]> git.ipfire.org Git - thirdparty/gcc.git/blob
c37108b
[thirdparty/gcc.git] /
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2013 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "tm_p.h"
27 #include "regs.h"
28 #include "hard-reg-set.h"
29 #include "insn-config.h"
30 #include "conditions.h"
31 #include "output.h"
32 #include "insn-codes.h"
33 #include "insn-attr.h"
34 #include "flags.h"
35 #include "except.h"
36 #include "function.h"
37 #include "recog.h"
38 #include "expr.h"
39 #include "optabs.h"
40 #include "diagnostic-core.h"
41 #include "toplev.h"
42 #include "basic-block.h"
43 #include "ggc.h"
44 #include "target.h"
45 #include "target-def.h"
46 #include "common/common-target.h"
47 #include "langhooks.h"
48 #include "reload.h"
49 #include "cgraph.h"
50 #include "gimple.h"
51 #include "dwarf2.h"
52 #include "df.h"
53 #include "tm-constrs.h"
54 #include "params.h"
55 #include "cselib.h"
56 #include "debug.h"
57 #include "sched-int.h"
58 #include "sbitmap.h"
59 #include "fibheap.h"
60 #include "opts.h"
61 #include "diagnostic.h"
62 #include "dumpfile.h"
63 #include "tree-pass.h"
64 #include "tree-flow.h"
65
66 static rtx legitimize_dllimport_symbol (rtx, bool);
67 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
68 static rtx legitimize_pe_coff_symbol (rtx, bool);
69
70 #ifndef CHECK_STACK_LIMIT
71 #define CHECK_STACK_LIMIT (-1)
72 #endif
73
74 /* Return index of given mode in mult and division cost tables. */
75 #define MODE_INDEX(mode) \
76 ((mode) == QImode ? 0 \
77 : (mode) == HImode ? 1 \
78 : (mode) == SImode ? 2 \
79 : (mode) == DImode ? 3 \
80 : 4)
81
82 /* Processor costs (relative to an add) */
83 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
84 #define COSTS_N_BYTES(N) ((N) * 2)
85
86 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
87
88 const
89 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
90 COSTS_N_BYTES (2), /* cost of an add instruction */
91 COSTS_N_BYTES (3), /* cost of a lea instruction */
92 COSTS_N_BYTES (2), /* variable shift costs */
93 COSTS_N_BYTES (3), /* constant shift costs */
94 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
95 COSTS_N_BYTES (3), /* HI */
96 COSTS_N_BYTES (3), /* SI */
97 COSTS_N_BYTES (3), /* DI */
98 COSTS_N_BYTES (5)}, /* other */
99 0, /* cost of multiply per each bit set */
100 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
101 COSTS_N_BYTES (3), /* HI */
102 COSTS_N_BYTES (3), /* SI */
103 COSTS_N_BYTES (3), /* DI */
104 COSTS_N_BYTES (5)}, /* other */
105 COSTS_N_BYTES (3), /* cost of movsx */
106 COSTS_N_BYTES (3), /* cost of movzx */
107 0, /* "large" insn */
108 2, /* MOVE_RATIO */
109 2, /* cost for loading QImode using movzbl */
110 {2, 2, 2}, /* cost of loading integer registers
111 in QImode, HImode and SImode.
112 Relative to reg-reg move (2). */
113 {2, 2, 2}, /* cost of storing integer registers */
114 2, /* cost of reg,reg fld/fst */
115 {2, 2, 2}, /* cost of loading fp registers
116 in SFmode, DFmode and XFmode */
117 {2, 2, 2}, /* cost of storing fp registers
118 in SFmode, DFmode and XFmode */
119 3, /* cost of moving MMX register */
120 {3, 3}, /* cost of loading MMX registers
121 in SImode and DImode */
122 {3, 3}, /* cost of storing MMX registers
123 in SImode and DImode */
124 3, /* cost of moving SSE register */
125 {3, 3, 3}, /* cost of loading SSE registers
126 in SImode, DImode and TImode */
127 {3, 3, 3}, /* cost of storing SSE registers
128 in SImode, DImode and TImode */
129 3, /* MMX or SSE register to integer */
130 0, /* size of l1 cache */
131 0, /* size of l2 cache */
132 0, /* size of prefetch block */
133 0, /* number of parallel prefetches */
134 2, /* Branch cost */
135 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
136 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
137 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
138 COSTS_N_BYTES (2), /* cost of FABS instruction. */
139 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
140 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
141 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
142 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
143 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
144 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
145 1, /* scalar_stmt_cost. */
146 1, /* scalar load_cost. */
147 1, /* scalar_store_cost. */
148 1, /* vec_stmt_cost. */
149 1, /* vec_to_scalar_cost. */
150 1, /* scalar_to_vec_cost. */
151 1, /* vec_align_load_cost. */
152 1, /* vec_unalign_load_cost. */
153 1, /* vec_store_cost. */
154 1, /* cond_taken_branch_cost. */
155 1, /* cond_not_taken_branch_cost. */
156 };
157
158 /* Processor costs (relative to an add) */
159 static const
160 struct processor_costs i386_cost = { /* 386 specific costs */
161 COSTS_N_INSNS (1), /* cost of an add instruction */
162 COSTS_N_INSNS (1), /* cost of a lea instruction */
163 COSTS_N_INSNS (3), /* variable shift costs */
164 COSTS_N_INSNS (2), /* constant shift costs */
165 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
166 COSTS_N_INSNS (6), /* HI */
167 COSTS_N_INSNS (6), /* SI */
168 COSTS_N_INSNS (6), /* DI */
169 COSTS_N_INSNS (6)}, /* other */
170 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
171 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
172 COSTS_N_INSNS (23), /* HI */
173 COSTS_N_INSNS (23), /* SI */
174 COSTS_N_INSNS (23), /* DI */
175 COSTS_N_INSNS (23)}, /* other */
176 COSTS_N_INSNS (3), /* cost of movsx */
177 COSTS_N_INSNS (2), /* cost of movzx */
178 15, /* "large" insn */
179 3, /* MOVE_RATIO */
180 4, /* cost for loading QImode using movzbl */
181 {2, 4, 2}, /* cost of loading integer registers
182 in QImode, HImode and SImode.
183 Relative to reg-reg move (2). */
184 {2, 4, 2}, /* cost of storing integer registers */
185 2, /* cost of reg,reg fld/fst */
186 {8, 8, 8}, /* cost of loading fp registers
187 in SFmode, DFmode and XFmode */
188 {8, 8, 8}, /* cost of storing fp registers
189 in SFmode, DFmode and XFmode */
190 2, /* cost of moving MMX register */
191 {4, 8}, /* cost of loading MMX registers
192 in SImode and DImode */
193 {4, 8}, /* cost of storing MMX registers
194 in SImode and DImode */
195 2, /* cost of moving SSE register */
196 {4, 8, 16}, /* cost of loading SSE registers
197 in SImode, DImode and TImode */
198 {4, 8, 16}, /* cost of storing SSE registers
199 in SImode, DImode and TImode */
200 3, /* MMX or SSE register to integer */
201 0, /* size of l1 cache */
202 0, /* size of l2 cache */
203 0, /* size of prefetch block */
204 0, /* number of parallel prefetches */
205 1, /* Branch cost */
206 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
207 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
208 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
209 COSTS_N_INSNS (22), /* cost of FABS instruction. */
210 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
211 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
212 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
213 DUMMY_STRINGOP_ALGS},
214 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
215 DUMMY_STRINGOP_ALGS},
216 1, /* scalar_stmt_cost. */
217 1, /* scalar load_cost. */
218 1, /* scalar_store_cost. */
219 1, /* vec_stmt_cost. */
220 1, /* vec_to_scalar_cost. */
221 1, /* scalar_to_vec_cost. */
222 1, /* vec_align_load_cost. */
223 2, /* vec_unalign_load_cost. */
224 1, /* vec_store_cost. */
225 3, /* cond_taken_branch_cost. */
226 1, /* cond_not_taken_branch_cost. */
227 };
228
229 static const
230 struct processor_costs i486_cost = { /* 486 specific costs */
231 COSTS_N_INSNS (1), /* cost of an add instruction */
232 COSTS_N_INSNS (1), /* cost of a lea instruction */
233 COSTS_N_INSNS (3), /* variable shift costs */
234 COSTS_N_INSNS (2), /* constant shift costs */
235 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
236 COSTS_N_INSNS (12), /* HI */
237 COSTS_N_INSNS (12), /* SI */
238 COSTS_N_INSNS (12), /* DI */
239 COSTS_N_INSNS (12)}, /* other */
240 1, /* cost of multiply per each bit set */
241 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
242 COSTS_N_INSNS (40), /* HI */
243 COSTS_N_INSNS (40), /* SI */
244 COSTS_N_INSNS (40), /* DI */
245 COSTS_N_INSNS (40)}, /* other */
246 COSTS_N_INSNS (3), /* cost of movsx */
247 COSTS_N_INSNS (2), /* cost of movzx */
248 15, /* "large" insn */
249 3, /* MOVE_RATIO */
250 4, /* cost for loading QImode using movzbl */
251 {2, 4, 2}, /* cost of loading integer registers
252 in QImode, HImode and SImode.
253 Relative to reg-reg move (2). */
254 {2, 4, 2}, /* cost of storing integer registers */
255 2, /* cost of reg,reg fld/fst */
256 {8, 8, 8}, /* cost of loading fp registers
257 in SFmode, DFmode and XFmode */
258 {8, 8, 8}, /* cost of storing fp registers
259 in SFmode, DFmode and XFmode */
260 2, /* cost of moving MMX register */
261 {4, 8}, /* cost of loading MMX registers
262 in SImode and DImode */
263 {4, 8}, /* cost of storing MMX registers
264 in SImode and DImode */
265 2, /* cost of moving SSE register */
266 {4, 8, 16}, /* cost of loading SSE registers
267 in SImode, DImode and TImode */
268 {4, 8, 16}, /* cost of storing SSE registers
269 in SImode, DImode and TImode */
270 3, /* MMX or SSE register to integer */
271 4, /* size of l1 cache. 486 has 8kB cache
272 shared for code and data, so 4kB is
273 not really precise. */
274 4, /* size of l2 cache */
275 0, /* size of prefetch block */
276 0, /* number of parallel prefetches */
277 1, /* Branch cost */
278 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
279 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
280 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
281 COSTS_N_INSNS (3), /* cost of FABS instruction. */
282 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
283 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
284 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
285 DUMMY_STRINGOP_ALGS},
286 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
287 DUMMY_STRINGOP_ALGS},
288 1, /* scalar_stmt_cost. */
289 1, /* scalar load_cost. */
290 1, /* scalar_store_cost. */
291 1, /* vec_stmt_cost. */
292 1, /* vec_to_scalar_cost. */
293 1, /* scalar_to_vec_cost. */
294 1, /* vec_align_load_cost. */
295 2, /* vec_unalign_load_cost. */
296 1, /* vec_store_cost. */
297 3, /* cond_taken_branch_cost. */
298 1, /* cond_not_taken_branch_cost. */
299 };
300
301 static const
302 struct processor_costs pentium_cost = {
303 COSTS_N_INSNS (1), /* cost of an add instruction */
304 COSTS_N_INSNS (1), /* cost of a lea instruction */
305 COSTS_N_INSNS (4), /* variable shift costs */
306 COSTS_N_INSNS (1), /* constant shift costs */
307 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
308 COSTS_N_INSNS (11), /* HI */
309 COSTS_N_INSNS (11), /* SI */
310 COSTS_N_INSNS (11), /* DI */
311 COSTS_N_INSNS (11)}, /* other */
312 0, /* cost of multiply per each bit set */
313 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
314 COSTS_N_INSNS (25), /* HI */
315 COSTS_N_INSNS (25), /* SI */
316 COSTS_N_INSNS (25), /* DI */
317 COSTS_N_INSNS (25)}, /* other */
318 COSTS_N_INSNS (3), /* cost of movsx */
319 COSTS_N_INSNS (2), /* cost of movzx */
320 8, /* "large" insn */
321 6, /* MOVE_RATIO */
322 6, /* cost for loading QImode using movzbl */
323 {2, 4, 2}, /* cost of loading integer registers
324 in QImode, HImode and SImode.
325 Relative to reg-reg move (2). */
326 {2, 4, 2}, /* cost of storing integer registers */
327 2, /* cost of reg,reg fld/fst */
328 {2, 2, 6}, /* cost of loading fp registers
329 in SFmode, DFmode and XFmode */
330 {4, 4, 6}, /* cost of storing fp registers
331 in SFmode, DFmode and XFmode */
332 8, /* cost of moving MMX register */
333 {8, 8}, /* cost of loading MMX registers
334 in SImode and DImode */
335 {8, 8}, /* cost of storing MMX registers
336 in SImode and DImode */
337 2, /* cost of moving SSE register */
338 {4, 8, 16}, /* cost of loading SSE registers
339 in SImode, DImode and TImode */
340 {4, 8, 16}, /* cost of storing SSE registers
341 in SImode, DImode and TImode */
342 3, /* MMX or SSE register to integer */
343 8, /* size of l1 cache. */
344 8, /* size of l2 cache */
345 0, /* size of prefetch block */
346 0, /* number of parallel prefetches */
347 2, /* Branch cost */
348 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
349 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
350 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
351 COSTS_N_INSNS (1), /* cost of FABS instruction. */
352 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
353 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
354 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
355 DUMMY_STRINGOP_ALGS},
356 {{libcall, {{-1, rep_prefix_4_byte, false}}},
357 DUMMY_STRINGOP_ALGS},
358 1, /* scalar_stmt_cost. */
359 1, /* scalar load_cost. */
360 1, /* scalar_store_cost. */
361 1, /* vec_stmt_cost. */
362 1, /* vec_to_scalar_cost. */
363 1, /* scalar_to_vec_cost. */
364 1, /* vec_align_load_cost. */
365 2, /* vec_unalign_load_cost. */
366 1, /* vec_store_cost. */
367 3, /* cond_taken_branch_cost. */
368 1, /* cond_not_taken_branch_cost. */
369 };
370
371 static const
372 struct processor_costs pentiumpro_cost = {
373 COSTS_N_INSNS (1), /* cost of an add instruction */
374 COSTS_N_INSNS (1), /* cost of a lea instruction */
375 COSTS_N_INSNS (1), /* variable shift costs */
376 COSTS_N_INSNS (1), /* constant shift costs */
377 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
378 COSTS_N_INSNS (4), /* HI */
379 COSTS_N_INSNS (4), /* SI */
380 COSTS_N_INSNS (4), /* DI */
381 COSTS_N_INSNS (4)}, /* other */
382 0, /* cost of multiply per each bit set */
383 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
384 COSTS_N_INSNS (17), /* HI */
385 COSTS_N_INSNS (17), /* SI */
386 COSTS_N_INSNS (17), /* DI */
387 COSTS_N_INSNS (17)}, /* other */
388 COSTS_N_INSNS (1), /* cost of movsx */
389 COSTS_N_INSNS (1), /* cost of movzx */
390 8, /* "large" insn */
391 6, /* MOVE_RATIO */
392 2, /* cost for loading QImode using movzbl */
393 {4, 4, 4}, /* cost of loading integer registers
394 in QImode, HImode and SImode.
395 Relative to reg-reg move (2). */
396 {2, 2, 2}, /* cost of storing integer registers */
397 2, /* cost of reg,reg fld/fst */
398 {2, 2, 6}, /* cost of loading fp registers
399 in SFmode, DFmode and XFmode */
400 {4, 4, 6}, /* cost of storing fp registers
401 in SFmode, DFmode and XFmode */
402 2, /* cost of moving MMX register */
403 {2, 2}, /* cost of loading MMX registers
404 in SImode and DImode */
405 {2, 2}, /* cost of storing MMX registers
406 in SImode and DImode */
407 2, /* cost of moving SSE register */
408 {2, 2, 8}, /* cost of loading SSE registers
409 in SImode, DImode and TImode */
410 {2, 2, 8}, /* cost of storing SSE registers
411 in SImode, DImode and TImode */
412 3, /* MMX or SSE register to integer */
413 8, /* size of l1 cache. */
414 256, /* size of l2 cache */
415 32, /* size of prefetch block */
416 6, /* number of parallel prefetches */
417 2, /* Branch cost */
418 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
419 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
420 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
421 COSTS_N_INSNS (2), /* cost of FABS instruction. */
422 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
423 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
424 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
425 (we ensure the alignment). For small blocks inline loop is still a
426 noticeable win, for bigger blocks either rep movsl or rep movsb is
427 way to go. Rep movsb has apparently more expensive startup time in CPU,
428 but after 4K the difference is down in the noise. */
429 {{rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
430 {8192, rep_prefix_4_byte, false},
431 {-1, rep_prefix_1_byte, false}}},
432 DUMMY_STRINGOP_ALGS},
433 {{rep_prefix_4_byte, {{1024, unrolled_loop, false},
434 {8192, rep_prefix_4_byte, false},
435 {-1, libcall, false}}},
436 DUMMY_STRINGOP_ALGS},
437 1, /* scalar_stmt_cost. */
438 1, /* scalar load_cost. */
439 1, /* scalar_store_cost. */
440 1, /* vec_stmt_cost. */
441 1, /* vec_to_scalar_cost. */
442 1, /* scalar_to_vec_cost. */
443 1, /* vec_align_load_cost. */
444 2, /* vec_unalign_load_cost. */
445 1, /* vec_store_cost. */
446 3, /* cond_taken_branch_cost. */
447 1, /* cond_not_taken_branch_cost. */
448 };
449
450 static const
451 struct processor_costs geode_cost = {
452 COSTS_N_INSNS (1), /* cost of an add instruction */
453 COSTS_N_INSNS (1), /* cost of a lea instruction */
454 COSTS_N_INSNS (2), /* variable shift costs */
455 COSTS_N_INSNS (1), /* constant shift costs */
456 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
457 COSTS_N_INSNS (4), /* HI */
458 COSTS_N_INSNS (7), /* SI */
459 COSTS_N_INSNS (7), /* DI */
460 COSTS_N_INSNS (7)}, /* other */
461 0, /* cost of multiply per each bit set */
462 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
463 COSTS_N_INSNS (23), /* HI */
464 COSTS_N_INSNS (39), /* SI */
465 COSTS_N_INSNS (39), /* DI */
466 COSTS_N_INSNS (39)}, /* other */
467 COSTS_N_INSNS (1), /* cost of movsx */
468 COSTS_N_INSNS (1), /* cost of movzx */
469 8, /* "large" insn */
470 4, /* MOVE_RATIO */
471 1, /* cost for loading QImode using movzbl */
472 {1, 1, 1}, /* cost of loading integer registers
473 in QImode, HImode and SImode.
474 Relative to reg-reg move (2). */
475 {1, 1, 1}, /* cost of storing integer registers */
476 1, /* cost of reg,reg fld/fst */
477 {1, 1, 1}, /* cost of loading fp registers
478 in SFmode, DFmode and XFmode */
479 {4, 6, 6}, /* cost of storing fp registers
480 in SFmode, DFmode and XFmode */
481
482 1, /* cost of moving MMX register */
483 {1, 1}, /* cost of loading MMX registers
484 in SImode and DImode */
485 {1, 1}, /* cost of storing MMX registers
486 in SImode and DImode */
487 1, /* cost of moving SSE register */
488 {1, 1, 1}, /* cost of loading SSE registers
489 in SImode, DImode and TImode */
490 {1, 1, 1}, /* cost of storing SSE registers
491 in SImode, DImode and TImode */
492 1, /* MMX or SSE register to integer */
493 64, /* size of l1 cache. */
494 128, /* size of l2 cache. */
495 32, /* size of prefetch block */
496 1, /* number of parallel prefetches */
497 1, /* Branch cost */
498 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
499 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
500 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
501 COSTS_N_INSNS (1), /* cost of FABS instruction. */
502 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
503 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
504 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
505 DUMMY_STRINGOP_ALGS},
506 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
507 DUMMY_STRINGOP_ALGS},
508 1, /* scalar_stmt_cost. */
509 1, /* scalar load_cost. */
510 1, /* scalar_store_cost. */
511 1, /* vec_stmt_cost. */
512 1, /* vec_to_scalar_cost. */
513 1, /* scalar_to_vec_cost. */
514 1, /* vec_align_load_cost. */
515 2, /* vec_unalign_load_cost. */
516 1, /* vec_store_cost. */
517 3, /* cond_taken_branch_cost. */
518 1, /* cond_not_taken_branch_cost. */
519 };
520
521 static const
522 struct processor_costs k6_cost = {
523 COSTS_N_INSNS (1), /* cost of an add instruction */
524 COSTS_N_INSNS (2), /* cost of a lea instruction */
525 COSTS_N_INSNS (1), /* variable shift costs */
526 COSTS_N_INSNS (1), /* constant shift costs */
527 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
528 COSTS_N_INSNS (3), /* HI */
529 COSTS_N_INSNS (3), /* SI */
530 COSTS_N_INSNS (3), /* DI */
531 COSTS_N_INSNS (3)}, /* other */
532 0, /* cost of multiply per each bit set */
533 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
534 COSTS_N_INSNS (18), /* HI */
535 COSTS_N_INSNS (18), /* SI */
536 COSTS_N_INSNS (18), /* DI */
537 COSTS_N_INSNS (18)}, /* other */
538 COSTS_N_INSNS (2), /* cost of movsx */
539 COSTS_N_INSNS (2), /* cost of movzx */
540 8, /* "large" insn */
541 4, /* MOVE_RATIO */
542 3, /* cost for loading QImode using movzbl */
543 {4, 5, 4}, /* cost of loading integer registers
544 in QImode, HImode and SImode.
545 Relative to reg-reg move (2). */
546 {2, 3, 2}, /* cost of storing integer registers */
547 4, /* cost of reg,reg fld/fst */
548 {6, 6, 6}, /* cost of loading fp registers
549 in SFmode, DFmode and XFmode */
550 {4, 4, 4}, /* cost of storing fp registers
551 in SFmode, DFmode and XFmode */
552 2, /* cost of moving MMX register */
553 {2, 2}, /* cost of loading MMX registers
554 in SImode and DImode */
555 {2, 2}, /* cost of storing MMX registers
556 in SImode and DImode */
557 2, /* cost of moving SSE register */
558 {2, 2, 8}, /* cost of loading SSE registers
559 in SImode, DImode and TImode */
560 {2, 2, 8}, /* cost of storing SSE registers
561 in SImode, DImode and TImode */
562 6, /* MMX or SSE register to integer */
563 32, /* size of l1 cache. */
564 32, /* size of l2 cache. Some models
565 have integrated l2 cache, but
566 optimizing for k6 is not important
567 enough to worry about that. */
568 32, /* size of prefetch block */
569 1, /* number of parallel prefetches */
570 1, /* Branch cost */
571 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
572 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
573 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
574 COSTS_N_INSNS (2), /* cost of FABS instruction. */
575 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
576 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
577 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
578 DUMMY_STRINGOP_ALGS},
579 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
580 DUMMY_STRINGOP_ALGS},
581 1, /* scalar_stmt_cost. */
582 1, /* scalar load_cost. */
583 1, /* scalar_store_cost. */
584 1, /* vec_stmt_cost. */
585 1, /* vec_to_scalar_cost. */
586 1, /* scalar_to_vec_cost. */
587 1, /* vec_align_load_cost. */
588 2, /* vec_unalign_load_cost. */
589 1, /* vec_store_cost. */
590 3, /* cond_taken_branch_cost. */
591 1, /* cond_not_taken_branch_cost. */
592 };
593
594 static const
595 struct processor_costs athlon_cost = {
596 COSTS_N_INSNS (1), /* cost of an add instruction */
597 COSTS_N_INSNS (2), /* cost of a lea instruction */
598 COSTS_N_INSNS (1), /* variable shift costs */
599 COSTS_N_INSNS (1), /* constant shift costs */
600 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
601 COSTS_N_INSNS (5), /* HI */
602 COSTS_N_INSNS (5), /* SI */
603 COSTS_N_INSNS (5), /* DI */
604 COSTS_N_INSNS (5)}, /* other */
605 0, /* cost of multiply per each bit set */
606 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
607 COSTS_N_INSNS (26), /* HI */
608 COSTS_N_INSNS (42), /* SI */
609 COSTS_N_INSNS (74), /* DI */
610 COSTS_N_INSNS (74)}, /* other */
611 COSTS_N_INSNS (1), /* cost of movsx */
612 COSTS_N_INSNS (1), /* cost of movzx */
613 8, /* "large" insn */
614 9, /* MOVE_RATIO */
615 4, /* cost for loading QImode using movzbl */
616 {3, 4, 3}, /* cost of loading integer registers
617 in QImode, HImode and SImode.
618 Relative to reg-reg move (2). */
619 {3, 4, 3}, /* cost of storing integer registers */
620 4, /* cost of reg,reg fld/fst */
621 {4, 4, 12}, /* cost of loading fp registers
622 in SFmode, DFmode and XFmode */
623 {6, 6, 8}, /* cost of storing fp registers
624 in SFmode, DFmode and XFmode */
625 2, /* cost of moving MMX register */
626 {4, 4}, /* cost of loading MMX registers
627 in SImode and DImode */
628 {4, 4}, /* cost of storing MMX registers
629 in SImode and DImode */
630 2, /* cost of moving SSE register */
631 {4, 4, 6}, /* cost of loading SSE registers
632 in SImode, DImode and TImode */
633 {4, 4, 5}, /* cost of storing SSE registers
634 in SImode, DImode and TImode */
635 5, /* MMX or SSE register to integer */
636 64, /* size of l1 cache. */
637 256, /* size of l2 cache. */
638 64, /* size of prefetch block */
639 6, /* number of parallel prefetches */
640 5, /* Branch cost */
641 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
642 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
643 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
644 COSTS_N_INSNS (2), /* cost of FABS instruction. */
645 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
646 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
647 /* For some reason, Athlon deals better with REP prefix (relative to loops)
648 compared to K8. Alignment becomes important after 8 bytes for memcpy and
649 128 bytes for memset. */
650 {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
651 DUMMY_STRINGOP_ALGS},
652 {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
653 DUMMY_STRINGOP_ALGS},
654 1, /* scalar_stmt_cost. */
655 1, /* scalar load_cost. */
656 1, /* scalar_store_cost. */
657 1, /* vec_stmt_cost. */
658 1, /* vec_to_scalar_cost. */
659 1, /* scalar_to_vec_cost. */
660 1, /* vec_align_load_cost. */
661 2, /* vec_unalign_load_cost. */
662 1, /* vec_store_cost. */
663 3, /* cond_taken_branch_cost. */
664 1, /* cond_not_taken_branch_cost. */
665 };
666
667 static const
668 struct processor_costs k8_cost = {
669 COSTS_N_INSNS (1), /* cost of an add instruction */
670 COSTS_N_INSNS (2), /* cost of a lea instruction */
671 COSTS_N_INSNS (1), /* variable shift costs */
672 COSTS_N_INSNS (1), /* constant shift costs */
673 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
674 COSTS_N_INSNS (4), /* HI */
675 COSTS_N_INSNS (3), /* SI */
676 COSTS_N_INSNS (4), /* DI */
677 COSTS_N_INSNS (5)}, /* other */
678 0, /* cost of multiply per each bit set */
679 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
680 COSTS_N_INSNS (26), /* HI */
681 COSTS_N_INSNS (42), /* SI */
682 COSTS_N_INSNS (74), /* DI */
683 COSTS_N_INSNS (74)}, /* other */
684 COSTS_N_INSNS (1), /* cost of movsx */
685 COSTS_N_INSNS (1), /* cost of movzx */
686 8, /* "large" insn */
687 9, /* MOVE_RATIO */
688 4, /* cost for loading QImode using movzbl */
689 {3, 4, 3}, /* cost of loading integer registers
690 in QImode, HImode and SImode.
691 Relative to reg-reg move (2). */
692 {3, 4, 3}, /* cost of storing integer registers */
693 4, /* cost of reg,reg fld/fst */
694 {4, 4, 12}, /* cost of loading fp registers
695 in SFmode, DFmode and XFmode */
696 {6, 6, 8}, /* cost of storing fp registers
697 in SFmode, DFmode and XFmode */
698 2, /* cost of moving MMX register */
699 {3, 3}, /* cost of loading MMX registers
700 in SImode and DImode */
701 {4, 4}, /* cost of storing MMX registers
702 in SImode and DImode */
703 2, /* cost of moving SSE register */
704 {4, 3, 6}, /* cost of loading SSE registers
705 in SImode, DImode and TImode */
706 {4, 4, 5}, /* cost of storing SSE registers
707 in SImode, DImode and TImode */
708 5, /* MMX or SSE register to integer */
709 64, /* size of l1 cache. */
710 512, /* size of l2 cache. */
711 64, /* size of prefetch block */
712 /* New AMD processors never drop prefetches; if they cannot be performed
713 immediately, they are queued. We set number of simultaneous prefetches
714 to a large constant to reflect this (it probably is not a good idea not
715 to limit number of prefetches at all, as their execution also takes some
716 time). */
717 100, /* number of parallel prefetches */
718 3, /* Branch cost */
719 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
720 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
721 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
722 COSTS_N_INSNS (2), /* cost of FABS instruction. */
723 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
724 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
725 /* K8 has optimized REP instruction for medium sized blocks, but for very
726 small blocks it is better to use loop. For large blocks, libcall can
727 do nontemporary accesses and beat inline considerably. */
728 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
729 {-1, rep_prefix_4_byte, false}}},
730 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
731 {-1, libcall, false}}}},
732 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
733 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
734 {libcall, {{48, unrolled_loop, false},
735 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
736 4, /* scalar_stmt_cost. */
737 2, /* scalar load_cost. */
738 2, /* scalar_store_cost. */
739 5, /* vec_stmt_cost. */
740 0, /* vec_to_scalar_cost. */
741 2, /* scalar_to_vec_cost. */
742 2, /* vec_align_load_cost. */
743 3, /* vec_unalign_load_cost. */
744 3, /* vec_store_cost. */
745 3, /* cond_taken_branch_cost. */
746 2, /* cond_not_taken_branch_cost. */
747 };
748
749 struct processor_costs amdfam10_cost = {
750 COSTS_N_INSNS (1), /* cost of an add instruction */
751 COSTS_N_INSNS (2), /* cost of a lea instruction */
752 COSTS_N_INSNS (1), /* variable shift costs */
753 COSTS_N_INSNS (1), /* constant shift costs */
754 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
755 COSTS_N_INSNS (4), /* HI */
756 COSTS_N_INSNS (3), /* SI */
757 COSTS_N_INSNS (4), /* DI */
758 COSTS_N_INSNS (5)}, /* other */
759 0, /* cost of multiply per each bit set */
760 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
761 COSTS_N_INSNS (35), /* HI */
762 COSTS_N_INSNS (51), /* SI */
763 COSTS_N_INSNS (83), /* DI */
764 COSTS_N_INSNS (83)}, /* other */
765 COSTS_N_INSNS (1), /* cost of movsx */
766 COSTS_N_INSNS (1), /* cost of movzx */
767 8, /* "large" insn */
768 9, /* MOVE_RATIO */
769 4, /* cost for loading QImode using movzbl */
770 {3, 4, 3}, /* cost of loading integer registers
771 in QImode, HImode and SImode.
772 Relative to reg-reg move (2). */
773 {3, 4, 3}, /* cost of storing integer registers */
774 4, /* cost of reg,reg fld/fst */
775 {4, 4, 12}, /* cost of loading fp registers
776 in SFmode, DFmode and XFmode */
777 {6, 6, 8}, /* cost of storing fp registers
778 in SFmode, DFmode and XFmode */
779 2, /* cost of moving MMX register */
780 {3, 3}, /* cost of loading MMX registers
781 in SImode and DImode */
782 {4, 4}, /* cost of storing MMX registers
783 in SImode and DImode */
784 2, /* cost of moving SSE register */
785 {4, 4, 3}, /* cost of loading SSE registers
786 in SImode, DImode and TImode */
787 {4, 4, 5}, /* cost of storing SSE registers
788 in SImode, DImode and TImode */
789 3, /* MMX or SSE register to integer */
790 /* On K8:
791 MOVD reg64, xmmreg Double FSTORE 4
792 MOVD reg32, xmmreg Double FSTORE 4
793 On AMDFAM10:
794 MOVD reg64, xmmreg Double FADD 3
795 1/1 1/1
796 MOVD reg32, xmmreg Double FADD 3
797 1/1 1/1 */
798 64, /* size of l1 cache. */
799 512, /* size of l2 cache. */
800 64, /* size of prefetch block */
801 /* New AMD processors never drop prefetches; if they cannot be performed
802 immediately, they are queued. We set number of simultaneous prefetches
803 to a large constant to reflect this (it probably is not a good idea not
804 to limit number of prefetches at all, as their execution also takes some
805 time). */
806 100, /* number of parallel prefetches */
807 2, /* Branch cost */
808 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
809 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
810 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
811 COSTS_N_INSNS (2), /* cost of FABS instruction. */
812 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
813 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
814
815 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
816 very small blocks it is better to use loop. For large blocks, libcall can
817 do nontemporary accesses and beat inline considerably. */
818 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
819 {-1, rep_prefix_4_byte, false}}},
820 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
821 {-1, libcall, false}}}},
822 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
823 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
824 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
825 {-1, libcall, false}}}},
826 4, /* scalar_stmt_cost. */
827 2, /* scalar load_cost. */
828 2, /* scalar_store_cost. */
829 6, /* vec_stmt_cost. */
830 0, /* vec_to_scalar_cost. */
831 2, /* scalar_to_vec_cost. */
832 2, /* vec_align_load_cost. */
833 2, /* vec_unalign_load_cost. */
834 2, /* vec_store_cost. */
835 2, /* cond_taken_branch_cost. */
836 1, /* cond_not_taken_branch_cost. */
837 };
838
839 struct processor_costs bdver1_cost = {
840 COSTS_N_INSNS (1), /* cost of an add instruction */
841 COSTS_N_INSNS (1), /* cost of a lea instruction */
842 COSTS_N_INSNS (1), /* variable shift costs */
843 COSTS_N_INSNS (1), /* constant shift costs */
844 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
845 COSTS_N_INSNS (4), /* HI */
846 COSTS_N_INSNS (4), /* SI */
847 COSTS_N_INSNS (6), /* DI */
848 COSTS_N_INSNS (6)}, /* other */
849 0, /* cost of multiply per each bit set */
850 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
851 COSTS_N_INSNS (35), /* HI */
852 COSTS_N_INSNS (51), /* SI */
853 COSTS_N_INSNS (83), /* DI */
854 COSTS_N_INSNS (83)}, /* other */
855 COSTS_N_INSNS (1), /* cost of movsx */
856 COSTS_N_INSNS (1), /* cost of movzx */
857 8, /* "large" insn */
858 9, /* MOVE_RATIO */
859 4, /* cost for loading QImode using movzbl */
860 {5, 5, 4}, /* cost of loading integer registers
861 in QImode, HImode and SImode.
862 Relative to reg-reg move (2). */
863 {4, 4, 4}, /* cost of storing integer registers */
864 2, /* cost of reg,reg fld/fst */
865 {5, 5, 12}, /* cost of loading fp registers
866 in SFmode, DFmode and XFmode */
867 {4, 4, 8}, /* cost of storing fp registers
868 in SFmode, DFmode and XFmode */
869 2, /* cost of moving MMX register */
870 {4, 4}, /* cost of loading MMX registers
871 in SImode and DImode */
872 {4, 4}, /* cost of storing MMX registers
873 in SImode and DImode */
874 2, /* cost of moving SSE register */
875 {4, 4, 4}, /* cost of loading SSE registers
876 in SImode, DImode and TImode */
877 {4, 4, 4}, /* cost of storing SSE registers
878 in SImode, DImode and TImode */
879 2, /* MMX or SSE register to integer */
880 /* On K8:
881 MOVD reg64, xmmreg Double FSTORE 4
882 MOVD reg32, xmmreg Double FSTORE 4
883 On AMDFAM10:
884 MOVD reg64, xmmreg Double FADD 3
885 1/1 1/1
886 MOVD reg32, xmmreg Double FADD 3
887 1/1 1/1 */
888 16, /* size of l1 cache. */
889 2048, /* size of l2 cache. */
890 64, /* size of prefetch block */
891 /* New AMD processors never drop prefetches; if they cannot be performed
892 immediately, they are queued. We set number of simultaneous prefetches
893 to a large constant to reflect this (it probably is not a good idea not
894 to limit number of prefetches at all, as their execution also takes some
895 time). */
896 100, /* number of parallel prefetches */
897 2, /* Branch cost */
898 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
899 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
900 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
901 COSTS_N_INSNS (2), /* cost of FABS instruction. */
902 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
903 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
904
905 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
906 very small blocks it is better to use loop. For large blocks, libcall
907 can do nontemporary accesses and beat inline considerably. */
908 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
909 {-1, rep_prefix_4_byte, false}}},
910 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
911 {-1, libcall, false}}}},
912 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
913 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
914 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
915 {-1, libcall, false}}}},
916 6, /* scalar_stmt_cost. */
917 4, /* scalar load_cost. */
918 4, /* scalar_store_cost. */
919 6, /* vec_stmt_cost. */
920 0, /* vec_to_scalar_cost. */
921 2, /* scalar_to_vec_cost. */
922 4, /* vec_align_load_cost. */
923 4, /* vec_unalign_load_cost. */
924 4, /* vec_store_cost. */
925 2, /* cond_taken_branch_cost. */
926 1, /* cond_not_taken_branch_cost. */
927 };
928
929 struct processor_costs bdver2_cost = {
930 COSTS_N_INSNS (1), /* cost of an add instruction */
931 COSTS_N_INSNS (1), /* cost of a lea instruction */
932 COSTS_N_INSNS (1), /* variable shift costs */
933 COSTS_N_INSNS (1), /* constant shift costs */
934 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
935 COSTS_N_INSNS (4), /* HI */
936 COSTS_N_INSNS (4), /* SI */
937 COSTS_N_INSNS (6), /* DI */
938 COSTS_N_INSNS (6)}, /* other */
939 0, /* cost of multiply per each bit set */
940 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
941 COSTS_N_INSNS (35), /* HI */
942 COSTS_N_INSNS (51), /* SI */
943 COSTS_N_INSNS (83), /* DI */
944 COSTS_N_INSNS (83)}, /* other */
945 COSTS_N_INSNS (1), /* cost of movsx */
946 COSTS_N_INSNS (1), /* cost of movzx */
947 8, /* "large" insn */
948 9, /* MOVE_RATIO */
949 4, /* cost for loading QImode using movzbl */
950 {5, 5, 4}, /* cost of loading integer registers
951 in QImode, HImode and SImode.
952 Relative to reg-reg move (2). */
953 {4, 4, 4}, /* cost of storing integer registers */
954 2, /* cost of reg,reg fld/fst */
955 {5, 5, 12}, /* cost of loading fp registers
956 in SFmode, DFmode and XFmode */
957 {4, 4, 8}, /* cost of storing fp registers
958 in SFmode, DFmode and XFmode */
959 2, /* cost of moving MMX register */
960 {4, 4}, /* cost of loading MMX registers
961 in SImode and DImode */
962 {4, 4}, /* cost of storing MMX registers
963 in SImode and DImode */
964 2, /* cost of moving SSE register */
965 {4, 4, 4}, /* cost of loading SSE registers
966 in SImode, DImode and TImode */
967 {4, 4, 4}, /* cost of storing SSE registers
968 in SImode, DImode and TImode */
969 2, /* MMX or SSE register to integer */
970 /* On K8:
971 MOVD reg64, xmmreg Double FSTORE 4
972 MOVD reg32, xmmreg Double FSTORE 4
973 On AMDFAM10:
974 MOVD reg64, xmmreg Double FADD 3
975 1/1 1/1
976 MOVD reg32, xmmreg Double FADD 3
977 1/1 1/1 */
978 16, /* size of l1 cache. */
979 2048, /* size of l2 cache. */
980 64, /* size of prefetch block */
981 /* New AMD processors never drop prefetches; if they cannot be performed
982 immediately, they are queued. We set number of simultaneous prefetches
983 to a large constant to reflect this (it probably is not a good idea not
984 to limit number of prefetches at all, as their execution also takes some
985 time). */
986 100, /* number of parallel prefetches */
987 2, /* Branch cost */
988 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
989 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
990 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
991 COSTS_N_INSNS (2), /* cost of FABS instruction. */
992 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
993 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
994
995 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
996 very small blocks it is better to use loop. For large blocks, libcall
997 can do nontemporary accesses and beat inline considerably. */
998 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
999 {-1, rep_prefix_4_byte, false}}},
1000 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1001 {-1, libcall, false}}}},
1002 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1003 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1004 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1005 {-1, libcall, false}}}},
1006 6, /* scalar_stmt_cost. */
1007 4, /* scalar load_cost. */
1008 4, /* scalar_store_cost. */
1009 6, /* vec_stmt_cost. */
1010 0, /* vec_to_scalar_cost. */
1011 2, /* scalar_to_vec_cost. */
1012 4, /* vec_align_load_cost. */
1013 4, /* vec_unalign_load_cost. */
1014 4, /* vec_store_cost. */
1015 2, /* cond_taken_branch_cost. */
1016 1, /* cond_not_taken_branch_cost. */
1017 };
1018
1019 struct processor_costs bdver3_cost = {
1020 COSTS_N_INSNS (1), /* cost of an add instruction */
1021 COSTS_N_INSNS (1), /* cost of a lea instruction */
1022 COSTS_N_INSNS (1), /* variable shift costs */
1023 COSTS_N_INSNS (1), /* constant shift costs */
1024 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1025 COSTS_N_INSNS (4), /* HI */
1026 COSTS_N_INSNS (4), /* SI */
1027 COSTS_N_INSNS (6), /* DI */
1028 COSTS_N_INSNS (6)}, /* other */
1029 0, /* cost of multiply per each bit set */
1030 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1031 COSTS_N_INSNS (35), /* HI */
1032 COSTS_N_INSNS (51), /* SI */
1033 COSTS_N_INSNS (83), /* DI */
1034 COSTS_N_INSNS (83)}, /* other */
1035 COSTS_N_INSNS (1), /* cost of movsx */
1036 COSTS_N_INSNS (1), /* cost of movzx */
1037 8, /* "large" insn */
1038 9, /* MOVE_RATIO */
1039 4, /* cost for loading QImode using movzbl */
1040 {5, 5, 4}, /* cost of loading integer registers
1041 in QImode, HImode and SImode.
1042 Relative to reg-reg move (2). */
1043 {4, 4, 4}, /* cost of storing integer registers */
1044 2, /* cost of reg,reg fld/fst */
1045 {5, 5, 12}, /* cost of loading fp registers
1046 in SFmode, DFmode and XFmode */
1047 {4, 4, 8}, /* cost of storing fp registers
1048 in SFmode, DFmode and XFmode */
1049 2, /* cost of moving MMX register */
1050 {4, 4}, /* cost of loading MMX registers
1051 in SImode and DImode */
1052 {4, 4}, /* cost of storing MMX registers
1053 in SImode and DImode */
1054 2, /* cost of moving SSE register */
1055 {4, 4, 4}, /* cost of loading SSE registers
1056 in SImode, DImode and TImode */
1057 {4, 4, 4}, /* cost of storing SSE registers
1058 in SImode, DImode and TImode */
1059 2, /* MMX or SSE register to integer */
1060 16, /* size of l1 cache. */
1061 2048, /* size of l2 cache. */
1062 64, /* size of prefetch block */
1063 /* New AMD processors never drop prefetches; if they cannot be performed
1064 immediately, they are queued. We set number of simultaneous prefetches
1065 to a large constant to reflect this (it probably is not a good idea not
1066 to limit number of prefetches at all, as their execution also takes some
1067 time). */
1068 100, /* number of parallel prefetches */
1069 2, /* Branch cost */
1070 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1071 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1072 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1073 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1074 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1075 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1076
1077 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1078 very small blocks it is better to use loop. For large blocks, libcall
1079 can do nontemporary accesses and beat inline considerably. */
1080 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1081 {-1, rep_prefix_4_byte, false}}},
1082 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1083 {-1, libcall, false}}}},
1084 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1085 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1086 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1087 {-1, libcall, false}}}},
1088 6, /* scalar_stmt_cost. */
1089 4, /* scalar load_cost. */
1090 4, /* scalar_store_cost. */
1091 6, /* vec_stmt_cost. */
1092 0, /* vec_to_scalar_cost. */
1093 2, /* scalar_to_vec_cost. */
1094 4, /* vec_align_load_cost. */
1095 4, /* vec_unalign_load_cost. */
1096 4, /* vec_store_cost. */
1097 2, /* cond_taken_branch_cost. */
1098 1, /* cond_not_taken_branch_cost. */
1099 };
1100
1101 struct processor_costs btver1_cost = {
1102 COSTS_N_INSNS (1), /* cost of an add instruction */
1103 COSTS_N_INSNS (2), /* cost of a lea instruction */
1104 COSTS_N_INSNS (1), /* variable shift costs */
1105 COSTS_N_INSNS (1), /* constant shift costs */
1106 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1107 COSTS_N_INSNS (4), /* HI */
1108 COSTS_N_INSNS (3), /* SI */
1109 COSTS_N_INSNS (4), /* DI */
1110 COSTS_N_INSNS (5)}, /* other */
1111 0, /* cost of multiply per each bit set */
1112 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1113 COSTS_N_INSNS (35), /* HI */
1114 COSTS_N_INSNS (51), /* SI */
1115 COSTS_N_INSNS (83), /* DI */
1116 COSTS_N_INSNS (83)}, /* other */
1117 COSTS_N_INSNS (1), /* cost of movsx */
1118 COSTS_N_INSNS (1), /* cost of movzx */
1119 8, /* "large" insn */
1120 9, /* MOVE_RATIO */
1121 4, /* cost for loading QImode using movzbl */
1122 {3, 4, 3}, /* cost of loading integer registers
1123 in QImode, HImode and SImode.
1124 Relative to reg-reg move (2). */
1125 {3, 4, 3}, /* cost of storing integer registers */
1126 4, /* cost of reg,reg fld/fst */
1127 {4, 4, 12}, /* cost of loading fp registers
1128 in SFmode, DFmode and XFmode */
1129 {6, 6, 8}, /* cost of storing fp registers
1130 in SFmode, DFmode and XFmode */
1131 2, /* cost of moving MMX register */
1132 {3, 3}, /* cost of loading MMX registers
1133 in SImode and DImode */
1134 {4, 4}, /* cost of storing MMX registers
1135 in SImode and DImode */
1136 2, /* cost of moving SSE register */
1137 {4, 4, 3}, /* cost of loading SSE registers
1138 in SImode, DImode and TImode */
1139 {4, 4, 5}, /* cost of storing SSE registers
1140 in SImode, DImode and TImode */
1141 3, /* MMX or SSE register to integer */
1142 /* On K8:
1143 MOVD reg64, xmmreg Double FSTORE 4
1144 MOVD reg32, xmmreg Double FSTORE 4
1145 On AMDFAM10:
1146 MOVD reg64, xmmreg Double FADD 3
1147 1/1 1/1
1148 MOVD reg32, xmmreg Double FADD 3
1149 1/1 1/1 */
1150 32, /* size of l1 cache. */
1151 512, /* size of l2 cache. */
1152 64, /* size of prefetch block */
1153 100, /* number of parallel prefetches */
1154 2, /* Branch cost */
1155 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1156 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1157 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1158 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1159 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1160 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1161
1162 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1163 very small blocks it is better to use loop. For large blocks, libcall can
1164 do nontemporary accesses and beat inline considerably. */
1165 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1166 {-1, rep_prefix_4_byte, false}}},
1167 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1168 {-1, libcall, false}}}},
1169 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1170 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1171 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1172 {-1, libcall, false}}}},
1173 4, /* scalar_stmt_cost. */
1174 2, /* scalar load_cost. */
1175 2, /* scalar_store_cost. */
1176 6, /* vec_stmt_cost. */
1177 0, /* vec_to_scalar_cost. */
1178 2, /* scalar_to_vec_cost. */
1179 2, /* vec_align_load_cost. */
1180 2, /* vec_unalign_load_cost. */
1181 2, /* vec_store_cost. */
1182 2, /* cond_taken_branch_cost. */
1183 1, /* cond_not_taken_branch_cost. */
1184 };
1185
1186 struct processor_costs btver2_cost = {
1187 COSTS_N_INSNS (1), /* cost of an add instruction */
1188 COSTS_N_INSNS (2), /* cost of a lea instruction */
1189 COSTS_N_INSNS (1), /* variable shift costs */
1190 COSTS_N_INSNS (1), /* constant shift costs */
1191 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1192 COSTS_N_INSNS (4), /* HI */
1193 COSTS_N_INSNS (3), /* SI */
1194 COSTS_N_INSNS (4), /* DI */
1195 COSTS_N_INSNS (5)}, /* other */
1196 0, /* cost of multiply per each bit set */
1197 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1198 COSTS_N_INSNS (35), /* HI */
1199 COSTS_N_INSNS (51), /* SI */
1200 COSTS_N_INSNS (83), /* DI */
1201 COSTS_N_INSNS (83)}, /* other */
1202 COSTS_N_INSNS (1), /* cost of movsx */
1203 COSTS_N_INSNS (1), /* cost of movzx */
1204 8, /* "large" insn */
1205 9, /* MOVE_RATIO */
1206 4, /* cost for loading QImode using movzbl */
1207 {3, 4, 3}, /* cost of loading integer registers
1208 in QImode, HImode and SImode.
1209 Relative to reg-reg move (2). */
1210 {3, 4, 3}, /* cost of storing integer registers */
1211 4, /* cost of reg,reg fld/fst */
1212 {4, 4, 12}, /* cost of loading fp registers
1213 in SFmode, DFmode and XFmode */
1214 {6, 6, 8}, /* cost of storing fp registers
1215 in SFmode, DFmode and XFmode */
1216 2, /* cost of moving MMX register */
1217 {3, 3}, /* cost of loading MMX registers
1218 in SImode and DImode */
1219 {4, 4}, /* cost of storing MMX registers
1220 in SImode and DImode */
1221 2, /* cost of moving SSE register */
1222 {4, 4, 3}, /* cost of loading SSE registers
1223 in SImode, DImode and TImode */
1224 {4, 4, 5}, /* cost of storing SSE registers
1225 in SImode, DImode and TImode */
1226 3, /* MMX or SSE register to integer */
1227 /* On K8:
1228 MOVD reg64, xmmreg Double FSTORE 4
1229 MOVD reg32, xmmreg Double FSTORE 4
1230 On AMDFAM10:
1231 MOVD reg64, xmmreg Double FADD 3
1232 1/1 1/1
1233 MOVD reg32, xmmreg Double FADD 3
1234 1/1 1/1 */
1235 32, /* size of l1 cache. */
1236 2048, /* size of l2 cache. */
1237 64, /* size of prefetch block */
1238 100, /* number of parallel prefetches */
1239 2, /* Branch cost */
1240 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1241 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1242 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1243 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1244 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1245 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1246
1247 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1248 {-1, rep_prefix_4_byte, false}}},
1249 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1250 {-1, libcall, false}}}},
1251 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1252 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1253 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1254 {-1, libcall, false}}}},
1255 4, /* scalar_stmt_cost. */
1256 2, /* scalar load_cost. */
1257 2, /* scalar_store_cost. */
1258 6, /* vec_stmt_cost. */
1259 0, /* vec_to_scalar_cost. */
1260 2, /* scalar_to_vec_cost. */
1261 2, /* vec_align_load_cost. */
1262 2, /* vec_unalign_load_cost. */
1263 2, /* vec_store_cost. */
1264 2, /* cond_taken_branch_cost. */
1265 1, /* cond_not_taken_branch_cost. */
1266 };
1267
1268 static const
1269 struct processor_costs pentium4_cost = {
1270 COSTS_N_INSNS (1), /* cost of an add instruction */
1271 COSTS_N_INSNS (3), /* cost of a lea instruction */
1272 COSTS_N_INSNS (4), /* variable shift costs */
1273 COSTS_N_INSNS (4), /* constant shift costs */
1274 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1275 COSTS_N_INSNS (15), /* HI */
1276 COSTS_N_INSNS (15), /* SI */
1277 COSTS_N_INSNS (15), /* DI */
1278 COSTS_N_INSNS (15)}, /* other */
1279 0, /* cost of multiply per each bit set */
1280 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1281 COSTS_N_INSNS (56), /* HI */
1282 COSTS_N_INSNS (56), /* SI */
1283 COSTS_N_INSNS (56), /* DI */
1284 COSTS_N_INSNS (56)}, /* other */
1285 COSTS_N_INSNS (1), /* cost of movsx */
1286 COSTS_N_INSNS (1), /* cost of movzx */
1287 16, /* "large" insn */
1288 6, /* MOVE_RATIO */
1289 2, /* cost for loading QImode using movzbl */
1290 {4, 5, 4}, /* cost of loading integer registers
1291 in QImode, HImode and SImode.
1292 Relative to reg-reg move (2). */
1293 {2, 3, 2}, /* cost of storing integer registers */
1294 2, /* cost of reg,reg fld/fst */
1295 {2, 2, 6}, /* cost of loading fp registers
1296 in SFmode, DFmode and XFmode */
1297 {4, 4, 6}, /* cost of storing fp registers
1298 in SFmode, DFmode and XFmode */
1299 2, /* cost of moving MMX register */
1300 {2, 2}, /* cost of loading MMX registers
1301 in SImode and DImode */
1302 {2, 2}, /* cost of storing MMX registers
1303 in SImode and DImode */
1304 12, /* cost of moving SSE register */
1305 {12, 12, 12}, /* cost of loading SSE registers
1306 in SImode, DImode and TImode */
1307 {2, 2, 8}, /* cost of storing SSE registers
1308 in SImode, DImode and TImode */
1309 10, /* MMX or SSE register to integer */
1310 8, /* size of l1 cache. */
1311 256, /* size of l2 cache. */
1312 64, /* size of prefetch block */
1313 6, /* number of parallel prefetches */
1314 2, /* Branch cost */
1315 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1316 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1317 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1318 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1319 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1320 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1321 {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1322 DUMMY_STRINGOP_ALGS},
1323 {{libcall, {{6, loop_1_byte, false}, {48, loop, false},
1324 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1325 DUMMY_STRINGOP_ALGS},
1326 1, /* scalar_stmt_cost. */
1327 1, /* scalar load_cost. */
1328 1, /* scalar_store_cost. */
1329 1, /* vec_stmt_cost. */
1330 1, /* vec_to_scalar_cost. */
1331 1, /* scalar_to_vec_cost. */
1332 1, /* vec_align_load_cost. */
1333 2, /* vec_unalign_load_cost. */
1334 1, /* vec_store_cost. */
1335 3, /* cond_taken_branch_cost. */
1336 1, /* cond_not_taken_branch_cost. */
1337 };
1338
1339 static const
1340 struct processor_costs nocona_cost = {
1341 COSTS_N_INSNS (1), /* cost of an add instruction */
1342 COSTS_N_INSNS (1), /* cost of a lea instruction */
1343 COSTS_N_INSNS (1), /* variable shift costs */
1344 COSTS_N_INSNS (1), /* constant shift costs */
1345 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1346 COSTS_N_INSNS (10), /* HI */
1347 COSTS_N_INSNS (10), /* SI */
1348 COSTS_N_INSNS (10), /* DI */
1349 COSTS_N_INSNS (10)}, /* other */
1350 0, /* cost of multiply per each bit set */
1351 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1352 COSTS_N_INSNS (66), /* HI */
1353 COSTS_N_INSNS (66), /* SI */
1354 COSTS_N_INSNS (66), /* DI */
1355 COSTS_N_INSNS (66)}, /* other */
1356 COSTS_N_INSNS (1), /* cost of movsx */
1357 COSTS_N_INSNS (1), /* cost of movzx */
1358 16, /* "large" insn */
1359 17, /* MOVE_RATIO */
1360 4, /* cost for loading QImode using movzbl */
1361 {4, 4, 4}, /* cost of loading integer registers
1362 in QImode, HImode and SImode.
1363 Relative to reg-reg move (2). */
1364 {4, 4, 4}, /* cost of storing integer registers */
1365 3, /* cost of reg,reg fld/fst */
1366 {12, 12, 12}, /* cost of loading fp registers
1367 in SFmode, DFmode and XFmode */
1368 {4, 4, 4}, /* cost of storing fp registers
1369 in SFmode, DFmode and XFmode */
1370 6, /* cost of moving MMX register */
1371 {12, 12}, /* cost of loading MMX registers
1372 in SImode and DImode */
1373 {12, 12}, /* cost of storing MMX registers
1374 in SImode and DImode */
1375 6, /* cost of moving SSE register */
1376 {12, 12, 12}, /* cost of loading SSE registers
1377 in SImode, DImode and TImode */
1378 {12, 12, 12}, /* cost of storing SSE registers
1379 in SImode, DImode and TImode */
1380 8, /* MMX or SSE register to integer */
1381 8, /* size of l1 cache. */
1382 1024, /* size of l2 cache. */
1383 128, /* size of prefetch block */
1384 8, /* number of parallel prefetches */
1385 1, /* Branch cost */
1386 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1387 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1388 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1389 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1390 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1391 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1392 {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1393 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1394 {100000, unrolled_loop, false}, {-1, libcall, false}}}},
1395 {{libcall, {{6, loop_1_byte, false}, {48, loop, false},
1396 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1397 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1398 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1399 1, /* scalar_stmt_cost. */
1400 1, /* scalar load_cost. */
1401 1, /* scalar_store_cost. */
1402 1, /* vec_stmt_cost. */
1403 1, /* vec_to_scalar_cost. */
1404 1, /* scalar_to_vec_cost. */
1405 1, /* vec_align_load_cost. */
1406 2, /* vec_unalign_load_cost. */
1407 1, /* vec_store_cost. */
1408 3, /* cond_taken_branch_cost. */
1409 1, /* cond_not_taken_branch_cost. */
1410 };
1411
1412 static const
1413 struct processor_costs atom_cost = {
1414 COSTS_N_INSNS (1), /* cost of an add instruction */
1415 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1416 COSTS_N_INSNS (1), /* variable shift costs */
1417 COSTS_N_INSNS (1), /* constant shift costs */
1418 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1419 COSTS_N_INSNS (4), /* HI */
1420 COSTS_N_INSNS (3), /* SI */
1421 COSTS_N_INSNS (4), /* DI */
1422 COSTS_N_INSNS (2)}, /* other */
1423 0, /* cost of multiply per each bit set */
1424 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1425 COSTS_N_INSNS (26), /* HI */
1426 COSTS_N_INSNS (42), /* SI */
1427 COSTS_N_INSNS (74), /* DI */
1428 COSTS_N_INSNS (74)}, /* other */
1429 COSTS_N_INSNS (1), /* cost of movsx */
1430 COSTS_N_INSNS (1), /* cost of movzx */
1431 8, /* "large" insn */
1432 17, /* MOVE_RATIO */
1433 4, /* cost for loading QImode using movzbl */
1434 {4, 4, 4}, /* cost of loading integer registers
1435 in QImode, HImode and SImode.
1436 Relative to reg-reg move (2). */
1437 {4, 4, 4}, /* cost of storing integer registers */
1438 4, /* cost of reg,reg fld/fst */
1439 {12, 12, 12}, /* cost of loading fp registers
1440 in SFmode, DFmode and XFmode */
1441 {6, 6, 8}, /* cost of storing fp registers
1442 in SFmode, DFmode and XFmode */
1443 2, /* cost of moving MMX register */
1444 {8, 8}, /* cost of loading MMX registers
1445 in SImode and DImode */
1446 {8, 8}, /* cost of storing MMX registers
1447 in SImode and DImode */
1448 2, /* cost of moving SSE register */
1449 {8, 8, 8}, /* cost of loading SSE registers
1450 in SImode, DImode and TImode */
1451 {8, 8, 8}, /* cost of storing SSE registers
1452 in SImode, DImode and TImode */
1453 5, /* MMX or SSE register to integer */
1454 32, /* size of l1 cache. */
1455 256, /* size of l2 cache. */
1456 64, /* size of prefetch block */
1457 6, /* number of parallel prefetches */
1458 3, /* Branch cost */
1459 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1460 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1461 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1462 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1463 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1464 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1465 {{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1466 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1467 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1468 {{libcall, {{8, loop, false}, {15, unrolled_loop, false},
1469 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1470 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1471 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1472 1, /* scalar_stmt_cost. */
1473 1, /* scalar load_cost. */
1474 1, /* scalar_store_cost. */
1475 1, /* vec_stmt_cost. */
1476 1, /* vec_to_scalar_cost. */
1477 1, /* scalar_to_vec_cost. */
1478 1, /* vec_align_load_cost. */
1479 2, /* vec_unalign_load_cost. */
1480 1, /* vec_store_cost. */
1481 3, /* cond_taken_branch_cost. */
1482 1, /* cond_not_taken_branch_cost. */
1483 };
1484
1485 static const
1486 struct processor_costs slm_cost = {
1487 COSTS_N_INSNS (1), /* cost of an add instruction */
1488 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1489 COSTS_N_INSNS (1), /* variable shift costs */
1490 COSTS_N_INSNS (1), /* constant shift costs */
1491 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1492 COSTS_N_INSNS (4), /* HI */
1493 COSTS_N_INSNS (3), /* SI */
1494 COSTS_N_INSNS (4), /* DI */
1495 COSTS_N_INSNS (2)}, /* other */
1496 0, /* cost of multiply per each bit set */
1497 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1498 COSTS_N_INSNS (26), /* HI */
1499 COSTS_N_INSNS (42), /* SI */
1500 COSTS_N_INSNS (74), /* DI */
1501 COSTS_N_INSNS (74)}, /* other */
1502 COSTS_N_INSNS (1), /* cost of movsx */
1503 COSTS_N_INSNS (1), /* cost of movzx */
1504 8, /* "large" insn */
1505 17, /* MOVE_RATIO */
1506 4, /* cost for loading QImode using movzbl */
1507 {4, 4, 4}, /* cost of loading integer registers
1508 in QImode, HImode and SImode.
1509 Relative to reg-reg move (2). */
1510 {4, 4, 4}, /* cost of storing integer registers */
1511 4, /* cost of reg,reg fld/fst */
1512 {12, 12, 12}, /* cost of loading fp registers
1513 in SFmode, DFmode and XFmode */
1514 {6, 6, 8}, /* cost of storing fp registers
1515 in SFmode, DFmode and XFmode */
1516 2, /* cost of moving MMX register */
1517 {8, 8}, /* cost of loading MMX registers
1518 in SImode and DImode */
1519 {8, 8}, /* cost of storing MMX registers
1520 in SImode and DImode */
1521 2, /* cost of moving SSE register */
1522 {8, 8, 8}, /* cost of loading SSE registers
1523 in SImode, DImode and TImode */
1524 {8, 8, 8}, /* cost of storing SSE registers
1525 in SImode, DImode and TImode */
1526 5, /* MMX or SSE register to integer */
1527 32, /* size of l1 cache. */
1528 256, /* size of l2 cache. */
1529 64, /* size of prefetch block */
1530 6, /* number of parallel prefetches */
1531 3, /* Branch cost */
1532 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1533 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1534 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1535 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1536 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1537 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1538 {{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1539 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1540 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1541 {{libcall, {{8, loop, false}, {15, unrolled_loop, false},
1542 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1543 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1544 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1545 1, /* scalar_stmt_cost. */
1546 1, /* scalar load_cost. */
1547 1, /* scalar_store_cost. */
1548 1, /* vec_stmt_cost. */
1549 1, /* vec_to_scalar_cost. */
1550 1, /* scalar_to_vec_cost. */
1551 1, /* vec_align_load_cost. */
1552 2, /* vec_unalign_load_cost. */
1553 1, /* vec_store_cost. */
1554 3, /* cond_taken_branch_cost. */
1555 1, /* cond_not_taken_branch_cost. */
1556 };
1557
1558 /* Generic64 should produce code tuned for Nocona and K8. */
1559 static const
1560 struct processor_costs generic64_cost = {
1561 COSTS_N_INSNS (1), /* cost of an add instruction */
1562 /* On all chips taken into consideration lea is 2 cycles and more. With
1563 this cost however our current implementation of synth_mult results in
1564 use of unnecessary temporary registers causing regression on several
1565 SPECfp benchmarks. */
1566 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1567 COSTS_N_INSNS (1), /* variable shift costs */
1568 COSTS_N_INSNS (1), /* constant shift costs */
1569 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1570 COSTS_N_INSNS (4), /* HI */
1571 COSTS_N_INSNS (3), /* SI */
1572 COSTS_N_INSNS (4), /* DI */
1573 COSTS_N_INSNS (2)}, /* other */
1574 0, /* cost of multiply per each bit set */
1575 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1576 COSTS_N_INSNS (26), /* HI */
1577 COSTS_N_INSNS (42), /* SI */
1578 COSTS_N_INSNS (74), /* DI */
1579 COSTS_N_INSNS (74)}, /* other */
1580 COSTS_N_INSNS (1), /* cost of movsx */
1581 COSTS_N_INSNS (1), /* cost of movzx */
1582 8, /* "large" insn */
1583 17, /* MOVE_RATIO */
1584 4, /* cost for loading QImode using movzbl */
1585 {4, 4, 4}, /* cost of loading integer registers
1586 in QImode, HImode and SImode.
1587 Relative to reg-reg move (2). */
1588 {4, 4, 4}, /* cost of storing integer registers */
1589 4, /* cost of reg,reg fld/fst */
1590 {12, 12, 12}, /* cost of loading fp registers
1591 in SFmode, DFmode and XFmode */
1592 {6, 6, 8}, /* cost of storing fp registers
1593 in SFmode, DFmode and XFmode */
1594 2, /* cost of moving MMX register */
1595 {8, 8}, /* cost of loading MMX registers
1596 in SImode and DImode */
1597 {8, 8}, /* cost of storing MMX registers
1598 in SImode and DImode */
1599 2, /* cost of moving SSE register */
1600 {8, 8, 8}, /* cost of loading SSE registers
1601 in SImode, DImode and TImode */
1602 {8, 8, 8}, /* cost of storing SSE registers
1603 in SImode, DImode and TImode */
1604 5, /* MMX or SSE register to integer */
1605 32, /* size of l1 cache. */
1606 512, /* size of l2 cache. */
1607 64, /* size of prefetch block */
1608 6, /* number of parallel prefetches */
1609 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1610 value is increased to perhaps more appropriate value of 5. */
1611 3, /* Branch cost */
1612 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1613 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1614 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1615 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1616 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1617 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1618 {DUMMY_STRINGOP_ALGS,
1619 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1620 {-1, libcall, false}}}},
1621 {DUMMY_STRINGOP_ALGS,
1622 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1623 {-1, libcall, false}}}},
1624 1, /* scalar_stmt_cost. */
1625 1, /* scalar load_cost. */
1626 1, /* scalar_store_cost. */
1627 1, /* vec_stmt_cost. */
1628 1, /* vec_to_scalar_cost. */
1629 1, /* scalar_to_vec_cost. */
1630 1, /* vec_align_load_cost. */
1631 2, /* vec_unalign_load_cost. */
1632 1, /* vec_store_cost. */
1633 3, /* cond_taken_branch_cost. */
1634 1, /* cond_not_taken_branch_cost. */
1635 };
1636
1637 /* core_cost should produce code tuned for Core familly of CPUs. */
1638 static const
1639 struct processor_costs core_cost = {
1640 COSTS_N_INSNS (1), /* cost of an add instruction */
1641 /* On all chips taken into consideration lea is 2 cycles and more. With
1642 this cost however our current implementation of synth_mult results in
1643 use of unnecessary temporary registers causing regression on several
1644 SPECfp benchmarks. */
1645 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1646 COSTS_N_INSNS (1), /* variable shift costs */
1647 COSTS_N_INSNS (1), /* constant shift costs */
1648 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1649 COSTS_N_INSNS (4), /* HI */
1650 COSTS_N_INSNS (3), /* SI */
1651 COSTS_N_INSNS (4), /* DI */
1652 COSTS_N_INSNS (2)}, /* other */
1653 0, /* cost of multiply per each bit set */
1654 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1655 COSTS_N_INSNS (26), /* HI */
1656 COSTS_N_INSNS (42), /* SI */
1657 COSTS_N_INSNS (74), /* DI */
1658 COSTS_N_INSNS (74)}, /* other */
1659 COSTS_N_INSNS (1), /* cost of movsx */
1660 COSTS_N_INSNS (1), /* cost of movzx */
1661 8, /* "large" insn */
1662 17, /* MOVE_RATIO */
1663 4, /* cost for loading QImode using movzbl */
1664 {4, 4, 4}, /* cost of loading integer registers
1665 in QImode, HImode and SImode.
1666 Relative to reg-reg move (2). */
1667 {4, 4, 4}, /* cost of storing integer registers */
1668 4, /* cost of reg,reg fld/fst */
1669 {12, 12, 12}, /* cost of loading fp registers
1670 in SFmode, DFmode and XFmode */
1671 {6, 6, 8}, /* cost of storing fp registers
1672 in SFmode, DFmode and XFmode */
1673 2, /* cost of moving MMX register */
1674 {8, 8}, /* cost of loading MMX registers
1675 in SImode and DImode */
1676 {8, 8}, /* cost of storing MMX registers
1677 in SImode and DImode */
1678 2, /* cost of moving SSE register */
1679 {8, 8, 8}, /* cost of loading SSE registers
1680 in SImode, DImode and TImode */
1681 {8, 8, 8}, /* cost of storing SSE registers
1682 in SImode, DImode and TImode */
1683 5, /* MMX or SSE register to integer */
1684 64, /* size of l1 cache. */
1685 512, /* size of l2 cache. */
1686 64, /* size of prefetch block */
1687 6, /* number of parallel prefetches */
1688 /* FIXME perhaps more appropriate value is 5. */
1689 3, /* Branch cost */
1690 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1691 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1692 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1693 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1694 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1695 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1696 {{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1697 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
1698 {-1, libcall, false}}}},
1699 {{libcall, {{6, loop_1_byte, true},
1700 {24, loop, true},
1701 {8192, rep_prefix_4_byte, true},
1702 {-1, libcall, false}}},
1703 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
1704 {-1, libcall, false}}}},
1705 1, /* scalar_stmt_cost. */
1706 1, /* scalar load_cost. */
1707 1, /* scalar_store_cost. */
1708 1, /* vec_stmt_cost. */
1709 1, /* vec_to_scalar_cost. */
1710 1, /* scalar_to_vec_cost. */
1711 1, /* vec_align_load_cost. */
1712 2, /* vec_unalign_load_cost. */
1713 1, /* vec_store_cost. */
1714 3, /* cond_taken_branch_cost. */
1715 1, /* cond_not_taken_branch_cost. */
1716 };
1717
1718 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1719 Athlon and K8. */
1720 static const
1721 struct processor_costs generic32_cost = {
1722 COSTS_N_INSNS (1), /* cost of an add instruction */
1723 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1724 COSTS_N_INSNS (1), /* variable shift costs */
1725 COSTS_N_INSNS (1), /* constant shift costs */
1726 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1727 COSTS_N_INSNS (4), /* HI */
1728 COSTS_N_INSNS (3), /* SI */
1729 COSTS_N_INSNS (4), /* DI */
1730 COSTS_N_INSNS (2)}, /* other */
1731 0, /* cost of multiply per each bit set */
1732 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1733 COSTS_N_INSNS (26), /* HI */
1734 COSTS_N_INSNS (42), /* SI */
1735 COSTS_N_INSNS (74), /* DI */
1736 COSTS_N_INSNS (74)}, /* other */
1737 COSTS_N_INSNS (1), /* cost of movsx */
1738 COSTS_N_INSNS (1), /* cost of movzx */
1739 8, /* "large" insn */
1740 17, /* MOVE_RATIO */
1741 4, /* cost for loading QImode using movzbl */
1742 {4, 4, 4}, /* cost of loading integer registers
1743 in QImode, HImode and SImode.
1744 Relative to reg-reg move (2). */
1745 {4, 4, 4}, /* cost of storing integer registers */
1746 4, /* cost of reg,reg fld/fst */
1747 {12, 12, 12}, /* cost of loading fp registers
1748 in SFmode, DFmode and XFmode */
1749 {6, 6, 8}, /* cost of storing fp registers
1750 in SFmode, DFmode and XFmode */
1751 2, /* cost of moving MMX register */
1752 {8, 8}, /* cost of loading MMX registers
1753 in SImode and DImode */
1754 {8, 8}, /* cost of storing MMX registers
1755 in SImode and DImode */
1756 2, /* cost of moving SSE register */
1757 {8, 8, 8}, /* cost of loading SSE registers
1758 in SImode, DImode and TImode */
1759 {8, 8, 8}, /* cost of storing SSE registers
1760 in SImode, DImode and TImode */
1761 5, /* MMX or SSE register to integer */
1762 32, /* size of l1 cache. */
1763 256, /* size of l2 cache. */
1764 64, /* size of prefetch block */
1765 6, /* number of parallel prefetches */
1766 3, /* Branch cost */
1767 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1768 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1769 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1770 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1771 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1772 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1773 {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1774 {-1, libcall, false}}},
1775 DUMMY_STRINGOP_ALGS},
1776 {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1777 {-1, libcall, false}}},
1778 DUMMY_STRINGOP_ALGS},
1779 1, /* scalar_stmt_cost. */
1780 1, /* scalar load_cost. */
1781 1, /* scalar_store_cost. */
1782 1, /* vec_stmt_cost. */
1783 1, /* vec_to_scalar_cost. */
1784 1, /* scalar_to_vec_cost. */
1785 1, /* vec_align_load_cost. */
1786 2, /* vec_unalign_load_cost. */
1787 1, /* vec_store_cost. */
1788 3, /* cond_taken_branch_cost. */
1789 1, /* cond_not_taken_branch_cost. */
1790 };
1791
1792 /* Set by -mtune. */
1793 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1794
1795 /* Set by -mtune or -Os. */
1796 const struct processor_costs *ix86_cost = &pentium_cost;
1797
1798 /* Processor feature/optimization bitmasks. */
1799 #define m_386 (1<<PROCESSOR_I386)
1800 #define m_486 (1<<PROCESSOR_I486)
1801 #define m_PENT (1<<PROCESSOR_PENTIUM)
1802 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1803 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1804 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1805 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1806 #define m_CORE2 (1<<PROCESSOR_CORE2)
1807 #define m_COREI7 (1<<PROCESSOR_COREI7)
1808 #define m_HASWELL (1<<PROCESSOR_HASWELL)
1809 #define m_CORE_ALL (m_CORE2 | m_COREI7 | m_HASWELL)
1810 #define m_ATOM (1<<PROCESSOR_ATOM)
1811 #define m_SLM (1<<PROCESSOR_SLM)
1812
1813 #define m_GEODE (1<<PROCESSOR_GEODE)
1814 #define m_K6 (1<<PROCESSOR_K6)
1815 #define m_K6_GEODE (m_K6 | m_GEODE)
1816 #define m_K8 (1<<PROCESSOR_K8)
1817 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1818 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1819 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1820 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1821 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1822 #define m_BDVER3 (1<<PROCESSOR_BDVER3)
1823 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1824 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1825 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3)
1826 #define m_BTVER (m_BTVER1 | m_BTVER2)
1827 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1828
1829 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1830 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1831
1832 /* Generic instruction choice should be common subset of supported CPUs
1833 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1834 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1835
1836 /* Feature tests against the various tunings. */
1837 unsigned char ix86_tune_features[X86_TUNE_LAST];
1838
1839 /* Feature tests against the various tunings used to create ix86_tune_features
1840 based on the processor mask. */
1841 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1842 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1843 negatively, so enabling for Generic64 seems like good code size
1844 tradeoff. We can't enable it for 32bit generic because it does not
1845 work well with PPro base chips. */
1846 m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1847
1848 /* X86_TUNE_PUSH_MEMORY */
1849 m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1850
1851 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1852 m_486 | m_PENT,
1853
1854 /* X86_TUNE_UNROLL_STRLEN */
1855 m_486 | m_PENT | m_PPRO | m_ATOM | m_SLM | m_CORE_ALL | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1856
1857 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1858 on simulation result. But after P4 was made, no performance benefit
1859 was observed with branch hints. It also increases the code size.
1860 As a result, icc never generates branch hints. */
1861 0,
1862
1863 /* X86_TUNE_DOUBLE_WITH_ADD */
1864 ~m_386,
1865
1866 /* X86_TUNE_USE_SAHF */
1867 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC,
1868
1869 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1870 partial dependencies. */
1871 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1872
1873 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1874 register stalls on Generic32 compilation setting as well. However
1875 in current implementation the partial register stalls are not eliminated
1876 very well - they can be introduced via subregs synthesized by combine
1877 and can happen in caller/callee saving sequences. Because this option
1878 pays back little on PPro based chips and is in conflict with partial reg
1879 dependencies used by Athlon/P4 based chips, it is better to leave it off
1880 for generic32 for now. */
1881 m_PPRO,
1882
1883 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1884 m_CORE_ALL | m_GENERIC,
1885
1886 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
1887 * on 16-bit immediate moves into memory on Core2 and Corei7. */
1888 m_CORE_ALL | m_GENERIC,
1889
1890 /* X86_TUNE_USE_HIMODE_FIOP */
1891 m_386 | m_486 | m_K6_GEODE,
1892
1893 /* X86_TUNE_USE_SIMODE_FIOP */
1894 ~(m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC),
1895
1896 /* X86_TUNE_USE_MOV0 */
1897 m_K6,
1898
1899 /* X86_TUNE_USE_CLTD */
1900 ~(m_PENT | m_ATOM | m_SLM | m_K6),
1901
1902 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1903 m_PENT4,
1904
1905 /* X86_TUNE_SPLIT_LONG_MOVES */
1906 m_PPRO,
1907
1908 /* X86_TUNE_READ_MODIFY_WRITE */
1909 ~m_PENT,
1910
1911 /* X86_TUNE_READ_MODIFY */
1912 ~(m_PENT | m_PPRO),
1913
1914 /* X86_TUNE_PROMOTE_QIMODE */
1915 m_386 | m_486 | m_PENT | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1916
1917 /* X86_TUNE_FAST_PREFIX */
1918 ~(m_386 | m_486 | m_PENT),
1919
1920 /* X86_TUNE_SINGLE_STRINGOP */
1921 m_386 | m_P4_NOCONA,
1922
1923 /* X86_TUNE_QIMODE_MATH */
1924 ~0,
1925
1926 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
1927 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
1928 might be considered for Generic32 if our scheme for avoiding partial
1929 stalls was more effective. */
1930 ~m_PPRO,
1931
1932 /* X86_TUNE_PROMOTE_QI_REGS */
1933 0,
1934
1935 /* X86_TUNE_PROMOTE_HI_REGS */
1936 m_PPRO,
1937
1938 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
1939 over esp addition. */
1940 m_386 | m_486 | m_PENT | m_PPRO,
1941
1942 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
1943 over esp addition. */
1944 m_PENT,
1945
1946 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
1947 over esp subtraction. */
1948 m_386 | m_486 | m_PENT | m_K6_GEODE,
1949
1950 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
1951 over esp subtraction. */
1952 m_PENT | m_K6_GEODE,
1953
1954 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
1955 for DFmode copies */
1956 ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC),
1957
1958 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
1959 m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC,
1960
1961 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
1962 conflict here in between PPro/Pentium4 based chips that thread 128bit
1963 SSE registers as single units versus K8 based chips that divide SSE
1964 registers to two 64bit halves. This knob promotes all store destinations
1965 to be 128bit to allow register renaming on 128bit SSE units, but usually
1966 results in one extra microop on 64bit SSE units. Experimental results
1967 shows that disabling this option on P4 brings over 20% SPECfp regression,
1968 while enabling it on K8 brings roughly 2.4% regression that can be partly
1969 masked by careful scheduling of moves. */
1970 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMDFAM10 | m_BDVER | m_GENERIC,
1971
1972 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
1973 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER | m_SLM,
1974
1975 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
1976 m_COREI7 | m_BDVER | m_SLM,
1977
1978 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
1979 m_BDVER ,
1980
1981 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
1982 are resolved on SSE register parts instead of whole registers, so we may
1983 maintain just lower part of scalar values in proper format leaving the
1984 upper part undefined. */
1985 m_ATHLON_K8,
1986
1987 /* X86_TUNE_SSE_TYPELESS_STORES */
1988 m_AMD_MULTIPLE,
1989
1990 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
1991 m_PPRO | m_P4_NOCONA,
1992
1993 /* X86_TUNE_MEMORY_MISMATCH_STALL */
1994 m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC,
1995
1996 /* X86_TUNE_PROLOGUE_USING_MOVE */
1997 m_PPRO | m_ATHLON_K8,
1998
1999 /* X86_TUNE_EPILOGUE_USING_MOVE */
2000 m_PPRO | m_ATHLON_K8,
2001
2002 /* X86_TUNE_SHIFT1 */
2003 ~m_486,
2004
2005 /* X86_TUNE_USE_FFREEP */
2006 m_AMD_MULTIPLE,
2007
2008 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC */
2009 ~(m_AMD_MULTIPLE | m_GENERIC),
2010
2011 /* X86_TUNE_INTER_UNIT_MOVES_FROM_VEC */
2012 ~m_ATHLON_K8,
2013
2014 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2015 ~(m_AMDFAM10 | m_BDVER ),
2016
2017 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2018 than 4 branch instructions in the 16 byte window. */
2019 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC,
2020
2021 /* X86_TUNE_SCHEDULE */
2022 m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2023
2024 /* X86_TUNE_USE_BT */
2025 m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC,
2026
2027 /* X86_TUNE_USE_INCDEC */
2028 ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GENERIC),
2029
2030 /* X86_TUNE_PAD_RETURNS */
2031 m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC,
2032
2033 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2034 m_ATOM,
2035
2036 /* X86_TUNE_EXT_80387_CONSTANTS */
2037 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2038
2039 /* X86_TUNE_AVOID_VECTOR_DECODE */
2040 m_CORE_ALL | m_K8 | m_GENERIC64,
2041
2042 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2043 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2044 ~(m_386 | m_486),
2045
2046 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2047 vector path on AMD machines. */
2048 m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
2049
2050 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2051 machines. */
2052 m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
2053
2054 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2055 than a MOV. */
2056 m_PENT,
2057
2058 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2059 but one byte longer. */
2060 m_PENT,
2061
2062 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2063 operand that cannot be represented using a modRM byte. The XOR
2064 replacement is long decoded, so this split helps here as well. */
2065 m_K6,
2066
2067 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2068 from FP to FP. */
2069 m_CORE_ALL | m_AMDFAM10 | m_GENERIC,
2070
2071 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2072 from integer to FP. */
2073 m_AMDFAM10,
2074
2075 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2076 with a subsequent conditional jump instruction into a single
2077 compare-and-branch uop. */
2078 m_BDVER,
2079
2080 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2081 will impact LEA instruction selection. */
2082 m_ATOM | m_SLM,
2083
2084 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2085 instructions. */
2086 ~m_ATOM,
2087
2088 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2089 at -O3. For the moment, the prefetching seems badly tuned for Intel
2090 chips. */
2091 m_K6_GEODE | m_AMD_MULTIPLE,
2092
2093 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2094 the auto-vectorizer. */
2095 m_BDVER | m_BTVER2,
2096
2097 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2098 during reassociation of integer computation. */
2099 m_ATOM,
2100
2101 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2102 during reassociation of fp computation. */
2103 m_ATOM | m_SLM | m_HASWELL | m_BDVER1 | m_BDVER2,
2104
2105 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
2106 regs instead of memory. */
2107 m_CORE_ALL,
2108
2109 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
2110 a conditional move. */
2111 m_ATOM
2112 };
2113
2114 /* Feature tests against the various architecture variations. */
2115 unsigned char ix86_arch_features[X86_ARCH_LAST];
2116
2117 /* Feature tests against the various architecture variations, used to create
2118 ix86_arch_features based on the processor mask. */
2119 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2120 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2121 ~(m_386 | m_486 | m_PENT | m_K6),
2122
2123 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2124 ~m_386,
2125
2126 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2127 ~(m_386 | m_486),
2128
2129 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2130 ~m_386,
2131
2132 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2133 ~m_386,
2134 };
2135
2136 static const unsigned int x86_accumulate_outgoing_args
2137 = m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC;
2138
2139 static const unsigned int x86_arch_always_fancy_math_387
2140 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC;
2141
2142 static const unsigned int x86_avx256_split_unaligned_load
2143 = m_COREI7 | m_GENERIC;
2144
2145 static const unsigned int x86_avx256_split_unaligned_store
2146 = m_COREI7 | m_BDVER | m_GENERIC;
2147
2148 /* In case the average insn count for single function invocation is
2149 lower than this constant, emit fast (but longer) prologue and
2150 epilogue code. */
2151 #define FAST_PROLOGUE_INSN_COUNT 20
2152
2153 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2154 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2155 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2156 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2157
2158 /* Array of the smallest class containing reg number REGNO, indexed by
2159 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2160
2161 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2162 {
2163 /* ax, dx, cx, bx */
2164 AREG, DREG, CREG, BREG,
2165 /* si, di, bp, sp */
2166 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2167 /* FP registers */
2168 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2169 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2170 /* arg pointer */
2171 NON_Q_REGS,
2172 /* flags, fpsr, fpcr, frame */
2173 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2174 /* SSE registers */
2175 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2176 SSE_REGS, SSE_REGS,
2177 /* MMX registers */
2178 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2179 MMX_REGS, MMX_REGS,
2180 /* REX registers */
2181 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2182 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2183 /* SSE REX registers */
2184 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2185 SSE_REGS, SSE_REGS,
2186 };
2187
2188 /* The "default" register map used in 32bit mode. */
2189
2190 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2191 {
2192 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2193 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2194 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2195 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2196 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2197 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2198 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2199 };
2200
2201 /* The "default" register map used in 64bit mode. */
2202
2203 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2204 {
2205 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2206 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2207 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2208 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2209 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2210 8,9,10,11,12,13,14,15, /* extended integer registers */
2211 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2212 };
2213
2214 /* Define the register numbers to be used in Dwarf debugging information.
2215 The SVR4 reference port C compiler uses the following register numbers
2216 in its Dwarf output code:
2217 0 for %eax (gcc regno = 0)
2218 1 for %ecx (gcc regno = 2)
2219 2 for %edx (gcc regno = 1)
2220 3 for %ebx (gcc regno = 3)
2221 4 for %esp (gcc regno = 7)
2222 5 for %ebp (gcc regno = 6)
2223 6 for %esi (gcc regno = 4)
2224 7 for %edi (gcc regno = 5)
2225 The following three DWARF register numbers are never generated by
2226 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2227 believes these numbers have these meanings.
2228 8 for %eip (no gcc equivalent)
2229 9 for %eflags (gcc regno = 17)
2230 10 for %trapno (no gcc equivalent)
2231 It is not at all clear how we should number the FP stack registers
2232 for the x86 architecture. If the version of SDB on x86/svr4 were
2233 a bit less brain dead with respect to floating-point then we would
2234 have a precedent to follow with respect to DWARF register numbers
2235 for x86 FP registers, but the SDB on x86/svr4 is so completely
2236 broken with respect to FP registers that it is hardly worth thinking
2237 of it as something to strive for compatibility with.
2238 The version of x86/svr4 SDB I have at the moment does (partially)
2239 seem to believe that DWARF register number 11 is associated with
2240 the x86 register %st(0), but that's about all. Higher DWARF
2241 register numbers don't seem to be associated with anything in
2242 particular, and even for DWARF regno 11, SDB only seems to under-
2243 stand that it should say that a variable lives in %st(0) (when
2244 asked via an `=' command) if we said it was in DWARF regno 11,
2245 but SDB still prints garbage when asked for the value of the
2246 variable in question (via a `/' command).
2247 (Also note that the labels SDB prints for various FP stack regs
2248 when doing an `x' command are all wrong.)
2249 Note that these problems generally don't affect the native SVR4
2250 C compiler because it doesn't allow the use of -O with -g and
2251 because when it is *not* optimizing, it allocates a memory
2252 location for each floating-point variable, and the memory
2253 location is what gets described in the DWARF AT_location
2254 attribute for the variable in question.
2255 Regardless of the severe mental illness of the x86/svr4 SDB, we
2256 do something sensible here and we use the following DWARF
2257 register numbers. Note that these are all stack-top-relative
2258 numbers.
2259 11 for %st(0) (gcc regno = 8)
2260 12 for %st(1) (gcc regno = 9)
2261 13 for %st(2) (gcc regno = 10)
2262 14 for %st(3) (gcc regno = 11)
2263 15 for %st(4) (gcc regno = 12)
2264 16 for %st(5) (gcc regno = 13)
2265 17 for %st(6) (gcc regno = 14)
2266 18 for %st(7) (gcc regno = 15)
2267 */
2268 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2269 {
2270 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2271 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2272 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2273 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2274 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2275 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2276 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2277 };
2278
2279 /* Define parameter passing and return registers. */
2280
2281 static int const x86_64_int_parameter_registers[6] =
2282 {
2283 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2284 };
2285
2286 static int const x86_64_ms_abi_int_parameter_registers[4] =
2287 {
2288 CX_REG, DX_REG, R8_REG, R9_REG
2289 };
2290
2291 static int const x86_64_int_return_registers[4] =
2292 {
2293 AX_REG, DX_REG, DI_REG, SI_REG
2294 };
2295
2296 /* Additional registers that are clobbered by SYSV calls. */
2297
2298 int const x86_64_ms_sysv_extra_clobbered_registers[12] =
2299 {
2300 SI_REG, DI_REG,
2301 XMM6_REG, XMM7_REG,
2302 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2303 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2304 };
2305
2306 /* Define the structure for the machine field in struct function. */
2307
2308 struct GTY(()) stack_local_entry {
2309 unsigned short mode;
2310 unsigned short n;
2311 rtx rtl;
2312 struct stack_local_entry *next;
2313 };
2314
2315 /* Structure describing stack frame layout.
2316 Stack grows downward:
2317
2318 [arguments]
2319 <- ARG_POINTER
2320 saved pc
2321
2322 saved static chain if ix86_static_chain_on_stack
2323
2324 saved frame pointer if frame_pointer_needed
2325 <- HARD_FRAME_POINTER
2326 [saved regs]
2327 <- regs_save_offset
2328 [padding0]
2329
2330 [saved SSE regs]
2331 <- sse_regs_save_offset
2332 [padding1] |
2333 | <- FRAME_POINTER
2334 [va_arg registers] |
2335 |
2336 [frame] |
2337 |
2338 [padding2] | = to_allocate
2339 <- STACK_POINTER
2340 */
2341 struct ix86_frame
2342 {
2343 int nsseregs;
2344 int nregs;
2345 int va_arg_size;
2346 int red_zone_size;
2347 int outgoing_arguments_size;
2348
2349 /* The offsets relative to ARG_POINTER. */
2350 HOST_WIDE_INT frame_pointer_offset;
2351 HOST_WIDE_INT hard_frame_pointer_offset;
2352 HOST_WIDE_INT stack_pointer_offset;
2353 HOST_WIDE_INT hfp_save_offset;
2354 HOST_WIDE_INT reg_save_offset;
2355 HOST_WIDE_INT sse_reg_save_offset;
2356
2357 /* When save_regs_using_mov is set, emit prologue using
2358 move instead of push instructions. */
2359 bool save_regs_using_mov;
2360 };
2361
2362 /* Which cpu are we scheduling for. */
2363 enum attr_cpu ix86_schedule;
2364
2365 /* Which cpu are we optimizing for. */
2366 enum processor_type ix86_tune;
2367
2368 /* Which instruction set architecture to use. */
2369 enum processor_type ix86_arch;
2370
2371 /* True if processor has SSE prefetch instruction. */
2372 unsigned char x86_prefetch_sse;
2373
2374 /* -mstackrealign option */
2375 static const char ix86_force_align_arg_pointer_string[]
2376 = "force_align_arg_pointer";
2377
2378 static rtx (*ix86_gen_leave) (void);
2379 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2380 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2381 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2382 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2383 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2384 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2385 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2386 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2387 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2388 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2389 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2390
2391 /* Preferred alignment for stack boundary in bits. */
2392 unsigned int ix86_preferred_stack_boundary;
2393
2394 /* Alignment for incoming stack boundary in bits specified at
2395 command line. */
2396 static unsigned int ix86_user_incoming_stack_boundary;
2397
2398 /* Default alignment for incoming stack boundary in bits. */
2399 static unsigned int ix86_default_incoming_stack_boundary;
2400
2401 /* Alignment for incoming stack boundary in bits. */
2402 unsigned int ix86_incoming_stack_boundary;
2403
2404 /* Calling abi specific va_list type nodes. */
2405 static GTY(()) tree sysv_va_list_type_node;
2406 static GTY(()) tree ms_va_list_type_node;
2407
2408 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2409 char internal_label_prefix[16];
2410 int internal_label_prefix_len;
2411
2412 /* Fence to use after loop using movnt. */
2413 tree x86_mfence;
2414
2415 /* Register class used for passing given 64bit part of the argument.
2416 These represent classes as documented by the PS ABI, with the exception
2417 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2418 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2419
2420 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2421 whenever possible (upper half does contain padding). */
2422 enum x86_64_reg_class
2423 {
2424 X86_64_NO_CLASS,
2425 X86_64_INTEGER_CLASS,
2426 X86_64_INTEGERSI_CLASS,
2427 X86_64_SSE_CLASS,
2428 X86_64_SSESF_CLASS,
2429 X86_64_SSEDF_CLASS,
2430 X86_64_SSEUP_CLASS,
2431 X86_64_X87_CLASS,
2432 X86_64_X87UP_CLASS,
2433 X86_64_COMPLEX_X87_CLASS,
2434 X86_64_MEMORY_CLASS
2435 };
2436
2437 #define MAX_CLASSES 4
2438
2439 /* Table of constants used by fldpi, fldln2, etc.... */
2440 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2441 static bool ext_80387_constants_init = 0;
2442
2443 \f
2444 static struct machine_function * ix86_init_machine_status (void);
2445 static rtx ix86_function_value (const_tree, const_tree, bool);
2446 static bool ix86_function_value_regno_p (const unsigned int);
2447 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2448 const_tree);
2449 static rtx ix86_static_chain (const_tree, bool);
2450 static int ix86_function_regparm (const_tree, const_tree);
2451 static void ix86_compute_frame_layout (struct ix86_frame *);
2452 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2453 rtx, rtx, int);
2454 static void ix86_add_new_builtins (HOST_WIDE_INT);
2455 static tree ix86_canonical_va_list_type (tree);
2456 static void predict_jump (int);
2457 static unsigned int split_stack_prologue_scratch_regno (void);
2458 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2459
2460 enum ix86_function_specific_strings
2461 {
2462 IX86_FUNCTION_SPECIFIC_ARCH,
2463 IX86_FUNCTION_SPECIFIC_TUNE,
2464 IX86_FUNCTION_SPECIFIC_MAX
2465 };
2466
2467 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2468 const char *, enum fpmath_unit, bool);
2469 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2470 static void ix86_function_specific_save (struct cl_target_option *);
2471 static void ix86_function_specific_restore (struct cl_target_option *);
2472 static void ix86_function_specific_print (FILE *, int,
2473 struct cl_target_option *);
2474 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2475 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2476 struct gcc_options *);
2477 static bool ix86_can_inline_p (tree, tree);
2478 static void ix86_set_current_function (tree);
2479 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2480
2481 static enum calling_abi ix86_function_abi (const_tree);
2482
2483 \f
2484 #ifndef SUBTARGET32_DEFAULT_CPU
2485 #define SUBTARGET32_DEFAULT_CPU "i386"
2486 #endif
2487
2488 /* Whether -mtune= or -march= were specified */
2489 static int ix86_tune_defaulted;
2490 static int ix86_arch_specified;
2491
2492 /* Vectorization library interface and handlers. */
2493 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2494
2495 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2496 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2497
2498 /* Processor target table, indexed by processor number */
2499 struct ptt
2500 {
2501 const struct processor_costs *cost; /* Processor costs */
2502 const int align_loop; /* Default alignments. */
2503 const int align_loop_max_skip;
2504 const int align_jump;
2505 const int align_jump_max_skip;
2506 const int align_func;
2507 };
2508
2509 static const struct ptt processor_target_table[PROCESSOR_max] =
2510 {
2511 {&i386_cost, 4, 3, 4, 3, 4},
2512 {&i486_cost, 16, 15, 16, 15, 16},
2513 {&pentium_cost, 16, 7, 16, 7, 16},
2514 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2515 {&geode_cost, 0, 0, 0, 0, 0},
2516 {&k6_cost, 32, 7, 32, 7, 32},
2517 {&athlon_cost, 16, 7, 16, 7, 16},
2518 {&pentium4_cost, 0, 0, 0, 0, 0},
2519 {&k8_cost, 16, 7, 16, 7, 16},
2520 {&nocona_cost, 0, 0, 0, 0, 0},
2521 /* Core 2 */
2522 {&core_cost, 16, 10, 16, 10, 16},
2523 /* Core i7 */
2524 {&core_cost, 16, 10, 16, 10, 16},
2525 /* Core avx2 */
2526 {&core_cost, 16, 10, 16, 10, 16},
2527 {&generic32_cost, 16, 7, 16, 7, 16},
2528 {&generic64_cost, 16, 10, 16, 10, 16},
2529 {&amdfam10_cost, 32, 24, 32, 7, 32},
2530 {&bdver1_cost, 16, 10, 16, 7, 11},
2531 {&bdver2_cost, 16, 10, 16, 7, 11},
2532 {&bdver3_cost, 16, 10, 16, 7, 11},
2533 {&btver1_cost, 16, 10, 16, 7, 11},
2534 {&btver2_cost, 16, 10, 16, 7, 11},
2535 {&atom_cost, 16, 15, 16, 7, 16},
2536 {&slm_cost, 16, 15, 16, 7, 16}
2537 };
2538
2539 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2540 {
2541 "generic",
2542 "i386",
2543 "i486",
2544 "pentium",
2545 "pentium-mmx",
2546 "pentiumpro",
2547 "pentium2",
2548 "pentium3",
2549 "pentium4",
2550 "pentium-m",
2551 "prescott",
2552 "nocona",
2553 "core2",
2554 "corei7",
2555 "core-avx2",
2556 "atom",
2557 "slm",
2558 "geode",
2559 "k6",
2560 "k6-2",
2561 "k6-3",
2562 "athlon",
2563 "athlon-4",
2564 "k8",
2565 "amdfam10",
2566 "bdver1",
2567 "bdver2",
2568 "bdver3",
2569 "btver1",
2570 "btver2"
2571 };
2572 \f
2573 static bool
2574 gate_insert_vzeroupper (void)
2575 {
2576 return TARGET_VZEROUPPER;
2577 }
2578
2579 static unsigned int
2580 rest_of_handle_insert_vzeroupper (void)
2581 {
2582 int i;
2583
2584 /* vzeroupper instructions are inserted immediately after reload to
2585 account for possible spills from 256bit registers. The pass
2586 reuses mode switching infrastructure by re-running mode insertion
2587 pass, so disable entities that have already been processed. */
2588 for (i = 0; i < MAX_386_ENTITIES; i++)
2589 ix86_optimize_mode_switching[i] = 0;
2590
2591 ix86_optimize_mode_switching[AVX_U128] = 1;
2592
2593 /* Call optimize_mode_switching. */
2594 pass_mode_switching.pass.execute ();
2595 return 0;
2596 }
2597
2598 struct rtl_opt_pass pass_insert_vzeroupper =
2599 {
2600 {
2601 RTL_PASS,
2602 "vzeroupper", /* name */
2603 OPTGROUP_NONE, /* optinfo_flags */
2604 gate_insert_vzeroupper, /* gate */
2605 rest_of_handle_insert_vzeroupper, /* execute */
2606 NULL, /* sub */
2607 NULL, /* next */
2608 0, /* static_pass_number */
2609 TV_NONE, /* tv_id */
2610 0, /* properties_required */
2611 0, /* properties_provided */
2612 0, /* properties_destroyed */
2613 0, /* todo_flags_start */
2614 TODO_df_finish | TODO_verify_rtl_sharing |
2615 0, /* todo_flags_finish */
2616 }
2617 };
2618
2619 /* Return true if a red-zone is in use. */
2620
2621 static inline bool
2622 ix86_using_red_zone (void)
2623 {
2624 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2625 }
2626 \f
2627 /* Return a string that documents the current -m options. The caller is
2628 responsible for freeing the string. */
2629
2630 static char *
2631 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2632 const char *tune, enum fpmath_unit fpmath,
2633 bool add_nl_p)
2634 {
2635 struct ix86_target_opts
2636 {
2637 const char *option; /* option string */
2638 HOST_WIDE_INT mask; /* isa mask options */
2639 };
2640
2641 /* This table is ordered so that options like -msse4.2 that imply
2642 preceding options while match those first. */
2643 static struct ix86_target_opts isa_opts[] =
2644 {
2645 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2646 { "-mfma", OPTION_MASK_ISA_FMA },
2647 { "-mxop", OPTION_MASK_ISA_XOP },
2648 { "-mlwp", OPTION_MASK_ISA_LWP },
2649 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2650 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2651 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2652 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2653 { "-msse3", OPTION_MASK_ISA_SSE3 },
2654 { "-msse2", OPTION_MASK_ISA_SSE2 },
2655 { "-msse", OPTION_MASK_ISA_SSE },
2656 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2657 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2658 { "-mmmx", OPTION_MASK_ISA_MMX },
2659 { "-mabm", OPTION_MASK_ISA_ABM },
2660 { "-mbmi", OPTION_MASK_ISA_BMI },
2661 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2662 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2663 { "-mhle", OPTION_MASK_ISA_HLE },
2664 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2665 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2666 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2667 { "-madx", OPTION_MASK_ISA_ADX },
2668 { "-mtbm", OPTION_MASK_ISA_TBM },
2669 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2670 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2671 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2672 { "-maes", OPTION_MASK_ISA_AES },
2673 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2674 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2675 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2676 { "-mf16c", OPTION_MASK_ISA_F16C },
2677 { "-mrtm", OPTION_MASK_ISA_RTM },
2678 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2679 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2680 };
2681
2682 /* Flag options. */
2683 static struct ix86_target_opts flag_opts[] =
2684 {
2685 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2686 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2687 { "-m80387", MASK_80387 },
2688 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2689 { "-malign-double", MASK_ALIGN_DOUBLE },
2690 { "-mcld", MASK_CLD },
2691 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2692 { "-mieee-fp", MASK_IEEE_FP },
2693 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2694 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2695 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2696 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2697 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2698 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2699 { "-mno-red-zone", MASK_NO_RED_ZONE },
2700 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2701 { "-mrecip", MASK_RECIP },
2702 { "-mrtd", MASK_RTD },
2703 { "-msseregparm", MASK_SSEREGPARM },
2704 { "-mstack-arg-probe", MASK_STACK_PROBE },
2705 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2706 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2707 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2708 { "-mvzeroupper", MASK_VZEROUPPER },
2709 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2710 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2711 { "-mprefer-avx128", MASK_PREFER_AVX128},
2712 };
2713
2714 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2715
2716 char isa_other[40];
2717 char target_other[40];
2718 unsigned num = 0;
2719 unsigned i, j;
2720 char *ret;
2721 char *ptr;
2722 size_t len;
2723 size_t line_len;
2724 size_t sep_len;
2725 const char *abi;
2726
2727 memset (opts, '\0', sizeof (opts));
2728
2729 /* Add -march= option. */
2730 if (arch)
2731 {
2732 opts[num][0] = "-march=";
2733 opts[num++][1] = arch;
2734 }
2735
2736 /* Add -mtune= option. */
2737 if (tune)
2738 {
2739 opts[num][0] = "-mtune=";
2740 opts[num++][1] = tune;
2741 }
2742
2743 /* Add -m32/-m64/-mx32. */
2744 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2745 {
2746 if ((isa & OPTION_MASK_ABI_64) != 0)
2747 abi = "-m64";
2748 else
2749 abi = "-mx32";
2750 isa &= ~ (OPTION_MASK_ISA_64BIT
2751 | OPTION_MASK_ABI_64
2752 | OPTION_MASK_ABI_X32);
2753 }
2754 else
2755 abi = "-m32";
2756 opts[num++][0] = abi;
2757
2758 /* Pick out the options in isa options. */
2759 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2760 {
2761 if ((isa & isa_opts[i].mask) != 0)
2762 {
2763 opts[num++][0] = isa_opts[i].option;
2764 isa &= ~ isa_opts[i].mask;
2765 }
2766 }
2767
2768 if (isa && add_nl_p)
2769 {
2770 opts[num++][0] = isa_other;
2771 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2772 isa);
2773 }
2774
2775 /* Add flag options. */
2776 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2777 {
2778 if ((flags & flag_opts[i].mask) != 0)
2779 {
2780 opts[num++][0] = flag_opts[i].option;
2781 flags &= ~ flag_opts[i].mask;
2782 }
2783 }
2784
2785 if (flags && add_nl_p)
2786 {
2787 opts[num++][0] = target_other;
2788 sprintf (target_other, "(other flags: %#x)", flags);
2789 }
2790
2791 /* Add -fpmath= option. */
2792 if (fpmath)
2793 {
2794 opts[num][0] = "-mfpmath=";
2795 switch ((int) fpmath)
2796 {
2797 case FPMATH_387:
2798 opts[num++][1] = "387";
2799 break;
2800
2801 case FPMATH_SSE:
2802 opts[num++][1] = "sse";
2803 break;
2804
2805 case FPMATH_387 | FPMATH_SSE:
2806 opts[num++][1] = "sse+387";
2807 break;
2808
2809 default:
2810 gcc_unreachable ();
2811 }
2812 }
2813
2814 /* Any options? */
2815 if (num == 0)
2816 return NULL;
2817
2818 gcc_assert (num < ARRAY_SIZE (opts));
2819
2820 /* Size the string. */
2821 len = 0;
2822 sep_len = (add_nl_p) ? 3 : 1;
2823 for (i = 0; i < num; i++)
2824 {
2825 len += sep_len;
2826 for (j = 0; j < 2; j++)
2827 if (opts[i][j])
2828 len += strlen (opts[i][j]);
2829 }
2830
2831 /* Build the string. */
2832 ret = ptr = (char *) xmalloc (len);
2833 line_len = 0;
2834
2835 for (i = 0; i < num; i++)
2836 {
2837 size_t len2[2];
2838
2839 for (j = 0; j < 2; j++)
2840 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2841
2842 if (i != 0)
2843 {
2844 *ptr++ = ' ';
2845 line_len++;
2846
2847 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2848 {
2849 *ptr++ = '\\';
2850 *ptr++ = '\n';
2851 line_len = 0;
2852 }
2853 }
2854
2855 for (j = 0; j < 2; j++)
2856 if (opts[i][j])
2857 {
2858 memcpy (ptr, opts[i][j], len2[j]);
2859 ptr += len2[j];
2860 line_len += len2[j];
2861 }
2862 }
2863
2864 *ptr = '\0';
2865 gcc_assert (ret + len >= ptr);
2866
2867 return ret;
2868 }
2869
2870 /* Return true, if profiling code should be emitted before
2871 prologue. Otherwise it returns false.
2872 Note: For x86 with "hotfix" it is sorried. */
2873 static bool
2874 ix86_profile_before_prologue (void)
2875 {
2876 return flag_fentry != 0;
2877 }
2878
2879 /* Function that is callable from the debugger to print the current
2880 options. */
2881 void
2882 ix86_debug_options (void)
2883 {
2884 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2885 ix86_arch_string, ix86_tune_string,
2886 ix86_fpmath, true);
2887
2888 if (opts)
2889 {
2890 fprintf (stderr, "%s\n\n", opts);
2891 free (opts);
2892 }
2893 else
2894 fputs ("<no options>\n\n", stderr);
2895
2896 return;
2897 }
2898 \f
2899 /* Override various settings based on options. If MAIN_ARGS_P, the
2900 options are from the command line, otherwise they are from
2901 attributes. */
2902
2903 static void
2904 ix86_option_override_internal (bool main_args_p)
2905 {
2906 int i;
2907 unsigned int ix86_arch_mask, ix86_tune_mask;
2908 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2909 const char *prefix;
2910 const char *suffix;
2911 const char *sw;
2912
2913 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2914 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2915 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2916 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2917 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2918 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2919 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2920 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2921 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2922 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2923 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2924 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2925 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2926 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2927 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2928 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2929 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2930 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2931 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2932 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2933 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2934 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2935 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2936 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2937 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2938 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2939 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2940 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2941 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2942 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2943 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2944 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2945 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2946 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
2947 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
2948 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
2949 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
2950 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
2951 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
2952 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
2953
2954 /* if this reaches 64, need to widen struct pta flags below */
2955
2956 static struct pta
2957 {
2958 const char *const name; /* processor name or nickname. */
2959 const enum processor_type processor;
2960 const enum attr_cpu schedule;
2961 const unsigned HOST_WIDE_INT flags;
2962 }
2963 const processor_alias_table[] =
2964 {
2965 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2966 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2967 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2968 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2969 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2970 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2971 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
2972 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
2973 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2974 PTA_MMX | PTA_SSE | PTA_FXSR},
2975 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2976 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2977 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
2978 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2979 PTA_MMX | PTA_SSE | PTA_FXSR},
2980 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2981 PTA_MMX | PTA_SSE | PTA_FXSR},
2982 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2983 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2984 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2985 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
2986 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2987 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2988 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2989 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
2990 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2991 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2992 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
2993 {"core2", PROCESSOR_CORE2, CPU_CORE2,
2994 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2995 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
2996 {"corei7", PROCESSOR_COREI7, CPU_COREI7,
2997 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3
2998 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT | PTA_FXSR},
2999 {"corei7-avx", PROCESSOR_COREI7, CPU_COREI7,
3000 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3001 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3002 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
3003 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3004 {"core-avx-i", PROCESSOR_COREI7, CPU_COREI7,
3005 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3006 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3007 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3008 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3009 {"core-avx2", PROCESSOR_HASWELL, CPU_COREI7,
3010 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3011 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
3012 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3013 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
3014 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
3015 | PTA_XSAVEOPT},
3016 {"atom", PROCESSOR_ATOM, CPU_ATOM,
3017 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3018 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
3019 {"slm", PROCESSOR_SLM, CPU_SLM,
3020 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3021 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_MOVBE
3022 | PTA_FXSR},
3023 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3024 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3025 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3026 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3027 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3028 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3029 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3030 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3031 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3032 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3033 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3034 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3035 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3036 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3037 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3038 {"x86-64", PROCESSOR_K8, CPU_K8,
3039 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3040 {"k8", PROCESSOR_K8, CPU_K8,
3041 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3042 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3043 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3044 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3045 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3046 {"opteron", PROCESSOR_K8, CPU_K8,
3047 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3048 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3049 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3050 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3051 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3052 {"athlon64", PROCESSOR_K8, CPU_K8,
3053 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3054 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3055 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3056 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3057 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3058 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3059 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3060 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3061 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3062 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3063 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3064 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3065 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3066 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3067 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3068 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3069 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3070 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3071 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3072 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3073 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3074 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3075 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3076 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3077 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3078 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3079 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3080 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3081 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3082 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3083 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3084 | PTA_XSAVEOPT | PTA_FSGSBASE},
3085 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3086 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3087 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3088 | PTA_FXSR | PTA_XSAVE},
3089 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3090 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3091 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3092 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3093 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3094 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3095
3096 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3097 PTA_HLE /* flags are only used for -march switch. */ },
3098 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3099 PTA_64BIT
3100 | PTA_HLE /* flags are only used for -march switch. */ },
3101 };
3102
3103 /* -mrecip options. */
3104 static struct
3105 {
3106 const char *string; /* option name */
3107 unsigned int mask; /* mask bits to set */
3108 }
3109 const recip_options[] =
3110 {
3111 { "all", RECIP_MASK_ALL },
3112 { "none", RECIP_MASK_NONE },
3113 { "div", RECIP_MASK_DIV },
3114 { "sqrt", RECIP_MASK_SQRT },
3115 { "vec-div", RECIP_MASK_VEC_DIV },
3116 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3117 };
3118
3119 int const pta_size = ARRAY_SIZE (processor_alias_table);
3120
3121 /* Set up prefix/suffix so the error messages refer to either the command
3122 line argument, or the attribute(target). */
3123 if (main_args_p)
3124 {
3125 prefix = "-m";
3126 suffix = "";
3127 sw = "switch";
3128 }
3129 else
3130 {
3131 prefix = "option(\"";
3132 suffix = "\")";
3133 sw = "attribute";
3134 }
3135
3136 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3137 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3138 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
3139 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3140 #ifdef TARGET_BI_ARCH
3141 else
3142 {
3143 #if TARGET_BI_ARCH == 1
3144 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3145 is on and OPTION_MASK_ABI_X32 is off. We turn off
3146 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3147 -mx32. */
3148 if (TARGET_X32)
3149 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3150 #else
3151 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3152 on and OPTION_MASK_ABI_64 is off. We turn off
3153 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3154 -m64. */
3155 if (TARGET_LP64)
3156 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3157 #endif
3158 }
3159 #endif
3160
3161 if (TARGET_X32)
3162 {
3163 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3164 OPTION_MASK_ABI_64 for TARGET_X32. */
3165 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3166 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3167 }
3168 else if (TARGET_LP64)
3169 {
3170 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3171 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3172 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3173 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3174 }
3175
3176 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3177 SUBTARGET_OVERRIDE_OPTIONS;
3178 #endif
3179
3180 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3181 SUBSUBTARGET_OVERRIDE_OPTIONS;
3182 #endif
3183
3184 /* -fPIC is the default for x86_64. */
3185 if (TARGET_MACHO && TARGET_64BIT)
3186 flag_pic = 2;
3187
3188 /* Need to check -mtune=generic first. */
3189 if (ix86_tune_string)
3190 {
3191 if (!strcmp (ix86_tune_string, "generic")
3192 || !strcmp (ix86_tune_string, "i686")
3193 /* As special support for cross compilers we read -mtune=native
3194 as -mtune=generic. With native compilers we won't see the
3195 -mtune=native, as it was changed by the driver. */
3196 || !strcmp (ix86_tune_string, "native"))
3197 {
3198 if (TARGET_64BIT)
3199 ix86_tune_string = "generic64";
3200 else
3201 ix86_tune_string = "generic32";
3202 }
3203 /* If this call is for setting the option attribute, allow the
3204 generic32/generic64 that was previously set. */
3205 else if (!main_args_p
3206 && (!strcmp (ix86_tune_string, "generic32")
3207 || !strcmp (ix86_tune_string, "generic64")))
3208 ;
3209 else if (!strncmp (ix86_tune_string, "generic", 7))
3210 error ("bad value (%s) for %stune=%s %s",
3211 ix86_tune_string, prefix, suffix, sw);
3212 else if (!strcmp (ix86_tune_string, "x86-64"))
3213 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3214 "%stune=k8%s or %stune=generic%s instead as appropriate",
3215 prefix, suffix, prefix, suffix, prefix, suffix);
3216 }
3217 else
3218 {
3219 if (ix86_arch_string)
3220 ix86_tune_string = ix86_arch_string;
3221 if (!ix86_tune_string)
3222 {
3223 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3224 ix86_tune_defaulted = 1;
3225 }
3226
3227 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3228 need to use a sensible tune option. */
3229 if (!strcmp (ix86_tune_string, "generic")
3230 || !strcmp (ix86_tune_string, "x86-64")
3231 || !strcmp (ix86_tune_string, "i686"))
3232 {
3233 if (TARGET_64BIT)
3234 ix86_tune_string = "generic64";
3235 else
3236 ix86_tune_string = "generic32";
3237 }
3238 }
3239
3240 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3241 {
3242 /* rep; movq isn't available in 32-bit code. */
3243 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3244 ix86_stringop_alg = no_stringop;
3245 }
3246
3247 if (!ix86_arch_string)
3248 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3249 else
3250 ix86_arch_specified = 1;
3251
3252 if (global_options_set.x_ix86_pmode)
3253 {
3254 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3255 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3256 error ("address mode %qs not supported in the %s bit mode",
3257 TARGET_64BIT ? "short" : "long",
3258 TARGET_64BIT ? "64" : "32");
3259 }
3260 else
3261 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3262
3263 if (!global_options_set.x_ix86_abi)
3264 ix86_abi = DEFAULT_ABI;
3265
3266 if (global_options_set.x_ix86_cmodel)
3267 {
3268 switch (ix86_cmodel)
3269 {
3270 case CM_SMALL:
3271 case CM_SMALL_PIC:
3272 if (flag_pic)
3273 ix86_cmodel = CM_SMALL_PIC;
3274 if (!TARGET_64BIT)
3275 error ("code model %qs not supported in the %s bit mode",
3276 "small", "32");
3277 break;
3278
3279 case CM_MEDIUM:
3280 case CM_MEDIUM_PIC:
3281 if (flag_pic)
3282 ix86_cmodel = CM_MEDIUM_PIC;
3283 if (!TARGET_64BIT)
3284 error ("code model %qs not supported in the %s bit mode",
3285 "medium", "32");
3286 else if (TARGET_X32)
3287 error ("code model %qs not supported in x32 mode",
3288 "medium");
3289 break;
3290
3291 case CM_LARGE:
3292 case CM_LARGE_PIC:
3293 if (flag_pic)
3294 ix86_cmodel = CM_LARGE_PIC;
3295 if (!TARGET_64BIT)
3296 error ("code model %qs not supported in the %s bit mode",
3297 "large", "32");
3298 else if (TARGET_X32)
3299 error ("code model %qs not supported in x32 mode",
3300 "large");
3301 break;
3302
3303 case CM_32:
3304 if (flag_pic)
3305 error ("code model %s does not support PIC mode", "32");
3306 if (TARGET_64BIT)
3307 error ("code model %qs not supported in the %s bit mode",
3308 "32", "64");
3309 break;
3310
3311 case CM_KERNEL:
3312 if (flag_pic)
3313 {
3314 error ("code model %s does not support PIC mode", "kernel");
3315 ix86_cmodel = CM_32;
3316 }
3317 if (!TARGET_64BIT)
3318 error ("code model %qs not supported in the %s bit mode",
3319 "kernel", "32");
3320 break;
3321
3322 default:
3323 gcc_unreachable ();
3324 }
3325 }
3326 else
3327 {
3328 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3329 use of rip-relative addressing. This eliminates fixups that
3330 would otherwise be needed if this object is to be placed in a
3331 DLL, and is essentially just as efficient as direct addressing. */
3332 if (TARGET_64BIT && (TARGET_RDOS || TARGET_PECOFF))
3333 ix86_cmodel = CM_MEDIUM_PIC, flag_pic = 1;
3334 else if (TARGET_64BIT)
3335 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3336 else
3337 ix86_cmodel = CM_32;
3338 }
3339 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3340 {
3341 error ("-masm=intel not supported in this configuration");
3342 ix86_asm_dialect = ASM_ATT;
3343 }
3344 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3345 sorry ("%i-bit mode not compiled in",
3346 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3347
3348 for (i = 0; i < pta_size; i++)
3349 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3350 {
3351 ix86_schedule = processor_alias_table[i].schedule;
3352 ix86_arch = processor_alias_table[i].processor;
3353 /* Default cpu tuning to the architecture. */
3354 ix86_tune = ix86_arch;
3355
3356 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3357 error ("CPU you selected does not support x86-64 "
3358 "instruction set");
3359
3360 if (processor_alias_table[i].flags & PTA_MMX
3361 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3362 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3363 if (processor_alias_table[i].flags & PTA_3DNOW
3364 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3365 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3366 if (processor_alias_table[i].flags & PTA_3DNOW_A
3367 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3368 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3369 if (processor_alias_table[i].flags & PTA_SSE
3370 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3371 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3372 if (processor_alias_table[i].flags & PTA_SSE2
3373 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3374 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3375 if (processor_alias_table[i].flags & PTA_SSE3
3376 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3377 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3378 if (processor_alias_table[i].flags & PTA_SSSE3
3379 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3380 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3381 if (processor_alias_table[i].flags & PTA_SSE4_1
3382 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3383 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3384 if (processor_alias_table[i].flags & PTA_SSE4_2
3385 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3386 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3387 if (processor_alias_table[i].flags & PTA_AVX
3388 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3389 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3390 if (processor_alias_table[i].flags & PTA_AVX2
3391 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3392 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3393 if (processor_alias_table[i].flags & PTA_FMA
3394 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3395 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3396 if (processor_alias_table[i].flags & PTA_SSE4A
3397 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3398 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3399 if (processor_alias_table[i].flags & PTA_FMA4
3400 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3401 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3402 if (processor_alias_table[i].flags & PTA_XOP
3403 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3404 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3405 if (processor_alias_table[i].flags & PTA_LWP
3406 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3407 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3408 if (processor_alias_table[i].flags & PTA_ABM
3409 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3410 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3411 if (processor_alias_table[i].flags & PTA_BMI
3412 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3413 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3414 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3415 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3416 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3417 if (processor_alias_table[i].flags & PTA_TBM
3418 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3419 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3420 if (processor_alias_table[i].flags & PTA_BMI2
3421 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3422 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3423 if (processor_alias_table[i].flags & PTA_CX16
3424 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3425 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3426 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3427 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3428 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3429 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3430 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3431 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3432 if (processor_alias_table[i].flags & PTA_MOVBE
3433 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3434 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3435 if (processor_alias_table[i].flags & PTA_AES
3436 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3437 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3438 if (processor_alias_table[i].flags & PTA_PCLMUL
3439 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3440 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3441 if (processor_alias_table[i].flags & PTA_FSGSBASE
3442 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3443 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3444 if (processor_alias_table[i].flags & PTA_RDRND
3445 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3446 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3447 if (processor_alias_table[i].flags & PTA_F16C
3448 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3449 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3450 if (processor_alias_table[i].flags & PTA_RTM
3451 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3452 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3453 if (processor_alias_table[i].flags & PTA_HLE
3454 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3455 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3456 if (processor_alias_table[i].flags & PTA_PRFCHW
3457 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3458 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3459 if (processor_alias_table[i].flags & PTA_RDSEED
3460 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3461 ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3462 if (processor_alias_table[i].flags & PTA_ADX
3463 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3464 ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3465 if (processor_alias_table[i].flags & PTA_FXSR
3466 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3467 ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3468 if (processor_alias_table[i].flags & PTA_XSAVE
3469 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3470 ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3471 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3472 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3473 ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3474 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3475 x86_prefetch_sse = true;
3476
3477 break;
3478 }
3479
3480 if (!strcmp (ix86_arch_string, "generic"))
3481 error ("generic CPU can be used only for %stune=%s %s",
3482 prefix, suffix, sw);
3483 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3484 error ("bad value (%s) for %sarch=%s %s",
3485 ix86_arch_string, prefix, suffix, sw);
3486
3487 ix86_arch_mask = 1u << ix86_arch;
3488 for (i = 0; i < X86_ARCH_LAST; ++i)
3489 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3490
3491 for (i = 0; i < pta_size; i++)
3492 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3493 {
3494 ix86_schedule = processor_alias_table[i].schedule;
3495 ix86_tune = processor_alias_table[i].processor;
3496 if (TARGET_64BIT)
3497 {
3498 if (!(processor_alias_table[i].flags & PTA_64BIT))
3499 {
3500 if (ix86_tune_defaulted)
3501 {
3502 ix86_tune_string = "x86-64";
3503 for (i = 0; i < pta_size; i++)
3504 if (! strcmp (ix86_tune_string,
3505 processor_alias_table[i].name))
3506 break;
3507 ix86_schedule = processor_alias_table[i].schedule;
3508 ix86_tune = processor_alias_table[i].processor;
3509 }
3510 else
3511 error ("CPU you selected does not support x86-64 "
3512 "instruction set");
3513 }
3514 }
3515 else
3516 {
3517 /* Adjust tuning when compiling for 32-bit ABI. */
3518 switch (ix86_tune)
3519 {
3520 case PROCESSOR_GENERIC64:
3521 ix86_tune = PROCESSOR_GENERIC32;
3522 ix86_schedule = CPU_PENTIUMPRO;
3523 break;
3524
3525 default:
3526 break;
3527 }
3528 }
3529 /* Intel CPUs have always interpreted SSE prefetch instructions as
3530 NOPs; so, we can enable SSE prefetch instructions even when
3531 -mtune (rather than -march) points us to a processor that has them.
3532 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3533 higher processors. */
3534 if (TARGET_CMOV
3535 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3536 x86_prefetch_sse = true;
3537 break;
3538 }
3539
3540 if (ix86_tune_specified && i == pta_size)
3541 error ("bad value (%s) for %stune=%s %s",
3542 ix86_tune_string, prefix, suffix, sw);
3543
3544 ix86_tune_mask = 1u << ix86_tune;
3545 for (i = 0; i < X86_TUNE_LAST; ++i)
3546 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3547
3548 #ifndef USE_IX86_FRAME_POINTER
3549 #define USE_IX86_FRAME_POINTER 0
3550 #endif
3551
3552 #ifndef USE_X86_64_FRAME_POINTER
3553 #define USE_X86_64_FRAME_POINTER 0
3554 #endif
3555
3556 /* Set the default values for switches whose default depends on TARGET_64BIT
3557 in case they weren't overwritten by command line options. */
3558 if (TARGET_64BIT)
3559 {
3560 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3561 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3562 if (flag_asynchronous_unwind_tables == 2)
3563 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3564 if (flag_pcc_struct_return == 2)
3565 flag_pcc_struct_return = 0;
3566 }
3567 else
3568 {
3569 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3570 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3571 if (flag_asynchronous_unwind_tables == 2)
3572 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3573 if (flag_pcc_struct_return == 2)
3574 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3575 }
3576
3577 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3578 if (optimize_size)
3579 ix86_cost = &ix86_size_cost;
3580 else
3581 ix86_cost = ix86_tune_cost;
3582
3583 /* Arrange to set up i386_stack_locals for all functions. */
3584 init_machine_status = ix86_init_machine_status;
3585
3586 /* Validate -mregparm= value. */
3587 if (global_options_set.x_ix86_regparm)
3588 {
3589 if (TARGET_64BIT)
3590 warning (0, "-mregparm is ignored in 64-bit mode");
3591 if (ix86_regparm > REGPARM_MAX)
3592 {
3593 error ("-mregparm=%d is not between 0 and %d",
3594 ix86_regparm, REGPARM_MAX);
3595 ix86_regparm = 0;
3596 }
3597 }
3598 if (TARGET_64BIT)
3599 ix86_regparm = REGPARM_MAX;
3600
3601 /* Default align_* from the processor table. */
3602 if (align_loops == 0)
3603 {
3604 align_loops = processor_target_table[ix86_tune].align_loop;
3605 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3606 }
3607 if (align_jumps == 0)
3608 {
3609 align_jumps = processor_target_table[ix86_tune].align_jump;
3610 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3611 }
3612 if (align_functions == 0)
3613 {
3614 align_functions = processor_target_table[ix86_tune].align_func;
3615 }
3616
3617 /* Provide default for -mbranch-cost= value. */
3618 if (!global_options_set.x_ix86_branch_cost)
3619 ix86_branch_cost = ix86_cost->branch_cost;
3620
3621 if (TARGET_64BIT)
3622 {
3623 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3624
3625 /* Enable by default the SSE and MMX builtins. Do allow the user to
3626 explicitly disable any of these. In particular, disabling SSE and
3627 MMX for kernel code is extremely useful. */
3628 if (!ix86_arch_specified)
3629 ix86_isa_flags
3630 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3631 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3632
3633 if (TARGET_RTD)
3634 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3635 }
3636 else
3637 {
3638 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3639
3640 if (!ix86_arch_specified)
3641 ix86_isa_flags
3642 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3643
3644 /* i386 ABI does not specify red zone. It still makes sense to use it
3645 when programmer takes care to stack from being destroyed. */
3646 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3647 target_flags |= MASK_NO_RED_ZONE;
3648 }
3649
3650 /* Keep nonleaf frame pointers. */
3651 if (flag_omit_frame_pointer)
3652 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3653 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3654 flag_omit_frame_pointer = 1;
3655
3656 /* If we're doing fast math, we don't care about comparison order
3657 wrt NaNs. This lets us use a shorter comparison sequence. */
3658 if (flag_finite_math_only)
3659 target_flags &= ~MASK_IEEE_FP;
3660
3661 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3662 since the insns won't need emulation. */
3663 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3664 target_flags &= ~MASK_NO_FANCY_MATH_387;
3665
3666 /* Likewise, if the target doesn't have a 387, or we've specified
3667 software floating point, don't use 387 inline intrinsics. */
3668 if (!TARGET_80387)
3669 target_flags |= MASK_NO_FANCY_MATH_387;
3670
3671 /* Turn on MMX builtins for -msse. */
3672 if (TARGET_SSE)
3673 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3674
3675 /* Enable SSE prefetch. */
3676 if (TARGET_SSE || (TARGET_PRFCHW && !TARGET_3DNOW))
3677 x86_prefetch_sse = true;
3678
3679 /* Enable prefetch{,w} instructions for -m3dnow. */
3680 if (TARGET_3DNOW)
3681 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW & ~ix86_isa_flags_explicit;
3682
3683 /* Enable popcnt instruction for -msse4.2 or -mabm. */
3684 if (TARGET_SSE4_2 || TARGET_ABM)
3685 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3686
3687 /* Enable lzcnt instruction for -mabm. */
3688 if (TARGET_ABM)
3689 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3690
3691 /* Validate -mpreferred-stack-boundary= value or default it to
3692 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3693 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3694 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3695 {
3696 int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
3697 int max = (TARGET_SEH ? 4 : 12);
3698
3699 if (ix86_preferred_stack_boundary_arg < min
3700 || ix86_preferred_stack_boundary_arg > max)
3701 {
3702 if (min == max)
3703 error ("-mpreferred-stack-boundary is not supported "
3704 "for this target");
3705 else
3706 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3707 ix86_preferred_stack_boundary_arg, min, max);
3708 }
3709 else
3710 ix86_preferred_stack_boundary
3711 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3712 }
3713
3714 /* Set the default value for -mstackrealign. */
3715 if (ix86_force_align_arg_pointer == -1)
3716 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3717
3718 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3719
3720 /* Validate -mincoming-stack-boundary= value or default it to
3721 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3722 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3723 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3724 {
3725 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3726 || ix86_incoming_stack_boundary_arg > 12)
3727 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3728 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3729 else
3730 {
3731 ix86_user_incoming_stack_boundary
3732 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3733 ix86_incoming_stack_boundary
3734 = ix86_user_incoming_stack_boundary;
3735 }
3736 }
3737
3738 /* Accept -msseregparm only if at least SSE support is enabled. */
3739 if (TARGET_SSEREGPARM
3740 && ! TARGET_SSE)
3741 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3742
3743 if (global_options_set.x_ix86_fpmath)
3744 {
3745 if (ix86_fpmath & FPMATH_SSE)
3746 {
3747 if (!TARGET_SSE)
3748 {
3749 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3750 ix86_fpmath = FPMATH_387;
3751 }
3752 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3753 {
3754 warning (0, "387 instruction set disabled, using SSE arithmetics");
3755 ix86_fpmath = FPMATH_SSE;
3756 }
3757 }
3758 }
3759 else
3760 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3761
3762 /* If the i387 is disabled, then do not return values in it. */
3763 if (!TARGET_80387)
3764 target_flags &= ~MASK_FLOAT_RETURNS;
3765
3766 /* Use external vectorized library in vectorizing intrinsics. */
3767 if (global_options_set.x_ix86_veclibabi_type)
3768 switch (ix86_veclibabi_type)
3769 {
3770 case ix86_veclibabi_type_svml:
3771 ix86_veclib_handler = ix86_veclibabi_svml;
3772 break;
3773
3774 case ix86_veclibabi_type_acml:
3775 ix86_veclib_handler = ix86_veclibabi_acml;
3776 break;
3777
3778 default:
3779 gcc_unreachable ();
3780 }
3781
3782 if ((!USE_IX86_FRAME_POINTER
3783 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3784 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3785 && !optimize_size)
3786 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3787
3788 /* ??? Unwind info is not correct around the CFG unless either a frame
3789 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3790 unwind info generation to be aware of the CFG and propagating states
3791 around edges. */
3792 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3793 || flag_exceptions || flag_non_call_exceptions)
3794 && flag_omit_frame_pointer
3795 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3796 {
3797 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3798 warning (0, "unwind tables currently require either a frame pointer "
3799 "or %saccumulate-outgoing-args%s for correctness",
3800 prefix, suffix);
3801 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3802 }
3803
3804 /* If stack probes are required, the space used for large function
3805 arguments on the stack must also be probed, so enable
3806 -maccumulate-outgoing-args so this happens in the prologue. */
3807 if (TARGET_STACK_PROBE
3808 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3809 {
3810 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3811 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3812 "for correctness", prefix, suffix);
3813 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3814 }
3815
3816 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3817 {
3818 char *p;
3819 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3820 p = strchr (internal_label_prefix, 'X');
3821 internal_label_prefix_len = p - internal_label_prefix;
3822 *p = '\0';
3823 }
3824
3825 /* When scheduling description is not available, disable scheduler pass
3826 so it won't slow down the compilation and make x87 code slower. */
3827 if (!TARGET_SCHEDULE)
3828 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3829
3830 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3831 ix86_tune_cost->simultaneous_prefetches,
3832 global_options.x_param_values,
3833 global_options_set.x_param_values);
3834 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3835 ix86_tune_cost->prefetch_block,
3836 global_options.x_param_values,
3837 global_options_set.x_param_values);
3838 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3839 ix86_tune_cost->l1_cache_size,
3840 global_options.x_param_values,
3841 global_options_set.x_param_values);
3842 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3843 ix86_tune_cost->l2_cache_size,
3844 global_options.x_param_values,
3845 global_options_set.x_param_values);
3846
3847 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3848 if (flag_prefetch_loop_arrays < 0
3849 && HAVE_prefetch
3850 && (optimize >= 3 || flag_profile_use)
3851 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3852 flag_prefetch_loop_arrays = 1;
3853
3854 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3855 can be optimized to ap = __builtin_next_arg (0). */
3856 if (!TARGET_64BIT && !flag_split_stack)
3857 targetm.expand_builtin_va_start = NULL;
3858
3859 if (TARGET_64BIT)
3860 {
3861 ix86_gen_leave = gen_leave_rex64;
3862 if (Pmode == DImode)
3863 {
3864 ix86_gen_monitor = gen_sse3_monitor64_di;
3865 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3866 ix86_gen_tls_local_dynamic_base_64
3867 = gen_tls_local_dynamic_base_64_di;
3868 }
3869 else
3870 {
3871 ix86_gen_monitor = gen_sse3_monitor64_si;
3872 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3873 ix86_gen_tls_local_dynamic_base_64
3874 = gen_tls_local_dynamic_base_64_si;
3875 }
3876 }
3877 else
3878 {
3879 ix86_gen_leave = gen_leave;
3880 ix86_gen_monitor = gen_sse3_monitor;
3881 }
3882
3883 if (Pmode == DImode)
3884 {
3885 ix86_gen_add3 = gen_adddi3;
3886 ix86_gen_sub3 = gen_subdi3;
3887 ix86_gen_sub3_carry = gen_subdi3_carry;
3888 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3889 ix86_gen_andsp = gen_anddi3;
3890 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3891 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3892 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3893 }
3894 else
3895 {
3896 ix86_gen_add3 = gen_addsi3;
3897 ix86_gen_sub3 = gen_subsi3;
3898 ix86_gen_sub3_carry = gen_subsi3_carry;
3899 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3900 ix86_gen_andsp = gen_andsi3;
3901 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3902 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3903 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3904 }
3905
3906 #ifdef USE_IX86_CLD
3907 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3908 if (!TARGET_64BIT)
3909 target_flags |= MASK_CLD & ~target_flags_explicit;
3910 #endif
3911
3912 if (!TARGET_64BIT && flag_pic)
3913 {
3914 if (flag_fentry > 0)
3915 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3916 "with -fpic");
3917 flag_fentry = 0;
3918 }
3919 else if (TARGET_SEH)
3920 {
3921 if (flag_fentry == 0)
3922 sorry ("-mno-fentry isn%'t compatible with SEH");
3923 flag_fentry = 1;
3924 }
3925 else if (flag_fentry < 0)
3926 {
3927 #if defined(PROFILE_BEFORE_PROLOGUE)
3928 flag_fentry = 1;
3929 #else
3930 flag_fentry = 0;
3931 #endif
3932 }
3933
3934 if (TARGET_AVX)
3935 {
3936 /* When not optimize for size, enable vzeroupper optimization for
3937 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3938 AVX unaligned load/store. */
3939 if (!optimize_size)
3940 {
3941 if (flag_expensive_optimizations
3942 && !(target_flags_explicit & MASK_VZEROUPPER))
3943 target_flags |= MASK_VZEROUPPER;
3944 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3945 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3946 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3947 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3948 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3949 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3950 /* Enable 128-bit AVX instruction generation
3951 for the auto-vectorizer. */
3952 if (TARGET_AVX128_OPTIMAL
3953 && !(target_flags_explicit & MASK_PREFER_AVX128))
3954 target_flags |= MASK_PREFER_AVX128;
3955 }
3956 }
3957 else
3958 {
3959 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3960 target_flags &= ~MASK_VZEROUPPER;
3961 }
3962
3963 if (ix86_recip_name)
3964 {
3965 char *p = ASTRDUP (ix86_recip_name);
3966 char *q;
3967 unsigned int mask, i;
3968 bool invert;
3969
3970 while ((q = strtok (p, ",")) != NULL)
3971 {
3972 p = NULL;
3973 if (*q == '!')
3974 {
3975 invert = true;
3976 q++;
3977 }
3978 else
3979 invert = false;
3980
3981 if (!strcmp (q, "default"))
3982 mask = RECIP_MASK_ALL;
3983 else
3984 {
3985 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3986 if (!strcmp (q, recip_options[i].string))
3987 {
3988 mask = recip_options[i].mask;
3989 break;
3990 }
3991
3992 if (i == ARRAY_SIZE (recip_options))
3993 {
3994 error ("unknown option for -mrecip=%s", q);
3995 invert = false;
3996 mask = RECIP_MASK_NONE;
3997 }
3998 }
3999
4000 recip_mask_explicit |= mask;
4001 if (invert)
4002 recip_mask &= ~mask;
4003 else
4004 recip_mask |= mask;
4005 }
4006 }
4007
4008 if (TARGET_RECIP)
4009 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
4010 else if (target_flags_explicit & MASK_RECIP)
4011 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
4012
4013 /* Default long double to 64-bit for Bionic. */
4014 if (TARGET_HAS_BIONIC
4015 && !(target_flags_explicit & MASK_LONG_DOUBLE_64))
4016 target_flags |= MASK_LONG_DOUBLE_64;
4017
4018 /* Save the initial options in case the user does function specific
4019 options. */
4020 if (main_args_p)
4021 target_option_default_node = target_option_current_node
4022 = build_target_option_node ();
4023
4024 /* Handle stack protector */
4025 if (!global_options_set.x_ix86_stack_protector_guard)
4026 ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4027 }
4028
4029 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4030
4031 static void
4032 ix86_option_override (void)
4033 {
4034 static struct register_pass_info insert_vzeroupper_info
4035 = { &pass_insert_vzeroupper.pass, "reload",
4036 1, PASS_POS_INSERT_AFTER
4037 };
4038
4039 ix86_option_override_internal (true);
4040
4041
4042 /* This needs to be done at start up. It's convenient to do it here. */
4043 register_pass (&insert_vzeroupper_info);
4044 }
4045
4046 /* Update register usage after having seen the compiler flags. */
4047
4048 static void
4049 ix86_conditional_register_usage (void)
4050 {
4051 int i, c_mask;
4052 unsigned int j;
4053
4054 /* The PIC register, if it exists, is fixed. */
4055 j = PIC_OFFSET_TABLE_REGNUM;
4056 if (j != INVALID_REGNUM)
4057 fixed_regs[j] = call_used_regs[j] = 1;
4058
4059 /* For 32-bit targets, squash the REX registers. */
4060 if (! TARGET_64BIT)
4061 {
4062 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4063 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4064 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4065 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4066 }
4067
4068 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4069 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
4070 : TARGET_64BIT ? (1 << 2)
4071 : (1 << 1));
4072
4073 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4074
4075 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4076 {
4077 /* Set/reset conditionally defined registers from
4078 CALL_USED_REGISTERS initializer. */
4079 if (call_used_regs[i] > 1)
4080 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4081
4082 /* Calculate registers of CLOBBERED_REGS register set
4083 as call used registers from GENERAL_REGS register set. */
4084 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4085 && call_used_regs[i])
4086 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4087 }
4088
4089 /* If MMX is disabled, squash the registers. */
4090 if (! TARGET_MMX)
4091 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4092 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4093 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4094
4095 /* If SSE is disabled, squash the registers. */
4096 if (! TARGET_SSE)
4097 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4098 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4099 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4100
4101 /* If the FPU is disabled, squash the registers. */
4102 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4103 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4104 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4105 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4106 }
4107
4108 \f
4109 /* Save the current options */
4110
4111 static void
4112 ix86_function_specific_save (struct cl_target_option *ptr)
4113 {
4114 ptr->arch = ix86_arch;
4115 ptr->schedule = ix86_schedule;
4116 ptr->tune = ix86_tune;
4117 ptr->branch_cost = ix86_branch_cost;
4118 ptr->tune_defaulted = ix86_tune_defaulted;
4119 ptr->arch_specified = ix86_arch_specified;
4120 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4121 ptr->ix86_target_flags_explicit = target_flags_explicit;
4122 ptr->x_recip_mask_explicit = recip_mask_explicit;
4123
4124 /* The fields are char but the variables are not; make sure the
4125 values fit in the fields. */
4126 gcc_assert (ptr->arch == ix86_arch);
4127 gcc_assert (ptr->schedule == ix86_schedule);
4128 gcc_assert (ptr->tune == ix86_tune);
4129 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4130 }
4131
4132 /* Restore the current options */
4133
4134 static void
4135 ix86_function_specific_restore (struct cl_target_option *ptr)
4136 {
4137 enum processor_type old_tune = ix86_tune;
4138 enum processor_type old_arch = ix86_arch;
4139 unsigned int ix86_arch_mask, ix86_tune_mask;
4140 int i;
4141
4142 ix86_arch = (enum processor_type) ptr->arch;
4143 ix86_schedule = (enum attr_cpu) ptr->schedule;
4144 ix86_tune = (enum processor_type) ptr->tune;
4145 ix86_branch_cost = ptr->branch_cost;
4146 ix86_tune_defaulted = ptr->tune_defaulted;
4147 ix86_arch_specified = ptr->arch_specified;
4148 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4149 target_flags_explicit = ptr->ix86_target_flags_explicit;
4150 recip_mask_explicit = ptr->x_recip_mask_explicit;
4151
4152 /* Recreate the arch feature tests if the arch changed */
4153 if (old_arch != ix86_arch)
4154 {
4155 ix86_arch_mask = 1u << ix86_arch;
4156 for (i = 0; i < X86_ARCH_LAST; ++i)
4157 ix86_arch_features[i]
4158 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4159 }
4160
4161 /* Recreate the tune optimization tests */
4162 if (old_tune != ix86_tune)
4163 {
4164 ix86_tune_mask = 1u << ix86_tune;
4165 for (i = 0; i < X86_TUNE_LAST; ++i)
4166 ix86_tune_features[i]
4167 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4168 }
4169 }
4170
4171 /* Print the current options */
4172
4173 static void
4174 ix86_function_specific_print (FILE *file, int indent,
4175 struct cl_target_option *ptr)
4176 {
4177 char *target_string
4178 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4179 NULL, NULL, ptr->x_ix86_fpmath, false);
4180
4181 fprintf (file, "%*sarch = %d (%s)\n",
4182 indent, "",
4183 ptr->arch,
4184 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4185 ? cpu_names[ptr->arch]
4186 : "<unknown>"));
4187
4188 fprintf (file, "%*stune = %d (%s)\n",
4189 indent, "",
4190 ptr->tune,
4191 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4192 ? cpu_names[ptr->tune]
4193 : "<unknown>"));
4194
4195 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4196
4197 if (target_string)
4198 {
4199 fprintf (file, "%*s%s\n", indent, "", target_string);
4200 free (target_string);
4201 }
4202 }
4203
4204 \f
4205 /* Inner function to process the attribute((target(...))), take an argument and
4206 set the current options from the argument. If we have a list, recursively go
4207 over the list. */
4208
4209 static bool
4210 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4211 struct gcc_options *enum_opts_set)
4212 {
4213 char *next_optstr;
4214 bool ret = true;
4215
4216 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4217 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4218 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4219 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4220 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4221
4222 enum ix86_opt_type
4223 {
4224 ix86_opt_unknown,
4225 ix86_opt_yes,
4226 ix86_opt_no,
4227 ix86_opt_str,
4228 ix86_opt_enum,
4229 ix86_opt_isa
4230 };
4231
4232 static const struct
4233 {
4234 const char *string;
4235 size_t len;
4236 enum ix86_opt_type type;
4237 int opt;
4238 int mask;
4239 } attrs[] = {
4240 /* isa options */
4241 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4242 IX86_ATTR_ISA ("abm", OPT_mabm),
4243 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4244 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4245 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4246 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4247 IX86_ATTR_ISA ("aes", OPT_maes),
4248 IX86_ATTR_ISA ("avx", OPT_mavx),
4249 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4250 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4251 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4252 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4253 IX86_ATTR_ISA ("sse", OPT_msse),
4254 IX86_ATTR_ISA ("sse2", OPT_msse2),
4255 IX86_ATTR_ISA ("sse3", OPT_msse3),
4256 IX86_ATTR_ISA ("sse4", OPT_msse4),
4257 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4258 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4259 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4260 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4261 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4262 IX86_ATTR_ISA ("fma", OPT_mfma),
4263 IX86_ATTR_ISA ("xop", OPT_mxop),
4264 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4265 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4266 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4267 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4268 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4269 IX86_ATTR_ISA ("hle", OPT_mhle),
4270 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4271 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4272 IX86_ATTR_ISA ("adx", OPT_madx),
4273 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4274 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4275 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4276
4277 /* enum options */
4278 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4279
4280 /* string options */
4281 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4282 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4283
4284 /* flag options */
4285 IX86_ATTR_YES ("cld",
4286 OPT_mcld,
4287 MASK_CLD),
4288
4289 IX86_ATTR_NO ("fancy-math-387",
4290 OPT_mfancy_math_387,
4291 MASK_NO_FANCY_MATH_387),
4292
4293 IX86_ATTR_YES ("ieee-fp",
4294 OPT_mieee_fp,
4295 MASK_IEEE_FP),
4296
4297 IX86_ATTR_YES ("inline-all-stringops",
4298 OPT_minline_all_stringops,
4299 MASK_INLINE_ALL_STRINGOPS),
4300
4301 IX86_ATTR_YES ("inline-stringops-dynamically",
4302 OPT_minline_stringops_dynamically,
4303 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4304
4305 IX86_ATTR_NO ("align-stringops",
4306 OPT_mno_align_stringops,
4307 MASK_NO_ALIGN_STRINGOPS),
4308
4309 IX86_ATTR_YES ("recip",
4310 OPT_mrecip,
4311 MASK_RECIP),
4312
4313 };
4314
4315 /* If this is a list, recurse to get the options. */
4316 if (TREE_CODE (args) == TREE_LIST)
4317 {
4318 bool ret = true;
4319
4320 for (; args; args = TREE_CHAIN (args))
4321 if (TREE_VALUE (args)
4322 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4323 p_strings, enum_opts_set))
4324 ret = false;
4325
4326 return ret;
4327 }
4328
4329 else if (TREE_CODE (args) != STRING_CST)
4330 {
4331 error ("attribute %<target%> argument not a string");
4332 return false;
4333 }
4334
4335 /* Handle multiple arguments separated by commas. */
4336 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4337
4338 while (next_optstr && *next_optstr != '\0')
4339 {
4340 char *p = next_optstr;
4341 char *orig_p = p;
4342 char *comma = strchr (next_optstr, ',');
4343 const char *opt_string;
4344 size_t len, opt_len;
4345 int opt;
4346 bool opt_set_p;
4347 char ch;
4348 unsigned i;
4349 enum ix86_opt_type type = ix86_opt_unknown;
4350 int mask = 0;
4351
4352 if (comma)
4353 {
4354 *comma = '\0';
4355 len = comma - next_optstr;
4356 next_optstr = comma + 1;
4357 }
4358 else
4359 {
4360 len = strlen (p);
4361 next_optstr = NULL;
4362 }
4363
4364 /* Recognize no-xxx. */
4365 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4366 {
4367 opt_set_p = false;
4368 p += 3;
4369 len -= 3;
4370 }
4371 else
4372 opt_set_p = true;
4373
4374 /* Find the option. */
4375 ch = *p;
4376 opt = N_OPTS;
4377 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4378 {
4379 type = attrs[i].type;
4380 opt_len = attrs[i].len;
4381 if (ch == attrs[i].string[0]
4382 && ((type != ix86_opt_str && type != ix86_opt_enum)
4383 ? len == opt_len
4384 : len > opt_len)
4385 && memcmp (p, attrs[i].string, opt_len) == 0)
4386 {
4387 opt = attrs[i].opt;
4388 mask = attrs[i].mask;
4389 opt_string = attrs[i].string;
4390 break;
4391 }
4392 }
4393
4394 /* Process the option. */
4395 if (opt == N_OPTS)
4396 {
4397 error ("attribute(target(\"%s\")) is unknown", orig_p);
4398 ret = false;
4399 }
4400
4401 else if (type == ix86_opt_isa)
4402 {
4403 struct cl_decoded_option decoded;
4404
4405 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4406 ix86_handle_option (&global_options, &global_options_set,
4407 &decoded, input_location);
4408 }
4409
4410 else if (type == ix86_opt_yes || type == ix86_opt_no)
4411 {
4412 if (type == ix86_opt_no)
4413 opt_set_p = !opt_set_p;
4414
4415 if (opt_set_p)
4416 target_flags |= mask;
4417 else
4418 target_flags &= ~mask;
4419 }
4420
4421 else if (type == ix86_opt_str)
4422 {
4423 if (p_strings[opt])
4424 {
4425 error ("option(\"%s\") was already specified", opt_string);
4426 ret = false;
4427 }
4428 else
4429 p_strings[opt] = xstrdup (p + opt_len);
4430 }
4431
4432 else if (type == ix86_opt_enum)
4433 {
4434 bool arg_ok;
4435 int value;
4436
4437 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4438 if (arg_ok)
4439 set_option (&global_options, enum_opts_set, opt, value,
4440 p + opt_len, DK_UNSPECIFIED, input_location,
4441 global_dc);
4442 else
4443 {
4444 error ("attribute(target(\"%s\")) is unknown", orig_p);
4445 ret = false;
4446 }
4447 }
4448
4449 else
4450 gcc_unreachable ();
4451 }
4452
4453 return ret;
4454 }
4455
4456 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4457
4458 tree
4459 ix86_valid_target_attribute_tree (tree args)
4460 {
4461 const char *orig_arch_string = ix86_arch_string;
4462 const char *orig_tune_string = ix86_tune_string;
4463 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4464 int orig_tune_defaulted = ix86_tune_defaulted;
4465 int orig_arch_specified = ix86_arch_specified;
4466 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4467 tree t = NULL_TREE;
4468 int i;
4469 struct cl_target_option *def
4470 = TREE_TARGET_OPTION (target_option_default_node);
4471 struct gcc_options enum_opts_set;
4472
4473 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4474
4475 /* Process each of the options on the chain. */
4476 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4477 &enum_opts_set))
4478 return error_mark_node;
4479
4480 /* If the changed options are different from the default, rerun
4481 ix86_option_override_internal, and then save the options away.
4482 The string options are are attribute options, and will be undone
4483 when we copy the save structure. */
4484 if (ix86_isa_flags != def->x_ix86_isa_flags
4485 || target_flags != def->x_target_flags
4486 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4487 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4488 || enum_opts_set.x_ix86_fpmath)
4489 {
4490 /* If we are using the default tune= or arch=, undo the string assigned,
4491 and use the default. */
4492 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4493 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4494 else if (!orig_arch_specified)
4495 ix86_arch_string = NULL;
4496
4497 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4498 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4499 else if (orig_tune_defaulted)
4500 ix86_tune_string = NULL;
4501
4502 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4503 if (enum_opts_set.x_ix86_fpmath)
4504 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4505 else if (!TARGET_64BIT && TARGET_SSE)
4506 {
4507 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4508 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4509 }
4510
4511 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4512 ix86_option_override_internal (false);
4513
4514 /* Add any builtin functions with the new isa if any. */
4515 ix86_add_new_builtins (ix86_isa_flags);
4516
4517 /* Save the current options unless we are validating options for
4518 #pragma. */
4519 t = build_target_option_node ();
4520
4521 ix86_arch_string = orig_arch_string;
4522 ix86_tune_string = orig_tune_string;
4523 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4524
4525 /* Free up memory allocated to hold the strings */
4526 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4527 free (option_strings[i]);
4528 }
4529
4530 return t;
4531 }
4532
4533 /* Hook to validate attribute((target("string"))). */
4534
4535 static bool
4536 ix86_valid_target_attribute_p (tree fndecl,
4537 tree ARG_UNUSED (name),
4538 tree args,
4539 int ARG_UNUSED (flags))
4540 {
4541 struct cl_target_option cur_target;
4542 bool ret = true;
4543
4544 /* attribute((target("default"))) does nothing, beyond
4545 affecting multi-versioning. */
4546 if (TREE_VALUE (args)
4547 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
4548 && TREE_CHAIN (args) == NULL_TREE
4549 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
4550 return true;
4551
4552 tree old_optimize = build_optimization_node ();
4553 tree new_target, new_optimize;
4554 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4555
4556 /* If the function changed the optimization levels as well as setting target
4557 options, start with the optimizations specified. */
4558 if (func_optimize && func_optimize != old_optimize)
4559 cl_optimization_restore (&global_options,
4560 TREE_OPTIMIZATION (func_optimize));
4561
4562 /* The target attributes may also change some optimization flags, so update
4563 the optimization options if necessary. */
4564 cl_target_option_save (&cur_target, &global_options);
4565 new_target = ix86_valid_target_attribute_tree (args);
4566 new_optimize = build_optimization_node ();
4567
4568 if (new_target == error_mark_node)
4569 ret = false;
4570
4571 else if (fndecl && new_target)
4572 {
4573 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4574
4575 if (old_optimize != new_optimize)
4576 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4577 }
4578
4579 cl_target_option_restore (&global_options, &cur_target);
4580
4581 if (old_optimize != new_optimize)
4582 cl_optimization_restore (&global_options,
4583 TREE_OPTIMIZATION (old_optimize));
4584
4585 return ret;
4586 }
4587
4588 \f
4589 /* Hook to determine if one function can safely inline another. */
4590
4591 static bool
4592 ix86_can_inline_p (tree caller, tree callee)
4593 {
4594 bool ret = false;
4595 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4596 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4597
4598 /* If callee has no option attributes, then it is ok to inline. */
4599 if (!callee_tree)
4600 ret = true;
4601
4602 /* If caller has no option attributes, but callee does then it is not ok to
4603 inline. */
4604 else if (!caller_tree)
4605 ret = false;
4606
4607 else
4608 {
4609 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4610 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4611
4612 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4613 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4614 function. */
4615 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4616 != callee_opts->x_ix86_isa_flags)
4617 ret = false;
4618
4619 /* See if we have the same non-isa options. */
4620 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4621 ret = false;
4622
4623 /* See if arch, tune, etc. are the same. */
4624 else if (caller_opts->arch != callee_opts->arch)
4625 ret = false;
4626
4627 else if (caller_opts->tune != callee_opts->tune)
4628 ret = false;
4629
4630 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4631 ret = false;
4632
4633 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4634 ret = false;
4635
4636 else
4637 ret = true;
4638 }
4639
4640 return ret;
4641 }
4642
4643 \f
4644 /* Remember the last target of ix86_set_current_function. */
4645 static GTY(()) tree ix86_previous_fndecl;
4646
4647 /* Establish appropriate back-end context for processing the function
4648 FNDECL. The argument might be NULL to indicate processing at top
4649 level, outside of any function scope. */
4650 static void
4651 ix86_set_current_function (tree fndecl)
4652 {
4653 /* Only change the context if the function changes. This hook is called
4654 several times in the course of compiling a function, and we don't want to
4655 slow things down too much or call target_reinit when it isn't safe. */
4656 if (fndecl && fndecl != ix86_previous_fndecl)
4657 {
4658 tree old_tree = (ix86_previous_fndecl
4659 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4660 : NULL_TREE);
4661
4662 tree new_tree = (fndecl
4663 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4664 : NULL_TREE);
4665
4666 ix86_previous_fndecl = fndecl;
4667 if (old_tree == new_tree)
4668 ;
4669
4670 else if (new_tree)
4671 {
4672 cl_target_option_restore (&global_options,
4673 TREE_TARGET_OPTION (new_tree));
4674 target_reinit ();
4675 }
4676
4677 else if (old_tree)
4678 {
4679 struct cl_target_option *def
4680 = TREE_TARGET_OPTION (target_option_current_node);
4681
4682 cl_target_option_restore (&global_options, def);
4683 target_reinit ();
4684 }
4685 }
4686 }
4687
4688 \f
4689 /* Return true if this goes in large data/bss. */
4690
4691 static bool
4692 ix86_in_large_data_p (tree exp)
4693 {
4694 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4695 return false;
4696
4697 /* Functions are never large data. */
4698 if (TREE_CODE (exp) == FUNCTION_DECL)
4699 return false;
4700
4701 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4702 {
4703 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4704 if (strcmp (section, ".ldata") == 0
4705 || strcmp (section, ".lbss") == 0)
4706 return true;
4707 return false;
4708 }
4709 else
4710 {
4711 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4712
4713 /* If this is an incomplete type with size 0, then we can't put it
4714 in data because it might be too big when completed. */
4715 if (!size || size > ix86_section_threshold)
4716 return true;
4717 }
4718
4719 return false;
4720 }
4721
4722 /* Switch to the appropriate section for output of DECL.
4723 DECL is either a `VAR_DECL' node or a constant of some sort.
4724 RELOC indicates whether forming the initial value of DECL requires
4725 link-time relocations. */
4726
4727 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4728 ATTRIBUTE_UNUSED;
4729
4730 static section *
4731 x86_64_elf_select_section (tree decl, int reloc,
4732 unsigned HOST_WIDE_INT align)
4733 {
4734 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4735 && ix86_in_large_data_p (decl))
4736 {
4737 const char *sname = NULL;
4738 unsigned int flags = SECTION_WRITE;
4739 switch (categorize_decl_for_section (decl, reloc))
4740 {
4741 case SECCAT_DATA:
4742 sname = ".ldata";
4743 break;
4744 case SECCAT_DATA_REL:
4745 sname = ".ldata.rel";
4746 break;
4747 case SECCAT_DATA_REL_LOCAL:
4748 sname = ".ldata.rel.local";
4749 break;
4750 case SECCAT_DATA_REL_RO:
4751 sname = ".ldata.rel.ro";
4752 break;
4753 case SECCAT_DATA_REL_RO_LOCAL:
4754 sname = ".ldata.rel.ro.local";
4755 break;
4756 case SECCAT_BSS:
4757 sname = ".lbss";
4758 flags |= SECTION_BSS;
4759 break;
4760 case SECCAT_RODATA:
4761 case SECCAT_RODATA_MERGE_STR:
4762 case SECCAT_RODATA_MERGE_STR_INIT:
4763 case SECCAT_RODATA_MERGE_CONST:
4764 sname = ".lrodata";
4765 flags = 0;
4766 break;
4767 case SECCAT_SRODATA:
4768 case SECCAT_SDATA:
4769 case SECCAT_SBSS:
4770 gcc_unreachable ();
4771 case SECCAT_TEXT:
4772 case SECCAT_TDATA:
4773 case SECCAT_TBSS:
4774 /* We don't split these for medium model. Place them into
4775 default sections and hope for best. */
4776 break;
4777 }
4778 if (sname)
4779 {
4780 /* We might get called with string constants, but get_named_section
4781 doesn't like them as they are not DECLs. Also, we need to set
4782 flags in that case. */
4783 if (!DECL_P (decl))
4784 return get_section (sname, flags, NULL);
4785 return get_named_section (decl, sname, reloc);
4786 }
4787 }
4788 return default_elf_select_section (decl, reloc, align);
4789 }
4790
4791 /* Build up a unique section name, expressed as a
4792 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4793 RELOC indicates whether the initial value of EXP requires
4794 link-time relocations. */
4795
4796 static void ATTRIBUTE_UNUSED
4797 x86_64_elf_unique_section (tree decl, int reloc)
4798 {
4799 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4800 && ix86_in_large_data_p (decl))
4801 {
4802 const char *prefix = NULL;
4803 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4804 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4805
4806 switch (categorize_decl_for_section (decl, reloc))
4807 {
4808 case SECCAT_DATA:
4809 case SECCAT_DATA_REL:
4810 case SECCAT_DATA_REL_LOCAL:
4811 case SECCAT_DATA_REL_RO:
4812 case SECCAT_DATA_REL_RO_LOCAL:
4813 prefix = one_only ? ".ld" : ".ldata";
4814 break;
4815 case SECCAT_BSS:
4816 prefix = one_only ? ".lb" : ".lbss";
4817 break;
4818 case SECCAT_RODATA:
4819 case SECCAT_RODATA_MERGE_STR:
4820 case SECCAT_RODATA_MERGE_STR_INIT:
4821 case SECCAT_RODATA_MERGE_CONST:
4822 prefix = one_only ? ".lr" : ".lrodata";
4823 break;
4824 case SECCAT_SRODATA:
4825 case SECCAT_SDATA:
4826 case SECCAT_SBSS:
4827 gcc_unreachable ();
4828 case SECCAT_TEXT:
4829 case SECCAT_TDATA:
4830 case SECCAT_TBSS:
4831 /* We don't split these for medium model. Place them into
4832 default sections and hope for best. */
4833 break;
4834 }
4835 if (prefix)
4836 {
4837 const char *name, *linkonce;
4838 char *string;
4839
4840 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4841 name = targetm.strip_name_encoding (name);
4842
4843 /* If we're using one_only, then there needs to be a .gnu.linkonce
4844 prefix to the section name. */
4845 linkonce = one_only ? ".gnu.linkonce" : "";
4846
4847 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4848
4849 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4850 return;
4851 }
4852 }
4853 default_unique_section (decl, reloc);
4854 }
4855
4856 #ifdef COMMON_ASM_OP
4857 /* This says how to output assembler code to declare an
4858 uninitialized external linkage data object.
4859
4860 For medium model x86-64 we need to use .largecomm opcode for
4861 large objects. */
4862 void
4863 x86_elf_aligned_common (FILE *file,
4864 const char *name, unsigned HOST_WIDE_INT size,
4865 int align)
4866 {
4867 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4868 && size > (unsigned int)ix86_section_threshold)
4869 fputs (".largecomm\t", file);
4870 else
4871 fputs (COMMON_ASM_OP, file);
4872 assemble_name (file, name);
4873 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4874 size, align / BITS_PER_UNIT);
4875 }
4876 #endif
4877
4878 /* Utility function for targets to use in implementing
4879 ASM_OUTPUT_ALIGNED_BSS. */
4880
4881 void
4882 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4883 const char *name, unsigned HOST_WIDE_INT size,
4884 int align)
4885 {
4886 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4887 && size > (unsigned int)ix86_section_threshold)
4888 switch_to_section (get_named_section (decl, ".lbss", 0));
4889 else
4890 switch_to_section (bss_section);
4891 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4892 #ifdef ASM_DECLARE_OBJECT_NAME
4893 last_assemble_variable_decl = decl;
4894 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4895 #else
4896 /* Standard thing is just output label for the object. */
4897 ASM_OUTPUT_LABEL (file, name);
4898 #endif /* ASM_DECLARE_OBJECT_NAME */
4899 ASM_OUTPUT_SKIP (file, size ? size : 1);
4900 }
4901 \f
4902 /* Decide whether we must probe the stack before any space allocation
4903 on this target. It's essentially TARGET_STACK_PROBE except when
4904 -fstack-check causes the stack to be already probed differently. */
4905
4906 bool
4907 ix86_target_stack_probe (void)
4908 {
4909 /* Do not probe the stack twice if static stack checking is enabled. */
4910 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4911 return false;
4912
4913 return TARGET_STACK_PROBE;
4914 }
4915 \f
4916 /* Decide whether we can make a sibling call to a function. DECL is the
4917 declaration of the function being targeted by the call and EXP is the
4918 CALL_EXPR representing the call. */
4919
4920 static bool
4921 ix86_function_ok_for_sibcall (tree decl, tree exp)
4922 {
4923 tree type, decl_or_type;
4924 rtx a, b;
4925
4926 /* If we are generating position-independent code, we cannot sibcall
4927 optimize any indirect call, or a direct call to a global function,
4928 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4929 if (!TARGET_MACHO
4930 && !TARGET_64BIT
4931 && flag_pic
4932 && (!decl || !targetm.binds_local_p (decl)))
4933 return false;
4934
4935 /* If we need to align the outgoing stack, then sibcalling would
4936 unalign the stack, which may break the called function. */
4937 if (ix86_minimum_incoming_stack_boundary (true)
4938 < PREFERRED_STACK_BOUNDARY)
4939 return false;
4940
4941 if (decl)
4942 {
4943 decl_or_type = decl;
4944 type = TREE_TYPE (decl);
4945 }
4946 else
4947 {
4948 /* We're looking at the CALL_EXPR, we need the type of the function. */
4949 type = CALL_EXPR_FN (exp); /* pointer expression */
4950 type = TREE_TYPE (type); /* pointer type */
4951 type = TREE_TYPE (type); /* function type */
4952 decl_or_type = type;
4953 }
4954
4955 /* Check that the return value locations are the same. Like
4956 if we are returning floats on the 80387 register stack, we cannot
4957 make a sibcall from a function that doesn't return a float to a
4958 function that does or, conversely, from a function that does return
4959 a float to a function that doesn't; the necessary stack adjustment
4960 would not be executed. This is also the place we notice
4961 differences in the return value ABI. Note that it is ok for one
4962 of the functions to have void return type as long as the return
4963 value of the other is passed in a register. */
4964 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4965 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4966 cfun->decl, false);
4967 if (STACK_REG_P (a) || STACK_REG_P (b))
4968 {
4969 if (!rtx_equal_p (a, b))
4970 return false;
4971 }
4972 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4973 ;
4974 else if (!rtx_equal_p (a, b))
4975 return false;
4976
4977 if (TARGET_64BIT)
4978 {
4979 /* The SYSV ABI has more call-clobbered registers;
4980 disallow sibcalls from MS to SYSV. */
4981 if (cfun->machine->call_abi == MS_ABI
4982 && ix86_function_type_abi (type) == SYSV_ABI)
4983 return false;
4984 }
4985 else
4986 {
4987 /* If this call is indirect, we'll need to be able to use a
4988 call-clobbered register for the address of the target function.
4989 Make sure that all such registers are not used for passing
4990 parameters. Note that DLLIMPORT functions are indirect. */
4991 if (!decl
4992 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4993 {
4994 if (ix86_function_regparm (type, NULL) >= 3)
4995 {
4996 /* ??? Need to count the actual number of registers to be used,
4997 not the possible number of registers. Fix later. */
4998 return false;
4999 }
5000 }
5001 }
5002
5003 /* Otherwise okay. That also includes certain types of indirect calls. */
5004 return true;
5005 }
5006
5007 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
5008 and "sseregparm" calling convention attributes;
5009 arguments as in struct attribute_spec.handler. */
5010
5011 static tree
5012 ix86_handle_cconv_attribute (tree *node, tree name,
5013 tree args,
5014 int flags ATTRIBUTE_UNUSED,
5015 bool *no_add_attrs)
5016 {
5017 if (TREE_CODE (*node) != FUNCTION_TYPE
5018 && TREE_CODE (*node) != METHOD_TYPE
5019 && TREE_CODE (*node) != FIELD_DECL
5020 && TREE_CODE (*node) != TYPE_DECL)
5021 {
5022 warning (OPT_Wattributes, "%qE attribute only applies to functions",
5023 name);
5024 *no_add_attrs = true;
5025 return NULL_TREE;
5026 }
5027
5028 /* Can combine regparm with all attributes but fastcall, and thiscall. */
5029 if (is_attribute_p ("regparm", name))
5030 {
5031 tree cst;
5032
5033 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5034 {
5035 error ("fastcall and regparm attributes are not compatible");
5036 }
5037
5038 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5039 {
5040 error ("regparam and thiscall attributes are not compatible");
5041 }
5042
5043 cst = TREE_VALUE (args);
5044 if (TREE_CODE (cst) != INTEGER_CST)
5045 {
5046 warning (OPT_Wattributes,
5047 "%qE attribute requires an integer constant argument",
5048 name);
5049 *no_add_attrs = true;
5050 }
5051 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
5052 {
5053 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
5054 name, REGPARM_MAX);
5055 *no_add_attrs = true;
5056 }
5057
5058 return NULL_TREE;
5059 }
5060
5061 if (TARGET_64BIT)
5062 {
5063 /* Do not warn when emulating the MS ABI. */
5064 if ((TREE_CODE (*node) != FUNCTION_TYPE
5065 && TREE_CODE (*node) != METHOD_TYPE)
5066 || ix86_function_type_abi (*node) != MS_ABI)
5067 warning (OPT_Wattributes, "%qE attribute ignored",
5068 name);
5069 *no_add_attrs = true;
5070 return NULL_TREE;
5071 }
5072
5073 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
5074 if (is_attribute_p ("fastcall", name))
5075 {
5076 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5077 {
5078 error ("fastcall and cdecl attributes are not compatible");
5079 }
5080 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5081 {
5082 error ("fastcall and stdcall attributes are not compatible");
5083 }
5084 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5085 {
5086 error ("fastcall and regparm attributes are not compatible");
5087 }
5088 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5089 {
5090 error ("fastcall and thiscall attributes are not compatible");
5091 }
5092 }
5093
5094 /* Can combine stdcall with fastcall (redundant), regparm and
5095 sseregparm. */
5096 else if (is_attribute_p ("stdcall", name))
5097 {
5098 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5099 {
5100 error ("stdcall and cdecl attributes are not compatible");
5101 }
5102 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5103 {
5104 error ("stdcall and fastcall attributes are not compatible");
5105 }
5106 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5107 {
5108 error ("stdcall and thiscall attributes are not compatible");
5109 }
5110 }
5111
5112 /* Can combine cdecl with regparm and sseregparm. */
5113 else if (is_attribute_p ("cdecl", name))
5114 {
5115 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5116 {
5117 error ("stdcall and cdecl attributes are not compatible");
5118 }
5119 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5120 {
5121 error ("fastcall and cdecl attributes are not compatible");
5122 }
5123 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5124 {
5125 error ("cdecl and thiscall attributes are not compatible");
5126 }
5127 }
5128 else if (is_attribute_p ("thiscall", name))
5129 {
5130 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5131 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5132 name);
5133 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5134 {
5135 error ("stdcall and thiscall attributes are not compatible");
5136 }
5137 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5138 {
5139 error ("fastcall and thiscall attributes are not compatible");
5140 }
5141 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5142 {
5143 error ("cdecl and thiscall attributes are not compatible");
5144 }
5145 }
5146
5147 /* Can combine sseregparm with all attributes. */
5148
5149 return NULL_TREE;
5150 }
5151
5152 /* The transactional memory builtins are implicitly regparm or fastcall
5153 depending on the ABI. Override the generic do-nothing attribute that
5154 these builtins were declared with, and replace it with one of the two
5155 attributes that we expect elsewhere. */
5156
5157 static tree
5158 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5159 tree args ATTRIBUTE_UNUSED,
5160 int flags ATTRIBUTE_UNUSED,
5161 bool *no_add_attrs)
5162 {
5163 tree alt;
5164
5165 /* In no case do we want to add the placeholder attribute. */
5166 *no_add_attrs = true;
5167
5168 /* The 64-bit ABI is unchanged for transactional memory. */
5169 if (TARGET_64BIT)
5170 return NULL_TREE;
5171
5172 /* ??? Is there a better way to validate 32-bit windows? We have
5173 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5174 if (CHECK_STACK_LIMIT > 0)
5175 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5176 else
5177 {
5178 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5179 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5180 }
5181 decl_attributes (node, alt, flags);
5182
5183 return NULL_TREE;
5184 }
5185
5186 /* This function determines from TYPE the calling-convention. */
5187
5188 unsigned int
5189 ix86_get_callcvt (const_tree type)
5190 {
5191 unsigned int ret = 0;
5192 bool is_stdarg;
5193 tree attrs;
5194
5195 if (TARGET_64BIT)
5196 return IX86_CALLCVT_CDECL;
5197
5198 attrs = TYPE_ATTRIBUTES (type);
5199 if (attrs != NULL_TREE)
5200 {
5201 if (lookup_attribute ("cdecl", attrs))
5202 ret |= IX86_CALLCVT_CDECL;
5203 else if (lookup_attribute ("stdcall", attrs))
5204 ret |= IX86_CALLCVT_STDCALL;
5205 else if (lookup_attribute ("fastcall", attrs))
5206 ret |= IX86_CALLCVT_FASTCALL;
5207 else if (lookup_attribute ("thiscall", attrs))
5208 ret |= IX86_CALLCVT_THISCALL;
5209
5210 /* Regparam isn't allowed for thiscall and fastcall. */
5211 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5212 {
5213 if (lookup_attribute ("regparm", attrs))
5214 ret |= IX86_CALLCVT_REGPARM;
5215 if (lookup_attribute ("sseregparm", attrs))
5216 ret |= IX86_CALLCVT_SSEREGPARM;
5217 }
5218
5219 if (IX86_BASE_CALLCVT(ret) != 0)
5220 return ret;
5221 }
5222
5223 is_stdarg = stdarg_p (type);
5224 if (TARGET_RTD && !is_stdarg)
5225 return IX86_CALLCVT_STDCALL | ret;
5226
5227 if (ret != 0
5228 || is_stdarg
5229 || TREE_CODE (type) != METHOD_TYPE
5230 || ix86_function_type_abi (type) != MS_ABI)
5231 return IX86_CALLCVT_CDECL | ret;
5232
5233 return IX86_CALLCVT_THISCALL;
5234 }
5235
5236 /* Return 0 if the attributes for two types are incompatible, 1 if they
5237 are compatible, and 2 if they are nearly compatible (which causes a
5238 warning to be generated). */
5239
5240 static int
5241 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5242 {
5243 unsigned int ccvt1, ccvt2;
5244
5245 if (TREE_CODE (type1) != FUNCTION_TYPE
5246 && TREE_CODE (type1) != METHOD_TYPE)
5247 return 1;
5248
5249 ccvt1 = ix86_get_callcvt (type1);
5250 ccvt2 = ix86_get_callcvt (type2);
5251 if (ccvt1 != ccvt2)
5252 return 0;
5253 if (ix86_function_regparm (type1, NULL)
5254 != ix86_function_regparm (type2, NULL))
5255 return 0;
5256
5257 return 1;
5258 }
5259 \f
5260 /* Return the regparm value for a function with the indicated TYPE and DECL.
5261 DECL may be NULL when calling function indirectly
5262 or considering a libcall. */
5263
5264 static int
5265 ix86_function_regparm (const_tree type, const_tree decl)
5266 {
5267 tree attr;
5268 int regparm;
5269 unsigned int ccvt;
5270
5271 if (TARGET_64BIT)
5272 return (ix86_function_type_abi (type) == SYSV_ABI
5273 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5274 ccvt = ix86_get_callcvt (type);
5275 regparm = ix86_regparm;
5276
5277 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5278 {
5279 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5280 if (attr)
5281 {
5282 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5283 return regparm;
5284 }
5285 }
5286 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5287 return 2;
5288 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5289 return 1;
5290
5291 /* Use register calling convention for local functions when possible. */
5292 if (decl
5293 && TREE_CODE (decl) == FUNCTION_DECL
5294 && optimize
5295 && !(profile_flag && !flag_fentry))
5296 {
5297 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5298 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5299 if (i && i->local && i->can_change_signature)
5300 {
5301 int local_regparm, globals = 0, regno;
5302
5303 /* Make sure no regparm register is taken by a
5304 fixed register variable. */
5305 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5306 if (fixed_regs[local_regparm])
5307 break;
5308
5309 /* We don't want to use regparm(3) for nested functions as
5310 these use a static chain pointer in the third argument. */
5311 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5312 local_regparm = 2;
5313
5314 /* In 32-bit mode save a register for the split stack. */
5315 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5316 local_regparm = 2;
5317
5318 /* Each fixed register usage increases register pressure,
5319 so less registers should be used for argument passing.
5320 This functionality can be overriden by an explicit
5321 regparm value. */
5322 for (regno = AX_REG; regno <= DI_REG; regno++)
5323 if (fixed_regs[regno])
5324 globals++;
5325
5326 local_regparm
5327 = globals < local_regparm ? local_regparm - globals : 0;
5328
5329 if (local_regparm > regparm)
5330 regparm = local_regparm;
5331 }
5332 }
5333
5334 return regparm;
5335 }
5336
5337 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5338 DFmode (2) arguments in SSE registers for a function with the
5339 indicated TYPE and DECL. DECL may be NULL when calling function
5340 indirectly or considering a libcall. Otherwise return 0. */
5341
5342 static int
5343 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5344 {
5345 gcc_assert (!TARGET_64BIT);
5346
5347 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5348 by the sseregparm attribute. */
5349 if (TARGET_SSEREGPARM
5350 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5351 {
5352 if (!TARGET_SSE)
5353 {
5354 if (warn)
5355 {
5356 if (decl)
5357 error ("calling %qD with attribute sseregparm without "
5358 "SSE/SSE2 enabled", decl);
5359 else
5360 error ("calling %qT with attribute sseregparm without "
5361 "SSE/SSE2 enabled", type);
5362 }
5363 return 0;
5364 }
5365
5366 return 2;
5367 }
5368
5369 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5370 (and DFmode for SSE2) arguments in SSE registers. */
5371 if (decl && TARGET_SSE_MATH && optimize
5372 && !(profile_flag && !flag_fentry))
5373 {
5374 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5375 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5376 if (i && i->local && i->can_change_signature)
5377 return TARGET_SSE2 ? 2 : 1;
5378 }
5379
5380 return 0;
5381 }
5382
5383 /* Return true if EAX is live at the start of the function. Used by
5384 ix86_expand_prologue to determine if we need special help before
5385 calling allocate_stack_worker. */
5386
5387 static bool
5388 ix86_eax_live_at_start_p (void)
5389 {
5390 /* Cheat. Don't bother working forward from ix86_function_regparm
5391 to the function type to whether an actual argument is located in
5392 eax. Instead just look at cfg info, which is still close enough
5393 to correct at this point. This gives false positives for broken
5394 functions that might use uninitialized data that happens to be
5395 allocated in eax, but who cares? */
5396 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5397 }
5398
5399 static bool
5400 ix86_keep_aggregate_return_pointer (tree fntype)
5401 {
5402 tree attr;
5403
5404 if (!TARGET_64BIT)
5405 {
5406 attr = lookup_attribute ("callee_pop_aggregate_return",
5407 TYPE_ATTRIBUTES (fntype));
5408 if (attr)
5409 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5410
5411 /* For 32-bit MS-ABI the default is to keep aggregate
5412 return pointer. */
5413 if (ix86_function_type_abi (fntype) == MS_ABI)
5414 return true;
5415 }
5416 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5417 }
5418
5419 /* Value is the number of bytes of arguments automatically
5420 popped when returning from a subroutine call.
5421 FUNDECL is the declaration node of the function (as a tree),
5422 FUNTYPE is the data type of the function (as a tree),
5423 or for a library call it is an identifier node for the subroutine name.
5424 SIZE is the number of bytes of arguments passed on the stack.
5425
5426 On the 80386, the RTD insn may be used to pop them if the number
5427 of args is fixed, but if the number is variable then the caller
5428 must pop them all. RTD can't be used for library calls now
5429 because the library is compiled with the Unix compiler.
5430 Use of RTD is a selectable option, since it is incompatible with
5431 standard Unix calling sequences. If the option is not selected,
5432 the caller must always pop the args.
5433
5434 The attribute stdcall is equivalent to RTD on a per module basis. */
5435
5436 static int
5437 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5438 {
5439 unsigned int ccvt;
5440
5441 /* None of the 64-bit ABIs pop arguments. */
5442 if (TARGET_64BIT)
5443 return 0;
5444
5445 ccvt = ix86_get_callcvt (funtype);
5446
5447 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5448 | IX86_CALLCVT_THISCALL)) != 0
5449 && ! stdarg_p (funtype))
5450 return size;
5451
5452 /* Lose any fake structure return argument if it is passed on the stack. */
5453 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5454 && !ix86_keep_aggregate_return_pointer (funtype))
5455 {
5456 int nregs = ix86_function_regparm (funtype, fundecl);
5457 if (nregs == 0)
5458 return GET_MODE_SIZE (Pmode);
5459 }
5460
5461 return 0;
5462 }
5463
5464 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5465
5466 static bool
5467 ix86_legitimate_combined_insn (rtx insn)
5468 {
5469 /* Check operand constraints in case hard registers were propagated
5470 into insn pattern. This check prevents combine pass from
5471 generating insn patterns with invalid hard register operands.
5472 These invalid insns can eventually confuse reload to error out
5473 with a spill failure. See also PRs 46829 and 46843. */
5474 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5475 {
5476 int i;
5477
5478 extract_insn (insn);
5479 preprocess_constraints ();
5480
5481 for (i = 0; i < recog_data.n_operands; i++)
5482 {
5483 rtx op = recog_data.operand[i];
5484 enum machine_mode mode = GET_MODE (op);
5485 struct operand_alternative *op_alt;
5486 int offset = 0;
5487 bool win;
5488 int j;
5489
5490 /* A unary operator may be accepted by the predicate, but it
5491 is irrelevant for matching constraints. */
5492 if (UNARY_P (op))
5493 op = XEXP (op, 0);
5494
5495 if (GET_CODE (op) == SUBREG)
5496 {
5497 if (REG_P (SUBREG_REG (op))
5498 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5499 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5500 GET_MODE (SUBREG_REG (op)),
5501 SUBREG_BYTE (op),
5502 GET_MODE (op));
5503 op = SUBREG_REG (op);
5504 }
5505
5506 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5507 continue;
5508
5509 op_alt = recog_op_alt[i];
5510
5511 /* Operand has no constraints, anything is OK. */
5512 win = !recog_data.n_alternatives;
5513
5514 for (j = 0; j < recog_data.n_alternatives; j++)
5515 {
5516 if (op_alt[j].anything_ok
5517 || (op_alt[j].matches != -1
5518 && operands_match_p
5519 (recog_data.operand[i],
5520 recog_data.operand[op_alt[j].matches]))
5521 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5522 {
5523 win = true;
5524 break;
5525 }
5526 }
5527
5528 if (!win)
5529 return false;
5530 }
5531 }
5532
5533 return true;
5534 }
5535 \f
5536 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
5537
5538 static unsigned HOST_WIDE_INT
5539 ix86_asan_shadow_offset (void)
5540 {
5541 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
5542 : HOST_WIDE_INT_C (0x7fff8000))
5543 : (HOST_WIDE_INT_1 << 29);
5544 }
5545 \f
5546 /* Argument support functions. */
5547
5548 /* Return true when register may be used to pass function parameters. */
5549 bool
5550 ix86_function_arg_regno_p (int regno)
5551 {
5552 int i;
5553 const int *parm_regs;
5554
5555 if (!TARGET_64BIT)
5556 {
5557 if (TARGET_MACHO)
5558 return (regno < REGPARM_MAX
5559 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5560 else
5561 return (regno < REGPARM_MAX
5562 || (TARGET_MMX && MMX_REGNO_P (regno)
5563 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5564 || (TARGET_SSE && SSE_REGNO_P (regno)
5565 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5566 }
5567
5568 if (TARGET_MACHO)
5569 {
5570 if (SSE_REGNO_P (regno) && TARGET_SSE)
5571 return true;
5572 }
5573 else
5574 {
5575 if (TARGET_SSE && SSE_REGNO_P (regno)
5576 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5577 return true;
5578 }
5579
5580 /* TODO: The function should depend on current function ABI but
5581 builtins.c would need updating then. Therefore we use the
5582 default ABI. */
5583
5584 /* RAX is used as hidden argument to va_arg functions. */
5585 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5586 return true;
5587
5588 if (ix86_abi == MS_ABI)
5589 parm_regs = x86_64_ms_abi_int_parameter_registers;
5590 else
5591 parm_regs = x86_64_int_parameter_registers;
5592 for (i = 0; i < (ix86_abi == MS_ABI
5593 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5594 if (regno == parm_regs[i])
5595 return true;
5596 return false;
5597 }
5598
5599 /* Return if we do not know how to pass TYPE solely in registers. */
5600
5601 static bool
5602 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5603 {
5604 if (must_pass_in_stack_var_size_or_pad (mode, type))
5605 return true;
5606
5607 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5608 The layout_type routine is crafty and tries to trick us into passing
5609 currently unsupported vector types on the stack by using TImode. */
5610 return (!TARGET_64BIT && mode == TImode
5611 && type && TREE_CODE (type) != VECTOR_TYPE);
5612 }
5613
5614 /* It returns the size, in bytes, of the area reserved for arguments passed
5615 in registers for the function represented by fndecl dependent to the used
5616 abi format. */
5617 int
5618 ix86_reg_parm_stack_space (const_tree fndecl)
5619 {
5620 enum calling_abi call_abi = SYSV_ABI;
5621 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5622 call_abi = ix86_function_abi (fndecl);
5623 else
5624 call_abi = ix86_function_type_abi (fndecl);
5625 if (TARGET_64BIT && call_abi == MS_ABI)
5626 return 32;
5627 return 0;
5628 }
5629
5630 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5631 call abi used. */
5632 enum calling_abi
5633 ix86_function_type_abi (const_tree fntype)
5634 {
5635 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5636 {
5637 enum calling_abi abi = ix86_abi;
5638 if (abi == SYSV_ABI)
5639 {
5640 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5641 abi = MS_ABI;
5642 }
5643 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5644 abi = SYSV_ABI;
5645 return abi;
5646 }
5647 return ix86_abi;
5648 }
5649
5650 static bool
5651 ix86_function_ms_hook_prologue (const_tree fn)
5652 {
5653 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5654 {
5655 if (decl_function_context (fn) != NULL_TREE)
5656 error_at (DECL_SOURCE_LOCATION (fn),
5657 "ms_hook_prologue is not compatible with nested function");
5658 else
5659 return true;
5660 }
5661 return false;
5662 }
5663
5664 static enum calling_abi
5665 ix86_function_abi (const_tree fndecl)
5666 {
5667 if (! fndecl)
5668 return ix86_abi;
5669 return ix86_function_type_abi (TREE_TYPE (fndecl));
5670 }
5671
5672 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5673 call abi used. */
5674 enum calling_abi
5675 ix86_cfun_abi (void)
5676 {
5677 if (! cfun)
5678 return ix86_abi;
5679 return cfun->machine->call_abi;
5680 }
5681
5682 /* Write the extra assembler code needed to declare a function properly. */
5683
5684 void
5685 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5686 tree decl)
5687 {
5688 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5689
5690 if (is_ms_hook)
5691 {
5692 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5693 unsigned int filler_cc = 0xcccccccc;
5694
5695 for (i = 0; i < filler_count; i += 4)
5696 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5697 }
5698
5699 #ifdef SUBTARGET_ASM_UNWIND_INIT
5700 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5701 #endif
5702
5703 ASM_OUTPUT_LABEL (asm_out_file, fname);
5704
5705 /* Output magic byte marker, if hot-patch attribute is set. */
5706 if (is_ms_hook)
5707 {
5708 if (TARGET_64BIT)
5709 {
5710 /* leaq [%rsp + 0], %rsp */
5711 asm_fprintf (asm_out_file, ASM_BYTE
5712 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5713 }
5714 else
5715 {
5716 /* movl.s %edi, %edi
5717 push %ebp
5718 movl.s %esp, %ebp */
5719 asm_fprintf (asm_out_file, ASM_BYTE
5720 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5721 }
5722 }
5723 }
5724
5725 /* regclass.c */
5726 extern void init_regs (void);
5727
5728 /* Implementation of call abi switching target hook. Specific to FNDECL
5729 the specific call register sets are set. See also
5730 ix86_conditional_register_usage for more details. */
5731 void
5732 ix86_call_abi_override (const_tree fndecl)
5733 {
5734 if (fndecl == NULL_TREE)
5735 cfun->machine->call_abi = ix86_abi;
5736 else
5737 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5738 }
5739
5740 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5741 expensive re-initialization of init_regs each time we switch function context
5742 since this is needed only during RTL expansion. */
5743 static void
5744 ix86_maybe_switch_abi (void)
5745 {
5746 if (TARGET_64BIT &&
5747 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5748 reinit_regs ();
5749 }
5750
5751 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5752 for a call to a function whose data type is FNTYPE.
5753 For a library call, FNTYPE is 0. */
5754
5755 void
5756 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5757 tree fntype, /* tree ptr for function decl */
5758 rtx libname, /* SYMBOL_REF of library name or 0 */
5759 tree fndecl,
5760 int caller)
5761 {
5762 struct cgraph_local_info *i;
5763
5764 memset (cum, 0, sizeof (*cum));
5765
5766 if (fndecl)
5767 {
5768 i = cgraph_local_info (fndecl);
5769 cum->call_abi = ix86_function_abi (fndecl);
5770 }
5771 else
5772 {
5773 i = NULL;
5774 cum->call_abi = ix86_function_type_abi (fntype);
5775 }
5776
5777 cum->caller = caller;
5778
5779 /* Set up the number of registers to use for passing arguments. */
5780
5781 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5782 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5783 "or subtarget optimization implying it");
5784 cum->nregs = ix86_regparm;
5785 if (TARGET_64BIT)
5786 {
5787 cum->nregs = (cum->call_abi == SYSV_ABI
5788 ? X86_64_REGPARM_MAX
5789 : X86_64_MS_REGPARM_MAX);
5790 }
5791 if (TARGET_SSE)
5792 {
5793 cum->sse_nregs = SSE_REGPARM_MAX;
5794 if (TARGET_64BIT)
5795 {
5796 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5797 ? X86_64_SSE_REGPARM_MAX
5798 : X86_64_MS_SSE_REGPARM_MAX);
5799 }
5800 }
5801 if (TARGET_MMX)
5802 cum->mmx_nregs = MMX_REGPARM_MAX;
5803 cum->warn_avx = true;
5804 cum->warn_sse = true;
5805 cum->warn_mmx = true;
5806
5807 /* Because type might mismatch in between caller and callee, we need to
5808 use actual type of function for local calls.
5809 FIXME: cgraph_analyze can be told to actually record if function uses
5810 va_start so for local functions maybe_vaarg can be made aggressive
5811 helping K&R code.
5812 FIXME: once typesytem is fixed, we won't need this code anymore. */
5813 if (i && i->local && i->can_change_signature)
5814 fntype = TREE_TYPE (fndecl);
5815 cum->maybe_vaarg = (fntype
5816 ? (!prototype_p (fntype) || stdarg_p (fntype))
5817 : !libname);
5818
5819 if (!TARGET_64BIT)
5820 {
5821 /* If there are variable arguments, then we won't pass anything
5822 in registers in 32-bit mode. */
5823 if (stdarg_p (fntype))
5824 {
5825 cum->nregs = 0;
5826 cum->sse_nregs = 0;
5827 cum->mmx_nregs = 0;
5828 cum->warn_avx = 0;
5829 cum->warn_sse = 0;
5830 cum->warn_mmx = 0;
5831 return;
5832 }
5833
5834 /* Use ecx and edx registers if function has fastcall attribute,
5835 else look for regparm information. */
5836 if (fntype)
5837 {
5838 unsigned int ccvt = ix86_get_callcvt (fntype);
5839 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5840 {
5841 cum->nregs = 1;
5842 cum->fastcall = 1; /* Same first register as in fastcall. */
5843 }
5844 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5845 {
5846 cum->nregs = 2;
5847 cum->fastcall = 1;
5848 }
5849 else
5850 cum->nregs = ix86_function_regparm (fntype, fndecl);
5851 }
5852
5853 /* Set up the number of SSE registers used for passing SFmode
5854 and DFmode arguments. Warn for mismatching ABI. */
5855 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5856 }
5857 }
5858
5859 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5860 But in the case of vector types, it is some vector mode.
5861
5862 When we have only some of our vector isa extensions enabled, then there
5863 are some modes for which vector_mode_supported_p is false. For these
5864 modes, the generic vector support in gcc will choose some non-vector mode
5865 in order to implement the type. By computing the natural mode, we'll
5866 select the proper ABI location for the operand and not depend on whatever
5867 the middle-end decides to do with these vector types.
5868
5869 The midde-end can't deal with the vector types > 16 bytes. In this
5870 case, we return the original mode and warn ABI change if CUM isn't
5871 NULL. */
5872
5873 static enum machine_mode
5874 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5875 {
5876 enum machine_mode mode = TYPE_MODE (type);
5877
5878 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5879 {
5880 HOST_WIDE_INT size = int_size_in_bytes (type);
5881 if ((size == 8 || size == 16 || size == 32)
5882 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5883 && TYPE_VECTOR_SUBPARTS (type) > 1)
5884 {
5885 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5886
5887 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5888 mode = MIN_MODE_VECTOR_FLOAT;
5889 else
5890 mode = MIN_MODE_VECTOR_INT;
5891
5892 /* Get the mode which has this inner mode and number of units. */
5893 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5894 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5895 && GET_MODE_INNER (mode) == innermode)
5896 {
5897 if (size == 32 && !TARGET_AVX)
5898 {
5899 static bool warnedavx;
5900
5901 if (cum
5902 && !warnedavx
5903 && cum->warn_avx)
5904 {
5905 warnedavx = true;
5906 warning (0, "AVX vector argument without AVX "
5907 "enabled changes the ABI");
5908 }
5909 return TYPE_MODE (type);
5910 }
5911 else if ((size == 8 || size == 16) && !TARGET_SSE)
5912 {
5913 static bool warnedsse;
5914
5915 if (cum
5916 && !warnedsse
5917 && cum->warn_sse)
5918 {
5919 warnedsse = true;
5920 warning (0, "SSE vector argument without SSE "
5921 "enabled changes the ABI");
5922 }
5923 return mode;
5924 }
5925 else
5926 return mode;
5927 }
5928
5929 gcc_unreachable ();
5930 }
5931 }
5932
5933 return mode;
5934 }
5935
5936 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5937 this may not agree with the mode that the type system has chosen for the
5938 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5939 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5940
5941 static rtx
5942 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5943 unsigned int regno)
5944 {
5945 rtx tmp;
5946
5947 if (orig_mode != BLKmode)
5948 tmp = gen_rtx_REG (orig_mode, regno);
5949 else
5950 {
5951 tmp = gen_rtx_REG (mode, regno);
5952 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5953 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5954 }
5955
5956 return tmp;
5957 }
5958
5959 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5960 of this code is to classify each 8bytes of incoming argument by the register
5961 class and assign registers accordingly. */
5962
5963 /* Return the union class of CLASS1 and CLASS2.
5964 See the x86-64 PS ABI for details. */
5965
5966 static enum x86_64_reg_class
5967 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5968 {
5969 /* Rule #1: If both classes are equal, this is the resulting class. */
5970 if (class1 == class2)
5971 return class1;
5972
5973 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5974 the other class. */
5975 if (class1 == X86_64_NO_CLASS)
5976 return class2;
5977 if (class2 == X86_64_NO_CLASS)
5978 return class1;
5979
5980 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5981 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5982 return X86_64_MEMORY_CLASS;
5983
5984 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5985 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5986 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5987 return X86_64_INTEGERSI_CLASS;
5988 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5989 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5990 return X86_64_INTEGER_CLASS;
5991
5992 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5993 MEMORY is used. */
5994 if (class1 == X86_64_X87_CLASS
5995 || class1 == X86_64_X87UP_CLASS
5996 || class1 == X86_64_COMPLEX_X87_CLASS
5997 || class2 == X86_64_X87_CLASS
5998 || class2 == X86_64_X87UP_CLASS
5999 || class2 == X86_64_COMPLEX_X87_CLASS)
6000 return X86_64_MEMORY_CLASS;
6001
6002 /* Rule #6: Otherwise class SSE is used. */
6003 return X86_64_SSE_CLASS;
6004 }
6005
6006 /* Classify the argument of type TYPE and mode MODE.
6007 CLASSES will be filled by the register class used to pass each word
6008 of the operand. The number of words is returned. In case the parameter
6009 should be passed in memory, 0 is returned. As a special case for zero
6010 sized containers, classes[0] will be NO_CLASS and 1 is returned.
6011
6012 BIT_OFFSET is used internally for handling records and specifies offset
6013 of the offset in bits modulo 256 to avoid overflow cases.
6014
6015 See the x86-64 PS ABI for details.
6016 */
6017
6018 static int
6019 classify_argument (enum machine_mode mode, const_tree type,
6020 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
6021 {
6022 HOST_WIDE_INT bytes =
6023 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6024 int words
6025 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6026
6027 /* Variable sized entities are always passed/returned in memory. */
6028 if (bytes < 0)
6029 return 0;
6030
6031 if (mode != VOIDmode
6032 && targetm.calls.must_pass_in_stack (mode, type))
6033 return 0;
6034
6035 if (type && AGGREGATE_TYPE_P (type))
6036 {
6037 int i;
6038 tree field;
6039 enum x86_64_reg_class subclasses[MAX_CLASSES];
6040
6041 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
6042 if (bytes > 32)
6043 return 0;
6044
6045 for (i = 0; i < words; i++)
6046 classes[i] = X86_64_NO_CLASS;
6047
6048 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
6049 signalize memory class, so handle it as special case. */
6050 if (!words)
6051 {
6052 classes[0] = X86_64_NO_CLASS;
6053 return 1;
6054 }
6055
6056 /* Classify each field of record and merge classes. */
6057 switch (TREE_CODE (type))
6058 {
6059 case RECORD_TYPE:
6060 /* And now merge the fields of structure. */
6061 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6062 {
6063 if (TREE_CODE (field) == FIELD_DECL)
6064 {
6065 int num;
6066
6067 if (TREE_TYPE (field) == error_mark_node)
6068 continue;
6069
6070 /* Bitfields are always classified as integer. Handle them
6071 early, since later code would consider them to be
6072 misaligned integers. */
6073 if (DECL_BIT_FIELD (field))
6074 {
6075 for (i = (int_bit_position (field)
6076 + (bit_offset % 64)) / 8 / 8;
6077 i < ((int_bit_position (field) + (bit_offset % 64))
6078 + tree_low_cst (DECL_SIZE (field), 0)
6079 + 63) / 8 / 8; i++)
6080 classes[i] =
6081 merge_classes (X86_64_INTEGER_CLASS,
6082 classes[i]);
6083 }
6084 else
6085 {
6086 int pos;
6087
6088 type = TREE_TYPE (field);
6089
6090 /* Flexible array member is ignored. */
6091 if (TYPE_MODE (type) == BLKmode
6092 && TREE_CODE (type) == ARRAY_TYPE
6093 && TYPE_SIZE (type) == NULL_TREE
6094 && TYPE_DOMAIN (type) != NULL_TREE
6095 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6096 == NULL_TREE))
6097 {
6098 static bool warned;
6099
6100 if (!warned && warn_psabi)
6101 {
6102 warned = true;
6103 inform (input_location,
6104 "the ABI of passing struct with"
6105 " a flexible array member has"
6106 " changed in GCC 4.4");
6107 }
6108 continue;
6109 }
6110 num = classify_argument (TYPE_MODE (type), type,
6111 subclasses,
6112 (int_bit_position (field)
6113 + bit_offset) % 256);
6114 if (!num)
6115 return 0;
6116 pos = (int_bit_position (field)
6117 + (bit_offset % 64)) / 8 / 8;
6118 for (i = 0; i < num && (i + pos) < words; i++)
6119 classes[i + pos] =
6120 merge_classes (subclasses[i], classes[i + pos]);
6121 }
6122 }
6123 }
6124 break;
6125
6126 case ARRAY_TYPE:
6127 /* Arrays are handled as small records. */
6128 {
6129 int num;
6130 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6131 TREE_TYPE (type), subclasses, bit_offset);
6132 if (!num)
6133 return 0;
6134
6135 /* The partial classes are now full classes. */
6136 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6137 subclasses[0] = X86_64_SSE_CLASS;
6138 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6139 && !((bit_offset % 64) == 0 && bytes == 4))
6140 subclasses[0] = X86_64_INTEGER_CLASS;
6141
6142 for (i = 0; i < words; i++)
6143 classes[i] = subclasses[i % num];
6144
6145 break;
6146 }
6147 case UNION_TYPE:
6148 case QUAL_UNION_TYPE:
6149 /* Unions are similar to RECORD_TYPE but offset is always 0.
6150 */
6151 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6152 {
6153 if (TREE_CODE (field) == FIELD_DECL)
6154 {
6155 int num;
6156
6157 if (TREE_TYPE (field) == error_mark_node)
6158 continue;
6159
6160 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6161 TREE_TYPE (field), subclasses,
6162 bit_offset);
6163 if (!num)
6164 return 0;
6165 for (i = 0; i < num; i++)
6166 classes[i] = merge_classes (subclasses[i], classes[i]);
6167 }
6168 }
6169 break;
6170
6171 default:
6172 gcc_unreachable ();
6173 }
6174
6175 if (words > 2)
6176 {
6177 /* When size > 16 bytes, if the first one isn't
6178 X86_64_SSE_CLASS or any other ones aren't
6179 X86_64_SSEUP_CLASS, everything should be passed in
6180 memory. */
6181 if (classes[0] != X86_64_SSE_CLASS)
6182 return 0;
6183
6184 for (i = 1; i < words; i++)
6185 if (classes[i] != X86_64_SSEUP_CLASS)
6186 return 0;
6187 }
6188
6189 /* Final merger cleanup. */
6190 for (i = 0; i < words; i++)
6191 {
6192 /* If one class is MEMORY, everything should be passed in
6193 memory. */
6194 if (classes[i] == X86_64_MEMORY_CLASS)
6195 return 0;
6196
6197 /* The X86_64_SSEUP_CLASS should be always preceded by
6198 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6199 if (classes[i] == X86_64_SSEUP_CLASS
6200 && classes[i - 1] != X86_64_SSE_CLASS
6201 && classes[i - 1] != X86_64_SSEUP_CLASS)
6202 {
6203 /* The first one should never be X86_64_SSEUP_CLASS. */
6204 gcc_assert (i != 0);
6205 classes[i] = X86_64_SSE_CLASS;
6206 }
6207
6208 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6209 everything should be passed in memory. */
6210 if (classes[i] == X86_64_X87UP_CLASS
6211 && (classes[i - 1] != X86_64_X87_CLASS))
6212 {
6213 static bool warned;
6214
6215 /* The first one should never be X86_64_X87UP_CLASS. */
6216 gcc_assert (i != 0);
6217 if (!warned && warn_psabi)
6218 {
6219 warned = true;
6220 inform (input_location,
6221 "the ABI of passing union with long double"
6222 " has changed in GCC 4.4");
6223 }
6224 return 0;
6225 }
6226 }
6227 return words;
6228 }
6229
6230 /* Compute alignment needed. We align all types to natural boundaries with
6231 exception of XFmode that is aligned to 64bits. */
6232 if (mode != VOIDmode && mode != BLKmode)
6233 {
6234 int mode_alignment = GET_MODE_BITSIZE (mode);
6235
6236 if (mode == XFmode)
6237 mode_alignment = 128;
6238 else if (mode == XCmode)
6239 mode_alignment = 256;
6240 if (COMPLEX_MODE_P (mode))
6241 mode_alignment /= 2;
6242 /* Misaligned fields are always returned in memory. */
6243 if (bit_offset % mode_alignment)
6244 return 0;
6245 }
6246
6247 /* for V1xx modes, just use the base mode */
6248 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6249 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6250 mode = GET_MODE_INNER (mode);
6251
6252 /* Classification of atomic types. */
6253 switch (mode)
6254 {
6255 case SDmode:
6256 case DDmode:
6257 classes[0] = X86_64_SSE_CLASS;
6258 return 1;
6259 case TDmode:
6260 classes[0] = X86_64_SSE_CLASS;
6261 classes[1] = X86_64_SSEUP_CLASS;
6262 return 2;
6263 case DImode:
6264 case SImode:
6265 case HImode:
6266 case QImode:
6267 case CSImode:
6268 case CHImode:
6269 case CQImode:
6270 {
6271 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6272
6273 if (size <= 32)
6274 {
6275 classes[0] = X86_64_INTEGERSI_CLASS;
6276 return 1;
6277 }
6278 else if (size <= 64)
6279 {
6280 classes[0] = X86_64_INTEGER_CLASS;
6281 return 1;
6282 }
6283 else if (size <= 64+32)
6284 {
6285 classes[0] = X86_64_INTEGER_CLASS;
6286 classes[1] = X86_64_INTEGERSI_CLASS;
6287 return 2;
6288 }
6289 else if (size <= 64+64)
6290 {
6291 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6292 return 2;
6293 }
6294 else
6295 gcc_unreachable ();
6296 }
6297 case CDImode:
6298 case TImode:
6299 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6300 return 2;
6301 case COImode:
6302 case OImode:
6303 /* OImode shouldn't be used directly. */
6304 gcc_unreachable ();
6305 case CTImode:
6306 return 0;
6307 case SFmode:
6308 if (!(bit_offset % 64))
6309 classes[0] = X86_64_SSESF_CLASS;
6310 else
6311 classes[0] = X86_64_SSE_CLASS;
6312 return 1;
6313 case DFmode:
6314 classes[0] = X86_64_SSEDF_CLASS;
6315 return 1;
6316 case XFmode:
6317 classes[0] = X86_64_X87_CLASS;
6318 classes[1] = X86_64_X87UP_CLASS;
6319 return 2;
6320 case TFmode:
6321 classes[0] = X86_64_SSE_CLASS;
6322 classes[1] = X86_64_SSEUP_CLASS;
6323 return 2;
6324 case SCmode:
6325 classes[0] = X86_64_SSE_CLASS;
6326 if (!(bit_offset % 64))
6327 return 1;
6328 else
6329 {
6330 static bool warned;
6331
6332 if (!warned && warn_psabi)
6333 {
6334 warned = true;
6335 inform (input_location,
6336 "the ABI of passing structure with complex float"
6337 " member has changed in GCC 4.4");
6338 }
6339 classes[1] = X86_64_SSESF_CLASS;
6340 return 2;
6341 }
6342 case DCmode:
6343 classes[0] = X86_64_SSEDF_CLASS;
6344 classes[1] = X86_64_SSEDF_CLASS;
6345 return 2;
6346 case XCmode:
6347 classes[0] = X86_64_COMPLEX_X87_CLASS;
6348 return 1;
6349 case TCmode:
6350 /* This modes is larger than 16 bytes. */
6351 return 0;
6352 case V8SFmode:
6353 case V8SImode:
6354 case V32QImode:
6355 case V16HImode:
6356 case V4DFmode:
6357 case V4DImode:
6358 classes[0] = X86_64_SSE_CLASS;
6359 classes[1] = X86_64_SSEUP_CLASS;
6360 classes[2] = X86_64_SSEUP_CLASS;
6361 classes[3] = X86_64_SSEUP_CLASS;
6362 return 4;
6363 case V4SFmode:
6364 case V4SImode:
6365 case V16QImode:
6366 case V8HImode:
6367 case V2DFmode:
6368 case V2DImode:
6369 classes[0] = X86_64_SSE_CLASS;
6370 classes[1] = X86_64_SSEUP_CLASS;
6371 return 2;
6372 case V1TImode:
6373 case V1DImode:
6374 case V2SFmode:
6375 case V2SImode:
6376 case V4HImode:
6377 case V8QImode:
6378 classes[0] = X86_64_SSE_CLASS;
6379 return 1;
6380 case BLKmode:
6381 case VOIDmode:
6382 return 0;
6383 default:
6384 gcc_assert (VECTOR_MODE_P (mode));
6385
6386 if (bytes > 16)
6387 return 0;
6388
6389 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6390
6391 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6392 classes[0] = X86_64_INTEGERSI_CLASS;
6393 else
6394 classes[0] = X86_64_INTEGER_CLASS;
6395 classes[1] = X86_64_INTEGER_CLASS;
6396 return 1 + (bytes > 8);
6397 }
6398 }
6399
6400 /* Examine the argument and return set number of register required in each
6401 class. Return 0 iff parameter should be passed in memory. */
6402 static int
6403 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6404 int *int_nregs, int *sse_nregs)
6405 {
6406 enum x86_64_reg_class regclass[MAX_CLASSES];
6407 int n = classify_argument (mode, type, regclass, 0);
6408
6409 *int_nregs = 0;
6410 *sse_nregs = 0;
6411 if (!n)
6412 return 0;
6413 for (n--; n >= 0; n--)
6414 switch (regclass[n])
6415 {
6416 case X86_64_INTEGER_CLASS:
6417 case X86_64_INTEGERSI_CLASS:
6418 (*int_nregs)++;
6419 break;
6420 case X86_64_SSE_CLASS:
6421 case X86_64_SSESF_CLASS:
6422 case X86_64_SSEDF_CLASS:
6423 (*sse_nregs)++;
6424 break;
6425 case X86_64_NO_CLASS:
6426 case X86_64_SSEUP_CLASS:
6427 break;
6428 case X86_64_X87_CLASS:
6429 case X86_64_X87UP_CLASS:
6430 if (!in_return)
6431 return 0;
6432 break;
6433 case X86_64_COMPLEX_X87_CLASS:
6434 return in_return ? 2 : 0;
6435 case X86_64_MEMORY_CLASS:
6436 gcc_unreachable ();
6437 }
6438 return 1;
6439 }
6440
6441 /* Construct container for the argument used by GCC interface. See
6442 FUNCTION_ARG for the detailed description. */
6443
6444 static rtx
6445 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6446 const_tree type, int in_return, int nintregs, int nsseregs,
6447 const int *intreg, int sse_regno)
6448 {
6449 /* The following variables hold the static issued_error state. */
6450 static bool issued_sse_arg_error;
6451 static bool issued_sse_ret_error;
6452 static bool issued_x87_ret_error;
6453
6454 enum machine_mode tmpmode;
6455 int bytes =
6456 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6457 enum x86_64_reg_class regclass[MAX_CLASSES];
6458 int n;
6459 int i;
6460 int nexps = 0;
6461 int needed_sseregs, needed_intregs;
6462 rtx exp[MAX_CLASSES];
6463 rtx ret;
6464
6465 n = classify_argument (mode, type, regclass, 0);
6466 if (!n)
6467 return NULL;
6468 if (!examine_argument (mode, type, in_return, &needed_intregs,
6469 &needed_sseregs))
6470 return NULL;
6471 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6472 return NULL;
6473
6474 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6475 some less clueful developer tries to use floating-point anyway. */
6476 if (needed_sseregs && !TARGET_SSE)
6477 {
6478 if (in_return)
6479 {
6480 if (!issued_sse_ret_error)
6481 {
6482 error ("SSE register return with SSE disabled");
6483 issued_sse_ret_error = true;
6484 }
6485 }
6486 else if (!issued_sse_arg_error)
6487 {
6488 error ("SSE register argument with SSE disabled");
6489 issued_sse_arg_error = true;
6490 }
6491 return NULL;
6492 }
6493
6494 /* Likewise, error if the ABI requires us to return values in the
6495 x87 registers and the user specified -mno-80387. */
6496 if (!TARGET_80387 && in_return)
6497 for (i = 0; i < n; i++)
6498 if (regclass[i] == X86_64_X87_CLASS
6499 || regclass[i] == X86_64_X87UP_CLASS
6500 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6501 {
6502 if (!issued_x87_ret_error)
6503 {
6504 error ("x87 register return with x87 disabled");
6505 issued_x87_ret_error = true;
6506 }
6507 return NULL;
6508 }
6509
6510 /* First construct simple cases. Avoid SCmode, since we want to use
6511 single register to pass this type. */
6512 if (n == 1 && mode != SCmode)
6513 switch (regclass[0])
6514 {
6515 case X86_64_INTEGER_CLASS:
6516 case X86_64_INTEGERSI_CLASS:
6517 return gen_rtx_REG (mode, intreg[0]);
6518 case X86_64_SSE_CLASS:
6519 case X86_64_SSESF_CLASS:
6520 case X86_64_SSEDF_CLASS:
6521 if (mode != BLKmode)
6522 return gen_reg_or_parallel (mode, orig_mode,
6523 SSE_REGNO (sse_regno));
6524 break;
6525 case X86_64_X87_CLASS:
6526 case X86_64_COMPLEX_X87_CLASS:
6527 return gen_rtx_REG (mode, FIRST_STACK_REG);
6528 case X86_64_NO_CLASS:
6529 /* Zero sized array, struct or class. */
6530 return NULL;
6531 default:
6532 gcc_unreachable ();
6533 }
6534 if (n == 2
6535 && regclass[0] == X86_64_SSE_CLASS
6536 && regclass[1] == X86_64_SSEUP_CLASS
6537 && mode != BLKmode)
6538 return gen_reg_or_parallel (mode, orig_mode,
6539 SSE_REGNO (sse_regno));
6540 if (n == 4
6541 && regclass[0] == X86_64_SSE_CLASS
6542 && regclass[1] == X86_64_SSEUP_CLASS
6543 && regclass[2] == X86_64_SSEUP_CLASS
6544 && regclass[3] == X86_64_SSEUP_CLASS
6545 && mode != BLKmode)
6546 return gen_reg_or_parallel (mode, orig_mode,
6547 SSE_REGNO (sse_regno));
6548 if (n == 2
6549 && regclass[0] == X86_64_X87_CLASS
6550 && regclass[1] == X86_64_X87UP_CLASS)
6551 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6552
6553 if (n == 2
6554 && regclass[0] == X86_64_INTEGER_CLASS
6555 && regclass[1] == X86_64_INTEGER_CLASS
6556 && (mode == CDImode || mode == TImode || mode == TFmode)
6557 && intreg[0] + 1 == intreg[1])
6558 return gen_rtx_REG (mode, intreg[0]);
6559
6560 /* Otherwise figure out the entries of the PARALLEL. */
6561 for (i = 0; i < n; i++)
6562 {
6563 int pos;
6564
6565 switch (regclass[i])
6566 {
6567 case X86_64_NO_CLASS:
6568 break;
6569 case X86_64_INTEGER_CLASS:
6570 case X86_64_INTEGERSI_CLASS:
6571 /* Merge TImodes on aligned occasions here too. */
6572 if (i * 8 + 8 > bytes)
6573 tmpmode
6574 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6575 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6576 tmpmode = SImode;
6577 else
6578 tmpmode = DImode;
6579 /* We've requested 24 bytes we
6580 don't have mode for. Use DImode. */
6581 if (tmpmode == BLKmode)
6582 tmpmode = DImode;
6583 exp [nexps++]
6584 = gen_rtx_EXPR_LIST (VOIDmode,
6585 gen_rtx_REG (tmpmode, *intreg),
6586 GEN_INT (i*8));
6587 intreg++;
6588 break;
6589 case X86_64_SSESF_CLASS:
6590 exp [nexps++]
6591 = gen_rtx_EXPR_LIST (VOIDmode,
6592 gen_rtx_REG (SFmode,
6593 SSE_REGNO (sse_regno)),
6594 GEN_INT (i*8));
6595 sse_regno++;
6596 break;
6597 case X86_64_SSEDF_CLASS:
6598 exp [nexps++]
6599 = gen_rtx_EXPR_LIST (VOIDmode,
6600 gen_rtx_REG (DFmode,
6601 SSE_REGNO (sse_regno)),
6602 GEN_INT (i*8));
6603 sse_regno++;
6604 break;
6605 case X86_64_SSE_CLASS:
6606 pos = i;
6607 switch (n)
6608 {
6609 case 1:
6610 tmpmode = DImode;
6611 break;
6612 case 2:
6613 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6614 {
6615 tmpmode = TImode;
6616 i++;
6617 }
6618 else
6619 tmpmode = DImode;
6620 break;
6621 case 4:
6622 gcc_assert (i == 0
6623 && regclass[1] == X86_64_SSEUP_CLASS
6624 && regclass[2] == X86_64_SSEUP_CLASS
6625 && regclass[3] == X86_64_SSEUP_CLASS);
6626 tmpmode = OImode;
6627 i += 3;
6628 break;
6629 default:
6630 gcc_unreachable ();
6631 }
6632 exp [nexps++]
6633 = gen_rtx_EXPR_LIST (VOIDmode,
6634 gen_rtx_REG (tmpmode,
6635 SSE_REGNO (sse_regno)),
6636 GEN_INT (pos*8));
6637 sse_regno++;
6638 break;
6639 default:
6640 gcc_unreachable ();
6641 }
6642 }
6643
6644 /* Empty aligned struct, union or class. */
6645 if (nexps == 0)
6646 return NULL;
6647
6648 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6649 for (i = 0; i < nexps; i++)
6650 XVECEXP (ret, 0, i) = exp [i];
6651 return ret;
6652 }
6653
6654 /* Update the data in CUM to advance over an argument of mode MODE
6655 and data type TYPE. (TYPE is null for libcalls where that information
6656 may not be available.) */
6657
6658 static void
6659 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6660 const_tree type, HOST_WIDE_INT bytes,
6661 HOST_WIDE_INT words)
6662 {
6663 switch (mode)
6664 {
6665 default:
6666 break;
6667
6668 case BLKmode:
6669 if (bytes < 0)
6670 break;
6671 /* FALLTHRU */
6672
6673 case DImode:
6674 case SImode:
6675 case HImode:
6676 case QImode:
6677 cum->words += words;
6678 cum->nregs -= words;
6679 cum->regno += words;
6680
6681 if (cum->nregs <= 0)
6682 {
6683 cum->nregs = 0;
6684 cum->regno = 0;
6685 }
6686 break;
6687
6688 case OImode:
6689 /* OImode shouldn't be used directly. */
6690 gcc_unreachable ();
6691
6692 case DFmode:
6693 if (cum->float_in_sse < 2)
6694 break;
6695 case SFmode:
6696 if (cum->float_in_sse < 1)
6697 break;
6698 /* FALLTHRU */
6699
6700 case V8SFmode:
6701 case V8SImode:
6702 case V32QImode:
6703 case V16HImode:
6704 case V4DFmode:
6705 case V4DImode:
6706 case TImode:
6707 case V16QImode:
6708 case V8HImode:
6709 case V4SImode:
6710 case V2DImode:
6711 case V4SFmode:
6712 case V2DFmode:
6713 if (!type || !AGGREGATE_TYPE_P (type))
6714 {
6715 cum->sse_words += words;
6716 cum->sse_nregs -= 1;
6717 cum->sse_regno += 1;
6718 if (cum->sse_nregs <= 0)
6719 {
6720 cum->sse_nregs = 0;
6721 cum->sse_regno = 0;
6722 }
6723 }
6724 break;
6725
6726 case V8QImode:
6727 case V4HImode:
6728 case V2SImode:
6729 case V2SFmode:
6730 case V1TImode:
6731 case V1DImode:
6732 if (!type || !AGGREGATE_TYPE_P (type))
6733 {
6734 cum->mmx_words += words;
6735 cum->mmx_nregs -= 1;
6736 cum->mmx_regno += 1;
6737 if (cum->mmx_nregs <= 0)
6738 {
6739 cum->mmx_nregs = 0;
6740 cum->mmx_regno = 0;
6741 }
6742 }
6743 break;
6744 }
6745 }
6746
6747 static void
6748 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6749 const_tree type, HOST_WIDE_INT words, bool named)
6750 {
6751 int int_nregs, sse_nregs;
6752
6753 /* Unnamed 256bit vector mode parameters are passed on stack. */
6754 if (!named && VALID_AVX256_REG_MODE (mode))
6755 return;
6756
6757 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6758 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6759 {
6760 cum->nregs -= int_nregs;
6761 cum->sse_nregs -= sse_nregs;
6762 cum->regno += int_nregs;
6763 cum->sse_regno += sse_nregs;
6764 }
6765 else
6766 {
6767 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6768 cum->words = (cum->words + align - 1) & ~(align - 1);
6769 cum->words += words;
6770 }
6771 }
6772
6773 static void
6774 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6775 HOST_WIDE_INT words)
6776 {
6777 /* Otherwise, this should be passed indirect. */
6778 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6779
6780 cum->words += words;
6781 if (cum->nregs > 0)
6782 {
6783 cum->nregs -= 1;
6784 cum->regno += 1;
6785 }
6786 }
6787
6788 /* Update the data in CUM to advance over an argument of mode MODE and
6789 data type TYPE. (TYPE is null for libcalls where that information
6790 may not be available.) */
6791
6792 static void
6793 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6794 const_tree type, bool named)
6795 {
6796 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6797 HOST_WIDE_INT bytes, words;
6798
6799 if (mode == BLKmode)
6800 bytes = int_size_in_bytes (type);
6801 else
6802 bytes = GET_MODE_SIZE (mode);
6803 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6804
6805 if (type)
6806 mode = type_natural_mode (type, NULL);
6807
6808 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6809 function_arg_advance_ms_64 (cum, bytes, words);
6810 else if (TARGET_64BIT)
6811 function_arg_advance_64 (cum, mode, type, words, named);
6812 else
6813 function_arg_advance_32 (cum, mode, type, bytes, words);
6814 }
6815
6816 /* Define where to put the arguments to a function.
6817 Value is zero to push the argument on the stack,
6818 or a hard register in which to store the argument.
6819
6820 MODE is the argument's machine mode.
6821 TYPE is the data type of the argument (as a tree).
6822 This is null for libcalls where that information may
6823 not be available.
6824 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6825 the preceding args and about the function being called.
6826 NAMED is nonzero if this argument is a named parameter
6827 (otherwise it is an extra parameter matching an ellipsis). */
6828
6829 static rtx
6830 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6831 enum machine_mode orig_mode, const_tree type,
6832 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6833 {
6834 static bool warnedsse, warnedmmx;
6835
6836 /* Avoid the AL settings for the Unix64 ABI. */
6837 if (mode == VOIDmode)
6838 return constm1_rtx;
6839
6840 switch (mode)
6841 {
6842 default:
6843 break;
6844
6845 case BLKmode:
6846 if (bytes < 0)
6847 break;
6848 /* FALLTHRU */
6849 case DImode:
6850 case SImode:
6851 case HImode:
6852 case QImode:
6853 if (words <= cum->nregs)
6854 {
6855 int regno = cum->regno;
6856
6857 /* Fastcall allocates the first two DWORD (SImode) or
6858 smaller arguments to ECX and EDX if it isn't an
6859 aggregate type . */
6860 if (cum->fastcall)
6861 {
6862 if (mode == BLKmode
6863 || mode == DImode
6864 || (type && AGGREGATE_TYPE_P (type)))
6865 break;
6866
6867 /* ECX not EAX is the first allocated register. */
6868 if (regno == AX_REG)
6869 regno = CX_REG;
6870 }
6871 return gen_rtx_REG (mode, regno);
6872 }
6873 break;
6874
6875 case DFmode:
6876 if (cum->float_in_sse < 2)
6877 break;
6878 case SFmode:
6879 if (cum->float_in_sse < 1)
6880 break;
6881 /* FALLTHRU */
6882 case TImode:
6883 /* In 32bit, we pass TImode in xmm registers. */
6884 case V16QImode:
6885 case V8HImode:
6886 case V4SImode:
6887 case V2DImode:
6888 case V4SFmode:
6889 case V2DFmode:
6890 if (!type || !AGGREGATE_TYPE_P (type))
6891 {
6892 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6893 {
6894 warnedsse = true;
6895 warning (0, "SSE vector argument without SSE enabled "
6896 "changes the ABI");
6897 }
6898 if (cum->sse_nregs)
6899 return gen_reg_or_parallel (mode, orig_mode,
6900 cum->sse_regno + FIRST_SSE_REG);
6901 }
6902 break;
6903
6904 case OImode:
6905 /* OImode shouldn't be used directly. */
6906 gcc_unreachable ();
6907
6908 case V8SFmode:
6909 case V8SImode:
6910 case V32QImode:
6911 case V16HImode:
6912 case V4DFmode:
6913 case V4DImode:
6914 if (!type || !AGGREGATE_TYPE_P (type))
6915 {
6916 if (cum->sse_nregs)
6917 return gen_reg_or_parallel (mode, orig_mode,
6918 cum->sse_regno + FIRST_SSE_REG);
6919 }
6920 break;
6921
6922 case V8QImode:
6923 case V4HImode:
6924 case V2SImode:
6925 case V2SFmode:
6926 case V1TImode:
6927 case V1DImode:
6928 if (!type || !AGGREGATE_TYPE_P (type))
6929 {
6930 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6931 {
6932 warnedmmx = true;
6933 warning (0, "MMX vector argument without MMX enabled "
6934 "changes the ABI");
6935 }
6936 if (cum->mmx_nregs)
6937 return gen_reg_or_parallel (mode, orig_mode,
6938 cum->mmx_regno + FIRST_MMX_REG);
6939 }
6940 break;
6941 }
6942
6943 return NULL_RTX;
6944 }
6945
6946 static rtx
6947 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6948 enum machine_mode orig_mode, const_tree type, bool named)
6949 {
6950 /* Handle a hidden AL argument containing number of registers
6951 for varargs x86-64 functions. */
6952 if (mode == VOIDmode)
6953 return GEN_INT (cum->maybe_vaarg
6954 ? (cum->sse_nregs < 0
6955 ? X86_64_SSE_REGPARM_MAX
6956 : cum->sse_regno)
6957 : -1);
6958
6959 switch (mode)
6960 {
6961 default:
6962 break;
6963
6964 case V8SFmode:
6965 case V8SImode:
6966 case V32QImode:
6967 case V16HImode:
6968 case V4DFmode:
6969 case V4DImode:
6970 /* Unnamed 256bit vector mode parameters are passed on stack. */
6971 if (!named)
6972 return NULL;
6973 break;
6974 }
6975
6976 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6977 cum->sse_nregs,
6978 &x86_64_int_parameter_registers [cum->regno],
6979 cum->sse_regno);
6980 }
6981
6982 static rtx
6983 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6984 enum machine_mode orig_mode, bool named,
6985 HOST_WIDE_INT bytes)
6986 {
6987 unsigned int regno;
6988
6989 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6990 We use value of -2 to specify that current function call is MSABI. */
6991 if (mode == VOIDmode)
6992 return GEN_INT (-2);
6993
6994 /* If we've run out of registers, it goes on the stack. */
6995 if (cum->nregs == 0)
6996 return NULL_RTX;
6997
6998 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6999
7000 /* Only floating point modes are passed in anything but integer regs. */
7001 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
7002 {
7003 if (named)
7004 regno = cum->regno + FIRST_SSE_REG;
7005 else
7006 {
7007 rtx t1, t2;
7008
7009 /* Unnamed floating parameters are passed in both the
7010 SSE and integer registers. */
7011 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
7012 t2 = gen_rtx_REG (mode, regno);
7013 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
7014 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
7015 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
7016 }
7017 }
7018 /* Handle aggregated types passed in register. */
7019 if (orig_mode == BLKmode)
7020 {
7021 if (bytes > 0 && bytes <= 8)
7022 mode = (bytes > 4 ? DImode : SImode);
7023 if (mode == BLKmode)
7024 mode = DImode;
7025 }
7026
7027 return gen_reg_or_parallel (mode, orig_mode, regno);
7028 }
7029
7030 /* Return where to put the arguments to a function.
7031 Return zero to push the argument on the stack, or a hard register in which to store the argument.
7032
7033 MODE is the argument's machine mode. TYPE is the data type of the
7034 argument. It is null for libcalls where that information may not be
7035 available. CUM gives information about the preceding args and about
7036 the function being called. NAMED is nonzero if this argument is a
7037 named parameter (otherwise it is an extra parameter matching an
7038 ellipsis). */
7039
7040 static rtx
7041 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
7042 const_tree type, bool named)
7043 {
7044 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7045 enum machine_mode mode = omode;
7046 HOST_WIDE_INT bytes, words;
7047 rtx arg;
7048
7049 if (mode == BLKmode)
7050 bytes = int_size_in_bytes (type);
7051 else
7052 bytes = GET_MODE_SIZE (mode);
7053 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7054
7055 /* To simplify the code below, represent vector types with a vector mode
7056 even if MMX/SSE are not active. */
7057 if (type && TREE_CODE (type) == VECTOR_TYPE)
7058 mode = type_natural_mode (type, cum);
7059
7060 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7061 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
7062 else if (TARGET_64BIT)
7063 arg = function_arg_64 (cum, mode, omode, type, named);
7064 else
7065 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
7066
7067 return arg;
7068 }
7069
7070 /* A C expression that indicates when an argument must be passed by
7071 reference. If nonzero for an argument, a copy of that argument is
7072 made in memory and a pointer to the argument is passed instead of
7073 the argument itself. The pointer is passed in whatever way is
7074 appropriate for passing a pointer to that type. */
7075
7076 static bool
7077 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
7078 enum machine_mode mode ATTRIBUTE_UNUSED,
7079 const_tree type, bool named ATTRIBUTE_UNUSED)
7080 {
7081 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7082
7083 /* See Windows x64 Software Convention. */
7084 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7085 {
7086 int msize = (int) GET_MODE_SIZE (mode);
7087 if (type)
7088 {
7089 /* Arrays are passed by reference. */
7090 if (TREE_CODE (type) == ARRAY_TYPE)
7091 return true;
7092
7093 if (AGGREGATE_TYPE_P (type))
7094 {
7095 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7096 are passed by reference. */
7097 msize = int_size_in_bytes (type);
7098 }
7099 }
7100
7101 /* __m128 is passed by reference. */
7102 switch (msize) {
7103 case 1: case 2: case 4: case 8:
7104 break;
7105 default:
7106 return true;
7107 }
7108 }
7109 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7110 return 1;
7111
7112 return 0;
7113 }
7114
7115 /* Return true when TYPE should be 128bit aligned for 32bit argument
7116 passing ABI. XXX: This function is obsolete and is only used for
7117 checking psABI compatibility with previous versions of GCC. */
7118
7119 static bool
7120 ix86_compat_aligned_value_p (const_tree type)
7121 {
7122 enum machine_mode mode = TYPE_MODE (type);
7123 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7124 || mode == TDmode
7125 || mode == TFmode
7126 || mode == TCmode)
7127 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7128 return true;
7129 if (TYPE_ALIGN (type) < 128)
7130 return false;
7131
7132 if (AGGREGATE_TYPE_P (type))
7133 {
7134 /* Walk the aggregates recursively. */
7135 switch (TREE_CODE (type))
7136 {
7137 case RECORD_TYPE:
7138 case UNION_TYPE:
7139 case QUAL_UNION_TYPE:
7140 {
7141 tree field;
7142
7143 /* Walk all the structure fields. */
7144 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7145 {
7146 if (TREE_CODE (field) == FIELD_DECL
7147 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7148 return true;
7149 }
7150 break;
7151 }
7152
7153 case ARRAY_TYPE:
7154 /* Just for use if some languages passes arrays by value. */
7155 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7156 return true;
7157 break;
7158
7159 default:
7160 gcc_unreachable ();
7161 }
7162 }
7163 return false;
7164 }
7165
7166 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7167 XXX: This function is obsolete and is only used for checking psABI
7168 compatibility with previous versions of GCC. */
7169
7170 static unsigned int
7171 ix86_compat_function_arg_boundary (enum machine_mode mode,
7172 const_tree type, unsigned int align)
7173 {
7174 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7175 natural boundaries. */
7176 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7177 {
7178 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7179 make an exception for SSE modes since these require 128bit
7180 alignment.
7181
7182 The handling here differs from field_alignment. ICC aligns MMX
7183 arguments to 4 byte boundaries, while structure fields are aligned
7184 to 8 byte boundaries. */
7185 if (!type)
7186 {
7187 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7188 align = PARM_BOUNDARY;
7189 }
7190 else
7191 {
7192 if (!ix86_compat_aligned_value_p (type))
7193 align = PARM_BOUNDARY;
7194 }
7195 }
7196 if (align > BIGGEST_ALIGNMENT)
7197 align = BIGGEST_ALIGNMENT;
7198 return align;
7199 }
7200
7201 /* Return true when TYPE should be 128bit aligned for 32bit argument
7202 passing ABI. */
7203
7204 static bool
7205 ix86_contains_aligned_value_p (const_tree type)
7206 {
7207 enum machine_mode mode = TYPE_MODE (type);
7208
7209 if (mode == XFmode || mode == XCmode)
7210 return false;
7211
7212 if (TYPE_ALIGN (type) < 128)
7213 return false;
7214
7215 if (AGGREGATE_TYPE_P (type))
7216 {
7217 /* Walk the aggregates recursively. */
7218 switch (TREE_CODE (type))
7219 {
7220 case RECORD_TYPE:
7221 case UNION_TYPE:
7222 case QUAL_UNION_TYPE:
7223 {
7224 tree field;
7225
7226 /* Walk all the structure fields. */
7227 for (field = TYPE_FIELDS (type);
7228 field;
7229 field = DECL_CHAIN (field))
7230 {
7231 if (TREE_CODE (field) == FIELD_DECL
7232 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7233 return true;
7234 }
7235 break;
7236 }
7237
7238 case ARRAY_TYPE:
7239 /* Just for use if some languages passes arrays by value. */
7240 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7241 return true;
7242 break;
7243
7244 default:
7245 gcc_unreachable ();
7246 }
7247 }
7248 else
7249 return TYPE_ALIGN (type) >= 128;
7250
7251 return false;
7252 }
7253
7254 /* Gives the alignment boundary, in bits, of an argument with the
7255 specified mode and type. */
7256
7257 static unsigned int
7258 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7259 {
7260 unsigned int align;
7261 if (type)
7262 {
7263 /* Since the main variant type is used for call, we convert it to
7264 the main variant type. */
7265 type = TYPE_MAIN_VARIANT (type);
7266 align = TYPE_ALIGN (type);
7267 }
7268 else
7269 align = GET_MODE_ALIGNMENT (mode);
7270 if (align < PARM_BOUNDARY)
7271 align = PARM_BOUNDARY;
7272 else
7273 {
7274 static bool warned;
7275 unsigned int saved_align = align;
7276
7277 if (!TARGET_64BIT)
7278 {
7279 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7280 if (!type)
7281 {
7282 if (mode == XFmode || mode == XCmode)
7283 align = PARM_BOUNDARY;
7284 }
7285 else if (!ix86_contains_aligned_value_p (type))
7286 align = PARM_BOUNDARY;
7287
7288 if (align < 128)
7289 align = PARM_BOUNDARY;
7290 }
7291
7292 if (warn_psabi
7293 && !warned
7294 && align != ix86_compat_function_arg_boundary (mode, type,
7295 saved_align))
7296 {
7297 warned = true;
7298 inform (input_location,
7299 "The ABI for passing parameters with %d-byte"
7300 " alignment has changed in GCC 4.6",
7301 align / BITS_PER_UNIT);
7302 }
7303 }
7304
7305 return align;
7306 }
7307
7308 /* Return true if N is a possible register number of function value. */
7309
7310 static bool
7311 ix86_function_value_regno_p (const unsigned int regno)
7312 {
7313 switch (regno)
7314 {
7315 case AX_REG:
7316 return true;
7317
7318 case FIRST_FLOAT_REG:
7319 /* TODO: The function should depend on current function ABI but
7320 builtins.c would need updating then. Therefore we use the
7321 default ABI. */
7322 if (TARGET_64BIT && ix86_abi == MS_ABI)
7323 return false;
7324 return TARGET_FLOAT_RETURNS_IN_80387;
7325
7326 case FIRST_SSE_REG:
7327 return TARGET_SSE;
7328
7329 case FIRST_MMX_REG:
7330 if (TARGET_MACHO || TARGET_64BIT)
7331 return false;
7332 return TARGET_MMX;
7333 }
7334
7335 return false;
7336 }
7337
7338 /* Define how to find the value returned by a function.
7339 VALTYPE is the data type of the value (as a tree).
7340 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7341 otherwise, FUNC is 0. */
7342
7343 static rtx
7344 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7345 const_tree fntype, const_tree fn)
7346 {
7347 unsigned int regno;
7348
7349 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7350 we normally prevent this case when mmx is not available. However
7351 some ABIs may require the result to be returned like DImode. */
7352 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7353 regno = FIRST_MMX_REG;
7354
7355 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7356 we prevent this case when sse is not available. However some ABIs
7357 may require the result to be returned like integer TImode. */
7358 else if (mode == TImode
7359 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7360 regno = FIRST_SSE_REG;
7361
7362 /* 32-byte vector modes in %ymm0. */
7363 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7364 regno = FIRST_SSE_REG;
7365
7366 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7367 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7368 regno = FIRST_FLOAT_REG;
7369 else
7370 /* Most things go in %eax. */
7371 regno = AX_REG;
7372
7373 /* Override FP return register with %xmm0 for local functions when
7374 SSE math is enabled or for functions with sseregparm attribute. */
7375 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7376 {
7377 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7378 if ((sse_level >= 1 && mode == SFmode)
7379 || (sse_level == 2 && mode == DFmode))
7380 regno = FIRST_SSE_REG;
7381 }
7382
7383 /* OImode shouldn't be used directly. */
7384 gcc_assert (mode != OImode);
7385
7386 return gen_rtx_REG (orig_mode, regno);
7387 }
7388
7389 static rtx
7390 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7391 const_tree valtype)
7392 {
7393 rtx ret;
7394
7395 /* Handle libcalls, which don't provide a type node. */
7396 if (valtype == NULL)
7397 {
7398 unsigned int regno;
7399
7400 switch (mode)
7401 {
7402 case SFmode:
7403 case SCmode:
7404 case DFmode:
7405 case DCmode:
7406 case TFmode:
7407 case SDmode:
7408 case DDmode:
7409 case TDmode:
7410 regno = FIRST_SSE_REG;
7411 break;
7412 case XFmode:
7413 case XCmode:
7414 regno = FIRST_FLOAT_REG;
7415 break;
7416 case TCmode:
7417 return NULL;
7418 default:
7419 regno = AX_REG;
7420 }
7421
7422 return gen_rtx_REG (mode, regno);
7423 }
7424 else if (POINTER_TYPE_P (valtype))
7425 {
7426 /* Pointers are always returned in word_mode. */
7427 mode = word_mode;
7428 }
7429
7430 ret = construct_container (mode, orig_mode, valtype, 1,
7431 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7432 x86_64_int_return_registers, 0);
7433
7434 /* For zero sized structures, construct_container returns NULL, but we
7435 need to keep rest of compiler happy by returning meaningful value. */
7436 if (!ret)
7437 ret = gen_rtx_REG (orig_mode, AX_REG);
7438
7439 return ret;
7440 }
7441
7442 static rtx
7443 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode,
7444 const_tree valtype)
7445 {
7446 unsigned int regno = AX_REG;
7447
7448 if (TARGET_SSE)
7449 {
7450 switch (GET_MODE_SIZE (mode))
7451 {
7452 case 16:
7453 if (valtype != NULL_TREE
7454 && !VECTOR_INTEGER_TYPE_P (valtype)
7455 && !VECTOR_INTEGER_TYPE_P (valtype)
7456 && !INTEGRAL_TYPE_P (valtype)
7457 && !VECTOR_FLOAT_TYPE_P (valtype))
7458 break;
7459 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7460 && !COMPLEX_MODE_P (mode))
7461 regno = FIRST_SSE_REG;
7462 break;
7463 case 8:
7464 case 4:
7465 if (mode == SFmode || mode == DFmode)
7466 regno = FIRST_SSE_REG;
7467 break;
7468 default:
7469 break;
7470 }
7471 }
7472 return gen_rtx_REG (orig_mode, regno);
7473 }
7474
7475 static rtx
7476 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7477 enum machine_mode orig_mode, enum machine_mode mode)
7478 {
7479 const_tree fn, fntype;
7480
7481 fn = NULL_TREE;
7482 if (fntype_or_decl && DECL_P (fntype_or_decl))
7483 fn = fntype_or_decl;
7484 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7485
7486 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7487 return function_value_ms_64 (orig_mode, mode, valtype);
7488 else if (TARGET_64BIT)
7489 return function_value_64 (orig_mode, mode, valtype);
7490 else
7491 return function_value_32 (orig_mode, mode, fntype, fn);
7492 }
7493
7494 static rtx
7495 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7496 bool outgoing ATTRIBUTE_UNUSED)
7497 {
7498 enum machine_mode mode, orig_mode;
7499
7500 orig_mode = TYPE_MODE (valtype);
7501 mode = type_natural_mode (valtype, NULL);
7502 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7503 }
7504
7505 /* Pointer function arguments and return values are promoted to
7506 word_mode. */
7507
7508 static enum machine_mode
7509 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7510 int *punsignedp, const_tree fntype,
7511 int for_return)
7512 {
7513 if (type != NULL_TREE && POINTER_TYPE_P (type))
7514 {
7515 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7516 return word_mode;
7517 }
7518 return default_promote_function_mode (type, mode, punsignedp, fntype,
7519 for_return);
7520 }
7521
7522 /* Return true if a structure, union or array with MODE containing FIELD
7523 should be accessed using BLKmode. */
7524
7525 static bool
7526 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7527 {
7528 /* Union with XFmode must be in BLKmode. */
7529 return (mode == XFmode
7530 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7531 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7532 }
7533
7534 rtx
7535 ix86_libcall_value (enum machine_mode mode)
7536 {
7537 return ix86_function_value_1 (NULL, NULL, mode, mode);
7538 }
7539
7540 /* Return true iff type is returned in memory. */
7541
7542 static bool ATTRIBUTE_UNUSED
7543 return_in_memory_32 (const_tree type, enum machine_mode mode)
7544 {
7545 HOST_WIDE_INT size;
7546
7547 if (mode == BLKmode)
7548 return true;
7549
7550 size = int_size_in_bytes (type);
7551
7552 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7553 return false;
7554
7555 if (VECTOR_MODE_P (mode) || mode == TImode)
7556 {
7557 /* User-created vectors small enough to fit in EAX. */
7558 if (size < 8)
7559 return false;
7560
7561 /* MMX/3dNow values are returned in MM0,
7562 except when it doesn't exits or the ABI prescribes otherwise. */
7563 if (size == 8)
7564 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7565
7566 /* SSE values are returned in XMM0, except when it doesn't exist. */
7567 if (size == 16)
7568 return !TARGET_SSE;
7569
7570 /* AVX values are returned in YMM0, except when it doesn't exist. */
7571 if (size == 32)
7572 return !TARGET_AVX;
7573 }
7574
7575 if (mode == XFmode)
7576 return false;
7577
7578 if (size > 12)
7579 return true;
7580
7581 /* OImode shouldn't be used directly. */
7582 gcc_assert (mode != OImode);
7583
7584 return false;
7585 }
7586
7587 static bool ATTRIBUTE_UNUSED
7588 return_in_memory_64 (const_tree type, enum machine_mode mode)
7589 {
7590 int needed_intregs, needed_sseregs;
7591 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7592 }
7593
7594 static bool ATTRIBUTE_UNUSED
7595 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7596 {
7597 HOST_WIDE_INT size = int_size_in_bytes (type);
7598
7599 /* __m128 is returned in xmm0. */
7600 if ((!type || VECTOR_INTEGER_TYPE_P (type) || INTEGRAL_TYPE_P (type)
7601 || VECTOR_FLOAT_TYPE_P (type))
7602 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7603 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7604 return false;
7605
7606 /* Otherwise, the size must be exactly in [1248]. */
7607 return size != 1 && size != 2 && size != 4 && size != 8;
7608 }
7609
7610 static bool
7611 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7612 {
7613 #ifdef SUBTARGET_RETURN_IN_MEMORY
7614 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7615 #else
7616 const enum machine_mode mode = type_natural_mode (type, NULL);
7617
7618 if (TARGET_64BIT)
7619 {
7620 if (ix86_function_type_abi (fntype) == MS_ABI)
7621 return return_in_memory_ms_64 (type, mode);
7622 else
7623 return return_in_memory_64 (type, mode);
7624 }
7625 else
7626 return return_in_memory_32 (type, mode);
7627 #endif
7628 }
7629
7630 /* When returning SSE vector types, we have a choice of either
7631 (1) being abi incompatible with a -march switch, or
7632 (2) generating an error.
7633 Given no good solution, I think the safest thing is one warning.
7634 The user won't be able to use -Werror, but....
7635
7636 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7637 called in response to actually generating a caller or callee that
7638 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7639 via aggregate_value_p for general type probing from tree-ssa. */
7640
7641 static rtx
7642 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7643 {
7644 static bool warnedsse, warnedmmx;
7645
7646 if (!TARGET_64BIT && type)
7647 {
7648 /* Look at the return type of the function, not the function type. */
7649 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7650
7651 if (!TARGET_SSE && !warnedsse)
7652 {
7653 if (mode == TImode
7654 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7655 {
7656 warnedsse = true;
7657 warning (0, "SSE vector return without SSE enabled "
7658 "changes the ABI");
7659 }
7660 }
7661
7662 if (!TARGET_MMX && !warnedmmx)
7663 {
7664 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7665 {
7666 warnedmmx = true;
7667 warning (0, "MMX vector return without MMX enabled "
7668 "changes the ABI");
7669 }
7670 }
7671 }
7672
7673 return NULL;
7674 }
7675
7676 \f
7677 /* Create the va_list data type. */
7678
7679 /* Returns the calling convention specific va_list date type.
7680 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7681
7682 static tree
7683 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7684 {
7685 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7686
7687 /* For i386 we use plain pointer to argument area. */
7688 if (!TARGET_64BIT || abi == MS_ABI)
7689 return build_pointer_type (char_type_node);
7690
7691 record = lang_hooks.types.make_type (RECORD_TYPE);
7692 type_decl = build_decl (BUILTINS_LOCATION,
7693 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7694
7695 f_gpr = build_decl (BUILTINS_LOCATION,
7696 FIELD_DECL, get_identifier ("gp_offset"),
7697 unsigned_type_node);
7698 f_fpr = build_decl (BUILTINS_LOCATION,
7699 FIELD_DECL, get_identifier ("fp_offset"),
7700 unsigned_type_node);
7701 f_ovf = build_decl (BUILTINS_LOCATION,
7702 FIELD_DECL, get_identifier ("overflow_arg_area"),
7703 ptr_type_node);
7704 f_sav = build_decl (BUILTINS_LOCATION,
7705 FIELD_DECL, get_identifier ("reg_save_area"),
7706 ptr_type_node);
7707
7708 va_list_gpr_counter_field = f_gpr;
7709 va_list_fpr_counter_field = f_fpr;
7710
7711 DECL_FIELD_CONTEXT (f_gpr) = record;
7712 DECL_FIELD_CONTEXT (f_fpr) = record;
7713 DECL_FIELD_CONTEXT (f_ovf) = record;
7714 DECL_FIELD_CONTEXT (f_sav) = record;
7715
7716 TYPE_STUB_DECL (record) = type_decl;
7717 TYPE_NAME (record) = type_decl;
7718 TYPE_FIELDS (record) = f_gpr;
7719 DECL_CHAIN (f_gpr) = f_fpr;
7720 DECL_CHAIN (f_fpr) = f_ovf;
7721 DECL_CHAIN (f_ovf) = f_sav;
7722
7723 layout_type (record);
7724
7725 /* The correct type is an array type of one element. */
7726 return build_array_type (record, build_index_type (size_zero_node));
7727 }
7728
7729 /* Setup the builtin va_list data type and for 64-bit the additional
7730 calling convention specific va_list data types. */
7731
7732 static tree
7733 ix86_build_builtin_va_list (void)
7734 {
7735 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7736
7737 /* Initialize abi specific va_list builtin types. */
7738 if (TARGET_64BIT)
7739 {
7740 tree t;
7741 if (ix86_abi == MS_ABI)
7742 {
7743 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7744 if (TREE_CODE (t) != RECORD_TYPE)
7745 t = build_variant_type_copy (t);
7746 sysv_va_list_type_node = t;
7747 }
7748 else
7749 {
7750 t = ret;
7751 if (TREE_CODE (t) != RECORD_TYPE)
7752 t = build_variant_type_copy (t);
7753 sysv_va_list_type_node = t;
7754 }
7755 if (ix86_abi != MS_ABI)
7756 {
7757 t = ix86_build_builtin_va_list_abi (MS_ABI);
7758 if (TREE_CODE (t) != RECORD_TYPE)
7759 t = build_variant_type_copy (t);
7760 ms_va_list_type_node = t;
7761 }
7762 else
7763 {
7764 t = ret;
7765 if (TREE_CODE (t) != RECORD_TYPE)
7766 t = build_variant_type_copy (t);
7767 ms_va_list_type_node = t;
7768 }
7769 }
7770
7771 return ret;
7772 }
7773
7774 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7775
7776 static void
7777 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7778 {
7779 rtx save_area, mem;
7780 alias_set_type set;
7781 int i, max;
7782
7783 /* GPR size of varargs save area. */
7784 if (cfun->va_list_gpr_size)
7785 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7786 else
7787 ix86_varargs_gpr_size = 0;
7788
7789 /* FPR size of varargs save area. We don't need it if we don't pass
7790 anything in SSE registers. */
7791 if (TARGET_SSE && cfun->va_list_fpr_size)
7792 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7793 else
7794 ix86_varargs_fpr_size = 0;
7795
7796 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7797 return;
7798
7799 save_area = frame_pointer_rtx;
7800 set = get_varargs_alias_set ();
7801
7802 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7803 if (max > X86_64_REGPARM_MAX)
7804 max = X86_64_REGPARM_MAX;
7805
7806 for (i = cum->regno; i < max; i++)
7807 {
7808 mem = gen_rtx_MEM (word_mode,
7809 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7810 MEM_NOTRAP_P (mem) = 1;
7811 set_mem_alias_set (mem, set);
7812 emit_move_insn (mem,
7813 gen_rtx_REG (word_mode,
7814 x86_64_int_parameter_registers[i]));
7815 }
7816
7817 if (ix86_varargs_fpr_size)
7818 {
7819 enum machine_mode smode;
7820 rtx label, test;
7821
7822 /* Now emit code to save SSE registers. The AX parameter contains number
7823 of SSE parameter registers used to call this function, though all we
7824 actually check here is the zero/non-zero status. */
7825
7826 label = gen_label_rtx ();
7827 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7828 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7829 label));
7830
7831 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7832 we used movdqa (i.e. TImode) instead? Perhaps even better would
7833 be if we could determine the real mode of the data, via a hook
7834 into pass_stdarg. Ignore all that for now. */
7835 smode = V4SFmode;
7836 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7837 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7838
7839 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7840 if (max > X86_64_SSE_REGPARM_MAX)
7841 max = X86_64_SSE_REGPARM_MAX;
7842
7843 for (i = cum->sse_regno; i < max; ++i)
7844 {
7845 mem = plus_constant (Pmode, save_area,
7846 i * 16 + ix86_varargs_gpr_size);
7847 mem = gen_rtx_MEM (smode, mem);
7848 MEM_NOTRAP_P (mem) = 1;
7849 set_mem_alias_set (mem, set);
7850 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7851
7852 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7853 }
7854
7855 emit_label (label);
7856 }
7857 }
7858
7859 static void
7860 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7861 {
7862 alias_set_type set = get_varargs_alias_set ();
7863 int i;
7864
7865 /* Reset to zero, as there might be a sysv vaarg used
7866 before. */
7867 ix86_varargs_gpr_size = 0;
7868 ix86_varargs_fpr_size = 0;
7869
7870 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7871 {
7872 rtx reg, mem;
7873
7874 mem = gen_rtx_MEM (Pmode,
7875 plus_constant (Pmode, virtual_incoming_args_rtx,
7876 i * UNITS_PER_WORD));
7877 MEM_NOTRAP_P (mem) = 1;
7878 set_mem_alias_set (mem, set);
7879
7880 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7881 emit_move_insn (mem, reg);
7882 }
7883 }
7884
7885 static void
7886 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7887 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7888 int no_rtl)
7889 {
7890 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7891 CUMULATIVE_ARGS next_cum;
7892 tree fntype;
7893
7894 /* This argument doesn't appear to be used anymore. Which is good,
7895 because the old code here didn't suppress rtl generation. */
7896 gcc_assert (!no_rtl);
7897
7898 if (!TARGET_64BIT)
7899 return;
7900
7901 fntype = TREE_TYPE (current_function_decl);
7902
7903 /* For varargs, we do not want to skip the dummy va_dcl argument.
7904 For stdargs, we do want to skip the last named argument. */
7905 next_cum = *cum;
7906 if (stdarg_p (fntype))
7907 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7908 true);
7909
7910 if (cum->call_abi == MS_ABI)
7911 setup_incoming_varargs_ms_64 (&next_cum);
7912 else
7913 setup_incoming_varargs_64 (&next_cum);
7914 }
7915
7916 /* Checks if TYPE is of kind va_list char *. */
7917
7918 static bool
7919 is_va_list_char_pointer (tree type)
7920 {
7921 tree canonic;
7922
7923 /* For 32-bit it is always true. */
7924 if (!TARGET_64BIT)
7925 return true;
7926 canonic = ix86_canonical_va_list_type (type);
7927 return (canonic == ms_va_list_type_node
7928 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7929 }
7930
7931 /* Implement va_start. */
7932
7933 static void
7934 ix86_va_start (tree valist, rtx nextarg)
7935 {
7936 HOST_WIDE_INT words, n_gpr, n_fpr;
7937 tree f_gpr, f_fpr, f_ovf, f_sav;
7938 tree gpr, fpr, ovf, sav, t;
7939 tree type;
7940 rtx ovf_rtx;
7941
7942 if (flag_split_stack
7943 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7944 {
7945 unsigned int scratch_regno;
7946
7947 /* When we are splitting the stack, we can't refer to the stack
7948 arguments using internal_arg_pointer, because they may be on
7949 the old stack. The split stack prologue will arrange to
7950 leave a pointer to the old stack arguments in a scratch
7951 register, which we here copy to a pseudo-register. The split
7952 stack prologue can't set the pseudo-register directly because
7953 it (the prologue) runs before any registers have been saved. */
7954
7955 scratch_regno = split_stack_prologue_scratch_regno ();
7956 if (scratch_regno != INVALID_REGNUM)
7957 {
7958 rtx reg, seq;
7959
7960 reg = gen_reg_rtx (Pmode);
7961 cfun->machine->split_stack_varargs_pointer = reg;
7962
7963 start_sequence ();
7964 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7965 seq = get_insns ();
7966 end_sequence ();
7967
7968 push_topmost_sequence ();
7969 emit_insn_after (seq, entry_of_function ());
7970 pop_topmost_sequence ();
7971 }
7972 }
7973
7974 /* Only 64bit target needs something special. */
7975 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7976 {
7977 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7978 std_expand_builtin_va_start (valist, nextarg);
7979 else
7980 {
7981 rtx va_r, next;
7982
7983 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7984 next = expand_binop (ptr_mode, add_optab,
7985 cfun->machine->split_stack_varargs_pointer,
7986 crtl->args.arg_offset_rtx,
7987 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7988 convert_move (va_r, next, 0);
7989 }
7990 return;
7991 }
7992
7993 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7994 f_fpr = DECL_CHAIN (f_gpr);
7995 f_ovf = DECL_CHAIN (f_fpr);
7996 f_sav = DECL_CHAIN (f_ovf);
7997
7998 valist = build_simple_mem_ref (valist);
7999 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
8000 /* The following should be folded into the MEM_REF offset. */
8001 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
8002 f_gpr, NULL_TREE);
8003 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
8004 f_fpr, NULL_TREE);
8005 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
8006 f_ovf, NULL_TREE);
8007 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
8008 f_sav, NULL_TREE);
8009
8010 /* Count number of gp and fp argument registers used. */
8011 words = crtl->args.info.words;
8012 n_gpr = crtl->args.info.regno;
8013 n_fpr = crtl->args.info.sse_regno;
8014
8015 if (cfun->va_list_gpr_size)
8016 {
8017 type = TREE_TYPE (gpr);
8018 t = build2 (MODIFY_EXPR, type,
8019 gpr, build_int_cst (type, n_gpr * 8));
8020 TREE_SIDE_EFFECTS (t) = 1;
8021 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8022 }
8023
8024 if (TARGET_SSE && cfun->va_list_fpr_size)
8025 {
8026 type = TREE_TYPE (fpr);
8027 t = build2 (MODIFY_EXPR, type, fpr,
8028 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
8029 TREE_SIDE_EFFECTS (t) = 1;
8030 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8031 }
8032
8033 /* Find the overflow area. */
8034 type = TREE_TYPE (ovf);
8035 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8036 ovf_rtx = crtl->args.internal_arg_pointer;
8037 else
8038 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
8039 t = make_tree (type, ovf_rtx);
8040 if (words != 0)
8041 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
8042 t = build2 (MODIFY_EXPR, type, ovf, t);
8043 TREE_SIDE_EFFECTS (t) = 1;
8044 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8045
8046 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
8047 {
8048 /* Find the register save area.
8049 Prologue of the function save it right above stack frame. */
8050 type = TREE_TYPE (sav);
8051 t = make_tree (type, frame_pointer_rtx);
8052 if (!ix86_varargs_gpr_size)
8053 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
8054 t = build2 (MODIFY_EXPR, type, sav, t);
8055 TREE_SIDE_EFFECTS (t) = 1;
8056 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8057 }
8058 }
8059
8060 /* Implement va_arg. */
8061
8062 static tree
8063 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
8064 gimple_seq *post_p)
8065 {
8066 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
8067 tree f_gpr, f_fpr, f_ovf, f_sav;
8068 tree gpr, fpr, ovf, sav, t;
8069 int size, rsize;
8070 tree lab_false, lab_over = NULL_TREE;
8071 tree addr, t2;
8072 rtx container;
8073 int indirect_p = 0;
8074 tree ptrtype;
8075 enum machine_mode nat_mode;
8076 unsigned int arg_boundary;
8077
8078 /* Only 64bit target needs something special. */
8079 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8080 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
8081
8082 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8083 f_fpr = DECL_CHAIN (f_gpr);
8084 f_ovf = DECL_CHAIN (f_fpr);
8085 f_sav = DECL_CHAIN (f_ovf);
8086
8087 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
8088 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
8089 valist = build_va_arg_indirect_ref (valist);
8090 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
8091 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
8092 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
8093
8094 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8095 if (indirect_p)
8096 type = build_pointer_type (type);
8097 size = int_size_in_bytes (type);
8098 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8099
8100 nat_mode = type_natural_mode (type, NULL);
8101 switch (nat_mode)
8102 {
8103 case V8SFmode:
8104 case V8SImode:
8105 case V32QImode:
8106 case V16HImode:
8107 case V4DFmode:
8108 case V4DImode:
8109 /* Unnamed 256bit vector mode parameters are passed on stack. */
8110 if (!TARGET_64BIT_MS_ABI)
8111 {
8112 container = NULL;
8113 break;
8114 }
8115
8116 default:
8117 container = construct_container (nat_mode, TYPE_MODE (type),
8118 type, 0, X86_64_REGPARM_MAX,
8119 X86_64_SSE_REGPARM_MAX, intreg,
8120 0);
8121 break;
8122 }
8123
8124 /* Pull the value out of the saved registers. */
8125
8126 addr = create_tmp_var (ptr_type_node, "addr");
8127
8128 if (container)
8129 {
8130 int needed_intregs, needed_sseregs;
8131 bool need_temp;
8132 tree int_addr, sse_addr;
8133
8134 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8135 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8136
8137 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8138
8139 need_temp = (!REG_P (container)
8140 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8141 || TYPE_ALIGN (type) > 128));
8142
8143 /* In case we are passing structure, verify that it is consecutive block
8144 on the register save area. If not we need to do moves. */
8145 if (!need_temp && !REG_P (container))
8146 {
8147 /* Verify that all registers are strictly consecutive */
8148 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8149 {
8150 int i;
8151
8152 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8153 {
8154 rtx slot = XVECEXP (container, 0, i);
8155 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8156 || INTVAL (XEXP (slot, 1)) != i * 16)
8157 need_temp = 1;
8158 }
8159 }
8160 else
8161 {
8162 int i;
8163
8164 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8165 {
8166 rtx slot = XVECEXP (container, 0, i);
8167 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8168 || INTVAL (XEXP (slot, 1)) != i * 8)
8169 need_temp = 1;
8170 }
8171 }
8172 }
8173 if (!need_temp)
8174 {
8175 int_addr = addr;
8176 sse_addr = addr;
8177 }
8178 else
8179 {
8180 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8181 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8182 }
8183
8184 /* First ensure that we fit completely in registers. */
8185 if (needed_intregs)
8186 {
8187 t = build_int_cst (TREE_TYPE (gpr),
8188 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8189 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8190 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8191 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8192 gimplify_and_add (t, pre_p);
8193 }
8194 if (needed_sseregs)
8195 {
8196 t = build_int_cst (TREE_TYPE (fpr),
8197 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8198 + X86_64_REGPARM_MAX * 8);
8199 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8200 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8201 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8202 gimplify_and_add (t, pre_p);
8203 }
8204
8205 /* Compute index to start of area used for integer regs. */
8206 if (needed_intregs)
8207 {
8208 /* int_addr = gpr + sav; */
8209 t = fold_build_pointer_plus (sav, gpr);
8210 gimplify_assign (int_addr, t, pre_p);
8211 }
8212 if (needed_sseregs)
8213 {
8214 /* sse_addr = fpr + sav; */
8215 t = fold_build_pointer_plus (sav, fpr);
8216 gimplify_assign (sse_addr, t, pre_p);
8217 }
8218 if (need_temp)
8219 {
8220 int i, prev_size = 0;
8221 tree temp = create_tmp_var (type, "va_arg_tmp");
8222
8223 /* addr = &temp; */
8224 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8225 gimplify_assign (addr, t, pre_p);
8226
8227 for (i = 0; i < XVECLEN (container, 0); i++)
8228 {
8229 rtx slot = XVECEXP (container, 0, i);
8230 rtx reg = XEXP (slot, 0);
8231 enum machine_mode mode = GET_MODE (reg);
8232 tree piece_type;
8233 tree addr_type;
8234 tree daddr_type;
8235 tree src_addr, src;
8236 int src_offset;
8237 tree dest_addr, dest;
8238 int cur_size = GET_MODE_SIZE (mode);
8239
8240 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8241 prev_size = INTVAL (XEXP (slot, 1));
8242 if (prev_size + cur_size > size)
8243 {
8244 cur_size = size - prev_size;
8245 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8246 if (mode == BLKmode)
8247 mode = QImode;
8248 }
8249 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8250 if (mode == GET_MODE (reg))
8251 addr_type = build_pointer_type (piece_type);
8252 else
8253 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8254 true);
8255 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8256 true);
8257
8258 if (SSE_REGNO_P (REGNO (reg)))
8259 {
8260 src_addr = sse_addr;
8261 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8262 }
8263 else
8264 {
8265 src_addr = int_addr;
8266 src_offset = REGNO (reg) * 8;
8267 }
8268 src_addr = fold_convert (addr_type, src_addr);
8269 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8270
8271 dest_addr = fold_convert (daddr_type, addr);
8272 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8273 if (cur_size == GET_MODE_SIZE (mode))
8274 {
8275 src = build_va_arg_indirect_ref (src_addr);
8276 dest = build_va_arg_indirect_ref (dest_addr);
8277
8278 gimplify_assign (dest, src, pre_p);
8279 }
8280 else
8281 {
8282 tree copy
8283 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8284 3, dest_addr, src_addr,
8285 size_int (cur_size));
8286 gimplify_and_add (copy, pre_p);
8287 }
8288 prev_size += cur_size;
8289 }
8290 }
8291
8292 if (needed_intregs)
8293 {
8294 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8295 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8296 gimplify_assign (gpr, t, pre_p);
8297 }
8298
8299 if (needed_sseregs)
8300 {
8301 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8302 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8303 gimplify_assign (fpr, t, pre_p);
8304 }
8305
8306 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8307
8308 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8309 }
8310
8311 /* ... otherwise out of the overflow area. */
8312
8313 /* When we align parameter on stack for caller, if the parameter
8314 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8315 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8316 here with caller. */
8317 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8318 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8319 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8320
8321 /* Care for on-stack alignment if needed. */
8322 if (arg_boundary <= 64 || size == 0)
8323 t = ovf;
8324 else
8325 {
8326 HOST_WIDE_INT align = arg_boundary / 8;
8327 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8328 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8329 build_int_cst (TREE_TYPE (t), -align));
8330 }
8331
8332 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8333 gimplify_assign (addr, t, pre_p);
8334
8335 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8336 gimplify_assign (unshare_expr (ovf), t, pre_p);
8337
8338 if (container)
8339 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8340
8341 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8342 addr = fold_convert (ptrtype, addr);
8343
8344 if (indirect_p)
8345 addr = build_va_arg_indirect_ref (addr);
8346 return build_va_arg_indirect_ref (addr);
8347 }
8348 \f
8349 /* Return true if OPNUM's MEM should be matched
8350 in movabs* patterns. */
8351
8352 bool
8353 ix86_check_movabs (rtx insn, int opnum)
8354 {
8355 rtx set, mem;
8356
8357 set = PATTERN (insn);
8358 if (GET_CODE (set) == PARALLEL)
8359 set = XVECEXP (set, 0, 0);
8360 gcc_assert (GET_CODE (set) == SET);
8361 mem = XEXP (set, opnum);
8362 while (GET_CODE (mem) == SUBREG)
8363 mem = SUBREG_REG (mem);
8364 gcc_assert (MEM_P (mem));
8365 return volatile_ok || !MEM_VOLATILE_P (mem);
8366 }
8367 \f
8368 /* Initialize the table of extra 80387 mathematical constants. */
8369
8370 static void
8371 init_ext_80387_constants (void)
8372 {
8373 static const char * cst[5] =
8374 {
8375 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8376 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8377 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8378 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8379 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8380 };
8381 int i;
8382
8383 for (i = 0; i < 5; i++)
8384 {
8385 real_from_string (&ext_80387_constants_table[i], cst[i]);
8386 /* Ensure each constant is rounded to XFmode precision. */
8387 real_convert (&ext_80387_constants_table[i],
8388 XFmode, &ext_80387_constants_table[i]);
8389 }
8390
8391 ext_80387_constants_init = 1;
8392 }
8393
8394 /* Return non-zero if the constant is something that
8395 can be loaded with a special instruction. */
8396
8397 int
8398 standard_80387_constant_p (rtx x)
8399 {
8400 enum machine_mode mode = GET_MODE (x);
8401
8402 REAL_VALUE_TYPE r;
8403
8404 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8405 return -1;
8406
8407 if (x == CONST0_RTX (mode))
8408 return 1;
8409 if (x == CONST1_RTX (mode))
8410 return 2;
8411
8412 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8413
8414 /* For XFmode constants, try to find a special 80387 instruction when
8415 optimizing for size or on those CPUs that benefit from them. */
8416 if (mode == XFmode
8417 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8418 {
8419 int i;
8420
8421 if (! ext_80387_constants_init)
8422 init_ext_80387_constants ();
8423
8424 for (i = 0; i < 5; i++)
8425 if (real_identical (&r, &ext_80387_constants_table[i]))
8426 return i + 3;
8427 }
8428
8429 /* Load of the constant -0.0 or -1.0 will be split as
8430 fldz;fchs or fld1;fchs sequence. */
8431 if (real_isnegzero (&r))
8432 return 8;
8433 if (real_identical (&r, &dconstm1))
8434 return 9;
8435
8436 return 0;
8437 }
8438
8439 /* Return the opcode of the special instruction to be used to load
8440 the constant X. */
8441
8442 const char *
8443 standard_80387_constant_opcode (rtx x)
8444 {
8445 switch (standard_80387_constant_p (x))
8446 {
8447 case 1:
8448 return "fldz";
8449 case 2:
8450 return "fld1";
8451 case 3:
8452 return "fldlg2";
8453 case 4:
8454 return "fldln2";
8455 case 5:
8456 return "fldl2e";
8457 case 6:
8458 return "fldl2t";
8459 case 7:
8460 return "fldpi";
8461 case 8:
8462 case 9:
8463 return "#";
8464 default:
8465 gcc_unreachable ();
8466 }
8467 }
8468
8469 /* Return the CONST_DOUBLE representing the 80387 constant that is
8470 loaded by the specified special instruction. The argument IDX
8471 matches the return value from standard_80387_constant_p. */
8472
8473 rtx
8474 standard_80387_constant_rtx (int idx)
8475 {
8476 int i;
8477
8478 if (! ext_80387_constants_init)
8479 init_ext_80387_constants ();
8480
8481 switch (idx)
8482 {
8483 case 3:
8484 case 4:
8485 case 5:
8486 case 6:
8487 case 7:
8488 i = idx - 3;
8489 break;
8490
8491 default:
8492 gcc_unreachable ();
8493 }
8494
8495 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8496 XFmode);
8497 }
8498
8499 /* Return 1 if X is all 0s and 2 if x is all 1s
8500 in supported SSE/AVX vector mode. */
8501
8502 int
8503 standard_sse_constant_p (rtx x)
8504 {
8505 enum machine_mode mode = GET_MODE (x);
8506
8507 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8508 return 1;
8509 if (vector_all_ones_operand (x, mode))
8510 switch (mode)
8511 {
8512 case V16QImode:
8513 case V8HImode:
8514 case V4SImode:
8515 case V2DImode:
8516 if (TARGET_SSE2)
8517 return 2;
8518 case V32QImode:
8519 case V16HImode:
8520 case V8SImode:
8521 case V4DImode:
8522 if (TARGET_AVX2)
8523 return 2;
8524 default:
8525 break;
8526 }
8527
8528 return 0;
8529 }
8530
8531 /* Return the opcode of the special instruction to be used to load
8532 the constant X. */
8533
8534 const char *
8535 standard_sse_constant_opcode (rtx insn, rtx x)
8536 {
8537 switch (standard_sse_constant_p (x))
8538 {
8539 case 1:
8540 switch (get_attr_mode (insn))
8541 {
8542 case MODE_TI:
8543 return "%vpxor\t%0, %d0";
8544 case MODE_V2DF:
8545 return "%vxorpd\t%0, %d0";
8546 case MODE_V4SF:
8547 return "%vxorps\t%0, %d0";
8548
8549 case MODE_OI:
8550 return "vpxor\t%x0, %x0, %x0";
8551 case MODE_V4DF:
8552 return "vxorpd\t%x0, %x0, %x0";
8553 case MODE_V8SF:
8554 return "vxorps\t%x0, %x0, %x0";
8555
8556 default:
8557 break;
8558 }
8559
8560 case 2:
8561 if (TARGET_AVX)
8562 return "vpcmpeqd\t%0, %0, %0";
8563 else
8564 return "pcmpeqd\t%0, %0";
8565
8566 default:
8567 break;
8568 }
8569 gcc_unreachable ();
8570 }
8571
8572 /* Returns true if OP contains a symbol reference */
8573
8574 bool
8575 symbolic_reference_mentioned_p (rtx op)
8576 {
8577 const char *fmt;
8578 int i;
8579
8580 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8581 return true;
8582
8583 fmt = GET_RTX_FORMAT (GET_CODE (op));
8584 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8585 {
8586 if (fmt[i] == 'E')
8587 {
8588 int j;
8589
8590 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8591 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8592 return true;
8593 }
8594
8595 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8596 return true;
8597 }
8598
8599 return false;
8600 }
8601
8602 /* Return true if it is appropriate to emit `ret' instructions in the
8603 body of a function. Do this only if the epilogue is simple, needing a
8604 couple of insns. Prior to reloading, we can't tell how many registers
8605 must be saved, so return false then. Return false if there is no frame
8606 marker to de-allocate. */
8607
8608 bool
8609 ix86_can_use_return_insn_p (void)
8610 {
8611 struct ix86_frame frame;
8612
8613 if (! reload_completed || frame_pointer_needed)
8614 return 0;
8615
8616 /* Don't allow more than 32k pop, since that's all we can do
8617 with one instruction. */
8618 if (crtl->args.pops_args && crtl->args.size >= 32768)
8619 return 0;
8620
8621 ix86_compute_frame_layout (&frame);
8622 return (frame.stack_pointer_offset == UNITS_PER_WORD
8623 && (frame.nregs + frame.nsseregs) == 0);
8624 }
8625 \f
8626 /* Value should be nonzero if functions must have frame pointers.
8627 Zero means the frame pointer need not be set up (and parms may
8628 be accessed via the stack pointer) in functions that seem suitable. */
8629
8630 static bool
8631 ix86_frame_pointer_required (void)
8632 {
8633 /* If we accessed previous frames, then the generated code expects
8634 to be able to access the saved ebp value in our frame. */
8635 if (cfun->machine->accesses_prev_frame)
8636 return true;
8637
8638 /* Several x86 os'es need a frame pointer for other reasons,
8639 usually pertaining to setjmp. */
8640 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8641 return true;
8642
8643 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8644 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8645 return true;
8646
8647 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8648 allocation is 4GB. */
8649 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8650 return true;
8651
8652 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8653 turns off the frame pointer by default. Turn it back on now if
8654 we've not got a leaf function. */
8655 if (TARGET_OMIT_LEAF_FRAME_POINTER
8656 && (!crtl->is_leaf
8657 || ix86_current_function_calls_tls_descriptor))
8658 return true;
8659
8660 if (crtl->profile && !flag_fentry)
8661 return true;
8662
8663 return false;
8664 }
8665
8666 /* Record that the current function accesses previous call frames. */
8667
8668 void
8669 ix86_setup_frame_addresses (void)
8670 {
8671 cfun->machine->accesses_prev_frame = 1;
8672 }
8673 \f
8674 #ifndef USE_HIDDEN_LINKONCE
8675 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8676 # define USE_HIDDEN_LINKONCE 1
8677 # else
8678 # define USE_HIDDEN_LINKONCE 0
8679 # endif
8680 #endif
8681
8682 static int pic_labels_used;
8683
8684 /* Fills in the label name that should be used for a pc thunk for
8685 the given register. */
8686
8687 static void
8688 get_pc_thunk_name (char name[32], unsigned int regno)
8689 {
8690 gcc_assert (!TARGET_64BIT);
8691
8692 if (USE_HIDDEN_LINKONCE)
8693 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8694 else
8695 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8696 }
8697
8698
8699 /* This function generates code for -fpic that loads %ebx with
8700 the return address of the caller and then returns. */
8701
8702 static void
8703 ix86_code_end (void)
8704 {
8705 rtx xops[2];
8706 int regno;
8707
8708 for (regno = AX_REG; regno <= SP_REG; regno++)
8709 {
8710 char name[32];
8711 tree decl;
8712
8713 if (!(pic_labels_used & (1 << regno)))
8714 continue;
8715
8716 get_pc_thunk_name (name, regno);
8717
8718 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8719 get_identifier (name),
8720 build_function_type_list (void_type_node, NULL_TREE));
8721 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8722 NULL_TREE, void_type_node);
8723 TREE_PUBLIC (decl) = 1;
8724 TREE_STATIC (decl) = 1;
8725 DECL_IGNORED_P (decl) = 1;
8726
8727 #if TARGET_MACHO
8728 if (TARGET_MACHO)
8729 {
8730 switch_to_section (darwin_sections[text_coal_section]);
8731 fputs ("\t.weak_definition\t", asm_out_file);
8732 assemble_name (asm_out_file, name);
8733 fputs ("\n\t.private_extern\t", asm_out_file);
8734 assemble_name (asm_out_file, name);
8735 putc ('\n', asm_out_file);
8736 ASM_OUTPUT_LABEL (asm_out_file, name);
8737 DECL_WEAK (decl) = 1;
8738 }
8739 else
8740 #endif
8741 if (USE_HIDDEN_LINKONCE)
8742 {
8743 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8744
8745 targetm.asm_out.unique_section (decl, 0);
8746 switch_to_section (get_named_section (decl, NULL, 0));
8747
8748 targetm.asm_out.globalize_label (asm_out_file, name);
8749 fputs ("\t.hidden\t", asm_out_file);
8750 assemble_name (asm_out_file, name);
8751 putc ('\n', asm_out_file);
8752 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8753 }
8754 else
8755 {
8756 switch_to_section (text_section);
8757 ASM_OUTPUT_LABEL (asm_out_file, name);
8758 }
8759
8760 DECL_INITIAL (decl) = make_node (BLOCK);
8761 current_function_decl = decl;
8762 init_function_start (decl);
8763 first_function_block_is_cold = false;
8764 /* Make sure unwind info is emitted for the thunk if needed. */
8765 final_start_function (emit_barrier (), asm_out_file, 1);
8766
8767 /* Pad stack IP move with 4 instructions (two NOPs count
8768 as one instruction). */
8769 if (TARGET_PAD_SHORT_FUNCTION)
8770 {
8771 int i = 8;
8772
8773 while (i--)
8774 fputs ("\tnop\n", asm_out_file);
8775 }
8776
8777 xops[0] = gen_rtx_REG (Pmode, regno);
8778 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8779 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8780 fputs ("\tret\n", asm_out_file);
8781 final_end_function ();
8782 init_insn_lengths ();
8783 free_after_compilation (cfun);
8784 set_cfun (NULL);
8785 current_function_decl = NULL;
8786 }
8787
8788 if (flag_split_stack)
8789 file_end_indicate_split_stack ();
8790 }
8791
8792 /* Emit code for the SET_GOT patterns. */
8793
8794 const char *
8795 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8796 {
8797 rtx xops[3];
8798
8799 xops[0] = dest;
8800
8801 if (TARGET_VXWORKS_RTP && flag_pic)
8802 {
8803 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8804 xops[2] = gen_rtx_MEM (Pmode,
8805 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8806 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8807
8808 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8809 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8810 an unadorned address. */
8811 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8812 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8813 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8814 return "";
8815 }
8816
8817 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8818
8819 if (!flag_pic)
8820 {
8821 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8822
8823 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8824
8825 #if TARGET_MACHO
8826 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8827 is what will be referenced by the Mach-O PIC subsystem. */
8828 if (!label)
8829 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8830 #endif
8831
8832 targetm.asm_out.internal_label (asm_out_file, "L",
8833 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8834 }
8835 else
8836 {
8837 char name[32];
8838 get_pc_thunk_name (name, REGNO (dest));
8839 pic_labels_used |= 1 << REGNO (dest);
8840
8841 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8842 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8843 output_asm_insn ("call\t%X2", xops);
8844 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8845 is what will be referenced by the Mach-O PIC subsystem. */
8846 #if TARGET_MACHO
8847 if (!label)
8848 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8849 else
8850 targetm.asm_out.internal_label (asm_out_file, "L",
8851 CODE_LABEL_NUMBER (label));
8852 #endif
8853 }
8854
8855 if (!TARGET_MACHO)
8856 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8857
8858 return "";
8859 }
8860
8861 /* Generate an "push" pattern for input ARG. */
8862
8863 static rtx
8864 gen_push (rtx arg)
8865 {
8866 struct machine_function *m = cfun->machine;
8867
8868 if (m->fs.cfa_reg == stack_pointer_rtx)
8869 m->fs.cfa_offset += UNITS_PER_WORD;
8870 m->fs.sp_offset += UNITS_PER_WORD;
8871
8872 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8873 arg = gen_rtx_REG (word_mode, REGNO (arg));
8874
8875 return gen_rtx_SET (VOIDmode,
8876 gen_rtx_MEM (word_mode,
8877 gen_rtx_PRE_DEC (Pmode,
8878 stack_pointer_rtx)),
8879 arg);
8880 }
8881
8882 /* Generate an "pop" pattern for input ARG. */
8883
8884 static rtx
8885 gen_pop (rtx arg)
8886 {
8887 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8888 arg = gen_rtx_REG (word_mode, REGNO (arg));
8889
8890 return gen_rtx_SET (VOIDmode,
8891 arg,
8892 gen_rtx_MEM (word_mode,
8893 gen_rtx_POST_INC (Pmode,
8894 stack_pointer_rtx)));
8895 }
8896
8897 /* Return >= 0 if there is an unused call-clobbered register available
8898 for the entire function. */
8899
8900 static unsigned int
8901 ix86_select_alt_pic_regnum (void)
8902 {
8903 if (crtl->is_leaf
8904 && !crtl->profile
8905 && !ix86_current_function_calls_tls_descriptor)
8906 {
8907 int i, drap;
8908 /* Can't use the same register for both PIC and DRAP. */
8909 if (crtl->drap_reg)
8910 drap = REGNO (crtl->drap_reg);
8911 else
8912 drap = -1;
8913 for (i = 2; i >= 0; --i)
8914 if (i != drap && !df_regs_ever_live_p (i))
8915 return i;
8916 }
8917
8918 return INVALID_REGNUM;
8919 }
8920
8921 /* Return TRUE if we need to save REGNO. */
8922
8923 static bool
8924 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8925 {
8926 if (pic_offset_table_rtx
8927 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8928 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8929 || crtl->profile
8930 || crtl->calls_eh_return
8931 || crtl->uses_const_pool))
8932 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8933
8934 if (crtl->calls_eh_return && maybe_eh_return)
8935 {
8936 unsigned i;
8937 for (i = 0; ; i++)
8938 {
8939 unsigned test = EH_RETURN_DATA_REGNO (i);
8940 if (test == INVALID_REGNUM)
8941 break;
8942 if (test == regno)
8943 return true;
8944 }
8945 }
8946
8947 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8948 return true;
8949
8950 return (df_regs_ever_live_p (regno)
8951 && !call_used_regs[regno]
8952 && !fixed_regs[regno]
8953 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8954 }
8955
8956 /* Return number of saved general prupose registers. */
8957
8958 static int
8959 ix86_nsaved_regs (void)
8960 {
8961 int nregs = 0;
8962 int regno;
8963
8964 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8965 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8966 nregs ++;
8967 return nregs;
8968 }
8969
8970 /* Return number of saved SSE registrers. */
8971
8972 static int
8973 ix86_nsaved_sseregs (void)
8974 {
8975 int nregs = 0;
8976 int regno;
8977
8978 if (!TARGET_64BIT_MS_ABI)
8979 return 0;
8980 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8981 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8982 nregs ++;
8983 return nregs;
8984 }
8985
8986 /* Given FROM and TO register numbers, say whether this elimination is
8987 allowed. If stack alignment is needed, we can only replace argument
8988 pointer with hard frame pointer, or replace frame pointer with stack
8989 pointer. Otherwise, frame pointer elimination is automatically
8990 handled and all other eliminations are valid. */
8991
8992 static bool
8993 ix86_can_eliminate (const int from, const int to)
8994 {
8995 if (stack_realign_fp)
8996 return ((from == ARG_POINTER_REGNUM
8997 && to == HARD_FRAME_POINTER_REGNUM)
8998 || (from == FRAME_POINTER_REGNUM
8999 && to == STACK_POINTER_REGNUM));
9000 else
9001 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
9002 }
9003
9004 /* Return the offset between two registers, one to be eliminated, and the other
9005 its replacement, at the start of a routine. */
9006
9007 HOST_WIDE_INT
9008 ix86_initial_elimination_offset (int from, int to)
9009 {
9010 struct ix86_frame frame;
9011 ix86_compute_frame_layout (&frame);
9012
9013 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
9014 return frame.hard_frame_pointer_offset;
9015 else if (from == FRAME_POINTER_REGNUM
9016 && to == HARD_FRAME_POINTER_REGNUM)
9017 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
9018 else
9019 {
9020 gcc_assert (to == STACK_POINTER_REGNUM);
9021
9022 if (from == ARG_POINTER_REGNUM)
9023 return frame.stack_pointer_offset;
9024
9025 gcc_assert (from == FRAME_POINTER_REGNUM);
9026 return frame.stack_pointer_offset - frame.frame_pointer_offset;
9027 }
9028 }
9029
9030 /* In a dynamically-aligned function, we can't know the offset from
9031 stack pointer to frame pointer, so we must ensure that setjmp
9032 eliminates fp against the hard fp (%ebp) rather than trying to
9033 index from %esp up to the top of the frame across a gap that is
9034 of unknown (at compile-time) size. */
9035 static rtx
9036 ix86_builtin_setjmp_frame_value (void)
9037 {
9038 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
9039 }
9040
9041 /* When using -fsplit-stack, the allocation routines set a field in
9042 the TCB to the bottom of the stack plus this much space, measured
9043 in bytes. */
9044
9045 #define SPLIT_STACK_AVAILABLE 256
9046
9047 /* Fill structure ix86_frame about frame of currently computed function. */
9048
9049 static void
9050 ix86_compute_frame_layout (struct ix86_frame *frame)
9051 {
9052 unsigned HOST_WIDE_INT stack_alignment_needed;
9053 HOST_WIDE_INT offset;
9054 unsigned HOST_WIDE_INT preferred_alignment;
9055 HOST_WIDE_INT size = get_frame_size ();
9056 HOST_WIDE_INT to_allocate;
9057
9058 frame->nregs = ix86_nsaved_regs ();
9059 frame->nsseregs = ix86_nsaved_sseregs ();
9060
9061 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
9062 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
9063
9064 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
9065 function prologues and leaf. */
9066 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
9067 && (!crtl->is_leaf || cfun->calls_alloca != 0
9068 || ix86_current_function_calls_tls_descriptor))
9069 {
9070 preferred_alignment = 16;
9071 stack_alignment_needed = 16;
9072 crtl->preferred_stack_boundary = 128;
9073 crtl->stack_alignment_needed = 128;
9074 }
9075
9076 gcc_assert (!size || stack_alignment_needed);
9077 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
9078 gcc_assert (preferred_alignment <= stack_alignment_needed);
9079
9080 /* For SEH we have to limit the amount of code movement into the prologue.
9081 At present we do this via a BLOCKAGE, at which point there's very little
9082 scheduling that can be done, which means that there's very little point
9083 in doing anything except PUSHs. */
9084 if (TARGET_SEH)
9085 cfun->machine->use_fast_prologue_epilogue = false;
9086
9087 /* During reload iteration the amount of registers saved can change.
9088 Recompute the value as needed. Do not recompute when amount of registers
9089 didn't change as reload does multiple calls to the function and does not
9090 expect the decision to change within single iteration. */
9091 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR)
9092 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
9093 {
9094 int count = frame->nregs;
9095 struct cgraph_node *node = cgraph_get_node (current_function_decl);
9096
9097 cfun->machine->use_fast_prologue_epilogue_nregs = count;
9098
9099 /* The fast prologue uses move instead of push to save registers. This
9100 is significantly longer, but also executes faster as modern hardware
9101 can execute the moves in parallel, but can't do that for push/pop.
9102
9103 Be careful about choosing what prologue to emit: When function takes
9104 many instructions to execute we may use slow version as well as in
9105 case function is known to be outside hot spot (this is known with
9106 feedback only). Weight the size of function by number of registers
9107 to save as it is cheap to use one or two push instructions but very
9108 slow to use many of them. */
9109 if (count)
9110 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9111 if (node->frequency < NODE_FREQUENCY_NORMAL
9112 || (flag_branch_probabilities
9113 && node->frequency < NODE_FREQUENCY_HOT))
9114 cfun->machine->use_fast_prologue_epilogue = false;
9115 else
9116 cfun->machine->use_fast_prologue_epilogue
9117 = !expensive_function_p (count);
9118 }
9119
9120 frame->save_regs_using_mov
9121 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9122 /* If static stack checking is enabled and done with probes,
9123 the registers need to be saved before allocating the frame. */
9124 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9125
9126 /* Skip return address. */
9127 offset = UNITS_PER_WORD;
9128
9129 /* Skip pushed static chain. */
9130 if (ix86_static_chain_on_stack)
9131 offset += UNITS_PER_WORD;
9132
9133 /* Skip saved base pointer. */
9134 if (frame_pointer_needed)
9135 offset += UNITS_PER_WORD;
9136 frame->hfp_save_offset = offset;
9137
9138 /* The traditional frame pointer location is at the top of the frame. */
9139 frame->hard_frame_pointer_offset = offset;
9140
9141 /* Register save area */
9142 offset += frame->nregs * UNITS_PER_WORD;
9143 frame->reg_save_offset = offset;
9144
9145 /* On SEH target, registers are pushed just before the frame pointer
9146 location. */
9147 if (TARGET_SEH)
9148 frame->hard_frame_pointer_offset = offset;
9149
9150 /* Align and set SSE register save area. */
9151 if (frame->nsseregs)
9152 {
9153 /* The only ABI that has saved SSE registers (Win64) also has a
9154 16-byte aligned default stack, and thus we don't need to be
9155 within the re-aligned local stack frame to save them. */
9156 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9157 offset = (offset + 16 - 1) & -16;
9158 offset += frame->nsseregs * 16;
9159 }
9160 frame->sse_reg_save_offset = offset;
9161
9162 /* The re-aligned stack starts here. Values before this point are not
9163 directly comparable with values below this point. In order to make
9164 sure that no value happens to be the same before and after, force
9165 the alignment computation below to add a non-zero value. */
9166 if (stack_realign_fp)
9167 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9168
9169 /* Va-arg area */
9170 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9171 offset += frame->va_arg_size;
9172
9173 /* Align start of frame for local function. */
9174 if (stack_realign_fp
9175 || offset != frame->sse_reg_save_offset
9176 || size != 0
9177 || !crtl->is_leaf
9178 || cfun->calls_alloca
9179 || ix86_current_function_calls_tls_descriptor)
9180 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9181
9182 /* Frame pointer points here. */
9183 frame->frame_pointer_offset = offset;
9184
9185 offset += size;
9186
9187 /* Add outgoing arguments area. Can be skipped if we eliminated
9188 all the function calls as dead code.
9189 Skipping is however impossible when function calls alloca. Alloca
9190 expander assumes that last crtl->outgoing_args_size
9191 of stack frame are unused. */
9192 if (ACCUMULATE_OUTGOING_ARGS
9193 && (!crtl->is_leaf || cfun->calls_alloca
9194 || ix86_current_function_calls_tls_descriptor))
9195 {
9196 offset += crtl->outgoing_args_size;
9197 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9198 }
9199 else
9200 frame->outgoing_arguments_size = 0;
9201
9202 /* Align stack boundary. Only needed if we're calling another function
9203 or using alloca. */
9204 if (!crtl->is_leaf || cfun->calls_alloca
9205 || ix86_current_function_calls_tls_descriptor)
9206 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9207
9208 /* We've reached end of stack frame. */
9209 frame->stack_pointer_offset = offset;
9210
9211 /* Size prologue needs to allocate. */
9212 to_allocate = offset - frame->sse_reg_save_offset;
9213
9214 if ((!to_allocate && frame->nregs <= 1)
9215 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9216 frame->save_regs_using_mov = false;
9217
9218 if (ix86_using_red_zone ()
9219 && crtl->sp_is_unchanging
9220 && crtl->is_leaf
9221 && !ix86_current_function_calls_tls_descriptor)
9222 {
9223 frame->red_zone_size = to_allocate;
9224 if (frame->save_regs_using_mov)
9225 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9226 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9227 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9228 }
9229 else
9230 frame->red_zone_size = 0;
9231 frame->stack_pointer_offset -= frame->red_zone_size;
9232
9233 /* The SEH frame pointer location is near the bottom of the frame.
9234 This is enforced by the fact that the difference between the
9235 stack pointer and the frame pointer is limited to 240 bytes in
9236 the unwind data structure. */
9237 if (TARGET_SEH)
9238 {
9239 HOST_WIDE_INT diff;
9240
9241 /* If we can leave the frame pointer where it is, do so. Also, returns
9242 the establisher frame for __builtin_frame_address (0). */
9243 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9244 if (diff <= SEH_MAX_FRAME_SIZE
9245 && (diff > 240 || (diff & 15) != 0)
9246 && !crtl->accesses_prior_frames)
9247 {
9248 /* Ideally we'd determine what portion of the local stack frame
9249 (within the constraint of the lowest 240) is most heavily used.
9250 But without that complication, simply bias the frame pointer
9251 by 128 bytes so as to maximize the amount of the local stack
9252 frame that is addressable with 8-bit offsets. */
9253 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9254 }
9255 }
9256 }
9257
9258 /* This is semi-inlined memory_address_length, but simplified
9259 since we know that we're always dealing with reg+offset, and
9260 to avoid having to create and discard all that rtl. */
9261
9262 static inline int
9263 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9264 {
9265 int len = 4;
9266
9267 if (offset == 0)
9268 {
9269 /* EBP and R13 cannot be encoded without an offset. */
9270 len = (regno == BP_REG || regno == R13_REG);
9271 }
9272 else if (IN_RANGE (offset, -128, 127))
9273 len = 1;
9274
9275 /* ESP and R12 must be encoded with a SIB byte. */
9276 if (regno == SP_REG || regno == R12_REG)
9277 len++;
9278
9279 return len;
9280 }
9281
9282 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9283 The valid base registers are taken from CFUN->MACHINE->FS. */
9284
9285 static rtx
9286 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9287 {
9288 const struct machine_function *m = cfun->machine;
9289 rtx base_reg = NULL;
9290 HOST_WIDE_INT base_offset = 0;
9291
9292 if (m->use_fast_prologue_epilogue)
9293 {
9294 /* Choose the base register most likely to allow the most scheduling
9295 opportunities. Generally FP is valid throughout the function,
9296 while DRAP must be reloaded within the epilogue. But choose either
9297 over the SP due to increased encoding size. */
9298
9299 if (m->fs.fp_valid)
9300 {
9301 base_reg = hard_frame_pointer_rtx;
9302 base_offset = m->fs.fp_offset - cfa_offset;
9303 }
9304 else if (m->fs.drap_valid)
9305 {
9306 base_reg = crtl->drap_reg;
9307 base_offset = 0 - cfa_offset;
9308 }
9309 else if (m->fs.sp_valid)
9310 {
9311 base_reg = stack_pointer_rtx;
9312 base_offset = m->fs.sp_offset - cfa_offset;
9313 }
9314 }
9315 else
9316 {
9317 HOST_WIDE_INT toffset;
9318 int len = 16, tlen;
9319
9320 /* Choose the base register with the smallest address encoding.
9321 With a tie, choose FP > DRAP > SP. */
9322 if (m->fs.sp_valid)
9323 {
9324 base_reg = stack_pointer_rtx;
9325 base_offset = m->fs.sp_offset - cfa_offset;
9326 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9327 }
9328 if (m->fs.drap_valid)
9329 {
9330 toffset = 0 - cfa_offset;
9331 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9332 if (tlen <= len)
9333 {
9334 base_reg = crtl->drap_reg;
9335 base_offset = toffset;
9336 len = tlen;
9337 }
9338 }
9339 if (m->fs.fp_valid)
9340 {
9341 toffset = m->fs.fp_offset - cfa_offset;
9342 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9343 if (tlen <= len)
9344 {
9345 base_reg = hard_frame_pointer_rtx;
9346 base_offset = toffset;
9347 len = tlen;
9348 }
9349 }
9350 }
9351 gcc_assert (base_reg != NULL);
9352
9353 return plus_constant (Pmode, base_reg, base_offset);
9354 }
9355
9356 /* Emit code to save registers in the prologue. */
9357
9358 static void
9359 ix86_emit_save_regs (void)
9360 {
9361 unsigned int regno;
9362 rtx insn;
9363
9364 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9365 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9366 {
9367 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9368 RTX_FRAME_RELATED_P (insn) = 1;
9369 }
9370 }
9371
9372 /* Emit a single register save at CFA - CFA_OFFSET. */
9373
9374 static void
9375 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9376 HOST_WIDE_INT cfa_offset)
9377 {
9378 struct machine_function *m = cfun->machine;
9379 rtx reg = gen_rtx_REG (mode, regno);
9380 rtx mem, addr, base, insn;
9381
9382 addr = choose_baseaddr (cfa_offset);
9383 mem = gen_frame_mem (mode, addr);
9384
9385 /* For SSE saves, we need to indicate the 128-bit alignment. */
9386 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9387
9388 insn = emit_move_insn (mem, reg);
9389 RTX_FRAME_RELATED_P (insn) = 1;
9390
9391 base = addr;
9392 if (GET_CODE (base) == PLUS)
9393 base = XEXP (base, 0);
9394 gcc_checking_assert (REG_P (base));
9395
9396 /* When saving registers into a re-aligned local stack frame, avoid
9397 any tricky guessing by dwarf2out. */
9398 if (m->fs.realigned)
9399 {
9400 gcc_checking_assert (stack_realign_drap);
9401
9402 if (regno == REGNO (crtl->drap_reg))
9403 {
9404 /* A bit of a hack. We force the DRAP register to be saved in
9405 the re-aligned stack frame, which provides us with a copy
9406 of the CFA that will last past the prologue. Install it. */
9407 gcc_checking_assert (cfun->machine->fs.fp_valid);
9408 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9409 cfun->machine->fs.fp_offset - cfa_offset);
9410 mem = gen_rtx_MEM (mode, addr);
9411 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9412 }
9413 else
9414 {
9415 /* The frame pointer is a stable reference within the
9416 aligned frame. Use it. */
9417 gcc_checking_assert (cfun->machine->fs.fp_valid);
9418 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9419 cfun->machine->fs.fp_offset - cfa_offset);
9420 mem = gen_rtx_MEM (mode, addr);
9421 add_reg_note (insn, REG_CFA_EXPRESSION,
9422 gen_rtx_SET (VOIDmode, mem, reg));
9423 }
9424 }
9425
9426 /* The memory may not be relative to the current CFA register,
9427 which means that we may need to generate a new pattern for
9428 use by the unwind info. */
9429 else if (base != m->fs.cfa_reg)
9430 {
9431 addr = plus_constant (Pmode, m->fs.cfa_reg,
9432 m->fs.cfa_offset - cfa_offset);
9433 mem = gen_rtx_MEM (mode, addr);
9434 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9435 }
9436 }
9437
9438 /* Emit code to save registers using MOV insns.
9439 First register is stored at CFA - CFA_OFFSET. */
9440 static void
9441 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9442 {
9443 unsigned int regno;
9444
9445 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9446 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9447 {
9448 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9449 cfa_offset -= UNITS_PER_WORD;
9450 }
9451 }
9452
9453 /* Emit code to save SSE registers using MOV insns.
9454 First register is stored at CFA - CFA_OFFSET. */
9455 static void
9456 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9457 {
9458 unsigned int regno;
9459
9460 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9461 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9462 {
9463 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9464 cfa_offset -= 16;
9465 }
9466 }
9467
9468 static GTY(()) rtx queued_cfa_restores;
9469
9470 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9471 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9472 Don't add the note if the previously saved value will be left untouched
9473 within stack red-zone till return, as unwinders can find the same value
9474 in the register and on the stack. */
9475
9476 static void
9477 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9478 {
9479 if (!crtl->shrink_wrapped
9480 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9481 return;
9482
9483 if (insn)
9484 {
9485 add_reg_note (insn, REG_CFA_RESTORE, reg);
9486 RTX_FRAME_RELATED_P (insn) = 1;
9487 }
9488 else
9489 queued_cfa_restores
9490 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9491 }
9492
9493 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9494
9495 static void
9496 ix86_add_queued_cfa_restore_notes (rtx insn)
9497 {
9498 rtx last;
9499 if (!queued_cfa_restores)
9500 return;
9501 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9502 ;
9503 XEXP (last, 1) = REG_NOTES (insn);
9504 REG_NOTES (insn) = queued_cfa_restores;
9505 queued_cfa_restores = NULL_RTX;
9506 RTX_FRAME_RELATED_P (insn) = 1;
9507 }
9508
9509 /* Expand prologue or epilogue stack adjustment.
9510 The pattern exist to put a dependency on all ebp-based memory accesses.
9511 STYLE should be negative if instructions should be marked as frame related,
9512 zero if %r11 register is live and cannot be freely used and positive
9513 otherwise. */
9514
9515 static void
9516 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9517 int style, bool set_cfa)
9518 {
9519 struct machine_function *m = cfun->machine;
9520 rtx insn;
9521 bool add_frame_related_expr = false;
9522
9523 if (Pmode == SImode)
9524 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9525 else if (x86_64_immediate_operand (offset, DImode))
9526 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9527 else
9528 {
9529 rtx tmp;
9530 /* r11 is used by indirect sibcall return as well, set before the
9531 epilogue and used after the epilogue. */
9532 if (style)
9533 tmp = gen_rtx_REG (DImode, R11_REG);
9534 else
9535 {
9536 gcc_assert (src != hard_frame_pointer_rtx
9537 && dest != hard_frame_pointer_rtx);
9538 tmp = hard_frame_pointer_rtx;
9539 }
9540 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9541 if (style < 0)
9542 add_frame_related_expr = true;
9543
9544 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9545 }
9546
9547 insn = emit_insn (insn);
9548 if (style >= 0)
9549 ix86_add_queued_cfa_restore_notes (insn);
9550
9551 if (set_cfa)
9552 {
9553 rtx r;
9554
9555 gcc_assert (m->fs.cfa_reg == src);
9556 m->fs.cfa_offset += INTVAL (offset);
9557 m->fs.cfa_reg = dest;
9558
9559 r = gen_rtx_PLUS (Pmode, src, offset);
9560 r = gen_rtx_SET (VOIDmode, dest, r);
9561 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9562 RTX_FRAME_RELATED_P (insn) = 1;
9563 }
9564 else if (style < 0)
9565 {
9566 RTX_FRAME_RELATED_P (insn) = 1;
9567 if (add_frame_related_expr)
9568 {
9569 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9570 r = gen_rtx_SET (VOIDmode, dest, r);
9571 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9572 }
9573 }
9574
9575 if (dest == stack_pointer_rtx)
9576 {
9577 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9578 bool valid = m->fs.sp_valid;
9579
9580 if (src == hard_frame_pointer_rtx)
9581 {
9582 valid = m->fs.fp_valid;
9583 ooffset = m->fs.fp_offset;
9584 }
9585 else if (src == crtl->drap_reg)
9586 {
9587 valid = m->fs.drap_valid;
9588 ooffset = 0;
9589 }
9590 else
9591 {
9592 /* Else there are two possibilities: SP itself, which we set
9593 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9594 taken care of this by hand along the eh_return path. */
9595 gcc_checking_assert (src == stack_pointer_rtx
9596 || offset == const0_rtx);
9597 }
9598
9599 m->fs.sp_offset = ooffset - INTVAL (offset);
9600 m->fs.sp_valid = valid;
9601 }
9602 }
9603
9604 /* Find an available register to be used as dynamic realign argument
9605 pointer regsiter. Such a register will be written in prologue and
9606 used in begin of body, so it must not be
9607 1. parameter passing register.
9608 2. GOT pointer.
9609 We reuse static-chain register if it is available. Otherwise, we
9610 use DI for i386 and R13 for x86-64. We chose R13 since it has
9611 shorter encoding.
9612
9613 Return: the regno of chosen register. */
9614
9615 static unsigned int
9616 find_drap_reg (void)
9617 {
9618 tree decl = cfun->decl;
9619
9620 if (TARGET_64BIT)
9621 {
9622 /* Use R13 for nested function or function need static chain.
9623 Since function with tail call may use any caller-saved
9624 registers in epilogue, DRAP must not use caller-saved
9625 register in such case. */
9626 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9627 return R13_REG;
9628
9629 return R10_REG;
9630 }
9631 else
9632 {
9633 /* Use DI for nested function or function need static chain.
9634 Since function with tail call may use any caller-saved
9635 registers in epilogue, DRAP must not use caller-saved
9636 register in such case. */
9637 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9638 return DI_REG;
9639
9640 /* Reuse static chain register if it isn't used for parameter
9641 passing. */
9642 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9643 {
9644 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9645 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9646 return CX_REG;
9647 }
9648 return DI_REG;
9649 }
9650 }
9651
9652 /* Return minimum incoming stack alignment. */
9653
9654 static unsigned int
9655 ix86_minimum_incoming_stack_boundary (bool sibcall)
9656 {
9657 unsigned int incoming_stack_boundary;
9658
9659 /* Prefer the one specified at command line. */
9660 if (ix86_user_incoming_stack_boundary)
9661 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9662 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9663 if -mstackrealign is used, it isn't used for sibcall check and
9664 estimated stack alignment is 128bit. */
9665 else if (!sibcall
9666 && !TARGET_64BIT
9667 && ix86_force_align_arg_pointer
9668 && crtl->stack_alignment_estimated == 128)
9669 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9670 else
9671 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9672
9673 /* Incoming stack alignment can be changed on individual functions
9674 via force_align_arg_pointer attribute. We use the smallest
9675 incoming stack boundary. */
9676 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9677 && lookup_attribute (ix86_force_align_arg_pointer_string,
9678 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9679 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9680
9681 /* The incoming stack frame has to be aligned at least at
9682 parm_stack_boundary. */
9683 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9684 incoming_stack_boundary = crtl->parm_stack_boundary;
9685
9686 /* Stack at entrance of main is aligned by runtime. We use the
9687 smallest incoming stack boundary. */
9688 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9689 && DECL_NAME (current_function_decl)
9690 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9691 && DECL_FILE_SCOPE_P (current_function_decl))
9692 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9693
9694 return incoming_stack_boundary;
9695 }
9696
9697 /* Update incoming stack boundary and estimated stack alignment. */
9698
9699 static void
9700 ix86_update_stack_boundary (void)
9701 {
9702 ix86_incoming_stack_boundary
9703 = ix86_minimum_incoming_stack_boundary (false);
9704
9705 /* x86_64 vararg needs 16byte stack alignment for register save
9706 area. */
9707 if (TARGET_64BIT
9708 && cfun->stdarg
9709 && crtl->stack_alignment_estimated < 128)
9710 crtl->stack_alignment_estimated = 128;
9711 }
9712
9713 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9714 needed or an rtx for DRAP otherwise. */
9715
9716 static rtx
9717 ix86_get_drap_rtx (void)
9718 {
9719 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9720 crtl->need_drap = true;
9721
9722 if (stack_realign_drap)
9723 {
9724 /* Assign DRAP to vDRAP and returns vDRAP */
9725 unsigned int regno = find_drap_reg ();
9726 rtx drap_vreg;
9727 rtx arg_ptr;
9728 rtx seq, insn;
9729
9730 arg_ptr = gen_rtx_REG (Pmode, regno);
9731 crtl->drap_reg = arg_ptr;
9732
9733 start_sequence ();
9734 drap_vreg = copy_to_reg (arg_ptr);
9735 seq = get_insns ();
9736 end_sequence ();
9737
9738 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9739 if (!optimize)
9740 {
9741 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9742 RTX_FRAME_RELATED_P (insn) = 1;
9743 }
9744 return drap_vreg;
9745 }
9746 else
9747 return NULL;
9748 }
9749
9750 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9751
9752 static rtx
9753 ix86_internal_arg_pointer (void)
9754 {
9755 return virtual_incoming_args_rtx;
9756 }
9757
9758 struct scratch_reg {
9759 rtx reg;
9760 bool saved;
9761 };
9762
9763 /* Return a short-lived scratch register for use on function entry.
9764 In 32-bit mode, it is valid only after the registers are saved
9765 in the prologue. This register must be released by means of
9766 release_scratch_register_on_entry once it is dead. */
9767
9768 static void
9769 get_scratch_register_on_entry (struct scratch_reg *sr)
9770 {
9771 int regno;
9772
9773 sr->saved = false;
9774
9775 if (TARGET_64BIT)
9776 {
9777 /* We always use R11 in 64-bit mode. */
9778 regno = R11_REG;
9779 }
9780 else
9781 {
9782 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9783 bool fastcall_p
9784 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9785 bool thiscall_p
9786 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9787 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9788 int regparm = ix86_function_regparm (fntype, decl);
9789 int drap_regno
9790 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9791
9792 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9793 for the static chain register. */
9794 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9795 && drap_regno != AX_REG)
9796 regno = AX_REG;
9797 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
9798 for the static chain register. */
9799 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
9800 regno = AX_REG;
9801 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
9802 regno = DX_REG;
9803 /* ecx is the static chain register. */
9804 else if (regparm < 3 && !fastcall_p && !thiscall_p
9805 && !static_chain_p
9806 && drap_regno != CX_REG)
9807 regno = CX_REG;
9808 else if (ix86_save_reg (BX_REG, true))
9809 regno = BX_REG;
9810 /* esi is the static chain register. */
9811 else if (!(regparm == 3 && static_chain_p)
9812 && ix86_save_reg (SI_REG, true))
9813 regno = SI_REG;
9814 else if (ix86_save_reg (DI_REG, true))
9815 regno = DI_REG;
9816 else
9817 {
9818 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9819 sr->saved = true;
9820 }
9821 }
9822
9823 sr->reg = gen_rtx_REG (Pmode, regno);
9824 if (sr->saved)
9825 {
9826 rtx insn = emit_insn (gen_push (sr->reg));
9827 RTX_FRAME_RELATED_P (insn) = 1;
9828 }
9829 }
9830
9831 /* Release a scratch register obtained from the preceding function. */
9832
9833 static void
9834 release_scratch_register_on_entry (struct scratch_reg *sr)
9835 {
9836 if (sr->saved)
9837 {
9838 struct machine_function *m = cfun->machine;
9839 rtx x, insn = emit_insn (gen_pop (sr->reg));
9840
9841 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9842 RTX_FRAME_RELATED_P (insn) = 1;
9843 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9844 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9845 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9846 m->fs.sp_offset -= UNITS_PER_WORD;
9847 }
9848 }
9849
9850 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9851
9852 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9853
9854 static void
9855 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9856 {
9857 /* We skip the probe for the first interval + a small dope of 4 words and
9858 probe that many bytes past the specified size to maintain a protection
9859 area at the botton of the stack. */
9860 const int dope = 4 * UNITS_PER_WORD;
9861 rtx size_rtx = GEN_INT (size), last;
9862
9863 /* See if we have a constant small number of probes to generate. If so,
9864 that's the easy case. The run-time loop is made up of 11 insns in the
9865 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9866 for n # of intervals. */
9867 if (size <= 5 * PROBE_INTERVAL)
9868 {
9869 HOST_WIDE_INT i, adjust;
9870 bool first_probe = true;
9871
9872 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9873 values of N from 1 until it exceeds SIZE. If only one probe is
9874 needed, this will not generate any code. Then adjust and probe
9875 to PROBE_INTERVAL + SIZE. */
9876 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9877 {
9878 if (first_probe)
9879 {
9880 adjust = 2 * PROBE_INTERVAL + dope;
9881 first_probe = false;
9882 }
9883 else
9884 adjust = PROBE_INTERVAL;
9885
9886 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9887 plus_constant (Pmode, stack_pointer_rtx,
9888 -adjust)));
9889 emit_stack_probe (stack_pointer_rtx);
9890 }
9891
9892 if (first_probe)
9893 adjust = size + PROBE_INTERVAL + dope;
9894 else
9895 adjust = size + PROBE_INTERVAL - i;
9896
9897 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9898 plus_constant (Pmode, stack_pointer_rtx,
9899 -adjust)));
9900 emit_stack_probe (stack_pointer_rtx);
9901
9902 /* Adjust back to account for the additional first interval. */
9903 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9904 plus_constant (Pmode, stack_pointer_rtx,
9905 PROBE_INTERVAL + dope)));
9906 }
9907
9908 /* Otherwise, do the same as above, but in a loop. Note that we must be
9909 extra careful with variables wrapping around because we might be at
9910 the very top (or the very bottom) of the address space and we have
9911 to be able to handle this case properly; in particular, we use an
9912 equality test for the loop condition. */
9913 else
9914 {
9915 HOST_WIDE_INT rounded_size;
9916 struct scratch_reg sr;
9917
9918 get_scratch_register_on_entry (&sr);
9919
9920
9921 /* Step 1: round SIZE to the previous multiple of the interval. */
9922
9923 rounded_size = size & -PROBE_INTERVAL;
9924
9925
9926 /* Step 2: compute initial and final value of the loop counter. */
9927
9928 /* SP = SP_0 + PROBE_INTERVAL. */
9929 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9930 plus_constant (Pmode, stack_pointer_rtx,
9931 - (PROBE_INTERVAL + dope))));
9932
9933 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9934 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9935 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9936 gen_rtx_PLUS (Pmode, sr.reg,
9937 stack_pointer_rtx)));
9938
9939
9940 /* Step 3: the loop
9941
9942 while (SP != LAST_ADDR)
9943 {
9944 SP = SP + PROBE_INTERVAL
9945 probe at SP
9946 }
9947
9948 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9949 values of N from 1 until it is equal to ROUNDED_SIZE. */
9950
9951 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9952
9953
9954 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9955 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9956
9957 if (size != rounded_size)
9958 {
9959 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9960 plus_constant (Pmode, stack_pointer_rtx,
9961 rounded_size - size)));
9962 emit_stack_probe (stack_pointer_rtx);
9963 }
9964
9965 /* Adjust back to account for the additional first interval. */
9966 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9967 plus_constant (Pmode, stack_pointer_rtx,
9968 PROBE_INTERVAL + dope)));
9969
9970 release_scratch_register_on_entry (&sr);
9971 }
9972
9973 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9974
9975 /* Even if the stack pointer isn't the CFA register, we need to correctly
9976 describe the adjustments made to it, in particular differentiate the
9977 frame-related ones from the frame-unrelated ones. */
9978 if (size > 0)
9979 {
9980 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9981 XVECEXP (expr, 0, 0)
9982 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9983 plus_constant (Pmode, stack_pointer_rtx, -size));
9984 XVECEXP (expr, 0, 1)
9985 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9986 plus_constant (Pmode, stack_pointer_rtx,
9987 PROBE_INTERVAL + dope + size));
9988 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9989 RTX_FRAME_RELATED_P (last) = 1;
9990
9991 cfun->machine->fs.sp_offset += size;
9992 }
9993
9994 /* Make sure nothing is scheduled before we are done. */
9995 emit_insn (gen_blockage ());
9996 }
9997
9998 /* Adjust the stack pointer up to REG while probing it. */
9999
10000 const char *
10001 output_adjust_stack_and_probe (rtx reg)
10002 {
10003 static int labelno = 0;
10004 char loop_lab[32], end_lab[32];
10005 rtx xops[2];
10006
10007 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10008 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10009
10010 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10011
10012 /* Jump to END_LAB if SP == LAST_ADDR. */
10013 xops[0] = stack_pointer_rtx;
10014 xops[1] = reg;
10015 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10016 fputs ("\tje\t", asm_out_file);
10017 assemble_name_raw (asm_out_file, end_lab);
10018 fputc ('\n', asm_out_file);
10019
10020 /* SP = SP + PROBE_INTERVAL. */
10021 xops[1] = GEN_INT (PROBE_INTERVAL);
10022 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10023
10024 /* Probe at SP. */
10025 xops[1] = const0_rtx;
10026 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
10027
10028 fprintf (asm_out_file, "\tjmp\t");
10029 assemble_name_raw (asm_out_file, loop_lab);
10030 fputc ('\n', asm_out_file);
10031
10032 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10033
10034 return "";
10035 }
10036
10037 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
10038 inclusive. These are offsets from the current stack pointer. */
10039
10040 static void
10041 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
10042 {
10043 /* See if we have a constant small number of probes to generate. If so,
10044 that's the easy case. The run-time loop is made up of 7 insns in the
10045 generic case while the compile-time loop is made up of n insns for n #
10046 of intervals. */
10047 if (size <= 7 * PROBE_INTERVAL)
10048 {
10049 HOST_WIDE_INT i;
10050
10051 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
10052 it exceeds SIZE. If only one probe is needed, this will not
10053 generate any code. Then probe at FIRST + SIZE. */
10054 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
10055 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10056 -(first + i)));
10057
10058 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10059 -(first + size)));
10060 }
10061
10062 /* Otherwise, do the same as above, but in a loop. Note that we must be
10063 extra careful with variables wrapping around because we might be at
10064 the very top (or the very bottom) of the address space and we have
10065 to be able to handle this case properly; in particular, we use an
10066 equality test for the loop condition. */
10067 else
10068 {
10069 HOST_WIDE_INT rounded_size, last;
10070 struct scratch_reg sr;
10071
10072 get_scratch_register_on_entry (&sr);
10073
10074
10075 /* Step 1: round SIZE to the previous multiple of the interval. */
10076
10077 rounded_size = size & -PROBE_INTERVAL;
10078
10079
10080 /* Step 2: compute initial and final value of the loop counter. */
10081
10082 /* TEST_OFFSET = FIRST. */
10083 emit_move_insn (sr.reg, GEN_INT (-first));
10084
10085 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
10086 last = first + rounded_size;
10087
10088
10089 /* Step 3: the loop
10090
10091 while (TEST_ADDR != LAST_ADDR)
10092 {
10093 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
10094 probe at TEST_ADDR
10095 }
10096
10097 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
10098 until it is equal to ROUNDED_SIZE. */
10099
10100 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10101
10102
10103 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10104 that SIZE is equal to ROUNDED_SIZE. */
10105
10106 if (size != rounded_size)
10107 emit_stack_probe (plus_constant (Pmode,
10108 gen_rtx_PLUS (Pmode,
10109 stack_pointer_rtx,
10110 sr.reg),
10111 rounded_size - size));
10112
10113 release_scratch_register_on_entry (&sr);
10114 }
10115
10116 /* Make sure nothing is scheduled before we are done. */
10117 emit_insn (gen_blockage ());
10118 }
10119
10120 /* Probe a range of stack addresses from REG to END, inclusive. These are
10121 offsets from the current stack pointer. */
10122
10123 const char *
10124 output_probe_stack_range (rtx reg, rtx end)
10125 {
10126 static int labelno = 0;
10127 char loop_lab[32], end_lab[32];
10128 rtx xops[3];
10129
10130 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10131 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10132
10133 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10134
10135 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10136 xops[0] = reg;
10137 xops[1] = end;
10138 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10139 fputs ("\tje\t", asm_out_file);
10140 assemble_name_raw (asm_out_file, end_lab);
10141 fputc ('\n', asm_out_file);
10142
10143 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10144 xops[1] = GEN_INT (PROBE_INTERVAL);
10145 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10146
10147 /* Probe at TEST_ADDR. */
10148 xops[0] = stack_pointer_rtx;
10149 xops[1] = reg;
10150 xops[2] = const0_rtx;
10151 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10152
10153 fprintf (asm_out_file, "\tjmp\t");
10154 assemble_name_raw (asm_out_file, loop_lab);
10155 fputc ('\n', asm_out_file);
10156
10157 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10158
10159 return "";
10160 }
10161
10162 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10163 to be generated in correct form. */
10164 static void
10165 ix86_finalize_stack_realign_flags (void)
10166 {
10167 /* Check if stack realign is really needed after reload, and
10168 stores result in cfun */
10169 unsigned int incoming_stack_boundary
10170 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10171 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10172 unsigned int stack_realign = (incoming_stack_boundary
10173 < (crtl->is_leaf
10174 ? crtl->max_used_stack_slot_alignment
10175 : crtl->stack_alignment_needed));
10176
10177 if (crtl->stack_realign_finalized)
10178 {
10179 /* After stack_realign_needed is finalized, we can't no longer
10180 change it. */
10181 gcc_assert (crtl->stack_realign_needed == stack_realign);
10182 return;
10183 }
10184
10185 /* If the only reason for frame_pointer_needed is that we conservatively
10186 assumed stack realignment might be needed, but in the end nothing that
10187 needed the stack alignment had been spilled, clear frame_pointer_needed
10188 and say we don't need stack realignment. */
10189 if (stack_realign
10190 && !crtl->need_drap
10191 && frame_pointer_needed
10192 && crtl->is_leaf
10193 && flag_omit_frame_pointer
10194 && crtl->sp_is_unchanging
10195 && !ix86_current_function_calls_tls_descriptor
10196 && !crtl->accesses_prior_frames
10197 && !cfun->calls_alloca
10198 && !crtl->calls_eh_return
10199 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10200 && !ix86_frame_pointer_required ()
10201 && get_frame_size () == 0
10202 && ix86_nsaved_sseregs () == 0
10203 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10204 {
10205 HARD_REG_SET set_up_by_prologue, prologue_used;
10206 basic_block bb;
10207
10208 CLEAR_HARD_REG_SET (prologue_used);
10209 CLEAR_HARD_REG_SET (set_up_by_prologue);
10210 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10211 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10212 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10213 HARD_FRAME_POINTER_REGNUM);
10214 FOR_EACH_BB (bb)
10215 {
10216 rtx insn;
10217 FOR_BB_INSNS (bb, insn)
10218 if (NONDEBUG_INSN_P (insn)
10219 && requires_stack_frame_p (insn, prologue_used,
10220 set_up_by_prologue))
10221 {
10222 crtl->stack_realign_needed = stack_realign;
10223 crtl->stack_realign_finalized = true;
10224 return;
10225 }
10226 }
10227
10228 frame_pointer_needed = false;
10229 stack_realign = false;
10230 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10231 crtl->stack_alignment_needed = incoming_stack_boundary;
10232 crtl->stack_alignment_estimated = incoming_stack_boundary;
10233 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10234 crtl->preferred_stack_boundary = incoming_stack_boundary;
10235 df_finish_pass (true);
10236 df_scan_alloc (NULL);
10237 df_scan_blocks ();
10238 df_compute_regs_ever_live (true);
10239 df_analyze ();
10240 }
10241
10242 crtl->stack_realign_needed = stack_realign;
10243 crtl->stack_realign_finalized = true;
10244 }
10245
10246 /* Expand the prologue into a bunch of separate insns. */
10247
10248 void
10249 ix86_expand_prologue (void)
10250 {
10251 struct machine_function *m = cfun->machine;
10252 rtx insn, t;
10253 bool pic_reg_used;
10254 struct ix86_frame frame;
10255 HOST_WIDE_INT allocate;
10256 bool int_registers_saved;
10257 bool sse_registers_saved;
10258
10259 ix86_finalize_stack_realign_flags ();
10260
10261 /* DRAP should not coexist with stack_realign_fp */
10262 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10263
10264 memset (&m->fs, 0, sizeof (m->fs));
10265
10266 /* Initialize CFA state for before the prologue. */
10267 m->fs.cfa_reg = stack_pointer_rtx;
10268 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10269
10270 /* Track SP offset to the CFA. We continue tracking this after we've
10271 swapped the CFA register away from SP. In the case of re-alignment
10272 this is fudged; we're interested to offsets within the local frame. */
10273 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10274 m->fs.sp_valid = true;
10275
10276 ix86_compute_frame_layout (&frame);
10277
10278 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10279 {
10280 /* We should have already generated an error for any use of
10281 ms_hook on a nested function. */
10282 gcc_checking_assert (!ix86_static_chain_on_stack);
10283
10284 /* Check if profiling is active and we shall use profiling before
10285 prologue variant. If so sorry. */
10286 if (crtl->profile && flag_fentry != 0)
10287 sorry ("ms_hook_prologue attribute isn%'t compatible "
10288 "with -mfentry for 32-bit");
10289
10290 /* In ix86_asm_output_function_label we emitted:
10291 8b ff movl.s %edi,%edi
10292 55 push %ebp
10293 8b ec movl.s %esp,%ebp
10294
10295 This matches the hookable function prologue in Win32 API
10296 functions in Microsoft Windows XP Service Pack 2 and newer.
10297 Wine uses this to enable Windows apps to hook the Win32 API
10298 functions provided by Wine.
10299
10300 What that means is that we've already set up the frame pointer. */
10301
10302 if (frame_pointer_needed
10303 && !(crtl->drap_reg && crtl->stack_realign_needed))
10304 {
10305 rtx push, mov;
10306
10307 /* We've decided to use the frame pointer already set up.
10308 Describe this to the unwinder by pretending that both
10309 push and mov insns happen right here.
10310
10311 Putting the unwind info here at the end of the ms_hook
10312 is done so that we can make absolutely certain we get
10313 the required byte sequence at the start of the function,
10314 rather than relying on an assembler that can produce
10315 the exact encoding required.
10316
10317 However it does mean (in the unpatched case) that we have
10318 a 1 insn window where the asynchronous unwind info is
10319 incorrect. However, if we placed the unwind info at
10320 its correct location we would have incorrect unwind info
10321 in the patched case. Which is probably all moot since
10322 I don't expect Wine generates dwarf2 unwind info for the
10323 system libraries that use this feature. */
10324
10325 insn = emit_insn (gen_blockage ());
10326
10327 push = gen_push (hard_frame_pointer_rtx);
10328 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10329 stack_pointer_rtx);
10330 RTX_FRAME_RELATED_P (push) = 1;
10331 RTX_FRAME_RELATED_P (mov) = 1;
10332
10333 RTX_FRAME_RELATED_P (insn) = 1;
10334 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10335 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10336
10337 /* Note that gen_push incremented m->fs.cfa_offset, even
10338 though we didn't emit the push insn here. */
10339 m->fs.cfa_reg = hard_frame_pointer_rtx;
10340 m->fs.fp_offset = m->fs.cfa_offset;
10341 m->fs.fp_valid = true;
10342 }
10343 else
10344 {
10345 /* The frame pointer is not needed so pop %ebp again.
10346 This leaves us with a pristine state. */
10347 emit_insn (gen_pop (hard_frame_pointer_rtx));
10348 }
10349 }
10350
10351 /* The first insn of a function that accepts its static chain on the
10352 stack is to push the register that would be filled in by a direct
10353 call. This insn will be skipped by the trampoline. */
10354 else if (ix86_static_chain_on_stack)
10355 {
10356 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10357 emit_insn (gen_blockage ());
10358
10359 /* We don't want to interpret this push insn as a register save,
10360 only as a stack adjustment. The real copy of the register as
10361 a save will be done later, if needed. */
10362 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10363 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10364 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10365 RTX_FRAME_RELATED_P (insn) = 1;
10366 }
10367
10368 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10369 of DRAP is needed and stack realignment is really needed after reload */
10370 if (stack_realign_drap)
10371 {
10372 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10373
10374 /* Only need to push parameter pointer reg if it is caller saved. */
10375 if (!call_used_regs[REGNO (crtl->drap_reg)])
10376 {
10377 /* Push arg pointer reg */
10378 insn = emit_insn (gen_push (crtl->drap_reg));
10379 RTX_FRAME_RELATED_P (insn) = 1;
10380 }
10381
10382 /* Grab the argument pointer. */
10383 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10384 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10385 RTX_FRAME_RELATED_P (insn) = 1;
10386 m->fs.cfa_reg = crtl->drap_reg;
10387 m->fs.cfa_offset = 0;
10388
10389 /* Align the stack. */
10390 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10391 stack_pointer_rtx,
10392 GEN_INT (-align_bytes)));
10393 RTX_FRAME_RELATED_P (insn) = 1;
10394
10395 /* Replicate the return address on the stack so that return
10396 address can be reached via (argp - 1) slot. This is needed
10397 to implement macro RETURN_ADDR_RTX and intrinsic function
10398 expand_builtin_return_addr etc. */
10399 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10400 t = gen_frame_mem (word_mode, t);
10401 insn = emit_insn (gen_push (t));
10402 RTX_FRAME_RELATED_P (insn) = 1;
10403
10404 /* For the purposes of frame and register save area addressing,
10405 we've started over with a new frame. */
10406 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10407 m->fs.realigned = true;
10408 }
10409
10410 int_registers_saved = (frame.nregs == 0);
10411 sse_registers_saved = (frame.nsseregs == 0);
10412
10413 if (frame_pointer_needed && !m->fs.fp_valid)
10414 {
10415 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10416 slower on all targets. Also sdb doesn't like it. */
10417 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10418 RTX_FRAME_RELATED_P (insn) = 1;
10419
10420 /* Push registers now, before setting the frame pointer
10421 on SEH target. */
10422 if (!int_registers_saved
10423 && TARGET_SEH
10424 && !frame.save_regs_using_mov)
10425 {
10426 ix86_emit_save_regs ();
10427 int_registers_saved = true;
10428 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10429 }
10430
10431 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10432 {
10433 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10434 RTX_FRAME_RELATED_P (insn) = 1;
10435
10436 if (m->fs.cfa_reg == stack_pointer_rtx)
10437 m->fs.cfa_reg = hard_frame_pointer_rtx;
10438 m->fs.fp_offset = m->fs.sp_offset;
10439 m->fs.fp_valid = true;
10440 }
10441 }
10442
10443 if (!int_registers_saved)
10444 {
10445 /* If saving registers via PUSH, do so now. */
10446 if (!frame.save_regs_using_mov)
10447 {
10448 ix86_emit_save_regs ();
10449 int_registers_saved = true;
10450 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10451 }
10452
10453 /* When using red zone we may start register saving before allocating
10454 the stack frame saving one cycle of the prologue. However, avoid
10455 doing this if we have to probe the stack; at least on x86_64 the
10456 stack probe can turn into a call that clobbers a red zone location. */
10457 else if (ix86_using_red_zone ()
10458 && (! TARGET_STACK_PROBE
10459 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10460 {
10461 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10462 int_registers_saved = true;
10463 }
10464 }
10465
10466 if (stack_realign_fp)
10467 {
10468 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10469 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10470
10471 /* The computation of the size of the re-aligned stack frame means
10472 that we must allocate the size of the register save area before
10473 performing the actual alignment. Otherwise we cannot guarantee
10474 that there's enough storage above the realignment point. */
10475 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10476 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10477 GEN_INT (m->fs.sp_offset
10478 - frame.sse_reg_save_offset),
10479 -1, false);
10480
10481 /* Align the stack. */
10482 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10483 stack_pointer_rtx,
10484 GEN_INT (-align_bytes)));
10485
10486 /* For the purposes of register save area addressing, the stack
10487 pointer is no longer valid. As for the value of sp_offset,
10488 see ix86_compute_frame_layout, which we need to match in order
10489 to pass verification of stack_pointer_offset at the end. */
10490 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10491 m->fs.sp_valid = false;
10492 }
10493
10494 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10495
10496 if (flag_stack_usage_info)
10497 {
10498 /* We start to count from ARG_POINTER. */
10499 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10500
10501 /* If it was realigned, take into account the fake frame. */
10502 if (stack_realign_drap)
10503 {
10504 if (ix86_static_chain_on_stack)
10505 stack_size += UNITS_PER_WORD;
10506
10507 if (!call_used_regs[REGNO (crtl->drap_reg)])
10508 stack_size += UNITS_PER_WORD;
10509
10510 /* This over-estimates by 1 minimal-stack-alignment-unit but
10511 mitigates that by counting in the new return address slot. */
10512 current_function_dynamic_stack_size
10513 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10514 }
10515
10516 current_function_static_stack_size = stack_size;
10517 }
10518
10519 /* On SEH target with very large frame size, allocate an area to save
10520 SSE registers (as the very large allocation won't be described). */
10521 if (TARGET_SEH
10522 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10523 && !sse_registers_saved)
10524 {
10525 HOST_WIDE_INT sse_size =
10526 frame.sse_reg_save_offset - frame.reg_save_offset;
10527
10528 gcc_assert (int_registers_saved);
10529
10530 /* No need to do stack checking as the area will be immediately
10531 written. */
10532 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10533 GEN_INT (-sse_size), -1,
10534 m->fs.cfa_reg == stack_pointer_rtx);
10535 allocate -= sse_size;
10536 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10537 sse_registers_saved = true;
10538 }
10539
10540 /* The stack has already been decremented by the instruction calling us
10541 so probe if the size is non-negative to preserve the protection area. */
10542 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10543 {
10544 /* We expect the registers to be saved when probes are used. */
10545 gcc_assert (int_registers_saved);
10546
10547 if (STACK_CHECK_MOVING_SP)
10548 {
10549 ix86_adjust_stack_and_probe (allocate);
10550 allocate = 0;
10551 }
10552 else
10553 {
10554 HOST_WIDE_INT size = allocate;
10555
10556 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10557 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10558
10559 if (TARGET_STACK_PROBE)
10560 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10561 else
10562 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10563 }
10564 }
10565
10566 if (allocate == 0)
10567 ;
10568 else if (!ix86_target_stack_probe ()
10569 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10570 {
10571 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10572 GEN_INT (-allocate), -1,
10573 m->fs.cfa_reg == stack_pointer_rtx);
10574 }
10575 else
10576 {
10577 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10578 rtx r10 = NULL;
10579 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10580 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10581 bool eax_live = false;
10582 bool r10_live = false;
10583
10584 if (TARGET_64BIT)
10585 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10586 if (!TARGET_64BIT_MS_ABI)
10587 eax_live = ix86_eax_live_at_start_p ();
10588
10589 /* Note that SEH directives need to continue tracking the stack
10590 pointer even after the frame pointer has been set up. */
10591 if (eax_live)
10592 {
10593 insn = emit_insn (gen_push (eax));
10594 allocate -= UNITS_PER_WORD;
10595 if (sp_is_cfa_reg || TARGET_SEH)
10596 {
10597 if (sp_is_cfa_reg)
10598 m->fs.cfa_offset += UNITS_PER_WORD;
10599 RTX_FRAME_RELATED_P (insn) = 1;
10600 }
10601 }
10602
10603 if (r10_live)
10604 {
10605 r10 = gen_rtx_REG (Pmode, R10_REG);
10606 insn = emit_insn (gen_push (r10));
10607 allocate -= UNITS_PER_WORD;
10608 if (sp_is_cfa_reg || TARGET_SEH)
10609 {
10610 if (sp_is_cfa_reg)
10611 m->fs.cfa_offset += UNITS_PER_WORD;
10612 RTX_FRAME_RELATED_P (insn) = 1;
10613 }
10614 }
10615
10616 emit_move_insn (eax, GEN_INT (allocate));
10617 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10618
10619 /* Use the fact that AX still contains ALLOCATE. */
10620 adjust_stack_insn = (Pmode == DImode
10621 ? gen_pro_epilogue_adjust_stack_di_sub
10622 : gen_pro_epilogue_adjust_stack_si_sub);
10623
10624 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10625 stack_pointer_rtx, eax));
10626
10627 if (sp_is_cfa_reg || TARGET_SEH)
10628 {
10629 if (sp_is_cfa_reg)
10630 m->fs.cfa_offset += allocate;
10631 RTX_FRAME_RELATED_P (insn) = 1;
10632 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10633 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10634 plus_constant (Pmode, stack_pointer_rtx,
10635 -allocate)));
10636 }
10637 m->fs.sp_offset += allocate;
10638
10639 if (r10_live && eax_live)
10640 {
10641 t = choose_baseaddr (m->fs.sp_offset - allocate);
10642 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10643 gen_frame_mem (word_mode, t));
10644 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10645 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10646 gen_frame_mem (word_mode, t));
10647 }
10648 else if (eax_live || r10_live)
10649 {
10650 t = choose_baseaddr (m->fs.sp_offset - allocate);
10651 emit_move_insn (gen_rtx_REG (word_mode,
10652 (eax_live ? AX_REG : R10_REG)),
10653 gen_frame_mem (word_mode, t));
10654 }
10655 }
10656 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10657
10658 /* If we havn't already set up the frame pointer, do so now. */
10659 if (frame_pointer_needed && !m->fs.fp_valid)
10660 {
10661 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10662 GEN_INT (frame.stack_pointer_offset
10663 - frame.hard_frame_pointer_offset));
10664 insn = emit_insn (insn);
10665 RTX_FRAME_RELATED_P (insn) = 1;
10666 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10667
10668 if (m->fs.cfa_reg == stack_pointer_rtx)
10669 m->fs.cfa_reg = hard_frame_pointer_rtx;
10670 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10671 m->fs.fp_valid = true;
10672 }
10673
10674 if (!int_registers_saved)
10675 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10676 if (!sse_registers_saved)
10677 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10678
10679 pic_reg_used = false;
10680 /* We don't use pic-register for pe-coff target. */
10681 if (pic_offset_table_rtx
10682 && !TARGET_PECOFF
10683 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10684 || crtl->profile))
10685 {
10686 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10687
10688 if (alt_pic_reg_used != INVALID_REGNUM)
10689 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10690
10691 pic_reg_used = true;
10692 }
10693
10694 if (pic_reg_used)
10695 {
10696 if (TARGET_64BIT)
10697 {
10698 if (ix86_cmodel == CM_LARGE_PIC)
10699 {
10700 rtx label, tmp_reg;
10701
10702 gcc_assert (Pmode == DImode);
10703 label = gen_label_rtx ();
10704 emit_label (label);
10705 LABEL_PRESERVE_P (label) = 1;
10706 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10707 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10708 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10709 label));
10710 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10711 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10712 pic_offset_table_rtx, tmp_reg));
10713 }
10714 else
10715 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10716 }
10717 else
10718 {
10719 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10720 RTX_FRAME_RELATED_P (insn) = 1;
10721 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10722 }
10723 }
10724
10725 /* In the pic_reg_used case, make sure that the got load isn't deleted
10726 when mcount needs it. Blockage to avoid call movement across mcount
10727 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10728 note. */
10729 if (crtl->profile && !flag_fentry && pic_reg_used)
10730 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10731
10732 if (crtl->drap_reg && !crtl->stack_realign_needed)
10733 {
10734 /* vDRAP is setup but after reload it turns out stack realign
10735 isn't necessary, here we will emit prologue to setup DRAP
10736 without stack realign adjustment */
10737 t = choose_baseaddr (0);
10738 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10739 }
10740
10741 /* Prevent instructions from being scheduled into register save push
10742 sequence when access to the redzone area is done through frame pointer.
10743 The offset between the frame pointer and the stack pointer is calculated
10744 relative to the value of the stack pointer at the end of the function
10745 prologue, and moving instructions that access redzone area via frame
10746 pointer inside push sequence violates this assumption. */
10747 if (frame_pointer_needed && frame.red_zone_size)
10748 emit_insn (gen_memory_blockage ());
10749
10750 /* Emit cld instruction if stringops are used in the function. */
10751 if (TARGET_CLD && ix86_current_function_needs_cld)
10752 emit_insn (gen_cld ());
10753
10754 /* SEH requires that the prologue end within 256 bytes of the start of
10755 the function. Prevent instruction schedules that would extend that.
10756 Further, prevent alloca modifications to the stack pointer from being
10757 combined with prologue modifications. */
10758 if (TARGET_SEH)
10759 emit_insn (gen_prologue_use (stack_pointer_rtx));
10760 }
10761
10762 /* Emit code to restore REG using a POP insn. */
10763
10764 static void
10765 ix86_emit_restore_reg_using_pop (rtx reg)
10766 {
10767 struct machine_function *m = cfun->machine;
10768 rtx insn = emit_insn (gen_pop (reg));
10769
10770 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10771 m->fs.sp_offset -= UNITS_PER_WORD;
10772
10773 if (m->fs.cfa_reg == crtl->drap_reg
10774 && REGNO (reg) == REGNO (crtl->drap_reg))
10775 {
10776 /* Previously we'd represented the CFA as an expression
10777 like *(%ebp - 8). We've just popped that value from
10778 the stack, which means we need to reset the CFA to
10779 the drap register. This will remain until we restore
10780 the stack pointer. */
10781 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10782 RTX_FRAME_RELATED_P (insn) = 1;
10783
10784 /* This means that the DRAP register is valid for addressing too. */
10785 m->fs.drap_valid = true;
10786 return;
10787 }
10788
10789 if (m->fs.cfa_reg == stack_pointer_rtx)
10790 {
10791 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10792 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10793 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10794 RTX_FRAME_RELATED_P (insn) = 1;
10795
10796 m->fs.cfa_offset -= UNITS_PER_WORD;
10797 }
10798
10799 /* When the frame pointer is the CFA, and we pop it, we are
10800 swapping back to the stack pointer as the CFA. This happens
10801 for stack frames that don't allocate other data, so we assume
10802 the stack pointer is now pointing at the return address, i.e.
10803 the function entry state, which makes the offset be 1 word. */
10804 if (reg == hard_frame_pointer_rtx)
10805 {
10806 m->fs.fp_valid = false;
10807 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10808 {
10809 m->fs.cfa_reg = stack_pointer_rtx;
10810 m->fs.cfa_offset -= UNITS_PER_WORD;
10811
10812 add_reg_note (insn, REG_CFA_DEF_CFA,
10813 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10814 GEN_INT (m->fs.cfa_offset)));
10815 RTX_FRAME_RELATED_P (insn) = 1;
10816 }
10817 }
10818 }
10819
10820 /* Emit code to restore saved registers using POP insns. */
10821
10822 static void
10823 ix86_emit_restore_regs_using_pop (void)
10824 {
10825 unsigned int regno;
10826
10827 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10828 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10829 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10830 }
10831
10832 /* Emit code and notes for the LEAVE instruction. */
10833
10834 static void
10835 ix86_emit_leave (void)
10836 {
10837 struct machine_function *m = cfun->machine;
10838 rtx insn = emit_insn (ix86_gen_leave ());
10839
10840 ix86_add_queued_cfa_restore_notes (insn);
10841
10842 gcc_assert (m->fs.fp_valid);
10843 m->fs.sp_valid = true;
10844 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10845 m->fs.fp_valid = false;
10846
10847 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10848 {
10849 m->fs.cfa_reg = stack_pointer_rtx;
10850 m->fs.cfa_offset = m->fs.sp_offset;
10851
10852 add_reg_note (insn, REG_CFA_DEF_CFA,
10853 plus_constant (Pmode, stack_pointer_rtx,
10854 m->fs.sp_offset));
10855 RTX_FRAME_RELATED_P (insn) = 1;
10856 }
10857 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10858 m->fs.fp_offset);
10859 }
10860
10861 /* Emit code to restore saved registers using MOV insns.
10862 First register is restored from CFA - CFA_OFFSET. */
10863 static void
10864 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10865 bool maybe_eh_return)
10866 {
10867 struct machine_function *m = cfun->machine;
10868 unsigned int regno;
10869
10870 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10871 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10872 {
10873 rtx reg = gen_rtx_REG (word_mode, regno);
10874 rtx insn, mem;
10875
10876 mem = choose_baseaddr (cfa_offset);
10877 mem = gen_frame_mem (word_mode, mem);
10878 insn = emit_move_insn (reg, mem);
10879
10880 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10881 {
10882 /* Previously we'd represented the CFA as an expression
10883 like *(%ebp - 8). We've just popped that value from
10884 the stack, which means we need to reset the CFA to
10885 the drap register. This will remain until we restore
10886 the stack pointer. */
10887 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10888 RTX_FRAME_RELATED_P (insn) = 1;
10889
10890 /* This means that the DRAP register is valid for addressing. */
10891 m->fs.drap_valid = true;
10892 }
10893 else
10894 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10895
10896 cfa_offset -= UNITS_PER_WORD;
10897 }
10898 }
10899
10900 /* Emit code to restore saved registers using MOV insns.
10901 First register is restored from CFA - CFA_OFFSET. */
10902 static void
10903 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10904 bool maybe_eh_return)
10905 {
10906 unsigned int regno;
10907
10908 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10909 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10910 {
10911 rtx reg = gen_rtx_REG (V4SFmode, regno);
10912 rtx mem;
10913
10914 mem = choose_baseaddr (cfa_offset);
10915 mem = gen_rtx_MEM (V4SFmode, mem);
10916 set_mem_align (mem, 128);
10917 emit_move_insn (reg, mem);
10918
10919 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10920
10921 cfa_offset -= 16;
10922 }
10923 }
10924
10925 /* Restore function stack, frame, and registers. */
10926
10927 void
10928 ix86_expand_epilogue (int style)
10929 {
10930 struct machine_function *m = cfun->machine;
10931 struct machine_frame_state frame_state_save = m->fs;
10932 struct ix86_frame frame;
10933 bool restore_regs_via_mov;
10934 bool using_drap;
10935
10936 ix86_finalize_stack_realign_flags ();
10937 ix86_compute_frame_layout (&frame);
10938
10939 m->fs.sp_valid = (!frame_pointer_needed
10940 || (crtl->sp_is_unchanging
10941 && !stack_realign_fp));
10942 gcc_assert (!m->fs.sp_valid
10943 || m->fs.sp_offset == frame.stack_pointer_offset);
10944
10945 /* The FP must be valid if the frame pointer is present. */
10946 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10947 gcc_assert (!m->fs.fp_valid
10948 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10949
10950 /* We must have *some* valid pointer to the stack frame. */
10951 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10952
10953 /* The DRAP is never valid at this point. */
10954 gcc_assert (!m->fs.drap_valid);
10955
10956 /* See the comment about red zone and frame
10957 pointer usage in ix86_expand_prologue. */
10958 if (frame_pointer_needed && frame.red_zone_size)
10959 emit_insn (gen_memory_blockage ());
10960
10961 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10962 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10963
10964 /* Determine the CFA offset of the end of the red-zone. */
10965 m->fs.red_zone_offset = 0;
10966 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10967 {
10968 /* The red-zone begins below the return address. */
10969 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10970
10971 /* When the register save area is in the aligned portion of
10972 the stack, determine the maximum runtime displacement that
10973 matches up with the aligned frame. */
10974 if (stack_realign_drap)
10975 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10976 + UNITS_PER_WORD);
10977 }
10978
10979 /* Special care must be taken for the normal return case of a function
10980 using eh_return: the eax and edx registers are marked as saved, but
10981 not restored along this path. Adjust the save location to match. */
10982 if (crtl->calls_eh_return && style != 2)
10983 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10984
10985 /* EH_RETURN requires the use of moves to function properly. */
10986 if (crtl->calls_eh_return)
10987 restore_regs_via_mov = true;
10988 /* SEH requires the use of pops to identify the epilogue. */
10989 else if (TARGET_SEH)
10990 restore_regs_via_mov = false;
10991 /* If we're only restoring one register and sp is not valid then
10992 using a move instruction to restore the register since it's
10993 less work than reloading sp and popping the register. */
10994 else if (!m->fs.sp_valid && frame.nregs <= 1)
10995 restore_regs_via_mov = true;
10996 else if (TARGET_EPILOGUE_USING_MOVE
10997 && cfun->machine->use_fast_prologue_epilogue
10998 && (frame.nregs > 1
10999 || m->fs.sp_offset != frame.reg_save_offset))
11000 restore_regs_via_mov = true;
11001 else if (frame_pointer_needed
11002 && !frame.nregs
11003 && m->fs.sp_offset != frame.reg_save_offset)
11004 restore_regs_via_mov = true;
11005 else if (frame_pointer_needed
11006 && TARGET_USE_LEAVE
11007 && cfun->machine->use_fast_prologue_epilogue
11008 && frame.nregs == 1)
11009 restore_regs_via_mov = true;
11010 else
11011 restore_regs_via_mov = false;
11012
11013 if (restore_regs_via_mov || frame.nsseregs)
11014 {
11015 /* Ensure that the entire register save area is addressable via
11016 the stack pointer, if we will restore via sp. */
11017 if (TARGET_64BIT
11018 && m->fs.sp_offset > 0x7fffffff
11019 && !(m->fs.fp_valid || m->fs.drap_valid)
11020 && (frame.nsseregs + frame.nregs) != 0)
11021 {
11022 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11023 GEN_INT (m->fs.sp_offset
11024 - frame.sse_reg_save_offset),
11025 style,
11026 m->fs.cfa_reg == stack_pointer_rtx);
11027 }
11028 }
11029
11030 /* If there are any SSE registers to restore, then we have to do it
11031 via moves, since there's obviously no pop for SSE regs. */
11032 if (frame.nsseregs)
11033 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
11034 style == 2);
11035
11036 if (restore_regs_via_mov)
11037 {
11038 rtx t;
11039
11040 if (frame.nregs)
11041 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
11042
11043 /* eh_return epilogues need %ecx added to the stack pointer. */
11044 if (style == 2)
11045 {
11046 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
11047
11048 /* Stack align doesn't work with eh_return. */
11049 gcc_assert (!stack_realign_drap);
11050 /* Neither does regparm nested functions. */
11051 gcc_assert (!ix86_static_chain_on_stack);
11052
11053 if (frame_pointer_needed)
11054 {
11055 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
11056 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
11057 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
11058
11059 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
11060 insn = emit_move_insn (hard_frame_pointer_rtx, t);
11061
11062 /* Note that we use SA as a temporary CFA, as the return
11063 address is at the proper place relative to it. We
11064 pretend this happens at the FP restore insn because
11065 prior to this insn the FP would be stored at the wrong
11066 offset relative to SA, and after this insn we have no
11067 other reasonable register to use for the CFA. We don't
11068 bother resetting the CFA to the SP for the duration of
11069 the return insn. */
11070 add_reg_note (insn, REG_CFA_DEF_CFA,
11071 plus_constant (Pmode, sa, UNITS_PER_WORD));
11072 ix86_add_queued_cfa_restore_notes (insn);
11073 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
11074 RTX_FRAME_RELATED_P (insn) = 1;
11075
11076 m->fs.cfa_reg = sa;
11077 m->fs.cfa_offset = UNITS_PER_WORD;
11078 m->fs.fp_valid = false;
11079
11080 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
11081 const0_rtx, style, false);
11082 }
11083 else
11084 {
11085 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
11086 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
11087 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
11088 ix86_add_queued_cfa_restore_notes (insn);
11089
11090 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
11091 if (m->fs.cfa_offset != UNITS_PER_WORD)
11092 {
11093 m->fs.cfa_offset = UNITS_PER_WORD;
11094 add_reg_note (insn, REG_CFA_DEF_CFA,
11095 plus_constant (Pmode, stack_pointer_rtx,
11096 UNITS_PER_WORD));
11097 RTX_FRAME_RELATED_P (insn) = 1;
11098 }
11099 }
11100 m->fs.sp_offset = UNITS_PER_WORD;
11101 m->fs.sp_valid = true;
11102 }
11103 }
11104 else
11105 {
11106 /* SEH requires that the function end with (1) a stack adjustment
11107 if necessary, (2) a sequence of pops, and (3) a return or
11108 jump instruction. Prevent insns from the function body from
11109 being scheduled into this sequence. */
11110 if (TARGET_SEH)
11111 {
11112 /* Prevent a catch region from being adjacent to the standard
11113 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11114 several other flags that would be interesting to test are
11115 not yet set up. */
11116 if (flag_non_call_exceptions)
11117 emit_insn (gen_nops (const1_rtx));
11118 else
11119 emit_insn (gen_blockage ());
11120 }
11121
11122 /* First step is to deallocate the stack frame so that we can
11123 pop the registers. Also do it on SEH target for very large
11124 frame as the emitted instructions aren't allowed by the ABI in
11125 epilogues. */
11126 if (!m->fs.sp_valid
11127 || (TARGET_SEH
11128 && (m->fs.sp_offset - frame.reg_save_offset
11129 >= SEH_MAX_FRAME_SIZE)))
11130 {
11131 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11132 GEN_INT (m->fs.fp_offset
11133 - frame.reg_save_offset),
11134 style, false);
11135 }
11136 else if (m->fs.sp_offset != frame.reg_save_offset)
11137 {
11138 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11139 GEN_INT (m->fs.sp_offset
11140 - frame.reg_save_offset),
11141 style,
11142 m->fs.cfa_reg == stack_pointer_rtx);
11143 }
11144
11145 ix86_emit_restore_regs_using_pop ();
11146 }
11147
11148 /* If we used a stack pointer and haven't already got rid of it,
11149 then do so now. */
11150 if (m->fs.fp_valid)
11151 {
11152 /* If the stack pointer is valid and pointing at the frame
11153 pointer store address, then we only need a pop. */
11154 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11155 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11156 /* Leave results in shorter dependency chains on CPUs that are
11157 able to grok it fast. */
11158 else if (TARGET_USE_LEAVE
11159 || optimize_bb_for_size_p (EXIT_BLOCK_PTR)
11160 || !cfun->machine->use_fast_prologue_epilogue)
11161 ix86_emit_leave ();
11162 else
11163 {
11164 pro_epilogue_adjust_stack (stack_pointer_rtx,
11165 hard_frame_pointer_rtx,
11166 const0_rtx, style, !using_drap);
11167 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11168 }
11169 }
11170
11171 if (using_drap)
11172 {
11173 int param_ptr_offset = UNITS_PER_WORD;
11174 rtx insn;
11175
11176 gcc_assert (stack_realign_drap);
11177
11178 if (ix86_static_chain_on_stack)
11179 param_ptr_offset += UNITS_PER_WORD;
11180 if (!call_used_regs[REGNO (crtl->drap_reg)])
11181 param_ptr_offset += UNITS_PER_WORD;
11182
11183 insn = emit_insn (gen_rtx_SET
11184 (VOIDmode, stack_pointer_rtx,
11185 gen_rtx_PLUS (Pmode,
11186 crtl->drap_reg,
11187 GEN_INT (-param_ptr_offset))));
11188 m->fs.cfa_reg = stack_pointer_rtx;
11189 m->fs.cfa_offset = param_ptr_offset;
11190 m->fs.sp_offset = param_ptr_offset;
11191 m->fs.realigned = false;
11192
11193 add_reg_note (insn, REG_CFA_DEF_CFA,
11194 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11195 GEN_INT (param_ptr_offset)));
11196 RTX_FRAME_RELATED_P (insn) = 1;
11197
11198 if (!call_used_regs[REGNO (crtl->drap_reg)])
11199 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11200 }
11201
11202 /* At this point the stack pointer must be valid, and we must have
11203 restored all of the registers. We may not have deallocated the
11204 entire stack frame. We've delayed this until now because it may
11205 be possible to merge the local stack deallocation with the
11206 deallocation forced by ix86_static_chain_on_stack. */
11207 gcc_assert (m->fs.sp_valid);
11208 gcc_assert (!m->fs.fp_valid);
11209 gcc_assert (!m->fs.realigned);
11210 if (m->fs.sp_offset != UNITS_PER_WORD)
11211 {
11212 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11213 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11214 style, true);
11215 }
11216 else
11217 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11218
11219 /* Sibcall epilogues don't want a return instruction. */
11220 if (style == 0)
11221 {
11222 m->fs = frame_state_save;
11223 return;
11224 }
11225
11226 if (crtl->args.pops_args && crtl->args.size)
11227 {
11228 rtx popc = GEN_INT (crtl->args.pops_args);
11229
11230 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11231 address, do explicit add, and jump indirectly to the caller. */
11232
11233 if (crtl->args.pops_args >= 65536)
11234 {
11235 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11236 rtx insn;
11237
11238 /* There is no "pascal" calling convention in any 64bit ABI. */
11239 gcc_assert (!TARGET_64BIT);
11240
11241 insn = emit_insn (gen_pop (ecx));
11242 m->fs.cfa_offset -= UNITS_PER_WORD;
11243 m->fs.sp_offset -= UNITS_PER_WORD;
11244
11245 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11246 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11247 add_reg_note (insn, REG_CFA_REGISTER,
11248 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11249 RTX_FRAME_RELATED_P (insn) = 1;
11250
11251 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11252 popc, -1, true);
11253 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11254 }
11255 else
11256 emit_jump_insn (gen_simple_return_pop_internal (popc));
11257 }
11258 else
11259 emit_jump_insn (gen_simple_return_internal ());
11260
11261 /* Restore the state back to the state from the prologue,
11262 so that it's correct for the next epilogue. */
11263 m->fs = frame_state_save;
11264 }
11265
11266 /* Reset from the function's potential modifications. */
11267
11268 static void
11269 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11270 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11271 {
11272 if (pic_offset_table_rtx)
11273 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11274 #if TARGET_MACHO
11275 /* Mach-O doesn't support labels at the end of objects, so if
11276 it looks like we might want one, insert a NOP. */
11277 {
11278 rtx insn = get_last_insn ();
11279 rtx deleted_debug_label = NULL_RTX;
11280 while (insn
11281 && NOTE_P (insn)
11282 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11283 {
11284 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11285 notes only, instead set their CODE_LABEL_NUMBER to -1,
11286 otherwise there would be code generation differences
11287 in between -g and -g0. */
11288 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11289 deleted_debug_label = insn;
11290 insn = PREV_INSN (insn);
11291 }
11292 if (insn
11293 && (LABEL_P (insn)
11294 || (NOTE_P (insn)
11295 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11296 fputs ("\tnop\n", file);
11297 else if (deleted_debug_label)
11298 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11299 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11300 CODE_LABEL_NUMBER (insn) = -1;
11301 }
11302 #endif
11303
11304 }
11305
11306 /* Return a scratch register to use in the split stack prologue. The
11307 split stack prologue is used for -fsplit-stack. It is the first
11308 instructions in the function, even before the regular prologue.
11309 The scratch register can be any caller-saved register which is not
11310 used for parameters or for the static chain. */
11311
11312 static unsigned int
11313 split_stack_prologue_scratch_regno (void)
11314 {
11315 if (TARGET_64BIT)
11316 return R11_REG;
11317 else
11318 {
11319 bool is_fastcall, is_thiscall;
11320 int regparm;
11321
11322 is_fastcall = (lookup_attribute ("fastcall",
11323 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11324 != NULL);
11325 is_thiscall = (lookup_attribute ("thiscall",
11326 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11327 != NULL);
11328 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11329
11330 if (is_fastcall)
11331 {
11332 if (DECL_STATIC_CHAIN (cfun->decl))
11333 {
11334 sorry ("-fsplit-stack does not support fastcall with "
11335 "nested function");
11336 return INVALID_REGNUM;
11337 }
11338 return AX_REG;
11339 }
11340 else if (is_thiscall)
11341 {
11342 if (!DECL_STATIC_CHAIN (cfun->decl))
11343 return DX_REG;
11344 return AX_REG;
11345 }
11346 else if (regparm < 3)
11347 {
11348 if (!DECL_STATIC_CHAIN (cfun->decl))
11349 return CX_REG;
11350 else
11351 {
11352 if (regparm >= 2)
11353 {
11354 sorry ("-fsplit-stack does not support 2 register "
11355 " parameters for a nested function");
11356 return INVALID_REGNUM;
11357 }
11358 return DX_REG;
11359 }
11360 }
11361 else
11362 {
11363 /* FIXME: We could make this work by pushing a register
11364 around the addition and comparison. */
11365 sorry ("-fsplit-stack does not support 3 register parameters");
11366 return INVALID_REGNUM;
11367 }
11368 }
11369 }
11370
11371 /* A SYMBOL_REF for the function which allocates new stackspace for
11372 -fsplit-stack. */
11373
11374 static GTY(()) rtx split_stack_fn;
11375
11376 /* A SYMBOL_REF for the more stack function when using the large
11377 model. */
11378
11379 static GTY(()) rtx split_stack_fn_large;
11380
11381 /* Handle -fsplit-stack. These are the first instructions in the
11382 function, even before the regular prologue. */
11383
11384 void
11385 ix86_expand_split_stack_prologue (void)
11386 {
11387 struct ix86_frame frame;
11388 HOST_WIDE_INT allocate;
11389 unsigned HOST_WIDE_INT args_size;
11390 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11391 rtx scratch_reg = NULL_RTX;
11392 rtx varargs_label = NULL_RTX;
11393 rtx fn;
11394
11395 gcc_assert (flag_split_stack && reload_completed);
11396
11397 ix86_finalize_stack_realign_flags ();
11398 ix86_compute_frame_layout (&frame);
11399 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11400
11401 /* This is the label we will branch to if we have enough stack
11402 space. We expect the basic block reordering pass to reverse this
11403 branch if optimizing, so that we branch in the unlikely case. */
11404 label = gen_label_rtx ();
11405
11406 /* We need to compare the stack pointer minus the frame size with
11407 the stack boundary in the TCB. The stack boundary always gives
11408 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11409 can compare directly. Otherwise we need to do an addition. */
11410
11411 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11412 UNSPEC_STACK_CHECK);
11413 limit = gen_rtx_CONST (Pmode, limit);
11414 limit = gen_rtx_MEM (Pmode, limit);
11415 if (allocate < SPLIT_STACK_AVAILABLE)
11416 current = stack_pointer_rtx;
11417 else
11418 {
11419 unsigned int scratch_regno;
11420 rtx offset;
11421
11422 /* We need a scratch register to hold the stack pointer minus
11423 the required frame size. Since this is the very start of the
11424 function, the scratch register can be any caller-saved
11425 register which is not used for parameters. */
11426 offset = GEN_INT (- allocate);
11427 scratch_regno = split_stack_prologue_scratch_regno ();
11428 if (scratch_regno == INVALID_REGNUM)
11429 return;
11430 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11431 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11432 {
11433 /* We don't use ix86_gen_add3 in this case because it will
11434 want to split to lea, but when not optimizing the insn
11435 will not be split after this point. */
11436 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11437 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11438 offset)));
11439 }
11440 else
11441 {
11442 emit_move_insn (scratch_reg, offset);
11443 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11444 stack_pointer_rtx));
11445 }
11446 current = scratch_reg;
11447 }
11448
11449 ix86_expand_branch (GEU, current, limit, label);
11450 jump_insn = get_last_insn ();
11451 JUMP_LABEL (jump_insn) = label;
11452
11453 /* Mark the jump as very likely to be taken. */
11454 add_reg_note (jump_insn, REG_BR_PROB,
11455 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11456
11457 if (split_stack_fn == NULL_RTX)
11458 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11459 fn = split_stack_fn;
11460
11461 /* Get more stack space. We pass in the desired stack space and the
11462 size of the arguments to copy to the new stack. In 32-bit mode
11463 we push the parameters; __morestack will return on a new stack
11464 anyhow. In 64-bit mode we pass the parameters in r10 and
11465 r11. */
11466 allocate_rtx = GEN_INT (allocate);
11467 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11468 call_fusage = NULL_RTX;
11469 if (TARGET_64BIT)
11470 {
11471 rtx reg10, reg11;
11472
11473 reg10 = gen_rtx_REG (Pmode, R10_REG);
11474 reg11 = gen_rtx_REG (Pmode, R11_REG);
11475
11476 /* If this function uses a static chain, it will be in %r10.
11477 Preserve it across the call to __morestack. */
11478 if (DECL_STATIC_CHAIN (cfun->decl))
11479 {
11480 rtx rax;
11481
11482 rax = gen_rtx_REG (word_mode, AX_REG);
11483 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11484 use_reg (&call_fusage, rax);
11485 }
11486
11487 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11488 && !TARGET_PECOFF)
11489 {
11490 HOST_WIDE_INT argval;
11491
11492 gcc_assert (Pmode == DImode);
11493 /* When using the large model we need to load the address
11494 into a register, and we've run out of registers. So we
11495 switch to a different calling convention, and we call a
11496 different function: __morestack_large. We pass the
11497 argument size in the upper 32 bits of r10 and pass the
11498 frame size in the lower 32 bits. */
11499 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11500 gcc_assert ((args_size & 0xffffffff) == args_size);
11501
11502 if (split_stack_fn_large == NULL_RTX)
11503 split_stack_fn_large =
11504 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11505
11506 if (ix86_cmodel == CM_LARGE_PIC)
11507 {
11508 rtx label, x;
11509
11510 label = gen_label_rtx ();
11511 emit_label (label);
11512 LABEL_PRESERVE_P (label) = 1;
11513 emit_insn (gen_set_rip_rex64 (reg10, label));
11514 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11515 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11516 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11517 UNSPEC_GOT);
11518 x = gen_rtx_CONST (Pmode, x);
11519 emit_move_insn (reg11, x);
11520 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11521 x = gen_const_mem (Pmode, x);
11522 emit_move_insn (reg11, x);
11523 }
11524 else
11525 emit_move_insn (reg11, split_stack_fn_large);
11526
11527 fn = reg11;
11528
11529 argval = ((args_size << 16) << 16) + allocate;
11530 emit_move_insn (reg10, GEN_INT (argval));
11531 }
11532 else
11533 {
11534 emit_move_insn (reg10, allocate_rtx);
11535 emit_move_insn (reg11, GEN_INT (args_size));
11536 use_reg (&call_fusage, reg11);
11537 }
11538
11539 use_reg (&call_fusage, reg10);
11540 }
11541 else
11542 {
11543 emit_insn (gen_push (GEN_INT (args_size)));
11544 emit_insn (gen_push (allocate_rtx));
11545 }
11546 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11547 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11548 NULL_RTX, false);
11549 add_function_usage_to (call_insn, call_fusage);
11550
11551 /* In order to make call/return prediction work right, we now need
11552 to execute a return instruction. See
11553 libgcc/config/i386/morestack.S for the details on how this works.
11554
11555 For flow purposes gcc must not see this as a return
11556 instruction--we need control flow to continue at the subsequent
11557 label. Therefore, we use an unspec. */
11558 gcc_assert (crtl->args.pops_args < 65536);
11559 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11560
11561 /* If we are in 64-bit mode and this function uses a static chain,
11562 we saved %r10 in %rax before calling _morestack. */
11563 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11564 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11565 gen_rtx_REG (word_mode, AX_REG));
11566
11567 /* If this function calls va_start, we need to store a pointer to
11568 the arguments on the old stack, because they may not have been
11569 all copied to the new stack. At this point the old stack can be
11570 found at the frame pointer value used by __morestack, because
11571 __morestack has set that up before calling back to us. Here we
11572 store that pointer in a scratch register, and in
11573 ix86_expand_prologue we store the scratch register in a stack
11574 slot. */
11575 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11576 {
11577 unsigned int scratch_regno;
11578 rtx frame_reg;
11579 int words;
11580
11581 scratch_regno = split_stack_prologue_scratch_regno ();
11582 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11583 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11584
11585 /* 64-bit:
11586 fp -> old fp value
11587 return address within this function
11588 return address of caller of this function
11589 stack arguments
11590 So we add three words to get to the stack arguments.
11591
11592 32-bit:
11593 fp -> old fp value
11594 return address within this function
11595 first argument to __morestack
11596 second argument to __morestack
11597 return address of caller of this function
11598 stack arguments
11599 So we add five words to get to the stack arguments.
11600 */
11601 words = TARGET_64BIT ? 3 : 5;
11602 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11603 gen_rtx_PLUS (Pmode, frame_reg,
11604 GEN_INT (words * UNITS_PER_WORD))));
11605
11606 varargs_label = gen_label_rtx ();
11607 emit_jump_insn (gen_jump (varargs_label));
11608 JUMP_LABEL (get_last_insn ()) = varargs_label;
11609
11610 emit_barrier ();
11611 }
11612
11613 emit_label (label);
11614 LABEL_NUSES (label) = 1;
11615
11616 /* If this function calls va_start, we now have to set the scratch
11617 register for the case where we do not call __morestack. In this
11618 case we need to set it based on the stack pointer. */
11619 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11620 {
11621 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11622 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11623 GEN_INT (UNITS_PER_WORD))));
11624
11625 emit_label (varargs_label);
11626 LABEL_NUSES (varargs_label) = 1;
11627 }
11628 }
11629
11630 /* We may have to tell the dataflow pass that the split stack prologue
11631 is initializing a scratch register. */
11632
11633 static void
11634 ix86_live_on_entry (bitmap regs)
11635 {
11636 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11637 {
11638 gcc_assert (flag_split_stack);
11639 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11640 }
11641 }
11642 \f
11643 /* Determine if op is suitable SUBREG RTX for address. */
11644
11645 static bool
11646 ix86_address_subreg_operand (rtx op)
11647 {
11648 enum machine_mode mode;
11649
11650 if (!REG_P (op))
11651 return false;
11652
11653 mode = GET_MODE (op);
11654
11655 if (GET_MODE_CLASS (mode) != MODE_INT)
11656 return false;
11657
11658 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11659 failures when the register is one word out of a two word structure. */
11660 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11661 return false;
11662
11663 /* Allow only SUBREGs of non-eliminable hard registers. */
11664 return register_no_elim_operand (op, mode);
11665 }
11666
11667 /* Extract the parts of an RTL expression that is a valid memory address
11668 for an instruction. Return 0 if the structure of the address is
11669 grossly off. Return -1 if the address contains ASHIFT, so it is not
11670 strictly valid, but still used for computing length of lea instruction. */
11671
11672 int
11673 ix86_decompose_address (rtx addr, struct ix86_address *out)
11674 {
11675 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11676 rtx base_reg, index_reg;
11677 HOST_WIDE_INT scale = 1;
11678 rtx scale_rtx = NULL_RTX;
11679 rtx tmp;
11680 int retval = 1;
11681 enum ix86_address_seg seg = SEG_DEFAULT;
11682
11683 /* Allow zero-extended SImode addresses,
11684 they will be emitted with addr32 prefix. */
11685 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11686 {
11687 if (GET_CODE (addr) == ZERO_EXTEND
11688 && GET_MODE (XEXP (addr, 0)) == SImode)
11689 {
11690 addr = XEXP (addr, 0);
11691 if (CONST_INT_P (addr))
11692 return 0;
11693 }
11694 else if (GET_CODE (addr) == AND
11695 && const_32bit_mask (XEXP (addr, 1), DImode))
11696 {
11697 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
11698 if (addr == NULL_RTX)
11699 return 0;
11700
11701 if (CONST_INT_P (addr))
11702 return 0;
11703 }
11704 }
11705
11706 /* Allow SImode subregs of DImode addresses,
11707 they will be emitted with addr32 prefix. */
11708 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11709 {
11710 if (GET_CODE (addr) == SUBREG
11711 && GET_MODE (SUBREG_REG (addr)) == DImode)
11712 {
11713 addr = SUBREG_REG (addr);
11714 if (CONST_INT_P (addr))
11715 return 0;
11716 }
11717 }
11718
11719 if (REG_P (addr))
11720 base = addr;
11721 else if (GET_CODE (addr) == SUBREG)
11722 {
11723 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11724 base = addr;
11725 else
11726 return 0;
11727 }
11728 else if (GET_CODE (addr) == PLUS)
11729 {
11730 rtx addends[4], op;
11731 int n = 0, i;
11732
11733 op = addr;
11734 do
11735 {
11736 if (n >= 4)
11737 return 0;
11738 addends[n++] = XEXP (op, 1);
11739 op = XEXP (op, 0);
11740 }
11741 while (GET_CODE (op) == PLUS);
11742 if (n >= 4)
11743 return 0;
11744 addends[n] = op;
11745
11746 for (i = n; i >= 0; --i)
11747 {
11748 op = addends[i];
11749 switch (GET_CODE (op))
11750 {
11751 case MULT:
11752 if (index)
11753 return 0;
11754 index = XEXP (op, 0);
11755 scale_rtx = XEXP (op, 1);
11756 break;
11757
11758 case ASHIFT:
11759 if (index)
11760 return 0;
11761 index = XEXP (op, 0);
11762 tmp = XEXP (op, 1);
11763 if (!CONST_INT_P (tmp))
11764 return 0;
11765 scale = INTVAL (tmp);
11766 if ((unsigned HOST_WIDE_INT) scale > 3)
11767 return 0;
11768 scale = 1 << scale;
11769 break;
11770
11771 case ZERO_EXTEND:
11772 op = XEXP (op, 0);
11773 if (GET_CODE (op) != UNSPEC)
11774 return 0;
11775 /* FALLTHRU */
11776
11777 case UNSPEC:
11778 if (XINT (op, 1) == UNSPEC_TP
11779 && TARGET_TLS_DIRECT_SEG_REFS
11780 && seg == SEG_DEFAULT)
11781 seg = DEFAULT_TLS_SEG_REG;
11782 else
11783 return 0;
11784 break;
11785
11786 case SUBREG:
11787 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11788 return 0;
11789 /* FALLTHRU */
11790
11791 case REG:
11792 if (!base)
11793 base = op;
11794 else if (!index)
11795 index = op;
11796 else
11797 return 0;
11798 break;
11799
11800 case CONST:
11801 case CONST_INT:
11802 case SYMBOL_REF:
11803 case LABEL_REF:
11804 if (disp)
11805 return 0;
11806 disp = op;
11807 break;
11808
11809 default:
11810 return 0;
11811 }
11812 }
11813 }
11814 else if (GET_CODE (addr) == MULT)
11815 {
11816 index = XEXP (addr, 0); /* index*scale */
11817 scale_rtx = XEXP (addr, 1);
11818 }
11819 else if (GET_CODE (addr) == ASHIFT)
11820 {
11821 /* We're called for lea too, which implements ashift on occasion. */
11822 index = XEXP (addr, 0);
11823 tmp = XEXP (addr, 1);
11824 if (!CONST_INT_P (tmp))
11825 return 0;
11826 scale = INTVAL (tmp);
11827 if ((unsigned HOST_WIDE_INT) scale > 3)
11828 return 0;
11829 scale = 1 << scale;
11830 retval = -1;
11831 }
11832 else if (CONST_INT_P (addr))
11833 {
11834 if (!x86_64_immediate_operand (addr, VOIDmode))
11835 return 0;
11836
11837 /* Constant addresses are sign extended to 64bit, we have to
11838 prevent addresses from 0x80000000 to 0xffffffff in x32 mode. */
11839 if (TARGET_X32
11840 && val_signbit_known_set_p (SImode, INTVAL (addr)))
11841 return 0;
11842
11843 disp = addr;
11844 }
11845 else
11846 disp = addr; /* displacement */
11847
11848 if (index)
11849 {
11850 if (REG_P (index))
11851 ;
11852 else if (GET_CODE (index) == SUBREG
11853 && ix86_address_subreg_operand (SUBREG_REG (index)))
11854 ;
11855 else
11856 return 0;
11857 }
11858
11859 /* Address override works only on the (%reg) part of %fs:(%reg). */
11860 if (seg != SEG_DEFAULT
11861 && ((base && GET_MODE (base) != word_mode)
11862 || (index && GET_MODE (index) != word_mode)))
11863 return 0;
11864
11865 /* Extract the integral value of scale. */
11866 if (scale_rtx)
11867 {
11868 if (!CONST_INT_P (scale_rtx))
11869 return 0;
11870 scale = INTVAL (scale_rtx);
11871 }
11872
11873 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11874 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11875
11876 /* Avoid useless 0 displacement. */
11877 if (disp == const0_rtx && (base || index))
11878 disp = NULL_RTX;
11879
11880 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11881 if (base_reg && index_reg && scale == 1
11882 && (index_reg == arg_pointer_rtx
11883 || index_reg == frame_pointer_rtx
11884 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11885 {
11886 rtx tmp;
11887 tmp = base, base = index, index = tmp;
11888 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11889 }
11890
11891 /* Special case: %ebp cannot be encoded as a base without a displacement.
11892 Similarly %r13. */
11893 if (!disp
11894 && base_reg
11895 && (base_reg == hard_frame_pointer_rtx
11896 || base_reg == frame_pointer_rtx
11897 || base_reg == arg_pointer_rtx
11898 || (REG_P (base_reg)
11899 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11900 || REGNO (base_reg) == R13_REG))))
11901 disp = const0_rtx;
11902
11903 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11904 Avoid this by transforming to [%esi+0].
11905 Reload calls address legitimization without cfun defined, so we need
11906 to test cfun for being non-NULL. */
11907 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11908 && base_reg && !index_reg && !disp
11909 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11910 disp = const0_rtx;
11911
11912 /* Special case: encode reg+reg instead of reg*2. */
11913 if (!base && index && scale == 2)
11914 base = index, base_reg = index_reg, scale = 1;
11915
11916 /* Special case: scaling cannot be encoded without base or displacement. */
11917 if (!base && !disp && index && scale != 1)
11918 disp = const0_rtx;
11919
11920 out->base = base;
11921 out->index = index;
11922 out->disp = disp;
11923 out->scale = scale;
11924 out->seg = seg;
11925
11926 return retval;
11927 }
11928 \f
11929 /* Return cost of the memory address x.
11930 For i386, it is better to use a complex address than let gcc copy
11931 the address into a reg and make a new pseudo. But not if the address
11932 requires to two regs - that would mean more pseudos with longer
11933 lifetimes. */
11934 static int
11935 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
11936 addr_space_t as ATTRIBUTE_UNUSED,
11937 bool speed ATTRIBUTE_UNUSED)
11938 {
11939 struct ix86_address parts;
11940 int cost = 1;
11941 int ok = ix86_decompose_address (x, &parts);
11942
11943 gcc_assert (ok);
11944
11945 if (parts.base && GET_CODE (parts.base) == SUBREG)
11946 parts.base = SUBREG_REG (parts.base);
11947 if (parts.index && GET_CODE (parts.index) == SUBREG)
11948 parts.index = SUBREG_REG (parts.index);
11949
11950 /* Attempt to minimize number of registers in the address. */
11951 if ((parts.base
11952 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11953 || (parts.index
11954 && (!REG_P (parts.index)
11955 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11956 cost++;
11957
11958 if (parts.base
11959 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11960 && parts.index
11961 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11962 && parts.base != parts.index)
11963 cost++;
11964
11965 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11966 since it's predecode logic can't detect the length of instructions
11967 and it degenerates to vector decoded. Increase cost of such
11968 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11969 to split such addresses or even refuse such addresses at all.
11970
11971 Following addressing modes are affected:
11972 [base+scale*index]
11973 [scale*index+disp]
11974 [base+index]
11975
11976 The first and last case may be avoidable by explicitly coding the zero in
11977 memory address, but I don't have AMD-K6 machine handy to check this
11978 theory. */
11979
11980 if (TARGET_K6
11981 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11982 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11983 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11984 cost += 10;
11985
11986 return cost;
11987 }
11988 \f
11989 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11990 this is used for to form addresses to local data when -fPIC is in
11991 use. */
11992
11993 static bool
11994 darwin_local_data_pic (rtx disp)
11995 {
11996 return (GET_CODE (disp) == UNSPEC
11997 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11998 }
11999
12000 /* Determine if a given RTX is a valid constant. We already know this
12001 satisfies CONSTANT_P. */
12002
12003 static bool
12004 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
12005 {
12006 switch (GET_CODE (x))
12007 {
12008 case CONST:
12009 x = XEXP (x, 0);
12010
12011 if (GET_CODE (x) == PLUS)
12012 {
12013 if (!CONST_INT_P (XEXP (x, 1)))
12014 return false;
12015 x = XEXP (x, 0);
12016 }
12017
12018 if (TARGET_MACHO && darwin_local_data_pic (x))
12019 return true;
12020
12021 /* Only some unspecs are valid as "constants". */
12022 if (GET_CODE (x) == UNSPEC)
12023 switch (XINT (x, 1))
12024 {
12025 case UNSPEC_GOT:
12026 case UNSPEC_GOTOFF:
12027 case UNSPEC_PLTOFF:
12028 return TARGET_64BIT;
12029 case UNSPEC_TPOFF:
12030 case UNSPEC_NTPOFF:
12031 x = XVECEXP (x, 0, 0);
12032 return (GET_CODE (x) == SYMBOL_REF
12033 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12034 case UNSPEC_DTPOFF:
12035 x = XVECEXP (x, 0, 0);
12036 return (GET_CODE (x) == SYMBOL_REF
12037 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
12038 default:
12039 return false;
12040 }
12041
12042 /* We must have drilled down to a symbol. */
12043 if (GET_CODE (x) == LABEL_REF)
12044 return true;
12045 if (GET_CODE (x) != SYMBOL_REF)
12046 return false;
12047 /* FALLTHRU */
12048
12049 case SYMBOL_REF:
12050 /* TLS symbols are never valid. */
12051 if (SYMBOL_REF_TLS_MODEL (x))
12052 return false;
12053
12054 /* DLLIMPORT symbols are never valid. */
12055 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
12056 && SYMBOL_REF_DLLIMPORT_P (x))
12057 return false;
12058
12059 #if TARGET_MACHO
12060 /* mdynamic-no-pic */
12061 if (MACHO_DYNAMIC_NO_PIC_P)
12062 return machopic_symbol_defined_p (x);
12063 #endif
12064 break;
12065
12066 case CONST_DOUBLE:
12067 if (GET_MODE (x) == TImode
12068 && x != CONST0_RTX (TImode)
12069 && !TARGET_64BIT)
12070 return false;
12071 break;
12072
12073 case CONST_VECTOR:
12074 if (!standard_sse_constant_p (x))
12075 return false;
12076
12077 default:
12078 break;
12079 }
12080
12081 /* Otherwise we handle everything else in the move patterns. */
12082 return true;
12083 }
12084
12085 /* Determine if it's legal to put X into the constant pool. This
12086 is not possible for the address of thread-local symbols, which
12087 is checked above. */
12088
12089 static bool
12090 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
12091 {
12092 /* We can always put integral constants and vectors in memory. */
12093 switch (GET_CODE (x))
12094 {
12095 case CONST_INT:
12096 case CONST_DOUBLE:
12097 case CONST_VECTOR:
12098 return false;
12099
12100 default:
12101 break;
12102 }
12103 return !ix86_legitimate_constant_p (mode, x);
12104 }
12105
12106 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
12107 otherwise zero. */
12108
12109 static bool
12110 is_imported_p (rtx x)
12111 {
12112 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
12113 || GET_CODE (x) != SYMBOL_REF)
12114 return false;
12115
12116 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
12117 }
12118
12119
12120 /* Nonzero if the constant value X is a legitimate general operand
12121 when generating PIC code. It is given that flag_pic is on and
12122 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
12123
12124 bool
12125 legitimate_pic_operand_p (rtx x)
12126 {
12127 rtx inner;
12128
12129 switch (GET_CODE (x))
12130 {
12131 case CONST:
12132 inner = XEXP (x, 0);
12133 if (GET_CODE (inner) == PLUS
12134 && CONST_INT_P (XEXP (inner, 1)))
12135 inner = XEXP (inner, 0);
12136
12137 /* Only some unspecs are valid as "constants". */
12138 if (GET_CODE (inner) == UNSPEC)
12139 switch (XINT (inner, 1))
12140 {
12141 case UNSPEC_GOT:
12142 case UNSPEC_GOTOFF:
12143 case UNSPEC_PLTOFF:
12144 return TARGET_64BIT;
12145 case UNSPEC_TPOFF:
12146 x = XVECEXP (inner, 0, 0);
12147 return (GET_CODE (x) == SYMBOL_REF
12148 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12149 case UNSPEC_MACHOPIC_OFFSET:
12150 return legitimate_pic_address_disp_p (x);
12151 default:
12152 return false;
12153 }
12154 /* FALLTHRU */
12155
12156 case SYMBOL_REF:
12157 case LABEL_REF:
12158 return legitimate_pic_address_disp_p (x);
12159
12160 default:
12161 return true;
12162 }
12163 }
12164
12165 /* Determine if a given CONST RTX is a valid memory displacement
12166 in PIC mode. */
12167
12168 bool
12169 legitimate_pic_address_disp_p (rtx disp)
12170 {
12171 bool saw_plus;
12172
12173 /* In 64bit mode we can allow direct addresses of symbols and labels
12174 when they are not dynamic symbols. */
12175 if (TARGET_64BIT)
12176 {
12177 rtx op0 = disp, op1;
12178
12179 switch (GET_CODE (disp))
12180 {
12181 case LABEL_REF:
12182 return true;
12183
12184 case CONST:
12185 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12186 break;
12187 op0 = XEXP (XEXP (disp, 0), 0);
12188 op1 = XEXP (XEXP (disp, 0), 1);
12189 if (!CONST_INT_P (op1)
12190 || INTVAL (op1) >= 16*1024*1024
12191 || INTVAL (op1) < -16*1024*1024)
12192 break;
12193 if (GET_CODE (op0) == LABEL_REF)
12194 return true;
12195 if (GET_CODE (op0) == CONST
12196 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12197 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12198 return true;
12199 if (GET_CODE (op0) == UNSPEC
12200 && XINT (op0, 1) == UNSPEC_PCREL)
12201 return true;
12202 if (GET_CODE (op0) != SYMBOL_REF)
12203 break;
12204 /* FALLTHRU */
12205
12206 case SYMBOL_REF:
12207 /* TLS references should always be enclosed in UNSPEC.
12208 The dllimported symbol needs always to be resolved. */
12209 if (SYMBOL_REF_TLS_MODEL (op0)
12210 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
12211 return false;
12212
12213 if (TARGET_PECOFF)
12214 {
12215 if (is_imported_p (op0))
12216 return true;
12217
12218 if (SYMBOL_REF_FAR_ADDR_P (op0)
12219 || !SYMBOL_REF_LOCAL_P (op0))
12220 break;
12221
12222 /* Function-symbols need to be resolved only for
12223 large-model.
12224 For the small-model we don't need to resolve anything
12225 here. */
12226 if ((ix86_cmodel != CM_LARGE_PIC
12227 && SYMBOL_REF_FUNCTION_P (op0))
12228 || ix86_cmodel == CM_SMALL_PIC)
12229 return true;
12230 /* Non-external symbols don't need to be resolved for
12231 large, and medium-model. */
12232 if ((ix86_cmodel == CM_LARGE_PIC
12233 || ix86_cmodel == CM_MEDIUM_PIC)
12234 && !SYMBOL_REF_EXTERNAL_P (op0))
12235 return true;
12236 }
12237 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
12238 && SYMBOL_REF_LOCAL_P (op0)
12239 && ix86_cmodel != CM_LARGE_PIC)
12240 return true;
12241 break;
12242
12243 default:
12244 break;
12245 }
12246 }
12247 if (GET_CODE (disp) != CONST)
12248 return false;
12249 disp = XEXP (disp, 0);
12250
12251 if (TARGET_64BIT)
12252 {
12253 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12254 of GOT tables. We should not need these anyway. */
12255 if (GET_CODE (disp) != UNSPEC
12256 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12257 && XINT (disp, 1) != UNSPEC_GOTOFF
12258 && XINT (disp, 1) != UNSPEC_PCREL
12259 && XINT (disp, 1) != UNSPEC_PLTOFF))
12260 return false;
12261
12262 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12263 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12264 return false;
12265 return true;
12266 }
12267
12268 saw_plus = false;
12269 if (GET_CODE (disp) == PLUS)
12270 {
12271 if (!CONST_INT_P (XEXP (disp, 1)))
12272 return false;
12273 disp = XEXP (disp, 0);
12274 saw_plus = true;
12275 }
12276
12277 if (TARGET_MACHO && darwin_local_data_pic (disp))
12278 return true;
12279
12280 if (GET_CODE (disp) != UNSPEC)
12281 return false;
12282
12283 switch (XINT (disp, 1))
12284 {
12285 case UNSPEC_GOT:
12286 if (saw_plus)
12287 return false;
12288 /* We need to check for both symbols and labels because VxWorks loads
12289 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12290 details. */
12291 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12292 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12293 case UNSPEC_GOTOFF:
12294 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12295 While ABI specify also 32bit relocation but we don't produce it in
12296 small PIC model at all. */
12297 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12298 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12299 && !TARGET_64BIT)
12300 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12301 return false;
12302 case UNSPEC_GOTTPOFF:
12303 case UNSPEC_GOTNTPOFF:
12304 case UNSPEC_INDNTPOFF:
12305 if (saw_plus)
12306 return false;
12307 disp = XVECEXP (disp, 0, 0);
12308 return (GET_CODE (disp) == SYMBOL_REF
12309 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12310 case UNSPEC_NTPOFF:
12311 disp = XVECEXP (disp, 0, 0);
12312 return (GET_CODE (disp) == SYMBOL_REF
12313 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12314 case UNSPEC_DTPOFF:
12315 disp = XVECEXP (disp, 0, 0);
12316 return (GET_CODE (disp) == SYMBOL_REF
12317 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12318 }
12319
12320 return false;
12321 }
12322
12323 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12324 replace the input X, or the original X if no replacement is called for.
12325 The output parameter *WIN is 1 if the calling macro should goto WIN,
12326 0 if it should not. */
12327
12328 bool
12329 ix86_legitimize_reload_address (rtx x,
12330 enum machine_mode mode ATTRIBUTE_UNUSED,
12331 int opnum, int type,
12332 int ind_levels ATTRIBUTE_UNUSED)
12333 {
12334 /* Reload can generate:
12335
12336 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12337 (reg:DI 97))
12338 (reg:DI 2 cx))
12339
12340 This RTX is rejected from ix86_legitimate_address_p due to
12341 non-strictness of base register 97. Following this rejection,
12342 reload pushes all three components into separate registers,
12343 creating invalid memory address RTX.
12344
12345 Following code reloads only the invalid part of the
12346 memory address RTX. */
12347
12348 if (GET_CODE (x) == PLUS
12349 && REG_P (XEXP (x, 1))
12350 && GET_CODE (XEXP (x, 0)) == PLUS
12351 && REG_P (XEXP (XEXP (x, 0), 1)))
12352 {
12353 rtx base, index;
12354 bool something_reloaded = false;
12355
12356 base = XEXP (XEXP (x, 0), 1);
12357 if (!REG_OK_FOR_BASE_STRICT_P (base))
12358 {
12359 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12360 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12361 opnum, (enum reload_type) type);
12362 something_reloaded = true;
12363 }
12364
12365 index = XEXP (x, 1);
12366 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12367 {
12368 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12369 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12370 opnum, (enum reload_type) type);
12371 something_reloaded = true;
12372 }
12373
12374 gcc_assert (something_reloaded);
12375 return true;
12376 }
12377
12378 return false;
12379 }
12380
12381 /* Recognizes RTL expressions that are valid memory addresses for an
12382 instruction. The MODE argument is the machine mode for the MEM
12383 expression that wants to use this address.
12384
12385 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12386 convert common non-canonical forms to canonical form so that they will
12387 be recognized. */
12388
12389 static bool
12390 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12391 rtx addr, bool strict)
12392 {
12393 struct ix86_address parts;
12394 rtx base, index, disp;
12395 HOST_WIDE_INT scale;
12396
12397 if (ix86_decompose_address (addr, &parts) <= 0)
12398 /* Decomposition failed. */
12399 return false;
12400
12401 base = parts.base;
12402 index = parts.index;
12403 disp = parts.disp;
12404 scale = parts.scale;
12405
12406 /* Validate base register. */
12407 if (base)
12408 {
12409 rtx reg;
12410
12411 if (REG_P (base))
12412 reg = base;
12413 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12414 reg = SUBREG_REG (base);
12415 else
12416 /* Base is not a register. */
12417 return false;
12418
12419 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12420 return false;
12421
12422 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12423 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12424 /* Base is not valid. */
12425 return false;
12426 }
12427
12428 /* Validate index register. */
12429 if (index)
12430 {
12431 rtx reg;
12432
12433 if (REG_P (index))
12434 reg = index;
12435 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12436 reg = SUBREG_REG (index);
12437 else
12438 /* Index is not a register. */
12439 return false;
12440
12441 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12442 return false;
12443
12444 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12445 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12446 /* Index is not valid. */
12447 return false;
12448 }
12449
12450 /* Index and base should have the same mode. */
12451 if (base && index
12452 && GET_MODE (base) != GET_MODE (index))
12453 return false;
12454
12455 /* Validate scale factor. */
12456 if (scale != 1)
12457 {
12458 if (!index)
12459 /* Scale without index. */
12460 return false;
12461
12462 if (scale != 2 && scale != 4 && scale != 8)
12463 /* Scale is not a valid multiplier. */
12464 return false;
12465 }
12466
12467 /* Validate displacement. */
12468 if (disp)
12469 {
12470 if (GET_CODE (disp) == CONST
12471 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12472 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12473 switch (XINT (XEXP (disp, 0), 1))
12474 {
12475 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12476 used. While ABI specify also 32bit relocations, we don't produce
12477 them at all and use IP relative instead. */
12478 case UNSPEC_GOT:
12479 case UNSPEC_GOTOFF:
12480 gcc_assert (flag_pic);
12481 if (!TARGET_64BIT)
12482 goto is_legitimate_pic;
12483
12484 /* 64bit address unspec. */
12485 return false;
12486
12487 case UNSPEC_GOTPCREL:
12488 case UNSPEC_PCREL:
12489 gcc_assert (flag_pic);
12490 goto is_legitimate_pic;
12491
12492 case UNSPEC_GOTTPOFF:
12493 case UNSPEC_GOTNTPOFF:
12494 case UNSPEC_INDNTPOFF:
12495 case UNSPEC_NTPOFF:
12496 case UNSPEC_DTPOFF:
12497 break;
12498
12499 case UNSPEC_STACK_CHECK:
12500 gcc_assert (flag_split_stack);
12501 break;
12502
12503 default:
12504 /* Invalid address unspec. */
12505 return false;
12506 }
12507
12508 else if (SYMBOLIC_CONST (disp)
12509 && (flag_pic
12510 || (TARGET_MACHO
12511 #if TARGET_MACHO
12512 && MACHOPIC_INDIRECT
12513 && !machopic_operand_p (disp)
12514 #endif
12515 )))
12516 {
12517
12518 is_legitimate_pic:
12519 if (TARGET_64BIT && (index || base))
12520 {
12521 /* foo@dtpoff(%rX) is ok. */
12522 if (GET_CODE (disp) != CONST
12523 || GET_CODE (XEXP (disp, 0)) != PLUS
12524 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12525 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12526 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12527 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12528 /* Non-constant pic memory reference. */
12529 return false;
12530 }
12531 else if ((!TARGET_MACHO || flag_pic)
12532 && ! legitimate_pic_address_disp_p (disp))
12533 /* Displacement is an invalid pic construct. */
12534 return false;
12535 #if TARGET_MACHO
12536 else if (MACHO_DYNAMIC_NO_PIC_P
12537 && !ix86_legitimate_constant_p (Pmode, disp))
12538 /* displacment must be referenced via non_lazy_pointer */
12539 return false;
12540 #endif
12541
12542 /* This code used to verify that a symbolic pic displacement
12543 includes the pic_offset_table_rtx register.
12544
12545 While this is good idea, unfortunately these constructs may
12546 be created by "adds using lea" optimization for incorrect
12547 code like:
12548
12549 int a;
12550 int foo(int i)
12551 {
12552 return *(&a+i);
12553 }
12554
12555 This code is nonsensical, but results in addressing
12556 GOT table with pic_offset_table_rtx base. We can't
12557 just refuse it easily, since it gets matched by
12558 "addsi3" pattern, that later gets split to lea in the
12559 case output register differs from input. While this
12560 can be handled by separate addsi pattern for this case
12561 that never results in lea, this seems to be easier and
12562 correct fix for crash to disable this test. */
12563 }
12564 else if (GET_CODE (disp) != LABEL_REF
12565 && !CONST_INT_P (disp)
12566 && (GET_CODE (disp) != CONST
12567 || !ix86_legitimate_constant_p (Pmode, disp))
12568 && (GET_CODE (disp) != SYMBOL_REF
12569 || !ix86_legitimate_constant_p (Pmode, disp)))
12570 /* Displacement is not constant. */
12571 return false;
12572 else if (TARGET_64BIT
12573 && !x86_64_immediate_operand (disp, VOIDmode))
12574 /* Displacement is out of range. */
12575 return false;
12576 }
12577
12578 /* Everything looks valid. */
12579 return true;
12580 }
12581
12582 /* Determine if a given RTX is a valid constant address. */
12583
12584 bool
12585 constant_address_p (rtx x)
12586 {
12587 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12588 }
12589 \f
12590 /* Return a unique alias set for the GOT. */
12591
12592 static alias_set_type
12593 ix86_GOT_alias_set (void)
12594 {
12595 static alias_set_type set = -1;
12596 if (set == -1)
12597 set = new_alias_set ();
12598 return set;
12599 }
12600
12601 /* Return a legitimate reference for ORIG (an address) using the
12602 register REG. If REG is 0, a new pseudo is generated.
12603
12604 There are two types of references that must be handled:
12605
12606 1. Global data references must load the address from the GOT, via
12607 the PIC reg. An insn is emitted to do this load, and the reg is
12608 returned.
12609
12610 2. Static data references, constant pool addresses, and code labels
12611 compute the address as an offset from the GOT, whose base is in
12612 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12613 differentiate them from global data objects. The returned
12614 address is the PIC reg + an unspec constant.
12615
12616 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12617 reg also appears in the address. */
12618
12619 static rtx
12620 legitimize_pic_address (rtx orig, rtx reg)
12621 {
12622 rtx addr = orig;
12623 rtx new_rtx = orig;
12624
12625 #if TARGET_MACHO
12626 if (TARGET_MACHO && !TARGET_64BIT)
12627 {
12628 if (reg == 0)
12629 reg = gen_reg_rtx (Pmode);
12630 /* Use the generic Mach-O PIC machinery. */
12631 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12632 }
12633 #endif
12634
12635 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12636 {
12637 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12638 if (tmp)
12639 return tmp;
12640 }
12641
12642 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12643 new_rtx = addr;
12644 else if (TARGET_64BIT && !TARGET_PECOFF
12645 && ix86_cmodel != CM_SMALL_PIC && gotoff_operand (addr, Pmode))
12646 {
12647 rtx tmpreg;
12648 /* This symbol may be referenced via a displacement from the PIC
12649 base address (@GOTOFF). */
12650
12651 if (reload_in_progress)
12652 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12653 if (GET_CODE (addr) == CONST)
12654 addr = XEXP (addr, 0);
12655 if (GET_CODE (addr) == PLUS)
12656 {
12657 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12658 UNSPEC_GOTOFF);
12659 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12660 }
12661 else
12662 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12663 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12664 if (!reg)
12665 tmpreg = gen_reg_rtx (Pmode);
12666 else
12667 tmpreg = reg;
12668 emit_move_insn (tmpreg, new_rtx);
12669
12670 if (reg != 0)
12671 {
12672 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12673 tmpreg, 1, OPTAB_DIRECT);
12674 new_rtx = reg;
12675 }
12676 else
12677 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12678 }
12679 else if (!TARGET_64BIT && !TARGET_PECOFF && gotoff_operand (addr, Pmode))
12680 {
12681 /* This symbol may be referenced via a displacement from the PIC
12682 base address (@GOTOFF). */
12683
12684 if (reload_in_progress)
12685 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12686 if (GET_CODE (addr) == CONST)
12687 addr = XEXP (addr, 0);
12688 if (GET_CODE (addr) == PLUS)
12689 {
12690 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12691 UNSPEC_GOTOFF);
12692 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12693 }
12694 else
12695 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12696 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12697 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12698
12699 if (reg != 0)
12700 {
12701 emit_move_insn (reg, new_rtx);
12702 new_rtx = reg;
12703 }
12704 }
12705 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12706 /* We can't use @GOTOFF for text labels on VxWorks;
12707 see gotoff_operand. */
12708 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12709 {
12710 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12711 if (tmp)
12712 return tmp;
12713
12714 /* For x64 PE-COFF there is no GOT table. So we use address
12715 directly. */
12716 if (TARGET_64BIT && TARGET_PECOFF)
12717 {
12718 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12719 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12720
12721 if (reg == 0)
12722 reg = gen_reg_rtx (Pmode);
12723 emit_move_insn (reg, new_rtx);
12724 new_rtx = reg;
12725 }
12726 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12727 {
12728 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12729 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12730 new_rtx = gen_const_mem (Pmode, new_rtx);
12731 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12732
12733 if (reg == 0)
12734 reg = gen_reg_rtx (Pmode);
12735 /* Use directly gen_movsi, otherwise the address is loaded
12736 into register for CSE. We don't want to CSE this addresses,
12737 instead we CSE addresses from the GOT table, so skip this. */
12738 emit_insn (gen_movsi (reg, new_rtx));
12739 new_rtx = reg;
12740 }
12741 else
12742 {
12743 /* This symbol must be referenced via a load from the
12744 Global Offset Table (@GOT). */
12745
12746 if (reload_in_progress)
12747 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12748 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12749 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12750 if (TARGET_64BIT)
12751 new_rtx = force_reg (Pmode, new_rtx);
12752 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12753 new_rtx = gen_const_mem (Pmode, new_rtx);
12754 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12755
12756 if (reg == 0)
12757 reg = gen_reg_rtx (Pmode);
12758 emit_move_insn (reg, new_rtx);
12759 new_rtx = reg;
12760 }
12761 }
12762 else
12763 {
12764 if (CONST_INT_P (addr)
12765 && !x86_64_immediate_operand (addr, VOIDmode))
12766 {
12767 if (reg)
12768 {
12769 emit_move_insn (reg, addr);
12770 new_rtx = reg;
12771 }
12772 else
12773 new_rtx = force_reg (Pmode, addr);
12774 }
12775 else if (GET_CODE (addr) == CONST)
12776 {
12777 addr = XEXP (addr, 0);
12778
12779 /* We must match stuff we generate before. Assume the only
12780 unspecs that can get here are ours. Not that we could do
12781 anything with them anyway.... */
12782 if (GET_CODE (addr) == UNSPEC
12783 || (GET_CODE (addr) == PLUS
12784 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12785 return orig;
12786 gcc_assert (GET_CODE (addr) == PLUS);
12787 }
12788 if (GET_CODE (addr) == PLUS)
12789 {
12790 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12791
12792 /* Check first to see if this is a constant offset from a @GOTOFF
12793 symbol reference. */
12794 if (!TARGET_PECOFF && gotoff_operand (op0, Pmode)
12795 && CONST_INT_P (op1))
12796 {
12797 if (!TARGET_64BIT)
12798 {
12799 if (reload_in_progress)
12800 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12801 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12802 UNSPEC_GOTOFF);
12803 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12804 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12805 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12806
12807 if (reg != 0)
12808 {
12809 emit_move_insn (reg, new_rtx);
12810 new_rtx = reg;
12811 }
12812 }
12813 else
12814 {
12815 if (INTVAL (op1) < -16*1024*1024
12816 || INTVAL (op1) >= 16*1024*1024)
12817 {
12818 if (!x86_64_immediate_operand (op1, Pmode))
12819 op1 = force_reg (Pmode, op1);
12820 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12821 }
12822 }
12823 }
12824 else
12825 {
12826 rtx base = legitimize_pic_address (op0, reg);
12827 enum machine_mode mode = GET_MODE (base);
12828 new_rtx
12829 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
12830
12831 if (CONST_INT_P (new_rtx))
12832 {
12833 if (INTVAL (new_rtx) < -16*1024*1024
12834 || INTVAL (new_rtx) >= 16*1024*1024)
12835 {
12836 if (!x86_64_immediate_operand (new_rtx, mode))
12837 new_rtx = force_reg (mode, new_rtx);
12838 new_rtx
12839 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
12840 }
12841 else
12842 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
12843 }
12844 else
12845 {
12846 if (GET_CODE (new_rtx) == PLUS
12847 && CONSTANT_P (XEXP (new_rtx, 1)))
12848 {
12849 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
12850 new_rtx = XEXP (new_rtx, 1);
12851 }
12852 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
12853 }
12854 }
12855 }
12856 }
12857 return new_rtx;
12858 }
12859 \f
12860 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12861
12862 static rtx
12863 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12864 {
12865 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12866
12867 if (GET_MODE (tp) != tp_mode)
12868 {
12869 gcc_assert (GET_MODE (tp) == SImode);
12870 gcc_assert (tp_mode == DImode);
12871
12872 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12873 }
12874
12875 if (to_reg)
12876 tp = copy_to_mode_reg (tp_mode, tp);
12877
12878 return tp;
12879 }
12880
12881 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12882
12883 static GTY(()) rtx ix86_tls_symbol;
12884
12885 static rtx
12886 ix86_tls_get_addr (void)
12887 {
12888 if (!ix86_tls_symbol)
12889 {
12890 const char *sym
12891 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12892 ? "___tls_get_addr" : "__tls_get_addr");
12893
12894 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12895 }
12896
12897 return ix86_tls_symbol;
12898 }
12899
12900 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12901
12902 static GTY(()) rtx ix86_tls_module_base_symbol;
12903
12904 rtx
12905 ix86_tls_module_base (void)
12906 {
12907 if (!ix86_tls_module_base_symbol)
12908 {
12909 ix86_tls_module_base_symbol
12910 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12911
12912 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12913 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12914 }
12915
12916 return ix86_tls_module_base_symbol;
12917 }
12918
12919 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12920 false if we expect this to be used for a memory address and true if
12921 we expect to load the address into a register. */
12922
12923 static rtx
12924 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12925 {
12926 rtx dest, base, off;
12927 rtx pic = NULL_RTX, tp = NULL_RTX;
12928 enum machine_mode tp_mode = Pmode;
12929 int type;
12930
12931 switch (model)
12932 {
12933 case TLS_MODEL_GLOBAL_DYNAMIC:
12934 dest = gen_reg_rtx (Pmode);
12935
12936 if (!TARGET_64BIT)
12937 {
12938 if (flag_pic && !TARGET_PECOFF)
12939 pic = pic_offset_table_rtx;
12940 else
12941 {
12942 pic = gen_reg_rtx (Pmode);
12943 emit_insn (gen_set_got (pic));
12944 }
12945 }
12946
12947 if (TARGET_GNU2_TLS)
12948 {
12949 if (TARGET_64BIT)
12950 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12951 else
12952 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12953
12954 tp = get_thread_pointer (Pmode, true);
12955 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12956
12957 if (GET_MODE (x) != Pmode)
12958 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12959
12960 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12961 }
12962 else
12963 {
12964 rtx caddr = ix86_tls_get_addr ();
12965
12966 if (TARGET_64BIT)
12967 {
12968 rtx rax = gen_rtx_REG (Pmode, AX_REG);
12969 rtx insns;
12970
12971 start_sequence ();
12972 emit_call_insn
12973 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
12974 insns = get_insns ();
12975 end_sequence ();
12976
12977 if (GET_MODE (x) != Pmode)
12978 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12979
12980 RTL_CONST_CALL_P (insns) = 1;
12981 emit_libcall_block (insns, dest, rax, x);
12982 }
12983 else
12984 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12985 }
12986 break;
12987
12988 case TLS_MODEL_LOCAL_DYNAMIC:
12989 base = gen_reg_rtx (Pmode);
12990
12991 if (!TARGET_64BIT)
12992 {
12993 if (flag_pic)
12994 pic = pic_offset_table_rtx;
12995 else
12996 {
12997 pic = gen_reg_rtx (Pmode);
12998 emit_insn (gen_set_got (pic));
12999 }
13000 }
13001
13002 if (TARGET_GNU2_TLS)
13003 {
13004 rtx tmp = ix86_tls_module_base ();
13005
13006 if (TARGET_64BIT)
13007 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
13008 else
13009 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
13010
13011 tp = get_thread_pointer (Pmode, true);
13012 set_unique_reg_note (get_last_insn (), REG_EQUAL,
13013 gen_rtx_MINUS (Pmode, tmp, tp));
13014 }
13015 else
13016 {
13017 rtx caddr = ix86_tls_get_addr ();
13018
13019 if (TARGET_64BIT)
13020 {
13021 rtx rax = gen_rtx_REG (Pmode, AX_REG);
13022 rtx insns, eqv;
13023
13024 start_sequence ();
13025 emit_call_insn
13026 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
13027 insns = get_insns ();
13028 end_sequence ();
13029
13030 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
13031 share the LD_BASE result with other LD model accesses. */
13032 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
13033 UNSPEC_TLS_LD_BASE);
13034
13035 RTL_CONST_CALL_P (insns) = 1;
13036 emit_libcall_block (insns, base, rax, eqv);
13037 }
13038 else
13039 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
13040 }
13041
13042 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
13043 off = gen_rtx_CONST (Pmode, off);
13044
13045 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
13046
13047 if (TARGET_GNU2_TLS)
13048 {
13049 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
13050
13051 if (GET_MODE (x) != Pmode)
13052 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13053
13054 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13055 }
13056 break;
13057
13058 case TLS_MODEL_INITIAL_EXEC:
13059 if (TARGET_64BIT)
13060 {
13061 if (TARGET_SUN_TLS && !TARGET_X32)
13062 {
13063 /* The Sun linker took the AMD64 TLS spec literally
13064 and can only handle %rax as destination of the
13065 initial executable code sequence. */
13066
13067 dest = gen_reg_rtx (DImode);
13068 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
13069 return dest;
13070 }
13071
13072 /* Generate DImode references to avoid %fs:(%reg32)
13073 problems and linker IE->LE relaxation bug. */
13074 tp_mode = DImode;
13075 pic = NULL;
13076 type = UNSPEC_GOTNTPOFF;
13077 }
13078 else if (flag_pic)
13079 {
13080 if (reload_in_progress)
13081 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13082 pic = pic_offset_table_rtx;
13083 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
13084 }
13085 else if (!TARGET_ANY_GNU_TLS)
13086 {
13087 pic = gen_reg_rtx (Pmode);
13088 emit_insn (gen_set_got (pic));
13089 type = UNSPEC_GOTTPOFF;
13090 }
13091 else
13092 {
13093 pic = NULL;
13094 type = UNSPEC_INDNTPOFF;
13095 }
13096
13097 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
13098 off = gen_rtx_CONST (tp_mode, off);
13099 if (pic)
13100 off = gen_rtx_PLUS (tp_mode, pic, off);
13101 off = gen_const_mem (tp_mode, off);
13102 set_mem_alias_set (off, ix86_GOT_alias_set ());
13103
13104 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13105 {
13106 base = get_thread_pointer (tp_mode,
13107 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13108 off = force_reg (tp_mode, off);
13109 return gen_rtx_PLUS (tp_mode, base, off);
13110 }
13111 else
13112 {
13113 base = get_thread_pointer (Pmode, true);
13114 dest = gen_reg_rtx (Pmode);
13115 emit_insn (ix86_gen_sub3 (dest, base, off));
13116 }
13117 break;
13118
13119 case TLS_MODEL_LOCAL_EXEC:
13120 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
13121 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13122 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
13123 off = gen_rtx_CONST (Pmode, off);
13124
13125 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13126 {
13127 base = get_thread_pointer (Pmode,
13128 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13129 return gen_rtx_PLUS (Pmode, base, off);
13130 }
13131 else
13132 {
13133 base = get_thread_pointer (Pmode, true);
13134 dest = gen_reg_rtx (Pmode);
13135 emit_insn (ix86_gen_sub3 (dest, base, off));
13136 }
13137 break;
13138
13139 default:
13140 gcc_unreachable ();
13141 }
13142
13143 return dest;
13144 }
13145
13146 /* Create or return the unique __imp_DECL dllimport symbol corresponding
13147 to symbol DECL if BEIMPORT is true. Otherwise create or return the
13148 unique refptr-DECL symbol corresponding to symbol DECL. */
13149
13150 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
13151 htab_t dllimport_map;
13152
13153 static tree
13154 get_dllimport_decl (tree decl, bool beimport)
13155 {
13156 struct tree_map *h, in;
13157 void **loc;
13158 const char *name;
13159 const char *prefix;
13160 size_t namelen, prefixlen;
13161 char *imp_name;
13162 tree to;
13163 rtx rtl;
13164
13165 if (!dllimport_map)
13166 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13167
13168 in.hash = htab_hash_pointer (decl);
13169 in.base.from = decl;
13170 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13171 h = (struct tree_map *) *loc;
13172 if (h)
13173 return h->to;
13174
13175 *loc = h = ggc_alloc_tree_map ();
13176 h->hash = in.hash;
13177 h->base.from = decl;
13178 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13179 VAR_DECL, NULL, ptr_type_node);
13180 DECL_ARTIFICIAL (to) = 1;
13181 DECL_IGNORED_P (to) = 1;
13182 DECL_EXTERNAL (to) = 1;
13183 TREE_READONLY (to) = 1;
13184
13185 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13186 name = targetm.strip_name_encoding (name);
13187 if (beimport)
13188 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13189 ? "*__imp_" : "*__imp__";
13190 else
13191 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
13192 namelen = strlen (name);
13193 prefixlen = strlen (prefix);
13194 imp_name = (char *) alloca (namelen + prefixlen + 1);
13195 memcpy (imp_name, prefix, prefixlen);
13196 memcpy (imp_name + prefixlen, name, namelen + 1);
13197
13198 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13199 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13200 SET_SYMBOL_REF_DECL (rtl, to);
13201 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
13202 if (!beimport)
13203 {
13204 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
13205 #ifdef SUB_TARGET_RECORD_STUB
13206 SUB_TARGET_RECORD_STUB (name);
13207 #endif
13208 }
13209
13210 rtl = gen_const_mem (Pmode, rtl);
13211 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13212
13213 SET_DECL_RTL (to, rtl);
13214 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13215
13216 return to;
13217 }
13218
13219 /* Expand SYMBOL into its corresponding far-addresse symbol.
13220 WANT_REG is true if we require the result be a register. */
13221
13222 static rtx
13223 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
13224 {
13225 tree imp_decl;
13226 rtx x;
13227
13228 gcc_assert (SYMBOL_REF_DECL (symbol));
13229 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
13230
13231 x = DECL_RTL (imp_decl);
13232 if (want_reg)
13233 x = force_reg (Pmode, x);
13234 return x;
13235 }
13236
13237 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13238 true if we require the result be a register. */
13239
13240 static rtx
13241 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13242 {
13243 tree imp_decl;
13244 rtx x;
13245
13246 gcc_assert (SYMBOL_REF_DECL (symbol));
13247 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
13248
13249 x = DECL_RTL (imp_decl);
13250 if (want_reg)
13251 x = force_reg (Pmode, x);
13252 return x;
13253 }
13254
13255 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
13256 is true if we require the result be a register. */
13257
13258 static rtx
13259 legitimize_pe_coff_symbol (rtx addr, bool inreg)
13260 {
13261 if (!TARGET_PECOFF)
13262 return NULL_RTX;
13263
13264 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13265 {
13266 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
13267 return legitimize_dllimport_symbol (addr, inreg);
13268 if (GET_CODE (addr) == CONST
13269 && GET_CODE (XEXP (addr, 0)) == PLUS
13270 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13271 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
13272 {
13273 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
13274 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13275 }
13276 }
13277
13278 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
13279 return NULL_RTX;
13280 if (GET_CODE (addr) == SYMBOL_REF
13281 && !is_imported_p (addr)
13282 && SYMBOL_REF_EXTERNAL_P (addr)
13283 && SYMBOL_REF_DECL (addr))
13284 return legitimize_pe_coff_extern_decl (addr, inreg);
13285
13286 if (GET_CODE (addr) == CONST
13287 && GET_CODE (XEXP (addr, 0)) == PLUS
13288 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13289 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
13290 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
13291 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
13292 {
13293 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
13294 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13295 }
13296 return NULL_RTX;
13297 }
13298
13299 /* Try machine-dependent ways of modifying an illegitimate address
13300 to be legitimate. If we find one, return the new, valid address.
13301 This macro is used in only one place: `memory_address' in explow.c.
13302
13303 OLDX is the address as it was before break_out_memory_refs was called.
13304 In some cases it is useful to look at this to decide what needs to be done.
13305
13306 It is always safe for this macro to do nothing. It exists to recognize
13307 opportunities to optimize the output.
13308
13309 For the 80386, we handle X+REG by loading X into a register R and
13310 using R+REG. R will go in a general reg and indexing will be used.
13311 However, if REG is a broken-out memory address or multiplication,
13312 nothing needs to be done because REG can certainly go in a general reg.
13313
13314 When -fpic is used, special handling is needed for symbolic references.
13315 See comments by legitimize_pic_address in i386.c for details. */
13316
13317 static rtx
13318 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13319 enum machine_mode mode)
13320 {
13321 int changed = 0;
13322 unsigned log;
13323
13324 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13325 if (log)
13326 return legitimize_tls_address (x, (enum tls_model) log, false);
13327 if (GET_CODE (x) == CONST
13328 && GET_CODE (XEXP (x, 0)) == PLUS
13329 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13330 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13331 {
13332 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13333 (enum tls_model) log, false);
13334 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13335 }
13336
13337 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13338 {
13339 rtx tmp = legitimize_pe_coff_symbol (x, true);
13340 if (tmp)
13341 return tmp;
13342 }
13343
13344 if (flag_pic && SYMBOLIC_CONST (x))
13345 return legitimize_pic_address (x, 0);
13346
13347 #if TARGET_MACHO
13348 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13349 return machopic_indirect_data_reference (x, 0);
13350 #endif
13351
13352 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13353 if (GET_CODE (x) == ASHIFT
13354 && CONST_INT_P (XEXP (x, 1))
13355 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13356 {
13357 changed = 1;
13358 log = INTVAL (XEXP (x, 1));
13359 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13360 GEN_INT (1 << log));
13361 }
13362
13363 if (GET_CODE (x) == PLUS)
13364 {
13365 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13366
13367 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13368 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13369 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13370 {
13371 changed = 1;
13372 log = INTVAL (XEXP (XEXP (x, 0), 1));
13373 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13374 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13375 GEN_INT (1 << log));
13376 }
13377
13378 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13379 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13380 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13381 {
13382 changed = 1;
13383 log = INTVAL (XEXP (XEXP (x, 1), 1));
13384 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13385 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13386 GEN_INT (1 << log));
13387 }
13388
13389 /* Put multiply first if it isn't already. */
13390 if (GET_CODE (XEXP (x, 1)) == MULT)
13391 {
13392 rtx tmp = XEXP (x, 0);
13393 XEXP (x, 0) = XEXP (x, 1);
13394 XEXP (x, 1) = tmp;
13395 changed = 1;
13396 }
13397
13398 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13399 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13400 created by virtual register instantiation, register elimination, and
13401 similar optimizations. */
13402 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13403 {
13404 changed = 1;
13405 x = gen_rtx_PLUS (Pmode,
13406 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13407 XEXP (XEXP (x, 1), 0)),
13408 XEXP (XEXP (x, 1), 1));
13409 }
13410
13411 /* Canonicalize
13412 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13413 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13414 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13415 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13416 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13417 && CONSTANT_P (XEXP (x, 1)))
13418 {
13419 rtx constant;
13420 rtx other = NULL_RTX;
13421
13422 if (CONST_INT_P (XEXP (x, 1)))
13423 {
13424 constant = XEXP (x, 1);
13425 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13426 }
13427 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13428 {
13429 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13430 other = XEXP (x, 1);
13431 }
13432 else
13433 constant = 0;
13434
13435 if (constant)
13436 {
13437 changed = 1;
13438 x = gen_rtx_PLUS (Pmode,
13439 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13440 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13441 plus_constant (Pmode, other,
13442 INTVAL (constant)));
13443 }
13444 }
13445
13446 if (changed && ix86_legitimate_address_p (mode, x, false))
13447 return x;
13448
13449 if (GET_CODE (XEXP (x, 0)) == MULT)
13450 {
13451 changed = 1;
13452 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13453 }
13454
13455 if (GET_CODE (XEXP (x, 1)) == MULT)
13456 {
13457 changed = 1;
13458 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13459 }
13460
13461 if (changed
13462 && REG_P (XEXP (x, 1))
13463 && REG_P (XEXP (x, 0)))
13464 return x;
13465
13466 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13467 {
13468 changed = 1;
13469 x = legitimize_pic_address (x, 0);
13470 }
13471
13472 if (changed && ix86_legitimate_address_p (mode, x, false))
13473 return x;
13474
13475 if (REG_P (XEXP (x, 0)))
13476 {
13477 rtx temp = gen_reg_rtx (Pmode);
13478 rtx val = force_operand (XEXP (x, 1), temp);
13479 if (val != temp)
13480 {
13481 val = convert_to_mode (Pmode, val, 1);
13482 emit_move_insn (temp, val);
13483 }
13484
13485 XEXP (x, 1) = temp;
13486 return x;
13487 }
13488
13489 else if (REG_P (XEXP (x, 1)))
13490 {
13491 rtx temp = gen_reg_rtx (Pmode);
13492 rtx val = force_operand (XEXP (x, 0), temp);
13493 if (val != temp)
13494 {
13495 val = convert_to_mode (Pmode, val, 1);
13496 emit_move_insn (temp, val);
13497 }
13498
13499 XEXP (x, 0) = temp;
13500 return x;
13501 }
13502 }
13503
13504 return x;
13505 }
13506 \f
13507 /* Print an integer constant expression in assembler syntax. Addition
13508 and subtraction are the only arithmetic that may appear in these
13509 expressions. FILE is the stdio stream to write to, X is the rtx, and
13510 CODE is the operand print code from the output string. */
13511
13512 static void
13513 output_pic_addr_const (FILE *file, rtx x, int code)
13514 {
13515 char buf[256];
13516
13517 switch (GET_CODE (x))
13518 {
13519 case PC:
13520 gcc_assert (flag_pic);
13521 putc ('.', file);
13522 break;
13523
13524 case SYMBOL_REF:
13525 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13526 output_addr_const (file, x);
13527 else
13528 {
13529 const char *name = XSTR (x, 0);
13530
13531 /* Mark the decl as referenced so that cgraph will
13532 output the function. */
13533 if (SYMBOL_REF_DECL (x))
13534 mark_decl_referenced (SYMBOL_REF_DECL (x));
13535
13536 #if TARGET_MACHO
13537 if (MACHOPIC_INDIRECT
13538 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13539 name = machopic_indirection_name (x, /*stub_p=*/true);
13540 #endif
13541 assemble_name (file, name);
13542 }
13543 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
13544 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13545 fputs ("@PLT", file);
13546 break;
13547
13548 case LABEL_REF:
13549 x = XEXP (x, 0);
13550 /* FALLTHRU */
13551 case CODE_LABEL:
13552 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13553 assemble_name (asm_out_file, buf);
13554 break;
13555
13556 case CONST_INT:
13557 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13558 break;
13559
13560 case CONST:
13561 /* This used to output parentheses around the expression,
13562 but that does not work on the 386 (either ATT or BSD assembler). */
13563 output_pic_addr_const (file, XEXP (x, 0), code);
13564 break;
13565
13566 case CONST_DOUBLE:
13567 if (GET_MODE (x) == VOIDmode)
13568 {
13569 /* We can use %d if the number is <32 bits and positive. */
13570 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13571 fprintf (file, "0x%lx%08lx",
13572 (unsigned long) CONST_DOUBLE_HIGH (x),
13573 (unsigned long) CONST_DOUBLE_LOW (x));
13574 else
13575 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13576 }
13577 else
13578 /* We can't handle floating point constants;
13579 TARGET_PRINT_OPERAND must handle them. */
13580 output_operand_lossage ("floating constant misused");
13581 break;
13582
13583 case PLUS:
13584 /* Some assemblers need integer constants to appear first. */
13585 if (CONST_INT_P (XEXP (x, 0)))
13586 {
13587 output_pic_addr_const (file, XEXP (x, 0), code);
13588 putc ('+', file);
13589 output_pic_addr_const (file, XEXP (x, 1), code);
13590 }
13591 else
13592 {
13593 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13594 output_pic_addr_const (file, XEXP (x, 1), code);
13595 putc ('+', file);
13596 output_pic_addr_const (file, XEXP (x, 0), code);
13597 }
13598 break;
13599
13600 case MINUS:
13601 if (!TARGET_MACHO)
13602 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13603 output_pic_addr_const (file, XEXP (x, 0), code);
13604 putc ('-', file);
13605 output_pic_addr_const (file, XEXP (x, 1), code);
13606 if (!TARGET_MACHO)
13607 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13608 break;
13609
13610 case UNSPEC:
13611 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13612 {
13613 bool f = i386_asm_output_addr_const_extra (file, x);
13614 gcc_assert (f);
13615 break;
13616 }
13617
13618 gcc_assert (XVECLEN (x, 0) == 1);
13619 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13620 switch (XINT (x, 1))
13621 {
13622 case UNSPEC_GOT:
13623 fputs ("@GOT", file);
13624 break;
13625 case UNSPEC_GOTOFF:
13626 fputs ("@GOTOFF", file);
13627 break;
13628 case UNSPEC_PLTOFF:
13629 fputs ("@PLTOFF", file);
13630 break;
13631 case UNSPEC_PCREL:
13632 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13633 "(%rip)" : "[rip]", file);
13634 break;
13635 case UNSPEC_GOTPCREL:
13636 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13637 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13638 break;
13639 case UNSPEC_GOTTPOFF:
13640 /* FIXME: This might be @TPOFF in Sun ld too. */
13641 fputs ("@gottpoff", file);
13642 break;
13643 case UNSPEC_TPOFF:
13644 fputs ("@tpoff", file);
13645 break;
13646 case UNSPEC_NTPOFF:
13647 if (TARGET_64BIT)
13648 fputs ("@tpoff", file);
13649 else
13650 fputs ("@ntpoff", file);
13651 break;
13652 case UNSPEC_DTPOFF:
13653 fputs ("@dtpoff", file);
13654 break;
13655 case UNSPEC_GOTNTPOFF:
13656 if (TARGET_64BIT)
13657 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13658 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13659 else
13660 fputs ("@gotntpoff", file);
13661 break;
13662 case UNSPEC_INDNTPOFF:
13663 fputs ("@indntpoff", file);
13664 break;
13665 #if TARGET_MACHO
13666 case UNSPEC_MACHOPIC_OFFSET:
13667 putc ('-', file);
13668 machopic_output_function_base_name (file);
13669 break;
13670 #endif
13671 default:
13672 output_operand_lossage ("invalid UNSPEC as operand");
13673 break;
13674 }
13675 break;
13676
13677 default:
13678 output_operand_lossage ("invalid expression as operand");
13679 }
13680 }
13681
13682 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13683 We need to emit DTP-relative relocations. */
13684
13685 static void ATTRIBUTE_UNUSED
13686 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13687 {
13688 fputs (ASM_LONG, file);
13689 output_addr_const (file, x);
13690 fputs ("@dtpoff", file);
13691 switch (size)
13692 {
13693 case 4:
13694 break;
13695 case 8:
13696 fputs (", 0", file);
13697 break;
13698 default:
13699 gcc_unreachable ();
13700 }
13701 }
13702
13703 /* Return true if X is a representation of the PIC register. This copes
13704 with calls from ix86_find_base_term, where the register might have
13705 been replaced by a cselib value. */
13706
13707 static bool
13708 ix86_pic_register_p (rtx x)
13709 {
13710 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13711 return (pic_offset_table_rtx
13712 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13713 else
13714 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13715 }
13716
13717 /* Helper function for ix86_delegitimize_address.
13718 Attempt to delegitimize TLS local-exec accesses. */
13719
13720 static rtx
13721 ix86_delegitimize_tls_address (rtx orig_x)
13722 {
13723 rtx x = orig_x, unspec;
13724 struct ix86_address addr;
13725
13726 if (!TARGET_TLS_DIRECT_SEG_REFS)
13727 return orig_x;
13728 if (MEM_P (x))
13729 x = XEXP (x, 0);
13730 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13731 return orig_x;
13732 if (ix86_decompose_address (x, &addr) == 0
13733 || addr.seg != DEFAULT_TLS_SEG_REG
13734 || addr.disp == NULL_RTX
13735 || GET_CODE (addr.disp) != CONST)
13736 return orig_x;
13737 unspec = XEXP (addr.disp, 0);
13738 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13739 unspec = XEXP (unspec, 0);
13740 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13741 return orig_x;
13742 x = XVECEXP (unspec, 0, 0);
13743 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13744 if (unspec != XEXP (addr.disp, 0))
13745 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13746 if (addr.index)
13747 {
13748 rtx idx = addr.index;
13749 if (addr.scale != 1)
13750 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13751 x = gen_rtx_PLUS (Pmode, idx, x);
13752 }
13753 if (addr.base)
13754 x = gen_rtx_PLUS (Pmode, addr.base, x);
13755 if (MEM_P (orig_x))
13756 x = replace_equiv_address_nv (orig_x, x);
13757 return x;
13758 }
13759
13760 /* In the name of slightly smaller debug output, and to cater to
13761 general assembler lossage, recognize PIC+GOTOFF and turn it back
13762 into a direct symbol reference.
13763
13764 On Darwin, this is necessary to avoid a crash, because Darwin
13765 has a different PIC label for each routine but the DWARF debugging
13766 information is not associated with any particular routine, so it's
13767 necessary to remove references to the PIC label from RTL stored by
13768 the DWARF output code. */
13769
13770 static rtx
13771 ix86_delegitimize_address (rtx x)
13772 {
13773 rtx orig_x = delegitimize_mem_from_attrs (x);
13774 /* addend is NULL or some rtx if x is something+GOTOFF where
13775 something doesn't include the PIC register. */
13776 rtx addend = NULL_RTX;
13777 /* reg_addend is NULL or a multiple of some register. */
13778 rtx reg_addend = NULL_RTX;
13779 /* const_addend is NULL or a const_int. */
13780 rtx const_addend = NULL_RTX;
13781 /* This is the result, or NULL. */
13782 rtx result = NULL_RTX;
13783
13784 x = orig_x;
13785
13786 if (MEM_P (x))
13787 x = XEXP (x, 0);
13788
13789 if (TARGET_64BIT)
13790 {
13791 if (GET_CODE (x) == CONST
13792 && GET_CODE (XEXP (x, 0)) == PLUS
13793 && GET_MODE (XEXP (x, 0)) == Pmode
13794 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13795 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13796 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13797 {
13798 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13799 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13800 if (MEM_P (orig_x))
13801 x = replace_equiv_address_nv (orig_x, x);
13802 return x;
13803 }
13804 if (GET_CODE (x) != CONST
13805 || GET_CODE (XEXP (x, 0)) != UNSPEC
13806 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13807 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13808 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13809 return ix86_delegitimize_tls_address (orig_x);
13810 x = XVECEXP (XEXP (x, 0), 0, 0);
13811 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13812 {
13813 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13814 GET_MODE (x), 0);
13815 if (x == NULL_RTX)
13816 return orig_x;
13817 }
13818 return x;
13819 }
13820
13821 if (GET_CODE (x) != PLUS
13822 || GET_CODE (XEXP (x, 1)) != CONST)
13823 return ix86_delegitimize_tls_address (orig_x);
13824
13825 if (ix86_pic_register_p (XEXP (x, 0)))
13826 /* %ebx + GOT/GOTOFF */
13827 ;
13828 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13829 {
13830 /* %ebx + %reg * scale + GOT/GOTOFF */
13831 reg_addend = XEXP (x, 0);
13832 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13833 reg_addend = XEXP (reg_addend, 1);
13834 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13835 reg_addend = XEXP (reg_addend, 0);
13836 else
13837 {
13838 reg_addend = NULL_RTX;
13839 addend = XEXP (x, 0);
13840 }
13841 }
13842 else
13843 addend = XEXP (x, 0);
13844
13845 x = XEXP (XEXP (x, 1), 0);
13846 if (GET_CODE (x) == PLUS
13847 && CONST_INT_P (XEXP (x, 1)))
13848 {
13849 const_addend = XEXP (x, 1);
13850 x = XEXP (x, 0);
13851 }
13852
13853 if (GET_CODE (x) == UNSPEC
13854 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13855 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13856 result = XVECEXP (x, 0, 0);
13857
13858 if (TARGET_MACHO && darwin_local_data_pic (x)
13859 && !MEM_P (orig_x))
13860 result = XVECEXP (x, 0, 0);
13861
13862 if (! result)
13863 return ix86_delegitimize_tls_address (orig_x);
13864
13865 if (const_addend)
13866 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13867 if (reg_addend)
13868 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13869 if (addend)
13870 {
13871 /* If the rest of original X doesn't involve the PIC register, add
13872 addend and subtract pic_offset_table_rtx. This can happen e.g.
13873 for code like:
13874 leal (%ebx, %ecx, 4), %ecx
13875 ...
13876 movl foo@GOTOFF(%ecx), %edx
13877 in which case we return (%ecx - %ebx) + foo. */
13878 if (pic_offset_table_rtx)
13879 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13880 pic_offset_table_rtx),
13881 result);
13882 else
13883 return orig_x;
13884 }
13885 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13886 {
13887 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13888 if (result == NULL_RTX)
13889 return orig_x;
13890 }
13891 return result;
13892 }
13893
13894 /* If X is a machine specific address (i.e. a symbol or label being
13895 referenced as a displacement from the GOT implemented using an
13896 UNSPEC), then return the base term. Otherwise return X. */
13897
13898 rtx
13899 ix86_find_base_term (rtx x)
13900 {
13901 rtx term;
13902
13903 if (TARGET_64BIT)
13904 {
13905 if (GET_CODE (x) != CONST)
13906 return x;
13907 term = XEXP (x, 0);
13908 if (GET_CODE (term) == PLUS
13909 && (CONST_INT_P (XEXP (term, 1))
13910 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13911 term = XEXP (term, 0);
13912 if (GET_CODE (term) != UNSPEC
13913 || (XINT (term, 1) != UNSPEC_GOTPCREL
13914 && XINT (term, 1) != UNSPEC_PCREL))
13915 return x;
13916
13917 return XVECEXP (term, 0, 0);
13918 }
13919
13920 return ix86_delegitimize_address (x);
13921 }
13922 \f
13923 static void
13924 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
13925 bool fp, FILE *file)
13926 {
13927 const char *suffix;
13928
13929 if (mode == CCFPmode || mode == CCFPUmode)
13930 {
13931 code = ix86_fp_compare_code_to_integer (code);
13932 mode = CCmode;
13933 }
13934 if (reverse)
13935 code = reverse_condition (code);
13936
13937 switch (code)
13938 {
13939 case EQ:
13940 switch (mode)
13941 {
13942 case CCAmode:
13943 suffix = "a";
13944 break;
13945
13946 case CCCmode:
13947 suffix = "c";
13948 break;
13949
13950 case CCOmode:
13951 suffix = "o";
13952 break;
13953
13954 case CCSmode:
13955 suffix = "s";
13956 break;
13957
13958 default:
13959 suffix = "e";
13960 }
13961 break;
13962 case NE:
13963 switch (mode)
13964 {
13965 case CCAmode:
13966 suffix = "na";
13967 break;
13968
13969 case CCCmode:
13970 suffix = "nc";
13971 break;
13972
13973 case CCOmode:
13974 suffix = "no";
13975 break;
13976
13977 case CCSmode:
13978 suffix = "ns";
13979 break;
13980
13981 default:
13982 suffix = "ne";
13983 }
13984 break;
13985 case GT:
13986 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13987 suffix = "g";
13988 break;
13989 case GTU:
13990 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13991 Those same assemblers have the same but opposite lossage on cmov. */
13992 if (mode == CCmode)
13993 suffix = fp ? "nbe" : "a";
13994 else if (mode == CCCmode)
13995 suffix = "b";
13996 else
13997 gcc_unreachable ();
13998 break;
13999 case LT:
14000 switch (mode)
14001 {
14002 case CCNOmode:
14003 case CCGOCmode:
14004 suffix = "s";
14005 break;
14006
14007 case CCmode:
14008 case CCGCmode:
14009 suffix = "l";
14010 break;
14011
14012 default:
14013 gcc_unreachable ();
14014 }
14015 break;
14016 case LTU:
14017 gcc_assert (mode == CCmode || mode == CCCmode);
14018 suffix = "b";
14019 break;
14020 case GE:
14021 switch (mode)
14022 {
14023 case CCNOmode:
14024 case CCGOCmode:
14025 suffix = "ns";
14026 break;
14027
14028 case CCmode:
14029 case CCGCmode:
14030 suffix = "ge";
14031 break;
14032
14033 default:
14034 gcc_unreachable ();
14035 }
14036 break;
14037 case GEU:
14038 /* ??? As above. */
14039 gcc_assert (mode == CCmode || mode == CCCmode);
14040 suffix = fp ? "nb" : "ae";
14041 break;
14042 case LE:
14043 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
14044 suffix = "le";
14045 break;
14046 case LEU:
14047 /* ??? As above. */
14048 if (mode == CCmode)
14049 suffix = "be";
14050 else if (mode == CCCmode)
14051 suffix = fp ? "nb" : "ae";
14052 else
14053 gcc_unreachable ();
14054 break;
14055 case UNORDERED:
14056 suffix = fp ? "u" : "p";
14057 break;
14058 case ORDERED:
14059 suffix = fp ? "nu" : "np";
14060 break;
14061 default:
14062 gcc_unreachable ();
14063 }
14064 fputs (suffix, file);
14065 }
14066
14067 /* Print the name of register X to FILE based on its machine mode and number.
14068 If CODE is 'w', pretend the mode is HImode.
14069 If CODE is 'b', pretend the mode is QImode.
14070 If CODE is 'k', pretend the mode is SImode.
14071 If CODE is 'q', pretend the mode is DImode.
14072 If CODE is 'x', pretend the mode is V4SFmode.
14073 If CODE is 't', pretend the mode is V8SFmode.
14074 If CODE is 'h', pretend the reg is the 'high' byte register.
14075 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
14076 If CODE is 'd', duplicate the operand for AVX instruction.
14077 */
14078
14079 void
14080 print_reg (rtx x, int code, FILE *file)
14081 {
14082 const char *reg;
14083 unsigned int regno;
14084 bool duplicated = code == 'd' && TARGET_AVX;
14085
14086 if (ASSEMBLER_DIALECT == ASM_ATT)
14087 putc ('%', file);
14088
14089 if (x == pc_rtx)
14090 {
14091 gcc_assert (TARGET_64BIT);
14092 fputs ("rip", file);
14093 return;
14094 }
14095
14096 regno = true_regnum (x);
14097 gcc_assert (regno != ARG_POINTER_REGNUM
14098 && regno != FRAME_POINTER_REGNUM
14099 && regno != FLAGS_REG
14100 && regno != FPSR_REG
14101 && regno != FPCR_REG);
14102
14103 if (code == 'w' || MMX_REG_P (x))
14104 code = 2;
14105 else if (code == 'b')
14106 code = 1;
14107 else if (code == 'k')
14108 code = 4;
14109 else if (code == 'q')
14110 code = 8;
14111 else if (code == 'y')
14112 code = 3;
14113 else if (code == 'h')
14114 code = 0;
14115 else if (code == 'x')
14116 code = 16;
14117 else if (code == 't')
14118 code = 32;
14119 else
14120 code = GET_MODE_SIZE (GET_MODE (x));
14121
14122 /* Irritatingly, AMD extended registers use different naming convention
14123 from the normal registers: "r%d[bwd]" */
14124 if (REX_INT_REGNO_P (regno))
14125 {
14126 gcc_assert (TARGET_64BIT);
14127 putc ('r', file);
14128 fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
14129 switch (code)
14130 {
14131 case 0:
14132 error ("extended registers have no high halves");
14133 break;
14134 case 1:
14135 putc ('b', file);
14136 break;
14137 case 2:
14138 putc ('w', file);
14139 break;
14140 case 4:
14141 putc ('d', file);
14142 break;
14143 case 8:
14144 /* no suffix */
14145 break;
14146 default:
14147 error ("unsupported operand size for extended register");
14148 break;
14149 }
14150 return;
14151 }
14152
14153 reg = NULL;
14154 switch (code)
14155 {
14156 case 3:
14157 if (STACK_TOP_P (x))
14158 {
14159 reg = "st(0)";
14160 break;
14161 }
14162 /* FALLTHRU */
14163 case 8:
14164 case 4:
14165 case 12:
14166 if (! ANY_FP_REG_P (x))
14167 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
14168 /* FALLTHRU */
14169 case 16:
14170 case 2:
14171 normal:
14172 reg = hi_reg_name[regno];
14173 break;
14174 case 1:
14175 if (regno >= ARRAY_SIZE (qi_reg_name))
14176 goto normal;
14177 reg = qi_reg_name[regno];
14178 break;
14179 case 0:
14180 if (regno >= ARRAY_SIZE (qi_high_reg_name))
14181 goto normal;
14182 reg = qi_high_reg_name[regno];
14183 break;
14184 case 32:
14185 if (SSE_REG_P (x))
14186 {
14187 gcc_assert (!duplicated);
14188 putc ('y', file);
14189 fputs (hi_reg_name[regno] + 1, file);
14190 return;
14191 }
14192 break;
14193 default:
14194 gcc_unreachable ();
14195 }
14196
14197 fputs (reg, file);
14198 if (duplicated)
14199 {
14200 if (ASSEMBLER_DIALECT == ASM_ATT)
14201 fprintf (file, ", %%%s", reg);
14202 else
14203 fprintf (file, ", %s", reg);
14204 }
14205 }
14206
14207 /* Locate some local-dynamic symbol still in use by this function
14208 so that we can print its name in some tls_local_dynamic_base
14209 pattern. */
14210
14211 static int
14212 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14213 {
14214 rtx x = *px;
14215
14216 if (GET_CODE (x) == SYMBOL_REF
14217 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14218 {
14219 cfun->machine->some_ld_name = XSTR (x, 0);
14220 return 1;
14221 }
14222
14223 return 0;
14224 }
14225
14226 static const char *
14227 get_some_local_dynamic_name (void)
14228 {
14229 rtx insn;
14230
14231 if (cfun->machine->some_ld_name)
14232 return cfun->machine->some_ld_name;
14233
14234 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14235 if (NONDEBUG_INSN_P (insn)
14236 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14237 return cfun->machine->some_ld_name;
14238
14239 return NULL;
14240 }
14241
14242 /* Meaning of CODE:
14243 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14244 C -- print opcode suffix for set/cmov insn.
14245 c -- like C, but print reversed condition
14246 F,f -- likewise, but for floating-point.
14247 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14248 otherwise nothing
14249 R -- print the prefix for register names.
14250 z -- print the opcode suffix for the size of the current operand.
14251 Z -- likewise, with special suffixes for x87 instructions.
14252 * -- print a star (in certain assembler syntax)
14253 A -- print an absolute memory reference.
14254 E -- print address with DImode register names if TARGET_64BIT.
14255 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14256 s -- print a shift double count, followed by the assemblers argument
14257 delimiter.
14258 b -- print the QImode name of the register for the indicated operand.
14259 %b0 would print %al if operands[0] is reg 0.
14260 w -- likewise, print the HImode name of the register.
14261 k -- likewise, print the SImode name of the register.
14262 q -- likewise, print the DImode name of the register.
14263 x -- likewise, print the V4SFmode name of the register.
14264 t -- likewise, print the V8SFmode name of the register.
14265 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14266 y -- print "st(0)" instead of "st" as a register.
14267 d -- print duplicated register operand for AVX instruction.
14268 D -- print condition for SSE cmp instruction.
14269 P -- if PIC, print an @PLT suffix.
14270 p -- print raw symbol name.
14271 X -- don't print any sort of PIC '@' suffix for a symbol.
14272 & -- print some in-use local-dynamic symbol name.
14273 H -- print a memory address offset by 8; used for sse high-parts
14274 Y -- print condition for XOP pcom* instruction.
14275 + -- print a branch hint as 'cs' or 'ds' prefix
14276 ; -- print a semicolon (after prefixes due to bug in older gas).
14277 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14278 @ -- print a segment register of thread base pointer load
14279 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14280 */
14281
14282 void
14283 ix86_print_operand (FILE *file, rtx x, int code)
14284 {
14285 if (code)
14286 {
14287 switch (code)
14288 {
14289 case 'A':
14290 switch (ASSEMBLER_DIALECT)
14291 {
14292 case ASM_ATT:
14293 putc ('*', file);
14294 break;
14295
14296 case ASM_INTEL:
14297 /* Intel syntax. For absolute addresses, registers should not
14298 be surrounded by braces. */
14299 if (!REG_P (x))
14300 {
14301 putc ('[', file);
14302 ix86_print_operand (file, x, 0);
14303 putc (']', file);
14304 return;
14305 }
14306 break;
14307
14308 default:
14309 gcc_unreachable ();
14310 }
14311
14312 ix86_print_operand (file, x, 0);
14313 return;
14314
14315 case 'E':
14316 /* Wrap address in an UNSPEC to declare special handling. */
14317 if (TARGET_64BIT)
14318 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14319
14320 output_address (x);
14321 return;
14322
14323 case 'L':
14324 if (ASSEMBLER_DIALECT == ASM_ATT)
14325 putc ('l', file);
14326 return;
14327
14328 case 'W':
14329 if (ASSEMBLER_DIALECT == ASM_ATT)
14330 putc ('w', file);
14331 return;
14332
14333 case 'B':
14334 if (ASSEMBLER_DIALECT == ASM_ATT)
14335 putc ('b', file);
14336 return;
14337
14338 case 'Q':
14339 if (ASSEMBLER_DIALECT == ASM_ATT)
14340 putc ('l', file);
14341 return;
14342
14343 case 'S':
14344 if (ASSEMBLER_DIALECT == ASM_ATT)
14345 putc ('s', file);
14346 return;
14347
14348 case 'T':
14349 if (ASSEMBLER_DIALECT == ASM_ATT)
14350 putc ('t', file);
14351 return;
14352
14353 case 'O':
14354 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14355 if (ASSEMBLER_DIALECT != ASM_ATT)
14356 return;
14357
14358 switch (GET_MODE_SIZE (GET_MODE (x)))
14359 {
14360 case 2:
14361 putc ('w', file);
14362 break;
14363
14364 case 4:
14365 putc ('l', file);
14366 break;
14367
14368 case 8:
14369 putc ('q', file);
14370 break;
14371
14372 default:
14373 output_operand_lossage
14374 ("invalid operand size for operand code 'O'");
14375 return;
14376 }
14377
14378 putc ('.', file);
14379 #endif
14380 return;
14381
14382 case 'z':
14383 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14384 {
14385 /* Opcodes don't get size suffixes if using Intel opcodes. */
14386 if (ASSEMBLER_DIALECT == ASM_INTEL)
14387 return;
14388
14389 switch (GET_MODE_SIZE (GET_MODE (x)))
14390 {
14391 case 1:
14392 putc ('b', file);
14393 return;
14394
14395 case 2:
14396 putc ('w', file);
14397 return;
14398
14399 case 4:
14400 putc ('l', file);
14401 return;
14402
14403 case 8:
14404 putc ('q', file);
14405 return;
14406
14407 default:
14408 output_operand_lossage
14409 ("invalid operand size for operand code 'z'");
14410 return;
14411 }
14412 }
14413
14414 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14415 warning
14416 (0, "non-integer operand used with operand code 'z'");
14417 /* FALLTHRU */
14418
14419 case 'Z':
14420 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14421 if (ASSEMBLER_DIALECT == ASM_INTEL)
14422 return;
14423
14424 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14425 {
14426 switch (GET_MODE_SIZE (GET_MODE (x)))
14427 {
14428 case 2:
14429 #ifdef HAVE_AS_IX86_FILDS
14430 putc ('s', file);
14431 #endif
14432 return;
14433
14434 case 4:
14435 putc ('l', file);
14436 return;
14437
14438 case 8:
14439 #ifdef HAVE_AS_IX86_FILDQ
14440 putc ('q', file);
14441 #else
14442 fputs ("ll", file);
14443 #endif
14444 return;
14445
14446 default:
14447 break;
14448 }
14449 }
14450 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14451 {
14452 /* 387 opcodes don't get size suffixes
14453 if the operands are registers. */
14454 if (STACK_REG_P (x))
14455 return;
14456
14457 switch (GET_MODE_SIZE (GET_MODE (x)))
14458 {
14459 case 4:
14460 putc ('s', file);
14461 return;
14462
14463 case 8:
14464 putc ('l', file);
14465 return;
14466
14467 case 12:
14468 case 16:
14469 putc ('t', file);
14470 return;
14471
14472 default:
14473 break;
14474 }
14475 }
14476 else
14477 {
14478 output_operand_lossage
14479 ("invalid operand type used with operand code 'Z'");
14480 return;
14481 }
14482
14483 output_operand_lossage
14484 ("invalid operand size for operand code 'Z'");
14485 return;
14486
14487 case 'd':
14488 case 'b':
14489 case 'w':
14490 case 'k':
14491 case 'q':
14492 case 'h':
14493 case 't':
14494 case 'y':
14495 case 'x':
14496 case 'X':
14497 case 'P':
14498 case 'p':
14499 break;
14500
14501 case 's':
14502 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14503 {
14504 ix86_print_operand (file, x, 0);
14505 fputs (", ", file);
14506 }
14507 return;
14508
14509 case 'Y':
14510 switch (GET_CODE (x))
14511 {
14512 case NE:
14513 fputs ("neq", file);
14514 break;
14515 case EQ:
14516 fputs ("eq", file);
14517 break;
14518 case GE:
14519 case GEU:
14520 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14521 break;
14522 case GT:
14523 case GTU:
14524 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14525 break;
14526 case LE:
14527 case LEU:
14528 fputs ("le", file);
14529 break;
14530 case LT:
14531 case LTU:
14532 fputs ("lt", file);
14533 break;
14534 case UNORDERED:
14535 fputs ("unord", file);
14536 break;
14537 case ORDERED:
14538 fputs ("ord", file);
14539 break;
14540 case UNEQ:
14541 fputs ("ueq", file);
14542 break;
14543 case UNGE:
14544 fputs ("nlt", file);
14545 break;
14546 case UNGT:
14547 fputs ("nle", file);
14548 break;
14549 case UNLE:
14550 fputs ("ule", file);
14551 break;
14552 case UNLT:
14553 fputs ("ult", file);
14554 break;
14555 case LTGT:
14556 fputs ("une", file);
14557 break;
14558 default:
14559 output_operand_lossage ("operand is not a condition code, "
14560 "invalid operand code 'Y'");
14561 return;
14562 }
14563 return;
14564
14565 case 'D':
14566 /* Little bit of braindamage here. The SSE compare instructions
14567 does use completely different names for the comparisons that the
14568 fp conditional moves. */
14569 switch (GET_CODE (x))
14570 {
14571 case UNEQ:
14572 if (TARGET_AVX)
14573 {
14574 fputs ("eq_us", file);
14575 break;
14576 }
14577 case EQ:
14578 fputs ("eq", file);
14579 break;
14580 case UNLT:
14581 if (TARGET_AVX)
14582 {
14583 fputs ("nge", file);
14584 break;
14585 }
14586 case LT:
14587 fputs ("lt", file);
14588 break;
14589 case UNLE:
14590 if (TARGET_AVX)
14591 {
14592 fputs ("ngt", file);
14593 break;
14594 }
14595 case LE:
14596 fputs ("le", file);
14597 break;
14598 case UNORDERED:
14599 fputs ("unord", file);
14600 break;
14601 case LTGT:
14602 if (TARGET_AVX)
14603 {
14604 fputs ("neq_oq", file);
14605 break;
14606 }
14607 case NE:
14608 fputs ("neq", file);
14609 break;
14610 case GE:
14611 if (TARGET_AVX)
14612 {
14613 fputs ("ge", file);
14614 break;
14615 }
14616 case UNGE:
14617 fputs ("nlt", file);
14618 break;
14619 case GT:
14620 if (TARGET_AVX)
14621 {
14622 fputs ("gt", file);
14623 break;
14624 }
14625 case UNGT:
14626 fputs ("nle", file);
14627 break;
14628 case ORDERED:
14629 fputs ("ord", file);
14630 break;
14631 default:
14632 output_operand_lossage ("operand is not a condition code, "
14633 "invalid operand code 'D'");
14634 return;
14635 }
14636 return;
14637
14638 case 'F':
14639 case 'f':
14640 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14641 if (ASSEMBLER_DIALECT == ASM_ATT)
14642 putc ('.', file);
14643 #endif
14644
14645 case 'C':
14646 case 'c':
14647 if (!COMPARISON_P (x))
14648 {
14649 output_operand_lossage ("operand is not a condition code, "
14650 "invalid operand code '%c'", code);
14651 return;
14652 }
14653 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14654 code == 'c' || code == 'f',
14655 code == 'F' || code == 'f',
14656 file);
14657 return;
14658
14659 case 'H':
14660 if (!offsettable_memref_p (x))
14661 {
14662 output_operand_lossage ("operand is not an offsettable memory "
14663 "reference, invalid operand code 'H'");
14664 return;
14665 }
14666 /* It doesn't actually matter what mode we use here, as we're
14667 only going to use this for printing. */
14668 x = adjust_address_nv (x, DImode, 8);
14669 break;
14670
14671 case 'K':
14672 gcc_assert (CONST_INT_P (x));
14673
14674 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14675 #ifdef HAVE_AS_IX86_HLE
14676 fputs ("xacquire ", file);
14677 #else
14678 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14679 #endif
14680 else if (INTVAL (x) & IX86_HLE_RELEASE)
14681 #ifdef HAVE_AS_IX86_HLE
14682 fputs ("xrelease ", file);
14683 #else
14684 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14685 #endif
14686 /* We do not want to print value of the operand. */
14687 return;
14688
14689 case '*':
14690 if (ASSEMBLER_DIALECT == ASM_ATT)
14691 putc ('*', file);
14692 return;
14693
14694 case '&':
14695 {
14696 const char *name = get_some_local_dynamic_name ();
14697 if (name == NULL)
14698 output_operand_lossage ("'%%&' used without any "
14699 "local dynamic TLS references");
14700 else
14701 assemble_name (file, name);
14702 return;
14703 }
14704
14705 case '+':
14706 {
14707 rtx x;
14708
14709 if (!optimize
14710 || optimize_function_for_size_p (cfun)
14711 || !TARGET_BRANCH_PREDICTION_HINTS)
14712 return;
14713
14714 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14715 if (x)
14716 {
14717 int pred_val = INTVAL (XEXP (x, 0));
14718
14719 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14720 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14721 {
14722 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14723 bool cputaken
14724 = final_forward_branch_p (current_output_insn) == 0;
14725
14726 /* Emit hints only in the case default branch prediction
14727 heuristics would fail. */
14728 if (taken != cputaken)
14729 {
14730 /* We use 3e (DS) prefix for taken branches and
14731 2e (CS) prefix for not taken branches. */
14732 if (taken)
14733 fputs ("ds ; ", file);
14734 else
14735 fputs ("cs ; ", file);
14736 }
14737 }
14738 }
14739 return;
14740 }
14741
14742 case ';':
14743 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14744 putc (';', file);
14745 #endif
14746 return;
14747
14748 case '@':
14749 if (ASSEMBLER_DIALECT == ASM_ATT)
14750 putc ('%', file);
14751
14752 /* The kernel uses a different segment register for performance
14753 reasons; a system call would not have to trash the userspace
14754 segment register, which would be expensive. */
14755 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14756 fputs ("fs", file);
14757 else
14758 fputs ("gs", file);
14759 return;
14760
14761 case '~':
14762 putc (TARGET_AVX2 ? 'i' : 'f', file);
14763 return;
14764
14765 case '^':
14766 if (TARGET_64BIT && Pmode != word_mode)
14767 fputs ("addr32 ", file);
14768 return;
14769
14770 default:
14771 output_operand_lossage ("invalid operand code '%c'", code);
14772 }
14773 }
14774
14775 if (REG_P (x))
14776 print_reg (x, code, file);
14777
14778 else if (MEM_P (x))
14779 {
14780 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14781 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14782 && GET_MODE (x) != BLKmode)
14783 {
14784 const char * size;
14785 switch (GET_MODE_SIZE (GET_MODE (x)))
14786 {
14787 case 1: size = "BYTE"; break;
14788 case 2: size = "WORD"; break;
14789 case 4: size = "DWORD"; break;
14790 case 8: size = "QWORD"; break;
14791 case 12: size = "TBYTE"; break;
14792 case 16:
14793 if (GET_MODE (x) == XFmode)
14794 size = "TBYTE";
14795 else
14796 size = "XMMWORD";
14797 break;
14798 case 32: size = "YMMWORD"; break;
14799 default:
14800 gcc_unreachable ();
14801 }
14802
14803 /* Check for explicit size override (codes 'b', 'w', 'k',
14804 'q' and 'x') */
14805 if (code == 'b')
14806 size = "BYTE";
14807 else if (code == 'w')
14808 size = "WORD";
14809 else if (code == 'k')
14810 size = "DWORD";
14811 else if (code == 'q')
14812 size = "QWORD";
14813 else if (code == 'x')
14814 size = "XMMWORD";
14815
14816 fputs (size, file);
14817 fputs (" PTR ", file);
14818 }
14819
14820 x = XEXP (x, 0);
14821 /* Avoid (%rip) for call operands. */
14822 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14823 && !CONST_INT_P (x))
14824 output_addr_const (file, x);
14825 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14826 output_operand_lossage ("invalid constraints for operand");
14827 else
14828 output_address (x);
14829 }
14830
14831 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14832 {
14833 REAL_VALUE_TYPE r;
14834 long l;
14835
14836 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14837 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14838
14839 if (ASSEMBLER_DIALECT == ASM_ATT)
14840 putc ('$', file);
14841 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14842 if (code == 'q')
14843 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
14844 (unsigned long long) (int) l);
14845 else
14846 fprintf (file, "0x%08x", (unsigned int) l);
14847 }
14848
14849 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14850 {
14851 REAL_VALUE_TYPE r;
14852 long l[2];
14853
14854 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14855 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14856
14857 if (ASSEMBLER_DIALECT == ASM_ATT)
14858 putc ('$', file);
14859 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14860 }
14861
14862 /* These float cases don't actually occur as immediate operands. */
14863 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14864 {
14865 char dstr[30];
14866
14867 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14868 fputs (dstr, file);
14869 }
14870
14871 else
14872 {
14873 /* We have patterns that allow zero sets of memory, for instance.
14874 In 64-bit mode, we should probably support all 8-byte vectors,
14875 since we can in fact encode that into an immediate. */
14876 if (GET_CODE (x) == CONST_VECTOR)
14877 {
14878 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14879 x = const0_rtx;
14880 }
14881
14882 if (code != 'P' && code != 'p')
14883 {
14884 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14885 {
14886 if (ASSEMBLER_DIALECT == ASM_ATT)
14887 putc ('$', file);
14888 }
14889 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14890 || GET_CODE (x) == LABEL_REF)
14891 {
14892 if (ASSEMBLER_DIALECT == ASM_ATT)
14893 putc ('$', file);
14894 else
14895 fputs ("OFFSET FLAT:", file);
14896 }
14897 }
14898 if (CONST_INT_P (x))
14899 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14900 else if (flag_pic || MACHOPIC_INDIRECT)
14901 output_pic_addr_const (file, x, code);
14902 else
14903 output_addr_const (file, x);
14904 }
14905 }
14906
14907 static bool
14908 ix86_print_operand_punct_valid_p (unsigned char code)
14909 {
14910 return (code == '@' || code == '*' || code == '+' || code == '&'
14911 || code == ';' || code == '~' || code == '^');
14912 }
14913 \f
14914 /* Print a memory operand whose address is ADDR. */
14915
14916 static void
14917 ix86_print_operand_address (FILE *file, rtx addr)
14918 {
14919 struct ix86_address parts;
14920 rtx base, index, disp;
14921 int scale;
14922 int ok;
14923 bool vsib = false;
14924 int code = 0;
14925
14926 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14927 {
14928 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14929 gcc_assert (parts.index == NULL_RTX);
14930 parts.index = XVECEXP (addr, 0, 1);
14931 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14932 addr = XVECEXP (addr, 0, 0);
14933 vsib = true;
14934 }
14935 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14936 {
14937 gcc_assert (TARGET_64BIT);
14938 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14939 code = 'q';
14940 }
14941 else
14942 ok = ix86_decompose_address (addr, &parts);
14943
14944 gcc_assert (ok);
14945
14946 base = parts.base;
14947 index = parts.index;
14948 disp = parts.disp;
14949 scale = parts.scale;
14950
14951 switch (parts.seg)
14952 {
14953 case SEG_DEFAULT:
14954 break;
14955 case SEG_FS:
14956 case SEG_GS:
14957 if (ASSEMBLER_DIALECT == ASM_ATT)
14958 putc ('%', file);
14959 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14960 break;
14961 default:
14962 gcc_unreachable ();
14963 }
14964
14965 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14966 if (TARGET_64BIT && !base && !index)
14967 {
14968 rtx symbol = disp;
14969
14970 if (GET_CODE (disp) == CONST
14971 && GET_CODE (XEXP (disp, 0)) == PLUS
14972 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14973 symbol = XEXP (XEXP (disp, 0), 0);
14974
14975 if (GET_CODE (symbol) == LABEL_REF
14976 || (GET_CODE (symbol) == SYMBOL_REF
14977 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14978 base = pc_rtx;
14979 }
14980 if (!base && !index)
14981 {
14982 /* Displacement only requires special attention. */
14983
14984 if (CONST_INT_P (disp))
14985 {
14986 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14987 fputs ("ds:", file);
14988 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14989 }
14990 else if (flag_pic)
14991 output_pic_addr_const (file, disp, 0);
14992 else
14993 output_addr_const (file, disp);
14994 }
14995 else
14996 {
14997 /* Print SImode register names to force addr32 prefix. */
14998 if (SImode_address_operand (addr, VOIDmode))
14999 {
15000 #ifdef ENABLE_CHECKING
15001 gcc_assert (TARGET_64BIT);
15002 switch (GET_CODE (addr))
15003 {
15004 case SUBREG:
15005 gcc_assert (GET_MODE (addr) == SImode);
15006 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
15007 break;
15008 case ZERO_EXTEND:
15009 case AND:
15010 gcc_assert (GET_MODE (addr) == DImode);
15011 break;
15012 default:
15013 gcc_unreachable ();
15014 }
15015 #endif
15016 gcc_assert (!code);
15017 code = 'k';
15018 }
15019 else if (code == 0
15020 && TARGET_X32
15021 && disp
15022 && CONST_INT_P (disp)
15023 && INTVAL (disp) < -16*1024*1024)
15024 {
15025 /* X32 runs in 64-bit mode, where displacement, DISP, in
15026 address DISP(%r64), is encoded as 32-bit immediate sign-
15027 extended from 32-bit to 64-bit. For -0x40000300(%r64),
15028 address is %r64 + 0xffffffffbffffd00. When %r64 <
15029 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
15030 which is invalid for x32. The correct address is %r64
15031 - 0x40000300 == 0xf7ffdd64. To properly encode
15032 -0x40000300(%r64) for x32, we zero-extend negative
15033 displacement by forcing addr32 prefix which truncates
15034 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
15035 zero-extend all negative displacements, including -1(%rsp).
15036 However, for small negative displacements, sign-extension
15037 won't cause overflow. We only zero-extend negative
15038 displacements if they < -16*1024*1024, which is also used
15039 to check legitimate address displacements for PIC. */
15040 code = 'k';
15041 }
15042
15043 if (ASSEMBLER_DIALECT == ASM_ATT)
15044 {
15045 if (disp)
15046 {
15047 if (flag_pic)
15048 output_pic_addr_const (file, disp, 0);
15049 else if (GET_CODE (disp) == LABEL_REF)
15050 output_asm_label (disp);
15051 else
15052 output_addr_const (file, disp);
15053 }
15054
15055 putc ('(', file);
15056 if (base)
15057 print_reg (base, code, file);
15058 if (index)
15059 {
15060 putc (',', file);
15061 print_reg (index, vsib ? 0 : code, file);
15062 if (scale != 1 || vsib)
15063 fprintf (file, ",%d", scale);
15064 }
15065 putc (')', file);
15066 }
15067 else
15068 {
15069 rtx offset = NULL_RTX;
15070
15071 if (disp)
15072 {
15073 /* Pull out the offset of a symbol; print any symbol itself. */
15074 if (GET_CODE (disp) == CONST
15075 && GET_CODE (XEXP (disp, 0)) == PLUS
15076 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15077 {
15078 offset = XEXP (XEXP (disp, 0), 1);
15079 disp = gen_rtx_CONST (VOIDmode,
15080 XEXP (XEXP (disp, 0), 0));
15081 }
15082
15083 if (flag_pic)
15084 output_pic_addr_const (file, disp, 0);
15085 else if (GET_CODE (disp) == LABEL_REF)
15086 output_asm_label (disp);
15087 else if (CONST_INT_P (disp))
15088 offset = disp;
15089 else
15090 output_addr_const (file, disp);
15091 }
15092
15093 putc ('[', file);
15094 if (base)
15095 {
15096 print_reg (base, code, file);
15097 if (offset)
15098 {
15099 if (INTVAL (offset) >= 0)
15100 putc ('+', file);
15101 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15102 }
15103 }
15104 else if (offset)
15105 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15106 else
15107 putc ('0', file);
15108
15109 if (index)
15110 {
15111 putc ('+', file);
15112 print_reg (index, vsib ? 0 : code, file);
15113 if (scale != 1 || vsib)
15114 fprintf (file, "*%d", scale);
15115 }
15116 putc (']', file);
15117 }
15118 }
15119 }
15120
15121 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
15122
15123 static bool
15124 i386_asm_output_addr_const_extra (FILE *file, rtx x)
15125 {
15126 rtx op;
15127
15128 if (GET_CODE (x) != UNSPEC)
15129 return false;
15130
15131 op = XVECEXP (x, 0, 0);
15132 switch (XINT (x, 1))
15133 {
15134 case UNSPEC_GOTTPOFF:
15135 output_addr_const (file, op);
15136 /* FIXME: This might be @TPOFF in Sun ld. */
15137 fputs ("@gottpoff", file);
15138 break;
15139 case UNSPEC_TPOFF:
15140 output_addr_const (file, op);
15141 fputs ("@tpoff", file);
15142 break;
15143 case UNSPEC_NTPOFF:
15144 output_addr_const (file, op);
15145 if (TARGET_64BIT)
15146 fputs ("@tpoff", file);
15147 else
15148 fputs ("@ntpoff", file);
15149 break;
15150 case UNSPEC_DTPOFF:
15151 output_addr_const (file, op);
15152 fputs ("@dtpoff", file);
15153 break;
15154 case UNSPEC_GOTNTPOFF:
15155 output_addr_const (file, op);
15156 if (TARGET_64BIT)
15157 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
15158 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
15159 else
15160 fputs ("@gotntpoff", file);
15161 break;
15162 case UNSPEC_INDNTPOFF:
15163 output_addr_const (file, op);
15164 fputs ("@indntpoff", file);
15165 break;
15166 #if TARGET_MACHO
15167 case UNSPEC_MACHOPIC_OFFSET:
15168 output_addr_const (file, op);
15169 putc ('-', file);
15170 machopic_output_function_base_name (file);
15171 break;
15172 #endif
15173
15174 case UNSPEC_STACK_CHECK:
15175 {
15176 int offset;
15177
15178 gcc_assert (flag_split_stack);
15179
15180 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15181 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15182 #else
15183 gcc_unreachable ();
15184 #endif
15185
15186 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
15187 }
15188 break;
15189
15190 default:
15191 return false;
15192 }
15193
15194 return true;
15195 }
15196 \f
15197 /* Split one or more double-mode RTL references into pairs of half-mode
15198 references. The RTL can be REG, offsettable MEM, integer constant, or
15199 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
15200 split and "num" is its length. lo_half and hi_half are output arrays
15201 that parallel "operands". */
15202
15203 void
15204 split_double_mode (enum machine_mode mode, rtx operands[],
15205 int num, rtx lo_half[], rtx hi_half[])
15206 {
15207 enum machine_mode half_mode;
15208 unsigned int byte;
15209
15210 switch (mode)
15211 {
15212 case TImode:
15213 half_mode = DImode;
15214 break;
15215 case DImode:
15216 half_mode = SImode;
15217 break;
15218 default:
15219 gcc_unreachable ();
15220 }
15221
15222 byte = GET_MODE_SIZE (half_mode);
15223
15224 while (num--)
15225 {
15226 rtx op = operands[num];
15227
15228 /* simplify_subreg refuse to split volatile memory addresses,
15229 but we still have to handle it. */
15230 if (MEM_P (op))
15231 {
15232 lo_half[num] = adjust_address (op, half_mode, 0);
15233 hi_half[num] = adjust_address (op, half_mode, byte);
15234 }
15235 else
15236 {
15237 lo_half[num] = simplify_gen_subreg (half_mode, op,
15238 GET_MODE (op) == VOIDmode
15239 ? mode : GET_MODE (op), 0);
15240 hi_half[num] = simplify_gen_subreg (half_mode, op,
15241 GET_MODE (op) == VOIDmode
15242 ? mode : GET_MODE (op), byte);
15243 }
15244 }
15245 }
15246 \f
15247 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15248 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15249 is the expression of the binary operation. The output may either be
15250 emitted here, or returned to the caller, like all output_* functions.
15251
15252 There is no guarantee that the operands are the same mode, as they
15253 might be within FLOAT or FLOAT_EXTEND expressions. */
15254
15255 #ifndef SYSV386_COMPAT
15256 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15257 wants to fix the assemblers because that causes incompatibility
15258 with gcc. No-one wants to fix gcc because that causes
15259 incompatibility with assemblers... You can use the option of
15260 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15261 #define SYSV386_COMPAT 1
15262 #endif
15263
15264 const char *
15265 output_387_binary_op (rtx insn, rtx *operands)
15266 {
15267 static char buf[40];
15268 const char *p;
15269 const char *ssep;
15270 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15271
15272 #ifdef ENABLE_CHECKING
15273 /* Even if we do not want to check the inputs, this documents input
15274 constraints. Which helps in understanding the following code. */
15275 if (STACK_REG_P (operands[0])
15276 && ((REG_P (operands[1])
15277 && REGNO (operands[0]) == REGNO (operands[1])
15278 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15279 || (REG_P (operands[2])
15280 && REGNO (operands[0]) == REGNO (operands[2])
15281 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15282 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15283 ; /* ok */
15284 else
15285 gcc_assert (is_sse);
15286 #endif
15287
15288 switch (GET_CODE (operands[3]))
15289 {
15290 case PLUS:
15291 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15292 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15293 p = "fiadd";
15294 else
15295 p = "fadd";
15296 ssep = "vadd";
15297 break;
15298
15299 case MINUS:
15300 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15301 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15302 p = "fisub";
15303 else
15304 p = "fsub";
15305 ssep = "vsub";
15306 break;
15307
15308 case MULT:
15309 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15310 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15311 p = "fimul";
15312 else
15313 p = "fmul";
15314 ssep = "vmul";
15315 break;
15316
15317 case DIV:
15318 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15319 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15320 p = "fidiv";
15321 else
15322 p = "fdiv";
15323 ssep = "vdiv";
15324 break;
15325
15326 default:
15327 gcc_unreachable ();
15328 }
15329
15330 if (is_sse)
15331 {
15332 if (TARGET_AVX)
15333 {
15334 strcpy (buf, ssep);
15335 if (GET_MODE (operands[0]) == SFmode)
15336 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15337 else
15338 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15339 }
15340 else
15341 {
15342 strcpy (buf, ssep + 1);
15343 if (GET_MODE (operands[0]) == SFmode)
15344 strcat (buf, "ss\t{%2, %0|%0, %2}");
15345 else
15346 strcat (buf, "sd\t{%2, %0|%0, %2}");
15347 }
15348 return buf;
15349 }
15350 strcpy (buf, p);
15351
15352 switch (GET_CODE (operands[3]))
15353 {
15354 case MULT:
15355 case PLUS:
15356 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15357 {
15358 rtx temp = operands[2];
15359 operands[2] = operands[1];
15360 operands[1] = temp;
15361 }
15362
15363 /* know operands[0] == operands[1]. */
15364
15365 if (MEM_P (operands[2]))
15366 {
15367 p = "%Z2\t%2";
15368 break;
15369 }
15370
15371 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15372 {
15373 if (STACK_TOP_P (operands[0]))
15374 /* How is it that we are storing to a dead operand[2]?
15375 Well, presumably operands[1] is dead too. We can't
15376 store the result to st(0) as st(0) gets popped on this
15377 instruction. Instead store to operands[2] (which I
15378 think has to be st(1)). st(1) will be popped later.
15379 gcc <= 2.8.1 didn't have this check and generated
15380 assembly code that the Unixware assembler rejected. */
15381 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15382 else
15383 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15384 break;
15385 }
15386
15387 if (STACK_TOP_P (operands[0]))
15388 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15389 else
15390 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15391 break;
15392
15393 case MINUS:
15394 case DIV:
15395 if (MEM_P (operands[1]))
15396 {
15397 p = "r%Z1\t%1";
15398 break;
15399 }
15400
15401 if (MEM_P (operands[2]))
15402 {
15403 p = "%Z2\t%2";
15404 break;
15405 }
15406
15407 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15408 {
15409 #if SYSV386_COMPAT
15410 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15411 derived assemblers, confusingly reverse the direction of
15412 the operation for fsub{r} and fdiv{r} when the
15413 destination register is not st(0). The Intel assembler
15414 doesn't have this brain damage. Read !SYSV386_COMPAT to
15415 figure out what the hardware really does. */
15416 if (STACK_TOP_P (operands[0]))
15417 p = "{p\t%0, %2|rp\t%2, %0}";
15418 else
15419 p = "{rp\t%2, %0|p\t%0, %2}";
15420 #else
15421 if (STACK_TOP_P (operands[0]))
15422 /* As above for fmul/fadd, we can't store to st(0). */
15423 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15424 else
15425 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15426 #endif
15427 break;
15428 }
15429
15430 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15431 {
15432 #if SYSV386_COMPAT
15433 if (STACK_TOP_P (operands[0]))
15434 p = "{rp\t%0, %1|p\t%1, %0}";
15435 else
15436 p = "{p\t%1, %0|rp\t%0, %1}";
15437 #else
15438 if (STACK_TOP_P (operands[0]))
15439 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15440 else
15441 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15442 #endif
15443 break;
15444 }
15445
15446 if (STACK_TOP_P (operands[0]))
15447 {
15448 if (STACK_TOP_P (operands[1]))
15449 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15450 else
15451 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15452 break;
15453 }
15454 else if (STACK_TOP_P (operands[1]))
15455 {
15456 #if SYSV386_COMPAT
15457 p = "{\t%1, %0|r\t%0, %1}";
15458 #else
15459 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15460 #endif
15461 }
15462 else
15463 {
15464 #if SYSV386_COMPAT
15465 p = "{r\t%2, %0|\t%0, %2}";
15466 #else
15467 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15468 #endif
15469 }
15470 break;
15471
15472 default:
15473 gcc_unreachable ();
15474 }
15475
15476 strcat (buf, p);
15477 return buf;
15478 }
15479
15480 /* Check if a 256bit AVX register is referenced inside of EXP. */
15481
15482 static int
15483 ix86_check_avx256_register (rtx *pexp, void *data ATTRIBUTE_UNUSED)
15484 {
15485 rtx exp = *pexp;
15486
15487 if (GET_CODE (exp) == SUBREG)
15488 exp = SUBREG_REG (exp);
15489
15490 if (REG_P (exp)
15491 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)))
15492 return 1;
15493
15494 return 0;
15495 }
15496
15497 /* Return needed mode for entity in optimize_mode_switching pass. */
15498
15499 static int
15500 ix86_avx_u128_mode_needed (rtx insn)
15501 {
15502 if (CALL_P (insn))
15503 {
15504 rtx link;
15505
15506 /* Needed mode is set to AVX_U128_CLEAN if there are
15507 no 256bit modes used in function arguments. */
15508 for (link = CALL_INSN_FUNCTION_USAGE (insn);
15509 link;
15510 link = XEXP (link, 1))
15511 {
15512 if (GET_CODE (XEXP (link, 0)) == USE)
15513 {
15514 rtx arg = XEXP (XEXP (link, 0), 0);
15515
15516 if (ix86_check_avx256_register (&arg, NULL))
15517 return AVX_U128_ANY;
15518 }
15519 }
15520
15521 return AVX_U128_CLEAN;
15522 }
15523
15524 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
15525 changes state only when a 256bit register is written to, but we need
15526 to prevent the compiler from moving optimal insertion point above
15527 eventual read from 256bit register. */
15528 if (for_each_rtx (&PATTERN (insn), ix86_check_avx256_register, NULL))
15529 return AVX_U128_DIRTY;
15530
15531 return AVX_U128_ANY;
15532 }
15533
15534 /* Return mode that i387 must be switched into
15535 prior to the execution of insn. */
15536
15537 static int
15538 ix86_i387_mode_needed (int entity, rtx insn)
15539 {
15540 enum attr_i387_cw mode;
15541
15542 /* The mode UNINITIALIZED is used to store control word after a
15543 function call or ASM pattern. The mode ANY specify that function
15544 has no requirements on the control word and make no changes in the
15545 bits we are interested in. */
15546
15547 if (CALL_P (insn)
15548 || (NONJUMP_INSN_P (insn)
15549 && (asm_noperands (PATTERN (insn)) >= 0
15550 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15551 return I387_CW_UNINITIALIZED;
15552
15553 if (recog_memoized (insn) < 0)
15554 return I387_CW_ANY;
15555
15556 mode = get_attr_i387_cw (insn);
15557
15558 switch (entity)
15559 {
15560 case I387_TRUNC:
15561 if (mode == I387_CW_TRUNC)
15562 return mode;
15563 break;
15564
15565 case I387_FLOOR:
15566 if (mode == I387_CW_FLOOR)
15567 return mode;
15568 break;
15569
15570 case I387_CEIL:
15571 if (mode == I387_CW_CEIL)
15572 return mode;
15573 break;
15574
15575 case I387_MASK_PM:
15576 if (mode == I387_CW_MASK_PM)
15577 return mode;
15578 break;
15579
15580 default:
15581 gcc_unreachable ();
15582 }
15583
15584 return I387_CW_ANY;
15585 }
15586
15587 /* Return mode that entity must be switched into
15588 prior to the execution of insn. */
15589
15590 int
15591 ix86_mode_needed (int entity, rtx insn)
15592 {
15593 switch (entity)
15594 {
15595 case AVX_U128:
15596 return ix86_avx_u128_mode_needed (insn);
15597 case I387_TRUNC:
15598 case I387_FLOOR:
15599 case I387_CEIL:
15600 case I387_MASK_PM:
15601 return ix86_i387_mode_needed (entity, insn);
15602 default:
15603 gcc_unreachable ();
15604 }
15605 return 0;
15606 }
15607
15608 /* Check if a 256bit AVX register is referenced in stores. */
15609
15610 static void
15611 ix86_check_avx256_stores (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
15612 {
15613 if (ix86_check_avx256_register (&dest, NULL))
15614 {
15615 bool *used = (bool *) data;
15616 *used = true;
15617 }
15618 }
15619
15620 /* Calculate mode of upper 128bit AVX registers after the insn. */
15621
15622 static int
15623 ix86_avx_u128_mode_after (int mode, rtx insn)
15624 {
15625 rtx pat = PATTERN (insn);
15626
15627 if (vzeroupper_operation (pat, VOIDmode)
15628 || vzeroall_operation (pat, VOIDmode))
15629 return AVX_U128_CLEAN;
15630
15631 /* We know that state is clean after CALL insn if there are no
15632 256bit registers used in the function return register. */
15633 if (CALL_P (insn))
15634 {
15635 bool avx_reg256_found = false;
15636 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
15637 if (!avx_reg256_found)
15638 return AVX_U128_CLEAN;
15639 }
15640
15641 /* Otherwise, return current mode. Remember that if insn
15642 references AVX 256bit registers, the mode was already changed
15643 to DIRTY from MODE_NEEDED. */
15644 return mode;
15645 }
15646
15647 /* Return the mode that an insn results in. */
15648
15649 int
15650 ix86_mode_after (int entity, int mode, rtx insn)
15651 {
15652 switch (entity)
15653 {
15654 case AVX_U128:
15655 return ix86_avx_u128_mode_after (mode, insn);
15656 case I387_TRUNC:
15657 case I387_FLOOR:
15658 case I387_CEIL:
15659 case I387_MASK_PM:
15660 return mode;
15661 default:
15662 gcc_unreachable ();
15663 }
15664 }
15665
15666 static int
15667 ix86_avx_u128_mode_entry (void)
15668 {
15669 tree arg;
15670
15671 /* Entry mode is set to AVX_U128_DIRTY if there are
15672 256bit modes used in function arguments. */
15673 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
15674 arg = TREE_CHAIN (arg))
15675 {
15676 rtx incoming = DECL_INCOMING_RTL (arg);
15677
15678 if (incoming && ix86_check_avx256_register (&incoming, NULL))
15679 return AVX_U128_DIRTY;
15680 }
15681
15682 return AVX_U128_CLEAN;
15683 }
15684
15685 /* Return a mode that ENTITY is assumed to be
15686 switched to at function entry. */
15687
15688 int
15689 ix86_mode_entry (int entity)
15690 {
15691 switch (entity)
15692 {
15693 case AVX_U128:
15694 return ix86_avx_u128_mode_entry ();
15695 case I387_TRUNC:
15696 case I387_FLOOR:
15697 case I387_CEIL:
15698 case I387_MASK_PM:
15699 return I387_CW_ANY;
15700 default:
15701 gcc_unreachable ();
15702 }
15703 }
15704
15705 static int
15706 ix86_avx_u128_mode_exit (void)
15707 {
15708 rtx reg = crtl->return_rtx;
15709
15710 /* Exit mode is set to AVX_U128_DIRTY if there are
15711 256bit modes used in the function return register. */
15712 if (reg && ix86_check_avx256_register (&reg, NULL))
15713 return AVX_U128_DIRTY;
15714
15715 return AVX_U128_CLEAN;
15716 }
15717
15718 /* Return a mode that ENTITY is assumed to be
15719 switched to at function exit. */
15720
15721 int
15722 ix86_mode_exit (int entity)
15723 {
15724 switch (entity)
15725 {
15726 case AVX_U128:
15727 return ix86_avx_u128_mode_exit ();
15728 case I387_TRUNC:
15729 case I387_FLOOR:
15730 case I387_CEIL:
15731 case I387_MASK_PM:
15732 return I387_CW_ANY;
15733 default:
15734 gcc_unreachable ();
15735 }
15736 }
15737
15738 /* Output code to initialize control word copies used by trunc?f?i and
15739 rounding patterns. CURRENT_MODE is set to current control word,
15740 while NEW_MODE is set to new control word. */
15741
15742 static void
15743 emit_i387_cw_initialization (int mode)
15744 {
15745 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15746 rtx new_mode;
15747
15748 enum ix86_stack_slot slot;
15749
15750 rtx reg = gen_reg_rtx (HImode);
15751
15752 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15753 emit_move_insn (reg, copy_rtx (stored_mode));
15754
15755 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15756 || optimize_insn_for_size_p ())
15757 {
15758 switch (mode)
15759 {
15760 case I387_CW_TRUNC:
15761 /* round toward zero (truncate) */
15762 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15763 slot = SLOT_CW_TRUNC;
15764 break;
15765
15766 case I387_CW_FLOOR:
15767 /* round down toward -oo */
15768 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15769 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15770 slot = SLOT_CW_FLOOR;
15771 break;
15772
15773 case I387_CW_CEIL:
15774 /* round up toward +oo */
15775 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15776 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15777 slot = SLOT_CW_CEIL;
15778 break;
15779
15780 case I387_CW_MASK_PM:
15781 /* mask precision exception for nearbyint() */
15782 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15783 slot = SLOT_CW_MASK_PM;
15784 break;
15785
15786 default:
15787 gcc_unreachable ();
15788 }
15789 }
15790 else
15791 {
15792 switch (mode)
15793 {
15794 case I387_CW_TRUNC:
15795 /* round toward zero (truncate) */
15796 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15797 slot = SLOT_CW_TRUNC;
15798 break;
15799
15800 case I387_CW_FLOOR:
15801 /* round down toward -oo */
15802 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15803 slot = SLOT_CW_FLOOR;
15804 break;
15805
15806 case I387_CW_CEIL:
15807 /* round up toward +oo */
15808 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15809 slot = SLOT_CW_CEIL;
15810 break;
15811
15812 case I387_CW_MASK_PM:
15813 /* mask precision exception for nearbyint() */
15814 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15815 slot = SLOT_CW_MASK_PM;
15816 break;
15817
15818 default:
15819 gcc_unreachable ();
15820 }
15821 }
15822
15823 gcc_assert (slot < MAX_386_STACK_LOCALS);
15824
15825 new_mode = assign_386_stack_local (HImode, slot);
15826 emit_move_insn (new_mode, reg);
15827 }
15828
15829 /* Emit vzeroupper. */
15830
15831 void
15832 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
15833 {
15834 int i;
15835
15836 /* Cancel automatic vzeroupper insertion if there are
15837 live call-saved SSE registers at the insertion point. */
15838
15839 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
15840 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15841 return;
15842
15843 if (TARGET_64BIT)
15844 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
15845 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15846 return;
15847
15848 emit_insn (gen_avx_vzeroupper ());
15849 }
15850
15851 /* Generate one or more insns to set ENTITY to MODE. */
15852
15853 void
15854 ix86_emit_mode_set (int entity, int mode, HARD_REG_SET regs_live)
15855 {
15856 switch (entity)
15857 {
15858 case AVX_U128:
15859 if (mode == AVX_U128_CLEAN)
15860 ix86_avx_emit_vzeroupper (regs_live);
15861 break;
15862 case I387_TRUNC:
15863 case I387_FLOOR:
15864 case I387_CEIL:
15865 case I387_MASK_PM:
15866 if (mode != I387_CW_ANY
15867 && mode != I387_CW_UNINITIALIZED)
15868 emit_i387_cw_initialization (mode);
15869 break;
15870 default:
15871 gcc_unreachable ();
15872 }
15873 }
15874
15875 /* Output code for INSN to convert a float to a signed int. OPERANDS
15876 are the insn operands. The output may be [HSD]Imode and the input
15877 operand may be [SDX]Fmode. */
15878
15879 const char *
15880 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15881 {
15882 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15883 int dimode_p = GET_MODE (operands[0]) == DImode;
15884 int round_mode = get_attr_i387_cw (insn);
15885
15886 /* Jump through a hoop or two for DImode, since the hardware has no
15887 non-popping instruction. We used to do this a different way, but
15888 that was somewhat fragile and broke with post-reload splitters. */
15889 if ((dimode_p || fisttp) && !stack_top_dies)
15890 output_asm_insn ("fld\t%y1", operands);
15891
15892 gcc_assert (STACK_TOP_P (operands[1]));
15893 gcc_assert (MEM_P (operands[0]));
15894 gcc_assert (GET_MODE (operands[1]) != TFmode);
15895
15896 if (fisttp)
15897 output_asm_insn ("fisttp%Z0\t%0", operands);
15898 else
15899 {
15900 if (round_mode != I387_CW_ANY)
15901 output_asm_insn ("fldcw\t%3", operands);
15902 if (stack_top_dies || dimode_p)
15903 output_asm_insn ("fistp%Z0\t%0", operands);
15904 else
15905 output_asm_insn ("fist%Z0\t%0", operands);
15906 if (round_mode != I387_CW_ANY)
15907 output_asm_insn ("fldcw\t%2", operands);
15908 }
15909
15910 return "";
15911 }
15912
15913 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15914 have the values zero or one, indicates the ffreep insn's operand
15915 from the OPERANDS array. */
15916
15917 static const char *
15918 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15919 {
15920 if (TARGET_USE_FFREEP)
15921 #ifdef HAVE_AS_IX86_FFREEP
15922 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15923 #else
15924 {
15925 static char retval[32];
15926 int regno = REGNO (operands[opno]);
15927
15928 gcc_assert (STACK_REGNO_P (regno));
15929
15930 regno -= FIRST_STACK_REG;
15931
15932 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15933 return retval;
15934 }
15935 #endif
15936
15937 return opno ? "fstp\t%y1" : "fstp\t%y0";
15938 }
15939
15940
15941 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15942 should be used. UNORDERED_P is true when fucom should be used. */
15943
15944 const char *
15945 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15946 {
15947 int stack_top_dies;
15948 rtx cmp_op0, cmp_op1;
15949 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15950
15951 if (eflags_p)
15952 {
15953 cmp_op0 = operands[0];
15954 cmp_op1 = operands[1];
15955 }
15956 else
15957 {
15958 cmp_op0 = operands[1];
15959 cmp_op1 = operands[2];
15960 }
15961
15962 if (is_sse)
15963 {
15964 if (GET_MODE (operands[0]) == SFmode)
15965 if (unordered_p)
15966 return "%vucomiss\t{%1, %0|%0, %1}";
15967 else
15968 return "%vcomiss\t{%1, %0|%0, %1}";
15969 else
15970 if (unordered_p)
15971 return "%vucomisd\t{%1, %0|%0, %1}";
15972 else
15973 return "%vcomisd\t{%1, %0|%0, %1}";
15974 }
15975
15976 gcc_assert (STACK_TOP_P (cmp_op0));
15977
15978 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15979
15980 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15981 {
15982 if (stack_top_dies)
15983 {
15984 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15985 return output_387_ffreep (operands, 1);
15986 }
15987 else
15988 return "ftst\n\tfnstsw\t%0";
15989 }
15990
15991 if (STACK_REG_P (cmp_op1)
15992 && stack_top_dies
15993 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15994 && REGNO (cmp_op1) != FIRST_STACK_REG)
15995 {
15996 /* If both the top of the 387 stack dies, and the other operand
15997 is also a stack register that dies, then this must be a
15998 `fcompp' float compare */
15999
16000 if (eflags_p)
16001 {
16002 /* There is no double popping fcomi variant. Fortunately,
16003 eflags is immune from the fstp's cc clobbering. */
16004 if (unordered_p)
16005 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
16006 else
16007 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
16008 return output_387_ffreep (operands, 0);
16009 }
16010 else
16011 {
16012 if (unordered_p)
16013 return "fucompp\n\tfnstsw\t%0";
16014 else
16015 return "fcompp\n\tfnstsw\t%0";
16016 }
16017 }
16018 else
16019 {
16020 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
16021
16022 static const char * const alt[16] =
16023 {
16024 "fcom%Z2\t%y2\n\tfnstsw\t%0",
16025 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
16026 "fucom%Z2\t%y2\n\tfnstsw\t%0",
16027 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
16028
16029 "ficom%Z2\t%y2\n\tfnstsw\t%0",
16030 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
16031 NULL,
16032 NULL,
16033
16034 "fcomi\t{%y1, %0|%0, %y1}",
16035 "fcomip\t{%y1, %0|%0, %y1}",
16036 "fucomi\t{%y1, %0|%0, %y1}",
16037 "fucomip\t{%y1, %0|%0, %y1}",
16038
16039 NULL,
16040 NULL,
16041 NULL,
16042 NULL
16043 };
16044
16045 int mask;
16046 const char *ret;
16047
16048 mask = eflags_p << 3;
16049 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
16050 mask |= unordered_p << 1;
16051 mask |= stack_top_dies;
16052
16053 gcc_assert (mask < 16);
16054 ret = alt[mask];
16055 gcc_assert (ret);
16056
16057 return ret;
16058 }
16059 }
16060
16061 void
16062 ix86_output_addr_vec_elt (FILE *file, int value)
16063 {
16064 const char *directive = ASM_LONG;
16065
16066 #ifdef ASM_QUAD
16067 if (TARGET_LP64)
16068 directive = ASM_QUAD;
16069 #else
16070 gcc_assert (!TARGET_64BIT);
16071 #endif
16072
16073 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
16074 }
16075
16076 void
16077 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
16078 {
16079 const char *directive = ASM_LONG;
16080
16081 #ifdef ASM_QUAD
16082 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
16083 directive = ASM_QUAD;
16084 #else
16085 gcc_assert (!TARGET_64BIT);
16086 #endif
16087 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
16088 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
16089 fprintf (file, "%s%s%d-%s%d\n",
16090 directive, LPREFIX, value, LPREFIX, rel);
16091 else if (HAVE_AS_GOTOFF_IN_DATA)
16092 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
16093 #if TARGET_MACHO
16094 else if (TARGET_MACHO)
16095 {
16096 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
16097 machopic_output_function_base_name (file);
16098 putc ('\n', file);
16099 }
16100 #endif
16101 else
16102 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
16103 GOT_SYMBOL_NAME, LPREFIX, value);
16104 }
16105 \f
16106 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
16107 for the target. */
16108
16109 void
16110 ix86_expand_clear (rtx dest)
16111 {
16112 rtx tmp;
16113
16114 /* We play register width games, which are only valid after reload. */
16115 gcc_assert (reload_completed);
16116
16117 /* Avoid HImode and its attendant prefix byte. */
16118 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
16119 dest = gen_rtx_REG (SImode, REGNO (dest));
16120 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
16121
16122 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
16123 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
16124 {
16125 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16126 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
16127 }
16128
16129 emit_insn (tmp);
16130 }
16131
16132 /* X is an unchanging MEM. If it is a constant pool reference, return
16133 the constant pool rtx, else NULL. */
16134
16135 rtx
16136 maybe_get_pool_constant (rtx x)
16137 {
16138 x = ix86_delegitimize_address (XEXP (x, 0));
16139
16140 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
16141 return get_pool_constant (x);
16142
16143 return NULL_RTX;
16144 }
16145
16146 void
16147 ix86_expand_move (enum machine_mode mode, rtx operands[])
16148 {
16149 rtx op0, op1;
16150 enum tls_model model;
16151
16152 op0 = operands[0];
16153 op1 = operands[1];
16154
16155 if (GET_CODE (op1) == SYMBOL_REF)
16156 {
16157 rtx tmp;
16158
16159 model = SYMBOL_REF_TLS_MODEL (op1);
16160 if (model)
16161 {
16162 op1 = legitimize_tls_address (op1, model, true);
16163 op1 = force_operand (op1, op0);
16164 if (op1 == op0)
16165 return;
16166 op1 = convert_to_mode (mode, op1, 1);
16167 }
16168 else if ((tmp = legitimize_pe_coff_symbol (op1, false)) != NULL_RTX)
16169 op1 = tmp;
16170 }
16171 else if (GET_CODE (op1) == CONST
16172 && GET_CODE (XEXP (op1, 0)) == PLUS
16173 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
16174 {
16175 rtx addend = XEXP (XEXP (op1, 0), 1);
16176 rtx symbol = XEXP (XEXP (op1, 0), 0);
16177 rtx tmp;
16178
16179 model = SYMBOL_REF_TLS_MODEL (symbol);
16180 if (model)
16181 tmp = legitimize_tls_address (symbol, model, true);
16182 else
16183 tmp = legitimize_pe_coff_symbol (symbol, true);
16184
16185 if (tmp)
16186 {
16187 tmp = force_operand (tmp, NULL);
16188 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
16189 op0, 1, OPTAB_DIRECT);
16190 if (tmp == op0)
16191 return;
16192 op1 = convert_to_mode (mode, tmp, 1);
16193 }
16194 }
16195
16196 if ((flag_pic || MACHOPIC_INDIRECT)
16197 && symbolic_operand (op1, mode))
16198 {
16199 if (TARGET_MACHO && !TARGET_64BIT)
16200 {
16201 #if TARGET_MACHO
16202 /* dynamic-no-pic */
16203 if (MACHOPIC_INDIRECT)
16204 {
16205 rtx temp = ((reload_in_progress
16206 || ((op0 && REG_P (op0))
16207 && mode == Pmode))
16208 ? op0 : gen_reg_rtx (Pmode));
16209 op1 = machopic_indirect_data_reference (op1, temp);
16210 if (MACHOPIC_PURE)
16211 op1 = machopic_legitimize_pic_address (op1, mode,
16212 temp == op1 ? 0 : temp);
16213 }
16214 if (op0 != op1 && GET_CODE (op0) != MEM)
16215 {
16216 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
16217 emit_insn (insn);
16218 return;
16219 }
16220 if (GET_CODE (op0) == MEM)
16221 op1 = force_reg (Pmode, op1);
16222 else
16223 {
16224 rtx temp = op0;
16225 if (GET_CODE (temp) != REG)
16226 temp = gen_reg_rtx (Pmode);
16227 temp = legitimize_pic_address (op1, temp);
16228 if (temp == op0)
16229 return;
16230 op1 = temp;
16231 }
16232 /* dynamic-no-pic */
16233 #endif
16234 }
16235 else
16236 {
16237 if (MEM_P (op0))
16238 op1 = force_reg (mode, op1);
16239 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
16240 {
16241 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
16242 op1 = legitimize_pic_address (op1, reg);
16243 if (op0 == op1)
16244 return;
16245 op1 = convert_to_mode (mode, op1, 1);
16246 }
16247 }
16248 }
16249 else
16250 {
16251 if (MEM_P (op0)
16252 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
16253 || !push_operand (op0, mode))
16254 && MEM_P (op1))
16255 op1 = force_reg (mode, op1);
16256
16257 if (push_operand (op0, mode)
16258 && ! general_no_elim_operand (op1, mode))
16259 op1 = copy_to_mode_reg (mode, op1);
16260
16261 /* Force large constants in 64bit compilation into register
16262 to get them CSEed. */
16263 if (can_create_pseudo_p ()
16264 && (mode == DImode) && TARGET_64BIT
16265 && immediate_operand (op1, mode)
16266 && !x86_64_zext_immediate_operand (op1, VOIDmode)
16267 && !register_operand (op0, mode)
16268 && optimize)
16269 op1 = copy_to_mode_reg (mode, op1);
16270
16271 if (can_create_pseudo_p ()
16272 && FLOAT_MODE_P (mode)
16273 && GET_CODE (op1) == CONST_DOUBLE)
16274 {
16275 /* If we are loading a floating point constant to a register,
16276 force the value to memory now, since we'll get better code
16277 out the back end. */
16278
16279 op1 = validize_mem (force_const_mem (mode, op1));
16280 if (!register_operand (op0, mode))
16281 {
16282 rtx temp = gen_reg_rtx (mode);
16283 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16284 emit_move_insn (op0, temp);
16285 return;
16286 }
16287 }
16288 }
16289
16290 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16291 }
16292
16293 void
16294 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16295 {
16296 rtx op0 = operands[0], op1 = operands[1];
16297 unsigned int align = GET_MODE_ALIGNMENT (mode);
16298
16299 /* Force constants other than zero into memory. We do not know how
16300 the instructions used to build constants modify the upper 64 bits
16301 of the register, once we have that information we may be able
16302 to handle some of them more efficiently. */
16303 if (can_create_pseudo_p ()
16304 && register_operand (op0, mode)
16305 && (CONSTANT_P (op1)
16306 || (GET_CODE (op1) == SUBREG
16307 && CONSTANT_P (SUBREG_REG (op1))))
16308 && !standard_sse_constant_p (op1))
16309 op1 = validize_mem (force_const_mem (mode, op1));
16310
16311 /* We need to check memory alignment for SSE mode since attribute
16312 can make operands unaligned. */
16313 if (can_create_pseudo_p ()
16314 && SSE_REG_MODE_P (mode)
16315 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16316 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16317 {
16318 rtx tmp[2];
16319
16320 /* ix86_expand_vector_move_misalign() does not like constants ... */
16321 if (CONSTANT_P (op1)
16322 || (GET_CODE (op1) == SUBREG
16323 && CONSTANT_P (SUBREG_REG (op1))))
16324 op1 = validize_mem (force_const_mem (mode, op1));
16325
16326 /* ... nor both arguments in memory. */
16327 if (!register_operand (op0, mode)
16328 && !register_operand (op1, mode))
16329 op1 = force_reg (mode, op1);
16330
16331 tmp[0] = op0; tmp[1] = op1;
16332 ix86_expand_vector_move_misalign (mode, tmp);
16333 return;
16334 }
16335
16336 /* Make operand1 a register if it isn't already. */
16337 if (can_create_pseudo_p ()
16338 && !register_operand (op0, mode)
16339 && !register_operand (op1, mode))
16340 {
16341 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16342 return;
16343 }
16344
16345 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16346 }
16347
16348 /* Split 32-byte AVX unaligned load and store if needed. */
16349
16350 static void
16351 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16352 {
16353 rtx m;
16354 rtx (*extract) (rtx, rtx, rtx);
16355 rtx (*load_unaligned) (rtx, rtx);
16356 rtx (*store_unaligned) (rtx, rtx);
16357 enum machine_mode mode;
16358
16359 switch (GET_MODE (op0))
16360 {
16361 default:
16362 gcc_unreachable ();
16363 case V32QImode:
16364 extract = gen_avx_vextractf128v32qi;
16365 load_unaligned = gen_avx_loaddqu256;
16366 store_unaligned = gen_avx_storedqu256;
16367 mode = V16QImode;
16368 break;
16369 case V8SFmode:
16370 extract = gen_avx_vextractf128v8sf;
16371 load_unaligned = gen_avx_loadups256;
16372 store_unaligned = gen_avx_storeups256;
16373 mode = V4SFmode;
16374 break;
16375 case V4DFmode:
16376 extract = gen_avx_vextractf128v4df;
16377 load_unaligned = gen_avx_loadupd256;
16378 store_unaligned = gen_avx_storeupd256;
16379 mode = V2DFmode;
16380 break;
16381 }
16382
16383 if (MEM_P (op1))
16384 {
16385 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16386 {
16387 rtx r = gen_reg_rtx (mode);
16388 m = adjust_address (op1, mode, 0);
16389 emit_move_insn (r, m);
16390 m = adjust_address (op1, mode, 16);
16391 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16392 emit_move_insn (op0, r);
16393 }
16394 else
16395 emit_insn (load_unaligned (op0, op1));
16396 }
16397 else if (MEM_P (op0))
16398 {
16399 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16400 {
16401 m = adjust_address (op0, mode, 0);
16402 emit_insn (extract (m, op1, const0_rtx));
16403 m = adjust_address (op0, mode, 16);
16404 emit_insn (extract (m, op1, const1_rtx));
16405 }
16406 else
16407 emit_insn (store_unaligned (op0, op1));
16408 }
16409 else
16410 gcc_unreachable ();
16411 }
16412
16413 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16414 straight to ix86_expand_vector_move. */
16415 /* Code generation for scalar reg-reg moves of single and double precision data:
16416 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16417 movaps reg, reg
16418 else
16419 movss reg, reg
16420 if (x86_sse_partial_reg_dependency == true)
16421 movapd reg, reg
16422 else
16423 movsd reg, reg
16424
16425 Code generation for scalar loads of double precision data:
16426 if (x86_sse_split_regs == true)
16427 movlpd mem, reg (gas syntax)
16428 else
16429 movsd mem, reg
16430
16431 Code generation for unaligned packed loads of single precision data
16432 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16433 if (x86_sse_unaligned_move_optimal)
16434 movups mem, reg
16435
16436 if (x86_sse_partial_reg_dependency == true)
16437 {
16438 xorps reg, reg
16439 movlps mem, reg
16440 movhps mem+8, reg
16441 }
16442 else
16443 {
16444 movlps mem, reg
16445 movhps mem+8, reg
16446 }
16447
16448 Code generation for unaligned packed loads of double precision data
16449 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16450 if (x86_sse_unaligned_move_optimal)
16451 movupd mem, reg
16452
16453 if (x86_sse_split_regs == true)
16454 {
16455 movlpd mem, reg
16456 movhpd mem+8, reg
16457 }
16458 else
16459 {
16460 movsd mem, reg
16461 movhpd mem+8, reg
16462 }
16463 */
16464
16465 void
16466 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16467 {
16468 rtx op0, op1, m;
16469
16470 op0 = operands[0];
16471 op1 = operands[1];
16472
16473 if (TARGET_AVX
16474 && GET_MODE_SIZE (mode) == 32)
16475 {
16476 switch (GET_MODE_CLASS (mode))
16477 {
16478 case MODE_VECTOR_INT:
16479 case MODE_INT:
16480 op0 = gen_lowpart (V32QImode, op0);
16481 op1 = gen_lowpart (V32QImode, op1);
16482 /* FALLTHRU */
16483
16484 case MODE_VECTOR_FLOAT:
16485 ix86_avx256_split_vector_move_misalign (op0, op1);
16486 break;
16487
16488 default:
16489 gcc_unreachable ();
16490 }
16491
16492 return;
16493 }
16494
16495 if (MEM_P (op1))
16496 {
16497 /* ??? If we have typed data, then it would appear that using
16498 movdqu is the only way to get unaligned data loaded with
16499 integer type. */
16500 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16501 {
16502 op0 = gen_lowpart (V16QImode, op0);
16503 op1 = gen_lowpart (V16QImode, op1);
16504 /* We will eventually emit movups based on insn attributes. */
16505 emit_insn (gen_sse2_loaddqu (op0, op1));
16506 }
16507 else if (TARGET_SSE2 && mode == V2DFmode)
16508 {
16509 rtx zero;
16510
16511 if (TARGET_AVX
16512 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16513 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16514 || optimize_insn_for_size_p ())
16515 {
16516 /* We will eventually emit movups based on insn attributes. */
16517 emit_insn (gen_sse2_loadupd (op0, op1));
16518 return;
16519 }
16520
16521 /* When SSE registers are split into halves, we can avoid
16522 writing to the top half twice. */
16523 if (TARGET_SSE_SPLIT_REGS)
16524 {
16525 emit_clobber (op0);
16526 zero = op0;
16527 }
16528 else
16529 {
16530 /* ??? Not sure about the best option for the Intel chips.
16531 The following would seem to satisfy; the register is
16532 entirely cleared, breaking the dependency chain. We
16533 then store to the upper half, with a dependency depth
16534 of one. A rumor has it that Intel recommends two movsd
16535 followed by an unpacklpd, but this is unconfirmed. And
16536 given that the dependency depth of the unpacklpd would
16537 still be one, I'm not sure why this would be better. */
16538 zero = CONST0_RTX (V2DFmode);
16539 }
16540
16541 m = adjust_address (op1, DFmode, 0);
16542 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16543 m = adjust_address (op1, DFmode, 8);
16544 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16545 }
16546 else
16547 {
16548 if (TARGET_AVX
16549 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16550 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16551 || optimize_insn_for_size_p ())
16552 {
16553 op0 = gen_lowpart (V4SFmode, op0);
16554 op1 = gen_lowpart (V4SFmode, op1);
16555 emit_insn (gen_sse_loadups (op0, op1));
16556 return;
16557 }
16558
16559 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16560 emit_move_insn (op0, CONST0_RTX (mode));
16561 else
16562 emit_clobber (op0);
16563
16564 if (mode != V4SFmode)
16565 op0 = gen_lowpart (V4SFmode, op0);
16566
16567 m = adjust_address (op1, V2SFmode, 0);
16568 emit_insn (gen_sse_loadlps (op0, op0, m));
16569 m = adjust_address (op1, V2SFmode, 8);
16570 emit_insn (gen_sse_loadhps (op0, op0, m));
16571 }
16572 }
16573 else if (MEM_P (op0))
16574 {
16575 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16576 {
16577 op0 = gen_lowpart (V16QImode, op0);
16578 op1 = gen_lowpart (V16QImode, op1);
16579 /* We will eventually emit movups based on insn attributes. */
16580 emit_insn (gen_sse2_storedqu (op0, op1));
16581 }
16582 else if (TARGET_SSE2 && mode == V2DFmode)
16583 {
16584 if (TARGET_AVX
16585 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16586 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16587 || optimize_insn_for_size_p ())
16588 /* We will eventually emit movups based on insn attributes. */
16589 emit_insn (gen_sse2_storeupd (op0, op1));
16590 else
16591 {
16592 m = adjust_address (op0, DFmode, 0);
16593 emit_insn (gen_sse2_storelpd (m, op1));
16594 m = adjust_address (op0, DFmode, 8);
16595 emit_insn (gen_sse2_storehpd (m, op1));
16596 }
16597 }
16598 else
16599 {
16600 if (mode != V4SFmode)
16601 op1 = gen_lowpart (V4SFmode, op1);
16602
16603 if (TARGET_AVX
16604 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16605 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16606 || optimize_insn_for_size_p ())
16607 {
16608 op0 = gen_lowpart (V4SFmode, op0);
16609 emit_insn (gen_sse_storeups (op0, op1));
16610 }
16611 else
16612 {
16613 m = adjust_address (op0, V2SFmode, 0);
16614 emit_insn (gen_sse_storelps (m, op1));
16615 m = adjust_address (op0, V2SFmode, 8);
16616 emit_insn (gen_sse_storehps (m, op1));
16617 }
16618 }
16619 }
16620 else
16621 gcc_unreachable ();
16622 }
16623
16624 /* Expand a push in MODE. This is some mode for which we do not support
16625 proper push instructions, at least from the registers that we expect
16626 the value to live in. */
16627
16628 void
16629 ix86_expand_push (enum machine_mode mode, rtx x)
16630 {
16631 rtx tmp;
16632
16633 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16634 GEN_INT (-GET_MODE_SIZE (mode)),
16635 stack_pointer_rtx, 1, OPTAB_DIRECT);
16636 if (tmp != stack_pointer_rtx)
16637 emit_move_insn (stack_pointer_rtx, tmp);
16638
16639 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16640
16641 /* When we push an operand onto stack, it has to be aligned at least
16642 at the function argument boundary. However since we don't have
16643 the argument type, we can't determine the actual argument
16644 boundary. */
16645 emit_move_insn (tmp, x);
16646 }
16647
16648 /* Helper function of ix86_fixup_binary_operands to canonicalize
16649 operand order. Returns true if the operands should be swapped. */
16650
16651 static bool
16652 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16653 rtx operands[])
16654 {
16655 rtx dst = operands[0];
16656 rtx src1 = operands[1];
16657 rtx src2 = operands[2];
16658
16659 /* If the operation is not commutative, we can't do anything. */
16660 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16661 return false;
16662
16663 /* Highest priority is that src1 should match dst. */
16664 if (rtx_equal_p (dst, src1))
16665 return false;
16666 if (rtx_equal_p (dst, src2))
16667 return true;
16668
16669 /* Next highest priority is that immediate constants come second. */
16670 if (immediate_operand (src2, mode))
16671 return false;
16672 if (immediate_operand (src1, mode))
16673 return true;
16674
16675 /* Lowest priority is that memory references should come second. */
16676 if (MEM_P (src2))
16677 return false;
16678 if (MEM_P (src1))
16679 return true;
16680
16681 return false;
16682 }
16683
16684
16685 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16686 destination to use for the operation. If different from the true
16687 destination in operands[0], a copy operation will be required. */
16688
16689 rtx
16690 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16691 rtx operands[])
16692 {
16693 rtx dst = operands[0];
16694 rtx src1 = operands[1];
16695 rtx src2 = operands[2];
16696
16697 /* Canonicalize operand order. */
16698 if (ix86_swap_binary_operands_p (code, mode, operands))
16699 {
16700 rtx temp;
16701
16702 /* It is invalid to swap operands of different modes. */
16703 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16704
16705 temp = src1;
16706 src1 = src2;
16707 src2 = temp;
16708 }
16709
16710 /* Both source operands cannot be in memory. */
16711 if (MEM_P (src1) && MEM_P (src2))
16712 {
16713 /* Optimization: Only read from memory once. */
16714 if (rtx_equal_p (src1, src2))
16715 {
16716 src2 = force_reg (mode, src2);
16717 src1 = src2;
16718 }
16719 else
16720 src2 = force_reg (mode, src2);
16721 }
16722
16723 /* If the destination is memory, and we do not have matching source
16724 operands, do things in registers. */
16725 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16726 dst = gen_reg_rtx (mode);
16727
16728 /* Source 1 cannot be a constant. */
16729 if (CONSTANT_P (src1))
16730 src1 = force_reg (mode, src1);
16731
16732 /* Source 1 cannot be a non-matching memory. */
16733 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16734 src1 = force_reg (mode, src1);
16735
16736 /* Improve address combine. */
16737 if (code == PLUS
16738 && GET_MODE_CLASS (mode) == MODE_INT
16739 && MEM_P (src2))
16740 src2 = force_reg (mode, src2);
16741
16742 operands[1] = src1;
16743 operands[2] = src2;
16744 return dst;
16745 }
16746
16747 /* Similarly, but assume that the destination has already been
16748 set up properly. */
16749
16750 void
16751 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16752 enum machine_mode mode, rtx operands[])
16753 {
16754 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16755 gcc_assert (dst == operands[0]);
16756 }
16757
16758 /* Attempt to expand a binary operator. Make the expansion closer to the
16759 actual machine, then just general_operand, which will allow 3 separate
16760 memory references (one output, two input) in a single insn. */
16761
16762 void
16763 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16764 rtx operands[])
16765 {
16766 rtx src1, src2, dst, op, clob;
16767
16768 dst = ix86_fixup_binary_operands (code, mode, operands);
16769 src1 = operands[1];
16770 src2 = operands[2];
16771
16772 /* Emit the instruction. */
16773
16774 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16775 if (reload_in_progress)
16776 {
16777 /* Reload doesn't know about the flags register, and doesn't know that
16778 it doesn't want to clobber it. We can only do this with PLUS. */
16779 gcc_assert (code == PLUS);
16780 emit_insn (op);
16781 }
16782 else if (reload_completed
16783 && code == PLUS
16784 && !rtx_equal_p (dst, src1))
16785 {
16786 /* This is going to be an LEA; avoid splitting it later. */
16787 emit_insn (op);
16788 }
16789 else
16790 {
16791 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16792 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16793 }
16794
16795 /* Fix up the destination if needed. */
16796 if (dst != operands[0])
16797 emit_move_insn (operands[0], dst);
16798 }
16799
16800 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
16801 the given OPERANDS. */
16802
16803 void
16804 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
16805 rtx operands[])
16806 {
16807 rtx op1 = NULL_RTX, op2 = NULL_RTX;
16808 if (GET_CODE (operands[1]) == SUBREG)
16809 {
16810 op1 = operands[1];
16811 op2 = operands[2];
16812 }
16813 else if (GET_CODE (operands[2]) == SUBREG)
16814 {
16815 op1 = operands[2];
16816 op2 = operands[1];
16817 }
16818 /* Optimize (__m128i) d | (__m128i) e and similar code
16819 when d and e are float vectors into float vector logical
16820 insn. In C/C++ without using intrinsics there is no other way
16821 to express vector logical operation on float vectors than
16822 to cast them temporarily to integer vectors. */
16823 if (op1
16824 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16825 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
16826 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
16827 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
16828 && SUBREG_BYTE (op1) == 0
16829 && (GET_CODE (op2) == CONST_VECTOR
16830 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
16831 && SUBREG_BYTE (op2) == 0))
16832 && can_create_pseudo_p ())
16833 {
16834 rtx dst;
16835 switch (GET_MODE (SUBREG_REG (op1)))
16836 {
16837 case V4SFmode:
16838 case V8SFmode:
16839 case V2DFmode:
16840 case V4DFmode:
16841 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
16842 if (GET_CODE (op2) == CONST_VECTOR)
16843 {
16844 op2 = gen_lowpart (GET_MODE (dst), op2);
16845 op2 = force_reg (GET_MODE (dst), op2);
16846 }
16847 else
16848 {
16849 op1 = operands[1];
16850 op2 = SUBREG_REG (operands[2]);
16851 if (!nonimmediate_operand (op2, GET_MODE (dst)))
16852 op2 = force_reg (GET_MODE (dst), op2);
16853 }
16854 op1 = SUBREG_REG (op1);
16855 if (!nonimmediate_operand (op1, GET_MODE (dst)))
16856 op1 = force_reg (GET_MODE (dst), op1);
16857 emit_insn (gen_rtx_SET (VOIDmode, dst,
16858 gen_rtx_fmt_ee (code, GET_MODE (dst),
16859 op1, op2)));
16860 emit_move_insn (operands[0], gen_lowpart (mode, dst));
16861 return;
16862 default:
16863 break;
16864 }
16865 }
16866 if (!nonimmediate_operand (operands[1], mode))
16867 operands[1] = force_reg (mode, operands[1]);
16868 if (!nonimmediate_operand (operands[2], mode))
16869 operands[2] = force_reg (mode, operands[2]);
16870 ix86_fixup_binary_operands_no_copy (code, mode, operands);
16871 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
16872 gen_rtx_fmt_ee (code, mode, operands[1],
16873 operands[2])));
16874 }
16875
16876 /* Return TRUE or FALSE depending on whether the binary operator meets the
16877 appropriate constraints. */
16878
16879 bool
16880 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16881 rtx operands[3])
16882 {
16883 rtx dst = operands[0];
16884 rtx src1 = operands[1];
16885 rtx src2 = operands[2];
16886
16887 /* Both source operands cannot be in memory. */
16888 if (MEM_P (src1) && MEM_P (src2))
16889 return false;
16890
16891 /* Canonicalize operand order for commutative operators. */
16892 if (ix86_swap_binary_operands_p (code, mode, operands))
16893 {
16894 rtx temp = src1;
16895 src1 = src2;
16896 src2 = temp;
16897 }
16898
16899 /* If the destination is memory, we must have a matching source operand. */
16900 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16901 return false;
16902
16903 /* Source 1 cannot be a constant. */
16904 if (CONSTANT_P (src1))
16905 return false;
16906
16907 /* Source 1 cannot be a non-matching memory. */
16908 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16909 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16910 return (code == AND
16911 && (mode == HImode
16912 || mode == SImode
16913 || (TARGET_64BIT && mode == DImode))
16914 && satisfies_constraint_L (src2));
16915
16916 return true;
16917 }
16918
16919 /* Attempt to expand a unary operator. Make the expansion closer to the
16920 actual machine, then just general_operand, which will allow 2 separate
16921 memory references (one output, one input) in a single insn. */
16922
16923 void
16924 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16925 rtx operands[])
16926 {
16927 int matching_memory;
16928 rtx src, dst, op, clob;
16929
16930 dst = operands[0];
16931 src = operands[1];
16932
16933 /* If the destination is memory, and we do not have matching source
16934 operands, do things in registers. */
16935 matching_memory = 0;
16936 if (MEM_P (dst))
16937 {
16938 if (rtx_equal_p (dst, src))
16939 matching_memory = 1;
16940 else
16941 dst = gen_reg_rtx (mode);
16942 }
16943
16944 /* When source operand is memory, destination must match. */
16945 if (MEM_P (src) && !matching_memory)
16946 src = force_reg (mode, src);
16947
16948 /* Emit the instruction. */
16949
16950 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16951 if (reload_in_progress || code == NOT)
16952 {
16953 /* Reload doesn't know about the flags register, and doesn't know that
16954 it doesn't want to clobber it. */
16955 gcc_assert (code == NOT);
16956 emit_insn (op);
16957 }
16958 else
16959 {
16960 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16961 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16962 }
16963
16964 /* Fix up the destination if needed. */
16965 if (dst != operands[0])
16966 emit_move_insn (operands[0], dst);
16967 }
16968
16969 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16970 divisor are within the range [0-255]. */
16971
16972 void
16973 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16974 bool signed_p)
16975 {
16976 rtx end_label, qimode_label;
16977 rtx insn, div, mod;
16978 rtx scratch, tmp0, tmp1, tmp2;
16979 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16980 rtx (*gen_zero_extend) (rtx, rtx);
16981 rtx (*gen_test_ccno_1) (rtx, rtx);
16982
16983 switch (mode)
16984 {
16985 case SImode:
16986 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16987 gen_test_ccno_1 = gen_testsi_ccno_1;
16988 gen_zero_extend = gen_zero_extendqisi2;
16989 break;
16990 case DImode:
16991 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16992 gen_test_ccno_1 = gen_testdi_ccno_1;
16993 gen_zero_extend = gen_zero_extendqidi2;
16994 break;
16995 default:
16996 gcc_unreachable ();
16997 }
16998
16999 end_label = gen_label_rtx ();
17000 qimode_label = gen_label_rtx ();
17001
17002 scratch = gen_reg_rtx (mode);
17003
17004 /* Use 8bit unsigned divimod if dividend and divisor are within
17005 the range [0-255]. */
17006 emit_move_insn (scratch, operands[2]);
17007 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
17008 scratch, 1, OPTAB_DIRECT);
17009 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
17010 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
17011 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
17012 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
17013 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
17014 pc_rtx);
17015 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
17016 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17017 JUMP_LABEL (insn) = qimode_label;
17018
17019 /* Generate original signed/unsigned divimod. */
17020 div = gen_divmod4_1 (operands[0], operands[1],
17021 operands[2], operands[3]);
17022 emit_insn (div);
17023
17024 /* Branch to the end. */
17025 emit_jump_insn (gen_jump (end_label));
17026 emit_barrier ();
17027
17028 /* Generate 8bit unsigned divide. */
17029 emit_label (qimode_label);
17030 /* Don't use operands[0] for result of 8bit divide since not all
17031 registers support QImode ZERO_EXTRACT. */
17032 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
17033 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
17034 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
17035 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
17036
17037 if (signed_p)
17038 {
17039 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
17040 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
17041 }
17042 else
17043 {
17044 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
17045 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
17046 }
17047
17048 /* Extract remainder from AH. */
17049 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
17050 if (REG_P (operands[1]))
17051 insn = emit_move_insn (operands[1], tmp1);
17052 else
17053 {
17054 /* Need a new scratch register since the old one has result
17055 of 8bit divide. */
17056 scratch = gen_reg_rtx (mode);
17057 emit_move_insn (scratch, tmp1);
17058 insn = emit_move_insn (operands[1], scratch);
17059 }
17060 set_unique_reg_note (insn, REG_EQUAL, mod);
17061
17062 /* Zero extend quotient from AL. */
17063 tmp1 = gen_lowpart (QImode, tmp0);
17064 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
17065 set_unique_reg_note (insn, REG_EQUAL, div);
17066
17067 emit_label (end_label);
17068 }
17069
17070 #define LEA_MAX_STALL (3)
17071 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
17072
17073 /* Increase given DISTANCE in half-cycles according to
17074 dependencies between PREV and NEXT instructions.
17075 Add 1 half-cycle if there is no dependency and
17076 go to next cycle if there is some dependecy. */
17077
17078 static unsigned int
17079 increase_distance (rtx prev, rtx next, unsigned int distance)
17080 {
17081 df_ref *use_rec;
17082 df_ref *def_rec;
17083
17084 if (!prev || !next)
17085 return distance + (distance & 1) + 2;
17086
17087 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
17088 return distance + 1;
17089
17090 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
17091 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
17092 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
17093 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
17094 return distance + (distance & 1) + 2;
17095
17096 return distance + 1;
17097 }
17098
17099 /* Function checks if instruction INSN defines register number
17100 REGNO1 or REGNO2. */
17101
17102 static bool
17103 insn_defines_reg (unsigned int regno1, unsigned int regno2,
17104 rtx insn)
17105 {
17106 df_ref *def_rec;
17107
17108 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
17109 if (DF_REF_REG_DEF_P (*def_rec)
17110 && !DF_REF_IS_ARTIFICIAL (*def_rec)
17111 && (regno1 == DF_REF_REGNO (*def_rec)
17112 || regno2 == DF_REF_REGNO (*def_rec)))
17113 {
17114 return true;
17115 }
17116
17117 return false;
17118 }
17119
17120 /* Function checks if instruction INSN uses register number
17121 REGNO as a part of address expression. */
17122
17123 static bool
17124 insn_uses_reg_mem (unsigned int regno, rtx insn)
17125 {
17126 df_ref *use_rec;
17127
17128 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
17129 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
17130 return true;
17131
17132 return false;
17133 }
17134
17135 /* Search backward for non-agu definition of register number REGNO1
17136 or register number REGNO2 in basic block starting from instruction
17137 START up to head of basic block or instruction INSN.
17138
17139 Function puts true value into *FOUND var if definition was found
17140 and false otherwise.
17141
17142 Distance in half-cycles between START and found instruction or head
17143 of BB is added to DISTANCE and returned. */
17144
17145 static int
17146 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
17147 rtx insn, int distance,
17148 rtx start, bool *found)
17149 {
17150 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17151 rtx prev = start;
17152 rtx next = NULL;
17153
17154 *found = false;
17155
17156 while (prev
17157 && prev != insn
17158 && distance < LEA_SEARCH_THRESHOLD)
17159 {
17160 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
17161 {
17162 distance = increase_distance (prev, next, distance);
17163 if (insn_defines_reg (regno1, regno2, prev))
17164 {
17165 if (recog_memoized (prev) < 0
17166 || get_attr_type (prev) != TYPE_LEA)
17167 {
17168 *found = true;
17169 return distance;
17170 }
17171 }
17172
17173 next = prev;
17174 }
17175 if (prev == BB_HEAD (bb))
17176 break;
17177
17178 prev = PREV_INSN (prev);
17179 }
17180
17181 return distance;
17182 }
17183
17184 /* Search backward for non-agu definition of register number REGNO1
17185 or register number REGNO2 in INSN's basic block until
17186 1. Pass LEA_SEARCH_THRESHOLD instructions, or
17187 2. Reach neighbour BBs boundary, or
17188 3. Reach agu definition.
17189 Returns the distance between the non-agu definition point and INSN.
17190 If no definition point, returns -1. */
17191
17192 static int
17193 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
17194 rtx insn)
17195 {
17196 basic_block bb = BLOCK_FOR_INSN (insn);
17197 int distance = 0;
17198 bool found = false;
17199
17200 if (insn != BB_HEAD (bb))
17201 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
17202 distance, PREV_INSN (insn),
17203 &found);
17204
17205 if (!found && distance < LEA_SEARCH_THRESHOLD)
17206 {
17207 edge e;
17208 edge_iterator ei;
17209 bool simple_loop = false;
17210
17211 FOR_EACH_EDGE (e, ei, bb->preds)
17212 if (e->src == bb)
17213 {
17214 simple_loop = true;
17215 break;
17216 }
17217
17218 if (simple_loop)
17219 distance = distance_non_agu_define_in_bb (regno1, regno2,
17220 insn, distance,
17221 BB_END (bb), &found);
17222 else
17223 {
17224 int shortest_dist = -1;
17225 bool found_in_bb = false;
17226
17227 FOR_EACH_EDGE (e, ei, bb->preds)
17228 {
17229 int bb_dist
17230 = distance_non_agu_define_in_bb (regno1, regno2,
17231 insn, distance,
17232 BB_END (e->src),
17233 &found_in_bb);
17234 if (found_in_bb)
17235 {
17236 if (shortest_dist < 0)
17237 shortest_dist = bb_dist;
17238 else if (bb_dist > 0)
17239 shortest_dist = MIN (bb_dist, shortest_dist);
17240
17241 found = true;
17242 }
17243 }
17244
17245 distance = shortest_dist;
17246 }
17247 }
17248
17249 /* get_attr_type may modify recog data. We want to make sure
17250 that recog data is valid for instruction INSN, on which
17251 distance_non_agu_define is called. INSN is unchanged here. */
17252 extract_insn_cached (insn);
17253
17254 if (!found)
17255 return -1;
17256
17257 return distance >> 1;
17258 }
17259
17260 /* Return the distance in half-cycles between INSN and the next
17261 insn that uses register number REGNO in memory address added
17262 to DISTANCE. Return -1 if REGNO0 is set.
17263
17264 Put true value into *FOUND if register usage was found and
17265 false otherwise.
17266 Put true value into *REDEFINED if register redefinition was
17267 found and false otherwise. */
17268
17269 static int
17270 distance_agu_use_in_bb (unsigned int regno,
17271 rtx insn, int distance, rtx start,
17272 bool *found, bool *redefined)
17273 {
17274 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17275 rtx next = start;
17276 rtx prev = NULL;
17277
17278 *found = false;
17279 *redefined = false;
17280
17281 while (next
17282 && next != insn
17283 && distance < LEA_SEARCH_THRESHOLD)
17284 {
17285 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17286 {
17287 distance = increase_distance(prev, next, distance);
17288 if (insn_uses_reg_mem (regno, next))
17289 {
17290 /* Return DISTANCE if OP0 is used in memory
17291 address in NEXT. */
17292 *found = true;
17293 return distance;
17294 }
17295
17296 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17297 {
17298 /* Return -1 if OP0 is set in NEXT. */
17299 *redefined = true;
17300 return -1;
17301 }
17302
17303 prev = next;
17304 }
17305
17306 if (next == BB_END (bb))
17307 break;
17308
17309 next = NEXT_INSN (next);
17310 }
17311
17312 return distance;
17313 }
17314
17315 /* Return the distance between INSN and the next insn that uses
17316 register number REGNO0 in memory address. Return -1 if no such
17317 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17318
17319 static int
17320 distance_agu_use (unsigned int regno0, rtx insn)
17321 {
17322 basic_block bb = BLOCK_FOR_INSN (insn);
17323 int distance = 0;
17324 bool found = false;
17325 bool redefined = false;
17326
17327 if (insn != BB_END (bb))
17328 distance = distance_agu_use_in_bb (regno0, insn, distance,
17329 NEXT_INSN (insn),
17330 &found, &redefined);
17331
17332 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17333 {
17334 edge e;
17335 edge_iterator ei;
17336 bool simple_loop = false;
17337
17338 FOR_EACH_EDGE (e, ei, bb->succs)
17339 if (e->dest == bb)
17340 {
17341 simple_loop = true;
17342 break;
17343 }
17344
17345 if (simple_loop)
17346 distance = distance_agu_use_in_bb (regno0, insn,
17347 distance, BB_HEAD (bb),
17348 &found, &redefined);
17349 else
17350 {
17351 int shortest_dist = -1;
17352 bool found_in_bb = false;
17353 bool redefined_in_bb = false;
17354
17355 FOR_EACH_EDGE (e, ei, bb->succs)
17356 {
17357 int bb_dist
17358 = distance_agu_use_in_bb (regno0, insn,
17359 distance, BB_HEAD (e->dest),
17360 &found_in_bb, &redefined_in_bb);
17361 if (found_in_bb)
17362 {
17363 if (shortest_dist < 0)
17364 shortest_dist = bb_dist;
17365 else if (bb_dist > 0)
17366 shortest_dist = MIN (bb_dist, shortest_dist);
17367
17368 found = true;
17369 }
17370 }
17371
17372 distance = shortest_dist;
17373 }
17374 }
17375
17376 if (!found || redefined)
17377 return -1;
17378
17379 return distance >> 1;
17380 }
17381
17382 /* Define this macro to tune LEA priority vs ADD, it take effect when
17383 there is a dilemma of choicing LEA or ADD
17384 Negative value: ADD is more preferred than LEA
17385 Zero: Netrual
17386 Positive value: LEA is more preferred than ADD*/
17387 #define IX86_LEA_PRIORITY 0
17388
17389 /* Return true if usage of lea INSN has performance advantage
17390 over a sequence of instructions. Instructions sequence has
17391 SPLIT_COST cycles higher latency than lea latency. */
17392
17393 static bool
17394 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17395 unsigned int regno2, int split_cost)
17396 {
17397 int dist_define, dist_use;
17398
17399 dist_define = distance_non_agu_define (regno1, regno2, insn);
17400 dist_use = distance_agu_use (regno0, insn);
17401
17402 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17403 {
17404 /* If there is no non AGU operand definition, no AGU
17405 operand usage and split cost is 0 then both lea
17406 and non lea variants have same priority. Currently
17407 we prefer lea for 64 bit code and non lea on 32 bit
17408 code. */
17409 if (dist_use < 0 && split_cost == 0)
17410 return TARGET_64BIT || IX86_LEA_PRIORITY;
17411 else
17412 return true;
17413 }
17414
17415 /* With longer definitions distance lea is more preferable.
17416 Here we change it to take into account splitting cost and
17417 lea priority. */
17418 dist_define += split_cost + IX86_LEA_PRIORITY;
17419
17420 /* If there is no use in memory addess then we just check
17421 that split cost exceeds AGU stall. */
17422 if (dist_use < 0)
17423 return dist_define > LEA_MAX_STALL;
17424
17425 /* If this insn has both backward non-agu dependence and forward
17426 agu dependence, the one with short distance takes effect. */
17427 return dist_define >= dist_use;
17428 }
17429
17430 /* Return true if it is legal to clobber flags by INSN and
17431 false otherwise. */
17432
17433 static bool
17434 ix86_ok_to_clobber_flags (rtx insn)
17435 {
17436 basic_block bb = BLOCK_FOR_INSN (insn);
17437 df_ref *use;
17438 bitmap live;
17439
17440 while (insn)
17441 {
17442 if (NONDEBUG_INSN_P (insn))
17443 {
17444 for (use = DF_INSN_USES (insn); *use; use++)
17445 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17446 return false;
17447
17448 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17449 return true;
17450 }
17451
17452 if (insn == BB_END (bb))
17453 break;
17454
17455 insn = NEXT_INSN (insn);
17456 }
17457
17458 live = df_get_live_out(bb);
17459 return !REGNO_REG_SET_P (live, FLAGS_REG);
17460 }
17461
17462 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17463 move and add to avoid AGU stalls. */
17464
17465 bool
17466 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17467 {
17468 unsigned int regno0, regno1, regno2;
17469
17470 /* Check if we need to optimize. */
17471 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17472 return false;
17473
17474 /* Check it is correct to split here. */
17475 if (!ix86_ok_to_clobber_flags(insn))
17476 return false;
17477
17478 regno0 = true_regnum (operands[0]);
17479 regno1 = true_regnum (operands[1]);
17480 regno2 = true_regnum (operands[2]);
17481
17482 /* We need to split only adds with non destructive
17483 destination operand. */
17484 if (regno0 == regno1 || regno0 == regno2)
17485 return false;
17486 else
17487 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
17488 }
17489
17490 /* Return true if we should emit lea instruction instead of mov
17491 instruction. */
17492
17493 bool
17494 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17495 {
17496 unsigned int regno0, regno1;
17497
17498 /* Check if we need to optimize. */
17499 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17500 return false;
17501
17502 /* Use lea for reg to reg moves only. */
17503 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17504 return false;
17505
17506 regno0 = true_regnum (operands[0]);
17507 regno1 = true_regnum (operands[1]);
17508
17509 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0);
17510 }
17511
17512 /* Return true if we need to split lea into a sequence of
17513 instructions to avoid AGU stalls. */
17514
17515 bool
17516 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17517 {
17518 unsigned int regno0, regno1, regno2;
17519 int split_cost;
17520 struct ix86_address parts;
17521 int ok;
17522
17523 /* Check we need to optimize. */
17524 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17525 return false;
17526
17527 /* Check it is correct to split here. */
17528 if (!ix86_ok_to_clobber_flags(insn))
17529 return false;
17530
17531 ok = ix86_decompose_address (operands[1], &parts);
17532 gcc_assert (ok);
17533
17534 /* There should be at least two components in the address. */
17535 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17536 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17537 return false;
17538
17539 /* We should not split into add if non legitimate pic
17540 operand is used as displacement. */
17541 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17542 return false;
17543
17544 regno0 = true_regnum (operands[0]) ;
17545 regno1 = INVALID_REGNUM;
17546 regno2 = INVALID_REGNUM;
17547
17548 if (parts.base)
17549 regno1 = true_regnum (parts.base);
17550 if (parts.index)
17551 regno2 = true_regnum (parts.index);
17552
17553 split_cost = 0;
17554
17555 /* Compute how many cycles we will add to execution time
17556 if split lea into a sequence of instructions. */
17557 if (parts.base || parts.index)
17558 {
17559 /* Have to use mov instruction if non desctructive
17560 destination form is used. */
17561 if (regno1 != regno0 && regno2 != regno0)
17562 split_cost += 1;
17563
17564 /* Have to add index to base if both exist. */
17565 if (parts.base && parts.index)
17566 split_cost += 1;
17567
17568 /* Have to use shift and adds if scale is 2 or greater. */
17569 if (parts.scale > 1)
17570 {
17571 if (regno0 != regno1)
17572 split_cost += 1;
17573 else if (regno2 == regno0)
17574 split_cost += 4;
17575 else
17576 split_cost += parts.scale;
17577 }
17578
17579 /* Have to use add instruction with immediate if
17580 disp is non zero. */
17581 if (parts.disp && parts.disp != const0_rtx)
17582 split_cost += 1;
17583
17584 /* Subtract the price of lea. */
17585 split_cost -= 1;
17586 }
17587
17588 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
17589 }
17590
17591 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17592 matches destination. RTX includes clobber of FLAGS_REG. */
17593
17594 static void
17595 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17596 rtx dst, rtx src)
17597 {
17598 rtx op, clob;
17599
17600 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17601 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17602
17603 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17604 }
17605
17606 /* Return true if regno1 def is nearest to the insn. */
17607
17608 static bool
17609 find_nearest_reg_def (rtx insn, int regno1, int regno2)
17610 {
17611 rtx prev = insn;
17612 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
17613
17614 if (insn == start)
17615 return false;
17616 while (prev && prev != start)
17617 {
17618 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
17619 {
17620 prev = PREV_INSN (prev);
17621 continue;
17622 }
17623 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
17624 return true;
17625 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
17626 return false;
17627 prev = PREV_INSN (prev);
17628 }
17629
17630 /* None of the regs is defined in the bb. */
17631 return false;
17632 }
17633
17634 /* Split lea instructions into a sequence of instructions
17635 which are executed on ALU to avoid AGU stalls.
17636 It is assumed that it is allowed to clobber flags register
17637 at lea position. */
17638
17639 void
17640 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
17641 {
17642 unsigned int regno0, regno1, regno2;
17643 struct ix86_address parts;
17644 rtx target, tmp;
17645 int ok, adds;
17646
17647 ok = ix86_decompose_address (operands[1], &parts);
17648 gcc_assert (ok);
17649
17650 target = gen_lowpart (mode, operands[0]);
17651
17652 regno0 = true_regnum (target);
17653 regno1 = INVALID_REGNUM;
17654 regno2 = INVALID_REGNUM;
17655
17656 if (parts.base)
17657 {
17658 parts.base = gen_lowpart (mode, parts.base);
17659 regno1 = true_regnum (parts.base);
17660 }
17661
17662 if (parts.index)
17663 {
17664 parts.index = gen_lowpart (mode, parts.index);
17665 regno2 = true_regnum (parts.index);
17666 }
17667
17668 if (parts.disp)
17669 parts.disp = gen_lowpart (mode, parts.disp);
17670
17671 if (parts.scale > 1)
17672 {
17673 /* Case r1 = r1 + ... */
17674 if (regno1 == regno0)
17675 {
17676 /* If we have a case r1 = r1 + C * r1 then we
17677 should use multiplication which is very
17678 expensive. Assume cost model is wrong if we
17679 have such case here. */
17680 gcc_assert (regno2 != regno0);
17681
17682 for (adds = parts.scale; adds > 0; adds--)
17683 ix86_emit_binop (PLUS, mode, target, parts.index);
17684 }
17685 else
17686 {
17687 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
17688 if (regno0 != regno2)
17689 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17690
17691 /* Use shift for scaling. */
17692 ix86_emit_binop (ASHIFT, mode, target,
17693 GEN_INT (exact_log2 (parts.scale)));
17694
17695 if (parts.base)
17696 ix86_emit_binop (PLUS, mode, target, parts.base);
17697
17698 if (parts.disp && parts.disp != const0_rtx)
17699 ix86_emit_binop (PLUS, mode, target, parts.disp);
17700 }
17701 }
17702 else if (!parts.base && !parts.index)
17703 {
17704 gcc_assert(parts.disp);
17705 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
17706 }
17707 else
17708 {
17709 if (!parts.base)
17710 {
17711 if (regno0 != regno2)
17712 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17713 }
17714 else if (!parts.index)
17715 {
17716 if (regno0 != regno1)
17717 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
17718 }
17719 else
17720 {
17721 if (regno0 == regno1)
17722 tmp = parts.index;
17723 else if (regno0 == regno2)
17724 tmp = parts.base;
17725 else
17726 {
17727 rtx tmp1;
17728
17729 /* Find better operand for SET instruction, depending
17730 on which definition is farther from the insn. */
17731 if (find_nearest_reg_def (insn, regno1, regno2))
17732 tmp = parts.index, tmp1 = parts.base;
17733 else
17734 tmp = parts.base, tmp1 = parts.index;
17735
17736 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
17737
17738 if (parts.disp && parts.disp != const0_rtx)
17739 ix86_emit_binop (PLUS, mode, target, parts.disp);
17740
17741 ix86_emit_binop (PLUS, mode, target, tmp1);
17742 return;
17743 }
17744
17745 ix86_emit_binop (PLUS, mode, target, tmp);
17746 }
17747
17748 if (parts.disp && parts.disp != const0_rtx)
17749 ix86_emit_binop (PLUS, mode, target, parts.disp);
17750 }
17751 }
17752
17753 /* Return true if it is ok to optimize an ADD operation to LEA
17754 operation to avoid flag register consumation. For most processors,
17755 ADD is faster than LEA. For the processors like ATOM, if the
17756 destination register of LEA holds an actual address which will be
17757 used soon, LEA is better and otherwise ADD is better. */
17758
17759 bool
17760 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17761 {
17762 unsigned int regno0 = true_regnum (operands[0]);
17763 unsigned int regno1 = true_regnum (operands[1]);
17764 unsigned int regno2 = true_regnum (operands[2]);
17765
17766 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17767 if (regno0 != regno1 && regno0 != regno2)
17768 return true;
17769
17770 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17771 return false;
17772
17773 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17774 }
17775
17776 /* Return true if destination reg of SET_BODY is shift count of
17777 USE_BODY. */
17778
17779 static bool
17780 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17781 {
17782 rtx set_dest;
17783 rtx shift_rtx;
17784 int i;
17785
17786 /* Retrieve destination of SET_BODY. */
17787 switch (GET_CODE (set_body))
17788 {
17789 case SET:
17790 set_dest = SET_DEST (set_body);
17791 if (!set_dest || !REG_P (set_dest))
17792 return false;
17793 break;
17794 case PARALLEL:
17795 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17796 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17797 use_body))
17798 return true;
17799 default:
17800 return false;
17801 break;
17802 }
17803
17804 /* Retrieve shift count of USE_BODY. */
17805 switch (GET_CODE (use_body))
17806 {
17807 case SET:
17808 shift_rtx = XEXP (use_body, 1);
17809 break;
17810 case PARALLEL:
17811 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17812 if (ix86_dep_by_shift_count_body (set_body,
17813 XVECEXP (use_body, 0, i)))
17814 return true;
17815 default:
17816 return false;
17817 break;
17818 }
17819
17820 if (shift_rtx
17821 && (GET_CODE (shift_rtx) == ASHIFT
17822 || GET_CODE (shift_rtx) == LSHIFTRT
17823 || GET_CODE (shift_rtx) == ASHIFTRT
17824 || GET_CODE (shift_rtx) == ROTATE
17825 || GET_CODE (shift_rtx) == ROTATERT))
17826 {
17827 rtx shift_count = XEXP (shift_rtx, 1);
17828
17829 /* Return true if shift count is dest of SET_BODY. */
17830 if (REG_P (shift_count))
17831 {
17832 /* Add check since it can be invoked before register
17833 allocation in pre-reload schedule. */
17834 if (reload_completed
17835 && true_regnum (set_dest) == true_regnum (shift_count))
17836 return true;
17837 else if (REGNO(set_dest) == REGNO(shift_count))
17838 return true;
17839 }
17840 }
17841
17842 return false;
17843 }
17844
17845 /* Return true if destination reg of SET_INSN is shift count of
17846 USE_INSN. */
17847
17848 bool
17849 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17850 {
17851 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17852 PATTERN (use_insn));
17853 }
17854
17855 /* Return TRUE or FALSE depending on whether the unary operator meets the
17856 appropriate constraints. */
17857
17858 bool
17859 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17860 enum machine_mode mode ATTRIBUTE_UNUSED,
17861 rtx operands[2] ATTRIBUTE_UNUSED)
17862 {
17863 /* If one of operands is memory, source and destination must match. */
17864 if ((MEM_P (operands[0])
17865 || MEM_P (operands[1]))
17866 && ! rtx_equal_p (operands[0], operands[1]))
17867 return false;
17868 return true;
17869 }
17870
17871 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17872 are ok, keeping in mind the possible movddup alternative. */
17873
17874 bool
17875 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17876 {
17877 if (MEM_P (operands[0]))
17878 return rtx_equal_p (operands[0], operands[1 + high]);
17879 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17880 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17881 return true;
17882 }
17883
17884 /* Post-reload splitter for converting an SF or DFmode value in an
17885 SSE register into an unsigned SImode. */
17886
17887 void
17888 ix86_split_convert_uns_si_sse (rtx operands[])
17889 {
17890 enum machine_mode vecmode;
17891 rtx value, large, zero_or_two31, input, two31, x;
17892
17893 large = operands[1];
17894 zero_or_two31 = operands[2];
17895 input = operands[3];
17896 two31 = operands[4];
17897 vecmode = GET_MODE (large);
17898 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17899
17900 /* Load up the value into the low element. We must ensure that the other
17901 elements are valid floats -- zero is the easiest such value. */
17902 if (MEM_P (input))
17903 {
17904 if (vecmode == V4SFmode)
17905 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17906 else
17907 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17908 }
17909 else
17910 {
17911 input = gen_rtx_REG (vecmode, REGNO (input));
17912 emit_move_insn (value, CONST0_RTX (vecmode));
17913 if (vecmode == V4SFmode)
17914 emit_insn (gen_sse_movss (value, value, input));
17915 else
17916 emit_insn (gen_sse2_movsd (value, value, input));
17917 }
17918
17919 emit_move_insn (large, two31);
17920 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17921
17922 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17923 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17924
17925 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17926 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17927
17928 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17929 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17930
17931 large = gen_rtx_REG (V4SImode, REGNO (large));
17932 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17933
17934 x = gen_rtx_REG (V4SImode, REGNO (value));
17935 if (vecmode == V4SFmode)
17936 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17937 else
17938 emit_insn (gen_sse2_cvttpd2dq (x, value));
17939 value = x;
17940
17941 emit_insn (gen_xorv4si3 (value, value, large));
17942 }
17943
17944 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17945 Expects the 64-bit DImode to be supplied in a pair of integral
17946 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17947 -mfpmath=sse, !optimize_size only. */
17948
17949 void
17950 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17951 {
17952 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17953 rtx int_xmm, fp_xmm;
17954 rtx biases, exponents;
17955 rtx x;
17956
17957 int_xmm = gen_reg_rtx (V4SImode);
17958 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
17959 emit_insn (gen_movdi_to_sse (int_xmm, input));
17960 else if (TARGET_SSE_SPLIT_REGS)
17961 {
17962 emit_clobber (int_xmm);
17963 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17964 }
17965 else
17966 {
17967 x = gen_reg_rtx (V2DImode);
17968 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17969 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17970 }
17971
17972 x = gen_rtx_CONST_VECTOR (V4SImode,
17973 gen_rtvec (4, GEN_INT (0x43300000UL),
17974 GEN_INT (0x45300000UL),
17975 const0_rtx, const0_rtx));
17976 exponents = validize_mem (force_const_mem (V4SImode, x));
17977
17978 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17979 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17980
17981 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17982 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17983 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17984 (0x1.0p84 + double(fp_value_hi_xmm)).
17985 Note these exponents differ by 32. */
17986
17987 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17988
17989 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17990 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17991 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17992 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17993 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17994 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17995 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17996 biases = validize_mem (force_const_mem (V2DFmode, biases));
17997 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17998
17999 /* Add the upper and lower DFmode values together. */
18000 if (TARGET_SSE3)
18001 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
18002 else
18003 {
18004 x = copy_to_mode_reg (V2DFmode, fp_xmm);
18005 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
18006 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
18007 }
18008
18009 ix86_expand_vector_extract (false, target, fp_xmm, 0);
18010 }
18011
18012 /* Not used, but eases macroization of patterns. */
18013 void
18014 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
18015 rtx input ATTRIBUTE_UNUSED)
18016 {
18017 gcc_unreachable ();
18018 }
18019
18020 /* Convert an unsigned SImode value into a DFmode. Only currently used
18021 for SSE, but applicable anywhere. */
18022
18023 void
18024 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
18025 {
18026 REAL_VALUE_TYPE TWO31r;
18027 rtx x, fp;
18028
18029 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
18030 NULL, 1, OPTAB_DIRECT);
18031
18032 fp = gen_reg_rtx (DFmode);
18033 emit_insn (gen_floatsidf2 (fp, x));
18034
18035 real_ldexp (&TWO31r, &dconst1, 31);
18036 x = const_double_from_real_value (TWO31r, DFmode);
18037
18038 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
18039 if (x != target)
18040 emit_move_insn (target, x);
18041 }
18042
18043 /* Convert a signed DImode value into a DFmode. Only used for SSE in
18044 32-bit mode; otherwise we have a direct convert instruction. */
18045
18046 void
18047 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
18048 {
18049 REAL_VALUE_TYPE TWO32r;
18050 rtx fp_lo, fp_hi, x;
18051
18052 fp_lo = gen_reg_rtx (DFmode);
18053 fp_hi = gen_reg_rtx (DFmode);
18054
18055 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
18056
18057 real_ldexp (&TWO32r, &dconst1, 32);
18058 x = const_double_from_real_value (TWO32r, DFmode);
18059 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
18060
18061 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
18062
18063 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
18064 0, OPTAB_DIRECT);
18065 if (x != target)
18066 emit_move_insn (target, x);
18067 }
18068
18069 /* Convert an unsigned SImode value into a SFmode, using only SSE.
18070 For x86_32, -mfpmath=sse, !optimize_size only. */
18071 void
18072 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
18073 {
18074 REAL_VALUE_TYPE ONE16r;
18075 rtx fp_hi, fp_lo, int_hi, int_lo, x;
18076
18077 real_ldexp (&ONE16r, &dconst1, 16);
18078 x = const_double_from_real_value (ONE16r, SFmode);
18079 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
18080 NULL, 0, OPTAB_DIRECT);
18081 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
18082 NULL, 0, OPTAB_DIRECT);
18083 fp_hi = gen_reg_rtx (SFmode);
18084 fp_lo = gen_reg_rtx (SFmode);
18085 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
18086 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
18087 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
18088 0, OPTAB_DIRECT);
18089 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
18090 0, OPTAB_DIRECT);
18091 if (!rtx_equal_p (target, fp_hi))
18092 emit_move_insn (target, fp_hi);
18093 }
18094
18095 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
18096 a vector of unsigned ints VAL to vector of floats TARGET. */
18097
18098 void
18099 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
18100 {
18101 rtx tmp[8];
18102 REAL_VALUE_TYPE TWO16r;
18103 enum machine_mode intmode = GET_MODE (val);
18104 enum machine_mode fltmode = GET_MODE (target);
18105 rtx (*cvt) (rtx, rtx);
18106
18107 if (intmode == V4SImode)
18108 cvt = gen_floatv4siv4sf2;
18109 else
18110 cvt = gen_floatv8siv8sf2;
18111 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
18112 tmp[0] = force_reg (intmode, tmp[0]);
18113 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
18114 OPTAB_DIRECT);
18115 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
18116 NULL_RTX, 1, OPTAB_DIRECT);
18117 tmp[3] = gen_reg_rtx (fltmode);
18118 emit_insn (cvt (tmp[3], tmp[1]));
18119 tmp[4] = gen_reg_rtx (fltmode);
18120 emit_insn (cvt (tmp[4], tmp[2]));
18121 real_ldexp (&TWO16r, &dconst1, 16);
18122 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
18123 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
18124 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
18125 OPTAB_DIRECT);
18126 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
18127 OPTAB_DIRECT);
18128 if (tmp[7] != target)
18129 emit_move_insn (target, tmp[7]);
18130 }
18131
18132 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
18133 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
18134 This is done by doing just signed conversion if < 0x1p31, and otherwise by
18135 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
18136
18137 rtx
18138 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
18139 {
18140 REAL_VALUE_TYPE TWO31r;
18141 rtx two31r, tmp[4];
18142 enum machine_mode mode = GET_MODE (val);
18143 enum machine_mode scalarmode = GET_MODE_INNER (mode);
18144 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
18145 rtx (*cmp) (rtx, rtx, rtx, rtx);
18146 int i;
18147
18148 for (i = 0; i < 3; i++)
18149 tmp[i] = gen_reg_rtx (mode);
18150 real_ldexp (&TWO31r, &dconst1, 31);
18151 two31r = const_double_from_real_value (TWO31r, scalarmode);
18152 two31r = ix86_build_const_vector (mode, 1, two31r);
18153 two31r = force_reg (mode, two31r);
18154 switch (mode)
18155 {
18156 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
18157 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
18158 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
18159 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
18160 default: gcc_unreachable ();
18161 }
18162 tmp[3] = gen_rtx_LE (mode, two31r, val);
18163 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
18164 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
18165 0, OPTAB_DIRECT);
18166 if (intmode == V4SImode || TARGET_AVX2)
18167 *xorp = expand_simple_binop (intmode, ASHIFT,
18168 gen_lowpart (intmode, tmp[0]),
18169 GEN_INT (31), NULL_RTX, 0,
18170 OPTAB_DIRECT);
18171 else
18172 {
18173 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
18174 two31 = ix86_build_const_vector (intmode, 1, two31);
18175 *xorp = expand_simple_binop (intmode, AND,
18176 gen_lowpart (intmode, tmp[0]),
18177 two31, NULL_RTX, 0,
18178 OPTAB_DIRECT);
18179 }
18180 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
18181 0, OPTAB_DIRECT);
18182 }
18183
18184 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
18185 then replicate the value for all elements of the vector
18186 register. */
18187
18188 rtx
18189 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
18190 {
18191 int i, n_elt;
18192 rtvec v;
18193 enum machine_mode scalar_mode;
18194
18195 switch (mode)
18196 {
18197 case V32QImode:
18198 case V16QImode:
18199 case V16HImode:
18200 case V8HImode:
18201 case V8SImode:
18202 case V4SImode:
18203 case V4DImode:
18204 case V2DImode:
18205 gcc_assert (vect);
18206 case V8SFmode:
18207 case V4SFmode:
18208 case V4DFmode:
18209 case V2DFmode:
18210 n_elt = GET_MODE_NUNITS (mode);
18211 v = rtvec_alloc (n_elt);
18212 scalar_mode = GET_MODE_INNER (mode);
18213
18214 RTVEC_ELT (v, 0) = value;
18215
18216 for (i = 1; i < n_elt; ++i)
18217 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
18218
18219 return gen_rtx_CONST_VECTOR (mode, v);
18220
18221 default:
18222 gcc_unreachable ();
18223 }
18224 }
18225
18226 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
18227 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
18228 for an SSE register. If VECT is true, then replicate the mask for
18229 all elements of the vector register. If INVERT is true, then create
18230 a mask excluding the sign bit. */
18231
18232 rtx
18233 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
18234 {
18235 enum machine_mode vec_mode, imode;
18236 HOST_WIDE_INT hi, lo;
18237 int shift = 63;
18238 rtx v;
18239 rtx mask;
18240
18241 /* Find the sign bit, sign extended to 2*HWI. */
18242 switch (mode)
18243 {
18244 case V8SImode:
18245 case V4SImode:
18246 case V8SFmode:
18247 case V4SFmode:
18248 vec_mode = mode;
18249 mode = GET_MODE_INNER (mode);
18250 imode = SImode;
18251 lo = 0x80000000, hi = lo < 0;
18252 break;
18253
18254 case V4DImode:
18255 case V2DImode:
18256 case V4DFmode:
18257 case V2DFmode:
18258 vec_mode = mode;
18259 mode = GET_MODE_INNER (mode);
18260 imode = DImode;
18261 if (HOST_BITS_PER_WIDE_INT >= 64)
18262 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
18263 else
18264 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18265 break;
18266
18267 case TImode:
18268 case TFmode:
18269 vec_mode = VOIDmode;
18270 if (HOST_BITS_PER_WIDE_INT >= 64)
18271 {
18272 imode = TImode;
18273 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18274 }
18275 else
18276 {
18277 rtvec vec;
18278
18279 imode = DImode;
18280 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18281
18282 if (invert)
18283 {
18284 lo = ~lo, hi = ~hi;
18285 v = constm1_rtx;
18286 }
18287 else
18288 v = const0_rtx;
18289
18290 mask = immed_double_const (lo, hi, imode);
18291
18292 vec = gen_rtvec (2, v, mask);
18293 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18294 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18295
18296 return v;
18297 }
18298 break;
18299
18300 default:
18301 gcc_unreachable ();
18302 }
18303
18304 if (invert)
18305 lo = ~lo, hi = ~hi;
18306
18307 /* Force this value into the low part of a fp vector constant. */
18308 mask = immed_double_const (lo, hi, imode);
18309 mask = gen_lowpart (mode, mask);
18310
18311 if (vec_mode == VOIDmode)
18312 return force_reg (mode, mask);
18313
18314 v = ix86_build_const_vector (vec_mode, vect, mask);
18315 return force_reg (vec_mode, v);
18316 }
18317
18318 /* Generate code for floating point ABS or NEG. */
18319
18320 void
18321 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18322 rtx operands[])
18323 {
18324 rtx mask, set, dst, src;
18325 bool use_sse = false;
18326 bool vector_mode = VECTOR_MODE_P (mode);
18327 enum machine_mode vmode = mode;
18328
18329 if (vector_mode)
18330 use_sse = true;
18331 else if (mode == TFmode)
18332 use_sse = true;
18333 else if (TARGET_SSE_MATH)
18334 {
18335 use_sse = SSE_FLOAT_MODE_P (mode);
18336 if (mode == SFmode)
18337 vmode = V4SFmode;
18338 else if (mode == DFmode)
18339 vmode = V2DFmode;
18340 }
18341
18342 /* NEG and ABS performed with SSE use bitwise mask operations.
18343 Create the appropriate mask now. */
18344 if (use_sse)
18345 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18346 else
18347 mask = NULL_RTX;
18348
18349 dst = operands[0];
18350 src = operands[1];
18351
18352 set = gen_rtx_fmt_e (code, mode, src);
18353 set = gen_rtx_SET (VOIDmode, dst, set);
18354
18355 if (mask)
18356 {
18357 rtx use, clob;
18358 rtvec par;
18359
18360 use = gen_rtx_USE (VOIDmode, mask);
18361 if (vector_mode)
18362 par = gen_rtvec (2, set, use);
18363 else
18364 {
18365 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18366 par = gen_rtvec (3, set, use, clob);
18367 }
18368 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18369 }
18370 else
18371 emit_insn (set);
18372 }
18373
18374 /* Expand a copysign operation. Special case operand 0 being a constant. */
18375
18376 void
18377 ix86_expand_copysign (rtx operands[])
18378 {
18379 enum machine_mode mode, vmode;
18380 rtx dest, op0, op1, mask, nmask;
18381
18382 dest = operands[0];
18383 op0 = operands[1];
18384 op1 = operands[2];
18385
18386 mode = GET_MODE (dest);
18387
18388 if (mode == SFmode)
18389 vmode = V4SFmode;
18390 else if (mode == DFmode)
18391 vmode = V2DFmode;
18392 else
18393 vmode = mode;
18394
18395 if (GET_CODE (op0) == CONST_DOUBLE)
18396 {
18397 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18398
18399 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18400 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18401
18402 if (mode == SFmode || mode == DFmode)
18403 {
18404 if (op0 == CONST0_RTX (mode))
18405 op0 = CONST0_RTX (vmode);
18406 else
18407 {
18408 rtx v = ix86_build_const_vector (vmode, false, op0);
18409
18410 op0 = force_reg (vmode, v);
18411 }
18412 }
18413 else if (op0 != CONST0_RTX (mode))
18414 op0 = force_reg (mode, op0);
18415
18416 mask = ix86_build_signbit_mask (vmode, 0, 0);
18417
18418 if (mode == SFmode)
18419 copysign_insn = gen_copysignsf3_const;
18420 else if (mode == DFmode)
18421 copysign_insn = gen_copysigndf3_const;
18422 else
18423 copysign_insn = gen_copysigntf3_const;
18424
18425 emit_insn (copysign_insn (dest, op0, op1, mask));
18426 }
18427 else
18428 {
18429 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18430
18431 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18432 mask = ix86_build_signbit_mask (vmode, 0, 0);
18433
18434 if (mode == SFmode)
18435 copysign_insn = gen_copysignsf3_var;
18436 else if (mode == DFmode)
18437 copysign_insn = gen_copysigndf3_var;
18438 else
18439 copysign_insn = gen_copysigntf3_var;
18440
18441 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18442 }
18443 }
18444
18445 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18446 be a constant, and so has already been expanded into a vector constant. */
18447
18448 void
18449 ix86_split_copysign_const (rtx operands[])
18450 {
18451 enum machine_mode mode, vmode;
18452 rtx dest, op0, mask, x;
18453
18454 dest = operands[0];
18455 op0 = operands[1];
18456 mask = operands[3];
18457
18458 mode = GET_MODE (dest);
18459 vmode = GET_MODE (mask);
18460
18461 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18462 x = gen_rtx_AND (vmode, dest, mask);
18463 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18464
18465 if (op0 != CONST0_RTX (vmode))
18466 {
18467 x = gen_rtx_IOR (vmode, dest, op0);
18468 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18469 }
18470 }
18471
18472 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18473 so we have to do two masks. */
18474
18475 void
18476 ix86_split_copysign_var (rtx operands[])
18477 {
18478 enum machine_mode mode, vmode;
18479 rtx dest, scratch, op0, op1, mask, nmask, x;
18480
18481 dest = operands[0];
18482 scratch = operands[1];
18483 op0 = operands[2];
18484 op1 = operands[3];
18485 nmask = operands[4];
18486 mask = operands[5];
18487
18488 mode = GET_MODE (dest);
18489 vmode = GET_MODE (mask);
18490
18491 if (rtx_equal_p (op0, op1))
18492 {
18493 /* Shouldn't happen often (it's useless, obviously), but when it does
18494 we'd generate incorrect code if we continue below. */
18495 emit_move_insn (dest, op0);
18496 return;
18497 }
18498
18499 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18500 {
18501 gcc_assert (REGNO (op1) == REGNO (scratch));
18502
18503 x = gen_rtx_AND (vmode, scratch, mask);
18504 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18505
18506 dest = mask;
18507 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18508 x = gen_rtx_NOT (vmode, dest);
18509 x = gen_rtx_AND (vmode, x, op0);
18510 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18511 }
18512 else
18513 {
18514 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18515 {
18516 x = gen_rtx_AND (vmode, scratch, mask);
18517 }
18518 else /* alternative 2,4 */
18519 {
18520 gcc_assert (REGNO (mask) == REGNO (scratch));
18521 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18522 x = gen_rtx_AND (vmode, scratch, op1);
18523 }
18524 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18525
18526 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18527 {
18528 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18529 x = gen_rtx_AND (vmode, dest, nmask);
18530 }
18531 else /* alternative 3,4 */
18532 {
18533 gcc_assert (REGNO (nmask) == REGNO (dest));
18534 dest = nmask;
18535 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18536 x = gen_rtx_AND (vmode, dest, op0);
18537 }
18538 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18539 }
18540
18541 x = gen_rtx_IOR (vmode, dest, scratch);
18542 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18543 }
18544
18545 /* Return TRUE or FALSE depending on whether the first SET in INSN
18546 has source and destination with matching CC modes, and that the
18547 CC mode is at least as constrained as REQ_MODE. */
18548
18549 bool
18550 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18551 {
18552 rtx set;
18553 enum machine_mode set_mode;
18554
18555 set = PATTERN (insn);
18556 if (GET_CODE (set) == PARALLEL)
18557 set = XVECEXP (set, 0, 0);
18558 gcc_assert (GET_CODE (set) == SET);
18559 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18560
18561 set_mode = GET_MODE (SET_DEST (set));
18562 switch (set_mode)
18563 {
18564 case CCNOmode:
18565 if (req_mode != CCNOmode
18566 && (req_mode != CCmode
18567 || XEXP (SET_SRC (set), 1) != const0_rtx))
18568 return false;
18569 break;
18570 case CCmode:
18571 if (req_mode == CCGCmode)
18572 return false;
18573 /* FALLTHRU */
18574 case CCGCmode:
18575 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18576 return false;
18577 /* FALLTHRU */
18578 case CCGOCmode:
18579 if (req_mode == CCZmode)
18580 return false;
18581 /* FALLTHRU */
18582 case CCZmode:
18583 break;
18584
18585 case CCAmode:
18586 case CCCmode:
18587 case CCOmode:
18588 case CCSmode:
18589 if (set_mode != req_mode)
18590 return false;
18591 break;
18592
18593 default:
18594 gcc_unreachable ();
18595 }
18596
18597 return GET_MODE (SET_SRC (set)) == set_mode;
18598 }
18599
18600 /* Generate insn patterns to do an integer compare of OPERANDS. */
18601
18602 static rtx
18603 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18604 {
18605 enum machine_mode cmpmode;
18606 rtx tmp, flags;
18607
18608 cmpmode = SELECT_CC_MODE (code, op0, op1);
18609 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18610
18611 /* This is very simple, but making the interface the same as in the
18612 FP case makes the rest of the code easier. */
18613 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18614 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18615
18616 /* Return the test that should be put into the flags user, i.e.
18617 the bcc, scc, or cmov instruction. */
18618 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18619 }
18620
18621 /* Figure out whether to use ordered or unordered fp comparisons.
18622 Return the appropriate mode to use. */
18623
18624 enum machine_mode
18625 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18626 {
18627 /* ??? In order to make all comparisons reversible, we do all comparisons
18628 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18629 all forms trapping and nontrapping comparisons, we can make inequality
18630 comparisons trapping again, since it results in better code when using
18631 FCOM based compares. */
18632 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18633 }
18634
18635 enum machine_mode
18636 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18637 {
18638 enum machine_mode mode = GET_MODE (op0);
18639
18640 if (SCALAR_FLOAT_MODE_P (mode))
18641 {
18642 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18643 return ix86_fp_compare_mode (code);
18644 }
18645
18646 switch (code)
18647 {
18648 /* Only zero flag is needed. */
18649 case EQ: /* ZF=0 */
18650 case NE: /* ZF!=0 */
18651 return CCZmode;
18652 /* Codes needing carry flag. */
18653 case GEU: /* CF=0 */
18654 case LTU: /* CF=1 */
18655 /* Detect overflow checks. They need just the carry flag. */
18656 if (GET_CODE (op0) == PLUS
18657 && rtx_equal_p (op1, XEXP (op0, 0)))
18658 return CCCmode;
18659 else
18660 return CCmode;
18661 case GTU: /* CF=0 & ZF=0 */
18662 case LEU: /* CF=1 | ZF=1 */
18663 /* Detect overflow checks. They need just the carry flag. */
18664 if (GET_CODE (op0) == MINUS
18665 && rtx_equal_p (op1, XEXP (op0, 0)))
18666 return CCCmode;
18667 else
18668 return CCmode;
18669 /* Codes possibly doable only with sign flag when
18670 comparing against zero. */
18671 case GE: /* SF=OF or SF=0 */
18672 case LT: /* SF<>OF or SF=1 */
18673 if (op1 == const0_rtx)
18674 return CCGOCmode;
18675 else
18676 /* For other cases Carry flag is not required. */
18677 return CCGCmode;
18678 /* Codes doable only with sign flag when comparing
18679 against zero, but we miss jump instruction for it
18680 so we need to use relational tests against overflow
18681 that thus needs to be zero. */
18682 case GT: /* ZF=0 & SF=OF */
18683 case LE: /* ZF=1 | SF<>OF */
18684 if (op1 == const0_rtx)
18685 return CCNOmode;
18686 else
18687 return CCGCmode;
18688 /* strcmp pattern do (use flags) and combine may ask us for proper
18689 mode. */
18690 case USE:
18691 return CCmode;
18692 default:
18693 gcc_unreachable ();
18694 }
18695 }
18696
18697 /* Return the fixed registers used for condition codes. */
18698
18699 static bool
18700 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18701 {
18702 *p1 = FLAGS_REG;
18703 *p2 = FPSR_REG;
18704 return true;
18705 }
18706
18707 /* If two condition code modes are compatible, return a condition code
18708 mode which is compatible with both. Otherwise, return
18709 VOIDmode. */
18710
18711 static enum machine_mode
18712 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18713 {
18714 if (m1 == m2)
18715 return m1;
18716
18717 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18718 return VOIDmode;
18719
18720 if ((m1 == CCGCmode && m2 == CCGOCmode)
18721 || (m1 == CCGOCmode && m2 == CCGCmode))
18722 return CCGCmode;
18723
18724 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
18725 return m2;
18726 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
18727 return m1;
18728
18729 switch (m1)
18730 {
18731 default:
18732 gcc_unreachable ();
18733
18734 case CCmode:
18735 case CCGCmode:
18736 case CCGOCmode:
18737 case CCNOmode:
18738 case CCAmode:
18739 case CCCmode:
18740 case CCOmode:
18741 case CCSmode:
18742 case CCZmode:
18743 switch (m2)
18744 {
18745 default:
18746 return VOIDmode;
18747
18748 case CCmode:
18749 case CCGCmode:
18750 case CCGOCmode:
18751 case CCNOmode:
18752 case CCAmode:
18753 case CCCmode:
18754 case CCOmode:
18755 case CCSmode:
18756 case CCZmode:
18757 return CCmode;
18758 }
18759
18760 case CCFPmode:
18761 case CCFPUmode:
18762 /* These are only compatible with themselves, which we already
18763 checked above. */
18764 return VOIDmode;
18765 }
18766 }
18767
18768
18769 /* Return a comparison we can do and that it is equivalent to
18770 swap_condition (code) apart possibly from orderedness.
18771 But, never change orderedness if TARGET_IEEE_FP, returning
18772 UNKNOWN in that case if necessary. */
18773
18774 static enum rtx_code
18775 ix86_fp_swap_condition (enum rtx_code code)
18776 {
18777 switch (code)
18778 {
18779 case GT: /* GTU - CF=0 & ZF=0 */
18780 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18781 case GE: /* GEU - CF=0 */
18782 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18783 case UNLT: /* LTU - CF=1 */
18784 return TARGET_IEEE_FP ? UNKNOWN : GT;
18785 case UNLE: /* LEU - CF=1 | ZF=1 */
18786 return TARGET_IEEE_FP ? UNKNOWN : GE;
18787 default:
18788 return swap_condition (code);
18789 }
18790 }
18791
18792 /* Return cost of comparison CODE using the best strategy for performance.
18793 All following functions do use number of instructions as a cost metrics.
18794 In future this should be tweaked to compute bytes for optimize_size and
18795 take into account performance of various instructions on various CPUs. */
18796
18797 static int
18798 ix86_fp_comparison_cost (enum rtx_code code)
18799 {
18800 int arith_cost;
18801
18802 /* The cost of code using bit-twiddling on %ah. */
18803 switch (code)
18804 {
18805 case UNLE:
18806 case UNLT:
18807 case LTGT:
18808 case GT:
18809 case GE:
18810 case UNORDERED:
18811 case ORDERED:
18812 case UNEQ:
18813 arith_cost = 4;
18814 break;
18815 case LT:
18816 case NE:
18817 case EQ:
18818 case UNGE:
18819 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18820 break;
18821 case LE:
18822 case UNGT:
18823 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18824 break;
18825 default:
18826 gcc_unreachable ();
18827 }
18828
18829 switch (ix86_fp_comparison_strategy (code))
18830 {
18831 case IX86_FPCMP_COMI:
18832 return arith_cost > 4 ? 3 : 2;
18833 case IX86_FPCMP_SAHF:
18834 return arith_cost > 4 ? 4 : 3;
18835 default:
18836 return arith_cost;
18837 }
18838 }
18839
18840 /* Return strategy to use for floating-point. We assume that fcomi is always
18841 preferrable where available, since that is also true when looking at size
18842 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18843
18844 enum ix86_fpcmp_strategy
18845 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18846 {
18847 /* Do fcomi/sahf based test when profitable. */
18848
18849 if (TARGET_CMOVE)
18850 return IX86_FPCMP_COMI;
18851
18852 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
18853 return IX86_FPCMP_SAHF;
18854
18855 return IX86_FPCMP_ARITH;
18856 }
18857
18858 /* Swap, force into registers, or otherwise massage the two operands
18859 to a fp comparison. The operands are updated in place; the new
18860 comparison code is returned. */
18861
18862 static enum rtx_code
18863 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18864 {
18865 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18866 rtx op0 = *pop0, op1 = *pop1;
18867 enum machine_mode op_mode = GET_MODE (op0);
18868 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18869
18870 /* All of the unordered compare instructions only work on registers.
18871 The same is true of the fcomi compare instructions. The XFmode
18872 compare instructions require registers except when comparing
18873 against zero or when converting operand 1 from fixed point to
18874 floating point. */
18875
18876 if (!is_sse
18877 && (fpcmp_mode == CCFPUmode
18878 || (op_mode == XFmode
18879 && ! (standard_80387_constant_p (op0) == 1
18880 || standard_80387_constant_p (op1) == 1)
18881 && GET_CODE (op1) != FLOAT)
18882 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18883 {
18884 op0 = force_reg (op_mode, op0);
18885 op1 = force_reg (op_mode, op1);
18886 }
18887 else
18888 {
18889 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18890 things around if they appear profitable, otherwise force op0
18891 into a register. */
18892
18893 if (standard_80387_constant_p (op0) == 0
18894 || (MEM_P (op0)
18895 && ! (standard_80387_constant_p (op1) == 0
18896 || MEM_P (op1))))
18897 {
18898 enum rtx_code new_code = ix86_fp_swap_condition (code);
18899 if (new_code != UNKNOWN)
18900 {
18901 rtx tmp;
18902 tmp = op0, op0 = op1, op1 = tmp;
18903 code = new_code;
18904 }
18905 }
18906
18907 if (!REG_P (op0))
18908 op0 = force_reg (op_mode, op0);
18909
18910 if (CONSTANT_P (op1))
18911 {
18912 int tmp = standard_80387_constant_p (op1);
18913 if (tmp == 0)
18914 op1 = validize_mem (force_const_mem (op_mode, op1));
18915 else if (tmp == 1)
18916 {
18917 if (TARGET_CMOVE)
18918 op1 = force_reg (op_mode, op1);
18919 }
18920 else
18921 op1 = force_reg (op_mode, op1);
18922 }
18923 }
18924
18925 /* Try to rearrange the comparison to make it cheaper. */
18926 if (ix86_fp_comparison_cost (code)
18927 > ix86_fp_comparison_cost (swap_condition (code))
18928 && (REG_P (op1) || can_create_pseudo_p ()))
18929 {
18930 rtx tmp;
18931 tmp = op0, op0 = op1, op1 = tmp;
18932 code = swap_condition (code);
18933 if (!REG_P (op0))
18934 op0 = force_reg (op_mode, op0);
18935 }
18936
18937 *pop0 = op0;
18938 *pop1 = op1;
18939 return code;
18940 }
18941
18942 /* Convert comparison codes we use to represent FP comparison to integer
18943 code that will result in proper branch. Return UNKNOWN if no such code
18944 is available. */
18945
18946 enum rtx_code
18947 ix86_fp_compare_code_to_integer (enum rtx_code code)
18948 {
18949 switch (code)
18950 {
18951 case GT:
18952 return GTU;
18953 case GE:
18954 return GEU;
18955 case ORDERED:
18956 case UNORDERED:
18957 return code;
18958 break;
18959 case UNEQ:
18960 return EQ;
18961 break;
18962 case UNLT:
18963 return LTU;
18964 break;
18965 case UNLE:
18966 return LEU;
18967 break;
18968 case LTGT:
18969 return NE;
18970 break;
18971 default:
18972 return UNKNOWN;
18973 }
18974 }
18975
18976 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18977
18978 static rtx
18979 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18980 {
18981 enum machine_mode fpcmp_mode, intcmp_mode;
18982 rtx tmp, tmp2;
18983
18984 fpcmp_mode = ix86_fp_compare_mode (code);
18985 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18986
18987 /* Do fcomi/sahf based test when profitable. */
18988 switch (ix86_fp_comparison_strategy (code))
18989 {
18990 case IX86_FPCMP_COMI:
18991 intcmp_mode = fpcmp_mode;
18992 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18993 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18994 tmp);
18995 emit_insn (tmp);
18996 break;
18997
18998 case IX86_FPCMP_SAHF:
18999 intcmp_mode = fpcmp_mode;
19000 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19001 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
19002 tmp);
19003
19004 if (!scratch)
19005 scratch = gen_reg_rtx (HImode);
19006 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
19007 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
19008 break;
19009
19010 case IX86_FPCMP_ARITH:
19011 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
19012 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19013 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
19014 if (!scratch)
19015 scratch = gen_reg_rtx (HImode);
19016 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
19017
19018 /* In the unordered case, we have to check C2 for NaN's, which
19019 doesn't happen to work out to anything nice combination-wise.
19020 So do some bit twiddling on the value we've got in AH to come
19021 up with an appropriate set of condition codes. */
19022
19023 intcmp_mode = CCNOmode;
19024 switch (code)
19025 {
19026 case GT:
19027 case UNGT:
19028 if (code == GT || !TARGET_IEEE_FP)
19029 {
19030 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19031 code = EQ;
19032 }
19033 else
19034 {
19035 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19036 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19037 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
19038 intcmp_mode = CCmode;
19039 code = GEU;
19040 }
19041 break;
19042 case LT:
19043 case UNLT:
19044 if (code == LT && TARGET_IEEE_FP)
19045 {
19046 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19047 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
19048 intcmp_mode = CCmode;
19049 code = EQ;
19050 }
19051 else
19052 {
19053 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
19054 code = NE;
19055 }
19056 break;
19057 case GE:
19058 case UNGE:
19059 if (code == GE || !TARGET_IEEE_FP)
19060 {
19061 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
19062 code = EQ;
19063 }
19064 else
19065 {
19066 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19067 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
19068 code = NE;
19069 }
19070 break;
19071 case LE:
19072 case UNLE:
19073 if (code == LE && TARGET_IEEE_FP)
19074 {
19075 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19076 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19077 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19078 intcmp_mode = CCmode;
19079 code = LTU;
19080 }
19081 else
19082 {
19083 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19084 code = NE;
19085 }
19086 break;
19087 case EQ:
19088 case UNEQ:
19089 if (code == EQ && TARGET_IEEE_FP)
19090 {
19091 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19092 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19093 intcmp_mode = CCmode;
19094 code = EQ;
19095 }
19096 else
19097 {
19098 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19099 code = NE;
19100 }
19101 break;
19102 case NE:
19103 case LTGT:
19104 if (code == NE && TARGET_IEEE_FP)
19105 {
19106 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19107 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
19108 GEN_INT (0x40)));
19109 code = NE;
19110 }
19111 else
19112 {
19113 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19114 code = EQ;
19115 }
19116 break;
19117
19118 case UNORDERED:
19119 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19120 code = NE;
19121 break;
19122 case ORDERED:
19123 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19124 code = EQ;
19125 break;
19126
19127 default:
19128 gcc_unreachable ();
19129 }
19130 break;
19131
19132 default:
19133 gcc_unreachable();
19134 }
19135
19136 /* Return the test that should be put into the flags user, i.e.
19137 the bcc, scc, or cmov instruction. */
19138 return gen_rtx_fmt_ee (code, VOIDmode,
19139 gen_rtx_REG (intcmp_mode, FLAGS_REG),
19140 const0_rtx);
19141 }
19142
19143 static rtx
19144 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
19145 {
19146 rtx ret;
19147
19148 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
19149 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
19150
19151 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
19152 {
19153 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
19154 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19155 }
19156 else
19157 ret = ix86_expand_int_compare (code, op0, op1);
19158
19159 return ret;
19160 }
19161
19162 void
19163 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
19164 {
19165 enum machine_mode mode = GET_MODE (op0);
19166 rtx tmp;
19167
19168 switch (mode)
19169 {
19170 case SFmode:
19171 case DFmode:
19172 case XFmode:
19173 case QImode:
19174 case HImode:
19175 case SImode:
19176 simple:
19177 tmp = ix86_expand_compare (code, op0, op1);
19178 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
19179 gen_rtx_LABEL_REF (VOIDmode, label),
19180 pc_rtx);
19181 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
19182 return;
19183
19184 case DImode:
19185 if (TARGET_64BIT)
19186 goto simple;
19187 case TImode:
19188 /* Expand DImode branch into multiple compare+branch. */
19189 {
19190 rtx lo[2], hi[2], label2;
19191 enum rtx_code code1, code2, code3;
19192 enum machine_mode submode;
19193
19194 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
19195 {
19196 tmp = op0, op0 = op1, op1 = tmp;
19197 code = swap_condition (code);
19198 }
19199
19200 split_double_mode (mode, &op0, 1, lo+0, hi+0);
19201 split_double_mode (mode, &op1, 1, lo+1, hi+1);
19202
19203 submode = mode == DImode ? SImode : DImode;
19204
19205 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
19206 avoid two branches. This costs one extra insn, so disable when
19207 optimizing for size. */
19208
19209 if ((code == EQ || code == NE)
19210 && (!optimize_insn_for_size_p ()
19211 || hi[1] == const0_rtx || lo[1] == const0_rtx))
19212 {
19213 rtx xor0, xor1;
19214
19215 xor1 = hi[0];
19216 if (hi[1] != const0_rtx)
19217 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
19218 NULL_RTX, 0, OPTAB_WIDEN);
19219
19220 xor0 = lo[0];
19221 if (lo[1] != const0_rtx)
19222 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
19223 NULL_RTX, 0, OPTAB_WIDEN);
19224
19225 tmp = expand_binop (submode, ior_optab, xor1, xor0,
19226 NULL_RTX, 0, OPTAB_WIDEN);
19227
19228 ix86_expand_branch (code, tmp, const0_rtx, label);
19229 return;
19230 }
19231
19232 /* Otherwise, if we are doing less-than or greater-or-equal-than,
19233 op1 is a constant and the low word is zero, then we can just
19234 examine the high word. Similarly for low word -1 and
19235 less-or-equal-than or greater-than. */
19236
19237 if (CONST_INT_P (hi[1]))
19238 switch (code)
19239 {
19240 case LT: case LTU: case GE: case GEU:
19241 if (lo[1] == const0_rtx)
19242 {
19243 ix86_expand_branch (code, hi[0], hi[1], label);
19244 return;
19245 }
19246 break;
19247 case LE: case LEU: case GT: case GTU:
19248 if (lo[1] == constm1_rtx)
19249 {
19250 ix86_expand_branch (code, hi[0], hi[1], label);
19251 return;
19252 }
19253 break;
19254 default:
19255 break;
19256 }
19257
19258 /* Otherwise, we need two or three jumps. */
19259
19260 label2 = gen_label_rtx ();
19261
19262 code1 = code;
19263 code2 = swap_condition (code);
19264 code3 = unsigned_condition (code);
19265
19266 switch (code)
19267 {
19268 case LT: case GT: case LTU: case GTU:
19269 break;
19270
19271 case LE: code1 = LT; code2 = GT; break;
19272 case GE: code1 = GT; code2 = LT; break;
19273 case LEU: code1 = LTU; code2 = GTU; break;
19274 case GEU: code1 = GTU; code2 = LTU; break;
19275
19276 case EQ: code1 = UNKNOWN; code2 = NE; break;
19277 case NE: code2 = UNKNOWN; break;
19278
19279 default:
19280 gcc_unreachable ();
19281 }
19282
19283 /*
19284 * a < b =>
19285 * if (hi(a) < hi(b)) goto true;
19286 * if (hi(a) > hi(b)) goto false;
19287 * if (lo(a) < lo(b)) goto true;
19288 * false:
19289 */
19290
19291 if (code1 != UNKNOWN)
19292 ix86_expand_branch (code1, hi[0], hi[1], label);
19293 if (code2 != UNKNOWN)
19294 ix86_expand_branch (code2, hi[0], hi[1], label2);
19295
19296 ix86_expand_branch (code3, lo[0], lo[1], label);
19297
19298 if (code2 != UNKNOWN)
19299 emit_label (label2);
19300 return;
19301 }
19302
19303 default:
19304 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19305 goto simple;
19306 }
19307 }
19308
19309 /* Split branch based on floating point condition. */
19310 void
19311 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19312 rtx target1, rtx target2, rtx tmp, rtx pushed)
19313 {
19314 rtx condition;
19315 rtx i;
19316
19317 if (target2 != pc_rtx)
19318 {
19319 rtx tmp = target2;
19320 code = reverse_condition_maybe_unordered (code);
19321 target2 = target1;
19322 target1 = tmp;
19323 }
19324
19325 condition = ix86_expand_fp_compare (code, op1, op2,
19326 tmp);
19327
19328 /* Remove pushed operand from stack. */
19329 if (pushed)
19330 ix86_free_from_memory (GET_MODE (pushed));
19331
19332 i = emit_jump_insn (gen_rtx_SET
19333 (VOIDmode, pc_rtx,
19334 gen_rtx_IF_THEN_ELSE (VOIDmode,
19335 condition, target1, target2)));
19336 if (split_branch_probability >= 0)
19337 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
19338 }
19339
19340 void
19341 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19342 {
19343 rtx ret;
19344
19345 gcc_assert (GET_MODE (dest) == QImode);
19346
19347 ret = ix86_expand_compare (code, op0, op1);
19348 PUT_MODE (ret, QImode);
19349 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19350 }
19351
19352 /* Expand comparison setting or clearing carry flag. Return true when
19353 successful and set pop for the operation. */
19354 static bool
19355 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19356 {
19357 enum machine_mode mode =
19358 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19359
19360 /* Do not handle double-mode compares that go through special path. */
19361 if (mode == (TARGET_64BIT ? TImode : DImode))
19362 return false;
19363
19364 if (SCALAR_FLOAT_MODE_P (mode))
19365 {
19366 rtx compare_op, compare_seq;
19367
19368 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19369
19370 /* Shortcut: following common codes never translate
19371 into carry flag compares. */
19372 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19373 || code == ORDERED || code == UNORDERED)
19374 return false;
19375
19376 /* These comparisons require zero flag; swap operands so they won't. */
19377 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19378 && !TARGET_IEEE_FP)
19379 {
19380 rtx tmp = op0;
19381 op0 = op1;
19382 op1 = tmp;
19383 code = swap_condition (code);
19384 }
19385
19386 /* Try to expand the comparison and verify that we end up with
19387 carry flag based comparison. This fails to be true only when
19388 we decide to expand comparison using arithmetic that is not
19389 too common scenario. */
19390 start_sequence ();
19391 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19392 compare_seq = get_insns ();
19393 end_sequence ();
19394
19395 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19396 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19397 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19398 else
19399 code = GET_CODE (compare_op);
19400
19401 if (code != LTU && code != GEU)
19402 return false;
19403
19404 emit_insn (compare_seq);
19405 *pop = compare_op;
19406 return true;
19407 }
19408
19409 if (!INTEGRAL_MODE_P (mode))
19410 return false;
19411
19412 switch (code)
19413 {
19414 case LTU:
19415 case GEU:
19416 break;
19417
19418 /* Convert a==0 into (unsigned)a<1. */
19419 case EQ:
19420 case NE:
19421 if (op1 != const0_rtx)
19422 return false;
19423 op1 = const1_rtx;
19424 code = (code == EQ ? LTU : GEU);
19425 break;
19426
19427 /* Convert a>b into b<a or a>=b-1. */
19428 case GTU:
19429 case LEU:
19430 if (CONST_INT_P (op1))
19431 {
19432 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19433 /* Bail out on overflow. We still can swap operands but that
19434 would force loading of the constant into register. */
19435 if (op1 == const0_rtx
19436 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19437 return false;
19438 code = (code == GTU ? GEU : LTU);
19439 }
19440 else
19441 {
19442 rtx tmp = op1;
19443 op1 = op0;
19444 op0 = tmp;
19445 code = (code == GTU ? LTU : GEU);
19446 }
19447 break;
19448
19449 /* Convert a>=0 into (unsigned)a<0x80000000. */
19450 case LT:
19451 case GE:
19452 if (mode == DImode || op1 != const0_rtx)
19453 return false;
19454 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19455 code = (code == LT ? GEU : LTU);
19456 break;
19457 case LE:
19458 case GT:
19459 if (mode == DImode || op1 != constm1_rtx)
19460 return false;
19461 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19462 code = (code == LE ? GEU : LTU);
19463 break;
19464
19465 default:
19466 return false;
19467 }
19468 /* Swapping operands may cause constant to appear as first operand. */
19469 if (!nonimmediate_operand (op0, VOIDmode))
19470 {
19471 if (!can_create_pseudo_p ())
19472 return false;
19473 op0 = force_reg (mode, op0);
19474 }
19475 *pop = ix86_expand_compare (code, op0, op1);
19476 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19477 return true;
19478 }
19479
19480 bool
19481 ix86_expand_int_movcc (rtx operands[])
19482 {
19483 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19484 rtx compare_seq, compare_op;
19485 enum machine_mode mode = GET_MODE (operands[0]);
19486 bool sign_bit_compare_p = false;
19487 rtx op0 = XEXP (operands[1], 0);
19488 rtx op1 = XEXP (operands[1], 1);
19489
19490 if (GET_MODE (op0) == TImode
19491 || (GET_MODE (op0) == DImode
19492 && !TARGET_64BIT))
19493 return false;
19494
19495 start_sequence ();
19496 compare_op = ix86_expand_compare (code, op0, op1);
19497 compare_seq = get_insns ();
19498 end_sequence ();
19499
19500 compare_code = GET_CODE (compare_op);
19501
19502 if ((op1 == const0_rtx && (code == GE || code == LT))
19503 || (op1 == constm1_rtx && (code == GT || code == LE)))
19504 sign_bit_compare_p = true;
19505
19506 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19507 HImode insns, we'd be swallowed in word prefix ops. */
19508
19509 if ((mode != HImode || TARGET_FAST_PREFIX)
19510 && (mode != (TARGET_64BIT ? TImode : DImode))
19511 && CONST_INT_P (operands[2])
19512 && CONST_INT_P (operands[3]))
19513 {
19514 rtx out = operands[0];
19515 HOST_WIDE_INT ct = INTVAL (operands[2]);
19516 HOST_WIDE_INT cf = INTVAL (operands[3]);
19517 HOST_WIDE_INT diff;
19518
19519 diff = ct - cf;
19520 /* Sign bit compares are better done using shifts than we do by using
19521 sbb. */
19522 if (sign_bit_compare_p
19523 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19524 {
19525 /* Detect overlap between destination and compare sources. */
19526 rtx tmp = out;
19527
19528 if (!sign_bit_compare_p)
19529 {
19530 rtx flags;
19531 bool fpcmp = false;
19532
19533 compare_code = GET_CODE (compare_op);
19534
19535 flags = XEXP (compare_op, 0);
19536
19537 if (GET_MODE (flags) == CCFPmode
19538 || GET_MODE (flags) == CCFPUmode)
19539 {
19540 fpcmp = true;
19541 compare_code
19542 = ix86_fp_compare_code_to_integer (compare_code);
19543 }
19544
19545 /* To simplify rest of code, restrict to the GEU case. */
19546 if (compare_code == LTU)
19547 {
19548 HOST_WIDE_INT tmp = ct;
19549 ct = cf;
19550 cf = tmp;
19551 compare_code = reverse_condition (compare_code);
19552 code = reverse_condition (code);
19553 }
19554 else
19555 {
19556 if (fpcmp)
19557 PUT_CODE (compare_op,
19558 reverse_condition_maybe_unordered
19559 (GET_CODE (compare_op)));
19560 else
19561 PUT_CODE (compare_op,
19562 reverse_condition (GET_CODE (compare_op)));
19563 }
19564 diff = ct - cf;
19565
19566 if (reg_overlap_mentioned_p (out, op0)
19567 || reg_overlap_mentioned_p (out, op1))
19568 tmp = gen_reg_rtx (mode);
19569
19570 if (mode == DImode)
19571 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19572 else
19573 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19574 flags, compare_op));
19575 }
19576 else
19577 {
19578 if (code == GT || code == GE)
19579 code = reverse_condition (code);
19580 else
19581 {
19582 HOST_WIDE_INT tmp = ct;
19583 ct = cf;
19584 cf = tmp;
19585 diff = ct - cf;
19586 }
19587 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19588 }
19589
19590 if (diff == 1)
19591 {
19592 /*
19593 * cmpl op0,op1
19594 * sbbl dest,dest
19595 * [addl dest, ct]
19596 *
19597 * Size 5 - 8.
19598 */
19599 if (ct)
19600 tmp = expand_simple_binop (mode, PLUS,
19601 tmp, GEN_INT (ct),
19602 copy_rtx (tmp), 1, OPTAB_DIRECT);
19603 }
19604 else if (cf == -1)
19605 {
19606 /*
19607 * cmpl op0,op1
19608 * sbbl dest,dest
19609 * orl $ct, dest
19610 *
19611 * Size 8.
19612 */
19613 tmp = expand_simple_binop (mode, IOR,
19614 tmp, GEN_INT (ct),
19615 copy_rtx (tmp), 1, OPTAB_DIRECT);
19616 }
19617 else if (diff == -1 && ct)
19618 {
19619 /*
19620 * cmpl op0,op1
19621 * sbbl dest,dest
19622 * notl dest
19623 * [addl dest, cf]
19624 *
19625 * Size 8 - 11.
19626 */
19627 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19628 if (cf)
19629 tmp = expand_simple_binop (mode, PLUS,
19630 copy_rtx (tmp), GEN_INT (cf),
19631 copy_rtx (tmp), 1, OPTAB_DIRECT);
19632 }
19633 else
19634 {
19635 /*
19636 * cmpl op0,op1
19637 * sbbl dest,dest
19638 * [notl dest]
19639 * andl cf - ct, dest
19640 * [addl dest, ct]
19641 *
19642 * Size 8 - 11.
19643 */
19644
19645 if (cf == 0)
19646 {
19647 cf = ct;
19648 ct = 0;
19649 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19650 }
19651
19652 tmp = expand_simple_binop (mode, AND,
19653 copy_rtx (tmp),
19654 gen_int_mode (cf - ct, mode),
19655 copy_rtx (tmp), 1, OPTAB_DIRECT);
19656 if (ct)
19657 tmp = expand_simple_binop (mode, PLUS,
19658 copy_rtx (tmp), GEN_INT (ct),
19659 copy_rtx (tmp), 1, OPTAB_DIRECT);
19660 }
19661
19662 if (!rtx_equal_p (tmp, out))
19663 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19664
19665 return true;
19666 }
19667
19668 if (diff < 0)
19669 {
19670 enum machine_mode cmp_mode = GET_MODE (op0);
19671
19672 HOST_WIDE_INT tmp;
19673 tmp = ct, ct = cf, cf = tmp;
19674 diff = -diff;
19675
19676 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19677 {
19678 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19679
19680 /* We may be reversing unordered compare to normal compare, that
19681 is not valid in general (we may convert non-trapping condition
19682 to trapping one), however on i386 we currently emit all
19683 comparisons unordered. */
19684 compare_code = reverse_condition_maybe_unordered (compare_code);
19685 code = reverse_condition_maybe_unordered (code);
19686 }
19687 else
19688 {
19689 compare_code = reverse_condition (compare_code);
19690 code = reverse_condition (code);
19691 }
19692 }
19693
19694 compare_code = UNKNOWN;
19695 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19696 && CONST_INT_P (op1))
19697 {
19698 if (op1 == const0_rtx
19699 && (code == LT || code == GE))
19700 compare_code = code;
19701 else if (op1 == constm1_rtx)
19702 {
19703 if (code == LE)
19704 compare_code = LT;
19705 else if (code == GT)
19706 compare_code = GE;
19707 }
19708 }
19709
19710 /* Optimize dest = (op0 < 0) ? -1 : cf. */
19711 if (compare_code != UNKNOWN
19712 && GET_MODE (op0) == GET_MODE (out)
19713 && (cf == -1 || ct == -1))
19714 {
19715 /* If lea code below could be used, only optimize
19716 if it results in a 2 insn sequence. */
19717
19718 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19719 || diff == 3 || diff == 5 || diff == 9)
19720 || (compare_code == LT && ct == -1)
19721 || (compare_code == GE && cf == -1))
19722 {
19723 /*
19724 * notl op1 (if necessary)
19725 * sarl $31, op1
19726 * orl cf, op1
19727 */
19728 if (ct != -1)
19729 {
19730 cf = ct;
19731 ct = -1;
19732 code = reverse_condition (code);
19733 }
19734
19735 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19736
19737 out = expand_simple_binop (mode, IOR,
19738 out, GEN_INT (cf),
19739 out, 1, OPTAB_DIRECT);
19740 if (out != operands[0])
19741 emit_move_insn (operands[0], out);
19742
19743 return true;
19744 }
19745 }
19746
19747
19748 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19749 || diff == 3 || diff == 5 || diff == 9)
19750 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19751 && (mode != DImode
19752 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19753 {
19754 /*
19755 * xorl dest,dest
19756 * cmpl op1,op2
19757 * setcc dest
19758 * lea cf(dest*(ct-cf)),dest
19759 *
19760 * Size 14.
19761 *
19762 * This also catches the degenerate setcc-only case.
19763 */
19764
19765 rtx tmp;
19766 int nops;
19767
19768 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19769
19770 nops = 0;
19771 /* On x86_64 the lea instruction operates on Pmode, so we need
19772 to get arithmetics done in proper mode to match. */
19773 if (diff == 1)
19774 tmp = copy_rtx (out);
19775 else
19776 {
19777 rtx out1;
19778 out1 = copy_rtx (out);
19779 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19780 nops++;
19781 if (diff & 1)
19782 {
19783 tmp = gen_rtx_PLUS (mode, tmp, out1);
19784 nops++;
19785 }
19786 }
19787 if (cf != 0)
19788 {
19789 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19790 nops++;
19791 }
19792 if (!rtx_equal_p (tmp, out))
19793 {
19794 if (nops == 1)
19795 out = force_operand (tmp, copy_rtx (out));
19796 else
19797 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19798 }
19799 if (!rtx_equal_p (out, operands[0]))
19800 emit_move_insn (operands[0], copy_rtx (out));
19801
19802 return true;
19803 }
19804
19805 /*
19806 * General case: Jumpful:
19807 * xorl dest,dest cmpl op1, op2
19808 * cmpl op1, op2 movl ct, dest
19809 * setcc dest jcc 1f
19810 * decl dest movl cf, dest
19811 * andl (cf-ct),dest 1:
19812 * addl ct,dest
19813 *
19814 * Size 20. Size 14.
19815 *
19816 * This is reasonably steep, but branch mispredict costs are
19817 * high on modern cpus, so consider failing only if optimizing
19818 * for space.
19819 */
19820
19821 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19822 && BRANCH_COST (optimize_insn_for_speed_p (),
19823 false) >= 2)
19824 {
19825 if (cf == 0)
19826 {
19827 enum machine_mode cmp_mode = GET_MODE (op0);
19828
19829 cf = ct;
19830 ct = 0;
19831
19832 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19833 {
19834 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19835
19836 /* We may be reversing unordered compare to normal compare,
19837 that is not valid in general (we may convert non-trapping
19838 condition to trapping one), however on i386 we currently
19839 emit all comparisons unordered. */
19840 code = reverse_condition_maybe_unordered (code);
19841 }
19842 else
19843 {
19844 code = reverse_condition (code);
19845 if (compare_code != UNKNOWN)
19846 compare_code = reverse_condition (compare_code);
19847 }
19848 }
19849
19850 if (compare_code != UNKNOWN)
19851 {
19852 /* notl op1 (if needed)
19853 sarl $31, op1
19854 andl (cf-ct), op1
19855 addl ct, op1
19856
19857 For x < 0 (resp. x <= -1) there will be no notl,
19858 so if possible swap the constants to get rid of the
19859 complement.
19860 True/false will be -1/0 while code below (store flag
19861 followed by decrement) is 0/-1, so the constants need
19862 to be exchanged once more. */
19863
19864 if (compare_code == GE || !cf)
19865 {
19866 code = reverse_condition (code);
19867 compare_code = LT;
19868 }
19869 else
19870 {
19871 HOST_WIDE_INT tmp = cf;
19872 cf = ct;
19873 ct = tmp;
19874 }
19875
19876 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19877 }
19878 else
19879 {
19880 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19881
19882 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19883 constm1_rtx,
19884 copy_rtx (out), 1, OPTAB_DIRECT);
19885 }
19886
19887 out = expand_simple_binop (mode, AND, copy_rtx (out),
19888 gen_int_mode (cf - ct, mode),
19889 copy_rtx (out), 1, OPTAB_DIRECT);
19890 if (ct)
19891 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19892 copy_rtx (out), 1, OPTAB_DIRECT);
19893 if (!rtx_equal_p (out, operands[0]))
19894 emit_move_insn (operands[0], copy_rtx (out));
19895
19896 return true;
19897 }
19898 }
19899
19900 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19901 {
19902 /* Try a few things more with specific constants and a variable. */
19903
19904 optab op;
19905 rtx var, orig_out, out, tmp;
19906
19907 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19908 return false;
19909
19910 /* If one of the two operands is an interesting constant, load a
19911 constant with the above and mask it in with a logical operation. */
19912
19913 if (CONST_INT_P (operands[2]))
19914 {
19915 var = operands[3];
19916 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19917 operands[3] = constm1_rtx, op = and_optab;
19918 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19919 operands[3] = const0_rtx, op = ior_optab;
19920 else
19921 return false;
19922 }
19923 else if (CONST_INT_P (operands[3]))
19924 {
19925 var = operands[2];
19926 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19927 operands[2] = constm1_rtx, op = and_optab;
19928 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19929 operands[2] = const0_rtx, op = ior_optab;
19930 else
19931 return false;
19932 }
19933 else
19934 return false;
19935
19936 orig_out = operands[0];
19937 tmp = gen_reg_rtx (mode);
19938 operands[0] = tmp;
19939
19940 /* Recurse to get the constant loaded. */
19941 if (ix86_expand_int_movcc (operands) == 0)
19942 return false;
19943
19944 /* Mask in the interesting variable. */
19945 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19946 OPTAB_WIDEN);
19947 if (!rtx_equal_p (out, orig_out))
19948 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19949
19950 return true;
19951 }
19952
19953 /*
19954 * For comparison with above,
19955 *
19956 * movl cf,dest
19957 * movl ct,tmp
19958 * cmpl op1,op2
19959 * cmovcc tmp,dest
19960 *
19961 * Size 15.
19962 */
19963
19964 if (! nonimmediate_operand (operands[2], mode))
19965 operands[2] = force_reg (mode, operands[2]);
19966 if (! nonimmediate_operand (operands[3], mode))
19967 operands[3] = force_reg (mode, operands[3]);
19968
19969 if (! register_operand (operands[2], VOIDmode)
19970 && (mode == QImode
19971 || ! register_operand (operands[3], VOIDmode)))
19972 operands[2] = force_reg (mode, operands[2]);
19973
19974 if (mode == QImode
19975 && ! register_operand (operands[3], VOIDmode))
19976 operands[3] = force_reg (mode, operands[3]);
19977
19978 emit_insn (compare_seq);
19979 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19980 gen_rtx_IF_THEN_ELSE (mode,
19981 compare_op, operands[2],
19982 operands[3])));
19983 return true;
19984 }
19985
19986 /* Swap, force into registers, or otherwise massage the two operands
19987 to an sse comparison with a mask result. Thus we differ a bit from
19988 ix86_prepare_fp_compare_args which expects to produce a flags result.
19989
19990 The DEST operand exists to help determine whether to commute commutative
19991 operators. The POP0/POP1 operands are updated in place. The new
19992 comparison code is returned, or UNKNOWN if not implementable. */
19993
19994 static enum rtx_code
19995 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19996 rtx *pop0, rtx *pop1)
19997 {
19998 rtx tmp;
19999
20000 switch (code)
20001 {
20002 case LTGT:
20003 case UNEQ:
20004 /* AVX supports all the needed comparisons. */
20005 if (TARGET_AVX)
20006 break;
20007 /* We have no LTGT as an operator. We could implement it with
20008 NE & ORDERED, but this requires an extra temporary. It's
20009 not clear that it's worth it. */
20010 return UNKNOWN;
20011
20012 case LT:
20013 case LE:
20014 case UNGT:
20015 case UNGE:
20016 /* These are supported directly. */
20017 break;
20018
20019 case EQ:
20020 case NE:
20021 case UNORDERED:
20022 case ORDERED:
20023 /* AVX has 3 operand comparisons, no need to swap anything. */
20024 if (TARGET_AVX)
20025 break;
20026 /* For commutative operators, try to canonicalize the destination
20027 operand to be first in the comparison - this helps reload to
20028 avoid extra moves. */
20029 if (!dest || !rtx_equal_p (dest, *pop1))
20030 break;
20031 /* FALLTHRU */
20032
20033 case GE:
20034 case GT:
20035 case UNLE:
20036 case UNLT:
20037 /* These are not supported directly before AVX, and furthermore
20038 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
20039 comparison operands to transform into something that is
20040 supported. */
20041 tmp = *pop0;
20042 *pop0 = *pop1;
20043 *pop1 = tmp;
20044 code = swap_condition (code);
20045 break;
20046
20047 default:
20048 gcc_unreachable ();
20049 }
20050
20051 return code;
20052 }
20053
20054 /* Detect conditional moves that exactly match min/max operational
20055 semantics. Note that this is IEEE safe, as long as we don't
20056 interchange the operands.
20057
20058 Returns FALSE if this conditional move doesn't match a MIN/MAX,
20059 and TRUE if the operation is successful and instructions are emitted. */
20060
20061 static bool
20062 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
20063 rtx cmp_op1, rtx if_true, rtx if_false)
20064 {
20065 enum machine_mode mode;
20066 bool is_min;
20067 rtx tmp;
20068
20069 if (code == LT)
20070 ;
20071 else if (code == UNGE)
20072 {
20073 tmp = if_true;
20074 if_true = if_false;
20075 if_false = tmp;
20076 }
20077 else
20078 return false;
20079
20080 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
20081 is_min = true;
20082 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
20083 is_min = false;
20084 else
20085 return false;
20086
20087 mode = GET_MODE (dest);
20088
20089 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
20090 but MODE may be a vector mode and thus not appropriate. */
20091 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
20092 {
20093 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
20094 rtvec v;
20095
20096 if_true = force_reg (mode, if_true);
20097 v = gen_rtvec (2, if_true, if_false);
20098 tmp = gen_rtx_UNSPEC (mode, v, u);
20099 }
20100 else
20101 {
20102 code = is_min ? SMIN : SMAX;
20103 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
20104 }
20105
20106 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
20107 return true;
20108 }
20109
20110 /* Expand an sse vector comparison. Return the register with the result. */
20111
20112 static rtx
20113 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
20114 rtx op_true, rtx op_false)
20115 {
20116 enum machine_mode mode = GET_MODE (dest);
20117 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
20118 rtx x;
20119
20120 cmp_op0 = force_reg (cmp_mode, cmp_op0);
20121 if (!nonimmediate_operand (cmp_op1, cmp_mode))
20122 cmp_op1 = force_reg (cmp_mode, cmp_op1);
20123
20124 if (optimize
20125 || reg_overlap_mentioned_p (dest, op_true)
20126 || reg_overlap_mentioned_p (dest, op_false))
20127 dest = gen_reg_rtx (mode);
20128
20129 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
20130 if (cmp_mode != mode)
20131 {
20132 x = force_reg (cmp_mode, x);
20133 convert_move (dest, x, false);
20134 }
20135 else
20136 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20137
20138 return dest;
20139 }
20140
20141 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
20142 operations. This is used for both scalar and vector conditional moves. */
20143
20144 static void
20145 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
20146 {
20147 enum machine_mode mode = GET_MODE (dest);
20148 rtx t2, t3, x;
20149
20150 if (vector_all_ones_operand (op_true, mode)
20151 && rtx_equal_p (op_false, CONST0_RTX (mode)))
20152 {
20153 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
20154 }
20155 else if (op_false == CONST0_RTX (mode))
20156 {
20157 op_true = force_reg (mode, op_true);
20158 x = gen_rtx_AND (mode, cmp, op_true);
20159 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20160 }
20161 else if (op_true == CONST0_RTX (mode))
20162 {
20163 op_false = force_reg (mode, op_false);
20164 x = gen_rtx_NOT (mode, cmp);
20165 x = gen_rtx_AND (mode, x, op_false);
20166 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20167 }
20168 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
20169 {
20170 op_false = force_reg (mode, op_false);
20171 x = gen_rtx_IOR (mode, cmp, op_false);
20172 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20173 }
20174 else if (TARGET_XOP)
20175 {
20176 op_true = force_reg (mode, op_true);
20177
20178 if (!nonimmediate_operand (op_false, mode))
20179 op_false = force_reg (mode, op_false);
20180
20181 emit_insn (gen_rtx_SET (mode, dest,
20182 gen_rtx_IF_THEN_ELSE (mode, cmp,
20183 op_true,
20184 op_false)));
20185 }
20186 else
20187 {
20188 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
20189
20190 if (!nonimmediate_operand (op_true, mode))
20191 op_true = force_reg (mode, op_true);
20192
20193 op_false = force_reg (mode, op_false);
20194
20195 switch (mode)
20196 {
20197 case V4SFmode:
20198 if (TARGET_SSE4_1)
20199 gen = gen_sse4_1_blendvps;
20200 break;
20201 case V2DFmode:
20202 if (TARGET_SSE4_1)
20203 gen = gen_sse4_1_blendvpd;
20204 break;
20205 case V16QImode:
20206 case V8HImode:
20207 case V4SImode:
20208 case V2DImode:
20209 if (TARGET_SSE4_1)
20210 {
20211 gen = gen_sse4_1_pblendvb;
20212 dest = gen_lowpart (V16QImode, dest);
20213 op_false = gen_lowpart (V16QImode, op_false);
20214 op_true = gen_lowpart (V16QImode, op_true);
20215 cmp = gen_lowpart (V16QImode, cmp);
20216 }
20217 break;
20218 case V8SFmode:
20219 if (TARGET_AVX)
20220 gen = gen_avx_blendvps256;
20221 break;
20222 case V4DFmode:
20223 if (TARGET_AVX)
20224 gen = gen_avx_blendvpd256;
20225 break;
20226 case V32QImode:
20227 case V16HImode:
20228 case V8SImode:
20229 case V4DImode:
20230 if (TARGET_AVX2)
20231 {
20232 gen = gen_avx2_pblendvb;
20233 dest = gen_lowpart (V32QImode, dest);
20234 op_false = gen_lowpart (V32QImode, op_false);
20235 op_true = gen_lowpart (V32QImode, op_true);
20236 cmp = gen_lowpart (V32QImode, cmp);
20237 }
20238 break;
20239 default:
20240 break;
20241 }
20242
20243 if (gen != NULL)
20244 emit_insn (gen (dest, op_false, op_true, cmp));
20245 else
20246 {
20247 op_true = force_reg (mode, op_true);
20248
20249 t2 = gen_reg_rtx (mode);
20250 if (optimize)
20251 t3 = gen_reg_rtx (mode);
20252 else
20253 t3 = dest;
20254
20255 x = gen_rtx_AND (mode, op_true, cmp);
20256 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
20257
20258 x = gen_rtx_NOT (mode, cmp);
20259 x = gen_rtx_AND (mode, x, op_false);
20260 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
20261
20262 x = gen_rtx_IOR (mode, t3, t2);
20263 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20264 }
20265 }
20266 }
20267
20268 /* Expand a floating-point conditional move. Return true if successful. */
20269
20270 bool
20271 ix86_expand_fp_movcc (rtx operands[])
20272 {
20273 enum machine_mode mode = GET_MODE (operands[0]);
20274 enum rtx_code code = GET_CODE (operands[1]);
20275 rtx tmp, compare_op;
20276 rtx op0 = XEXP (operands[1], 0);
20277 rtx op1 = XEXP (operands[1], 1);
20278
20279 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20280 {
20281 enum machine_mode cmode;
20282
20283 /* Since we've no cmove for sse registers, don't force bad register
20284 allocation just to gain access to it. Deny movcc when the
20285 comparison mode doesn't match the move mode. */
20286 cmode = GET_MODE (op0);
20287 if (cmode == VOIDmode)
20288 cmode = GET_MODE (op1);
20289 if (cmode != mode)
20290 return false;
20291
20292 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20293 if (code == UNKNOWN)
20294 return false;
20295
20296 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20297 operands[2], operands[3]))
20298 return true;
20299
20300 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20301 operands[2], operands[3]);
20302 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20303 return true;
20304 }
20305
20306 if (GET_MODE (op0) == TImode
20307 || (GET_MODE (op0) == DImode
20308 && !TARGET_64BIT))
20309 return false;
20310
20311 /* The floating point conditional move instructions don't directly
20312 support conditions resulting from a signed integer comparison. */
20313
20314 compare_op = ix86_expand_compare (code, op0, op1);
20315 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20316 {
20317 tmp = gen_reg_rtx (QImode);
20318 ix86_expand_setcc (tmp, code, op0, op1);
20319
20320 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20321 }
20322
20323 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20324 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20325 operands[2], operands[3])));
20326
20327 return true;
20328 }
20329
20330 /* Expand a floating-point vector conditional move; a vcond operation
20331 rather than a movcc operation. */
20332
20333 bool
20334 ix86_expand_fp_vcond (rtx operands[])
20335 {
20336 enum rtx_code code = GET_CODE (operands[3]);
20337 rtx cmp;
20338
20339 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20340 &operands[4], &operands[5]);
20341 if (code == UNKNOWN)
20342 {
20343 rtx temp;
20344 switch (GET_CODE (operands[3]))
20345 {
20346 case LTGT:
20347 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20348 operands[5], operands[0], operands[0]);
20349 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20350 operands[5], operands[1], operands[2]);
20351 code = AND;
20352 break;
20353 case UNEQ:
20354 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20355 operands[5], operands[0], operands[0]);
20356 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20357 operands[5], operands[1], operands[2]);
20358 code = IOR;
20359 break;
20360 default:
20361 gcc_unreachable ();
20362 }
20363 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20364 OPTAB_DIRECT);
20365 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20366 return true;
20367 }
20368
20369 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20370 operands[5], operands[1], operands[2]))
20371 return true;
20372
20373 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20374 operands[1], operands[2]);
20375 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20376 return true;
20377 }
20378
20379 /* Expand a signed/unsigned integral vector conditional move. */
20380
20381 bool
20382 ix86_expand_int_vcond (rtx operands[])
20383 {
20384 enum machine_mode data_mode = GET_MODE (operands[0]);
20385 enum machine_mode mode = GET_MODE (operands[4]);
20386 enum rtx_code code = GET_CODE (operands[3]);
20387 bool negate = false;
20388 rtx x, cop0, cop1;
20389
20390 cop0 = operands[4];
20391 cop1 = operands[5];
20392
20393 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20394 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20395 if ((code == LT || code == GE)
20396 && data_mode == mode
20397 && cop1 == CONST0_RTX (mode)
20398 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20399 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20400 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20401 && (GET_MODE_SIZE (data_mode) == 16
20402 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20403 {
20404 rtx negop = operands[2 - (code == LT)];
20405 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20406 if (negop == CONST1_RTX (data_mode))
20407 {
20408 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20409 operands[0], 1, OPTAB_DIRECT);
20410 if (res != operands[0])
20411 emit_move_insn (operands[0], res);
20412 return true;
20413 }
20414 else if (GET_MODE_INNER (data_mode) != DImode
20415 && vector_all_ones_operand (negop, data_mode))
20416 {
20417 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20418 operands[0], 0, OPTAB_DIRECT);
20419 if (res != operands[0])
20420 emit_move_insn (operands[0], res);
20421 return true;
20422 }
20423 }
20424
20425 if (!nonimmediate_operand (cop1, mode))
20426 cop1 = force_reg (mode, cop1);
20427 if (!general_operand (operands[1], data_mode))
20428 operands[1] = force_reg (data_mode, operands[1]);
20429 if (!general_operand (operands[2], data_mode))
20430 operands[2] = force_reg (data_mode, operands[2]);
20431
20432 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20433 if (TARGET_XOP
20434 && (mode == V16QImode || mode == V8HImode
20435 || mode == V4SImode || mode == V2DImode))
20436 ;
20437 else
20438 {
20439 /* Canonicalize the comparison to EQ, GT, GTU. */
20440 switch (code)
20441 {
20442 case EQ:
20443 case GT:
20444 case GTU:
20445 break;
20446
20447 case NE:
20448 case LE:
20449 case LEU:
20450 code = reverse_condition (code);
20451 negate = true;
20452 break;
20453
20454 case GE:
20455 case GEU:
20456 code = reverse_condition (code);
20457 negate = true;
20458 /* FALLTHRU */
20459
20460 case LT:
20461 case LTU:
20462 code = swap_condition (code);
20463 x = cop0, cop0 = cop1, cop1 = x;
20464 break;
20465
20466 default:
20467 gcc_unreachable ();
20468 }
20469
20470 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20471 if (mode == V2DImode)
20472 {
20473 switch (code)
20474 {
20475 case EQ:
20476 /* SSE4.1 supports EQ. */
20477 if (!TARGET_SSE4_1)
20478 return false;
20479 break;
20480
20481 case GT:
20482 case GTU:
20483 /* SSE4.2 supports GT/GTU. */
20484 if (!TARGET_SSE4_2)
20485 return false;
20486 break;
20487
20488 default:
20489 gcc_unreachable ();
20490 }
20491 }
20492
20493 /* Unsigned parallel compare is not supported by the hardware.
20494 Play some tricks to turn this into a signed comparison
20495 against 0. */
20496 if (code == GTU)
20497 {
20498 cop0 = force_reg (mode, cop0);
20499
20500 switch (mode)
20501 {
20502 case V8SImode:
20503 case V4DImode:
20504 case V4SImode:
20505 case V2DImode:
20506 {
20507 rtx t1, t2, mask;
20508 rtx (*gen_sub3) (rtx, rtx, rtx);
20509
20510 switch (mode)
20511 {
20512 case V8SImode: gen_sub3 = gen_subv8si3; break;
20513 case V4DImode: gen_sub3 = gen_subv4di3; break;
20514 case V4SImode: gen_sub3 = gen_subv4si3; break;
20515 case V2DImode: gen_sub3 = gen_subv2di3; break;
20516 default:
20517 gcc_unreachable ();
20518 }
20519 /* Subtract (-(INT MAX) - 1) from both operands to make
20520 them signed. */
20521 mask = ix86_build_signbit_mask (mode, true, false);
20522 t1 = gen_reg_rtx (mode);
20523 emit_insn (gen_sub3 (t1, cop0, mask));
20524
20525 t2 = gen_reg_rtx (mode);
20526 emit_insn (gen_sub3 (t2, cop1, mask));
20527
20528 cop0 = t1;
20529 cop1 = t2;
20530 code = GT;
20531 }
20532 break;
20533
20534 case V32QImode:
20535 case V16HImode:
20536 case V16QImode:
20537 case V8HImode:
20538 /* Perform a parallel unsigned saturating subtraction. */
20539 x = gen_reg_rtx (mode);
20540 emit_insn (gen_rtx_SET (VOIDmode, x,
20541 gen_rtx_US_MINUS (mode, cop0, cop1)));
20542
20543 cop0 = x;
20544 cop1 = CONST0_RTX (mode);
20545 code = EQ;
20546 negate = !negate;
20547 break;
20548
20549 default:
20550 gcc_unreachable ();
20551 }
20552 }
20553 }
20554
20555 /* Allow the comparison to be done in one mode, but the movcc to
20556 happen in another mode. */
20557 if (data_mode == mode)
20558 {
20559 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20560 operands[1+negate], operands[2-negate]);
20561 }
20562 else
20563 {
20564 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20565 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
20566 code, cop0, cop1,
20567 operands[1+negate], operands[2-negate]);
20568 x = gen_lowpart (data_mode, x);
20569 }
20570
20571 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20572 operands[2-negate]);
20573 return true;
20574 }
20575
20576 /* Expand a variable vector permutation. */
20577
20578 void
20579 ix86_expand_vec_perm (rtx operands[])
20580 {
20581 rtx target = operands[0];
20582 rtx op0 = operands[1];
20583 rtx op1 = operands[2];
20584 rtx mask = operands[3];
20585 rtx t1, t2, t3, t4, vt, vt2, vec[32];
20586 enum machine_mode mode = GET_MODE (op0);
20587 enum machine_mode maskmode = GET_MODE (mask);
20588 int w, e, i;
20589 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20590
20591 /* Number of elements in the vector. */
20592 w = GET_MODE_NUNITS (mode);
20593 e = GET_MODE_UNIT_SIZE (mode);
20594 gcc_assert (w <= 32);
20595
20596 if (TARGET_AVX2)
20597 {
20598 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20599 {
20600 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20601 an constant shuffle operand. With a tiny bit of effort we can
20602 use VPERMD instead. A re-interpretation stall for V4DFmode is
20603 unfortunate but there's no avoiding it.
20604 Similarly for V16HImode we don't have instructions for variable
20605 shuffling, while for V32QImode we can use after preparing suitable
20606 masks vpshufb; vpshufb; vpermq; vpor. */
20607
20608 if (mode == V16HImode)
20609 {
20610 maskmode = mode = V32QImode;
20611 w = 32;
20612 e = 1;
20613 }
20614 else
20615 {
20616 maskmode = mode = V8SImode;
20617 w = 8;
20618 e = 4;
20619 }
20620 t1 = gen_reg_rtx (maskmode);
20621
20622 /* Replicate the low bits of the V4DImode mask into V8SImode:
20623 mask = { A B C D }
20624 t1 = { A A B B C C D D }. */
20625 for (i = 0; i < w / 2; ++i)
20626 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20627 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20628 vt = force_reg (maskmode, vt);
20629 mask = gen_lowpart (maskmode, mask);
20630 if (maskmode == V8SImode)
20631 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20632 else
20633 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20634
20635 /* Multiply the shuffle indicies by two. */
20636 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20637 OPTAB_DIRECT);
20638
20639 /* Add one to the odd shuffle indicies:
20640 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20641 for (i = 0; i < w / 2; ++i)
20642 {
20643 vec[i * 2] = const0_rtx;
20644 vec[i * 2 + 1] = const1_rtx;
20645 }
20646 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20647 vt = validize_mem (force_const_mem (maskmode, vt));
20648 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20649 OPTAB_DIRECT);
20650
20651 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20652 operands[3] = mask = t1;
20653 target = gen_lowpart (mode, target);
20654 op0 = gen_lowpart (mode, op0);
20655 op1 = gen_lowpart (mode, op1);
20656 }
20657
20658 switch (mode)
20659 {
20660 case V8SImode:
20661 /* The VPERMD and VPERMPS instructions already properly ignore
20662 the high bits of the shuffle elements. No need for us to
20663 perform an AND ourselves. */
20664 if (one_operand_shuffle)
20665 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20666 else
20667 {
20668 t1 = gen_reg_rtx (V8SImode);
20669 t2 = gen_reg_rtx (V8SImode);
20670 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20671 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20672 goto merge_two;
20673 }
20674 return;
20675
20676 case V8SFmode:
20677 mask = gen_lowpart (V8SFmode, mask);
20678 if (one_operand_shuffle)
20679 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20680 else
20681 {
20682 t1 = gen_reg_rtx (V8SFmode);
20683 t2 = gen_reg_rtx (V8SFmode);
20684 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20685 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20686 goto merge_two;
20687 }
20688 return;
20689
20690 case V4SImode:
20691 /* By combining the two 128-bit input vectors into one 256-bit
20692 input vector, we can use VPERMD and VPERMPS for the full
20693 two-operand shuffle. */
20694 t1 = gen_reg_rtx (V8SImode);
20695 t2 = gen_reg_rtx (V8SImode);
20696 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
20697 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20698 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
20699 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
20700 return;
20701
20702 case V4SFmode:
20703 t1 = gen_reg_rtx (V8SFmode);
20704 t2 = gen_reg_rtx (V8SImode);
20705 mask = gen_lowpart (V4SImode, mask);
20706 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
20707 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20708 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
20709 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
20710 return;
20711
20712 case V32QImode:
20713 t1 = gen_reg_rtx (V32QImode);
20714 t2 = gen_reg_rtx (V32QImode);
20715 t3 = gen_reg_rtx (V32QImode);
20716 vt2 = GEN_INT (128);
20717 for (i = 0; i < 32; i++)
20718 vec[i] = vt2;
20719 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20720 vt = force_reg (V32QImode, vt);
20721 for (i = 0; i < 32; i++)
20722 vec[i] = i < 16 ? vt2 : const0_rtx;
20723 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20724 vt2 = force_reg (V32QImode, vt2);
20725 /* From mask create two adjusted masks, which contain the same
20726 bits as mask in the low 7 bits of each vector element.
20727 The first mask will have the most significant bit clear
20728 if it requests element from the same 128-bit lane
20729 and MSB set if it requests element from the other 128-bit lane.
20730 The second mask will have the opposite values of the MSB,
20731 and additionally will have its 128-bit lanes swapped.
20732 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20733 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
20734 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20735 stands for other 12 bytes. */
20736 /* The bit whether element is from the same lane or the other
20737 lane is bit 4, so shift it up by 3 to the MSB position. */
20738 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20739 gen_lowpart (V4DImode, mask),
20740 GEN_INT (3)));
20741 /* Clear MSB bits from the mask just in case it had them set. */
20742 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20743 /* After this t1 will have MSB set for elements from other lane. */
20744 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20745 /* Clear bits other than MSB. */
20746 emit_insn (gen_andv32qi3 (t1, t1, vt));
20747 /* Or in the lower bits from mask into t3. */
20748 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20749 /* And invert MSB bits in t1, so MSB is set for elements from the same
20750 lane. */
20751 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20752 /* Swap 128-bit lanes in t3. */
20753 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20754 gen_lowpart (V4DImode, t3),
20755 const2_rtx, GEN_INT (3),
20756 const0_rtx, const1_rtx));
20757 /* And or in the lower bits from mask into t1. */
20758 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20759 if (one_operand_shuffle)
20760 {
20761 /* Each of these shuffles will put 0s in places where
20762 element from the other 128-bit lane is needed, otherwise
20763 will shuffle in the requested value. */
20764 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20765 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20766 /* For t3 the 128-bit lanes are swapped again. */
20767 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20768 gen_lowpart (V4DImode, t3),
20769 const2_rtx, GEN_INT (3),
20770 const0_rtx, const1_rtx));
20771 /* And oring both together leads to the result. */
20772 emit_insn (gen_iorv32qi3 (target, t1, t3));
20773 return;
20774 }
20775
20776 t4 = gen_reg_rtx (V32QImode);
20777 /* Similarly to the above one_operand_shuffle code,
20778 just for repeated twice for each operand. merge_two:
20779 code will merge the two results together. */
20780 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20781 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20782 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20783 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20784 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20785 gen_lowpart (V4DImode, t4),
20786 const2_rtx, GEN_INT (3),
20787 const0_rtx, const1_rtx));
20788 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20789 gen_lowpart (V4DImode, t3),
20790 const2_rtx, GEN_INT (3),
20791 const0_rtx, const1_rtx));
20792 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20793 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20794 t1 = t4;
20795 t2 = t3;
20796 goto merge_two;
20797
20798 default:
20799 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20800 break;
20801 }
20802 }
20803
20804 if (TARGET_XOP)
20805 {
20806 /* The XOP VPPERM insn supports three inputs. By ignoring the
20807 one_operand_shuffle special case, we avoid creating another
20808 set of constant vectors in memory. */
20809 one_operand_shuffle = false;
20810
20811 /* mask = mask & {2*w-1, ...} */
20812 vt = GEN_INT (2*w - 1);
20813 }
20814 else
20815 {
20816 /* mask = mask & {w-1, ...} */
20817 vt = GEN_INT (w - 1);
20818 }
20819
20820 for (i = 0; i < w; i++)
20821 vec[i] = vt;
20822 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20823 mask = expand_simple_binop (maskmode, AND, mask, vt,
20824 NULL_RTX, 0, OPTAB_DIRECT);
20825
20826 /* For non-QImode operations, convert the word permutation control
20827 into a byte permutation control. */
20828 if (mode != V16QImode)
20829 {
20830 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20831 GEN_INT (exact_log2 (e)),
20832 NULL_RTX, 0, OPTAB_DIRECT);
20833
20834 /* Convert mask to vector of chars. */
20835 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20836
20837 /* Replicate each of the input bytes into byte positions:
20838 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20839 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20840 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20841 for (i = 0; i < 16; ++i)
20842 vec[i] = GEN_INT (i/e * e);
20843 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20844 vt = validize_mem (force_const_mem (V16QImode, vt));
20845 if (TARGET_XOP)
20846 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20847 else
20848 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20849
20850 /* Convert it into the byte positions by doing
20851 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20852 for (i = 0; i < 16; ++i)
20853 vec[i] = GEN_INT (i % e);
20854 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20855 vt = validize_mem (force_const_mem (V16QImode, vt));
20856 emit_insn (gen_addv16qi3 (mask, mask, vt));
20857 }
20858
20859 /* The actual shuffle operations all operate on V16QImode. */
20860 op0 = gen_lowpart (V16QImode, op0);
20861 op1 = gen_lowpart (V16QImode, op1);
20862 target = gen_lowpart (V16QImode, target);
20863
20864 if (TARGET_XOP)
20865 {
20866 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20867 }
20868 else if (one_operand_shuffle)
20869 {
20870 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20871 }
20872 else
20873 {
20874 rtx xops[6];
20875 bool ok;
20876
20877 /* Shuffle the two input vectors independently. */
20878 t1 = gen_reg_rtx (V16QImode);
20879 t2 = gen_reg_rtx (V16QImode);
20880 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20881 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20882
20883 merge_two:
20884 /* Then merge them together. The key is whether any given control
20885 element contained a bit set that indicates the second word. */
20886 mask = operands[3];
20887 vt = GEN_INT (w);
20888 if (maskmode == V2DImode && !TARGET_SSE4_1)
20889 {
20890 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20891 more shuffle to convert the V2DI input mask into a V4SI
20892 input mask. At which point the masking that expand_int_vcond
20893 will work as desired. */
20894 rtx t3 = gen_reg_rtx (V4SImode);
20895 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20896 const0_rtx, const0_rtx,
20897 const2_rtx, const2_rtx));
20898 mask = t3;
20899 maskmode = V4SImode;
20900 e = w = 4;
20901 }
20902
20903 for (i = 0; i < w; i++)
20904 vec[i] = vt;
20905 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20906 vt = force_reg (maskmode, vt);
20907 mask = expand_simple_binop (maskmode, AND, mask, vt,
20908 NULL_RTX, 0, OPTAB_DIRECT);
20909
20910 xops[0] = gen_lowpart (mode, operands[0]);
20911 xops[1] = gen_lowpart (mode, t2);
20912 xops[2] = gen_lowpart (mode, t1);
20913 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20914 xops[4] = mask;
20915 xops[5] = vt;
20916 ok = ix86_expand_int_vcond (xops);
20917 gcc_assert (ok);
20918 }
20919 }
20920
20921 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20922 true if we should do zero extension, else sign extension. HIGH_P is
20923 true if we want the N/2 high elements, else the low elements. */
20924
20925 void
20926 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
20927 {
20928 enum machine_mode imode = GET_MODE (src);
20929 rtx tmp;
20930
20931 if (TARGET_SSE4_1)
20932 {
20933 rtx (*unpack)(rtx, rtx);
20934 rtx (*extract)(rtx, rtx) = NULL;
20935 enum machine_mode halfmode = BLKmode;
20936
20937 switch (imode)
20938 {
20939 case V32QImode:
20940 if (unsigned_p)
20941 unpack = gen_avx2_zero_extendv16qiv16hi2;
20942 else
20943 unpack = gen_avx2_sign_extendv16qiv16hi2;
20944 halfmode = V16QImode;
20945 extract
20946 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20947 break;
20948 case V16HImode:
20949 if (unsigned_p)
20950 unpack = gen_avx2_zero_extendv8hiv8si2;
20951 else
20952 unpack = gen_avx2_sign_extendv8hiv8si2;
20953 halfmode = V8HImode;
20954 extract
20955 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20956 break;
20957 case V8SImode:
20958 if (unsigned_p)
20959 unpack = gen_avx2_zero_extendv4siv4di2;
20960 else
20961 unpack = gen_avx2_sign_extendv4siv4di2;
20962 halfmode = V4SImode;
20963 extract
20964 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20965 break;
20966 case V16QImode:
20967 if (unsigned_p)
20968 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20969 else
20970 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20971 break;
20972 case V8HImode:
20973 if (unsigned_p)
20974 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20975 else
20976 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20977 break;
20978 case V4SImode:
20979 if (unsigned_p)
20980 unpack = gen_sse4_1_zero_extendv2siv2di2;
20981 else
20982 unpack = gen_sse4_1_sign_extendv2siv2di2;
20983 break;
20984 default:
20985 gcc_unreachable ();
20986 }
20987
20988 if (GET_MODE_SIZE (imode) == 32)
20989 {
20990 tmp = gen_reg_rtx (halfmode);
20991 emit_insn (extract (tmp, src));
20992 }
20993 else if (high_p)
20994 {
20995 /* Shift higher 8 bytes to lower 8 bytes. */
20996 tmp = gen_reg_rtx (imode);
20997 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20998 gen_lowpart (V1TImode, src),
20999 GEN_INT (64)));
21000 }
21001 else
21002 tmp = src;
21003
21004 emit_insn (unpack (dest, tmp));
21005 }
21006 else
21007 {
21008 rtx (*unpack)(rtx, rtx, rtx);
21009
21010 switch (imode)
21011 {
21012 case V16QImode:
21013 if (high_p)
21014 unpack = gen_vec_interleave_highv16qi;
21015 else
21016 unpack = gen_vec_interleave_lowv16qi;
21017 break;
21018 case V8HImode:
21019 if (high_p)
21020 unpack = gen_vec_interleave_highv8hi;
21021 else
21022 unpack = gen_vec_interleave_lowv8hi;
21023 break;
21024 case V4SImode:
21025 if (high_p)
21026 unpack = gen_vec_interleave_highv4si;
21027 else
21028 unpack = gen_vec_interleave_lowv4si;
21029 break;
21030 default:
21031 gcc_unreachable ();
21032 }
21033
21034 if (unsigned_p)
21035 tmp = force_reg (imode, CONST0_RTX (imode));
21036 else
21037 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
21038 src, pc_rtx, pc_rtx);
21039
21040 emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
21041 }
21042 }
21043
21044 /* Expand conditional increment or decrement using adb/sbb instructions.
21045 The default case using setcc followed by the conditional move can be
21046 done by generic code. */
21047 bool
21048 ix86_expand_int_addcc (rtx operands[])
21049 {
21050 enum rtx_code code = GET_CODE (operands[1]);
21051 rtx flags;
21052 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
21053 rtx compare_op;
21054 rtx val = const0_rtx;
21055 bool fpcmp = false;
21056 enum machine_mode mode;
21057 rtx op0 = XEXP (operands[1], 0);
21058 rtx op1 = XEXP (operands[1], 1);
21059
21060 if (operands[3] != const1_rtx
21061 && operands[3] != constm1_rtx)
21062 return false;
21063 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
21064 return false;
21065 code = GET_CODE (compare_op);
21066
21067 flags = XEXP (compare_op, 0);
21068
21069 if (GET_MODE (flags) == CCFPmode
21070 || GET_MODE (flags) == CCFPUmode)
21071 {
21072 fpcmp = true;
21073 code = ix86_fp_compare_code_to_integer (code);
21074 }
21075
21076 if (code != LTU)
21077 {
21078 val = constm1_rtx;
21079 if (fpcmp)
21080 PUT_CODE (compare_op,
21081 reverse_condition_maybe_unordered
21082 (GET_CODE (compare_op)));
21083 else
21084 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
21085 }
21086
21087 mode = GET_MODE (operands[0]);
21088
21089 /* Construct either adc or sbb insn. */
21090 if ((code == LTU) == (operands[3] == constm1_rtx))
21091 {
21092 switch (mode)
21093 {
21094 case QImode:
21095 insn = gen_subqi3_carry;
21096 break;
21097 case HImode:
21098 insn = gen_subhi3_carry;
21099 break;
21100 case SImode:
21101 insn = gen_subsi3_carry;
21102 break;
21103 case DImode:
21104 insn = gen_subdi3_carry;
21105 break;
21106 default:
21107 gcc_unreachable ();
21108 }
21109 }
21110 else
21111 {
21112 switch (mode)
21113 {
21114 case QImode:
21115 insn = gen_addqi3_carry;
21116 break;
21117 case HImode:
21118 insn = gen_addhi3_carry;
21119 break;
21120 case SImode:
21121 insn = gen_addsi3_carry;
21122 break;
21123 case DImode:
21124 insn = gen_adddi3_carry;
21125 break;
21126 default:
21127 gcc_unreachable ();
21128 }
21129 }
21130 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
21131
21132 return true;
21133 }
21134
21135
21136 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
21137 but works for floating pointer parameters and nonoffsetable memories.
21138 For pushes, it returns just stack offsets; the values will be saved
21139 in the right order. Maximally three parts are generated. */
21140
21141 static int
21142 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
21143 {
21144 int size;
21145
21146 if (!TARGET_64BIT)
21147 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
21148 else
21149 size = (GET_MODE_SIZE (mode) + 4) / 8;
21150
21151 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
21152 gcc_assert (size >= 2 && size <= 4);
21153
21154 /* Optimize constant pool reference to immediates. This is used by fp
21155 moves, that force all constants to memory to allow combining. */
21156 if (MEM_P (operand) && MEM_READONLY_P (operand))
21157 {
21158 rtx tmp = maybe_get_pool_constant (operand);
21159 if (tmp)
21160 operand = tmp;
21161 }
21162
21163 if (MEM_P (operand) && !offsettable_memref_p (operand))
21164 {
21165 /* The only non-offsetable memories we handle are pushes. */
21166 int ok = push_operand (operand, VOIDmode);
21167
21168 gcc_assert (ok);
21169
21170 operand = copy_rtx (operand);
21171 PUT_MODE (operand, word_mode);
21172 parts[0] = parts[1] = parts[2] = parts[3] = operand;
21173 return size;
21174 }
21175
21176 if (GET_CODE (operand) == CONST_VECTOR)
21177 {
21178 enum machine_mode imode = int_mode_for_mode (mode);
21179 /* Caution: if we looked through a constant pool memory above,
21180 the operand may actually have a different mode now. That's
21181 ok, since we want to pun this all the way back to an integer. */
21182 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
21183 gcc_assert (operand != NULL);
21184 mode = imode;
21185 }
21186
21187 if (!TARGET_64BIT)
21188 {
21189 if (mode == DImode)
21190 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21191 else
21192 {
21193 int i;
21194
21195 if (REG_P (operand))
21196 {
21197 gcc_assert (reload_completed);
21198 for (i = 0; i < size; i++)
21199 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
21200 }
21201 else if (offsettable_memref_p (operand))
21202 {
21203 operand = adjust_address (operand, SImode, 0);
21204 parts[0] = operand;
21205 for (i = 1; i < size; i++)
21206 parts[i] = adjust_address (operand, SImode, 4 * i);
21207 }
21208 else if (GET_CODE (operand) == CONST_DOUBLE)
21209 {
21210 REAL_VALUE_TYPE r;
21211 long l[4];
21212
21213 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21214 switch (mode)
21215 {
21216 case TFmode:
21217 real_to_target (l, &r, mode);
21218 parts[3] = gen_int_mode (l[3], SImode);
21219 parts[2] = gen_int_mode (l[2], SImode);
21220 break;
21221 case XFmode:
21222 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
21223 long double may not be 80-bit. */
21224 real_to_target (l, &r, mode);
21225 parts[2] = gen_int_mode (l[2], SImode);
21226 break;
21227 case DFmode:
21228 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
21229 break;
21230 default:
21231 gcc_unreachable ();
21232 }
21233 parts[1] = gen_int_mode (l[1], SImode);
21234 parts[0] = gen_int_mode (l[0], SImode);
21235 }
21236 else
21237 gcc_unreachable ();
21238 }
21239 }
21240 else
21241 {
21242 if (mode == TImode)
21243 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21244 if (mode == XFmode || mode == TFmode)
21245 {
21246 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
21247 if (REG_P (operand))
21248 {
21249 gcc_assert (reload_completed);
21250 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
21251 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
21252 }
21253 else if (offsettable_memref_p (operand))
21254 {
21255 operand = adjust_address (operand, DImode, 0);
21256 parts[0] = operand;
21257 parts[1] = adjust_address (operand, upper_mode, 8);
21258 }
21259 else if (GET_CODE (operand) == CONST_DOUBLE)
21260 {
21261 REAL_VALUE_TYPE r;
21262 long l[4];
21263
21264 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21265 real_to_target (l, &r, mode);
21266
21267 /* Do not use shift by 32 to avoid warning on 32bit systems. */
21268 if (HOST_BITS_PER_WIDE_INT >= 64)
21269 parts[0]
21270 = gen_int_mode
21271 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
21272 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21273 DImode);
21274 else
21275 parts[0] = immed_double_const (l[0], l[1], DImode);
21276
21277 if (upper_mode == SImode)
21278 parts[1] = gen_int_mode (l[2], SImode);
21279 else if (HOST_BITS_PER_WIDE_INT >= 64)
21280 parts[1]
21281 = gen_int_mode
21282 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21283 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21284 DImode);
21285 else
21286 parts[1] = immed_double_const (l[2], l[3], DImode);
21287 }
21288 else
21289 gcc_unreachable ();
21290 }
21291 }
21292
21293 return size;
21294 }
21295
21296 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21297 Return false when normal moves are needed; true when all required
21298 insns have been emitted. Operands 2-4 contain the input values
21299 int the correct order; operands 5-7 contain the output values. */
21300
21301 void
21302 ix86_split_long_move (rtx operands[])
21303 {
21304 rtx part[2][4];
21305 int nparts, i, j;
21306 int push = 0;
21307 int collisions = 0;
21308 enum machine_mode mode = GET_MODE (operands[0]);
21309 bool collisionparts[4];
21310
21311 /* The DFmode expanders may ask us to move double.
21312 For 64bit target this is single move. By hiding the fact
21313 here we simplify i386.md splitters. */
21314 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21315 {
21316 /* Optimize constant pool reference to immediates. This is used by
21317 fp moves, that force all constants to memory to allow combining. */
21318
21319 if (MEM_P (operands[1])
21320 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21321 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21322 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21323 if (push_operand (operands[0], VOIDmode))
21324 {
21325 operands[0] = copy_rtx (operands[0]);
21326 PUT_MODE (operands[0], word_mode);
21327 }
21328 else
21329 operands[0] = gen_lowpart (DImode, operands[0]);
21330 operands[1] = gen_lowpart (DImode, operands[1]);
21331 emit_move_insn (operands[0], operands[1]);
21332 return;
21333 }
21334
21335 /* The only non-offsettable memory we handle is push. */
21336 if (push_operand (operands[0], VOIDmode))
21337 push = 1;
21338 else
21339 gcc_assert (!MEM_P (operands[0])
21340 || offsettable_memref_p (operands[0]));
21341
21342 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21343 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21344
21345 /* When emitting push, take care for source operands on the stack. */
21346 if (push && MEM_P (operands[1])
21347 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21348 {
21349 rtx src_base = XEXP (part[1][nparts - 1], 0);
21350
21351 /* Compensate for the stack decrement by 4. */
21352 if (!TARGET_64BIT && nparts == 3
21353 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21354 src_base = plus_constant (Pmode, src_base, 4);
21355
21356 /* src_base refers to the stack pointer and is
21357 automatically decreased by emitted push. */
21358 for (i = 0; i < nparts; i++)
21359 part[1][i] = change_address (part[1][i],
21360 GET_MODE (part[1][i]), src_base);
21361 }
21362
21363 /* We need to do copy in the right order in case an address register
21364 of the source overlaps the destination. */
21365 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21366 {
21367 rtx tmp;
21368
21369 for (i = 0; i < nparts; i++)
21370 {
21371 collisionparts[i]
21372 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21373 if (collisionparts[i])
21374 collisions++;
21375 }
21376
21377 /* Collision in the middle part can be handled by reordering. */
21378 if (collisions == 1 && nparts == 3 && collisionparts [1])
21379 {
21380 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21381 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21382 }
21383 else if (collisions == 1
21384 && nparts == 4
21385 && (collisionparts [1] || collisionparts [2]))
21386 {
21387 if (collisionparts [1])
21388 {
21389 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21390 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21391 }
21392 else
21393 {
21394 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21395 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21396 }
21397 }
21398
21399 /* If there are more collisions, we can't handle it by reordering.
21400 Do an lea to the last part and use only one colliding move. */
21401 else if (collisions > 1)
21402 {
21403 rtx base;
21404
21405 collisions = 1;
21406
21407 base = part[0][nparts - 1];
21408
21409 /* Handle the case when the last part isn't valid for lea.
21410 Happens in 64-bit mode storing the 12-byte XFmode. */
21411 if (GET_MODE (base) != Pmode)
21412 base = gen_rtx_REG (Pmode, REGNO (base));
21413
21414 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21415 part[1][0] = replace_equiv_address (part[1][0], base);
21416 for (i = 1; i < nparts; i++)
21417 {
21418 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21419 part[1][i] = replace_equiv_address (part[1][i], tmp);
21420 }
21421 }
21422 }
21423
21424 if (push)
21425 {
21426 if (!TARGET_64BIT)
21427 {
21428 if (nparts == 3)
21429 {
21430 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21431 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21432 stack_pointer_rtx, GEN_INT (-4)));
21433 emit_move_insn (part[0][2], part[1][2]);
21434 }
21435 else if (nparts == 4)
21436 {
21437 emit_move_insn (part[0][3], part[1][3]);
21438 emit_move_insn (part[0][2], part[1][2]);
21439 }
21440 }
21441 else
21442 {
21443 /* In 64bit mode we don't have 32bit push available. In case this is
21444 register, it is OK - we will just use larger counterpart. We also
21445 retype memory - these comes from attempt to avoid REX prefix on
21446 moving of second half of TFmode value. */
21447 if (GET_MODE (part[1][1]) == SImode)
21448 {
21449 switch (GET_CODE (part[1][1]))
21450 {
21451 case MEM:
21452 part[1][1] = adjust_address (part[1][1], DImode, 0);
21453 break;
21454
21455 case REG:
21456 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21457 break;
21458
21459 default:
21460 gcc_unreachable ();
21461 }
21462
21463 if (GET_MODE (part[1][0]) == SImode)
21464 part[1][0] = part[1][1];
21465 }
21466 }
21467 emit_move_insn (part[0][1], part[1][1]);
21468 emit_move_insn (part[0][0], part[1][0]);
21469 return;
21470 }
21471
21472 /* Choose correct order to not overwrite the source before it is copied. */
21473 if ((REG_P (part[0][0])
21474 && REG_P (part[1][1])
21475 && (REGNO (part[0][0]) == REGNO (part[1][1])
21476 || (nparts == 3
21477 && REGNO (part[0][0]) == REGNO (part[1][2]))
21478 || (nparts == 4
21479 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21480 || (collisions > 0
21481 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21482 {
21483 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21484 {
21485 operands[2 + i] = part[0][j];
21486 operands[6 + i] = part[1][j];
21487 }
21488 }
21489 else
21490 {
21491 for (i = 0; i < nparts; i++)
21492 {
21493 operands[2 + i] = part[0][i];
21494 operands[6 + i] = part[1][i];
21495 }
21496 }
21497
21498 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21499 if (optimize_insn_for_size_p ())
21500 {
21501 for (j = 0; j < nparts - 1; j++)
21502 if (CONST_INT_P (operands[6 + j])
21503 && operands[6 + j] != const0_rtx
21504 && REG_P (operands[2 + j]))
21505 for (i = j; i < nparts - 1; i++)
21506 if (CONST_INT_P (operands[7 + i])
21507 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21508 operands[7 + i] = operands[2 + j];
21509 }
21510
21511 for (i = 0; i < nparts; i++)
21512 emit_move_insn (operands[2 + i], operands[6 + i]);
21513
21514 return;
21515 }
21516
21517 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21518 left shift by a constant, either using a single shift or
21519 a sequence of add instructions. */
21520
21521 static void
21522 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21523 {
21524 rtx (*insn)(rtx, rtx, rtx);
21525
21526 if (count == 1
21527 || (count * ix86_cost->add <= ix86_cost->shift_const
21528 && !optimize_insn_for_size_p ()))
21529 {
21530 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21531 while (count-- > 0)
21532 emit_insn (insn (operand, operand, operand));
21533 }
21534 else
21535 {
21536 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21537 emit_insn (insn (operand, operand, GEN_INT (count)));
21538 }
21539 }
21540
21541 void
21542 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
21543 {
21544 rtx (*gen_ashl3)(rtx, rtx, rtx);
21545 rtx (*gen_shld)(rtx, rtx, rtx);
21546 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21547
21548 rtx low[2], high[2];
21549 int count;
21550
21551 if (CONST_INT_P (operands[2]))
21552 {
21553 split_double_mode (mode, operands, 2, low, high);
21554 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21555
21556 if (count >= half_width)
21557 {
21558 emit_move_insn (high[0], low[1]);
21559 emit_move_insn (low[0], const0_rtx);
21560
21561 if (count > half_width)
21562 ix86_expand_ashl_const (high[0], count - half_width, mode);
21563 }
21564 else
21565 {
21566 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21567
21568 if (!rtx_equal_p (operands[0], operands[1]))
21569 emit_move_insn (operands[0], operands[1]);
21570
21571 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21572 ix86_expand_ashl_const (low[0], count, mode);
21573 }
21574 return;
21575 }
21576
21577 split_double_mode (mode, operands, 1, low, high);
21578
21579 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21580
21581 if (operands[1] == const1_rtx)
21582 {
21583 /* Assuming we've chosen a QImode capable registers, then 1 << N
21584 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21585 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21586 {
21587 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21588
21589 ix86_expand_clear (low[0]);
21590 ix86_expand_clear (high[0]);
21591 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21592
21593 d = gen_lowpart (QImode, low[0]);
21594 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21595 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21596 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21597
21598 d = gen_lowpart (QImode, high[0]);
21599 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21600 s = gen_rtx_NE (QImode, flags, const0_rtx);
21601 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21602 }
21603
21604 /* Otherwise, we can get the same results by manually performing
21605 a bit extract operation on bit 5/6, and then performing the two
21606 shifts. The two methods of getting 0/1 into low/high are exactly
21607 the same size. Avoiding the shift in the bit extract case helps
21608 pentium4 a bit; no one else seems to care much either way. */
21609 else
21610 {
21611 enum machine_mode half_mode;
21612 rtx (*gen_lshr3)(rtx, rtx, rtx);
21613 rtx (*gen_and3)(rtx, rtx, rtx);
21614 rtx (*gen_xor3)(rtx, rtx, rtx);
21615 HOST_WIDE_INT bits;
21616 rtx x;
21617
21618 if (mode == DImode)
21619 {
21620 half_mode = SImode;
21621 gen_lshr3 = gen_lshrsi3;
21622 gen_and3 = gen_andsi3;
21623 gen_xor3 = gen_xorsi3;
21624 bits = 5;
21625 }
21626 else
21627 {
21628 half_mode = DImode;
21629 gen_lshr3 = gen_lshrdi3;
21630 gen_and3 = gen_anddi3;
21631 gen_xor3 = gen_xordi3;
21632 bits = 6;
21633 }
21634
21635 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21636 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21637 else
21638 x = gen_lowpart (half_mode, operands[2]);
21639 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21640
21641 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21642 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21643 emit_move_insn (low[0], high[0]);
21644 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21645 }
21646
21647 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21648 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21649 return;
21650 }
21651
21652 if (operands[1] == constm1_rtx)
21653 {
21654 /* For -1 << N, we can avoid the shld instruction, because we
21655 know that we're shifting 0...31/63 ones into a -1. */
21656 emit_move_insn (low[0], constm1_rtx);
21657 if (optimize_insn_for_size_p ())
21658 emit_move_insn (high[0], low[0]);
21659 else
21660 emit_move_insn (high[0], constm1_rtx);
21661 }
21662 else
21663 {
21664 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21665
21666 if (!rtx_equal_p (operands[0], operands[1]))
21667 emit_move_insn (operands[0], operands[1]);
21668
21669 split_double_mode (mode, operands, 1, low, high);
21670 emit_insn (gen_shld (high[0], low[0], operands[2]));
21671 }
21672
21673 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21674
21675 if (TARGET_CMOVE && scratch)
21676 {
21677 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21678 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21679
21680 ix86_expand_clear (scratch);
21681 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
21682 }
21683 else
21684 {
21685 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21686 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21687
21688 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
21689 }
21690 }
21691
21692 void
21693 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
21694 {
21695 rtx (*gen_ashr3)(rtx, rtx, rtx)
21696 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
21697 rtx (*gen_shrd)(rtx, rtx, rtx);
21698 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21699
21700 rtx low[2], high[2];
21701 int count;
21702
21703 if (CONST_INT_P (operands[2]))
21704 {
21705 split_double_mode (mode, operands, 2, low, high);
21706 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21707
21708 if (count == GET_MODE_BITSIZE (mode) - 1)
21709 {
21710 emit_move_insn (high[0], high[1]);
21711 emit_insn (gen_ashr3 (high[0], high[0],
21712 GEN_INT (half_width - 1)));
21713 emit_move_insn (low[0], high[0]);
21714
21715 }
21716 else if (count >= half_width)
21717 {
21718 emit_move_insn (low[0], high[1]);
21719 emit_move_insn (high[0], low[0]);
21720 emit_insn (gen_ashr3 (high[0], high[0],
21721 GEN_INT (half_width - 1)));
21722
21723 if (count > half_width)
21724 emit_insn (gen_ashr3 (low[0], low[0],
21725 GEN_INT (count - half_width)));
21726 }
21727 else
21728 {
21729 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21730
21731 if (!rtx_equal_p (operands[0], operands[1]))
21732 emit_move_insn (operands[0], operands[1]);
21733
21734 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21735 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21736 }
21737 }
21738 else
21739 {
21740 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21741
21742 if (!rtx_equal_p (operands[0], operands[1]))
21743 emit_move_insn (operands[0], operands[1]);
21744
21745 split_double_mode (mode, operands, 1, low, high);
21746
21747 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21748 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21749
21750 if (TARGET_CMOVE && scratch)
21751 {
21752 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21753 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21754
21755 emit_move_insn (scratch, high[0]);
21756 emit_insn (gen_ashr3 (scratch, scratch,
21757 GEN_INT (half_width - 1)));
21758 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21759 scratch));
21760 }
21761 else
21762 {
21763 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21764 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21765
21766 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21767 }
21768 }
21769 }
21770
21771 void
21772 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21773 {
21774 rtx (*gen_lshr3)(rtx, rtx, rtx)
21775 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21776 rtx (*gen_shrd)(rtx, rtx, rtx);
21777 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21778
21779 rtx low[2], high[2];
21780 int count;
21781
21782 if (CONST_INT_P (operands[2]))
21783 {
21784 split_double_mode (mode, operands, 2, low, high);
21785 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21786
21787 if (count >= half_width)
21788 {
21789 emit_move_insn (low[0], high[1]);
21790 ix86_expand_clear (high[0]);
21791
21792 if (count > half_width)
21793 emit_insn (gen_lshr3 (low[0], low[0],
21794 GEN_INT (count - half_width)));
21795 }
21796 else
21797 {
21798 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21799
21800 if (!rtx_equal_p (operands[0], operands[1]))
21801 emit_move_insn (operands[0], operands[1]);
21802
21803 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21804 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21805 }
21806 }
21807 else
21808 {
21809 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21810
21811 if (!rtx_equal_p (operands[0], operands[1]))
21812 emit_move_insn (operands[0], operands[1]);
21813
21814 split_double_mode (mode, operands, 1, low, high);
21815
21816 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21817 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21818
21819 if (TARGET_CMOVE && scratch)
21820 {
21821 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21822 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21823
21824 ix86_expand_clear (scratch);
21825 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21826 scratch));
21827 }
21828 else
21829 {
21830 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21831 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21832
21833 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21834 }
21835 }
21836 }
21837
21838 /* Predict just emitted jump instruction to be taken with probability PROB. */
21839 static void
21840 predict_jump (int prob)
21841 {
21842 rtx insn = get_last_insn ();
21843 gcc_assert (JUMP_P (insn));
21844 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21845 }
21846
21847 /* Helper function for the string operations below. Dest VARIABLE whether
21848 it is aligned to VALUE bytes. If true, jump to the label. */
21849 static rtx
21850 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21851 {
21852 rtx label = gen_label_rtx ();
21853 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21854 if (GET_MODE (variable) == DImode)
21855 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21856 else
21857 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21858 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21859 1, label);
21860 if (epilogue)
21861 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21862 else
21863 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21864 return label;
21865 }
21866
21867 /* Adjust COUNTER by the VALUE. */
21868 static void
21869 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21870 {
21871 rtx (*gen_add)(rtx, rtx, rtx)
21872 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21873
21874 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21875 }
21876
21877 /* Zero extend possibly SImode EXP to Pmode register. */
21878 rtx
21879 ix86_zero_extend_to_Pmode (rtx exp)
21880 {
21881 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
21882 }
21883
21884 /* Divide COUNTREG by SCALE. */
21885 static rtx
21886 scale_counter (rtx countreg, int scale)
21887 {
21888 rtx sc;
21889
21890 if (scale == 1)
21891 return countreg;
21892 if (CONST_INT_P (countreg))
21893 return GEN_INT (INTVAL (countreg) / scale);
21894 gcc_assert (REG_P (countreg));
21895
21896 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21897 GEN_INT (exact_log2 (scale)),
21898 NULL, 1, OPTAB_DIRECT);
21899 return sc;
21900 }
21901
21902 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21903 DImode for constant loop counts. */
21904
21905 static enum machine_mode
21906 counter_mode (rtx count_exp)
21907 {
21908 if (GET_MODE (count_exp) != VOIDmode)
21909 return GET_MODE (count_exp);
21910 if (!CONST_INT_P (count_exp))
21911 return Pmode;
21912 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21913 return DImode;
21914 return SImode;
21915 }
21916
21917 /* When SRCPTR is non-NULL, output simple loop to move memory
21918 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21919 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21920 equivalent loop to set memory by VALUE (supposed to be in MODE).
21921
21922 The size is rounded down to whole number of chunk size moved at once.
21923 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21924
21925
21926 static void
21927 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21928 rtx destptr, rtx srcptr, rtx value,
21929 rtx count, enum machine_mode mode, int unroll,
21930 int expected_size)
21931 {
21932 rtx out_label, top_label, iter, tmp;
21933 enum machine_mode iter_mode = counter_mode (count);
21934 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21935 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21936 rtx size;
21937 rtx x_addr;
21938 rtx y_addr;
21939 int i;
21940
21941 top_label = gen_label_rtx ();
21942 out_label = gen_label_rtx ();
21943 iter = gen_reg_rtx (iter_mode);
21944
21945 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21946 NULL, 1, OPTAB_DIRECT);
21947 /* Those two should combine. */
21948 if (piece_size == const1_rtx)
21949 {
21950 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21951 true, out_label);
21952 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21953 }
21954 emit_move_insn (iter, const0_rtx);
21955
21956 emit_label (top_label);
21957
21958 tmp = convert_modes (Pmode, iter_mode, iter, true);
21959 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21960 destmem = change_address (destmem, mode, x_addr);
21961
21962 if (srcmem)
21963 {
21964 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21965 srcmem = change_address (srcmem, mode, y_addr);
21966
21967 /* When unrolling for chips that reorder memory reads and writes,
21968 we can save registers by using single temporary.
21969 Also using 4 temporaries is overkill in 32bit mode. */
21970 if (!TARGET_64BIT && 0)
21971 {
21972 for (i = 0; i < unroll; i++)
21973 {
21974 if (i)
21975 {
21976 destmem =
21977 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21978 srcmem =
21979 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21980 }
21981 emit_move_insn (destmem, srcmem);
21982 }
21983 }
21984 else
21985 {
21986 rtx tmpreg[4];
21987 gcc_assert (unroll <= 4);
21988 for (i = 0; i < unroll; i++)
21989 {
21990 tmpreg[i] = gen_reg_rtx (mode);
21991 if (i)
21992 {
21993 srcmem =
21994 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21995 }
21996 emit_move_insn (tmpreg[i], srcmem);
21997 }
21998 for (i = 0; i < unroll; i++)
21999 {
22000 if (i)
22001 {
22002 destmem =
22003 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22004 }
22005 emit_move_insn (destmem, tmpreg[i]);
22006 }
22007 }
22008 }
22009 else
22010 for (i = 0; i < unroll; i++)
22011 {
22012 if (i)
22013 destmem =
22014 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22015 emit_move_insn (destmem, value);
22016 }
22017
22018 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
22019 true, OPTAB_LIB_WIDEN);
22020 if (tmp != iter)
22021 emit_move_insn (iter, tmp);
22022
22023 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
22024 true, top_label);
22025 if (expected_size != -1)
22026 {
22027 expected_size /= GET_MODE_SIZE (mode) * unroll;
22028 if (expected_size == 0)
22029 predict_jump (0);
22030 else if (expected_size > REG_BR_PROB_BASE)
22031 predict_jump (REG_BR_PROB_BASE - 1);
22032 else
22033 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
22034 }
22035 else
22036 predict_jump (REG_BR_PROB_BASE * 80 / 100);
22037 iter = ix86_zero_extend_to_Pmode (iter);
22038 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
22039 true, OPTAB_LIB_WIDEN);
22040 if (tmp != destptr)
22041 emit_move_insn (destptr, tmp);
22042 if (srcptr)
22043 {
22044 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
22045 true, OPTAB_LIB_WIDEN);
22046 if (tmp != srcptr)
22047 emit_move_insn (srcptr, tmp);
22048 }
22049 emit_label (out_label);
22050 }
22051
22052 /* Output "rep; mov" instruction.
22053 Arguments have same meaning as for previous function */
22054 static void
22055 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
22056 rtx destptr, rtx srcptr,
22057 rtx count,
22058 enum machine_mode mode)
22059 {
22060 rtx destexp;
22061 rtx srcexp;
22062 rtx countreg;
22063 HOST_WIDE_INT rounded_count;
22064
22065 /* If the size is known, it is shorter to use rep movs. */
22066 if (mode == QImode && CONST_INT_P (count)
22067 && !(INTVAL (count) & 3))
22068 mode = SImode;
22069
22070 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
22071 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
22072 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
22073 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
22074 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
22075 if (mode != QImode)
22076 {
22077 destexp = gen_rtx_ASHIFT (Pmode, countreg,
22078 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22079 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
22080 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
22081 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22082 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
22083 }
22084 else
22085 {
22086 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
22087 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
22088 }
22089 if (CONST_INT_P (count))
22090 {
22091 rounded_count = (INTVAL (count)
22092 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22093 destmem = shallow_copy_rtx (destmem);
22094 srcmem = shallow_copy_rtx (srcmem);
22095 set_mem_size (destmem, rounded_count);
22096 set_mem_size (srcmem, rounded_count);
22097 }
22098 else
22099 {
22100 if (MEM_SIZE_KNOWN_P (destmem))
22101 clear_mem_size (destmem);
22102 if (MEM_SIZE_KNOWN_P (srcmem))
22103 clear_mem_size (srcmem);
22104 }
22105 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
22106 destexp, srcexp));
22107 }
22108
22109 /* Output "rep; stos" instruction.
22110 Arguments have same meaning as for previous function */
22111 static void
22112 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
22113 rtx count, enum machine_mode mode,
22114 rtx orig_value)
22115 {
22116 rtx destexp;
22117 rtx countreg;
22118 HOST_WIDE_INT rounded_count;
22119
22120 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
22121 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
22122 value = force_reg (mode, gen_lowpart (mode, value));
22123 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
22124 if (mode != QImode)
22125 {
22126 destexp = gen_rtx_ASHIFT (Pmode, countreg,
22127 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22128 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
22129 }
22130 else
22131 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
22132 if (orig_value == const0_rtx && CONST_INT_P (count))
22133 {
22134 rounded_count = (INTVAL (count)
22135 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22136 destmem = shallow_copy_rtx (destmem);
22137 set_mem_size (destmem, rounded_count);
22138 }
22139 else if (MEM_SIZE_KNOWN_P (destmem))
22140 clear_mem_size (destmem);
22141 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
22142 }
22143
22144 static void
22145 emit_strmov (rtx destmem, rtx srcmem,
22146 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
22147 {
22148 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
22149 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
22150 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22151 }
22152
22153 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
22154 static void
22155 expand_movmem_epilogue (rtx destmem, rtx srcmem,
22156 rtx destptr, rtx srcptr, rtx count, int max_size)
22157 {
22158 rtx src, dest;
22159 if (CONST_INT_P (count))
22160 {
22161 HOST_WIDE_INT countval = INTVAL (count);
22162 int offset = 0;
22163
22164 if ((countval & 0x10) && max_size > 16)
22165 {
22166 if (TARGET_64BIT)
22167 {
22168 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
22169 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
22170 }
22171 else
22172 gcc_unreachable ();
22173 offset += 16;
22174 }
22175 if ((countval & 0x08) && max_size > 8)
22176 {
22177 if (TARGET_64BIT)
22178 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
22179 else
22180 {
22181 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
22182 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
22183 }
22184 offset += 8;
22185 }
22186 if ((countval & 0x04) && max_size > 4)
22187 {
22188 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
22189 offset += 4;
22190 }
22191 if ((countval & 0x02) && max_size > 2)
22192 {
22193 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
22194 offset += 2;
22195 }
22196 if ((countval & 0x01) && max_size > 1)
22197 {
22198 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
22199 offset += 1;
22200 }
22201 return;
22202 }
22203 if (max_size > 8)
22204 {
22205 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
22206 count, 1, OPTAB_DIRECT);
22207 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
22208 count, QImode, 1, 4);
22209 return;
22210 }
22211
22212 /* When there are stringops, we can cheaply increase dest and src pointers.
22213 Otherwise we save code size by maintaining offset (zero is readily
22214 available from preceding rep operation) and using x86 addressing modes.
22215 */
22216 if (TARGET_SINGLE_STRINGOP)
22217 {
22218 if (max_size > 4)
22219 {
22220 rtx label = ix86_expand_aligntest (count, 4, true);
22221 src = change_address (srcmem, SImode, srcptr);
22222 dest = change_address (destmem, SImode, destptr);
22223 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22224 emit_label (label);
22225 LABEL_NUSES (label) = 1;
22226 }
22227 if (max_size > 2)
22228 {
22229 rtx label = ix86_expand_aligntest (count, 2, true);
22230 src = change_address (srcmem, HImode, srcptr);
22231 dest = change_address (destmem, HImode, destptr);
22232 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22233 emit_label (label);
22234 LABEL_NUSES (label) = 1;
22235 }
22236 if (max_size > 1)
22237 {
22238 rtx label = ix86_expand_aligntest (count, 1, true);
22239 src = change_address (srcmem, QImode, srcptr);
22240 dest = change_address (destmem, QImode, destptr);
22241 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22242 emit_label (label);
22243 LABEL_NUSES (label) = 1;
22244 }
22245 }
22246 else
22247 {
22248 rtx offset = force_reg (Pmode, const0_rtx);
22249 rtx tmp;
22250
22251 if (max_size > 4)
22252 {
22253 rtx label = ix86_expand_aligntest (count, 4, true);
22254 src = change_address (srcmem, SImode, srcptr);
22255 dest = change_address (destmem, SImode, destptr);
22256 emit_move_insn (dest, src);
22257 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
22258 true, OPTAB_LIB_WIDEN);
22259 if (tmp != offset)
22260 emit_move_insn (offset, tmp);
22261 emit_label (label);
22262 LABEL_NUSES (label) = 1;
22263 }
22264 if (max_size > 2)
22265 {
22266 rtx label = ix86_expand_aligntest (count, 2, true);
22267 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22268 src = change_address (srcmem, HImode, tmp);
22269 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22270 dest = change_address (destmem, HImode, tmp);
22271 emit_move_insn (dest, src);
22272 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22273 true, OPTAB_LIB_WIDEN);
22274 if (tmp != offset)
22275 emit_move_insn (offset, tmp);
22276 emit_label (label);
22277 LABEL_NUSES (label) = 1;
22278 }
22279 if (max_size > 1)
22280 {
22281 rtx label = ix86_expand_aligntest (count, 1, true);
22282 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22283 src = change_address (srcmem, QImode, tmp);
22284 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22285 dest = change_address (destmem, QImode, tmp);
22286 emit_move_insn (dest, src);
22287 emit_label (label);
22288 LABEL_NUSES (label) = 1;
22289 }
22290 }
22291 }
22292
22293 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22294 static void
22295 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22296 rtx count, int max_size)
22297 {
22298 count =
22299 expand_simple_binop (counter_mode (count), AND, count,
22300 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22301 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22302 gen_lowpart (QImode, value), count, QImode,
22303 1, max_size / 2);
22304 }
22305
22306 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22307 static void
22308 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
22309 {
22310 rtx dest;
22311
22312 if (CONST_INT_P (count))
22313 {
22314 HOST_WIDE_INT countval = INTVAL (count);
22315 int offset = 0;
22316
22317 if ((countval & 0x10) && max_size > 16)
22318 {
22319 if (TARGET_64BIT)
22320 {
22321 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22322 emit_insn (gen_strset (destptr, dest, value));
22323 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
22324 emit_insn (gen_strset (destptr, dest, value));
22325 }
22326 else
22327 gcc_unreachable ();
22328 offset += 16;
22329 }
22330 if ((countval & 0x08) && max_size > 8)
22331 {
22332 if (TARGET_64BIT)
22333 {
22334 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22335 emit_insn (gen_strset (destptr, dest, value));
22336 }
22337 else
22338 {
22339 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22340 emit_insn (gen_strset (destptr, dest, value));
22341 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
22342 emit_insn (gen_strset (destptr, dest, value));
22343 }
22344 offset += 8;
22345 }
22346 if ((countval & 0x04) && max_size > 4)
22347 {
22348 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22349 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22350 offset += 4;
22351 }
22352 if ((countval & 0x02) && max_size > 2)
22353 {
22354 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
22355 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22356 offset += 2;
22357 }
22358 if ((countval & 0x01) && max_size > 1)
22359 {
22360 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
22361 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22362 offset += 1;
22363 }
22364 return;
22365 }
22366 if (max_size > 32)
22367 {
22368 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22369 return;
22370 }
22371 if (max_size > 16)
22372 {
22373 rtx label = ix86_expand_aligntest (count, 16, true);
22374 if (TARGET_64BIT)
22375 {
22376 dest = change_address (destmem, DImode, destptr);
22377 emit_insn (gen_strset (destptr, dest, value));
22378 emit_insn (gen_strset (destptr, dest, value));
22379 }
22380 else
22381 {
22382 dest = change_address (destmem, SImode, destptr);
22383 emit_insn (gen_strset (destptr, dest, value));
22384 emit_insn (gen_strset (destptr, dest, value));
22385 emit_insn (gen_strset (destptr, dest, value));
22386 emit_insn (gen_strset (destptr, dest, value));
22387 }
22388 emit_label (label);
22389 LABEL_NUSES (label) = 1;
22390 }
22391 if (max_size > 8)
22392 {
22393 rtx label = ix86_expand_aligntest (count, 8, true);
22394 if (TARGET_64BIT)
22395 {
22396 dest = change_address (destmem, DImode, destptr);
22397 emit_insn (gen_strset (destptr, dest, value));
22398 }
22399 else
22400 {
22401 dest = change_address (destmem, SImode, destptr);
22402 emit_insn (gen_strset (destptr, dest, value));
22403 emit_insn (gen_strset (destptr, dest, value));
22404 }
22405 emit_label (label);
22406 LABEL_NUSES (label) = 1;
22407 }
22408 if (max_size > 4)
22409 {
22410 rtx label = ix86_expand_aligntest (count, 4, true);
22411 dest = change_address (destmem, SImode, destptr);
22412 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22413 emit_label (label);
22414 LABEL_NUSES (label) = 1;
22415 }
22416 if (max_size > 2)
22417 {
22418 rtx label = ix86_expand_aligntest (count, 2, true);
22419 dest = change_address (destmem, HImode, destptr);
22420 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22421 emit_label (label);
22422 LABEL_NUSES (label) = 1;
22423 }
22424 if (max_size > 1)
22425 {
22426 rtx label = ix86_expand_aligntest (count, 1, true);
22427 dest = change_address (destmem, QImode, destptr);
22428 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22429 emit_label (label);
22430 LABEL_NUSES (label) = 1;
22431 }
22432 }
22433
22434 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
22435 DESIRED_ALIGNMENT. */
22436 static void
22437 expand_movmem_prologue (rtx destmem, rtx srcmem,
22438 rtx destptr, rtx srcptr, rtx count,
22439 int align, int desired_alignment)
22440 {
22441 if (align <= 1 && desired_alignment > 1)
22442 {
22443 rtx label = ix86_expand_aligntest (destptr, 1, false);
22444 srcmem = change_address (srcmem, QImode, srcptr);
22445 destmem = change_address (destmem, QImode, destptr);
22446 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22447 ix86_adjust_counter (count, 1);
22448 emit_label (label);
22449 LABEL_NUSES (label) = 1;
22450 }
22451 if (align <= 2 && desired_alignment > 2)
22452 {
22453 rtx label = ix86_expand_aligntest (destptr, 2, false);
22454 srcmem = change_address (srcmem, HImode, srcptr);
22455 destmem = change_address (destmem, HImode, destptr);
22456 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22457 ix86_adjust_counter (count, 2);
22458 emit_label (label);
22459 LABEL_NUSES (label) = 1;
22460 }
22461 if (align <= 4 && desired_alignment > 4)
22462 {
22463 rtx label = ix86_expand_aligntest (destptr, 4, false);
22464 srcmem = change_address (srcmem, SImode, srcptr);
22465 destmem = change_address (destmem, SImode, destptr);
22466 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22467 ix86_adjust_counter (count, 4);
22468 emit_label (label);
22469 LABEL_NUSES (label) = 1;
22470 }
22471 gcc_assert (desired_alignment <= 8);
22472 }
22473
22474 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
22475 ALIGN_BYTES is how many bytes need to be copied. */
22476 static rtx
22477 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
22478 int desired_align, int align_bytes)
22479 {
22480 rtx src = *srcp;
22481 rtx orig_dst = dst;
22482 rtx orig_src = src;
22483 int off = 0;
22484 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
22485 if (src_align_bytes >= 0)
22486 src_align_bytes = desired_align - src_align_bytes;
22487 if (align_bytes & 1)
22488 {
22489 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22490 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
22491 off = 1;
22492 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22493 }
22494 if (align_bytes & 2)
22495 {
22496 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22497 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
22498 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22499 set_mem_align (dst, 2 * BITS_PER_UNIT);
22500 if (src_align_bytes >= 0
22501 && (src_align_bytes & 1) == (align_bytes & 1)
22502 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
22503 set_mem_align (src, 2 * BITS_PER_UNIT);
22504 off = 2;
22505 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22506 }
22507 if (align_bytes & 4)
22508 {
22509 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22510 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
22511 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22512 set_mem_align (dst, 4 * BITS_PER_UNIT);
22513 if (src_align_bytes >= 0)
22514 {
22515 unsigned int src_align = 0;
22516 if ((src_align_bytes & 3) == (align_bytes & 3))
22517 src_align = 4;
22518 else if ((src_align_bytes & 1) == (align_bytes & 1))
22519 src_align = 2;
22520 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22521 set_mem_align (src, src_align * BITS_PER_UNIT);
22522 }
22523 off = 4;
22524 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22525 }
22526 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22527 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
22528 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22529 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22530 if (src_align_bytes >= 0)
22531 {
22532 unsigned int src_align = 0;
22533 if ((src_align_bytes & 7) == (align_bytes & 7))
22534 src_align = 8;
22535 else if ((src_align_bytes & 3) == (align_bytes & 3))
22536 src_align = 4;
22537 else if ((src_align_bytes & 1) == (align_bytes & 1))
22538 src_align = 2;
22539 if (src_align > (unsigned int) desired_align)
22540 src_align = desired_align;
22541 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22542 set_mem_align (src, src_align * BITS_PER_UNIT);
22543 }
22544 if (MEM_SIZE_KNOWN_P (orig_dst))
22545 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22546 if (MEM_SIZE_KNOWN_P (orig_src))
22547 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
22548 *srcp = src;
22549 return dst;
22550 }
22551
22552 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
22553 DESIRED_ALIGNMENT. */
22554 static void
22555 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
22556 int align, int desired_alignment)
22557 {
22558 if (align <= 1 && desired_alignment > 1)
22559 {
22560 rtx label = ix86_expand_aligntest (destptr, 1, false);
22561 destmem = change_address (destmem, QImode, destptr);
22562 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
22563 ix86_adjust_counter (count, 1);
22564 emit_label (label);
22565 LABEL_NUSES (label) = 1;
22566 }
22567 if (align <= 2 && desired_alignment > 2)
22568 {
22569 rtx label = ix86_expand_aligntest (destptr, 2, false);
22570 destmem = change_address (destmem, HImode, destptr);
22571 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
22572 ix86_adjust_counter (count, 2);
22573 emit_label (label);
22574 LABEL_NUSES (label) = 1;
22575 }
22576 if (align <= 4 && desired_alignment > 4)
22577 {
22578 rtx label = ix86_expand_aligntest (destptr, 4, false);
22579 destmem = change_address (destmem, SImode, destptr);
22580 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22581 ix86_adjust_counter (count, 4);
22582 emit_label (label);
22583 LABEL_NUSES (label) = 1;
22584 }
22585 gcc_assert (desired_alignment <= 8);
22586 }
22587
22588 /* Set enough from DST to align DST known to by aligned by ALIGN to
22589 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
22590 static rtx
22591 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
22592 int desired_align, int align_bytes)
22593 {
22594 int off = 0;
22595 rtx orig_dst = dst;
22596 if (align_bytes & 1)
22597 {
22598 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22599 off = 1;
22600 emit_insn (gen_strset (destreg, dst,
22601 gen_lowpart (QImode, value)));
22602 }
22603 if (align_bytes & 2)
22604 {
22605 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22606 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22607 set_mem_align (dst, 2 * BITS_PER_UNIT);
22608 off = 2;
22609 emit_insn (gen_strset (destreg, dst,
22610 gen_lowpart (HImode, value)));
22611 }
22612 if (align_bytes & 4)
22613 {
22614 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22615 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22616 set_mem_align (dst, 4 * BITS_PER_UNIT);
22617 off = 4;
22618 emit_insn (gen_strset (destreg, dst,
22619 gen_lowpart (SImode, value)));
22620 }
22621 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22622 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22623 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22624 if (MEM_SIZE_KNOWN_P (orig_dst))
22625 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22626 return dst;
22627 }
22628
22629 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
22630 static enum stringop_alg
22631 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22632 int *dynamic_check, bool *noalign)
22633 {
22634 const struct stringop_algs * algs;
22635 bool optimize_for_speed;
22636 /* Algorithms using the rep prefix want at least edi and ecx;
22637 additionally, memset wants eax and memcpy wants esi. Don't
22638 consider such algorithms if the user has appropriated those
22639 registers for their own purposes. */
22640 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22641 || (memset
22642 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22643 *noalign = false;
22644
22645 #define ALG_USABLE_P(alg) (rep_prefix_usable \
22646 || (alg != rep_prefix_1_byte \
22647 && alg != rep_prefix_4_byte \
22648 && alg != rep_prefix_8_byte))
22649 const struct processor_costs *cost;
22650
22651 /* Even if the string operation call is cold, we still might spend a lot
22652 of time processing large blocks. */
22653 if (optimize_function_for_size_p (cfun)
22654 || (optimize_insn_for_size_p ()
22655 && expected_size != -1 && expected_size < 256))
22656 optimize_for_speed = false;
22657 else
22658 optimize_for_speed = true;
22659
22660 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
22661
22662 *dynamic_check = -1;
22663 if (memset)
22664 algs = &cost->memset[TARGET_64BIT != 0];
22665 else
22666 algs = &cost->memcpy[TARGET_64BIT != 0];
22667 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
22668 return ix86_stringop_alg;
22669 /* rep; movq or rep; movl is the smallest variant. */
22670 else if (!optimize_for_speed)
22671 {
22672 if (!count || (count & 3))
22673 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
22674 else
22675 return rep_prefix_usable ? rep_prefix_4_byte : loop;
22676 }
22677 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
22678 */
22679 else if (expected_size != -1 && expected_size < 4)
22680 return loop_1_byte;
22681 else if (expected_size != -1)
22682 {
22683 unsigned int i;
22684 enum stringop_alg alg = libcall;
22685 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22686 {
22687 /* We get here if the algorithms that were not libcall-based
22688 were rep-prefix based and we are unable to use rep prefixes
22689 based on global register usage. Break out of the loop and
22690 use the heuristic below. */
22691 if (algs->size[i].max == 0)
22692 break;
22693 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22694 {
22695 enum stringop_alg candidate = algs->size[i].alg;
22696
22697 if (candidate != libcall && ALG_USABLE_P (candidate))
22698 alg = candidate;
22699 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22700 last non-libcall inline algorithm. */
22701 if (TARGET_INLINE_ALL_STRINGOPS)
22702 {
22703 /* When the current size is best to be copied by a libcall,
22704 but we are still forced to inline, run the heuristic below
22705 that will pick code for medium sized blocks. */
22706 if (alg != libcall)
22707 return alg;
22708 break;
22709 }
22710 else if (ALG_USABLE_P (candidate))
22711 {
22712 *noalign = algs->size[i].noalign;
22713 return candidate;
22714 }
22715 }
22716 }
22717 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22718 }
22719 /* When asked to inline the call anyway, try to pick meaningful choice.
22720 We look for maximal size of block that is faster to copy by hand and
22721 take blocks of at most of that size guessing that average size will
22722 be roughly half of the block.
22723
22724 If this turns out to be bad, we might simply specify the preferred
22725 choice in ix86_costs. */
22726 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22727 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22728 {
22729 int max = -1;
22730 enum stringop_alg alg;
22731 int i;
22732 bool any_alg_usable_p = true;
22733
22734 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22735 {
22736 enum stringop_alg candidate = algs->size[i].alg;
22737 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22738
22739 if (candidate != libcall && candidate
22740 && ALG_USABLE_P (candidate))
22741 max = algs->size[i].max;
22742 }
22743 /* If there aren't any usable algorithms, then recursing on
22744 smaller sizes isn't going to find anything. Just return the
22745 simple byte-at-a-time copy loop. */
22746 if (!any_alg_usable_p)
22747 {
22748 /* Pick something reasonable. */
22749 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22750 *dynamic_check = 128;
22751 return loop_1_byte;
22752 }
22753 if (max == -1)
22754 max = 4096;
22755 alg = decide_alg (count, max / 2, memset, dynamic_check, noalign);
22756 gcc_assert (*dynamic_check == -1);
22757 gcc_assert (alg != libcall);
22758 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22759 *dynamic_check = max;
22760 return alg;
22761 }
22762 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22763 #undef ALG_USABLE_P
22764 }
22765
22766 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22767 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22768 static int
22769 decide_alignment (int align,
22770 enum stringop_alg alg,
22771 int expected_size)
22772 {
22773 int desired_align = 0;
22774 switch (alg)
22775 {
22776 case no_stringop:
22777 gcc_unreachable ();
22778 case loop:
22779 case unrolled_loop:
22780 desired_align = GET_MODE_SIZE (Pmode);
22781 break;
22782 case rep_prefix_8_byte:
22783 desired_align = 8;
22784 break;
22785 case rep_prefix_4_byte:
22786 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22787 copying whole cacheline at once. */
22788 if (TARGET_PENTIUMPRO)
22789 desired_align = 8;
22790 else
22791 desired_align = 4;
22792 break;
22793 case rep_prefix_1_byte:
22794 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22795 copying whole cacheline at once. */
22796 if (TARGET_PENTIUMPRO)
22797 desired_align = 8;
22798 else
22799 desired_align = 1;
22800 break;
22801 case loop_1_byte:
22802 desired_align = 1;
22803 break;
22804 case libcall:
22805 return 0;
22806 }
22807
22808 if (optimize_size)
22809 desired_align = 1;
22810 if (desired_align < align)
22811 desired_align = align;
22812 if (expected_size != -1 && expected_size < 4)
22813 desired_align = align;
22814 return desired_align;
22815 }
22816
22817 /* Return the smallest power of 2 greater than VAL. */
22818 static int
22819 smallest_pow2_greater_than (int val)
22820 {
22821 int ret = 1;
22822 while (ret <= val)
22823 ret <<= 1;
22824 return ret;
22825 }
22826
22827 /* Expand string move (memcpy) operation. Use i386 string operations
22828 when profitable. expand_setmem contains similar code. The code
22829 depends upon architecture, block size and alignment, but always has
22830 the same overall structure:
22831
22832 1) Prologue guard: Conditional that jumps up to epilogues for small
22833 blocks that can be handled by epilogue alone. This is faster
22834 but also needed for correctness, since prologue assume the block
22835 is larger than the desired alignment.
22836
22837 Optional dynamic check for size and libcall for large
22838 blocks is emitted here too, with -minline-stringops-dynamically.
22839
22840 2) Prologue: copy first few bytes in order to get destination
22841 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22842 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22843 copied. We emit either a jump tree on power of two sized
22844 blocks, or a byte loop.
22845
22846 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22847 with specified algorithm.
22848
22849 4) Epilogue: code copying tail of the block that is too small to be
22850 handled by main body (or up to size guarded by prologue guard). */
22851
22852 bool
22853 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22854 rtx expected_align_exp, rtx expected_size_exp)
22855 {
22856 rtx destreg;
22857 rtx srcreg;
22858 rtx label = NULL;
22859 rtx tmp;
22860 rtx jump_around_label = NULL;
22861 HOST_WIDE_INT align = 1;
22862 unsigned HOST_WIDE_INT count = 0;
22863 HOST_WIDE_INT expected_size = -1;
22864 int size_needed = 0, epilogue_size_needed;
22865 int desired_align = 0, align_bytes = 0;
22866 enum stringop_alg alg;
22867 int dynamic_check;
22868 bool need_zero_guard = false;
22869 bool noalign;
22870
22871 if (CONST_INT_P (align_exp))
22872 align = INTVAL (align_exp);
22873 /* i386 can do misaligned access on reasonably increased cost. */
22874 if (CONST_INT_P (expected_align_exp)
22875 && INTVAL (expected_align_exp) > align)
22876 align = INTVAL (expected_align_exp);
22877 /* ALIGN is the minimum of destination and source alignment, but we care here
22878 just about destination alignment. */
22879 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22880 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22881
22882 if (CONST_INT_P (count_exp))
22883 count = expected_size = INTVAL (count_exp);
22884 if (CONST_INT_P (expected_size_exp) && count == 0)
22885 expected_size = INTVAL (expected_size_exp);
22886
22887 /* Make sure we don't need to care about overflow later on. */
22888 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22889 return false;
22890
22891 /* Step 0: Decide on preferred algorithm, desired alignment and
22892 size of chunks to be copied by main loop. */
22893
22894 alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
22895 desired_align = decide_alignment (align, alg, expected_size);
22896
22897 if (!TARGET_ALIGN_STRINGOPS || noalign)
22898 align = desired_align;
22899
22900 if (alg == libcall)
22901 return false;
22902 gcc_assert (alg != no_stringop);
22903 if (!count)
22904 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22905 destreg = copy_addr_to_reg (XEXP (dst, 0));
22906 srcreg = copy_addr_to_reg (XEXP (src, 0));
22907 switch (alg)
22908 {
22909 case libcall:
22910 case no_stringop:
22911 gcc_unreachable ();
22912 case loop:
22913 need_zero_guard = true;
22914 size_needed = GET_MODE_SIZE (word_mode);
22915 break;
22916 case unrolled_loop:
22917 need_zero_guard = true;
22918 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22919 break;
22920 case rep_prefix_8_byte:
22921 size_needed = 8;
22922 break;
22923 case rep_prefix_4_byte:
22924 size_needed = 4;
22925 break;
22926 case rep_prefix_1_byte:
22927 size_needed = 1;
22928 break;
22929 case loop_1_byte:
22930 need_zero_guard = true;
22931 size_needed = 1;
22932 break;
22933 }
22934
22935 epilogue_size_needed = size_needed;
22936
22937 /* Step 1: Prologue guard. */
22938
22939 /* Alignment code needs count to be in register. */
22940 if (CONST_INT_P (count_exp) && desired_align > align)
22941 {
22942 if (INTVAL (count_exp) > desired_align
22943 && INTVAL (count_exp) > size_needed)
22944 {
22945 align_bytes
22946 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22947 if (align_bytes <= 0)
22948 align_bytes = 0;
22949 else
22950 align_bytes = desired_align - align_bytes;
22951 }
22952 if (align_bytes == 0)
22953 count_exp = force_reg (counter_mode (count_exp), count_exp);
22954 }
22955 gcc_assert (desired_align >= 1 && align >= 1);
22956
22957 /* Ensure that alignment prologue won't copy past end of block. */
22958 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22959 {
22960 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22961 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22962 Make sure it is power of 2. */
22963 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22964
22965 if (count)
22966 {
22967 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22968 {
22969 /* If main algorithm works on QImode, no epilogue is needed.
22970 For small sizes just don't align anything. */
22971 if (size_needed == 1)
22972 desired_align = align;
22973 else
22974 goto epilogue;
22975 }
22976 }
22977 else
22978 {
22979 label = gen_label_rtx ();
22980 emit_cmp_and_jump_insns (count_exp,
22981 GEN_INT (epilogue_size_needed),
22982 LTU, 0, counter_mode (count_exp), 1, label);
22983 if (expected_size == -1 || expected_size < epilogue_size_needed)
22984 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22985 else
22986 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22987 }
22988 }
22989
22990 /* Emit code to decide on runtime whether library call or inline should be
22991 used. */
22992 if (dynamic_check != -1)
22993 {
22994 if (CONST_INT_P (count_exp))
22995 {
22996 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22997 {
22998 emit_block_move_via_libcall (dst, src, count_exp, false);
22999 count_exp = const0_rtx;
23000 goto epilogue;
23001 }
23002 }
23003 else
23004 {
23005 rtx hot_label = gen_label_rtx ();
23006 jump_around_label = gen_label_rtx ();
23007 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23008 LEU, 0, GET_MODE (count_exp), 1, hot_label);
23009 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23010 emit_block_move_via_libcall (dst, src, count_exp, false);
23011 emit_jump (jump_around_label);
23012 emit_label (hot_label);
23013 }
23014 }
23015
23016 /* Step 2: Alignment prologue. */
23017
23018 if (desired_align > align)
23019 {
23020 if (align_bytes == 0)
23021 {
23022 /* Except for the first move in epilogue, we no longer know
23023 constant offset in aliasing info. It don't seems to worth
23024 the pain to maintain it for the first move, so throw away
23025 the info early. */
23026 src = change_address (src, BLKmode, srcreg);
23027 dst = change_address (dst, BLKmode, destreg);
23028 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
23029 desired_align);
23030 }
23031 else
23032 {
23033 /* If we know how many bytes need to be stored before dst is
23034 sufficiently aligned, maintain aliasing info accurately. */
23035 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
23036 desired_align, align_bytes);
23037 count_exp = plus_constant (counter_mode (count_exp),
23038 count_exp, -align_bytes);
23039 count -= align_bytes;
23040 }
23041 if (need_zero_guard
23042 && (count < (unsigned HOST_WIDE_INT) size_needed
23043 || (align_bytes == 0
23044 && count < ((unsigned HOST_WIDE_INT) size_needed
23045 + desired_align - align))))
23046 {
23047 /* It is possible that we copied enough so the main loop will not
23048 execute. */
23049 gcc_assert (size_needed > 1);
23050 if (label == NULL_RTX)
23051 label = gen_label_rtx ();
23052 emit_cmp_and_jump_insns (count_exp,
23053 GEN_INT (size_needed),
23054 LTU, 0, counter_mode (count_exp), 1, label);
23055 if (expected_size == -1
23056 || expected_size < (desired_align - align) / 2 + size_needed)
23057 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23058 else
23059 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23060 }
23061 }
23062 if (label && size_needed == 1)
23063 {
23064 emit_label (label);
23065 LABEL_NUSES (label) = 1;
23066 label = NULL;
23067 epilogue_size_needed = 1;
23068 }
23069 else if (label == NULL_RTX)
23070 epilogue_size_needed = size_needed;
23071
23072 /* Step 3: Main loop. */
23073
23074 switch (alg)
23075 {
23076 case libcall:
23077 case no_stringop:
23078 gcc_unreachable ();
23079 case loop_1_byte:
23080 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
23081 count_exp, QImode, 1, expected_size);
23082 break;
23083 case loop:
23084 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
23085 count_exp, word_mode, 1, expected_size);
23086 break;
23087 case unrolled_loop:
23088 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
23089 registers for 4 temporaries anyway. */
23090 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
23091 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
23092 expected_size);
23093 break;
23094 case rep_prefix_8_byte:
23095 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
23096 DImode);
23097 break;
23098 case rep_prefix_4_byte:
23099 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
23100 SImode);
23101 break;
23102 case rep_prefix_1_byte:
23103 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
23104 QImode);
23105 break;
23106 }
23107 /* Adjust properly the offset of src and dest memory for aliasing. */
23108 if (CONST_INT_P (count_exp))
23109 {
23110 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
23111 (count / size_needed) * size_needed);
23112 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23113 (count / size_needed) * size_needed);
23114 }
23115 else
23116 {
23117 src = change_address (src, BLKmode, srcreg);
23118 dst = change_address (dst, BLKmode, destreg);
23119 }
23120
23121 /* Step 4: Epilogue to copy the remaining bytes. */
23122 epilogue:
23123 if (label)
23124 {
23125 /* When the main loop is done, COUNT_EXP might hold original count,
23126 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23127 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23128 bytes. Compensate if needed. */
23129
23130 if (size_needed < epilogue_size_needed)
23131 {
23132 tmp =
23133 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23134 GEN_INT (size_needed - 1), count_exp, 1,
23135 OPTAB_DIRECT);
23136 if (tmp != count_exp)
23137 emit_move_insn (count_exp, tmp);
23138 }
23139 emit_label (label);
23140 LABEL_NUSES (label) = 1;
23141 }
23142
23143 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23144 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
23145 epilogue_size_needed);
23146 if (jump_around_label)
23147 emit_label (jump_around_label);
23148 return true;
23149 }
23150
23151 /* Helper function for memcpy. For QImode value 0xXY produce
23152 0xXYXYXYXY of wide specified by MODE. This is essentially
23153 a * 0x10101010, but we can do slightly better than
23154 synth_mult by unwinding the sequence by hand on CPUs with
23155 slow multiply. */
23156 static rtx
23157 promote_duplicated_reg (enum machine_mode mode, rtx val)
23158 {
23159 enum machine_mode valmode = GET_MODE (val);
23160 rtx tmp;
23161 int nops = mode == DImode ? 3 : 2;
23162
23163 gcc_assert (mode == SImode || mode == DImode);
23164 if (val == const0_rtx)
23165 return copy_to_mode_reg (mode, const0_rtx);
23166 if (CONST_INT_P (val))
23167 {
23168 HOST_WIDE_INT v = INTVAL (val) & 255;
23169
23170 v |= v << 8;
23171 v |= v << 16;
23172 if (mode == DImode)
23173 v |= (v << 16) << 16;
23174 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
23175 }
23176
23177 if (valmode == VOIDmode)
23178 valmode = QImode;
23179 if (valmode != QImode)
23180 val = gen_lowpart (QImode, val);
23181 if (mode == QImode)
23182 return val;
23183 if (!TARGET_PARTIAL_REG_STALL)
23184 nops--;
23185 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
23186 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
23187 <= (ix86_cost->shift_const + ix86_cost->add) * nops
23188 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
23189 {
23190 rtx reg = convert_modes (mode, QImode, val, true);
23191 tmp = promote_duplicated_reg (mode, const1_rtx);
23192 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
23193 OPTAB_DIRECT);
23194 }
23195 else
23196 {
23197 rtx reg = convert_modes (mode, QImode, val, true);
23198
23199 if (!TARGET_PARTIAL_REG_STALL)
23200 if (mode == SImode)
23201 emit_insn (gen_movsi_insv_1 (reg, reg));
23202 else
23203 emit_insn (gen_movdi_insv_1 (reg, reg));
23204 else
23205 {
23206 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
23207 NULL, 1, OPTAB_DIRECT);
23208 reg =
23209 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23210 }
23211 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
23212 NULL, 1, OPTAB_DIRECT);
23213 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23214 if (mode == SImode)
23215 return reg;
23216 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
23217 NULL, 1, OPTAB_DIRECT);
23218 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23219 return reg;
23220 }
23221 }
23222
23223 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
23224 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
23225 alignment from ALIGN to DESIRED_ALIGN. */
23226 static rtx
23227 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
23228 {
23229 rtx promoted_val;
23230
23231 if (TARGET_64BIT
23232 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
23233 promoted_val = promote_duplicated_reg (DImode, val);
23234 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
23235 promoted_val = promote_duplicated_reg (SImode, val);
23236 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
23237 promoted_val = promote_duplicated_reg (HImode, val);
23238 else
23239 promoted_val = val;
23240
23241 return promoted_val;
23242 }
23243
23244 /* Expand string clear operation (bzero). Use i386 string operations when
23245 profitable. See expand_movmem comment for explanation of individual
23246 steps performed. */
23247 bool
23248 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
23249 rtx expected_align_exp, rtx expected_size_exp)
23250 {
23251 rtx destreg;
23252 rtx label = NULL;
23253 rtx tmp;
23254 rtx jump_around_label = NULL;
23255 HOST_WIDE_INT align = 1;
23256 unsigned HOST_WIDE_INT count = 0;
23257 HOST_WIDE_INT expected_size = -1;
23258 int size_needed = 0, epilogue_size_needed;
23259 int desired_align = 0, align_bytes = 0;
23260 enum stringop_alg alg;
23261 rtx promoted_val = NULL;
23262 bool force_loopy_epilogue = false;
23263 int dynamic_check;
23264 bool need_zero_guard = false;
23265 bool noalign;
23266
23267 if (CONST_INT_P (align_exp))
23268 align = INTVAL (align_exp);
23269 /* i386 can do misaligned access on reasonably increased cost. */
23270 if (CONST_INT_P (expected_align_exp)
23271 && INTVAL (expected_align_exp) > align)
23272 align = INTVAL (expected_align_exp);
23273 if (CONST_INT_P (count_exp))
23274 count = expected_size = INTVAL (count_exp);
23275 if (CONST_INT_P (expected_size_exp) && count == 0)
23276 expected_size = INTVAL (expected_size_exp);
23277
23278 /* Make sure we don't need to care about overflow later on. */
23279 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23280 return false;
23281
23282 /* Step 0: Decide on preferred algorithm, desired alignment and
23283 size of chunks to be copied by main loop. */
23284
23285 alg = decide_alg (count, expected_size, true, &dynamic_check, &noalign);
23286 desired_align = decide_alignment (align, alg, expected_size);
23287
23288 if (!TARGET_ALIGN_STRINGOPS || noalign)
23289 align = desired_align;
23290
23291 if (alg == libcall)
23292 return false;
23293 gcc_assert (alg != no_stringop);
23294 if (!count)
23295 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
23296 destreg = copy_addr_to_reg (XEXP (dst, 0));
23297 switch (alg)
23298 {
23299 case libcall:
23300 case no_stringop:
23301 gcc_unreachable ();
23302 case loop:
23303 need_zero_guard = true;
23304 size_needed = GET_MODE_SIZE (word_mode);
23305 break;
23306 case unrolled_loop:
23307 need_zero_guard = true;
23308 size_needed = GET_MODE_SIZE (word_mode) * 4;
23309 break;
23310 case rep_prefix_8_byte:
23311 size_needed = 8;
23312 break;
23313 case rep_prefix_4_byte:
23314 size_needed = 4;
23315 break;
23316 case rep_prefix_1_byte:
23317 size_needed = 1;
23318 break;
23319 case loop_1_byte:
23320 need_zero_guard = true;
23321 size_needed = 1;
23322 break;
23323 }
23324 epilogue_size_needed = size_needed;
23325
23326 /* Step 1: Prologue guard. */
23327
23328 /* Alignment code needs count to be in register. */
23329 if (CONST_INT_P (count_exp) && desired_align > align)
23330 {
23331 if (INTVAL (count_exp) > desired_align
23332 && INTVAL (count_exp) > size_needed)
23333 {
23334 align_bytes
23335 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23336 if (align_bytes <= 0)
23337 align_bytes = 0;
23338 else
23339 align_bytes = desired_align - align_bytes;
23340 }
23341 if (align_bytes == 0)
23342 {
23343 enum machine_mode mode = SImode;
23344 if (TARGET_64BIT && (count & ~0xffffffff))
23345 mode = DImode;
23346 count_exp = force_reg (mode, count_exp);
23347 }
23348 }
23349 /* Do the cheap promotion to allow better CSE across the
23350 main loop and epilogue (ie one load of the big constant in the
23351 front of all code. */
23352 if (CONST_INT_P (val_exp))
23353 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23354 desired_align, align);
23355 /* Ensure that alignment prologue won't copy past end of block. */
23356 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23357 {
23358 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23359 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
23360 Make sure it is power of 2. */
23361 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
23362
23363 /* To improve performance of small blocks, we jump around the VAL
23364 promoting mode. This mean that if the promoted VAL is not constant,
23365 we might not use it in the epilogue and have to use byte
23366 loop variant. */
23367 if (epilogue_size_needed > 2 && !promoted_val)
23368 force_loopy_epilogue = true;
23369 if (count)
23370 {
23371 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23372 {
23373 /* If main algorithm works on QImode, no epilogue is needed.
23374 For small sizes just don't align anything. */
23375 if (size_needed == 1)
23376 desired_align = align;
23377 else
23378 goto epilogue;
23379 }
23380 }
23381 else
23382 {
23383 label = gen_label_rtx ();
23384 emit_cmp_and_jump_insns (count_exp,
23385 GEN_INT (epilogue_size_needed),
23386 LTU, 0, counter_mode (count_exp), 1, label);
23387 if (expected_size == -1 || expected_size <= epilogue_size_needed)
23388 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23389 else
23390 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23391 }
23392 }
23393 if (dynamic_check != -1)
23394 {
23395 rtx hot_label = gen_label_rtx ();
23396 jump_around_label = gen_label_rtx ();
23397 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23398 LEU, 0, counter_mode (count_exp), 1, hot_label);
23399 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23400 set_storage_via_libcall (dst, count_exp, val_exp, false);
23401 emit_jump (jump_around_label);
23402 emit_label (hot_label);
23403 }
23404
23405 /* Step 2: Alignment prologue. */
23406
23407 /* Do the expensive promotion once we branched off the small blocks. */
23408 if (!promoted_val)
23409 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23410 desired_align, align);
23411 gcc_assert (desired_align >= 1 && align >= 1);
23412
23413 if (desired_align > align)
23414 {
23415 if (align_bytes == 0)
23416 {
23417 /* Except for the first move in epilogue, we no longer know
23418 constant offset in aliasing info. It don't seems to worth
23419 the pain to maintain it for the first move, so throw away
23420 the info early. */
23421 dst = change_address (dst, BLKmode, destreg);
23422 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
23423 desired_align);
23424 }
23425 else
23426 {
23427 /* If we know how many bytes need to be stored before dst is
23428 sufficiently aligned, maintain aliasing info accurately. */
23429 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
23430 desired_align, align_bytes);
23431 count_exp = plus_constant (counter_mode (count_exp),
23432 count_exp, -align_bytes);
23433 count -= align_bytes;
23434 }
23435 if (need_zero_guard
23436 && (count < (unsigned HOST_WIDE_INT) size_needed
23437 || (align_bytes == 0
23438 && count < ((unsigned HOST_WIDE_INT) size_needed
23439 + desired_align - align))))
23440 {
23441 /* It is possible that we copied enough so the main loop will not
23442 execute. */
23443 gcc_assert (size_needed > 1);
23444 if (label == NULL_RTX)
23445 label = gen_label_rtx ();
23446 emit_cmp_and_jump_insns (count_exp,
23447 GEN_INT (size_needed),
23448 LTU, 0, counter_mode (count_exp), 1, label);
23449 if (expected_size == -1
23450 || expected_size < (desired_align - align) / 2 + size_needed)
23451 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23452 else
23453 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23454 }
23455 }
23456 if (label && size_needed == 1)
23457 {
23458 emit_label (label);
23459 LABEL_NUSES (label) = 1;
23460 label = NULL;
23461 promoted_val = val_exp;
23462 epilogue_size_needed = 1;
23463 }
23464 else if (label == NULL_RTX)
23465 epilogue_size_needed = size_needed;
23466
23467 /* Step 3: Main loop. */
23468
23469 switch (alg)
23470 {
23471 case libcall:
23472 case no_stringop:
23473 gcc_unreachable ();
23474 case loop_1_byte:
23475 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23476 count_exp, QImode, 1, expected_size);
23477 break;
23478 case loop:
23479 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23480 count_exp, word_mode, 1, expected_size);
23481 break;
23482 case unrolled_loop:
23483 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23484 count_exp, word_mode, 4, expected_size);
23485 break;
23486 case rep_prefix_8_byte:
23487 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23488 DImode, val_exp);
23489 break;
23490 case rep_prefix_4_byte:
23491 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23492 SImode, val_exp);
23493 break;
23494 case rep_prefix_1_byte:
23495 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23496 QImode, val_exp);
23497 break;
23498 }
23499 /* Adjust properly the offset of src and dest memory for aliasing. */
23500 if (CONST_INT_P (count_exp))
23501 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23502 (count / size_needed) * size_needed);
23503 else
23504 dst = change_address (dst, BLKmode, destreg);
23505
23506 /* Step 4: Epilogue to copy the remaining bytes. */
23507
23508 if (label)
23509 {
23510 /* When the main loop is done, COUNT_EXP might hold original count,
23511 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23512 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23513 bytes. Compensate if needed. */
23514
23515 if (size_needed < epilogue_size_needed)
23516 {
23517 tmp =
23518 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23519 GEN_INT (size_needed - 1), count_exp, 1,
23520 OPTAB_DIRECT);
23521 if (tmp != count_exp)
23522 emit_move_insn (count_exp, tmp);
23523 }
23524 emit_label (label);
23525 LABEL_NUSES (label) = 1;
23526 }
23527 epilogue:
23528 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23529 {
23530 if (force_loopy_epilogue)
23531 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
23532 epilogue_size_needed);
23533 else
23534 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
23535 epilogue_size_needed);
23536 }
23537 if (jump_around_label)
23538 emit_label (jump_around_label);
23539 return true;
23540 }
23541
23542 /* Expand the appropriate insns for doing strlen if not just doing
23543 repnz; scasb
23544
23545 out = result, initialized with the start address
23546 align_rtx = alignment of the address.
23547 scratch = scratch register, initialized with the startaddress when
23548 not aligned, otherwise undefined
23549
23550 This is just the body. It needs the initializations mentioned above and
23551 some address computing at the end. These things are done in i386.md. */
23552
23553 static void
23554 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
23555 {
23556 int align;
23557 rtx tmp;
23558 rtx align_2_label = NULL_RTX;
23559 rtx align_3_label = NULL_RTX;
23560 rtx align_4_label = gen_label_rtx ();
23561 rtx end_0_label = gen_label_rtx ();
23562 rtx mem;
23563 rtx tmpreg = gen_reg_rtx (SImode);
23564 rtx scratch = gen_reg_rtx (SImode);
23565 rtx cmp;
23566
23567 align = 0;
23568 if (CONST_INT_P (align_rtx))
23569 align = INTVAL (align_rtx);
23570
23571 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
23572
23573 /* Is there a known alignment and is it less than 4? */
23574 if (align < 4)
23575 {
23576 rtx scratch1 = gen_reg_rtx (Pmode);
23577 emit_move_insn (scratch1, out);
23578 /* Is there a known alignment and is it not 2? */
23579 if (align != 2)
23580 {
23581 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
23582 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
23583
23584 /* Leave just the 3 lower bits. */
23585 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
23586 NULL_RTX, 0, OPTAB_WIDEN);
23587
23588 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23589 Pmode, 1, align_4_label);
23590 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
23591 Pmode, 1, align_2_label);
23592 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
23593 Pmode, 1, align_3_label);
23594 }
23595 else
23596 {
23597 /* Since the alignment is 2, we have to check 2 or 0 bytes;
23598 check if is aligned to 4 - byte. */
23599
23600 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
23601 NULL_RTX, 0, OPTAB_WIDEN);
23602
23603 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23604 Pmode, 1, align_4_label);
23605 }
23606
23607 mem = change_address (src, QImode, out);
23608
23609 /* Now compare the bytes. */
23610
23611 /* Compare the first n unaligned byte on a byte per byte basis. */
23612 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
23613 QImode, 1, end_0_label);
23614
23615 /* Increment the address. */
23616 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23617
23618 /* Not needed with an alignment of 2 */
23619 if (align != 2)
23620 {
23621 emit_label (align_2_label);
23622
23623 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23624 end_0_label);
23625
23626 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23627
23628 emit_label (align_3_label);
23629 }
23630
23631 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23632 end_0_label);
23633
23634 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23635 }
23636
23637 /* Generate loop to check 4 bytes at a time. It is not a good idea to
23638 align this loop. It gives only huge programs, but does not help to
23639 speed up. */
23640 emit_label (align_4_label);
23641
23642 mem = change_address (src, SImode, out);
23643 emit_move_insn (scratch, mem);
23644 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23645
23646 /* This formula yields a nonzero result iff one of the bytes is zero.
23647 This saves three branches inside loop and many cycles. */
23648
23649 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23650 emit_insn (gen_one_cmplsi2 (scratch, scratch));
23651 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23652 emit_insn (gen_andsi3 (tmpreg, tmpreg,
23653 gen_int_mode (0x80808080, SImode)));
23654 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23655 align_4_label);
23656
23657 if (TARGET_CMOVE)
23658 {
23659 rtx reg = gen_reg_rtx (SImode);
23660 rtx reg2 = gen_reg_rtx (Pmode);
23661 emit_move_insn (reg, tmpreg);
23662 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23663
23664 /* If zero is not in the first two bytes, move two bytes forward. */
23665 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23666 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23667 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23668 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
23669 gen_rtx_IF_THEN_ELSE (SImode, tmp,
23670 reg,
23671 tmpreg)));
23672 /* Emit lea manually to avoid clobbering of flags. */
23673 emit_insn (gen_rtx_SET (SImode, reg2,
23674 gen_rtx_PLUS (Pmode, out, const2_rtx)));
23675
23676 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23677 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23678 emit_insn (gen_rtx_SET (VOIDmode, out,
23679 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
23680 reg2,
23681 out)));
23682 }
23683 else
23684 {
23685 rtx end_2_label = gen_label_rtx ();
23686 /* Is zero in the first two bytes? */
23687
23688 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23689 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23690 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23691 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23692 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23693 pc_rtx);
23694 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23695 JUMP_LABEL (tmp) = end_2_label;
23696
23697 /* Not in the first two. Move two bytes forward. */
23698 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23699 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23700
23701 emit_label (end_2_label);
23702
23703 }
23704
23705 /* Avoid branch in fixing the byte. */
23706 tmpreg = gen_lowpart (QImode, tmpreg);
23707 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23708 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23709 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23710 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23711
23712 emit_label (end_0_label);
23713 }
23714
23715 /* Expand strlen. */
23716
23717 bool
23718 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23719 {
23720 rtx addr, scratch1, scratch2, scratch3, scratch4;
23721
23722 /* The generic case of strlen expander is long. Avoid it's
23723 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
23724
23725 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23726 && !TARGET_INLINE_ALL_STRINGOPS
23727 && !optimize_insn_for_size_p ()
23728 && (!CONST_INT_P (align) || INTVAL (align) < 4))
23729 return false;
23730
23731 addr = force_reg (Pmode, XEXP (src, 0));
23732 scratch1 = gen_reg_rtx (Pmode);
23733
23734 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23735 && !optimize_insn_for_size_p ())
23736 {
23737 /* Well it seems that some optimizer does not combine a call like
23738 foo(strlen(bar), strlen(bar));
23739 when the move and the subtraction is done here. It does calculate
23740 the length just once when these instructions are done inside of
23741 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23742 often used and I use one fewer register for the lifetime of
23743 output_strlen_unroll() this is better. */
23744
23745 emit_move_insn (out, addr);
23746
23747 ix86_expand_strlensi_unroll_1 (out, src, align);
23748
23749 /* strlensi_unroll_1 returns the address of the zero at the end of
23750 the string, like memchr(), so compute the length by subtracting
23751 the start address. */
23752 emit_insn (ix86_gen_sub3 (out, out, addr));
23753 }
23754 else
23755 {
23756 rtx unspec;
23757
23758 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23759 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23760 return false;
23761
23762 scratch2 = gen_reg_rtx (Pmode);
23763 scratch3 = gen_reg_rtx (Pmode);
23764 scratch4 = force_reg (Pmode, constm1_rtx);
23765
23766 emit_move_insn (scratch3, addr);
23767 eoschar = force_reg (QImode, eoschar);
23768
23769 src = replace_equiv_address_nv (src, scratch3);
23770
23771 /* If .md starts supporting :P, this can be done in .md. */
23772 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23773 scratch4), UNSPEC_SCAS);
23774 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23775 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23776 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23777 }
23778 return true;
23779 }
23780
23781 /* For given symbol (function) construct code to compute address of it's PLT
23782 entry in large x86-64 PIC model. */
23783 static rtx
23784 construct_plt_address (rtx symbol)
23785 {
23786 rtx tmp, unspec;
23787
23788 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23789 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
23790 gcc_assert (Pmode == DImode);
23791
23792 tmp = gen_reg_rtx (Pmode);
23793 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23794
23795 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23796 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23797 return tmp;
23798 }
23799
23800 rtx
23801 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23802 rtx callarg2,
23803 rtx pop, bool sibcall)
23804 {
23805 unsigned int const cregs_size
23806 = ARRAY_SIZE (x86_64_ms_sysv_extra_clobbered_registers);
23807 rtx vec[3 + cregs_size];
23808 rtx use = NULL, call;
23809 unsigned int vec_len = 0;
23810
23811 if (pop == const0_rtx)
23812 pop = NULL;
23813 gcc_assert (!TARGET_64BIT || !pop);
23814
23815 if (TARGET_MACHO && !TARGET_64BIT)
23816 {
23817 #if TARGET_MACHO
23818 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23819 fnaddr = machopic_indirect_call_target (fnaddr);
23820 #endif
23821 }
23822 else
23823 {
23824 /* Static functions and indirect calls don't need the pic register. */
23825 if (flag_pic
23826 && (!TARGET_64BIT
23827 || (ix86_cmodel == CM_LARGE_PIC
23828 && DEFAULT_ABI != MS_ABI))
23829 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23830 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23831 use_reg (&use, pic_offset_table_rtx);
23832 }
23833
23834 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23835 {
23836 rtx al = gen_rtx_REG (QImode, AX_REG);
23837 emit_move_insn (al, callarg2);
23838 use_reg (&use, al);
23839 }
23840
23841 if (ix86_cmodel == CM_LARGE_PIC
23842 && !TARGET_PECOFF
23843 && MEM_P (fnaddr)
23844 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23845 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23846 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23847 else if (sibcall
23848 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23849 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23850 {
23851 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
23852 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23853 }
23854
23855 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23856 if (retval)
23857 call = gen_rtx_SET (VOIDmode, retval, call);
23858 vec[vec_len++] = call;
23859
23860 if (pop)
23861 {
23862 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23863 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23864 vec[vec_len++] = pop;
23865 }
23866
23867 if (TARGET_64BIT_MS_ABI
23868 && (!callarg2 || INTVAL (callarg2) != -2))
23869 {
23870 unsigned i;
23871
23872 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23873 UNSPEC_MS_TO_SYSV_CALL);
23874
23875 for (i = 0; i < cregs_size; i++)
23876 {
23877 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
23878 enum machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
23879
23880 vec[vec_len++]
23881 = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (mode, regno));
23882 }
23883 }
23884
23885 if (vec_len > 1)
23886 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23887 call = emit_call_insn (call);
23888 if (use)
23889 CALL_INSN_FUNCTION_USAGE (call) = use;
23890
23891 return call;
23892 }
23893
23894 /* Output the assembly for a call instruction. */
23895
23896 const char *
23897 ix86_output_call_insn (rtx insn, rtx call_op)
23898 {
23899 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23900 bool seh_nop_p = false;
23901 const char *xasm;
23902
23903 if (SIBLING_CALL_P (insn))
23904 {
23905 if (direct_p)
23906 xasm = "jmp\t%P0";
23907 /* SEH epilogue detection requires the indirect branch case
23908 to include REX.W. */
23909 else if (TARGET_SEH)
23910 xasm = "rex.W jmp %A0";
23911 else
23912 xasm = "jmp\t%A0";
23913
23914 output_asm_insn (xasm, &call_op);
23915 return "";
23916 }
23917
23918 /* SEH unwinding can require an extra nop to be emitted in several
23919 circumstances. Determine if we have one of those. */
23920 if (TARGET_SEH)
23921 {
23922 rtx i;
23923
23924 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23925 {
23926 /* If we get to another real insn, we don't need the nop. */
23927 if (INSN_P (i))
23928 break;
23929
23930 /* If we get to the epilogue note, prevent a catch region from
23931 being adjacent to the standard epilogue sequence. If non-
23932 call-exceptions, we'll have done this during epilogue emission. */
23933 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23934 && !flag_non_call_exceptions
23935 && !can_throw_internal (insn))
23936 {
23937 seh_nop_p = true;
23938 break;
23939 }
23940 }
23941
23942 /* If we didn't find a real insn following the call, prevent the
23943 unwinder from looking into the next function. */
23944 if (i == NULL)
23945 seh_nop_p = true;
23946 }
23947
23948 if (direct_p)
23949 xasm = "call\t%P0";
23950 else
23951 xasm = "call\t%A0";
23952
23953 output_asm_insn (xasm, &call_op);
23954
23955 if (seh_nop_p)
23956 return "nop";
23957
23958 return "";
23959 }
23960 \f
23961 /* Clear stack slot assignments remembered from previous functions.
23962 This is called from INIT_EXPANDERS once before RTL is emitted for each
23963 function. */
23964
23965 static struct machine_function *
23966 ix86_init_machine_status (void)
23967 {
23968 struct machine_function *f;
23969
23970 f = ggc_alloc_cleared_machine_function ();
23971 f->use_fast_prologue_epilogue_nregs = -1;
23972 f->call_abi = ix86_abi;
23973
23974 return f;
23975 }
23976
23977 /* Return a MEM corresponding to a stack slot with mode MODE.
23978 Allocate a new slot if necessary.
23979
23980 The RTL for a function can have several slots available: N is
23981 which slot to use. */
23982
23983 rtx
23984 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23985 {
23986 struct stack_local_entry *s;
23987
23988 gcc_assert (n < MAX_386_STACK_LOCALS);
23989
23990 for (s = ix86_stack_locals; s; s = s->next)
23991 if (s->mode == mode && s->n == n)
23992 return validize_mem (copy_rtx (s->rtl));
23993
23994 s = ggc_alloc_stack_local_entry ();
23995 s->n = n;
23996 s->mode = mode;
23997 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23998
23999 s->next = ix86_stack_locals;
24000 ix86_stack_locals = s;
24001 return validize_mem (s->rtl);
24002 }
24003
24004 static void
24005 ix86_instantiate_decls (void)
24006 {
24007 struct stack_local_entry *s;
24008
24009 for (s = ix86_stack_locals; s; s = s->next)
24010 if (s->rtl != NULL_RTX)
24011 instantiate_decl_rtl (s->rtl);
24012 }
24013 \f
24014 /* Calculate the length of the memory address in the instruction encoding.
24015 Includes addr32 prefix, does not include the one-byte modrm, opcode,
24016 or other prefixes. We never generate addr32 prefix for LEA insn. */
24017
24018 int
24019 memory_address_length (rtx addr, bool lea)
24020 {
24021 struct ix86_address parts;
24022 rtx base, index, disp;
24023 int len;
24024 int ok;
24025
24026 if (GET_CODE (addr) == PRE_DEC
24027 || GET_CODE (addr) == POST_INC
24028 || GET_CODE (addr) == PRE_MODIFY
24029 || GET_CODE (addr) == POST_MODIFY)
24030 return 0;
24031
24032 ok = ix86_decompose_address (addr, &parts);
24033 gcc_assert (ok);
24034
24035 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
24036
24037 /* If this is not LEA instruction, add the length of addr32 prefix. */
24038 if (TARGET_64BIT && !lea
24039 && (SImode_address_operand (addr, VOIDmode)
24040 || (parts.base && GET_MODE (parts.base) == SImode)
24041 || (parts.index && GET_MODE (parts.index) == SImode)))
24042 len++;
24043
24044 base = parts.base;
24045 index = parts.index;
24046 disp = parts.disp;
24047
24048 if (base && GET_CODE (base) == SUBREG)
24049 base = SUBREG_REG (base);
24050 if (index && GET_CODE (index) == SUBREG)
24051 index = SUBREG_REG (index);
24052
24053 gcc_assert (base == NULL_RTX || REG_P (base));
24054 gcc_assert (index == NULL_RTX || REG_P (index));
24055
24056 /* Rule of thumb:
24057 - esp as the base always wants an index,
24058 - ebp as the base always wants a displacement,
24059 - r12 as the base always wants an index,
24060 - r13 as the base always wants a displacement. */
24061
24062 /* Register Indirect. */
24063 if (base && !index && !disp)
24064 {
24065 /* esp (for its index) and ebp (for its displacement) need
24066 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
24067 code. */
24068 if (base == arg_pointer_rtx
24069 || base == frame_pointer_rtx
24070 || REGNO (base) == SP_REG
24071 || REGNO (base) == BP_REG
24072 || REGNO (base) == R12_REG
24073 || REGNO (base) == R13_REG)
24074 len++;
24075 }
24076
24077 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
24078 is not disp32, but disp32(%rip), so for disp32
24079 SIB byte is needed, unless print_operand_address
24080 optimizes it into disp32(%rip) or (%rip) is implied
24081 by UNSPEC. */
24082 else if (disp && !base && !index)
24083 {
24084 len += 4;
24085 if (TARGET_64BIT)
24086 {
24087 rtx symbol = disp;
24088
24089 if (GET_CODE (disp) == CONST)
24090 symbol = XEXP (disp, 0);
24091 if (GET_CODE (symbol) == PLUS
24092 && CONST_INT_P (XEXP (symbol, 1)))
24093 symbol = XEXP (symbol, 0);
24094
24095 if (GET_CODE (symbol) != LABEL_REF
24096 && (GET_CODE (symbol) != SYMBOL_REF
24097 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
24098 && (GET_CODE (symbol) != UNSPEC
24099 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
24100 && XINT (symbol, 1) != UNSPEC_PCREL
24101 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
24102 len++;
24103 }
24104 }
24105 else
24106 {
24107 /* Find the length of the displacement constant. */
24108 if (disp)
24109 {
24110 if (base && satisfies_constraint_K (disp))
24111 len += 1;
24112 else
24113 len += 4;
24114 }
24115 /* ebp always wants a displacement. Similarly r13. */
24116 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
24117 len++;
24118
24119 /* An index requires the two-byte modrm form.... */
24120 if (index
24121 /* ...like esp (or r12), which always wants an index. */
24122 || base == arg_pointer_rtx
24123 || base == frame_pointer_rtx
24124 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
24125 len++;
24126 }
24127
24128 return len;
24129 }
24130
24131 /* Compute default value for "length_immediate" attribute. When SHORTFORM
24132 is set, expect that insn have 8bit immediate alternative. */
24133 int
24134 ix86_attr_length_immediate_default (rtx insn, bool shortform)
24135 {
24136 int len = 0;
24137 int i;
24138 extract_insn_cached (insn);
24139 for (i = recog_data.n_operands - 1; i >= 0; --i)
24140 if (CONSTANT_P (recog_data.operand[i]))
24141 {
24142 enum attr_mode mode = get_attr_mode (insn);
24143
24144 gcc_assert (!len);
24145 if (shortform && CONST_INT_P (recog_data.operand[i]))
24146 {
24147 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
24148 switch (mode)
24149 {
24150 case MODE_QI:
24151 len = 1;
24152 continue;
24153 case MODE_HI:
24154 ival = trunc_int_for_mode (ival, HImode);
24155 break;
24156 case MODE_SI:
24157 ival = trunc_int_for_mode (ival, SImode);
24158 break;
24159 default:
24160 break;
24161 }
24162 if (IN_RANGE (ival, -128, 127))
24163 {
24164 len = 1;
24165 continue;
24166 }
24167 }
24168 switch (mode)
24169 {
24170 case MODE_QI:
24171 len = 1;
24172 break;
24173 case MODE_HI:
24174 len = 2;
24175 break;
24176 case MODE_SI:
24177 len = 4;
24178 break;
24179 /* Immediates for DImode instructions are encoded
24180 as 32bit sign extended values. */
24181 case MODE_DI:
24182 len = 4;
24183 break;
24184 default:
24185 fatal_insn ("unknown insn mode", insn);
24186 }
24187 }
24188 return len;
24189 }
24190
24191 /* Compute default value for "length_address" attribute. */
24192 int
24193 ix86_attr_length_address_default (rtx insn)
24194 {
24195 int i;
24196
24197 if (get_attr_type (insn) == TYPE_LEA)
24198 {
24199 rtx set = PATTERN (insn), addr;
24200
24201 if (GET_CODE (set) == PARALLEL)
24202 set = XVECEXP (set, 0, 0);
24203
24204 gcc_assert (GET_CODE (set) == SET);
24205
24206 addr = SET_SRC (set);
24207
24208 return memory_address_length (addr, true);
24209 }
24210
24211 extract_insn_cached (insn);
24212 for (i = recog_data.n_operands - 1; i >= 0; --i)
24213 if (MEM_P (recog_data.operand[i]))
24214 {
24215 constrain_operands_cached (reload_completed);
24216 if (which_alternative != -1)
24217 {
24218 const char *constraints = recog_data.constraints[i];
24219 int alt = which_alternative;
24220
24221 while (*constraints == '=' || *constraints == '+')
24222 constraints++;
24223 while (alt-- > 0)
24224 while (*constraints++ != ',')
24225 ;
24226 /* Skip ignored operands. */
24227 if (*constraints == 'X')
24228 continue;
24229 }
24230 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
24231 }
24232 return 0;
24233 }
24234
24235 /* Compute default value for "length_vex" attribute. It includes
24236 2 or 3 byte VEX prefix and 1 opcode byte. */
24237
24238 int
24239 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24240 {
24241 int i;
24242
24243 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24244 byte VEX prefix. */
24245 if (!has_0f_opcode || has_vex_w)
24246 return 3 + 1;
24247
24248 /* We can always use 2 byte VEX prefix in 32bit. */
24249 if (!TARGET_64BIT)
24250 return 2 + 1;
24251
24252 extract_insn_cached (insn);
24253
24254 for (i = recog_data.n_operands - 1; i >= 0; --i)
24255 if (REG_P (recog_data.operand[i]))
24256 {
24257 /* REX.W bit uses 3 byte VEX prefix. */
24258 if (GET_MODE (recog_data.operand[i]) == DImode
24259 && GENERAL_REG_P (recog_data.operand[i]))
24260 return 3 + 1;
24261 }
24262 else
24263 {
24264 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24265 if (MEM_P (recog_data.operand[i])
24266 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24267 return 3 + 1;
24268 }
24269
24270 return 2 + 1;
24271 }
24272 \f
24273 /* Return the maximum number of instructions a cpu can issue. */
24274
24275 static int
24276 ix86_issue_rate (void)
24277 {
24278 switch (ix86_tune)
24279 {
24280 case PROCESSOR_PENTIUM:
24281 case PROCESSOR_ATOM:
24282 case PROCESSOR_SLM:
24283 case PROCESSOR_K6:
24284 case PROCESSOR_BTVER2:
24285 return 2;
24286
24287 case PROCESSOR_PENTIUMPRO:
24288 case PROCESSOR_PENTIUM4:
24289 case PROCESSOR_CORE2:
24290 case PROCESSOR_COREI7:
24291 case PROCESSOR_HASWELL:
24292 case PROCESSOR_ATHLON:
24293 case PROCESSOR_K8:
24294 case PROCESSOR_AMDFAM10:
24295 case PROCESSOR_NOCONA:
24296 case PROCESSOR_GENERIC32:
24297 case PROCESSOR_GENERIC64:
24298 case PROCESSOR_BDVER1:
24299 case PROCESSOR_BDVER2:
24300 case PROCESSOR_BDVER3:
24301 case PROCESSOR_BTVER1:
24302 return 3;
24303
24304 default:
24305 return 1;
24306 }
24307 }
24308
24309 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24310 by DEP_INSN and nothing set by DEP_INSN. */
24311
24312 static bool
24313 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24314 {
24315 rtx set, set2;
24316
24317 /* Simplify the test for uninteresting insns. */
24318 if (insn_type != TYPE_SETCC
24319 && insn_type != TYPE_ICMOV
24320 && insn_type != TYPE_FCMOV
24321 && insn_type != TYPE_IBR)
24322 return false;
24323
24324 if ((set = single_set (dep_insn)) != 0)
24325 {
24326 set = SET_DEST (set);
24327 set2 = NULL_RTX;
24328 }
24329 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
24330 && XVECLEN (PATTERN (dep_insn), 0) == 2
24331 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
24332 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
24333 {
24334 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24335 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24336 }
24337 else
24338 return false;
24339
24340 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
24341 return false;
24342
24343 /* This test is true if the dependent insn reads the flags but
24344 not any other potentially set register. */
24345 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
24346 return false;
24347
24348 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
24349 return false;
24350
24351 return true;
24352 }
24353
24354 /* Return true iff USE_INSN has a memory address with operands set by
24355 SET_INSN. */
24356
24357 bool
24358 ix86_agi_dependent (rtx set_insn, rtx use_insn)
24359 {
24360 int i;
24361 extract_insn_cached (use_insn);
24362 for (i = recog_data.n_operands - 1; i >= 0; --i)
24363 if (MEM_P (recog_data.operand[i]))
24364 {
24365 rtx addr = XEXP (recog_data.operand[i], 0);
24366 return modified_in_p (addr, set_insn) != 0;
24367 }
24368 return false;
24369 }
24370
24371 static int
24372 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
24373 {
24374 enum attr_type insn_type, dep_insn_type;
24375 enum attr_memory memory;
24376 rtx set, set2;
24377 int dep_insn_code_number;
24378
24379 /* Anti and output dependencies have zero cost on all CPUs. */
24380 if (REG_NOTE_KIND (link) != 0)
24381 return 0;
24382
24383 dep_insn_code_number = recog_memoized (dep_insn);
24384
24385 /* If we can't recognize the insns, we can't really do anything. */
24386 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
24387 return cost;
24388
24389 insn_type = get_attr_type (insn);
24390 dep_insn_type = get_attr_type (dep_insn);
24391
24392 switch (ix86_tune)
24393 {
24394 case PROCESSOR_PENTIUM:
24395 /* Address Generation Interlock adds a cycle of latency. */
24396 if (insn_type == TYPE_LEA)
24397 {
24398 rtx addr = PATTERN (insn);
24399
24400 if (GET_CODE (addr) == PARALLEL)
24401 addr = XVECEXP (addr, 0, 0);
24402
24403 gcc_assert (GET_CODE (addr) == SET);
24404
24405 addr = SET_SRC (addr);
24406 if (modified_in_p (addr, dep_insn))
24407 cost += 1;
24408 }
24409 else if (ix86_agi_dependent (dep_insn, insn))
24410 cost += 1;
24411
24412 /* ??? Compares pair with jump/setcc. */
24413 if (ix86_flags_dependent (insn, dep_insn, insn_type))
24414 cost = 0;
24415
24416 /* Floating point stores require value to be ready one cycle earlier. */
24417 if (insn_type == TYPE_FMOV
24418 && get_attr_memory (insn) == MEMORY_STORE
24419 && !ix86_agi_dependent (dep_insn, insn))
24420 cost += 1;
24421 break;
24422
24423 case PROCESSOR_PENTIUMPRO:
24424 memory = get_attr_memory (insn);
24425
24426 /* INT->FP conversion is expensive. */
24427 if (get_attr_fp_int_src (dep_insn))
24428 cost += 5;
24429
24430 /* There is one cycle extra latency between an FP op and a store. */
24431 if (insn_type == TYPE_FMOV
24432 && (set = single_set (dep_insn)) != NULL_RTX
24433 && (set2 = single_set (insn)) != NULL_RTX
24434 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
24435 && MEM_P (SET_DEST (set2)))
24436 cost += 1;
24437
24438 /* Show ability of reorder buffer to hide latency of load by executing
24439 in parallel with previous instruction in case
24440 previous instruction is not needed to compute the address. */
24441 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24442 && !ix86_agi_dependent (dep_insn, insn))
24443 {
24444 /* Claim moves to take one cycle, as core can issue one load
24445 at time and the next load can start cycle later. */
24446 if (dep_insn_type == TYPE_IMOV
24447 || dep_insn_type == TYPE_FMOV)
24448 cost = 1;
24449 else if (cost > 1)
24450 cost--;
24451 }
24452 break;
24453
24454 case PROCESSOR_K6:
24455 memory = get_attr_memory (insn);
24456
24457 /* The esp dependency is resolved before the instruction is really
24458 finished. */
24459 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
24460 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
24461 return 1;
24462
24463 /* INT->FP conversion is expensive. */
24464 if (get_attr_fp_int_src (dep_insn))
24465 cost += 5;
24466
24467 /* Show ability of reorder buffer to hide latency of load by executing
24468 in parallel with previous instruction in case
24469 previous instruction is not needed to compute the address. */
24470 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24471 && !ix86_agi_dependent (dep_insn, insn))
24472 {
24473 /* Claim moves to take one cycle, as core can issue one load
24474 at time and the next load can start cycle later. */
24475 if (dep_insn_type == TYPE_IMOV
24476 || dep_insn_type == TYPE_FMOV)
24477 cost = 1;
24478 else if (cost > 2)
24479 cost -= 2;
24480 else
24481 cost = 1;
24482 }
24483 break;
24484
24485 case PROCESSOR_ATHLON:
24486 case PROCESSOR_K8:
24487 case PROCESSOR_AMDFAM10:
24488 case PROCESSOR_BDVER1:
24489 case PROCESSOR_BDVER2:
24490 case PROCESSOR_BDVER3:
24491 case PROCESSOR_BTVER1:
24492 case PROCESSOR_BTVER2:
24493 case PROCESSOR_ATOM:
24494 case PROCESSOR_GENERIC32:
24495 case PROCESSOR_GENERIC64:
24496 memory = get_attr_memory (insn);
24497
24498 /* Show ability of reorder buffer to hide latency of load by executing
24499 in parallel with previous instruction in case
24500 previous instruction is not needed to compute the address. */
24501 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24502 && !ix86_agi_dependent (dep_insn, insn))
24503 {
24504 enum attr_unit unit = get_attr_unit (insn);
24505 int loadcost = 3;
24506
24507 /* Because of the difference between the length of integer and
24508 floating unit pipeline preparation stages, the memory operands
24509 for floating point are cheaper.
24510
24511 ??? For Athlon it the difference is most probably 2. */
24512 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
24513 loadcost = 3;
24514 else
24515 loadcost = TARGET_ATHLON ? 2 : 0;
24516
24517 if (cost >= loadcost)
24518 cost -= loadcost;
24519 else
24520 cost = 0;
24521 }
24522
24523 default:
24524 break;
24525 }
24526
24527 return cost;
24528 }
24529
24530 /* How many alternative schedules to try. This should be as wide as the
24531 scheduling freedom in the DFA, but no wider. Making this value too
24532 large results extra work for the scheduler. */
24533
24534 static int
24535 ia32_multipass_dfa_lookahead (void)
24536 {
24537 switch (ix86_tune)
24538 {
24539 case PROCESSOR_PENTIUM:
24540 return 2;
24541
24542 case PROCESSOR_PENTIUMPRO:
24543 case PROCESSOR_K6:
24544 return 1;
24545
24546 case PROCESSOR_CORE2:
24547 case PROCESSOR_COREI7:
24548 case PROCESSOR_HASWELL:
24549 case PROCESSOR_ATOM:
24550 case PROCESSOR_SLM:
24551 /* Generally, we want haifa-sched:max_issue() to look ahead as far
24552 as many instructions can be executed on a cycle, i.e.,
24553 issue_rate. I wonder why tuning for many CPUs does not do this. */
24554 if (reload_completed)
24555 return ix86_issue_rate ();
24556 /* Don't use lookahead for pre-reload schedule to save compile time. */
24557 return 0;
24558
24559 default:
24560 return 0;
24561 }
24562 }
24563
24564 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
24565 execution. It is applied if
24566 (1) IMUL instruction is on the top of list;
24567 (2) There exists the only producer of independent IMUL instruction in
24568 ready list;
24569 (3) Put found producer on the top of ready list.
24570 Returns issue rate. */
24571
24572 static int
24573 ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
24574 int clock_var ATTRIBUTE_UNUSED)
24575 {
24576 static int issue_rate = -1;
24577 int n_ready = *pn_ready;
24578 rtx insn, insn1, insn2;
24579 int i;
24580 sd_iterator_def sd_it;
24581 dep_t dep;
24582 int index = -1;
24583
24584 /* Set up issue rate. */
24585 issue_rate = ix86_issue_rate();
24586
24587 /* Do reodering for Atom only. */
24588 if (ix86_tune != PROCESSOR_ATOM)
24589 return issue_rate;
24590 /* Do not perform ready list reodering for pre-reload schedule pass. */
24591 if (!reload_completed)
24592 return issue_rate;
24593 /* Nothing to do if ready list contains only 1 instruction. */
24594 if (n_ready <= 1)
24595 return issue_rate;
24596
24597 /* Check that IMUL instruction is on the top of ready list. */
24598 insn = ready[n_ready - 1];
24599 if (!NONDEBUG_INSN_P (insn))
24600 return issue_rate;
24601 insn = PATTERN (insn);
24602 if (GET_CODE (insn) == PARALLEL)
24603 insn = XVECEXP (insn, 0, 0);
24604 if (GET_CODE (insn) != SET)
24605 return issue_rate;
24606 if (!(GET_CODE (SET_SRC (insn)) == MULT
24607 && GET_MODE (SET_SRC (insn)) == SImode))
24608 return issue_rate;
24609
24610 /* Search for producer of independent IMUL instruction. */
24611 for (i = n_ready - 2; i>= 0; i--)
24612 {
24613 insn = ready[i];
24614 if (!NONDEBUG_INSN_P (insn))
24615 continue;
24616 /* Skip IMUL instruction. */
24617 insn2 = PATTERN (insn);
24618 if (GET_CODE (insn2) == PARALLEL)
24619 insn2 = XVECEXP (insn2, 0, 0);
24620 if (GET_CODE (insn2) == SET
24621 && GET_CODE (SET_SRC (insn2)) == MULT
24622 && GET_MODE (SET_SRC (insn2)) == SImode)
24623 continue;
24624
24625 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
24626 {
24627 rtx con;
24628 con = DEP_CON (dep);
24629 if (!NONDEBUG_INSN_P (con))
24630 continue;
24631 insn1 = PATTERN (con);
24632 if (GET_CODE (insn1) == PARALLEL)
24633 insn1 = XVECEXP (insn1, 0, 0);
24634
24635 if (GET_CODE (insn1) == SET
24636 && GET_CODE (SET_SRC (insn1)) == MULT
24637 && GET_MODE (SET_SRC (insn1)) == SImode)
24638 {
24639 sd_iterator_def sd_it1;
24640 dep_t dep1;
24641 /* Check if there is no other dependee for IMUL. */
24642 index = i;
24643 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
24644 {
24645 rtx pro;
24646 pro = DEP_PRO (dep1);
24647 if (!NONDEBUG_INSN_P (pro))
24648 continue;
24649 if (pro != insn)
24650 index = -1;
24651 }
24652 if (index >= 0)
24653 break;
24654 }
24655 }
24656 if (index >= 0)
24657 break;
24658 }
24659 if (index < 0)
24660 return issue_rate; /* Didn't find IMUL producer. */
24661
24662 if (sched_verbose > 1)
24663 fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n",
24664 INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1]));
24665
24666 /* Put IMUL producer (ready[index]) at the top of ready list. */
24667 insn1= ready[index];
24668 for (i = index; i < n_ready - 1; i++)
24669 ready[i] = ready[i + 1];
24670 ready[n_ready - 1] = insn1;
24671
24672 return issue_rate;
24673 }
24674
24675 static bool
24676 ix86_class_likely_spilled_p (reg_class_t);
24677
24678 /* Returns true if lhs of insn is HW function argument register and set up
24679 is_spilled to true if it is likely spilled HW register. */
24680 static bool
24681 insn_is_function_arg (rtx insn, bool* is_spilled)
24682 {
24683 rtx dst;
24684
24685 if (!NONDEBUG_INSN_P (insn))
24686 return false;
24687 /* Call instructions are not movable, ignore it. */
24688 if (CALL_P (insn))
24689 return false;
24690 insn = PATTERN (insn);
24691 if (GET_CODE (insn) == PARALLEL)
24692 insn = XVECEXP (insn, 0, 0);
24693 if (GET_CODE (insn) != SET)
24694 return false;
24695 dst = SET_DEST (insn);
24696 if (REG_P (dst) && HARD_REGISTER_P (dst)
24697 && ix86_function_arg_regno_p (REGNO (dst)))
24698 {
24699 /* Is it likely spilled HW register? */
24700 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
24701 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
24702 *is_spilled = true;
24703 return true;
24704 }
24705 return false;
24706 }
24707
24708 /* Add output dependencies for chain of function adjacent arguments if only
24709 there is a move to likely spilled HW register. Return first argument
24710 if at least one dependence was added or NULL otherwise. */
24711 static rtx
24712 add_parameter_dependencies (rtx call, rtx head)
24713 {
24714 rtx insn;
24715 rtx last = call;
24716 rtx first_arg = NULL;
24717 bool is_spilled = false;
24718
24719 head = PREV_INSN (head);
24720
24721 /* Find nearest to call argument passing instruction. */
24722 while (true)
24723 {
24724 last = PREV_INSN (last);
24725 if (last == head)
24726 return NULL;
24727 if (!NONDEBUG_INSN_P (last))
24728 continue;
24729 if (insn_is_function_arg (last, &is_spilled))
24730 break;
24731 return NULL;
24732 }
24733
24734 first_arg = last;
24735 while (true)
24736 {
24737 insn = PREV_INSN (last);
24738 if (!INSN_P (insn))
24739 break;
24740 if (insn == head)
24741 break;
24742 if (!NONDEBUG_INSN_P (insn))
24743 {
24744 last = insn;
24745 continue;
24746 }
24747 if (insn_is_function_arg (insn, &is_spilled))
24748 {
24749 /* Add output depdendence between two function arguments if chain
24750 of output arguments contains likely spilled HW registers. */
24751 if (is_spilled)
24752 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
24753 first_arg = last = insn;
24754 }
24755 else
24756 break;
24757 }
24758 if (!is_spilled)
24759 return NULL;
24760 return first_arg;
24761 }
24762
24763 /* Add output or anti dependency from insn to first_arg to restrict its code
24764 motion. */
24765 static void
24766 avoid_func_arg_motion (rtx first_arg, rtx insn)
24767 {
24768 rtx set;
24769 rtx tmp;
24770
24771 set = single_set (insn);
24772 if (!set)
24773 return;
24774 tmp = SET_DEST (set);
24775 if (REG_P (tmp))
24776 {
24777 /* Add output dependency to the first function argument. */
24778 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
24779 return;
24780 }
24781 /* Add anti dependency. */
24782 add_dependence (first_arg, insn, REG_DEP_ANTI);
24783 }
24784
24785 /* Avoid cross block motion of function argument through adding dependency
24786 from the first non-jump instruction in bb. */
24787 static void
24788 add_dependee_for_func_arg (rtx arg, basic_block bb)
24789 {
24790 rtx insn = BB_END (bb);
24791
24792 while (insn)
24793 {
24794 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
24795 {
24796 rtx set = single_set (insn);
24797 if (set)
24798 {
24799 avoid_func_arg_motion (arg, insn);
24800 return;
24801 }
24802 }
24803 if (insn == BB_HEAD (bb))
24804 return;
24805 insn = PREV_INSN (insn);
24806 }
24807 }
24808
24809 /* Hook for pre-reload schedule - avoid motion of function arguments
24810 passed in likely spilled HW registers. */
24811 static void
24812 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
24813 {
24814 rtx insn;
24815 rtx first_arg = NULL;
24816 if (reload_completed)
24817 return;
24818 while (head != tail && DEBUG_INSN_P (head))
24819 head = NEXT_INSN (head);
24820 for (insn = tail; insn != head; insn = PREV_INSN (insn))
24821 if (INSN_P (insn) && CALL_P (insn))
24822 {
24823 first_arg = add_parameter_dependencies (insn, head);
24824 if (first_arg)
24825 {
24826 /* Add dependee for first argument to predecessors if only
24827 region contains more than one block. */
24828 basic_block bb = BLOCK_FOR_INSN (insn);
24829 int rgn = CONTAINING_RGN (bb->index);
24830 int nr_blks = RGN_NR_BLOCKS (rgn);
24831 /* Skip trivial regions and region head blocks that can have
24832 predecessors outside of region. */
24833 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
24834 {
24835 edge e;
24836 edge_iterator ei;
24837 /* Assume that region is SCC, i.e. all immediate predecessors
24838 of non-head block are in the same region. */
24839 FOR_EACH_EDGE (e, ei, bb->preds)
24840 {
24841 /* Avoid creating of loop-carried dependencies through
24842 using topological odering in region. */
24843 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
24844 add_dependee_for_func_arg (first_arg, e->src);
24845 }
24846 }
24847 insn = first_arg;
24848 if (insn == head)
24849 break;
24850 }
24851 }
24852 else if (first_arg)
24853 avoid_func_arg_motion (first_arg, insn);
24854 }
24855
24856 /* Hook for pre-reload schedule - set priority of moves from likely spilled
24857 HW registers to maximum, to schedule them at soon as possible. These are
24858 moves from function argument registers at the top of the function entry
24859 and moves from function return value registers after call. */
24860 static int
24861 ix86_adjust_priority (rtx insn, int priority)
24862 {
24863 rtx set;
24864
24865 if (reload_completed)
24866 return priority;
24867
24868 if (!NONDEBUG_INSN_P (insn))
24869 return priority;
24870
24871 set = single_set (insn);
24872 if (set)
24873 {
24874 rtx tmp = SET_SRC (set);
24875 if (REG_P (tmp)
24876 && HARD_REGISTER_P (tmp)
24877 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
24878 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
24879 return current_sched_info->sched_max_insns_priority;
24880 }
24881
24882 return priority;
24883 }
24884
24885 /* Model decoder of Core 2/i7.
24886 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
24887 track the instruction fetch block boundaries and make sure that long
24888 (9+ bytes) instructions are assigned to D0. */
24889
24890 /* Maximum length of an insn that can be handled by
24891 a secondary decoder unit. '8' for Core 2/i7. */
24892 static int core2i7_secondary_decoder_max_insn_size;
24893
24894 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
24895 '16' for Core 2/i7. */
24896 static int core2i7_ifetch_block_size;
24897
24898 /* Maximum number of instructions decoder can handle per cycle.
24899 '6' for Core 2/i7. */
24900 static int core2i7_ifetch_block_max_insns;
24901
24902 typedef struct ix86_first_cycle_multipass_data_ *
24903 ix86_first_cycle_multipass_data_t;
24904 typedef const struct ix86_first_cycle_multipass_data_ *
24905 const_ix86_first_cycle_multipass_data_t;
24906
24907 /* A variable to store target state across calls to max_issue within
24908 one cycle. */
24909 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
24910 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
24911
24912 /* Initialize DATA. */
24913 static void
24914 core2i7_first_cycle_multipass_init (void *_data)
24915 {
24916 ix86_first_cycle_multipass_data_t data
24917 = (ix86_first_cycle_multipass_data_t) _data;
24918
24919 data->ifetch_block_len = 0;
24920 data->ifetch_block_n_insns = 0;
24921 data->ready_try_change = NULL;
24922 data->ready_try_change_size = 0;
24923 }
24924
24925 /* Advancing the cycle; reset ifetch block counts. */
24926 static void
24927 core2i7_dfa_post_advance_cycle (void)
24928 {
24929 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
24930
24931 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24932
24933 data->ifetch_block_len = 0;
24934 data->ifetch_block_n_insns = 0;
24935 }
24936
24937 static int min_insn_size (rtx);
24938
24939 /* Filter out insns from ready_try that the core will not be able to issue
24940 on current cycle due to decoder. */
24941 static void
24942 core2i7_first_cycle_multipass_filter_ready_try
24943 (const_ix86_first_cycle_multipass_data_t data,
24944 char *ready_try, int n_ready, bool first_cycle_insn_p)
24945 {
24946 while (n_ready--)
24947 {
24948 rtx insn;
24949 int insn_size;
24950
24951 if (ready_try[n_ready])
24952 continue;
24953
24954 insn = get_ready_element (n_ready);
24955 insn_size = min_insn_size (insn);
24956
24957 if (/* If this is a too long an insn for a secondary decoder ... */
24958 (!first_cycle_insn_p
24959 && insn_size > core2i7_secondary_decoder_max_insn_size)
24960 /* ... or it would not fit into the ifetch block ... */
24961 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
24962 /* ... or the decoder is full already ... */
24963 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
24964 /* ... mask the insn out. */
24965 {
24966 ready_try[n_ready] = 1;
24967
24968 if (data->ready_try_change)
24969 bitmap_set_bit (data->ready_try_change, n_ready);
24970 }
24971 }
24972 }
24973
24974 /* Prepare for a new round of multipass lookahead scheduling. */
24975 static void
24976 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
24977 bool first_cycle_insn_p)
24978 {
24979 ix86_first_cycle_multipass_data_t data
24980 = (ix86_first_cycle_multipass_data_t) _data;
24981 const_ix86_first_cycle_multipass_data_t prev_data
24982 = ix86_first_cycle_multipass_data;
24983
24984 /* Restore the state from the end of the previous round. */
24985 data->ifetch_block_len = prev_data->ifetch_block_len;
24986 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
24987
24988 /* Filter instructions that cannot be issued on current cycle due to
24989 decoder restrictions. */
24990 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24991 first_cycle_insn_p);
24992 }
24993
24994 /* INSN is being issued in current solution. Account for its impact on
24995 the decoder model. */
24996 static void
24997 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24998 rtx insn, const void *_prev_data)
24999 {
25000 ix86_first_cycle_multipass_data_t data
25001 = (ix86_first_cycle_multipass_data_t) _data;
25002 const_ix86_first_cycle_multipass_data_t prev_data
25003 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
25004
25005 int insn_size = min_insn_size (insn);
25006
25007 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
25008 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
25009 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
25010 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
25011
25012 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
25013 if (!data->ready_try_change)
25014 {
25015 data->ready_try_change = sbitmap_alloc (n_ready);
25016 data->ready_try_change_size = n_ready;
25017 }
25018 else if (data->ready_try_change_size < n_ready)
25019 {
25020 data->ready_try_change = sbitmap_resize (data->ready_try_change,
25021 n_ready, 0);
25022 data->ready_try_change_size = n_ready;
25023 }
25024 bitmap_clear (data->ready_try_change);
25025
25026 /* Filter out insns from ready_try that the core will not be able to issue
25027 on current cycle due to decoder. */
25028 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
25029 false);
25030 }
25031
25032 /* Revert the effect on ready_try. */
25033 static void
25034 core2i7_first_cycle_multipass_backtrack (const void *_data,
25035 char *ready_try,
25036 int n_ready ATTRIBUTE_UNUSED)
25037 {
25038 const_ix86_first_cycle_multipass_data_t data
25039 = (const_ix86_first_cycle_multipass_data_t) _data;
25040 unsigned int i = 0;
25041 sbitmap_iterator sbi;
25042
25043 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
25044 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
25045 {
25046 ready_try[i] = 0;
25047 }
25048 }
25049
25050 /* Save the result of multipass lookahead scheduling for the next round. */
25051 static void
25052 core2i7_first_cycle_multipass_end (const void *_data)
25053 {
25054 const_ix86_first_cycle_multipass_data_t data
25055 = (const_ix86_first_cycle_multipass_data_t) _data;
25056 ix86_first_cycle_multipass_data_t next_data
25057 = ix86_first_cycle_multipass_data;
25058
25059 if (data != NULL)
25060 {
25061 next_data->ifetch_block_len = data->ifetch_block_len;
25062 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
25063 }
25064 }
25065
25066 /* Deallocate target data. */
25067 static void
25068 core2i7_first_cycle_multipass_fini (void *_data)
25069 {
25070 ix86_first_cycle_multipass_data_t data
25071 = (ix86_first_cycle_multipass_data_t) _data;
25072
25073 if (data->ready_try_change)
25074 {
25075 sbitmap_free (data->ready_try_change);
25076 data->ready_try_change = NULL;
25077 data->ready_try_change_size = 0;
25078 }
25079 }
25080
25081 /* Prepare for scheduling pass. */
25082 static void
25083 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
25084 int verbose ATTRIBUTE_UNUSED,
25085 int max_uid ATTRIBUTE_UNUSED)
25086 {
25087 /* Install scheduling hooks for current CPU. Some of these hooks are used
25088 in time-critical parts of the scheduler, so we only set them up when
25089 they are actually used. */
25090 switch (ix86_tune)
25091 {
25092 case PROCESSOR_CORE2:
25093 case PROCESSOR_COREI7:
25094 case PROCESSOR_HASWELL:
25095 /* Do not perform multipass scheduling for pre-reload schedule
25096 to save compile time. */
25097 if (reload_completed)
25098 {
25099 targetm.sched.dfa_post_advance_cycle
25100 = core2i7_dfa_post_advance_cycle;
25101 targetm.sched.first_cycle_multipass_init
25102 = core2i7_first_cycle_multipass_init;
25103 targetm.sched.first_cycle_multipass_begin
25104 = core2i7_first_cycle_multipass_begin;
25105 targetm.sched.first_cycle_multipass_issue
25106 = core2i7_first_cycle_multipass_issue;
25107 targetm.sched.first_cycle_multipass_backtrack
25108 = core2i7_first_cycle_multipass_backtrack;
25109 targetm.sched.first_cycle_multipass_end
25110 = core2i7_first_cycle_multipass_end;
25111 targetm.sched.first_cycle_multipass_fini
25112 = core2i7_first_cycle_multipass_fini;
25113
25114 /* Set decoder parameters. */
25115 core2i7_secondary_decoder_max_insn_size = 8;
25116 core2i7_ifetch_block_size = 16;
25117 core2i7_ifetch_block_max_insns = 6;
25118 break;
25119 }
25120 /* ... Fall through ... */
25121 default:
25122 targetm.sched.dfa_post_advance_cycle = NULL;
25123 targetm.sched.first_cycle_multipass_init = NULL;
25124 targetm.sched.first_cycle_multipass_begin = NULL;
25125 targetm.sched.first_cycle_multipass_issue = NULL;
25126 targetm.sched.first_cycle_multipass_backtrack = NULL;
25127 targetm.sched.first_cycle_multipass_end = NULL;
25128 targetm.sched.first_cycle_multipass_fini = NULL;
25129 break;
25130 }
25131 }
25132
25133 \f
25134 /* Compute the alignment given to a constant that is being placed in memory.
25135 EXP is the constant and ALIGN is the alignment that the object would
25136 ordinarily have.
25137 The value of this function is used instead of that alignment to align
25138 the object. */
25139
25140 int
25141 ix86_constant_alignment (tree exp, int align)
25142 {
25143 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
25144 || TREE_CODE (exp) == INTEGER_CST)
25145 {
25146 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
25147 return 64;
25148 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
25149 return 128;
25150 }
25151 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
25152 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
25153 return BITS_PER_WORD;
25154
25155 return align;
25156 }
25157
25158 /* Compute the alignment for a static variable.
25159 TYPE is the data type, and ALIGN is the alignment that
25160 the object would ordinarily have. The value of this function is used
25161 instead of that alignment to align the object. */
25162
25163 int
25164 ix86_data_alignment (tree type, int align)
25165 {
25166 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
25167
25168 if (AGGREGATE_TYPE_P (type)
25169 && TYPE_SIZE (type)
25170 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25171 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
25172 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
25173 && align < max_align)
25174 align = max_align;
25175
25176 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25177 to 16byte boundary. */
25178 if (TARGET_64BIT)
25179 {
25180 if (AGGREGATE_TYPE_P (type)
25181 && TYPE_SIZE (type)
25182 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25183 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
25184 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25185 return 128;
25186 }
25187
25188 if (TREE_CODE (type) == ARRAY_TYPE)
25189 {
25190 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25191 return 64;
25192 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25193 return 128;
25194 }
25195 else if (TREE_CODE (type) == COMPLEX_TYPE)
25196 {
25197
25198 if (TYPE_MODE (type) == DCmode && align < 64)
25199 return 64;
25200 if ((TYPE_MODE (type) == XCmode
25201 || TYPE_MODE (type) == TCmode) && align < 128)
25202 return 128;
25203 }
25204 else if ((TREE_CODE (type) == RECORD_TYPE
25205 || TREE_CODE (type) == UNION_TYPE
25206 || TREE_CODE (type) == QUAL_UNION_TYPE)
25207 && TYPE_FIELDS (type))
25208 {
25209 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25210 return 64;
25211 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25212 return 128;
25213 }
25214 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25215 || TREE_CODE (type) == INTEGER_TYPE)
25216 {
25217 if (TYPE_MODE (type) == DFmode && align < 64)
25218 return 64;
25219 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25220 return 128;
25221 }
25222
25223 return align;
25224 }
25225
25226 /* Compute the alignment for a local variable or a stack slot. EXP is
25227 the data type or decl itself, MODE is the widest mode available and
25228 ALIGN is the alignment that the object would ordinarily have. The
25229 value of this macro is used instead of that alignment to align the
25230 object. */
25231
25232 unsigned int
25233 ix86_local_alignment (tree exp, enum machine_mode mode,
25234 unsigned int align)
25235 {
25236 tree type, decl;
25237
25238 if (exp && DECL_P (exp))
25239 {
25240 type = TREE_TYPE (exp);
25241 decl = exp;
25242 }
25243 else
25244 {
25245 type = exp;
25246 decl = NULL;
25247 }
25248
25249 /* Don't do dynamic stack realignment for long long objects with
25250 -mpreferred-stack-boundary=2. */
25251 if (!TARGET_64BIT
25252 && align == 64
25253 && ix86_preferred_stack_boundary < 64
25254 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
25255 && (!type || !TYPE_USER_ALIGN (type))
25256 && (!decl || !DECL_USER_ALIGN (decl)))
25257 align = 32;
25258
25259 /* If TYPE is NULL, we are allocating a stack slot for caller-save
25260 register in MODE. We will return the largest alignment of XF
25261 and DF. */
25262 if (!type)
25263 {
25264 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
25265 align = GET_MODE_ALIGNMENT (DFmode);
25266 return align;
25267 }
25268
25269 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25270 to 16byte boundary. Exact wording is:
25271
25272 An array uses the same alignment as its elements, except that a local or
25273 global array variable of length at least 16 bytes or
25274 a C99 variable-length array variable always has alignment of at least 16 bytes.
25275
25276 This was added to allow use of aligned SSE instructions at arrays. This
25277 rule is meant for static storage (where compiler can not do the analysis
25278 by itself). We follow it for automatic variables only when convenient.
25279 We fully control everything in the function compiled and functions from
25280 other unit can not rely on the alignment.
25281
25282 Exclude va_list type. It is the common case of local array where
25283 we can not benefit from the alignment.
25284
25285 TODO: Probably one should optimize for size only when var is not escaping. */
25286 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
25287 && TARGET_SSE)
25288 {
25289 if (AGGREGATE_TYPE_P (type)
25290 && (va_list_type_node == NULL_TREE
25291 || (TYPE_MAIN_VARIANT (type)
25292 != TYPE_MAIN_VARIANT (va_list_type_node)))
25293 && TYPE_SIZE (type)
25294 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25295 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
25296 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25297 return 128;
25298 }
25299 if (TREE_CODE (type) == ARRAY_TYPE)
25300 {
25301 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25302 return 64;
25303 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25304 return 128;
25305 }
25306 else if (TREE_CODE (type) == COMPLEX_TYPE)
25307 {
25308 if (TYPE_MODE (type) == DCmode && align < 64)
25309 return 64;
25310 if ((TYPE_MODE (type) == XCmode
25311 || TYPE_MODE (type) == TCmode) && align < 128)
25312 return 128;
25313 }
25314 else if ((TREE_CODE (type) == RECORD_TYPE
25315 || TREE_CODE (type) == UNION_TYPE
25316 || TREE_CODE (type) == QUAL_UNION_TYPE)
25317 && TYPE_FIELDS (type))
25318 {
25319 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25320 return 64;
25321 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25322 return 128;
25323 }
25324 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25325 || TREE_CODE (type) == INTEGER_TYPE)
25326 {
25327
25328 if (TYPE_MODE (type) == DFmode && align < 64)
25329 return 64;
25330 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25331 return 128;
25332 }
25333 return align;
25334 }
25335
25336 /* Compute the minimum required alignment for dynamic stack realignment
25337 purposes for a local variable, parameter or a stack slot. EXP is
25338 the data type or decl itself, MODE is its mode and ALIGN is the
25339 alignment that the object would ordinarily have. */
25340
25341 unsigned int
25342 ix86_minimum_alignment (tree exp, enum machine_mode mode,
25343 unsigned int align)
25344 {
25345 tree type, decl;
25346
25347 if (exp && DECL_P (exp))
25348 {
25349 type = TREE_TYPE (exp);
25350 decl = exp;
25351 }
25352 else
25353 {
25354 type = exp;
25355 decl = NULL;
25356 }
25357
25358 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
25359 return align;
25360
25361 /* Don't do dynamic stack realignment for long long objects with
25362 -mpreferred-stack-boundary=2. */
25363 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
25364 && (!type || !TYPE_USER_ALIGN (type))
25365 && (!decl || !DECL_USER_ALIGN (decl)))
25366 return 32;
25367
25368 return align;
25369 }
25370 \f
25371 /* Find a location for the static chain incoming to a nested function.
25372 This is a register, unless all free registers are used by arguments. */
25373
25374 static rtx
25375 ix86_static_chain (const_tree fndecl, bool incoming_p)
25376 {
25377 unsigned regno;
25378
25379 if (!DECL_STATIC_CHAIN (fndecl))
25380 return NULL;
25381
25382 if (TARGET_64BIT)
25383 {
25384 /* We always use R10 in 64-bit mode. */
25385 regno = R10_REG;
25386 }
25387 else
25388 {
25389 tree fntype;
25390 unsigned int ccvt;
25391
25392 /* By default in 32-bit mode we use ECX to pass the static chain. */
25393 regno = CX_REG;
25394
25395 fntype = TREE_TYPE (fndecl);
25396 ccvt = ix86_get_callcvt (fntype);
25397 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
25398 {
25399 /* Fastcall functions use ecx/edx for arguments, which leaves
25400 us with EAX for the static chain.
25401 Thiscall functions use ecx for arguments, which also
25402 leaves us with EAX for the static chain. */
25403 regno = AX_REG;
25404 }
25405 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
25406 {
25407 /* Thiscall functions use ecx for arguments, which leaves
25408 us with EAX and EDX for the static chain.
25409 We are using for abi-compatibility EAX. */
25410 regno = AX_REG;
25411 }
25412 else if (ix86_function_regparm (fntype, fndecl) == 3)
25413 {
25414 /* For regparm 3, we have no free call-clobbered registers in
25415 which to store the static chain. In order to implement this,
25416 we have the trampoline push the static chain to the stack.
25417 However, we can't push a value below the return address when
25418 we call the nested function directly, so we have to use an
25419 alternate entry point. For this we use ESI, and have the
25420 alternate entry point push ESI, so that things appear the
25421 same once we're executing the nested function. */
25422 if (incoming_p)
25423 {
25424 if (fndecl == current_function_decl)
25425 ix86_static_chain_on_stack = true;
25426 return gen_frame_mem (SImode,
25427 plus_constant (Pmode,
25428 arg_pointer_rtx, -8));
25429 }
25430 regno = SI_REG;
25431 }
25432 }
25433
25434 return gen_rtx_REG (Pmode, regno);
25435 }
25436
25437 /* Emit RTL insns to initialize the variable parts of a trampoline.
25438 FNDECL is the decl of the target address; M_TRAMP is a MEM for
25439 the trampoline, and CHAIN_VALUE is an RTX for the static chain
25440 to be passed to the target function. */
25441
25442 static void
25443 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
25444 {
25445 rtx mem, fnaddr;
25446 int opcode;
25447 int offset = 0;
25448
25449 fnaddr = XEXP (DECL_RTL (fndecl), 0);
25450
25451 if (TARGET_64BIT)
25452 {
25453 int size;
25454
25455 /* Load the function address to r11. Try to load address using
25456 the shorter movl instead of movabs. We may want to support
25457 movq for kernel mode, but kernel does not use trampolines at
25458 the moment. FNADDR is a 32bit address and may not be in
25459 DImode when ptr_mode == SImode. Always use movl in this
25460 case. */
25461 if (ptr_mode == SImode
25462 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
25463 {
25464 fnaddr = copy_addr_to_reg (fnaddr);
25465
25466 mem = adjust_address (m_tramp, HImode, offset);
25467 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
25468
25469 mem = adjust_address (m_tramp, SImode, offset + 2);
25470 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
25471 offset += 6;
25472 }
25473 else
25474 {
25475 mem = adjust_address (m_tramp, HImode, offset);
25476 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
25477
25478 mem = adjust_address (m_tramp, DImode, offset + 2);
25479 emit_move_insn (mem, fnaddr);
25480 offset += 10;
25481 }
25482
25483 /* Load static chain using movabs to r10. Use the shorter movl
25484 instead of movabs when ptr_mode == SImode. */
25485 if (ptr_mode == SImode)
25486 {
25487 opcode = 0xba41;
25488 size = 6;
25489 }
25490 else
25491 {
25492 opcode = 0xba49;
25493 size = 10;
25494 }
25495
25496 mem = adjust_address (m_tramp, HImode, offset);
25497 emit_move_insn (mem, gen_int_mode (opcode, HImode));
25498
25499 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
25500 emit_move_insn (mem, chain_value);
25501 offset += size;
25502
25503 /* Jump to r11; the last (unused) byte is a nop, only there to
25504 pad the write out to a single 32-bit store. */
25505 mem = adjust_address (m_tramp, SImode, offset);
25506 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
25507 offset += 4;
25508 }
25509 else
25510 {
25511 rtx disp, chain;
25512
25513 /* Depending on the static chain location, either load a register
25514 with a constant, or push the constant to the stack. All of the
25515 instructions are the same size. */
25516 chain = ix86_static_chain (fndecl, true);
25517 if (REG_P (chain))
25518 {
25519 switch (REGNO (chain))
25520 {
25521 case AX_REG:
25522 opcode = 0xb8; break;
25523 case CX_REG:
25524 opcode = 0xb9; break;
25525 default:
25526 gcc_unreachable ();
25527 }
25528 }
25529 else
25530 opcode = 0x68;
25531
25532 mem = adjust_address (m_tramp, QImode, offset);
25533 emit_move_insn (mem, gen_int_mode (opcode, QImode));
25534
25535 mem = adjust_address (m_tramp, SImode, offset + 1);
25536 emit_move_insn (mem, chain_value);
25537 offset += 5;
25538
25539 mem = adjust_address (m_tramp, QImode, offset);
25540 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
25541
25542 mem = adjust_address (m_tramp, SImode, offset + 1);
25543
25544 /* Compute offset from the end of the jmp to the target function.
25545 In the case in which the trampoline stores the static chain on
25546 the stack, we need to skip the first insn which pushes the
25547 (call-saved) register static chain; this push is 1 byte. */
25548 offset += 5;
25549 disp = expand_binop (SImode, sub_optab, fnaddr,
25550 plus_constant (Pmode, XEXP (m_tramp, 0),
25551 offset - (MEM_P (chain) ? 1 : 0)),
25552 NULL_RTX, 1, OPTAB_DIRECT);
25553 emit_move_insn (mem, disp);
25554 }
25555
25556 gcc_assert (offset <= TRAMPOLINE_SIZE);
25557
25558 #ifdef HAVE_ENABLE_EXECUTE_STACK
25559 #ifdef CHECK_EXECUTE_STACK_ENABLED
25560 if (CHECK_EXECUTE_STACK_ENABLED)
25561 #endif
25562 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
25563 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
25564 #endif
25565 }
25566 \f
25567 /* The following file contains several enumerations and data structures
25568 built from the definitions in i386-builtin-types.def. */
25569
25570 #include "i386-builtin-types.inc"
25571
25572 /* Table for the ix86 builtin non-function types. */
25573 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
25574
25575 /* Retrieve an element from the above table, building some of
25576 the types lazily. */
25577
25578 static tree
25579 ix86_get_builtin_type (enum ix86_builtin_type tcode)
25580 {
25581 unsigned int index;
25582 tree type, itype;
25583
25584 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
25585
25586 type = ix86_builtin_type_tab[(int) tcode];
25587 if (type != NULL)
25588 return type;
25589
25590 gcc_assert (tcode > IX86_BT_LAST_PRIM);
25591 if (tcode <= IX86_BT_LAST_VECT)
25592 {
25593 enum machine_mode mode;
25594
25595 index = tcode - IX86_BT_LAST_PRIM - 1;
25596 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
25597 mode = ix86_builtin_type_vect_mode[index];
25598
25599 type = build_vector_type_for_mode (itype, mode);
25600 }
25601 else
25602 {
25603 int quals;
25604
25605 index = tcode - IX86_BT_LAST_VECT - 1;
25606 if (tcode <= IX86_BT_LAST_PTR)
25607 quals = TYPE_UNQUALIFIED;
25608 else
25609 quals = TYPE_QUAL_CONST;
25610
25611 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
25612 if (quals != TYPE_UNQUALIFIED)
25613 itype = build_qualified_type (itype, quals);
25614
25615 type = build_pointer_type (itype);
25616 }
25617
25618 ix86_builtin_type_tab[(int) tcode] = type;
25619 return type;
25620 }
25621
25622 /* Table for the ix86 builtin function types. */
25623 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
25624
25625 /* Retrieve an element from the above table, building some of
25626 the types lazily. */
25627
25628 static tree
25629 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
25630 {
25631 tree type;
25632
25633 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
25634
25635 type = ix86_builtin_func_type_tab[(int) tcode];
25636 if (type != NULL)
25637 return type;
25638
25639 if (tcode <= IX86_BT_LAST_FUNC)
25640 {
25641 unsigned start = ix86_builtin_func_start[(int) tcode];
25642 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
25643 tree rtype, atype, args = void_list_node;
25644 unsigned i;
25645
25646 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
25647 for (i = after - 1; i > start; --i)
25648 {
25649 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
25650 args = tree_cons (NULL, atype, args);
25651 }
25652
25653 type = build_function_type (rtype, args);
25654 }
25655 else
25656 {
25657 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
25658 enum ix86_builtin_func_type icode;
25659
25660 icode = ix86_builtin_func_alias_base[index];
25661 type = ix86_get_builtin_func_type (icode);
25662 }
25663
25664 ix86_builtin_func_type_tab[(int) tcode] = type;
25665 return type;
25666 }
25667
25668
25669 /* Codes for all the SSE/MMX builtins. */
25670 enum ix86_builtins
25671 {
25672 IX86_BUILTIN_ADDPS,
25673 IX86_BUILTIN_ADDSS,
25674 IX86_BUILTIN_DIVPS,
25675 IX86_BUILTIN_DIVSS,
25676 IX86_BUILTIN_MULPS,
25677 IX86_BUILTIN_MULSS,
25678 IX86_BUILTIN_SUBPS,
25679 IX86_BUILTIN_SUBSS,
25680
25681 IX86_BUILTIN_CMPEQPS,
25682 IX86_BUILTIN_CMPLTPS,
25683 IX86_BUILTIN_CMPLEPS,
25684 IX86_BUILTIN_CMPGTPS,
25685 IX86_BUILTIN_CMPGEPS,
25686 IX86_BUILTIN_CMPNEQPS,
25687 IX86_BUILTIN_CMPNLTPS,
25688 IX86_BUILTIN_CMPNLEPS,
25689 IX86_BUILTIN_CMPNGTPS,
25690 IX86_BUILTIN_CMPNGEPS,
25691 IX86_BUILTIN_CMPORDPS,
25692 IX86_BUILTIN_CMPUNORDPS,
25693 IX86_BUILTIN_CMPEQSS,
25694 IX86_BUILTIN_CMPLTSS,
25695 IX86_BUILTIN_CMPLESS,
25696 IX86_BUILTIN_CMPNEQSS,
25697 IX86_BUILTIN_CMPNLTSS,
25698 IX86_BUILTIN_CMPNLESS,
25699 IX86_BUILTIN_CMPNGTSS,
25700 IX86_BUILTIN_CMPNGESS,
25701 IX86_BUILTIN_CMPORDSS,
25702 IX86_BUILTIN_CMPUNORDSS,
25703
25704 IX86_BUILTIN_COMIEQSS,
25705 IX86_BUILTIN_COMILTSS,
25706 IX86_BUILTIN_COMILESS,
25707 IX86_BUILTIN_COMIGTSS,
25708 IX86_BUILTIN_COMIGESS,
25709 IX86_BUILTIN_COMINEQSS,
25710 IX86_BUILTIN_UCOMIEQSS,
25711 IX86_BUILTIN_UCOMILTSS,
25712 IX86_BUILTIN_UCOMILESS,
25713 IX86_BUILTIN_UCOMIGTSS,
25714 IX86_BUILTIN_UCOMIGESS,
25715 IX86_BUILTIN_UCOMINEQSS,
25716
25717 IX86_BUILTIN_CVTPI2PS,
25718 IX86_BUILTIN_CVTPS2PI,
25719 IX86_BUILTIN_CVTSI2SS,
25720 IX86_BUILTIN_CVTSI642SS,
25721 IX86_BUILTIN_CVTSS2SI,
25722 IX86_BUILTIN_CVTSS2SI64,
25723 IX86_BUILTIN_CVTTPS2PI,
25724 IX86_BUILTIN_CVTTSS2SI,
25725 IX86_BUILTIN_CVTTSS2SI64,
25726
25727 IX86_BUILTIN_MAXPS,
25728 IX86_BUILTIN_MAXSS,
25729 IX86_BUILTIN_MINPS,
25730 IX86_BUILTIN_MINSS,
25731
25732 IX86_BUILTIN_LOADUPS,
25733 IX86_BUILTIN_STOREUPS,
25734 IX86_BUILTIN_MOVSS,
25735
25736 IX86_BUILTIN_MOVHLPS,
25737 IX86_BUILTIN_MOVLHPS,
25738 IX86_BUILTIN_LOADHPS,
25739 IX86_BUILTIN_LOADLPS,
25740 IX86_BUILTIN_STOREHPS,
25741 IX86_BUILTIN_STORELPS,
25742
25743 IX86_BUILTIN_MASKMOVQ,
25744 IX86_BUILTIN_MOVMSKPS,
25745 IX86_BUILTIN_PMOVMSKB,
25746
25747 IX86_BUILTIN_MOVNTPS,
25748 IX86_BUILTIN_MOVNTQ,
25749
25750 IX86_BUILTIN_LOADDQU,
25751 IX86_BUILTIN_STOREDQU,
25752
25753 IX86_BUILTIN_PACKSSWB,
25754 IX86_BUILTIN_PACKSSDW,
25755 IX86_BUILTIN_PACKUSWB,
25756
25757 IX86_BUILTIN_PADDB,
25758 IX86_BUILTIN_PADDW,
25759 IX86_BUILTIN_PADDD,
25760 IX86_BUILTIN_PADDQ,
25761 IX86_BUILTIN_PADDSB,
25762 IX86_BUILTIN_PADDSW,
25763 IX86_BUILTIN_PADDUSB,
25764 IX86_BUILTIN_PADDUSW,
25765 IX86_BUILTIN_PSUBB,
25766 IX86_BUILTIN_PSUBW,
25767 IX86_BUILTIN_PSUBD,
25768 IX86_BUILTIN_PSUBQ,
25769 IX86_BUILTIN_PSUBSB,
25770 IX86_BUILTIN_PSUBSW,
25771 IX86_BUILTIN_PSUBUSB,
25772 IX86_BUILTIN_PSUBUSW,
25773
25774 IX86_BUILTIN_PAND,
25775 IX86_BUILTIN_PANDN,
25776 IX86_BUILTIN_POR,
25777 IX86_BUILTIN_PXOR,
25778
25779 IX86_BUILTIN_PAVGB,
25780 IX86_BUILTIN_PAVGW,
25781
25782 IX86_BUILTIN_PCMPEQB,
25783 IX86_BUILTIN_PCMPEQW,
25784 IX86_BUILTIN_PCMPEQD,
25785 IX86_BUILTIN_PCMPGTB,
25786 IX86_BUILTIN_PCMPGTW,
25787 IX86_BUILTIN_PCMPGTD,
25788
25789 IX86_BUILTIN_PMADDWD,
25790
25791 IX86_BUILTIN_PMAXSW,
25792 IX86_BUILTIN_PMAXUB,
25793 IX86_BUILTIN_PMINSW,
25794 IX86_BUILTIN_PMINUB,
25795
25796 IX86_BUILTIN_PMULHUW,
25797 IX86_BUILTIN_PMULHW,
25798 IX86_BUILTIN_PMULLW,
25799
25800 IX86_BUILTIN_PSADBW,
25801 IX86_BUILTIN_PSHUFW,
25802
25803 IX86_BUILTIN_PSLLW,
25804 IX86_BUILTIN_PSLLD,
25805 IX86_BUILTIN_PSLLQ,
25806 IX86_BUILTIN_PSRAW,
25807 IX86_BUILTIN_PSRAD,
25808 IX86_BUILTIN_PSRLW,
25809 IX86_BUILTIN_PSRLD,
25810 IX86_BUILTIN_PSRLQ,
25811 IX86_BUILTIN_PSLLWI,
25812 IX86_BUILTIN_PSLLDI,
25813 IX86_BUILTIN_PSLLQI,
25814 IX86_BUILTIN_PSRAWI,
25815 IX86_BUILTIN_PSRADI,
25816 IX86_BUILTIN_PSRLWI,
25817 IX86_BUILTIN_PSRLDI,
25818 IX86_BUILTIN_PSRLQI,
25819
25820 IX86_BUILTIN_PUNPCKHBW,
25821 IX86_BUILTIN_PUNPCKHWD,
25822 IX86_BUILTIN_PUNPCKHDQ,
25823 IX86_BUILTIN_PUNPCKLBW,
25824 IX86_BUILTIN_PUNPCKLWD,
25825 IX86_BUILTIN_PUNPCKLDQ,
25826
25827 IX86_BUILTIN_SHUFPS,
25828
25829 IX86_BUILTIN_RCPPS,
25830 IX86_BUILTIN_RCPSS,
25831 IX86_BUILTIN_RSQRTPS,
25832 IX86_BUILTIN_RSQRTPS_NR,
25833 IX86_BUILTIN_RSQRTSS,
25834 IX86_BUILTIN_RSQRTF,
25835 IX86_BUILTIN_SQRTPS,
25836 IX86_BUILTIN_SQRTPS_NR,
25837 IX86_BUILTIN_SQRTSS,
25838
25839 IX86_BUILTIN_UNPCKHPS,
25840 IX86_BUILTIN_UNPCKLPS,
25841
25842 IX86_BUILTIN_ANDPS,
25843 IX86_BUILTIN_ANDNPS,
25844 IX86_BUILTIN_ORPS,
25845 IX86_BUILTIN_XORPS,
25846
25847 IX86_BUILTIN_EMMS,
25848 IX86_BUILTIN_LDMXCSR,
25849 IX86_BUILTIN_STMXCSR,
25850 IX86_BUILTIN_SFENCE,
25851
25852 IX86_BUILTIN_FXSAVE,
25853 IX86_BUILTIN_FXRSTOR,
25854 IX86_BUILTIN_FXSAVE64,
25855 IX86_BUILTIN_FXRSTOR64,
25856
25857 IX86_BUILTIN_XSAVE,
25858 IX86_BUILTIN_XRSTOR,
25859 IX86_BUILTIN_XSAVE64,
25860 IX86_BUILTIN_XRSTOR64,
25861
25862 IX86_BUILTIN_XSAVEOPT,
25863 IX86_BUILTIN_XSAVEOPT64,
25864
25865 /* 3DNow! Original */
25866 IX86_BUILTIN_FEMMS,
25867 IX86_BUILTIN_PAVGUSB,
25868 IX86_BUILTIN_PF2ID,
25869 IX86_BUILTIN_PFACC,
25870 IX86_BUILTIN_PFADD,
25871 IX86_BUILTIN_PFCMPEQ,
25872 IX86_BUILTIN_PFCMPGE,
25873 IX86_BUILTIN_PFCMPGT,
25874 IX86_BUILTIN_PFMAX,
25875 IX86_BUILTIN_PFMIN,
25876 IX86_BUILTIN_PFMUL,
25877 IX86_BUILTIN_PFRCP,
25878 IX86_BUILTIN_PFRCPIT1,
25879 IX86_BUILTIN_PFRCPIT2,
25880 IX86_BUILTIN_PFRSQIT1,
25881 IX86_BUILTIN_PFRSQRT,
25882 IX86_BUILTIN_PFSUB,
25883 IX86_BUILTIN_PFSUBR,
25884 IX86_BUILTIN_PI2FD,
25885 IX86_BUILTIN_PMULHRW,
25886
25887 /* 3DNow! Athlon Extensions */
25888 IX86_BUILTIN_PF2IW,
25889 IX86_BUILTIN_PFNACC,
25890 IX86_BUILTIN_PFPNACC,
25891 IX86_BUILTIN_PI2FW,
25892 IX86_BUILTIN_PSWAPDSI,
25893 IX86_BUILTIN_PSWAPDSF,
25894
25895 /* SSE2 */
25896 IX86_BUILTIN_ADDPD,
25897 IX86_BUILTIN_ADDSD,
25898 IX86_BUILTIN_DIVPD,
25899 IX86_BUILTIN_DIVSD,
25900 IX86_BUILTIN_MULPD,
25901 IX86_BUILTIN_MULSD,
25902 IX86_BUILTIN_SUBPD,
25903 IX86_BUILTIN_SUBSD,
25904
25905 IX86_BUILTIN_CMPEQPD,
25906 IX86_BUILTIN_CMPLTPD,
25907 IX86_BUILTIN_CMPLEPD,
25908 IX86_BUILTIN_CMPGTPD,
25909 IX86_BUILTIN_CMPGEPD,
25910 IX86_BUILTIN_CMPNEQPD,
25911 IX86_BUILTIN_CMPNLTPD,
25912 IX86_BUILTIN_CMPNLEPD,
25913 IX86_BUILTIN_CMPNGTPD,
25914 IX86_BUILTIN_CMPNGEPD,
25915 IX86_BUILTIN_CMPORDPD,
25916 IX86_BUILTIN_CMPUNORDPD,
25917 IX86_BUILTIN_CMPEQSD,
25918 IX86_BUILTIN_CMPLTSD,
25919 IX86_BUILTIN_CMPLESD,
25920 IX86_BUILTIN_CMPNEQSD,
25921 IX86_BUILTIN_CMPNLTSD,
25922 IX86_BUILTIN_CMPNLESD,
25923 IX86_BUILTIN_CMPORDSD,
25924 IX86_BUILTIN_CMPUNORDSD,
25925
25926 IX86_BUILTIN_COMIEQSD,
25927 IX86_BUILTIN_COMILTSD,
25928 IX86_BUILTIN_COMILESD,
25929 IX86_BUILTIN_COMIGTSD,
25930 IX86_BUILTIN_COMIGESD,
25931 IX86_BUILTIN_COMINEQSD,
25932 IX86_BUILTIN_UCOMIEQSD,
25933 IX86_BUILTIN_UCOMILTSD,
25934 IX86_BUILTIN_UCOMILESD,
25935 IX86_BUILTIN_UCOMIGTSD,
25936 IX86_BUILTIN_UCOMIGESD,
25937 IX86_BUILTIN_UCOMINEQSD,
25938
25939 IX86_BUILTIN_MAXPD,
25940 IX86_BUILTIN_MAXSD,
25941 IX86_BUILTIN_MINPD,
25942 IX86_BUILTIN_MINSD,
25943
25944 IX86_BUILTIN_ANDPD,
25945 IX86_BUILTIN_ANDNPD,
25946 IX86_BUILTIN_ORPD,
25947 IX86_BUILTIN_XORPD,
25948
25949 IX86_BUILTIN_SQRTPD,
25950 IX86_BUILTIN_SQRTSD,
25951
25952 IX86_BUILTIN_UNPCKHPD,
25953 IX86_BUILTIN_UNPCKLPD,
25954
25955 IX86_BUILTIN_SHUFPD,
25956
25957 IX86_BUILTIN_LOADUPD,
25958 IX86_BUILTIN_STOREUPD,
25959 IX86_BUILTIN_MOVSD,
25960
25961 IX86_BUILTIN_LOADHPD,
25962 IX86_BUILTIN_LOADLPD,
25963
25964 IX86_BUILTIN_CVTDQ2PD,
25965 IX86_BUILTIN_CVTDQ2PS,
25966
25967 IX86_BUILTIN_CVTPD2DQ,
25968 IX86_BUILTIN_CVTPD2PI,
25969 IX86_BUILTIN_CVTPD2PS,
25970 IX86_BUILTIN_CVTTPD2DQ,
25971 IX86_BUILTIN_CVTTPD2PI,
25972
25973 IX86_BUILTIN_CVTPI2PD,
25974 IX86_BUILTIN_CVTSI2SD,
25975 IX86_BUILTIN_CVTSI642SD,
25976
25977 IX86_BUILTIN_CVTSD2SI,
25978 IX86_BUILTIN_CVTSD2SI64,
25979 IX86_BUILTIN_CVTSD2SS,
25980 IX86_BUILTIN_CVTSS2SD,
25981 IX86_BUILTIN_CVTTSD2SI,
25982 IX86_BUILTIN_CVTTSD2SI64,
25983
25984 IX86_BUILTIN_CVTPS2DQ,
25985 IX86_BUILTIN_CVTPS2PD,
25986 IX86_BUILTIN_CVTTPS2DQ,
25987
25988 IX86_BUILTIN_MOVNTI,
25989 IX86_BUILTIN_MOVNTI64,
25990 IX86_BUILTIN_MOVNTPD,
25991 IX86_BUILTIN_MOVNTDQ,
25992
25993 IX86_BUILTIN_MOVQ128,
25994
25995 /* SSE2 MMX */
25996 IX86_BUILTIN_MASKMOVDQU,
25997 IX86_BUILTIN_MOVMSKPD,
25998 IX86_BUILTIN_PMOVMSKB128,
25999
26000 IX86_BUILTIN_PACKSSWB128,
26001 IX86_BUILTIN_PACKSSDW128,
26002 IX86_BUILTIN_PACKUSWB128,
26003
26004 IX86_BUILTIN_PADDB128,
26005 IX86_BUILTIN_PADDW128,
26006 IX86_BUILTIN_PADDD128,
26007 IX86_BUILTIN_PADDQ128,
26008 IX86_BUILTIN_PADDSB128,
26009 IX86_BUILTIN_PADDSW128,
26010 IX86_BUILTIN_PADDUSB128,
26011 IX86_BUILTIN_PADDUSW128,
26012 IX86_BUILTIN_PSUBB128,
26013 IX86_BUILTIN_PSUBW128,
26014 IX86_BUILTIN_PSUBD128,
26015 IX86_BUILTIN_PSUBQ128,
26016 IX86_BUILTIN_PSUBSB128,
26017 IX86_BUILTIN_PSUBSW128,
26018 IX86_BUILTIN_PSUBUSB128,
26019 IX86_BUILTIN_PSUBUSW128,
26020
26021 IX86_BUILTIN_PAND128,
26022 IX86_BUILTIN_PANDN128,
26023 IX86_BUILTIN_POR128,
26024 IX86_BUILTIN_PXOR128,
26025
26026 IX86_BUILTIN_PAVGB128,
26027 IX86_BUILTIN_PAVGW128,
26028
26029 IX86_BUILTIN_PCMPEQB128,
26030 IX86_BUILTIN_PCMPEQW128,
26031 IX86_BUILTIN_PCMPEQD128,
26032 IX86_BUILTIN_PCMPGTB128,
26033 IX86_BUILTIN_PCMPGTW128,
26034 IX86_BUILTIN_PCMPGTD128,
26035
26036 IX86_BUILTIN_PMADDWD128,
26037
26038 IX86_BUILTIN_PMAXSW128,
26039 IX86_BUILTIN_PMAXUB128,
26040 IX86_BUILTIN_PMINSW128,
26041 IX86_BUILTIN_PMINUB128,
26042
26043 IX86_BUILTIN_PMULUDQ,
26044 IX86_BUILTIN_PMULUDQ128,
26045 IX86_BUILTIN_PMULHUW128,
26046 IX86_BUILTIN_PMULHW128,
26047 IX86_BUILTIN_PMULLW128,
26048
26049 IX86_BUILTIN_PSADBW128,
26050 IX86_BUILTIN_PSHUFHW,
26051 IX86_BUILTIN_PSHUFLW,
26052 IX86_BUILTIN_PSHUFD,
26053
26054 IX86_BUILTIN_PSLLDQI128,
26055 IX86_BUILTIN_PSLLWI128,
26056 IX86_BUILTIN_PSLLDI128,
26057 IX86_BUILTIN_PSLLQI128,
26058 IX86_BUILTIN_PSRAWI128,
26059 IX86_BUILTIN_PSRADI128,
26060 IX86_BUILTIN_PSRLDQI128,
26061 IX86_BUILTIN_PSRLWI128,
26062 IX86_BUILTIN_PSRLDI128,
26063 IX86_BUILTIN_PSRLQI128,
26064
26065 IX86_BUILTIN_PSLLDQ128,
26066 IX86_BUILTIN_PSLLW128,
26067 IX86_BUILTIN_PSLLD128,
26068 IX86_BUILTIN_PSLLQ128,
26069 IX86_BUILTIN_PSRAW128,
26070 IX86_BUILTIN_PSRAD128,
26071 IX86_BUILTIN_PSRLW128,
26072 IX86_BUILTIN_PSRLD128,
26073 IX86_BUILTIN_PSRLQ128,
26074
26075 IX86_BUILTIN_PUNPCKHBW128,
26076 IX86_BUILTIN_PUNPCKHWD128,
26077 IX86_BUILTIN_PUNPCKHDQ128,
26078 IX86_BUILTIN_PUNPCKHQDQ128,
26079 IX86_BUILTIN_PUNPCKLBW128,
26080 IX86_BUILTIN_PUNPCKLWD128,
26081 IX86_BUILTIN_PUNPCKLDQ128,
26082 IX86_BUILTIN_PUNPCKLQDQ128,
26083
26084 IX86_BUILTIN_CLFLUSH,
26085 IX86_BUILTIN_MFENCE,
26086 IX86_BUILTIN_LFENCE,
26087 IX86_BUILTIN_PAUSE,
26088
26089 IX86_BUILTIN_BSRSI,
26090 IX86_BUILTIN_BSRDI,
26091 IX86_BUILTIN_RDPMC,
26092 IX86_BUILTIN_RDTSC,
26093 IX86_BUILTIN_RDTSCP,
26094 IX86_BUILTIN_ROLQI,
26095 IX86_BUILTIN_ROLHI,
26096 IX86_BUILTIN_RORQI,
26097 IX86_BUILTIN_RORHI,
26098
26099 /* SSE3. */
26100 IX86_BUILTIN_ADDSUBPS,
26101 IX86_BUILTIN_HADDPS,
26102 IX86_BUILTIN_HSUBPS,
26103 IX86_BUILTIN_MOVSHDUP,
26104 IX86_BUILTIN_MOVSLDUP,
26105 IX86_BUILTIN_ADDSUBPD,
26106 IX86_BUILTIN_HADDPD,
26107 IX86_BUILTIN_HSUBPD,
26108 IX86_BUILTIN_LDDQU,
26109
26110 IX86_BUILTIN_MONITOR,
26111 IX86_BUILTIN_MWAIT,
26112
26113 /* SSSE3. */
26114 IX86_BUILTIN_PHADDW,
26115 IX86_BUILTIN_PHADDD,
26116 IX86_BUILTIN_PHADDSW,
26117 IX86_BUILTIN_PHSUBW,
26118 IX86_BUILTIN_PHSUBD,
26119 IX86_BUILTIN_PHSUBSW,
26120 IX86_BUILTIN_PMADDUBSW,
26121 IX86_BUILTIN_PMULHRSW,
26122 IX86_BUILTIN_PSHUFB,
26123 IX86_BUILTIN_PSIGNB,
26124 IX86_BUILTIN_PSIGNW,
26125 IX86_BUILTIN_PSIGND,
26126 IX86_BUILTIN_PALIGNR,
26127 IX86_BUILTIN_PABSB,
26128 IX86_BUILTIN_PABSW,
26129 IX86_BUILTIN_PABSD,
26130
26131 IX86_BUILTIN_PHADDW128,
26132 IX86_BUILTIN_PHADDD128,
26133 IX86_BUILTIN_PHADDSW128,
26134 IX86_BUILTIN_PHSUBW128,
26135 IX86_BUILTIN_PHSUBD128,
26136 IX86_BUILTIN_PHSUBSW128,
26137 IX86_BUILTIN_PMADDUBSW128,
26138 IX86_BUILTIN_PMULHRSW128,
26139 IX86_BUILTIN_PSHUFB128,
26140 IX86_BUILTIN_PSIGNB128,
26141 IX86_BUILTIN_PSIGNW128,
26142 IX86_BUILTIN_PSIGND128,
26143 IX86_BUILTIN_PALIGNR128,
26144 IX86_BUILTIN_PABSB128,
26145 IX86_BUILTIN_PABSW128,
26146 IX86_BUILTIN_PABSD128,
26147
26148 /* AMDFAM10 - SSE4A New Instructions. */
26149 IX86_BUILTIN_MOVNTSD,
26150 IX86_BUILTIN_MOVNTSS,
26151 IX86_BUILTIN_EXTRQI,
26152 IX86_BUILTIN_EXTRQ,
26153 IX86_BUILTIN_INSERTQI,
26154 IX86_BUILTIN_INSERTQ,
26155
26156 /* SSE4.1. */
26157 IX86_BUILTIN_BLENDPD,
26158 IX86_BUILTIN_BLENDPS,
26159 IX86_BUILTIN_BLENDVPD,
26160 IX86_BUILTIN_BLENDVPS,
26161 IX86_BUILTIN_PBLENDVB128,
26162 IX86_BUILTIN_PBLENDW128,
26163
26164 IX86_BUILTIN_DPPD,
26165 IX86_BUILTIN_DPPS,
26166
26167 IX86_BUILTIN_INSERTPS128,
26168
26169 IX86_BUILTIN_MOVNTDQA,
26170 IX86_BUILTIN_MPSADBW128,
26171 IX86_BUILTIN_PACKUSDW128,
26172 IX86_BUILTIN_PCMPEQQ,
26173 IX86_BUILTIN_PHMINPOSUW128,
26174
26175 IX86_BUILTIN_PMAXSB128,
26176 IX86_BUILTIN_PMAXSD128,
26177 IX86_BUILTIN_PMAXUD128,
26178 IX86_BUILTIN_PMAXUW128,
26179
26180 IX86_BUILTIN_PMINSB128,
26181 IX86_BUILTIN_PMINSD128,
26182 IX86_BUILTIN_PMINUD128,
26183 IX86_BUILTIN_PMINUW128,
26184
26185 IX86_BUILTIN_PMOVSXBW128,
26186 IX86_BUILTIN_PMOVSXBD128,
26187 IX86_BUILTIN_PMOVSXBQ128,
26188 IX86_BUILTIN_PMOVSXWD128,
26189 IX86_BUILTIN_PMOVSXWQ128,
26190 IX86_BUILTIN_PMOVSXDQ128,
26191
26192 IX86_BUILTIN_PMOVZXBW128,
26193 IX86_BUILTIN_PMOVZXBD128,
26194 IX86_BUILTIN_PMOVZXBQ128,
26195 IX86_BUILTIN_PMOVZXWD128,
26196 IX86_BUILTIN_PMOVZXWQ128,
26197 IX86_BUILTIN_PMOVZXDQ128,
26198
26199 IX86_BUILTIN_PMULDQ128,
26200 IX86_BUILTIN_PMULLD128,
26201
26202 IX86_BUILTIN_ROUNDSD,
26203 IX86_BUILTIN_ROUNDSS,
26204
26205 IX86_BUILTIN_ROUNDPD,
26206 IX86_BUILTIN_ROUNDPS,
26207
26208 IX86_BUILTIN_FLOORPD,
26209 IX86_BUILTIN_CEILPD,
26210 IX86_BUILTIN_TRUNCPD,
26211 IX86_BUILTIN_RINTPD,
26212 IX86_BUILTIN_ROUNDPD_AZ,
26213
26214 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
26215 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
26216 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
26217
26218 IX86_BUILTIN_FLOORPS,
26219 IX86_BUILTIN_CEILPS,
26220 IX86_BUILTIN_TRUNCPS,
26221 IX86_BUILTIN_RINTPS,
26222 IX86_BUILTIN_ROUNDPS_AZ,
26223
26224 IX86_BUILTIN_FLOORPS_SFIX,
26225 IX86_BUILTIN_CEILPS_SFIX,
26226 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
26227
26228 IX86_BUILTIN_PTESTZ,
26229 IX86_BUILTIN_PTESTC,
26230 IX86_BUILTIN_PTESTNZC,
26231
26232 IX86_BUILTIN_VEC_INIT_V2SI,
26233 IX86_BUILTIN_VEC_INIT_V4HI,
26234 IX86_BUILTIN_VEC_INIT_V8QI,
26235 IX86_BUILTIN_VEC_EXT_V2DF,
26236 IX86_BUILTIN_VEC_EXT_V2DI,
26237 IX86_BUILTIN_VEC_EXT_V4SF,
26238 IX86_BUILTIN_VEC_EXT_V4SI,
26239 IX86_BUILTIN_VEC_EXT_V8HI,
26240 IX86_BUILTIN_VEC_EXT_V2SI,
26241 IX86_BUILTIN_VEC_EXT_V4HI,
26242 IX86_BUILTIN_VEC_EXT_V16QI,
26243 IX86_BUILTIN_VEC_SET_V2DI,
26244 IX86_BUILTIN_VEC_SET_V4SF,
26245 IX86_BUILTIN_VEC_SET_V4SI,
26246 IX86_BUILTIN_VEC_SET_V8HI,
26247 IX86_BUILTIN_VEC_SET_V4HI,
26248 IX86_BUILTIN_VEC_SET_V16QI,
26249
26250 IX86_BUILTIN_VEC_PACK_SFIX,
26251 IX86_BUILTIN_VEC_PACK_SFIX256,
26252
26253 /* SSE4.2. */
26254 IX86_BUILTIN_CRC32QI,
26255 IX86_BUILTIN_CRC32HI,
26256 IX86_BUILTIN_CRC32SI,
26257 IX86_BUILTIN_CRC32DI,
26258
26259 IX86_BUILTIN_PCMPESTRI128,
26260 IX86_BUILTIN_PCMPESTRM128,
26261 IX86_BUILTIN_PCMPESTRA128,
26262 IX86_BUILTIN_PCMPESTRC128,
26263 IX86_BUILTIN_PCMPESTRO128,
26264 IX86_BUILTIN_PCMPESTRS128,
26265 IX86_BUILTIN_PCMPESTRZ128,
26266 IX86_BUILTIN_PCMPISTRI128,
26267 IX86_BUILTIN_PCMPISTRM128,
26268 IX86_BUILTIN_PCMPISTRA128,
26269 IX86_BUILTIN_PCMPISTRC128,
26270 IX86_BUILTIN_PCMPISTRO128,
26271 IX86_BUILTIN_PCMPISTRS128,
26272 IX86_BUILTIN_PCMPISTRZ128,
26273
26274 IX86_BUILTIN_PCMPGTQ,
26275
26276 /* AES instructions */
26277 IX86_BUILTIN_AESENC128,
26278 IX86_BUILTIN_AESENCLAST128,
26279 IX86_BUILTIN_AESDEC128,
26280 IX86_BUILTIN_AESDECLAST128,
26281 IX86_BUILTIN_AESIMC128,
26282 IX86_BUILTIN_AESKEYGENASSIST128,
26283
26284 /* PCLMUL instruction */
26285 IX86_BUILTIN_PCLMULQDQ128,
26286
26287 /* AVX */
26288 IX86_BUILTIN_ADDPD256,
26289 IX86_BUILTIN_ADDPS256,
26290 IX86_BUILTIN_ADDSUBPD256,
26291 IX86_BUILTIN_ADDSUBPS256,
26292 IX86_BUILTIN_ANDPD256,
26293 IX86_BUILTIN_ANDPS256,
26294 IX86_BUILTIN_ANDNPD256,
26295 IX86_BUILTIN_ANDNPS256,
26296 IX86_BUILTIN_BLENDPD256,
26297 IX86_BUILTIN_BLENDPS256,
26298 IX86_BUILTIN_BLENDVPD256,
26299 IX86_BUILTIN_BLENDVPS256,
26300 IX86_BUILTIN_DIVPD256,
26301 IX86_BUILTIN_DIVPS256,
26302 IX86_BUILTIN_DPPS256,
26303 IX86_BUILTIN_HADDPD256,
26304 IX86_BUILTIN_HADDPS256,
26305 IX86_BUILTIN_HSUBPD256,
26306 IX86_BUILTIN_HSUBPS256,
26307 IX86_BUILTIN_MAXPD256,
26308 IX86_BUILTIN_MAXPS256,
26309 IX86_BUILTIN_MINPD256,
26310 IX86_BUILTIN_MINPS256,
26311 IX86_BUILTIN_MULPD256,
26312 IX86_BUILTIN_MULPS256,
26313 IX86_BUILTIN_ORPD256,
26314 IX86_BUILTIN_ORPS256,
26315 IX86_BUILTIN_SHUFPD256,
26316 IX86_BUILTIN_SHUFPS256,
26317 IX86_BUILTIN_SUBPD256,
26318 IX86_BUILTIN_SUBPS256,
26319 IX86_BUILTIN_XORPD256,
26320 IX86_BUILTIN_XORPS256,
26321 IX86_BUILTIN_CMPSD,
26322 IX86_BUILTIN_CMPSS,
26323 IX86_BUILTIN_CMPPD,
26324 IX86_BUILTIN_CMPPS,
26325 IX86_BUILTIN_CMPPD256,
26326 IX86_BUILTIN_CMPPS256,
26327 IX86_BUILTIN_CVTDQ2PD256,
26328 IX86_BUILTIN_CVTDQ2PS256,
26329 IX86_BUILTIN_CVTPD2PS256,
26330 IX86_BUILTIN_CVTPS2DQ256,
26331 IX86_BUILTIN_CVTPS2PD256,
26332 IX86_BUILTIN_CVTTPD2DQ256,
26333 IX86_BUILTIN_CVTPD2DQ256,
26334 IX86_BUILTIN_CVTTPS2DQ256,
26335 IX86_BUILTIN_EXTRACTF128PD256,
26336 IX86_BUILTIN_EXTRACTF128PS256,
26337 IX86_BUILTIN_EXTRACTF128SI256,
26338 IX86_BUILTIN_VZEROALL,
26339 IX86_BUILTIN_VZEROUPPER,
26340 IX86_BUILTIN_VPERMILVARPD,
26341 IX86_BUILTIN_VPERMILVARPS,
26342 IX86_BUILTIN_VPERMILVARPD256,
26343 IX86_BUILTIN_VPERMILVARPS256,
26344 IX86_BUILTIN_VPERMILPD,
26345 IX86_BUILTIN_VPERMILPS,
26346 IX86_BUILTIN_VPERMILPD256,
26347 IX86_BUILTIN_VPERMILPS256,
26348 IX86_BUILTIN_VPERMIL2PD,
26349 IX86_BUILTIN_VPERMIL2PS,
26350 IX86_BUILTIN_VPERMIL2PD256,
26351 IX86_BUILTIN_VPERMIL2PS256,
26352 IX86_BUILTIN_VPERM2F128PD256,
26353 IX86_BUILTIN_VPERM2F128PS256,
26354 IX86_BUILTIN_VPERM2F128SI256,
26355 IX86_BUILTIN_VBROADCASTSS,
26356 IX86_BUILTIN_VBROADCASTSD256,
26357 IX86_BUILTIN_VBROADCASTSS256,
26358 IX86_BUILTIN_VBROADCASTPD256,
26359 IX86_BUILTIN_VBROADCASTPS256,
26360 IX86_BUILTIN_VINSERTF128PD256,
26361 IX86_BUILTIN_VINSERTF128PS256,
26362 IX86_BUILTIN_VINSERTF128SI256,
26363 IX86_BUILTIN_LOADUPD256,
26364 IX86_BUILTIN_LOADUPS256,
26365 IX86_BUILTIN_STOREUPD256,
26366 IX86_BUILTIN_STOREUPS256,
26367 IX86_BUILTIN_LDDQU256,
26368 IX86_BUILTIN_MOVNTDQ256,
26369 IX86_BUILTIN_MOVNTPD256,
26370 IX86_BUILTIN_MOVNTPS256,
26371 IX86_BUILTIN_LOADDQU256,
26372 IX86_BUILTIN_STOREDQU256,
26373 IX86_BUILTIN_MASKLOADPD,
26374 IX86_BUILTIN_MASKLOADPS,
26375 IX86_BUILTIN_MASKSTOREPD,
26376 IX86_BUILTIN_MASKSTOREPS,
26377 IX86_BUILTIN_MASKLOADPD256,
26378 IX86_BUILTIN_MASKLOADPS256,
26379 IX86_BUILTIN_MASKSTOREPD256,
26380 IX86_BUILTIN_MASKSTOREPS256,
26381 IX86_BUILTIN_MOVSHDUP256,
26382 IX86_BUILTIN_MOVSLDUP256,
26383 IX86_BUILTIN_MOVDDUP256,
26384
26385 IX86_BUILTIN_SQRTPD256,
26386 IX86_BUILTIN_SQRTPS256,
26387 IX86_BUILTIN_SQRTPS_NR256,
26388 IX86_BUILTIN_RSQRTPS256,
26389 IX86_BUILTIN_RSQRTPS_NR256,
26390
26391 IX86_BUILTIN_RCPPS256,
26392
26393 IX86_BUILTIN_ROUNDPD256,
26394 IX86_BUILTIN_ROUNDPS256,
26395
26396 IX86_BUILTIN_FLOORPD256,
26397 IX86_BUILTIN_CEILPD256,
26398 IX86_BUILTIN_TRUNCPD256,
26399 IX86_BUILTIN_RINTPD256,
26400 IX86_BUILTIN_ROUNDPD_AZ256,
26401
26402 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
26403 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
26404 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
26405
26406 IX86_BUILTIN_FLOORPS256,
26407 IX86_BUILTIN_CEILPS256,
26408 IX86_BUILTIN_TRUNCPS256,
26409 IX86_BUILTIN_RINTPS256,
26410 IX86_BUILTIN_ROUNDPS_AZ256,
26411
26412 IX86_BUILTIN_FLOORPS_SFIX256,
26413 IX86_BUILTIN_CEILPS_SFIX256,
26414 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
26415
26416 IX86_BUILTIN_UNPCKHPD256,
26417 IX86_BUILTIN_UNPCKLPD256,
26418 IX86_BUILTIN_UNPCKHPS256,
26419 IX86_BUILTIN_UNPCKLPS256,
26420
26421 IX86_BUILTIN_SI256_SI,
26422 IX86_BUILTIN_PS256_PS,
26423 IX86_BUILTIN_PD256_PD,
26424 IX86_BUILTIN_SI_SI256,
26425 IX86_BUILTIN_PS_PS256,
26426 IX86_BUILTIN_PD_PD256,
26427
26428 IX86_BUILTIN_VTESTZPD,
26429 IX86_BUILTIN_VTESTCPD,
26430 IX86_BUILTIN_VTESTNZCPD,
26431 IX86_BUILTIN_VTESTZPS,
26432 IX86_BUILTIN_VTESTCPS,
26433 IX86_BUILTIN_VTESTNZCPS,
26434 IX86_BUILTIN_VTESTZPD256,
26435 IX86_BUILTIN_VTESTCPD256,
26436 IX86_BUILTIN_VTESTNZCPD256,
26437 IX86_BUILTIN_VTESTZPS256,
26438 IX86_BUILTIN_VTESTCPS256,
26439 IX86_BUILTIN_VTESTNZCPS256,
26440 IX86_BUILTIN_PTESTZ256,
26441 IX86_BUILTIN_PTESTC256,
26442 IX86_BUILTIN_PTESTNZC256,
26443
26444 IX86_BUILTIN_MOVMSKPD256,
26445 IX86_BUILTIN_MOVMSKPS256,
26446
26447 /* AVX2 */
26448 IX86_BUILTIN_MPSADBW256,
26449 IX86_BUILTIN_PABSB256,
26450 IX86_BUILTIN_PABSW256,
26451 IX86_BUILTIN_PABSD256,
26452 IX86_BUILTIN_PACKSSDW256,
26453 IX86_BUILTIN_PACKSSWB256,
26454 IX86_BUILTIN_PACKUSDW256,
26455 IX86_BUILTIN_PACKUSWB256,
26456 IX86_BUILTIN_PADDB256,
26457 IX86_BUILTIN_PADDW256,
26458 IX86_BUILTIN_PADDD256,
26459 IX86_BUILTIN_PADDQ256,
26460 IX86_BUILTIN_PADDSB256,
26461 IX86_BUILTIN_PADDSW256,
26462 IX86_BUILTIN_PADDUSB256,
26463 IX86_BUILTIN_PADDUSW256,
26464 IX86_BUILTIN_PALIGNR256,
26465 IX86_BUILTIN_AND256I,
26466 IX86_BUILTIN_ANDNOT256I,
26467 IX86_BUILTIN_PAVGB256,
26468 IX86_BUILTIN_PAVGW256,
26469 IX86_BUILTIN_PBLENDVB256,
26470 IX86_BUILTIN_PBLENDVW256,
26471 IX86_BUILTIN_PCMPEQB256,
26472 IX86_BUILTIN_PCMPEQW256,
26473 IX86_BUILTIN_PCMPEQD256,
26474 IX86_BUILTIN_PCMPEQQ256,
26475 IX86_BUILTIN_PCMPGTB256,
26476 IX86_BUILTIN_PCMPGTW256,
26477 IX86_BUILTIN_PCMPGTD256,
26478 IX86_BUILTIN_PCMPGTQ256,
26479 IX86_BUILTIN_PHADDW256,
26480 IX86_BUILTIN_PHADDD256,
26481 IX86_BUILTIN_PHADDSW256,
26482 IX86_BUILTIN_PHSUBW256,
26483 IX86_BUILTIN_PHSUBD256,
26484 IX86_BUILTIN_PHSUBSW256,
26485 IX86_BUILTIN_PMADDUBSW256,
26486 IX86_BUILTIN_PMADDWD256,
26487 IX86_BUILTIN_PMAXSB256,
26488 IX86_BUILTIN_PMAXSW256,
26489 IX86_BUILTIN_PMAXSD256,
26490 IX86_BUILTIN_PMAXUB256,
26491 IX86_BUILTIN_PMAXUW256,
26492 IX86_BUILTIN_PMAXUD256,
26493 IX86_BUILTIN_PMINSB256,
26494 IX86_BUILTIN_PMINSW256,
26495 IX86_BUILTIN_PMINSD256,
26496 IX86_BUILTIN_PMINUB256,
26497 IX86_BUILTIN_PMINUW256,
26498 IX86_BUILTIN_PMINUD256,
26499 IX86_BUILTIN_PMOVMSKB256,
26500 IX86_BUILTIN_PMOVSXBW256,
26501 IX86_BUILTIN_PMOVSXBD256,
26502 IX86_BUILTIN_PMOVSXBQ256,
26503 IX86_BUILTIN_PMOVSXWD256,
26504 IX86_BUILTIN_PMOVSXWQ256,
26505 IX86_BUILTIN_PMOVSXDQ256,
26506 IX86_BUILTIN_PMOVZXBW256,
26507 IX86_BUILTIN_PMOVZXBD256,
26508 IX86_BUILTIN_PMOVZXBQ256,
26509 IX86_BUILTIN_PMOVZXWD256,
26510 IX86_BUILTIN_PMOVZXWQ256,
26511 IX86_BUILTIN_PMOVZXDQ256,
26512 IX86_BUILTIN_PMULDQ256,
26513 IX86_BUILTIN_PMULHRSW256,
26514 IX86_BUILTIN_PMULHUW256,
26515 IX86_BUILTIN_PMULHW256,
26516 IX86_BUILTIN_PMULLW256,
26517 IX86_BUILTIN_PMULLD256,
26518 IX86_BUILTIN_PMULUDQ256,
26519 IX86_BUILTIN_POR256,
26520 IX86_BUILTIN_PSADBW256,
26521 IX86_BUILTIN_PSHUFB256,
26522 IX86_BUILTIN_PSHUFD256,
26523 IX86_BUILTIN_PSHUFHW256,
26524 IX86_BUILTIN_PSHUFLW256,
26525 IX86_BUILTIN_PSIGNB256,
26526 IX86_BUILTIN_PSIGNW256,
26527 IX86_BUILTIN_PSIGND256,
26528 IX86_BUILTIN_PSLLDQI256,
26529 IX86_BUILTIN_PSLLWI256,
26530 IX86_BUILTIN_PSLLW256,
26531 IX86_BUILTIN_PSLLDI256,
26532 IX86_BUILTIN_PSLLD256,
26533 IX86_BUILTIN_PSLLQI256,
26534 IX86_BUILTIN_PSLLQ256,
26535 IX86_BUILTIN_PSRAWI256,
26536 IX86_BUILTIN_PSRAW256,
26537 IX86_BUILTIN_PSRADI256,
26538 IX86_BUILTIN_PSRAD256,
26539 IX86_BUILTIN_PSRLDQI256,
26540 IX86_BUILTIN_PSRLWI256,
26541 IX86_BUILTIN_PSRLW256,
26542 IX86_BUILTIN_PSRLDI256,
26543 IX86_BUILTIN_PSRLD256,
26544 IX86_BUILTIN_PSRLQI256,
26545 IX86_BUILTIN_PSRLQ256,
26546 IX86_BUILTIN_PSUBB256,
26547 IX86_BUILTIN_PSUBW256,
26548 IX86_BUILTIN_PSUBD256,
26549 IX86_BUILTIN_PSUBQ256,
26550 IX86_BUILTIN_PSUBSB256,
26551 IX86_BUILTIN_PSUBSW256,
26552 IX86_BUILTIN_PSUBUSB256,
26553 IX86_BUILTIN_PSUBUSW256,
26554 IX86_BUILTIN_PUNPCKHBW256,
26555 IX86_BUILTIN_PUNPCKHWD256,
26556 IX86_BUILTIN_PUNPCKHDQ256,
26557 IX86_BUILTIN_PUNPCKHQDQ256,
26558 IX86_BUILTIN_PUNPCKLBW256,
26559 IX86_BUILTIN_PUNPCKLWD256,
26560 IX86_BUILTIN_PUNPCKLDQ256,
26561 IX86_BUILTIN_PUNPCKLQDQ256,
26562 IX86_BUILTIN_PXOR256,
26563 IX86_BUILTIN_MOVNTDQA256,
26564 IX86_BUILTIN_VBROADCASTSS_PS,
26565 IX86_BUILTIN_VBROADCASTSS_PS256,
26566 IX86_BUILTIN_VBROADCASTSD_PD256,
26567 IX86_BUILTIN_VBROADCASTSI256,
26568 IX86_BUILTIN_PBLENDD256,
26569 IX86_BUILTIN_PBLENDD128,
26570 IX86_BUILTIN_PBROADCASTB256,
26571 IX86_BUILTIN_PBROADCASTW256,
26572 IX86_BUILTIN_PBROADCASTD256,
26573 IX86_BUILTIN_PBROADCASTQ256,
26574 IX86_BUILTIN_PBROADCASTB128,
26575 IX86_BUILTIN_PBROADCASTW128,
26576 IX86_BUILTIN_PBROADCASTD128,
26577 IX86_BUILTIN_PBROADCASTQ128,
26578 IX86_BUILTIN_VPERMVARSI256,
26579 IX86_BUILTIN_VPERMDF256,
26580 IX86_BUILTIN_VPERMVARSF256,
26581 IX86_BUILTIN_VPERMDI256,
26582 IX86_BUILTIN_VPERMTI256,
26583 IX86_BUILTIN_VEXTRACT128I256,
26584 IX86_BUILTIN_VINSERT128I256,
26585 IX86_BUILTIN_MASKLOADD,
26586 IX86_BUILTIN_MASKLOADQ,
26587 IX86_BUILTIN_MASKLOADD256,
26588 IX86_BUILTIN_MASKLOADQ256,
26589 IX86_BUILTIN_MASKSTORED,
26590 IX86_BUILTIN_MASKSTOREQ,
26591 IX86_BUILTIN_MASKSTORED256,
26592 IX86_BUILTIN_MASKSTOREQ256,
26593 IX86_BUILTIN_PSLLVV4DI,
26594 IX86_BUILTIN_PSLLVV2DI,
26595 IX86_BUILTIN_PSLLVV8SI,
26596 IX86_BUILTIN_PSLLVV4SI,
26597 IX86_BUILTIN_PSRAVV8SI,
26598 IX86_BUILTIN_PSRAVV4SI,
26599 IX86_BUILTIN_PSRLVV4DI,
26600 IX86_BUILTIN_PSRLVV2DI,
26601 IX86_BUILTIN_PSRLVV8SI,
26602 IX86_BUILTIN_PSRLVV4SI,
26603
26604 IX86_BUILTIN_GATHERSIV2DF,
26605 IX86_BUILTIN_GATHERSIV4DF,
26606 IX86_BUILTIN_GATHERDIV2DF,
26607 IX86_BUILTIN_GATHERDIV4DF,
26608 IX86_BUILTIN_GATHERSIV4SF,
26609 IX86_BUILTIN_GATHERSIV8SF,
26610 IX86_BUILTIN_GATHERDIV4SF,
26611 IX86_BUILTIN_GATHERDIV8SF,
26612 IX86_BUILTIN_GATHERSIV2DI,
26613 IX86_BUILTIN_GATHERSIV4DI,
26614 IX86_BUILTIN_GATHERDIV2DI,
26615 IX86_BUILTIN_GATHERDIV4DI,
26616 IX86_BUILTIN_GATHERSIV4SI,
26617 IX86_BUILTIN_GATHERSIV8SI,
26618 IX86_BUILTIN_GATHERDIV4SI,
26619 IX86_BUILTIN_GATHERDIV8SI,
26620
26621 /* Alternate 4 element gather for the vectorizer where
26622 all operands are 32-byte wide. */
26623 IX86_BUILTIN_GATHERALTSIV4DF,
26624 IX86_BUILTIN_GATHERALTDIV8SF,
26625 IX86_BUILTIN_GATHERALTSIV4DI,
26626 IX86_BUILTIN_GATHERALTDIV8SI,
26627
26628 /* TFmode support builtins. */
26629 IX86_BUILTIN_INFQ,
26630 IX86_BUILTIN_HUGE_VALQ,
26631 IX86_BUILTIN_FABSQ,
26632 IX86_BUILTIN_COPYSIGNQ,
26633
26634 /* Vectorizer support builtins. */
26635 IX86_BUILTIN_CPYSGNPS,
26636 IX86_BUILTIN_CPYSGNPD,
26637 IX86_BUILTIN_CPYSGNPS256,
26638 IX86_BUILTIN_CPYSGNPD256,
26639
26640 /* FMA4 instructions. */
26641 IX86_BUILTIN_VFMADDSS,
26642 IX86_BUILTIN_VFMADDSD,
26643 IX86_BUILTIN_VFMADDPS,
26644 IX86_BUILTIN_VFMADDPD,
26645 IX86_BUILTIN_VFMADDPS256,
26646 IX86_BUILTIN_VFMADDPD256,
26647 IX86_BUILTIN_VFMADDSUBPS,
26648 IX86_BUILTIN_VFMADDSUBPD,
26649 IX86_BUILTIN_VFMADDSUBPS256,
26650 IX86_BUILTIN_VFMADDSUBPD256,
26651
26652 /* FMA3 instructions. */
26653 IX86_BUILTIN_VFMADDSS3,
26654 IX86_BUILTIN_VFMADDSD3,
26655
26656 /* XOP instructions. */
26657 IX86_BUILTIN_VPCMOV,
26658 IX86_BUILTIN_VPCMOV_V2DI,
26659 IX86_BUILTIN_VPCMOV_V4SI,
26660 IX86_BUILTIN_VPCMOV_V8HI,
26661 IX86_BUILTIN_VPCMOV_V16QI,
26662 IX86_BUILTIN_VPCMOV_V4SF,
26663 IX86_BUILTIN_VPCMOV_V2DF,
26664 IX86_BUILTIN_VPCMOV256,
26665 IX86_BUILTIN_VPCMOV_V4DI256,
26666 IX86_BUILTIN_VPCMOV_V8SI256,
26667 IX86_BUILTIN_VPCMOV_V16HI256,
26668 IX86_BUILTIN_VPCMOV_V32QI256,
26669 IX86_BUILTIN_VPCMOV_V8SF256,
26670 IX86_BUILTIN_VPCMOV_V4DF256,
26671
26672 IX86_BUILTIN_VPPERM,
26673
26674 IX86_BUILTIN_VPMACSSWW,
26675 IX86_BUILTIN_VPMACSWW,
26676 IX86_BUILTIN_VPMACSSWD,
26677 IX86_BUILTIN_VPMACSWD,
26678 IX86_BUILTIN_VPMACSSDD,
26679 IX86_BUILTIN_VPMACSDD,
26680 IX86_BUILTIN_VPMACSSDQL,
26681 IX86_BUILTIN_VPMACSSDQH,
26682 IX86_BUILTIN_VPMACSDQL,
26683 IX86_BUILTIN_VPMACSDQH,
26684 IX86_BUILTIN_VPMADCSSWD,
26685 IX86_BUILTIN_VPMADCSWD,
26686
26687 IX86_BUILTIN_VPHADDBW,
26688 IX86_BUILTIN_VPHADDBD,
26689 IX86_BUILTIN_VPHADDBQ,
26690 IX86_BUILTIN_VPHADDWD,
26691 IX86_BUILTIN_VPHADDWQ,
26692 IX86_BUILTIN_VPHADDDQ,
26693 IX86_BUILTIN_VPHADDUBW,
26694 IX86_BUILTIN_VPHADDUBD,
26695 IX86_BUILTIN_VPHADDUBQ,
26696 IX86_BUILTIN_VPHADDUWD,
26697 IX86_BUILTIN_VPHADDUWQ,
26698 IX86_BUILTIN_VPHADDUDQ,
26699 IX86_BUILTIN_VPHSUBBW,
26700 IX86_BUILTIN_VPHSUBWD,
26701 IX86_BUILTIN_VPHSUBDQ,
26702
26703 IX86_BUILTIN_VPROTB,
26704 IX86_BUILTIN_VPROTW,
26705 IX86_BUILTIN_VPROTD,
26706 IX86_BUILTIN_VPROTQ,
26707 IX86_BUILTIN_VPROTB_IMM,
26708 IX86_BUILTIN_VPROTW_IMM,
26709 IX86_BUILTIN_VPROTD_IMM,
26710 IX86_BUILTIN_VPROTQ_IMM,
26711
26712 IX86_BUILTIN_VPSHLB,
26713 IX86_BUILTIN_VPSHLW,
26714 IX86_BUILTIN_VPSHLD,
26715 IX86_BUILTIN_VPSHLQ,
26716 IX86_BUILTIN_VPSHAB,
26717 IX86_BUILTIN_VPSHAW,
26718 IX86_BUILTIN_VPSHAD,
26719 IX86_BUILTIN_VPSHAQ,
26720
26721 IX86_BUILTIN_VFRCZSS,
26722 IX86_BUILTIN_VFRCZSD,
26723 IX86_BUILTIN_VFRCZPS,
26724 IX86_BUILTIN_VFRCZPD,
26725 IX86_BUILTIN_VFRCZPS256,
26726 IX86_BUILTIN_VFRCZPD256,
26727
26728 IX86_BUILTIN_VPCOMEQUB,
26729 IX86_BUILTIN_VPCOMNEUB,
26730 IX86_BUILTIN_VPCOMLTUB,
26731 IX86_BUILTIN_VPCOMLEUB,
26732 IX86_BUILTIN_VPCOMGTUB,
26733 IX86_BUILTIN_VPCOMGEUB,
26734 IX86_BUILTIN_VPCOMFALSEUB,
26735 IX86_BUILTIN_VPCOMTRUEUB,
26736
26737 IX86_BUILTIN_VPCOMEQUW,
26738 IX86_BUILTIN_VPCOMNEUW,
26739 IX86_BUILTIN_VPCOMLTUW,
26740 IX86_BUILTIN_VPCOMLEUW,
26741 IX86_BUILTIN_VPCOMGTUW,
26742 IX86_BUILTIN_VPCOMGEUW,
26743 IX86_BUILTIN_VPCOMFALSEUW,
26744 IX86_BUILTIN_VPCOMTRUEUW,
26745
26746 IX86_BUILTIN_VPCOMEQUD,
26747 IX86_BUILTIN_VPCOMNEUD,
26748 IX86_BUILTIN_VPCOMLTUD,
26749 IX86_BUILTIN_VPCOMLEUD,
26750 IX86_BUILTIN_VPCOMGTUD,
26751 IX86_BUILTIN_VPCOMGEUD,
26752 IX86_BUILTIN_VPCOMFALSEUD,
26753 IX86_BUILTIN_VPCOMTRUEUD,
26754
26755 IX86_BUILTIN_VPCOMEQUQ,
26756 IX86_BUILTIN_VPCOMNEUQ,
26757 IX86_BUILTIN_VPCOMLTUQ,
26758 IX86_BUILTIN_VPCOMLEUQ,
26759 IX86_BUILTIN_VPCOMGTUQ,
26760 IX86_BUILTIN_VPCOMGEUQ,
26761 IX86_BUILTIN_VPCOMFALSEUQ,
26762 IX86_BUILTIN_VPCOMTRUEUQ,
26763
26764 IX86_BUILTIN_VPCOMEQB,
26765 IX86_BUILTIN_VPCOMNEB,
26766 IX86_BUILTIN_VPCOMLTB,
26767 IX86_BUILTIN_VPCOMLEB,
26768 IX86_BUILTIN_VPCOMGTB,
26769 IX86_BUILTIN_VPCOMGEB,
26770 IX86_BUILTIN_VPCOMFALSEB,
26771 IX86_BUILTIN_VPCOMTRUEB,
26772
26773 IX86_BUILTIN_VPCOMEQW,
26774 IX86_BUILTIN_VPCOMNEW,
26775 IX86_BUILTIN_VPCOMLTW,
26776 IX86_BUILTIN_VPCOMLEW,
26777 IX86_BUILTIN_VPCOMGTW,
26778 IX86_BUILTIN_VPCOMGEW,
26779 IX86_BUILTIN_VPCOMFALSEW,
26780 IX86_BUILTIN_VPCOMTRUEW,
26781
26782 IX86_BUILTIN_VPCOMEQD,
26783 IX86_BUILTIN_VPCOMNED,
26784 IX86_BUILTIN_VPCOMLTD,
26785 IX86_BUILTIN_VPCOMLED,
26786 IX86_BUILTIN_VPCOMGTD,
26787 IX86_BUILTIN_VPCOMGED,
26788 IX86_BUILTIN_VPCOMFALSED,
26789 IX86_BUILTIN_VPCOMTRUED,
26790
26791 IX86_BUILTIN_VPCOMEQQ,
26792 IX86_BUILTIN_VPCOMNEQ,
26793 IX86_BUILTIN_VPCOMLTQ,
26794 IX86_BUILTIN_VPCOMLEQ,
26795 IX86_BUILTIN_VPCOMGTQ,
26796 IX86_BUILTIN_VPCOMGEQ,
26797 IX86_BUILTIN_VPCOMFALSEQ,
26798 IX86_BUILTIN_VPCOMTRUEQ,
26799
26800 /* LWP instructions. */
26801 IX86_BUILTIN_LLWPCB,
26802 IX86_BUILTIN_SLWPCB,
26803 IX86_BUILTIN_LWPVAL32,
26804 IX86_BUILTIN_LWPVAL64,
26805 IX86_BUILTIN_LWPINS32,
26806 IX86_BUILTIN_LWPINS64,
26807
26808 IX86_BUILTIN_CLZS,
26809
26810 /* RTM */
26811 IX86_BUILTIN_XBEGIN,
26812 IX86_BUILTIN_XEND,
26813 IX86_BUILTIN_XABORT,
26814 IX86_BUILTIN_XTEST,
26815
26816 /* BMI instructions. */
26817 IX86_BUILTIN_BEXTR32,
26818 IX86_BUILTIN_BEXTR64,
26819 IX86_BUILTIN_CTZS,
26820
26821 /* TBM instructions. */
26822 IX86_BUILTIN_BEXTRI32,
26823 IX86_BUILTIN_BEXTRI64,
26824
26825 /* BMI2 instructions. */
26826 IX86_BUILTIN_BZHI32,
26827 IX86_BUILTIN_BZHI64,
26828 IX86_BUILTIN_PDEP32,
26829 IX86_BUILTIN_PDEP64,
26830 IX86_BUILTIN_PEXT32,
26831 IX86_BUILTIN_PEXT64,
26832
26833 /* ADX instructions. */
26834 IX86_BUILTIN_ADDCARRYX32,
26835 IX86_BUILTIN_ADDCARRYX64,
26836
26837 /* FSGSBASE instructions. */
26838 IX86_BUILTIN_RDFSBASE32,
26839 IX86_BUILTIN_RDFSBASE64,
26840 IX86_BUILTIN_RDGSBASE32,
26841 IX86_BUILTIN_RDGSBASE64,
26842 IX86_BUILTIN_WRFSBASE32,
26843 IX86_BUILTIN_WRFSBASE64,
26844 IX86_BUILTIN_WRGSBASE32,
26845 IX86_BUILTIN_WRGSBASE64,
26846
26847 /* RDRND instructions. */
26848 IX86_BUILTIN_RDRAND16_STEP,
26849 IX86_BUILTIN_RDRAND32_STEP,
26850 IX86_BUILTIN_RDRAND64_STEP,
26851
26852 /* RDSEED instructions. */
26853 IX86_BUILTIN_RDSEED16_STEP,
26854 IX86_BUILTIN_RDSEED32_STEP,
26855 IX86_BUILTIN_RDSEED64_STEP,
26856
26857 /* F16C instructions. */
26858 IX86_BUILTIN_CVTPH2PS,
26859 IX86_BUILTIN_CVTPH2PS256,
26860 IX86_BUILTIN_CVTPS2PH,
26861 IX86_BUILTIN_CVTPS2PH256,
26862
26863 /* CFString built-in for darwin */
26864 IX86_BUILTIN_CFSTRING,
26865
26866 /* Builtins to get CPU type and supported features. */
26867 IX86_BUILTIN_CPU_INIT,
26868 IX86_BUILTIN_CPU_IS,
26869 IX86_BUILTIN_CPU_SUPPORTS,
26870
26871 IX86_BUILTIN_MAX
26872 };
26873
26874 /* Table for the ix86 builtin decls. */
26875 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
26876
26877 /* Table of all of the builtin functions that are possible with different ISA's
26878 but are waiting to be built until a function is declared to use that
26879 ISA. */
26880 struct builtin_isa {
26881 const char *name; /* function name */
26882 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
26883 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
26884 bool const_p; /* true if the declaration is constant */
26885 bool set_and_not_built_p;
26886 };
26887
26888 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
26889
26890
26891 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
26892 of which isa_flags to use in the ix86_builtins_isa array. Stores the
26893 function decl in the ix86_builtins array. Returns the function decl or
26894 NULL_TREE, if the builtin was not added.
26895
26896 If the front end has a special hook for builtin functions, delay adding
26897 builtin functions that aren't in the current ISA until the ISA is changed
26898 with function specific optimization. Doing so, can save about 300K for the
26899 default compiler. When the builtin is expanded, check at that time whether
26900 it is valid.
26901
26902 If the front end doesn't have a special hook, record all builtins, even if
26903 it isn't an instruction set in the current ISA in case the user uses
26904 function specific options for a different ISA, so that we don't get scope
26905 errors if a builtin is added in the middle of a function scope. */
26906
26907 static inline tree
26908 def_builtin (HOST_WIDE_INT mask, const char *name,
26909 enum ix86_builtin_func_type tcode,
26910 enum ix86_builtins code)
26911 {
26912 tree decl = NULL_TREE;
26913
26914 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
26915 {
26916 ix86_builtins_isa[(int) code].isa = mask;
26917
26918 mask &= ~OPTION_MASK_ISA_64BIT;
26919 if (mask == 0
26920 || (mask & ix86_isa_flags) != 0
26921 || (lang_hooks.builtin_function
26922 == lang_hooks.builtin_function_ext_scope))
26923
26924 {
26925 tree type = ix86_get_builtin_func_type (tcode);
26926 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
26927 NULL, NULL_TREE);
26928 ix86_builtins[(int) code] = decl;
26929 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
26930 }
26931 else
26932 {
26933 ix86_builtins[(int) code] = NULL_TREE;
26934 ix86_builtins_isa[(int) code].tcode = tcode;
26935 ix86_builtins_isa[(int) code].name = name;
26936 ix86_builtins_isa[(int) code].const_p = false;
26937 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
26938 }
26939 }
26940
26941 return decl;
26942 }
26943
26944 /* Like def_builtin, but also marks the function decl "const". */
26945
26946 static inline tree
26947 def_builtin_const (HOST_WIDE_INT mask, const char *name,
26948 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
26949 {
26950 tree decl = def_builtin (mask, name, tcode, code);
26951 if (decl)
26952 TREE_READONLY (decl) = 1;
26953 else
26954 ix86_builtins_isa[(int) code].const_p = true;
26955
26956 return decl;
26957 }
26958
26959 /* Add any new builtin functions for a given ISA that may not have been
26960 declared. This saves a bit of space compared to adding all of the
26961 declarations to the tree, even if we didn't use them. */
26962
26963 static void
26964 ix86_add_new_builtins (HOST_WIDE_INT isa)
26965 {
26966 int i;
26967
26968 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
26969 {
26970 if ((ix86_builtins_isa[i].isa & isa) != 0
26971 && ix86_builtins_isa[i].set_and_not_built_p)
26972 {
26973 tree decl, type;
26974
26975 /* Don't define the builtin again. */
26976 ix86_builtins_isa[i].set_and_not_built_p = false;
26977
26978 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
26979 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
26980 type, i, BUILT_IN_MD, NULL,
26981 NULL_TREE);
26982
26983 ix86_builtins[i] = decl;
26984 if (ix86_builtins_isa[i].const_p)
26985 TREE_READONLY (decl) = 1;
26986 }
26987 }
26988 }
26989
26990 /* Bits for builtin_description.flag. */
26991
26992 /* Set when we don't support the comparison natively, and should
26993 swap_comparison in order to support it. */
26994 #define BUILTIN_DESC_SWAP_OPERANDS 1
26995
26996 struct builtin_description
26997 {
26998 const HOST_WIDE_INT mask;
26999 const enum insn_code icode;
27000 const char *const name;
27001 const enum ix86_builtins code;
27002 const enum rtx_code comparison;
27003 const int flag;
27004 };
27005
27006 static const struct builtin_description bdesc_comi[] =
27007 {
27008 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
27009 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
27010 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
27011 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
27012 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
27013 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
27014 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
27015 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
27016 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
27017 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
27018 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
27019 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
27020 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
27021 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
27022 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
27023 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
27024 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
27025 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
27026 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
27027 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
27028 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
27029 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
27030 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
27031 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
27032 };
27033
27034 static const struct builtin_description bdesc_pcmpestr[] =
27035 {
27036 /* SSE4.2 */
27037 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
27038 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
27039 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
27040 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
27041 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
27042 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
27043 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
27044 };
27045
27046 static const struct builtin_description bdesc_pcmpistr[] =
27047 {
27048 /* SSE4.2 */
27049 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
27050 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
27051 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
27052 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
27053 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
27054 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
27055 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
27056 };
27057
27058 /* Special builtins with variable number of arguments. */
27059 static const struct builtin_description bdesc_special_args[] =
27060 {
27061 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
27062 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
27063 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
27064
27065 /* MMX */
27066 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
27067
27068 /* 3DNow! */
27069 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
27070
27071 /* FXSR, XSAVE and XSAVEOPT */
27072 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
27073 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
27074 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27075 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27076 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27077
27078 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
27079 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
27080 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27081 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27082 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27083
27084 /* SSE */
27085 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
27086 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
27087 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
27088
27089 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
27090 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
27091 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
27092 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
27093
27094 /* SSE or 3DNow!A */
27095 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27096 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
27097
27098 /* SSE2 */
27099 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27100 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27101 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27102 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
27103 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27104 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
27105 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
27106 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
27107 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
27108 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
27109
27110 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
27111 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
27112
27113 /* SSE3 */
27114 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
27115
27116 /* SSE4.1 */
27117 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
27118
27119 /* SSE4A */
27120 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27121 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
27122
27123 /* AVX */
27124 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
27125 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
27126
27127 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
27128 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
27129 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
27130 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
27131 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
27132
27133 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
27134 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
27135 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
27136 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
27137 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
27138 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
27139 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
27140
27141 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
27142 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
27143 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
27144
27145 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
27146 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
27147 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
27148 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
27149 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
27150 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
27151 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
27152 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
27153
27154 /* AVX2 */
27155 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
27156 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
27157 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
27158 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
27159 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
27160 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
27161 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
27162 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
27163 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
27164
27165 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
27166 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
27167 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
27168 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
27169 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
27170 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
27171
27172 /* FSGSBASE */
27173 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
27174 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
27175 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
27176 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
27177 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
27178 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
27179 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
27180 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
27181
27182 /* RTM */
27183 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
27184 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
27185 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
27186 };
27187
27188 /* Builtins with variable number of arguments. */
27189 static const struct builtin_description bdesc_args[] =
27190 {
27191 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
27192 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
27193 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
27194 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
27195 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
27196 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
27197 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
27198
27199 /* MMX */
27200 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27201 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27202 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27203 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27204 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27205 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27206
27207 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27208 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27209 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27210 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27211 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27212 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27213 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27214 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27215
27216 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27217 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27218
27219 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27220 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27221 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27222 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27223
27224 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27225 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27226 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27227 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27228 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27229 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27230
27231 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27232 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27233 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27234 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27235 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
27236 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
27237
27238 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27239 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
27240 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27241
27242 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
27243
27244 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27245 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27246 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27247 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27248 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27249 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27250
27251 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27252 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27253 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27254 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27255 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27256 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27257
27258 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27259 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27260 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27261 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27262
27263 /* 3DNow! */
27264 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27265 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27266 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27267 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27268
27269 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27270 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27271 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27272 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27273 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27274 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27275 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27276 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27277 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27278 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27279 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27280 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27281 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27282 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27283 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27284
27285 /* 3DNow!A */
27286 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27287 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27288 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27289 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27290 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27291 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27292
27293 /* SSE */
27294 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
27295 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27296 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27297 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27298 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27299 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27300 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27301 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27302 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27303 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27304 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27305 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27306
27307 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27308
27309 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27310 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27311 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27312 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27313 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27314 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27315 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27316 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27317
27318 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27319 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27320 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27321 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27322 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27323 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27324 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27325 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27326 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27327 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27328 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
27329 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27330 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27331 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27332 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27333 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27334 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27335 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27336 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27337 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27338 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27339 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27340
27341 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27342 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27343 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27344 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27345
27346 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27347 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27348 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27349 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27350
27351 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27352
27353 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27354 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27355 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27356 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27357 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27358
27359 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
27360 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
27361 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
27362
27363 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
27364
27365 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27366 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27367 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27368
27369 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
27370 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
27371
27372 /* SSE MMX or 3Dnow!A */
27373 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27374 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27375 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27376
27377 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27378 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27379 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27380 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27381
27382 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
27383 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
27384
27385 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
27386
27387 /* SSE2 */
27388 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27389
27390 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
27391 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
27392 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27393 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
27394 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
27395
27396 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27397 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27398 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
27399 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27400 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27401
27402 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
27403
27404 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27405 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27406 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27407 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27408
27409 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27410 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
27411 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27412
27413 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27414 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27415 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27416 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27417 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27418 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27419 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27420 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27421
27422 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27423 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27424 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27425 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27426 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
27427 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27428 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27429 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27430 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27431 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27432 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27433 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27434 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27435 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27436 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27437 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27438 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27439 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27440 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27441 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27442
27443 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27444 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27445 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27446 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27447
27448 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27449 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27450 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27451 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27452
27453 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27454
27455 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27456 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27457 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27458
27459 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27460
27461 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27462 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27463 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27464 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27465 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27466 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27467 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27468 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27469
27470 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27471 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27472 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27473 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27474 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27475 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27476 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27477 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27478
27479 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27480 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
27481
27482 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27483 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27484 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27485 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27486
27487 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27488 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27489
27490 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27491 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27492 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27493 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27494 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27495 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27496
27497 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27498 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27499 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27500 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27501
27502 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27503 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27504 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27505 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27506 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27507 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27508 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27509 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27510
27511 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27512 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27513 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27514
27515 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27516 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
27517
27518 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
27519 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27520
27521 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
27522
27523 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
27524 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
27525 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
27526 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
27527
27528 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27529 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27530 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27531 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27532 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27533 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27534 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27535
27536 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27537 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27538 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27539 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27540 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27541 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27542 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27543
27544 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27545 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27546 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27547 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27548
27549 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
27550 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27551 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27552
27553 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
27554
27555 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27556
27557 /* SSE2 MMX */
27558 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27559 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27560
27561 /* SSE3 */
27562 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
27563 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27564
27565 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27566 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27567 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27568 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27569 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27570 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27571
27572 /* SSSE3 */
27573 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27574 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
27575 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27576 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
27577 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27578 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27579
27580 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27581 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27582 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27583 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27584 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27585 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27586 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27587 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27588 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27589 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27590 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27591 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27592 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
27593 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
27594 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27595 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27596 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27597 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27598 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27599 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27600 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27601 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27602 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27603 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27604
27605 /* SSSE3. */
27606 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
27607 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
27608
27609 /* SSE4.1 */
27610 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27611 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27612 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
27613 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
27614 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27615 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27616 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27617 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
27618 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
27619 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
27620
27621 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27622 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27623 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27624 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27625 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27626 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27627 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27628 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27629 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27630 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27631 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27632 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27633 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27634
27635 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27636 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27637 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27638 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27639 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27640 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27641 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27642 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27643 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27644 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27645 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27646 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27647
27648 /* SSE4.1 */
27649 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27650 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27651 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27652 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27653
27654 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
27655 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
27656 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
27657 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
27658
27659 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27660 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27661
27662 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27663 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27664
27665 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
27666 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
27667 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
27668 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
27669
27670 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
27671 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
27672
27673 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27674 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27675
27676 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27677 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27678 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27679
27680 /* SSE4.2 */
27681 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27682 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
27683 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
27684 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27685 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27686
27687 /* SSE4A */
27688 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
27689 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
27690 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
27691 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27692
27693 /* AES */
27694 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
27695 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27696
27697 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27698 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27699 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27700 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27701
27702 /* PCLMUL */
27703 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
27704
27705 /* AVX */
27706 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27707 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27708 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27709 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27710 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27711 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27712 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27713 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27714 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27715 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27716 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27717 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27718 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27719 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27720 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27721 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27722 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27723 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27724 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27725 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27726 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27727 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27728 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27729 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27730 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27731 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27732
27733 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
27734 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
27735 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
27736 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27737
27738 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27739 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27740 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
27741 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
27742 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27743 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27744 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27745 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27746 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27747 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27748 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27749 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27750 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27751 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
27752 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
27753 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
27754 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
27755 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
27756 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
27757 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27758 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
27759 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27760 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27761 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27762 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27763 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27764 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27765 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27766 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27767 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27768 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27769 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
27770 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
27771 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
27772
27773 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27774 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27775 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27776
27777 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27778 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27779 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27780 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27781 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27782
27783 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27784
27785 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27786 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27787
27788 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
27789 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
27790 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
27791 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
27792
27793 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27794 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27795
27796 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27797 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27798
27799 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
27800 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
27801 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
27802 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
27803
27804 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
27805 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
27806
27807 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27808 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27809
27810 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27811 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27812 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27813 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27814
27815 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27816 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27817 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27818 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
27819 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
27820 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
27821
27822 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27823 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27824 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27825 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27826 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27827 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27828 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27829 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27830 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27831 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27832 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27833 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27834 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27835 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27836 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27837
27838 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
27839 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
27840
27841 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27842 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27843
27844 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27845
27846 /* AVX2 */
27847 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
27848 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
27849 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
27850 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
27851 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27852 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27853 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27854 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27855 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27856 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27857 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27858 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27859 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27860 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27861 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27862 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27863 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
27864 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27865 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27866 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27867 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27868 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
27869 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
27870 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27871 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27872 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27873 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27874 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27875 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27876 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27877 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27878 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27879 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27880 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27881 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27882 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27883 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27884 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27885 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
27886 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27887 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27888 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27889 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27890 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27891 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27892 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27893 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27894 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27895 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27896 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27897 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27898 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
27899 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27900 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27901 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27902 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27903 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27904 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27905 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27906 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27907 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27908 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27909 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27910 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27911 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27912 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27913 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27914 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27915 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27916 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27917 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27918 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27919 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27920 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27921 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
27922 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27923 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27924 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27925 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27926 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27927 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27928 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27929 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27930 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27931 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27932 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27933 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27934 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27935 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27936 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27937 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27938 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27939 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27940 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27941 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27942 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27943 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27944 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27945 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27946 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27947 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27948 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27949 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27950 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27951 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27952 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27953 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27954 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27955 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27956 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27957 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27958 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27959 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27960 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27961 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27962 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27963 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27964 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27965 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27966 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
27967 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27968 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
27969 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
27970 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27971 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27972 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27973 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27974 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27975 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27976 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27977 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27978 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27979 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
27980 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
27981 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
27982 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
27983 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27984 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27985 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27986 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27987 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27988 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27989 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27990 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27991 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27992 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27993
27994 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27995
27996 /* BMI */
27997 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27998 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27999 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
28000
28001 /* TBM */
28002 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28003 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28004
28005 /* F16C */
28006 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
28007 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
28008 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
28009 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
28010
28011 /* BMI2 */
28012 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28013 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28014 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28015 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28016 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28017 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28018 };
28019
28020 /* FMA4 and XOP. */
28021 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
28022 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
28023 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
28024 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
28025 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
28026 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
28027 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
28028 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
28029 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
28030 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
28031 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
28032 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
28033 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
28034 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
28035 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
28036 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
28037 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
28038 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
28039 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
28040 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
28041 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
28042 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
28043 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
28044 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
28045 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
28046 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
28047 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
28048 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
28049 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
28050 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
28051 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
28052 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
28053 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
28054 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
28055 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
28056 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
28057 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
28058 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
28059 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
28060 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
28061 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
28062 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
28063 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
28064 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
28065 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
28066 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
28067 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
28068 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
28069 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
28070 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
28071 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
28072 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
28073
28074 static const struct builtin_description bdesc_multi_arg[] =
28075 {
28076 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
28077 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
28078 UNKNOWN, (int)MULTI_ARG_3_SF },
28079 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
28080 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
28081 UNKNOWN, (int)MULTI_ARG_3_DF },
28082
28083 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
28084 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
28085 UNKNOWN, (int)MULTI_ARG_3_SF },
28086 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
28087 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
28088 UNKNOWN, (int)MULTI_ARG_3_DF },
28089
28090 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
28091 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
28092 UNKNOWN, (int)MULTI_ARG_3_SF },
28093 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
28094 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
28095 UNKNOWN, (int)MULTI_ARG_3_DF },
28096 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
28097 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
28098 UNKNOWN, (int)MULTI_ARG_3_SF2 },
28099 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
28100 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
28101 UNKNOWN, (int)MULTI_ARG_3_DF2 },
28102
28103 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
28104 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
28105 UNKNOWN, (int)MULTI_ARG_3_SF },
28106 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
28107 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
28108 UNKNOWN, (int)MULTI_ARG_3_DF },
28109 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
28110 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
28111 UNKNOWN, (int)MULTI_ARG_3_SF2 },
28112 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
28113 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
28114 UNKNOWN, (int)MULTI_ARG_3_DF2 },
28115
28116 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
28117 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
28118 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
28119 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
28120 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
28121 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
28122 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
28123
28124 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
28125 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
28126 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
28127 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
28128 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
28129 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
28130 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
28131
28132 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
28133
28134 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
28135 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
28136 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28137 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28138 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
28139 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
28140 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28141 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28142 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28143 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28144 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28145 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28146
28147 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
28148 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
28149 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
28150 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
28151 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
28152 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
28153 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
28154 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
28155 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
28156 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
28157 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
28158 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
28159 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
28160 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
28161 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
28162 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
28163
28164 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
28165 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
28166 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
28167 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
28168 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
28169 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
28170
28171 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
28172 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
28173 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
28174 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
28175 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
28176 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
28177 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
28178 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
28179 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
28180 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
28181 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
28182 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
28183 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
28184 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
28185 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
28186
28187 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
28188 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
28189 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
28190 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
28191 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
28192 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
28193 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
28194
28195 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
28196 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
28197 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
28198 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
28199 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
28200 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
28201 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
28202
28203 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
28204 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
28205 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
28206 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
28207 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
28208 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
28209 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
28210
28211 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28212 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28213 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28214 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
28215 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
28216 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
28217 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
28218
28219 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
28220 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28221 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28222 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
28223 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
28224 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
28225 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
28226
28227 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
28228 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28229 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28230 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
28231 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
28232 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
28233 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
28234
28235 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
28236 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28237 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28238 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
28239 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
28240 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
28241 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
28242
28243 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28244 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28245 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28246 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
28247 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
28248 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
28249 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
28250
28251 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28252 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28253 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28254 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28255 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28256 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28257 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28258 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28259
28260 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28261 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28262 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28263 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28264 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28265 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28266 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28267 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28268
28269 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
28270 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
28271 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
28272 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
28273
28274 };
28275 \f
28276 /* TM vector builtins. */
28277
28278 /* Reuse the existing x86-specific `struct builtin_description' cause
28279 we're lazy. Add casts to make them fit. */
28280 static const struct builtin_description bdesc_tm[] =
28281 {
28282 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28283 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28284 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28285 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28286 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28287 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28288 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28289
28290 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28291 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28292 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28293 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28294 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28295 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28296 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28297
28298 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28299 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28300 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28301 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28302 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28303 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28304 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28305
28306 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
28307 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
28308 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
28309 };
28310
28311 /* TM callbacks. */
28312
28313 /* Return the builtin decl needed to load a vector of TYPE. */
28314
28315 static tree
28316 ix86_builtin_tm_load (tree type)
28317 {
28318 if (TREE_CODE (type) == VECTOR_TYPE)
28319 {
28320 switch (tree_low_cst (TYPE_SIZE (type), 1))
28321 {
28322 case 64:
28323 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
28324 case 128:
28325 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
28326 case 256:
28327 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
28328 }
28329 }
28330 return NULL_TREE;
28331 }
28332
28333 /* Return the builtin decl needed to store a vector of TYPE. */
28334
28335 static tree
28336 ix86_builtin_tm_store (tree type)
28337 {
28338 if (TREE_CODE (type) == VECTOR_TYPE)
28339 {
28340 switch (tree_low_cst (TYPE_SIZE (type), 1))
28341 {
28342 case 64:
28343 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
28344 case 128:
28345 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
28346 case 256:
28347 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
28348 }
28349 }
28350 return NULL_TREE;
28351 }
28352 \f
28353 /* Initialize the transactional memory vector load/store builtins. */
28354
28355 static void
28356 ix86_init_tm_builtins (void)
28357 {
28358 enum ix86_builtin_func_type ftype;
28359 const struct builtin_description *d;
28360 size_t i;
28361 tree decl;
28362 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
28363 tree attrs_log, attrs_type_log;
28364
28365 if (!flag_tm)
28366 return;
28367
28368 /* If there are no builtins defined, we must be compiling in a
28369 language without trans-mem support. */
28370 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
28371 return;
28372
28373 /* Use whatever attributes a normal TM load has. */
28374 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
28375 attrs_load = DECL_ATTRIBUTES (decl);
28376 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28377 /* Use whatever attributes a normal TM store has. */
28378 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
28379 attrs_store = DECL_ATTRIBUTES (decl);
28380 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28381 /* Use whatever attributes a normal TM log has. */
28382 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
28383 attrs_log = DECL_ATTRIBUTES (decl);
28384 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28385
28386 for (i = 0, d = bdesc_tm;
28387 i < ARRAY_SIZE (bdesc_tm);
28388 i++, d++)
28389 {
28390 if ((d->mask & ix86_isa_flags) != 0
28391 || (lang_hooks.builtin_function
28392 == lang_hooks.builtin_function_ext_scope))
28393 {
28394 tree type, attrs, attrs_type;
28395 enum built_in_function code = (enum built_in_function) d->code;
28396
28397 ftype = (enum ix86_builtin_func_type) d->flag;
28398 type = ix86_get_builtin_func_type (ftype);
28399
28400 if (BUILTIN_TM_LOAD_P (code))
28401 {
28402 attrs = attrs_load;
28403 attrs_type = attrs_type_load;
28404 }
28405 else if (BUILTIN_TM_STORE_P (code))
28406 {
28407 attrs = attrs_store;
28408 attrs_type = attrs_type_store;
28409 }
28410 else
28411 {
28412 attrs = attrs_log;
28413 attrs_type = attrs_type_log;
28414 }
28415 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
28416 /* The builtin without the prefix for
28417 calling it directly. */
28418 d->name + strlen ("__builtin_"),
28419 attrs);
28420 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
28421 set the TYPE_ATTRIBUTES. */
28422 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
28423
28424 set_builtin_decl (code, decl, false);
28425 }
28426 }
28427 }
28428
28429 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
28430 in the current target ISA to allow the user to compile particular modules
28431 with different target specific options that differ from the command line
28432 options. */
28433 static void
28434 ix86_init_mmx_sse_builtins (void)
28435 {
28436 const struct builtin_description * d;
28437 enum ix86_builtin_func_type ftype;
28438 size_t i;
28439
28440 /* Add all special builtins with variable number of operands. */
28441 for (i = 0, d = bdesc_special_args;
28442 i < ARRAY_SIZE (bdesc_special_args);
28443 i++, d++)
28444 {
28445 if (d->name == 0)
28446 continue;
28447
28448 ftype = (enum ix86_builtin_func_type) d->flag;
28449 def_builtin (d->mask, d->name, ftype, d->code);
28450 }
28451
28452 /* Add all builtins with variable number of operands. */
28453 for (i = 0, d = bdesc_args;
28454 i < ARRAY_SIZE (bdesc_args);
28455 i++, d++)
28456 {
28457 if (d->name == 0)
28458 continue;
28459
28460 ftype = (enum ix86_builtin_func_type) d->flag;
28461 def_builtin_const (d->mask, d->name, ftype, d->code);
28462 }
28463
28464 /* pcmpestr[im] insns. */
28465 for (i = 0, d = bdesc_pcmpestr;
28466 i < ARRAY_SIZE (bdesc_pcmpestr);
28467 i++, d++)
28468 {
28469 if (d->code == IX86_BUILTIN_PCMPESTRM128)
28470 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
28471 else
28472 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
28473 def_builtin_const (d->mask, d->name, ftype, d->code);
28474 }
28475
28476 /* pcmpistr[im] insns. */
28477 for (i = 0, d = bdesc_pcmpistr;
28478 i < ARRAY_SIZE (bdesc_pcmpistr);
28479 i++, d++)
28480 {
28481 if (d->code == IX86_BUILTIN_PCMPISTRM128)
28482 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
28483 else
28484 ftype = INT_FTYPE_V16QI_V16QI_INT;
28485 def_builtin_const (d->mask, d->name, ftype, d->code);
28486 }
28487
28488 /* comi/ucomi insns. */
28489 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
28490 {
28491 if (d->mask == OPTION_MASK_ISA_SSE2)
28492 ftype = INT_FTYPE_V2DF_V2DF;
28493 else
28494 ftype = INT_FTYPE_V4SF_V4SF;
28495 def_builtin_const (d->mask, d->name, ftype, d->code);
28496 }
28497
28498 /* SSE */
28499 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
28500 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
28501 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
28502 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
28503
28504 /* SSE or 3DNow!A */
28505 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28506 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
28507 IX86_BUILTIN_MASKMOVQ);
28508
28509 /* SSE2 */
28510 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
28511 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
28512
28513 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
28514 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
28515 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
28516 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
28517
28518 /* SSE3. */
28519 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
28520 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
28521 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
28522 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
28523
28524 /* AES */
28525 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
28526 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
28527 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
28528 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
28529 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
28530 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
28531 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
28532 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
28533 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
28534 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
28535 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
28536 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
28537
28538 /* PCLMUL */
28539 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
28540 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
28541
28542 /* RDRND */
28543 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
28544 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
28545 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
28546 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
28547 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
28548 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
28549 IX86_BUILTIN_RDRAND64_STEP);
28550
28551 /* AVX2 */
28552 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
28553 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
28554 IX86_BUILTIN_GATHERSIV2DF);
28555
28556 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
28557 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
28558 IX86_BUILTIN_GATHERSIV4DF);
28559
28560 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
28561 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
28562 IX86_BUILTIN_GATHERDIV2DF);
28563
28564 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
28565 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
28566 IX86_BUILTIN_GATHERDIV4DF);
28567
28568 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
28569 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
28570 IX86_BUILTIN_GATHERSIV4SF);
28571
28572 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
28573 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
28574 IX86_BUILTIN_GATHERSIV8SF);
28575
28576 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
28577 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
28578 IX86_BUILTIN_GATHERDIV4SF);
28579
28580 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
28581 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
28582 IX86_BUILTIN_GATHERDIV8SF);
28583
28584 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
28585 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
28586 IX86_BUILTIN_GATHERSIV2DI);
28587
28588 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
28589 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
28590 IX86_BUILTIN_GATHERSIV4DI);
28591
28592 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
28593 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
28594 IX86_BUILTIN_GATHERDIV2DI);
28595
28596 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
28597 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
28598 IX86_BUILTIN_GATHERDIV4DI);
28599
28600 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
28601 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
28602 IX86_BUILTIN_GATHERSIV4SI);
28603
28604 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
28605 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
28606 IX86_BUILTIN_GATHERSIV8SI);
28607
28608 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
28609 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
28610 IX86_BUILTIN_GATHERDIV4SI);
28611
28612 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
28613 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
28614 IX86_BUILTIN_GATHERDIV8SI);
28615
28616 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
28617 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
28618 IX86_BUILTIN_GATHERALTSIV4DF);
28619
28620 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
28621 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
28622 IX86_BUILTIN_GATHERALTDIV8SF);
28623
28624 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
28625 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
28626 IX86_BUILTIN_GATHERALTSIV4DI);
28627
28628 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
28629 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
28630 IX86_BUILTIN_GATHERALTDIV8SI);
28631
28632 /* RTM. */
28633 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
28634 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
28635
28636 /* MMX access to the vec_init patterns. */
28637 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
28638 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
28639
28640 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
28641 V4HI_FTYPE_HI_HI_HI_HI,
28642 IX86_BUILTIN_VEC_INIT_V4HI);
28643
28644 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
28645 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
28646 IX86_BUILTIN_VEC_INIT_V8QI);
28647
28648 /* Access to the vec_extract patterns. */
28649 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
28650 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
28651 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
28652 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
28653 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
28654 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
28655 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
28656 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
28657 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
28658 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
28659
28660 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28661 "__builtin_ia32_vec_ext_v4hi",
28662 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
28663
28664 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
28665 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
28666
28667 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
28668 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
28669
28670 /* Access to the vec_set patterns. */
28671 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
28672 "__builtin_ia32_vec_set_v2di",
28673 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
28674
28675 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
28676 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
28677
28678 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
28679 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
28680
28681 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
28682 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
28683
28684 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28685 "__builtin_ia32_vec_set_v4hi",
28686 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
28687
28688 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
28689 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
28690
28691 /* RDSEED */
28692 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
28693 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
28694 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
28695 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
28696 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
28697 "__builtin_ia32_rdseed_di_step",
28698 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
28699
28700 /* ADCX */
28701 def_builtin (0, "__builtin_ia32_addcarryx_u32",
28702 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
28703 def_builtin (OPTION_MASK_ISA_64BIT,
28704 "__builtin_ia32_addcarryx_u64",
28705 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
28706 IX86_BUILTIN_ADDCARRYX64);
28707
28708 /* Add FMA4 multi-arg argument instructions */
28709 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
28710 {
28711 if (d->name == 0)
28712 continue;
28713
28714 ftype = (enum ix86_builtin_func_type) d->flag;
28715 def_builtin_const (d->mask, d->name, ftype, d->code);
28716 }
28717 }
28718
28719 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
28720 to return a pointer to VERSION_DECL if the outcome of the expression
28721 formed by PREDICATE_CHAIN is true. This function will be called during
28722 version dispatch to decide which function version to execute. It returns
28723 the basic block at the end, to which more conditions can be added. */
28724
28725 static basic_block
28726 add_condition_to_bb (tree function_decl, tree version_decl,
28727 tree predicate_chain, basic_block new_bb)
28728 {
28729 gimple return_stmt;
28730 tree convert_expr, result_var;
28731 gimple convert_stmt;
28732 gimple call_cond_stmt;
28733 gimple if_else_stmt;
28734
28735 basic_block bb1, bb2, bb3;
28736 edge e12, e23;
28737
28738 tree cond_var, and_expr_var = NULL_TREE;
28739 gimple_seq gseq;
28740
28741 tree predicate_decl, predicate_arg;
28742
28743 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
28744
28745 gcc_assert (new_bb != NULL);
28746 gseq = bb_seq (new_bb);
28747
28748
28749 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
28750 build_fold_addr_expr (version_decl));
28751 result_var = create_tmp_var (ptr_type_node, NULL);
28752 convert_stmt = gimple_build_assign (result_var, convert_expr);
28753 return_stmt = gimple_build_return (result_var);
28754
28755 if (predicate_chain == NULL_TREE)
28756 {
28757 gimple_seq_add_stmt (&gseq, convert_stmt);
28758 gimple_seq_add_stmt (&gseq, return_stmt);
28759 set_bb_seq (new_bb, gseq);
28760 gimple_set_bb (convert_stmt, new_bb);
28761 gimple_set_bb (return_stmt, new_bb);
28762 pop_cfun ();
28763 return new_bb;
28764 }
28765
28766 while (predicate_chain != NULL)
28767 {
28768 cond_var = create_tmp_var (integer_type_node, NULL);
28769 predicate_decl = TREE_PURPOSE (predicate_chain);
28770 predicate_arg = TREE_VALUE (predicate_chain);
28771 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
28772 gimple_call_set_lhs (call_cond_stmt, cond_var);
28773
28774 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
28775 gimple_set_bb (call_cond_stmt, new_bb);
28776 gimple_seq_add_stmt (&gseq, call_cond_stmt);
28777
28778 predicate_chain = TREE_CHAIN (predicate_chain);
28779
28780 if (and_expr_var == NULL)
28781 and_expr_var = cond_var;
28782 else
28783 {
28784 gimple assign_stmt;
28785 /* Use MIN_EXPR to check if any integer is zero?.
28786 and_expr_var = min_expr <cond_var, and_expr_var> */
28787 assign_stmt = gimple_build_assign (and_expr_var,
28788 build2 (MIN_EXPR, integer_type_node,
28789 cond_var, and_expr_var));
28790
28791 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
28792 gimple_set_bb (assign_stmt, new_bb);
28793 gimple_seq_add_stmt (&gseq, assign_stmt);
28794 }
28795 }
28796
28797 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
28798 integer_zero_node,
28799 NULL_TREE, NULL_TREE);
28800 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
28801 gimple_set_bb (if_else_stmt, new_bb);
28802 gimple_seq_add_stmt (&gseq, if_else_stmt);
28803
28804 gimple_seq_add_stmt (&gseq, convert_stmt);
28805 gimple_seq_add_stmt (&gseq, return_stmt);
28806 set_bb_seq (new_bb, gseq);
28807
28808 bb1 = new_bb;
28809 e12 = split_block (bb1, if_else_stmt);
28810 bb2 = e12->dest;
28811 e12->flags &= ~EDGE_FALLTHRU;
28812 e12->flags |= EDGE_TRUE_VALUE;
28813
28814 e23 = split_block (bb2, return_stmt);
28815
28816 gimple_set_bb (convert_stmt, bb2);
28817 gimple_set_bb (return_stmt, bb2);
28818
28819 bb3 = e23->dest;
28820 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
28821
28822 remove_edge (e23);
28823 make_edge (bb2, EXIT_BLOCK_PTR, 0);
28824
28825 pop_cfun ();
28826
28827 return bb3;
28828 }
28829
28830 /* This parses the attribute arguments to target in DECL and determines
28831 the right builtin to use to match the platform specification.
28832 It returns the priority value for this version decl. If PREDICATE_LIST
28833 is not NULL, it stores the list of cpu features that need to be checked
28834 before dispatching this function. */
28835
28836 static unsigned int
28837 get_builtin_code_for_version (tree decl, tree *predicate_list)
28838 {
28839 tree attrs;
28840 struct cl_target_option cur_target;
28841 tree target_node;
28842 struct cl_target_option *new_target;
28843 const char *arg_str = NULL;
28844 const char *attrs_str = NULL;
28845 char *tok_str = NULL;
28846 char *token;
28847
28848 /* Priority of i386 features, greater value is higher priority. This is
28849 used to decide the order in which function dispatch must happen. For
28850 instance, a version specialized for SSE4.2 should be checked for dispatch
28851 before a version for SSE3, as SSE4.2 implies SSE3. */
28852 enum feature_priority
28853 {
28854 P_ZERO = 0,
28855 P_MMX,
28856 P_SSE,
28857 P_SSE2,
28858 P_SSE3,
28859 P_SSSE3,
28860 P_PROC_SSSE3,
28861 P_SSE4_a,
28862 P_PROC_SSE4_a,
28863 P_SSE4_1,
28864 P_SSE4_2,
28865 P_PROC_SSE4_2,
28866 P_POPCNT,
28867 P_AVX,
28868 P_AVX2,
28869 P_FMA,
28870 P_PROC_FMA
28871 };
28872
28873 enum feature_priority priority = P_ZERO;
28874
28875 /* These are the target attribute strings for which a dispatcher is
28876 available, from fold_builtin_cpu. */
28877
28878 static struct _feature_list
28879 {
28880 const char *const name;
28881 const enum feature_priority priority;
28882 }
28883 const feature_list[] =
28884 {
28885 {"mmx", P_MMX},
28886 {"sse", P_SSE},
28887 {"sse2", P_SSE2},
28888 {"sse3", P_SSE3},
28889 {"ssse3", P_SSSE3},
28890 {"sse4.1", P_SSE4_1},
28891 {"sse4.2", P_SSE4_2},
28892 {"popcnt", P_POPCNT},
28893 {"avx", P_AVX},
28894 {"avx2", P_AVX2}
28895 };
28896
28897
28898 static unsigned int NUM_FEATURES
28899 = sizeof (feature_list) / sizeof (struct _feature_list);
28900
28901 unsigned int i;
28902
28903 tree predicate_chain = NULL_TREE;
28904 tree predicate_decl, predicate_arg;
28905
28906 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
28907 gcc_assert (attrs != NULL);
28908
28909 attrs = TREE_VALUE (TREE_VALUE (attrs));
28910
28911 gcc_assert (TREE_CODE (attrs) == STRING_CST);
28912 attrs_str = TREE_STRING_POINTER (attrs);
28913
28914 /* Return priority zero for default function. */
28915 if (strcmp (attrs_str, "default") == 0)
28916 return 0;
28917
28918 /* Handle arch= if specified. For priority, set it to be 1 more than
28919 the best instruction set the processor can handle. For instance, if
28920 there is a version for atom and a version for ssse3 (the highest ISA
28921 priority for atom), the atom version must be checked for dispatch
28922 before the ssse3 version. */
28923 if (strstr (attrs_str, "arch=") != NULL)
28924 {
28925 cl_target_option_save (&cur_target, &global_options);
28926 target_node = ix86_valid_target_attribute_tree (attrs);
28927
28928 gcc_assert (target_node);
28929 new_target = TREE_TARGET_OPTION (target_node);
28930 gcc_assert (new_target);
28931
28932 if (new_target->arch_specified && new_target->arch > 0)
28933 {
28934 switch (new_target->arch)
28935 {
28936 case PROCESSOR_CORE2:
28937 arg_str = "core2";
28938 priority = P_PROC_SSSE3;
28939 break;
28940 case PROCESSOR_COREI7:
28941 arg_str = "corei7";
28942 priority = P_PROC_SSE4_2;
28943 break;
28944 case PROCESSOR_ATOM:
28945 arg_str = "atom";
28946 priority = P_PROC_SSSE3;
28947 break;
28948 case PROCESSOR_AMDFAM10:
28949 arg_str = "amdfam10h";
28950 priority = P_PROC_SSE4_a;
28951 break;
28952 case PROCESSOR_BDVER1:
28953 arg_str = "bdver1";
28954 priority = P_PROC_FMA;
28955 break;
28956 case PROCESSOR_BDVER2:
28957 arg_str = "bdver2";
28958 priority = P_PROC_FMA;
28959 break;
28960 }
28961 }
28962
28963 cl_target_option_restore (&global_options, &cur_target);
28964
28965 if (predicate_list && arg_str == NULL)
28966 {
28967 error_at (DECL_SOURCE_LOCATION (decl),
28968 "No dispatcher found for the versioning attributes");
28969 return 0;
28970 }
28971
28972 if (predicate_list)
28973 {
28974 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
28975 /* For a C string literal the length includes the trailing NULL. */
28976 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
28977 predicate_chain = tree_cons (predicate_decl, predicate_arg,
28978 predicate_chain);
28979 }
28980 }
28981
28982 /* Process feature name. */
28983 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
28984 strcpy (tok_str, attrs_str);
28985 token = strtok (tok_str, ",");
28986 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
28987
28988 while (token != NULL)
28989 {
28990 /* Do not process "arch=" */
28991 if (strncmp (token, "arch=", 5) == 0)
28992 {
28993 token = strtok (NULL, ",");
28994 continue;
28995 }
28996 for (i = 0; i < NUM_FEATURES; ++i)
28997 {
28998 if (strcmp (token, feature_list[i].name) == 0)
28999 {
29000 if (predicate_list)
29001 {
29002 predicate_arg = build_string_literal (
29003 strlen (feature_list[i].name) + 1,
29004 feature_list[i].name);
29005 predicate_chain = tree_cons (predicate_decl, predicate_arg,
29006 predicate_chain);
29007 }
29008 /* Find the maximum priority feature. */
29009 if (feature_list[i].priority > priority)
29010 priority = feature_list[i].priority;
29011
29012 break;
29013 }
29014 }
29015 if (predicate_list && i == NUM_FEATURES)
29016 {
29017 error_at (DECL_SOURCE_LOCATION (decl),
29018 "No dispatcher found for %s", token);
29019 return 0;
29020 }
29021 token = strtok (NULL, ",");
29022 }
29023 free (tok_str);
29024
29025 if (predicate_list && predicate_chain == NULL_TREE)
29026 {
29027 error_at (DECL_SOURCE_LOCATION (decl),
29028 "No dispatcher found for the versioning attributes : %s",
29029 attrs_str);
29030 return 0;
29031 }
29032 else if (predicate_list)
29033 {
29034 predicate_chain = nreverse (predicate_chain);
29035 *predicate_list = predicate_chain;
29036 }
29037
29038 return priority;
29039 }
29040
29041 /* This compares the priority of target features in function DECL1
29042 and DECL2. It returns positive value if DECL1 is higher priority,
29043 negative value if DECL2 is higher priority and 0 if they are the
29044 same. */
29045
29046 static int
29047 ix86_compare_version_priority (tree decl1, tree decl2)
29048 {
29049 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
29050 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
29051
29052 return (int)priority1 - (int)priority2;
29053 }
29054
29055 /* V1 and V2 point to function versions with different priorities
29056 based on the target ISA. This function compares their priorities. */
29057
29058 static int
29059 feature_compare (const void *v1, const void *v2)
29060 {
29061 typedef struct _function_version_info
29062 {
29063 tree version_decl;
29064 tree predicate_chain;
29065 unsigned int dispatch_priority;
29066 } function_version_info;
29067
29068 const function_version_info c1 = *(const function_version_info *)v1;
29069 const function_version_info c2 = *(const function_version_info *)v2;
29070 return (c2.dispatch_priority - c1.dispatch_priority);
29071 }
29072
29073 /* This function generates the dispatch function for
29074 multi-versioned functions. DISPATCH_DECL is the function which will
29075 contain the dispatch logic. FNDECLS are the function choices for
29076 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
29077 in DISPATCH_DECL in which the dispatch code is generated. */
29078
29079 static int
29080 dispatch_function_versions (tree dispatch_decl,
29081 void *fndecls_p,
29082 basic_block *empty_bb)
29083 {
29084 tree default_decl;
29085 gimple ifunc_cpu_init_stmt;
29086 gimple_seq gseq;
29087 int ix;
29088 tree ele;
29089 vec<tree> *fndecls;
29090 unsigned int num_versions = 0;
29091 unsigned int actual_versions = 0;
29092 unsigned int i;
29093
29094 struct _function_version_info
29095 {
29096 tree version_decl;
29097 tree predicate_chain;
29098 unsigned int dispatch_priority;
29099 }*function_version_info;
29100
29101 gcc_assert (dispatch_decl != NULL
29102 && fndecls_p != NULL
29103 && empty_bb != NULL);
29104
29105 /*fndecls_p is actually a vector. */
29106 fndecls = static_cast<vec<tree> *> (fndecls_p);
29107
29108 /* At least one more version other than the default. */
29109 num_versions = fndecls->length ();
29110 gcc_assert (num_versions >= 2);
29111
29112 function_version_info = (struct _function_version_info *)
29113 XNEWVEC (struct _function_version_info, (num_versions - 1));
29114
29115 /* The first version in the vector is the default decl. */
29116 default_decl = (*fndecls)[0];
29117
29118 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
29119
29120 gseq = bb_seq (*empty_bb);
29121 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
29122 constructors, so explicity call __builtin_cpu_init here. */
29123 ifunc_cpu_init_stmt = gimple_build_call_vec (
29124 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
29125 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
29126 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
29127 set_bb_seq (*empty_bb, gseq);
29128
29129 pop_cfun ();
29130
29131
29132 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
29133 {
29134 tree version_decl = ele;
29135 tree predicate_chain = NULL_TREE;
29136 unsigned int priority;
29137 /* Get attribute string, parse it and find the right predicate decl.
29138 The predicate function could be a lengthy combination of many
29139 features, like arch-type and various isa-variants. */
29140 priority = get_builtin_code_for_version (version_decl,
29141 &predicate_chain);
29142
29143 if (predicate_chain == NULL_TREE)
29144 continue;
29145
29146 actual_versions++;
29147 function_version_info [ix - 1].version_decl = version_decl;
29148 function_version_info [ix - 1].predicate_chain = predicate_chain;
29149 function_version_info [ix - 1].dispatch_priority = priority;
29150 }
29151
29152 /* Sort the versions according to descending order of dispatch priority. The
29153 priority is based on the ISA. This is not a perfect solution. There
29154 could still be ambiguity. If more than one function version is suitable
29155 to execute, which one should be dispatched? In future, allow the user
29156 to specify a dispatch priority next to the version. */
29157 qsort (function_version_info, actual_versions,
29158 sizeof (struct _function_version_info), feature_compare);
29159
29160 for (i = 0; i < actual_versions; ++i)
29161 *empty_bb = add_condition_to_bb (dispatch_decl,
29162 function_version_info[i].version_decl,
29163 function_version_info[i].predicate_chain,
29164 *empty_bb);
29165
29166 /* dispatch default version at the end. */
29167 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
29168 NULL, *empty_bb);
29169
29170 free (function_version_info);
29171 return 0;
29172 }
29173
29174 /* Comparator function to be used in qsort routine to sort attribute
29175 specification strings to "target". */
29176
29177 static int
29178 attr_strcmp (const void *v1, const void *v2)
29179 {
29180 const char *c1 = *(char *const*)v1;
29181 const char *c2 = *(char *const*)v2;
29182 return strcmp (c1, c2);
29183 }
29184
29185 /* ARGLIST is the argument to target attribute. This function tokenizes
29186 the comma separated arguments, sorts them and returns a string which
29187 is a unique identifier for the comma separated arguments. It also
29188 replaces non-identifier characters "=,-" with "_". */
29189
29190 static char *
29191 sorted_attr_string (tree arglist)
29192 {
29193 tree arg;
29194 size_t str_len_sum = 0;
29195 char **args = NULL;
29196 char *attr_str, *ret_str;
29197 char *attr = NULL;
29198 unsigned int argnum = 1;
29199 unsigned int i;
29200
29201 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
29202 {
29203 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
29204 size_t len = strlen (str);
29205 str_len_sum += len + 1;
29206 if (arg != arglist)
29207 argnum++;
29208 for (i = 0; i < strlen (str); i++)
29209 if (str[i] == ',')
29210 argnum++;
29211 }
29212
29213 attr_str = XNEWVEC (char, str_len_sum);
29214 str_len_sum = 0;
29215 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
29216 {
29217 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
29218 size_t len = strlen (str);
29219 memcpy (attr_str + str_len_sum, str, len);
29220 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
29221 str_len_sum += len + 1;
29222 }
29223
29224 /* Replace "=,-" with "_". */
29225 for (i = 0; i < strlen (attr_str); i++)
29226 if (attr_str[i] == '=' || attr_str[i]== '-')
29227 attr_str[i] = '_';
29228
29229 if (argnum == 1)
29230 return attr_str;
29231
29232 args = XNEWVEC (char *, argnum);
29233
29234 i = 0;
29235 attr = strtok (attr_str, ",");
29236 while (attr != NULL)
29237 {
29238 args[i] = attr;
29239 i++;
29240 attr = strtok (NULL, ",");
29241 }
29242
29243 qsort (args, argnum, sizeof (char *), attr_strcmp);
29244
29245 ret_str = XNEWVEC (char, str_len_sum);
29246 str_len_sum = 0;
29247 for (i = 0; i < argnum; i++)
29248 {
29249 size_t len = strlen (args[i]);
29250 memcpy (ret_str + str_len_sum, args[i], len);
29251 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
29252 str_len_sum += len + 1;
29253 }
29254
29255 XDELETEVEC (args);
29256 XDELETEVEC (attr_str);
29257 return ret_str;
29258 }
29259
29260 /* This function changes the assembler name for functions that are
29261 versions. If DECL is a function version and has a "target"
29262 attribute, it appends the attribute string to its assembler name. */
29263
29264 static tree
29265 ix86_mangle_function_version_assembler_name (tree decl, tree id)
29266 {
29267 tree version_attr;
29268 const char *orig_name, *version_string;
29269 char *attr_str, *assembler_name;
29270
29271 if (DECL_DECLARED_INLINE_P (decl)
29272 && lookup_attribute ("gnu_inline",
29273 DECL_ATTRIBUTES (decl)))
29274 error_at (DECL_SOURCE_LOCATION (decl),
29275 "Function versions cannot be marked as gnu_inline,"
29276 " bodies have to be generated");
29277
29278 if (DECL_VIRTUAL_P (decl)
29279 || DECL_VINDEX (decl))
29280 sorry ("Virtual function multiversioning not supported");
29281
29282 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29283
29284 /* target attribute string cannot be NULL. */
29285 gcc_assert (version_attr != NULL_TREE);
29286
29287 orig_name = IDENTIFIER_POINTER (id);
29288 version_string
29289 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
29290
29291 if (strcmp (version_string, "default") == 0)
29292 return id;
29293
29294 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
29295 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
29296
29297 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
29298
29299 /* Allow assembler name to be modified if already set. */
29300 if (DECL_ASSEMBLER_NAME_SET_P (decl))
29301 SET_DECL_RTL (decl, NULL);
29302
29303 tree ret = get_identifier (assembler_name);
29304 XDELETEVEC (attr_str);
29305 XDELETEVEC (assembler_name);
29306 return ret;
29307 }
29308
29309 /* This function returns true if FN1 and FN2 are versions of the same function,
29310 that is, the target strings of the function decls are different. This assumes
29311 that FN1 and FN2 have the same signature. */
29312
29313 static bool
29314 ix86_function_versions (tree fn1, tree fn2)
29315 {
29316 tree attr1, attr2;
29317 char *target1, *target2;
29318 bool result;
29319
29320 if (TREE_CODE (fn1) != FUNCTION_DECL
29321 || TREE_CODE (fn2) != FUNCTION_DECL)
29322 return false;
29323
29324 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
29325 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
29326
29327 /* At least one function decl should have the target attribute specified. */
29328 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
29329 return false;
29330
29331 /* Diagnose missing target attribute if one of the decls is already
29332 multi-versioned. */
29333 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
29334 {
29335 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
29336 {
29337 if (attr2 != NULL_TREE)
29338 {
29339 tree tem = fn1;
29340 fn1 = fn2;
29341 fn2 = tem;
29342 attr1 = attr2;
29343 }
29344 error_at (DECL_SOURCE_LOCATION (fn2),
29345 "missing %<target%> attribute for multi-versioned %D",
29346 fn2);
29347 error_at (DECL_SOURCE_LOCATION (fn1),
29348 "previous declaration of %D", fn1);
29349 /* Prevent diagnosing of the same error multiple times. */
29350 DECL_ATTRIBUTES (fn2)
29351 = tree_cons (get_identifier ("target"),
29352 copy_node (TREE_VALUE (attr1)),
29353 DECL_ATTRIBUTES (fn2));
29354 }
29355 return false;
29356 }
29357
29358 target1 = sorted_attr_string (TREE_VALUE (attr1));
29359 target2 = sorted_attr_string (TREE_VALUE (attr2));
29360
29361 /* The sorted target strings must be different for fn1 and fn2
29362 to be versions. */
29363 if (strcmp (target1, target2) == 0)
29364 result = false;
29365 else
29366 result = true;
29367
29368 XDELETEVEC (target1);
29369 XDELETEVEC (target2);
29370
29371 return result;
29372 }
29373
29374 static tree
29375 ix86_mangle_decl_assembler_name (tree decl, tree id)
29376 {
29377 /* For function version, add the target suffix to the assembler name. */
29378 if (TREE_CODE (decl) == FUNCTION_DECL
29379 && DECL_FUNCTION_VERSIONED (decl))
29380 id = ix86_mangle_function_version_assembler_name (decl, id);
29381 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
29382 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
29383 #endif
29384
29385 return id;
29386 }
29387
29388 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
29389 is true, append the full path name of the source file. */
29390
29391 static char *
29392 make_name (tree decl, const char *suffix, bool make_unique)
29393 {
29394 char *global_var_name;
29395 int name_len;
29396 const char *name;
29397 const char *unique_name = NULL;
29398
29399 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
29400
29401 /* Get a unique name that can be used globally without any chances
29402 of collision at link time. */
29403 if (make_unique)
29404 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
29405
29406 name_len = strlen (name) + strlen (suffix) + 2;
29407
29408 if (make_unique)
29409 name_len += strlen (unique_name) + 1;
29410 global_var_name = XNEWVEC (char, name_len);
29411
29412 /* Use '.' to concatenate names as it is demangler friendly. */
29413 if (make_unique)
29414 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
29415 suffix);
29416 else
29417 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
29418
29419 return global_var_name;
29420 }
29421
29422 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
29423
29424 /* Make a dispatcher declaration for the multi-versioned function DECL.
29425 Calls to DECL function will be replaced with calls to the dispatcher
29426 by the front-end. Return the decl created. */
29427
29428 static tree
29429 make_dispatcher_decl (const tree decl)
29430 {
29431 tree func_decl;
29432 char *func_name;
29433 tree fn_type, func_type;
29434 bool is_uniq = false;
29435
29436 if (TREE_PUBLIC (decl) == 0)
29437 is_uniq = true;
29438
29439 func_name = make_name (decl, "ifunc", is_uniq);
29440
29441 fn_type = TREE_TYPE (decl);
29442 func_type = build_function_type (TREE_TYPE (fn_type),
29443 TYPE_ARG_TYPES (fn_type));
29444
29445 func_decl = build_fn_decl (func_name, func_type);
29446 XDELETEVEC (func_name);
29447 TREE_USED (func_decl) = 1;
29448 DECL_CONTEXT (func_decl) = NULL_TREE;
29449 DECL_INITIAL (func_decl) = error_mark_node;
29450 DECL_ARTIFICIAL (func_decl) = 1;
29451 /* Mark this func as external, the resolver will flip it again if
29452 it gets generated. */
29453 DECL_EXTERNAL (func_decl) = 1;
29454 /* This will be of type IFUNCs have to be externally visible. */
29455 TREE_PUBLIC (func_decl) = 1;
29456
29457 return func_decl;
29458 }
29459
29460 #endif
29461
29462 /* Returns true if decl is multi-versioned and DECL is the default function,
29463 that is it is not tagged with target specific optimization. */
29464
29465 static bool
29466 is_function_default_version (const tree decl)
29467 {
29468 if (TREE_CODE (decl) != FUNCTION_DECL
29469 || !DECL_FUNCTION_VERSIONED (decl))
29470 return false;
29471 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29472 gcc_assert (attr);
29473 attr = TREE_VALUE (TREE_VALUE (attr));
29474 return (TREE_CODE (attr) == STRING_CST
29475 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
29476 }
29477
29478 /* Make a dispatcher declaration for the multi-versioned function DECL.
29479 Calls to DECL function will be replaced with calls to the dispatcher
29480 by the front-end. Returns the decl of the dispatcher function. */
29481
29482 static tree
29483 ix86_get_function_versions_dispatcher (void *decl)
29484 {
29485 tree fn = (tree) decl;
29486 struct cgraph_node *node = NULL;
29487 struct cgraph_node *default_node = NULL;
29488 struct cgraph_function_version_info *node_v = NULL;
29489 struct cgraph_function_version_info *first_v = NULL;
29490
29491 tree dispatch_decl = NULL;
29492
29493 struct cgraph_function_version_info *default_version_info = NULL;
29494
29495 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
29496
29497 node = cgraph_get_node (fn);
29498 gcc_assert (node != NULL);
29499
29500 node_v = get_cgraph_node_version (node);
29501 gcc_assert (node_v != NULL);
29502
29503 if (node_v->dispatcher_resolver != NULL)
29504 return node_v->dispatcher_resolver;
29505
29506 /* Find the default version and make it the first node. */
29507 first_v = node_v;
29508 /* Go to the beginnig of the chain. */
29509 while (first_v->prev != NULL)
29510 first_v = first_v->prev;
29511 default_version_info = first_v;
29512 while (default_version_info != NULL)
29513 {
29514 if (is_function_default_version
29515 (default_version_info->this_node->symbol.decl))
29516 break;
29517 default_version_info = default_version_info->next;
29518 }
29519
29520 /* If there is no default node, just return NULL. */
29521 if (default_version_info == NULL)
29522 return NULL;
29523
29524 /* Make default info the first node. */
29525 if (first_v != default_version_info)
29526 {
29527 default_version_info->prev->next = default_version_info->next;
29528 if (default_version_info->next)
29529 default_version_info->next->prev = default_version_info->prev;
29530 first_v->prev = default_version_info;
29531 default_version_info->next = first_v;
29532 default_version_info->prev = NULL;
29533 }
29534
29535 default_node = default_version_info->this_node;
29536
29537 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
29538 if (targetm.has_ifunc_p ())
29539 {
29540 struct cgraph_function_version_info *it_v = NULL;
29541 struct cgraph_node *dispatcher_node = NULL;
29542 struct cgraph_function_version_info *dispatcher_version_info = NULL;
29543
29544 /* Right now, the dispatching is done via ifunc. */
29545 dispatch_decl = make_dispatcher_decl (default_node->symbol.decl);
29546
29547 dispatcher_node = cgraph_get_create_node (dispatch_decl);
29548 gcc_assert (dispatcher_node != NULL);
29549 dispatcher_node->dispatcher_function = 1;
29550 dispatcher_version_info
29551 = insert_new_cgraph_node_version (dispatcher_node);
29552 dispatcher_version_info->next = default_version_info;
29553 dispatcher_node->symbol.definition = 1;
29554
29555 /* Set the dispatcher for all the versions. */
29556 it_v = default_version_info;
29557 while (it_v != NULL)
29558 {
29559 it_v->dispatcher_resolver = dispatch_decl;
29560 it_v = it_v->next;
29561 }
29562 }
29563 else
29564 #endif
29565 {
29566 error_at (DECL_SOURCE_LOCATION (default_node->symbol.decl),
29567 "multiversioning needs ifunc which is not supported "
29568 "on this target");
29569 }
29570
29571 return dispatch_decl;
29572 }
29573
29574 /* Makes a function attribute of the form NAME(ARG_NAME) and chains
29575 it to CHAIN. */
29576
29577 static tree
29578 make_attribute (const char *name, const char *arg_name, tree chain)
29579 {
29580 tree attr_name;
29581 tree attr_arg_name;
29582 tree attr_args;
29583 tree attr;
29584
29585 attr_name = get_identifier (name);
29586 attr_arg_name = build_string (strlen (arg_name), arg_name);
29587 attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
29588 attr = tree_cons (attr_name, attr_args, chain);
29589 return attr;
29590 }
29591
29592 /* Make the resolver function decl to dispatch the versions of
29593 a multi-versioned function, DEFAULT_DECL. Create an
29594 empty basic block in the resolver and store the pointer in
29595 EMPTY_BB. Return the decl of the resolver function. */
29596
29597 static tree
29598 make_resolver_func (const tree default_decl,
29599 const tree dispatch_decl,
29600 basic_block *empty_bb)
29601 {
29602 char *resolver_name;
29603 tree decl, type, decl_name, t;
29604 bool is_uniq = false;
29605
29606 /* IFUNC's have to be globally visible. So, if the default_decl is
29607 not, then the name of the IFUNC should be made unique. */
29608 if (TREE_PUBLIC (default_decl) == 0)
29609 is_uniq = true;
29610
29611 /* Append the filename to the resolver function if the versions are
29612 not externally visible. This is because the resolver function has
29613 to be externally visible for the loader to find it. So, appending
29614 the filename will prevent conflicts with a resolver function from
29615 another module which is based on the same version name. */
29616 resolver_name = make_name (default_decl, "resolver", is_uniq);
29617
29618 /* The resolver function should return a (void *). */
29619 type = build_function_type_list (ptr_type_node, NULL_TREE);
29620
29621 decl = build_fn_decl (resolver_name, type);
29622 decl_name = get_identifier (resolver_name);
29623 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
29624
29625 DECL_NAME (decl) = decl_name;
29626 TREE_USED (decl) = 1;
29627 DECL_ARTIFICIAL (decl) = 1;
29628 DECL_IGNORED_P (decl) = 0;
29629 /* IFUNC resolvers have to be externally visible. */
29630 TREE_PUBLIC (decl) = 1;
29631 DECL_UNINLINABLE (decl) = 0;
29632
29633 /* Resolver is not external, body is generated. */
29634 DECL_EXTERNAL (decl) = 0;
29635 DECL_EXTERNAL (dispatch_decl) = 0;
29636
29637 DECL_CONTEXT (decl) = NULL_TREE;
29638 DECL_INITIAL (decl) = make_node (BLOCK);
29639 DECL_STATIC_CONSTRUCTOR (decl) = 0;
29640
29641 if (DECL_COMDAT_GROUP (default_decl)
29642 || TREE_PUBLIC (default_decl))
29643 {
29644 /* In this case, each translation unit with a call to this
29645 versioned function will put out a resolver. Ensure it
29646 is comdat to keep just one copy. */
29647 DECL_COMDAT (decl) = 1;
29648 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
29649 }
29650 /* Build result decl and add to function_decl. */
29651 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
29652 DECL_ARTIFICIAL (t) = 1;
29653 DECL_IGNORED_P (t) = 1;
29654 DECL_RESULT (decl) = t;
29655
29656 gimplify_function_tree (decl);
29657 push_cfun (DECL_STRUCT_FUNCTION (decl));
29658 *empty_bb = init_lowered_empty_function (decl, false);
29659
29660 cgraph_add_new_function (decl, true);
29661 cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
29662
29663 pop_cfun ();
29664
29665 gcc_assert (dispatch_decl != NULL);
29666 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
29667 DECL_ATTRIBUTES (dispatch_decl)
29668 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
29669
29670 /* Create the alias for dispatch to resolver here. */
29671 /*cgraph_create_function_alias (dispatch_decl, decl);*/
29672 cgraph_same_body_alias (NULL, dispatch_decl, decl);
29673 XDELETEVEC (resolver_name);
29674 return decl;
29675 }
29676
29677 /* Generate the dispatching code body to dispatch multi-versioned function
29678 DECL. The target hook is called to process the "target" attributes and
29679 provide the code to dispatch the right function at run-time. NODE points
29680 to the dispatcher decl whose body will be created. */
29681
29682 static tree
29683 ix86_generate_version_dispatcher_body (void *node_p)
29684 {
29685 tree resolver_decl;
29686 basic_block empty_bb;
29687 vec<tree> fn_ver_vec = vNULL;
29688 tree default_ver_decl;
29689 struct cgraph_node *versn;
29690 struct cgraph_node *node;
29691
29692 struct cgraph_function_version_info *node_version_info = NULL;
29693 struct cgraph_function_version_info *versn_info = NULL;
29694
29695 node = (cgraph_node *)node_p;
29696
29697 node_version_info = get_cgraph_node_version (node);
29698 gcc_assert (node->dispatcher_function
29699 && node_version_info != NULL);
29700
29701 if (node_version_info->dispatcher_resolver)
29702 return node_version_info->dispatcher_resolver;
29703
29704 /* The first version in the chain corresponds to the default version. */
29705 default_ver_decl = node_version_info->next->this_node->symbol.decl;
29706
29707 /* node is going to be an alias, so remove the finalized bit. */
29708 node->symbol.definition = false;
29709
29710 resolver_decl = make_resolver_func (default_ver_decl,
29711 node->symbol.decl, &empty_bb);
29712
29713 node_version_info->dispatcher_resolver = resolver_decl;
29714
29715 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
29716
29717 fn_ver_vec.create (2);
29718
29719 for (versn_info = node_version_info->next; versn_info;
29720 versn_info = versn_info->next)
29721 {
29722 versn = versn_info->this_node;
29723 /* Check for virtual functions here again, as by this time it should
29724 have been determined if this function needs a vtable index or
29725 not. This happens for methods in derived classes that override
29726 virtual methods in base classes but are not explicitly marked as
29727 virtual. */
29728 if (DECL_VINDEX (versn->symbol.decl))
29729 sorry ("Virtual function multiversioning not supported");
29730
29731 fn_ver_vec.safe_push (versn->symbol.decl);
29732 }
29733
29734 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
29735 fn_ver_vec.release ();
29736 rebuild_cgraph_edges ();
29737 pop_cfun ();
29738 return resolver_decl;
29739 }
29740 /* This builds the processor_model struct type defined in
29741 libgcc/config/i386/cpuinfo.c */
29742
29743 static tree
29744 build_processor_model_struct (void)
29745 {
29746 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
29747 "__cpu_features"};
29748 tree field = NULL_TREE, field_chain = NULL_TREE;
29749 int i;
29750 tree type = make_node (RECORD_TYPE);
29751
29752 /* The first 3 fields are unsigned int. */
29753 for (i = 0; i < 3; ++i)
29754 {
29755 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29756 get_identifier (field_name[i]), unsigned_type_node);
29757 if (field_chain != NULL_TREE)
29758 DECL_CHAIN (field) = field_chain;
29759 field_chain = field;
29760 }
29761
29762 /* The last field is an array of unsigned integers of size one. */
29763 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29764 get_identifier (field_name[3]),
29765 build_array_type (unsigned_type_node,
29766 build_index_type (size_one_node)));
29767 if (field_chain != NULL_TREE)
29768 DECL_CHAIN (field) = field_chain;
29769 field_chain = field;
29770
29771 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
29772 return type;
29773 }
29774
29775 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
29776
29777 static tree
29778 make_var_decl (tree type, const char *name)
29779 {
29780 tree new_decl;
29781
29782 new_decl = build_decl (UNKNOWN_LOCATION,
29783 VAR_DECL,
29784 get_identifier(name),
29785 type);
29786
29787 DECL_EXTERNAL (new_decl) = 1;
29788 TREE_STATIC (new_decl) = 1;
29789 TREE_PUBLIC (new_decl) = 1;
29790 DECL_INITIAL (new_decl) = 0;
29791 DECL_ARTIFICIAL (new_decl) = 0;
29792 DECL_PRESERVE_P (new_decl) = 1;
29793
29794 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
29795 assemble_variable (new_decl, 0, 0, 0);
29796
29797 return new_decl;
29798 }
29799
29800 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
29801 into an integer defined in libgcc/config/i386/cpuinfo.c */
29802
29803 static tree
29804 fold_builtin_cpu (tree fndecl, tree *args)
29805 {
29806 unsigned int i;
29807 enum ix86_builtins fn_code = (enum ix86_builtins)
29808 DECL_FUNCTION_CODE (fndecl);
29809 tree param_string_cst = NULL;
29810
29811 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
29812 enum processor_features
29813 {
29814 F_CMOV = 0,
29815 F_MMX,
29816 F_POPCNT,
29817 F_SSE,
29818 F_SSE2,
29819 F_SSE3,
29820 F_SSSE3,
29821 F_SSE4_1,
29822 F_SSE4_2,
29823 F_AVX,
29824 F_AVX2,
29825 F_MAX
29826 };
29827
29828 /* These are the values for vendor types and cpu types and subtypes
29829 in cpuinfo.c. Cpu types and subtypes should be subtracted by
29830 the corresponding start value. */
29831 enum processor_model
29832 {
29833 M_INTEL = 1,
29834 M_AMD,
29835 M_CPU_TYPE_START,
29836 M_INTEL_ATOM,
29837 M_INTEL_SLM,
29838 M_INTEL_CORE2,
29839 M_INTEL_COREI7,
29840 M_AMDFAM10H,
29841 M_AMDFAM15H,
29842 M_CPU_SUBTYPE_START,
29843 M_INTEL_COREI7_NEHALEM,
29844 M_INTEL_COREI7_WESTMERE,
29845 M_INTEL_COREI7_SANDYBRIDGE,
29846 M_AMDFAM10H_BARCELONA,
29847 M_AMDFAM10H_SHANGHAI,
29848 M_AMDFAM10H_ISTANBUL,
29849 M_AMDFAM15H_BDVER1,
29850 M_AMDFAM15H_BDVER2,
29851 M_AMDFAM15H_BDVER3
29852 };
29853
29854 static struct _arch_names_table
29855 {
29856 const char *const name;
29857 const enum processor_model model;
29858 }
29859 const arch_names_table[] =
29860 {
29861 {"amd", M_AMD},
29862 {"intel", M_INTEL},
29863 {"atom", M_INTEL_ATOM},
29864 {"slm", M_INTEL_SLM},
29865 {"core2", M_INTEL_CORE2},
29866 {"corei7", M_INTEL_COREI7},
29867 {"nehalem", M_INTEL_COREI7_NEHALEM},
29868 {"westmere", M_INTEL_COREI7_WESTMERE},
29869 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
29870 {"amdfam10h", M_AMDFAM10H},
29871 {"barcelona", M_AMDFAM10H_BARCELONA},
29872 {"shanghai", M_AMDFAM10H_SHANGHAI},
29873 {"istanbul", M_AMDFAM10H_ISTANBUL},
29874 {"amdfam15h", M_AMDFAM15H},
29875 {"bdver1", M_AMDFAM15H_BDVER1},
29876 {"bdver2", M_AMDFAM15H_BDVER2},
29877 {"bdver3", M_AMDFAM15H_BDVER3},
29878 };
29879
29880 static struct _isa_names_table
29881 {
29882 const char *const name;
29883 const enum processor_features feature;
29884 }
29885 const isa_names_table[] =
29886 {
29887 {"cmov", F_CMOV},
29888 {"mmx", F_MMX},
29889 {"popcnt", F_POPCNT},
29890 {"sse", F_SSE},
29891 {"sse2", F_SSE2},
29892 {"sse3", F_SSE3},
29893 {"ssse3", F_SSSE3},
29894 {"sse4.1", F_SSE4_1},
29895 {"sse4.2", F_SSE4_2},
29896 {"avx", F_AVX},
29897 {"avx2", F_AVX2}
29898 };
29899
29900 tree __processor_model_type = build_processor_model_struct ();
29901 tree __cpu_model_var = make_var_decl (__processor_model_type,
29902 "__cpu_model");
29903
29904
29905 varpool_add_new_variable (__cpu_model_var);
29906
29907 gcc_assert ((args != NULL) && (*args != NULL));
29908
29909 param_string_cst = *args;
29910 while (param_string_cst
29911 && TREE_CODE (param_string_cst) != STRING_CST)
29912 {
29913 /* *args must be a expr that can contain other EXPRS leading to a
29914 STRING_CST. */
29915 if (!EXPR_P (param_string_cst))
29916 {
29917 error ("Parameter to builtin must be a string constant or literal");
29918 return integer_zero_node;
29919 }
29920 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
29921 }
29922
29923 gcc_assert (param_string_cst);
29924
29925 if (fn_code == IX86_BUILTIN_CPU_IS)
29926 {
29927 tree ref;
29928 tree field;
29929 tree final;
29930
29931 unsigned int field_val = 0;
29932 unsigned int NUM_ARCH_NAMES
29933 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
29934
29935 for (i = 0; i < NUM_ARCH_NAMES; i++)
29936 if (strcmp (arch_names_table[i].name,
29937 TREE_STRING_POINTER (param_string_cst)) == 0)
29938 break;
29939
29940 if (i == NUM_ARCH_NAMES)
29941 {
29942 error ("Parameter to builtin not valid: %s",
29943 TREE_STRING_POINTER (param_string_cst));
29944 return integer_zero_node;
29945 }
29946
29947 field = TYPE_FIELDS (__processor_model_type);
29948 field_val = arch_names_table[i].model;
29949
29950 /* CPU types are stored in the next field. */
29951 if (field_val > M_CPU_TYPE_START
29952 && field_val < M_CPU_SUBTYPE_START)
29953 {
29954 field = DECL_CHAIN (field);
29955 field_val -= M_CPU_TYPE_START;
29956 }
29957
29958 /* CPU subtypes are stored in the next field. */
29959 if (field_val > M_CPU_SUBTYPE_START)
29960 {
29961 field = DECL_CHAIN ( DECL_CHAIN (field));
29962 field_val -= M_CPU_SUBTYPE_START;
29963 }
29964
29965 /* Get the appropriate field in __cpu_model. */
29966 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
29967 field, NULL_TREE);
29968
29969 /* Check the value. */
29970 final = build2 (EQ_EXPR, unsigned_type_node, ref,
29971 build_int_cstu (unsigned_type_node, field_val));
29972 return build1 (CONVERT_EXPR, integer_type_node, final);
29973 }
29974 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
29975 {
29976 tree ref;
29977 tree array_elt;
29978 tree field;
29979 tree final;
29980
29981 unsigned int field_val = 0;
29982 unsigned int NUM_ISA_NAMES
29983 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
29984
29985 for (i = 0; i < NUM_ISA_NAMES; i++)
29986 if (strcmp (isa_names_table[i].name,
29987 TREE_STRING_POINTER (param_string_cst)) == 0)
29988 break;
29989
29990 if (i == NUM_ISA_NAMES)
29991 {
29992 error ("Parameter to builtin not valid: %s",
29993 TREE_STRING_POINTER (param_string_cst));
29994 return integer_zero_node;
29995 }
29996
29997 field = TYPE_FIELDS (__processor_model_type);
29998 /* Get the last field, which is __cpu_features. */
29999 while (DECL_CHAIN (field))
30000 field = DECL_CHAIN (field);
30001
30002 /* Get the appropriate field: __cpu_model.__cpu_features */
30003 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
30004 field, NULL_TREE);
30005
30006 /* Access the 0th element of __cpu_features array. */
30007 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
30008 integer_zero_node, NULL_TREE, NULL_TREE);
30009
30010 field_val = (1 << isa_names_table[i].feature);
30011 /* Return __cpu_model.__cpu_features[0] & field_val */
30012 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
30013 build_int_cstu (unsigned_type_node, field_val));
30014 return build1 (CONVERT_EXPR, integer_type_node, final);
30015 }
30016 gcc_unreachable ();
30017 }
30018
30019 static tree
30020 ix86_fold_builtin (tree fndecl, int n_args,
30021 tree *args, bool ignore ATTRIBUTE_UNUSED)
30022 {
30023 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
30024 {
30025 enum ix86_builtins fn_code = (enum ix86_builtins)
30026 DECL_FUNCTION_CODE (fndecl);
30027 if (fn_code == IX86_BUILTIN_CPU_IS
30028 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
30029 {
30030 gcc_assert (n_args == 1);
30031 return fold_builtin_cpu (fndecl, args);
30032 }
30033 }
30034
30035 #ifdef SUBTARGET_FOLD_BUILTIN
30036 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
30037 #endif
30038
30039 return NULL_TREE;
30040 }
30041
30042 /* Make builtins to detect cpu type and features supported. NAME is
30043 the builtin name, CODE is the builtin code, and FTYPE is the function
30044 type of the builtin. */
30045
30046 static void
30047 make_cpu_type_builtin (const char* name, int code,
30048 enum ix86_builtin_func_type ftype, bool is_const)
30049 {
30050 tree decl;
30051 tree type;
30052
30053 type = ix86_get_builtin_func_type (ftype);
30054 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30055 NULL, NULL_TREE);
30056 gcc_assert (decl != NULL_TREE);
30057 ix86_builtins[(int) code] = decl;
30058 TREE_READONLY (decl) = is_const;
30059 }
30060
30061 /* Make builtins to get CPU type and features supported. The created
30062 builtins are :
30063
30064 __builtin_cpu_init (), to detect cpu type and features,
30065 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
30066 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
30067 */
30068
30069 static void
30070 ix86_init_platform_type_builtins (void)
30071 {
30072 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
30073 INT_FTYPE_VOID, false);
30074 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
30075 INT_FTYPE_PCCHAR, true);
30076 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
30077 INT_FTYPE_PCCHAR, true);
30078 }
30079
30080 /* Internal method for ix86_init_builtins. */
30081
30082 static void
30083 ix86_init_builtins_va_builtins_abi (void)
30084 {
30085 tree ms_va_ref, sysv_va_ref;
30086 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
30087 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
30088 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
30089 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
30090
30091 if (!TARGET_64BIT)
30092 return;
30093 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
30094 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
30095 ms_va_ref = build_reference_type (ms_va_list_type_node);
30096 sysv_va_ref =
30097 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
30098
30099 fnvoid_va_end_ms =
30100 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
30101 fnvoid_va_start_ms =
30102 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
30103 fnvoid_va_end_sysv =
30104 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
30105 fnvoid_va_start_sysv =
30106 build_varargs_function_type_list (void_type_node, sysv_va_ref,
30107 NULL_TREE);
30108 fnvoid_va_copy_ms =
30109 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
30110 NULL_TREE);
30111 fnvoid_va_copy_sysv =
30112 build_function_type_list (void_type_node, sysv_va_ref,
30113 sysv_va_ref, NULL_TREE);
30114
30115 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
30116 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
30117 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
30118 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
30119 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
30120 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
30121 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
30122 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
30123 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
30124 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
30125 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
30126 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
30127 }
30128
30129 static void
30130 ix86_init_builtin_types (void)
30131 {
30132 tree float128_type_node, float80_type_node;
30133
30134 /* The __float80 type. */
30135 float80_type_node = long_double_type_node;
30136 if (TYPE_MODE (float80_type_node) != XFmode)
30137 {
30138 /* The __float80 type. */
30139 float80_type_node = make_node (REAL_TYPE);
30140
30141 TYPE_PRECISION (float80_type_node) = 80;
30142 layout_type (float80_type_node);
30143 }
30144 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
30145
30146 /* The __float128 type. */
30147 float128_type_node = make_node (REAL_TYPE);
30148 TYPE_PRECISION (float128_type_node) = 128;
30149 layout_type (float128_type_node);
30150 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
30151
30152 /* This macro is built by i386-builtin-types.awk. */
30153 DEFINE_BUILTIN_PRIMITIVE_TYPES;
30154 }
30155
30156 static void
30157 ix86_init_builtins (void)
30158 {
30159 tree t;
30160
30161 ix86_init_builtin_types ();
30162
30163 /* Builtins to get CPU type and features. */
30164 ix86_init_platform_type_builtins ();
30165
30166 /* TFmode support builtins. */
30167 def_builtin_const (0, "__builtin_infq",
30168 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
30169 def_builtin_const (0, "__builtin_huge_valq",
30170 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
30171
30172 /* We will expand them to normal call if SSE isn't available since
30173 they are used by libgcc. */
30174 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
30175 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
30176 BUILT_IN_MD, "__fabstf2", NULL_TREE);
30177 TREE_READONLY (t) = 1;
30178 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
30179
30180 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
30181 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
30182 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
30183 TREE_READONLY (t) = 1;
30184 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
30185
30186 ix86_init_tm_builtins ();
30187 ix86_init_mmx_sse_builtins ();
30188
30189 if (TARGET_LP64)
30190 ix86_init_builtins_va_builtins_abi ();
30191
30192 #ifdef SUBTARGET_INIT_BUILTINS
30193 SUBTARGET_INIT_BUILTINS;
30194 #endif
30195 }
30196
30197 /* Return the ix86 builtin for CODE. */
30198
30199 static tree
30200 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
30201 {
30202 if (code >= IX86_BUILTIN_MAX)
30203 return error_mark_node;
30204
30205 return ix86_builtins[code];
30206 }
30207
30208 /* Errors in the source file can cause expand_expr to return const0_rtx
30209 where we expect a vector. To avoid crashing, use one of the vector
30210 clear instructions. */
30211 static rtx
30212 safe_vector_operand (rtx x, enum machine_mode mode)
30213 {
30214 if (x == const0_rtx)
30215 x = CONST0_RTX (mode);
30216 return x;
30217 }
30218
30219 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
30220
30221 static rtx
30222 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
30223 {
30224 rtx pat;
30225 tree arg0 = CALL_EXPR_ARG (exp, 0);
30226 tree arg1 = CALL_EXPR_ARG (exp, 1);
30227 rtx op0 = expand_normal (arg0);
30228 rtx op1 = expand_normal (arg1);
30229 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30230 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30231 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
30232
30233 if (VECTOR_MODE_P (mode0))
30234 op0 = safe_vector_operand (op0, mode0);
30235 if (VECTOR_MODE_P (mode1))
30236 op1 = safe_vector_operand (op1, mode1);
30237
30238 if (optimize || !target
30239 || GET_MODE (target) != tmode
30240 || !insn_data[icode].operand[0].predicate (target, tmode))
30241 target = gen_reg_rtx (tmode);
30242
30243 if (GET_MODE (op1) == SImode && mode1 == TImode)
30244 {
30245 rtx x = gen_reg_rtx (V4SImode);
30246 emit_insn (gen_sse2_loadd (x, op1));
30247 op1 = gen_lowpart (TImode, x);
30248 }
30249
30250 if (!insn_data[icode].operand[1].predicate (op0, mode0))
30251 op0 = copy_to_mode_reg (mode0, op0);
30252 if (!insn_data[icode].operand[2].predicate (op1, mode1))
30253 op1 = copy_to_mode_reg (mode1, op1);
30254
30255 pat = GEN_FCN (icode) (target, op0, op1);
30256 if (! pat)
30257 return 0;
30258
30259 emit_insn (pat);
30260
30261 return target;
30262 }
30263
30264 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
30265
30266 static rtx
30267 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
30268 enum ix86_builtin_func_type m_type,
30269 enum rtx_code sub_code)
30270 {
30271 rtx pat;
30272 int i;
30273 int nargs;
30274 bool comparison_p = false;
30275 bool tf_p = false;
30276 bool last_arg_constant = false;
30277 int num_memory = 0;
30278 struct {
30279 rtx op;
30280 enum machine_mode mode;
30281 } args[4];
30282
30283 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30284
30285 switch (m_type)
30286 {
30287 case MULTI_ARG_4_DF2_DI_I:
30288 case MULTI_ARG_4_DF2_DI_I1:
30289 case MULTI_ARG_4_SF2_SI_I:
30290 case MULTI_ARG_4_SF2_SI_I1:
30291 nargs = 4;
30292 last_arg_constant = true;
30293 break;
30294
30295 case MULTI_ARG_3_SF:
30296 case MULTI_ARG_3_DF:
30297 case MULTI_ARG_3_SF2:
30298 case MULTI_ARG_3_DF2:
30299 case MULTI_ARG_3_DI:
30300 case MULTI_ARG_3_SI:
30301 case MULTI_ARG_3_SI_DI:
30302 case MULTI_ARG_3_HI:
30303 case MULTI_ARG_3_HI_SI:
30304 case MULTI_ARG_3_QI:
30305 case MULTI_ARG_3_DI2:
30306 case MULTI_ARG_3_SI2:
30307 case MULTI_ARG_3_HI2:
30308 case MULTI_ARG_3_QI2:
30309 nargs = 3;
30310 break;
30311
30312 case MULTI_ARG_2_SF:
30313 case MULTI_ARG_2_DF:
30314 case MULTI_ARG_2_DI:
30315 case MULTI_ARG_2_SI:
30316 case MULTI_ARG_2_HI:
30317 case MULTI_ARG_2_QI:
30318 nargs = 2;
30319 break;
30320
30321 case MULTI_ARG_2_DI_IMM:
30322 case MULTI_ARG_2_SI_IMM:
30323 case MULTI_ARG_2_HI_IMM:
30324 case MULTI_ARG_2_QI_IMM:
30325 nargs = 2;
30326 last_arg_constant = true;
30327 break;
30328
30329 case MULTI_ARG_1_SF:
30330 case MULTI_ARG_1_DF:
30331 case MULTI_ARG_1_SF2:
30332 case MULTI_ARG_1_DF2:
30333 case MULTI_ARG_1_DI:
30334 case MULTI_ARG_1_SI:
30335 case MULTI_ARG_1_HI:
30336 case MULTI_ARG_1_QI:
30337 case MULTI_ARG_1_SI_DI:
30338 case MULTI_ARG_1_HI_DI:
30339 case MULTI_ARG_1_HI_SI:
30340 case MULTI_ARG_1_QI_DI:
30341 case MULTI_ARG_1_QI_SI:
30342 case MULTI_ARG_1_QI_HI:
30343 nargs = 1;
30344 break;
30345
30346 case MULTI_ARG_2_DI_CMP:
30347 case MULTI_ARG_2_SI_CMP:
30348 case MULTI_ARG_2_HI_CMP:
30349 case MULTI_ARG_2_QI_CMP:
30350 nargs = 2;
30351 comparison_p = true;
30352 break;
30353
30354 case MULTI_ARG_2_SF_TF:
30355 case MULTI_ARG_2_DF_TF:
30356 case MULTI_ARG_2_DI_TF:
30357 case MULTI_ARG_2_SI_TF:
30358 case MULTI_ARG_2_HI_TF:
30359 case MULTI_ARG_2_QI_TF:
30360 nargs = 2;
30361 tf_p = true;
30362 break;
30363
30364 default:
30365 gcc_unreachable ();
30366 }
30367
30368 if (optimize || !target
30369 || GET_MODE (target) != tmode
30370 || !insn_data[icode].operand[0].predicate (target, tmode))
30371 target = gen_reg_rtx (tmode);
30372
30373 gcc_assert (nargs <= 4);
30374
30375 for (i = 0; i < nargs; i++)
30376 {
30377 tree arg = CALL_EXPR_ARG (exp, i);
30378 rtx op = expand_normal (arg);
30379 int adjust = (comparison_p) ? 1 : 0;
30380 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
30381
30382 if (last_arg_constant && i == nargs - 1)
30383 {
30384 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
30385 {
30386 enum insn_code new_icode = icode;
30387 switch (icode)
30388 {
30389 case CODE_FOR_xop_vpermil2v2df3:
30390 case CODE_FOR_xop_vpermil2v4sf3:
30391 case CODE_FOR_xop_vpermil2v4df3:
30392 case CODE_FOR_xop_vpermil2v8sf3:
30393 error ("the last argument must be a 2-bit immediate");
30394 return gen_reg_rtx (tmode);
30395 case CODE_FOR_xop_rotlv2di3:
30396 new_icode = CODE_FOR_rotlv2di3;
30397 goto xop_rotl;
30398 case CODE_FOR_xop_rotlv4si3:
30399 new_icode = CODE_FOR_rotlv4si3;
30400 goto xop_rotl;
30401 case CODE_FOR_xop_rotlv8hi3:
30402 new_icode = CODE_FOR_rotlv8hi3;
30403 goto xop_rotl;
30404 case CODE_FOR_xop_rotlv16qi3:
30405 new_icode = CODE_FOR_rotlv16qi3;
30406 xop_rotl:
30407 if (CONST_INT_P (op))
30408 {
30409 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
30410 op = GEN_INT (INTVAL (op) & mask);
30411 gcc_checking_assert
30412 (insn_data[icode].operand[i + 1].predicate (op, mode));
30413 }
30414 else
30415 {
30416 gcc_checking_assert
30417 (nargs == 2
30418 && insn_data[new_icode].operand[0].mode == tmode
30419 && insn_data[new_icode].operand[1].mode == tmode
30420 && insn_data[new_icode].operand[2].mode == mode
30421 && insn_data[new_icode].operand[0].predicate
30422 == insn_data[icode].operand[0].predicate
30423 && insn_data[new_icode].operand[1].predicate
30424 == insn_data[icode].operand[1].predicate);
30425 icode = new_icode;
30426 goto non_constant;
30427 }
30428 break;
30429 default:
30430 gcc_unreachable ();
30431 }
30432 }
30433 }
30434 else
30435 {
30436 non_constant:
30437 if (VECTOR_MODE_P (mode))
30438 op = safe_vector_operand (op, mode);
30439
30440 /* If we aren't optimizing, only allow one memory operand to be
30441 generated. */
30442 if (memory_operand (op, mode))
30443 num_memory++;
30444
30445 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
30446
30447 if (optimize
30448 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
30449 || num_memory > 1)
30450 op = force_reg (mode, op);
30451 }
30452
30453 args[i].op = op;
30454 args[i].mode = mode;
30455 }
30456
30457 switch (nargs)
30458 {
30459 case 1:
30460 pat = GEN_FCN (icode) (target, args[0].op);
30461 break;
30462
30463 case 2:
30464 if (tf_p)
30465 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
30466 GEN_INT ((int)sub_code));
30467 else if (! comparison_p)
30468 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
30469 else
30470 {
30471 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
30472 args[0].op,
30473 args[1].op);
30474
30475 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
30476 }
30477 break;
30478
30479 case 3:
30480 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
30481 break;
30482
30483 case 4:
30484 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
30485 break;
30486
30487 default:
30488 gcc_unreachable ();
30489 }
30490
30491 if (! pat)
30492 return 0;
30493
30494 emit_insn (pat);
30495 return target;
30496 }
30497
30498 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
30499 insns with vec_merge. */
30500
30501 static rtx
30502 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
30503 rtx target)
30504 {
30505 rtx pat;
30506 tree arg0 = CALL_EXPR_ARG (exp, 0);
30507 rtx op1, op0 = expand_normal (arg0);
30508 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30509 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30510
30511 if (optimize || !target
30512 || GET_MODE (target) != tmode
30513 || !insn_data[icode].operand[0].predicate (target, tmode))
30514 target = gen_reg_rtx (tmode);
30515
30516 if (VECTOR_MODE_P (mode0))
30517 op0 = safe_vector_operand (op0, mode0);
30518
30519 if ((optimize && !register_operand (op0, mode0))
30520 || !insn_data[icode].operand[1].predicate (op0, mode0))
30521 op0 = copy_to_mode_reg (mode0, op0);
30522
30523 op1 = op0;
30524 if (!insn_data[icode].operand[2].predicate (op1, mode0))
30525 op1 = copy_to_mode_reg (mode0, op1);
30526
30527 pat = GEN_FCN (icode) (target, op0, op1);
30528 if (! pat)
30529 return 0;
30530 emit_insn (pat);
30531 return target;
30532 }
30533
30534 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
30535
30536 static rtx
30537 ix86_expand_sse_compare (const struct builtin_description *d,
30538 tree exp, rtx target, bool swap)
30539 {
30540 rtx pat;
30541 tree arg0 = CALL_EXPR_ARG (exp, 0);
30542 tree arg1 = CALL_EXPR_ARG (exp, 1);
30543 rtx op0 = expand_normal (arg0);
30544 rtx op1 = expand_normal (arg1);
30545 rtx op2;
30546 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30547 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30548 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30549 enum rtx_code comparison = d->comparison;
30550
30551 if (VECTOR_MODE_P (mode0))
30552 op0 = safe_vector_operand (op0, mode0);
30553 if (VECTOR_MODE_P (mode1))
30554 op1 = safe_vector_operand (op1, mode1);
30555
30556 /* Swap operands if we have a comparison that isn't available in
30557 hardware. */
30558 if (swap)
30559 {
30560 rtx tmp = gen_reg_rtx (mode1);
30561 emit_move_insn (tmp, op1);
30562 op1 = op0;
30563 op0 = tmp;
30564 }
30565
30566 if (optimize || !target
30567 || GET_MODE (target) != tmode
30568 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30569 target = gen_reg_rtx (tmode);
30570
30571 if ((optimize && !register_operand (op0, mode0))
30572 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
30573 op0 = copy_to_mode_reg (mode0, op0);
30574 if ((optimize && !register_operand (op1, mode1))
30575 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
30576 op1 = copy_to_mode_reg (mode1, op1);
30577
30578 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
30579 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30580 if (! pat)
30581 return 0;
30582 emit_insn (pat);
30583 return target;
30584 }
30585
30586 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
30587
30588 static rtx
30589 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
30590 rtx target)
30591 {
30592 rtx pat;
30593 tree arg0 = CALL_EXPR_ARG (exp, 0);
30594 tree arg1 = CALL_EXPR_ARG (exp, 1);
30595 rtx op0 = expand_normal (arg0);
30596 rtx op1 = expand_normal (arg1);
30597 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30598 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30599 enum rtx_code comparison = d->comparison;
30600
30601 if (VECTOR_MODE_P (mode0))
30602 op0 = safe_vector_operand (op0, mode0);
30603 if (VECTOR_MODE_P (mode1))
30604 op1 = safe_vector_operand (op1, mode1);
30605
30606 /* Swap operands if we have a comparison that isn't available in
30607 hardware. */
30608 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
30609 {
30610 rtx tmp = op1;
30611 op1 = op0;
30612 op0 = tmp;
30613 }
30614
30615 target = gen_reg_rtx (SImode);
30616 emit_move_insn (target, const0_rtx);
30617 target = gen_rtx_SUBREG (QImode, target, 0);
30618
30619 if ((optimize && !register_operand (op0, mode0))
30620 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30621 op0 = copy_to_mode_reg (mode0, op0);
30622 if ((optimize && !register_operand (op1, mode1))
30623 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30624 op1 = copy_to_mode_reg (mode1, op1);
30625
30626 pat = GEN_FCN (d->icode) (op0, op1);
30627 if (! pat)
30628 return 0;
30629 emit_insn (pat);
30630 emit_insn (gen_rtx_SET (VOIDmode,
30631 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30632 gen_rtx_fmt_ee (comparison, QImode,
30633 SET_DEST (pat),
30634 const0_rtx)));
30635
30636 return SUBREG_REG (target);
30637 }
30638
30639 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
30640
30641 static rtx
30642 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
30643 rtx target)
30644 {
30645 rtx pat;
30646 tree arg0 = CALL_EXPR_ARG (exp, 0);
30647 rtx op1, op0 = expand_normal (arg0);
30648 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30649 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30650
30651 if (optimize || target == 0
30652 || GET_MODE (target) != tmode
30653 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30654 target = gen_reg_rtx (tmode);
30655
30656 if (VECTOR_MODE_P (mode0))
30657 op0 = safe_vector_operand (op0, mode0);
30658
30659 if ((optimize && !register_operand (op0, mode0))
30660 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30661 op0 = copy_to_mode_reg (mode0, op0);
30662
30663 op1 = GEN_INT (d->comparison);
30664
30665 pat = GEN_FCN (d->icode) (target, op0, op1);
30666 if (! pat)
30667 return 0;
30668 emit_insn (pat);
30669 return target;
30670 }
30671
30672 static rtx
30673 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
30674 tree exp, rtx target)
30675 {
30676 rtx pat;
30677 tree arg0 = CALL_EXPR_ARG (exp, 0);
30678 tree arg1 = CALL_EXPR_ARG (exp, 1);
30679 rtx op0 = expand_normal (arg0);
30680 rtx op1 = expand_normal (arg1);
30681 rtx op2;
30682 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30683 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30684 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30685
30686 if (optimize || target == 0
30687 || GET_MODE (target) != tmode
30688 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30689 target = gen_reg_rtx (tmode);
30690
30691 op0 = safe_vector_operand (op0, mode0);
30692 op1 = safe_vector_operand (op1, mode1);
30693
30694 if ((optimize && !register_operand (op0, mode0))
30695 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30696 op0 = copy_to_mode_reg (mode0, op0);
30697 if ((optimize && !register_operand (op1, mode1))
30698 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30699 op1 = copy_to_mode_reg (mode1, op1);
30700
30701 op2 = GEN_INT (d->comparison);
30702
30703 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30704 if (! pat)
30705 return 0;
30706 emit_insn (pat);
30707 return target;
30708 }
30709
30710 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
30711
30712 static rtx
30713 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
30714 rtx target)
30715 {
30716 rtx pat;
30717 tree arg0 = CALL_EXPR_ARG (exp, 0);
30718 tree arg1 = CALL_EXPR_ARG (exp, 1);
30719 rtx op0 = expand_normal (arg0);
30720 rtx op1 = expand_normal (arg1);
30721 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30722 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30723 enum rtx_code comparison = d->comparison;
30724
30725 if (VECTOR_MODE_P (mode0))
30726 op0 = safe_vector_operand (op0, mode0);
30727 if (VECTOR_MODE_P (mode1))
30728 op1 = safe_vector_operand (op1, mode1);
30729
30730 target = gen_reg_rtx (SImode);
30731 emit_move_insn (target, const0_rtx);
30732 target = gen_rtx_SUBREG (QImode, target, 0);
30733
30734 if ((optimize && !register_operand (op0, mode0))
30735 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30736 op0 = copy_to_mode_reg (mode0, op0);
30737 if ((optimize && !register_operand (op1, mode1))
30738 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30739 op1 = copy_to_mode_reg (mode1, op1);
30740
30741 pat = GEN_FCN (d->icode) (op0, op1);
30742 if (! pat)
30743 return 0;
30744 emit_insn (pat);
30745 emit_insn (gen_rtx_SET (VOIDmode,
30746 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30747 gen_rtx_fmt_ee (comparison, QImode,
30748 SET_DEST (pat),
30749 const0_rtx)));
30750
30751 return SUBREG_REG (target);
30752 }
30753
30754 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
30755
30756 static rtx
30757 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
30758 tree exp, rtx target)
30759 {
30760 rtx pat;
30761 tree arg0 = CALL_EXPR_ARG (exp, 0);
30762 tree arg1 = CALL_EXPR_ARG (exp, 1);
30763 tree arg2 = CALL_EXPR_ARG (exp, 2);
30764 tree arg3 = CALL_EXPR_ARG (exp, 3);
30765 tree arg4 = CALL_EXPR_ARG (exp, 4);
30766 rtx scratch0, scratch1;
30767 rtx op0 = expand_normal (arg0);
30768 rtx op1 = expand_normal (arg1);
30769 rtx op2 = expand_normal (arg2);
30770 rtx op3 = expand_normal (arg3);
30771 rtx op4 = expand_normal (arg4);
30772 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
30773
30774 tmode0 = insn_data[d->icode].operand[0].mode;
30775 tmode1 = insn_data[d->icode].operand[1].mode;
30776 modev2 = insn_data[d->icode].operand[2].mode;
30777 modei3 = insn_data[d->icode].operand[3].mode;
30778 modev4 = insn_data[d->icode].operand[4].mode;
30779 modei5 = insn_data[d->icode].operand[5].mode;
30780 modeimm = insn_data[d->icode].operand[6].mode;
30781
30782 if (VECTOR_MODE_P (modev2))
30783 op0 = safe_vector_operand (op0, modev2);
30784 if (VECTOR_MODE_P (modev4))
30785 op2 = safe_vector_operand (op2, modev4);
30786
30787 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30788 op0 = copy_to_mode_reg (modev2, op0);
30789 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
30790 op1 = copy_to_mode_reg (modei3, op1);
30791 if ((optimize && !register_operand (op2, modev4))
30792 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
30793 op2 = copy_to_mode_reg (modev4, op2);
30794 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
30795 op3 = copy_to_mode_reg (modei5, op3);
30796
30797 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
30798 {
30799 error ("the fifth argument must be an 8-bit immediate");
30800 return const0_rtx;
30801 }
30802
30803 if (d->code == IX86_BUILTIN_PCMPESTRI128)
30804 {
30805 if (optimize || !target
30806 || GET_MODE (target) != tmode0
30807 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30808 target = gen_reg_rtx (tmode0);
30809
30810 scratch1 = gen_reg_rtx (tmode1);
30811
30812 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
30813 }
30814 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
30815 {
30816 if (optimize || !target
30817 || GET_MODE (target) != tmode1
30818 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30819 target = gen_reg_rtx (tmode1);
30820
30821 scratch0 = gen_reg_rtx (tmode0);
30822
30823 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
30824 }
30825 else
30826 {
30827 gcc_assert (d->flag);
30828
30829 scratch0 = gen_reg_rtx (tmode0);
30830 scratch1 = gen_reg_rtx (tmode1);
30831
30832 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
30833 }
30834
30835 if (! pat)
30836 return 0;
30837
30838 emit_insn (pat);
30839
30840 if (d->flag)
30841 {
30842 target = gen_reg_rtx (SImode);
30843 emit_move_insn (target, const0_rtx);
30844 target = gen_rtx_SUBREG (QImode, target, 0);
30845
30846 emit_insn
30847 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30848 gen_rtx_fmt_ee (EQ, QImode,
30849 gen_rtx_REG ((enum machine_mode) d->flag,
30850 FLAGS_REG),
30851 const0_rtx)));
30852 return SUBREG_REG (target);
30853 }
30854 else
30855 return target;
30856 }
30857
30858
30859 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
30860
30861 static rtx
30862 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
30863 tree exp, rtx target)
30864 {
30865 rtx pat;
30866 tree arg0 = CALL_EXPR_ARG (exp, 0);
30867 tree arg1 = CALL_EXPR_ARG (exp, 1);
30868 tree arg2 = CALL_EXPR_ARG (exp, 2);
30869 rtx scratch0, scratch1;
30870 rtx op0 = expand_normal (arg0);
30871 rtx op1 = expand_normal (arg1);
30872 rtx op2 = expand_normal (arg2);
30873 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
30874
30875 tmode0 = insn_data[d->icode].operand[0].mode;
30876 tmode1 = insn_data[d->icode].operand[1].mode;
30877 modev2 = insn_data[d->icode].operand[2].mode;
30878 modev3 = insn_data[d->icode].operand[3].mode;
30879 modeimm = insn_data[d->icode].operand[4].mode;
30880
30881 if (VECTOR_MODE_P (modev2))
30882 op0 = safe_vector_operand (op0, modev2);
30883 if (VECTOR_MODE_P (modev3))
30884 op1 = safe_vector_operand (op1, modev3);
30885
30886 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30887 op0 = copy_to_mode_reg (modev2, op0);
30888 if ((optimize && !register_operand (op1, modev3))
30889 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
30890 op1 = copy_to_mode_reg (modev3, op1);
30891
30892 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
30893 {
30894 error ("the third argument must be an 8-bit immediate");
30895 return const0_rtx;
30896 }
30897
30898 if (d->code == IX86_BUILTIN_PCMPISTRI128)
30899 {
30900 if (optimize || !target
30901 || GET_MODE (target) != tmode0
30902 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30903 target = gen_reg_rtx (tmode0);
30904
30905 scratch1 = gen_reg_rtx (tmode1);
30906
30907 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
30908 }
30909 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
30910 {
30911 if (optimize || !target
30912 || GET_MODE (target) != tmode1
30913 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30914 target = gen_reg_rtx (tmode1);
30915
30916 scratch0 = gen_reg_rtx (tmode0);
30917
30918 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
30919 }
30920 else
30921 {
30922 gcc_assert (d->flag);
30923
30924 scratch0 = gen_reg_rtx (tmode0);
30925 scratch1 = gen_reg_rtx (tmode1);
30926
30927 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
30928 }
30929
30930 if (! pat)
30931 return 0;
30932
30933 emit_insn (pat);
30934
30935 if (d->flag)
30936 {
30937 target = gen_reg_rtx (SImode);
30938 emit_move_insn (target, const0_rtx);
30939 target = gen_rtx_SUBREG (QImode, target, 0);
30940
30941 emit_insn
30942 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30943 gen_rtx_fmt_ee (EQ, QImode,
30944 gen_rtx_REG ((enum machine_mode) d->flag,
30945 FLAGS_REG),
30946 const0_rtx)));
30947 return SUBREG_REG (target);
30948 }
30949 else
30950 return target;
30951 }
30952
30953 /* Subroutine of ix86_expand_builtin to take care of insns with
30954 variable number of operands. */
30955
30956 static rtx
30957 ix86_expand_args_builtin (const struct builtin_description *d,
30958 tree exp, rtx target)
30959 {
30960 rtx pat, real_target;
30961 unsigned int i, nargs;
30962 unsigned int nargs_constant = 0;
30963 int num_memory = 0;
30964 struct
30965 {
30966 rtx op;
30967 enum machine_mode mode;
30968 } args[4];
30969 bool last_arg_count = false;
30970 enum insn_code icode = d->icode;
30971 const struct insn_data_d *insn_p = &insn_data[icode];
30972 enum machine_mode tmode = insn_p->operand[0].mode;
30973 enum machine_mode rmode = VOIDmode;
30974 bool swap = false;
30975 enum rtx_code comparison = d->comparison;
30976
30977 switch ((enum ix86_builtin_func_type) d->flag)
30978 {
30979 case V2DF_FTYPE_V2DF_ROUND:
30980 case V4DF_FTYPE_V4DF_ROUND:
30981 case V4SF_FTYPE_V4SF_ROUND:
30982 case V8SF_FTYPE_V8SF_ROUND:
30983 case V4SI_FTYPE_V4SF_ROUND:
30984 case V8SI_FTYPE_V8SF_ROUND:
30985 return ix86_expand_sse_round (d, exp, target);
30986 case V4SI_FTYPE_V2DF_V2DF_ROUND:
30987 case V8SI_FTYPE_V4DF_V4DF_ROUND:
30988 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
30989 case INT_FTYPE_V8SF_V8SF_PTEST:
30990 case INT_FTYPE_V4DI_V4DI_PTEST:
30991 case INT_FTYPE_V4DF_V4DF_PTEST:
30992 case INT_FTYPE_V4SF_V4SF_PTEST:
30993 case INT_FTYPE_V2DI_V2DI_PTEST:
30994 case INT_FTYPE_V2DF_V2DF_PTEST:
30995 return ix86_expand_sse_ptest (d, exp, target);
30996 case FLOAT128_FTYPE_FLOAT128:
30997 case FLOAT_FTYPE_FLOAT:
30998 case INT_FTYPE_INT:
30999 case UINT64_FTYPE_INT:
31000 case UINT16_FTYPE_UINT16:
31001 case INT64_FTYPE_INT64:
31002 case INT64_FTYPE_V4SF:
31003 case INT64_FTYPE_V2DF:
31004 case INT_FTYPE_V16QI:
31005 case INT_FTYPE_V8QI:
31006 case INT_FTYPE_V8SF:
31007 case INT_FTYPE_V4DF:
31008 case INT_FTYPE_V4SF:
31009 case INT_FTYPE_V2DF:
31010 case INT_FTYPE_V32QI:
31011 case V16QI_FTYPE_V16QI:
31012 case V8SI_FTYPE_V8SF:
31013 case V8SI_FTYPE_V4SI:
31014 case V8HI_FTYPE_V8HI:
31015 case V8HI_FTYPE_V16QI:
31016 case V8QI_FTYPE_V8QI:
31017 case V8SF_FTYPE_V8SF:
31018 case V8SF_FTYPE_V8SI:
31019 case V8SF_FTYPE_V4SF:
31020 case V8SF_FTYPE_V8HI:
31021 case V4SI_FTYPE_V4SI:
31022 case V4SI_FTYPE_V16QI:
31023 case V4SI_FTYPE_V4SF:
31024 case V4SI_FTYPE_V8SI:
31025 case V4SI_FTYPE_V8HI:
31026 case V4SI_FTYPE_V4DF:
31027 case V4SI_FTYPE_V2DF:
31028 case V4HI_FTYPE_V4HI:
31029 case V4DF_FTYPE_V4DF:
31030 case V4DF_FTYPE_V4SI:
31031 case V4DF_FTYPE_V4SF:
31032 case V4DF_FTYPE_V2DF:
31033 case V4SF_FTYPE_V4SF:
31034 case V4SF_FTYPE_V4SI:
31035 case V4SF_FTYPE_V8SF:
31036 case V4SF_FTYPE_V4DF:
31037 case V4SF_FTYPE_V8HI:
31038 case V4SF_FTYPE_V2DF:
31039 case V2DI_FTYPE_V2DI:
31040 case V2DI_FTYPE_V16QI:
31041 case V2DI_FTYPE_V8HI:
31042 case V2DI_FTYPE_V4SI:
31043 case V2DF_FTYPE_V2DF:
31044 case V2DF_FTYPE_V4SI:
31045 case V2DF_FTYPE_V4DF:
31046 case V2DF_FTYPE_V4SF:
31047 case V2DF_FTYPE_V2SI:
31048 case V2SI_FTYPE_V2SI:
31049 case V2SI_FTYPE_V4SF:
31050 case V2SI_FTYPE_V2SF:
31051 case V2SI_FTYPE_V2DF:
31052 case V2SF_FTYPE_V2SF:
31053 case V2SF_FTYPE_V2SI:
31054 case V32QI_FTYPE_V32QI:
31055 case V32QI_FTYPE_V16QI:
31056 case V16HI_FTYPE_V16HI:
31057 case V16HI_FTYPE_V8HI:
31058 case V8SI_FTYPE_V8SI:
31059 case V16HI_FTYPE_V16QI:
31060 case V8SI_FTYPE_V16QI:
31061 case V4DI_FTYPE_V16QI:
31062 case V8SI_FTYPE_V8HI:
31063 case V4DI_FTYPE_V8HI:
31064 case V4DI_FTYPE_V4SI:
31065 case V4DI_FTYPE_V2DI:
31066 nargs = 1;
31067 break;
31068 case V4SF_FTYPE_V4SF_VEC_MERGE:
31069 case V2DF_FTYPE_V2DF_VEC_MERGE:
31070 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
31071 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
31072 case V16QI_FTYPE_V16QI_V16QI:
31073 case V16QI_FTYPE_V8HI_V8HI:
31074 case V8QI_FTYPE_V8QI_V8QI:
31075 case V8QI_FTYPE_V4HI_V4HI:
31076 case V8HI_FTYPE_V8HI_V8HI:
31077 case V8HI_FTYPE_V16QI_V16QI:
31078 case V8HI_FTYPE_V4SI_V4SI:
31079 case V8SF_FTYPE_V8SF_V8SF:
31080 case V8SF_FTYPE_V8SF_V8SI:
31081 case V4SI_FTYPE_V4SI_V4SI:
31082 case V4SI_FTYPE_V8HI_V8HI:
31083 case V4SI_FTYPE_V4SF_V4SF:
31084 case V4SI_FTYPE_V2DF_V2DF:
31085 case V4HI_FTYPE_V4HI_V4HI:
31086 case V4HI_FTYPE_V8QI_V8QI:
31087 case V4HI_FTYPE_V2SI_V2SI:
31088 case V4DF_FTYPE_V4DF_V4DF:
31089 case V4DF_FTYPE_V4DF_V4DI:
31090 case V4SF_FTYPE_V4SF_V4SF:
31091 case V4SF_FTYPE_V4SF_V4SI:
31092 case V4SF_FTYPE_V4SF_V2SI:
31093 case V4SF_FTYPE_V4SF_V2DF:
31094 case V4SF_FTYPE_V4SF_DI:
31095 case V4SF_FTYPE_V4SF_SI:
31096 case V2DI_FTYPE_V2DI_V2DI:
31097 case V2DI_FTYPE_V16QI_V16QI:
31098 case V2DI_FTYPE_V4SI_V4SI:
31099 case V2UDI_FTYPE_V4USI_V4USI:
31100 case V2DI_FTYPE_V2DI_V16QI:
31101 case V2DI_FTYPE_V2DF_V2DF:
31102 case V2SI_FTYPE_V2SI_V2SI:
31103 case V2SI_FTYPE_V4HI_V4HI:
31104 case V2SI_FTYPE_V2SF_V2SF:
31105 case V2DF_FTYPE_V2DF_V2DF:
31106 case V2DF_FTYPE_V2DF_V4SF:
31107 case V2DF_FTYPE_V2DF_V2DI:
31108 case V2DF_FTYPE_V2DF_DI:
31109 case V2DF_FTYPE_V2DF_SI:
31110 case V2SF_FTYPE_V2SF_V2SF:
31111 case V1DI_FTYPE_V1DI_V1DI:
31112 case V1DI_FTYPE_V8QI_V8QI:
31113 case V1DI_FTYPE_V2SI_V2SI:
31114 case V32QI_FTYPE_V16HI_V16HI:
31115 case V16HI_FTYPE_V8SI_V8SI:
31116 case V32QI_FTYPE_V32QI_V32QI:
31117 case V16HI_FTYPE_V32QI_V32QI:
31118 case V16HI_FTYPE_V16HI_V16HI:
31119 case V8SI_FTYPE_V4DF_V4DF:
31120 case V8SI_FTYPE_V8SI_V8SI:
31121 case V8SI_FTYPE_V16HI_V16HI:
31122 case V4DI_FTYPE_V4DI_V4DI:
31123 case V4DI_FTYPE_V8SI_V8SI:
31124 case V4UDI_FTYPE_V8USI_V8USI:
31125 if (comparison == UNKNOWN)
31126 return ix86_expand_binop_builtin (icode, exp, target);
31127 nargs = 2;
31128 break;
31129 case V4SF_FTYPE_V4SF_V4SF_SWAP:
31130 case V2DF_FTYPE_V2DF_V2DF_SWAP:
31131 gcc_assert (comparison != UNKNOWN);
31132 nargs = 2;
31133 swap = true;
31134 break;
31135 case V16HI_FTYPE_V16HI_V8HI_COUNT:
31136 case V16HI_FTYPE_V16HI_SI_COUNT:
31137 case V8SI_FTYPE_V8SI_V4SI_COUNT:
31138 case V8SI_FTYPE_V8SI_SI_COUNT:
31139 case V4DI_FTYPE_V4DI_V2DI_COUNT:
31140 case V4DI_FTYPE_V4DI_INT_COUNT:
31141 case V8HI_FTYPE_V8HI_V8HI_COUNT:
31142 case V8HI_FTYPE_V8HI_SI_COUNT:
31143 case V4SI_FTYPE_V4SI_V4SI_COUNT:
31144 case V4SI_FTYPE_V4SI_SI_COUNT:
31145 case V4HI_FTYPE_V4HI_V4HI_COUNT:
31146 case V4HI_FTYPE_V4HI_SI_COUNT:
31147 case V2DI_FTYPE_V2DI_V2DI_COUNT:
31148 case V2DI_FTYPE_V2DI_SI_COUNT:
31149 case V2SI_FTYPE_V2SI_V2SI_COUNT:
31150 case V2SI_FTYPE_V2SI_SI_COUNT:
31151 case V1DI_FTYPE_V1DI_V1DI_COUNT:
31152 case V1DI_FTYPE_V1DI_SI_COUNT:
31153 nargs = 2;
31154 last_arg_count = true;
31155 break;
31156 case UINT64_FTYPE_UINT64_UINT64:
31157 case UINT_FTYPE_UINT_UINT:
31158 case UINT_FTYPE_UINT_USHORT:
31159 case UINT_FTYPE_UINT_UCHAR:
31160 case UINT16_FTYPE_UINT16_INT:
31161 case UINT8_FTYPE_UINT8_INT:
31162 nargs = 2;
31163 break;
31164 case V2DI_FTYPE_V2DI_INT_CONVERT:
31165 nargs = 2;
31166 rmode = V1TImode;
31167 nargs_constant = 1;
31168 break;
31169 case V4DI_FTYPE_V4DI_INT_CONVERT:
31170 nargs = 2;
31171 rmode = V2TImode;
31172 nargs_constant = 1;
31173 break;
31174 case V8HI_FTYPE_V8HI_INT:
31175 case V8HI_FTYPE_V8SF_INT:
31176 case V8HI_FTYPE_V4SF_INT:
31177 case V8SF_FTYPE_V8SF_INT:
31178 case V4SI_FTYPE_V4SI_INT:
31179 case V4SI_FTYPE_V8SI_INT:
31180 case V4HI_FTYPE_V4HI_INT:
31181 case V4DF_FTYPE_V4DF_INT:
31182 case V4SF_FTYPE_V4SF_INT:
31183 case V4SF_FTYPE_V8SF_INT:
31184 case V2DI_FTYPE_V2DI_INT:
31185 case V2DF_FTYPE_V2DF_INT:
31186 case V2DF_FTYPE_V4DF_INT:
31187 case V16HI_FTYPE_V16HI_INT:
31188 case V8SI_FTYPE_V8SI_INT:
31189 case V4DI_FTYPE_V4DI_INT:
31190 case V2DI_FTYPE_V4DI_INT:
31191 nargs = 2;
31192 nargs_constant = 1;
31193 break;
31194 case V16QI_FTYPE_V16QI_V16QI_V16QI:
31195 case V8SF_FTYPE_V8SF_V8SF_V8SF:
31196 case V4DF_FTYPE_V4DF_V4DF_V4DF:
31197 case V4SF_FTYPE_V4SF_V4SF_V4SF:
31198 case V2DF_FTYPE_V2DF_V2DF_V2DF:
31199 case V32QI_FTYPE_V32QI_V32QI_V32QI:
31200 nargs = 3;
31201 break;
31202 case V32QI_FTYPE_V32QI_V32QI_INT:
31203 case V16HI_FTYPE_V16HI_V16HI_INT:
31204 case V16QI_FTYPE_V16QI_V16QI_INT:
31205 case V4DI_FTYPE_V4DI_V4DI_INT:
31206 case V8HI_FTYPE_V8HI_V8HI_INT:
31207 case V8SI_FTYPE_V8SI_V8SI_INT:
31208 case V8SI_FTYPE_V8SI_V4SI_INT:
31209 case V8SF_FTYPE_V8SF_V8SF_INT:
31210 case V8SF_FTYPE_V8SF_V4SF_INT:
31211 case V4SI_FTYPE_V4SI_V4SI_INT:
31212 case V4DF_FTYPE_V4DF_V4DF_INT:
31213 case V4DF_FTYPE_V4DF_V2DF_INT:
31214 case V4SF_FTYPE_V4SF_V4SF_INT:
31215 case V2DI_FTYPE_V2DI_V2DI_INT:
31216 case V4DI_FTYPE_V4DI_V2DI_INT:
31217 case V2DF_FTYPE_V2DF_V2DF_INT:
31218 nargs = 3;
31219 nargs_constant = 1;
31220 break;
31221 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
31222 nargs = 3;
31223 rmode = V4DImode;
31224 nargs_constant = 1;
31225 break;
31226 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
31227 nargs = 3;
31228 rmode = V2DImode;
31229 nargs_constant = 1;
31230 break;
31231 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
31232 nargs = 3;
31233 rmode = DImode;
31234 nargs_constant = 1;
31235 break;
31236 case V2DI_FTYPE_V2DI_UINT_UINT:
31237 nargs = 3;
31238 nargs_constant = 2;
31239 break;
31240 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
31241 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
31242 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
31243 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
31244 nargs = 4;
31245 nargs_constant = 1;
31246 break;
31247 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
31248 nargs = 4;
31249 nargs_constant = 2;
31250 break;
31251 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
31252 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
31253 nargs = 4;
31254 break;
31255 default:
31256 gcc_unreachable ();
31257 }
31258
31259 gcc_assert (nargs <= ARRAY_SIZE (args));
31260
31261 if (comparison != UNKNOWN)
31262 {
31263 gcc_assert (nargs == 2);
31264 return ix86_expand_sse_compare (d, exp, target, swap);
31265 }
31266
31267 if (rmode == VOIDmode || rmode == tmode)
31268 {
31269 if (optimize
31270 || target == 0
31271 || GET_MODE (target) != tmode
31272 || !insn_p->operand[0].predicate (target, tmode))
31273 target = gen_reg_rtx (tmode);
31274 real_target = target;
31275 }
31276 else
31277 {
31278 target = gen_reg_rtx (rmode);
31279 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
31280 }
31281
31282 for (i = 0; i < nargs; i++)
31283 {
31284 tree arg = CALL_EXPR_ARG (exp, i);
31285 rtx op = expand_normal (arg);
31286 enum machine_mode mode = insn_p->operand[i + 1].mode;
31287 bool match = insn_p->operand[i + 1].predicate (op, mode);
31288
31289 if (last_arg_count && (i + 1) == nargs)
31290 {
31291 /* SIMD shift insns take either an 8-bit immediate or
31292 register as count. But builtin functions take int as
31293 count. If count doesn't match, we put it in register. */
31294 if (!match)
31295 {
31296 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
31297 if (!insn_p->operand[i + 1].predicate (op, mode))
31298 op = copy_to_reg (op);
31299 }
31300 }
31301 else if ((nargs - i) <= nargs_constant)
31302 {
31303 if (!match)
31304 switch (icode)
31305 {
31306 case CODE_FOR_avx2_inserti128:
31307 case CODE_FOR_avx2_extracti128:
31308 error ("the last argument must be an 1-bit immediate");
31309 return const0_rtx;
31310
31311 case CODE_FOR_sse4_1_roundsd:
31312 case CODE_FOR_sse4_1_roundss:
31313
31314 case CODE_FOR_sse4_1_roundpd:
31315 case CODE_FOR_sse4_1_roundps:
31316 case CODE_FOR_avx_roundpd256:
31317 case CODE_FOR_avx_roundps256:
31318
31319 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
31320 case CODE_FOR_sse4_1_roundps_sfix:
31321 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
31322 case CODE_FOR_avx_roundps_sfix256:
31323
31324 case CODE_FOR_sse4_1_blendps:
31325 case CODE_FOR_avx_blendpd256:
31326 case CODE_FOR_avx_vpermilv4df:
31327 error ("the last argument must be a 4-bit immediate");
31328 return const0_rtx;
31329
31330 case CODE_FOR_sse4_1_blendpd:
31331 case CODE_FOR_avx_vpermilv2df:
31332 case CODE_FOR_xop_vpermil2v2df3:
31333 case CODE_FOR_xop_vpermil2v4sf3:
31334 case CODE_FOR_xop_vpermil2v4df3:
31335 case CODE_FOR_xop_vpermil2v8sf3:
31336 error ("the last argument must be a 2-bit immediate");
31337 return const0_rtx;
31338
31339 case CODE_FOR_avx_vextractf128v4df:
31340 case CODE_FOR_avx_vextractf128v8sf:
31341 case CODE_FOR_avx_vextractf128v8si:
31342 case CODE_FOR_avx_vinsertf128v4df:
31343 case CODE_FOR_avx_vinsertf128v8sf:
31344 case CODE_FOR_avx_vinsertf128v8si:
31345 error ("the last argument must be a 1-bit immediate");
31346 return const0_rtx;
31347
31348 case CODE_FOR_avx_vmcmpv2df3:
31349 case CODE_FOR_avx_vmcmpv4sf3:
31350 case CODE_FOR_avx_cmpv2df3:
31351 case CODE_FOR_avx_cmpv4sf3:
31352 case CODE_FOR_avx_cmpv4df3:
31353 case CODE_FOR_avx_cmpv8sf3:
31354 error ("the last argument must be a 5-bit immediate");
31355 return const0_rtx;
31356
31357 default:
31358 switch (nargs_constant)
31359 {
31360 case 2:
31361 if ((nargs - i) == nargs_constant)
31362 {
31363 error ("the next to last argument must be an 8-bit immediate");
31364 break;
31365 }
31366 case 1:
31367 error ("the last argument must be an 8-bit immediate");
31368 break;
31369 default:
31370 gcc_unreachable ();
31371 }
31372 return const0_rtx;
31373 }
31374 }
31375 else
31376 {
31377 if (VECTOR_MODE_P (mode))
31378 op = safe_vector_operand (op, mode);
31379
31380 /* If we aren't optimizing, only allow one memory operand to
31381 be generated. */
31382 if (memory_operand (op, mode))
31383 num_memory++;
31384
31385 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
31386 {
31387 if (optimize || !match || num_memory > 1)
31388 op = copy_to_mode_reg (mode, op);
31389 }
31390 else
31391 {
31392 op = copy_to_reg (op);
31393 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
31394 }
31395 }
31396
31397 args[i].op = op;
31398 args[i].mode = mode;
31399 }
31400
31401 switch (nargs)
31402 {
31403 case 1:
31404 pat = GEN_FCN (icode) (real_target, args[0].op);
31405 break;
31406 case 2:
31407 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
31408 break;
31409 case 3:
31410 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31411 args[2].op);
31412 break;
31413 case 4:
31414 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31415 args[2].op, args[3].op);
31416 break;
31417 default:
31418 gcc_unreachable ();
31419 }
31420
31421 if (! pat)
31422 return 0;
31423
31424 emit_insn (pat);
31425 return target;
31426 }
31427
31428 /* Subroutine of ix86_expand_builtin to take care of special insns
31429 with variable number of operands. */
31430
31431 static rtx
31432 ix86_expand_special_args_builtin (const struct builtin_description *d,
31433 tree exp, rtx target)
31434 {
31435 tree arg;
31436 rtx pat, op;
31437 unsigned int i, nargs, arg_adjust, memory;
31438 struct
31439 {
31440 rtx op;
31441 enum machine_mode mode;
31442 } args[3];
31443 enum insn_code icode = d->icode;
31444 bool last_arg_constant = false;
31445 const struct insn_data_d *insn_p = &insn_data[icode];
31446 enum machine_mode tmode = insn_p->operand[0].mode;
31447 enum { load, store } klass;
31448
31449 switch ((enum ix86_builtin_func_type) d->flag)
31450 {
31451 case VOID_FTYPE_VOID:
31452 emit_insn (GEN_FCN (icode) (target));
31453 return 0;
31454 case VOID_FTYPE_UINT64:
31455 case VOID_FTYPE_UNSIGNED:
31456 nargs = 0;
31457 klass = store;
31458 memory = 0;
31459 break;
31460
31461 case INT_FTYPE_VOID:
31462 case UINT64_FTYPE_VOID:
31463 case UNSIGNED_FTYPE_VOID:
31464 nargs = 0;
31465 klass = load;
31466 memory = 0;
31467 break;
31468 case UINT64_FTYPE_PUNSIGNED:
31469 case V2DI_FTYPE_PV2DI:
31470 case V4DI_FTYPE_PV4DI:
31471 case V32QI_FTYPE_PCCHAR:
31472 case V16QI_FTYPE_PCCHAR:
31473 case V8SF_FTYPE_PCV4SF:
31474 case V8SF_FTYPE_PCFLOAT:
31475 case V4SF_FTYPE_PCFLOAT:
31476 case V4DF_FTYPE_PCV2DF:
31477 case V4DF_FTYPE_PCDOUBLE:
31478 case V2DF_FTYPE_PCDOUBLE:
31479 case VOID_FTYPE_PVOID:
31480 nargs = 1;
31481 klass = load;
31482 memory = 0;
31483 break;
31484 case VOID_FTYPE_PV2SF_V4SF:
31485 case VOID_FTYPE_PV4DI_V4DI:
31486 case VOID_FTYPE_PV2DI_V2DI:
31487 case VOID_FTYPE_PCHAR_V32QI:
31488 case VOID_FTYPE_PCHAR_V16QI:
31489 case VOID_FTYPE_PFLOAT_V8SF:
31490 case VOID_FTYPE_PFLOAT_V4SF:
31491 case VOID_FTYPE_PDOUBLE_V4DF:
31492 case VOID_FTYPE_PDOUBLE_V2DF:
31493 case VOID_FTYPE_PLONGLONG_LONGLONG:
31494 case VOID_FTYPE_PULONGLONG_ULONGLONG:
31495 case VOID_FTYPE_PINT_INT:
31496 nargs = 1;
31497 klass = store;
31498 /* Reserve memory operand for target. */
31499 memory = ARRAY_SIZE (args);
31500 break;
31501 case V4SF_FTYPE_V4SF_PCV2SF:
31502 case V2DF_FTYPE_V2DF_PCDOUBLE:
31503 nargs = 2;
31504 klass = load;
31505 memory = 1;
31506 break;
31507 case V8SF_FTYPE_PCV8SF_V8SI:
31508 case V4DF_FTYPE_PCV4DF_V4DI:
31509 case V4SF_FTYPE_PCV4SF_V4SI:
31510 case V2DF_FTYPE_PCV2DF_V2DI:
31511 case V8SI_FTYPE_PCV8SI_V8SI:
31512 case V4DI_FTYPE_PCV4DI_V4DI:
31513 case V4SI_FTYPE_PCV4SI_V4SI:
31514 case V2DI_FTYPE_PCV2DI_V2DI:
31515 nargs = 2;
31516 klass = load;
31517 memory = 0;
31518 break;
31519 case VOID_FTYPE_PV8SF_V8SI_V8SF:
31520 case VOID_FTYPE_PV4DF_V4DI_V4DF:
31521 case VOID_FTYPE_PV4SF_V4SI_V4SF:
31522 case VOID_FTYPE_PV2DF_V2DI_V2DF:
31523 case VOID_FTYPE_PV8SI_V8SI_V8SI:
31524 case VOID_FTYPE_PV4DI_V4DI_V4DI:
31525 case VOID_FTYPE_PV4SI_V4SI_V4SI:
31526 case VOID_FTYPE_PV2DI_V2DI_V2DI:
31527 nargs = 2;
31528 klass = store;
31529 /* Reserve memory operand for target. */
31530 memory = ARRAY_SIZE (args);
31531 break;
31532 case VOID_FTYPE_UINT_UINT_UINT:
31533 case VOID_FTYPE_UINT64_UINT_UINT:
31534 case UCHAR_FTYPE_UINT_UINT_UINT:
31535 case UCHAR_FTYPE_UINT64_UINT_UINT:
31536 nargs = 3;
31537 klass = load;
31538 memory = ARRAY_SIZE (args);
31539 last_arg_constant = true;
31540 break;
31541 default:
31542 gcc_unreachable ();
31543 }
31544
31545 gcc_assert (nargs <= ARRAY_SIZE (args));
31546
31547 if (klass == store)
31548 {
31549 arg = CALL_EXPR_ARG (exp, 0);
31550 op = expand_normal (arg);
31551 gcc_assert (target == 0);
31552 if (memory)
31553 {
31554 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31555 target = gen_rtx_MEM (tmode, op);
31556 }
31557 else
31558 target = force_reg (tmode, op);
31559 arg_adjust = 1;
31560 }
31561 else
31562 {
31563 arg_adjust = 0;
31564 if (optimize
31565 || target == 0
31566 || !register_operand (target, tmode)
31567 || GET_MODE (target) != tmode)
31568 target = gen_reg_rtx (tmode);
31569 }
31570
31571 for (i = 0; i < nargs; i++)
31572 {
31573 enum machine_mode mode = insn_p->operand[i + 1].mode;
31574 bool match;
31575
31576 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
31577 op = expand_normal (arg);
31578 match = insn_p->operand[i + 1].predicate (op, mode);
31579
31580 if (last_arg_constant && (i + 1) == nargs)
31581 {
31582 if (!match)
31583 {
31584 if (icode == CODE_FOR_lwp_lwpvalsi3
31585 || icode == CODE_FOR_lwp_lwpinssi3
31586 || icode == CODE_FOR_lwp_lwpvaldi3
31587 || icode == CODE_FOR_lwp_lwpinsdi3)
31588 error ("the last argument must be a 32-bit immediate");
31589 else
31590 error ("the last argument must be an 8-bit immediate");
31591 return const0_rtx;
31592 }
31593 }
31594 else
31595 {
31596 if (i == memory)
31597 {
31598 /* This must be the memory operand. */
31599 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31600 op = gen_rtx_MEM (mode, op);
31601 gcc_assert (GET_MODE (op) == mode
31602 || GET_MODE (op) == VOIDmode);
31603 }
31604 else
31605 {
31606 /* This must be register. */
31607 if (VECTOR_MODE_P (mode))
31608 op = safe_vector_operand (op, mode);
31609
31610 gcc_assert (GET_MODE (op) == mode
31611 || GET_MODE (op) == VOIDmode);
31612 op = copy_to_mode_reg (mode, op);
31613 }
31614 }
31615
31616 args[i].op = op;
31617 args[i].mode = mode;
31618 }
31619
31620 switch (nargs)
31621 {
31622 case 0:
31623 pat = GEN_FCN (icode) (target);
31624 break;
31625 case 1:
31626 pat = GEN_FCN (icode) (target, args[0].op);
31627 break;
31628 case 2:
31629 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
31630 break;
31631 case 3:
31632 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
31633 break;
31634 default:
31635 gcc_unreachable ();
31636 }
31637
31638 if (! pat)
31639 return 0;
31640 emit_insn (pat);
31641 return klass == store ? 0 : target;
31642 }
31643
31644 /* Return the integer constant in ARG. Constrain it to be in the range
31645 of the subparts of VEC_TYPE; issue an error if not. */
31646
31647 static int
31648 get_element_number (tree vec_type, tree arg)
31649 {
31650 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
31651
31652 if (!host_integerp (arg, 1)
31653 || (elt = tree_low_cst (arg, 1), elt > max))
31654 {
31655 error ("selector must be an integer constant in the range 0..%wi", max);
31656 return 0;
31657 }
31658
31659 return elt;
31660 }
31661
31662 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31663 ix86_expand_vector_init. We DO have language-level syntax for this, in
31664 the form of (type){ init-list }. Except that since we can't place emms
31665 instructions from inside the compiler, we can't allow the use of MMX
31666 registers unless the user explicitly asks for it. So we do *not* define
31667 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
31668 we have builtins invoked by mmintrin.h that gives us license to emit
31669 these sorts of instructions. */
31670
31671 static rtx
31672 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
31673 {
31674 enum machine_mode tmode = TYPE_MODE (type);
31675 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
31676 int i, n_elt = GET_MODE_NUNITS (tmode);
31677 rtvec v = rtvec_alloc (n_elt);
31678
31679 gcc_assert (VECTOR_MODE_P (tmode));
31680 gcc_assert (call_expr_nargs (exp) == n_elt);
31681
31682 for (i = 0; i < n_elt; ++i)
31683 {
31684 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
31685 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
31686 }
31687
31688 if (!target || !register_operand (target, tmode))
31689 target = gen_reg_rtx (tmode);
31690
31691 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
31692 return target;
31693 }
31694
31695 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31696 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
31697 had a language-level syntax for referencing vector elements. */
31698
31699 static rtx
31700 ix86_expand_vec_ext_builtin (tree exp, rtx target)
31701 {
31702 enum machine_mode tmode, mode0;
31703 tree arg0, arg1;
31704 int elt;
31705 rtx op0;
31706
31707 arg0 = CALL_EXPR_ARG (exp, 0);
31708 arg1 = CALL_EXPR_ARG (exp, 1);
31709
31710 op0 = expand_normal (arg0);
31711 elt = get_element_number (TREE_TYPE (arg0), arg1);
31712
31713 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31714 mode0 = TYPE_MODE (TREE_TYPE (arg0));
31715 gcc_assert (VECTOR_MODE_P (mode0));
31716
31717 op0 = force_reg (mode0, op0);
31718
31719 if (optimize || !target || !register_operand (target, tmode))
31720 target = gen_reg_rtx (tmode);
31721
31722 ix86_expand_vector_extract (true, target, op0, elt);
31723
31724 return target;
31725 }
31726
31727 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31728 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
31729 a language-level syntax for referencing vector elements. */
31730
31731 static rtx
31732 ix86_expand_vec_set_builtin (tree exp)
31733 {
31734 enum machine_mode tmode, mode1;
31735 tree arg0, arg1, arg2;
31736 int elt;
31737 rtx op0, op1, target;
31738
31739 arg0 = CALL_EXPR_ARG (exp, 0);
31740 arg1 = CALL_EXPR_ARG (exp, 1);
31741 arg2 = CALL_EXPR_ARG (exp, 2);
31742
31743 tmode = TYPE_MODE (TREE_TYPE (arg0));
31744 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31745 gcc_assert (VECTOR_MODE_P (tmode));
31746
31747 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
31748 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
31749 elt = get_element_number (TREE_TYPE (arg0), arg2);
31750
31751 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
31752 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
31753
31754 op0 = force_reg (tmode, op0);
31755 op1 = force_reg (mode1, op1);
31756
31757 /* OP0 is the source of these builtin functions and shouldn't be
31758 modified. Create a copy, use it and return it as target. */
31759 target = gen_reg_rtx (tmode);
31760 emit_move_insn (target, op0);
31761 ix86_expand_vector_set (true, target, op1, elt);
31762
31763 return target;
31764 }
31765
31766 /* Expand an expression EXP that calls a built-in function,
31767 with result going to TARGET if that's convenient
31768 (and in mode MODE if that's convenient).
31769 SUBTARGET may be used as the target for computing one of EXP's operands.
31770 IGNORE is nonzero if the value is to be ignored. */
31771
31772 static rtx
31773 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
31774 enum machine_mode mode ATTRIBUTE_UNUSED,
31775 int ignore ATTRIBUTE_UNUSED)
31776 {
31777 const struct builtin_description *d;
31778 size_t i;
31779 enum insn_code icode;
31780 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
31781 tree arg0, arg1, arg2, arg3, arg4;
31782 rtx op0, op1, op2, op3, op4, pat, insn;
31783 enum machine_mode mode0, mode1, mode2, mode3, mode4;
31784 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
31785
31786 /* For CPU builtins that can be folded, fold first and expand the fold. */
31787 switch (fcode)
31788 {
31789 case IX86_BUILTIN_CPU_INIT:
31790 {
31791 /* Make it call __cpu_indicator_init in libgcc. */
31792 tree call_expr, fndecl, type;
31793 type = build_function_type_list (integer_type_node, NULL_TREE);
31794 fndecl = build_fn_decl ("__cpu_indicator_init", type);
31795 call_expr = build_call_expr (fndecl, 0);
31796 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
31797 }
31798 case IX86_BUILTIN_CPU_IS:
31799 case IX86_BUILTIN_CPU_SUPPORTS:
31800 {
31801 tree arg0 = CALL_EXPR_ARG (exp, 0);
31802 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
31803 gcc_assert (fold_expr != NULL_TREE);
31804 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
31805 }
31806 }
31807
31808 /* Determine whether the builtin function is available under the current ISA.
31809 Originally the builtin was not created if it wasn't applicable to the
31810 current ISA based on the command line switches. With function specific
31811 options, we need to check in the context of the function making the call
31812 whether it is supported. */
31813 if (ix86_builtins_isa[fcode].isa
31814 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
31815 {
31816 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
31817 NULL, (enum fpmath_unit) 0, false);
31818
31819 if (!opts)
31820 error ("%qE needs unknown isa option", fndecl);
31821 else
31822 {
31823 gcc_assert (opts != NULL);
31824 error ("%qE needs isa option %s", fndecl, opts);
31825 free (opts);
31826 }
31827 return const0_rtx;
31828 }
31829
31830 switch (fcode)
31831 {
31832 case IX86_BUILTIN_MASKMOVQ:
31833 case IX86_BUILTIN_MASKMOVDQU:
31834 icode = (fcode == IX86_BUILTIN_MASKMOVQ
31835 ? CODE_FOR_mmx_maskmovq
31836 : CODE_FOR_sse2_maskmovdqu);
31837 /* Note the arg order is different from the operand order. */
31838 arg1 = CALL_EXPR_ARG (exp, 0);
31839 arg2 = CALL_EXPR_ARG (exp, 1);
31840 arg0 = CALL_EXPR_ARG (exp, 2);
31841 op0 = expand_normal (arg0);
31842 op1 = expand_normal (arg1);
31843 op2 = expand_normal (arg2);
31844 mode0 = insn_data[icode].operand[0].mode;
31845 mode1 = insn_data[icode].operand[1].mode;
31846 mode2 = insn_data[icode].operand[2].mode;
31847
31848 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31849 op0 = gen_rtx_MEM (mode1, op0);
31850
31851 if (!insn_data[icode].operand[0].predicate (op0, mode0))
31852 op0 = copy_to_mode_reg (mode0, op0);
31853 if (!insn_data[icode].operand[1].predicate (op1, mode1))
31854 op1 = copy_to_mode_reg (mode1, op1);
31855 if (!insn_data[icode].operand[2].predicate (op2, mode2))
31856 op2 = copy_to_mode_reg (mode2, op2);
31857 pat = GEN_FCN (icode) (op0, op1, op2);
31858 if (! pat)
31859 return 0;
31860 emit_insn (pat);
31861 return 0;
31862
31863 case IX86_BUILTIN_LDMXCSR:
31864 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
31865 target = assign_386_stack_local (SImode, SLOT_TEMP);
31866 emit_move_insn (target, op0);
31867 emit_insn (gen_sse_ldmxcsr (target));
31868 return 0;
31869
31870 case IX86_BUILTIN_STMXCSR:
31871 target = assign_386_stack_local (SImode, SLOT_TEMP);
31872 emit_insn (gen_sse_stmxcsr (target));
31873 return copy_to_mode_reg (SImode, target);
31874
31875 case IX86_BUILTIN_CLFLUSH:
31876 arg0 = CALL_EXPR_ARG (exp, 0);
31877 op0 = expand_normal (arg0);
31878 icode = CODE_FOR_sse2_clflush;
31879 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
31880 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31881
31882 emit_insn (gen_sse2_clflush (op0));
31883 return 0;
31884
31885 case IX86_BUILTIN_MONITOR:
31886 arg0 = CALL_EXPR_ARG (exp, 0);
31887 arg1 = CALL_EXPR_ARG (exp, 1);
31888 arg2 = CALL_EXPR_ARG (exp, 2);
31889 op0 = expand_normal (arg0);
31890 op1 = expand_normal (arg1);
31891 op2 = expand_normal (arg2);
31892 if (!REG_P (op0))
31893 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31894 if (!REG_P (op1))
31895 op1 = copy_to_mode_reg (SImode, op1);
31896 if (!REG_P (op2))
31897 op2 = copy_to_mode_reg (SImode, op2);
31898 emit_insn (ix86_gen_monitor (op0, op1, op2));
31899 return 0;
31900
31901 case IX86_BUILTIN_MWAIT:
31902 arg0 = CALL_EXPR_ARG (exp, 0);
31903 arg1 = CALL_EXPR_ARG (exp, 1);
31904 op0 = expand_normal (arg0);
31905 op1 = expand_normal (arg1);
31906 if (!REG_P (op0))
31907 op0 = copy_to_mode_reg (SImode, op0);
31908 if (!REG_P (op1))
31909 op1 = copy_to_mode_reg (SImode, op1);
31910 emit_insn (gen_sse3_mwait (op0, op1));
31911 return 0;
31912
31913 case IX86_BUILTIN_VEC_INIT_V2SI:
31914 case IX86_BUILTIN_VEC_INIT_V4HI:
31915 case IX86_BUILTIN_VEC_INIT_V8QI:
31916 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
31917
31918 case IX86_BUILTIN_VEC_EXT_V2DF:
31919 case IX86_BUILTIN_VEC_EXT_V2DI:
31920 case IX86_BUILTIN_VEC_EXT_V4SF:
31921 case IX86_BUILTIN_VEC_EXT_V4SI:
31922 case IX86_BUILTIN_VEC_EXT_V8HI:
31923 case IX86_BUILTIN_VEC_EXT_V2SI:
31924 case IX86_BUILTIN_VEC_EXT_V4HI:
31925 case IX86_BUILTIN_VEC_EXT_V16QI:
31926 return ix86_expand_vec_ext_builtin (exp, target);
31927
31928 case IX86_BUILTIN_VEC_SET_V2DI:
31929 case IX86_BUILTIN_VEC_SET_V4SF:
31930 case IX86_BUILTIN_VEC_SET_V4SI:
31931 case IX86_BUILTIN_VEC_SET_V8HI:
31932 case IX86_BUILTIN_VEC_SET_V4HI:
31933 case IX86_BUILTIN_VEC_SET_V16QI:
31934 return ix86_expand_vec_set_builtin (exp);
31935
31936 case IX86_BUILTIN_INFQ:
31937 case IX86_BUILTIN_HUGE_VALQ:
31938 {
31939 REAL_VALUE_TYPE inf;
31940 rtx tmp;
31941
31942 real_inf (&inf);
31943 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
31944
31945 tmp = validize_mem (force_const_mem (mode, tmp));
31946
31947 if (target == 0)
31948 target = gen_reg_rtx (mode);
31949
31950 emit_move_insn (target, tmp);
31951 return target;
31952 }
31953
31954 case IX86_BUILTIN_RDPMC:
31955 case IX86_BUILTIN_RDTSC:
31956 case IX86_BUILTIN_RDTSCP:
31957
31958 op0 = gen_reg_rtx (DImode);
31959 op1 = gen_reg_rtx (DImode);
31960
31961 if (fcode == IX86_BUILTIN_RDPMC)
31962 {
31963 arg0 = CALL_EXPR_ARG (exp, 0);
31964 op2 = expand_normal (arg0);
31965 if (!register_operand (op2, SImode))
31966 op2 = copy_to_mode_reg (SImode, op2);
31967
31968 insn = (TARGET_64BIT
31969 ? gen_rdpmc_rex64 (op0, op1, op2)
31970 : gen_rdpmc (op0, op2));
31971 emit_insn (insn);
31972 }
31973 else if (fcode == IX86_BUILTIN_RDTSC)
31974 {
31975 insn = (TARGET_64BIT
31976 ? gen_rdtsc_rex64 (op0, op1)
31977 : gen_rdtsc (op0));
31978 emit_insn (insn);
31979 }
31980 else
31981 {
31982 op2 = gen_reg_rtx (SImode);
31983
31984 insn = (TARGET_64BIT
31985 ? gen_rdtscp_rex64 (op0, op1, op2)
31986 : gen_rdtscp (op0, op2));
31987 emit_insn (insn);
31988
31989 arg0 = CALL_EXPR_ARG (exp, 0);
31990 op4 = expand_normal (arg0);
31991 if (!address_operand (op4, VOIDmode))
31992 {
31993 op4 = convert_memory_address (Pmode, op4);
31994 op4 = copy_addr_to_reg (op4);
31995 }
31996 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
31997 }
31998
31999 if (target == 0)
32000 target = gen_reg_rtx (mode);
32001
32002 if (TARGET_64BIT)
32003 {
32004 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
32005 op1, 1, OPTAB_DIRECT);
32006 op0 = expand_simple_binop (DImode, IOR, op0, op1,
32007 op0, 1, OPTAB_DIRECT);
32008 }
32009
32010 emit_move_insn (target, op0);
32011 return target;
32012
32013 case IX86_BUILTIN_FXSAVE:
32014 case IX86_BUILTIN_FXRSTOR:
32015 case IX86_BUILTIN_FXSAVE64:
32016 case IX86_BUILTIN_FXRSTOR64:
32017 switch (fcode)
32018 {
32019 case IX86_BUILTIN_FXSAVE:
32020 icode = CODE_FOR_fxsave;
32021 break;
32022 case IX86_BUILTIN_FXRSTOR:
32023 icode = CODE_FOR_fxrstor;
32024 break;
32025 case IX86_BUILTIN_FXSAVE64:
32026 icode = CODE_FOR_fxsave64;
32027 break;
32028 case IX86_BUILTIN_FXRSTOR64:
32029 icode = CODE_FOR_fxrstor64;
32030 break;
32031 default:
32032 gcc_unreachable ();
32033 }
32034
32035 arg0 = CALL_EXPR_ARG (exp, 0);
32036 op0 = expand_normal (arg0);
32037
32038 if (!address_operand (op0, VOIDmode))
32039 {
32040 op0 = convert_memory_address (Pmode, op0);
32041 op0 = copy_addr_to_reg (op0);
32042 }
32043 op0 = gen_rtx_MEM (BLKmode, op0);
32044
32045 pat = GEN_FCN (icode) (op0);
32046 if (pat)
32047 emit_insn (pat);
32048 return 0;
32049
32050 case IX86_BUILTIN_XSAVE:
32051 case IX86_BUILTIN_XRSTOR:
32052 case IX86_BUILTIN_XSAVE64:
32053 case IX86_BUILTIN_XRSTOR64:
32054 case IX86_BUILTIN_XSAVEOPT:
32055 case IX86_BUILTIN_XSAVEOPT64:
32056 arg0 = CALL_EXPR_ARG (exp, 0);
32057 arg1 = CALL_EXPR_ARG (exp, 1);
32058 op0 = expand_normal (arg0);
32059 op1 = expand_normal (arg1);
32060
32061 if (!address_operand (op0, VOIDmode))
32062 {
32063 op0 = convert_memory_address (Pmode, op0);
32064 op0 = copy_addr_to_reg (op0);
32065 }
32066 op0 = gen_rtx_MEM (BLKmode, op0);
32067
32068 op1 = force_reg (DImode, op1);
32069
32070 if (TARGET_64BIT)
32071 {
32072 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
32073 NULL, 1, OPTAB_DIRECT);
32074 switch (fcode)
32075 {
32076 case IX86_BUILTIN_XSAVE:
32077 icode = CODE_FOR_xsave_rex64;
32078 break;
32079 case IX86_BUILTIN_XRSTOR:
32080 icode = CODE_FOR_xrstor_rex64;
32081 break;
32082 case IX86_BUILTIN_XSAVE64:
32083 icode = CODE_FOR_xsave64;
32084 break;
32085 case IX86_BUILTIN_XRSTOR64:
32086 icode = CODE_FOR_xrstor64;
32087 break;
32088 case IX86_BUILTIN_XSAVEOPT:
32089 icode = CODE_FOR_xsaveopt_rex64;
32090 break;
32091 case IX86_BUILTIN_XSAVEOPT64:
32092 icode = CODE_FOR_xsaveopt64;
32093 break;
32094 default:
32095 gcc_unreachable ();
32096 }
32097
32098 op2 = gen_lowpart (SImode, op2);
32099 op1 = gen_lowpart (SImode, op1);
32100 pat = GEN_FCN (icode) (op0, op1, op2);
32101 }
32102 else
32103 {
32104 switch (fcode)
32105 {
32106 case IX86_BUILTIN_XSAVE:
32107 icode = CODE_FOR_xsave;
32108 break;
32109 case IX86_BUILTIN_XRSTOR:
32110 icode = CODE_FOR_xrstor;
32111 break;
32112 case IX86_BUILTIN_XSAVEOPT:
32113 icode = CODE_FOR_xsaveopt;
32114 break;
32115 default:
32116 gcc_unreachable ();
32117 }
32118 pat = GEN_FCN (icode) (op0, op1);
32119 }
32120
32121 if (pat)
32122 emit_insn (pat);
32123 return 0;
32124
32125 case IX86_BUILTIN_LLWPCB:
32126 arg0 = CALL_EXPR_ARG (exp, 0);
32127 op0 = expand_normal (arg0);
32128 icode = CODE_FOR_lwp_llwpcb;
32129 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
32130 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32131 emit_insn (gen_lwp_llwpcb (op0));
32132 return 0;
32133
32134 case IX86_BUILTIN_SLWPCB:
32135 icode = CODE_FOR_lwp_slwpcb;
32136 if (!target
32137 || !insn_data[icode].operand[0].predicate (target, Pmode))
32138 target = gen_reg_rtx (Pmode);
32139 emit_insn (gen_lwp_slwpcb (target));
32140 return target;
32141
32142 case IX86_BUILTIN_BEXTRI32:
32143 case IX86_BUILTIN_BEXTRI64:
32144 arg0 = CALL_EXPR_ARG (exp, 0);
32145 arg1 = CALL_EXPR_ARG (exp, 1);
32146 op0 = expand_normal (arg0);
32147 op1 = expand_normal (arg1);
32148 icode = (fcode == IX86_BUILTIN_BEXTRI32
32149 ? CODE_FOR_tbm_bextri_si
32150 : CODE_FOR_tbm_bextri_di);
32151 if (!CONST_INT_P (op1))
32152 {
32153 error ("last argument must be an immediate");
32154 return const0_rtx;
32155 }
32156 else
32157 {
32158 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
32159 unsigned char lsb_index = INTVAL (op1) & 0xFF;
32160 op1 = GEN_INT (length);
32161 op2 = GEN_INT (lsb_index);
32162 pat = GEN_FCN (icode) (target, op0, op1, op2);
32163 if (pat)
32164 emit_insn (pat);
32165 return target;
32166 }
32167
32168 case IX86_BUILTIN_RDRAND16_STEP:
32169 icode = CODE_FOR_rdrandhi_1;
32170 mode0 = HImode;
32171 goto rdrand_step;
32172
32173 case IX86_BUILTIN_RDRAND32_STEP:
32174 icode = CODE_FOR_rdrandsi_1;
32175 mode0 = SImode;
32176 goto rdrand_step;
32177
32178 case IX86_BUILTIN_RDRAND64_STEP:
32179 icode = CODE_FOR_rdranddi_1;
32180 mode0 = DImode;
32181
32182 rdrand_step:
32183 op0 = gen_reg_rtx (mode0);
32184 emit_insn (GEN_FCN (icode) (op0));
32185
32186 arg0 = CALL_EXPR_ARG (exp, 0);
32187 op1 = expand_normal (arg0);
32188 if (!address_operand (op1, VOIDmode))
32189 {
32190 op1 = convert_memory_address (Pmode, op1);
32191 op1 = copy_addr_to_reg (op1);
32192 }
32193 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
32194
32195 op1 = gen_reg_rtx (SImode);
32196 emit_move_insn (op1, CONST1_RTX (SImode));
32197
32198 /* Emit SImode conditional move. */
32199 if (mode0 == HImode)
32200 {
32201 op2 = gen_reg_rtx (SImode);
32202 emit_insn (gen_zero_extendhisi2 (op2, op0));
32203 }
32204 else if (mode0 == SImode)
32205 op2 = op0;
32206 else
32207 op2 = gen_rtx_SUBREG (SImode, op0, 0);
32208
32209 if (target == 0)
32210 target = gen_reg_rtx (SImode);
32211
32212 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
32213 const0_rtx);
32214 emit_insn (gen_rtx_SET (VOIDmode, target,
32215 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
32216 return target;
32217
32218 case IX86_BUILTIN_RDSEED16_STEP:
32219 icode = CODE_FOR_rdseedhi_1;
32220 mode0 = HImode;
32221 goto rdseed_step;
32222
32223 case IX86_BUILTIN_RDSEED32_STEP:
32224 icode = CODE_FOR_rdseedsi_1;
32225 mode0 = SImode;
32226 goto rdseed_step;
32227
32228 case IX86_BUILTIN_RDSEED64_STEP:
32229 icode = CODE_FOR_rdseeddi_1;
32230 mode0 = DImode;
32231
32232 rdseed_step:
32233 op0 = gen_reg_rtx (mode0);
32234 emit_insn (GEN_FCN (icode) (op0));
32235
32236 arg0 = CALL_EXPR_ARG (exp, 0);
32237 op1 = expand_normal (arg0);
32238 if (!address_operand (op1, VOIDmode))
32239 {
32240 op1 = convert_memory_address (Pmode, op1);
32241 op1 = copy_addr_to_reg (op1);
32242 }
32243 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
32244
32245 op2 = gen_reg_rtx (QImode);
32246
32247 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
32248 const0_rtx);
32249 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
32250
32251 if (target == 0)
32252 target = gen_reg_rtx (SImode);
32253
32254 emit_insn (gen_zero_extendqisi2 (target, op2));
32255 return target;
32256
32257 case IX86_BUILTIN_ADDCARRYX32:
32258 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
32259 mode0 = SImode;
32260 goto addcarryx;
32261
32262 case IX86_BUILTIN_ADDCARRYX64:
32263 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
32264 mode0 = DImode;
32265
32266 addcarryx:
32267 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
32268 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
32269 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
32270 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
32271
32272 op0 = gen_reg_rtx (QImode);
32273
32274 /* Generate CF from input operand. */
32275 op1 = expand_normal (arg0);
32276 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
32277 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
32278
32279 /* Gen ADCX instruction to compute X+Y+CF. */
32280 op2 = expand_normal (arg1);
32281 op3 = expand_normal (arg2);
32282
32283 if (!REG_P (op2))
32284 op2 = copy_to_mode_reg (mode0, op2);
32285 if (!REG_P (op3))
32286 op3 = copy_to_mode_reg (mode0, op3);
32287
32288 op0 = gen_reg_rtx (mode0);
32289
32290 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
32291 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
32292 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
32293
32294 /* Store the result. */
32295 op4 = expand_normal (arg3);
32296 if (!address_operand (op4, VOIDmode))
32297 {
32298 op4 = convert_memory_address (Pmode, op4);
32299 op4 = copy_addr_to_reg (op4);
32300 }
32301 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
32302
32303 /* Return current CF value. */
32304 if (target == 0)
32305 target = gen_reg_rtx (QImode);
32306
32307 PUT_MODE (pat, QImode);
32308 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
32309 return target;
32310
32311 case IX86_BUILTIN_GATHERSIV2DF:
32312 icode = CODE_FOR_avx2_gathersiv2df;
32313 goto gather_gen;
32314 case IX86_BUILTIN_GATHERSIV4DF:
32315 icode = CODE_FOR_avx2_gathersiv4df;
32316 goto gather_gen;
32317 case IX86_BUILTIN_GATHERDIV2DF:
32318 icode = CODE_FOR_avx2_gatherdiv2df;
32319 goto gather_gen;
32320 case IX86_BUILTIN_GATHERDIV4DF:
32321 icode = CODE_FOR_avx2_gatherdiv4df;
32322 goto gather_gen;
32323 case IX86_BUILTIN_GATHERSIV4SF:
32324 icode = CODE_FOR_avx2_gathersiv4sf;
32325 goto gather_gen;
32326 case IX86_BUILTIN_GATHERSIV8SF:
32327 icode = CODE_FOR_avx2_gathersiv8sf;
32328 goto gather_gen;
32329 case IX86_BUILTIN_GATHERDIV4SF:
32330 icode = CODE_FOR_avx2_gatherdiv4sf;
32331 goto gather_gen;
32332 case IX86_BUILTIN_GATHERDIV8SF:
32333 icode = CODE_FOR_avx2_gatherdiv8sf;
32334 goto gather_gen;
32335 case IX86_BUILTIN_GATHERSIV2DI:
32336 icode = CODE_FOR_avx2_gathersiv2di;
32337 goto gather_gen;
32338 case IX86_BUILTIN_GATHERSIV4DI:
32339 icode = CODE_FOR_avx2_gathersiv4di;
32340 goto gather_gen;
32341 case IX86_BUILTIN_GATHERDIV2DI:
32342 icode = CODE_FOR_avx2_gatherdiv2di;
32343 goto gather_gen;
32344 case IX86_BUILTIN_GATHERDIV4DI:
32345 icode = CODE_FOR_avx2_gatherdiv4di;
32346 goto gather_gen;
32347 case IX86_BUILTIN_GATHERSIV4SI:
32348 icode = CODE_FOR_avx2_gathersiv4si;
32349 goto gather_gen;
32350 case IX86_BUILTIN_GATHERSIV8SI:
32351 icode = CODE_FOR_avx2_gathersiv8si;
32352 goto gather_gen;
32353 case IX86_BUILTIN_GATHERDIV4SI:
32354 icode = CODE_FOR_avx2_gatherdiv4si;
32355 goto gather_gen;
32356 case IX86_BUILTIN_GATHERDIV8SI:
32357 icode = CODE_FOR_avx2_gatherdiv8si;
32358 goto gather_gen;
32359 case IX86_BUILTIN_GATHERALTSIV4DF:
32360 icode = CODE_FOR_avx2_gathersiv4df;
32361 goto gather_gen;
32362 case IX86_BUILTIN_GATHERALTDIV8SF:
32363 icode = CODE_FOR_avx2_gatherdiv8sf;
32364 goto gather_gen;
32365 case IX86_BUILTIN_GATHERALTSIV4DI:
32366 icode = CODE_FOR_avx2_gathersiv4di;
32367 goto gather_gen;
32368 case IX86_BUILTIN_GATHERALTDIV8SI:
32369 icode = CODE_FOR_avx2_gatherdiv8si;
32370 goto gather_gen;
32371
32372 gather_gen:
32373 arg0 = CALL_EXPR_ARG (exp, 0);
32374 arg1 = CALL_EXPR_ARG (exp, 1);
32375 arg2 = CALL_EXPR_ARG (exp, 2);
32376 arg3 = CALL_EXPR_ARG (exp, 3);
32377 arg4 = CALL_EXPR_ARG (exp, 4);
32378 op0 = expand_normal (arg0);
32379 op1 = expand_normal (arg1);
32380 op2 = expand_normal (arg2);
32381 op3 = expand_normal (arg3);
32382 op4 = expand_normal (arg4);
32383 /* Note the arg order is different from the operand order. */
32384 mode0 = insn_data[icode].operand[1].mode;
32385 mode2 = insn_data[icode].operand[3].mode;
32386 mode3 = insn_data[icode].operand[4].mode;
32387 mode4 = insn_data[icode].operand[5].mode;
32388
32389 if (target == NULL_RTX
32390 || GET_MODE (target) != insn_data[icode].operand[0].mode)
32391 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
32392 else
32393 subtarget = target;
32394
32395 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
32396 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
32397 {
32398 rtx half = gen_reg_rtx (V4SImode);
32399 if (!nonimmediate_operand (op2, V8SImode))
32400 op2 = copy_to_mode_reg (V8SImode, op2);
32401 emit_insn (gen_vec_extract_lo_v8si (half, op2));
32402 op2 = half;
32403 }
32404 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
32405 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
32406 {
32407 rtx (*gen) (rtx, rtx);
32408 rtx half = gen_reg_rtx (mode0);
32409 if (mode0 == V4SFmode)
32410 gen = gen_vec_extract_lo_v8sf;
32411 else
32412 gen = gen_vec_extract_lo_v8si;
32413 if (!nonimmediate_operand (op0, GET_MODE (op0)))
32414 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
32415 emit_insn (gen (half, op0));
32416 op0 = half;
32417 if (!nonimmediate_operand (op3, GET_MODE (op3)))
32418 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
32419 emit_insn (gen (half, op3));
32420 op3 = half;
32421 }
32422
32423 /* Force memory operand only with base register here. But we
32424 don't want to do it on memory operand for other builtin
32425 functions. */
32426 op1 = force_reg (Pmode, convert_to_mode (Pmode, op1, 1));
32427
32428 if (!insn_data[icode].operand[1].predicate (op0, mode0))
32429 op0 = copy_to_mode_reg (mode0, op0);
32430 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
32431 op1 = copy_to_mode_reg (Pmode, op1);
32432 if (!insn_data[icode].operand[3].predicate (op2, mode2))
32433 op2 = copy_to_mode_reg (mode2, op2);
32434 if (!insn_data[icode].operand[4].predicate (op3, mode3))
32435 op3 = copy_to_mode_reg (mode3, op3);
32436 if (!insn_data[icode].operand[5].predicate (op4, mode4))
32437 {
32438 error ("last argument must be scale 1, 2, 4, 8");
32439 return const0_rtx;
32440 }
32441
32442 /* Optimize. If mask is known to have all high bits set,
32443 replace op0 with pc_rtx to signal that the instruction
32444 overwrites the whole destination and doesn't use its
32445 previous contents. */
32446 if (optimize)
32447 {
32448 if (TREE_CODE (arg3) == VECTOR_CST)
32449 {
32450 unsigned int negative = 0;
32451 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
32452 {
32453 tree cst = VECTOR_CST_ELT (arg3, i);
32454 if (TREE_CODE (cst) == INTEGER_CST
32455 && tree_int_cst_sign_bit (cst))
32456 negative++;
32457 else if (TREE_CODE (cst) == REAL_CST
32458 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
32459 negative++;
32460 }
32461 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
32462 op0 = pc_rtx;
32463 }
32464 else if (TREE_CODE (arg3) == SSA_NAME)
32465 {
32466 /* Recognize also when mask is like:
32467 __v2df src = _mm_setzero_pd ();
32468 __v2df mask = _mm_cmpeq_pd (src, src);
32469 or
32470 __v8sf src = _mm256_setzero_ps ();
32471 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
32472 as that is a cheaper way to load all ones into
32473 a register than having to load a constant from
32474 memory. */
32475 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
32476 if (is_gimple_call (def_stmt))
32477 {
32478 tree fndecl = gimple_call_fndecl (def_stmt);
32479 if (fndecl
32480 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
32481 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
32482 {
32483 case IX86_BUILTIN_CMPPD:
32484 case IX86_BUILTIN_CMPPS:
32485 case IX86_BUILTIN_CMPPD256:
32486 case IX86_BUILTIN_CMPPS256:
32487 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
32488 break;
32489 /* FALLTHRU */
32490 case IX86_BUILTIN_CMPEQPD:
32491 case IX86_BUILTIN_CMPEQPS:
32492 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
32493 && initializer_zerop (gimple_call_arg (def_stmt,
32494 1)))
32495 op0 = pc_rtx;
32496 break;
32497 default:
32498 break;
32499 }
32500 }
32501 }
32502 }
32503
32504 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
32505 if (! pat)
32506 return const0_rtx;
32507 emit_insn (pat);
32508
32509 if (fcode == IX86_BUILTIN_GATHERDIV8SF
32510 || fcode == IX86_BUILTIN_GATHERDIV8SI)
32511 {
32512 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
32513 ? V4SFmode : V4SImode;
32514 if (target == NULL_RTX)
32515 target = gen_reg_rtx (tmode);
32516 if (tmode == V4SFmode)
32517 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
32518 else
32519 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
32520 }
32521 else
32522 target = subtarget;
32523
32524 return target;
32525
32526 case IX86_BUILTIN_XABORT:
32527 icode = CODE_FOR_xabort;
32528 arg0 = CALL_EXPR_ARG (exp, 0);
32529 op0 = expand_normal (arg0);
32530 mode0 = insn_data[icode].operand[0].mode;
32531 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32532 {
32533 error ("the xabort's argument must be an 8-bit immediate");
32534 return const0_rtx;
32535 }
32536 emit_insn (gen_xabort (op0));
32537 return 0;
32538
32539 default:
32540 break;
32541 }
32542
32543 for (i = 0, d = bdesc_special_args;
32544 i < ARRAY_SIZE (bdesc_special_args);
32545 i++, d++)
32546 if (d->code == fcode)
32547 return ix86_expand_special_args_builtin (d, exp, target);
32548
32549 for (i = 0, d = bdesc_args;
32550 i < ARRAY_SIZE (bdesc_args);
32551 i++, d++)
32552 if (d->code == fcode)
32553 switch (fcode)
32554 {
32555 case IX86_BUILTIN_FABSQ:
32556 case IX86_BUILTIN_COPYSIGNQ:
32557 if (!TARGET_SSE)
32558 /* Emit a normal call if SSE isn't available. */
32559 return expand_call (exp, target, ignore);
32560 default:
32561 return ix86_expand_args_builtin (d, exp, target);
32562 }
32563
32564 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
32565 if (d->code == fcode)
32566 return ix86_expand_sse_comi (d, exp, target);
32567
32568 for (i = 0, d = bdesc_pcmpestr;
32569 i < ARRAY_SIZE (bdesc_pcmpestr);
32570 i++, d++)
32571 if (d->code == fcode)
32572 return ix86_expand_sse_pcmpestr (d, exp, target);
32573
32574 for (i = 0, d = bdesc_pcmpistr;
32575 i < ARRAY_SIZE (bdesc_pcmpistr);
32576 i++, d++)
32577 if (d->code == fcode)
32578 return ix86_expand_sse_pcmpistr (d, exp, target);
32579
32580 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
32581 if (d->code == fcode)
32582 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
32583 (enum ix86_builtin_func_type)
32584 d->flag, d->comparison);
32585
32586 gcc_unreachable ();
32587 }
32588
32589 /* Returns a function decl for a vectorized version of the builtin function
32590 with builtin function code FN and the result vector type TYPE, or NULL_TREE
32591 if it is not available. */
32592
32593 static tree
32594 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
32595 tree type_in)
32596 {
32597 enum machine_mode in_mode, out_mode;
32598 int in_n, out_n;
32599 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
32600
32601 if (TREE_CODE (type_out) != VECTOR_TYPE
32602 || TREE_CODE (type_in) != VECTOR_TYPE
32603 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
32604 return NULL_TREE;
32605
32606 out_mode = TYPE_MODE (TREE_TYPE (type_out));
32607 out_n = TYPE_VECTOR_SUBPARTS (type_out);
32608 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32609 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32610
32611 switch (fn)
32612 {
32613 case BUILT_IN_SQRT:
32614 if (out_mode == DFmode && in_mode == DFmode)
32615 {
32616 if (out_n == 2 && in_n == 2)
32617 return ix86_builtins[IX86_BUILTIN_SQRTPD];
32618 else if (out_n == 4 && in_n == 4)
32619 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
32620 }
32621 break;
32622
32623 case BUILT_IN_SQRTF:
32624 if (out_mode == SFmode && in_mode == SFmode)
32625 {
32626 if (out_n == 4 && in_n == 4)
32627 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
32628 else if (out_n == 8 && in_n == 8)
32629 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
32630 }
32631 break;
32632
32633 case BUILT_IN_IFLOOR:
32634 case BUILT_IN_LFLOOR:
32635 case BUILT_IN_LLFLOOR:
32636 /* The round insn does not trap on denormals. */
32637 if (flag_trapping_math || !TARGET_ROUND)
32638 break;
32639
32640 if (out_mode == SImode && in_mode == DFmode)
32641 {
32642 if (out_n == 4 && in_n == 2)
32643 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
32644 else if (out_n == 8 && in_n == 4)
32645 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
32646 }
32647 break;
32648
32649 case BUILT_IN_IFLOORF:
32650 case BUILT_IN_LFLOORF:
32651 case BUILT_IN_LLFLOORF:
32652 /* The round insn does not trap on denormals. */
32653 if (flag_trapping_math || !TARGET_ROUND)
32654 break;
32655
32656 if (out_mode == SImode && in_mode == SFmode)
32657 {
32658 if (out_n == 4 && in_n == 4)
32659 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
32660 else if (out_n == 8 && in_n == 8)
32661 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
32662 }
32663 break;
32664
32665 case BUILT_IN_ICEIL:
32666 case BUILT_IN_LCEIL:
32667 case BUILT_IN_LLCEIL:
32668 /* The round insn does not trap on denormals. */
32669 if (flag_trapping_math || !TARGET_ROUND)
32670 break;
32671
32672 if (out_mode == SImode && in_mode == DFmode)
32673 {
32674 if (out_n == 4 && in_n == 2)
32675 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
32676 else if (out_n == 8 && in_n == 4)
32677 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
32678 }
32679 break;
32680
32681 case BUILT_IN_ICEILF:
32682 case BUILT_IN_LCEILF:
32683 case BUILT_IN_LLCEILF:
32684 /* The round insn does not trap on denormals. */
32685 if (flag_trapping_math || !TARGET_ROUND)
32686 break;
32687
32688 if (out_mode == SImode && in_mode == SFmode)
32689 {
32690 if (out_n == 4 && in_n == 4)
32691 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
32692 else if (out_n == 8 && in_n == 8)
32693 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
32694 }
32695 break;
32696
32697 case BUILT_IN_IRINT:
32698 case BUILT_IN_LRINT:
32699 case BUILT_IN_LLRINT:
32700 if (out_mode == SImode && in_mode == DFmode)
32701 {
32702 if (out_n == 4 && in_n == 2)
32703 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
32704 else if (out_n == 8 && in_n == 4)
32705 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
32706 }
32707 break;
32708
32709 case BUILT_IN_IRINTF:
32710 case BUILT_IN_LRINTF:
32711 case BUILT_IN_LLRINTF:
32712 if (out_mode == SImode && in_mode == SFmode)
32713 {
32714 if (out_n == 4 && in_n == 4)
32715 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
32716 else if (out_n == 8 && in_n == 8)
32717 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
32718 }
32719 break;
32720
32721 case BUILT_IN_IROUND:
32722 case BUILT_IN_LROUND:
32723 case BUILT_IN_LLROUND:
32724 /* The round insn does not trap on denormals. */
32725 if (flag_trapping_math || !TARGET_ROUND)
32726 break;
32727
32728 if (out_mode == SImode && in_mode == DFmode)
32729 {
32730 if (out_n == 4 && in_n == 2)
32731 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
32732 else if (out_n == 8 && in_n == 4)
32733 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
32734 }
32735 break;
32736
32737 case BUILT_IN_IROUNDF:
32738 case BUILT_IN_LROUNDF:
32739 case BUILT_IN_LLROUNDF:
32740 /* The round insn does not trap on denormals. */
32741 if (flag_trapping_math || !TARGET_ROUND)
32742 break;
32743
32744 if (out_mode == SImode && in_mode == SFmode)
32745 {
32746 if (out_n == 4 && in_n == 4)
32747 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
32748 else if (out_n == 8 && in_n == 8)
32749 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
32750 }
32751 break;
32752
32753 case BUILT_IN_COPYSIGN:
32754 if (out_mode == DFmode && in_mode == DFmode)
32755 {
32756 if (out_n == 2 && in_n == 2)
32757 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
32758 else if (out_n == 4 && in_n == 4)
32759 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
32760 }
32761 break;
32762
32763 case BUILT_IN_COPYSIGNF:
32764 if (out_mode == SFmode && in_mode == SFmode)
32765 {
32766 if (out_n == 4 && in_n == 4)
32767 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
32768 else if (out_n == 8 && in_n == 8)
32769 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
32770 }
32771 break;
32772
32773 case BUILT_IN_FLOOR:
32774 /* The round insn does not trap on denormals. */
32775 if (flag_trapping_math || !TARGET_ROUND)
32776 break;
32777
32778 if (out_mode == DFmode && in_mode == DFmode)
32779 {
32780 if (out_n == 2 && in_n == 2)
32781 return ix86_builtins[IX86_BUILTIN_FLOORPD];
32782 else if (out_n == 4 && in_n == 4)
32783 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
32784 }
32785 break;
32786
32787 case BUILT_IN_FLOORF:
32788 /* The round insn does not trap on denormals. */
32789 if (flag_trapping_math || !TARGET_ROUND)
32790 break;
32791
32792 if (out_mode == SFmode && in_mode == SFmode)
32793 {
32794 if (out_n == 4 && in_n == 4)
32795 return ix86_builtins[IX86_BUILTIN_FLOORPS];
32796 else if (out_n == 8 && in_n == 8)
32797 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
32798 }
32799 break;
32800
32801 case BUILT_IN_CEIL:
32802 /* The round insn does not trap on denormals. */
32803 if (flag_trapping_math || !TARGET_ROUND)
32804 break;
32805
32806 if (out_mode == DFmode && in_mode == DFmode)
32807 {
32808 if (out_n == 2 && in_n == 2)
32809 return ix86_builtins[IX86_BUILTIN_CEILPD];
32810 else if (out_n == 4 && in_n == 4)
32811 return ix86_builtins[IX86_BUILTIN_CEILPD256];
32812 }
32813 break;
32814
32815 case BUILT_IN_CEILF:
32816 /* The round insn does not trap on denormals. */
32817 if (flag_trapping_math || !TARGET_ROUND)
32818 break;
32819
32820 if (out_mode == SFmode && in_mode == SFmode)
32821 {
32822 if (out_n == 4 && in_n == 4)
32823 return ix86_builtins[IX86_BUILTIN_CEILPS];
32824 else if (out_n == 8 && in_n == 8)
32825 return ix86_builtins[IX86_BUILTIN_CEILPS256];
32826 }
32827 break;
32828
32829 case BUILT_IN_TRUNC:
32830 /* The round insn does not trap on denormals. */
32831 if (flag_trapping_math || !TARGET_ROUND)
32832 break;
32833
32834 if (out_mode == DFmode && in_mode == DFmode)
32835 {
32836 if (out_n == 2 && in_n == 2)
32837 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
32838 else if (out_n == 4 && in_n == 4)
32839 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
32840 }
32841 break;
32842
32843 case BUILT_IN_TRUNCF:
32844 /* The round insn does not trap on denormals. */
32845 if (flag_trapping_math || !TARGET_ROUND)
32846 break;
32847
32848 if (out_mode == SFmode && in_mode == SFmode)
32849 {
32850 if (out_n == 4 && in_n == 4)
32851 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
32852 else if (out_n == 8 && in_n == 8)
32853 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
32854 }
32855 break;
32856
32857 case BUILT_IN_RINT:
32858 /* The round insn does not trap on denormals. */
32859 if (flag_trapping_math || !TARGET_ROUND)
32860 break;
32861
32862 if (out_mode == DFmode && in_mode == DFmode)
32863 {
32864 if (out_n == 2 && in_n == 2)
32865 return ix86_builtins[IX86_BUILTIN_RINTPD];
32866 else if (out_n == 4 && in_n == 4)
32867 return ix86_builtins[IX86_BUILTIN_RINTPD256];
32868 }
32869 break;
32870
32871 case BUILT_IN_RINTF:
32872 /* The round insn does not trap on denormals. */
32873 if (flag_trapping_math || !TARGET_ROUND)
32874 break;
32875
32876 if (out_mode == SFmode && in_mode == SFmode)
32877 {
32878 if (out_n == 4 && in_n == 4)
32879 return ix86_builtins[IX86_BUILTIN_RINTPS];
32880 else if (out_n == 8 && in_n == 8)
32881 return ix86_builtins[IX86_BUILTIN_RINTPS256];
32882 }
32883 break;
32884
32885 case BUILT_IN_ROUND:
32886 /* The round insn does not trap on denormals. */
32887 if (flag_trapping_math || !TARGET_ROUND)
32888 break;
32889
32890 if (out_mode == DFmode && in_mode == DFmode)
32891 {
32892 if (out_n == 2 && in_n == 2)
32893 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
32894 else if (out_n == 4 && in_n == 4)
32895 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
32896 }
32897 break;
32898
32899 case BUILT_IN_ROUNDF:
32900 /* The round insn does not trap on denormals. */
32901 if (flag_trapping_math || !TARGET_ROUND)
32902 break;
32903
32904 if (out_mode == SFmode && in_mode == SFmode)
32905 {
32906 if (out_n == 4 && in_n == 4)
32907 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
32908 else if (out_n == 8 && in_n == 8)
32909 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
32910 }
32911 break;
32912
32913 case BUILT_IN_FMA:
32914 if (out_mode == DFmode && in_mode == DFmode)
32915 {
32916 if (out_n == 2 && in_n == 2)
32917 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
32918 if (out_n == 4 && in_n == 4)
32919 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
32920 }
32921 break;
32922
32923 case BUILT_IN_FMAF:
32924 if (out_mode == SFmode && in_mode == SFmode)
32925 {
32926 if (out_n == 4 && in_n == 4)
32927 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
32928 if (out_n == 8 && in_n == 8)
32929 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
32930 }
32931 break;
32932
32933 default:
32934 break;
32935 }
32936
32937 /* Dispatch to a handler for a vectorization library. */
32938 if (ix86_veclib_handler)
32939 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
32940 type_in);
32941
32942 return NULL_TREE;
32943 }
32944
32945 /* Handler for an SVML-style interface to
32946 a library with vectorized intrinsics. */
32947
32948 static tree
32949 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
32950 {
32951 char name[20];
32952 tree fntype, new_fndecl, args;
32953 unsigned arity;
32954 const char *bname;
32955 enum machine_mode el_mode, in_mode;
32956 int n, in_n;
32957
32958 /* The SVML is suitable for unsafe math only. */
32959 if (!flag_unsafe_math_optimizations)
32960 return NULL_TREE;
32961
32962 el_mode = TYPE_MODE (TREE_TYPE (type_out));
32963 n = TYPE_VECTOR_SUBPARTS (type_out);
32964 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32965 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32966 if (el_mode != in_mode
32967 || n != in_n)
32968 return NULL_TREE;
32969
32970 switch (fn)
32971 {
32972 case BUILT_IN_EXP:
32973 case BUILT_IN_LOG:
32974 case BUILT_IN_LOG10:
32975 case BUILT_IN_POW:
32976 case BUILT_IN_TANH:
32977 case BUILT_IN_TAN:
32978 case BUILT_IN_ATAN:
32979 case BUILT_IN_ATAN2:
32980 case BUILT_IN_ATANH:
32981 case BUILT_IN_CBRT:
32982 case BUILT_IN_SINH:
32983 case BUILT_IN_SIN:
32984 case BUILT_IN_ASINH:
32985 case BUILT_IN_ASIN:
32986 case BUILT_IN_COSH:
32987 case BUILT_IN_COS:
32988 case BUILT_IN_ACOSH:
32989 case BUILT_IN_ACOS:
32990 if (el_mode != DFmode || n != 2)
32991 return NULL_TREE;
32992 break;
32993
32994 case BUILT_IN_EXPF:
32995 case BUILT_IN_LOGF:
32996 case BUILT_IN_LOG10F:
32997 case BUILT_IN_POWF:
32998 case BUILT_IN_TANHF:
32999 case BUILT_IN_TANF:
33000 case BUILT_IN_ATANF:
33001 case BUILT_IN_ATAN2F:
33002 case BUILT_IN_ATANHF:
33003 case BUILT_IN_CBRTF:
33004 case BUILT_IN_SINHF:
33005 case BUILT_IN_SINF:
33006 case BUILT_IN_ASINHF:
33007 case BUILT_IN_ASINF:
33008 case BUILT_IN_COSHF:
33009 case BUILT_IN_COSF:
33010 case BUILT_IN_ACOSHF:
33011 case BUILT_IN_ACOSF:
33012 if (el_mode != SFmode || n != 4)
33013 return NULL_TREE;
33014 break;
33015
33016 default:
33017 return NULL_TREE;
33018 }
33019
33020 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
33021
33022 if (fn == BUILT_IN_LOGF)
33023 strcpy (name, "vmlsLn4");
33024 else if (fn == BUILT_IN_LOG)
33025 strcpy (name, "vmldLn2");
33026 else if (n == 4)
33027 {
33028 sprintf (name, "vmls%s", bname+10);
33029 name[strlen (name)-1] = '4';
33030 }
33031 else
33032 sprintf (name, "vmld%s2", bname+10);
33033
33034 /* Convert to uppercase. */
33035 name[4] &= ~0x20;
33036
33037 arity = 0;
33038 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
33039 args;
33040 args = TREE_CHAIN (args))
33041 arity++;
33042
33043 if (arity == 1)
33044 fntype = build_function_type_list (type_out, type_in, NULL);
33045 else
33046 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
33047
33048 /* Build a function declaration for the vectorized function. */
33049 new_fndecl = build_decl (BUILTINS_LOCATION,
33050 FUNCTION_DECL, get_identifier (name), fntype);
33051 TREE_PUBLIC (new_fndecl) = 1;
33052 DECL_EXTERNAL (new_fndecl) = 1;
33053 DECL_IS_NOVOPS (new_fndecl) = 1;
33054 TREE_READONLY (new_fndecl) = 1;
33055
33056 return new_fndecl;
33057 }
33058
33059 /* Handler for an ACML-style interface to
33060 a library with vectorized intrinsics. */
33061
33062 static tree
33063 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
33064 {
33065 char name[20] = "__vr.._";
33066 tree fntype, new_fndecl, args;
33067 unsigned arity;
33068 const char *bname;
33069 enum machine_mode el_mode, in_mode;
33070 int n, in_n;
33071
33072 /* The ACML is 64bits only and suitable for unsafe math only as
33073 it does not correctly support parts of IEEE with the required
33074 precision such as denormals. */
33075 if (!TARGET_64BIT
33076 || !flag_unsafe_math_optimizations)
33077 return NULL_TREE;
33078
33079 el_mode = TYPE_MODE (TREE_TYPE (type_out));
33080 n = TYPE_VECTOR_SUBPARTS (type_out);
33081 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33082 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33083 if (el_mode != in_mode
33084 || n != in_n)
33085 return NULL_TREE;
33086
33087 switch (fn)
33088 {
33089 case BUILT_IN_SIN:
33090 case BUILT_IN_COS:
33091 case BUILT_IN_EXP:
33092 case BUILT_IN_LOG:
33093 case BUILT_IN_LOG2:
33094 case BUILT_IN_LOG10:
33095 name[4] = 'd';
33096 name[5] = '2';
33097 if (el_mode != DFmode
33098 || n != 2)
33099 return NULL_TREE;
33100 break;
33101
33102 case BUILT_IN_SINF:
33103 case BUILT_IN_COSF:
33104 case BUILT_IN_EXPF:
33105 case BUILT_IN_POWF:
33106 case BUILT_IN_LOGF:
33107 case BUILT_IN_LOG2F:
33108 case BUILT_IN_LOG10F:
33109 name[4] = 's';
33110 name[5] = '4';
33111 if (el_mode != SFmode
33112 || n != 4)
33113 return NULL_TREE;
33114 break;
33115
33116 default:
33117 return NULL_TREE;
33118 }
33119
33120 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
33121 sprintf (name + 7, "%s", bname+10);
33122
33123 arity = 0;
33124 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
33125 args;
33126 args = TREE_CHAIN (args))
33127 arity++;
33128
33129 if (arity == 1)
33130 fntype = build_function_type_list (type_out, type_in, NULL);
33131 else
33132 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
33133
33134 /* Build a function declaration for the vectorized function. */
33135 new_fndecl = build_decl (BUILTINS_LOCATION,
33136 FUNCTION_DECL, get_identifier (name), fntype);
33137 TREE_PUBLIC (new_fndecl) = 1;
33138 DECL_EXTERNAL (new_fndecl) = 1;
33139 DECL_IS_NOVOPS (new_fndecl) = 1;
33140 TREE_READONLY (new_fndecl) = 1;
33141
33142 return new_fndecl;
33143 }
33144
33145 /* Returns a decl of a function that implements gather load with
33146 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
33147 Return NULL_TREE if it is not available. */
33148
33149 static tree
33150 ix86_vectorize_builtin_gather (const_tree mem_vectype,
33151 const_tree index_type, int scale)
33152 {
33153 bool si;
33154 enum ix86_builtins code;
33155
33156 if (! TARGET_AVX2)
33157 return NULL_TREE;
33158
33159 if ((TREE_CODE (index_type) != INTEGER_TYPE
33160 && !POINTER_TYPE_P (index_type))
33161 || (TYPE_MODE (index_type) != SImode
33162 && TYPE_MODE (index_type) != DImode))
33163 return NULL_TREE;
33164
33165 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
33166 return NULL_TREE;
33167
33168 /* v*gather* insn sign extends index to pointer mode. */
33169 if (TYPE_PRECISION (index_type) < POINTER_SIZE
33170 && TYPE_UNSIGNED (index_type))
33171 return NULL_TREE;
33172
33173 if (scale <= 0
33174 || scale > 8
33175 || (scale & (scale - 1)) != 0)
33176 return NULL_TREE;
33177
33178 si = TYPE_MODE (index_type) == SImode;
33179 switch (TYPE_MODE (mem_vectype))
33180 {
33181 case V2DFmode:
33182 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
33183 break;
33184 case V4DFmode:
33185 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
33186 break;
33187 case V2DImode:
33188 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
33189 break;
33190 case V4DImode:
33191 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
33192 break;
33193 case V4SFmode:
33194 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
33195 break;
33196 case V8SFmode:
33197 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
33198 break;
33199 case V4SImode:
33200 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
33201 break;
33202 case V8SImode:
33203 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
33204 break;
33205 default:
33206 return NULL_TREE;
33207 }
33208
33209 return ix86_builtins[code];
33210 }
33211
33212 /* Returns a code for a target-specific builtin that implements
33213 reciprocal of the function, or NULL_TREE if not available. */
33214
33215 static tree
33216 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
33217 bool sqrt ATTRIBUTE_UNUSED)
33218 {
33219 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
33220 && flag_finite_math_only && !flag_trapping_math
33221 && flag_unsafe_math_optimizations))
33222 return NULL_TREE;
33223
33224 if (md_fn)
33225 /* Machine dependent builtins. */
33226 switch (fn)
33227 {
33228 /* Vectorized version of sqrt to rsqrt conversion. */
33229 case IX86_BUILTIN_SQRTPS_NR:
33230 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
33231
33232 case IX86_BUILTIN_SQRTPS_NR256:
33233 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
33234
33235 default:
33236 return NULL_TREE;
33237 }
33238 else
33239 /* Normal builtins. */
33240 switch (fn)
33241 {
33242 /* Sqrt to rsqrt conversion. */
33243 case BUILT_IN_SQRTF:
33244 return ix86_builtins[IX86_BUILTIN_RSQRTF];
33245
33246 default:
33247 return NULL_TREE;
33248 }
33249 }
33250 \f
33251 /* Helper for avx_vpermilps256_operand et al. This is also used by
33252 the expansion functions to turn the parallel back into a mask.
33253 The return value is 0 for no match and the imm8+1 for a match. */
33254
33255 int
33256 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
33257 {
33258 unsigned i, nelt = GET_MODE_NUNITS (mode);
33259 unsigned mask = 0;
33260 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33261
33262 if (XVECLEN (par, 0) != (int) nelt)
33263 return 0;
33264
33265 /* Validate that all of the elements are constants, and not totally
33266 out of range. Copy the data into an integral array to make the
33267 subsequent checks easier. */
33268 for (i = 0; i < nelt; ++i)
33269 {
33270 rtx er = XVECEXP (par, 0, i);
33271 unsigned HOST_WIDE_INT ei;
33272
33273 if (!CONST_INT_P (er))
33274 return 0;
33275 ei = INTVAL (er);
33276 if (ei >= nelt)
33277 return 0;
33278 ipar[i] = ei;
33279 }
33280
33281 switch (mode)
33282 {
33283 case V4DFmode:
33284 /* In the 256-bit DFmode case, we can only move elements within
33285 a 128-bit lane. */
33286 for (i = 0; i < 2; ++i)
33287 {
33288 if (ipar[i] >= 2)
33289 return 0;
33290 mask |= ipar[i] << i;
33291 }
33292 for (i = 2; i < 4; ++i)
33293 {
33294 if (ipar[i] < 2)
33295 return 0;
33296 mask |= (ipar[i] - 2) << i;
33297 }
33298 break;
33299
33300 case V8SFmode:
33301 /* In the 256-bit SFmode case, we have full freedom of movement
33302 within the low 128-bit lane, but the high 128-bit lane must
33303 mirror the exact same pattern. */
33304 for (i = 0; i < 4; ++i)
33305 if (ipar[i] + 4 != ipar[i + 4])
33306 return 0;
33307 nelt = 4;
33308 /* FALLTHRU */
33309
33310 case V2DFmode:
33311 case V4SFmode:
33312 /* In the 128-bit case, we've full freedom in the placement of
33313 the elements from the source operand. */
33314 for (i = 0; i < nelt; ++i)
33315 mask |= ipar[i] << (i * (nelt / 2));
33316 break;
33317
33318 default:
33319 gcc_unreachable ();
33320 }
33321
33322 /* Make sure success has a non-zero value by adding one. */
33323 return mask + 1;
33324 }
33325
33326 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
33327 the expansion functions to turn the parallel back into a mask.
33328 The return value is 0 for no match and the imm8+1 for a match. */
33329
33330 int
33331 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
33332 {
33333 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
33334 unsigned mask = 0;
33335 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33336
33337 if (XVECLEN (par, 0) != (int) nelt)
33338 return 0;
33339
33340 /* Validate that all of the elements are constants, and not totally
33341 out of range. Copy the data into an integral array to make the
33342 subsequent checks easier. */
33343 for (i = 0; i < nelt; ++i)
33344 {
33345 rtx er = XVECEXP (par, 0, i);
33346 unsigned HOST_WIDE_INT ei;
33347
33348 if (!CONST_INT_P (er))
33349 return 0;
33350 ei = INTVAL (er);
33351 if (ei >= 2 * nelt)
33352 return 0;
33353 ipar[i] = ei;
33354 }
33355
33356 /* Validate that the halves of the permute are halves. */
33357 for (i = 0; i < nelt2 - 1; ++i)
33358 if (ipar[i] + 1 != ipar[i + 1])
33359 return 0;
33360 for (i = nelt2; i < nelt - 1; ++i)
33361 if (ipar[i] + 1 != ipar[i + 1])
33362 return 0;
33363
33364 /* Reconstruct the mask. */
33365 for (i = 0; i < 2; ++i)
33366 {
33367 unsigned e = ipar[i * nelt2];
33368 if (e % nelt2)
33369 return 0;
33370 e /= nelt2;
33371 mask |= e << (i * 4);
33372 }
33373
33374 /* Make sure success has a non-zero value by adding one. */
33375 return mask + 1;
33376 }
33377 \f
33378 /* Store OPERAND to the memory after reload is completed. This means
33379 that we can't easily use assign_stack_local. */
33380 rtx
33381 ix86_force_to_memory (enum machine_mode mode, rtx operand)
33382 {
33383 rtx result;
33384
33385 gcc_assert (reload_completed);
33386 if (ix86_using_red_zone ())
33387 {
33388 result = gen_rtx_MEM (mode,
33389 gen_rtx_PLUS (Pmode,
33390 stack_pointer_rtx,
33391 GEN_INT (-RED_ZONE_SIZE)));
33392 emit_move_insn (result, operand);
33393 }
33394 else if (TARGET_64BIT)
33395 {
33396 switch (mode)
33397 {
33398 case HImode:
33399 case SImode:
33400 operand = gen_lowpart (DImode, operand);
33401 /* FALLTHRU */
33402 case DImode:
33403 emit_insn (
33404 gen_rtx_SET (VOIDmode,
33405 gen_rtx_MEM (DImode,
33406 gen_rtx_PRE_DEC (DImode,
33407 stack_pointer_rtx)),
33408 operand));
33409 break;
33410 default:
33411 gcc_unreachable ();
33412 }
33413 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33414 }
33415 else
33416 {
33417 switch (mode)
33418 {
33419 case DImode:
33420 {
33421 rtx operands[2];
33422 split_double_mode (mode, &operand, 1, operands, operands + 1);
33423 emit_insn (
33424 gen_rtx_SET (VOIDmode,
33425 gen_rtx_MEM (SImode,
33426 gen_rtx_PRE_DEC (Pmode,
33427 stack_pointer_rtx)),
33428 operands[1]));
33429 emit_insn (
33430 gen_rtx_SET (VOIDmode,
33431 gen_rtx_MEM (SImode,
33432 gen_rtx_PRE_DEC (Pmode,
33433 stack_pointer_rtx)),
33434 operands[0]));
33435 }
33436 break;
33437 case HImode:
33438 /* Store HImodes as SImodes. */
33439 operand = gen_lowpart (SImode, operand);
33440 /* FALLTHRU */
33441 case SImode:
33442 emit_insn (
33443 gen_rtx_SET (VOIDmode,
33444 gen_rtx_MEM (GET_MODE (operand),
33445 gen_rtx_PRE_DEC (SImode,
33446 stack_pointer_rtx)),
33447 operand));
33448 break;
33449 default:
33450 gcc_unreachable ();
33451 }
33452 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33453 }
33454 return result;
33455 }
33456
33457 /* Free operand from the memory. */
33458 void
33459 ix86_free_from_memory (enum machine_mode mode)
33460 {
33461 if (!ix86_using_red_zone ())
33462 {
33463 int size;
33464
33465 if (mode == DImode || TARGET_64BIT)
33466 size = 8;
33467 else
33468 size = 4;
33469 /* Use LEA to deallocate stack space. In peephole2 it will be converted
33470 to pop or add instruction if registers are available. */
33471 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
33472 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
33473 GEN_INT (size))));
33474 }
33475 }
33476
33477 /* Return a register priority for hard reg REGNO. */
33478 static int
33479 ix86_register_priority (int hard_regno)
33480 {
33481 /* ebp and r13 as the base always wants a displacement, r12 as the
33482 base always wants an index. So discourage their usage in an
33483 address. */
33484 if (hard_regno == R12_REG || hard_regno == R13_REG)
33485 return 0;
33486 if (hard_regno == BP_REG)
33487 return 1;
33488 /* New x86-64 int registers result in bigger code size. Discourage
33489 them. */
33490 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
33491 return 2;
33492 /* New x86-64 SSE registers result in bigger code size. Discourage
33493 them. */
33494 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
33495 return 2;
33496 /* Usage of AX register results in smaller code. Prefer it. */
33497 if (hard_regno == 0)
33498 return 4;
33499 return 3;
33500 }
33501
33502 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
33503
33504 Put float CONST_DOUBLE in the constant pool instead of fp regs.
33505 QImode must go into class Q_REGS.
33506 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
33507 movdf to do mem-to-mem moves through integer regs. */
33508
33509 static reg_class_t
33510 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
33511 {
33512 enum machine_mode mode = GET_MODE (x);
33513
33514 /* We're only allowed to return a subclass of CLASS. Many of the
33515 following checks fail for NO_REGS, so eliminate that early. */
33516 if (regclass == NO_REGS)
33517 return NO_REGS;
33518
33519 /* All classes can load zeros. */
33520 if (x == CONST0_RTX (mode))
33521 return regclass;
33522
33523 /* Force constants into memory if we are loading a (nonzero) constant into
33524 an MMX or SSE register. This is because there are no MMX/SSE instructions
33525 to load from a constant. */
33526 if (CONSTANT_P (x)
33527 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
33528 return NO_REGS;
33529
33530 /* Prefer SSE regs only, if we can use them for math. */
33531 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
33532 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
33533
33534 /* Floating-point constants need more complex checks. */
33535 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
33536 {
33537 /* General regs can load everything. */
33538 if (reg_class_subset_p (regclass, GENERAL_REGS))
33539 return regclass;
33540
33541 /* Floats can load 0 and 1 plus some others. Note that we eliminated
33542 zero above. We only want to wind up preferring 80387 registers if
33543 we plan on doing computation with them. */
33544 if (TARGET_80387
33545 && standard_80387_constant_p (x) > 0)
33546 {
33547 /* Limit class to non-sse. */
33548 if (regclass == FLOAT_SSE_REGS)
33549 return FLOAT_REGS;
33550 if (regclass == FP_TOP_SSE_REGS)
33551 return FP_TOP_REG;
33552 if (regclass == FP_SECOND_SSE_REGS)
33553 return FP_SECOND_REG;
33554 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
33555 return regclass;
33556 }
33557
33558 return NO_REGS;
33559 }
33560
33561 /* Generally when we see PLUS here, it's the function invariant
33562 (plus soft-fp const_int). Which can only be computed into general
33563 regs. */
33564 if (GET_CODE (x) == PLUS)
33565 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
33566
33567 /* QImode constants are easy to load, but non-constant QImode data
33568 must go into Q_REGS. */
33569 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
33570 {
33571 if (reg_class_subset_p (regclass, Q_REGS))
33572 return regclass;
33573 if (reg_class_subset_p (Q_REGS, regclass))
33574 return Q_REGS;
33575 return NO_REGS;
33576 }
33577
33578 return regclass;
33579 }
33580
33581 /* Discourage putting floating-point values in SSE registers unless
33582 SSE math is being used, and likewise for the 387 registers. */
33583 static reg_class_t
33584 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
33585 {
33586 enum machine_mode mode = GET_MODE (x);
33587
33588 /* Restrict the output reload class to the register bank that we are doing
33589 math on. If we would like not to return a subset of CLASS, reject this
33590 alternative: if reload cannot do this, it will still use its choice. */
33591 mode = GET_MODE (x);
33592 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
33593 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
33594
33595 if (X87_FLOAT_MODE_P (mode))
33596 {
33597 if (regclass == FP_TOP_SSE_REGS)
33598 return FP_TOP_REG;
33599 else if (regclass == FP_SECOND_SSE_REGS)
33600 return FP_SECOND_REG;
33601 else
33602 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
33603 }
33604
33605 return regclass;
33606 }
33607
33608 static reg_class_t
33609 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
33610 enum machine_mode mode, secondary_reload_info *sri)
33611 {
33612 /* Double-word spills from general registers to non-offsettable memory
33613 references (zero-extended addresses) require special handling. */
33614 if (TARGET_64BIT
33615 && MEM_P (x)
33616 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
33617 && rclass == GENERAL_REGS
33618 && !offsettable_memref_p (x))
33619 {
33620 sri->icode = (in_p
33621 ? CODE_FOR_reload_noff_load
33622 : CODE_FOR_reload_noff_store);
33623 /* Add the cost of moving address to a temporary. */
33624 sri->extra_cost = 1;
33625
33626 return NO_REGS;
33627 }
33628
33629 /* QImode spills from non-QI registers require
33630 intermediate register on 32bit targets. */
33631 if (!TARGET_64BIT
33632 && !in_p && mode == QImode
33633 && (rclass == GENERAL_REGS
33634 || rclass == LEGACY_REGS
33635 || rclass == NON_Q_REGS
33636 || rclass == SIREG
33637 || rclass == DIREG
33638 || rclass == INDEX_REGS))
33639 {
33640 int regno;
33641
33642 if (REG_P (x))
33643 regno = REGNO (x);
33644 else
33645 regno = -1;
33646
33647 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
33648 regno = true_regnum (x);
33649
33650 /* Return Q_REGS if the operand is in memory. */
33651 if (regno == -1)
33652 return Q_REGS;
33653 }
33654
33655 /* This condition handles corner case where an expression involving
33656 pointers gets vectorized. We're trying to use the address of a
33657 stack slot as a vector initializer.
33658
33659 (set (reg:V2DI 74 [ vect_cst_.2 ])
33660 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
33661
33662 Eventually frame gets turned into sp+offset like this:
33663
33664 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33665 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33666 (const_int 392 [0x188]))))
33667
33668 That later gets turned into:
33669
33670 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33671 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33672 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
33673
33674 We'll have the following reload recorded:
33675
33676 Reload 0: reload_in (DI) =
33677 (plus:DI (reg/f:DI 7 sp)
33678 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
33679 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33680 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
33681 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
33682 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33683 reload_reg_rtx: (reg:V2DI 22 xmm1)
33684
33685 Which isn't going to work since SSE instructions can't handle scalar
33686 additions. Returning GENERAL_REGS forces the addition into integer
33687 register and reload can handle subsequent reloads without problems. */
33688
33689 if (in_p && GET_CODE (x) == PLUS
33690 && SSE_CLASS_P (rclass)
33691 && SCALAR_INT_MODE_P (mode))
33692 return GENERAL_REGS;
33693
33694 return NO_REGS;
33695 }
33696
33697 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
33698
33699 static bool
33700 ix86_class_likely_spilled_p (reg_class_t rclass)
33701 {
33702 switch (rclass)
33703 {
33704 case AREG:
33705 case DREG:
33706 case CREG:
33707 case BREG:
33708 case AD_REGS:
33709 case SIREG:
33710 case DIREG:
33711 case SSE_FIRST_REG:
33712 case FP_TOP_REG:
33713 case FP_SECOND_REG:
33714 return true;
33715
33716 default:
33717 break;
33718 }
33719
33720 return false;
33721 }
33722
33723 /* If we are copying between general and FP registers, we need a memory
33724 location. The same is true for SSE and MMX registers.
33725
33726 To optimize register_move_cost performance, allow inline variant.
33727
33728 The macro can't work reliably when one of the CLASSES is class containing
33729 registers from multiple units (SSE, MMX, integer). We avoid this by never
33730 combining those units in single alternative in the machine description.
33731 Ensure that this constraint holds to avoid unexpected surprises.
33732
33733 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
33734 enforce these sanity checks. */
33735
33736 static inline bool
33737 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33738 enum machine_mode mode, int strict)
33739 {
33740 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
33741 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
33742 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
33743 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
33744 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
33745 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
33746 {
33747 gcc_assert (!strict || lra_in_progress);
33748 return true;
33749 }
33750
33751 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
33752 return true;
33753
33754 /* ??? This is a lie. We do have moves between mmx/general, and for
33755 mmx/sse2. But by saying we need secondary memory we discourage the
33756 register allocator from using the mmx registers unless needed. */
33757 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
33758 return true;
33759
33760 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
33761 {
33762 /* SSE1 doesn't have any direct moves from other classes. */
33763 if (!TARGET_SSE2)
33764 return true;
33765
33766 /* If the target says that inter-unit moves are more expensive
33767 than moving through memory, then don't generate them. */
33768 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
33769 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
33770 return true;
33771
33772 /* Between SSE and general, we have moves no larger than word size. */
33773 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
33774 return true;
33775 }
33776
33777 return false;
33778 }
33779
33780 bool
33781 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33782 enum machine_mode mode, int strict)
33783 {
33784 return inline_secondary_memory_needed (class1, class2, mode, strict);
33785 }
33786
33787 /* Implement the TARGET_CLASS_MAX_NREGS hook.
33788
33789 On the 80386, this is the size of MODE in words,
33790 except in the FP regs, where a single reg is always enough. */
33791
33792 static unsigned char
33793 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
33794 {
33795 if (MAYBE_INTEGER_CLASS_P (rclass))
33796 {
33797 if (mode == XFmode)
33798 return (TARGET_64BIT ? 2 : 3);
33799 else if (mode == XCmode)
33800 return (TARGET_64BIT ? 4 : 6);
33801 else
33802 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
33803 }
33804 else
33805 {
33806 if (COMPLEX_MODE_P (mode))
33807 return 2;
33808 else
33809 return 1;
33810 }
33811 }
33812
33813 /* Return true if the registers in CLASS cannot represent the change from
33814 modes FROM to TO. */
33815
33816 bool
33817 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
33818 enum reg_class regclass)
33819 {
33820 if (from == to)
33821 return false;
33822
33823 /* x87 registers can't do subreg at all, as all values are reformatted
33824 to extended precision. */
33825 if (MAYBE_FLOAT_CLASS_P (regclass))
33826 return true;
33827
33828 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
33829 {
33830 /* Vector registers do not support QI or HImode loads. If we don't
33831 disallow a change to these modes, reload will assume it's ok to
33832 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
33833 the vec_dupv4hi pattern. */
33834 if (GET_MODE_SIZE (from) < 4)
33835 return true;
33836
33837 /* Vector registers do not support subreg with nonzero offsets, which
33838 are otherwise valid for integer registers. Since we can't see
33839 whether we have a nonzero offset from here, prohibit all
33840 nonparadoxical subregs changing size. */
33841 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
33842 return true;
33843 }
33844
33845 return false;
33846 }
33847
33848 /* Return the cost of moving data of mode M between a
33849 register and memory. A value of 2 is the default; this cost is
33850 relative to those in `REGISTER_MOVE_COST'.
33851
33852 This function is used extensively by register_move_cost that is used to
33853 build tables at startup. Make it inline in this case.
33854 When IN is 2, return maximum of in and out move cost.
33855
33856 If moving between registers and memory is more expensive than
33857 between two registers, you should define this macro to express the
33858 relative cost.
33859
33860 Model also increased moving costs of QImode registers in non
33861 Q_REGS classes.
33862 */
33863 static inline int
33864 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
33865 int in)
33866 {
33867 int cost;
33868 if (FLOAT_CLASS_P (regclass))
33869 {
33870 int index;
33871 switch (mode)
33872 {
33873 case SFmode:
33874 index = 0;
33875 break;
33876 case DFmode:
33877 index = 1;
33878 break;
33879 case XFmode:
33880 index = 2;
33881 break;
33882 default:
33883 return 100;
33884 }
33885 if (in == 2)
33886 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
33887 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
33888 }
33889 if (SSE_CLASS_P (regclass))
33890 {
33891 int index;
33892 switch (GET_MODE_SIZE (mode))
33893 {
33894 case 4:
33895 index = 0;
33896 break;
33897 case 8:
33898 index = 1;
33899 break;
33900 case 16:
33901 index = 2;
33902 break;
33903 default:
33904 return 100;
33905 }
33906 if (in == 2)
33907 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
33908 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
33909 }
33910 if (MMX_CLASS_P (regclass))
33911 {
33912 int index;
33913 switch (GET_MODE_SIZE (mode))
33914 {
33915 case 4:
33916 index = 0;
33917 break;
33918 case 8:
33919 index = 1;
33920 break;
33921 default:
33922 return 100;
33923 }
33924 if (in)
33925 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
33926 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
33927 }
33928 switch (GET_MODE_SIZE (mode))
33929 {
33930 case 1:
33931 if (Q_CLASS_P (regclass) || TARGET_64BIT)
33932 {
33933 if (!in)
33934 return ix86_cost->int_store[0];
33935 if (TARGET_PARTIAL_REG_DEPENDENCY
33936 && optimize_function_for_speed_p (cfun))
33937 cost = ix86_cost->movzbl_load;
33938 else
33939 cost = ix86_cost->int_load[0];
33940 if (in == 2)
33941 return MAX (cost, ix86_cost->int_store[0]);
33942 return cost;
33943 }
33944 else
33945 {
33946 if (in == 2)
33947 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
33948 if (in)
33949 return ix86_cost->movzbl_load;
33950 else
33951 return ix86_cost->int_store[0] + 4;
33952 }
33953 break;
33954 case 2:
33955 if (in == 2)
33956 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
33957 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
33958 default:
33959 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
33960 if (mode == TFmode)
33961 mode = XFmode;
33962 if (in == 2)
33963 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
33964 else if (in)
33965 cost = ix86_cost->int_load[2];
33966 else
33967 cost = ix86_cost->int_store[2];
33968 return (cost * (((int) GET_MODE_SIZE (mode)
33969 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
33970 }
33971 }
33972
33973 static int
33974 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
33975 bool in)
33976 {
33977 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
33978 }
33979
33980
33981 /* Return the cost of moving data from a register in class CLASS1 to
33982 one in class CLASS2.
33983
33984 It is not required that the cost always equal 2 when FROM is the same as TO;
33985 on some machines it is expensive to move between registers if they are not
33986 general registers. */
33987
33988 static int
33989 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
33990 reg_class_t class2_i)
33991 {
33992 enum reg_class class1 = (enum reg_class) class1_i;
33993 enum reg_class class2 = (enum reg_class) class2_i;
33994
33995 /* In case we require secondary memory, compute cost of the store followed
33996 by load. In order to avoid bad register allocation choices, we need
33997 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
33998
33999 if (inline_secondary_memory_needed (class1, class2, mode, 0))
34000 {
34001 int cost = 1;
34002
34003 cost += inline_memory_move_cost (mode, class1, 2);
34004 cost += inline_memory_move_cost (mode, class2, 2);
34005
34006 /* In case of copying from general_purpose_register we may emit multiple
34007 stores followed by single load causing memory size mismatch stall.
34008 Count this as arbitrarily high cost of 20. */
34009 if (targetm.class_max_nregs (class1, mode)
34010 > targetm.class_max_nregs (class2, mode))
34011 cost += 20;
34012
34013 /* In the case of FP/MMX moves, the registers actually overlap, and we
34014 have to switch modes in order to treat them differently. */
34015 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
34016 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
34017 cost += 20;
34018
34019 return cost;
34020 }
34021
34022 /* Moves between SSE/MMX and integer unit are expensive. */
34023 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
34024 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
34025
34026 /* ??? By keeping returned value relatively high, we limit the number
34027 of moves between integer and MMX/SSE registers for all targets.
34028 Additionally, high value prevents problem with x86_modes_tieable_p(),
34029 where integer modes in MMX/SSE registers are not tieable
34030 because of missing QImode and HImode moves to, from or between
34031 MMX/SSE registers. */
34032 return MAX (8, ix86_cost->mmxsse_to_integer);
34033
34034 if (MAYBE_FLOAT_CLASS_P (class1))
34035 return ix86_cost->fp_move;
34036 if (MAYBE_SSE_CLASS_P (class1))
34037 return ix86_cost->sse_move;
34038 if (MAYBE_MMX_CLASS_P (class1))
34039 return ix86_cost->mmx_move;
34040 return 2;
34041 }
34042
34043 /* Return TRUE if hard register REGNO can hold a value of machine-mode
34044 MODE. */
34045
34046 bool
34047 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
34048 {
34049 /* Flags and only flags can only hold CCmode values. */
34050 if (CC_REGNO_P (regno))
34051 return GET_MODE_CLASS (mode) == MODE_CC;
34052 if (GET_MODE_CLASS (mode) == MODE_CC
34053 || GET_MODE_CLASS (mode) == MODE_RANDOM
34054 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
34055 return false;
34056 if (STACK_REGNO_P (regno))
34057 return VALID_FP_MODE_P (mode);
34058 if (SSE_REGNO_P (regno))
34059 {
34060 /* We implement the move patterns for all vector modes into and
34061 out of SSE registers, even when no operation instructions
34062 are available. OImode move is available only when AVX is
34063 enabled. */
34064 return ((TARGET_AVX && mode == OImode)
34065 || VALID_AVX256_REG_MODE (mode)
34066 || VALID_SSE_REG_MODE (mode)
34067 || VALID_SSE2_REG_MODE (mode)
34068 || VALID_MMX_REG_MODE (mode)
34069 || VALID_MMX_REG_MODE_3DNOW (mode));
34070 }
34071 if (MMX_REGNO_P (regno))
34072 {
34073 /* We implement the move patterns for 3DNOW modes even in MMX mode,
34074 so if the register is available at all, then we can move data of
34075 the given mode into or out of it. */
34076 return (VALID_MMX_REG_MODE (mode)
34077 || VALID_MMX_REG_MODE_3DNOW (mode));
34078 }
34079
34080 if (mode == QImode)
34081 {
34082 /* Take care for QImode values - they can be in non-QI regs,
34083 but then they do cause partial register stalls. */
34084 if (ANY_QI_REGNO_P (regno))
34085 return true;
34086 if (!TARGET_PARTIAL_REG_STALL)
34087 return true;
34088 /* LRA checks if the hard register is OK for the given mode.
34089 QImode values can live in non-QI regs, so we allow all
34090 registers here. */
34091 if (lra_in_progress)
34092 return true;
34093 return !can_create_pseudo_p ();
34094 }
34095 /* We handle both integer and floats in the general purpose registers. */
34096 else if (VALID_INT_MODE_P (mode))
34097 return true;
34098 else if (VALID_FP_MODE_P (mode))
34099 return true;
34100 else if (VALID_DFP_MODE_P (mode))
34101 return true;
34102 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
34103 on to use that value in smaller contexts, this can easily force a
34104 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
34105 supporting DImode, allow it. */
34106 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
34107 return true;
34108
34109 return false;
34110 }
34111
34112 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
34113 tieable integer mode. */
34114
34115 static bool
34116 ix86_tieable_integer_mode_p (enum machine_mode mode)
34117 {
34118 switch (mode)
34119 {
34120 case HImode:
34121 case SImode:
34122 return true;
34123
34124 case QImode:
34125 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
34126
34127 case DImode:
34128 return TARGET_64BIT;
34129
34130 default:
34131 return false;
34132 }
34133 }
34134
34135 /* Return true if MODE1 is accessible in a register that can hold MODE2
34136 without copying. That is, all register classes that can hold MODE2
34137 can also hold MODE1. */
34138
34139 bool
34140 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
34141 {
34142 if (mode1 == mode2)
34143 return true;
34144
34145 if (ix86_tieable_integer_mode_p (mode1)
34146 && ix86_tieable_integer_mode_p (mode2))
34147 return true;
34148
34149 /* MODE2 being XFmode implies fp stack or general regs, which means we
34150 can tie any smaller floating point modes to it. Note that we do not
34151 tie this with TFmode. */
34152 if (mode2 == XFmode)
34153 return mode1 == SFmode || mode1 == DFmode;
34154
34155 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
34156 that we can tie it with SFmode. */
34157 if (mode2 == DFmode)
34158 return mode1 == SFmode;
34159
34160 /* If MODE2 is only appropriate for an SSE register, then tie with
34161 any other mode acceptable to SSE registers. */
34162 if (GET_MODE_SIZE (mode2) == 32
34163 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
34164 return (GET_MODE_SIZE (mode1) == 32
34165 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
34166 if (GET_MODE_SIZE (mode2) == 16
34167 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
34168 return (GET_MODE_SIZE (mode1) == 16
34169 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
34170
34171 /* If MODE2 is appropriate for an MMX register, then tie
34172 with any other mode acceptable to MMX registers. */
34173 if (GET_MODE_SIZE (mode2) == 8
34174 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
34175 return (GET_MODE_SIZE (mode1) == 8
34176 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
34177
34178 return false;
34179 }
34180
34181 /* Return the cost of moving between two registers of mode MODE. */
34182
34183 static int
34184 ix86_set_reg_reg_cost (enum machine_mode mode)
34185 {
34186 unsigned int units = UNITS_PER_WORD;
34187
34188 switch (GET_MODE_CLASS (mode))
34189 {
34190 default:
34191 break;
34192
34193 case MODE_CC:
34194 units = GET_MODE_SIZE (CCmode);
34195 break;
34196
34197 case MODE_FLOAT:
34198 if ((TARGET_SSE && mode == TFmode)
34199 || (TARGET_80387 && mode == XFmode)
34200 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
34201 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
34202 units = GET_MODE_SIZE (mode);
34203 break;
34204
34205 case MODE_COMPLEX_FLOAT:
34206 if ((TARGET_SSE && mode == TCmode)
34207 || (TARGET_80387 && mode == XCmode)
34208 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
34209 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
34210 units = GET_MODE_SIZE (mode);
34211 break;
34212
34213 case MODE_VECTOR_INT:
34214 case MODE_VECTOR_FLOAT:
34215 if ((TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34216 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34217 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34218 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
34219 units = GET_MODE_SIZE (mode);
34220 }
34221
34222 /* Return the cost of moving between two registers of mode MODE,
34223 assuming that the move will be in pieces of at most UNITS bytes. */
34224 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
34225 }
34226
34227 /* Compute a (partial) cost for rtx X. Return true if the complete
34228 cost has been computed, and false if subexpressions should be
34229 scanned. In either case, *TOTAL contains the cost result. */
34230
34231 static bool
34232 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
34233 bool speed)
34234 {
34235 enum rtx_code code = (enum rtx_code) code_i;
34236 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
34237 enum machine_mode mode = GET_MODE (x);
34238 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
34239
34240 switch (code)
34241 {
34242 case SET:
34243 if (register_operand (SET_DEST (x), VOIDmode)
34244 && reg_or_0_operand (SET_SRC (x), VOIDmode))
34245 {
34246 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
34247 return true;
34248 }
34249 return false;
34250
34251 case CONST_INT:
34252 case CONST:
34253 case LABEL_REF:
34254 case SYMBOL_REF:
34255 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
34256 *total = 3;
34257 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
34258 *total = 2;
34259 else if (flag_pic && SYMBOLIC_CONST (x)
34260 && (!TARGET_64BIT
34261 || (!GET_CODE (x) != LABEL_REF
34262 && (GET_CODE (x) != SYMBOL_REF
34263 || !SYMBOL_REF_LOCAL_P (x)))))
34264 *total = 1;
34265 else
34266 *total = 0;
34267 return true;
34268
34269 case CONST_DOUBLE:
34270 if (mode == VOIDmode)
34271 {
34272 *total = 0;
34273 return true;
34274 }
34275 switch (standard_80387_constant_p (x))
34276 {
34277 case 1: /* 0.0 */
34278 *total = 1;
34279 return true;
34280 default: /* Other constants */
34281 *total = 2;
34282 return true;
34283 case 0:
34284 case -1:
34285 break;
34286 }
34287 if (SSE_FLOAT_MODE_P (mode))
34288 {
34289 case CONST_VECTOR:
34290 switch (standard_sse_constant_p (x))
34291 {
34292 case 0:
34293 break;
34294 case 1: /* 0: xor eliminates false dependency */
34295 *total = 0;
34296 return true;
34297 default: /* -1: cmp contains false dependency */
34298 *total = 1;
34299 return true;
34300 }
34301 }
34302 /* Fall back to (MEM (SYMBOL_REF)), since that's where
34303 it'll probably end up. Add a penalty for size. */
34304 *total = (COSTS_N_INSNS (1)
34305 + (flag_pic != 0 && !TARGET_64BIT)
34306 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
34307 return true;
34308
34309 case ZERO_EXTEND:
34310 /* The zero extensions is often completely free on x86_64, so make
34311 it as cheap as possible. */
34312 if (TARGET_64BIT && mode == DImode
34313 && GET_MODE (XEXP (x, 0)) == SImode)
34314 *total = 1;
34315 else if (TARGET_ZERO_EXTEND_WITH_AND)
34316 *total = cost->add;
34317 else
34318 *total = cost->movzx;
34319 return false;
34320
34321 case SIGN_EXTEND:
34322 *total = cost->movsx;
34323 return false;
34324
34325 case ASHIFT:
34326 if (SCALAR_INT_MODE_P (mode)
34327 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
34328 && CONST_INT_P (XEXP (x, 1)))
34329 {
34330 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34331 if (value == 1)
34332 {
34333 *total = cost->add;
34334 return false;
34335 }
34336 if ((value == 2 || value == 3)
34337 && cost->lea <= cost->shift_const)
34338 {
34339 *total = cost->lea;
34340 return false;
34341 }
34342 }
34343 /* FALLTHRU */
34344
34345 case ROTATE:
34346 case ASHIFTRT:
34347 case LSHIFTRT:
34348 case ROTATERT:
34349 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34350 {
34351 /* ??? Should be SSE vector operation cost. */
34352 /* At least for published AMD latencies, this really is the same
34353 as the latency for a simple fpu operation like fabs. */
34354 /* V*QImode is emulated with 1-11 insns. */
34355 if (mode == V16QImode || mode == V32QImode)
34356 {
34357 int count = 11;
34358 if (TARGET_XOP && mode == V16QImode)
34359 {
34360 /* For XOP we use vpshab, which requires a broadcast of the
34361 value to the variable shift insn. For constants this
34362 means a V16Q const in mem; even when we can perform the
34363 shift with one insn set the cost to prefer paddb. */
34364 if (CONSTANT_P (XEXP (x, 1)))
34365 {
34366 *total = (cost->fabs
34367 + rtx_cost (XEXP (x, 0), code, 0, speed)
34368 + (speed ? 2 : COSTS_N_BYTES (16)));
34369 return true;
34370 }
34371 count = 3;
34372 }
34373 else if (TARGET_SSSE3)
34374 count = 7;
34375 *total = cost->fabs * count;
34376 }
34377 else
34378 *total = cost->fabs;
34379 }
34380 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34381 {
34382 if (CONST_INT_P (XEXP (x, 1)))
34383 {
34384 if (INTVAL (XEXP (x, 1)) > 32)
34385 *total = cost->shift_const + COSTS_N_INSNS (2);
34386 else
34387 *total = cost->shift_const * 2;
34388 }
34389 else
34390 {
34391 if (GET_CODE (XEXP (x, 1)) == AND)
34392 *total = cost->shift_var * 2;
34393 else
34394 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
34395 }
34396 }
34397 else
34398 {
34399 if (CONST_INT_P (XEXP (x, 1)))
34400 *total = cost->shift_const;
34401 else if (GET_CODE (XEXP (x, 1)) == SUBREG
34402 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
34403 {
34404 /* Return the cost after shift-and truncation. */
34405 *total = cost->shift_var;
34406 return true;
34407 }
34408 else
34409 *total = cost->shift_var;
34410 }
34411 return false;
34412
34413 case FMA:
34414 {
34415 rtx sub;
34416
34417 gcc_assert (FLOAT_MODE_P (mode));
34418 gcc_assert (TARGET_FMA || TARGET_FMA4);
34419
34420 /* ??? SSE scalar/vector cost should be used here. */
34421 /* ??? Bald assumption that fma has the same cost as fmul. */
34422 *total = cost->fmul;
34423 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
34424
34425 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
34426 sub = XEXP (x, 0);
34427 if (GET_CODE (sub) == NEG)
34428 sub = XEXP (sub, 0);
34429 *total += rtx_cost (sub, FMA, 0, speed);
34430
34431 sub = XEXP (x, 2);
34432 if (GET_CODE (sub) == NEG)
34433 sub = XEXP (sub, 0);
34434 *total += rtx_cost (sub, FMA, 2, speed);
34435 return true;
34436 }
34437
34438 case MULT:
34439 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34440 {
34441 /* ??? SSE scalar cost should be used here. */
34442 *total = cost->fmul;
34443 return false;
34444 }
34445 else if (X87_FLOAT_MODE_P (mode))
34446 {
34447 *total = cost->fmul;
34448 return false;
34449 }
34450 else if (FLOAT_MODE_P (mode))
34451 {
34452 /* ??? SSE vector cost should be used here. */
34453 *total = cost->fmul;
34454 return false;
34455 }
34456 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34457 {
34458 /* V*QImode is emulated with 7-13 insns. */
34459 if (mode == V16QImode || mode == V32QImode)
34460 {
34461 int extra = 11;
34462 if (TARGET_XOP && mode == V16QImode)
34463 extra = 5;
34464 else if (TARGET_SSSE3)
34465 extra = 6;
34466 *total = cost->fmul * 2 + cost->fabs * extra;
34467 }
34468 /* V*DImode is emulated with 5-8 insns. */
34469 else if (mode == V2DImode || mode == V4DImode)
34470 {
34471 if (TARGET_XOP && mode == V2DImode)
34472 *total = cost->fmul * 2 + cost->fabs * 3;
34473 else
34474 *total = cost->fmul * 3 + cost->fabs * 5;
34475 }
34476 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
34477 insns, including two PMULUDQ. */
34478 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
34479 *total = cost->fmul * 2 + cost->fabs * 5;
34480 else
34481 *total = cost->fmul;
34482 return false;
34483 }
34484 else
34485 {
34486 rtx op0 = XEXP (x, 0);
34487 rtx op1 = XEXP (x, 1);
34488 int nbits;
34489 if (CONST_INT_P (XEXP (x, 1)))
34490 {
34491 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34492 for (nbits = 0; value != 0; value &= value - 1)
34493 nbits++;
34494 }
34495 else
34496 /* This is arbitrary. */
34497 nbits = 7;
34498
34499 /* Compute costs correctly for widening multiplication. */
34500 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
34501 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
34502 == GET_MODE_SIZE (mode))
34503 {
34504 int is_mulwiden = 0;
34505 enum machine_mode inner_mode = GET_MODE (op0);
34506
34507 if (GET_CODE (op0) == GET_CODE (op1))
34508 is_mulwiden = 1, op1 = XEXP (op1, 0);
34509 else if (CONST_INT_P (op1))
34510 {
34511 if (GET_CODE (op0) == SIGN_EXTEND)
34512 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
34513 == INTVAL (op1);
34514 else
34515 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
34516 }
34517
34518 if (is_mulwiden)
34519 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
34520 }
34521
34522 *total = (cost->mult_init[MODE_INDEX (mode)]
34523 + nbits * cost->mult_bit
34524 + rtx_cost (op0, outer_code, opno, speed)
34525 + rtx_cost (op1, outer_code, opno, speed));
34526
34527 return true;
34528 }
34529
34530 case DIV:
34531 case UDIV:
34532 case MOD:
34533 case UMOD:
34534 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34535 /* ??? SSE cost should be used here. */
34536 *total = cost->fdiv;
34537 else if (X87_FLOAT_MODE_P (mode))
34538 *total = cost->fdiv;
34539 else if (FLOAT_MODE_P (mode))
34540 /* ??? SSE vector cost should be used here. */
34541 *total = cost->fdiv;
34542 else
34543 *total = cost->divide[MODE_INDEX (mode)];
34544 return false;
34545
34546 case PLUS:
34547 if (GET_MODE_CLASS (mode) == MODE_INT
34548 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
34549 {
34550 if (GET_CODE (XEXP (x, 0)) == PLUS
34551 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
34552 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
34553 && CONSTANT_P (XEXP (x, 1)))
34554 {
34555 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
34556 if (val == 2 || val == 4 || val == 8)
34557 {
34558 *total = cost->lea;
34559 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34560 outer_code, opno, speed);
34561 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
34562 outer_code, opno, speed);
34563 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34564 return true;
34565 }
34566 }
34567 else if (GET_CODE (XEXP (x, 0)) == MULT
34568 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
34569 {
34570 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
34571 if (val == 2 || val == 4 || val == 8)
34572 {
34573 *total = cost->lea;
34574 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34575 outer_code, opno, speed);
34576 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34577 return true;
34578 }
34579 }
34580 else if (GET_CODE (XEXP (x, 0)) == PLUS)
34581 {
34582 *total = cost->lea;
34583 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34584 outer_code, opno, speed);
34585 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34586 outer_code, opno, speed);
34587 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34588 return true;
34589 }
34590 }
34591 /* FALLTHRU */
34592
34593 case MINUS:
34594 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34595 {
34596 /* ??? SSE cost should be used here. */
34597 *total = cost->fadd;
34598 return false;
34599 }
34600 else if (X87_FLOAT_MODE_P (mode))
34601 {
34602 *total = cost->fadd;
34603 return false;
34604 }
34605 else if (FLOAT_MODE_P (mode))
34606 {
34607 /* ??? SSE vector cost should be used here. */
34608 *total = cost->fadd;
34609 return false;
34610 }
34611 /* FALLTHRU */
34612
34613 case AND:
34614 case IOR:
34615 case XOR:
34616 if (GET_MODE_CLASS (mode) == MODE_INT
34617 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34618 {
34619 *total = (cost->add * 2
34620 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
34621 << (GET_MODE (XEXP (x, 0)) != DImode))
34622 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
34623 << (GET_MODE (XEXP (x, 1)) != DImode)));
34624 return true;
34625 }
34626 /* FALLTHRU */
34627
34628 case NEG:
34629 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34630 {
34631 /* ??? SSE cost should be used here. */
34632 *total = cost->fchs;
34633 return false;
34634 }
34635 else if (X87_FLOAT_MODE_P (mode))
34636 {
34637 *total = cost->fchs;
34638 return false;
34639 }
34640 else if (FLOAT_MODE_P (mode))
34641 {
34642 /* ??? SSE vector cost should be used here. */
34643 *total = cost->fchs;
34644 return false;
34645 }
34646 /* FALLTHRU */
34647
34648 case NOT:
34649 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34650 {
34651 /* ??? Should be SSE vector operation cost. */
34652 /* At least for published AMD latencies, this really is the same
34653 as the latency for a simple fpu operation like fabs. */
34654 *total = cost->fabs;
34655 }
34656 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34657 *total = cost->add * 2;
34658 else
34659 *total = cost->add;
34660 return false;
34661
34662 case COMPARE:
34663 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
34664 && XEXP (XEXP (x, 0), 1) == const1_rtx
34665 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
34666 && XEXP (x, 1) == const0_rtx)
34667 {
34668 /* This kind of construct is implemented using test[bwl].
34669 Treat it as if we had an AND. */
34670 *total = (cost->add
34671 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
34672 + rtx_cost (const1_rtx, outer_code, opno, speed));
34673 return true;
34674 }
34675 return false;
34676
34677 case FLOAT_EXTEND:
34678 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
34679 *total = 0;
34680 return false;
34681
34682 case ABS:
34683 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34684 /* ??? SSE cost should be used here. */
34685 *total = cost->fabs;
34686 else if (X87_FLOAT_MODE_P (mode))
34687 *total = cost->fabs;
34688 else if (FLOAT_MODE_P (mode))
34689 /* ??? SSE vector cost should be used here. */
34690 *total = cost->fabs;
34691 return false;
34692
34693 case SQRT:
34694 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34695 /* ??? SSE cost should be used here. */
34696 *total = cost->fsqrt;
34697 else if (X87_FLOAT_MODE_P (mode))
34698 *total = cost->fsqrt;
34699 else if (FLOAT_MODE_P (mode))
34700 /* ??? SSE vector cost should be used here. */
34701 *total = cost->fsqrt;
34702 return false;
34703
34704 case UNSPEC:
34705 if (XINT (x, 1) == UNSPEC_TP)
34706 *total = 0;
34707 return false;
34708
34709 case VEC_SELECT:
34710 case VEC_CONCAT:
34711 case VEC_MERGE:
34712 case VEC_DUPLICATE:
34713 /* ??? Assume all of these vector manipulation patterns are
34714 recognizable. In which case they all pretty much have the
34715 same cost. */
34716 *total = cost->fabs;
34717 return true;
34718
34719 default:
34720 return false;
34721 }
34722 }
34723
34724 #if TARGET_MACHO
34725
34726 static int current_machopic_label_num;
34727
34728 /* Given a symbol name and its associated stub, write out the
34729 definition of the stub. */
34730
34731 void
34732 machopic_output_stub (FILE *file, const char *symb, const char *stub)
34733 {
34734 unsigned int length;
34735 char *binder_name, *symbol_name, lazy_ptr_name[32];
34736 int label = ++current_machopic_label_num;
34737
34738 /* For 64-bit we shouldn't get here. */
34739 gcc_assert (!TARGET_64BIT);
34740
34741 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
34742 symb = targetm.strip_name_encoding (symb);
34743
34744 length = strlen (stub);
34745 binder_name = XALLOCAVEC (char, length + 32);
34746 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
34747
34748 length = strlen (symb);
34749 symbol_name = XALLOCAVEC (char, length + 32);
34750 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
34751
34752 sprintf (lazy_ptr_name, "L%d$lz", label);
34753
34754 if (MACHOPIC_ATT_STUB)
34755 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
34756 else if (MACHOPIC_PURE)
34757 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
34758 else
34759 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
34760
34761 fprintf (file, "%s:\n", stub);
34762 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34763
34764 if (MACHOPIC_ATT_STUB)
34765 {
34766 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
34767 }
34768 else if (MACHOPIC_PURE)
34769 {
34770 /* PIC stub. */
34771 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34772 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
34773 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
34774 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
34775 label, lazy_ptr_name, label);
34776 fprintf (file, "\tjmp\t*%%ecx\n");
34777 }
34778 else
34779 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
34780
34781 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
34782 it needs no stub-binding-helper. */
34783 if (MACHOPIC_ATT_STUB)
34784 return;
34785
34786 fprintf (file, "%s:\n", binder_name);
34787
34788 if (MACHOPIC_PURE)
34789 {
34790 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
34791 fprintf (file, "\tpushl\t%%ecx\n");
34792 }
34793 else
34794 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
34795
34796 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
34797
34798 /* N.B. Keep the correspondence of these
34799 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
34800 old-pic/new-pic/non-pic stubs; altering this will break
34801 compatibility with existing dylibs. */
34802 if (MACHOPIC_PURE)
34803 {
34804 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34805 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
34806 }
34807 else
34808 /* 16-byte -mdynamic-no-pic stub. */
34809 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
34810
34811 fprintf (file, "%s:\n", lazy_ptr_name);
34812 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34813 fprintf (file, ASM_LONG "%s\n", binder_name);
34814 }
34815 #endif /* TARGET_MACHO */
34816
34817 /* Order the registers for register allocator. */
34818
34819 void
34820 x86_order_regs_for_local_alloc (void)
34821 {
34822 int pos = 0;
34823 int i;
34824
34825 /* First allocate the local general purpose registers. */
34826 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34827 if (GENERAL_REGNO_P (i) && call_used_regs[i])
34828 reg_alloc_order [pos++] = i;
34829
34830 /* Global general purpose registers. */
34831 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34832 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
34833 reg_alloc_order [pos++] = i;
34834
34835 /* x87 registers come first in case we are doing FP math
34836 using them. */
34837 if (!TARGET_SSE_MATH)
34838 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34839 reg_alloc_order [pos++] = i;
34840
34841 /* SSE registers. */
34842 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
34843 reg_alloc_order [pos++] = i;
34844 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
34845 reg_alloc_order [pos++] = i;
34846
34847 /* x87 registers. */
34848 if (TARGET_SSE_MATH)
34849 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34850 reg_alloc_order [pos++] = i;
34851
34852 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
34853 reg_alloc_order [pos++] = i;
34854
34855 /* Initialize the rest of array as we do not allocate some registers
34856 at all. */
34857 while (pos < FIRST_PSEUDO_REGISTER)
34858 reg_alloc_order [pos++] = 0;
34859 }
34860
34861 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
34862 in struct attribute_spec handler. */
34863 static tree
34864 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
34865 tree args,
34866 int flags ATTRIBUTE_UNUSED,
34867 bool *no_add_attrs)
34868 {
34869 if (TREE_CODE (*node) != FUNCTION_TYPE
34870 && TREE_CODE (*node) != METHOD_TYPE
34871 && TREE_CODE (*node) != FIELD_DECL
34872 && TREE_CODE (*node) != TYPE_DECL)
34873 {
34874 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34875 name);
34876 *no_add_attrs = true;
34877 return NULL_TREE;
34878 }
34879 if (TARGET_64BIT)
34880 {
34881 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
34882 name);
34883 *no_add_attrs = true;
34884 return NULL_TREE;
34885 }
34886 if (is_attribute_p ("callee_pop_aggregate_return", name))
34887 {
34888 tree cst;
34889
34890 cst = TREE_VALUE (args);
34891 if (TREE_CODE (cst) != INTEGER_CST)
34892 {
34893 warning (OPT_Wattributes,
34894 "%qE attribute requires an integer constant argument",
34895 name);
34896 *no_add_attrs = true;
34897 }
34898 else if (compare_tree_int (cst, 0) != 0
34899 && compare_tree_int (cst, 1) != 0)
34900 {
34901 warning (OPT_Wattributes,
34902 "argument to %qE attribute is neither zero, nor one",
34903 name);
34904 *no_add_attrs = true;
34905 }
34906
34907 return NULL_TREE;
34908 }
34909
34910 return NULL_TREE;
34911 }
34912
34913 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
34914 struct attribute_spec.handler. */
34915 static tree
34916 ix86_handle_abi_attribute (tree *node, tree name,
34917 tree args ATTRIBUTE_UNUSED,
34918 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34919 {
34920 if (TREE_CODE (*node) != FUNCTION_TYPE
34921 && TREE_CODE (*node) != METHOD_TYPE
34922 && TREE_CODE (*node) != FIELD_DECL
34923 && TREE_CODE (*node) != TYPE_DECL)
34924 {
34925 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34926 name);
34927 *no_add_attrs = true;
34928 return NULL_TREE;
34929 }
34930
34931 /* Can combine regparm with all attributes but fastcall. */
34932 if (is_attribute_p ("ms_abi", name))
34933 {
34934 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
34935 {
34936 error ("ms_abi and sysv_abi attributes are not compatible");
34937 }
34938
34939 return NULL_TREE;
34940 }
34941 else if (is_attribute_p ("sysv_abi", name))
34942 {
34943 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
34944 {
34945 error ("ms_abi and sysv_abi attributes are not compatible");
34946 }
34947
34948 return NULL_TREE;
34949 }
34950
34951 return NULL_TREE;
34952 }
34953
34954 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
34955 struct attribute_spec.handler. */
34956 static tree
34957 ix86_handle_struct_attribute (tree *node, tree name,
34958 tree args ATTRIBUTE_UNUSED,
34959 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34960 {
34961 tree *type = NULL;
34962 if (DECL_P (*node))
34963 {
34964 if (TREE_CODE (*node) == TYPE_DECL)
34965 type = &TREE_TYPE (*node);
34966 }
34967 else
34968 type = node;
34969
34970 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
34971 {
34972 warning (OPT_Wattributes, "%qE attribute ignored",
34973 name);
34974 *no_add_attrs = true;
34975 }
34976
34977 else if ((is_attribute_p ("ms_struct", name)
34978 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
34979 || ((is_attribute_p ("gcc_struct", name)
34980 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
34981 {
34982 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
34983 name);
34984 *no_add_attrs = true;
34985 }
34986
34987 return NULL_TREE;
34988 }
34989
34990 static tree
34991 ix86_handle_fndecl_attribute (tree *node, tree name,
34992 tree args ATTRIBUTE_UNUSED,
34993 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34994 {
34995 if (TREE_CODE (*node) != FUNCTION_DECL)
34996 {
34997 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34998 name);
34999 *no_add_attrs = true;
35000 }
35001 return NULL_TREE;
35002 }
35003
35004 static bool
35005 ix86_ms_bitfield_layout_p (const_tree record_type)
35006 {
35007 return ((TARGET_MS_BITFIELD_LAYOUT
35008 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
35009 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
35010 }
35011
35012 /* Returns an expression indicating where the this parameter is
35013 located on entry to the FUNCTION. */
35014
35015 static rtx
35016 x86_this_parameter (tree function)
35017 {
35018 tree type = TREE_TYPE (function);
35019 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
35020 int nregs;
35021
35022 if (TARGET_64BIT)
35023 {
35024 const int *parm_regs;
35025
35026 if (ix86_function_type_abi (type) == MS_ABI)
35027 parm_regs = x86_64_ms_abi_int_parameter_registers;
35028 else
35029 parm_regs = x86_64_int_parameter_registers;
35030 return gen_rtx_REG (Pmode, parm_regs[aggr]);
35031 }
35032
35033 nregs = ix86_function_regparm (type, function);
35034
35035 if (nregs > 0 && !stdarg_p (type))
35036 {
35037 int regno;
35038 unsigned int ccvt = ix86_get_callcvt (type);
35039
35040 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
35041 regno = aggr ? DX_REG : CX_REG;
35042 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
35043 {
35044 regno = CX_REG;
35045 if (aggr)
35046 return gen_rtx_MEM (SImode,
35047 plus_constant (Pmode, stack_pointer_rtx, 4));
35048 }
35049 else
35050 {
35051 regno = AX_REG;
35052 if (aggr)
35053 {
35054 regno = DX_REG;
35055 if (nregs == 1)
35056 return gen_rtx_MEM (SImode,
35057 plus_constant (Pmode,
35058 stack_pointer_rtx, 4));
35059 }
35060 }
35061 return gen_rtx_REG (SImode, regno);
35062 }
35063
35064 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
35065 aggr ? 8 : 4));
35066 }
35067
35068 /* Determine whether x86_output_mi_thunk can succeed. */
35069
35070 static bool
35071 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
35072 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
35073 HOST_WIDE_INT vcall_offset, const_tree function)
35074 {
35075 /* 64-bit can handle anything. */
35076 if (TARGET_64BIT)
35077 return true;
35078
35079 /* For 32-bit, everything's fine if we have one free register. */
35080 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
35081 return true;
35082
35083 /* Need a free register for vcall_offset. */
35084 if (vcall_offset)
35085 return false;
35086
35087 /* Need a free register for GOT references. */
35088 if (flag_pic && !targetm.binds_local_p (function))
35089 return false;
35090
35091 /* Otherwise ok. */
35092 return true;
35093 }
35094
35095 /* Output the assembler code for a thunk function. THUNK_DECL is the
35096 declaration for the thunk function itself, FUNCTION is the decl for
35097 the target function. DELTA is an immediate constant offset to be
35098 added to THIS. If VCALL_OFFSET is nonzero, the word at
35099 *(*this + vcall_offset) should be added to THIS. */
35100
35101 static void
35102 x86_output_mi_thunk (FILE *file,
35103 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
35104 HOST_WIDE_INT vcall_offset, tree function)
35105 {
35106 rtx this_param = x86_this_parameter (function);
35107 rtx this_reg, tmp, fnaddr;
35108 unsigned int tmp_regno;
35109
35110 if (TARGET_64BIT)
35111 tmp_regno = R10_REG;
35112 else
35113 {
35114 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
35115 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
35116 tmp_regno = AX_REG;
35117 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
35118 tmp_regno = DX_REG;
35119 else
35120 tmp_regno = CX_REG;
35121 }
35122
35123 emit_note (NOTE_INSN_PROLOGUE_END);
35124
35125 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
35126 pull it in now and let DELTA benefit. */
35127 if (REG_P (this_param))
35128 this_reg = this_param;
35129 else if (vcall_offset)
35130 {
35131 /* Put the this parameter into %eax. */
35132 this_reg = gen_rtx_REG (Pmode, AX_REG);
35133 emit_move_insn (this_reg, this_param);
35134 }
35135 else
35136 this_reg = NULL_RTX;
35137
35138 /* Adjust the this parameter by a fixed constant. */
35139 if (delta)
35140 {
35141 rtx delta_rtx = GEN_INT (delta);
35142 rtx delta_dst = this_reg ? this_reg : this_param;
35143
35144 if (TARGET_64BIT)
35145 {
35146 if (!x86_64_general_operand (delta_rtx, Pmode))
35147 {
35148 tmp = gen_rtx_REG (Pmode, tmp_regno);
35149 emit_move_insn (tmp, delta_rtx);
35150 delta_rtx = tmp;
35151 }
35152 }
35153
35154 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
35155 }
35156
35157 /* Adjust the this parameter by a value stored in the vtable. */
35158 if (vcall_offset)
35159 {
35160 rtx vcall_addr, vcall_mem, this_mem;
35161
35162 tmp = gen_rtx_REG (Pmode, tmp_regno);
35163
35164 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
35165 if (Pmode != ptr_mode)
35166 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
35167 emit_move_insn (tmp, this_mem);
35168
35169 /* Adjust the this parameter. */
35170 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
35171 if (TARGET_64BIT
35172 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
35173 {
35174 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
35175 emit_move_insn (tmp2, GEN_INT (vcall_offset));
35176 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
35177 }
35178
35179 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
35180 if (Pmode != ptr_mode)
35181 emit_insn (gen_addsi_1_zext (this_reg,
35182 gen_rtx_REG (ptr_mode,
35183 REGNO (this_reg)),
35184 vcall_mem));
35185 else
35186 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
35187 }
35188
35189 /* If necessary, drop THIS back to its stack slot. */
35190 if (this_reg && this_reg != this_param)
35191 emit_move_insn (this_param, this_reg);
35192
35193 fnaddr = XEXP (DECL_RTL (function), 0);
35194 if (TARGET_64BIT)
35195 {
35196 if (!flag_pic || targetm.binds_local_p (function)
35197 || TARGET_PECOFF)
35198 ;
35199 else
35200 {
35201 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
35202 tmp = gen_rtx_CONST (Pmode, tmp);
35203 fnaddr = gen_rtx_MEM (Pmode, tmp);
35204 }
35205 }
35206 else
35207 {
35208 if (!flag_pic || targetm.binds_local_p (function))
35209 ;
35210 #if TARGET_MACHO
35211 else if (TARGET_MACHO)
35212 {
35213 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
35214 fnaddr = XEXP (fnaddr, 0);
35215 }
35216 #endif /* TARGET_MACHO */
35217 else
35218 {
35219 tmp = gen_rtx_REG (Pmode, CX_REG);
35220 output_set_got (tmp, NULL_RTX);
35221
35222 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
35223 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
35224 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
35225 }
35226 }
35227
35228 /* Our sibling call patterns do not allow memories, because we have no
35229 predicate that can distinguish between frame and non-frame memory.
35230 For our purposes here, we can get away with (ab)using a jump pattern,
35231 because we're going to do no optimization. */
35232 if (MEM_P (fnaddr))
35233 emit_jump_insn (gen_indirect_jump (fnaddr));
35234 else
35235 {
35236 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
35237 fnaddr = legitimize_pic_address (fnaddr,
35238 gen_rtx_REG (Pmode, tmp_regno));
35239
35240 if (!sibcall_insn_operand (fnaddr, word_mode))
35241 {
35242 tmp = gen_rtx_REG (word_mode, tmp_regno);
35243 if (GET_MODE (fnaddr) != word_mode)
35244 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
35245 emit_move_insn (tmp, fnaddr);
35246 fnaddr = tmp;
35247 }
35248
35249 tmp = gen_rtx_MEM (QImode, fnaddr);
35250 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
35251 tmp = emit_call_insn (tmp);
35252 SIBLING_CALL_P (tmp) = 1;
35253 }
35254 emit_barrier ();
35255
35256 /* Emit just enough of rest_of_compilation to get the insns emitted.
35257 Note that use_thunk calls assemble_start_function et al. */
35258 tmp = get_insns ();
35259 shorten_branches (tmp);
35260 final_start_function (tmp, file, 1);
35261 final (tmp, file, 1);
35262 final_end_function ();
35263 }
35264
35265 static void
35266 x86_file_start (void)
35267 {
35268 default_file_start ();
35269 #if TARGET_MACHO
35270 darwin_file_start ();
35271 #endif
35272 if (X86_FILE_START_VERSION_DIRECTIVE)
35273 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
35274 if (X86_FILE_START_FLTUSED)
35275 fputs ("\t.global\t__fltused\n", asm_out_file);
35276 if (ix86_asm_dialect == ASM_INTEL)
35277 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
35278 }
35279
35280 int
35281 x86_field_alignment (tree field, int computed)
35282 {
35283 enum machine_mode mode;
35284 tree type = TREE_TYPE (field);
35285
35286 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
35287 return computed;
35288 mode = TYPE_MODE (strip_array_types (type));
35289 if (mode == DFmode || mode == DCmode
35290 || GET_MODE_CLASS (mode) == MODE_INT
35291 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
35292 return MIN (32, computed);
35293 return computed;
35294 }
35295
35296 /* Output assembler code to FILE to increment profiler label # LABELNO
35297 for profiling a function entry. */
35298 void
35299 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
35300 {
35301 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
35302 : MCOUNT_NAME);
35303
35304 if (TARGET_64BIT)
35305 {
35306 #ifndef NO_PROFILE_COUNTERS
35307 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
35308 #endif
35309
35310 if (!TARGET_PECOFF && flag_pic)
35311 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
35312 else
35313 fprintf (file, "\tcall\t%s\n", mcount_name);
35314 }
35315 else if (flag_pic)
35316 {
35317 #ifndef NO_PROFILE_COUNTERS
35318 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
35319 LPREFIX, labelno);
35320 #endif
35321 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
35322 }
35323 else
35324 {
35325 #ifndef NO_PROFILE_COUNTERS
35326 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
35327 LPREFIX, labelno);
35328 #endif
35329 fprintf (file, "\tcall\t%s\n", mcount_name);
35330 }
35331 }
35332
35333 /* We don't have exact information about the insn sizes, but we may assume
35334 quite safely that we are informed about all 1 byte insns and memory
35335 address sizes. This is enough to eliminate unnecessary padding in
35336 99% of cases. */
35337
35338 static int
35339 min_insn_size (rtx insn)
35340 {
35341 int l = 0, len;
35342
35343 if (!INSN_P (insn) || !active_insn_p (insn))
35344 return 0;
35345
35346 /* Discard alignments we've emit and jump instructions. */
35347 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
35348 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
35349 return 0;
35350
35351 /* Important case - calls are always 5 bytes.
35352 It is common to have many calls in the row. */
35353 if (CALL_P (insn)
35354 && symbolic_reference_mentioned_p (PATTERN (insn))
35355 && !SIBLING_CALL_P (insn))
35356 return 5;
35357 len = get_attr_length (insn);
35358 if (len <= 1)
35359 return 1;
35360
35361 /* For normal instructions we rely on get_attr_length being exact,
35362 with a few exceptions. */
35363 if (!JUMP_P (insn))
35364 {
35365 enum attr_type type = get_attr_type (insn);
35366
35367 switch (type)
35368 {
35369 case TYPE_MULTI:
35370 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
35371 || asm_noperands (PATTERN (insn)) >= 0)
35372 return 0;
35373 break;
35374 case TYPE_OTHER:
35375 case TYPE_FCMP:
35376 break;
35377 default:
35378 /* Otherwise trust get_attr_length. */
35379 return len;
35380 }
35381
35382 l = get_attr_length_address (insn);
35383 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
35384 l = 4;
35385 }
35386 if (l)
35387 return 1+l;
35388 else
35389 return 2;
35390 }
35391
35392 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35393
35394 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
35395 window. */
35396
35397 static void
35398 ix86_avoid_jump_mispredicts (void)
35399 {
35400 rtx insn, start = get_insns ();
35401 int nbytes = 0, njumps = 0;
35402 int isjump = 0;
35403
35404 /* Look for all minimal intervals of instructions containing 4 jumps.
35405 The intervals are bounded by START and INSN. NBYTES is the total
35406 size of instructions in the interval including INSN and not including
35407 START. When the NBYTES is smaller than 16 bytes, it is possible
35408 that the end of START and INSN ends up in the same 16byte page.
35409
35410 The smallest offset in the page INSN can start is the case where START
35411 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
35412 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
35413 */
35414 for (insn = start; insn; insn = NEXT_INSN (insn))
35415 {
35416 int min_size;
35417
35418 if (LABEL_P (insn))
35419 {
35420 int align = label_to_alignment (insn);
35421 int max_skip = label_to_max_skip (insn);
35422
35423 if (max_skip > 15)
35424 max_skip = 15;
35425 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
35426 already in the current 16 byte page, because otherwise
35427 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
35428 bytes to reach 16 byte boundary. */
35429 if (align <= 0
35430 || (align <= 3 && max_skip != (1 << align) - 1))
35431 max_skip = 0;
35432 if (dump_file)
35433 fprintf (dump_file, "Label %i with max_skip %i\n",
35434 INSN_UID (insn), max_skip);
35435 if (max_skip)
35436 {
35437 while (nbytes + max_skip >= 16)
35438 {
35439 start = NEXT_INSN (start);
35440 if (JUMP_P (start) || CALL_P (start))
35441 njumps--, isjump = 1;
35442 else
35443 isjump = 0;
35444 nbytes -= min_insn_size (start);
35445 }
35446 }
35447 continue;
35448 }
35449
35450 min_size = min_insn_size (insn);
35451 nbytes += min_size;
35452 if (dump_file)
35453 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
35454 INSN_UID (insn), min_size);
35455 if (JUMP_P (insn) || CALL_P (insn))
35456 njumps++;
35457 else
35458 continue;
35459
35460 while (njumps > 3)
35461 {
35462 start = NEXT_INSN (start);
35463 if (JUMP_P (start) || CALL_P (start))
35464 njumps--, isjump = 1;
35465 else
35466 isjump = 0;
35467 nbytes -= min_insn_size (start);
35468 }
35469 gcc_assert (njumps >= 0);
35470 if (dump_file)
35471 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
35472 INSN_UID (start), INSN_UID (insn), nbytes);
35473
35474 if (njumps == 3 && isjump && nbytes < 16)
35475 {
35476 int padsize = 15 - nbytes + min_insn_size (insn);
35477
35478 if (dump_file)
35479 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
35480 INSN_UID (insn), padsize);
35481 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
35482 }
35483 }
35484 }
35485 #endif
35486
35487 /* AMD Athlon works faster
35488 when RET is not destination of conditional jump or directly preceded
35489 by other jump instruction. We avoid the penalty by inserting NOP just
35490 before the RET instructions in such cases. */
35491 static void
35492 ix86_pad_returns (void)
35493 {
35494 edge e;
35495 edge_iterator ei;
35496
35497 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35498 {
35499 basic_block bb = e->src;
35500 rtx ret = BB_END (bb);
35501 rtx prev;
35502 bool replace = false;
35503
35504 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
35505 || optimize_bb_for_size_p (bb))
35506 continue;
35507 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
35508 if (active_insn_p (prev) || LABEL_P (prev))
35509 break;
35510 if (prev && LABEL_P (prev))
35511 {
35512 edge e;
35513 edge_iterator ei;
35514
35515 FOR_EACH_EDGE (e, ei, bb->preds)
35516 if (EDGE_FREQUENCY (e) && e->src->index >= 0
35517 && !(e->flags & EDGE_FALLTHRU))
35518 replace = true;
35519 }
35520 if (!replace)
35521 {
35522 prev = prev_active_insn (ret);
35523 if (prev
35524 && ((JUMP_P (prev) && any_condjump_p (prev))
35525 || CALL_P (prev)))
35526 replace = true;
35527 /* Empty functions get branch mispredict even when
35528 the jump destination is not visible to us. */
35529 if (!prev && !optimize_function_for_size_p (cfun))
35530 replace = true;
35531 }
35532 if (replace)
35533 {
35534 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
35535 delete_insn (ret);
35536 }
35537 }
35538 }
35539
35540 /* Count the minimum number of instructions in BB. Return 4 if the
35541 number of instructions >= 4. */
35542
35543 static int
35544 ix86_count_insn_bb (basic_block bb)
35545 {
35546 rtx insn;
35547 int insn_count = 0;
35548
35549 /* Count number of instructions in this block. Return 4 if the number
35550 of instructions >= 4. */
35551 FOR_BB_INSNS (bb, insn)
35552 {
35553 /* Only happen in exit blocks. */
35554 if (JUMP_P (insn)
35555 && ANY_RETURN_P (PATTERN (insn)))
35556 break;
35557
35558 if (NONDEBUG_INSN_P (insn)
35559 && GET_CODE (PATTERN (insn)) != USE
35560 && GET_CODE (PATTERN (insn)) != CLOBBER)
35561 {
35562 insn_count++;
35563 if (insn_count >= 4)
35564 return insn_count;
35565 }
35566 }
35567
35568 return insn_count;
35569 }
35570
35571
35572 /* Count the minimum number of instructions in code path in BB.
35573 Return 4 if the number of instructions >= 4. */
35574
35575 static int
35576 ix86_count_insn (basic_block bb)
35577 {
35578 edge e;
35579 edge_iterator ei;
35580 int min_prev_count;
35581
35582 /* Only bother counting instructions along paths with no
35583 more than 2 basic blocks between entry and exit. Given
35584 that BB has an edge to exit, determine if a predecessor
35585 of BB has an edge from entry. If so, compute the number
35586 of instructions in the predecessor block. If there
35587 happen to be multiple such blocks, compute the minimum. */
35588 min_prev_count = 4;
35589 FOR_EACH_EDGE (e, ei, bb->preds)
35590 {
35591 edge prev_e;
35592 edge_iterator prev_ei;
35593
35594 if (e->src == ENTRY_BLOCK_PTR)
35595 {
35596 min_prev_count = 0;
35597 break;
35598 }
35599 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
35600 {
35601 if (prev_e->src == ENTRY_BLOCK_PTR)
35602 {
35603 int count = ix86_count_insn_bb (e->src);
35604 if (count < min_prev_count)
35605 min_prev_count = count;
35606 break;
35607 }
35608 }
35609 }
35610
35611 if (min_prev_count < 4)
35612 min_prev_count += ix86_count_insn_bb (bb);
35613
35614 return min_prev_count;
35615 }
35616
35617 /* Pad short function to 4 instructions. */
35618
35619 static void
35620 ix86_pad_short_function (void)
35621 {
35622 edge e;
35623 edge_iterator ei;
35624
35625 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35626 {
35627 rtx ret = BB_END (e->src);
35628 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
35629 {
35630 int insn_count = ix86_count_insn (e->src);
35631
35632 /* Pad short function. */
35633 if (insn_count < 4)
35634 {
35635 rtx insn = ret;
35636
35637 /* Find epilogue. */
35638 while (insn
35639 && (!NOTE_P (insn)
35640 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
35641 insn = PREV_INSN (insn);
35642
35643 if (!insn)
35644 insn = ret;
35645
35646 /* Two NOPs count as one instruction. */
35647 insn_count = 2 * (4 - insn_count);
35648 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
35649 }
35650 }
35651 }
35652 }
35653
35654 /* Fix up a Windows system unwinder issue. If an EH region falls thru into
35655 the epilogue, the Windows system unwinder will apply epilogue logic and
35656 produce incorrect offsets. This can be avoided by adding a nop between
35657 the last insn that can throw and the first insn of the epilogue. */
35658
35659 static void
35660 ix86_seh_fixup_eh_fallthru (void)
35661 {
35662 edge e;
35663 edge_iterator ei;
35664
35665 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35666 {
35667 rtx insn, next;
35668
35669 /* Find the beginning of the epilogue. */
35670 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
35671 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
35672 break;
35673 if (insn == NULL)
35674 continue;
35675
35676 /* We only care about preceeding insns that can throw. */
35677 insn = prev_active_insn (insn);
35678 if (insn == NULL || !can_throw_internal (insn))
35679 continue;
35680
35681 /* Do not separate calls from their debug information. */
35682 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
35683 if (NOTE_P (next)
35684 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
35685 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
35686 insn = next;
35687 else
35688 break;
35689
35690 emit_insn_after (gen_nops (const1_rtx), insn);
35691 }
35692 }
35693
35694 /* Implement machine specific optimizations. We implement padding of returns
35695 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
35696 static void
35697 ix86_reorg (void)
35698 {
35699 /* We are freeing block_for_insn in the toplev to keep compatibility
35700 with old MDEP_REORGS that are not CFG based. Recompute it now. */
35701 compute_bb_for_insn ();
35702
35703 if (TARGET_SEH && current_function_has_exception_handlers ())
35704 ix86_seh_fixup_eh_fallthru ();
35705
35706 if (optimize && optimize_function_for_speed_p (cfun))
35707 {
35708 if (TARGET_PAD_SHORT_FUNCTION)
35709 ix86_pad_short_function ();
35710 else if (TARGET_PAD_RETURNS)
35711 ix86_pad_returns ();
35712 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35713 if (TARGET_FOUR_JUMP_LIMIT)
35714 ix86_avoid_jump_mispredicts ();
35715 #endif
35716 }
35717 }
35718
35719 /* Return nonzero when QImode register that must be represented via REX prefix
35720 is used. */
35721 bool
35722 x86_extended_QIreg_mentioned_p (rtx insn)
35723 {
35724 int i;
35725 extract_insn_cached (insn);
35726 for (i = 0; i < recog_data.n_operands; i++)
35727 if (GENERAL_REG_P (recog_data.operand[i])
35728 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
35729 return true;
35730 return false;
35731 }
35732
35733 /* Return nonzero when P points to register encoded via REX prefix.
35734 Called via for_each_rtx. */
35735 static int
35736 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
35737 {
35738 unsigned int regno;
35739 if (!REG_P (*p))
35740 return 0;
35741 regno = REGNO (*p);
35742 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
35743 }
35744
35745 /* Return true when INSN mentions register that must be encoded using REX
35746 prefix. */
35747 bool
35748 x86_extended_reg_mentioned_p (rtx insn)
35749 {
35750 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
35751 extended_reg_mentioned_1, NULL);
35752 }
35753
35754 /* If profitable, negate (without causing overflow) integer constant
35755 of mode MODE at location LOC. Return true in this case. */
35756 bool
35757 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
35758 {
35759 HOST_WIDE_INT val;
35760
35761 if (!CONST_INT_P (*loc))
35762 return false;
35763
35764 switch (mode)
35765 {
35766 case DImode:
35767 /* DImode x86_64 constants must fit in 32 bits. */
35768 gcc_assert (x86_64_immediate_operand (*loc, mode));
35769
35770 mode = SImode;
35771 break;
35772
35773 case SImode:
35774 case HImode:
35775 case QImode:
35776 break;
35777
35778 default:
35779 gcc_unreachable ();
35780 }
35781
35782 /* Avoid overflows. */
35783 if (mode_signbit_p (mode, *loc))
35784 return false;
35785
35786 val = INTVAL (*loc);
35787
35788 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
35789 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
35790 if ((val < 0 && val != -128)
35791 || val == 128)
35792 {
35793 *loc = GEN_INT (-val);
35794 return true;
35795 }
35796
35797 return false;
35798 }
35799
35800 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
35801 optabs would emit if we didn't have TFmode patterns. */
35802
35803 void
35804 x86_emit_floatuns (rtx operands[2])
35805 {
35806 rtx neglab, donelab, i0, i1, f0, in, out;
35807 enum machine_mode mode, inmode;
35808
35809 inmode = GET_MODE (operands[1]);
35810 gcc_assert (inmode == SImode || inmode == DImode);
35811
35812 out = operands[0];
35813 in = force_reg (inmode, operands[1]);
35814 mode = GET_MODE (out);
35815 neglab = gen_label_rtx ();
35816 donelab = gen_label_rtx ();
35817 f0 = gen_reg_rtx (mode);
35818
35819 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
35820
35821 expand_float (out, in, 0);
35822
35823 emit_jump_insn (gen_jump (donelab));
35824 emit_barrier ();
35825
35826 emit_label (neglab);
35827
35828 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
35829 1, OPTAB_DIRECT);
35830 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
35831 1, OPTAB_DIRECT);
35832 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
35833
35834 expand_float (f0, i0, 0);
35835
35836 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
35837
35838 emit_label (donelab);
35839 }
35840 \f
35841 /* AVX2 does support 32-byte integer vector operations,
35842 thus the longest vector we are faced with is V32QImode. */
35843 #define MAX_VECT_LEN 32
35844
35845 struct expand_vec_perm_d
35846 {
35847 rtx target, op0, op1;
35848 unsigned char perm[MAX_VECT_LEN];
35849 enum machine_mode vmode;
35850 unsigned char nelt;
35851 bool one_operand_p;
35852 bool testing_p;
35853 };
35854
35855 static bool canonicalize_perm (struct expand_vec_perm_d *d);
35856 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
35857 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
35858
35859 /* Get a vector mode of the same size as the original but with elements
35860 twice as wide. This is only guaranteed to apply to integral vectors. */
35861
35862 static inline enum machine_mode
35863 get_mode_wider_vector (enum machine_mode o)
35864 {
35865 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
35866 enum machine_mode n = GET_MODE_WIDER_MODE (o);
35867 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
35868 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
35869 return n;
35870 }
35871
35872 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35873 with all elements equal to VAR. Return true if successful. */
35874
35875 static bool
35876 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
35877 rtx target, rtx val)
35878 {
35879 bool ok;
35880
35881 switch (mode)
35882 {
35883 case V2SImode:
35884 case V2SFmode:
35885 if (!mmx_ok)
35886 return false;
35887 /* FALLTHRU */
35888
35889 case V4DFmode:
35890 case V4DImode:
35891 case V8SFmode:
35892 case V8SImode:
35893 case V2DFmode:
35894 case V2DImode:
35895 case V4SFmode:
35896 case V4SImode:
35897 {
35898 rtx insn, dup;
35899
35900 /* First attempt to recognize VAL as-is. */
35901 dup = gen_rtx_VEC_DUPLICATE (mode, val);
35902 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
35903 if (recog_memoized (insn) < 0)
35904 {
35905 rtx seq;
35906 /* If that fails, force VAL into a register. */
35907
35908 start_sequence ();
35909 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
35910 seq = get_insns ();
35911 end_sequence ();
35912 if (seq)
35913 emit_insn_before (seq, insn);
35914
35915 ok = recog_memoized (insn) >= 0;
35916 gcc_assert (ok);
35917 }
35918 }
35919 return true;
35920
35921 case V4HImode:
35922 if (!mmx_ok)
35923 return false;
35924 if (TARGET_SSE || TARGET_3DNOW_A)
35925 {
35926 rtx x;
35927
35928 val = gen_lowpart (SImode, val);
35929 x = gen_rtx_TRUNCATE (HImode, val);
35930 x = gen_rtx_VEC_DUPLICATE (mode, x);
35931 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35932 return true;
35933 }
35934 goto widen;
35935
35936 case V8QImode:
35937 if (!mmx_ok)
35938 return false;
35939 goto widen;
35940
35941 case V8HImode:
35942 if (TARGET_SSE2)
35943 {
35944 struct expand_vec_perm_d dperm;
35945 rtx tmp1, tmp2;
35946
35947 permute:
35948 memset (&dperm, 0, sizeof (dperm));
35949 dperm.target = target;
35950 dperm.vmode = mode;
35951 dperm.nelt = GET_MODE_NUNITS (mode);
35952 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
35953 dperm.one_operand_p = true;
35954
35955 /* Extend to SImode using a paradoxical SUBREG. */
35956 tmp1 = gen_reg_rtx (SImode);
35957 emit_move_insn (tmp1, gen_lowpart (SImode, val));
35958
35959 /* Insert the SImode value as low element of a V4SImode vector. */
35960 tmp2 = gen_lowpart (V4SImode, dperm.op0);
35961 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
35962
35963 ok = (expand_vec_perm_1 (&dperm)
35964 || expand_vec_perm_broadcast_1 (&dperm));
35965 gcc_assert (ok);
35966 return ok;
35967 }
35968 goto widen;
35969
35970 case V16QImode:
35971 if (TARGET_SSE2)
35972 goto permute;
35973 goto widen;
35974
35975 widen:
35976 /* Replicate the value once into the next wider mode and recurse. */
35977 {
35978 enum machine_mode smode, wsmode, wvmode;
35979 rtx x;
35980
35981 smode = GET_MODE_INNER (mode);
35982 wvmode = get_mode_wider_vector (mode);
35983 wsmode = GET_MODE_INNER (wvmode);
35984
35985 val = convert_modes (wsmode, smode, val, true);
35986 x = expand_simple_binop (wsmode, ASHIFT, val,
35987 GEN_INT (GET_MODE_BITSIZE (smode)),
35988 NULL_RTX, 1, OPTAB_LIB_WIDEN);
35989 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
35990
35991 x = gen_lowpart (wvmode, target);
35992 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
35993 gcc_assert (ok);
35994 return ok;
35995 }
35996
35997 case V16HImode:
35998 case V32QImode:
35999 {
36000 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
36001 rtx x = gen_reg_rtx (hvmode);
36002
36003 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
36004 gcc_assert (ok);
36005
36006 x = gen_rtx_VEC_CONCAT (mode, x, x);
36007 emit_insn (gen_rtx_SET (VOIDmode, target, x));
36008 }
36009 return true;
36010
36011 default:
36012 return false;
36013 }
36014 }
36015
36016 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36017 whose ONE_VAR element is VAR, and other elements are zero. Return true
36018 if successful. */
36019
36020 static bool
36021 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
36022 rtx target, rtx var, int one_var)
36023 {
36024 enum machine_mode vsimode;
36025 rtx new_target;
36026 rtx x, tmp;
36027 bool use_vector_set = false;
36028
36029 switch (mode)
36030 {
36031 case V2DImode:
36032 /* For SSE4.1, we normally use vector set. But if the second
36033 element is zero and inter-unit moves are OK, we use movq
36034 instead. */
36035 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
36036 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
36037 && one_var == 0));
36038 break;
36039 case V16QImode:
36040 case V4SImode:
36041 case V4SFmode:
36042 use_vector_set = TARGET_SSE4_1;
36043 break;
36044 case V8HImode:
36045 use_vector_set = TARGET_SSE2;
36046 break;
36047 case V4HImode:
36048 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
36049 break;
36050 case V32QImode:
36051 case V16HImode:
36052 case V8SImode:
36053 case V8SFmode:
36054 case V4DFmode:
36055 use_vector_set = TARGET_AVX;
36056 break;
36057 case V4DImode:
36058 /* Use ix86_expand_vector_set in 64bit mode only. */
36059 use_vector_set = TARGET_AVX && TARGET_64BIT;
36060 break;
36061 default:
36062 break;
36063 }
36064
36065 if (use_vector_set)
36066 {
36067 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
36068 var = force_reg (GET_MODE_INNER (mode), var);
36069 ix86_expand_vector_set (mmx_ok, target, var, one_var);
36070 return true;
36071 }
36072
36073 switch (mode)
36074 {
36075 case V2SFmode:
36076 case V2SImode:
36077 if (!mmx_ok)
36078 return false;
36079 /* FALLTHRU */
36080
36081 case V2DFmode:
36082 case V2DImode:
36083 if (one_var != 0)
36084 return false;
36085 var = force_reg (GET_MODE_INNER (mode), var);
36086 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
36087 emit_insn (gen_rtx_SET (VOIDmode, target, x));
36088 return true;
36089
36090 case V4SFmode:
36091 case V4SImode:
36092 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
36093 new_target = gen_reg_rtx (mode);
36094 else
36095 new_target = target;
36096 var = force_reg (GET_MODE_INNER (mode), var);
36097 x = gen_rtx_VEC_DUPLICATE (mode, var);
36098 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
36099 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
36100 if (one_var != 0)
36101 {
36102 /* We need to shuffle the value to the correct position, so
36103 create a new pseudo to store the intermediate result. */
36104
36105 /* With SSE2, we can use the integer shuffle insns. */
36106 if (mode != V4SFmode && TARGET_SSE2)
36107 {
36108 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
36109 const1_rtx,
36110 GEN_INT (one_var == 1 ? 0 : 1),
36111 GEN_INT (one_var == 2 ? 0 : 1),
36112 GEN_INT (one_var == 3 ? 0 : 1)));
36113 if (target != new_target)
36114 emit_move_insn (target, new_target);
36115 return true;
36116 }
36117
36118 /* Otherwise convert the intermediate result to V4SFmode and
36119 use the SSE1 shuffle instructions. */
36120 if (mode != V4SFmode)
36121 {
36122 tmp = gen_reg_rtx (V4SFmode);
36123 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
36124 }
36125 else
36126 tmp = new_target;
36127
36128 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
36129 const1_rtx,
36130 GEN_INT (one_var == 1 ? 0 : 1),
36131 GEN_INT (one_var == 2 ? 0+4 : 1+4),
36132 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
36133
36134 if (mode != V4SFmode)
36135 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
36136 else if (tmp != target)
36137 emit_move_insn (target, tmp);
36138 }
36139 else if (target != new_target)
36140 emit_move_insn (target, new_target);
36141 return true;
36142
36143 case V8HImode:
36144 case V16QImode:
36145 vsimode = V4SImode;
36146 goto widen;
36147 case V4HImode:
36148 case V8QImode:
36149 if (!mmx_ok)
36150 return false;
36151 vsimode = V2SImode;
36152 goto widen;
36153 widen:
36154 if (one_var != 0)
36155 return false;
36156
36157 /* Zero extend the variable element to SImode and recurse. */
36158 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
36159
36160 x = gen_reg_rtx (vsimode);
36161 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
36162 var, one_var))
36163 gcc_unreachable ();
36164
36165 emit_move_insn (target, gen_lowpart (mode, x));
36166 return true;
36167
36168 default:
36169 return false;
36170 }
36171 }
36172
36173 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36174 consisting of the values in VALS. It is known that all elements
36175 except ONE_VAR are constants. Return true if successful. */
36176
36177 static bool
36178 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
36179 rtx target, rtx vals, int one_var)
36180 {
36181 rtx var = XVECEXP (vals, 0, one_var);
36182 enum machine_mode wmode;
36183 rtx const_vec, x;
36184
36185 const_vec = copy_rtx (vals);
36186 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
36187 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
36188
36189 switch (mode)
36190 {
36191 case V2DFmode:
36192 case V2DImode:
36193 case V2SFmode:
36194 case V2SImode:
36195 /* For the two element vectors, it's just as easy to use
36196 the general case. */
36197 return false;
36198
36199 case V4DImode:
36200 /* Use ix86_expand_vector_set in 64bit mode only. */
36201 if (!TARGET_64BIT)
36202 return false;
36203 case V4DFmode:
36204 case V8SFmode:
36205 case V8SImode:
36206 case V16HImode:
36207 case V32QImode:
36208 case V4SFmode:
36209 case V4SImode:
36210 case V8HImode:
36211 case V4HImode:
36212 break;
36213
36214 case V16QImode:
36215 if (TARGET_SSE4_1)
36216 break;
36217 wmode = V8HImode;
36218 goto widen;
36219 case V8QImode:
36220 wmode = V4HImode;
36221 goto widen;
36222 widen:
36223 /* There's no way to set one QImode entry easily. Combine
36224 the variable value with its adjacent constant value, and
36225 promote to an HImode set. */
36226 x = XVECEXP (vals, 0, one_var ^ 1);
36227 if (one_var & 1)
36228 {
36229 var = convert_modes (HImode, QImode, var, true);
36230 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
36231 NULL_RTX, 1, OPTAB_LIB_WIDEN);
36232 x = GEN_INT (INTVAL (x) & 0xff);
36233 }
36234 else
36235 {
36236 var = convert_modes (HImode, QImode, var, true);
36237 x = gen_int_mode (INTVAL (x) << 8, HImode);
36238 }
36239 if (x != const0_rtx)
36240 var = expand_simple_binop (HImode, IOR, var, x, var,
36241 1, OPTAB_LIB_WIDEN);
36242
36243 x = gen_reg_rtx (wmode);
36244 emit_move_insn (x, gen_lowpart (wmode, const_vec));
36245 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
36246
36247 emit_move_insn (target, gen_lowpart (mode, x));
36248 return true;
36249
36250 default:
36251 return false;
36252 }
36253
36254 emit_move_insn (target, const_vec);
36255 ix86_expand_vector_set (mmx_ok, target, var, one_var);
36256 return true;
36257 }
36258
36259 /* A subroutine of ix86_expand_vector_init_general. Use vector
36260 concatenate to handle the most general case: all values variable,
36261 and none identical. */
36262
36263 static void
36264 ix86_expand_vector_init_concat (enum machine_mode mode,
36265 rtx target, rtx *ops, int n)
36266 {
36267 enum machine_mode cmode, hmode = VOIDmode;
36268 rtx first[8], second[4];
36269 rtvec v;
36270 int i, j;
36271
36272 switch (n)
36273 {
36274 case 2:
36275 switch (mode)
36276 {
36277 case V8SImode:
36278 cmode = V4SImode;
36279 break;
36280 case V8SFmode:
36281 cmode = V4SFmode;
36282 break;
36283 case V4DImode:
36284 cmode = V2DImode;
36285 break;
36286 case V4DFmode:
36287 cmode = V2DFmode;
36288 break;
36289 case V4SImode:
36290 cmode = V2SImode;
36291 break;
36292 case V4SFmode:
36293 cmode = V2SFmode;
36294 break;
36295 case V2DImode:
36296 cmode = DImode;
36297 break;
36298 case V2SImode:
36299 cmode = SImode;
36300 break;
36301 case V2DFmode:
36302 cmode = DFmode;
36303 break;
36304 case V2SFmode:
36305 cmode = SFmode;
36306 break;
36307 default:
36308 gcc_unreachable ();
36309 }
36310
36311 if (!register_operand (ops[1], cmode))
36312 ops[1] = force_reg (cmode, ops[1]);
36313 if (!register_operand (ops[0], cmode))
36314 ops[0] = force_reg (cmode, ops[0]);
36315 emit_insn (gen_rtx_SET (VOIDmode, target,
36316 gen_rtx_VEC_CONCAT (mode, ops[0],
36317 ops[1])));
36318 break;
36319
36320 case 4:
36321 switch (mode)
36322 {
36323 case V4DImode:
36324 cmode = V2DImode;
36325 break;
36326 case V4DFmode:
36327 cmode = V2DFmode;
36328 break;
36329 case V4SImode:
36330 cmode = V2SImode;
36331 break;
36332 case V4SFmode:
36333 cmode = V2SFmode;
36334 break;
36335 default:
36336 gcc_unreachable ();
36337 }
36338 goto half;
36339
36340 case 8:
36341 switch (mode)
36342 {
36343 case V8SImode:
36344 cmode = V2SImode;
36345 hmode = V4SImode;
36346 break;
36347 case V8SFmode:
36348 cmode = V2SFmode;
36349 hmode = V4SFmode;
36350 break;
36351 default:
36352 gcc_unreachable ();
36353 }
36354 goto half;
36355
36356 half:
36357 /* FIXME: We process inputs backward to help RA. PR 36222. */
36358 i = n - 1;
36359 j = (n >> 1) - 1;
36360 for (; i > 0; i -= 2, j--)
36361 {
36362 first[j] = gen_reg_rtx (cmode);
36363 v = gen_rtvec (2, ops[i - 1], ops[i]);
36364 ix86_expand_vector_init (false, first[j],
36365 gen_rtx_PARALLEL (cmode, v));
36366 }
36367
36368 n >>= 1;
36369 if (n > 2)
36370 {
36371 gcc_assert (hmode != VOIDmode);
36372 for (i = j = 0; i < n; i += 2, j++)
36373 {
36374 second[j] = gen_reg_rtx (hmode);
36375 ix86_expand_vector_init_concat (hmode, second [j],
36376 &first [i], 2);
36377 }
36378 n >>= 1;
36379 ix86_expand_vector_init_concat (mode, target, second, n);
36380 }
36381 else
36382 ix86_expand_vector_init_concat (mode, target, first, n);
36383 break;
36384
36385 default:
36386 gcc_unreachable ();
36387 }
36388 }
36389
36390 /* A subroutine of ix86_expand_vector_init_general. Use vector
36391 interleave to handle the most general case: all values variable,
36392 and none identical. */
36393
36394 static void
36395 ix86_expand_vector_init_interleave (enum machine_mode mode,
36396 rtx target, rtx *ops, int n)
36397 {
36398 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
36399 int i, j;
36400 rtx op0, op1;
36401 rtx (*gen_load_even) (rtx, rtx, rtx);
36402 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
36403 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
36404
36405 switch (mode)
36406 {
36407 case V8HImode:
36408 gen_load_even = gen_vec_setv8hi;
36409 gen_interleave_first_low = gen_vec_interleave_lowv4si;
36410 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36411 inner_mode = HImode;
36412 first_imode = V4SImode;
36413 second_imode = V2DImode;
36414 third_imode = VOIDmode;
36415 break;
36416 case V16QImode:
36417 gen_load_even = gen_vec_setv16qi;
36418 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
36419 gen_interleave_second_low = gen_vec_interleave_lowv4si;
36420 inner_mode = QImode;
36421 first_imode = V8HImode;
36422 second_imode = V4SImode;
36423 third_imode = V2DImode;
36424 break;
36425 default:
36426 gcc_unreachable ();
36427 }
36428
36429 for (i = 0; i < n; i++)
36430 {
36431 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
36432 op0 = gen_reg_rtx (SImode);
36433 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
36434
36435 /* Insert the SImode value as low element of V4SImode vector. */
36436 op1 = gen_reg_rtx (V4SImode);
36437 op0 = gen_rtx_VEC_MERGE (V4SImode,
36438 gen_rtx_VEC_DUPLICATE (V4SImode,
36439 op0),
36440 CONST0_RTX (V4SImode),
36441 const1_rtx);
36442 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
36443
36444 /* Cast the V4SImode vector back to a vector in orignal mode. */
36445 op0 = gen_reg_rtx (mode);
36446 emit_move_insn (op0, gen_lowpart (mode, op1));
36447
36448 /* Load even elements into the second positon. */
36449 emit_insn (gen_load_even (op0,
36450 force_reg (inner_mode,
36451 ops [i + i + 1]),
36452 const1_rtx));
36453
36454 /* Cast vector to FIRST_IMODE vector. */
36455 ops[i] = gen_reg_rtx (first_imode);
36456 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
36457 }
36458
36459 /* Interleave low FIRST_IMODE vectors. */
36460 for (i = j = 0; i < n; i += 2, j++)
36461 {
36462 op0 = gen_reg_rtx (first_imode);
36463 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
36464
36465 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
36466 ops[j] = gen_reg_rtx (second_imode);
36467 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
36468 }
36469
36470 /* Interleave low SECOND_IMODE vectors. */
36471 switch (second_imode)
36472 {
36473 case V4SImode:
36474 for (i = j = 0; i < n / 2; i += 2, j++)
36475 {
36476 op0 = gen_reg_rtx (second_imode);
36477 emit_insn (gen_interleave_second_low (op0, ops[i],
36478 ops[i + 1]));
36479
36480 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
36481 vector. */
36482 ops[j] = gen_reg_rtx (third_imode);
36483 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
36484 }
36485 second_imode = V2DImode;
36486 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36487 /* FALLTHRU */
36488
36489 case V2DImode:
36490 op0 = gen_reg_rtx (second_imode);
36491 emit_insn (gen_interleave_second_low (op0, ops[0],
36492 ops[1]));
36493
36494 /* Cast the SECOND_IMODE vector back to a vector on original
36495 mode. */
36496 emit_insn (gen_rtx_SET (VOIDmode, target,
36497 gen_lowpart (mode, op0)));
36498 break;
36499
36500 default:
36501 gcc_unreachable ();
36502 }
36503 }
36504
36505 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
36506 all values variable, and none identical. */
36507
36508 static void
36509 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
36510 rtx target, rtx vals)
36511 {
36512 rtx ops[32], op0, op1;
36513 enum machine_mode half_mode = VOIDmode;
36514 int n, i;
36515
36516 switch (mode)
36517 {
36518 case V2SFmode:
36519 case V2SImode:
36520 if (!mmx_ok && !TARGET_SSE)
36521 break;
36522 /* FALLTHRU */
36523
36524 case V8SFmode:
36525 case V8SImode:
36526 case V4DFmode:
36527 case V4DImode:
36528 case V4SFmode:
36529 case V4SImode:
36530 case V2DFmode:
36531 case V2DImode:
36532 n = GET_MODE_NUNITS (mode);
36533 for (i = 0; i < n; i++)
36534 ops[i] = XVECEXP (vals, 0, i);
36535 ix86_expand_vector_init_concat (mode, target, ops, n);
36536 return;
36537
36538 case V32QImode:
36539 half_mode = V16QImode;
36540 goto half;
36541
36542 case V16HImode:
36543 half_mode = V8HImode;
36544 goto half;
36545
36546 half:
36547 n = GET_MODE_NUNITS (mode);
36548 for (i = 0; i < n; i++)
36549 ops[i] = XVECEXP (vals, 0, i);
36550 op0 = gen_reg_rtx (half_mode);
36551 op1 = gen_reg_rtx (half_mode);
36552 ix86_expand_vector_init_interleave (half_mode, op0, ops,
36553 n >> 2);
36554 ix86_expand_vector_init_interleave (half_mode, op1,
36555 &ops [n >> 1], n >> 2);
36556 emit_insn (gen_rtx_SET (VOIDmode, target,
36557 gen_rtx_VEC_CONCAT (mode, op0, op1)));
36558 return;
36559
36560 case V16QImode:
36561 if (!TARGET_SSE4_1)
36562 break;
36563 /* FALLTHRU */
36564
36565 case V8HImode:
36566 if (!TARGET_SSE2)
36567 break;
36568
36569 /* Don't use ix86_expand_vector_init_interleave if we can't
36570 move from GPR to SSE register directly. */
36571 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
36572 break;
36573
36574 n = GET_MODE_NUNITS (mode);
36575 for (i = 0; i < n; i++)
36576 ops[i] = XVECEXP (vals, 0, i);
36577 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
36578 return;
36579
36580 case V4HImode:
36581 case V8QImode:
36582 break;
36583
36584 default:
36585 gcc_unreachable ();
36586 }
36587
36588 {
36589 int i, j, n_elts, n_words, n_elt_per_word;
36590 enum machine_mode inner_mode;
36591 rtx words[4], shift;
36592
36593 inner_mode = GET_MODE_INNER (mode);
36594 n_elts = GET_MODE_NUNITS (mode);
36595 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
36596 n_elt_per_word = n_elts / n_words;
36597 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
36598
36599 for (i = 0; i < n_words; ++i)
36600 {
36601 rtx word = NULL_RTX;
36602
36603 for (j = 0; j < n_elt_per_word; ++j)
36604 {
36605 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
36606 elt = convert_modes (word_mode, inner_mode, elt, true);
36607
36608 if (j == 0)
36609 word = elt;
36610 else
36611 {
36612 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
36613 word, 1, OPTAB_LIB_WIDEN);
36614 word = expand_simple_binop (word_mode, IOR, word, elt,
36615 word, 1, OPTAB_LIB_WIDEN);
36616 }
36617 }
36618
36619 words[i] = word;
36620 }
36621
36622 if (n_words == 1)
36623 emit_move_insn (target, gen_lowpart (mode, words[0]));
36624 else if (n_words == 2)
36625 {
36626 rtx tmp = gen_reg_rtx (mode);
36627 emit_clobber (tmp);
36628 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
36629 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
36630 emit_move_insn (target, tmp);
36631 }
36632 else if (n_words == 4)
36633 {
36634 rtx tmp = gen_reg_rtx (V4SImode);
36635 gcc_assert (word_mode == SImode);
36636 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
36637 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
36638 emit_move_insn (target, gen_lowpart (mode, tmp));
36639 }
36640 else
36641 gcc_unreachable ();
36642 }
36643 }
36644
36645 /* Initialize vector TARGET via VALS. Suppress the use of MMX
36646 instructions unless MMX_OK is true. */
36647
36648 void
36649 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
36650 {
36651 enum machine_mode mode = GET_MODE (target);
36652 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36653 int n_elts = GET_MODE_NUNITS (mode);
36654 int n_var = 0, one_var = -1;
36655 bool all_same = true, all_const_zero = true;
36656 int i;
36657 rtx x;
36658
36659 for (i = 0; i < n_elts; ++i)
36660 {
36661 x = XVECEXP (vals, 0, i);
36662 if (!(CONST_INT_P (x)
36663 || GET_CODE (x) == CONST_DOUBLE
36664 || GET_CODE (x) == CONST_FIXED))
36665 n_var++, one_var = i;
36666 else if (x != CONST0_RTX (inner_mode))
36667 all_const_zero = false;
36668 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
36669 all_same = false;
36670 }
36671
36672 /* Constants are best loaded from the constant pool. */
36673 if (n_var == 0)
36674 {
36675 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
36676 return;
36677 }
36678
36679 /* If all values are identical, broadcast the value. */
36680 if (all_same
36681 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
36682 XVECEXP (vals, 0, 0)))
36683 return;
36684
36685 /* Values where only one field is non-constant are best loaded from
36686 the pool and overwritten via move later. */
36687 if (n_var == 1)
36688 {
36689 if (all_const_zero
36690 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
36691 XVECEXP (vals, 0, one_var),
36692 one_var))
36693 return;
36694
36695 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
36696 return;
36697 }
36698
36699 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
36700 }
36701
36702 void
36703 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
36704 {
36705 enum machine_mode mode = GET_MODE (target);
36706 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36707 enum machine_mode half_mode;
36708 bool use_vec_merge = false;
36709 rtx tmp;
36710 static rtx (*gen_extract[6][2]) (rtx, rtx)
36711 = {
36712 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
36713 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
36714 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
36715 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
36716 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
36717 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
36718 };
36719 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
36720 = {
36721 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
36722 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
36723 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
36724 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
36725 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
36726 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
36727 };
36728 int i, j, n;
36729
36730 switch (mode)
36731 {
36732 case V2SFmode:
36733 case V2SImode:
36734 if (mmx_ok)
36735 {
36736 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36737 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
36738 if (elt == 0)
36739 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36740 else
36741 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36742 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36743 return;
36744 }
36745 break;
36746
36747 case V2DImode:
36748 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
36749 if (use_vec_merge)
36750 break;
36751
36752 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36753 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
36754 if (elt == 0)
36755 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36756 else
36757 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36758 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36759 return;
36760
36761 case V2DFmode:
36762 {
36763 rtx op0, op1;
36764
36765 /* For the two element vectors, we implement a VEC_CONCAT with
36766 the extraction of the other element. */
36767
36768 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
36769 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
36770
36771 if (elt == 0)
36772 op0 = val, op1 = tmp;
36773 else
36774 op0 = tmp, op1 = val;
36775
36776 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
36777 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36778 }
36779 return;
36780
36781 case V4SFmode:
36782 use_vec_merge = TARGET_SSE4_1;
36783 if (use_vec_merge)
36784 break;
36785
36786 switch (elt)
36787 {
36788 case 0:
36789 use_vec_merge = true;
36790 break;
36791
36792 case 1:
36793 /* tmp = target = A B C D */
36794 tmp = copy_to_reg (target);
36795 /* target = A A B B */
36796 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
36797 /* target = X A B B */
36798 ix86_expand_vector_set (false, target, val, 0);
36799 /* target = A X C D */
36800 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36801 const1_rtx, const0_rtx,
36802 GEN_INT (2+4), GEN_INT (3+4)));
36803 return;
36804
36805 case 2:
36806 /* tmp = target = A B C D */
36807 tmp = copy_to_reg (target);
36808 /* tmp = X B C D */
36809 ix86_expand_vector_set (false, tmp, val, 0);
36810 /* target = A B X D */
36811 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36812 const0_rtx, const1_rtx,
36813 GEN_INT (0+4), GEN_INT (3+4)));
36814 return;
36815
36816 case 3:
36817 /* tmp = target = A B C D */
36818 tmp = copy_to_reg (target);
36819 /* tmp = X B C D */
36820 ix86_expand_vector_set (false, tmp, val, 0);
36821 /* target = A B X D */
36822 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36823 const0_rtx, const1_rtx,
36824 GEN_INT (2+4), GEN_INT (0+4)));
36825 return;
36826
36827 default:
36828 gcc_unreachable ();
36829 }
36830 break;
36831
36832 case V4SImode:
36833 use_vec_merge = TARGET_SSE4_1;
36834 if (use_vec_merge)
36835 break;
36836
36837 /* Element 0 handled by vec_merge below. */
36838 if (elt == 0)
36839 {
36840 use_vec_merge = true;
36841 break;
36842 }
36843
36844 if (TARGET_SSE2)
36845 {
36846 /* With SSE2, use integer shuffles to swap element 0 and ELT,
36847 store into element 0, then shuffle them back. */
36848
36849 rtx order[4];
36850
36851 order[0] = GEN_INT (elt);
36852 order[1] = const1_rtx;
36853 order[2] = const2_rtx;
36854 order[3] = GEN_INT (3);
36855 order[elt] = const0_rtx;
36856
36857 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36858 order[1], order[2], order[3]));
36859
36860 ix86_expand_vector_set (false, target, val, 0);
36861
36862 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36863 order[1], order[2], order[3]));
36864 }
36865 else
36866 {
36867 /* For SSE1, we have to reuse the V4SF code. */
36868 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
36869 gen_lowpart (SFmode, val), elt);
36870 }
36871 return;
36872
36873 case V8HImode:
36874 use_vec_merge = TARGET_SSE2;
36875 break;
36876 case V4HImode:
36877 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
36878 break;
36879
36880 case V16QImode:
36881 use_vec_merge = TARGET_SSE4_1;
36882 break;
36883
36884 case V8QImode:
36885 break;
36886
36887 case V32QImode:
36888 half_mode = V16QImode;
36889 j = 0;
36890 n = 16;
36891 goto half;
36892
36893 case V16HImode:
36894 half_mode = V8HImode;
36895 j = 1;
36896 n = 8;
36897 goto half;
36898
36899 case V8SImode:
36900 half_mode = V4SImode;
36901 j = 2;
36902 n = 4;
36903 goto half;
36904
36905 case V4DImode:
36906 half_mode = V2DImode;
36907 j = 3;
36908 n = 2;
36909 goto half;
36910
36911 case V8SFmode:
36912 half_mode = V4SFmode;
36913 j = 4;
36914 n = 4;
36915 goto half;
36916
36917 case V4DFmode:
36918 half_mode = V2DFmode;
36919 j = 5;
36920 n = 2;
36921 goto half;
36922
36923 half:
36924 /* Compute offset. */
36925 i = elt / n;
36926 elt %= n;
36927
36928 gcc_assert (i <= 1);
36929
36930 /* Extract the half. */
36931 tmp = gen_reg_rtx (half_mode);
36932 emit_insn (gen_extract[j][i] (tmp, target));
36933
36934 /* Put val in tmp at elt. */
36935 ix86_expand_vector_set (false, tmp, val, elt);
36936
36937 /* Put it back. */
36938 emit_insn (gen_insert[j][i] (target, target, tmp));
36939 return;
36940
36941 default:
36942 break;
36943 }
36944
36945 if (use_vec_merge)
36946 {
36947 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
36948 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
36949 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36950 }
36951 else
36952 {
36953 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
36954
36955 emit_move_insn (mem, target);
36956
36957 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
36958 emit_move_insn (tmp, val);
36959
36960 emit_move_insn (target, mem);
36961 }
36962 }
36963
36964 void
36965 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
36966 {
36967 enum machine_mode mode = GET_MODE (vec);
36968 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36969 bool use_vec_extr = false;
36970 rtx tmp;
36971
36972 switch (mode)
36973 {
36974 case V2SImode:
36975 case V2SFmode:
36976 if (!mmx_ok)
36977 break;
36978 /* FALLTHRU */
36979
36980 case V2DFmode:
36981 case V2DImode:
36982 use_vec_extr = true;
36983 break;
36984
36985 case V4SFmode:
36986 use_vec_extr = TARGET_SSE4_1;
36987 if (use_vec_extr)
36988 break;
36989
36990 switch (elt)
36991 {
36992 case 0:
36993 tmp = vec;
36994 break;
36995
36996 case 1:
36997 case 3:
36998 tmp = gen_reg_rtx (mode);
36999 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
37000 GEN_INT (elt), GEN_INT (elt),
37001 GEN_INT (elt+4), GEN_INT (elt+4)));
37002 break;
37003
37004 case 2:
37005 tmp = gen_reg_rtx (mode);
37006 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
37007 break;
37008
37009 default:
37010 gcc_unreachable ();
37011 }
37012 vec = tmp;
37013 use_vec_extr = true;
37014 elt = 0;
37015 break;
37016
37017 case V4SImode:
37018 use_vec_extr = TARGET_SSE4_1;
37019 if (use_vec_extr)
37020 break;
37021
37022 if (TARGET_SSE2)
37023 {
37024 switch (elt)
37025 {
37026 case 0:
37027 tmp = vec;
37028 break;
37029
37030 case 1:
37031 case 3:
37032 tmp = gen_reg_rtx (mode);
37033 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
37034 GEN_INT (elt), GEN_INT (elt),
37035 GEN_INT (elt), GEN_INT (elt)));
37036 break;
37037
37038 case 2:
37039 tmp = gen_reg_rtx (mode);
37040 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
37041 break;
37042
37043 default:
37044 gcc_unreachable ();
37045 }
37046 vec = tmp;
37047 use_vec_extr = true;
37048 elt = 0;
37049 }
37050 else
37051 {
37052 /* For SSE1, we have to reuse the V4SF code. */
37053 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
37054 gen_lowpart (V4SFmode, vec), elt);
37055 return;
37056 }
37057 break;
37058
37059 case V8HImode:
37060 use_vec_extr = TARGET_SSE2;
37061 break;
37062 case V4HImode:
37063 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
37064 break;
37065
37066 case V16QImode:
37067 use_vec_extr = TARGET_SSE4_1;
37068 break;
37069
37070 case V8SFmode:
37071 if (TARGET_AVX)
37072 {
37073 tmp = gen_reg_rtx (V4SFmode);
37074 if (elt < 4)
37075 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
37076 else
37077 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
37078 ix86_expand_vector_extract (false, target, tmp, elt & 3);
37079 return;
37080 }
37081 break;
37082
37083 case V4DFmode:
37084 if (TARGET_AVX)
37085 {
37086 tmp = gen_reg_rtx (V2DFmode);
37087 if (elt < 2)
37088 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
37089 else
37090 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
37091 ix86_expand_vector_extract (false, target, tmp, elt & 1);
37092 return;
37093 }
37094 break;
37095
37096 case V32QImode:
37097 if (TARGET_AVX)
37098 {
37099 tmp = gen_reg_rtx (V16QImode);
37100 if (elt < 16)
37101 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
37102 else
37103 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
37104 ix86_expand_vector_extract (false, target, tmp, elt & 15);
37105 return;
37106 }
37107 break;
37108
37109 case V16HImode:
37110 if (TARGET_AVX)
37111 {
37112 tmp = gen_reg_rtx (V8HImode);
37113 if (elt < 8)
37114 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
37115 else
37116 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
37117 ix86_expand_vector_extract (false, target, tmp, elt & 7);
37118 return;
37119 }
37120 break;
37121
37122 case V8SImode:
37123 if (TARGET_AVX)
37124 {
37125 tmp = gen_reg_rtx (V4SImode);
37126 if (elt < 4)
37127 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
37128 else
37129 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
37130 ix86_expand_vector_extract (false, target, tmp, elt & 3);
37131 return;
37132 }
37133 break;
37134
37135 case V4DImode:
37136 if (TARGET_AVX)
37137 {
37138 tmp = gen_reg_rtx (V2DImode);
37139 if (elt < 2)
37140 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
37141 else
37142 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
37143 ix86_expand_vector_extract (false, target, tmp, elt & 1);
37144 return;
37145 }
37146 break;
37147
37148 case V8QImode:
37149 /* ??? Could extract the appropriate HImode element and shift. */
37150 default:
37151 break;
37152 }
37153
37154 if (use_vec_extr)
37155 {
37156 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
37157 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
37158
37159 /* Let the rtl optimizers know about the zero extension performed. */
37160 if (inner_mode == QImode || inner_mode == HImode)
37161 {
37162 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
37163 target = gen_lowpart (SImode, target);
37164 }
37165
37166 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37167 }
37168 else
37169 {
37170 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
37171
37172 emit_move_insn (mem, vec);
37173
37174 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
37175 emit_move_insn (target, tmp);
37176 }
37177 }
37178
37179 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
37180 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
37181 The upper bits of DEST are undefined, though they shouldn't cause
37182 exceptions (some bits from src or all zeros are ok). */
37183
37184 static void
37185 emit_reduc_half (rtx dest, rtx src, int i)
37186 {
37187 rtx tem;
37188 switch (GET_MODE (src))
37189 {
37190 case V4SFmode:
37191 if (i == 128)
37192 tem = gen_sse_movhlps (dest, src, src);
37193 else
37194 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
37195 GEN_INT (1 + 4), GEN_INT (1 + 4));
37196 break;
37197 case V2DFmode:
37198 tem = gen_vec_interleave_highv2df (dest, src, src);
37199 break;
37200 case V16QImode:
37201 case V8HImode:
37202 case V4SImode:
37203 case V2DImode:
37204 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
37205 gen_lowpart (V1TImode, src),
37206 GEN_INT (i / 2));
37207 break;
37208 case V8SFmode:
37209 if (i == 256)
37210 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
37211 else
37212 tem = gen_avx_shufps256 (dest, src, src,
37213 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
37214 break;
37215 case V4DFmode:
37216 if (i == 256)
37217 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
37218 else
37219 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
37220 break;
37221 case V32QImode:
37222 case V16HImode:
37223 case V8SImode:
37224 case V4DImode:
37225 if (i == 256)
37226 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
37227 gen_lowpart (V4DImode, src),
37228 gen_lowpart (V4DImode, src),
37229 const1_rtx);
37230 else
37231 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
37232 gen_lowpart (V2TImode, src),
37233 GEN_INT (i / 2));
37234 break;
37235 default:
37236 gcc_unreachable ();
37237 }
37238 emit_insn (tem);
37239 }
37240
37241 /* Expand a vector reduction. FN is the binary pattern to reduce;
37242 DEST is the destination; IN is the input vector. */
37243
37244 void
37245 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
37246 {
37247 rtx half, dst, vec = in;
37248 enum machine_mode mode = GET_MODE (in);
37249 int i;
37250
37251 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
37252 if (TARGET_SSE4_1
37253 && mode == V8HImode
37254 && fn == gen_uminv8hi3)
37255 {
37256 emit_insn (gen_sse4_1_phminposuw (dest, in));
37257 return;
37258 }
37259
37260 for (i = GET_MODE_BITSIZE (mode);
37261 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
37262 i >>= 1)
37263 {
37264 half = gen_reg_rtx (mode);
37265 emit_reduc_half (half, vec, i);
37266 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
37267 dst = dest;
37268 else
37269 dst = gen_reg_rtx (mode);
37270 emit_insn (fn (dst, half, vec));
37271 vec = dst;
37272 }
37273 }
37274 \f
37275 /* Target hook for scalar_mode_supported_p. */
37276 static bool
37277 ix86_scalar_mode_supported_p (enum machine_mode mode)
37278 {
37279 if (DECIMAL_FLOAT_MODE_P (mode))
37280 return default_decimal_float_supported_p ();
37281 else if (mode == TFmode)
37282 return true;
37283 else
37284 return default_scalar_mode_supported_p (mode);
37285 }
37286
37287 /* Implements target hook vector_mode_supported_p. */
37288 static bool
37289 ix86_vector_mode_supported_p (enum machine_mode mode)
37290 {
37291 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
37292 return true;
37293 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
37294 return true;
37295 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
37296 return true;
37297 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
37298 return true;
37299 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
37300 return true;
37301 return false;
37302 }
37303
37304 /* Target hook for c_mode_for_suffix. */
37305 static enum machine_mode
37306 ix86_c_mode_for_suffix (char suffix)
37307 {
37308 if (suffix == 'q')
37309 return TFmode;
37310 if (suffix == 'w')
37311 return XFmode;
37312
37313 return VOIDmode;
37314 }
37315
37316 /* Worker function for TARGET_MD_ASM_CLOBBERS.
37317
37318 We do this in the new i386 backend to maintain source compatibility
37319 with the old cc0-based compiler. */
37320
37321 static tree
37322 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
37323 tree inputs ATTRIBUTE_UNUSED,
37324 tree clobbers)
37325 {
37326 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
37327 clobbers);
37328 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
37329 clobbers);
37330 return clobbers;
37331 }
37332
37333 /* Implements target vector targetm.asm.encode_section_info. */
37334
37335 static void ATTRIBUTE_UNUSED
37336 ix86_encode_section_info (tree decl, rtx rtl, int first)
37337 {
37338 default_encode_section_info (decl, rtl, first);
37339
37340 if (TREE_CODE (decl) == VAR_DECL
37341 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
37342 && ix86_in_large_data_p (decl))
37343 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
37344 }
37345
37346 /* Worker function for REVERSE_CONDITION. */
37347
37348 enum rtx_code
37349 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
37350 {
37351 return (mode != CCFPmode && mode != CCFPUmode
37352 ? reverse_condition (code)
37353 : reverse_condition_maybe_unordered (code));
37354 }
37355
37356 /* Output code to perform an x87 FP register move, from OPERANDS[1]
37357 to OPERANDS[0]. */
37358
37359 const char *
37360 output_387_reg_move (rtx insn, rtx *operands)
37361 {
37362 if (REG_P (operands[0]))
37363 {
37364 if (REG_P (operands[1])
37365 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37366 {
37367 if (REGNO (operands[0]) == FIRST_STACK_REG)
37368 return output_387_ffreep (operands, 0);
37369 return "fstp\t%y0";
37370 }
37371 if (STACK_TOP_P (operands[0]))
37372 return "fld%Z1\t%y1";
37373 return "fst\t%y0";
37374 }
37375 else if (MEM_P (operands[0]))
37376 {
37377 gcc_assert (REG_P (operands[1]));
37378 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37379 return "fstp%Z0\t%y0";
37380 else
37381 {
37382 /* There is no non-popping store to memory for XFmode.
37383 So if we need one, follow the store with a load. */
37384 if (GET_MODE (operands[0]) == XFmode)
37385 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
37386 else
37387 return "fst%Z0\t%y0";
37388 }
37389 }
37390 else
37391 gcc_unreachable();
37392 }
37393
37394 /* Output code to perform a conditional jump to LABEL, if C2 flag in
37395 FP status register is set. */
37396
37397 void
37398 ix86_emit_fp_unordered_jump (rtx label)
37399 {
37400 rtx reg = gen_reg_rtx (HImode);
37401 rtx temp;
37402
37403 emit_insn (gen_x86_fnstsw_1 (reg));
37404
37405 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
37406 {
37407 emit_insn (gen_x86_sahf_1 (reg));
37408
37409 temp = gen_rtx_REG (CCmode, FLAGS_REG);
37410 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
37411 }
37412 else
37413 {
37414 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
37415
37416 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
37417 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
37418 }
37419
37420 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
37421 gen_rtx_LABEL_REF (VOIDmode, label),
37422 pc_rtx);
37423 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
37424
37425 emit_jump_insn (temp);
37426 predict_jump (REG_BR_PROB_BASE * 10 / 100);
37427 }
37428
37429 /* Output code to perform a log1p XFmode calculation. */
37430
37431 void ix86_emit_i387_log1p (rtx op0, rtx op1)
37432 {
37433 rtx label1 = gen_label_rtx ();
37434 rtx label2 = gen_label_rtx ();
37435
37436 rtx tmp = gen_reg_rtx (XFmode);
37437 rtx tmp2 = gen_reg_rtx (XFmode);
37438 rtx test;
37439
37440 emit_insn (gen_absxf2 (tmp, op1));
37441 test = gen_rtx_GE (VOIDmode, tmp,
37442 CONST_DOUBLE_FROM_REAL_VALUE (
37443 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
37444 XFmode));
37445 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
37446
37447 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37448 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
37449 emit_jump (label2);
37450
37451 emit_label (label1);
37452 emit_move_insn (tmp, CONST1_RTX (XFmode));
37453 emit_insn (gen_addxf3 (tmp, op1, tmp));
37454 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37455 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
37456
37457 emit_label (label2);
37458 }
37459
37460 /* Emit code for round calculation. */
37461 void ix86_emit_i387_round (rtx op0, rtx op1)
37462 {
37463 enum machine_mode inmode = GET_MODE (op1);
37464 enum machine_mode outmode = GET_MODE (op0);
37465 rtx e1, e2, res, tmp, tmp1, half;
37466 rtx scratch = gen_reg_rtx (HImode);
37467 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
37468 rtx jump_label = gen_label_rtx ();
37469 rtx insn;
37470 rtx (*gen_abs) (rtx, rtx);
37471 rtx (*gen_neg) (rtx, rtx);
37472
37473 switch (inmode)
37474 {
37475 case SFmode:
37476 gen_abs = gen_abssf2;
37477 break;
37478 case DFmode:
37479 gen_abs = gen_absdf2;
37480 break;
37481 case XFmode:
37482 gen_abs = gen_absxf2;
37483 break;
37484 default:
37485 gcc_unreachable ();
37486 }
37487
37488 switch (outmode)
37489 {
37490 case SFmode:
37491 gen_neg = gen_negsf2;
37492 break;
37493 case DFmode:
37494 gen_neg = gen_negdf2;
37495 break;
37496 case XFmode:
37497 gen_neg = gen_negxf2;
37498 break;
37499 case HImode:
37500 gen_neg = gen_neghi2;
37501 break;
37502 case SImode:
37503 gen_neg = gen_negsi2;
37504 break;
37505 case DImode:
37506 gen_neg = gen_negdi2;
37507 break;
37508 default:
37509 gcc_unreachable ();
37510 }
37511
37512 e1 = gen_reg_rtx (inmode);
37513 e2 = gen_reg_rtx (inmode);
37514 res = gen_reg_rtx (outmode);
37515
37516 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
37517
37518 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
37519
37520 /* scratch = fxam(op1) */
37521 emit_insn (gen_rtx_SET (VOIDmode, scratch,
37522 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
37523 UNSPEC_FXAM)));
37524 /* e1 = fabs(op1) */
37525 emit_insn (gen_abs (e1, op1));
37526
37527 /* e2 = e1 + 0.5 */
37528 half = force_reg (inmode, half);
37529 emit_insn (gen_rtx_SET (VOIDmode, e2,
37530 gen_rtx_PLUS (inmode, e1, half)));
37531
37532 /* res = floor(e2) */
37533 if (inmode != XFmode)
37534 {
37535 tmp1 = gen_reg_rtx (XFmode);
37536
37537 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
37538 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
37539 }
37540 else
37541 tmp1 = e2;
37542
37543 switch (outmode)
37544 {
37545 case SFmode:
37546 case DFmode:
37547 {
37548 rtx tmp0 = gen_reg_rtx (XFmode);
37549
37550 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
37551
37552 emit_insn (gen_rtx_SET (VOIDmode, res,
37553 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
37554 UNSPEC_TRUNC_NOOP)));
37555 }
37556 break;
37557 case XFmode:
37558 emit_insn (gen_frndintxf2_floor (res, tmp1));
37559 break;
37560 case HImode:
37561 emit_insn (gen_lfloorxfhi2 (res, tmp1));
37562 break;
37563 case SImode:
37564 emit_insn (gen_lfloorxfsi2 (res, tmp1));
37565 break;
37566 case DImode:
37567 emit_insn (gen_lfloorxfdi2 (res, tmp1));
37568 break;
37569 default:
37570 gcc_unreachable ();
37571 }
37572
37573 /* flags = signbit(a) */
37574 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
37575
37576 /* if (flags) then res = -res */
37577 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
37578 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
37579 gen_rtx_LABEL_REF (VOIDmode, jump_label),
37580 pc_rtx);
37581 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37582 predict_jump (REG_BR_PROB_BASE * 50 / 100);
37583 JUMP_LABEL (insn) = jump_label;
37584
37585 emit_insn (gen_neg (res, res));
37586
37587 emit_label (jump_label);
37588 LABEL_NUSES (jump_label) = 1;
37589
37590 emit_move_insn (op0, res);
37591 }
37592
37593 /* Output code to perform a Newton-Rhapson approximation of a single precision
37594 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
37595
37596 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
37597 {
37598 rtx x0, x1, e0, e1;
37599
37600 x0 = gen_reg_rtx (mode);
37601 e0 = gen_reg_rtx (mode);
37602 e1 = gen_reg_rtx (mode);
37603 x1 = gen_reg_rtx (mode);
37604
37605 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
37606
37607 b = force_reg (mode, b);
37608
37609 /* x0 = rcp(b) estimate */
37610 emit_insn (gen_rtx_SET (VOIDmode, x0,
37611 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
37612 UNSPEC_RCP)));
37613 /* e0 = x0 * b */
37614 emit_insn (gen_rtx_SET (VOIDmode, e0,
37615 gen_rtx_MULT (mode, x0, b)));
37616
37617 /* e0 = x0 * e0 */
37618 emit_insn (gen_rtx_SET (VOIDmode, e0,
37619 gen_rtx_MULT (mode, x0, e0)));
37620
37621 /* e1 = x0 + x0 */
37622 emit_insn (gen_rtx_SET (VOIDmode, e1,
37623 gen_rtx_PLUS (mode, x0, x0)));
37624
37625 /* x1 = e1 - e0 */
37626 emit_insn (gen_rtx_SET (VOIDmode, x1,
37627 gen_rtx_MINUS (mode, e1, e0)));
37628
37629 /* res = a * x1 */
37630 emit_insn (gen_rtx_SET (VOIDmode, res,
37631 gen_rtx_MULT (mode, a, x1)));
37632 }
37633
37634 /* Output code to perform a Newton-Rhapson approximation of a
37635 single precision floating point [reciprocal] square root. */
37636
37637 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
37638 bool recip)
37639 {
37640 rtx x0, e0, e1, e2, e3, mthree, mhalf;
37641 REAL_VALUE_TYPE r;
37642
37643 x0 = gen_reg_rtx (mode);
37644 e0 = gen_reg_rtx (mode);
37645 e1 = gen_reg_rtx (mode);
37646 e2 = gen_reg_rtx (mode);
37647 e3 = gen_reg_rtx (mode);
37648
37649 real_from_integer (&r, VOIDmode, -3, -1, 0);
37650 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37651
37652 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
37653 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37654
37655 if (VECTOR_MODE_P (mode))
37656 {
37657 mthree = ix86_build_const_vector (mode, true, mthree);
37658 mhalf = ix86_build_const_vector (mode, true, mhalf);
37659 }
37660
37661 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
37662 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
37663
37664 a = force_reg (mode, a);
37665
37666 /* x0 = rsqrt(a) estimate */
37667 emit_insn (gen_rtx_SET (VOIDmode, x0,
37668 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
37669 UNSPEC_RSQRT)));
37670
37671 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
37672 if (!recip)
37673 {
37674 rtx zero, mask;
37675
37676 zero = gen_reg_rtx (mode);
37677 mask = gen_reg_rtx (mode);
37678
37679 zero = force_reg (mode, CONST0_RTX(mode));
37680 emit_insn (gen_rtx_SET (VOIDmode, mask,
37681 gen_rtx_NE (mode, zero, a)));
37682
37683 emit_insn (gen_rtx_SET (VOIDmode, x0,
37684 gen_rtx_AND (mode, x0, mask)));
37685 }
37686
37687 /* e0 = x0 * a */
37688 emit_insn (gen_rtx_SET (VOIDmode, e0,
37689 gen_rtx_MULT (mode, x0, a)));
37690 /* e1 = e0 * x0 */
37691 emit_insn (gen_rtx_SET (VOIDmode, e1,
37692 gen_rtx_MULT (mode, e0, x0)));
37693
37694 /* e2 = e1 - 3. */
37695 mthree = force_reg (mode, mthree);
37696 emit_insn (gen_rtx_SET (VOIDmode, e2,
37697 gen_rtx_PLUS (mode, e1, mthree)));
37698
37699 mhalf = force_reg (mode, mhalf);
37700 if (recip)
37701 /* e3 = -.5 * x0 */
37702 emit_insn (gen_rtx_SET (VOIDmode, e3,
37703 gen_rtx_MULT (mode, x0, mhalf)));
37704 else
37705 /* e3 = -.5 * e0 */
37706 emit_insn (gen_rtx_SET (VOIDmode, e3,
37707 gen_rtx_MULT (mode, e0, mhalf)));
37708 /* ret = e2 * e3 */
37709 emit_insn (gen_rtx_SET (VOIDmode, res,
37710 gen_rtx_MULT (mode, e2, e3)));
37711 }
37712
37713 #ifdef TARGET_SOLARIS
37714 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
37715
37716 static void
37717 i386_solaris_elf_named_section (const char *name, unsigned int flags,
37718 tree decl)
37719 {
37720 /* With Binutils 2.15, the "@unwind" marker must be specified on
37721 every occurrence of the ".eh_frame" section, not just the first
37722 one. */
37723 if (TARGET_64BIT
37724 && strcmp (name, ".eh_frame") == 0)
37725 {
37726 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
37727 flags & SECTION_WRITE ? "aw" : "a");
37728 return;
37729 }
37730
37731 #ifndef USE_GAS
37732 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
37733 {
37734 solaris_elf_asm_comdat_section (name, flags, decl);
37735 return;
37736 }
37737 #endif
37738
37739 default_elf_asm_named_section (name, flags, decl);
37740 }
37741 #endif /* TARGET_SOLARIS */
37742
37743 /* Return the mangling of TYPE if it is an extended fundamental type. */
37744
37745 static const char *
37746 ix86_mangle_type (const_tree type)
37747 {
37748 type = TYPE_MAIN_VARIANT (type);
37749
37750 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
37751 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
37752 return NULL;
37753
37754 switch (TYPE_MODE (type))
37755 {
37756 case TFmode:
37757 /* __float128 is "g". */
37758 return "g";
37759 case XFmode:
37760 /* "long double" or __float80 is "e". */
37761 return "e";
37762 default:
37763 return NULL;
37764 }
37765 }
37766
37767 /* For 32-bit code we can save PIC register setup by using
37768 __stack_chk_fail_local hidden function instead of calling
37769 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
37770 register, so it is better to call __stack_chk_fail directly. */
37771
37772 static tree ATTRIBUTE_UNUSED
37773 ix86_stack_protect_fail (void)
37774 {
37775 return TARGET_64BIT
37776 ? default_external_stack_protect_fail ()
37777 : default_hidden_stack_protect_fail ();
37778 }
37779
37780 /* Select a format to encode pointers in exception handling data. CODE
37781 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
37782 true if the symbol may be affected by dynamic relocations.
37783
37784 ??? All x86 object file formats are capable of representing this.
37785 After all, the relocation needed is the same as for the call insn.
37786 Whether or not a particular assembler allows us to enter such, I
37787 guess we'll have to see. */
37788 int
37789 asm_preferred_eh_data_format (int code, int global)
37790 {
37791 if (flag_pic)
37792 {
37793 int type = DW_EH_PE_sdata8;
37794 if (!TARGET_64BIT
37795 || ix86_cmodel == CM_SMALL_PIC
37796 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
37797 type = DW_EH_PE_sdata4;
37798 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
37799 }
37800 if (ix86_cmodel == CM_SMALL
37801 || (ix86_cmodel == CM_MEDIUM && code))
37802 return DW_EH_PE_udata4;
37803 return DW_EH_PE_absptr;
37804 }
37805 \f
37806 /* Expand copysign from SIGN to the positive value ABS_VALUE
37807 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
37808 the sign-bit. */
37809 static void
37810 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
37811 {
37812 enum machine_mode mode = GET_MODE (sign);
37813 rtx sgn = gen_reg_rtx (mode);
37814 if (mask == NULL_RTX)
37815 {
37816 enum machine_mode vmode;
37817
37818 if (mode == SFmode)
37819 vmode = V4SFmode;
37820 else if (mode == DFmode)
37821 vmode = V2DFmode;
37822 else
37823 vmode = mode;
37824
37825 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
37826 if (!VECTOR_MODE_P (mode))
37827 {
37828 /* We need to generate a scalar mode mask in this case. */
37829 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37830 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37831 mask = gen_reg_rtx (mode);
37832 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37833 }
37834 }
37835 else
37836 mask = gen_rtx_NOT (mode, mask);
37837 emit_insn (gen_rtx_SET (VOIDmode, sgn,
37838 gen_rtx_AND (mode, mask, sign)));
37839 emit_insn (gen_rtx_SET (VOIDmode, result,
37840 gen_rtx_IOR (mode, abs_value, sgn)));
37841 }
37842
37843 /* Expand fabs (OP0) and return a new rtx that holds the result. The
37844 mask for masking out the sign-bit is stored in *SMASK, if that is
37845 non-null. */
37846 static rtx
37847 ix86_expand_sse_fabs (rtx op0, rtx *smask)
37848 {
37849 enum machine_mode vmode, mode = GET_MODE (op0);
37850 rtx xa, mask;
37851
37852 xa = gen_reg_rtx (mode);
37853 if (mode == SFmode)
37854 vmode = V4SFmode;
37855 else if (mode == DFmode)
37856 vmode = V2DFmode;
37857 else
37858 vmode = mode;
37859 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
37860 if (!VECTOR_MODE_P (mode))
37861 {
37862 /* We need to generate a scalar mode mask in this case. */
37863 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37864 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37865 mask = gen_reg_rtx (mode);
37866 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37867 }
37868 emit_insn (gen_rtx_SET (VOIDmode, xa,
37869 gen_rtx_AND (mode, op0, mask)));
37870
37871 if (smask)
37872 *smask = mask;
37873
37874 return xa;
37875 }
37876
37877 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
37878 swapping the operands if SWAP_OPERANDS is true. The expanded
37879 code is a forward jump to a newly created label in case the
37880 comparison is true. The generated label rtx is returned. */
37881 static rtx
37882 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
37883 bool swap_operands)
37884 {
37885 rtx label, tmp;
37886
37887 if (swap_operands)
37888 {
37889 tmp = op0;
37890 op0 = op1;
37891 op1 = tmp;
37892 }
37893
37894 label = gen_label_rtx ();
37895 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
37896 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37897 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
37898 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
37899 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
37900 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
37901 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37902 JUMP_LABEL (tmp) = label;
37903
37904 return label;
37905 }
37906
37907 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
37908 using comparison code CODE. Operands are swapped for the comparison if
37909 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
37910 static rtx
37911 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
37912 bool swap_operands)
37913 {
37914 rtx (*insn)(rtx, rtx, rtx, rtx);
37915 enum machine_mode mode = GET_MODE (op0);
37916 rtx mask = gen_reg_rtx (mode);
37917
37918 if (swap_operands)
37919 {
37920 rtx tmp = op0;
37921 op0 = op1;
37922 op1 = tmp;
37923 }
37924
37925 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
37926
37927 emit_insn (insn (mask, op0, op1,
37928 gen_rtx_fmt_ee (code, mode, op0, op1)));
37929 return mask;
37930 }
37931
37932 /* Generate and return a rtx of mode MODE for 2**n where n is the number
37933 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
37934 static rtx
37935 ix86_gen_TWO52 (enum machine_mode mode)
37936 {
37937 REAL_VALUE_TYPE TWO52r;
37938 rtx TWO52;
37939
37940 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
37941 TWO52 = const_double_from_real_value (TWO52r, mode);
37942 TWO52 = force_reg (mode, TWO52);
37943
37944 return TWO52;
37945 }
37946
37947 /* Expand SSE sequence for computing lround from OP1 storing
37948 into OP0. */
37949 void
37950 ix86_expand_lround (rtx op0, rtx op1)
37951 {
37952 /* C code for the stuff we're doing below:
37953 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
37954 return (long)tmp;
37955 */
37956 enum machine_mode mode = GET_MODE (op1);
37957 const struct real_format *fmt;
37958 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
37959 rtx adj;
37960
37961 /* load nextafter (0.5, 0.0) */
37962 fmt = REAL_MODE_FORMAT (mode);
37963 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
37964 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
37965
37966 /* adj = copysign (0.5, op1) */
37967 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
37968 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
37969
37970 /* adj = op1 + adj */
37971 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
37972
37973 /* op0 = (imode)adj */
37974 expand_fix (op0, adj, 0);
37975 }
37976
37977 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
37978 into OPERAND0. */
37979 void
37980 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
37981 {
37982 /* C code for the stuff we're doing below (for do_floor):
37983 xi = (long)op1;
37984 xi -= (double)xi > op1 ? 1 : 0;
37985 return xi;
37986 */
37987 enum machine_mode fmode = GET_MODE (op1);
37988 enum machine_mode imode = GET_MODE (op0);
37989 rtx ireg, freg, label, tmp;
37990
37991 /* reg = (long)op1 */
37992 ireg = gen_reg_rtx (imode);
37993 expand_fix (ireg, op1, 0);
37994
37995 /* freg = (double)reg */
37996 freg = gen_reg_rtx (fmode);
37997 expand_float (freg, ireg, 0);
37998
37999 /* ireg = (freg > op1) ? ireg - 1 : ireg */
38000 label = ix86_expand_sse_compare_and_jump (UNLE,
38001 freg, op1, !do_floor);
38002 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
38003 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
38004 emit_move_insn (ireg, tmp);
38005
38006 emit_label (label);
38007 LABEL_NUSES (label) = 1;
38008
38009 emit_move_insn (op0, ireg);
38010 }
38011
38012 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
38013 result in OPERAND0. */
38014 void
38015 ix86_expand_rint (rtx operand0, rtx operand1)
38016 {
38017 /* C code for the stuff we're doing below:
38018 xa = fabs (operand1);
38019 if (!isless (xa, 2**52))
38020 return operand1;
38021 xa = xa + 2**52 - 2**52;
38022 return copysign (xa, operand1);
38023 */
38024 enum machine_mode mode = GET_MODE (operand0);
38025 rtx res, xa, label, TWO52, mask;
38026
38027 res = gen_reg_rtx (mode);
38028 emit_move_insn (res, operand1);
38029
38030 /* xa = abs (operand1) */
38031 xa = ix86_expand_sse_fabs (res, &mask);
38032
38033 /* if (!isless (xa, TWO52)) goto label; */
38034 TWO52 = ix86_gen_TWO52 (mode);
38035 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38036
38037 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38038 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
38039
38040 ix86_sse_copysign_to_positive (res, xa, res, mask);
38041
38042 emit_label (label);
38043 LABEL_NUSES (label) = 1;
38044
38045 emit_move_insn (operand0, res);
38046 }
38047
38048 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
38049 into OPERAND0. */
38050 void
38051 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
38052 {
38053 /* C code for the stuff we expand below.
38054 double xa = fabs (x), x2;
38055 if (!isless (xa, TWO52))
38056 return x;
38057 xa = xa + TWO52 - TWO52;
38058 x2 = copysign (xa, x);
38059 Compensate. Floor:
38060 if (x2 > x)
38061 x2 -= 1;
38062 Compensate. Ceil:
38063 if (x2 < x)
38064 x2 -= -1;
38065 return x2;
38066 */
38067 enum machine_mode mode = GET_MODE (operand0);
38068 rtx xa, TWO52, tmp, label, one, res, mask;
38069
38070 TWO52 = ix86_gen_TWO52 (mode);
38071
38072 /* Temporary for holding the result, initialized to the input
38073 operand to ease control flow. */
38074 res = gen_reg_rtx (mode);
38075 emit_move_insn (res, operand1);
38076
38077 /* xa = abs (operand1) */
38078 xa = ix86_expand_sse_fabs (res, &mask);
38079
38080 /* if (!isless (xa, TWO52)) goto label; */
38081 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38082
38083 /* xa = xa + TWO52 - TWO52; */
38084 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38085 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
38086
38087 /* xa = copysign (xa, operand1) */
38088 ix86_sse_copysign_to_positive (xa, xa, res, mask);
38089
38090 /* generate 1.0 or -1.0 */
38091 one = force_reg (mode,
38092 const_double_from_real_value (do_floor
38093 ? dconst1 : dconstm1, mode));
38094
38095 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
38096 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
38097 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38098 gen_rtx_AND (mode, one, tmp)));
38099 /* We always need to subtract here to preserve signed zero. */
38100 tmp = expand_simple_binop (mode, MINUS,
38101 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38102 emit_move_insn (res, tmp);
38103
38104 emit_label (label);
38105 LABEL_NUSES (label) = 1;
38106
38107 emit_move_insn (operand0, res);
38108 }
38109
38110 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
38111 into OPERAND0. */
38112 void
38113 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
38114 {
38115 /* C code for the stuff we expand below.
38116 double xa = fabs (x), x2;
38117 if (!isless (xa, TWO52))
38118 return x;
38119 x2 = (double)(long)x;
38120 Compensate. Floor:
38121 if (x2 > x)
38122 x2 -= 1;
38123 Compensate. Ceil:
38124 if (x2 < x)
38125 x2 += 1;
38126 if (HONOR_SIGNED_ZEROS (mode))
38127 return copysign (x2, x);
38128 return x2;
38129 */
38130 enum machine_mode mode = GET_MODE (operand0);
38131 rtx xa, xi, TWO52, tmp, label, one, res, mask;
38132
38133 TWO52 = ix86_gen_TWO52 (mode);
38134
38135 /* Temporary for holding the result, initialized to the input
38136 operand to ease control flow. */
38137 res = gen_reg_rtx (mode);
38138 emit_move_insn (res, operand1);
38139
38140 /* xa = abs (operand1) */
38141 xa = ix86_expand_sse_fabs (res, &mask);
38142
38143 /* if (!isless (xa, TWO52)) goto label; */
38144 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38145
38146 /* xa = (double)(long)x */
38147 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38148 expand_fix (xi, res, 0);
38149 expand_float (xa, xi, 0);
38150
38151 /* generate 1.0 */
38152 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
38153
38154 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
38155 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
38156 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38157 gen_rtx_AND (mode, one, tmp)));
38158 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
38159 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38160 emit_move_insn (res, tmp);
38161
38162 if (HONOR_SIGNED_ZEROS (mode))
38163 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
38164
38165 emit_label (label);
38166 LABEL_NUSES (label) = 1;
38167
38168 emit_move_insn (operand0, res);
38169 }
38170
38171 /* Expand SSE sequence for computing round from OPERAND1 storing
38172 into OPERAND0. Sequence that works without relying on DImode truncation
38173 via cvttsd2siq that is only available on 64bit targets. */
38174 void
38175 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
38176 {
38177 /* C code for the stuff we expand below.
38178 double xa = fabs (x), xa2, x2;
38179 if (!isless (xa, TWO52))
38180 return x;
38181 Using the absolute value and copying back sign makes
38182 -0.0 -> -0.0 correct.
38183 xa2 = xa + TWO52 - TWO52;
38184 Compensate.
38185 dxa = xa2 - xa;
38186 if (dxa <= -0.5)
38187 xa2 += 1;
38188 else if (dxa > 0.5)
38189 xa2 -= 1;
38190 x2 = copysign (xa2, x);
38191 return x2;
38192 */
38193 enum machine_mode mode = GET_MODE (operand0);
38194 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
38195
38196 TWO52 = ix86_gen_TWO52 (mode);
38197
38198 /* Temporary for holding the result, initialized to the input
38199 operand to ease control flow. */
38200 res = gen_reg_rtx (mode);
38201 emit_move_insn (res, operand1);
38202
38203 /* xa = abs (operand1) */
38204 xa = ix86_expand_sse_fabs (res, &mask);
38205
38206 /* if (!isless (xa, TWO52)) goto label; */
38207 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38208
38209 /* xa2 = xa + TWO52 - TWO52; */
38210 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38211 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
38212
38213 /* dxa = xa2 - xa; */
38214 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
38215
38216 /* generate 0.5, 1.0 and -0.5 */
38217 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
38218 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
38219 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
38220 0, OPTAB_DIRECT);
38221
38222 /* Compensate. */
38223 tmp = gen_reg_rtx (mode);
38224 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
38225 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
38226 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38227 gen_rtx_AND (mode, one, tmp)));
38228 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38229 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
38230 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
38231 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38232 gen_rtx_AND (mode, one, tmp)));
38233 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38234
38235 /* res = copysign (xa2, operand1) */
38236 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
38237
38238 emit_label (label);
38239 LABEL_NUSES (label) = 1;
38240
38241 emit_move_insn (operand0, res);
38242 }
38243
38244 /* Expand SSE sequence for computing trunc from OPERAND1 storing
38245 into OPERAND0. */
38246 void
38247 ix86_expand_trunc (rtx operand0, rtx operand1)
38248 {
38249 /* C code for SSE variant we expand below.
38250 double xa = fabs (x), x2;
38251 if (!isless (xa, TWO52))
38252 return x;
38253 x2 = (double)(long)x;
38254 if (HONOR_SIGNED_ZEROS (mode))
38255 return copysign (x2, x);
38256 return x2;
38257 */
38258 enum machine_mode mode = GET_MODE (operand0);
38259 rtx xa, xi, TWO52, label, res, mask;
38260
38261 TWO52 = ix86_gen_TWO52 (mode);
38262
38263 /* Temporary for holding the result, initialized to the input
38264 operand to ease control flow. */
38265 res = gen_reg_rtx (mode);
38266 emit_move_insn (res, operand1);
38267
38268 /* xa = abs (operand1) */
38269 xa = ix86_expand_sse_fabs (res, &mask);
38270
38271 /* if (!isless (xa, TWO52)) goto label; */
38272 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38273
38274 /* x = (double)(long)x */
38275 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38276 expand_fix (xi, res, 0);
38277 expand_float (res, xi, 0);
38278
38279 if (HONOR_SIGNED_ZEROS (mode))
38280 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
38281
38282 emit_label (label);
38283 LABEL_NUSES (label) = 1;
38284
38285 emit_move_insn (operand0, res);
38286 }
38287
38288 /* Expand SSE sequence for computing trunc from OPERAND1 storing
38289 into OPERAND0. */
38290 void
38291 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
38292 {
38293 enum machine_mode mode = GET_MODE (operand0);
38294 rtx xa, mask, TWO52, label, one, res, smask, tmp;
38295
38296 /* C code for SSE variant we expand below.
38297 double xa = fabs (x), x2;
38298 if (!isless (xa, TWO52))
38299 return x;
38300 xa2 = xa + TWO52 - TWO52;
38301 Compensate:
38302 if (xa2 > xa)
38303 xa2 -= 1.0;
38304 x2 = copysign (xa2, x);
38305 return x2;
38306 */
38307
38308 TWO52 = ix86_gen_TWO52 (mode);
38309
38310 /* Temporary for holding the result, initialized to the input
38311 operand to ease control flow. */
38312 res = gen_reg_rtx (mode);
38313 emit_move_insn (res, operand1);
38314
38315 /* xa = abs (operand1) */
38316 xa = ix86_expand_sse_fabs (res, &smask);
38317
38318 /* if (!isless (xa, TWO52)) goto label; */
38319 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38320
38321 /* res = xa + TWO52 - TWO52; */
38322 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38323 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
38324 emit_move_insn (res, tmp);
38325
38326 /* generate 1.0 */
38327 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
38328
38329 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
38330 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
38331 emit_insn (gen_rtx_SET (VOIDmode, mask,
38332 gen_rtx_AND (mode, mask, one)));
38333 tmp = expand_simple_binop (mode, MINUS,
38334 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
38335 emit_move_insn (res, tmp);
38336
38337 /* res = copysign (res, operand1) */
38338 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
38339
38340 emit_label (label);
38341 LABEL_NUSES (label) = 1;
38342
38343 emit_move_insn (operand0, res);
38344 }
38345
38346 /* Expand SSE sequence for computing round from OPERAND1 storing
38347 into OPERAND0. */
38348 void
38349 ix86_expand_round (rtx operand0, rtx operand1)
38350 {
38351 /* C code for the stuff we're doing below:
38352 double xa = fabs (x);
38353 if (!isless (xa, TWO52))
38354 return x;
38355 xa = (double)(long)(xa + nextafter (0.5, 0.0));
38356 return copysign (xa, x);
38357 */
38358 enum machine_mode mode = GET_MODE (operand0);
38359 rtx res, TWO52, xa, label, xi, half, mask;
38360 const struct real_format *fmt;
38361 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38362
38363 /* Temporary for holding the result, initialized to the input
38364 operand to ease control flow. */
38365 res = gen_reg_rtx (mode);
38366 emit_move_insn (res, operand1);
38367
38368 TWO52 = ix86_gen_TWO52 (mode);
38369 xa = ix86_expand_sse_fabs (res, &mask);
38370 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38371
38372 /* load nextafter (0.5, 0.0) */
38373 fmt = REAL_MODE_FORMAT (mode);
38374 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38375 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38376
38377 /* xa = xa + 0.5 */
38378 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
38379 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
38380
38381 /* xa = (double)(int64_t)xa */
38382 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38383 expand_fix (xi, xa, 0);
38384 expand_float (xa, xi, 0);
38385
38386 /* res = copysign (xa, operand1) */
38387 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
38388
38389 emit_label (label);
38390 LABEL_NUSES (label) = 1;
38391
38392 emit_move_insn (operand0, res);
38393 }
38394
38395 /* Expand SSE sequence for computing round
38396 from OP1 storing into OP0 using sse4 round insn. */
38397 void
38398 ix86_expand_round_sse4 (rtx op0, rtx op1)
38399 {
38400 enum machine_mode mode = GET_MODE (op0);
38401 rtx e1, e2, res, half;
38402 const struct real_format *fmt;
38403 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38404 rtx (*gen_copysign) (rtx, rtx, rtx);
38405 rtx (*gen_round) (rtx, rtx, rtx);
38406
38407 switch (mode)
38408 {
38409 case SFmode:
38410 gen_copysign = gen_copysignsf3;
38411 gen_round = gen_sse4_1_roundsf2;
38412 break;
38413 case DFmode:
38414 gen_copysign = gen_copysigndf3;
38415 gen_round = gen_sse4_1_rounddf2;
38416 break;
38417 default:
38418 gcc_unreachable ();
38419 }
38420
38421 /* round (a) = trunc (a + copysign (0.5, a)) */
38422
38423 /* load nextafter (0.5, 0.0) */
38424 fmt = REAL_MODE_FORMAT (mode);
38425 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38426 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38427 half = const_double_from_real_value (pred_half, mode);
38428
38429 /* e1 = copysign (0.5, op1) */
38430 e1 = gen_reg_rtx (mode);
38431 emit_insn (gen_copysign (e1, half, op1));
38432
38433 /* e2 = op1 + e1 */
38434 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
38435
38436 /* res = trunc (e2) */
38437 res = gen_reg_rtx (mode);
38438 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
38439
38440 emit_move_insn (op0, res);
38441 }
38442 \f
38443
38444 /* Table of valid machine attributes. */
38445 static const struct attribute_spec ix86_attribute_table[] =
38446 {
38447 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
38448 affects_type_identity } */
38449 /* Stdcall attribute says callee is responsible for popping arguments
38450 if they are not variable. */
38451 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38452 true },
38453 /* Fastcall attribute says callee is responsible for popping arguments
38454 if they are not variable. */
38455 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38456 true },
38457 /* Thiscall attribute says callee is responsible for popping arguments
38458 if they are not variable. */
38459 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38460 true },
38461 /* Cdecl attribute says the callee is a normal C declaration */
38462 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38463 true },
38464 /* Regparm attribute specifies how many integer arguments are to be
38465 passed in registers. */
38466 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
38467 true },
38468 /* Sseregparm attribute says we are using x86_64 calling conventions
38469 for FP arguments. */
38470 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38471 true },
38472 /* The transactional memory builtins are implicitly regparm or fastcall
38473 depending on the ABI. Override the generic do-nothing attribute that
38474 these builtins were declared with. */
38475 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
38476 true },
38477 /* force_align_arg_pointer says this function realigns the stack at entry. */
38478 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
38479 false, true, true, ix86_handle_cconv_attribute, false },
38480 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38481 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
38482 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
38483 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
38484 false },
38485 #endif
38486 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38487 false },
38488 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38489 false },
38490 #ifdef SUBTARGET_ATTRIBUTE_TABLE
38491 SUBTARGET_ATTRIBUTE_TABLE,
38492 #endif
38493 /* ms_abi and sysv_abi calling convention function attributes. */
38494 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38495 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38496 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
38497 false },
38498 { "callee_pop_aggregate_return", 1, 1, false, true, true,
38499 ix86_handle_callee_pop_aggregate_return, true },
38500 /* End element. */
38501 { NULL, 0, 0, false, false, false, NULL, false }
38502 };
38503
38504 /* Implement targetm.vectorize.builtin_vectorization_cost. */
38505 static int
38506 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
38507 tree vectype,
38508 int misalign ATTRIBUTE_UNUSED)
38509 {
38510 unsigned elements;
38511
38512 switch (type_of_cost)
38513 {
38514 case scalar_stmt:
38515 return ix86_cost->scalar_stmt_cost;
38516
38517 case scalar_load:
38518 return ix86_cost->scalar_load_cost;
38519
38520 case scalar_store:
38521 return ix86_cost->scalar_store_cost;
38522
38523 case vector_stmt:
38524 return ix86_cost->vec_stmt_cost;
38525
38526 case vector_load:
38527 return ix86_cost->vec_align_load_cost;
38528
38529 case vector_store:
38530 return ix86_cost->vec_store_cost;
38531
38532 case vec_to_scalar:
38533 return ix86_cost->vec_to_scalar_cost;
38534
38535 case scalar_to_vec:
38536 return ix86_cost->scalar_to_vec_cost;
38537
38538 case unaligned_load:
38539 case unaligned_store:
38540 return ix86_cost->vec_unalign_load_cost;
38541
38542 case cond_branch_taken:
38543 return ix86_cost->cond_taken_branch_cost;
38544
38545 case cond_branch_not_taken:
38546 return ix86_cost->cond_not_taken_branch_cost;
38547
38548 case vec_perm:
38549 case vec_promote_demote:
38550 return ix86_cost->vec_stmt_cost;
38551
38552 case vec_construct:
38553 elements = TYPE_VECTOR_SUBPARTS (vectype);
38554 return elements / 2 + 1;
38555
38556 default:
38557 gcc_unreachable ();
38558 }
38559 }
38560
38561 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
38562 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
38563 insn every time. */
38564
38565 static GTY(()) rtx vselect_insn;
38566
38567 /* Initialize vselect_insn. */
38568
38569 static void
38570 init_vselect_insn (void)
38571 {
38572 unsigned i;
38573 rtx x;
38574
38575 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
38576 for (i = 0; i < MAX_VECT_LEN; ++i)
38577 XVECEXP (x, 0, i) = const0_rtx;
38578 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
38579 const0_rtx), x);
38580 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
38581 start_sequence ();
38582 vselect_insn = emit_insn (x);
38583 end_sequence ();
38584 }
38585
38586 /* Construct (set target (vec_select op0 (parallel perm))) and
38587 return true if that's a valid instruction in the active ISA. */
38588
38589 static bool
38590 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
38591 unsigned nelt, bool testing_p)
38592 {
38593 unsigned int i;
38594 rtx x, save_vconcat;
38595 int icode;
38596
38597 if (vselect_insn == NULL_RTX)
38598 init_vselect_insn ();
38599
38600 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
38601 PUT_NUM_ELEM (XVEC (x, 0), nelt);
38602 for (i = 0; i < nelt; ++i)
38603 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
38604 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38605 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
38606 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
38607 SET_DEST (PATTERN (vselect_insn)) = target;
38608 icode = recog_memoized (vselect_insn);
38609
38610 if (icode >= 0 && !testing_p)
38611 emit_insn (copy_rtx (PATTERN (vselect_insn)));
38612
38613 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
38614 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
38615 INSN_CODE (vselect_insn) = -1;
38616
38617 return icode >= 0;
38618 }
38619
38620 /* Similar, but generate a vec_concat from op0 and op1 as well. */
38621
38622 static bool
38623 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
38624 const unsigned char *perm, unsigned nelt,
38625 bool testing_p)
38626 {
38627 enum machine_mode v2mode;
38628 rtx x;
38629 bool ok;
38630
38631 if (vselect_insn == NULL_RTX)
38632 init_vselect_insn ();
38633
38634 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
38635 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38636 PUT_MODE (x, v2mode);
38637 XEXP (x, 0) = op0;
38638 XEXP (x, 1) = op1;
38639 ok = expand_vselect (target, x, perm, nelt, testing_p);
38640 XEXP (x, 0) = const0_rtx;
38641 XEXP (x, 1) = const0_rtx;
38642 return ok;
38643 }
38644
38645 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38646 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
38647
38648 static bool
38649 expand_vec_perm_blend (struct expand_vec_perm_d *d)
38650 {
38651 enum machine_mode vmode = d->vmode;
38652 unsigned i, mask, nelt = d->nelt;
38653 rtx target, op0, op1, x;
38654 rtx rperm[32], vperm;
38655
38656 if (d->one_operand_p)
38657 return false;
38658 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
38659 ;
38660 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
38661 ;
38662 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
38663 ;
38664 else
38665 return false;
38666
38667 /* This is a blend, not a permute. Elements must stay in their
38668 respective lanes. */
38669 for (i = 0; i < nelt; ++i)
38670 {
38671 unsigned e = d->perm[i];
38672 if (!(e == i || e == i + nelt))
38673 return false;
38674 }
38675
38676 if (d->testing_p)
38677 return true;
38678
38679 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
38680 decision should be extracted elsewhere, so that we only try that
38681 sequence once all budget==3 options have been tried. */
38682 target = d->target;
38683 op0 = d->op0;
38684 op1 = d->op1;
38685 mask = 0;
38686
38687 switch (vmode)
38688 {
38689 case V4DFmode:
38690 case V8SFmode:
38691 case V2DFmode:
38692 case V4SFmode:
38693 case V8HImode:
38694 case V8SImode:
38695 for (i = 0; i < nelt; ++i)
38696 mask |= (d->perm[i] >= nelt) << i;
38697 break;
38698
38699 case V2DImode:
38700 for (i = 0; i < 2; ++i)
38701 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
38702 vmode = V8HImode;
38703 goto do_subreg;
38704
38705 case V4SImode:
38706 for (i = 0; i < 4; ++i)
38707 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38708 vmode = V8HImode;
38709 goto do_subreg;
38710
38711 case V16QImode:
38712 /* See if bytes move in pairs so we can use pblendw with
38713 an immediate argument, rather than pblendvb with a vector
38714 argument. */
38715 for (i = 0; i < 16; i += 2)
38716 if (d->perm[i] + 1 != d->perm[i + 1])
38717 {
38718 use_pblendvb:
38719 for (i = 0; i < nelt; ++i)
38720 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
38721
38722 finish_pblendvb:
38723 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
38724 vperm = force_reg (vmode, vperm);
38725
38726 if (GET_MODE_SIZE (vmode) == 16)
38727 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
38728 else
38729 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
38730 return true;
38731 }
38732
38733 for (i = 0; i < 8; ++i)
38734 mask |= (d->perm[i * 2] >= 16) << i;
38735 vmode = V8HImode;
38736 /* FALLTHRU */
38737
38738 do_subreg:
38739 target = gen_lowpart (vmode, target);
38740 op0 = gen_lowpart (vmode, op0);
38741 op1 = gen_lowpart (vmode, op1);
38742 break;
38743
38744 case V32QImode:
38745 /* See if bytes move in pairs. If not, vpblendvb must be used. */
38746 for (i = 0; i < 32; i += 2)
38747 if (d->perm[i] + 1 != d->perm[i + 1])
38748 goto use_pblendvb;
38749 /* See if bytes move in quadruplets. If yes, vpblendd
38750 with immediate can be used. */
38751 for (i = 0; i < 32; i += 4)
38752 if (d->perm[i] + 2 != d->perm[i + 2])
38753 break;
38754 if (i < 32)
38755 {
38756 /* See if bytes move the same in both lanes. If yes,
38757 vpblendw with immediate can be used. */
38758 for (i = 0; i < 16; i += 2)
38759 if (d->perm[i] + 16 != d->perm[i + 16])
38760 goto use_pblendvb;
38761
38762 /* Use vpblendw. */
38763 for (i = 0; i < 16; ++i)
38764 mask |= (d->perm[i * 2] >= 32) << i;
38765 vmode = V16HImode;
38766 goto do_subreg;
38767 }
38768
38769 /* Use vpblendd. */
38770 for (i = 0; i < 8; ++i)
38771 mask |= (d->perm[i * 4] >= 32) << i;
38772 vmode = V8SImode;
38773 goto do_subreg;
38774
38775 case V16HImode:
38776 /* See if words move in pairs. If yes, vpblendd can be used. */
38777 for (i = 0; i < 16; i += 2)
38778 if (d->perm[i] + 1 != d->perm[i + 1])
38779 break;
38780 if (i < 16)
38781 {
38782 /* See if words move the same in both lanes. If not,
38783 vpblendvb must be used. */
38784 for (i = 0; i < 8; i++)
38785 if (d->perm[i] + 8 != d->perm[i + 8])
38786 {
38787 /* Use vpblendvb. */
38788 for (i = 0; i < 32; ++i)
38789 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
38790
38791 vmode = V32QImode;
38792 nelt = 32;
38793 target = gen_lowpart (vmode, target);
38794 op0 = gen_lowpart (vmode, op0);
38795 op1 = gen_lowpart (vmode, op1);
38796 goto finish_pblendvb;
38797 }
38798
38799 /* Use vpblendw. */
38800 for (i = 0; i < 16; ++i)
38801 mask |= (d->perm[i] >= 16) << i;
38802 break;
38803 }
38804
38805 /* Use vpblendd. */
38806 for (i = 0; i < 8; ++i)
38807 mask |= (d->perm[i * 2] >= 16) << i;
38808 vmode = V8SImode;
38809 goto do_subreg;
38810
38811 case V4DImode:
38812 /* Use vpblendd. */
38813 for (i = 0; i < 4; ++i)
38814 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38815 vmode = V8SImode;
38816 goto do_subreg;
38817
38818 default:
38819 gcc_unreachable ();
38820 }
38821
38822 /* This matches five different patterns with the different modes. */
38823 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
38824 x = gen_rtx_SET (VOIDmode, target, x);
38825 emit_insn (x);
38826
38827 return true;
38828 }
38829
38830 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38831 in terms of the variable form of vpermilps.
38832
38833 Note that we will have already failed the immediate input vpermilps,
38834 which requires that the high and low part shuffle be identical; the
38835 variable form doesn't require that. */
38836
38837 static bool
38838 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
38839 {
38840 rtx rperm[8], vperm;
38841 unsigned i;
38842
38843 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
38844 return false;
38845
38846 /* We can only permute within the 128-bit lane. */
38847 for (i = 0; i < 8; ++i)
38848 {
38849 unsigned e = d->perm[i];
38850 if (i < 4 ? e >= 4 : e < 4)
38851 return false;
38852 }
38853
38854 if (d->testing_p)
38855 return true;
38856
38857 for (i = 0; i < 8; ++i)
38858 {
38859 unsigned e = d->perm[i];
38860
38861 /* Within each 128-bit lane, the elements of op0 are numbered
38862 from 0 and the elements of op1 are numbered from 4. */
38863 if (e >= 8 + 4)
38864 e -= 8;
38865 else if (e >= 4)
38866 e -= 4;
38867
38868 rperm[i] = GEN_INT (e);
38869 }
38870
38871 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
38872 vperm = force_reg (V8SImode, vperm);
38873 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
38874
38875 return true;
38876 }
38877
38878 /* Return true if permutation D can be performed as VMODE permutation
38879 instead. */
38880
38881 static bool
38882 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
38883 {
38884 unsigned int i, j, chunk;
38885
38886 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
38887 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
38888 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
38889 return false;
38890
38891 if (GET_MODE_NUNITS (vmode) >= d->nelt)
38892 return true;
38893
38894 chunk = d->nelt / GET_MODE_NUNITS (vmode);
38895 for (i = 0; i < d->nelt; i += chunk)
38896 if (d->perm[i] & (chunk - 1))
38897 return false;
38898 else
38899 for (j = 1; j < chunk; ++j)
38900 if (d->perm[i] + j != d->perm[i + j])
38901 return false;
38902
38903 return true;
38904 }
38905
38906 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38907 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
38908
38909 static bool
38910 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
38911 {
38912 unsigned i, nelt, eltsz, mask;
38913 unsigned char perm[32];
38914 enum machine_mode vmode = V16QImode;
38915 rtx rperm[32], vperm, target, op0, op1;
38916
38917 nelt = d->nelt;
38918
38919 if (!d->one_operand_p)
38920 {
38921 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
38922 {
38923 if (TARGET_AVX2
38924 && valid_perm_using_mode_p (V2TImode, d))
38925 {
38926 if (d->testing_p)
38927 return true;
38928
38929 /* Use vperm2i128 insn. The pattern uses
38930 V4DImode instead of V2TImode. */
38931 target = gen_lowpart (V4DImode, d->target);
38932 op0 = gen_lowpart (V4DImode, d->op0);
38933 op1 = gen_lowpart (V4DImode, d->op1);
38934 rperm[0]
38935 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
38936 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
38937 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
38938 return true;
38939 }
38940 return false;
38941 }
38942 }
38943 else
38944 {
38945 if (GET_MODE_SIZE (d->vmode) == 16)
38946 {
38947 if (!TARGET_SSSE3)
38948 return false;
38949 }
38950 else if (GET_MODE_SIZE (d->vmode) == 32)
38951 {
38952 if (!TARGET_AVX2)
38953 return false;
38954
38955 /* V4DImode should be already handled through
38956 expand_vselect by vpermq instruction. */
38957 gcc_assert (d->vmode != V4DImode);
38958
38959 vmode = V32QImode;
38960 if (d->vmode == V8SImode
38961 || d->vmode == V16HImode
38962 || d->vmode == V32QImode)
38963 {
38964 /* First see if vpermq can be used for
38965 V8SImode/V16HImode/V32QImode. */
38966 if (valid_perm_using_mode_p (V4DImode, d))
38967 {
38968 for (i = 0; i < 4; i++)
38969 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
38970 if (d->testing_p)
38971 return true;
38972 return expand_vselect (gen_lowpart (V4DImode, d->target),
38973 gen_lowpart (V4DImode, d->op0),
38974 perm, 4, false);
38975 }
38976
38977 /* Next see if vpermd can be used. */
38978 if (valid_perm_using_mode_p (V8SImode, d))
38979 vmode = V8SImode;
38980 }
38981 /* Or if vpermps can be used. */
38982 else if (d->vmode == V8SFmode)
38983 vmode = V8SImode;
38984
38985 if (vmode == V32QImode)
38986 {
38987 /* vpshufb only works intra lanes, it is not
38988 possible to shuffle bytes in between the lanes. */
38989 for (i = 0; i < nelt; ++i)
38990 if ((d->perm[i] ^ i) & (nelt / 2))
38991 return false;
38992 }
38993 }
38994 else
38995 return false;
38996 }
38997
38998 if (d->testing_p)
38999 return true;
39000
39001 if (vmode == V8SImode)
39002 for (i = 0; i < 8; ++i)
39003 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
39004 else
39005 {
39006 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39007 if (!d->one_operand_p)
39008 mask = 2 * nelt - 1;
39009 else if (vmode == V16QImode)
39010 mask = nelt - 1;
39011 else
39012 mask = nelt / 2 - 1;
39013
39014 for (i = 0; i < nelt; ++i)
39015 {
39016 unsigned j, e = d->perm[i] & mask;
39017 for (j = 0; j < eltsz; ++j)
39018 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
39019 }
39020 }
39021
39022 vperm = gen_rtx_CONST_VECTOR (vmode,
39023 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
39024 vperm = force_reg (vmode, vperm);
39025
39026 target = gen_lowpart (vmode, d->target);
39027 op0 = gen_lowpart (vmode, d->op0);
39028 if (d->one_operand_p)
39029 {
39030 if (vmode == V16QImode)
39031 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
39032 else if (vmode == V32QImode)
39033 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
39034 else if (vmode == V8SFmode)
39035 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
39036 else
39037 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
39038 }
39039 else
39040 {
39041 op1 = gen_lowpart (vmode, d->op1);
39042 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
39043 }
39044
39045 return true;
39046 }
39047
39048 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
39049 in a single instruction. */
39050
39051 static bool
39052 expand_vec_perm_1 (struct expand_vec_perm_d *d)
39053 {
39054 unsigned i, nelt = d->nelt;
39055 unsigned char perm2[MAX_VECT_LEN];
39056
39057 /* Check plain VEC_SELECT first, because AVX has instructions that could
39058 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
39059 input where SEL+CONCAT may not. */
39060 if (d->one_operand_p)
39061 {
39062 int mask = nelt - 1;
39063 bool identity_perm = true;
39064 bool broadcast_perm = true;
39065
39066 for (i = 0; i < nelt; i++)
39067 {
39068 perm2[i] = d->perm[i] & mask;
39069 if (perm2[i] != i)
39070 identity_perm = false;
39071 if (perm2[i])
39072 broadcast_perm = false;
39073 }
39074
39075 if (identity_perm)
39076 {
39077 if (!d->testing_p)
39078 emit_move_insn (d->target, d->op0);
39079 return true;
39080 }
39081 else if (broadcast_perm && TARGET_AVX2)
39082 {
39083 /* Use vpbroadcast{b,w,d}. */
39084 rtx (*gen) (rtx, rtx) = NULL;
39085 switch (d->vmode)
39086 {
39087 case V32QImode:
39088 gen = gen_avx2_pbroadcastv32qi_1;
39089 break;
39090 case V16HImode:
39091 gen = gen_avx2_pbroadcastv16hi_1;
39092 break;
39093 case V8SImode:
39094 gen = gen_avx2_pbroadcastv8si_1;
39095 break;
39096 case V16QImode:
39097 gen = gen_avx2_pbroadcastv16qi;
39098 break;
39099 case V8HImode:
39100 gen = gen_avx2_pbroadcastv8hi;
39101 break;
39102 case V8SFmode:
39103 gen = gen_avx2_vec_dupv8sf_1;
39104 break;
39105 /* For other modes prefer other shuffles this function creates. */
39106 default: break;
39107 }
39108 if (gen != NULL)
39109 {
39110 if (!d->testing_p)
39111 emit_insn (gen (d->target, d->op0));
39112 return true;
39113 }
39114 }
39115
39116 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
39117 return true;
39118
39119 /* There are plenty of patterns in sse.md that are written for
39120 SEL+CONCAT and are not replicated for a single op. Perhaps
39121 that should be changed, to avoid the nastiness here. */
39122
39123 /* Recognize interleave style patterns, which means incrementing
39124 every other permutation operand. */
39125 for (i = 0; i < nelt; i += 2)
39126 {
39127 perm2[i] = d->perm[i] & mask;
39128 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
39129 }
39130 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
39131 d->testing_p))
39132 return true;
39133
39134 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
39135 if (nelt >= 4)
39136 {
39137 for (i = 0; i < nelt; i += 4)
39138 {
39139 perm2[i + 0] = d->perm[i + 0] & mask;
39140 perm2[i + 1] = d->perm[i + 1] & mask;
39141 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
39142 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
39143 }
39144
39145 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
39146 d->testing_p))
39147 return true;
39148 }
39149 }
39150
39151 /* Finally, try the fully general two operand permute. */
39152 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
39153 d->testing_p))
39154 return true;
39155
39156 /* Recognize interleave style patterns with reversed operands. */
39157 if (!d->one_operand_p)
39158 {
39159 for (i = 0; i < nelt; ++i)
39160 {
39161 unsigned e = d->perm[i];
39162 if (e >= nelt)
39163 e -= nelt;
39164 else
39165 e += nelt;
39166 perm2[i] = e;
39167 }
39168
39169 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
39170 d->testing_p))
39171 return true;
39172 }
39173
39174 /* Try the SSE4.1 blend variable merge instructions. */
39175 if (expand_vec_perm_blend (d))
39176 return true;
39177
39178 /* Try one of the AVX vpermil variable permutations. */
39179 if (expand_vec_perm_vpermil (d))
39180 return true;
39181
39182 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
39183 vpshufb, vpermd, vpermps or vpermq variable permutation. */
39184 if (expand_vec_perm_pshufb (d))
39185 return true;
39186
39187 return false;
39188 }
39189
39190 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39191 in terms of a pair of pshuflw + pshufhw instructions. */
39192
39193 static bool
39194 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
39195 {
39196 unsigned char perm2[MAX_VECT_LEN];
39197 unsigned i;
39198 bool ok;
39199
39200 if (d->vmode != V8HImode || !d->one_operand_p)
39201 return false;
39202
39203 /* The two permutations only operate in 64-bit lanes. */
39204 for (i = 0; i < 4; ++i)
39205 if (d->perm[i] >= 4)
39206 return false;
39207 for (i = 4; i < 8; ++i)
39208 if (d->perm[i] < 4)
39209 return false;
39210
39211 if (d->testing_p)
39212 return true;
39213
39214 /* Emit the pshuflw. */
39215 memcpy (perm2, d->perm, 4);
39216 for (i = 4; i < 8; ++i)
39217 perm2[i] = i;
39218 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
39219 gcc_assert (ok);
39220
39221 /* Emit the pshufhw. */
39222 memcpy (perm2 + 4, d->perm + 4, 4);
39223 for (i = 0; i < 4; ++i)
39224 perm2[i] = i;
39225 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
39226 gcc_assert (ok);
39227
39228 return true;
39229 }
39230
39231 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39232 the permutation using the SSSE3 palignr instruction. This succeeds
39233 when all of the elements in PERM fit within one vector and we merely
39234 need to shift them down so that a single vector permutation has a
39235 chance to succeed. */
39236
39237 static bool
39238 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
39239 {
39240 unsigned i, nelt = d->nelt;
39241 unsigned min, max;
39242 bool in_order, ok;
39243 rtx shift;
39244
39245 /* Even with AVX, palignr only operates on 128-bit vectors. */
39246 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
39247 return false;
39248
39249 min = nelt, max = 0;
39250 for (i = 0; i < nelt; ++i)
39251 {
39252 unsigned e = d->perm[i];
39253 if (e < min)
39254 min = e;
39255 if (e > max)
39256 max = e;
39257 }
39258 if (min == 0 || max - min >= nelt)
39259 return false;
39260
39261 /* Given that we have SSSE3, we know we'll be able to implement the
39262 single operand permutation after the palignr with pshufb. */
39263 if (d->testing_p)
39264 return true;
39265
39266 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
39267 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
39268 gen_lowpart (TImode, d->op1),
39269 gen_lowpart (TImode, d->op0), shift));
39270
39271 d->op0 = d->op1 = d->target;
39272 d->one_operand_p = true;
39273
39274 in_order = true;
39275 for (i = 0; i < nelt; ++i)
39276 {
39277 unsigned e = d->perm[i] - min;
39278 if (e != i)
39279 in_order = false;
39280 d->perm[i] = e;
39281 }
39282
39283 /* Test for the degenerate case where the alignment by itself
39284 produces the desired permutation. */
39285 if (in_order)
39286 return true;
39287
39288 ok = expand_vec_perm_1 (d);
39289 gcc_assert (ok);
39290
39291 return ok;
39292 }
39293
39294 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
39295
39296 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39297 a two vector permutation into a single vector permutation by using
39298 an interleave operation to merge the vectors. */
39299
39300 static bool
39301 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
39302 {
39303 struct expand_vec_perm_d dremap, dfinal;
39304 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
39305 unsigned HOST_WIDE_INT contents;
39306 unsigned char remap[2 * MAX_VECT_LEN];
39307 rtx seq;
39308 bool ok, same_halves = false;
39309
39310 if (GET_MODE_SIZE (d->vmode) == 16)
39311 {
39312 if (d->one_operand_p)
39313 return false;
39314 }
39315 else if (GET_MODE_SIZE (d->vmode) == 32)
39316 {
39317 if (!TARGET_AVX)
39318 return false;
39319 /* For 32-byte modes allow even d->one_operand_p.
39320 The lack of cross-lane shuffling in some instructions
39321 might prevent a single insn shuffle. */
39322 dfinal = *d;
39323 dfinal.testing_p = true;
39324 /* If expand_vec_perm_interleave3 can expand this into
39325 a 3 insn sequence, give up and let it be expanded as
39326 3 insn sequence. While that is one insn longer,
39327 it doesn't need a memory operand and in the common
39328 case that both interleave low and high permutations
39329 with the same operands are adjacent needs 4 insns
39330 for both after CSE. */
39331 if (expand_vec_perm_interleave3 (&dfinal))
39332 return false;
39333 }
39334 else
39335 return false;
39336
39337 /* Examine from whence the elements come. */
39338 contents = 0;
39339 for (i = 0; i < nelt; ++i)
39340 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
39341
39342 memset (remap, 0xff, sizeof (remap));
39343 dremap = *d;
39344
39345 if (GET_MODE_SIZE (d->vmode) == 16)
39346 {
39347 unsigned HOST_WIDE_INT h1, h2, h3, h4;
39348
39349 /* Split the two input vectors into 4 halves. */
39350 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
39351 h2 = h1 << nelt2;
39352 h3 = h2 << nelt2;
39353 h4 = h3 << nelt2;
39354
39355 /* If the elements from the low halves use interleave low, and similarly
39356 for interleave high. If the elements are from mis-matched halves, we
39357 can use shufps for V4SF/V4SI or do a DImode shuffle. */
39358 if ((contents & (h1 | h3)) == contents)
39359 {
39360 /* punpckl* */
39361 for (i = 0; i < nelt2; ++i)
39362 {
39363 remap[i] = i * 2;
39364 remap[i + nelt] = i * 2 + 1;
39365 dremap.perm[i * 2] = i;
39366 dremap.perm[i * 2 + 1] = i + nelt;
39367 }
39368 if (!TARGET_SSE2 && d->vmode == V4SImode)
39369 dremap.vmode = V4SFmode;
39370 }
39371 else if ((contents & (h2 | h4)) == contents)
39372 {
39373 /* punpckh* */
39374 for (i = 0; i < nelt2; ++i)
39375 {
39376 remap[i + nelt2] = i * 2;
39377 remap[i + nelt + nelt2] = i * 2 + 1;
39378 dremap.perm[i * 2] = i + nelt2;
39379 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
39380 }
39381 if (!TARGET_SSE2 && d->vmode == V4SImode)
39382 dremap.vmode = V4SFmode;
39383 }
39384 else if ((contents & (h1 | h4)) == contents)
39385 {
39386 /* shufps */
39387 for (i = 0; i < nelt2; ++i)
39388 {
39389 remap[i] = i;
39390 remap[i + nelt + nelt2] = i + nelt2;
39391 dremap.perm[i] = i;
39392 dremap.perm[i + nelt2] = i + nelt + nelt2;
39393 }
39394 if (nelt != 4)
39395 {
39396 /* shufpd */
39397 dremap.vmode = V2DImode;
39398 dremap.nelt = 2;
39399 dremap.perm[0] = 0;
39400 dremap.perm[1] = 3;
39401 }
39402 }
39403 else if ((contents & (h2 | h3)) == contents)
39404 {
39405 /* shufps */
39406 for (i = 0; i < nelt2; ++i)
39407 {
39408 remap[i + nelt2] = i;
39409 remap[i + nelt] = i + nelt2;
39410 dremap.perm[i] = i + nelt2;
39411 dremap.perm[i + nelt2] = i + nelt;
39412 }
39413 if (nelt != 4)
39414 {
39415 /* shufpd */
39416 dremap.vmode = V2DImode;
39417 dremap.nelt = 2;
39418 dremap.perm[0] = 1;
39419 dremap.perm[1] = 2;
39420 }
39421 }
39422 else
39423 return false;
39424 }
39425 else
39426 {
39427 unsigned int nelt4 = nelt / 4, nzcnt = 0;
39428 unsigned HOST_WIDE_INT q[8];
39429 unsigned int nonzero_halves[4];
39430
39431 /* Split the two input vectors into 8 quarters. */
39432 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
39433 for (i = 1; i < 8; ++i)
39434 q[i] = q[0] << (nelt4 * i);
39435 for (i = 0; i < 4; ++i)
39436 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
39437 {
39438 nonzero_halves[nzcnt] = i;
39439 ++nzcnt;
39440 }
39441
39442 if (nzcnt == 1)
39443 {
39444 gcc_assert (d->one_operand_p);
39445 nonzero_halves[1] = nonzero_halves[0];
39446 same_halves = true;
39447 }
39448 else if (d->one_operand_p)
39449 {
39450 gcc_assert (nonzero_halves[0] == 0);
39451 gcc_assert (nonzero_halves[1] == 1);
39452 }
39453
39454 if (nzcnt <= 2)
39455 {
39456 if (d->perm[0] / nelt2 == nonzero_halves[1])
39457 {
39458 /* Attempt to increase the likelihood that dfinal
39459 shuffle will be intra-lane. */
39460 char tmph = nonzero_halves[0];
39461 nonzero_halves[0] = nonzero_halves[1];
39462 nonzero_halves[1] = tmph;
39463 }
39464
39465 /* vperm2f128 or vperm2i128. */
39466 for (i = 0; i < nelt2; ++i)
39467 {
39468 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
39469 remap[i + nonzero_halves[0] * nelt2] = i;
39470 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
39471 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
39472 }
39473
39474 if (d->vmode != V8SFmode
39475 && d->vmode != V4DFmode
39476 && d->vmode != V8SImode)
39477 {
39478 dremap.vmode = V8SImode;
39479 dremap.nelt = 8;
39480 for (i = 0; i < 4; ++i)
39481 {
39482 dremap.perm[i] = i + nonzero_halves[0] * 4;
39483 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
39484 }
39485 }
39486 }
39487 else if (d->one_operand_p)
39488 return false;
39489 else if (TARGET_AVX2
39490 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
39491 {
39492 /* vpunpckl* */
39493 for (i = 0; i < nelt4; ++i)
39494 {
39495 remap[i] = i * 2;
39496 remap[i + nelt] = i * 2 + 1;
39497 remap[i + nelt2] = i * 2 + nelt2;
39498 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
39499 dremap.perm[i * 2] = i;
39500 dremap.perm[i * 2 + 1] = i + nelt;
39501 dremap.perm[i * 2 + nelt2] = i + nelt2;
39502 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
39503 }
39504 }
39505 else if (TARGET_AVX2
39506 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
39507 {
39508 /* vpunpckh* */
39509 for (i = 0; i < nelt4; ++i)
39510 {
39511 remap[i + nelt4] = i * 2;
39512 remap[i + nelt + nelt4] = i * 2 + 1;
39513 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
39514 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
39515 dremap.perm[i * 2] = i + nelt4;
39516 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
39517 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
39518 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
39519 }
39520 }
39521 else
39522 return false;
39523 }
39524
39525 /* Use the remapping array set up above to move the elements from their
39526 swizzled locations into their final destinations. */
39527 dfinal = *d;
39528 for (i = 0; i < nelt; ++i)
39529 {
39530 unsigned e = remap[d->perm[i]];
39531 gcc_assert (e < nelt);
39532 /* If same_halves is true, both halves of the remapped vector are the
39533 same. Avoid cross-lane accesses if possible. */
39534 if (same_halves && i >= nelt2)
39535 {
39536 gcc_assert (e < nelt2);
39537 dfinal.perm[i] = e + nelt2;
39538 }
39539 else
39540 dfinal.perm[i] = e;
39541 }
39542 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
39543 dfinal.op1 = dfinal.op0;
39544 dfinal.one_operand_p = true;
39545 dremap.target = dfinal.op0;
39546
39547 /* Test if the final remap can be done with a single insn. For V4SFmode or
39548 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
39549 start_sequence ();
39550 ok = expand_vec_perm_1 (&dfinal);
39551 seq = get_insns ();
39552 end_sequence ();
39553
39554 if (!ok)
39555 return false;
39556
39557 if (d->testing_p)
39558 return true;
39559
39560 if (dremap.vmode != dfinal.vmode)
39561 {
39562 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
39563 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
39564 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
39565 }
39566
39567 ok = expand_vec_perm_1 (&dremap);
39568 gcc_assert (ok);
39569
39570 emit_insn (seq);
39571 return true;
39572 }
39573
39574 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39575 a single vector cross-lane permutation into vpermq followed
39576 by any of the single insn permutations. */
39577
39578 static bool
39579 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
39580 {
39581 struct expand_vec_perm_d dremap, dfinal;
39582 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
39583 unsigned contents[2];
39584 bool ok;
39585
39586 if (!(TARGET_AVX2
39587 && (d->vmode == V32QImode || d->vmode == V16HImode)
39588 && d->one_operand_p))
39589 return false;
39590
39591 contents[0] = 0;
39592 contents[1] = 0;
39593 for (i = 0; i < nelt2; ++i)
39594 {
39595 contents[0] |= 1u << (d->perm[i] / nelt4);
39596 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
39597 }
39598
39599 for (i = 0; i < 2; ++i)
39600 {
39601 unsigned int cnt = 0;
39602 for (j = 0; j < 4; ++j)
39603 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
39604 return false;
39605 }
39606
39607 if (d->testing_p)
39608 return true;
39609
39610 dremap = *d;
39611 dremap.vmode = V4DImode;
39612 dremap.nelt = 4;
39613 dremap.target = gen_reg_rtx (V4DImode);
39614 dremap.op0 = gen_lowpart (V4DImode, d->op0);
39615 dremap.op1 = dremap.op0;
39616 dremap.one_operand_p = true;
39617 for (i = 0; i < 2; ++i)
39618 {
39619 unsigned int cnt = 0;
39620 for (j = 0; j < 4; ++j)
39621 if ((contents[i] & (1u << j)) != 0)
39622 dremap.perm[2 * i + cnt++] = j;
39623 for (; cnt < 2; ++cnt)
39624 dremap.perm[2 * i + cnt] = 0;
39625 }
39626
39627 dfinal = *d;
39628 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
39629 dfinal.op1 = dfinal.op0;
39630 dfinal.one_operand_p = true;
39631 for (i = 0, j = 0; i < nelt; ++i)
39632 {
39633 if (i == nelt2)
39634 j = 2;
39635 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
39636 if ((d->perm[i] / nelt4) == dremap.perm[j])
39637 ;
39638 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
39639 dfinal.perm[i] |= nelt4;
39640 else
39641 gcc_unreachable ();
39642 }
39643
39644 ok = expand_vec_perm_1 (&dremap);
39645 gcc_assert (ok);
39646
39647 ok = expand_vec_perm_1 (&dfinal);
39648 gcc_assert (ok);
39649
39650 return true;
39651 }
39652
39653 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
39654 a vector permutation using two instructions, vperm2f128 resp.
39655 vperm2i128 followed by any single in-lane permutation. */
39656
39657 static bool
39658 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
39659 {
39660 struct expand_vec_perm_d dfirst, dsecond;
39661 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
39662 bool ok;
39663
39664 if (!TARGET_AVX
39665 || GET_MODE_SIZE (d->vmode) != 32
39666 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
39667 return false;
39668
39669 dsecond = *d;
39670 dsecond.one_operand_p = false;
39671 dsecond.testing_p = true;
39672
39673 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
39674 immediate. For perm < 16 the second permutation uses
39675 d->op0 as first operand, for perm >= 16 it uses d->op1
39676 as first operand. The second operand is the result of
39677 vperm2[fi]128. */
39678 for (perm = 0; perm < 32; perm++)
39679 {
39680 /* Ignore permutations which do not move anything cross-lane. */
39681 if (perm < 16)
39682 {
39683 /* The second shuffle for e.g. V4DFmode has
39684 0123 and ABCD operands.
39685 Ignore AB23, as 23 is already in the second lane
39686 of the first operand. */
39687 if ((perm & 0xc) == (1 << 2)) continue;
39688 /* And 01CD, as 01 is in the first lane of the first
39689 operand. */
39690 if ((perm & 3) == 0) continue;
39691 /* And 4567, as then the vperm2[fi]128 doesn't change
39692 anything on the original 4567 second operand. */
39693 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
39694 }
39695 else
39696 {
39697 /* The second shuffle for e.g. V4DFmode has
39698 4567 and ABCD operands.
39699 Ignore AB67, as 67 is already in the second lane
39700 of the first operand. */
39701 if ((perm & 0xc) == (3 << 2)) continue;
39702 /* And 45CD, as 45 is in the first lane of the first
39703 operand. */
39704 if ((perm & 3) == 2) continue;
39705 /* And 0123, as then the vperm2[fi]128 doesn't change
39706 anything on the original 0123 first operand. */
39707 if ((perm & 0xf) == (1 << 2)) continue;
39708 }
39709
39710 for (i = 0; i < nelt; i++)
39711 {
39712 j = d->perm[i] / nelt2;
39713 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
39714 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
39715 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
39716 dsecond.perm[i] = d->perm[i] & (nelt - 1);
39717 else
39718 break;
39719 }
39720
39721 if (i == nelt)
39722 {
39723 start_sequence ();
39724 ok = expand_vec_perm_1 (&dsecond);
39725 end_sequence ();
39726 }
39727 else
39728 ok = false;
39729
39730 if (ok)
39731 {
39732 if (d->testing_p)
39733 return true;
39734
39735 /* Found a usable second shuffle. dfirst will be
39736 vperm2f128 on d->op0 and d->op1. */
39737 dsecond.testing_p = false;
39738 dfirst = *d;
39739 dfirst.target = gen_reg_rtx (d->vmode);
39740 for (i = 0; i < nelt; i++)
39741 dfirst.perm[i] = (i & (nelt2 - 1))
39742 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
39743
39744 ok = expand_vec_perm_1 (&dfirst);
39745 gcc_assert (ok);
39746
39747 /* And dsecond is some single insn shuffle, taking
39748 d->op0 and result of vperm2f128 (if perm < 16) or
39749 d->op1 and result of vperm2f128 (otherwise). */
39750 dsecond.op1 = dfirst.target;
39751 if (perm >= 16)
39752 dsecond.op0 = dfirst.op1;
39753
39754 ok = expand_vec_perm_1 (&dsecond);
39755 gcc_assert (ok);
39756
39757 return true;
39758 }
39759
39760 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
39761 if (d->one_operand_p)
39762 return false;
39763 }
39764
39765 return false;
39766 }
39767
39768 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39769 a two vector permutation using 2 intra-lane interleave insns
39770 and cross-lane shuffle for 32-byte vectors. */
39771
39772 static bool
39773 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
39774 {
39775 unsigned i, nelt;
39776 rtx (*gen) (rtx, rtx, rtx);
39777
39778 if (d->one_operand_p)
39779 return false;
39780 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
39781 ;
39782 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
39783 ;
39784 else
39785 return false;
39786
39787 nelt = d->nelt;
39788 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
39789 return false;
39790 for (i = 0; i < nelt; i += 2)
39791 if (d->perm[i] != d->perm[0] + i / 2
39792 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
39793 return false;
39794
39795 if (d->testing_p)
39796 return true;
39797
39798 switch (d->vmode)
39799 {
39800 case V32QImode:
39801 if (d->perm[0])
39802 gen = gen_vec_interleave_highv32qi;
39803 else
39804 gen = gen_vec_interleave_lowv32qi;
39805 break;
39806 case V16HImode:
39807 if (d->perm[0])
39808 gen = gen_vec_interleave_highv16hi;
39809 else
39810 gen = gen_vec_interleave_lowv16hi;
39811 break;
39812 case V8SImode:
39813 if (d->perm[0])
39814 gen = gen_vec_interleave_highv8si;
39815 else
39816 gen = gen_vec_interleave_lowv8si;
39817 break;
39818 case V4DImode:
39819 if (d->perm[0])
39820 gen = gen_vec_interleave_highv4di;
39821 else
39822 gen = gen_vec_interleave_lowv4di;
39823 break;
39824 case V8SFmode:
39825 if (d->perm[0])
39826 gen = gen_vec_interleave_highv8sf;
39827 else
39828 gen = gen_vec_interleave_lowv8sf;
39829 break;
39830 case V4DFmode:
39831 if (d->perm[0])
39832 gen = gen_vec_interleave_highv4df;
39833 else
39834 gen = gen_vec_interleave_lowv4df;
39835 break;
39836 default:
39837 gcc_unreachable ();
39838 }
39839
39840 emit_insn (gen (d->target, d->op0, d->op1));
39841 return true;
39842 }
39843
39844 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
39845 a single vector permutation using a single intra-lane vector
39846 permutation, vperm2f128 swapping the lanes and vblend* insn blending
39847 the non-swapped and swapped vectors together. */
39848
39849 static bool
39850 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
39851 {
39852 struct expand_vec_perm_d dfirst, dsecond;
39853 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
39854 rtx seq;
39855 bool ok;
39856 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
39857
39858 if (!TARGET_AVX
39859 || TARGET_AVX2
39860 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
39861 || !d->one_operand_p)
39862 return false;
39863
39864 dfirst = *d;
39865 for (i = 0; i < nelt; i++)
39866 dfirst.perm[i] = 0xff;
39867 for (i = 0, msk = 0; i < nelt; i++)
39868 {
39869 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
39870 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
39871 return false;
39872 dfirst.perm[j] = d->perm[i];
39873 if (j != i)
39874 msk |= (1 << i);
39875 }
39876 for (i = 0; i < nelt; i++)
39877 if (dfirst.perm[i] == 0xff)
39878 dfirst.perm[i] = i;
39879
39880 if (!d->testing_p)
39881 dfirst.target = gen_reg_rtx (dfirst.vmode);
39882
39883 start_sequence ();
39884 ok = expand_vec_perm_1 (&dfirst);
39885 seq = get_insns ();
39886 end_sequence ();
39887
39888 if (!ok)
39889 return false;
39890
39891 if (d->testing_p)
39892 return true;
39893
39894 emit_insn (seq);
39895
39896 dsecond = *d;
39897 dsecond.op0 = dfirst.target;
39898 dsecond.op1 = dfirst.target;
39899 dsecond.one_operand_p = true;
39900 dsecond.target = gen_reg_rtx (dsecond.vmode);
39901 for (i = 0; i < nelt; i++)
39902 dsecond.perm[i] = i ^ nelt2;
39903
39904 ok = expand_vec_perm_1 (&dsecond);
39905 gcc_assert (ok);
39906
39907 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
39908 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
39909 return true;
39910 }
39911
39912 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
39913 permutation using two vperm2f128, followed by a vshufpd insn blending
39914 the two vectors together. */
39915
39916 static bool
39917 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
39918 {
39919 struct expand_vec_perm_d dfirst, dsecond, dthird;
39920 bool ok;
39921
39922 if (!TARGET_AVX || (d->vmode != V4DFmode))
39923 return false;
39924
39925 if (d->testing_p)
39926 return true;
39927
39928 dfirst = *d;
39929 dsecond = *d;
39930 dthird = *d;
39931
39932 dfirst.perm[0] = (d->perm[0] & ~1);
39933 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
39934 dfirst.perm[2] = (d->perm[2] & ~1);
39935 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
39936 dsecond.perm[0] = (d->perm[1] & ~1);
39937 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
39938 dsecond.perm[2] = (d->perm[3] & ~1);
39939 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
39940 dthird.perm[0] = (d->perm[0] % 2);
39941 dthird.perm[1] = (d->perm[1] % 2) + 4;
39942 dthird.perm[2] = (d->perm[2] % 2) + 2;
39943 dthird.perm[3] = (d->perm[3] % 2) + 6;
39944
39945 dfirst.target = gen_reg_rtx (dfirst.vmode);
39946 dsecond.target = gen_reg_rtx (dsecond.vmode);
39947 dthird.op0 = dfirst.target;
39948 dthird.op1 = dsecond.target;
39949 dthird.one_operand_p = false;
39950
39951 canonicalize_perm (&dfirst);
39952 canonicalize_perm (&dsecond);
39953
39954 ok = expand_vec_perm_1 (&dfirst)
39955 && expand_vec_perm_1 (&dsecond)
39956 && expand_vec_perm_1 (&dthird);
39957
39958 gcc_assert (ok);
39959
39960 return true;
39961 }
39962
39963 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
39964 permutation with two pshufb insns and an ior. We should have already
39965 failed all two instruction sequences. */
39966
39967 static bool
39968 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
39969 {
39970 rtx rperm[2][16], vperm, l, h, op, m128;
39971 unsigned int i, nelt, eltsz;
39972
39973 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
39974 return false;
39975 gcc_assert (!d->one_operand_p);
39976
39977 nelt = d->nelt;
39978 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39979
39980 /* Generate two permutation masks. If the required element is within
39981 the given vector it is shuffled into the proper lane. If the required
39982 element is in the other vector, force a zero into the lane by setting
39983 bit 7 in the permutation mask. */
39984 m128 = GEN_INT (-128);
39985 for (i = 0; i < nelt; ++i)
39986 {
39987 unsigned j, e = d->perm[i];
39988 unsigned which = (e >= nelt);
39989 if (e >= nelt)
39990 e -= nelt;
39991
39992 for (j = 0; j < eltsz; ++j)
39993 {
39994 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
39995 rperm[1-which][i*eltsz + j] = m128;
39996 }
39997 }
39998
39999 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
40000 vperm = force_reg (V16QImode, vperm);
40001
40002 l = gen_reg_rtx (V16QImode);
40003 op = gen_lowpart (V16QImode, d->op0);
40004 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
40005
40006 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
40007 vperm = force_reg (V16QImode, vperm);
40008
40009 h = gen_reg_rtx (V16QImode);
40010 op = gen_lowpart (V16QImode, d->op1);
40011 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
40012
40013 op = gen_lowpart (V16QImode, d->target);
40014 emit_insn (gen_iorv16qi3 (op, l, h));
40015
40016 return true;
40017 }
40018
40019 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
40020 with two vpshufb insns, vpermq and vpor. We should have already failed
40021 all two or three instruction sequences. */
40022
40023 static bool
40024 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
40025 {
40026 rtx rperm[2][32], vperm, l, h, hp, op, m128;
40027 unsigned int i, nelt, eltsz;
40028
40029 if (!TARGET_AVX2
40030 || !d->one_operand_p
40031 || (d->vmode != V32QImode && d->vmode != V16HImode))
40032 return false;
40033
40034 if (d->testing_p)
40035 return true;
40036
40037 nelt = d->nelt;
40038 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40039
40040 /* Generate two permutation masks. If the required element is within
40041 the same lane, it is shuffled in. If the required element from the
40042 other lane, force a zero by setting bit 7 in the permutation mask.
40043 In the other mask the mask has non-negative elements if element
40044 is requested from the other lane, but also moved to the other lane,
40045 so that the result of vpshufb can have the two V2TImode halves
40046 swapped. */
40047 m128 = GEN_INT (-128);
40048 for (i = 0; i < nelt; ++i)
40049 {
40050 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40051 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
40052
40053 for (j = 0; j < eltsz; ++j)
40054 {
40055 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
40056 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
40057 }
40058 }
40059
40060 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
40061 vperm = force_reg (V32QImode, vperm);
40062
40063 h = gen_reg_rtx (V32QImode);
40064 op = gen_lowpart (V32QImode, d->op0);
40065 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
40066
40067 /* Swap the 128-byte lanes of h into hp. */
40068 hp = gen_reg_rtx (V4DImode);
40069 op = gen_lowpart (V4DImode, h);
40070 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
40071 const1_rtx));
40072
40073 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
40074 vperm = force_reg (V32QImode, vperm);
40075
40076 l = gen_reg_rtx (V32QImode);
40077 op = gen_lowpart (V32QImode, d->op0);
40078 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
40079
40080 op = gen_lowpart (V32QImode, d->target);
40081 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
40082
40083 return true;
40084 }
40085
40086 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
40087 and extract-odd permutations of two V32QImode and V16QImode operand
40088 with two vpshufb insns, vpor and vpermq. We should have already
40089 failed all two or three instruction sequences. */
40090
40091 static bool
40092 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
40093 {
40094 rtx rperm[2][32], vperm, l, h, ior, op, m128;
40095 unsigned int i, nelt, eltsz;
40096
40097 if (!TARGET_AVX2
40098 || d->one_operand_p
40099 || (d->vmode != V32QImode && d->vmode != V16HImode))
40100 return false;
40101
40102 for (i = 0; i < d->nelt; ++i)
40103 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
40104 return false;
40105
40106 if (d->testing_p)
40107 return true;
40108
40109 nelt = d->nelt;
40110 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40111
40112 /* Generate two permutation masks. In the first permutation mask
40113 the first quarter will contain indexes for the first half
40114 of the op0, the second quarter will contain bit 7 set, third quarter
40115 will contain indexes for the second half of the op0 and the
40116 last quarter bit 7 set. In the second permutation mask
40117 the first quarter will contain bit 7 set, the second quarter
40118 indexes for the first half of the op1, the third quarter bit 7 set
40119 and last quarter indexes for the second half of the op1.
40120 I.e. the first mask e.g. for V32QImode extract even will be:
40121 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
40122 (all values masked with 0xf except for -128) and second mask
40123 for extract even will be
40124 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
40125 m128 = GEN_INT (-128);
40126 for (i = 0; i < nelt; ++i)
40127 {
40128 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40129 unsigned which = d->perm[i] >= nelt;
40130 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
40131
40132 for (j = 0; j < eltsz; ++j)
40133 {
40134 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
40135 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
40136 }
40137 }
40138
40139 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
40140 vperm = force_reg (V32QImode, vperm);
40141
40142 l = gen_reg_rtx (V32QImode);
40143 op = gen_lowpart (V32QImode, d->op0);
40144 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
40145
40146 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
40147 vperm = force_reg (V32QImode, vperm);
40148
40149 h = gen_reg_rtx (V32QImode);
40150 op = gen_lowpart (V32QImode, d->op1);
40151 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
40152
40153 ior = gen_reg_rtx (V32QImode);
40154 emit_insn (gen_iorv32qi3 (ior, l, h));
40155
40156 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
40157 op = gen_lowpart (V4DImode, d->target);
40158 ior = gen_lowpart (V4DImode, ior);
40159 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
40160 const1_rtx, GEN_INT (3)));
40161
40162 return true;
40163 }
40164
40165 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
40166 and extract-odd permutations. */
40167
40168 static bool
40169 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
40170 {
40171 rtx t1, t2, t3;
40172
40173 switch (d->vmode)
40174 {
40175 case V4DFmode:
40176 t1 = gen_reg_rtx (V4DFmode);
40177 t2 = gen_reg_rtx (V4DFmode);
40178
40179 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
40180 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
40181 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
40182
40183 /* Now an unpck[lh]pd will produce the result required. */
40184 if (odd)
40185 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
40186 else
40187 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
40188 emit_insn (t3);
40189 break;
40190
40191 case V8SFmode:
40192 {
40193 int mask = odd ? 0xdd : 0x88;
40194
40195 t1 = gen_reg_rtx (V8SFmode);
40196 t2 = gen_reg_rtx (V8SFmode);
40197 t3 = gen_reg_rtx (V8SFmode);
40198
40199 /* Shuffle within the 128-bit lanes to produce:
40200 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
40201 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
40202 GEN_INT (mask)));
40203
40204 /* Shuffle the lanes around to produce:
40205 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
40206 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
40207 GEN_INT (0x3)));
40208
40209 /* Shuffle within the 128-bit lanes to produce:
40210 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
40211 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
40212
40213 /* Shuffle within the 128-bit lanes to produce:
40214 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
40215 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
40216
40217 /* Shuffle the lanes around to produce:
40218 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
40219 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
40220 GEN_INT (0x20)));
40221 }
40222 break;
40223
40224 case V2DFmode:
40225 case V4SFmode:
40226 case V2DImode:
40227 case V4SImode:
40228 /* These are always directly implementable by expand_vec_perm_1. */
40229 gcc_unreachable ();
40230
40231 case V8HImode:
40232 if (TARGET_SSSE3)
40233 return expand_vec_perm_pshufb2 (d);
40234 else
40235 {
40236 /* We need 2*log2(N)-1 operations to achieve odd/even
40237 with interleave. */
40238 t1 = gen_reg_rtx (V8HImode);
40239 t2 = gen_reg_rtx (V8HImode);
40240 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
40241 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
40242 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
40243 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
40244 if (odd)
40245 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
40246 else
40247 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
40248 emit_insn (t3);
40249 }
40250 break;
40251
40252 case V16QImode:
40253 if (TARGET_SSSE3)
40254 return expand_vec_perm_pshufb2 (d);
40255 else
40256 {
40257 t1 = gen_reg_rtx (V16QImode);
40258 t2 = gen_reg_rtx (V16QImode);
40259 t3 = gen_reg_rtx (V16QImode);
40260 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
40261 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
40262 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
40263 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
40264 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
40265 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
40266 if (odd)
40267 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
40268 else
40269 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
40270 emit_insn (t3);
40271 }
40272 break;
40273
40274 case V16HImode:
40275 case V32QImode:
40276 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
40277
40278 case V4DImode:
40279 if (!TARGET_AVX2)
40280 {
40281 struct expand_vec_perm_d d_copy = *d;
40282 d_copy.vmode = V4DFmode;
40283 d_copy.target = gen_lowpart (V4DFmode, d->target);
40284 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
40285 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
40286 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40287 }
40288
40289 t1 = gen_reg_rtx (V4DImode);
40290 t2 = gen_reg_rtx (V4DImode);
40291
40292 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
40293 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
40294 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
40295
40296 /* Now an vpunpck[lh]qdq will produce the result required. */
40297 if (odd)
40298 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
40299 else
40300 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
40301 emit_insn (t3);
40302 break;
40303
40304 case V8SImode:
40305 if (!TARGET_AVX2)
40306 {
40307 struct expand_vec_perm_d d_copy = *d;
40308 d_copy.vmode = V8SFmode;
40309 d_copy.target = gen_lowpart (V8SFmode, d->target);
40310 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
40311 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
40312 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40313 }
40314
40315 t1 = gen_reg_rtx (V8SImode);
40316 t2 = gen_reg_rtx (V8SImode);
40317
40318 /* Shuffle the lanes around into
40319 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
40320 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
40321 gen_lowpart (V4DImode, d->op0),
40322 gen_lowpart (V4DImode, d->op1),
40323 GEN_INT (0x20)));
40324 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
40325 gen_lowpart (V4DImode, d->op0),
40326 gen_lowpart (V4DImode, d->op1),
40327 GEN_INT (0x31)));
40328
40329 /* Swap the 2nd and 3rd position in each lane into
40330 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
40331 emit_insn (gen_avx2_pshufdv3 (t1, t1,
40332 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40333 emit_insn (gen_avx2_pshufdv3 (t2, t2,
40334 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40335
40336 /* Now an vpunpck[lh]qdq will produce
40337 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
40338 if (odd)
40339 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
40340 gen_lowpart (V4DImode, t1),
40341 gen_lowpart (V4DImode, t2));
40342 else
40343 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
40344 gen_lowpart (V4DImode, t1),
40345 gen_lowpart (V4DImode, t2));
40346 emit_insn (t3);
40347 break;
40348
40349 default:
40350 gcc_unreachable ();
40351 }
40352
40353 return true;
40354 }
40355
40356 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40357 extract-even and extract-odd permutations. */
40358
40359 static bool
40360 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
40361 {
40362 unsigned i, odd, nelt = d->nelt;
40363
40364 odd = d->perm[0];
40365 if (odd != 0 && odd != 1)
40366 return false;
40367
40368 for (i = 1; i < nelt; ++i)
40369 if (d->perm[i] != 2 * i + odd)
40370 return false;
40371
40372 return expand_vec_perm_even_odd_1 (d, odd);
40373 }
40374
40375 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
40376 permutations. We assume that expand_vec_perm_1 has already failed. */
40377
40378 static bool
40379 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
40380 {
40381 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
40382 enum machine_mode vmode = d->vmode;
40383 unsigned char perm2[4];
40384 rtx op0 = d->op0;
40385 bool ok;
40386
40387 switch (vmode)
40388 {
40389 case V4DFmode:
40390 case V8SFmode:
40391 /* These are special-cased in sse.md so that we can optionally
40392 use the vbroadcast instruction. They expand to two insns
40393 if the input happens to be in a register. */
40394 gcc_unreachable ();
40395
40396 case V2DFmode:
40397 case V2DImode:
40398 case V4SFmode:
40399 case V4SImode:
40400 /* These are always implementable using standard shuffle patterns. */
40401 gcc_unreachable ();
40402
40403 case V8HImode:
40404 case V16QImode:
40405 /* These can be implemented via interleave. We save one insn by
40406 stopping once we have promoted to V4SImode and then use pshufd. */
40407 do
40408 {
40409 rtx dest;
40410 rtx (*gen) (rtx, rtx, rtx)
40411 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
40412 : gen_vec_interleave_lowv8hi;
40413
40414 if (elt >= nelt2)
40415 {
40416 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
40417 : gen_vec_interleave_highv8hi;
40418 elt -= nelt2;
40419 }
40420 nelt2 /= 2;
40421
40422 dest = gen_reg_rtx (vmode);
40423 emit_insn (gen (dest, op0, op0));
40424 vmode = get_mode_wider_vector (vmode);
40425 op0 = gen_lowpart (vmode, dest);
40426 }
40427 while (vmode != V4SImode);
40428
40429 memset (perm2, elt, 4);
40430 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
40431 d->testing_p);
40432 gcc_assert (ok);
40433 return true;
40434
40435 case V32QImode:
40436 case V16HImode:
40437 case V8SImode:
40438 case V4DImode:
40439 /* For AVX2 broadcasts of the first element vpbroadcast* or
40440 vpermq should be used by expand_vec_perm_1. */
40441 gcc_assert (!TARGET_AVX2 || d->perm[0]);
40442 return false;
40443
40444 default:
40445 gcc_unreachable ();
40446 }
40447 }
40448
40449 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40450 broadcast permutations. */
40451
40452 static bool
40453 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
40454 {
40455 unsigned i, elt, nelt = d->nelt;
40456
40457 if (!d->one_operand_p)
40458 return false;
40459
40460 elt = d->perm[0];
40461 for (i = 1; i < nelt; ++i)
40462 if (d->perm[i] != elt)
40463 return false;
40464
40465 return expand_vec_perm_broadcast_1 (d);
40466 }
40467
40468 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
40469 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
40470 all the shorter instruction sequences. */
40471
40472 static bool
40473 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
40474 {
40475 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
40476 unsigned int i, nelt, eltsz;
40477 bool used[4];
40478
40479 if (!TARGET_AVX2
40480 || d->one_operand_p
40481 || (d->vmode != V32QImode && d->vmode != V16HImode))
40482 return false;
40483
40484 if (d->testing_p)
40485 return true;
40486
40487 nelt = d->nelt;
40488 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40489
40490 /* Generate 4 permutation masks. If the required element is within
40491 the same lane, it is shuffled in. If the required element from the
40492 other lane, force a zero by setting bit 7 in the permutation mask.
40493 In the other mask the mask has non-negative elements if element
40494 is requested from the other lane, but also moved to the other lane,
40495 so that the result of vpshufb can have the two V2TImode halves
40496 swapped. */
40497 m128 = GEN_INT (-128);
40498 for (i = 0; i < 32; ++i)
40499 {
40500 rperm[0][i] = m128;
40501 rperm[1][i] = m128;
40502 rperm[2][i] = m128;
40503 rperm[3][i] = m128;
40504 }
40505 used[0] = false;
40506 used[1] = false;
40507 used[2] = false;
40508 used[3] = false;
40509 for (i = 0; i < nelt; ++i)
40510 {
40511 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40512 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
40513 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
40514
40515 for (j = 0; j < eltsz; ++j)
40516 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
40517 used[which] = true;
40518 }
40519
40520 for (i = 0; i < 2; ++i)
40521 {
40522 if (!used[2 * i + 1])
40523 {
40524 h[i] = NULL_RTX;
40525 continue;
40526 }
40527 vperm = gen_rtx_CONST_VECTOR (V32QImode,
40528 gen_rtvec_v (32, rperm[2 * i + 1]));
40529 vperm = force_reg (V32QImode, vperm);
40530 h[i] = gen_reg_rtx (V32QImode);
40531 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40532 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
40533 }
40534
40535 /* Swap the 128-byte lanes of h[X]. */
40536 for (i = 0; i < 2; ++i)
40537 {
40538 if (h[i] == NULL_RTX)
40539 continue;
40540 op = gen_reg_rtx (V4DImode);
40541 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
40542 const2_rtx, GEN_INT (3), const0_rtx,
40543 const1_rtx));
40544 h[i] = gen_lowpart (V32QImode, op);
40545 }
40546
40547 for (i = 0; i < 2; ++i)
40548 {
40549 if (!used[2 * i])
40550 {
40551 l[i] = NULL_RTX;
40552 continue;
40553 }
40554 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
40555 vperm = force_reg (V32QImode, vperm);
40556 l[i] = gen_reg_rtx (V32QImode);
40557 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40558 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
40559 }
40560
40561 for (i = 0; i < 2; ++i)
40562 {
40563 if (h[i] && l[i])
40564 {
40565 op = gen_reg_rtx (V32QImode);
40566 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
40567 l[i] = op;
40568 }
40569 else if (h[i])
40570 l[i] = h[i];
40571 }
40572
40573 gcc_assert (l[0] && l[1]);
40574 op = gen_lowpart (V32QImode, d->target);
40575 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
40576 return true;
40577 }
40578
40579 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
40580 With all of the interface bits taken care of, perform the expansion
40581 in D and return true on success. */
40582
40583 static bool
40584 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
40585 {
40586 /* Try a single instruction expansion. */
40587 if (expand_vec_perm_1 (d))
40588 return true;
40589
40590 /* Try sequences of two instructions. */
40591
40592 if (expand_vec_perm_pshuflw_pshufhw (d))
40593 return true;
40594
40595 if (expand_vec_perm_palignr (d))
40596 return true;
40597
40598 if (expand_vec_perm_interleave2 (d))
40599 return true;
40600
40601 if (expand_vec_perm_broadcast (d))
40602 return true;
40603
40604 if (expand_vec_perm_vpermq_perm_1 (d))
40605 return true;
40606
40607 if (expand_vec_perm_vperm2f128 (d))
40608 return true;
40609
40610 /* Try sequences of three instructions. */
40611
40612 if (expand_vec_perm_2vperm2f128_vshuf (d))
40613 return true;
40614
40615 if (expand_vec_perm_pshufb2 (d))
40616 return true;
40617
40618 if (expand_vec_perm_interleave3 (d))
40619 return true;
40620
40621 if (expand_vec_perm_vperm2f128_vblend (d))
40622 return true;
40623
40624 /* Try sequences of four instructions. */
40625
40626 if (expand_vec_perm_vpshufb2_vpermq (d))
40627 return true;
40628
40629 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
40630 return true;
40631
40632 /* ??? Look for narrow permutations whose element orderings would
40633 allow the promotion to a wider mode. */
40634
40635 /* ??? Look for sequences of interleave or a wider permute that place
40636 the data into the correct lanes for a half-vector shuffle like
40637 pshuf[lh]w or vpermilps. */
40638
40639 /* ??? Look for sequences of interleave that produce the desired results.
40640 The combinatorics of punpck[lh] get pretty ugly... */
40641
40642 if (expand_vec_perm_even_odd (d))
40643 return true;
40644
40645 /* Even longer sequences. */
40646 if (expand_vec_perm_vpshufb4_vpermq2 (d))
40647 return true;
40648
40649 return false;
40650 }
40651
40652 /* If a permutation only uses one operand, make it clear. Returns true
40653 if the permutation references both operands. */
40654
40655 static bool
40656 canonicalize_perm (struct expand_vec_perm_d *d)
40657 {
40658 int i, which, nelt = d->nelt;
40659
40660 for (i = which = 0; i < nelt; ++i)
40661 which |= (d->perm[i] < nelt ? 1 : 2);
40662
40663 d->one_operand_p = true;
40664 switch (which)
40665 {
40666 default:
40667 gcc_unreachable();
40668
40669 case 3:
40670 if (!rtx_equal_p (d->op0, d->op1))
40671 {
40672 d->one_operand_p = false;
40673 break;
40674 }
40675 /* The elements of PERM do not suggest that only the first operand
40676 is used, but both operands are identical. Allow easier matching
40677 of the permutation by folding the permutation into the single
40678 input vector. */
40679 /* FALLTHRU */
40680
40681 case 2:
40682 for (i = 0; i < nelt; ++i)
40683 d->perm[i] &= nelt - 1;
40684 d->op0 = d->op1;
40685 break;
40686
40687 case 1:
40688 d->op1 = d->op0;
40689 break;
40690 }
40691
40692 return (which == 3);
40693 }
40694
40695 bool
40696 ix86_expand_vec_perm_const (rtx operands[4])
40697 {
40698 struct expand_vec_perm_d d;
40699 unsigned char perm[MAX_VECT_LEN];
40700 int i, nelt;
40701 bool two_args;
40702 rtx sel;
40703
40704 d.target = operands[0];
40705 d.op0 = operands[1];
40706 d.op1 = operands[2];
40707 sel = operands[3];
40708
40709 d.vmode = GET_MODE (d.target);
40710 gcc_assert (VECTOR_MODE_P (d.vmode));
40711 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40712 d.testing_p = false;
40713
40714 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
40715 gcc_assert (XVECLEN (sel, 0) == nelt);
40716 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
40717
40718 for (i = 0; i < nelt; ++i)
40719 {
40720 rtx e = XVECEXP (sel, 0, i);
40721 int ei = INTVAL (e) & (2 * nelt - 1);
40722 d.perm[i] = ei;
40723 perm[i] = ei;
40724 }
40725
40726 two_args = canonicalize_perm (&d);
40727
40728 if (ix86_expand_vec_perm_const_1 (&d))
40729 return true;
40730
40731 /* If the selector says both arguments are needed, but the operands are the
40732 same, the above tried to expand with one_operand_p and flattened selector.
40733 If that didn't work, retry without one_operand_p; we succeeded with that
40734 during testing. */
40735 if (two_args && d.one_operand_p)
40736 {
40737 d.one_operand_p = false;
40738 memcpy (d.perm, perm, sizeof (perm));
40739 return ix86_expand_vec_perm_const_1 (&d);
40740 }
40741
40742 return false;
40743 }
40744
40745 /* Implement targetm.vectorize.vec_perm_const_ok. */
40746
40747 static bool
40748 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
40749 const unsigned char *sel)
40750 {
40751 struct expand_vec_perm_d d;
40752 unsigned int i, nelt, which;
40753 bool ret;
40754
40755 d.vmode = vmode;
40756 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40757 d.testing_p = true;
40758
40759 /* Given sufficient ISA support we can just return true here
40760 for selected vector modes. */
40761 if (GET_MODE_SIZE (d.vmode) == 16)
40762 {
40763 /* All implementable with a single vpperm insn. */
40764 if (TARGET_XOP)
40765 return true;
40766 /* All implementable with 2 pshufb + 1 ior. */
40767 if (TARGET_SSSE3)
40768 return true;
40769 /* All implementable with shufpd or unpck[lh]pd. */
40770 if (d.nelt == 2)
40771 return true;
40772 }
40773
40774 /* Extract the values from the vector CST into the permutation
40775 array in D. */
40776 memcpy (d.perm, sel, nelt);
40777 for (i = which = 0; i < nelt; ++i)
40778 {
40779 unsigned char e = d.perm[i];
40780 gcc_assert (e < 2 * nelt);
40781 which |= (e < nelt ? 1 : 2);
40782 }
40783
40784 /* For all elements from second vector, fold the elements to first. */
40785 if (which == 2)
40786 for (i = 0; i < nelt; ++i)
40787 d.perm[i] -= nelt;
40788
40789 /* Check whether the mask can be applied to the vector type. */
40790 d.one_operand_p = (which != 3);
40791
40792 /* Implementable with shufps or pshufd. */
40793 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
40794 return true;
40795
40796 /* Otherwise we have to go through the motions and see if we can
40797 figure out how to generate the requested permutation. */
40798 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
40799 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
40800 if (!d.one_operand_p)
40801 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
40802
40803 start_sequence ();
40804 ret = ix86_expand_vec_perm_const_1 (&d);
40805 end_sequence ();
40806
40807 return ret;
40808 }
40809
40810 void
40811 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
40812 {
40813 struct expand_vec_perm_d d;
40814 unsigned i, nelt;
40815
40816 d.target = targ;
40817 d.op0 = op0;
40818 d.op1 = op1;
40819 d.vmode = GET_MODE (targ);
40820 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40821 d.one_operand_p = false;
40822 d.testing_p = false;
40823
40824 for (i = 0; i < nelt; ++i)
40825 d.perm[i] = i * 2 + odd;
40826
40827 /* We'll either be able to implement the permutation directly... */
40828 if (expand_vec_perm_1 (&d))
40829 return;
40830
40831 /* ... or we use the special-case patterns. */
40832 expand_vec_perm_even_odd_1 (&d, odd);
40833 }
40834
40835 static void
40836 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
40837 {
40838 struct expand_vec_perm_d d;
40839 unsigned i, nelt, base;
40840 bool ok;
40841
40842 d.target = targ;
40843 d.op0 = op0;
40844 d.op1 = op1;
40845 d.vmode = GET_MODE (targ);
40846 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40847 d.one_operand_p = false;
40848 d.testing_p = false;
40849
40850 base = high_p ? nelt / 2 : 0;
40851 for (i = 0; i < nelt / 2; ++i)
40852 {
40853 d.perm[i * 2] = i + base;
40854 d.perm[i * 2 + 1] = i + base + nelt;
40855 }
40856
40857 /* Note that for AVX this isn't one instruction. */
40858 ok = ix86_expand_vec_perm_const_1 (&d);
40859 gcc_assert (ok);
40860 }
40861
40862
40863 /* Expand a vector operation CODE for a V*QImode in terms of the
40864 same operation on V*HImode. */
40865
40866 void
40867 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
40868 {
40869 enum machine_mode qimode = GET_MODE (dest);
40870 enum machine_mode himode;
40871 rtx (*gen_il) (rtx, rtx, rtx);
40872 rtx (*gen_ih) (rtx, rtx, rtx);
40873 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
40874 struct expand_vec_perm_d d;
40875 bool ok, full_interleave;
40876 bool uns_p = false;
40877 int i;
40878
40879 switch (qimode)
40880 {
40881 case V16QImode:
40882 himode = V8HImode;
40883 gen_il = gen_vec_interleave_lowv16qi;
40884 gen_ih = gen_vec_interleave_highv16qi;
40885 break;
40886 case V32QImode:
40887 himode = V16HImode;
40888 gen_il = gen_avx2_interleave_lowv32qi;
40889 gen_ih = gen_avx2_interleave_highv32qi;
40890 break;
40891 default:
40892 gcc_unreachable ();
40893 }
40894
40895 op2_l = op2_h = op2;
40896 switch (code)
40897 {
40898 case MULT:
40899 /* Unpack data such that we've got a source byte in each low byte of
40900 each word. We don't care what goes into the high byte of each word.
40901 Rather than trying to get zero in there, most convenient is to let
40902 it be a copy of the low byte. */
40903 op2_l = gen_reg_rtx (qimode);
40904 op2_h = gen_reg_rtx (qimode);
40905 emit_insn (gen_il (op2_l, op2, op2));
40906 emit_insn (gen_ih (op2_h, op2, op2));
40907 /* FALLTHRU */
40908
40909 op1_l = gen_reg_rtx (qimode);
40910 op1_h = gen_reg_rtx (qimode);
40911 emit_insn (gen_il (op1_l, op1, op1));
40912 emit_insn (gen_ih (op1_h, op1, op1));
40913 full_interleave = qimode == V16QImode;
40914 break;
40915
40916 case ASHIFT:
40917 case LSHIFTRT:
40918 uns_p = true;
40919 /* FALLTHRU */
40920 case ASHIFTRT:
40921 op1_l = gen_reg_rtx (himode);
40922 op1_h = gen_reg_rtx (himode);
40923 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
40924 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
40925 full_interleave = true;
40926 break;
40927 default:
40928 gcc_unreachable ();
40929 }
40930
40931 /* Perform the operation. */
40932 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
40933 1, OPTAB_DIRECT);
40934 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
40935 1, OPTAB_DIRECT);
40936 gcc_assert (res_l && res_h);
40937
40938 /* Merge the data back into the right place. */
40939 d.target = dest;
40940 d.op0 = gen_lowpart (qimode, res_l);
40941 d.op1 = gen_lowpart (qimode, res_h);
40942 d.vmode = qimode;
40943 d.nelt = GET_MODE_NUNITS (qimode);
40944 d.one_operand_p = false;
40945 d.testing_p = false;
40946
40947 if (full_interleave)
40948 {
40949 /* For SSE2, we used an full interleave, so the desired
40950 results are in the even elements. */
40951 for (i = 0; i < 32; ++i)
40952 d.perm[i] = i * 2;
40953 }
40954 else
40955 {
40956 /* For AVX, the interleave used above was not cross-lane. So the
40957 extraction is evens but with the second and third quarter swapped.
40958 Happily, that is even one insn shorter than even extraction. */
40959 for (i = 0; i < 32; ++i)
40960 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
40961 }
40962
40963 ok = ix86_expand_vec_perm_const_1 (&d);
40964 gcc_assert (ok);
40965
40966 set_unique_reg_note (get_last_insn (), REG_EQUAL,
40967 gen_rtx_fmt_ee (code, qimode, op1, op2));
40968 }
40969
40970 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
40971 if op is CONST_VECTOR with all odd elements equal to their
40972 preceeding element. */
40973
40974 static bool
40975 const_vector_equal_evenodd_p (rtx op)
40976 {
40977 enum machine_mode mode = GET_MODE (op);
40978 int i, nunits = GET_MODE_NUNITS (mode);
40979 if (GET_CODE (op) != CONST_VECTOR
40980 || nunits != CONST_VECTOR_NUNITS (op))
40981 return false;
40982 for (i = 0; i < nunits; i += 2)
40983 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
40984 return false;
40985 return true;
40986 }
40987
40988 void
40989 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
40990 bool uns_p, bool odd_p)
40991 {
40992 enum machine_mode mode = GET_MODE (op1);
40993 enum machine_mode wmode = GET_MODE (dest);
40994 rtx x;
40995 rtx orig_op1 = op1, orig_op2 = op2;
40996
40997 if (!nonimmediate_operand (op1, mode))
40998 op1 = force_reg (mode, op1);
40999 if (!nonimmediate_operand (op2, mode))
41000 op2 = force_reg (mode, op2);
41001
41002 /* We only play even/odd games with vectors of SImode. */
41003 gcc_assert (mode == V4SImode || mode == V8SImode);
41004
41005 /* If we're looking for the odd results, shift those members down to
41006 the even slots. For some cpus this is faster than a PSHUFD. */
41007 if (odd_p)
41008 {
41009 /* For XOP use vpmacsdqh, but only for smult, as it is only
41010 signed. */
41011 if (TARGET_XOP && mode == V4SImode && !uns_p)
41012 {
41013 x = force_reg (wmode, CONST0_RTX (wmode));
41014 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
41015 return;
41016 }
41017
41018 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
41019 if (!const_vector_equal_evenodd_p (orig_op1))
41020 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
41021 x, NULL, 1, OPTAB_DIRECT);
41022 if (!const_vector_equal_evenodd_p (orig_op2))
41023 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
41024 x, NULL, 1, OPTAB_DIRECT);
41025 op1 = gen_lowpart (mode, op1);
41026 op2 = gen_lowpart (mode, op2);
41027 }
41028
41029 if (mode == V8SImode)
41030 {
41031 if (uns_p)
41032 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
41033 else
41034 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
41035 }
41036 else if (uns_p)
41037 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
41038 else if (TARGET_SSE4_1)
41039 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
41040 else
41041 {
41042 rtx s1, s2, t0, t1, t2;
41043
41044 /* The easiest way to implement this without PMULDQ is to go through
41045 the motions as if we are performing a full 64-bit multiply. With
41046 the exception that we need to do less shuffling of the elements. */
41047
41048 /* Compute the sign-extension, aka highparts, of the two operands. */
41049 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
41050 op1, pc_rtx, pc_rtx);
41051 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
41052 op2, pc_rtx, pc_rtx);
41053
41054 /* Multiply LO(A) * HI(B), and vice-versa. */
41055 t1 = gen_reg_rtx (wmode);
41056 t2 = gen_reg_rtx (wmode);
41057 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
41058 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
41059
41060 /* Multiply LO(A) * LO(B). */
41061 t0 = gen_reg_rtx (wmode);
41062 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
41063
41064 /* Combine and shift the highparts into place. */
41065 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
41066 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
41067 1, OPTAB_DIRECT);
41068
41069 /* Combine high and low parts. */
41070 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
41071 return;
41072 }
41073 emit_insn (x);
41074 }
41075
41076 void
41077 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
41078 bool uns_p, bool high_p)
41079 {
41080 enum machine_mode wmode = GET_MODE (dest);
41081 enum machine_mode mode = GET_MODE (op1);
41082 rtx t1, t2, t3, t4, mask;
41083
41084 switch (mode)
41085 {
41086 case V4SImode:
41087 t1 = gen_reg_rtx (mode);
41088 t2 = gen_reg_rtx (mode);
41089 if (TARGET_XOP && !uns_p)
41090 {
41091 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
41092 shuffle the elements once so that all elements are in the right
41093 place for immediate use: { A C B D }. */
41094 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
41095 const1_rtx, GEN_INT (3)));
41096 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
41097 const1_rtx, GEN_INT (3)));
41098 }
41099 else
41100 {
41101 /* Put the elements into place for the multiply. */
41102 ix86_expand_vec_interleave (t1, op1, op1, high_p);
41103 ix86_expand_vec_interleave (t2, op2, op2, high_p);
41104 high_p = false;
41105 }
41106 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
41107 break;
41108
41109 case V8SImode:
41110 /* Shuffle the elements between the lanes. After this we
41111 have { A B E F | C D G H } for each operand. */
41112 t1 = gen_reg_rtx (V4DImode);
41113 t2 = gen_reg_rtx (V4DImode);
41114 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
41115 const0_rtx, const2_rtx,
41116 const1_rtx, GEN_INT (3)));
41117 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
41118 const0_rtx, const2_rtx,
41119 const1_rtx, GEN_INT (3)));
41120
41121 /* Shuffle the elements within the lanes. After this we
41122 have { A A B B | C C D D } or { E E F F | G G H H }. */
41123 t3 = gen_reg_rtx (V8SImode);
41124 t4 = gen_reg_rtx (V8SImode);
41125 mask = GEN_INT (high_p
41126 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
41127 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
41128 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
41129 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
41130
41131 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
41132 break;
41133
41134 case V8HImode:
41135 case V16HImode:
41136 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
41137 uns_p, OPTAB_DIRECT);
41138 t2 = expand_binop (mode,
41139 uns_p ? umul_highpart_optab : smul_highpart_optab,
41140 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
41141 gcc_assert (t1 && t2);
41142
41143 ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
41144 break;
41145
41146 case V16QImode:
41147 case V32QImode:
41148 t1 = gen_reg_rtx (wmode);
41149 t2 = gen_reg_rtx (wmode);
41150 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
41151 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
41152
41153 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
41154 break;
41155
41156 default:
41157 gcc_unreachable ();
41158 }
41159 }
41160
41161 void
41162 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
41163 {
41164 rtx res_1, res_2;
41165
41166 res_1 = gen_reg_rtx (V4SImode);
41167 res_2 = gen_reg_rtx (V4SImode);
41168 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
41169 op1, op2, true, false);
41170 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
41171 op1, op2, true, true);
41172
41173 /* Move the results in element 2 down to element 1; we don't care
41174 what goes in elements 2 and 3. Then we can merge the parts
41175 back together with an interleave.
41176
41177 Note that two other sequences were tried:
41178 (1) Use interleaves at the start instead of psrldq, which allows
41179 us to use a single shufps to merge things back at the end.
41180 (2) Use shufps here to combine the two vectors, then pshufd to
41181 put the elements in the correct order.
41182 In both cases the cost of the reformatting stall was too high
41183 and the overall sequence slower. */
41184
41185 emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
41186 const0_rtx, const0_rtx));
41187 emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
41188 const0_rtx, const0_rtx));
41189 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
41190
41191 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
41192 }
41193
41194 void
41195 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
41196 {
41197 enum machine_mode mode = GET_MODE (op0);
41198 rtx t1, t2, t3, t4, t5, t6;
41199
41200 if (TARGET_XOP && mode == V2DImode)
41201 {
41202 /* op1: A,B,C,D, op2: E,F,G,H */
41203 op1 = gen_lowpart (V4SImode, op1);
41204 op2 = gen_lowpart (V4SImode, op2);
41205
41206 t1 = gen_reg_rtx (V4SImode);
41207 t2 = gen_reg_rtx (V4SImode);
41208 t3 = gen_reg_rtx (V2DImode);
41209 t4 = gen_reg_rtx (V2DImode);
41210
41211 /* t1: B,A,D,C */
41212 emit_insn (gen_sse2_pshufd_1 (t1, op1,
41213 GEN_INT (1),
41214 GEN_INT (0),
41215 GEN_INT (3),
41216 GEN_INT (2)));
41217
41218 /* t2: (B*E),(A*F),(D*G),(C*H) */
41219 emit_insn (gen_mulv4si3 (t2, t1, op2));
41220
41221 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
41222 emit_insn (gen_xop_phadddq (t3, t2));
41223
41224 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
41225 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
41226
41227 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
41228 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
41229 }
41230 else
41231 {
41232 enum machine_mode nmode;
41233 rtx (*umul) (rtx, rtx, rtx);
41234
41235 if (mode == V2DImode)
41236 {
41237 umul = gen_vec_widen_umult_even_v4si;
41238 nmode = V4SImode;
41239 }
41240 else if (mode == V4DImode)
41241 {
41242 umul = gen_vec_widen_umult_even_v8si;
41243 nmode = V8SImode;
41244 }
41245 else
41246 gcc_unreachable ();
41247
41248
41249 /* Multiply low parts. */
41250 t1 = gen_reg_rtx (mode);
41251 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
41252
41253 /* Shift input vectors right 32 bits so we can multiply high parts. */
41254 t6 = GEN_INT (32);
41255 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
41256 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
41257
41258 /* Multiply high parts by low parts. */
41259 t4 = gen_reg_rtx (mode);
41260 t5 = gen_reg_rtx (mode);
41261 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
41262 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
41263
41264 /* Combine and shift the highparts back. */
41265 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
41266 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
41267
41268 /* Combine high and low parts. */
41269 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
41270 }
41271
41272 set_unique_reg_note (get_last_insn (), REG_EQUAL,
41273 gen_rtx_MULT (mode, op1, op2));
41274 }
41275
41276 /* Expand an insert into a vector register through pinsr insn.
41277 Return true if successful. */
41278
41279 bool
41280 ix86_expand_pinsr (rtx *operands)
41281 {
41282 rtx dst = operands[0];
41283 rtx src = operands[3];
41284
41285 unsigned int size = INTVAL (operands[1]);
41286 unsigned int pos = INTVAL (operands[2]);
41287
41288 if (GET_CODE (dst) == SUBREG)
41289 {
41290 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
41291 dst = SUBREG_REG (dst);
41292 }
41293
41294 if (GET_CODE (src) == SUBREG)
41295 src = SUBREG_REG (src);
41296
41297 switch (GET_MODE (dst))
41298 {
41299 case V16QImode:
41300 case V8HImode:
41301 case V4SImode:
41302 case V2DImode:
41303 {
41304 enum machine_mode srcmode, dstmode;
41305 rtx (*pinsr)(rtx, rtx, rtx, rtx);
41306
41307 srcmode = mode_for_size (size, MODE_INT, 0);
41308
41309 switch (srcmode)
41310 {
41311 case QImode:
41312 if (!TARGET_SSE4_1)
41313 return false;
41314 dstmode = V16QImode;
41315 pinsr = gen_sse4_1_pinsrb;
41316 break;
41317
41318 case HImode:
41319 if (!TARGET_SSE2)
41320 return false;
41321 dstmode = V8HImode;
41322 pinsr = gen_sse2_pinsrw;
41323 break;
41324
41325 case SImode:
41326 if (!TARGET_SSE4_1)
41327 return false;
41328 dstmode = V4SImode;
41329 pinsr = gen_sse4_1_pinsrd;
41330 break;
41331
41332 case DImode:
41333 gcc_assert (TARGET_64BIT);
41334 if (!TARGET_SSE4_1)
41335 return false;
41336 dstmode = V2DImode;
41337 pinsr = gen_sse4_1_pinsrq;
41338 break;
41339
41340 default:
41341 return false;
41342 }
41343
41344 dst = gen_lowpart (dstmode, dst);
41345 src = gen_lowpart (srcmode, src);
41346
41347 pos /= size;
41348
41349 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
41350 return true;
41351 }
41352
41353 default:
41354 return false;
41355 }
41356 }
41357 \f
41358 /* This function returns the calling abi specific va_list type node.
41359 It returns the FNDECL specific va_list type. */
41360
41361 static tree
41362 ix86_fn_abi_va_list (tree fndecl)
41363 {
41364 if (!TARGET_64BIT)
41365 return va_list_type_node;
41366 gcc_assert (fndecl != NULL_TREE);
41367
41368 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
41369 return ms_va_list_type_node;
41370 else
41371 return sysv_va_list_type_node;
41372 }
41373
41374 /* Returns the canonical va_list type specified by TYPE. If there
41375 is no valid TYPE provided, it return NULL_TREE. */
41376
41377 static tree
41378 ix86_canonical_va_list_type (tree type)
41379 {
41380 tree wtype, htype;
41381
41382 /* Resolve references and pointers to va_list type. */
41383 if (TREE_CODE (type) == MEM_REF)
41384 type = TREE_TYPE (type);
41385 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
41386 type = TREE_TYPE (type);
41387 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
41388 type = TREE_TYPE (type);
41389
41390 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
41391 {
41392 wtype = va_list_type_node;
41393 gcc_assert (wtype != NULL_TREE);
41394 htype = type;
41395 if (TREE_CODE (wtype) == ARRAY_TYPE)
41396 {
41397 /* If va_list is an array type, the argument may have decayed
41398 to a pointer type, e.g. by being passed to another function.
41399 In that case, unwrap both types so that we can compare the
41400 underlying records. */
41401 if (TREE_CODE (htype) == ARRAY_TYPE
41402 || POINTER_TYPE_P (htype))
41403 {
41404 wtype = TREE_TYPE (wtype);
41405 htype = TREE_TYPE (htype);
41406 }
41407 }
41408 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41409 return va_list_type_node;
41410 wtype = sysv_va_list_type_node;
41411 gcc_assert (wtype != NULL_TREE);
41412 htype = type;
41413 if (TREE_CODE (wtype) == ARRAY_TYPE)
41414 {
41415 /* If va_list is an array type, the argument may have decayed
41416 to a pointer type, e.g. by being passed to another function.
41417 In that case, unwrap both types so that we can compare the
41418 underlying records. */
41419 if (TREE_CODE (htype) == ARRAY_TYPE
41420 || POINTER_TYPE_P (htype))
41421 {
41422 wtype = TREE_TYPE (wtype);
41423 htype = TREE_TYPE (htype);
41424 }
41425 }
41426 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41427 return sysv_va_list_type_node;
41428 wtype = ms_va_list_type_node;
41429 gcc_assert (wtype != NULL_TREE);
41430 htype = type;
41431 if (TREE_CODE (wtype) == ARRAY_TYPE)
41432 {
41433 /* If va_list is an array type, the argument may have decayed
41434 to a pointer type, e.g. by being passed to another function.
41435 In that case, unwrap both types so that we can compare the
41436 underlying records. */
41437 if (TREE_CODE (htype) == ARRAY_TYPE
41438 || POINTER_TYPE_P (htype))
41439 {
41440 wtype = TREE_TYPE (wtype);
41441 htype = TREE_TYPE (htype);
41442 }
41443 }
41444 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41445 return ms_va_list_type_node;
41446 return NULL_TREE;
41447 }
41448 return std_canonical_va_list_type (type);
41449 }
41450
41451 /* Iterate through the target-specific builtin types for va_list.
41452 IDX denotes the iterator, *PTREE is set to the result type of
41453 the va_list builtin, and *PNAME to its internal type.
41454 Returns zero if there is no element for this index, otherwise
41455 IDX should be increased upon the next call.
41456 Note, do not iterate a base builtin's name like __builtin_va_list.
41457 Used from c_common_nodes_and_builtins. */
41458
41459 static int
41460 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
41461 {
41462 if (TARGET_64BIT)
41463 {
41464 switch (idx)
41465 {
41466 default:
41467 break;
41468
41469 case 0:
41470 *ptree = ms_va_list_type_node;
41471 *pname = "__builtin_ms_va_list";
41472 return 1;
41473
41474 case 1:
41475 *ptree = sysv_va_list_type_node;
41476 *pname = "__builtin_sysv_va_list";
41477 return 1;
41478 }
41479 }
41480
41481 return 0;
41482 }
41483
41484 #undef TARGET_SCHED_DISPATCH
41485 #define TARGET_SCHED_DISPATCH has_dispatch
41486 #undef TARGET_SCHED_DISPATCH_DO
41487 #define TARGET_SCHED_DISPATCH_DO do_dispatch
41488 #undef TARGET_SCHED_REASSOCIATION_WIDTH
41489 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
41490 #undef TARGET_SCHED_REORDER
41491 #define TARGET_SCHED_REORDER ix86_sched_reorder
41492 #undef TARGET_SCHED_ADJUST_PRIORITY
41493 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
41494 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
41495 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
41496 ix86_dependencies_evaluation_hook
41497
41498 /* The size of the dispatch window is the total number of bytes of
41499 object code allowed in a window. */
41500 #define DISPATCH_WINDOW_SIZE 16
41501
41502 /* Number of dispatch windows considered for scheduling. */
41503 #define MAX_DISPATCH_WINDOWS 3
41504
41505 /* Maximum number of instructions in a window. */
41506 #define MAX_INSN 4
41507
41508 /* Maximum number of immediate operands in a window. */
41509 #define MAX_IMM 4
41510
41511 /* Maximum number of immediate bits allowed in a window. */
41512 #define MAX_IMM_SIZE 128
41513
41514 /* Maximum number of 32 bit immediates allowed in a window. */
41515 #define MAX_IMM_32 4
41516
41517 /* Maximum number of 64 bit immediates allowed in a window. */
41518 #define MAX_IMM_64 2
41519
41520 /* Maximum total of loads or prefetches allowed in a window. */
41521 #define MAX_LOAD 2
41522
41523 /* Maximum total of stores allowed in a window. */
41524 #define MAX_STORE 1
41525
41526 #undef BIG
41527 #define BIG 100
41528
41529
41530 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
41531 enum dispatch_group {
41532 disp_no_group = 0,
41533 disp_load,
41534 disp_store,
41535 disp_load_store,
41536 disp_prefetch,
41537 disp_imm,
41538 disp_imm_32,
41539 disp_imm_64,
41540 disp_branch,
41541 disp_cmp,
41542 disp_jcc,
41543 disp_last
41544 };
41545
41546 /* Number of allowable groups in a dispatch window. It is an array
41547 indexed by dispatch_group enum. 100 is used as a big number,
41548 because the number of these kind of operations does not have any
41549 effect in dispatch window, but we need them for other reasons in
41550 the table. */
41551 static unsigned int num_allowable_groups[disp_last] = {
41552 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
41553 };
41554
41555 char group_name[disp_last + 1][16] = {
41556 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
41557 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
41558 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
41559 };
41560
41561 /* Instruction path. */
41562 enum insn_path {
41563 no_path = 0,
41564 path_single, /* Single micro op. */
41565 path_double, /* Double micro op. */
41566 path_multi, /* Instructions with more than 2 micro op.. */
41567 last_path
41568 };
41569
41570 /* sched_insn_info defines a window to the instructions scheduled in
41571 the basic block. It contains a pointer to the insn_info table and
41572 the instruction scheduled.
41573
41574 Windows are allocated for each basic block and are linked
41575 together. */
41576 typedef struct sched_insn_info_s {
41577 rtx insn;
41578 enum dispatch_group group;
41579 enum insn_path path;
41580 int byte_len;
41581 int imm_bytes;
41582 } sched_insn_info;
41583
41584 /* Linked list of dispatch windows. This is a two way list of
41585 dispatch windows of a basic block. It contains information about
41586 the number of uops in the window and the total number of
41587 instructions and of bytes in the object code for this dispatch
41588 window. */
41589 typedef struct dispatch_windows_s {
41590 int num_insn; /* Number of insn in the window. */
41591 int num_uops; /* Number of uops in the window. */
41592 int window_size; /* Number of bytes in the window. */
41593 int window_num; /* Window number between 0 or 1. */
41594 int num_imm; /* Number of immediates in an insn. */
41595 int num_imm_32; /* Number of 32 bit immediates in an insn. */
41596 int num_imm_64; /* Number of 64 bit immediates in an insn. */
41597 int imm_size; /* Total immediates in the window. */
41598 int num_loads; /* Total memory loads in the window. */
41599 int num_stores; /* Total memory stores in the window. */
41600 int violation; /* Violation exists in window. */
41601 sched_insn_info *window; /* Pointer to the window. */
41602 struct dispatch_windows_s *next;
41603 struct dispatch_windows_s *prev;
41604 } dispatch_windows;
41605
41606 /* Immediate valuse used in an insn. */
41607 typedef struct imm_info_s
41608 {
41609 int imm;
41610 int imm32;
41611 int imm64;
41612 } imm_info;
41613
41614 static dispatch_windows *dispatch_window_list;
41615 static dispatch_windows *dispatch_window_list1;
41616
41617 /* Get dispatch group of insn. */
41618
41619 static enum dispatch_group
41620 get_mem_group (rtx insn)
41621 {
41622 enum attr_memory memory;
41623
41624 if (INSN_CODE (insn) < 0)
41625 return disp_no_group;
41626 memory = get_attr_memory (insn);
41627 if (memory == MEMORY_STORE)
41628 return disp_store;
41629
41630 if (memory == MEMORY_LOAD)
41631 return disp_load;
41632
41633 if (memory == MEMORY_BOTH)
41634 return disp_load_store;
41635
41636 return disp_no_group;
41637 }
41638
41639 /* Return true if insn is a compare instruction. */
41640
41641 static bool
41642 is_cmp (rtx insn)
41643 {
41644 enum attr_type type;
41645
41646 type = get_attr_type (insn);
41647 return (type == TYPE_TEST
41648 || type == TYPE_ICMP
41649 || type == TYPE_FCMP
41650 || GET_CODE (PATTERN (insn)) == COMPARE);
41651 }
41652
41653 /* Return true if a dispatch violation encountered. */
41654
41655 static bool
41656 dispatch_violation (void)
41657 {
41658 if (dispatch_window_list->next)
41659 return dispatch_window_list->next->violation;
41660 return dispatch_window_list->violation;
41661 }
41662
41663 /* Return true if insn is a branch instruction. */
41664
41665 static bool
41666 is_branch (rtx insn)
41667 {
41668 return (CALL_P (insn) || JUMP_P (insn));
41669 }
41670
41671 /* Return true if insn is a prefetch instruction. */
41672
41673 static bool
41674 is_prefetch (rtx insn)
41675 {
41676 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
41677 }
41678
41679 /* This function initializes a dispatch window and the list container holding a
41680 pointer to the window. */
41681
41682 static void
41683 init_window (int window_num)
41684 {
41685 int i;
41686 dispatch_windows *new_list;
41687
41688 if (window_num == 0)
41689 new_list = dispatch_window_list;
41690 else
41691 new_list = dispatch_window_list1;
41692
41693 new_list->num_insn = 0;
41694 new_list->num_uops = 0;
41695 new_list->window_size = 0;
41696 new_list->next = NULL;
41697 new_list->prev = NULL;
41698 new_list->window_num = window_num;
41699 new_list->num_imm = 0;
41700 new_list->num_imm_32 = 0;
41701 new_list->num_imm_64 = 0;
41702 new_list->imm_size = 0;
41703 new_list->num_loads = 0;
41704 new_list->num_stores = 0;
41705 new_list->violation = false;
41706
41707 for (i = 0; i < MAX_INSN; i++)
41708 {
41709 new_list->window[i].insn = NULL;
41710 new_list->window[i].group = disp_no_group;
41711 new_list->window[i].path = no_path;
41712 new_list->window[i].byte_len = 0;
41713 new_list->window[i].imm_bytes = 0;
41714 }
41715 return;
41716 }
41717
41718 /* This function allocates and initializes a dispatch window and the
41719 list container holding a pointer to the window. */
41720
41721 static dispatch_windows *
41722 allocate_window (void)
41723 {
41724 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
41725 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
41726
41727 return new_list;
41728 }
41729
41730 /* This routine initializes the dispatch scheduling information. It
41731 initiates building dispatch scheduler tables and constructs the
41732 first dispatch window. */
41733
41734 static void
41735 init_dispatch_sched (void)
41736 {
41737 /* Allocate a dispatch list and a window. */
41738 dispatch_window_list = allocate_window ();
41739 dispatch_window_list1 = allocate_window ();
41740 init_window (0);
41741 init_window (1);
41742 }
41743
41744 /* This function returns true if a branch is detected. End of a basic block
41745 does not have to be a branch, but here we assume only branches end a
41746 window. */
41747
41748 static bool
41749 is_end_basic_block (enum dispatch_group group)
41750 {
41751 return group == disp_branch;
41752 }
41753
41754 /* This function is called when the end of a window processing is reached. */
41755
41756 static void
41757 process_end_window (void)
41758 {
41759 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
41760 if (dispatch_window_list->next)
41761 {
41762 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
41763 gcc_assert (dispatch_window_list->window_size
41764 + dispatch_window_list1->window_size <= 48);
41765 init_window (1);
41766 }
41767 init_window (0);
41768 }
41769
41770 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
41771 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
41772 for 48 bytes of instructions. Note that these windows are not dispatch
41773 windows that their sizes are DISPATCH_WINDOW_SIZE. */
41774
41775 static dispatch_windows *
41776 allocate_next_window (int window_num)
41777 {
41778 if (window_num == 0)
41779 {
41780 if (dispatch_window_list->next)
41781 init_window (1);
41782 init_window (0);
41783 return dispatch_window_list;
41784 }
41785
41786 dispatch_window_list->next = dispatch_window_list1;
41787 dispatch_window_list1->prev = dispatch_window_list;
41788
41789 return dispatch_window_list1;
41790 }
41791
41792 /* Increment the number of immediate operands of an instruction. */
41793
41794 static int
41795 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
41796 {
41797 if (*in_rtx == 0)
41798 return 0;
41799
41800 switch ( GET_CODE (*in_rtx))
41801 {
41802 case CONST:
41803 case SYMBOL_REF:
41804 case CONST_INT:
41805 (imm_values->imm)++;
41806 if (x86_64_immediate_operand (*in_rtx, SImode))
41807 (imm_values->imm32)++;
41808 else
41809 (imm_values->imm64)++;
41810 break;
41811
41812 case CONST_DOUBLE:
41813 (imm_values->imm)++;
41814 (imm_values->imm64)++;
41815 break;
41816
41817 case CODE_LABEL:
41818 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
41819 {
41820 (imm_values->imm)++;
41821 (imm_values->imm32)++;
41822 }
41823 break;
41824
41825 default:
41826 break;
41827 }
41828
41829 return 0;
41830 }
41831
41832 /* Compute number of immediate operands of an instruction. */
41833
41834 static void
41835 find_constant (rtx in_rtx, imm_info *imm_values)
41836 {
41837 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
41838 (rtx_function) find_constant_1, (void *) imm_values);
41839 }
41840
41841 /* Return total size of immediate operands of an instruction along with number
41842 of corresponding immediate-operands. It initializes its parameters to zero
41843 befor calling FIND_CONSTANT.
41844 INSN is the input instruction. IMM is the total of immediates.
41845 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
41846 bit immediates. */
41847
41848 static int
41849 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
41850 {
41851 imm_info imm_values = {0, 0, 0};
41852
41853 find_constant (insn, &imm_values);
41854 *imm = imm_values.imm;
41855 *imm32 = imm_values.imm32;
41856 *imm64 = imm_values.imm64;
41857 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
41858 }
41859
41860 /* This function indicates if an operand of an instruction is an
41861 immediate. */
41862
41863 static bool
41864 has_immediate (rtx insn)
41865 {
41866 int num_imm_operand;
41867 int num_imm32_operand;
41868 int num_imm64_operand;
41869
41870 if (insn)
41871 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41872 &num_imm64_operand);
41873 return false;
41874 }
41875
41876 /* Return single or double path for instructions. */
41877
41878 static enum insn_path
41879 get_insn_path (rtx insn)
41880 {
41881 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
41882
41883 if ((int)path == 0)
41884 return path_single;
41885
41886 if ((int)path == 1)
41887 return path_double;
41888
41889 return path_multi;
41890 }
41891
41892 /* Return insn dispatch group. */
41893
41894 static enum dispatch_group
41895 get_insn_group (rtx insn)
41896 {
41897 enum dispatch_group group = get_mem_group (insn);
41898 if (group)
41899 return group;
41900
41901 if (is_branch (insn))
41902 return disp_branch;
41903
41904 if (is_cmp (insn))
41905 return disp_cmp;
41906
41907 if (has_immediate (insn))
41908 return disp_imm;
41909
41910 if (is_prefetch (insn))
41911 return disp_prefetch;
41912
41913 return disp_no_group;
41914 }
41915
41916 /* Count number of GROUP restricted instructions in a dispatch
41917 window WINDOW_LIST. */
41918
41919 static int
41920 count_num_restricted (rtx insn, dispatch_windows *window_list)
41921 {
41922 enum dispatch_group group = get_insn_group (insn);
41923 int imm_size;
41924 int num_imm_operand;
41925 int num_imm32_operand;
41926 int num_imm64_operand;
41927
41928 if (group == disp_no_group)
41929 return 0;
41930
41931 if (group == disp_imm)
41932 {
41933 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41934 &num_imm64_operand);
41935 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
41936 || num_imm_operand + window_list->num_imm > MAX_IMM
41937 || (num_imm32_operand > 0
41938 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
41939 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
41940 || (num_imm64_operand > 0
41941 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
41942 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
41943 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
41944 && num_imm64_operand > 0
41945 && ((window_list->num_imm_64 > 0
41946 && window_list->num_insn >= 2)
41947 || window_list->num_insn >= 3)))
41948 return BIG;
41949
41950 return 1;
41951 }
41952
41953 if ((group == disp_load_store
41954 && (window_list->num_loads >= MAX_LOAD
41955 || window_list->num_stores >= MAX_STORE))
41956 || ((group == disp_load
41957 || group == disp_prefetch)
41958 && window_list->num_loads >= MAX_LOAD)
41959 || (group == disp_store
41960 && window_list->num_stores >= MAX_STORE))
41961 return BIG;
41962
41963 return 1;
41964 }
41965
41966 /* This function returns true if insn satisfies dispatch rules on the
41967 last window scheduled. */
41968
41969 static bool
41970 fits_dispatch_window (rtx insn)
41971 {
41972 dispatch_windows *window_list = dispatch_window_list;
41973 dispatch_windows *window_list_next = dispatch_window_list->next;
41974 unsigned int num_restrict;
41975 enum dispatch_group group = get_insn_group (insn);
41976 enum insn_path path = get_insn_path (insn);
41977 int sum;
41978
41979 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
41980 instructions should be given the lowest priority in the
41981 scheduling process in Haifa scheduler to make sure they will be
41982 scheduled in the same dispatch window as the reference to them. */
41983 if (group == disp_jcc || group == disp_cmp)
41984 return false;
41985
41986 /* Check nonrestricted. */
41987 if (group == disp_no_group || group == disp_branch)
41988 return true;
41989
41990 /* Get last dispatch window. */
41991 if (window_list_next)
41992 window_list = window_list_next;
41993
41994 if (window_list->window_num == 1)
41995 {
41996 sum = window_list->prev->window_size + window_list->window_size;
41997
41998 if (sum == 32
41999 || (min_insn_size (insn) + sum) >= 48)
42000 /* Window 1 is full. Go for next window. */
42001 return true;
42002 }
42003
42004 num_restrict = count_num_restricted (insn, window_list);
42005
42006 if (num_restrict > num_allowable_groups[group])
42007 return false;
42008
42009 /* See if it fits in the first window. */
42010 if (window_list->window_num == 0)
42011 {
42012 /* The first widow should have only single and double path
42013 uops. */
42014 if (path == path_double
42015 && (window_list->num_uops + 2) > MAX_INSN)
42016 return false;
42017 else if (path != path_single)
42018 return false;
42019 }
42020 return true;
42021 }
42022
42023 /* Add an instruction INSN with NUM_UOPS micro-operations to the
42024 dispatch window WINDOW_LIST. */
42025
42026 static void
42027 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
42028 {
42029 int byte_len = min_insn_size (insn);
42030 int num_insn = window_list->num_insn;
42031 int imm_size;
42032 sched_insn_info *window = window_list->window;
42033 enum dispatch_group group = get_insn_group (insn);
42034 enum insn_path path = get_insn_path (insn);
42035 int num_imm_operand;
42036 int num_imm32_operand;
42037 int num_imm64_operand;
42038
42039 if (!window_list->violation && group != disp_cmp
42040 && !fits_dispatch_window (insn))
42041 window_list->violation = true;
42042
42043 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42044 &num_imm64_operand);
42045
42046 /* Initialize window with new instruction. */
42047 window[num_insn].insn = insn;
42048 window[num_insn].byte_len = byte_len;
42049 window[num_insn].group = group;
42050 window[num_insn].path = path;
42051 window[num_insn].imm_bytes = imm_size;
42052
42053 window_list->window_size += byte_len;
42054 window_list->num_insn = num_insn + 1;
42055 window_list->num_uops = window_list->num_uops + num_uops;
42056 window_list->imm_size += imm_size;
42057 window_list->num_imm += num_imm_operand;
42058 window_list->num_imm_32 += num_imm32_operand;
42059 window_list->num_imm_64 += num_imm64_operand;
42060
42061 if (group == disp_store)
42062 window_list->num_stores += 1;
42063 else if (group == disp_load
42064 || group == disp_prefetch)
42065 window_list->num_loads += 1;
42066 else if (group == disp_load_store)
42067 {
42068 window_list->num_stores += 1;
42069 window_list->num_loads += 1;
42070 }
42071 }
42072
42073 /* Adds a scheduled instruction, INSN, to the current dispatch window.
42074 If the total bytes of instructions or the number of instructions in
42075 the window exceed allowable, it allocates a new window. */
42076
42077 static void
42078 add_to_dispatch_window (rtx insn)
42079 {
42080 int byte_len;
42081 dispatch_windows *window_list;
42082 dispatch_windows *next_list;
42083 dispatch_windows *window0_list;
42084 enum insn_path path;
42085 enum dispatch_group insn_group;
42086 bool insn_fits;
42087 int num_insn;
42088 int num_uops;
42089 int window_num;
42090 int insn_num_uops;
42091 int sum;
42092
42093 if (INSN_CODE (insn) < 0)
42094 return;
42095
42096 byte_len = min_insn_size (insn);
42097 window_list = dispatch_window_list;
42098 next_list = window_list->next;
42099 path = get_insn_path (insn);
42100 insn_group = get_insn_group (insn);
42101
42102 /* Get the last dispatch window. */
42103 if (next_list)
42104 window_list = dispatch_window_list->next;
42105
42106 if (path == path_single)
42107 insn_num_uops = 1;
42108 else if (path == path_double)
42109 insn_num_uops = 2;
42110 else
42111 insn_num_uops = (int) path;
42112
42113 /* If current window is full, get a new window.
42114 Window number zero is full, if MAX_INSN uops are scheduled in it.
42115 Window number one is full, if window zero's bytes plus window
42116 one's bytes is 32, or if the bytes of the new instruction added
42117 to the total makes it greater than 48, or it has already MAX_INSN
42118 instructions in it. */
42119 num_insn = window_list->num_insn;
42120 num_uops = window_list->num_uops;
42121 window_num = window_list->window_num;
42122 insn_fits = fits_dispatch_window (insn);
42123
42124 if (num_insn >= MAX_INSN
42125 || num_uops + insn_num_uops > MAX_INSN
42126 || !(insn_fits))
42127 {
42128 window_num = ~window_num & 1;
42129 window_list = allocate_next_window (window_num);
42130 }
42131
42132 if (window_num == 0)
42133 {
42134 add_insn_window (insn, window_list, insn_num_uops);
42135 if (window_list->num_insn >= MAX_INSN
42136 && insn_group == disp_branch)
42137 {
42138 process_end_window ();
42139 return;
42140 }
42141 }
42142 else if (window_num == 1)
42143 {
42144 window0_list = window_list->prev;
42145 sum = window0_list->window_size + window_list->window_size;
42146 if (sum == 32
42147 || (byte_len + sum) >= 48)
42148 {
42149 process_end_window ();
42150 window_list = dispatch_window_list;
42151 }
42152
42153 add_insn_window (insn, window_list, insn_num_uops);
42154 }
42155 else
42156 gcc_unreachable ();
42157
42158 if (is_end_basic_block (insn_group))
42159 {
42160 /* End of basic block is reached do end-basic-block process. */
42161 process_end_window ();
42162 return;
42163 }
42164 }
42165
42166 /* Print the dispatch window, WINDOW_NUM, to FILE. */
42167
42168 DEBUG_FUNCTION static void
42169 debug_dispatch_window_file (FILE *file, int window_num)
42170 {
42171 dispatch_windows *list;
42172 int i;
42173
42174 if (window_num == 0)
42175 list = dispatch_window_list;
42176 else
42177 list = dispatch_window_list1;
42178
42179 fprintf (file, "Window #%d:\n", list->window_num);
42180 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
42181 list->num_insn, list->num_uops, list->window_size);
42182 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
42183 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
42184
42185 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
42186 list->num_stores);
42187 fprintf (file, " insn info:\n");
42188
42189 for (i = 0; i < MAX_INSN; i++)
42190 {
42191 if (!list->window[i].insn)
42192 break;
42193 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
42194 i, group_name[list->window[i].group],
42195 i, (void *)list->window[i].insn,
42196 i, list->window[i].path,
42197 i, list->window[i].byte_len,
42198 i, list->window[i].imm_bytes);
42199 }
42200 }
42201
42202 /* Print to stdout a dispatch window. */
42203
42204 DEBUG_FUNCTION void
42205 debug_dispatch_window (int window_num)
42206 {
42207 debug_dispatch_window_file (stdout, window_num);
42208 }
42209
42210 /* Print INSN dispatch information to FILE. */
42211
42212 DEBUG_FUNCTION static void
42213 debug_insn_dispatch_info_file (FILE *file, rtx insn)
42214 {
42215 int byte_len;
42216 enum insn_path path;
42217 enum dispatch_group group;
42218 int imm_size;
42219 int num_imm_operand;
42220 int num_imm32_operand;
42221 int num_imm64_operand;
42222
42223 if (INSN_CODE (insn) < 0)
42224 return;
42225
42226 byte_len = min_insn_size (insn);
42227 path = get_insn_path (insn);
42228 group = get_insn_group (insn);
42229 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42230 &num_imm64_operand);
42231
42232 fprintf (file, " insn info:\n");
42233 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
42234 group_name[group], path, byte_len);
42235 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
42236 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
42237 }
42238
42239 /* Print to STDERR the status of the ready list with respect to
42240 dispatch windows. */
42241
42242 DEBUG_FUNCTION void
42243 debug_ready_dispatch (void)
42244 {
42245 int i;
42246 int no_ready = number_in_ready ();
42247
42248 fprintf (stdout, "Number of ready: %d\n", no_ready);
42249
42250 for (i = 0; i < no_ready; i++)
42251 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
42252 }
42253
42254 /* This routine is the driver of the dispatch scheduler. */
42255
42256 static void
42257 do_dispatch (rtx insn, int mode)
42258 {
42259 if (mode == DISPATCH_INIT)
42260 init_dispatch_sched ();
42261 else if (mode == ADD_TO_DISPATCH_WINDOW)
42262 add_to_dispatch_window (insn);
42263 }
42264
42265 /* Return TRUE if Dispatch Scheduling is supported. */
42266
42267 static bool
42268 has_dispatch (rtx insn, int action)
42269 {
42270 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3)
42271 && flag_dispatch_scheduler)
42272 switch (action)
42273 {
42274 default:
42275 return false;
42276
42277 case IS_DISPATCH_ON:
42278 return true;
42279 break;
42280
42281 case IS_CMP:
42282 return is_cmp (insn);
42283
42284 case DISPATCH_VIOLATION:
42285 return dispatch_violation ();
42286
42287 case FITS_DISPATCH_WINDOW:
42288 return fits_dispatch_window (insn);
42289 }
42290
42291 return false;
42292 }
42293
42294 /* Implementation of reassociation_width target hook used by
42295 reassoc phase to identify parallelism level in reassociated
42296 tree. Statements tree_code is passed in OPC. Arguments type
42297 is passed in MODE.
42298
42299 Currently parallel reassociation is enabled for Atom
42300 processors only and we set reassociation width to be 2
42301 because Atom may issue up to 2 instructions per cycle.
42302
42303 Return value should be fixed if parallel reassociation is
42304 enabled for other processors. */
42305
42306 static int
42307 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
42308 enum machine_mode mode)
42309 {
42310 int res = 1;
42311
42312 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
42313 res = 2;
42314 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
42315 res = 2;
42316
42317 return res;
42318 }
42319
42320 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
42321 place emms and femms instructions. */
42322
42323 static enum machine_mode
42324 ix86_preferred_simd_mode (enum machine_mode mode)
42325 {
42326 if (!TARGET_SSE)
42327 return word_mode;
42328
42329 switch (mode)
42330 {
42331 case QImode:
42332 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
42333 case HImode:
42334 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
42335 case SImode:
42336 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
42337 case DImode:
42338 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
42339
42340 case SFmode:
42341 if (TARGET_AVX && !TARGET_PREFER_AVX128)
42342 return V8SFmode;
42343 else
42344 return V4SFmode;
42345
42346 case DFmode:
42347 if (!TARGET_VECTORIZE_DOUBLE)
42348 return word_mode;
42349 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
42350 return V4DFmode;
42351 else if (TARGET_SSE2)
42352 return V2DFmode;
42353 /* FALLTHRU */
42354
42355 default:
42356 return word_mode;
42357 }
42358 }
42359
42360 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
42361 vectors. */
42362
42363 static unsigned int
42364 ix86_autovectorize_vector_sizes (void)
42365 {
42366 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
42367 }
42368
42369 \f
42370
42371 /* Return class of registers which could be used for pseudo of MODE
42372 and of class RCLASS for spilling instead of memory. Return NO_REGS
42373 if it is not possible or non-profitable. */
42374 static reg_class_t
42375 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
42376 {
42377 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
42378 && (mode == SImode || (TARGET_64BIT && mode == DImode))
42379 && INTEGER_CLASS_P (rclass))
42380 return SSE_REGS;
42381 return NO_REGS;
42382 }
42383
42384 /* Implement targetm.vectorize.init_cost. */
42385
42386 static void *
42387 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
42388 {
42389 unsigned *cost = XNEWVEC (unsigned, 3);
42390 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
42391 return cost;
42392 }
42393
42394 /* Implement targetm.vectorize.add_stmt_cost. */
42395
42396 static unsigned
42397 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
42398 struct _stmt_vec_info *stmt_info, int misalign,
42399 enum vect_cost_model_location where)
42400 {
42401 unsigned *cost = (unsigned *) data;
42402 unsigned retval = 0;
42403
42404 if (flag_vect_cost_model)
42405 {
42406 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
42407 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
42408
42409 /* Statements in an inner loop relative to the loop being
42410 vectorized are weighted more heavily. The value here is
42411 arbitrary and could potentially be improved with analysis. */
42412 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
42413 count *= 50; /* FIXME. */
42414
42415 retval = (unsigned) (count * stmt_cost);
42416 cost[where] += retval;
42417 }
42418
42419 return retval;
42420 }
42421
42422 /* Implement targetm.vectorize.finish_cost. */
42423
42424 static void
42425 ix86_finish_cost (void *data, unsigned *prologue_cost,
42426 unsigned *body_cost, unsigned *epilogue_cost)
42427 {
42428 unsigned *cost = (unsigned *) data;
42429 *prologue_cost = cost[vect_prologue];
42430 *body_cost = cost[vect_body];
42431 *epilogue_cost = cost[vect_epilogue];
42432 }
42433
42434 /* Implement targetm.vectorize.destroy_cost_data. */
42435
42436 static void
42437 ix86_destroy_cost_data (void *data)
42438 {
42439 free (data);
42440 }
42441
42442 /* Validate target specific memory model bits in VAL. */
42443
42444 static unsigned HOST_WIDE_INT
42445 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
42446 {
42447 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
42448 bool strong;
42449
42450 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
42451 |MEMMODEL_MASK)
42452 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
42453 {
42454 warning (OPT_Winvalid_memory_model,
42455 "Unknown architecture specific memory model");
42456 return MEMMODEL_SEQ_CST;
42457 }
42458 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
42459 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
42460 {
42461 warning (OPT_Winvalid_memory_model,
42462 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
42463 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
42464 }
42465 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
42466 {
42467 warning (OPT_Winvalid_memory_model,
42468 "HLE_RELEASE not used with RELEASE or stronger memory model");
42469 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
42470 }
42471 return val;
42472 }
42473
42474 /* Initialize the GCC target structure. */
42475 #undef TARGET_RETURN_IN_MEMORY
42476 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
42477
42478 #undef TARGET_LEGITIMIZE_ADDRESS
42479 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
42480
42481 #undef TARGET_ATTRIBUTE_TABLE
42482 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
42483 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42484 # undef TARGET_MERGE_DECL_ATTRIBUTES
42485 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
42486 #endif
42487
42488 #undef TARGET_COMP_TYPE_ATTRIBUTES
42489 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
42490
42491 #undef TARGET_INIT_BUILTINS
42492 #define TARGET_INIT_BUILTINS ix86_init_builtins
42493 #undef TARGET_BUILTIN_DECL
42494 #define TARGET_BUILTIN_DECL ix86_builtin_decl
42495 #undef TARGET_EXPAND_BUILTIN
42496 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
42497
42498 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
42499 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
42500 ix86_builtin_vectorized_function
42501
42502 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
42503 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
42504
42505 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
42506 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
42507
42508 #undef TARGET_VECTORIZE_BUILTIN_GATHER
42509 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
42510
42511 #undef TARGET_BUILTIN_RECIPROCAL
42512 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
42513
42514 #undef TARGET_ASM_FUNCTION_EPILOGUE
42515 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
42516
42517 #undef TARGET_ENCODE_SECTION_INFO
42518 #ifndef SUBTARGET_ENCODE_SECTION_INFO
42519 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
42520 #else
42521 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
42522 #endif
42523
42524 #undef TARGET_ASM_OPEN_PAREN
42525 #define TARGET_ASM_OPEN_PAREN ""
42526 #undef TARGET_ASM_CLOSE_PAREN
42527 #define TARGET_ASM_CLOSE_PAREN ""
42528
42529 #undef TARGET_ASM_BYTE_OP
42530 #define TARGET_ASM_BYTE_OP ASM_BYTE
42531
42532 #undef TARGET_ASM_ALIGNED_HI_OP
42533 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
42534 #undef TARGET_ASM_ALIGNED_SI_OP
42535 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
42536 #ifdef ASM_QUAD
42537 #undef TARGET_ASM_ALIGNED_DI_OP
42538 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
42539 #endif
42540
42541 #undef TARGET_PROFILE_BEFORE_PROLOGUE
42542 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
42543
42544 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
42545 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
42546
42547 #undef TARGET_ASM_UNALIGNED_HI_OP
42548 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
42549 #undef TARGET_ASM_UNALIGNED_SI_OP
42550 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
42551 #undef TARGET_ASM_UNALIGNED_DI_OP
42552 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
42553
42554 #undef TARGET_PRINT_OPERAND
42555 #define TARGET_PRINT_OPERAND ix86_print_operand
42556 #undef TARGET_PRINT_OPERAND_ADDRESS
42557 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
42558 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
42559 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
42560 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
42561 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
42562
42563 #undef TARGET_SCHED_INIT_GLOBAL
42564 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
42565 #undef TARGET_SCHED_ADJUST_COST
42566 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
42567 #undef TARGET_SCHED_ISSUE_RATE
42568 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
42569 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
42570 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
42571 ia32_multipass_dfa_lookahead
42572
42573 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
42574 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
42575
42576 #undef TARGET_MEMMODEL_CHECK
42577 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
42578
42579 #ifdef HAVE_AS_TLS
42580 #undef TARGET_HAVE_TLS
42581 #define TARGET_HAVE_TLS true
42582 #endif
42583 #undef TARGET_CANNOT_FORCE_CONST_MEM
42584 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
42585 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
42586 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
42587
42588 #undef TARGET_DELEGITIMIZE_ADDRESS
42589 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
42590
42591 #undef TARGET_MS_BITFIELD_LAYOUT_P
42592 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
42593
42594 #if TARGET_MACHO
42595 #undef TARGET_BINDS_LOCAL_P
42596 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
42597 #endif
42598 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42599 #undef TARGET_BINDS_LOCAL_P
42600 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
42601 #endif
42602
42603 #undef TARGET_ASM_OUTPUT_MI_THUNK
42604 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
42605 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
42606 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
42607
42608 #undef TARGET_ASM_FILE_START
42609 #define TARGET_ASM_FILE_START x86_file_start
42610
42611 #undef TARGET_OPTION_OVERRIDE
42612 #define TARGET_OPTION_OVERRIDE ix86_option_override
42613
42614 #undef TARGET_REGISTER_MOVE_COST
42615 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
42616 #undef TARGET_MEMORY_MOVE_COST
42617 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
42618 #undef TARGET_RTX_COSTS
42619 #define TARGET_RTX_COSTS ix86_rtx_costs
42620 #undef TARGET_ADDRESS_COST
42621 #define TARGET_ADDRESS_COST ix86_address_cost
42622
42623 #undef TARGET_FIXED_CONDITION_CODE_REGS
42624 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
42625 #undef TARGET_CC_MODES_COMPATIBLE
42626 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
42627
42628 #undef TARGET_MACHINE_DEPENDENT_REORG
42629 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
42630
42631 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
42632 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
42633
42634 #undef TARGET_BUILD_BUILTIN_VA_LIST
42635 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
42636
42637 #undef TARGET_FOLD_BUILTIN
42638 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
42639
42640 #undef TARGET_COMPARE_VERSION_PRIORITY
42641 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
42642
42643 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
42644 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
42645 ix86_generate_version_dispatcher_body
42646
42647 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
42648 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
42649 ix86_get_function_versions_dispatcher
42650
42651 #undef TARGET_ENUM_VA_LIST_P
42652 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
42653
42654 #undef TARGET_FN_ABI_VA_LIST
42655 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
42656
42657 #undef TARGET_CANONICAL_VA_LIST_TYPE
42658 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
42659
42660 #undef TARGET_EXPAND_BUILTIN_VA_START
42661 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
42662
42663 #undef TARGET_MD_ASM_CLOBBERS
42664 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
42665
42666 #undef TARGET_PROMOTE_PROTOTYPES
42667 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
42668 #undef TARGET_STRUCT_VALUE_RTX
42669 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
42670 #undef TARGET_SETUP_INCOMING_VARARGS
42671 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
42672 #undef TARGET_MUST_PASS_IN_STACK
42673 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
42674 #undef TARGET_FUNCTION_ARG_ADVANCE
42675 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
42676 #undef TARGET_FUNCTION_ARG
42677 #define TARGET_FUNCTION_ARG ix86_function_arg
42678 #undef TARGET_FUNCTION_ARG_BOUNDARY
42679 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
42680 #undef TARGET_PASS_BY_REFERENCE
42681 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
42682 #undef TARGET_INTERNAL_ARG_POINTER
42683 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
42684 #undef TARGET_UPDATE_STACK_BOUNDARY
42685 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
42686 #undef TARGET_GET_DRAP_RTX
42687 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
42688 #undef TARGET_STRICT_ARGUMENT_NAMING
42689 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
42690 #undef TARGET_STATIC_CHAIN
42691 #define TARGET_STATIC_CHAIN ix86_static_chain
42692 #undef TARGET_TRAMPOLINE_INIT
42693 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
42694 #undef TARGET_RETURN_POPS_ARGS
42695 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
42696
42697 #undef TARGET_LEGITIMATE_COMBINED_INSN
42698 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
42699
42700 #undef TARGET_ASAN_SHADOW_OFFSET
42701 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
42702
42703 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
42704 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
42705
42706 #undef TARGET_SCALAR_MODE_SUPPORTED_P
42707 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
42708
42709 #undef TARGET_VECTOR_MODE_SUPPORTED_P
42710 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
42711
42712 #undef TARGET_C_MODE_FOR_SUFFIX
42713 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
42714
42715 #ifdef HAVE_AS_TLS
42716 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
42717 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
42718 #endif
42719
42720 #ifdef SUBTARGET_INSERT_ATTRIBUTES
42721 #undef TARGET_INSERT_ATTRIBUTES
42722 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
42723 #endif
42724
42725 #undef TARGET_MANGLE_TYPE
42726 #define TARGET_MANGLE_TYPE ix86_mangle_type
42727
42728 #if !TARGET_MACHO
42729 #undef TARGET_STACK_PROTECT_FAIL
42730 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
42731 #endif
42732
42733 #undef TARGET_FUNCTION_VALUE
42734 #define TARGET_FUNCTION_VALUE ix86_function_value
42735
42736 #undef TARGET_FUNCTION_VALUE_REGNO_P
42737 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
42738
42739 #undef TARGET_PROMOTE_FUNCTION_MODE
42740 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
42741
42742 #undef TARGET_MEMBER_TYPE_FORCES_BLK
42743 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
42744
42745 #undef TARGET_INSTANTIATE_DECLS
42746 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
42747
42748 #undef TARGET_SECONDARY_RELOAD
42749 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
42750
42751 #undef TARGET_CLASS_MAX_NREGS
42752 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
42753
42754 #undef TARGET_PREFERRED_RELOAD_CLASS
42755 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
42756 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
42757 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
42758 #undef TARGET_CLASS_LIKELY_SPILLED_P
42759 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
42760
42761 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
42762 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
42763 ix86_builtin_vectorization_cost
42764 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
42765 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
42766 ix86_vectorize_vec_perm_const_ok
42767 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
42768 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
42769 ix86_preferred_simd_mode
42770 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
42771 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
42772 ix86_autovectorize_vector_sizes
42773 #undef TARGET_VECTORIZE_INIT_COST
42774 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
42775 #undef TARGET_VECTORIZE_ADD_STMT_COST
42776 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
42777 #undef TARGET_VECTORIZE_FINISH_COST
42778 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
42779 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
42780 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
42781
42782 #undef TARGET_SET_CURRENT_FUNCTION
42783 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
42784
42785 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
42786 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
42787
42788 #undef TARGET_OPTION_SAVE
42789 #define TARGET_OPTION_SAVE ix86_function_specific_save
42790
42791 #undef TARGET_OPTION_RESTORE
42792 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
42793
42794 #undef TARGET_OPTION_PRINT
42795 #define TARGET_OPTION_PRINT ix86_function_specific_print
42796
42797 #undef TARGET_OPTION_FUNCTION_VERSIONS
42798 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
42799
42800 #undef TARGET_CAN_INLINE_P
42801 #define TARGET_CAN_INLINE_P ix86_can_inline_p
42802
42803 #undef TARGET_EXPAND_TO_RTL_HOOK
42804 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
42805
42806 #undef TARGET_LEGITIMATE_ADDRESS_P
42807 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
42808
42809 #undef TARGET_LRA_P
42810 #define TARGET_LRA_P hook_bool_void_true
42811
42812 #undef TARGET_REGISTER_PRIORITY
42813 #define TARGET_REGISTER_PRIORITY ix86_register_priority
42814
42815 #undef TARGET_LEGITIMATE_CONSTANT_P
42816 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
42817
42818 #undef TARGET_FRAME_POINTER_REQUIRED
42819 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
42820
42821 #undef TARGET_CAN_ELIMINATE
42822 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
42823
42824 #undef TARGET_EXTRA_LIVE_ON_ENTRY
42825 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
42826
42827 #undef TARGET_ASM_CODE_END
42828 #define TARGET_ASM_CODE_END ix86_code_end
42829
42830 #undef TARGET_CONDITIONAL_REGISTER_USAGE
42831 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
42832
42833 #if TARGET_MACHO
42834 #undef TARGET_INIT_LIBFUNCS
42835 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
42836 #endif
42837
42838 #undef TARGET_SPILL_CLASS
42839 #define TARGET_SPILL_CLASS ix86_spill_class
42840
42841 struct gcc_target targetm = TARGET_INITIALIZER;
42842 \f
42843 #include "gt-i386.h"