]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/i386/i386.c
Merge from transactional-memory branch.
[thirdparty/gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
4 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "cgraph.h"
51 #include "gimple.h"
52 #include "dwarf2.h"
53 #include "df.h"
54 #include "tm-constrs.h"
55 #include "params.h"
56 #include "cselib.h"
57 #include "debug.h"
58 #include "sched-int.h"
59 #include "sbitmap.h"
60 #include "fibheap.h"
61 #include "opts.h"
62 #include "diagnostic.h"
63
64 enum upper_128bits_state
65 {
66 unknown = 0,
67 unused,
68 used
69 };
70
71 typedef struct block_info_def
72 {
73 /* State of the upper 128bits of AVX registers at exit. */
74 enum upper_128bits_state state;
75 /* TRUE if state of the upper 128bits of AVX registers is unchanged
76 in this block. */
77 bool unchanged;
78 /* TRUE if block has been processed. */
79 bool processed;
80 /* TRUE if block has been scanned. */
81 bool scanned;
82 /* Previous state of the upper 128bits of AVX registers at entry. */
83 enum upper_128bits_state prev;
84 } *block_info;
85
86 #define BLOCK_INFO(B) ((block_info) (B)->aux)
87
88 enum call_avx256_state
89 {
90 /* Callee returns 256bit AVX register. */
91 callee_return_avx256 = -1,
92 /* Callee returns and passes 256bit AVX register. */
93 callee_return_pass_avx256,
94 /* Callee passes 256bit AVX register. */
95 callee_pass_avx256,
96 /* Callee doesn't return nor passe 256bit AVX register, or no
97 256bit AVX register in function return. */
98 call_no_avx256,
99 /* vzeroupper intrinsic. */
100 vzeroupper_intrinsic
101 };
102
103 /* Check if a 256bit AVX register is referenced in stores. */
104
105 static void
106 check_avx256_stores (rtx dest, const_rtx set, void *data)
107 {
108 if ((REG_P (dest)
109 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
110 || (GET_CODE (set) == SET
111 && REG_P (SET_SRC (set))
112 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
113 {
114 enum upper_128bits_state *state
115 = (enum upper_128bits_state *) data;
116 *state = used;
117 }
118 }
119
120 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
121 in basic block BB. Delete it if upper 128bit AVX registers are
122 unused. If it isn't deleted, move it to just before a jump insn.
123
124 STATE is state of the upper 128bits of AVX registers at entry. */
125
126 static void
127 move_or_delete_vzeroupper_2 (basic_block bb,
128 enum upper_128bits_state state)
129 {
130 rtx insn, bb_end;
131 rtx vzeroupper_insn = NULL_RTX;
132 rtx pat;
133 int avx256;
134 bool unchanged;
135
136 if (BLOCK_INFO (bb)->unchanged)
137 {
138 if (dump_file)
139 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
140 bb->index, state);
141
142 BLOCK_INFO (bb)->state = state;
143 return;
144 }
145
146 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
147 {
148 if (dump_file)
149 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
150 bb->index, BLOCK_INFO (bb)->state);
151 return;
152 }
153
154 BLOCK_INFO (bb)->prev = state;
155
156 if (dump_file)
157 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
158 bb->index, state);
159
160 unchanged = true;
161
162 /* BB_END changes when it is deleted. */
163 bb_end = BB_END (bb);
164 insn = BB_HEAD (bb);
165 while (insn != bb_end)
166 {
167 insn = NEXT_INSN (insn);
168
169 if (!NONDEBUG_INSN_P (insn))
170 continue;
171
172 /* Move vzeroupper before jump/call. */
173 if (JUMP_P (insn) || CALL_P (insn))
174 {
175 if (!vzeroupper_insn)
176 continue;
177
178 if (PREV_INSN (insn) != vzeroupper_insn)
179 {
180 if (dump_file)
181 {
182 fprintf (dump_file, "Move vzeroupper after:\n");
183 print_rtl_single (dump_file, PREV_INSN (insn));
184 fprintf (dump_file, "before:\n");
185 print_rtl_single (dump_file, insn);
186 }
187 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
188 PREV_INSN (insn));
189 }
190 vzeroupper_insn = NULL_RTX;
191 continue;
192 }
193
194 pat = PATTERN (insn);
195
196 /* Check insn for vzeroupper intrinsic. */
197 if (GET_CODE (pat) == UNSPEC_VOLATILE
198 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
199 {
200 if (dump_file)
201 {
202 /* Found vzeroupper intrinsic. */
203 fprintf (dump_file, "Found vzeroupper:\n");
204 print_rtl_single (dump_file, insn);
205 }
206 }
207 else
208 {
209 /* Check insn for vzeroall intrinsic. */
210 if (GET_CODE (pat) == PARALLEL
211 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
212 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
213 {
214 state = unused;
215 unchanged = false;
216
217 /* Delete pending vzeroupper insertion. */
218 if (vzeroupper_insn)
219 {
220 delete_insn (vzeroupper_insn);
221 vzeroupper_insn = NULL_RTX;
222 }
223 }
224 else if (state != used)
225 {
226 note_stores (pat, check_avx256_stores, &state);
227 if (state == used)
228 unchanged = false;
229 }
230 continue;
231 }
232
233 /* Process vzeroupper intrinsic. */
234 avx256 = INTVAL (XVECEXP (pat, 0, 0));
235
236 if (state == unused)
237 {
238 /* Since the upper 128bits are cleared, callee must not pass
239 256bit AVX register. We only need to check if callee
240 returns 256bit AVX register. */
241 if (avx256 == callee_return_avx256)
242 {
243 state = used;
244 unchanged = false;
245 }
246
247 /* Remove unnecessary vzeroupper since upper 128bits are
248 cleared. */
249 if (dump_file)
250 {
251 fprintf (dump_file, "Delete redundant vzeroupper:\n");
252 print_rtl_single (dump_file, insn);
253 }
254 delete_insn (insn);
255 }
256 else
257 {
258 /* Set state to UNUSED if callee doesn't return 256bit AVX
259 register. */
260 if (avx256 != callee_return_pass_avx256)
261 state = unused;
262
263 if (avx256 == callee_return_pass_avx256
264 || avx256 == callee_pass_avx256)
265 {
266 /* Must remove vzeroupper since callee passes in 256bit
267 AVX register. */
268 if (dump_file)
269 {
270 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
271 print_rtl_single (dump_file, insn);
272 }
273 delete_insn (insn);
274 }
275 else
276 {
277 vzeroupper_insn = insn;
278 unchanged = false;
279 }
280 }
281 }
282
283 BLOCK_INFO (bb)->state = state;
284 BLOCK_INFO (bb)->unchanged = unchanged;
285 BLOCK_INFO (bb)->scanned = true;
286
287 if (dump_file)
288 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
289 bb->index, unchanged ? "unchanged" : "changed",
290 state);
291 }
292
293 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
294 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
295 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
296 state is changed. */
297
298 static bool
299 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
300 {
301 edge e;
302 edge_iterator ei;
303 enum upper_128bits_state state, old_state, new_state;
304 bool seen_unknown;
305
306 if (dump_file)
307 fprintf (dump_file, " Process [bb %i]: status: %d\n",
308 block->index, BLOCK_INFO (block)->processed);
309
310 if (BLOCK_INFO (block)->processed)
311 return false;
312
313 state = unused;
314
315 /* Check all predecessor edges of this block. */
316 seen_unknown = false;
317 FOR_EACH_EDGE (e, ei, block->preds)
318 {
319 if (e->src == block)
320 continue;
321 switch (BLOCK_INFO (e->src)->state)
322 {
323 case unknown:
324 if (!unknown_is_unused)
325 seen_unknown = true;
326 case unused:
327 break;
328 case used:
329 state = used;
330 goto done;
331 }
332 }
333
334 if (seen_unknown)
335 state = unknown;
336
337 done:
338 old_state = BLOCK_INFO (block)->state;
339 move_or_delete_vzeroupper_2 (block, state);
340 new_state = BLOCK_INFO (block)->state;
341
342 if (state != unknown || new_state == used)
343 BLOCK_INFO (block)->processed = true;
344
345 /* Need to rescan if the upper 128bits of AVX registers are changed
346 to USED at exit. */
347 if (new_state != old_state)
348 {
349 if (new_state == used)
350 cfun->machine->rescan_vzeroupper_p = 1;
351 return true;
352 }
353 else
354 return false;
355 }
356
357 /* Go through the instruction stream looking for vzeroupper. Delete
358 it if upper 128bit AVX registers are unused. If it isn't deleted,
359 move it to just before a jump insn. */
360
361 static void
362 move_or_delete_vzeroupper (void)
363 {
364 edge e;
365 edge_iterator ei;
366 basic_block bb;
367 fibheap_t worklist, pending, fibheap_swap;
368 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
369 int *bb_order;
370 int *rc_order;
371 int i;
372
373 /* Set up block info for each basic block. */
374 alloc_aux_for_blocks (sizeof (struct block_info_def));
375
376 /* Process outgoing edges of entry point. */
377 if (dump_file)
378 fprintf (dump_file, "Process outgoing edges of entry point\n");
379
380 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
381 {
382 move_or_delete_vzeroupper_2 (e->dest,
383 cfun->machine->caller_pass_avx256_p
384 ? used : unused);
385 BLOCK_INFO (e->dest)->processed = true;
386 }
387
388 /* Compute reverse completion order of depth first search of the CFG
389 so that the data-flow runs faster. */
390 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
391 bb_order = XNEWVEC (int, last_basic_block);
392 pre_and_rev_post_order_compute (NULL, rc_order, false);
393 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
394 bb_order[rc_order[i]] = i;
395 free (rc_order);
396
397 worklist = fibheap_new ();
398 pending = fibheap_new ();
399 visited = sbitmap_alloc (last_basic_block);
400 in_worklist = sbitmap_alloc (last_basic_block);
401 in_pending = sbitmap_alloc (last_basic_block);
402 sbitmap_zero (in_worklist);
403
404 /* Don't check outgoing edges of entry point. */
405 sbitmap_ones (in_pending);
406 FOR_EACH_BB (bb)
407 if (BLOCK_INFO (bb)->processed)
408 RESET_BIT (in_pending, bb->index);
409 else
410 {
411 move_or_delete_vzeroupper_1 (bb, false);
412 fibheap_insert (pending, bb_order[bb->index], bb);
413 }
414
415 if (dump_file)
416 fprintf (dump_file, "Check remaining basic blocks\n");
417
418 while (!fibheap_empty (pending))
419 {
420 fibheap_swap = pending;
421 pending = worklist;
422 worklist = fibheap_swap;
423 sbitmap_swap = in_pending;
424 in_pending = in_worklist;
425 in_worklist = sbitmap_swap;
426
427 sbitmap_zero (visited);
428
429 cfun->machine->rescan_vzeroupper_p = 0;
430
431 while (!fibheap_empty (worklist))
432 {
433 bb = (basic_block) fibheap_extract_min (worklist);
434 RESET_BIT (in_worklist, bb->index);
435 gcc_assert (!TEST_BIT (visited, bb->index));
436 if (!TEST_BIT (visited, bb->index))
437 {
438 edge_iterator ei;
439
440 SET_BIT (visited, bb->index);
441
442 if (move_or_delete_vzeroupper_1 (bb, false))
443 FOR_EACH_EDGE (e, ei, bb->succs)
444 {
445 if (e->dest == EXIT_BLOCK_PTR
446 || BLOCK_INFO (e->dest)->processed)
447 continue;
448
449 if (TEST_BIT (visited, e->dest->index))
450 {
451 if (!TEST_BIT (in_pending, e->dest->index))
452 {
453 /* Send E->DEST to next round. */
454 SET_BIT (in_pending, e->dest->index);
455 fibheap_insert (pending,
456 bb_order[e->dest->index],
457 e->dest);
458 }
459 }
460 else if (!TEST_BIT (in_worklist, e->dest->index))
461 {
462 /* Add E->DEST to current round. */
463 SET_BIT (in_worklist, e->dest->index);
464 fibheap_insert (worklist, bb_order[e->dest->index],
465 e->dest);
466 }
467 }
468 }
469 }
470
471 if (!cfun->machine->rescan_vzeroupper_p)
472 break;
473 }
474
475 free (bb_order);
476 fibheap_delete (worklist);
477 fibheap_delete (pending);
478 sbitmap_free (visited);
479 sbitmap_free (in_worklist);
480 sbitmap_free (in_pending);
481
482 if (dump_file)
483 fprintf (dump_file, "Process remaining basic blocks\n");
484
485 FOR_EACH_BB (bb)
486 move_or_delete_vzeroupper_1 (bb, true);
487
488 free_aux_for_blocks ();
489 }
490
491 static rtx legitimize_dllimport_symbol (rtx, bool);
492
493 #ifndef CHECK_STACK_LIMIT
494 #define CHECK_STACK_LIMIT (-1)
495 #endif
496
497 /* Return index of given mode in mult and division cost tables. */
498 #define MODE_INDEX(mode) \
499 ((mode) == QImode ? 0 \
500 : (mode) == HImode ? 1 \
501 : (mode) == SImode ? 2 \
502 : (mode) == DImode ? 3 \
503 : 4)
504
505 /* Processor costs (relative to an add) */
506 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
507 #define COSTS_N_BYTES(N) ((N) * 2)
508
509 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
510
511 const
512 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
513 COSTS_N_BYTES (2), /* cost of an add instruction */
514 COSTS_N_BYTES (3), /* cost of a lea instruction */
515 COSTS_N_BYTES (2), /* variable shift costs */
516 COSTS_N_BYTES (3), /* constant shift costs */
517 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
518 COSTS_N_BYTES (3), /* HI */
519 COSTS_N_BYTES (3), /* SI */
520 COSTS_N_BYTES (3), /* DI */
521 COSTS_N_BYTES (5)}, /* other */
522 0, /* cost of multiply per each bit set */
523 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
524 COSTS_N_BYTES (3), /* HI */
525 COSTS_N_BYTES (3), /* SI */
526 COSTS_N_BYTES (3), /* DI */
527 COSTS_N_BYTES (5)}, /* other */
528 COSTS_N_BYTES (3), /* cost of movsx */
529 COSTS_N_BYTES (3), /* cost of movzx */
530 0, /* "large" insn */
531 2, /* MOVE_RATIO */
532 2, /* cost for loading QImode using movzbl */
533 {2, 2, 2}, /* cost of loading integer registers
534 in QImode, HImode and SImode.
535 Relative to reg-reg move (2). */
536 {2, 2, 2}, /* cost of storing integer registers */
537 2, /* cost of reg,reg fld/fst */
538 {2, 2, 2}, /* cost of loading fp registers
539 in SFmode, DFmode and XFmode */
540 {2, 2, 2}, /* cost of storing fp registers
541 in SFmode, DFmode and XFmode */
542 3, /* cost of moving MMX register */
543 {3, 3}, /* cost of loading MMX registers
544 in SImode and DImode */
545 {3, 3}, /* cost of storing MMX registers
546 in SImode and DImode */
547 3, /* cost of moving SSE register */
548 {3, 3, 3}, /* cost of loading SSE registers
549 in SImode, DImode and TImode */
550 {3, 3, 3}, /* cost of storing SSE registers
551 in SImode, DImode and TImode */
552 3, /* MMX or SSE register to integer */
553 0, /* size of l1 cache */
554 0, /* size of l2 cache */
555 0, /* size of prefetch block */
556 0, /* number of parallel prefetches */
557 2, /* Branch cost */
558 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
559 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
560 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
561 COSTS_N_BYTES (2), /* cost of FABS instruction. */
562 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
563 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
564 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
565 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
566 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
567 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
568 1, /* scalar_stmt_cost. */
569 1, /* scalar load_cost. */
570 1, /* scalar_store_cost. */
571 1, /* vec_stmt_cost. */
572 1, /* vec_to_scalar_cost. */
573 1, /* scalar_to_vec_cost. */
574 1, /* vec_align_load_cost. */
575 1, /* vec_unalign_load_cost. */
576 1, /* vec_store_cost. */
577 1, /* cond_taken_branch_cost. */
578 1, /* cond_not_taken_branch_cost. */
579 };
580
581 /* Processor costs (relative to an add) */
582 static const
583 struct processor_costs i386_cost = { /* 386 specific costs */
584 COSTS_N_INSNS (1), /* cost of an add instruction */
585 COSTS_N_INSNS (1), /* cost of a lea instruction */
586 COSTS_N_INSNS (3), /* variable shift costs */
587 COSTS_N_INSNS (2), /* constant shift costs */
588 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
589 COSTS_N_INSNS (6), /* HI */
590 COSTS_N_INSNS (6), /* SI */
591 COSTS_N_INSNS (6), /* DI */
592 COSTS_N_INSNS (6)}, /* other */
593 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
594 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
595 COSTS_N_INSNS (23), /* HI */
596 COSTS_N_INSNS (23), /* SI */
597 COSTS_N_INSNS (23), /* DI */
598 COSTS_N_INSNS (23)}, /* other */
599 COSTS_N_INSNS (3), /* cost of movsx */
600 COSTS_N_INSNS (2), /* cost of movzx */
601 15, /* "large" insn */
602 3, /* MOVE_RATIO */
603 4, /* cost for loading QImode using movzbl */
604 {2, 4, 2}, /* cost of loading integer registers
605 in QImode, HImode and SImode.
606 Relative to reg-reg move (2). */
607 {2, 4, 2}, /* cost of storing integer registers */
608 2, /* cost of reg,reg fld/fst */
609 {8, 8, 8}, /* cost of loading fp registers
610 in SFmode, DFmode and XFmode */
611 {8, 8, 8}, /* cost of storing fp registers
612 in SFmode, DFmode and XFmode */
613 2, /* cost of moving MMX register */
614 {4, 8}, /* cost of loading MMX registers
615 in SImode and DImode */
616 {4, 8}, /* cost of storing MMX registers
617 in SImode and DImode */
618 2, /* cost of moving SSE register */
619 {4, 8, 16}, /* cost of loading SSE registers
620 in SImode, DImode and TImode */
621 {4, 8, 16}, /* cost of storing SSE registers
622 in SImode, DImode and TImode */
623 3, /* MMX or SSE register to integer */
624 0, /* size of l1 cache */
625 0, /* size of l2 cache */
626 0, /* size of prefetch block */
627 0, /* number of parallel prefetches */
628 1, /* Branch cost */
629 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
630 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
631 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
632 COSTS_N_INSNS (22), /* cost of FABS instruction. */
633 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
634 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
635 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
636 DUMMY_STRINGOP_ALGS},
637 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
638 DUMMY_STRINGOP_ALGS},
639 1, /* scalar_stmt_cost. */
640 1, /* scalar load_cost. */
641 1, /* scalar_store_cost. */
642 1, /* vec_stmt_cost. */
643 1, /* vec_to_scalar_cost. */
644 1, /* scalar_to_vec_cost. */
645 1, /* vec_align_load_cost. */
646 2, /* vec_unalign_load_cost. */
647 1, /* vec_store_cost. */
648 3, /* cond_taken_branch_cost. */
649 1, /* cond_not_taken_branch_cost. */
650 };
651
652 static const
653 struct processor_costs i486_cost = { /* 486 specific costs */
654 COSTS_N_INSNS (1), /* cost of an add instruction */
655 COSTS_N_INSNS (1), /* cost of a lea instruction */
656 COSTS_N_INSNS (3), /* variable shift costs */
657 COSTS_N_INSNS (2), /* constant shift costs */
658 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
659 COSTS_N_INSNS (12), /* HI */
660 COSTS_N_INSNS (12), /* SI */
661 COSTS_N_INSNS (12), /* DI */
662 COSTS_N_INSNS (12)}, /* other */
663 1, /* cost of multiply per each bit set */
664 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
665 COSTS_N_INSNS (40), /* HI */
666 COSTS_N_INSNS (40), /* SI */
667 COSTS_N_INSNS (40), /* DI */
668 COSTS_N_INSNS (40)}, /* other */
669 COSTS_N_INSNS (3), /* cost of movsx */
670 COSTS_N_INSNS (2), /* cost of movzx */
671 15, /* "large" insn */
672 3, /* MOVE_RATIO */
673 4, /* cost for loading QImode using movzbl */
674 {2, 4, 2}, /* cost of loading integer registers
675 in QImode, HImode and SImode.
676 Relative to reg-reg move (2). */
677 {2, 4, 2}, /* cost of storing integer registers */
678 2, /* cost of reg,reg fld/fst */
679 {8, 8, 8}, /* cost of loading fp registers
680 in SFmode, DFmode and XFmode */
681 {8, 8, 8}, /* cost of storing fp registers
682 in SFmode, DFmode and XFmode */
683 2, /* cost of moving MMX register */
684 {4, 8}, /* cost of loading MMX registers
685 in SImode and DImode */
686 {4, 8}, /* cost of storing MMX registers
687 in SImode and DImode */
688 2, /* cost of moving SSE register */
689 {4, 8, 16}, /* cost of loading SSE registers
690 in SImode, DImode and TImode */
691 {4, 8, 16}, /* cost of storing SSE registers
692 in SImode, DImode and TImode */
693 3, /* MMX or SSE register to integer */
694 4, /* size of l1 cache. 486 has 8kB cache
695 shared for code and data, so 4kB is
696 not really precise. */
697 4, /* size of l2 cache */
698 0, /* size of prefetch block */
699 0, /* number of parallel prefetches */
700 1, /* Branch cost */
701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
702 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
703 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
704 COSTS_N_INSNS (3), /* cost of FABS instruction. */
705 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
706 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
707 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
708 DUMMY_STRINGOP_ALGS},
709 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
710 DUMMY_STRINGOP_ALGS},
711 1, /* scalar_stmt_cost. */
712 1, /* scalar load_cost. */
713 1, /* scalar_store_cost. */
714 1, /* vec_stmt_cost. */
715 1, /* vec_to_scalar_cost. */
716 1, /* scalar_to_vec_cost. */
717 1, /* vec_align_load_cost. */
718 2, /* vec_unalign_load_cost. */
719 1, /* vec_store_cost. */
720 3, /* cond_taken_branch_cost. */
721 1, /* cond_not_taken_branch_cost. */
722 };
723
724 static const
725 struct processor_costs pentium_cost = {
726 COSTS_N_INSNS (1), /* cost of an add instruction */
727 COSTS_N_INSNS (1), /* cost of a lea instruction */
728 COSTS_N_INSNS (4), /* variable shift costs */
729 COSTS_N_INSNS (1), /* constant shift costs */
730 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
731 COSTS_N_INSNS (11), /* HI */
732 COSTS_N_INSNS (11), /* SI */
733 COSTS_N_INSNS (11), /* DI */
734 COSTS_N_INSNS (11)}, /* other */
735 0, /* cost of multiply per each bit set */
736 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
737 COSTS_N_INSNS (25), /* HI */
738 COSTS_N_INSNS (25), /* SI */
739 COSTS_N_INSNS (25), /* DI */
740 COSTS_N_INSNS (25)}, /* other */
741 COSTS_N_INSNS (3), /* cost of movsx */
742 COSTS_N_INSNS (2), /* cost of movzx */
743 8, /* "large" insn */
744 6, /* MOVE_RATIO */
745 6, /* cost for loading QImode using movzbl */
746 {2, 4, 2}, /* cost of loading integer registers
747 in QImode, HImode and SImode.
748 Relative to reg-reg move (2). */
749 {2, 4, 2}, /* cost of storing integer registers */
750 2, /* cost of reg,reg fld/fst */
751 {2, 2, 6}, /* cost of loading fp registers
752 in SFmode, DFmode and XFmode */
753 {4, 4, 6}, /* cost of storing fp registers
754 in SFmode, DFmode and XFmode */
755 8, /* cost of moving MMX register */
756 {8, 8}, /* cost of loading MMX registers
757 in SImode and DImode */
758 {8, 8}, /* cost of storing MMX registers
759 in SImode and DImode */
760 2, /* cost of moving SSE register */
761 {4, 8, 16}, /* cost of loading SSE registers
762 in SImode, DImode and TImode */
763 {4, 8, 16}, /* cost of storing SSE registers
764 in SImode, DImode and TImode */
765 3, /* MMX or SSE register to integer */
766 8, /* size of l1 cache. */
767 8, /* size of l2 cache */
768 0, /* size of prefetch block */
769 0, /* number of parallel prefetches */
770 2, /* Branch cost */
771 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
772 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
773 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
774 COSTS_N_INSNS (1), /* cost of FABS instruction. */
775 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
776 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
777 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
778 DUMMY_STRINGOP_ALGS},
779 {{libcall, {{-1, rep_prefix_4_byte}}},
780 DUMMY_STRINGOP_ALGS},
781 1, /* scalar_stmt_cost. */
782 1, /* scalar load_cost. */
783 1, /* scalar_store_cost. */
784 1, /* vec_stmt_cost. */
785 1, /* vec_to_scalar_cost. */
786 1, /* scalar_to_vec_cost. */
787 1, /* vec_align_load_cost. */
788 2, /* vec_unalign_load_cost. */
789 1, /* vec_store_cost. */
790 3, /* cond_taken_branch_cost. */
791 1, /* cond_not_taken_branch_cost. */
792 };
793
794 static const
795 struct processor_costs pentiumpro_cost = {
796 COSTS_N_INSNS (1), /* cost of an add instruction */
797 COSTS_N_INSNS (1), /* cost of a lea instruction */
798 COSTS_N_INSNS (1), /* variable shift costs */
799 COSTS_N_INSNS (1), /* constant shift costs */
800 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
801 COSTS_N_INSNS (4), /* HI */
802 COSTS_N_INSNS (4), /* SI */
803 COSTS_N_INSNS (4), /* DI */
804 COSTS_N_INSNS (4)}, /* other */
805 0, /* cost of multiply per each bit set */
806 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
807 COSTS_N_INSNS (17), /* HI */
808 COSTS_N_INSNS (17), /* SI */
809 COSTS_N_INSNS (17), /* DI */
810 COSTS_N_INSNS (17)}, /* other */
811 COSTS_N_INSNS (1), /* cost of movsx */
812 COSTS_N_INSNS (1), /* cost of movzx */
813 8, /* "large" insn */
814 6, /* MOVE_RATIO */
815 2, /* cost for loading QImode using movzbl */
816 {4, 4, 4}, /* cost of loading integer registers
817 in QImode, HImode and SImode.
818 Relative to reg-reg move (2). */
819 {2, 2, 2}, /* cost of storing integer registers */
820 2, /* cost of reg,reg fld/fst */
821 {2, 2, 6}, /* cost of loading fp registers
822 in SFmode, DFmode and XFmode */
823 {4, 4, 6}, /* cost of storing fp registers
824 in SFmode, DFmode and XFmode */
825 2, /* cost of moving MMX register */
826 {2, 2}, /* cost of loading MMX registers
827 in SImode and DImode */
828 {2, 2}, /* cost of storing MMX registers
829 in SImode and DImode */
830 2, /* cost of moving SSE register */
831 {2, 2, 8}, /* cost of loading SSE registers
832 in SImode, DImode and TImode */
833 {2, 2, 8}, /* cost of storing SSE registers
834 in SImode, DImode and TImode */
835 3, /* MMX or SSE register to integer */
836 8, /* size of l1 cache. */
837 256, /* size of l2 cache */
838 32, /* size of prefetch block */
839 6, /* number of parallel prefetches */
840 2, /* Branch cost */
841 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
842 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
843 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
844 COSTS_N_INSNS (2), /* cost of FABS instruction. */
845 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
846 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
847 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
848 (we ensure the alignment). For small blocks inline loop is still a
849 noticeable win, for bigger blocks either rep movsl or rep movsb is
850 way to go. Rep movsb has apparently more expensive startup time in CPU,
851 but after 4K the difference is down in the noise. */
852 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
853 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
854 DUMMY_STRINGOP_ALGS},
855 {{rep_prefix_4_byte, {{1024, unrolled_loop},
856 {8192, rep_prefix_4_byte}, {-1, libcall}}},
857 DUMMY_STRINGOP_ALGS},
858 1, /* scalar_stmt_cost. */
859 1, /* scalar load_cost. */
860 1, /* scalar_store_cost. */
861 1, /* vec_stmt_cost. */
862 1, /* vec_to_scalar_cost. */
863 1, /* scalar_to_vec_cost. */
864 1, /* vec_align_load_cost. */
865 2, /* vec_unalign_load_cost. */
866 1, /* vec_store_cost. */
867 3, /* cond_taken_branch_cost. */
868 1, /* cond_not_taken_branch_cost. */
869 };
870
871 static const
872 struct processor_costs geode_cost = {
873 COSTS_N_INSNS (1), /* cost of an add instruction */
874 COSTS_N_INSNS (1), /* cost of a lea instruction */
875 COSTS_N_INSNS (2), /* variable shift costs */
876 COSTS_N_INSNS (1), /* constant shift costs */
877 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
878 COSTS_N_INSNS (4), /* HI */
879 COSTS_N_INSNS (7), /* SI */
880 COSTS_N_INSNS (7), /* DI */
881 COSTS_N_INSNS (7)}, /* other */
882 0, /* cost of multiply per each bit set */
883 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
884 COSTS_N_INSNS (23), /* HI */
885 COSTS_N_INSNS (39), /* SI */
886 COSTS_N_INSNS (39), /* DI */
887 COSTS_N_INSNS (39)}, /* other */
888 COSTS_N_INSNS (1), /* cost of movsx */
889 COSTS_N_INSNS (1), /* cost of movzx */
890 8, /* "large" insn */
891 4, /* MOVE_RATIO */
892 1, /* cost for loading QImode using movzbl */
893 {1, 1, 1}, /* cost of loading integer registers
894 in QImode, HImode and SImode.
895 Relative to reg-reg move (2). */
896 {1, 1, 1}, /* cost of storing integer registers */
897 1, /* cost of reg,reg fld/fst */
898 {1, 1, 1}, /* cost of loading fp registers
899 in SFmode, DFmode and XFmode */
900 {4, 6, 6}, /* cost of storing fp registers
901 in SFmode, DFmode and XFmode */
902
903 1, /* cost of moving MMX register */
904 {1, 1}, /* cost of loading MMX registers
905 in SImode and DImode */
906 {1, 1}, /* cost of storing MMX registers
907 in SImode and DImode */
908 1, /* cost of moving SSE register */
909 {1, 1, 1}, /* cost of loading SSE registers
910 in SImode, DImode and TImode */
911 {1, 1, 1}, /* cost of storing SSE registers
912 in SImode, DImode and TImode */
913 1, /* MMX or SSE register to integer */
914 64, /* size of l1 cache. */
915 128, /* size of l2 cache. */
916 32, /* size of prefetch block */
917 1, /* number of parallel prefetches */
918 1, /* Branch cost */
919 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
920 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
921 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
922 COSTS_N_INSNS (1), /* cost of FABS instruction. */
923 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
924 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
925 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
926 DUMMY_STRINGOP_ALGS},
927 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
928 DUMMY_STRINGOP_ALGS},
929 1, /* scalar_stmt_cost. */
930 1, /* scalar load_cost. */
931 1, /* scalar_store_cost. */
932 1, /* vec_stmt_cost. */
933 1, /* vec_to_scalar_cost. */
934 1, /* scalar_to_vec_cost. */
935 1, /* vec_align_load_cost. */
936 2, /* vec_unalign_load_cost. */
937 1, /* vec_store_cost. */
938 3, /* cond_taken_branch_cost. */
939 1, /* cond_not_taken_branch_cost. */
940 };
941
942 static const
943 struct processor_costs k6_cost = {
944 COSTS_N_INSNS (1), /* cost of an add instruction */
945 COSTS_N_INSNS (2), /* cost of a lea instruction */
946 COSTS_N_INSNS (1), /* variable shift costs */
947 COSTS_N_INSNS (1), /* constant shift costs */
948 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
949 COSTS_N_INSNS (3), /* HI */
950 COSTS_N_INSNS (3), /* SI */
951 COSTS_N_INSNS (3), /* DI */
952 COSTS_N_INSNS (3)}, /* other */
953 0, /* cost of multiply per each bit set */
954 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
955 COSTS_N_INSNS (18), /* HI */
956 COSTS_N_INSNS (18), /* SI */
957 COSTS_N_INSNS (18), /* DI */
958 COSTS_N_INSNS (18)}, /* other */
959 COSTS_N_INSNS (2), /* cost of movsx */
960 COSTS_N_INSNS (2), /* cost of movzx */
961 8, /* "large" insn */
962 4, /* MOVE_RATIO */
963 3, /* cost for loading QImode using movzbl */
964 {4, 5, 4}, /* cost of loading integer registers
965 in QImode, HImode and SImode.
966 Relative to reg-reg move (2). */
967 {2, 3, 2}, /* cost of storing integer registers */
968 4, /* cost of reg,reg fld/fst */
969 {6, 6, 6}, /* cost of loading fp registers
970 in SFmode, DFmode and XFmode */
971 {4, 4, 4}, /* cost of storing fp registers
972 in SFmode, DFmode and XFmode */
973 2, /* cost of moving MMX register */
974 {2, 2}, /* cost of loading MMX registers
975 in SImode and DImode */
976 {2, 2}, /* cost of storing MMX registers
977 in SImode and DImode */
978 2, /* cost of moving SSE register */
979 {2, 2, 8}, /* cost of loading SSE registers
980 in SImode, DImode and TImode */
981 {2, 2, 8}, /* cost of storing SSE registers
982 in SImode, DImode and TImode */
983 6, /* MMX or SSE register to integer */
984 32, /* size of l1 cache. */
985 32, /* size of l2 cache. Some models
986 have integrated l2 cache, but
987 optimizing for k6 is not important
988 enough to worry about that. */
989 32, /* size of prefetch block */
990 1, /* number of parallel prefetches */
991 1, /* Branch cost */
992 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
993 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
994 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
995 COSTS_N_INSNS (2), /* cost of FABS instruction. */
996 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
997 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
998 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
999 DUMMY_STRINGOP_ALGS},
1000 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1001 DUMMY_STRINGOP_ALGS},
1002 1, /* scalar_stmt_cost. */
1003 1, /* scalar load_cost. */
1004 1, /* scalar_store_cost. */
1005 1, /* vec_stmt_cost. */
1006 1, /* vec_to_scalar_cost. */
1007 1, /* scalar_to_vec_cost. */
1008 1, /* vec_align_load_cost. */
1009 2, /* vec_unalign_load_cost. */
1010 1, /* vec_store_cost. */
1011 3, /* cond_taken_branch_cost. */
1012 1, /* cond_not_taken_branch_cost. */
1013 };
1014
1015 static const
1016 struct processor_costs athlon_cost = {
1017 COSTS_N_INSNS (1), /* cost of an add instruction */
1018 COSTS_N_INSNS (2), /* cost of a lea instruction */
1019 COSTS_N_INSNS (1), /* variable shift costs */
1020 COSTS_N_INSNS (1), /* constant shift costs */
1021 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1022 COSTS_N_INSNS (5), /* HI */
1023 COSTS_N_INSNS (5), /* SI */
1024 COSTS_N_INSNS (5), /* DI */
1025 COSTS_N_INSNS (5)}, /* other */
1026 0, /* cost of multiply per each bit set */
1027 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1028 COSTS_N_INSNS (26), /* HI */
1029 COSTS_N_INSNS (42), /* SI */
1030 COSTS_N_INSNS (74), /* DI */
1031 COSTS_N_INSNS (74)}, /* other */
1032 COSTS_N_INSNS (1), /* cost of movsx */
1033 COSTS_N_INSNS (1), /* cost of movzx */
1034 8, /* "large" insn */
1035 9, /* MOVE_RATIO */
1036 4, /* cost for loading QImode using movzbl */
1037 {3, 4, 3}, /* cost of loading integer registers
1038 in QImode, HImode and SImode.
1039 Relative to reg-reg move (2). */
1040 {3, 4, 3}, /* cost of storing integer registers */
1041 4, /* cost of reg,reg fld/fst */
1042 {4, 4, 12}, /* cost of loading fp registers
1043 in SFmode, DFmode and XFmode */
1044 {6, 6, 8}, /* cost of storing fp registers
1045 in SFmode, DFmode and XFmode */
1046 2, /* cost of moving MMX register */
1047 {4, 4}, /* cost of loading MMX registers
1048 in SImode and DImode */
1049 {4, 4}, /* cost of storing MMX registers
1050 in SImode and DImode */
1051 2, /* cost of moving SSE register */
1052 {4, 4, 6}, /* cost of loading SSE registers
1053 in SImode, DImode and TImode */
1054 {4, 4, 5}, /* cost of storing SSE registers
1055 in SImode, DImode and TImode */
1056 5, /* MMX or SSE register to integer */
1057 64, /* size of l1 cache. */
1058 256, /* size of l2 cache. */
1059 64, /* size of prefetch block */
1060 6, /* number of parallel prefetches */
1061 5, /* Branch cost */
1062 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1063 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1064 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1065 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1066 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1067 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1068 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1069 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1070 128 bytes for memset. */
1071 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1072 DUMMY_STRINGOP_ALGS},
1073 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1074 DUMMY_STRINGOP_ALGS},
1075 1, /* scalar_stmt_cost. */
1076 1, /* scalar load_cost. */
1077 1, /* scalar_store_cost. */
1078 1, /* vec_stmt_cost. */
1079 1, /* vec_to_scalar_cost. */
1080 1, /* scalar_to_vec_cost. */
1081 1, /* vec_align_load_cost. */
1082 2, /* vec_unalign_load_cost. */
1083 1, /* vec_store_cost. */
1084 3, /* cond_taken_branch_cost. */
1085 1, /* cond_not_taken_branch_cost. */
1086 };
1087
1088 static const
1089 struct processor_costs k8_cost = {
1090 COSTS_N_INSNS (1), /* cost of an add instruction */
1091 COSTS_N_INSNS (2), /* cost of a lea instruction */
1092 COSTS_N_INSNS (1), /* variable shift costs */
1093 COSTS_N_INSNS (1), /* constant shift costs */
1094 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1095 COSTS_N_INSNS (4), /* HI */
1096 COSTS_N_INSNS (3), /* SI */
1097 COSTS_N_INSNS (4), /* DI */
1098 COSTS_N_INSNS (5)}, /* other */
1099 0, /* cost of multiply per each bit set */
1100 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1101 COSTS_N_INSNS (26), /* HI */
1102 COSTS_N_INSNS (42), /* SI */
1103 COSTS_N_INSNS (74), /* DI */
1104 COSTS_N_INSNS (74)}, /* other */
1105 COSTS_N_INSNS (1), /* cost of movsx */
1106 COSTS_N_INSNS (1), /* cost of movzx */
1107 8, /* "large" insn */
1108 9, /* MOVE_RATIO */
1109 4, /* cost for loading QImode using movzbl */
1110 {3, 4, 3}, /* cost of loading integer registers
1111 in QImode, HImode and SImode.
1112 Relative to reg-reg move (2). */
1113 {3, 4, 3}, /* cost of storing integer registers */
1114 4, /* cost of reg,reg fld/fst */
1115 {4, 4, 12}, /* cost of loading fp registers
1116 in SFmode, DFmode and XFmode */
1117 {6, 6, 8}, /* cost of storing fp registers
1118 in SFmode, DFmode and XFmode */
1119 2, /* cost of moving MMX register */
1120 {3, 3}, /* cost of loading MMX registers
1121 in SImode and DImode */
1122 {4, 4}, /* cost of storing MMX registers
1123 in SImode and DImode */
1124 2, /* cost of moving SSE register */
1125 {4, 3, 6}, /* cost of loading SSE registers
1126 in SImode, DImode and TImode */
1127 {4, 4, 5}, /* cost of storing SSE registers
1128 in SImode, DImode and TImode */
1129 5, /* MMX or SSE register to integer */
1130 64, /* size of l1 cache. */
1131 512, /* size of l2 cache. */
1132 64, /* size of prefetch block */
1133 /* New AMD processors never drop prefetches; if they cannot be performed
1134 immediately, they are queued. We set number of simultaneous prefetches
1135 to a large constant to reflect this (it probably is not a good idea not
1136 to limit number of prefetches at all, as their execution also takes some
1137 time). */
1138 100, /* number of parallel prefetches */
1139 3, /* Branch cost */
1140 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1141 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1142 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1143 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1144 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1145 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1146 /* K8 has optimized REP instruction for medium sized blocks, but for very
1147 small blocks it is better to use loop. For large blocks, libcall can
1148 do nontemporary accesses and beat inline considerably. */
1149 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1150 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1151 {{libcall, {{8, loop}, {24, unrolled_loop},
1152 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1153 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1154 4, /* scalar_stmt_cost. */
1155 2, /* scalar load_cost. */
1156 2, /* scalar_store_cost. */
1157 5, /* vec_stmt_cost. */
1158 0, /* vec_to_scalar_cost. */
1159 2, /* scalar_to_vec_cost. */
1160 2, /* vec_align_load_cost. */
1161 3, /* vec_unalign_load_cost. */
1162 3, /* vec_store_cost. */
1163 3, /* cond_taken_branch_cost. */
1164 2, /* cond_not_taken_branch_cost. */
1165 };
1166
1167 struct processor_costs amdfam10_cost = {
1168 COSTS_N_INSNS (1), /* cost of an add instruction */
1169 COSTS_N_INSNS (2), /* cost of a lea instruction */
1170 COSTS_N_INSNS (1), /* variable shift costs */
1171 COSTS_N_INSNS (1), /* constant shift costs */
1172 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1173 COSTS_N_INSNS (4), /* HI */
1174 COSTS_N_INSNS (3), /* SI */
1175 COSTS_N_INSNS (4), /* DI */
1176 COSTS_N_INSNS (5)}, /* other */
1177 0, /* cost of multiply per each bit set */
1178 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1179 COSTS_N_INSNS (35), /* HI */
1180 COSTS_N_INSNS (51), /* SI */
1181 COSTS_N_INSNS (83), /* DI */
1182 COSTS_N_INSNS (83)}, /* other */
1183 COSTS_N_INSNS (1), /* cost of movsx */
1184 COSTS_N_INSNS (1), /* cost of movzx */
1185 8, /* "large" insn */
1186 9, /* MOVE_RATIO */
1187 4, /* cost for loading QImode using movzbl */
1188 {3, 4, 3}, /* cost of loading integer registers
1189 in QImode, HImode and SImode.
1190 Relative to reg-reg move (2). */
1191 {3, 4, 3}, /* cost of storing integer registers */
1192 4, /* cost of reg,reg fld/fst */
1193 {4, 4, 12}, /* cost of loading fp registers
1194 in SFmode, DFmode and XFmode */
1195 {6, 6, 8}, /* cost of storing fp registers
1196 in SFmode, DFmode and XFmode */
1197 2, /* cost of moving MMX register */
1198 {3, 3}, /* cost of loading MMX registers
1199 in SImode and DImode */
1200 {4, 4}, /* cost of storing MMX registers
1201 in SImode and DImode */
1202 2, /* cost of moving SSE register */
1203 {4, 4, 3}, /* cost of loading SSE registers
1204 in SImode, DImode and TImode */
1205 {4, 4, 5}, /* cost of storing SSE registers
1206 in SImode, DImode and TImode */
1207 3, /* MMX or SSE register to integer */
1208 /* On K8:
1209 MOVD reg64, xmmreg Double FSTORE 4
1210 MOVD reg32, xmmreg Double FSTORE 4
1211 On AMDFAM10:
1212 MOVD reg64, xmmreg Double FADD 3
1213 1/1 1/1
1214 MOVD reg32, xmmreg Double FADD 3
1215 1/1 1/1 */
1216 64, /* size of l1 cache. */
1217 512, /* size of l2 cache. */
1218 64, /* size of prefetch block */
1219 /* New AMD processors never drop prefetches; if they cannot be performed
1220 immediately, they are queued. We set number of simultaneous prefetches
1221 to a large constant to reflect this (it probably is not a good idea not
1222 to limit number of prefetches at all, as their execution also takes some
1223 time). */
1224 100, /* number of parallel prefetches */
1225 2, /* Branch cost */
1226 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1227 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1228 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1229 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1230 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1231 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1232
1233 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1234 very small blocks it is better to use loop. For large blocks, libcall can
1235 do nontemporary accesses and beat inline considerably. */
1236 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1237 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1238 {{libcall, {{8, loop}, {24, unrolled_loop},
1239 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1240 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1241 4, /* scalar_stmt_cost. */
1242 2, /* scalar load_cost. */
1243 2, /* scalar_store_cost. */
1244 6, /* vec_stmt_cost. */
1245 0, /* vec_to_scalar_cost. */
1246 2, /* scalar_to_vec_cost. */
1247 2, /* vec_align_load_cost. */
1248 2, /* vec_unalign_load_cost. */
1249 2, /* vec_store_cost. */
1250 2, /* cond_taken_branch_cost. */
1251 1, /* cond_not_taken_branch_cost. */
1252 };
1253
1254 struct processor_costs bdver1_cost = {
1255 COSTS_N_INSNS (1), /* cost of an add instruction */
1256 COSTS_N_INSNS (1), /* cost of a lea instruction */
1257 COSTS_N_INSNS (1), /* variable shift costs */
1258 COSTS_N_INSNS (1), /* constant shift costs */
1259 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1260 COSTS_N_INSNS (4), /* HI */
1261 COSTS_N_INSNS (4), /* SI */
1262 COSTS_N_INSNS (6), /* DI */
1263 COSTS_N_INSNS (6)}, /* other */
1264 0, /* cost of multiply per each bit set */
1265 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1266 COSTS_N_INSNS (35), /* HI */
1267 COSTS_N_INSNS (51), /* SI */
1268 COSTS_N_INSNS (83), /* DI */
1269 COSTS_N_INSNS (83)}, /* other */
1270 COSTS_N_INSNS (1), /* cost of movsx */
1271 COSTS_N_INSNS (1), /* cost of movzx */
1272 8, /* "large" insn */
1273 9, /* MOVE_RATIO */
1274 4, /* cost for loading QImode using movzbl */
1275 {5, 5, 4}, /* cost of loading integer registers
1276 in QImode, HImode and SImode.
1277 Relative to reg-reg move (2). */
1278 {4, 4, 4}, /* cost of storing integer registers */
1279 2, /* cost of reg,reg fld/fst */
1280 {5, 5, 12}, /* cost of loading fp registers
1281 in SFmode, DFmode and XFmode */
1282 {4, 4, 8}, /* cost of storing fp registers
1283 in SFmode, DFmode and XFmode */
1284 2, /* cost of moving MMX register */
1285 {4, 4}, /* cost of loading MMX registers
1286 in SImode and DImode */
1287 {4, 4}, /* cost of storing MMX registers
1288 in SImode and DImode */
1289 2, /* cost of moving SSE register */
1290 {4, 4, 4}, /* cost of loading SSE registers
1291 in SImode, DImode and TImode */
1292 {4, 4, 4}, /* cost of storing SSE registers
1293 in SImode, DImode and TImode */
1294 2, /* MMX or SSE register to integer */
1295 /* On K8:
1296 MOVD reg64, xmmreg Double FSTORE 4
1297 MOVD reg32, xmmreg Double FSTORE 4
1298 On AMDFAM10:
1299 MOVD reg64, xmmreg Double FADD 3
1300 1/1 1/1
1301 MOVD reg32, xmmreg Double FADD 3
1302 1/1 1/1 */
1303 16, /* size of l1 cache. */
1304 2048, /* size of l2 cache. */
1305 64, /* size of prefetch block */
1306 /* New AMD processors never drop prefetches; if they cannot be performed
1307 immediately, they are queued. We set number of simultaneous prefetches
1308 to a large constant to reflect this (it probably is not a good idea not
1309 to limit number of prefetches at all, as their execution also takes some
1310 time). */
1311 100, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1319
1320 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1321 very small blocks it is better to use loop. For large blocks, libcall
1322 can do nontemporary accesses and beat inline considerably. */
1323 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1324 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1325 {{libcall, {{8, loop}, {24, unrolled_loop},
1326 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1327 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1328 6, /* scalar_stmt_cost. */
1329 4, /* scalar load_cost. */
1330 4, /* scalar_store_cost. */
1331 6, /* vec_stmt_cost. */
1332 0, /* vec_to_scalar_cost. */
1333 2, /* scalar_to_vec_cost. */
1334 4, /* vec_align_load_cost. */
1335 4, /* vec_unalign_load_cost. */
1336 4, /* vec_store_cost. */
1337 2, /* cond_taken_branch_cost. */
1338 1, /* cond_not_taken_branch_cost. */
1339 };
1340
1341 struct processor_costs bdver2_cost = {
1342 COSTS_N_INSNS (1), /* cost of an add instruction */
1343 COSTS_N_INSNS (1), /* cost of a lea instruction */
1344 COSTS_N_INSNS (1), /* variable shift costs */
1345 COSTS_N_INSNS (1), /* constant shift costs */
1346 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1347 COSTS_N_INSNS (4), /* HI */
1348 COSTS_N_INSNS (4), /* SI */
1349 COSTS_N_INSNS (6), /* DI */
1350 COSTS_N_INSNS (6)}, /* other */
1351 0, /* cost of multiply per each bit set */
1352 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1353 COSTS_N_INSNS (35), /* HI */
1354 COSTS_N_INSNS (51), /* SI */
1355 COSTS_N_INSNS (83), /* DI */
1356 COSTS_N_INSNS (83)}, /* other */
1357 COSTS_N_INSNS (1), /* cost of movsx */
1358 COSTS_N_INSNS (1), /* cost of movzx */
1359 8, /* "large" insn */
1360 9, /* MOVE_RATIO */
1361 4, /* cost for loading QImode using movzbl */
1362 {5, 5, 4}, /* cost of loading integer registers
1363 in QImode, HImode and SImode.
1364 Relative to reg-reg move (2). */
1365 {4, 4, 4}, /* cost of storing integer registers */
1366 2, /* cost of reg,reg fld/fst */
1367 {5, 5, 12}, /* cost of loading fp registers
1368 in SFmode, DFmode and XFmode */
1369 {4, 4, 8}, /* cost of storing fp registers
1370 in SFmode, DFmode and XFmode */
1371 2, /* cost of moving MMX register */
1372 {4, 4}, /* cost of loading MMX registers
1373 in SImode and DImode */
1374 {4, 4}, /* cost of storing MMX registers
1375 in SImode and DImode */
1376 2, /* cost of moving SSE register */
1377 {4, 4, 4}, /* cost of loading SSE registers
1378 in SImode, DImode and TImode */
1379 {4, 4, 4}, /* cost of storing SSE registers
1380 in SImode, DImode and TImode */
1381 2, /* MMX or SSE register to integer */
1382 /* On K8:
1383 MOVD reg64, xmmreg Double FSTORE 4
1384 MOVD reg32, xmmreg Double FSTORE 4
1385 On AMDFAM10:
1386 MOVD reg64, xmmreg Double FADD 3
1387 1/1 1/1
1388 MOVD reg32, xmmreg Double FADD 3
1389 1/1 1/1 */
1390 16, /* size of l1 cache. */
1391 2048, /* size of l2 cache. */
1392 64, /* size of prefetch block */
1393 /* New AMD processors never drop prefetches; if they cannot be performed
1394 immediately, they are queued. We set number of simultaneous prefetches
1395 to a large constant to reflect this (it probably is not a good idea not
1396 to limit number of prefetches at all, as their execution also takes some
1397 time). */
1398 100, /* number of parallel prefetches */
1399 2, /* Branch cost */
1400 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1401 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1402 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1403 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1404 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1405 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1406
1407 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1408 very small blocks it is better to use loop. For large blocks, libcall
1409 can do nontemporary accesses and beat inline considerably. */
1410 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1411 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1412 {{libcall, {{8, loop}, {24, unrolled_loop},
1413 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1414 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1415 6, /* scalar_stmt_cost. */
1416 4, /* scalar load_cost. */
1417 4, /* scalar_store_cost. */
1418 6, /* vec_stmt_cost. */
1419 0, /* vec_to_scalar_cost. */
1420 2, /* scalar_to_vec_cost. */
1421 4, /* vec_align_load_cost. */
1422 4, /* vec_unalign_load_cost. */
1423 4, /* vec_store_cost. */
1424 2, /* cond_taken_branch_cost. */
1425 1, /* cond_not_taken_branch_cost. */
1426 };
1427
1428 struct processor_costs btver1_cost = {
1429 COSTS_N_INSNS (1), /* cost of an add instruction */
1430 COSTS_N_INSNS (2), /* cost of a lea instruction */
1431 COSTS_N_INSNS (1), /* variable shift costs */
1432 COSTS_N_INSNS (1), /* constant shift costs */
1433 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1434 COSTS_N_INSNS (4), /* HI */
1435 COSTS_N_INSNS (3), /* SI */
1436 COSTS_N_INSNS (4), /* DI */
1437 COSTS_N_INSNS (5)}, /* other */
1438 0, /* cost of multiply per each bit set */
1439 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1440 COSTS_N_INSNS (35), /* HI */
1441 COSTS_N_INSNS (51), /* SI */
1442 COSTS_N_INSNS (83), /* DI */
1443 COSTS_N_INSNS (83)}, /* other */
1444 COSTS_N_INSNS (1), /* cost of movsx */
1445 COSTS_N_INSNS (1), /* cost of movzx */
1446 8, /* "large" insn */
1447 9, /* MOVE_RATIO */
1448 4, /* cost for loading QImode using movzbl */
1449 {3, 4, 3}, /* cost of loading integer registers
1450 in QImode, HImode and SImode.
1451 Relative to reg-reg move (2). */
1452 {3, 4, 3}, /* cost of storing integer registers */
1453 4, /* cost of reg,reg fld/fst */
1454 {4, 4, 12}, /* cost of loading fp registers
1455 in SFmode, DFmode and XFmode */
1456 {6, 6, 8}, /* cost of storing fp registers
1457 in SFmode, DFmode and XFmode */
1458 2, /* cost of moving MMX register */
1459 {3, 3}, /* cost of loading MMX registers
1460 in SImode and DImode */
1461 {4, 4}, /* cost of storing MMX registers
1462 in SImode and DImode */
1463 2, /* cost of moving SSE register */
1464 {4, 4, 3}, /* cost of loading SSE registers
1465 in SImode, DImode and TImode */
1466 {4, 4, 5}, /* cost of storing SSE registers
1467 in SImode, DImode and TImode */
1468 3, /* MMX or SSE register to integer */
1469 /* On K8:
1470 MOVD reg64, xmmreg Double FSTORE 4
1471 MOVD reg32, xmmreg Double FSTORE 4
1472 On AMDFAM10:
1473 MOVD reg64, xmmreg Double FADD 3
1474 1/1 1/1
1475 MOVD reg32, xmmreg Double FADD 3
1476 1/1 1/1 */
1477 32, /* size of l1 cache. */
1478 512, /* size of l2 cache. */
1479 64, /* size of prefetch block */
1480 100, /* number of parallel prefetches */
1481 2, /* Branch cost */
1482 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1483 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1484 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1485 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1486 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1487 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1488
1489 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1490 very small blocks it is better to use loop. For large blocks, libcall can
1491 do nontemporary accesses and beat inline considerably. */
1492 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1493 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1494 {{libcall, {{8, loop}, {24, unrolled_loop},
1495 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1496 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1497 4, /* scalar_stmt_cost. */
1498 2, /* scalar load_cost. */
1499 2, /* scalar_store_cost. */
1500 6, /* vec_stmt_cost. */
1501 0, /* vec_to_scalar_cost. */
1502 2, /* scalar_to_vec_cost. */
1503 2, /* vec_align_load_cost. */
1504 2, /* vec_unalign_load_cost. */
1505 2, /* vec_store_cost. */
1506 2, /* cond_taken_branch_cost. */
1507 1, /* cond_not_taken_branch_cost. */
1508 };
1509
1510 static const
1511 struct processor_costs pentium4_cost = {
1512 COSTS_N_INSNS (1), /* cost of an add instruction */
1513 COSTS_N_INSNS (3), /* cost of a lea instruction */
1514 COSTS_N_INSNS (4), /* variable shift costs */
1515 COSTS_N_INSNS (4), /* constant shift costs */
1516 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1517 COSTS_N_INSNS (15), /* HI */
1518 COSTS_N_INSNS (15), /* SI */
1519 COSTS_N_INSNS (15), /* DI */
1520 COSTS_N_INSNS (15)}, /* other */
1521 0, /* cost of multiply per each bit set */
1522 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1523 COSTS_N_INSNS (56), /* HI */
1524 COSTS_N_INSNS (56), /* SI */
1525 COSTS_N_INSNS (56), /* DI */
1526 COSTS_N_INSNS (56)}, /* other */
1527 COSTS_N_INSNS (1), /* cost of movsx */
1528 COSTS_N_INSNS (1), /* cost of movzx */
1529 16, /* "large" insn */
1530 6, /* MOVE_RATIO */
1531 2, /* cost for loading QImode using movzbl */
1532 {4, 5, 4}, /* cost of loading integer registers
1533 in QImode, HImode and SImode.
1534 Relative to reg-reg move (2). */
1535 {2, 3, 2}, /* cost of storing integer registers */
1536 2, /* cost of reg,reg fld/fst */
1537 {2, 2, 6}, /* cost of loading fp registers
1538 in SFmode, DFmode and XFmode */
1539 {4, 4, 6}, /* cost of storing fp registers
1540 in SFmode, DFmode and XFmode */
1541 2, /* cost of moving MMX register */
1542 {2, 2}, /* cost of loading MMX registers
1543 in SImode and DImode */
1544 {2, 2}, /* cost of storing MMX registers
1545 in SImode and DImode */
1546 12, /* cost of moving SSE register */
1547 {12, 12, 12}, /* cost of loading SSE registers
1548 in SImode, DImode and TImode */
1549 {2, 2, 8}, /* cost of storing SSE registers
1550 in SImode, DImode and TImode */
1551 10, /* MMX or SSE register to integer */
1552 8, /* size of l1 cache. */
1553 256, /* size of l2 cache. */
1554 64, /* size of prefetch block */
1555 6, /* number of parallel prefetches */
1556 2, /* Branch cost */
1557 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1558 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1559 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1560 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1561 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1562 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1563 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1564 DUMMY_STRINGOP_ALGS},
1565 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1566 {-1, libcall}}},
1567 DUMMY_STRINGOP_ALGS},
1568 1, /* scalar_stmt_cost. */
1569 1, /* scalar load_cost. */
1570 1, /* scalar_store_cost. */
1571 1, /* vec_stmt_cost. */
1572 1, /* vec_to_scalar_cost. */
1573 1, /* scalar_to_vec_cost. */
1574 1, /* vec_align_load_cost. */
1575 2, /* vec_unalign_load_cost. */
1576 1, /* vec_store_cost. */
1577 3, /* cond_taken_branch_cost. */
1578 1, /* cond_not_taken_branch_cost. */
1579 };
1580
1581 static const
1582 struct processor_costs nocona_cost = {
1583 COSTS_N_INSNS (1), /* cost of an add instruction */
1584 COSTS_N_INSNS (1), /* cost of a lea instruction */
1585 COSTS_N_INSNS (1), /* variable shift costs */
1586 COSTS_N_INSNS (1), /* constant shift costs */
1587 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1588 COSTS_N_INSNS (10), /* HI */
1589 COSTS_N_INSNS (10), /* SI */
1590 COSTS_N_INSNS (10), /* DI */
1591 COSTS_N_INSNS (10)}, /* other */
1592 0, /* cost of multiply per each bit set */
1593 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1594 COSTS_N_INSNS (66), /* HI */
1595 COSTS_N_INSNS (66), /* SI */
1596 COSTS_N_INSNS (66), /* DI */
1597 COSTS_N_INSNS (66)}, /* other */
1598 COSTS_N_INSNS (1), /* cost of movsx */
1599 COSTS_N_INSNS (1), /* cost of movzx */
1600 16, /* "large" insn */
1601 17, /* MOVE_RATIO */
1602 4, /* cost for loading QImode using movzbl */
1603 {4, 4, 4}, /* cost of loading integer registers
1604 in QImode, HImode and SImode.
1605 Relative to reg-reg move (2). */
1606 {4, 4, 4}, /* cost of storing integer registers */
1607 3, /* cost of reg,reg fld/fst */
1608 {12, 12, 12}, /* cost of loading fp registers
1609 in SFmode, DFmode and XFmode */
1610 {4, 4, 4}, /* cost of storing fp registers
1611 in SFmode, DFmode and XFmode */
1612 6, /* cost of moving MMX register */
1613 {12, 12}, /* cost of loading MMX registers
1614 in SImode and DImode */
1615 {12, 12}, /* cost of storing MMX registers
1616 in SImode and DImode */
1617 6, /* cost of moving SSE register */
1618 {12, 12, 12}, /* cost of loading SSE registers
1619 in SImode, DImode and TImode */
1620 {12, 12, 12}, /* cost of storing SSE registers
1621 in SImode, DImode and TImode */
1622 8, /* MMX or SSE register to integer */
1623 8, /* size of l1 cache. */
1624 1024, /* size of l2 cache. */
1625 128, /* size of prefetch block */
1626 8, /* number of parallel prefetches */
1627 1, /* Branch cost */
1628 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1629 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1630 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1631 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1632 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1633 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1634 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1635 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1636 {100000, unrolled_loop}, {-1, libcall}}}},
1637 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1638 {-1, libcall}}},
1639 {libcall, {{24, loop}, {64, unrolled_loop},
1640 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1641 1, /* scalar_stmt_cost. */
1642 1, /* scalar load_cost. */
1643 1, /* scalar_store_cost. */
1644 1, /* vec_stmt_cost. */
1645 1, /* vec_to_scalar_cost. */
1646 1, /* scalar_to_vec_cost. */
1647 1, /* vec_align_load_cost. */
1648 2, /* vec_unalign_load_cost. */
1649 1, /* vec_store_cost. */
1650 3, /* cond_taken_branch_cost. */
1651 1, /* cond_not_taken_branch_cost. */
1652 };
1653
1654 static const
1655 struct processor_costs atom_cost = {
1656 COSTS_N_INSNS (1), /* cost of an add instruction */
1657 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1658 COSTS_N_INSNS (1), /* variable shift costs */
1659 COSTS_N_INSNS (1), /* constant shift costs */
1660 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1661 COSTS_N_INSNS (4), /* HI */
1662 COSTS_N_INSNS (3), /* SI */
1663 COSTS_N_INSNS (4), /* DI */
1664 COSTS_N_INSNS (2)}, /* other */
1665 0, /* cost of multiply per each bit set */
1666 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1667 COSTS_N_INSNS (26), /* HI */
1668 COSTS_N_INSNS (42), /* SI */
1669 COSTS_N_INSNS (74), /* DI */
1670 COSTS_N_INSNS (74)}, /* other */
1671 COSTS_N_INSNS (1), /* cost of movsx */
1672 COSTS_N_INSNS (1), /* cost of movzx */
1673 8, /* "large" insn */
1674 17, /* MOVE_RATIO */
1675 4, /* cost for loading QImode using movzbl */
1676 {4, 4, 4}, /* cost of loading integer registers
1677 in QImode, HImode and SImode.
1678 Relative to reg-reg move (2). */
1679 {4, 4, 4}, /* cost of storing integer registers */
1680 4, /* cost of reg,reg fld/fst */
1681 {12, 12, 12}, /* cost of loading fp registers
1682 in SFmode, DFmode and XFmode */
1683 {6, 6, 8}, /* cost of storing fp registers
1684 in SFmode, DFmode and XFmode */
1685 2, /* cost of moving MMX register */
1686 {8, 8}, /* cost of loading MMX registers
1687 in SImode and DImode */
1688 {8, 8}, /* cost of storing MMX registers
1689 in SImode and DImode */
1690 2, /* cost of moving SSE register */
1691 {8, 8, 8}, /* cost of loading SSE registers
1692 in SImode, DImode and TImode */
1693 {8, 8, 8}, /* cost of storing SSE registers
1694 in SImode, DImode and TImode */
1695 5, /* MMX or SSE register to integer */
1696 32, /* size of l1 cache. */
1697 256, /* size of l2 cache. */
1698 64, /* size of prefetch block */
1699 6, /* number of parallel prefetches */
1700 3, /* Branch cost */
1701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1702 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1703 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1704 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1705 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1706 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1707 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1708 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1709 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1710 {{libcall, {{8, loop}, {15, unrolled_loop},
1711 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1712 {libcall, {{24, loop}, {32, unrolled_loop},
1713 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1714 1, /* scalar_stmt_cost. */
1715 1, /* scalar load_cost. */
1716 1, /* scalar_store_cost. */
1717 1, /* vec_stmt_cost. */
1718 1, /* vec_to_scalar_cost. */
1719 1, /* scalar_to_vec_cost. */
1720 1, /* vec_align_load_cost. */
1721 2, /* vec_unalign_load_cost. */
1722 1, /* vec_store_cost. */
1723 3, /* cond_taken_branch_cost. */
1724 1, /* cond_not_taken_branch_cost. */
1725 };
1726
1727 /* Generic64 should produce code tuned for Nocona and K8. */
1728 static const
1729 struct processor_costs generic64_cost = {
1730 COSTS_N_INSNS (1), /* cost of an add instruction */
1731 /* On all chips taken into consideration lea is 2 cycles and more. With
1732 this cost however our current implementation of synth_mult results in
1733 use of unnecessary temporary registers causing regression on several
1734 SPECfp benchmarks. */
1735 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1736 COSTS_N_INSNS (1), /* variable shift costs */
1737 COSTS_N_INSNS (1), /* constant shift costs */
1738 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1739 COSTS_N_INSNS (4), /* HI */
1740 COSTS_N_INSNS (3), /* SI */
1741 COSTS_N_INSNS (4), /* DI */
1742 COSTS_N_INSNS (2)}, /* other */
1743 0, /* cost of multiply per each bit set */
1744 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1745 COSTS_N_INSNS (26), /* HI */
1746 COSTS_N_INSNS (42), /* SI */
1747 COSTS_N_INSNS (74), /* DI */
1748 COSTS_N_INSNS (74)}, /* other */
1749 COSTS_N_INSNS (1), /* cost of movsx */
1750 COSTS_N_INSNS (1), /* cost of movzx */
1751 8, /* "large" insn */
1752 17, /* MOVE_RATIO */
1753 4, /* cost for loading QImode using movzbl */
1754 {4, 4, 4}, /* cost of loading integer registers
1755 in QImode, HImode and SImode.
1756 Relative to reg-reg move (2). */
1757 {4, 4, 4}, /* cost of storing integer registers */
1758 4, /* cost of reg,reg fld/fst */
1759 {12, 12, 12}, /* cost of loading fp registers
1760 in SFmode, DFmode and XFmode */
1761 {6, 6, 8}, /* cost of storing fp registers
1762 in SFmode, DFmode and XFmode */
1763 2, /* cost of moving MMX register */
1764 {8, 8}, /* cost of loading MMX registers
1765 in SImode and DImode */
1766 {8, 8}, /* cost of storing MMX registers
1767 in SImode and DImode */
1768 2, /* cost of moving SSE register */
1769 {8, 8, 8}, /* cost of loading SSE registers
1770 in SImode, DImode and TImode */
1771 {8, 8, 8}, /* cost of storing SSE registers
1772 in SImode, DImode and TImode */
1773 5, /* MMX or SSE register to integer */
1774 32, /* size of l1 cache. */
1775 512, /* size of l2 cache. */
1776 64, /* size of prefetch block */
1777 6, /* number of parallel prefetches */
1778 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1779 value is increased to perhaps more appropriate value of 5. */
1780 3, /* Branch cost */
1781 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1782 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1783 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1784 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1785 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1786 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1787 {DUMMY_STRINGOP_ALGS,
1788 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1789 {DUMMY_STRINGOP_ALGS,
1790 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1791 1, /* scalar_stmt_cost. */
1792 1, /* scalar load_cost. */
1793 1, /* scalar_store_cost. */
1794 1, /* vec_stmt_cost. */
1795 1, /* vec_to_scalar_cost. */
1796 1, /* scalar_to_vec_cost. */
1797 1, /* vec_align_load_cost. */
1798 2, /* vec_unalign_load_cost. */
1799 1, /* vec_store_cost. */
1800 3, /* cond_taken_branch_cost. */
1801 1, /* cond_not_taken_branch_cost. */
1802 };
1803
1804 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1805 Athlon and K8. */
1806 static const
1807 struct processor_costs generic32_cost = {
1808 COSTS_N_INSNS (1), /* cost of an add instruction */
1809 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1810 COSTS_N_INSNS (1), /* variable shift costs */
1811 COSTS_N_INSNS (1), /* constant shift costs */
1812 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1813 COSTS_N_INSNS (4), /* HI */
1814 COSTS_N_INSNS (3), /* SI */
1815 COSTS_N_INSNS (4), /* DI */
1816 COSTS_N_INSNS (2)}, /* other */
1817 0, /* cost of multiply per each bit set */
1818 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1819 COSTS_N_INSNS (26), /* HI */
1820 COSTS_N_INSNS (42), /* SI */
1821 COSTS_N_INSNS (74), /* DI */
1822 COSTS_N_INSNS (74)}, /* other */
1823 COSTS_N_INSNS (1), /* cost of movsx */
1824 COSTS_N_INSNS (1), /* cost of movzx */
1825 8, /* "large" insn */
1826 17, /* MOVE_RATIO */
1827 4, /* cost for loading QImode using movzbl */
1828 {4, 4, 4}, /* cost of loading integer registers
1829 in QImode, HImode and SImode.
1830 Relative to reg-reg move (2). */
1831 {4, 4, 4}, /* cost of storing integer registers */
1832 4, /* cost of reg,reg fld/fst */
1833 {12, 12, 12}, /* cost of loading fp registers
1834 in SFmode, DFmode and XFmode */
1835 {6, 6, 8}, /* cost of storing fp registers
1836 in SFmode, DFmode and XFmode */
1837 2, /* cost of moving MMX register */
1838 {8, 8}, /* cost of loading MMX registers
1839 in SImode and DImode */
1840 {8, 8}, /* cost of storing MMX registers
1841 in SImode and DImode */
1842 2, /* cost of moving SSE register */
1843 {8, 8, 8}, /* cost of loading SSE registers
1844 in SImode, DImode and TImode */
1845 {8, 8, 8}, /* cost of storing SSE registers
1846 in SImode, DImode and TImode */
1847 5, /* MMX or SSE register to integer */
1848 32, /* size of l1 cache. */
1849 256, /* size of l2 cache. */
1850 64, /* size of prefetch block */
1851 6, /* number of parallel prefetches */
1852 3, /* Branch cost */
1853 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1854 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1855 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1856 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1857 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1858 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1859 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1860 DUMMY_STRINGOP_ALGS},
1861 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1862 DUMMY_STRINGOP_ALGS},
1863 1, /* scalar_stmt_cost. */
1864 1, /* scalar load_cost. */
1865 1, /* scalar_store_cost. */
1866 1, /* vec_stmt_cost. */
1867 1, /* vec_to_scalar_cost. */
1868 1, /* scalar_to_vec_cost. */
1869 1, /* vec_align_load_cost. */
1870 2, /* vec_unalign_load_cost. */
1871 1, /* vec_store_cost. */
1872 3, /* cond_taken_branch_cost. */
1873 1, /* cond_not_taken_branch_cost. */
1874 };
1875
1876 const struct processor_costs *ix86_cost = &pentium_cost;
1877
1878 /* Processor feature/optimization bitmasks. */
1879 #define m_386 (1<<PROCESSOR_I386)
1880 #define m_486 (1<<PROCESSOR_I486)
1881 #define m_PENT (1<<PROCESSOR_PENTIUM)
1882 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1883 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1884 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1885 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1886 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1887 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1888 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1889 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1890 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1891 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1892 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1893 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1894 #define m_ATOM (1<<PROCESSOR_ATOM)
1895
1896 #define m_GEODE (1<<PROCESSOR_GEODE)
1897 #define m_K6 (1<<PROCESSOR_K6)
1898 #define m_K6_GEODE (m_K6 | m_GEODE)
1899 #define m_K8 (1<<PROCESSOR_K8)
1900 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1901 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1902 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1903 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1904 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1905 #define m_BDVER (m_BDVER1 | m_BDVER2)
1906 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1907 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1)
1908
1909 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1910 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1911
1912 /* Generic instruction choice should be common subset of supported CPUs
1913 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1914 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1915
1916 /* Feature tests against the various tunings. */
1917 unsigned char ix86_tune_features[X86_TUNE_LAST];
1918
1919 /* Feature tests against the various tunings used to create ix86_tune_features
1920 based on the processor mask. */
1921 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1922 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1923 negatively, so enabling for Generic64 seems like good code size
1924 tradeoff. We can't enable it for 32bit generic because it does not
1925 work well with PPro base chips. */
1926 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1927
1928 /* X86_TUNE_PUSH_MEMORY */
1929 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1930
1931 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1932 m_486 | m_PENT,
1933
1934 /* X86_TUNE_UNROLL_STRLEN */
1935 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1936
1937 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1938 on simulation result. But after P4 was made, no performance benefit
1939 was observed with branch hints. It also increases the code size.
1940 As a result, icc never generates branch hints. */
1941 0,
1942
1943 /* X86_TUNE_DOUBLE_WITH_ADD */
1944 ~m_386,
1945
1946 /* X86_TUNE_USE_SAHF */
1947 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC,
1948
1949 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1950 partial dependencies. */
1951 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1952
1953 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1954 register stalls on Generic32 compilation setting as well. However
1955 in current implementation the partial register stalls are not eliminated
1956 very well - they can be introduced via subregs synthesized by combine
1957 and can happen in caller/callee saving sequences. Because this option
1958 pays back little on PPro based chips and is in conflict with partial reg
1959 dependencies used by Athlon/P4 based chips, it is better to leave it off
1960 for generic32 for now. */
1961 m_PPRO,
1962
1963 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1964 m_CORE2I7 | m_GENERIC,
1965
1966 /* X86_TUNE_USE_HIMODE_FIOP */
1967 m_386 | m_486 | m_K6_GEODE,
1968
1969 /* X86_TUNE_USE_SIMODE_FIOP */
1970 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1971
1972 /* X86_TUNE_USE_MOV0 */
1973 m_K6,
1974
1975 /* X86_TUNE_USE_CLTD */
1976 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
1977
1978 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1979 m_PENT4,
1980
1981 /* X86_TUNE_SPLIT_LONG_MOVES */
1982 m_PPRO,
1983
1984 /* X86_TUNE_READ_MODIFY_WRITE */
1985 ~m_PENT,
1986
1987 /* X86_TUNE_READ_MODIFY */
1988 ~(m_PENT | m_PPRO),
1989
1990 /* X86_TUNE_PROMOTE_QIMODE */
1991 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1992
1993 /* X86_TUNE_FAST_PREFIX */
1994 ~(m_386 | m_486 | m_PENT),
1995
1996 /* X86_TUNE_SINGLE_STRINGOP */
1997 m_386 | m_P4_NOCONA,
1998
1999 /* X86_TUNE_QIMODE_MATH */
2000 ~0,
2001
2002 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2003 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2004 might be considered for Generic32 if our scheme for avoiding partial
2005 stalls was more effective. */
2006 ~m_PPRO,
2007
2008 /* X86_TUNE_PROMOTE_QI_REGS */
2009 0,
2010
2011 /* X86_TUNE_PROMOTE_HI_REGS */
2012 m_PPRO,
2013
2014 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2015 over esp addition. */
2016 m_386 | m_486 | m_PENT | m_PPRO,
2017
2018 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2019 over esp addition. */
2020 m_PENT,
2021
2022 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2023 over esp subtraction. */
2024 m_386 | m_486 | m_PENT | m_K6_GEODE,
2025
2026 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2027 over esp subtraction. */
2028 m_PENT | m_K6_GEODE,
2029
2030 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2031 for DFmode copies */
2032 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2033
2034 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2035 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2036
2037 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2038 conflict here in between PPro/Pentium4 based chips that thread 128bit
2039 SSE registers as single units versus K8 based chips that divide SSE
2040 registers to two 64bit halves. This knob promotes all store destinations
2041 to be 128bit to allow register renaming on 128bit SSE units, but usually
2042 results in one extra microop on 64bit SSE units. Experimental results
2043 shows that disabling this option on P4 brings over 20% SPECfp regression,
2044 while enabling it on K8 brings roughly 2.4% regression that can be partly
2045 masked by careful scheduling of moves. */
2046 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2047
2048 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2049 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER1,
2050
2051 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2052 m_COREI7 | m_BDVER,
2053
2054 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2055 m_BDVER ,
2056
2057 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2058 are resolved on SSE register parts instead of whole registers, so we may
2059 maintain just lower part of scalar values in proper format leaving the
2060 upper part undefined. */
2061 m_ATHLON_K8,
2062
2063 /* X86_TUNE_SSE_TYPELESS_STORES */
2064 m_AMD_MULTIPLE,
2065
2066 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2067 m_PPRO | m_P4_NOCONA,
2068
2069 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2070 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2071
2072 /* X86_TUNE_PROLOGUE_USING_MOVE */
2073 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2074
2075 /* X86_TUNE_EPILOGUE_USING_MOVE */
2076 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2077
2078 /* X86_TUNE_SHIFT1 */
2079 ~m_486,
2080
2081 /* X86_TUNE_USE_FFREEP */
2082 m_AMD_MULTIPLE,
2083
2084 /* X86_TUNE_INTER_UNIT_MOVES */
2085 ~(m_AMD_MULTIPLE | m_GENERIC),
2086
2087 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2088 ~(m_AMDFAM10 | m_BDVER ),
2089
2090 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2091 than 4 branch instructions in the 16 byte window. */
2092 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2093
2094 /* X86_TUNE_SCHEDULE */
2095 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2096
2097 /* X86_TUNE_USE_BT */
2098 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2099
2100 /* X86_TUNE_USE_INCDEC */
2101 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2102
2103 /* X86_TUNE_PAD_RETURNS */
2104 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2105
2106 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2107 m_ATOM,
2108
2109 /* X86_TUNE_EXT_80387_CONSTANTS */
2110 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2111
2112 /* X86_TUNE_SHORTEN_X87_SSE */
2113 ~m_K8,
2114
2115 /* X86_TUNE_AVOID_VECTOR_DECODE */
2116 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2117
2118 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2119 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2120 ~(m_386 | m_486),
2121
2122 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2123 vector path on AMD machines. */
2124 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2125
2126 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2127 machines. */
2128 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2129
2130 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2131 than a MOV. */
2132 m_PENT,
2133
2134 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2135 but one byte longer. */
2136 m_PENT,
2137
2138 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2139 operand that cannot be represented using a modRM byte. The XOR
2140 replacement is long decoded, so this split helps here as well. */
2141 m_K6,
2142
2143 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2144 from FP to FP. */
2145 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2146
2147 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2148 from integer to FP. */
2149 m_AMDFAM10,
2150
2151 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2152 with a subsequent conditional jump instruction into a single
2153 compare-and-branch uop. */
2154 m_BDVER,
2155
2156 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2157 will impact LEA instruction selection. */
2158 m_ATOM,
2159
2160 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2161 instructions. */
2162 ~m_ATOM,
2163
2164 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2165 at -O3. For the moment, the prefetching seems badly tuned for Intel
2166 chips. */
2167 m_K6_GEODE | m_AMD_MULTIPLE,
2168
2169 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2170 the auto-vectorizer. */
2171 m_BDVER,
2172
2173 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2174 during reassociation of integer computation. */
2175 m_ATOM,
2176
2177 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2178 during reassociation of fp computation. */
2179 m_ATOM
2180 };
2181
2182 /* Feature tests against the various architecture variations. */
2183 unsigned char ix86_arch_features[X86_ARCH_LAST];
2184
2185 /* Feature tests against the various architecture variations, used to create
2186 ix86_arch_features based on the processor mask. */
2187 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2188 /* X86_ARCH_CMOVE: Conditional move was added for pentiumpro. */
2189 ~(m_386 | m_486 | m_PENT | m_K6),
2190
2191 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2192 ~m_386,
2193
2194 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2195 ~(m_386 | m_486),
2196
2197 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2198 ~m_386,
2199
2200 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2201 ~m_386,
2202 };
2203
2204 static const unsigned int x86_accumulate_outgoing_args
2205 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2206
2207 static const unsigned int x86_arch_always_fancy_math_387
2208 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2209
2210 static const unsigned int x86_avx256_split_unaligned_load
2211 = m_COREI7 | m_GENERIC;
2212
2213 static const unsigned int x86_avx256_split_unaligned_store
2214 = m_COREI7 | m_BDVER | m_GENERIC;
2215
2216 /* In case the average insn count for single function invocation is
2217 lower than this constant, emit fast (but longer) prologue and
2218 epilogue code. */
2219 #define FAST_PROLOGUE_INSN_COUNT 20
2220
2221 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2222 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2223 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2224 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2225
2226 /* Array of the smallest class containing reg number REGNO, indexed by
2227 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2228
2229 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2230 {
2231 /* ax, dx, cx, bx */
2232 AREG, DREG, CREG, BREG,
2233 /* si, di, bp, sp */
2234 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2235 /* FP registers */
2236 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2237 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2238 /* arg pointer */
2239 NON_Q_REGS,
2240 /* flags, fpsr, fpcr, frame */
2241 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2242 /* SSE registers */
2243 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2244 SSE_REGS, SSE_REGS,
2245 /* MMX registers */
2246 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2247 MMX_REGS, MMX_REGS,
2248 /* REX registers */
2249 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2250 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2251 /* SSE REX registers */
2252 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2253 SSE_REGS, SSE_REGS,
2254 };
2255
2256 /* The "default" register map used in 32bit mode. */
2257
2258 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2259 {
2260 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2261 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2262 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2263 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2264 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2265 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2266 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2267 };
2268
2269 /* The "default" register map used in 64bit mode. */
2270
2271 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2272 {
2273 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2274 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2275 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2276 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2277 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2278 8,9,10,11,12,13,14,15, /* extended integer registers */
2279 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2280 };
2281
2282 /* Define the register numbers to be used in Dwarf debugging information.
2283 The SVR4 reference port C compiler uses the following register numbers
2284 in its Dwarf output code:
2285 0 for %eax (gcc regno = 0)
2286 1 for %ecx (gcc regno = 2)
2287 2 for %edx (gcc regno = 1)
2288 3 for %ebx (gcc regno = 3)
2289 4 for %esp (gcc regno = 7)
2290 5 for %ebp (gcc regno = 6)
2291 6 for %esi (gcc regno = 4)
2292 7 for %edi (gcc regno = 5)
2293 The following three DWARF register numbers are never generated by
2294 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2295 believes these numbers have these meanings.
2296 8 for %eip (no gcc equivalent)
2297 9 for %eflags (gcc regno = 17)
2298 10 for %trapno (no gcc equivalent)
2299 It is not at all clear how we should number the FP stack registers
2300 for the x86 architecture. If the version of SDB on x86/svr4 were
2301 a bit less brain dead with respect to floating-point then we would
2302 have a precedent to follow with respect to DWARF register numbers
2303 for x86 FP registers, but the SDB on x86/svr4 is so completely
2304 broken with respect to FP registers that it is hardly worth thinking
2305 of it as something to strive for compatibility with.
2306 The version of x86/svr4 SDB I have at the moment does (partially)
2307 seem to believe that DWARF register number 11 is associated with
2308 the x86 register %st(0), but that's about all. Higher DWARF
2309 register numbers don't seem to be associated with anything in
2310 particular, and even for DWARF regno 11, SDB only seems to under-
2311 stand that it should say that a variable lives in %st(0) (when
2312 asked via an `=' command) if we said it was in DWARF regno 11,
2313 but SDB still prints garbage when asked for the value of the
2314 variable in question (via a `/' command).
2315 (Also note that the labels SDB prints for various FP stack regs
2316 when doing an `x' command are all wrong.)
2317 Note that these problems generally don't affect the native SVR4
2318 C compiler because it doesn't allow the use of -O with -g and
2319 because when it is *not* optimizing, it allocates a memory
2320 location for each floating-point variable, and the memory
2321 location is what gets described in the DWARF AT_location
2322 attribute for the variable in question.
2323 Regardless of the severe mental illness of the x86/svr4 SDB, we
2324 do something sensible here and we use the following DWARF
2325 register numbers. Note that these are all stack-top-relative
2326 numbers.
2327 11 for %st(0) (gcc regno = 8)
2328 12 for %st(1) (gcc regno = 9)
2329 13 for %st(2) (gcc regno = 10)
2330 14 for %st(3) (gcc regno = 11)
2331 15 for %st(4) (gcc regno = 12)
2332 16 for %st(5) (gcc regno = 13)
2333 17 for %st(6) (gcc regno = 14)
2334 18 for %st(7) (gcc regno = 15)
2335 */
2336 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2337 {
2338 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2339 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2340 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2341 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2342 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2343 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2344 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2345 };
2346
2347 /* Define parameter passing and return registers. */
2348
2349 static int const x86_64_int_parameter_registers[6] =
2350 {
2351 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2352 };
2353
2354 static int const x86_64_ms_abi_int_parameter_registers[4] =
2355 {
2356 CX_REG, DX_REG, R8_REG, R9_REG
2357 };
2358
2359 static int const x86_64_int_return_registers[4] =
2360 {
2361 AX_REG, DX_REG, DI_REG, SI_REG
2362 };
2363
2364 /* Define the structure for the machine field in struct function. */
2365
2366 struct GTY(()) stack_local_entry {
2367 unsigned short mode;
2368 unsigned short n;
2369 rtx rtl;
2370 struct stack_local_entry *next;
2371 };
2372
2373 /* Structure describing stack frame layout.
2374 Stack grows downward:
2375
2376 [arguments]
2377 <- ARG_POINTER
2378 saved pc
2379
2380 saved static chain if ix86_static_chain_on_stack
2381
2382 saved frame pointer if frame_pointer_needed
2383 <- HARD_FRAME_POINTER
2384 [saved regs]
2385 <- regs_save_offset
2386 [padding0]
2387
2388 [saved SSE regs]
2389 <- sse_regs_save_offset
2390 [padding1] |
2391 | <- FRAME_POINTER
2392 [va_arg registers] |
2393 |
2394 [frame] |
2395 |
2396 [padding2] | = to_allocate
2397 <- STACK_POINTER
2398 */
2399 struct ix86_frame
2400 {
2401 int nsseregs;
2402 int nregs;
2403 int va_arg_size;
2404 int red_zone_size;
2405 int outgoing_arguments_size;
2406 HOST_WIDE_INT frame;
2407
2408 /* The offsets relative to ARG_POINTER. */
2409 HOST_WIDE_INT frame_pointer_offset;
2410 HOST_WIDE_INT hard_frame_pointer_offset;
2411 HOST_WIDE_INT stack_pointer_offset;
2412 HOST_WIDE_INT hfp_save_offset;
2413 HOST_WIDE_INT reg_save_offset;
2414 HOST_WIDE_INT sse_reg_save_offset;
2415
2416 /* When save_regs_using_mov is set, emit prologue using
2417 move instead of push instructions. */
2418 bool save_regs_using_mov;
2419 };
2420
2421 /* Which cpu are we scheduling for. */
2422 enum attr_cpu ix86_schedule;
2423
2424 /* Which cpu are we optimizing for. */
2425 enum processor_type ix86_tune;
2426
2427 /* Which instruction set architecture to use. */
2428 enum processor_type ix86_arch;
2429
2430 /* true if sse prefetch instruction is not NOOP. */
2431 int x86_prefetch_sse;
2432
2433 /* -mstackrealign option */
2434 static const char ix86_force_align_arg_pointer_string[]
2435 = "force_align_arg_pointer";
2436
2437 static rtx (*ix86_gen_leave) (void);
2438 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2439 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2440 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2441 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2442 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2443 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2444 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2445 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2446 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2447
2448 /* Preferred alignment for stack boundary in bits. */
2449 unsigned int ix86_preferred_stack_boundary;
2450
2451 /* Alignment for incoming stack boundary in bits specified at
2452 command line. */
2453 static unsigned int ix86_user_incoming_stack_boundary;
2454
2455 /* Default alignment for incoming stack boundary in bits. */
2456 static unsigned int ix86_default_incoming_stack_boundary;
2457
2458 /* Alignment for incoming stack boundary in bits. */
2459 unsigned int ix86_incoming_stack_boundary;
2460
2461 /* Calling abi specific va_list type nodes. */
2462 static GTY(()) tree sysv_va_list_type_node;
2463 static GTY(()) tree ms_va_list_type_node;
2464
2465 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2466 char internal_label_prefix[16];
2467 int internal_label_prefix_len;
2468
2469 /* Fence to use after loop using movnt. */
2470 tree x86_mfence;
2471
2472 /* Register class used for passing given 64bit part of the argument.
2473 These represent classes as documented by the PS ABI, with the exception
2474 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2475 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2476
2477 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2478 whenever possible (upper half does contain padding). */
2479 enum x86_64_reg_class
2480 {
2481 X86_64_NO_CLASS,
2482 X86_64_INTEGER_CLASS,
2483 X86_64_INTEGERSI_CLASS,
2484 X86_64_SSE_CLASS,
2485 X86_64_SSESF_CLASS,
2486 X86_64_SSEDF_CLASS,
2487 X86_64_SSEUP_CLASS,
2488 X86_64_X87_CLASS,
2489 X86_64_X87UP_CLASS,
2490 X86_64_COMPLEX_X87_CLASS,
2491 X86_64_MEMORY_CLASS
2492 };
2493
2494 #define MAX_CLASSES 4
2495
2496 /* Table of constants used by fldpi, fldln2, etc.... */
2497 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2498 static bool ext_80387_constants_init = 0;
2499
2500 \f
2501 static struct machine_function * ix86_init_machine_status (void);
2502 static rtx ix86_function_value (const_tree, const_tree, bool);
2503 static bool ix86_function_value_regno_p (const unsigned int);
2504 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2505 const_tree);
2506 static rtx ix86_static_chain (const_tree, bool);
2507 static int ix86_function_regparm (const_tree, const_tree);
2508 static void ix86_compute_frame_layout (struct ix86_frame *);
2509 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2510 rtx, rtx, int);
2511 static void ix86_add_new_builtins (HOST_WIDE_INT);
2512 static tree ix86_canonical_va_list_type (tree);
2513 static void predict_jump (int);
2514 static unsigned int split_stack_prologue_scratch_regno (void);
2515 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2516
2517 enum ix86_function_specific_strings
2518 {
2519 IX86_FUNCTION_SPECIFIC_ARCH,
2520 IX86_FUNCTION_SPECIFIC_TUNE,
2521 IX86_FUNCTION_SPECIFIC_MAX
2522 };
2523
2524 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2525 const char *, enum fpmath_unit, bool);
2526 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2527 static void ix86_function_specific_save (struct cl_target_option *);
2528 static void ix86_function_specific_restore (struct cl_target_option *);
2529 static void ix86_function_specific_print (FILE *, int,
2530 struct cl_target_option *);
2531 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2532 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2533 struct gcc_options *);
2534 static bool ix86_can_inline_p (tree, tree);
2535 static void ix86_set_current_function (tree);
2536 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2537
2538 static enum calling_abi ix86_function_abi (const_tree);
2539
2540 \f
2541 #ifndef SUBTARGET32_DEFAULT_CPU
2542 #define SUBTARGET32_DEFAULT_CPU "i386"
2543 #endif
2544
2545 /* The svr4 ABI for the i386 says that records and unions are returned
2546 in memory. */
2547 #ifndef DEFAULT_PCC_STRUCT_RETURN
2548 #define DEFAULT_PCC_STRUCT_RETURN 1
2549 #endif
2550
2551 /* Whether -mtune= or -march= were specified */
2552 static int ix86_tune_defaulted;
2553 static int ix86_arch_specified;
2554
2555 /* Vectorization library interface and handlers. */
2556 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2557
2558 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2559 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2560
2561 /* Processor target table, indexed by processor number */
2562 struct ptt
2563 {
2564 const struct processor_costs *cost; /* Processor costs */
2565 const int align_loop; /* Default alignments. */
2566 const int align_loop_max_skip;
2567 const int align_jump;
2568 const int align_jump_max_skip;
2569 const int align_func;
2570 };
2571
2572 static const struct ptt processor_target_table[PROCESSOR_max] =
2573 {
2574 {&i386_cost, 4, 3, 4, 3, 4},
2575 {&i486_cost, 16, 15, 16, 15, 16},
2576 {&pentium_cost, 16, 7, 16, 7, 16},
2577 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2578 {&geode_cost, 0, 0, 0, 0, 0},
2579 {&k6_cost, 32, 7, 32, 7, 32},
2580 {&athlon_cost, 16, 7, 16, 7, 16},
2581 {&pentium4_cost, 0, 0, 0, 0, 0},
2582 {&k8_cost, 16, 7, 16, 7, 16},
2583 {&nocona_cost, 0, 0, 0, 0, 0},
2584 /* Core 2 32-bit. */
2585 {&generic32_cost, 16, 10, 16, 10, 16},
2586 /* Core 2 64-bit. */
2587 {&generic64_cost, 16, 10, 16, 10, 16},
2588 /* Core i7 32-bit. */
2589 {&generic32_cost, 16, 10, 16, 10, 16},
2590 /* Core i7 64-bit. */
2591 {&generic64_cost, 16, 10, 16, 10, 16},
2592 {&generic32_cost, 16, 7, 16, 7, 16},
2593 {&generic64_cost, 16, 10, 16, 10, 16},
2594 {&amdfam10_cost, 32, 24, 32, 7, 32},
2595 {&bdver1_cost, 32, 24, 32, 7, 32},
2596 {&bdver2_cost, 32, 24, 32, 7, 32},
2597 {&btver1_cost, 32, 24, 32, 7, 32},
2598 {&atom_cost, 16, 15, 16, 7, 16}
2599 };
2600
2601 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2602 {
2603 "generic",
2604 "i386",
2605 "i486",
2606 "pentium",
2607 "pentium-mmx",
2608 "pentiumpro",
2609 "pentium2",
2610 "pentium3",
2611 "pentium4",
2612 "pentium-m",
2613 "prescott",
2614 "nocona",
2615 "core2",
2616 "corei7",
2617 "atom",
2618 "geode",
2619 "k6",
2620 "k6-2",
2621 "k6-3",
2622 "athlon",
2623 "athlon-4",
2624 "k8",
2625 "amdfam10",
2626 "bdver1",
2627 "bdver2",
2628 "btver1"
2629 };
2630 \f
2631 /* Return true if a red-zone is in use. */
2632
2633 static inline bool
2634 ix86_using_red_zone (void)
2635 {
2636 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2637 }
2638 \f
2639 /* Return a string that documents the current -m options. The caller is
2640 responsible for freeing the string. */
2641
2642 static char *
2643 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2644 const char *tune, enum fpmath_unit fpmath,
2645 bool add_nl_p)
2646 {
2647 struct ix86_target_opts
2648 {
2649 const char *option; /* option string */
2650 HOST_WIDE_INT mask; /* isa mask options */
2651 };
2652
2653 /* This table is ordered so that options like -msse4.2 that imply
2654 preceding options while match those first. */
2655 static struct ix86_target_opts isa_opts[] =
2656 {
2657 { "-m64", OPTION_MASK_ISA_64BIT },
2658 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2659 { "-mfma", OPTION_MASK_ISA_FMA },
2660 { "-mxop", OPTION_MASK_ISA_XOP },
2661 { "-mlwp", OPTION_MASK_ISA_LWP },
2662 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2663 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2664 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2665 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2666 { "-msse3", OPTION_MASK_ISA_SSE3 },
2667 { "-msse2", OPTION_MASK_ISA_SSE2 },
2668 { "-msse", OPTION_MASK_ISA_SSE },
2669 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2670 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2671 { "-mmmx", OPTION_MASK_ISA_MMX },
2672 { "-mabm", OPTION_MASK_ISA_ABM },
2673 { "-mbmi", OPTION_MASK_ISA_BMI },
2674 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2675 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2676 { "-mtbm", OPTION_MASK_ISA_TBM },
2677 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2678 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2679 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2680 { "-maes", OPTION_MASK_ISA_AES },
2681 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2682 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2683 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2684 { "-mf16c", OPTION_MASK_ISA_F16C },
2685 };
2686
2687 /* Flag options. */
2688 static struct ix86_target_opts flag_opts[] =
2689 {
2690 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2691 { "-m80387", MASK_80387 },
2692 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2693 { "-malign-double", MASK_ALIGN_DOUBLE },
2694 { "-mcld", MASK_CLD },
2695 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2696 { "-mieee-fp", MASK_IEEE_FP },
2697 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2698 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2699 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2700 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2701 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2702 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2703 { "-mno-red-zone", MASK_NO_RED_ZONE },
2704 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2705 { "-mrecip", MASK_RECIP },
2706 { "-mrtd", MASK_RTD },
2707 { "-msseregparm", MASK_SSEREGPARM },
2708 { "-mstack-arg-probe", MASK_STACK_PROBE },
2709 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2710 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2711 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2712 { "-mvzeroupper", MASK_VZEROUPPER },
2713 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2714 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2715 { "-mprefer-avx128", MASK_PREFER_AVX128},
2716 };
2717
2718 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2719
2720 char isa_other[40];
2721 char target_other[40];
2722 unsigned num = 0;
2723 unsigned i, j;
2724 char *ret;
2725 char *ptr;
2726 size_t len;
2727 size_t line_len;
2728 size_t sep_len;
2729
2730 memset (opts, '\0', sizeof (opts));
2731
2732 /* Add -march= option. */
2733 if (arch)
2734 {
2735 opts[num][0] = "-march=";
2736 opts[num++][1] = arch;
2737 }
2738
2739 /* Add -mtune= option. */
2740 if (tune)
2741 {
2742 opts[num][0] = "-mtune=";
2743 opts[num++][1] = tune;
2744 }
2745
2746 /* Pick out the options in isa options. */
2747 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2748 {
2749 if ((isa & isa_opts[i].mask) != 0)
2750 {
2751 opts[num++][0] = isa_opts[i].option;
2752 isa &= ~ isa_opts[i].mask;
2753 }
2754 }
2755
2756 if (isa && add_nl_p)
2757 {
2758 opts[num++][0] = isa_other;
2759 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2760 isa);
2761 }
2762
2763 /* Add flag options. */
2764 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2765 {
2766 if ((flags & flag_opts[i].mask) != 0)
2767 {
2768 opts[num++][0] = flag_opts[i].option;
2769 flags &= ~ flag_opts[i].mask;
2770 }
2771 }
2772
2773 if (flags && add_nl_p)
2774 {
2775 opts[num++][0] = target_other;
2776 sprintf (target_other, "(other flags: %#x)", flags);
2777 }
2778
2779 /* Add -fpmath= option. */
2780 if (fpmath)
2781 {
2782 opts[num][0] = "-mfpmath=";
2783 switch ((int) fpmath)
2784 {
2785 case FPMATH_387:
2786 opts[num++][1] = "387";
2787 break;
2788
2789 case FPMATH_SSE:
2790 opts[num++][1] = "sse";
2791 break;
2792
2793 case FPMATH_387 | FPMATH_SSE:
2794 opts[num++][1] = "sse+387";
2795 break;
2796
2797 default:
2798 gcc_unreachable ();
2799 }
2800 }
2801
2802 /* Any options? */
2803 if (num == 0)
2804 return NULL;
2805
2806 gcc_assert (num < ARRAY_SIZE (opts));
2807
2808 /* Size the string. */
2809 len = 0;
2810 sep_len = (add_nl_p) ? 3 : 1;
2811 for (i = 0; i < num; i++)
2812 {
2813 len += sep_len;
2814 for (j = 0; j < 2; j++)
2815 if (opts[i][j])
2816 len += strlen (opts[i][j]);
2817 }
2818
2819 /* Build the string. */
2820 ret = ptr = (char *) xmalloc (len);
2821 line_len = 0;
2822
2823 for (i = 0; i < num; i++)
2824 {
2825 size_t len2[2];
2826
2827 for (j = 0; j < 2; j++)
2828 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2829
2830 if (i != 0)
2831 {
2832 *ptr++ = ' ';
2833 line_len++;
2834
2835 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2836 {
2837 *ptr++ = '\\';
2838 *ptr++ = '\n';
2839 line_len = 0;
2840 }
2841 }
2842
2843 for (j = 0; j < 2; j++)
2844 if (opts[i][j])
2845 {
2846 memcpy (ptr, opts[i][j], len2[j]);
2847 ptr += len2[j];
2848 line_len += len2[j];
2849 }
2850 }
2851
2852 *ptr = '\0';
2853 gcc_assert (ret + len >= ptr);
2854
2855 return ret;
2856 }
2857
2858 /* Return true, if profiling code should be emitted before
2859 prologue. Otherwise it returns false.
2860 Note: For x86 with "hotfix" it is sorried. */
2861 static bool
2862 ix86_profile_before_prologue (void)
2863 {
2864 return flag_fentry != 0;
2865 }
2866
2867 /* Function that is callable from the debugger to print the current
2868 options. */
2869 void
2870 ix86_debug_options (void)
2871 {
2872 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2873 ix86_arch_string, ix86_tune_string,
2874 ix86_fpmath, true);
2875
2876 if (opts)
2877 {
2878 fprintf (stderr, "%s\n\n", opts);
2879 free (opts);
2880 }
2881 else
2882 fputs ("<no options>\n\n", stderr);
2883
2884 return;
2885 }
2886 \f
2887 /* Override various settings based on options. If MAIN_ARGS_P, the
2888 options are from the command line, otherwise they are from
2889 attributes. */
2890
2891 static void
2892 ix86_option_override_internal (bool main_args_p)
2893 {
2894 int i;
2895 unsigned int ix86_arch_mask, ix86_tune_mask;
2896 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2897 const char *prefix;
2898 const char *suffix;
2899 const char *sw;
2900
2901 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2902 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2903 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2904 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2905 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2906 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2907 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2908 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2909 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2910 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2911 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2912 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2913 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2914 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2915 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2916 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2917 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2918 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2919 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2920 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2921 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2922 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2923 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2924 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2925 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2926 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2927 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2928 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2929 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2930 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2931 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2932 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2933 /* if this reaches 64, need to widen struct pta flags below */
2934
2935 static struct pta
2936 {
2937 const char *const name; /* processor name or nickname. */
2938 const enum processor_type processor;
2939 const enum attr_cpu schedule;
2940 const unsigned HOST_WIDE_INT flags;
2941 }
2942 const processor_alias_table[] =
2943 {
2944 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2945 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2946 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2947 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2948 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2949 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2950 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2951 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2952 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2953 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2954 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2955 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
2956 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2957 PTA_MMX | PTA_SSE},
2958 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2959 PTA_MMX | PTA_SSE},
2960 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2961 PTA_MMX | PTA_SSE | PTA_SSE2},
2962 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2963 PTA_MMX |PTA_SSE | PTA_SSE2},
2964 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2965 PTA_MMX | PTA_SSE | PTA_SSE2},
2966 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2967 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2968 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2969 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2970 | PTA_CX16 | PTA_NO_SAHF},
2971 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
2972 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2973 | PTA_SSSE3 | PTA_CX16},
2974 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
2975 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2976 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16},
2977 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
2978 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2979 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2980 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
2981 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
2982 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2983 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2984 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2985 | PTA_RDRND | PTA_F16C},
2986 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
2987 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2988 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2989 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2990 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2991 | PTA_FMA | PTA_MOVBE},
2992 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2993 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2994 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
2995 {"geode", PROCESSOR_GEODE, CPU_GEODE,
2996 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE},
2997 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
2998 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2999 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3000 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3001 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3002 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3003 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3004 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3005 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3006 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3007 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3008 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3009 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3010 {"x86-64", PROCESSOR_K8, CPU_K8,
3011 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3012 {"k8", PROCESSOR_K8, CPU_K8,
3013 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3014 | PTA_SSE2 | PTA_NO_SAHF},
3015 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3016 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3017 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3018 {"opteron", PROCESSOR_K8, CPU_K8,
3019 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3020 | PTA_SSE2 | PTA_NO_SAHF},
3021 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3022 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3023 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3024 {"athlon64", PROCESSOR_K8, CPU_K8,
3025 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3026 | PTA_SSE2 | PTA_NO_SAHF},
3027 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3028 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3029 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3030 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3031 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3032 | PTA_SSE2 | PTA_NO_SAHF},
3033 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3034 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3035 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3036 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3037 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3038 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3039 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3040 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3041 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3042 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3043 | PTA_XOP | PTA_LWP},
3044 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3045 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3046 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3047 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3048 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3049 | PTA_FMA},
3050 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3051 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3052 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16},
3053 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3054 0 /* flags are only used for -march switch. */ },
3055 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3056 PTA_64BIT /* flags are only used for -march switch. */ },
3057 };
3058
3059 /* -mrecip options. */
3060 static struct
3061 {
3062 const char *string; /* option name */
3063 unsigned int mask; /* mask bits to set */
3064 }
3065 const recip_options[] =
3066 {
3067 { "all", RECIP_MASK_ALL },
3068 { "none", RECIP_MASK_NONE },
3069 { "div", RECIP_MASK_DIV },
3070 { "sqrt", RECIP_MASK_SQRT },
3071 { "vec-div", RECIP_MASK_VEC_DIV },
3072 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3073 };
3074
3075 int const pta_size = ARRAY_SIZE (processor_alias_table);
3076
3077 /* Set up prefix/suffix so the error messages refer to either the command
3078 line argument, or the attribute(target). */
3079 if (main_args_p)
3080 {
3081 prefix = "-m";
3082 suffix = "";
3083 sw = "switch";
3084 }
3085 else
3086 {
3087 prefix = "option(\"";
3088 suffix = "\")";
3089 sw = "attribute";
3090 }
3091
3092 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3093 SUBTARGET_OVERRIDE_OPTIONS;
3094 #endif
3095
3096 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3097 SUBSUBTARGET_OVERRIDE_OPTIONS;
3098 #endif
3099
3100 if (TARGET_X32)
3101 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3102
3103 /* -fPIC is the default for x86_64. */
3104 if (TARGET_MACHO && TARGET_64BIT)
3105 flag_pic = 2;
3106
3107 /* Need to check -mtune=generic first. */
3108 if (ix86_tune_string)
3109 {
3110 if (!strcmp (ix86_tune_string, "generic")
3111 || !strcmp (ix86_tune_string, "i686")
3112 /* As special support for cross compilers we read -mtune=native
3113 as -mtune=generic. With native compilers we won't see the
3114 -mtune=native, as it was changed by the driver. */
3115 || !strcmp (ix86_tune_string, "native"))
3116 {
3117 if (TARGET_64BIT)
3118 ix86_tune_string = "generic64";
3119 else
3120 ix86_tune_string = "generic32";
3121 }
3122 /* If this call is for setting the option attribute, allow the
3123 generic32/generic64 that was previously set. */
3124 else if (!main_args_p
3125 && (!strcmp (ix86_tune_string, "generic32")
3126 || !strcmp (ix86_tune_string, "generic64")))
3127 ;
3128 else if (!strncmp (ix86_tune_string, "generic", 7))
3129 error ("bad value (%s) for %stune=%s %s",
3130 ix86_tune_string, prefix, suffix, sw);
3131 else if (!strcmp (ix86_tune_string, "x86-64"))
3132 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3133 "%stune=k8%s or %stune=generic%s instead as appropriate",
3134 prefix, suffix, prefix, suffix, prefix, suffix);
3135 }
3136 else
3137 {
3138 if (ix86_arch_string)
3139 ix86_tune_string = ix86_arch_string;
3140 if (!ix86_tune_string)
3141 {
3142 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3143 ix86_tune_defaulted = 1;
3144 }
3145
3146 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3147 need to use a sensible tune option. */
3148 if (!strcmp (ix86_tune_string, "generic")
3149 || !strcmp (ix86_tune_string, "x86-64")
3150 || !strcmp (ix86_tune_string, "i686"))
3151 {
3152 if (TARGET_64BIT)
3153 ix86_tune_string = "generic64";
3154 else
3155 ix86_tune_string = "generic32";
3156 }
3157 }
3158
3159 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3160 {
3161 /* rep; movq isn't available in 32-bit code. */
3162 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3163 ix86_stringop_alg = no_stringop;
3164 }
3165
3166 if (!ix86_arch_string)
3167 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3168 else
3169 ix86_arch_specified = 1;
3170
3171 if (!global_options_set.x_ix86_abi)
3172 ix86_abi = DEFAULT_ABI;
3173
3174 if (global_options_set.x_ix86_cmodel)
3175 {
3176 switch (ix86_cmodel)
3177 {
3178 case CM_SMALL:
3179 case CM_SMALL_PIC:
3180 if (flag_pic)
3181 ix86_cmodel = CM_SMALL_PIC;
3182 if (!TARGET_64BIT)
3183 error ("code model %qs not supported in the %s bit mode",
3184 "small", "32");
3185 break;
3186
3187 case CM_MEDIUM:
3188 case CM_MEDIUM_PIC:
3189 if (flag_pic)
3190 ix86_cmodel = CM_MEDIUM_PIC;
3191 if (!TARGET_64BIT)
3192 error ("code model %qs not supported in the %s bit mode",
3193 "medium", "32");
3194 else if (TARGET_X32)
3195 error ("code model %qs not supported in x32 mode",
3196 "medium");
3197 break;
3198
3199 case CM_LARGE:
3200 case CM_LARGE_PIC:
3201 if (flag_pic)
3202 ix86_cmodel = CM_LARGE_PIC;
3203 if (!TARGET_64BIT)
3204 error ("code model %qs not supported in the %s bit mode",
3205 "large", "32");
3206 else if (TARGET_X32)
3207 error ("code model %qs not supported in x32 mode",
3208 "medium");
3209 break;
3210
3211 case CM_32:
3212 if (flag_pic)
3213 error ("code model %s does not support PIC mode", "32");
3214 if (TARGET_64BIT)
3215 error ("code model %qs not supported in the %s bit mode",
3216 "32", "64");
3217 break;
3218
3219 case CM_KERNEL:
3220 if (flag_pic)
3221 {
3222 error ("code model %s does not support PIC mode", "kernel");
3223 ix86_cmodel = CM_32;
3224 }
3225 if (!TARGET_64BIT)
3226 error ("code model %qs not supported in the %s bit mode",
3227 "kernel", "32");
3228 break;
3229
3230 default:
3231 gcc_unreachable ();
3232 }
3233 }
3234 else
3235 {
3236 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3237 use of rip-relative addressing. This eliminates fixups that
3238 would otherwise be needed if this object is to be placed in a
3239 DLL, and is essentially just as efficient as direct addressing. */
3240 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3241 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3242 else if (TARGET_64BIT)
3243 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3244 else
3245 ix86_cmodel = CM_32;
3246 }
3247 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3248 {
3249 error ("-masm=intel not supported in this configuration");
3250 ix86_asm_dialect = ASM_ATT;
3251 }
3252 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3253 sorry ("%i-bit mode not compiled in",
3254 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3255
3256 for (i = 0; i < pta_size; i++)
3257 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3258 {
3259 ix86_schedule = processor_alias_table[i].schedule;
3260 ix86_arch = processor_alias_table[i].processor;
3261 /* Default cpu tuning to the architecture. */
3262 ix86_tune = ix86_arch;
3263
3264 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3265 error ("CPU you selected does not support x86-64 "
3266 "instruction set");
3267
3268 if (processor_alias_table[i].flags & PTA_MMX
3269 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3270 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3271 if (processor_alias_table[i].flags & PTA_3DNOW
3272 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3273 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3274 if (processor_alias_table[i].flags & PTA_3DNOW_A
3275 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3276 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3277 if (processor_alias_table[i].flags & PTA_SSE
3278 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3279 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3280 if (processor_alias_table[i].flags & PTA_SSE2
3281 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3282 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3283 if (processor_alias_table[i].flags & PTA_SSE3
3284 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3285 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3286 if (processor_alias_table[i].flags & PTA_SSSE3
3287 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3288 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3289 if (processor_alias_table[i].flags & PTA_SSE4_1
3290 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3291 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3292 if (processor_alias_table[i].flags & PTA_SSE4_2
3293 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3294 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3295 if (processor_alias_table[i].flags & PTA_AVX
3296 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3297 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3298 if (processor_alias_table[i].flags & PTA_AVX2
3299 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3300 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3301 if (processor_alias_table[i].flags & PTA_FMA
3302 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3303 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3304 if (processor_alias_table[i].flags & PTA_SSE4A
3305 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3306 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3307 if (processor_alias_table[i].flags & PTA_FMA4
3308 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3309 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3310 if (processor_alias_table[i].flags & PTA_XOP
3311 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3312 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3313 if (processor_alias_table[i].flags & PTA_LWP
3314 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3315 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3316 if (processor_alias_table[i].flags & PTA_ABM
3317 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3318 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3319 if (processor_alias_table[i].flags & PTA_BMI
3320 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3321 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3322 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3323 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3324 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3325 if (processor_alias_table[i].flags & PTA_TBM
3326 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3327 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3328 if (processor_alias_table[i].flags & PTA_BMI2
3329 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3330 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3331 if (processor_alias_table[i].flags & PTA_CX16
3332 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3333 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3334 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3335 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3336 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3337 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3338 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3339 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3340 if (processor_alias_table[i].flags & PTA_MOVBE
3341 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3342 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3343 if (processor_alias_table[i].flags & PTA_AES
3344 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3345 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3346 if (processor_alias_table[i].flags & PTA_PCLMUL
3347 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3348 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3349 if (processor_alias_table[i].flags & PTA_FSGSBASE
3350 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3351 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3352 if (processor_alias_table[i].flags & PTA_RDRND
3353 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3354 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3355 if (processor_alias_table[i].flags & PTA_F16C
3356 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3357 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3358 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3359 x86_prefetch_sse = true;
3360
3361 break;
3362 }
3363
3364 if (!strcmp (ix86_arch_string, "generic"))
3365 error ("generic CPU can be used only for %stune=%s %s",
3366 prefix, suffix, sw);
3367 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3368 error ("bad value (%s) for %sarch=%s %s",
3369 ix86_arch_string, prefix, suffix, sw);
3370
3371 ix86_arch_mask = 1u << ix86_arch;
3372 for (i = 0; i < X86_ARCH_LAST; ++i)
3373 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3374
3375 for (i = 0; i < pta_size; i++)
3376 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3377 {
3378 ix86_schedule = processor_alias_table[i].schedule;
3379 ix86_tune = processor_alias_table[i].processor;
3380 if (TARGET_64BIT)
3381 {
3382 if (!(processor_alias_table[i].flags & PTA_64BIT))
3383 {
3384 if (ix86_tune_defaulted)
3385 {
3386 ix86_tune_string = "x86-64";
3387 for (i = 0; i < pta_size; i++)
3388 if (! strcmp (ix86_tune_string,
3389 processor_alias_table[i].name))
3390 break;
3391 ix86_schedule = processor_alias_table[i].schedule;
3392 ix86_tune = processor_alias_table[i].processor;
3393 }
3394 else
3395 error ("CPU you selected does not support x86-64 "
3396 "instruction set");
3397 }
3398 }
3399 else
3400 {
3401 /* Adjust tuning when compiling for 32-bit ABI. */
3402 switch (ix86_tune)
3403 {
3404 case PROCESSOR_GENERIC64:
3405 ix86_tune = PROCESSOR_GENERIC32;
3406 ix86_schedule = CPU_PENTIUMPRO;
3407 break;
3408
3409 case PROCESSOR_CORE2_64:
3410 ix86_tune = PROCESSOR_CORE2_32;
3411 break;
3412
3413 case PROCESSOR_COREI7_64:
3414 ix86_tune = PROCESSOR_COREI7_32;
3415 break;
3416
3417 default:
3418 break;
3419 }
3420 }
3421 /* Intel CPUs have always interpreted SSE prefetch instructions as
3422 NOPs; so, we can enable SSE prefetch instructions even when
3423 -mtune (rather than -march) points us to a processor that has them.
3424 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3425 higher processors. */
3426 if (TARGET_CMOVE
3427 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3428 x86_prefetch_sse = true;
3429 break;
3430 }
3431
3432 if (ix86_tune_specified && i == pta_size)
3433 error ("bad value (%s) for %stune=%s %s",
3434 ix86_tune_string, prefix, suffix, sw);
3435
3436 ix86_tune_mask = 1u << ix86_tune;
3437 for (i = 0; i < X86_TUNE_LAST; ++i)
3438 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3439
3440 #ifndef USE_IX86_FRAME_POINTER
3441 #define USE_IX86_FRAME_POINTER 0
3442 #endif
3443
3444 #ifndef USE_X86_64_FRAME_POINTER
3445 #define USE_X86_64_FRAME_POINTER 0
3446 #endif
3447
3448 /* Set the default values for switches whose default depends on TARGET_64BIT
3449 in case they weren't overwritten by command line options. */
3450 if (TARGET_64BIT)
3451 {
3452 if (optimize > 1 && !global_options_set.x_flag_zee)
3453 flag_zee = 1;
3454 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3455 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3456 if (flag_asynchronous_unwind_tables == 2)
3457 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3458 if (flag_pcc_struct_return == 2)
3459 flag_pcc_struct_return = 0;
3460 }
3461 else
3462 {
3463 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3464 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3465 if (flag_asynchronous_unwind_tables == 2)
3466 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3467 if (flag_pcc_struct_return == 2)
3468 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3469 }
3470
3471 if (optimize_size)
3472 ix86_cost = &ix86_size_cost;
3473 else
3474 ix86_cost = processor_target_table[ix86_tune].cost;
3475
3476 /* Arrange to set up i386_stack_locals for all functions. */
3477 init_machine_status = ix86_init_machine_status;
3478
3479 /* Validate -mregparm= value. */
3480 if (global_options_set.x_ix86_regparm)
3481 {
3482 if (TARGET_64BIT)
3483 warning (0, "-mregparm is ignored in 64-bit mode");
3484 if (ix86_regparm > REGPARM_MAX)
3485 {
3486 error ("-mregparm=%d is not between 0 and %d",
3487 ix86_regparm, REGPARM_MAX);
3488 ix86_regparm = 0;
3489 }
3490 }
3491 if (TARGET_64BIT)
3492 ix86_regparm = REGPARM_MAX;
3493
3494 /* Default align_* from the processor table. */
3495 if (align_loops == 0)
3496 {
3497 align_loops = processor_target_table[ix86_tune].align_loop;
3498 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3499 }
3500 if (align_jumps == 0)
3501 {
3502 align_jumps = processor_target_table[ix86_tune].align_jump;
3503 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3504 }
3505 if (align_functions == 0)
3506 {
3507 align_functions = processor_target_table[ix86_tune].align_func;
3508 }
3509
3510 /* Provide default for -mbranch-cost= value. */
3511 if (!global_options_set.x_ix86_branch_cost)
3512 ix86_branch_cost = ix86_cost->branch_cost;
3513
3514 if (TARGET_64BIT)
3515 {
3516 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3517
3518 /* Enable by default the SSE and MMX builtins. Do allow the user to
3519 explicitly disable any of these. In particular, disabling SSE and
3520 MMX for kernel code is extremely useful. */
3521 if (!ix86_arch_specified)
3522 ix86_isa_flags
3523 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3524 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3525
3526 if (TARGET_RTD)
3527 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3528 }
3529 else
3530 {
3531 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3532
3533 if (!ix86_arch_specified)
3534 ix86_isa_flags
3535 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3536
3537 /* i386 ABI does not specify red zone. It still makes sense to use it
3538 when programmer takes care to stack from being destroyed. */
3539 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3540 target_flags |= MASK_NO_RED_ZONE;
3541 }
3542
3543 /* Keep nonleaf frame pointers. */
3544 if (flag_omit_frame_pointer)
3545 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3546 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3547 flag_omit_frame_pointer = 1;
3548
3549 /* If we're doing fast math, we don't care about comparison order
3550 wrt NaNs. This lets us use a shorter comparison sequence. */
3551 if (flag_finite_math_only)
3552 target_flags &= ~MASK_IEEE_FP;
3553
3554 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3555 since the insns won't need emulation. */
3556 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3557 target_flags &= ~MASK_NO_FANCY_MATH_387;
3558
3559 /* Likewise, if the target doesn't have a 387, or we've specified
3560 software floating point, don't use 387 inline intrinsics. */
3561 if (!TARGET_80387)
3562 target_flags |= MASK_NO_FANCY_MATH_387;
3563
3564 /* Turn on MMX builtins for -msse. */
3565 if (TARGET_SSE)
3566 {
3567 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3568 x86_prefetch_sse = true;
3569 }
3570
3571 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3572 if (TARGET_SSE4_2 || TARGET_ABM)
3573 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3574
3575 /* Turn on lzcnt instruction for -mabm. */
3576 if (TARGET_ABM)
3577 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3578
3579 /* Validate -mpreferred-stack-boundary= value or default it to
3580 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3581 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3582 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3583 {
3584 int min = (TARGET_64BIT ? 4 : 2);
3585 int max = (TARGET_SEH ? 4 : 12);
3586
3587 if (ix86_preferred_stack_boundary_arg < min
3588 || ix86_preferred_stack_boundary_arg > max)
3589 {
3590 if (min == max)
3591 error ("-mpreferred-stack-boundary is not supported "
3592 "for this target");
3593 else
3594 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3595 ix86_preferred_stack_boundary_arg, min, max);
3596 }
3597 else
3598 ix86_preferred_stack_boundary
3599 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3600 }
3601
3602 /* Set the default value for -mstackrealign. */
3603 if (ix86_force_align_arg_pointer == -1)
3604 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3605
3606 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3607
3608 /* Validate -mincoming-stack-boundary= value or default it to
3609 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3610 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3611 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3612 {
3613 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3614 || ix86_incoming_stack_boundary_arg > 12)
3615 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3616 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3617 else
3618 {
3619 ix86_user_incoming_stack_boundary
3620 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3621 ix86_incoming_stack_boundary
3622 = ix86_user_incoming_stack_boundary;
3623 }
3624 }
3625
3626 /* Accept -msseregparm only if at least SSE support is enabled. */
3627 if (TARGET_SSEREGPARM
3628 && ! TARGET_SSE)
3629 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3630
3631 if (global_options_set.x_ix86_fpmath)
3632 {
3633 if (ix86_fpmath & FPMATH_SSE)
3634 {
3635 if (!TARGET_SSE)
3636 {
3637 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3638 ix86_fpmath = FPMATH_387;
3639 }
3640 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3641 {
3642 warning (0, "387 instruction set disabled, using SSE arithmetics");
3643 ix86_fpmath = FPMATH_SSE;
3644 }
3645 }
3646 }
3647 else
3648 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3649
3650 /* If the i387 is disabled, then do not return values in it. */
3651 if (!TARGET_80387)
3652 target_flags &= ~MASK_FLOAT_RETURNS;
3653
3654 /* Use external vectorized library in vectorizing intrinsics. */
3655 if (global_options_set.x_ix86_veclibabi_type)
3656 switch (ix86_veclibabi_type)
3657 {
3658 case ix86_veclibabi_type_svml:
3659 ix86_veclib_handler = ix86_veclibabi_svml;
3660 break;
3661
3662 case ix86_veclibabi_type_acml:
3663 ix86_veclib_handler = ix86_veclibabi_acml;
3664 break;
3665
3666 default:
3667 gcc_unreachable ();
3668 }
3669
3670 if ((!USE_IX86_FRAME_POINTER
3671 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3672 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3673 && !optimize_size)
3674 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3675
3676 /* ??? Unwind info is not correct around the CFG unless either a frame
3677 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3678 unwind info generation to be aware of the CFG and propagating states
3679 around edges. */
3680 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3681 || flag_exceptions || flag_non_call_exceptions)
3682 && flag_omit_frame_pointer
3683 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3684 {
3685 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3686 warning (0, "unwind tables currently require either a frame pointer "
3687 "or %saccumulate-outgoing-args%s for correctness",
3688 prefix, suffix);
3689 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3690 }
3691
3692 /* If stack probes are required, the space used for large function
3693 arguments on the stack must also be probed, so enable
3694 -maccumulate-outgoing-args so this happens in the prologue. */
3695 if (TARGET_STACK_PROBE
3696 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3697 {
3698 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3699 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3700 "for correctness", prefix, suffix);
3701 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3702 }
3703
3704 /* For sane SSE instruction set generation we need fcomi instruction.
3705 It is safe to enable all CMOVE instructions. Also, RDRAND intrinsic
3706 expands to a sequence that includes conditional move. */
3707 if (TARGET_SSE || TARGET_RDRND)
3708 TARGET_CMOVE = 1;
3709
3710 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3711 {
3712 char *p;
3713 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3714 p = strchr (internal_label_prefix, 'X');
3715 internal_label_prefix_len = p - internal_label_prefix;
3716 *p = '\0';
3717 }
3718
3719 /* When scheduling description is not available, disable scheduler pass
3720 so it won't slow down the compilation and make x87 code slower. */
3721 if (!TARGET_SCHEDULE)
3722 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3723
3724 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3725 ix86_cost->simultaneous_prefetches,
3726 global_options.x_param_values,
3727 global_options_set.x_param_values);
3728 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, ix86_cost->prefetch_block,
3729 global_options.x_param_values,
3730 global_options_set.x_param_values);
3731 maybe_set_param_value (PARAM_L1_CACHE_SIZE, ix86_cost->l1_cache_size,
3732 global_options.x_param_values,
3733 global_options_set.x_param_values);
3734 maybe_set_param_value (PARAM_L2_CACHE_SIZE, ix86_cost->l2_cache_size,
3735 global_options.x_param_values,
3736 global_options_set.x_param_values);
3737
3738 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3739 if (flag_prefetch_loop_arrays < 0
3740 && HAVE_prefetch
3741 && optimize >= 3
3742 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3743 flag_prefetch_loop_arrays = 1;
3744
3745 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3746 can be optimized to ap = __builtin_next_arg (0). */
3747 if (!TARGET_64BIT && !flag_split_stack)
3748 targetm.expand_builtin_va_start = NULL;
3749
3750 if (TARGET_64BIT)
3751 {
3752 ix86_gen_leave = gen_leave_rex64;
3753 ix86_gen_add3 = gen_adddi3;
3754 ix86_gen_sub3 = gen_subdi3;
3755 ix86_gen_sub3_carry = gen_subdi3_carry;
3756 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3757 ix86_gen_monitor = gen_sse3_monitor64;
3758 ix86_gen_andsp = gen_anddi3;
3759 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3760 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3761 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3762 }
3763 else
3764 {
3765 ix86_gen_leave = gen_leave;
3766 ix86_gen_add3 = gen_addsi3;
3767 ix86_gen_sub3 = gen_subsi3;
3768 ix86_gen_sub3_carry = gen_subsi3_carry;
3769 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3770 ix86_gen_monitor = gen_sse3_monitor;
3771 ix86_gen_andsp = gen_andsi3;
3772 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3773 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3774 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3775 }
3776
3777 #ifdef USE_IX86_CLD
3778 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3779 if (!TARGET_64BIT)
3780 target_flags |= MASK_CLD & ~target_flags_explicit;
3781 #endif
3782
3783 if (!TARGET_64BIT && flag_pic)
3784 {
3785 if (flag_fentry > 0)
3786 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3787 "with -fpic");
3788 flag_fentry = 0;
3789 }
3790 else if (TARGET_SEH)
3791 {
3792 if (flag_fentry == 0)
3793 sorry ("-mno-fentry isn%'t compatible with SEH");
3794 flag_fentry = 1;
3795 }
3796 else if (flag_fentry < 0)
3797 {
3798 #if defined(PROFILE_BEFORE_PROLOGUE)
3799 flag_fentry = 1;
3800 #else
3801 flag_fentry = 0;
3802 #endif
3803 }
3804
3805 if (TARGET_AVX)
3806 {
3807 /* When not optimize for size, enable vzeroupper optimization for
3808 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3809 AVX unaligned load/store. */
3810 if (!optimize_size)
3811 {
3812 if (flag_expensive_optimizations
3813 && !(target_flags_explicit & MASK_VZEROUPPER))
3814 target_flags |= MASK_VZEROUPPER;
3815 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3816 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3817 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3818 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3819 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3820 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3821 /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */
3822 if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
3823 target_flags |= MASK_PREFER_AVX128;
3824 }
3825 }
3826 else
3827 {
3828 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3829 target_flags &= ~MASK_VZEROUPPER;
3830 }
3831
3832 if (ix86_recip_name)
3833 {
3834 char *p = ASTRDUP (ix86_recip_name);
3835 char *q;
3836 unsigned int mask, i;
3837 bool invert;
3838
3839 while ((q = strtok (p, ",")) != NULL)
3840 {
3841 p = NULL;
3842 if (*q == '!')
3843 {
3844 invert = true;
3845 q++;
3846 }
3847 else
3848 invert = false;
3849
3850 if (!strcmp (q, "default"))
3851 mask = RECIP_MASK_ALL;
3852 else
3853 {
3854 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3855 if (!strcmp (q, recip_options[i].string))
3856 {
3857 mask = recip_options[i].mask;
3858 break;
3859 }
3860
3861 if (i == ARRAY_SIZE (recip_options))
3862 {
3863 error ("unknown option for -mrecip=%s", q);
3864 invert = false;
3865 mask = RECIP_MASK_NONE;
3866 }
3867 }
3868
3869 recip_mask_explicit |= mask;
3870 if (invert)
3871 recip_mask &= ~mask;
3872 else
3873 recip_mask |= mask;
3874 }
3875 }
3876
3877 if (TARGET_RECIP)
3878 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3879 else if (target_flags_explicit & MASK_RECIP)
3880 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3881
3882 /* Save the initial options in case the user does function specific
3883 options. */
3884 if (main_args_p)
3885 target_option_default_node = target_option_current_node
3886 = build_target_option_node ();
3887 }
3888
3889 /* Return TRUE if VAL is passed in register with 256bit AVX modes. */
3890
3891 static bool
3892 function_pass_avx256_p (const_rtx val)
3893 {
3894 if (!val)
3895 return false;
3896
3897 if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
3898 return true;
3899
3900 if (GET_CODE (val) == PARALLEL)
3901 {
3902 int i;
3903 rtx r;
3904
3905 for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
3906 {
3907 r = XVECEXP (val, 0, i);
3908 if (GET_CODE (r) == EXPR_LIST
3909 && XEXP (r, 0)
3910 && REG_P (XEXP (r, 0))
3911 && (GET_MODE (XEXP (r, 0)) == OImode
3912 || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
3913 return true;
3914 }
3915 }
3916
3917 return false;
3918 }
3919
3920 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3921
3922 static void
3923 ix86_option_override (void)
3924 {
3925 ix86_option_override_internal (true);
3926 }
3927
3928 /* Update register usage after having seen the compiler flags. */
3929
3930 static void
3931 ix86_conditional_register_usage (void)
3932 {
3933 int i;
3934 unsigned int j;
3935
3936 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3937 {
3938 if (fixed_regs[i] > 1)
3939 fixed_regs[i] = (fixed_regs[i] == (TARGET_64BIT ? 3 : 2));
3940 if (call_used_regs[i] > 1)
3941 call_used_regs[i] = (call_used_regs[i] == (TARGET_64BIT ? 3 : 2));
3942 }
3943
3944 /* The PIC register, if it exists, is fixed. */
3945 j = PIC_OFFSET_TABLE_REGNUM;
3946 if (j != INVALID_REGNUM)
3947 fixed_regs[j] = call_used_regs[j] = 1;
3948
3949 /* The 64-bit MS_ABI changes the set of call-used registers. */
3950 if (TARGET_64BIT_MS_ABI)
3951 {
3952 call_used_regs[SI_REG] = 0;
3953 call_used_regs[DI_REG] = 0;
3954 call_used_regs[XMM6_REG] = 0;
3955 call_used_regs[XMM7_REG] = 0;
3956 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3957 call_used_regs[i] = 0;
3958 }
3959
3960 /* The default setting of CLOBBERED_REGS is for 32-bit; add in the
3961 other call-clobbered regs for 64-bit. */
3962 if (TARGET_64BIT)
3963 {
3964 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
3965
3966 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3967 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
3968 && call_used_regs[i])
3969 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
3970 }
3971
3972 /* If MMX is disabled, squash the registers. */
3973 if (! TARGET_MMX)
3974 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3975 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
3976 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3977
3978 /* If SSE is disabled, squash the registers. */
3979 if (! TARGET_SSE)
3980 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3981 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
3982 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3983
3984 /* If the FPU is disabled, squash the registers. */
3985 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
3986 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3987 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
3988 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3989
3990 /* If 32-bit, squash the 64-bit registers. */
3991 if (! TARGET_64BIT)
3992 {
3993 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
3994 reg_names[i] = "";
3995 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3996 reg_names[i] = "";
3997 }
3998 }
3999
4000 \f
4001 /* Save the current options */
4002
4003 static void
4004 ix86_function_specific_save (struct cl_target_option *ptr)
4005 {
4006 ptr->arch = ix86_arch;
4007 ptr->schedule = ix86_schedule;
4008 ptr->tune = ix86_tune;
4009 ptr->branch_cost = ix86_branch_cost;
4010 ptr->tune_defaulted = ix86_tune_defaulted;
4011 ptr->arch_specified = ix86_arch_specified;
4012 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4013 ptr->ix86_target_flags_explicit = target_flags_explicit;
4014 ptr->x_recip_mask_explicit = recip_mask_explicit;
4015
4016 /* The fields are char but the variables are not; make sure the
4017 values fit in the fields. */
4018 gcc_assert (ptr->arch == ix86_arch);
4019 gcc_assert (ptr->schedule == ix86_schedule);
4020 gcc_assert (ptr->tune == ix86_tune);
4021 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4022 }
4023
4024 /* Restore the current options */
4025
4026 static void
4027 ix86_function_specific_restore (struct cl_target_option *ptr)
4028 {
4029 enum processor_type old_tune = ix86_tune;
4030 enum processor_type old_arch = ix86_arch;
4031 unsigned int ix86_arch_mask, ix86_tune_mask;
4032 int i;
4033
4034 ix86_arch = (enum processor_type) ptr->arch;
4035 ix86_schedule = (enum attr_cpu) ptr->schedule;
4036 ix86_tune = (enum processor_type) ptr->tune;
4037 ix86_branch_cost = ptr->branch_cost;
4038 ix86_tune_defaulted = ptr->tune_defaulted;
4039 ix86_arch_specified = ptr->arch_specified;
4040 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4041 target_flags_explicit = ptr->ix86_target_flags_explicit;
4042 recip_mask_explicit = ptr->x_recip_mask_explicit;
4043
4044 /* Recreate the arch feature tests if the arch changed */
4045 if (old_arch != ix86_arch)
4046 {
4047 ix86_arch_mask = 1u << ix86_arch;
4048 for (i = 0; i < X86_ARCH_LAST; ++i)
4049 ix86_arch_features[i]
4050 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4051 }
4052
4053 /* Recreate the tune optimization tests */
4054 if (old_tune != ix86_tune)
4055 {
4056 ix86_tune_mask = 1u << ix86_tune;
4057 for (i = 0; i < X86_TUNE_LAST; ++i)
4058 ix86_tune_features[i]
4059 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4060 }
4061 }
4062
4063 /* Print the current options */
4064
4065 static void
4066 ix86_function_specific_print (FILE *file, int indent,
4067 struct cl_target_option *ptr)
4068 {
4069 char *target_string
4070 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4071 NULL, NULL, ptr->x_ix86_fpmath, false);
4072
4073 fprintf (file, "%*sarch = %d (%s)\n",
4074 indent, "",
4075 ptr->arch,
4076 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4077 ? cpu_names[ptr->arch]
4078 : "<unknown>"));
4079
4080 fprintf (file, "%*stune = %d (%s)\n",
4081 indent, "",
4082 ptr->tune,
4083 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4084 ? cpu_names[ptr->tune]
4085 : "<unknown>"));
4086
4087 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4088
4089 if (target_string)
4090 {
4091 fprintf (file, "%*s%s\n", indent, "", target_string);
4092 free (target_string);
4093 }
4094 }
4095
4096 \f
4097 /* Inner function to process the attribute((target(...))), take an argument and
4098 set the current options from the argument. If we have a list, recursively go
4099 over the list. */
4100
4101 static bool
4102 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4103 struct gcc_options *enum_opts_set)
4104 {
4105 char *next_optstr;
4106 bool ret = true;
4107
4108 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4109 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4110 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4111 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4112 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4113
4114 enum ix86_opt_type
4115 {
4116 ix86_opt_unknown,
4117 ix86_opt_yes,
4118 ix86_opt_no,
4119 ix86_opt_str,
4120 ix86_opt_enum,
4121 ix86_opt_isa
4122 };
4123
4124 static const struct
4125 {
4126 const char *string;
4127 size_t len;
4128 enum ix86_opt_type type;
4129 int opt;
4130 int mask;
4131 } attrs[] = {
4132 /* isa options */
4133 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4134 IX86_ATTR_ISA ("abm", OPT_mabm),
4135 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4136 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4137 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4138 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4139 IX86_ATTR_ISA ("aes", OPT_maes),
4140 IX86_ATTR_ISA ("avx", OPT_mavx),
4141 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4142 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4143 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4144 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4145 IX86_ATTR_ISA ("sse", OPT_msse),
4146 IX86_ATTR_ISA ("sse2", OPT_msse2),
4147 IX86_ATTR_ISA ("sse3", OPT_msse3),
4148 IX86_ATTR_ISA ("sse4", OPT_msse4),
4149 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4150 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4151 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4152 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4153 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4154 IX86_ATTR_ISA ("fma", OPT_mfma),
4155 IX86_ATTR_ISA ("xop", OPT_mxop),
4156 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4157 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4158 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4159 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4160
4161 /* enum options */
4162 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4163
4164 /* string options */
4165 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4166 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4167
4168 /* flag options */
4169 IX86_ATTR_YES ("cld",
4170 OPT_mcld,
4171 MASK_CLD),
4172
4173 IX86_ATTR_NO ("fancy-math-387",
4174 OPT_mfancy_math_387,
4175 MASK_NO_FANCY_MATH_387),
4176
4177 IX86_ATTR_YES ("ieee-fp",
4178 OPT_mieee_fp,
4179 MASK_IEEE_FP),
4180
4181 IX86_ATTR_YES ("inline-all-stringops",
4182 OPT_minline_all_stringops,
4183 MASK_INLINE_ALL_STRINGOPS),
4184
4185 IX86_ATTR_YES ("inline-stringops-dynamically",
4186 OPT_minline_stringops_dynamically,
4187 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4188
4189 IX86_ATTR_NO ("align-stringops",
4190 OPT_mno_align_stringops,
4191 MASK_NO_ALIGN_STRINGOPS),
4192
4193 IX86_ATTR_YES ("recip",
4194 OPT_mrecip,
4195 MASK_RECIP),
4196
4197 };
4198
4199 /* If this is a list, recurse to get the options. */
4200 if (TREE_CODE (args) == TREE_LIST)
4201 {
4202 bool ret = true;
4203
4204 for (; args; args = TREE_CHAIN (args))
4205 if (TREE_VALUE (args)
4206 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4207 p_strings, enum_opts_set))
4208 ret = false;
4209
4210 return ret;
4211 }
4212
4213 else if (TREE_CODE (args) != STRING_CST)
4214 gcc_unreachable ();
4215
4216 /* Handle multiple arguments separated by commas. */
4217 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4218
4219 while (next_optstr && *next_optstr != '\0')
4220 {
4221 char *p = next_optstr;
4222 char *orig_p = p;
4223 char *comma = strchr (next_optstr, ',');
4224 const char *opt_string;
4225 size_t len, opt_len;
4226 int opt;
4227 bool opt_set_p;
4228 char ch;
4229 unsigned i;
4230 enum ix86_opt_type type = ix86_opt_unknown;
4231 int mask = 0;
4232
4233 if (comma)
4234 {
4235 *comma = '\0';
4236 len = comma - next_optstr;
4237 next_optstr = comma + 1;
4238 }
4239 else
4240 {
4241 len = strlen (p);
4242 next_optstr = NULL;
4243 }
4244
4245 /* Recognize no-xxx. */
4246 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4247 {
4248 opt_set_p = false;
4249 p += 3;
4250 len -= 3;
4251 }
4252 else
4253 opt_set_p = true;
4254
4255 /* Find the option. */
4256 ch = *p;
4257 opt = N_OPTS;
4258 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4259 {
4260 type = attrs[i].type;
4261 opt_len = attrs[i].len;
4262 if (ch == attrs[i].string[0]
4263 && ((type != ix86_opt_str && type != ix86_opt_enum)
4264 ? len == opt_len
4265 : len > opt_len)
4266 && memcmp (p, attrs[i].string, opt_len) == 0)
4267 {
4268 opt = attrs[i].opt;
4269 mask = attrs[i].mask;
4270 opt_string = attrs[i].string;
4271 break;
4272 }
4273 }
4274
4275 /* Process the option. */
4276 if (opt == N_OPTS)
4277 {
4278 error ("attribute(target(\"%s\")) is unknown", orig_p);
4279 ret = false;
4280 }
4281
4282 else if (type == ix86_opt_isa)
4283 {
4284 struct cl_decoded_option decoded;
4285
4286 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4287 ix86_handle_option (&global_options, &global_options_set,
4288 &decoded, input_location);
4289 }
4290
4291 else if (type == ix86_opt_yes || type == ix86_opt_no)
4292 {
4293 if (type == ix86_opt_no)
4294 opt_set_p = !opt_set_p;
4295
4296 if (opt_set_p)
4297 target_flags |= mask;
4298 else
4299 target_flags &= ~mask;
4300 }
4301
4302 else if (type == ix86_opt_str)
4303 {
4304 if (p_strings[opt])
4305 {
4306 error ("option(\"%s\") was already specified", opt_string);
4307 ret = false;
4308 }
4309 else
4310 p_strings[opt] = xstrdup (p + opt_len);
4311 }
4312
4313 else if (type == ix86_opt_enum)
4314 {
4315 bool arg_ok;
4316 int value;
4317
4318 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4319 if (arg_ok)
4320 set_option (&global_options, enum_opts_set, opt, value,
4321 p + opt_len, DK_UNSPECIFIED, input_location,
4322 global_dc);
4323 else
4324 {
4325 error ("attribute(target(\"%s\")) is unknown", orig_p);
4326 ret = false;
4327 }
4328 }
4329
4330 else
4331 gcc_unreachable ();
4332 }
4333
4334 return ret;
4335 }
4336
4337 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4338
4339 tree
4340 ix86_valid_target_attribute_tree (tree args)
4341 {
4342 const char *orig_arch_string = ix86_arch_string;
4343 const char *orig_tune_string = ix86_tune_string;
4344 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4345 int orig_tune_defaulted = ix86_tune_defaulted;
4346 int orig_arch_specified = ix86_arch_specified;
4347 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4348 tree t = NULL_TREE;
4349 int i;
4350 struct cl_target_option *def
4351 = TREE_TARGET_OPTION (target_option_default_node);
4352 struct gcc_options enum_opts_set;
4353
4354 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4355
4356 /* Process each of the options on the chain. */
4357 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4358 &enum_opts_set))
4359 return NULL_TREE;
4360
4361 /* If the changed options are different from the default, rerun
4362 ix86_option_override_internal, and then save the options away.
4363 The string options are are attribute options, and will be undone
4364 when we copy the save structure. */
4365 if (ix86_isa_flags != def->x_ix86_isa_flags
4366 || target_flags != def->x_target_flags
4367 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4368 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4369 || enum_opts_set.x_ix86_fpmath)
4370 {
4371 /* If we are using the default tune= or arch=, undo the string assigned,
4372 and use the default. */
4373 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4374 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4375 else if (!orig_arch_specified)
4376 ix86_arch_string = NULL;
4377
4378 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4379 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4380 else if (orig_tune_defaulted)
4381 ix86_tune_string = NULL;
4382
4383 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4384 if (enum_opts_set.x_ix86_fpmath)
4385 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4386 else if (!TARGET_64BIT && TARGET_SSE)
4387 {
4388 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4389 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4390 }
4391
4392 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4393 ix86_option_override_internal (false);
4394
4395 /* Add any builtin functions with the new isa if any. */
4396 ix86_add_new_builtins (ix86_isa_flags);
4397
4398 /* Save the current options unless we are validating options for
4399 #pragma. */
4400 t = build_target_option_node ();
4401
4402 ix86_arch_string = orig_arch_string;
4403 ix86_tune_string = orig_tune_string;
4404 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4405
4406 /* Free up memory allocated to hold the strings */
4407 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4408 free (option_strings[i]);
4409 }
4410
4411 return t;
4412 }
4413
4414 /* Hook to validate attribute((target("string"))). */
4415
4416 static bool
4417 ix86_valid_target_attribute_p (tree fndecl,
4418 tree ARG_UNUSED (name),
4419 tree args,
4420 int ARG_UNUSED (flags))
4421 {
4422 struct cl_target_option cur_target;
4423 bool ret = true;
4424 tree old_optimize = build_optimization_node ();
4425 tree new_target, new_optimize;
4426 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4427
4428 /* If the function changed the optimization levels as well as setting target
4429 options, start with the optimizations specified. */
4430 if (func_optimize && func_optimize != old_optimize)
4431 cl_optimization_restore (&global_options,
4432 TREE_OPTIMIZATION (func_optimize));
4433
4434 /* The target attributes may also change some optimization flags, so update
4435 the optimization options if necessary. */
4436 cl_target_option_save (&cur_target, &global_options);
4437 new_target = ix86_valid_target_attribute_tree (args);
4438 new_optimize = build_optimization_node ();
4439
4440 if (!new_target)
4441 ret = false;
4442
4443 else if (fndecl)
4444 {
4445 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4446
4447 if (old_optimize != new_optimize)
4448 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4449 }
4450
4451 cl_target_option_restore (&global_options, &cur_target);
4452
4453 if (old_optimize != new_optimize)
4454 cl_optimization_restore (&global_options,
4455 TREE_OPTIMIZATION (old_optimize));
4456
4457 return ret;
4458 }
4459
4460 \f
4461 /* Hook to determine if one function can safely inline another. */
4462
4463 static bool
4464 ix86_can_inline_p (tree caller, tree callee)
4465 {
4466 bool ret = false;
4467 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4468 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4469
4470 /* If callee has no option attributes, then it is ok to inline. */
4471 if (!callee_tree)
4472 ret = true;
4473
4474 /* If caller has no option attributes, but callee does then it is not ok to
4475 inline. */
4476 else if (!caller_tree)
4477 ret = false;
4478
4479 else
4480 {
4481 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4482 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4483
4484 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4485 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4486 function. */
4487 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4488 != callee_opts->x_ix86_isa_flags)
4489 ret = false;
4490
4491 /* See if we have the same non-isa options. */
4492 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4493 ret = false;
4494
4495 /* See if arch, tune, etc. are the same. */
4496 else if (caller_opts->arch != callee_opts->arch)
4497 ret = false;
4498
4499 else if (caller_opts->tune != callee_opts->tune)
4500 ret = false;
4501
4502 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4503 ret = false;
4504
4505 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4506 ret = false;
4507
4508 else
4509 ret = true;
4510 }
4511
4512 return ret;
4513 }
4514
4515 \f
4516 /* Remember the last target of ix86_set_current_function. */
4517 static GTY(()) tree ix86_previous_fndecl;
4518
4519 /* Establish appropriate back-end context for processing the function
4520 FNDECL. The argument might be NULL to indicate processing at top
4521 level, outside of any function scope. */
4522 static void
4523 ix86_set_current_function (tree fndecl)
4524 {
4525 /* Only change the context if the function changes. This hook is called
4526 several times in the course of compiling a function, and we don't want to
4527 slow things down too much or call target_reinit when it isn't safe. */
4528 if (fndecl && fndecl != ix86_previous_fndecl)
4529 {
4530 tree old_tree = (ix86_previous_fndecl
4531 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4532 : NULL_TREE);
4533
4534 tree new_tree = (fndecl
4535 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4536 : NULL_TREE);
4537
4538 ix86_previous_fndecl = fndecl;
4539 if (old_tree == new_tree)
4540 ;
4541
4542 else if (new_tree)
4543 {
4544 cl_target_option_restore (&global_options,
4545 TREE_TARGET_OPTION (new_tree));
4546 target_reinit ();
4547 }
4548
4549 else if (old_tree)
4550 {
4551 struct cl_target_option *def
4552 = TREE_TARGET_OPTION (target_option_current_node);
4553
4554 cl_target_option_restore (&global_options, def);
4555 target_reinit ();
4556 }
4557 }
4558 }
4559
4560 \f
4561 /* Return true if this goes in large data/bss. */
4562
4563 static bool
4564 ix86_in_large_data_p (tree exp)
4565 {
4566 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4567 return false;
4568
4569 /* Functions are never large data. */
4570 if (TREE_CODE (exp) == FUNCTION_DECL)
4571 return false;
4572
4573 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4574 {
4575 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4576 if (strcmp (section, ".ldata") == 0
4577 || strcmp (section, ".lbss") == 0)
4578 return true;
4579 return false;
4580 }
4581 else
4582 {
4583 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4584
4585 /* If this is an incomplete type with size 0, then we can't put it
4586 in data because it might be too big when completed. */
4587 if (!size || size > ix86_section_threshold)
4588 return true;
4589 }
4590
4591 return false;
4592 }
4593
4594 /* Switch to the appropriate section for output of DECL.
4595 DECL is either a `VAR_DECL' node or a constant of some sort.
4596 RELOC indicates whether forming the initial value of DECL requires
4597 link-time relocations. */
4598
4599 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4600 ATTRIBUTE_UNUSED;
4601
4602 static section *
4603 x86_64_elf_select_section (tree decl, int reloc,
4604 unsigned HOST_WIDE_INT align)
4605 {
4606 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4607 && ix86_in_large_data_p (decl))
4608 {
4609 const char *sname = NULL;
4610 unsigned int flags = SECTION_WRITE;
4611 switch (categorize_decl_for_section (decl, reloc))
4612 {
4613 case SECCAT_DATA:
4614 sname = ".ldata";
4615 break;
4616 case SECCAT_DATA_REL:
4617 sname = ".ldata.rel";
4618 break;
4619 case SECCAT_DATA_REL_LOCAL:
4620 sname = ".ldata.rel.local";
4621 break;
4622 case SECCAT_DATA_REL_RO:
4623 sname = ".ldata.rel.ro";
4624 break;
4625 case SECCAT_DATA_REL_RO_LOCAL:
4626 sname = ".ldata.rel.ro.local";
4627 break;
4628 case SECCAT_BSS:
4629 sname = ".lbss";
4630 flags |= SECTION_BSS;
4631 break;
4632 case SECCAT_RODATA:
4633 case SECCAT_RODATA_MERGE_STR:
4634 case SECCAT_RODATA_MERGE_STR_INIT:
4635 case SECCAT_RODATA_MERGE_CONST:
4636 sname = ".lrodata";
4637 flags = 0;
4638 break;
4639 case SECCAT_SRODATA:
4640 case SECCAT_SDATA:
4641 case SECCAT_SBSS:
4642 gcc_unreachable ();
4643 case SECCAT_TEXT:
4644 case SECCAT_TDATA:
4645 case SECCAT_TBSS:
4646 /* We don't split these for medium model. Place them into
4647 default sections and hope for best. */
4648 break;
4649 }
4650 if (sname)
4651 {
4652 /* We might get called with string constants, but get_named_section
4653 doesn't like them as they are not DECLs. Also, we need to set
4654 flags in that case. */
4655 if (!DECL_P (decl))
4656 return get_section (sname, flags, NULL);
4657 return get_named_section (decl, sname, reloc);
4658 }
4659 }
4660 return default_elf_select_section (decl, reloc, align);
4661 }
4662
4663 /* Build up a unique section name, expressed as a
4664 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4665 RELOC indicates whether the initial value of EXP requires
4666 link-time relocations. */
4667
4668 static void ATTRIBUTE_UNUSED
4669 x86_64_elf_unique_section (tree decl, int reloc)
4670 {
4671 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4672 && ix86_in_large_data_p (decl))
4673 {
4674 const char *prefix = NULL;
4675 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4676 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4677
4678 switch (categorize_decl_for_section (decl, reloc))
4679 {
4680 case SECCAT_DATA:
4681 case SECCAT_DATA_REL:
4682 case SECCAT_DATA_REL_LOCAL:
4683 case SECCAT_DATA_REL_RO:
4684 case SECCAT_DATA_REL_RO_LOCAL:
4685 prefix = one_only ? ".ld" : ".ldata";
4686 break;
4687 case SECCAT_BSS:
4688 prefix = one_only ? ".lb" : ".lbss";
4689 break;
4690 case SECCAT_RODATA:
4691 case SECCAT_RODATA_MERGE_STR:
4692 case SECCAT_RODATA_MERGE_STR_INIT:
4693 case SECCAT_RODATA_MERGE_CONST:
4694 prefix = one_only ? ".lr" : ".lrodata";
4695 break;
4696 case SECCAT_SRODATA:
4697 case SECCAT_SDATA:
4698 case SECCAT_SBSS:
4699 gcc_unreachable ();
4700 case SECCAT_TEXT:
4701 case SECCAT_TDATA:
4702 case SECCAT_TBSS:
4703 /* We don't split these for medium model. Place them into
4704 default sections and hope for best. */
4705 break;
4706 }
4707 if (prefix)
4708 {
4709 const char *name, *linkonce;
4710 char *string;
4711
4712 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4713 name = targetm.strip_name_encoding (name);
4714
4715 /* If we're using one_only, then there needs to be a .gnu.linkonce
4716 prefix to the section name. */
4717 linkonce = one_only ? ".gnu.linkonce" : "";
4718
4719 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4720
4721 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4722 return;
4723 }
4724 }
4725 default_unique_section (decl, reloc);
4726 }
4727
4728 #ifdef COMMON_ASM_OP
4729 /* This says how to output assembler code to declare an
4730 uninitialized external linkage data object.
4731
4732 For medium model x86-64 we need to use .largecomm opcode for
4733 large objects. */
4734 void
4735 x86_elf_aligned_common (FILE *file,
4736 const char *name, unsigned HOST_WIDE_INT size,
4737 int align)
4738 {
4739 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4740 && size > (unsigned int)ix86_section_threshold)
4741 fputs (".largecomm\t", file);
4742 else
4743 fputs (COMMON_ASM_OP, file);
4744 assemble_name (file, name);
4745 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4746 size, align / BITS_PER_UNIT);
4747 }
4748 #endif
4749
4750 /* Utility function for targets to use in implementing
4751 ASM_OUTPUT_ALIGNED_BSS. */
4752
4753 void
4754 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4755 const char *name, unsigned HOST_WIDE_INT size,
4756 int align)
4757 {
4758 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4759 && size > (unsigned int)ix86_section_threshold)
4760 switch_to_section (get_named_section (decl, ".lbss", 0));
4761 else
4762 switch_to_section (bss_section);
4763 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4764 #ifdef ASM_DECLARE_OBJECT_NAME
4765 last_assemble_variable_decl = decl;
4766 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4767 #else
4768 /* Standard thing is just output label for the object. */
4769 ASM_OUTPUT_LABEL (file, name);
4770 #endif /* ASM_DECLARE_OBJECT_NAME */
4771 ASM_OUTPUT_SKIP (file, size ? size : 1);
4772 }
4773 \f
4774 /* Decide whether we must probe the stack before any space allocation
4775 on this target. It's essentially TARGET_STACK_PROBE except when
4776 -fstack-check causes the stack to be already probed differently. */
4777
4778 bool
4779 ix86_target_stack_probe (void)
4780 {
4781 /* Do not probe the stack twice if static stack checking is enabled. */
4782 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4783 return false;
4784
4785 return TARGET_STACK_PROBE;
4786 }
4787 \f
4788 /* Decide whether we can make a sibling call to a function. DECL is the
4789 declaration of the function being targeted by the call and EXP is the
4790 CALL_EXPR representing the call. */
4791
4792 static bool
4793 ix86_function_ok_for_sibcall (tree decl, tree exp)
4794 {
4795 tree type, decl_or_type;
4796 rtx a, b;
4797
4798 /* If we are generating position-independent code, we cannot sibcall
4799 optimize any indirect call, or a direct call to a global function,
4800 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4801 if (!TARGET_MACHO
4802 && !TARGET_64BIT
4803 && flag_pic
4804 && (!decl || !targetm.binds_local_p (decl)))
4805 return false;
4806
4807 /* If we need to align the outgoing stack, then sibcalling would
4808 unalign the stack, which may break the called function. */
4809 if (ix86_minimum_incoming_stack_boundary (true)
4810 < PREFERRED_STACK_BOUNDARY)
4811 return false;
4812
4813 if (decl)
4814 {
4815 decl_or_type = decl;
4816 type = TREE_TYPE (decl);
4817 }
4818 else
4819 {
4820 /* We're looking at the CALL_EXPR, we need the type of the function. */
4821 type = CALL_EXPR_FN (exp); /* pointer expression */
4822 type = TREE_TYPE (type); /* pointer type */
4823 type = TREE_TYPE (type); /* function type */
4824 decl_or_type = type;
4825 }
4826
4827 /* Check that the return value locations are the same. Like
4828 if we are returning floats on the 80387 register stack, we cannot
4829 make a sibcall from a function that doesn't return a float to a
4830 function that does or, conversely, from a function that does return
4831 a float to a function that doesn't; the necessary stack adjustment
4832 would not be executed. This is also the place we notice
4833 differences in the return value ABI. Note that it is ok for one
4834 of the functions to have void return type as long as the return
4835 value of the other is passed in a register. */
4836 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4837 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4838 cfun->decl, false);
4839 if (STACK_REG_P (a) || STACK_REG_P (b))
4840 {
4841 if (!rtx_equal_p (a, b))
4842 return false;
4843 }
4844 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4845 {
4846 /* Disable sibcall if we need to generate vzeroupper after
4847 callee returns. */
4848 if (TARGET_VZEROUPPER
4849 && cfun->machine->callee_return_avx256_p
4850 && !cfun->machine->caller_return_avx256_p)
4851 return false;
4852 }
4853 else if (!rtx_equal_p (a, b))
4854 return false;
4855
4856 if (TARGET_64BIT)
4857 {
4858 /* The SYSV ABI has more call-clobbered registers;
4859 disallow sibcalls from MS to SYSV. */
4860 if (cfun->machine->call_abi == MS_ABI
4861 && ix86_function_type_abi (type) == SYSV_ABI)
4862 return false;
4863 }
4864 else
4865 {
4866 /* If this call is indirect, we'll need to be able to use a
4867 call-clobbered register for the address of the target function.
4868 Make sure that all such registers are not used for passing
4869 parameters. Note that DLLIMPORT functions are indirect. */
4870 if (!decl
4871 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4872 {
4873 if (ix86_function_regparm (type, NULL) >= 3)
4874 {
4875 /* ??? Need to count the actual number of registers to be used,
4876 not the possible number of registers. Fix later. */
4877 return false;
4878 }
4879 }
4880 }
4881
4882 /* Otherwise okay. That also includes certain types of indirect calls. */
4883 return true;
4884 }
4885
4886 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4887 and "sseregparm" calling convention attributes;
4888 arguments as in struct attribute_spec.handler. */
4889
4890 static tree
4891 ix86_handle_cconv_attribute (tree *node, tree name,
4892 tree args,
4893 int flags ATTRIBUTE_UNUSED,
4894 bool *no_add_attrs)
4895 {
4896 if (TREE_CODE (*node) != FUNCTION_TYPE
4897 && TREE_CODE (*node) != METHOD_TYPE
4898 && TREE_CODE (*node) != FIELD_DECL
4899 && TREE_CODE (*node) != TYPE_DECL)
4900 {
4901 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4902 name);
4903 *no_add_attrs = true;
4904 return NULL_TREE;
4905 }
4906
4907 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4908 if (is_attribute_p ("regparm", name))
4909 {
4910 tree cst;
4911
4912 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4913 {
4914 error ("fastcall and regparm attributes are not compatible");
4915 }
4916
4917 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4918 {
4919 error ("regparam and thiscall attributes are not compatible");
4920 }
4921
4922 cst = TREE_VALUE (args);
4923 if (TREE_CODE (cst) != INTEGER_CST)
4924 {
4925 warning (OPT_Wattributes,
4926 "%qE attribute requires an integer constant argument",
4927 name);
4928 *no_add_attrs = true;
4929 }
4930 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4931 {
4932 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4933 name, REGPARM_MAX);
4934 *no_add_attrs = true;
4935 }
4936
4937 return NULL_TREE;
4938 }
4939
4940 if (TARGET_64BIT)
4941 {
4942 /* Do not warn when emulating the MS ABI. */
4943 if ((TREE_CODE (*node) != FUNCTION_TYPE
4944 && TREE_CODE (*node) != METHOD_TYPE)
4945 || ix86_function_type_abi (*node) != MS_ABI)
4946 warning (OPT_Wattributes, "%qE attribute ignored",
4947 name);
4948 *no_add_attrs = true;
4949 return NULL_TREE;
4950 }
4951
4952 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4953 if (is_attribute_p ("fastcall", name))
4954 {
4955 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4956 {
4957 error ("fastcall and cdecl attributes are not compatible");
4958 }
4959 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4960 {
4961 error ("fastcall and stdcall attributes are not compatible");
4962 }
4963 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
4964 {
4965 error ("fastcall and regparm attributes are not compatible");
4966 }
4967 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4968 {
4969 error ("fastcall and thiscall attributes are not compatible");
4970 }
4971 }
4972
4973 /* Can combine stdcall with fastcall (redundant), regparm and
4974 sseregparm. */
4975 else if (is_attribute_p ("stdcall", name))
4976 {
4977 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4978 {
4979 error ("stdcall and cdecl attributes are not compatible");
4980 }
4981 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4982 {
4983 error ("stdcall and fastcall attributes are not compatible");
4984 }
4985 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4986 {
4987 error ("stdcall and thiscall attributes are not compatible");
4988 }
4989 }
4990
4991 /* Can combine cdecl with regparm and sseregparm. */
4992 else if (is_attribute_p ("cdecl", name))
4993 {
4994 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4995 {
4996 error ("stdcall and cdecl attributes are not compatible");
4997 }
4998 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4999 {
5000 error ("fastcall and cdecl attributes are not compatible");
5001 }
5002 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5003 {
5004 error ("cdecl and thiscall attributes are not compatible");
5005 }
5006 }
5007 else if (is_attribute_p ("thiscall", name))
5008 {
5009 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5010 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5011 name);
5012 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5013 {
5014 error ("stdcall and thiscall attributes are not compatible");
5015 }
5016 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5017 {
5018 error ("fastcall and thiscall attributes are not compatible");
5019 }
5020 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5021 {
5022 error ("cdecl and thiscall attributes are not compatible");
5023 }
5024 }
5025
5026 /* Can combine sseregparm with all attributes. */
5027
5028 return NULL_TREE;
5029 }
5030
5031 /* The transactional memory builtins are implicitly regparm or fastcall
5032 depending on the ABI. Override the generic do-nothing attribute that
5033 these builtins were declared with, and replace it with one of the two
5034 attributes that we expect elsewhere. */
5035
5036 static tree
5037 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5038 tree args ATTRIBUTE_UNUSED,
5039 int flags ATTRIBUTE_UNUSED,
5040 bool *no_add_attrs)
5041 {
5042 tree alt;
5043
5044 /* In no case do we want to add the placeholder attribute. */
5045 *no_add_attrs = true;
5046
5047 /* The 64-bit ABI is unchanged for transactional memory. */
5048 if (TARGET_64BIT)
5049 return NULL_TREE;
5050
5051 /* ??? Is there a better way to validate 32-bit windows? We have
5052 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5053 if (CHECK_STACK_LIMIT > 0)
5054 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5055 else
5056 {
5057 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5058 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5059 }
5060 decl_attributes (node, alt, flags);
5061
5062 return NULL_TREE;
5063 }
5064
5065 /* This function determines from TYPE the calling-convention. */
5066
5067 unsigned int
5068 ix86_get_callcvt (const_tree type)
5069 {
5070 unsigned int ret = 0;
5071 bool is_stdarg;
5072 tree attrs;
5073
5074 if (TARGET_64BIT)
5075 return IX86_CALLCVT_CDECL;
5076
5077 attrs = TYPE_ATTRIBUTES (type);
5078 if (attrs != NULL_TREE)
5079 {
5080 if (lookup_attribute ("cdecl", attrs))
5081 ret |= IX86_CALLCVT_CDECL;
5082 else if (lookup_attribute ("stdcall", attrs))
5083 ret |= IX86_CALLCVT_STDCALL;
5084 else if (lookup_attribute ("fastcall", attrs))
5085 ret |= IX86_CALLCVT_FASTCALL;
5086 else if (lookup_attribute ("thiscall", attrs))
5087 ret |= IX86_CALLCVT_THISCALL;
5088
5089 /* Regparam isn't allowed for thiscall and fastcall. */
5090 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5091 {
5092 if (lookup_attribute ("regparm", attrs))
5093 ret |= IX86_CALLCVT_REGPARM;
5094 if (lookup_attribute ("sseregparm", attrs))
5095 ret |= IX86_CALLCVT_SSEREGPARM;
5096 }
5097
5098 if (IX86_BASE_CALLCVT(ret) != 0)
5099 return ret;
5100 }
5101
5102 is_stdarg = stdarg_p (type);
5103 if (TARGET_RTD && !is_stdarg)
5104 return IX86_CALLCVT_STDCALL | ret;
5105
5106 if (ret != 0
5107 || is_stdarg
5108 || TREE_CODE (type) != METHOD_TYPE
5109 || ix86_function_type_abi (type) != MS_ABI)
5110 return IX86_CALLCVT_CDECL | ret;
5111
5112 return IX86_CALLCVT_THISCALL;
5113 }
5114
5115 /* Return 0 if the attributes for two types are incompatible, 1 if they
5116 are compatible, and 2 if they are nearly compatible (which causes a
5117 warning to be generated). */
5118
5119 static int
5120 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5121 {
5122 unsigned int ccvt1, ccvt2;
5123
5124 if (TREE_CODE (type1) != FUNCTION_TYPE
5125 && TREE_CODE (type1) != METHOD_TYPE)
5126 return 1;
5127
5128 ccvt1 = ix86_get_callcvt (type1);
5129 ccvt2 = ix86_get_callcvt (type2);
5130 if (ccvt1 != ccvt2)
5131 return 0;
5132 if (ix86_function_regparm (type1, NULL)
5133 != ix86_function_regparm (type2, NULL))
5134 return 0;
5135
5136 return 1;
5137 }
5138 \f
5139 /* Return the regparm value for a function with the indicated TYPE and DECL.
5140 DECL may be NULL when calling function indirectly
5141 or considering a libcall. */
5142
5143 static int
5144 ix86_function_regparm (const_tree type, const_tree decl)
5145 {
5146 tree attr;
5147 int regparm;
5148 unsigned int ccvt;
5149
5150 if (TARGET_64BIT)
5151 return (ix86_function_type_abi (type) == SYSV_ABI
5152 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5153 ccvt = ix86_get_callcvt (type);
5154 regparm = ix86_regparm;
5155
5156 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5157 {
5158 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5159 if (attr)
5160 {
5161 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5162 return regparm;
5163 }
5164 }
5165 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5166 return 2;
5167 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5168 return 1;
5169
5170 /* Use register calling convention for local functions when possible. */
5171 if (decl
5172 && TREE_CODE (decl) == FUNCTION_DECL
5173 && optimize
5174 && !(profile_flag && !flag_fentry))
5175 {
5176 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5177 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5178 if (i && i->local && i->can_change_signature)
5179 {
5180 int local_regparm, globals = 0, regno;
5181
5182 /* Make sure no regparm register is taken by a
5183 fixed register variable. */
5184 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5185 if (fixed_regs[local_regparm])
5186 break;
5187
5188 /* We don't want to use regparm(3) for nested functions as
5189 these use a static chain pointer in the third argument. */
5190 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5191 local_regparm = 2;
5192
5193 /* In 32-bit mode save a register for the split stack. */
5194 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5195 local_regparm = 2;
5196
5197 /* Each fixed register usage increases register pressure,
5198 so less registers should be used for argument passing.
5199 This functionality can be overriden by an explicit
5200 regparm value. */
5201 for (regno = 0; regno <= DI_REG; regno++)
5202 if (fixed_regs[regno])
5203 globals++;
5204
5205 local_regparm
5206 = globals < local_regparm ? local_regparm - globals : 0;
5207
5208 if (local_regparm > regparm)
5209 regparm = local_regparm;
5210 }
5211 }
5212
5213 return regparm;
5214 }
5215
5216 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5217 DFmode (2) arguments in SSE registers for a function with the
5218 indicated TYPE and DECL. DECL may be NULL when calling function
5219 indirectly or considering a libcall. Otherwise return 0. */
5220
5221 static int
5222 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5223 {
5224 gcc_assert (!TARGET_64BIT);
5225
5226 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5227 by the sseregparm attribute. */
5228 if (TARGET_SSEREGPARM
5229 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5230 {
5231 if (!TARGET_SSE)
5232 {
5233 if (warn)
5234 {
5235 if (decl)
5236 error ("calling %qD with attribute sseregparm without "
5237 "SSE/SSE2 enabled", decl);
5238 else
5239 error ("calling %qT with attribute sseregparm without "
5240 "SSE/SSE2 enabled", type);
5241 }
5242 return 0;
5243 }
5244
5245 return 2;
5246 }
5247
5248 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5249 (and DFmode for SSE2) arguments in SSE registers. */
5250 if (decl && TARGET_SSE_MATH && optimize
5251 && !(profile_flag && !flag_fentry))
5252 {
5253 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5254 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5255 if (i && i->local && i->can_change_signature)
5256 return TARGET_SSE2 ? 2 : 1;
5257 }
5258
5259 return 0;
5260 }
5261
5262 /* Return true if EAX is live at the start of the function. Used by
5263 ix86_expand_prologue to determine if we need special help before
5264 calling allocate_stack_worker. */
5265
5266 static bool
5267 ix86_eax_live_at_start_p (void)
5268 {
5269 /* Cheat. Don't bother working forward from ix86_function_regparm
5270 to the function type to whether an actual argument is located in
5271 eax. Instead just look at cfg info, which is still close enough
5272 to correct at this point. This gives false positives for broken
5273 functions that might use uninitialized data that happens to be
5274 allocated in eax, but who cares? */
5275 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5276 }
5277
5278 static bool
5279 ix86_keep_aggregate_return_pointer (tree fntype)
5280 {
5281 tree attr;
5282
5283 if (!TARGET_64BIT)
5284 {
5285 attr = lookup_attribute ("callee_pop_aggregate_return",
5286 TYPE_ATTRIBUTES (fntype));
5287 if (attr)
5288 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5289
5290 /* For 32-bit MS-ABI the default is to keep aggregate
5291 return pointer. */
5292 if (ix86_function_type_abi (fntype) == MS_ABI)
5293 return true;
5294 }
5295 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5296 }
5297
5298 /* Value is the number of bytes of arguments automatically
5299 popped when returning from a subroutine call.
5300 FUNDECL is the declaration node of the function (as a tree),
5301 FUNTYPE is the data type of the function (as a tree),
5302 or for a library call it is an identifier node for the subroutine name.
5303 SIZE is the number of bytes of arguments passed on the stack.
5304
5305 On the 80386, the RTD insn may be used to pop them if the number
5306 of args is fixed, but if the number is variable then the caller
5307 must pop them all. RTD can't be used for library calls now
5308 because the library is compiled with the Unix compiler.
5309 Use of RTD is a selectable option, since it is incompatible with
5310 standard Unix calling sequences. If the option is not selected,
5311 the caller must always pop the args.
5312
5313 The attribute stdcall is equivalent to RTD on a per module basis. */
5314
5315 static int
5316 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5317 {
5318 unsigned int ccvt;
5319
5320 /* None of the 64-bit ABIs pop arguments. */
5321 if (TARGET_64BIT)
5322 return 0;
5323
5324 ccvt = ix86_get_callcvt (funtype);
5325
5326 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5327 | IX86_CALLCVT_THISCALL)) != 0
5328 && ! stdarg_p (funtype))
5329 return size;
5330
5331 /* Lose any fake structure return argument if it is passed on the stack. */
5332 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5333 && !ix86_keep_aggregate_return_pointer (funtype))
5334 {
5335 int nregs = ix86_function_regparm (funtype, fundecl);
5336 if (nregs == 0)
5337 return GET_MODE_SIZE (Pmode);
5338 }
5339
5340 return 0;
5341 }
5342 \f
5343 /* Argument support functions. */
5344
5345 /* Return true when register may be used to pass function parameters. */
5346 bool
5347 ix86_function_arg_regno_p (int regno)
5348 {
5349 int i;
5350 const int *parm_regs;
5351
5352 if (!TARGET_64BIT)
5353 {
5354 if (TARGET_MACHO)
5355 return (regno < REGPARM_MAX
5356 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5357 else
5358 return (regno < REGPARM_MAX
5359 || (TARGET_MMX && MMX_REGNO_P (regno)
5360 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5361 || (TARGET_SSE && SSE_REGNO_P (regno)
5362 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5363 }
5364
5365 if (TARGET_MACHO)
5366 {
5367 if (SSE_REGNO_P (regno) && TARGET_SSE)
5368 return true;
5369 }
5370 else
5371 {
5372 if (TARGET_SSE && SSE_REGNO_P (regno)
5373 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5374 return true;
5375 }
5376
5377 /* TODO: The function should depend on current function ABI but
5378 builtins.c would need updating then. Therefore we use the
5379 default ABI. */
5380
5381 /* RAX is used as hidden argument to va_arg functions. */
5382 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5383 return true;
5384
5385 if (ix86_abi == MS_ABI)
5386 parm_regs = x86_64_ms_abi_int_parameter_registers;
5387 else
5388 parm_regs = x86_64_int_parameter_registers;
5389 for (i = 0; i < (ix86_abi == MS_ABI
5390 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5391 if (regno == parm_regs[i])
5392 return true;
5393 return false;
5394 }
5395
5396 /* Return if we do not know how to pass TYPE solely in registers. */
5397
5398 static bool
5399 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5400 {
5401 if (must_pass_in_stack_var_size_or_pad (mode, type))
5402 return true;
5403
5404 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5405 The layout_type routine is crafty and tries to trick us into passing
5406 currently unsupported vector types on the stack by using TImode. */
5407 return (!TARGET_64BIT && mode == TImode
5408 && type && TREE_CODE (type) != VECTOR_TYPE);
5409 }
5410
5411 /* It returns the size, in bytes, of the area reserved for arguments passed
5412 in registers for the function represented by fndecl dependent to the used
5413 abi format. */
5414 int
5415 ix86_reg_parm_stack_space (const_tree fndecl)
5416 {
5417 enum calling_abi call_abi = SYSV_ABI;
5418 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5419 call_abi = ix86_function_abi (fndecl);
5420 else
5421 call_abi = ix86_function_type_abi (fndecl);
5422 if (TARGET_64BIT && call_abi == MS_ABI)
5423 return 32;
5424 return 0;
5425 }
5426
5427 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5428 call abi used. */
5429 enum calling_abi
5430 ix86_function_type_abi (const_tree fntype)
5431 {
5432 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5433 {
5434 enum calling_abi abi = ix86_abi;
5435 if (abi == SYSV_ABI)
5436 {
5437 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5438 abi = MS_ABI;
5439 }
5440 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5441 abi = SYSV_ABI;
5442 return abi;
5443 }
5444 return ix86_abi;
5445 }
5446
5447 static bool
5448 ix86_function_ms_hook_prologue (const_tree fn)
5449 {
5450 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5451 {
5452 if (decl_function_context (fn) != NULL_TREE)
5453 error_at (DECL_SOURCE_LOCATION (fn),
5454 "ms_hook_prologue is not compatible with nested function");
5455 else
5456 return true;
5457 }
5458 return false;
5459 }
5460
5461 static enum calling_abi
5462 ix86_function_abi (const_tree fndecl)
5463 {
5464 if (! fndecl)
5465 return ix86_abi;
5466 return ix86_function_type_abi (TREE_TYPE (fndecl));
5467 }
5468
5469 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5470 call abi used. */
5471 enum calling_abi
5472 ix86_cfun_abi (void)
5473 {
5474 if (! cfun)
5475 return ix86_abi;
5476 return cfun->machine->call_abi;
5477 }
5478
5479 /* Write the extra assembler code needed to declare a function properly. */
5480
5481 void
5482 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5483 tree decl)
5484 {
5485 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5486
5487 if (is_ms_hook)
5488 {
5489 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5490 unsigned int filler_cc = 0xcccccccc;
5491
5492 for (i = 0; i < filler_count; i += 4)
5493 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5494 }
5495
5496 #ifdef SUBTARGET_ASM_UNWIND_INIT
5497 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5498 #endif
5499
5500 ASM_OUTPUT_LABEL (asm_out_file, fname);
5501
5502 /* Output magic byte marker, if hot-patch attribute is set. */
5503 if (is_ms_hook)
5504 {
5505 if (TARGET_64BIT)
5506 {
5507 /* leaq [%rsp + 0], %rsp */
5508 asm_fprintf (asm_out_file, ASM_BYTE
5509 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5510 }
5511 else
5512 {
5513 /* movl.s %edi, %edi
5514 push %ebp
5515 movl.s %esp, %ebp */
5516 asm_fprintf (asm_out_file, ASM_BYTE
5517 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5518 }
5519 }
5520 }
5521
5522 /* regclass.c */
5523 extern void init_regs (void);
5524
5525 /* Implementation of call abi switching target hook. Specific to FNDECL
5526 the specific call register sets are set. See also
5527 ix86_conditional_register_usage for more details. */
5528 void
5529 ix86_call_abi_override (const_tree fndecl)
5530 {
5531 if (fndecl == NULL_TREE)
5532 cfun->machine->call_abi = ix86_abi;
5533 else
5534 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5535 }
5536
5537 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5538 expensive re-initialization of init_regs each time we switch function context
5539 since this is needed only during RTL expansion. */
5540 static void
5541 ix86_maybe_switch_abi (void)
5542 {
5543 if (TARGET_64BIT &&
5544 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5545 reinit_regs ();
5546 }
5547
5548 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5549 for a call to a function whose data type is FNTYPE.
5550 For a library call, FNTYPE is 0. */
5551
5552 void
5553 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5554 tree fntype, /* tree ptr for function decl */
5555 rtx libname, /* SYMBOL_REF of library name or 0 */
5556 tree fndecl,
5557 int caller)
5558 {
5559 struct cgraph_local_info *i;
5560 tree fnret_type;
5561
5562 memset (cum, 0, sizeof (*cum));
5563
5564 /* Initialize for the current callee. */
5565 if (caller)
5566 {
5567 cfun->machine->callee_pass_avx256_p = false;
5568 cfun->machine->callee_return_avx256_p = false;
5569 }
5570
5571 if (fndecl)
5572 {
5573 i = cgraph_local_info (fndecl);
5574 cum->call_abi = ix86_function_abi (fndecl);
5575 fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5576 }
5577 else
5578 {
5579 i = NULL;
5580 cum->call_abi = ix86_function_type_abi (fntype);
5581 if (fntype)
5582 fnret_type = TREE_TYPE (fntype);
5583 else
5584 fnret_type = NULL;
5585 }
5586
5587 if (TARGET_VZEROUPPER && fnret_type)
5588 {
5589 rtx fnret_value = ix86_function_value (fnret_type, fntype,
5590 false);
5591 if (function_pass_avx256_p (fnret_value))
5592 {
5593 /* The return value of this function uses 256bit AVX modes. */
5594 if (caller)
5595 cfun->machine->callee_return_avx256_p = true;
5596 else
5597 cfun->machine->caller_return_avx256_p = true;
5598 }
5599 }
5600
5601 cum->caller = caller;
5602
5603 /* Set up the number of registers to use for passing arguments. */
5604
5605 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5606 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5607 "or subtarget optimization implying it");
5608 cum->nregs = ix86_regparm;
5609 if (TARGET_64BIT)
5610 {
5611 cum->nregs = (cum->call_abi == SYSV_ABI
5612 ? X86_64_REGPARM_MAX
5613 : X86_64_MS_REGPARM_MAX);
5614 }
5615 if (TARGET_SSE)
5616 {
5617 cum->sse_nregs = SSE_REGPARM_MAX;
5618 if (TARGET_64BIT)
5619 {
5620 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5621 ? X86_64_SSE_REGPARM_MAX
5622 : X86_64_MS_SSE_REGPARM_MAX);
5623 }
5624 }
5625 if (TARGET_MMX)
5626 cum->mmx_nregs = MMX_REGPARM_MAX;
5627 cum->warn_avx = true;
5628 cum->warn_sse = true;
5629 cum->warn_mmx = true;
5630
5631 /* Because type might mismatch in between caller and callee, we need to
5632 use actual type of function for local calls.
5633 FIXME: cgraph_analyze can be told to actually record if function uses
5634 va_start so for local functions maybe_vaarg can be made aggressive
5635 helping K&R code.
5636 FIXME: once typesytem is fixed, we won't need this code anymore. */
5637 if (i && i->local && i->can_change_signature)
5638 fntype = TREE_TYPE (fndecl);
5639 cum->maybe_vaarg = (fntype
5640 ? (!prototype_p (fntype) || stdarg_p (fntype))
5641 : !libname);
5642
5643 if (!TARGET_64BIT)
5644 {
5645 /* If there are variable arguments, then we won't pass anything
5646 in registers in 32-bit mode. */
5647 if (stdarg_p (fntype))
5648 {
5649 cum->nregs = 0;
5650 cum->sse_nregs = 0;
5651 cum->mmx_nregs = 0;
5652 cum->warn_avx = 0;
5653 cum->warn_sse = 0;
5654 cum->warn_mmx = 0;
5655 return;
5656 }
5657
5658 /* Use ecx and edx registers if function has fastcall attribute,
5659 else look for regparm information. */
5660 if (fntype)
5661 {
5662 unsigned int ccvt = ix86_get_callcvt (fntype);
5663 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5664 {
5665 cum->nregs = 1;
5666 cum->fastcall = 1; /* Same first register as in fastcall. */
5667 }
5668 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5669 {
5670 cum->nregs = 2;
5671 cum->fastcall = 1;
5672 }
5673 else
5674 cum->nregs = ix86_function_regparm (fntype, fndecl);
5675 }
5676
5677 /* Set up the number of SSE registers used for passing SFmode
5678 and DFmode arguments. Warn for mismatching ABI. */
5679 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5680 }
5681 }
5682
5683 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5684 But in the case of vector types, it is some vector mode.
5685
5686 When we have only some of our vector isa extensions enabled, then there
5687 are some modes for which vector_mode_supported_p is false. For these
5688 modes, the generic vector support in gcc will choose some non-vector mode
5689 in order to implement the type. By computing the natural mode, we'll
5690 select the proper ABI location for the operand and not depend on whatever
5691 the middle-end decides to do with these vector types.
5692
5693 The midde-end can't deal with the vector types > 16 bytes. In this
5694 case, we return the original mode and warn ABI change if CUM isn't
5695 NULL. */
5696
5697 static enum machine_mode
5698 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5699 {
5700 enum machine_mode mode = TYPE_MODE (type);
5701
5702 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5703 {
5704 HOST_WIDE_INT size = int_size_in_bytes (type);
5705 if ((size == 8 || size == 16 || size == 32)
5706 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5707 && TYPE_VECTOR_SUBPARTS (type) > 1)
5708 {
5709 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5710
5711 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5712 mode = MIN_MODE_VECTOR_FLOAT;
5713 else
5714 mode = MIN_MODE_VECTOR_INT;
5715
5716 /* Get the mode which has this inner mode and number of units. */
5717 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5718 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5719 && GET_MODE_INNER (mode) == innermode)
5720 {
5721 if (size == 32 && !TARGET_AVX)
5722 {
5723 static bool warnedavx;
5724
5725 if (cum
5726 && !warnedavx
5727 && cum->warn_avx)
5728 {
5729 warnedavx = true;
5730 warning (0, "AVX vector argument without AVX "
5731 "enabled changes the ABI");
5732 }
5733 return TYPE_MODE (type);
5734 }
5735 else
5736 return mode;
5737 }
5738
5739 gcc_unreachable ();
5740 }
5741 }
5742
5743 return mode;
5744 }
5745
5746 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5747 this may not agree with the mode that the type system has chosen for the
5748 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5749 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5750
5751 static rtx
5752 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5753 unsigned int regno)
5754 {
5755 rtx tmp;
5756
5757 if (orig_mode != BLKmode)
5758 tmp = gen_rtx_REG (orig_mode, regno);
5759 else
5760 {
5761 tmp = gen_rtx_REG (mode, regno);
5762 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5763 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5764 }
5765
5766 return tmp;
5767 }
5768
5769 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5770 of this code is to classify each 8bytes of incoming argument by the register
5771 class and assign registers accordingly. */
5772
5773 /* Return the union class of CLASS1 and CLASS2.
5774 See the x86-64 PS ABI for details. */
5775
5776 static enum x86_64_reg_class
5777 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5778 {
5779 /* Rule #1: If both classes are equal, this is the resulting class. */
5780 if (class1 == class2)
5781 return class1;
5782
5783 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5784 the other class. */
5785 if (class1 == X86_64_NO_CLASS)
5786 return class2;
5787 if (class2 == X86_64_NO_CLASS)
5788 return class1;
5789
5790 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5791 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5792 return X86_64_MEMORY_CLASS;
5793
5794 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5795 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5796 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5797 return X86_64_INTEGERSI_CLASS;
5798 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5799 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5800 return X86_64_INTEGER_CLASS;
5801
5802 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5803 MEMORY is used. */
5804 if (class1 == X86_64_X87_CLASS
5805 || class1 == X86_64_X87UP_CLASS
5806 || class1 == X86_64_COMPLEX_X87_CLASS
5807 || class2 == X86_64_X87_CLASS
5808 || class2 == X86_64_X87UP_CLASS
5809 || class2 == X86_64_COMPLEX_X87_CLASS)
5810 return X86_64_MEMORY_CLASS;
5811
5812 /* Rule #6: Otherwise class SSE is used. */
5813 return X86_64_SSE_CLASS;
5814 }
5815
5816 /* Classify the argument of type TYPE and mode MODE.
5817 CLASSES will be filled by the register class used to pass each word
5818 of the operand. The number of words is returned. In case the parameter
5819 should be passed in memory, 0 is returned. As a special case for zero
5820 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5821
5822 BIT_OFFSET is used internally for handling records and specifies offset
5823 of the offset in bits modulo 256 to avoid overflow cases.
5824
5825 See the x86-64 PS ABI for details.
5826 */
5827
5828 static int
5829 classify_argument (enum machine_mode mode, const_tree type,
5830 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5831 {
5832 HOST_WIDE_INT bytes =
5833 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5834 int words = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5835
5836 /* Variable sized entities are always passed/returned in memory. */
5837 if (bytes < 0)
5838 return 0;
5839
5840 if (mode != VOIDmode
5841 && targetm.calls.must_pass_in_stack (mode, type))
5842 return 0;
5843
5844 if (type && AGGREGATE_TYPE_P (type))
5845 {
5846 int i;
5847 tree field;
5848 enum x86_64_reg_class subclasses[MAX_CLASSES];
5849
5850 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5851 if (bytes > 32)
5852 return 0;
5853
5854 for (i = 0; i < words; i++)
5855 classes[i] = X86_64_NO_CLASS;
5856
5857 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5858 signalize memory class, so handle it as special case. */
5859 if (!words)
5860 {
5861 classes[0] = X86_64_NO_CLASS;
5862 return 1;
5863 }
5864
5865 /* Classify each field of record and merge classes. */
5866 switch (TREE_CODE (type))
5867 {
5868 case RECORD_TYPE:
5869 /* And now merge the fields of structure. */
5870 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5871 {
5872 if (TREE_CODE (field) == FIELD_DECL)
5873 {
5874 int num;
5875
5876 if (TREE_TYPE (field) == error_mark_node)
5877 continue;
5878
5879 /* Bitfields are always classified as integer. Handle them
5880 early, since later code would consider them to be
5881 misaligned integers. */
5882 if (DECL_BIT_FIELD (field))
5883 {
5884 for (i = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
5885 i < ((int_bit_position (field) + (bit_offset % 64))
5886 + tree_low_cst (DECL_SIZE (field), 0)
5887 + 63) / 8 / 8; i++)
5888 classes[i] =
5889 merge_classes (X86_64_INTEGER_CLASS,
5890 classes[i]);
5891 }
5892 else
5893 {
5894 int pos;
5895
5896 type = TREE_TYPE (field);
5897
5898 /* Flexible array member is ignored. */
5899 if (TYPE_MODE (type) == BLKmode
5900 && TREE_CODE (type) == ARRAY_TYPE
5901 && TYPE_SIZE (type) == NULL_TREE
5902 && TYPE_DOMAIN (type) != NULL_TREE
5903 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
5904 == NULL_TREE))
5905 {
5906 static bool warned;
5907
5908 if (!warned && warn_psabi)
5909 {
5910 warned = true;
5911 inform (input_location,
5912 "the ABI of passing struct with"
5913 " a flexible array member has"
5914 " changed in GCC 4.4");
5915 }
5916 continue;
5917 }
5918 num = classify_argument (TYPE_MODE (type), type,
5919 subclasses,
5920 (int_bit_position (field)
5921 + bit_offset) % 256);
5922 if (!num)
5923 return 0;
5924 pos = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
5925 for (i = 0; i < num && (i + pos) < words; i++)
5926 classes[i + pos] =
5927 merge_classes (subclasses[i], classes[i + pos]);
5928 }
5929 }
5930 }
5931 break;
5932
5933 case ARRAY_TYPE:
5934 /* Arrays are handled as small records. */
5935 {
5936 int num;
5937 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
5938 TREE_TYPE (type), subclasses, bit_offset);
5939 if (!num)
5940 return 0;
5941
5942 /* The partial classes are now full classes. */
5943 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
5944 subclasses[0] = X86_64_SSE_CLASS;
5945 if (subclasses[0] == X86_64_INTEGERSI_CLASS
5946 && !((bit_offset % 64) == 0 && bytes == 4))
5947 subclasses[0] = X86_64_INTEGER_CLASS;
5948
5949 for (i = 0; i < words; i++)
5950 classes[i] = subclasses[i % num];
5951
5952 break;
5953 }
5954 case UNION_TYPE:
5955 case QUAL_UNION_TYPE:
5956 /* Unions are similar to RECORD_TYPE but offset is always 0.
5957 */
5958 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5959 {
5960 if (TREE_CODE (field) == FIELD_DECL)
5961 {
5962 int num;
5963
5964 if (TREE_TYPE (field) == error_mark_node)
5965 continue;
5966
5967 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
5968 TREE_TYPE (field), subclasses,
5969 bit_offset);
5970 if (!num)
5971 return 0;
5972 for (i = 0; i < num; i++)
5973 classes[i] = merge_classes (subclasses[i], classes[i]);
5974 }
5975 }
5976 break;
5977
5978 default:
5979 gcc_unreachable ();
5980 }
5981
5982 if (words > 2)
5983 {
5984 /* When size > 16 bytes, if the first one isn't
5985 X86_64_SSE_CLASS or any other ones aren't
5986 X86_64_SSEUP_CLASS, everything should be passed in
5987 memory. */
5988 if (classes[0] != X86_64_SSE_CLASS)
5989 return 0;
5990
5991 for (i = 1; i < words; i++)
5992 if (classes[i] != X86_64_SSEUP_CLASS)
5993 return 0;
5994 }
5995
5996 /* Final merger cleanup. */
5997 for (i = 0; i < words; i++)
5998 {
5999 /* If one class is MEMORY, everything should be passed in
6000 memory. */
6001 if (classes[i] == X86_64_MEMORY_CLASS)
6002 return 0;
6003
6004 /* The X86_64_SSEUP_CLASS should be always preceded by
6005 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6006 if (classes[i] == X86_64_SSEUP_CLASS
6007 && classes[i - 1] != X86_64_SSE_CLASS
6008 && classes[i - 1] != X86_64_SSEUP_CLASS)
6009 {
6010 /* The first one should never be X86_64_SSEUP_CLASS. */
6011 gcc_assert (i != 0);
6012 classes[i] = X86_64_SSE_CLASS;
6013 }
6014
6015 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6016 everything should be passed in memory. */
6017 if (classes[i] == X86_64_X87UP_CLASS
6018 && (classes[i - 1] != X86_64_X87_CLASS))
6019 {
6020 static bool warned;
6021
6022 /* The first one should never be X86_64_X87UP_CLASS. */
6023 gcc_assert (i != 0);
6024 if (!warned && warn_psabi)
6025 {
6026 warned = true;
6027 inform (input_location,
6028 "the ABI of passing union with long double"
6029 " has changed in GCC 4.4");
6030 }
6031 return 0;
6032 }
6033 }
6034 return words;
6035 }
6036
6037 /* Compute alignment needed. We align all types to natural boundaries with
6038 exception of XFmode that is aligned to 64bits. */
6039 if (mode != VOIDmode && mode != BLKmode)
6040 {
6041 int mode_alignment = GET_MODE_BITSIZE (mode);
6042
6043 if (mode == XFmode)
6044 mode_alignment = 128;
6045 else if (mode == XCmode)
6046 mode_alignment = 256;
6047 if (COMPLEX_MODE_P (mode))
6048 mode_alignment /= 2;
6049 /* Misaligned fields are always returned in memory. */
6050 if (bit_offset % mode_alignment)
6051 return 0;
6052 }
6053
6054 /* for V1xx modes, just use the base mode */
6055 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6056 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6057 mode = GET_MODE_INNER (mode);
6058
6059 /* Classification of atomic types. */
6060 switch (mode)
6061 {
6062 case SDmode:
6063 case DDmode:
6064 classes[0] = X86_64_SSE_CLASS;
6065 return 1;
6066 case TDmode:
6067 classes[0] = X86_64_SSE_CLASS;
6068 classes[1] = X86_64_SSEUP_CLASS;
6069 return 2;
6070 case DImode:
6071 case SImode:
6072 case HImode:
6073 case QImode:
6074 case CSImode:
6075 case CHImode:
6076 case CQImode:
6077 {
6078 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6079
6080 if (size <= 32)
6081 {
6082 classes[0] = X86_64_INTEGERSI_CLASS;
6083 return 1;
6084 }
6085 else if (size <= 64)
6086 {
6087 classes[0] = X86_64_INTEGER_CLASS;
6088 return 1;
6089 }
6090 else if (size <= 64+32)
6091 {
6092 classes[0] = X86_64_INTEGER_CLASS;
6093 classes[1] = X86_64_INTEGERSI_CLASS;
6094 return 2;
6095 }
6096 else if (size <= 64+64)
6097 {
6098 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6099 return 2;
6100 }
6101 else
6102 gcc_unreachable ();
6103 }
6104 case CDImode:
6105 case TImode:
6106 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6107 return 2;
6108 case COImode:
6109 case OImode:
6110 /* OImode shouldn't be used directly. */
6111 gcc_unreachable ();
6112 case CTImode:
6113 return 0;
6114 case SFmode:
6115 if (!(bit_offset % 64))
6116 classes[0] = X86_64_SSESF_CLASS;
6117 else
6118 classes[0] = X86_64_SSE_CLASS;
6119 return 1;
6120 case DFmode:
6121 classes[0] = X86_64_SSEDF_CLASS;
6122 return 1;
6123 case XFmode:
6124 classes[0] = X86_64_X87_CLASS;
6125 classes[1] = X86_64_X87UP_CLASS;
6126 return 2;
6127 case TFmode:
6128 classes[0] = X86_64_SSE_CLASS;
6129 classes[1] = X86_64_SSEUP_CLASS;
6130 return 2;
6131 case SCmode:
6132 classes[0] = X86_64_SSE_CLASS;
6133 if (!(bit_offset % 64))
6134 return 1;
6135 else
6136 {
6137 static bool warned;
6138
6139 if (!warned && warn_psabi)
6140 {
6141 warned = true;
6142 inform (input_location,
6143 "the ABI of passing structure with complex float"
6144 " member has changed in GCC 4.4");
6145 }
6146 classes[1] = X86_64_SSESF_CLASS;
6147 return 2;
6148 }
6149 case DCmode:
6150 classes[0] = X86_64_SSEDF_CLASS;
6151 classes[1] = X86_64_SSEDF_CLASS;
6152 return 2;
6153 case XCmode:
6154 classes[0] = X86_64_COMPLEX_X87_CLASS;
6155 return 1;
6156 case TCmode:
6157 /* This modes is larger than 16 bytes. */
6158 return 0;
6159 case V8SFmode:
6160 case V8SImode:
6161 case V32QImode:
6162 case V16HImode:
6163 case V4DFmode:
6164 case V4DImode:
6165 classes[0] = X86_64_SSE_CLASS;
6166 classes[1] = X86_64_SSEUP_CLASS;
6167 classes[2] = X86_64_SSEUP_CLASS;
6168 classes[3] = X86_64_SSEUP_CLASS;
6169 return 4;
6170 case V4SFmode:
6171 case V4SImode:
6172 case V16QImode:
6173 case V8HImode:
6174 case V2DFmode:
6175 case V2DImode:
6176 classes[0] = X86_64_SSE_CLASS;
6177 classes[1] = X86_64_SSEUP_CLASS;
6178 return 2;
6179 case V1TImode:
6180 case V1DImode:
6181 case V2SFmode:
6182 case V2SImode:
6183 case V4HImode:
6184 case V8QImode:
6185 classes[0] = X86_64_SSE_CLASS;
6186 return 1;
6187 case BLKmode:
6188 case VOIDmode:
6189 return 0;
6190 default:
6191 gcc_assert (VECTOR_MODE_P (mode));
6192
6193 if (bytes > 16)
6194 return 0;
6195
6196 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6197
6198 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6199 classes[0] = X86_64_INTEGERSI_CLASS;
6200 else
6201 classes[0] = X86_64_INTEGER_CLASS;
6202 classes[1] = X86_64_INTEGER_CLASS;
6203 return 1 + (bytes > 8);
6204 }
6205 }
6206
6207 /* Examine the argument and return set number of register required in each
6208 class. Return 0 iff parameter should be passed in memory. */
6209 static int
6210 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6211 int *int_nregs, int *sse_nregs)
6212 {
6213 enum x86_64_reg_class regclass[MAX_CLASSES];
6214 int n = classify_argument (mode, type, regclass, 0);
6215
6216 *int_nregs = 0;
6217 *sse_nregs = 0;
6218 if (!n)
6219 return 0;
6220 for (n--; n >= 0; n--)
6221 switch (regclass[n])
6222 {
6223 case X86_64_INTEGER_CLASS:
6224 case X86_64_INTEGERSI_CLASS:
6225 (*int_nregs)++;
6226 break;
6227 case X86_64_SSE_CLASS:
6228 case X86_64_SSESF_CLASS:
6229 case X86_64_SSEDF_CLASS:
6230 (*sse_nregs)++;
6231 break;
6232 case X86_64_NO_CLASS:
6233 case X86_64_SSEUP_CLASS:
6234 break;
6235 case X86_64_X87_CLASS:
6236 case X86_64_X87UP_CLASS:
6237 if (!in_return)
6238 return 0;
6239 break;
6240 case X86_64_COMPLEX_X87_CLASS:
6241 return in_return ? 2 : 0;
6242 case X86_64_MEMORY_CLASS:
6243 gcc_unreachable ();
6244 }
6245 return 1;
6246 }
6247
6248 /* Construct container for the argument used by GCC interface. See
6249 FUNCTION_ARG for the detailed description. */
6250
6251 static rtx
6252 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6253 const_tree type, int in_return, int nintregs, int nsseregs,
6254 const int *intreg, int sse_regno)
6255 {
6256 /* The following variables hold the static issued_error state. */
6257 static bool issued_sse_arg_error;
6258 static bool issued_sse_ret_error;
6259 static bool issued_x87_ret_error;
6260
6261 enum machine_mode tmpmode;
6262 int bytes =
6263 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6264 enum x86_64_reg_class regclass[MAX_CLASSES];
6265 int n;
6266 int i;
6267 int nexps = 0;
6268 int needed_sseregs, needed_intregs;
6269 rtx exp[MAX_CLASSES];
6270 rtx ret;
6271
6272 n = classify_argument (mode, type, regclass, 0);
6273 if (!n)
6274 return NULL;
6275 if (!examine_argument (mode, type, in_return, &needed_intregs,
6276 &needed_sseregs))
6277 return NULL;
6278 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6279 return NULL;
6280
6281 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6282 some less clueful developer tries to use floating-point anyway. */
6283 if (needed_sseregs && !TARGET_SSE)
6284 {
6285 if (in_return)
6286 {
6287 if (!issued_sse_ret_error)
6288 {
6289 error ("SSE register return with SSE disabled");
6290 issued_sse_ret_error = true;
6291 }
6292 }
6293 else if (!issued_sse_arg_error)
6294 {
6295 error ("SSE register argument with SSE disabled");
6296 issued_sse_arg_error = true;
6297 }
6298 return NULL;
6299 }
6300
6301 /* Likewise, error if the ABI requires us to return values in the
6302 x87 registers and the user specified -mno-80387. */
6303 if (!TARGET_80387 && in_return)
6304 for (i = 0; i < n; i++)
6305 if (regclass[i] == X86_64_X87_CLASS
6306 || regclass[i] == X86_64_X87UP_CLASS
6307 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6308 {
6309 if (!issued_x87_ret_error)
6310 {
6311 error ("x87 register return with x87 disabled");
6312 issued_x87_ret_error = true;
6313 }
6314 return NULL;
6315 }
6316
6317 /* First construct simple cases. Avoid SCmode, since we want to use
6318 single register to pass this type. */
6319 if (n == 1 && mode != SCmode)
6320 switch (regclass[0])
6321 {
6322 case X86_64_INTEGER_CLASS:
6323 case X86_64_INTEGERSI_CLASS:
6324 return gen_rtx_REG (mode, intreg[0]);
6325 case X86_64_SSE_CLASS:
6326 case X86_64_SSESF_CLASS:
6327 case X86_64_SSEDF_CLASS:
6328 if (mode != BLKmode)
6329 return gen_reg_or_parallel (mode, orig_mode,
6330 SSE_REGNO (sse_regno));
6331 break;
6332 case X86_64_X87_CLASS:
6333 case X86_64_COMPLEX_X87_CLASS:
6334 return gen_rtx_REG (mode, FIRST_STACK_REG);
6335 case X86_64_NO_CLASS:
6336 /* Zero sized array, struct or class. */
6337 return NULL;
6338 default:
6339 gcc_unreachable ();
6340 }
6341 if (n == 2 && regclass[0] == X86_64_SSE_CLASS
6342 && regclass[1] == X86_64_SSEUP_CLASS && mode != BLKmode)
6343 return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
6344 if (n == 4
6345 && regclass[0] == X86_64_SSE_CLASS
6346 && regclass[1] == X86_64_SSEUP_CLASS
6347 && regclass[2] == X86_64_SSEUP_CLASS
6348 && regclass[3] == X86_64_SSEUP_CLASS
6349 && mode != BLKmode)
6350 return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
6351
6352 if (n == 2
6353 && regclass[0] == X86_64_X87_CLASS && regclass[1] == X86_64_X87UP_CLASS)
6354 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6355 if (n == 2 && regclass[0] == X86_64_INTEGER_CLASS
6356 && regclass[1] == X86_64_INTEGER_CLASS
6357 && (mode == CDImode || mode == TImode || mode == TFmode)
6358 && intreg[0] + 1 == intreg[1])
6359 return gen_rtx_REG (mode, intreg[0]);
6360
6361 /* Otherwise figure out the entries of the PARALLEL. */
6362 for (i = 0; i < n; i++)
6363 {
6364 int pos;
6365
6366 switch (regclass[i])
6367 {
6368 case X86_64_NO_CLASS:
6369 break;
6370 case X86_64_INTEGER_CLASS:
6371 case X86_64_INTEGERSI_CLASS:
6372 /* Merge TImodes on aligned occasions here too. */
6373 if (i * 8 + 8 > bytes)
6374 tmpmode = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6375 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6376 tmpmode = SImode;
6377 else
6378 tmpmode = DImode;
6379 /* We've requested 24 bytes we don't have mode for. Use DImode. */
6380 if (tmpmode == BLKmode)
6381 tmpmode = DImode;
6382 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6383 gen_rtx_REG (tmpmode, *intreg),
6384 GEN_INT (i*8));
6385 intreg++;
6386 break;
6387 case X86_64_SSESF_CLASS:
6388 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6389 gen_rtx_REG (SFmode,
6390 SSE_REGNO (sse_regno)),
6391 GEN_INT (i*8));
6392 sse_regno++;
6393 break;
6394 case X86_64_SSEDF_CLASS:
6395 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6396 gen_rtx_REG (DFmode,
6397 SSE_REGNO (sse_regno)),
6398 GEN_INT (i*8));
6399 sse_regno++;
6400 break;
6401 case X86_64_SSE_CLASS:
6402 pos = i;
6403 switch (n)
6404 {
6405 case 1:
6406 tmpmode = DImode;
6407 break;
6408 case 2:
6409 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6410 {
6411 tmpmode = TImode;
6412 i++;
6413 }
6414 else
6415 tmpmode = DImode;
6416 break;
6417 case 4:
6418 gcc_assert (i == 0
6419 && regclass[1] == X86_64_SSEUP_CLASS
6420 && regclass[2] == X86_64_SSEUP_CLASS
6421 && regclass[3] == X86_64_SSEUP_CLASS);
6422 tmpmode = OImode;
6423 i += 3;
6424 break;
6425 default:
6426 gcc_unreachable ();
6427 }
6428 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6429 gen_rtx_REG (tmpmode,
6430 SSE_REGNO (sse_regno)),
6431 GEN_INT (pos*8));
6432 sse_regno++;
6433 break;
6434 default:
6435 gcc_unreachable ();
6436 }
6437 }
6438
6439 /* Empty aligned struct, union or class. */
6440 if (nexps == 0)
6441 return NULL;
6442
6443 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6444 for (i = 0; i < nexps; i++)
6445 XVECEXP (ret, 0, i) = exp [i];
6446 return ret;
6447 }
6448
6449 /* Update the data in CUM to advance over an argument of mode MODE
6450 and data type TYPE. (TYPE is null for libcalls where that information
6451 may not be available.) */
6452
6453 static void
6454 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6455 const_tree type, HOST_WIDE_INT bytes,
6456 HOST_WIDE_INT words)
6457 {
6458 switch (mode)
6459 {
6460 default:
6461 break;
6462
6463 case BLKmode:
6464 if (bytes < 0)
6465 break;
6466 /* FALLTHRU */
6467
6468 case DImode:
6469 case SImode:
6470 case HImode:
6471 case QImode:
6472 cum->words += words;
6473 cum->nregs -= words;
6474 cum->regno += words;
6475
6476 if (cum->nregs <= 0)
6477 {
6478 cum->nregs = 0;
6479 cum->regno = 0;
6480 }
6481 break;
6482
6483 case OImode:
6484 /* OImode shouldn't be used directly. */
6485 gcc_unreachable ();
6486
6487 case DFmode:
6488 if (cum->float_in_sse < 2)
6489 break;
6490 case SFmode:
6491 if (cum->float_in_sse < 1)
6492 break;
6493 /* FALLTHRU */
6494
6495 case V8SFmode:
6496 case V8SImode:
6497 case V32QImode:
6498 case V16HImode:
6499 case V4DFmode:
6500 case V4DImode:
6501 case TImode:
6502 case V16QImode:
6503 case V8HImode:
6504 case V4SImode:
6505 case V2DImode:
6506 case V4SFmode:
6507 case V2DFmode:
6508 if (!type || !AGGREGATE_TYPE_P (type))
6509 {
6510 cum->sse_words += words;
6511 cum->sse_nregs -= 1;
6512 cum->sse_regno += 1;
6513 if (cum->sse_nregs <= 0)
6514 {
6515 cum->sse_nregs = 0;
6516 cum->sse_regno = 0;
6517 }
6518 }
6519 break;
6520
6521 case V8QImode:
6522 case V4HImode:
6523 case V2SImode:
6524 case V2SFmode:
6525 case V1TImode:
6526 case V1DImode:
6527 if (!type || !AGGREGATE_TYPE_P (type))
6528 {
6529 cum->mmx_words += words;
6530 cum->mmx_nregs -= 1;
6531 cum->mmx_regno += 1;
6532 if (cum->mmx_nregs <= 0)
6533 {
6534 cum->mmx_nregs = 0;
6535 cum->mmx_regno = 0;
6536 }
6537 }
6538 break;
6539 }
6540 }
6541
6542 static void
6543 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6544 const_tree type, HOST_WIDE_INT words, bool named)
6545 {
6546 int int_nregs, sse_nregs;
6547
6548 /* Unnamed 256bit vector mode parameters are passed on stack. */
6549 if (!named && VALID_AVX256_REG_MODE (mode))
6550 return;
6551
6552 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6553 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6554 {
6555 cum->nregs -= int_nregs;
6556 cum->sse_nregs -= sse_nregs;
6557 cum->regno += int_nregs;
6558 cum->sse_regno += sse_nregs;
6559 }
6560 else
6561 {
6562 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6563 cum->words = (cum->words + align - 1) & ~(align - 1);
6564 cum->words += words;
6565 }
6566 }
6567
6568 static void
6569 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6570 HOST_WIDE_INT words)
6571 {
6572 /* Otherwise, this should be passed indirect. */
6573 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6574
6575 cum->words += words;
6576 if (cum->nregs > 0)
6577 {
6578 cum->nregs -= 1;
6579 cum->regno += 1;
6580 }
6581 }
6582
6583 /* Update the data in CUM to advance over an argument of mode MODE and
6584 data type TYPE. (TYPE is null for libcalls where that information
6585 may not be available.) */
6586
6587 static void
6588 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6589 const_tree type, bool named)
6590 {
6591 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6592 HOST_WIDE_INT bytes, words;
6593
6594 if (mode == BLKmode)
6595 bytes = int_size_in_bytes (type);
6596 else
6597 bytes = GET_MODE_SIZE (mode);
6598 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6599
6600 if (type)
6601 mode = type_natural_mode (type, NULL);
6602
6603 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6604 function_arg_advance_ms_64 (cum, bytes, words);
6605 else if (TARGET_64BIT)
6606 function_arg_advance_64 (cum, mode, type, words, named);
6607 else
6608 function_arg_advance_32 (cum, mode, type, bytes, words);
6609 }
6610
6611 /* Define where to put the arguments to a function.
6612 Value is zero to push the argument on the stack,
6613 or a hard register in which to store the argument.
6614
6615 MODE is the argument's machine mode.
6616 TYPE is the data type of the argument (as a tree).
6617 This is null for libcalls where that information may
6618 not be available.
6619 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6620 the preceding args and about the function being called.
6621 NAMED is nonzero if this argument is a named parameter
6622 (otherwise it is an extra parameter matching an ellipsis). */
6623
6624 static rtx
6625 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6626 enum machine_mode orig_mode, const_tree type,
6627 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6628 {
6629 static bool warnedsse, warnedmmx;
6630
6631 /* Avoid the AL settings for the Unix64 ABI. */
6632 if (mode == VOIDmode)
6633 return constm1_rtx;
6634
6635 switch (mode)
6636 {
6637 default:
6638 break;
6639
6640 case BLKmode:
6641 if (bytes < 0)
6642 break;
6643 /* FALLTHRU */
6644 case DImode:
6645 case SImode:
6646 case HImode:
6647 case QImode:
6648 if (words <= cum->nregs)
6649 {
6650 int regno = cum->regno;
6651
6652 /* Fastcall allocates the first two DWORD (SImode) or
6653 smaller arguments to ECX and EDX if it isn't an
6654 aggregate type . */
6655 if (cum->fastcall)
6656 {
6657 if (mode == BLKmode
6658 || mode == DImode
6659 || (type && AGGREGATE_TYPE_P (type)))
6660 break;
6661
6662 /* ECX not EAX is the first allocated register. */
6663 if (regno == AX_REG)
6664 regno = CX_REG;
6665 }
6666 return gen_rtx_REG (mode, regno);
6667 }
6668 break;
6669
6670 case DFmode:
6671 if (cum->float_in_sse < 2)
6672 break;
6673 case SFmode:
6674 if (cum->float_in_sse < 1)
6675 break;
6676 /* FALLTHRU */
6677 case TImode:
6678 /* In 32bit, we pass TImode in xmm registers. */
6679 case V16QImode:
6680 case V8HImode:
6681 case V4SImode:
6682 case V2DImode:
6683 case V4SFmode:
6684 case V2DFmode:
6685 if (!type || !AGGREGATE_TYPE_P (type))
6686 {
6687 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6688 {
6689 warnedsse = true;
6690 warning (0, "SSE vector argument without SSE enabled "
6691 "changes the ABI");
6692 }
6693 if (cum->sse_nregs)
6694 return gen_reg_or_parallel (mode, orig_mode,
6695 cum->sse_regno + FIRST_SSE_REG);
6696 }
6697 break;
6698
6699 case OImode:
6700 /* OImode shouldn't be used directly. */
6701 gcc_unreachable ();
6702
6703 case V8SFmode:
6704 case V8SImode:
6705 case V32QImode:
6706 case V16HImode:
6707 case V4DFmode:
6708 case V4DImode:
6709 if (!type || !AGGREGATE_TYPE_P (type))
6710 {
6711 if (cum->sse_nregs)
6712 return gen_reg_or_parallel (mode, orig_mode,
6713 cum->sse_regno + FIRST_SSE_REG);
6714 }
6715 break;
6716
6717 case V8QImode:
6718 case V4HImode:
6719 case V2SImode:
6720 case V2SFmode:
6721 case V1TImode:
6722 case V1DImode:
6723 if (!type || !AGGREGATE_TYPE_P (type))
6724 {
6725 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6726 {
6727 warnedmmx = true;
6728 warning (0, "MMX vector argument without MMX enabled "
6729 "changes the ABI");
6730 }
6731 if (cum->mmx_nregs)
6732 return gen_reg_or_parallel (mode, orig_mode,
6733 cum->mmx_regno + FIRST_MMX_REG);
6734 }
6735 break;
6736 }
6737
6738 return NULL_RTX;
6739 }
6740
6741 static rtx
6742 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6743 enum machine_mode orig_mode, const_tree type, bool named)
6744 {
6745 /* Handle a hidden AL argument containing number of registers
6746 for varargs x86-64 functions. */
6747 if (mode == VOIDmode)
6748 return GEN_INT (cum->maybe_vaarg
6749 ? (cum->sse_nregs < 0
6750 ? X86_64_SSE_REGPARM_MAX
6751 : cum->sse_regno)
6752 : -1);
6753
6754 switch (mode)
6755 {
6756 default:
6757 break;
6758
6759 case V8SFmode:
6760 case V8SImode:
6761 case V32QImode:
6762 case V16HImode:
6763 case V4DFmode:
6764 case V4DImode:
6765 /* Unnamed 256bit vector mode parameters are passed on stack. */
6766 if (!named)
6767 return NULL;
6768 break;
6769 }
6770
6771 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6772 cum->sse_nregs,
6773 &x86_64_int_parameter_registers [cum->regno],
6774 cum->sse_regno);
6775 }
6776
6777 static rtx
6778 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6779 enum machine_mode orig_mode, bool named,
6780 HOST_WIDE_INT bytes)
6781 {
6782 unsigned int regno;
6783
6784 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6785 We use value of -2 to specify that current function call is MSABI. */
6786 if (mode == VOIDmode)
6787 return GEN_INT (-2);
6788
6789 /* If we've run out of registers, it goes on the stack. */
6790 if (cum->nregs == 0)
6791 return NULL_RTX;
6792
6793 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6794
6795 /* Only floating point modes are passed in anything but integer regs. */
6796 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6797 {
6798 if (named)
6799 regno = cum->regno + FIRST_SSE_REG;
6800 else
6801 {
6802 rtx t1, t2;
6803
6804 /* Unnamed floating parameters are passed in both the
6805 SSE and integer registers. */
6806 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6807 t2 = gen_rtx_REG (mode, regno);
6808 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6809 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6810 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6811 }
6812 }
6813 /* Handle aggregated types passed in register. */
6814 if (orig_mode == BLKmode)
6815 {
6816 if (bytes > 0 && bytes <= 8)
6817 mode = (bytes > 4 ? DImode : SImode);
6818 if (mode == BLKmode)
6819 mode = DImode;
6820 }
6821
6822 return gen_reg_or_parallel (mode, orig_mode, regno);
6823 }
6824
6825 /* Return where to put the arguments to a function.
6826 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6827
6828 MODE is the argument's machine mode. TYPE is the data type of the
6829 argument. It is null for libcalls where that information may not be
6830 available. CUM gives information about the preceding args and about
6831 the function being called. NAMED is nonzero if this argument is a
6832 named parameter (otherwise it is an extra parameter matching an
6833 ellipsis). */
6834
6835 static rtx
6836 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6837 const_tree type, bool named)
6838 {
6839 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6840 enum machine_mode mode = omode;
6841 HOST_WIDE_INT bytes, words;
6842 rtx arg;
6843
6844 if (mode == BLKmode)
6845 bytes = int_size_in_bytes (type);
6846 else
6847 bytes = GET_MODE_SIZE (mode);
6848 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6849
6850 /* To simplify the code below, represent vector types with a vector mode
6851 even if MMX/SSE are not active. */
6852 if (type && TREE_CODE (type) == VECTOR_TYPE)
6853 mode = type_natural_mode (type, cum);
6854
6855 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6856 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6857 else if (TARGET_64BIT)
6858 arg = function_arg_64 (cum, mode, omode, type, named);
6859 else
6860 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6861
6862 if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
6863 {
6864 /* This argument uses 256bit AVX modes. */
6865 if (cum->caller)
6866 cfun->machine->callee_pass_avx256_p = true;
6867 else
6868 cfun->machine->caller_pass_avx256_p = true;
6869 }
6870
6871 return arg;
6872 }
6873
6874 /* A C expression that indicates when an argument must be passed by
6875 reference. If nonzero for an argument, a copy of that argument is
6876 made in memory and a pointer to the argument is passed instead of
6877 the argument itself. The pointer is passed in whatever way is
6878 appropriate for passing a pointer to that type. */
6879
6880 static bool
6881 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6882 enum machine_mode mode ATTRIBUTE_UNUSED,
6883 const_tree type, bool named ATTRIBUTE_UNUSED)
6884 {
6885 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6886
6887 /* See Windows x64 Software Convention. */
6888 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6889 {
6890 int msize = (int) GET_MODE_SIZE (mode);
6891 if (type)
6892 {
6893 /* Arrays are passed by reference. */
6894 if (TREE_CODE (type) == ARRAY_TYPE)
6895 return true;
6896
6897 if (AGGREGATE_TYPE_P (type))
6898 {
6899 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
6900 are passed by reference. */
6901 msize = int_size_in_bytes (type);
6902 }
6903 }
6904
6905 /* __m128 is passed by reference. */
6906 switch (msize) {
6907 case 1: case 2: case 4: case 8:
6908 break;
6909 default:
6910 return true;
6911 }
6912 }
6913 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
6914 return 1;
6915
6916 return 0;
6917 }
6918
6919 /* Return true when TYPE should be 128bit aligned for 32bit argument
6920 passing ABI. XXX: This function is obsolete and is only used for
6921 checking psABI compatibility with previous versions of GCC. */
6922
6923 static bool
6924 ix86_compat_aligned_value_p (const_tree type)
6925 {
6926 enum machine_mode mode = TYPE_MODE (type);
6927 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
6928 || mode == TDmode
6929 || mode == TFmode
6930 || mode == TCmode)
6931 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
6932 return true;
6933 if (TYPE_ALIGN (type) < 128)
6934 return false;
6935
6936 if (AGGREGATE_TYPE_P (type))
6937 {
6938 /* Walk the aggregates recursively. */
6939 switch (TREE_CODE (type))
6940 {
6941 case RECORD_TYPE:
6942 case UNION_TYPE:
6943 case QUAL_UNION_TYPE:
6944 {
6945 tree field;
6946
6947 /* Walk all the structure fields. */
6948 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6949 {
6950 if (TREE_CODE (field) == FIELD_DECL
6951 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
6952 return true;
6953 }
6954 break;
6955 }
6956
6957 case ARRAY_TYPE:
6958 /* Just for use if some languages passes arrays by value. */
6959 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
6960 return true;
6961 break;
6962
6963 default:
6964 gcc_unreachable ();
6965 }
6966 }
6967 return false;
6968 }
6969
6970 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
6971 XXX: This function is obsolete and is only used for checking psABI
6972 compatibility with previous versions of GCC. */
6973
6974 static unsigned int
6975 ix86_compat_function_arg_boundary (enum machine_mode mode,
6976 const_tree type, unsigned int align)
6977 {
6978 /* In 32bit, only _Decimal128 and __float128 are aligned to their
6979 natural boundaries. */
6980 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
6981 {
6982 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
6983 make an exception for SSE modes since these require 128bit
6984 alignment.
6985
6986 The handling here differs from field_alignment. ICC aligns MMX
6987 arguments to 4 byte boundaries, while structure fields are aligned
6988 to 8 byte boundaries. */
6989 if (!type)
6990 {
6991 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
6992 align = PARM_BOUNDARY;
6993 }
6994 else
6995 {
6996 if (!ix86_compat_aligned_value_p (type))
6997 align = PARM_BOUNDARY;
6998 }
6999 }
7000 if (align > BIGGEST_ALIGNMENT)
7001 align = BIGGEST_ALIGNMENT;
7002 return align;
7003 }
7004
7005 /* Return true when TYPE should be 128bit aligned for 32bit argument
7006 passing ABI. */
7007
7008 static bool
7009 ix86_contains_aligned_value_p (const_tree type)
7010 {
7011 enum machine_mode mode = TYPE_MODE (type);
7012
7013 if (mode == XFmode || mode == XCmode)
7014 return false;
7015
7016 if (TYPE_ALIGN (type) < 128)
7017 return false;
7018
7019 if (AGGREGATE_TYPE_P (type))
7020 {
7021 /* Walk the aggregates recursively. */
7022 switch (TREE_CODE (type))
7023 {
7024 case RECORD_TYPE:
7025 case UNION_TYPE:
7026 case QUAL_UNION_TYPE:
7027 {
7028 tree field;
7029
7030 /* Walk all the structure fields. */
7031 for (field = TYPE_FIELDS (type);
7032 field;
7033 field = DECL_CHAIN (field))
7034 {
7035 if (TREE_CODE (field) == FIELD_DECL
7036 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7037 return true;
7038 }
7039 break;
7040 }
7041
7042 case ARRAY_TYPE:
7043 /* Just for use if some languages passes arrays by value. */
7044 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7045 return true;
7046 break;
7047
7048 default:
7049 gcc_unreachable ();
7050 }
7051 }
7052 else
7053 return TYPE_ALIGN (type) >= 128;
7054
7055 return false;
7056 }
7057
7058 /* Gives the alignment boundary, in bits, of an argument with the
7059 specified mode and type. */
7060
7061 static unsigned int
7062 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7063 {
7064 unsigned int align;
7065 if (type)
7066 {
7067 /* Since the main variant type is used for call, we convert it to
7068 the main variant type. */
7069 type = TYPE_MAIN_VARIANT (type);
7070 align = TYPE_ALIGN (type);
7071 }
7072 else
7073 align = GET_MODE_ALIGNMENT (mode);
7074 if (align < PARM_BOUNDARY)
7075 align = PARM_BOUNDARY;
7076 else
7077 {
7078 static bool warned;
7079 unsigned int saved_align = align;
7080
7081 if (!TARGET_64BIT)
7082 {
7083 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7084 if (!type)
7085 {
7086 if (mode == XFmode || mode == XCmode)
7087 align = PARM_BOUNDARY;
7088 }
7089 else if (!ix86_contains_aligned_value_p (type))
7090 align = PARM_BOUNDARY;
7091
7092 if (align < 128)
7093 align = PARM_BOUNDARY;
7094 }
7095
7096 if (warn_psabi
7097 && !warned
7098 && align != ix86_compat_function_arg_boundary (mode, type,
7099 saved_align))
7100 {
7101 warned = true;
7102 inform (input_location,
7103 "The ABI for passing parameters with %d-byte"
7104 " alignment has changed in GCC 4.6",
7105 align / BITS_PER_UNIT);
7106 }
7107 }
7108
7109 return align;
7110 }
7111
7112 /* Return true if N is a possible register number of function value. */
7113
7114 static bool
7115 ix86_function_value_regno_p (const unsigned int regno)
7116 {
7117 switch (regno)
7118 {
7119 case AX_REG:
7120 return true;
7121
7122 case FIRST_FLOAT_REG:
7123 /* TODO: The function should depend on current function ABI but
7124 builtins.c would need updating then. Therefore we use the
7125 default ABI. */
7126 if (TARGET_64BIT && ix86_abi == MS_ABI)
7127 return false;
7128 return TARGET_FLOAT_RETURNS_IN_80387;
7129
7130 case FIRST_SSE_REG:
7131 return TARGET_SSE;
7132
7133 case FIRST_MMX_REG:
7134 if (TARGET_MACHO || TARGET_64BIT)
7135 return false;
7136 return TARGET_MMX;
7137 }
7138
7139 return false;
7140 }
7141
7142 /* Define how to find the value returned by a function.
7143 VALTYPE is the data type of the value (as a tree).
7144 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7145 otherwise, FUNC is 0. */
7146
7147 static rtx
7148 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7149 const_tree fntype, const_tree fn)
7150 {
7151 unsigned int regno;
7152
7153 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7154 we normally prevent this case when mmx is not available. However
7155 some ABIs may require the result to be returned like DImode. */
7156 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7157 regno = FIRST_MMX_REG;
7158
7159 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7160 we prevent this case when sse is not available. However some ABIs
7161 may require the result to be returned like integer TImode. */
7162 else if (mode == TImode
7163 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7164 regno = FIRST_SSE_REG;
7165
7166 /* 32-byte vector modes in %ymm0. */
7167 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7168 regno = FIRST_SSE_REG;
7169
7170 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7171 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7172 regno = FIRST_FLOAT_REG;
7173 else
7174 /* Most things go in %eax. */
7175 regno = AX_REG;
7176
7177 /* Override FP return register with %xmm0 for local functions when
7178 SSE math is enabled or for functions with sseregparm attribute. */
7179 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7180 {
7181 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7182 if ((sse_level >= 1 && mode == SFmode)
7183 || (sse_level == 2 && mode == DFmode))
7184 regno = FIRST_SSE_REG;
7185 }
7186
7187 /* OImode shouldn't be used directly. */
7188 gcc_assert (mode != OImode);
7189
7190 return gen_rtx_REG (orig_mode, regno);
7191 }
7192
7193 static rtx
7194 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7195 const_tree valtype)
7196 {
7197 rtx ret;
7198
7199 /* Handle libcalls, which don't provide a type node. */
7200 if (valtype == NULL)
7201 {
7202 unsigned int regno;
7203
7204 switch (mode)
7205 {
7206 case SFmode:
7207 case SCmode:
7208 case DFmode:
7209 case DCmode:
7210 case TFmode:
7211 case SDmode:
7212 case DDmode:
7213 case TDmode:
7214 regno = FIRST_SSE_REG;
7215 break;
7216 case XFmode:
7217 case XCmode:
7218 regno = FIRST_FLOAT_REG;
7219 break;
7220 case TCmode:
7221 return NULL;
7222 default:
7223 regno = AX_REG;
7224 }
7225
7226 return gen_rtx_REG (mode, regno);
7227 }
7228 else if (POINTER_TYPE_P (valtype))
7229 {
7230 /* Pointers are always returned in Pmode. */
7231 mode = Pmode;
7232 }
7233
7234 ret = construct_container (mode, orig_mode, valtype, 1,
7235 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7236 x86_64_int_return_registers, 0);
7237
7238 /* For zero sized structures, construct_container returns NULL, but we
7239 need to keep rest of compiler happy by returning meaningful value. */
7240 if (!ret)
7241 ret = gen_rtx_REG (orig_mode, AX_REG);
7242
7243 return ret;
7244 }
7245
7246 static rtx
7247 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7248 {
7249 unsigned int regno = AX_REG;
7250
7251 if (TARGET_SSE)
7252 {
7253 switch (GET_MODE_SIZE (mode))
7254 {
7255 case 16:
7256 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7257 && !COMPLEX_MODE_P (mode))
7258 regno = FIRST_SSE_REG;
7259 break;
7260 case 8:
7261 case 4:
7262 if (mode == SFmode || mode == DFmode)
7263 regno = FIRST_SSE_REG;
7264 break;
7265 default:
7266 break;
7267 }
7268 }
7269 return gen_rtx_REG (orig_mode, regno);
7270 }
7271
7272 static rtx
7273 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7274 enum machine_mode orig_mode, enum machine_mode mode)
7275 {
7276 const_tree fn, fntype;
7277
7278 fn = NULL_TREE;
7279 if (fntype_or_decl && DECL_P (fntype_or_decl))
7280 fn = fntype_or_decl;
7281 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7282
7283 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7284 return function_value_ms_64 (orig_mode, mode);
7285 else if (TARGET_64BIT)
7286 return function_value_64 (orig_mode, mode, valtype);
7287 else
7288 return function_value_32 (orig_mode, mode, fntype, fn);
7289 }
7290
7291 static rtx
7292 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7293 bool outgoing ATTRIBUTE_UNUSED)
7294 {
7295 enum machine_mode mode, orig_mode;
7296
7297 orig_mode = TYPE_MODE (valtype);
7298 mode = type_natural_mode (valtype, NULL);
7299 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7300 }
7301
7302 /* Pointer function arguments and return values are promoted to Pmode. */
7303
7304 static enum machine_mode
7305 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7306 int *punsignedp, const_tree fntype,
7307 int for_return)
7308 {
7309 if (type != NULL_TREE && POINTER_TYPE_P (type))
7310 {
7311 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7312 return Pmode;
7313 }
7314 return default_promote_function_mode (type, mode, punsignedp, fntype,
7315 for_return);
7316 }
7317
7318 rtx
7319 ix86_libcall_value (enum machine_mode mode)
7320 {
7321 return ix86_function_value_1 (NULL, NULL, mode, mode);
7322 }
7323
7324 /* Return true iff type is returned in memory. */
7325
7326 static bool ATTRIBUTE_UNUSED
7327 return_in_memory_32 (const_tree type, enum machine_mode mode)
7328 {
7329 HOST_WIDE_INT size;
7330
7331 if (mode == BLKmode)
7332 return true;
7333
7334 size = int_size_in_bytes (type);
7335
7336 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7337 return false;
7338
7339 if (VECTOR_MODE_P (mode) || mode == TImode)
7340 {
7341 /* User-created vectors small enough to fit in EAX. */
7342 if (size < 8)
7343 return false;
7344
7345 /* MMX/3dNow values are returned in MM0,
7346 except when it doesn't exits or the ABI prescribes otherwise. */
7347 if (size == 8)
7348 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7349
7350 /* SSE values are returned in XMM0, except when it doesn't exist. */
7351 if (size == 16)
7352 return !TARGET_SSE;
7353
7354 /* AVX values are returned in YMM0, except when it doesn't exist. */
7355 if (size == 32)
7356 return !TARGET_AVX;
7357 }
7358
7359 if (mode == XFmode)
7360 return false;
7361
7362 if (size > 12)
7363 return true;
7364
7365 /* OImode shouldn't be used directly. */
7366 gcc_assert (mode != OImode);
7367
7368 return false;
7369 }
7370
7371 static bool ATTRIBUTE_UNUSED
7372 return_in_memory_64 (const_tree type, enum machine_mode mode)
7373 {
7374 int needed_intregs, needed_sseregs;
7375 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7376 }
7377
7378 static bool ATTRIBUTE_UNUSED
7379 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7380 {
7381 HOST_WIDE_INT size = int_size_in_bytes (type);
7382
7383 /* __m128 is returned in xmm0. */
7384 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7385 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7386 return false;
7387
7388 /* Otherwise, the size must be exactly in [1248]. */
7389 return size != 1 && size != 2 && size != 4 && size != 8;
7390 }
7391
7392 static bool
7393 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7394 {
7395 #ifdef SUBTARGET_RETURN_IN_MEMORY
7396 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7397 #else
7398 const enum machine_mode mode = type_natural_mode (type, NULL);
7399
7400 if (TARGET_64BIT)
7401 {
7402 if (ix86_function_type_abi (fntype) == MS_ABI)
7403 return return_in_memory_ms_64 (type, mode);
7404 else
7405 return return_in_memory_64 (type, mode);
7406 }
7407 else
7408 return return_in_memory_32 (type, mode);
7409 #endif
7410 }
7411
7412 /* When returning SSE vector types, we have a choice of either
7413 (1) being abi incompatible with a -march switch, or
7414 (2) generating an error.
7415 Given no good solution, I think the safest thing is one warning.
7416 The user won't be able to use -Werror, but....
7417
7418 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7419 called in response to actually generating a caller or callee that
7420 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7421 via aggregate_value_p for general type probing from tree-ssa. */
7422
7423 static rtx
7424 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7425 {
7426 static bool warnedsse, warnedmmx;
7427
7428 if (!TARGET_64BIT && type)
7429 {
7430 /* Look at the return type of the function, not the function type. */
7431 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7432
7433 if (!TARGET_SSE && !warnedsse)
7434 {
7435 if (mode == TImode
7436 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7437 {
7438 warnedsse = true;
7439 warning (0, "SSE vector return without SSE enabled "
7440 "changes the ABI");
7441 }
7442 }
7443
7444 if (!TARGET_MMX && !warnedmmx)
7445 {
7446 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7447 {
7448 warnedmmx = true;
7449 warning (0, "MMX vector return without MMX enabled "
7450 "changes the ABI");
7451 }
7452 }
7453 }
7454
7455 return NULL;
7456 }
7457
7458 \f
7459 /* Create the va_list data type. */
7460
7461 /* Returns the calling convention specific va_list date type.
7462 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7463
7464 static tree
7465 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7466 {
7467 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7468
7469 /* For i386 we use plain pointer to argument area. */
7470 if (!TARGET_64BIT || abi == MS_ABI)
7471 return build_pointer_type (char_type_node);
7472
7473 record = lang_hooks.types.make_type (RECORD_TYPE);
7474 type_decl = build_decl (BUILTINS_LOCATION,
7475 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7476
7477 f_gpr = build_decl (BUILTINS_LOCATION,
7478 FIELD_DECL, get_identifier ("gp_offset"),
7479 unsigned_type_node);
7480 f_fpr = build_decl (BUILTINS_LOCATION,
7481 FIELD_DECL, get_identifier ("fp_offset"),
7482 unsigned_type_node);
7483 f_ovf = build_decl (BUILTINS_LOCATION,
7484 FIELD_DECL, get_identifier ("overflow_arg_area"),
7485 ptr_type_node);
7486 f_sav = build_decl (BUILTINS_LOCATION,
7487 FIELD_DECL, get_identifier ("reg_save_area"),
7488 ptr_type_node);
7489
7490 va_list_gpr_counter_field = f_gpr;
7491 va_list_fpr_counter_field = f_fpr;
7492
7493 DECL_FIELD_CONTEXT (f_gpr) = record;
7494 DECL_FIELD_CONTEXT (f_fpr) = record;
7495 DECL_FIELD_CONTEXT (f_ovf) = record;
7496 DECL_FIELD_CONTEXT (f_sav) = record;
7497
7498 TYPE_STUB_DECL (record) = type_decl;
7499 TYPE_NAME (record) = type_decl;
7500 TYPE_FIELDS (record) = f_gpr;
7501 DECL_CHAIN (f_gpr) = f_fpr;
7502 DECL_CHAIN (f_fpr) = f_ovf;
7503 DECL_CHAIN (f_ovf) = f_sav;
7504
7505 layout_type (record);
7506
7507 /* The correct type is an array type of one element. */
7508 return build_array_type (record, build_index_type (size_zero_node));
7509 }
7510
7511 /* Setup the builtin va_list data type and for 64-bit the additional
7512 calling convention specific va_list data types. */
7513
7514 static tree
7515 ix86_build_builtin_va_list (void)
7516 {
7517 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7518
7519 /* Initialize abi specific va_list builtin types. */
7520 if (TARGET_64BIT)
7521 {
7522 tree t;
7523 if (ix86_abi == MS_ABI)
7524 {
7525 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7526 if (TREE_CODE (t) != RECORD_TYPE)
7527 t = build_variant_type_copy (t);
7528 sysv_va_list_type_node = t;
7529 }
7530 else
7531 {
7532 t = ret;
7533 if (TREE_CODE (t) != RECORD_TYPE)
7534 t = build_variant_type_copy (t);
7535 sysv_va_list_type_node = t;
7536 }
7537 if (ix86_abi != MS_ABI)
7538 {
7539 t = ix86_build_builtin_va_list_abi (MS_ABI);
7540 if (TREE_CODE (t) != RECORD_TYPE)
7541 t = build_variant_type_copy (t);
7542 ms_va_list_type_node = t;
7543 }
7544 else
7545 {
7546 t = ret;
7547 if (TREE_CODE (t) != RECORD_TYPE)
7548 t = build_variant_type_copy (t);
7549 ms_va_list_type_node = t;
7550 }
7551 }
7552
7553 return ret;
7554 }
7555
7556 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7557
7558 static void
7559 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7560 {
7561 rtx save_area, mem;
7562 alias_set_type set;
7563 int i, max;
7564
7565 /* GPR size of varargs save area. */
7566 if (cfun->va_list_gpr_size)
7567 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7568 else
7569 ix86_varargs_gpr_size = 0;
7570
7571 /* FPR size of varargs save area. We don't need it if we don't pass
7572 anything in SSE registers. */
7573 if (TARGET_SSE && cfun->va_list_fpr_size)
7574 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7575 else
7576 ix86_varargs_fpr_size = 0;
7577
7578 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7579 return;
7580
7581 save_area = frame_pointer_rtx;
7582 set = get_varargs_alias_set ();
7583
7584 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7585 if (max > X86_64_REGPARM_MAX)
7586 max = X86_64_REGPARM_MAX;
7587
7588 for (i = cum->regno; i < max; i++)
7589 {
7590 mem = gen_rtx_MEM (Pmode,
7591 plus_constant (save_area, i * UNITS_PER_WORD));
7592 MEM_NOTRAP_P (mem) = 1;
7593 set_mem_alias_set (mem, set);
7594 emit_move_insn (mem, gen_rtx_REG (Pmode,
7595 x86_64_int_parameter_registers[i]));
7596 }
7597
7598 if (ix86_varargs_fpr_size)
7599 {
7600 enum machine_mode smode;
7601 rtx label, test;
7602
7603 /* Now emit code to save SSE registers. The AX parameter contains number
7604 of SSE parameter registers used to call this function, though all we
7605 actually check here is the zero/non-zero status. */
7606
7607 label = gen_label_rtx ();
7608 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7609 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7610 label));
7611
7612 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7613 we used movdqa (i.e. TImode) instead? Perhaps even better would
7614 be if we could determine the real mode of the data, via a hook
7615 into pass_stdarg. Ignore all that for now. */
7616 smode = V4SFmode;
7617 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7618 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7619
7620 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7621 if (max > X86_64_SSE_REGPARM_MAX)
7622 max = X86_64_SSE_REGPARM_MAX;
7623
7624 for (i = cum->sse_regno; i < max; ++i)
7625 {
7626 mem = plus_constant (save_area, i * 16 + ix86_varargs_gpr_size);
7627 mem = gen_rtx_MEM (smode, mem);
7628 MEM_NOTRAP_P (mem) = 1;
7629 set_mem_alias_set (mem, set);
7630 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7631
7632 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7633 }
7634
7635 emit_label (label);
7636 }
7637 }
7638
7639 static void
7640 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7641 {
7642 alias_set_type set = get_varargs_alias_set ();
7643 int i;
7644
7645 /* Reset to zero, as there might be a sysv vaarg used
7646 before. */
7647 ix86_varargs_gpr_size = 0;
7648 ix86_varargs_fpr_size = 0;
7649
7650 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7651 {
7652 rtx reg, mem;
7653
7654 mem = gen_rtx_MEM (Pmode,
7655 plus_constant (virtual_incoming_args_rtx,
7656 i * UNITS_PER_WORD));
7657 MEM_NOTRAP_P (mem) = 1;
7658 set_mem_alias_set (mem, set);
7659
7660 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7661 emit_move_insn (mem, reg);
7662 }
7663 }
7664
7665 static void
7666 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7667 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7668 int no_rtl)
7669 {
7670 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7671 CUMULATIVE_ARGS next_cum;
7672 tree fntype;
7673
7674 /* This argument doesn't appear to be used anymore. Which is good,
7675 because the old code here didn't suppress rtl generation. */
7676 gcc_assert (!no_rtl);
7677
7678 if (!TARGET_64BIT)
7679 return;
7680
7681 fntype = TREE_TYPE (current_function_decl);
7682
7683 /* For varargs, we do not want to skip the dummy va_dcl argument.
7684 For stdargs, we do want to skip the last named argument. */
7685 next_cum = *cum;
7686 if (stdarg_p (fntype))
7687 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7688 true);
7689
7690 if (cum->call_abi == MS_ABI)
7691 setup_incoming_varargs_ms_64 (&next_cum);
7692 else
7693 setup_incoming_varargs_64 (&next_cum);
7694 }
7695
7696 /* Checks if TYPE is of kind va_list char *. */
7697
7698 static bool
7699 is_va_list_char_pointer (tree type)
7700 {
7701 tree canonic;
7702
7703 /* For 32-bit it is always true. */
7704 if (!TARGET_64BIT)
7705 return true;
7706 canonic = ix86_canonical_va_list_type (type);
7707 return (canonic == ms_va_list_type_node
7708 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7709 }
7710
7711 /* Implement va_start. */
7712
7713 static void
7714 ix86_va_start (tree valist, rtx nextarg)
7715 {
7716 HOST_WIDE_INT words, n_gpr, n_fpr;
7717 tree f_gpr, f_fpr, f_ovf, f_sav;
7718 tree gpr, fpr, ovf, sav, t;
7719 tree type;
7720 rtx ovf_rtx;
7721
7722 if (flag_split_stack
7723 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7724 {
7725 unsigned int scratch_regno;
7726
7727 /* When we are splitting the stack, we can't refer to the stack
7728 arguments using internal_arg_pointer, because they may be on
7729 the old stack. The split stack prologue will arrange to
7730 leave a pointer to the old stack arguments in a scratch
7731 register, which we here copy to a pseudo-register. The split
7732 stack prologue can't set the pseudo-register directly because
7733 it (the prologue) runs before any registers have been saved. */
7734
7735 scratch_regno = split_stack_prologue_scratch_regno ();
7736 if (scratch_regno != INVALID_REGNUM)
7737 {
7738 rtx reg, seq;
7739
7740 reg = gen_reg_rtx (Pmode);
7741 cfun->machine->split_stack_varargs_pointer = reg;
7742
7743 start_sequence ();
7744 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7745 seq = get_insns ();
7746 end_sequence ();
7747
7748 push_topmost_sequence ();
7749 emit_insn_after (seq, entry_of_function ());
7750 pop_topmost_sequence ();
7751 }
7752 }
7753
7754 /* Only 64bit target needs something special. */
7755 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7756 {
7757 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7758 std_expand_builtin_va_start (valist, nextarg);
7759 else
7760 {
7761 rtx va_r, next;
7762
7763 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7764 next = expand_binop (ptr_mode, add_optab,
7765 cfun->machine->split_stack_varargs_pointer,
7766 crtl->args.arg_offset_rtx,
7767 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7768 convert_move (va_r, next, 0);
7769 }
7770 return;
7771 }
7772
7773 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7774 f_fpr = DECL_CHAIN (f_gpr);
7775 f_ovf = DECL_CHAIN (f_fpr);
7776 f_sav = DECL_CHAIN (f_ovf);
7777
7778 valist = build_simple_mem_ref (valist);
7779 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7780 /* The following should be folded into the MEM_REF offset. */
7781 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7782 f_gpr, NULL_TREE);
7783 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7784 f_fpr, NULL_TREE);
7785 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7786 f_ovf, NULL_TREE);
7787 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7788 f_sav, NULL_TREE);
7789
7790 /* Count number of gp and fp argument registers used. */
7791 words = crtl->args.info.words;
7792 n_gpr = crtl->args.info.regno;
7793 n_fpr = crtl->args.info.sse_regno;
7794
7795 if (cfun->va_list_gpr_size)
7796 {
7797 type = TREE_TYPE (gpr);
7798 t = build2 (MODIFY_EXPR, type,
7799 gpr, build_int_cst (type, n_gpr * 8));
7800 TREE_SIDE_EFFECTS (t) = 1;
7801 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7802 }
7803
7804 if (TARGET_SSE && cfun->va_list_fpr_size)
7805 {
7806 type = TREE_TYPE (fpr);
7807 t = build2 (MODIFY_EXPR, type, fpr,
7808 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7809 TREE_SIDE_EFFECTS (t) = 1;
7810 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7811 }
7812
7813 /* Find the overflow area. */
7814 type = TREE_TYPE (ovf);
7815 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7816 ovf_rtx = crtl->args.internal_arg_pointer;
7817 else
7818 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7819 t = make_tree (type, ovf_rtx);
7820 if (words != 0)
7821 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7822 t = build2 (MODIFY_EXPR, type, ovf, t);
7823 TREE_SIDE_EFFECTS (t) = 1;
7824 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7825
7826 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7827 {
7828 /* Find the register save area.
7829 Prologue of the function save it right above stack frame. */
7830 type = TREE_TYPE (sav);
7831 t = make_tree (type, frame_pointer_rtx);
7832 if (!ix86_varargs_gpr_size)
7833 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7834 t = build2 (MODIFY_EXPR, type, sav, t);
7835 TREE_SIDE_EFFECTS (t) = 1;
7836 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7837 }
7838 }
7839
7840 /* Implement va_arg. */
7841
7842 static tree
7843 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7844 gimple_seq *post_p)
7845 {
7846 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7847 tree f_gpr, f_fpr, f_ovf, f_sav;
7848 tree gpr, fpr, ovf, sav, t;
7849 int size, rsize;
7850 tree lab_false, lab_over = NULL_TREE;
7851 tree addr, t2;
7852 rtx container;
7853 int indirect_p = 0;
7854 tree ptrtype;
7855 enum machine_mode nat_mode;
7856 unsigned int arg_boundary;
7857
7858 /* Only 64bit target needs something special. */
7859 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7860 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7861
7862 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7863 f_fpr = DECL_CHAIN (f_gpr);
7864 f_ovf = DECL_CHAIN (f_fpr);
7865 f_sav = DECL_CHAIN (f_ovf);
7866
7867 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7868 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7869 valist = build_va_arg_indirect_ref (valist);
7870 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7871 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7872 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7873
7874 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7875 if (indirect_p)
7876 type = build_pointer_type (type);
7877 size = int_size_in_bytes (type);
7878 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7879
7880 nat_mode = type_natural_mode (type, NULL);
7881 switch (nat_mode)
7882 {
7883 case V8SFmode:
7884 case V8SImode:
7885 case V32QImode:
7886 case V16HImode:
7887 case V4DFmode:
7888 case V4DImode:
7889 /* Unnamed 256bit vector mode parameters are passed on stack. */
7890 if (!TARGET_64BIT_MS_ABI)
7891 {
7892 container = NULL;
7893 break;
7894 }
7895
7896 default:
7897 container = construct_container (nat_mode, TYPE_MODE (type),
7898 type, 0, X86_64_REGPARM_MAX,
7899 X86_64_SSE_REGPARM_MAX, intreg,
7900 0);
7901 break;
7902 }
7903
7904 /* Pull the value out of the saved registers. */
7905
7906 addr = create_tmp_var (ptr_type_node, "addr");
7907
7908 if (container)
7909 {
7910 int needed_intregs, needed_sseregs;
7911 bool need_temp;
7912 tree int_addr, sse_addr;
7913
7914 lab_false = create_artificial_label (UNKNOWN_LOCATION);
7915 lab_over = create_artificial_label (UNKNOWN_LOCATION);
7916
7917 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
7918
7919 need_temp = (!REG_P (container)
7920 && ((needed_intregs && TYPE_ALIGN (type) > 64)
7921 || TYPE_ALIGN (type) > 128));
7922
7923 /* In case we are passing structure, verify that it is consecutive block
7924 on the register save area. If not we need to do moves. */
7925 if (!need_temp && !REG_P (container))
7926 {
7927 /* Verify that all registers are strictly consecutive */
7928 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
7929 {
7930 int i;
7931
7932 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7933 {
7934 rtx slot = XVECEXP (container, 0, i);
7935 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
7936 || INTVAL (XEXP (slot, 1)) != i * 16)
7937 need_temp = 1;
7938 }
7939 }
7940 else
7941 {
7942 int i;
7943
7944 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7945 {
7946 rtx slot = XVECEXP (container, 0, i);
7947 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
7948 || INTVAL (XEXP (slot, 1)) != i * 8)
7949 need_temp = 1;
7950 }
7951 }
7952 }
7953 if (!need_temp)
7954 {
7955 int_addr = addr;
7956 sse_addr = addr;
7957 }
7958 else
7959 {
7960 int_addr = create_tmp_var (ptr_type_node, "int_addr");
7961 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
7962 }
7963
7964 /* First ensure that we fit completely in registers. */
7965 if (needed_intregs)
7966 {
7967 t = build_int_cst (TREE_TYPE (gpr),
7968 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
7969 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
7970 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7971 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7972 gimplify_and_add (t, pre_p);
7973 }
7974 if (needed_sseregs)
7975 {
7976 t = build_int_cst (TREE_TYPE (fpr),
7977 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
7978 + X86_64_REGPARM_MAX * 8);
7979 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
7980 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7981 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7982 gimplify_and_add (t, pre_p);
7983 }
7984
7985 /* Compute index to start of area used for integer regs. */
7986 if (needed_intregs)
7987 {
7988 /* int_addr = gpr + sav; */
7989 t = fold_build_pointer_plus (sav, gpr);
7990 gimplify_assign (int_addr, t, pre_p);
7991 }
7992 if (needed_sseregs)
7993 {
7994 /* sse_addr = fpr + sav; */
7995 t = fold_build_pointer_plus (sav, fpr);
7996 gimplify_assign (sse_addr, t, pre_p);
7997 }
7998 if (need_temp)
7999 {
8000 int i, prev_size = 0;
8001 tree temp = create_tmp_var (type, "va_arg_tmp");
8002
8003 /* addr = &temp; */
8004 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8005 gimplify_assign (addr, t, pre_p);
8006
8007 for (i = 0; i < XVECLEN (container, 0); i++)
8008 {
8009 rtx slot = XVECEXP (container, 0, i);
8010 rtx reg = XEXP (slot, 0);
8011 enum machine_mode mode = GET_MODE (reg);
8012 tree piece_type;
8013 tree addr_type;
8014 tree daddr_type;
8015 tree src_addr, src;
8016 int src_offset;
8017 tree dest_addr, dest;
8018 int cur_size = GET_MODE_SIZE (mode);
8019
8020 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8021 prev_size = INTVAL (XEXP (slot, 1));
8022 if (prev_size + cur_size > size)
8023 {
8024 cur_size = size - prev_size;
8025 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8026 if (mode == BLKmode)
8027 mode = QImode;
8028 }
8029 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8030 if (mode == GET_MODE (reg))
8031 addr_type = build_pointer_type (piece_type);
8032 else
8033 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8034 true);
8035 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8036 true);
8037
8038 if (SSE_REGNO_P (REGNO (reg)))
8039 {
8040 src_addr = sse_addr;
8041 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8042 }
8043 else
8044 {
8045 src_addr = int_addr;
8046 src_offset = REGNO (reg) * 8;
8047 }
8048 src_addr = fold_convert (addr_type, src_addr);
8049 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8050
8051 dest_addr = fold_convert (daddr_type, addr);
8052 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8053 if (cur_size == GET_MODE_SIZE (mode))
8054 {
8055 src = build_va_arg_indirect_ref (src_addr);
8056 dest = build_va_arg_indirect_ref (dest_addr);
8057
8058 gimplify_assign (dest, src, pre_p);
8059 }
8060 else
8061 {
8062 tree copy
8063 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8064 3, dest_addr, src_addr,
8065 size_int (cur_size));
8066 gimplify_and_add (copy, pre_p);
8067 }
8068 prev_size += cur_size;
8069 }
8070 }
8071
8072 if (needed_intregs)
8073 {
8074 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8075 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8076 gimplify_assign (gpr, t, pre_p);
8077 }
8078
8079 if (needed_sseregs)
8080 {
8081 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8082 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8083 gimplify_assign (fpr, t, pre_p);
8084 }
8085
8086 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8087
8088 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8089 }
8090
8091 /* ... otherwise out of the overflow area. */
8092
8093 /* When we align parameter on stack for caller, if the parameter
8094 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8095 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8096 here with caller. */
8097 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8098 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8099 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8100
8101 /* Care for on-stack alignment if needed. */
8102 if (arg_boundary <= 64 || size == 0)
8103 t = ovf;
8104 else
8105 {
8106 HOST_WIDE_INT align = arg_boundary / 8;
8107 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8108 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8109 build_int_cst (TREE_TYPE (t), -align));
8110 }
8111
8112 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8113 gimplify_assign (addr, t, pre_p);
8114
8115 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8116 gimplify_assign (unshare_expr (ovf), t, pre_p);
8117
8118 if (container)
8119 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8120
8121 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8122 addr = fold_convert (ptrtype, addr);
8123
8124 if (indirect_p)
8125 addr = build_va_arg_indirect_ref (addr);
8126 return build_va_arg_indirect_ref (addr);
8127 }
8128 \f
8129 /* Return true if OPNUM's MEM should be matched
8130 in movabs* patterns. */
8131
8132 bool
8133 ix86_check_movabs (rtx insn, int opnum)
8134 {
8135 rtx set, mem;
8136
8137 set = PATTERN (insn);
8138 if (GET_CODE (set) == PARALLEL)
8139 set = XVECEXP (set, 0, 0);
8140 gcc_assert (GET_CODE (set) == SET);
8141 mem = XEXP (set, opnum);
8142 while (GET_CODE (mem) == SUBREG)
8143 mem = SUBREG_REG (mem);
8144 gcc_assert (MEM_P (mem));
8145 return volatile_ok || !MEM_VOLATILE_P (mem);
8146 }
8147 \f
8148 /* Initialize the table of extra 80387 mathematical constants. */
8149
8150 static void
8151 init_ext_80387_constants (void)
8152 {
8153 static const char * cst[5] =
8154 {
8155 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8156 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8157 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8158 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8159 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8160 };
8161 int i;
8162
8163 for (i = 0; i < 5; i++)
8164 {
8165 real_from_string (&ext_80387_constants_table[i], cst[i]);
8166 /* Ensure each constant is rounded to XFmode precision. */
8167 real_convert (&ext_80387_constants_table[i],
8168 XFmode, &ext_80387_constants_table[i]);
8169 }
8170
8171 ext_80387_constants_init = 1;
8172 }
8173
8174 /* Return non-zero if the constant is something that
8175 can be loaded with a special instruction. */
8176
8177 int
8178 standard_80387_constant_p (rtx x)
8179 {
8180 enum machine_mode mode = GET_MODE (x);
8181
8182 REAL_VALUE_TYPE r;
8183
8184 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8185 return -1;
8186
8187 if (x == CONST0_RTX (mode))
8188 return 1;
8189 if (x == CONST1_RTX (mode))
8190 return 2;
8191
8192 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8193
8194 /* For XFmode constants, try to find a special 80387 instruction when
8195 optimizing for size or on those CPUs that benefit from them. */
8196 if (mode == XFmode
8197 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8198 {
8199 int i;
8200
8201 if (! ext_80387_constants_init)
8202 init_ext_80387_constants ();
8203
8204 for (i = 0; i < 5; i++)
8205 if (real_identical (&r, &ext_80387_constants_table[i]))
8206 return i + 3;
8207 }
8208
8209 /* Load of the constant -0.0 or -1.0 will be split as
8210 fldz;fchs or fld1;fchs sequence. */
8211 if (real_isnegzero (&r))
8212 return 8;
8213 if (real_identical (&r, &dconstm1))
8214 return 9;
8215
8216 return 0;
8217 }
8218
8219 /* Return the opcode of the special instruction to be used to load
8220 the constant X. */
8221
8222 const char *
8223 standard_80387_constant_opcode (rtx x)
8224 {
8225 switch (standard_80387_constant_p (x))
8226 {
8227 case 1:
8228 return "fldz";
8229 case 2:
8230 return "fld1";
8231 case 3:
8232 return "fldlg2";
8233 case 4:
8234 return "fldln2";
8235 case 5:
8236 return "fldl2e";
8237 case 6:
8238 return "fldl2t";
8239 case 7:
8240 return "fldpi";
8241 case 8:
8242 case 9:
8243 return "#";
8244 default:
8245 gcc_unreachable ();
8246 }
8247 }
8248
8249 /* Return the CONST_DOUBLE representing the 80387 constant that is
8250 loaded by the specified special instruction. The argument IDX
8251 matches the return value from standard_80387_constant_p. */
8252
8253 rtx
8254 standard_80387_constant_rtx (int idx)
8255 {
8256 int i;
8257
8258 if (! ext_80387_constants_init)
8259 init_ext_80387_constants ();
8260
8261 switch (idx)
8262 {
8263 case 3:
8264 case 4:
8265 case 5:
8266 case 6:
8267 case 7:
8268 i = idx - 3;
8269 break;
8270
8271 default:
8272 gcc_unreachable ();
8273 }
8274
8275 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8276 XFmode);
8277 }
8278
8279 /* Return 1 if X is all 0s and 2 if x is all 1s
8280 in supported SSE/AVX vector mode. */
8281
8282 int
8283 standard_sse_constant_p (rtx x)
8284 {
8285 enum machine_mode mode = GET_MODE (x);
8286
8287 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8288 return 1;
8289 if (vector_all_ones_operand (x, mode))
8290 switch (mode)
8291 {
8292 case V16QImode:
8293 case V8HImode:
8294 case V4SImode:
8295 case V2DImode:
8296 if (TARGET_SSE2)
8297 return 2;
8298 case V32QImode:
8299 case V16HImode:
8300 case V8SImode:
8301 case V4DImode:
8302 if (TARGET_AVX2)
8303 return 2;
8304 default:
8305 break;
8306 }
8307
8308 return 0;
8309 }
8310
8311 /* Return the opcode of the special instruction to be used to load
8312 the constant X. */
8313
8314 const char *
8315 standard_sse_constant_opcode (rtx insn, rtx x)
8316 {
8317 switch (standard_sse_constant_p (x))
8318 {
8319 case 1:
8320 switch (get_attr_mode (insn))
8321 {
8322 case MODE_TI:
8323 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8324 return "%vpxor\t%0, %d0";
8325 case MODE_V2DF:
8326 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8327 return "%vxorpd\t%0, %d0";
8328 case MODE_V4SF:
8329 return "%vxorps\t%0, %d0";
8330
8331 case MODE_OI:
8332 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8333 return "vpxor\t%x0, %x0, %x0";
8334 case MODE_V4DF:
8335 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8336 return "vxorpd\t%x0, %x0, %x0";
8337 case MODE_V8SF:
8338 return "vxorps\t%x0, %x0, %x0";
8339
8340 default:
8341 break;
8342 }
8343
8344 case 2:
8345 if (TARGET_AVX)
8346 return "vpcmpeqd\t%0, %0, %0";
8347 else
8348 return "pcmpeqd\t%0, %0";
8349
8350 default:
8351 break;
8352 }
8353 gcc_unreachable ();
8354 }
8355
8356 /* Returns true if OP contains a symbol reference */
8357
8358 bool
8359 symbolic_reference_mentioned_p (rtx op)
8360 {
8361 const char *fmt;
8362 int i;
8363
8364 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8365 return true;
8366
8367 fmt = GET_RTX_FORMAT (GET_CODE (op));
8368 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8369 {
8370 if (fmt[i] == 'E')
8371 {
8372 int j;
8373
8374 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8375 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8376 return true;
8377 }
8378
8379 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8380 return true;
8381 }
8382
8383 return false;
8384 }
8385
8386 /* Return true if it is appropriate to emit `ret' instructions in the
8387 body of a function. Do this only if the epilogue is simple, needing a
8388 couple of insns. Prior to reloading, we can't tell how many registers
8389 must be saved, so return false then. Return false if there is no frame
8390 marker to de-allocate. */
8391
8392 bool
8393 ix86_can_use_return_insn_p (void)
8394 {
8395 struct ix86_frame frame;
8396
8397 if (! reload_completed || frame_pointer_needed)
8398 return 0;
8399
8400 /* Don't allow more than 32k pop, since that's all we can do
8401 with one instruction. */
8402 if (crtl->args.pops_args && crtl->args.size >= 32768)
8403 return 0;
8404
8405 ix86_compute_frame_layout (&frame);
8406 return (frame.stack_pointer_offset == UNITS_PER_WORD
8407 && (frame.nregs + frame.nsseregs) == 0);
8408 }
8409 \f
8410 /* Value should be nonzero if functions must have frame pointers.
8411 Zero means the frame pointer need not be set up (and parms may
8412 be accessed via the stack pointer) in functions that seem suitable. */
8413
8414 static bool
8415 ix86_frame_pointer_required (void)
8416 {
8417 /* If we accessed previous frames, then the generated code expects
8418 to be able to access the saved ebp value in our frame. */
8419 if (cfun->machine->accesses_prev_frame)
8420 return true;
8421
8422 /* Several x86 os'es need a frame pointer for other reasons,
8423 usually pertaining to setjmp. */
8424 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8425 return true;
8426
8427 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8428 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8429 return true;
8430
8431 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8432 turns off the frame pointer by default. Turn it back on now if
8433 we've not got a leaf function. */
8434 if (TARGET_OMIT_LEAF_FRAME_POINTER
8435 && (!current_function_is_leaf
8436 || ix86_current_function_calls_tls_descriptor))
8437 return true;
8438
8439 if (crtl->profile && !flag_fentry)
8440 return true;
8441
8442 return false;
8443 }
8444
8445 /* Record that the current function accesses previous call frames. */
8446
8447 void
8448 ix86_setup_frame_addresses (void)
8449 {
8450 cfun->machine->accesses_prev_frame = 1;
8451 }
8452 \f
8453 #ifndef USE_HIDDEN_LINKONCE
8454 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8455 # define USE_HIDDEN_LINKONCE 1
8456 # else
8457 # define USE_HIDDEN_LINKONCE 0
8458 # endif
8459 #endif
8460
8461 static int pic_labels_used;
8462
8463 /* Fills in the label name that should be used for a pc thunk for
8464 the given register. */
8465
8466 static void
8467 get_pc_thunk_name (char name[32], unsigned int regno)
8468 {
8469 gcc_assert (!TARGET_64BIT);
8470
8471 if (USE_HIDDEN_LINKONCE)
8472 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8473 else
8474 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8475 }
8476
8477
8478 /* This function generates code for -fpic that loads %ebx with
8479 the return address of the caller and then returns. */
8480
8481 static void
8482 ix86_code_end (void)
8483 {
8484 rtx xops[2];
8485 int regno;
8486
8487 for (regno = AX_REG; regno <= SP_REG; regno++)
8488 {
8489 char name[32];
8490 tree decl;
8491
8492 if (!(pic_labels_used & (1 << regno)))
8493 continue;
8494
8495 get_pc_thunk_name (name, regno);
8496
8497 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8498 get_identifier (name),
8499 build_function_type_list (void_type_node, NULL_TREE));
8500 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8501 NULL_TREE, void_type_node);
8502 TREE_PUBLIC (decl) = 1;
8503 TREE_STATIC (decl) = 1;
8504
8505 #if TARGET_MACHO
8506 if (TARGET_MACHO)
8507 {
8508 switch_to_section (darwin_sections[text_coal_section]);
8509 fputs ("\t.weak_definition\t", asm_out_file);
8510 assemble_name (asm_out_file, name);
8511 fputs ("\n\t.private_extern\t", asm_out_file);
8512 assemble_name (asm_out_file, name);
8513 putc ('\n', asm_out_file);
8514 ASM_OUTPUT_LABEL (asm_out_file, name);
8515 DECL_WEAK (decl) = 1;
8516 }
8517 else
8518 #endif
8519 if (USE_HIDDEN_LINKONCE)
8520 {
8521 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8522
8523 targetm.asm_out.unique_section (decl, 0);
8524 switch_to_section (get_named_section (decl, NULL, 0));
8525
8526 targetm.asm_out.globalize_label (asm_out_file, name);
8527 fputs ("\t.hidden\t", asm_out_file);
8528 assemble_name (asm_out_file, name);
8529 putc ('\n', asm_out_file);
8530 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8531 }
8532 else
8533 {
8534 switch_to_section (text_section);
8535 ASM_OUTPUT_LABEL (asm_out_file, name);
8536 }
8537
8538 DECL_INITIAL (decl) = make_node (BLOCK);
8539 current_function_decl = decl;
8540 init_function_start (decl);
8541 first_function_block_is_cold = false;
8542 /* Make sure unwind info is emitted for the thunk if needed. */
8543 final_start_function (emit_barrier (), asm_out_file, 1);
8544
8545 /* Pad stack IP move with 4 instructions (two NOPs count
8546 as one instruction). */
8547 if (TARGET_PAD_SHORT_FUNCTION)
8548 {
8549 int i = 8;
8550
8551 while (i--)
8552 fputs ("\tnop\n", asm_out_file);
8553 }
8554
8555 xops[0] = gen_rtx_REG (Pmode, regno);
8556 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8557 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8558 fputs ("\tret\n", asm_out_file);
8559 final_end_function ();
8560 init_insn_lengths ();
8561 free_after_compilation (cfun);
8562 set_cfun (NULL);
8563 current_function_decl = NULL;
8564 }
8565
8566 if (flag_split_stack)
8567 file_end_indicate_split_stack ();
8568 }
8569
8570 /* Emit code for the SET_GOT patterns. */
8571
8572 const char *
8573 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8574 {
8575 rtx xops[3];
8576
8577 xops[0] = dest;
8578
8579 if (TARGET_VXWORKS_RTP && flag_pic)
8580 {
8581 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8582 xops[2] = gen_rtx_MEM (Pmode,
8583 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8584 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8585
8586 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8587 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8588 an unadorned address. */
8589 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8590 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8591 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8592 return "";
8593 }
8594
8595 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8596
8597 if (!flag_pic)
8598 {
8599 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8600
8601 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8602
8603 #if TARGET_MACHO
8604 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8605 is what will be referenced by the Mach-O PIC subsystem. */
8606 if (!label)
8607 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8608 #endif
8609
8610 targetm.asm_out.internal_label (asm_out_file, "L",
8611 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8612 }
8613 else
8614 {
8615 char name[32];
8616 get_pc_thunk_name (name, REGNO (dest));
8617 pic_labels_used |= 1 << REGNO (dest);
8618
8619 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8620 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8621 output_asm_insn ("call\t%X2", xops);
8622 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8623 is what will be referenced by the Mach-O PIC subsystem. */
8624 #if TARGET_MACHO
8625 if (!label)
8626 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8627 else
8628 targetm.asm_out.internal_label (asm_out_file, "L",
8629 CODE_LABEL_NUMBER (label));
8630 #endif
8631 }
8632
8633 if (!TARGET_MACHO)
8634 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8635
8636 return "";
8637 }
8638
8639 /* Generate an "push" pattern for input ARG. */
8640
8641 static rtx
8642 gen_push (rtx arg)
8643 {
8644 struct machine_function *m = cfun->machine;
8645
8646 if (m->fs.cfa_reg == stack_pointer_rtx)
8647 m->fs.cfa_offset += UNITS_PER_WORD;
8648 m->fs.sp_offset += UNITS_PER_WORD;
8649
8650 return gen_rtx_SET (VOIDmode,
8651 gen_rtx_MEM (Pmode,
8652 gen_rtx_PRE_DEC (Pmode,
8653 stack_pointer_rtx)),
8654 arg);
8655 }
8656
8657 /* Generate an "pop" pattern for input ARG. */
8658
8659 static rtx
8660 gen_pop (rtx arg)
8661 {
8662 return gen_rtx_SET (VOIDmode,
8663 arg,
8664 gen_rtx_MEM (Pmode,
8665 gen_rtx_POST_INC (Pmode,
8666 stack_pointer_rtx)));
8667 }
8668
8669 /* Return >= 0 if there is an unused call-clobbered register available
8670 for the entire function. */
8671
8672 static unsigned int
8673 ix86_select_alt_pic_regnum (void)
8674 {
8675 if (current_function_is_leaf
8676 && !crtl->profile
8677 && !ix86_current_function_calls_tls_descriptor)
8678 {
8679 int i, drap;
8680 /* Can't use the same register for both PIC and DRAP. */
8681 if (crtl->drap_reg)
8682 drap = REGNO (crtl->drap_reg);
8683 else
8684 drap = -1;
8685 for (i = 2; i >= 0; --i)
8686 if (i != drap && !df_regs_ever_live_p (i))
8687 return i;
8688 }
8689
8690 return INVALID_REGNUM;
8691 }
8692
8693 /* Return TRUE if we need to save REGNO. */
8694
8695 static bool
8696 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8697 {
8698 if (pic_offset_table_rtx
8699 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8700 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8701 || crtl->profile
8702 || crtl->calls_eh_return
8703 || crtl->uses_const_pool))
8704 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8705
8706 if (crtl->calls_eh_return && maybe_eh_return)
8707 {
8708 unsigned i;
8709 for (i = 0; ; i++)
8710 {
8711 unsigned test = EH_RETURN_DATA_REGNO (i);
8712 if (test == INVALID_REGNUM)
8713 break;
8714 if (test == regno)
8715 return true;
8716 }
8717 }
8718
8719 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8720 return true;
8721
8722 return (df_regs_ever_live_p (regno)
8723 && !call_used_regs[regno]
8724 && !fixed_regs[regno]
8725 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8726 }
8727
8728 /* Return number of saved general prupose registers. */
8729
8730 static int
8731 ix86_nsaved_regs (void)
8732 {
8733 int nregs = 0;
8734 int regno;
8735
8736 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8737 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8738 nregs ++;
8739 return nregs;
8740 }
8741
8742 /* Return number of saved SSE registrers. */
8743
8744 static int
8745 ix86_nsaved_sseregs (void)
8746 {
8747 int nregs = 0;
8748 int regno;
8749
8750 if (!TARGET_64BIT_MS_ABI)
8751 return 0;
8752 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8753 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8754 nregs ++;
8755 return nregs;
8756 }
8757
8758 /* Given FROM and TO register numbers, say whether this elimination is
8759 allowed. If stack alignment is needed, we can only replace argument
8760 pointer with hard frame pointer, or replace frame pointer with stack
8761 pointer. Otherwise, frame pointer elimination is automatically
8762 handled and all other eliminations are valid. */
8763
8764 static bool
8765 ix86_can_eliminate (const int from, const int to)
8766 {
8767 if (stack_realign_fp)
8768 return ((from == ARG_POINTER_REGNUM
8769 && to == HARD_FRAME_POINTER_REGNUM)
8770 || (from == FRAME_POINTER_REGNUM
8771 && to == STACK_POINTER_REGNUM));
8772 else
8773 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8774 }
8775
8776 /* Return the offset between two registers, one to be eliminated, and the other
8777 its replacement, at the start of a routine. */
8778
8779 HOST_WIDE_INT
8780 ix86_initial_elimination_offset (int from, int to)
8781 {
8782 struct ix86_frame frame;
8783 ix86_compute_frame_layout (&frame);
8784
8785 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8786 return frame.hard_frame_pointer_offset;
8787 else if (from == FRAME_POINTER_REGNUM
8788 && to == HARD_FRAME_POINTER_REGNUM)
8789 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8790 else
8791 {
8792 gcc_assert (to == STACK_POINTER_REGNUM);
8793
8794 if (from == ARG_POINTER_REGNUM)
8795 return frame.stack_pointer_offset;
8796
8797 gcc_assert (from == FRAME_POINTER_REGNUM);
8798 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8799 }
8800 }
8801
8802 /* In a dynamically-aligned function, we can't know the offset from
8803 stack pointer to frame pointer, so we must ensure that setjmp
8804 eliminates fp against the hard fp (%ebp) rather than trying to
8805 index from %esp up to the top of the frame across a gap that is
8806 of unknown (at compile-time) size. */
8807 static rtx
8808 ix86_builtin_setjmp_frame_value (void)
8809 {
8810 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8811 }
8812
8813 /* When using -fsplit-stack, the allocation routines set a field in
8814 the TCB to the bottom of the stack plus this much space, measured
8815 in bytes. */
8816
8817 #define SPLIT_STACK_AVAILABLE 256
8818
8819 /* Fill structure ix86_frame about frame of currently computed function. */
8820
8821 static void
8822 ix86_compute_frame_layout (struct ix86_frame *frame)
8823 {
8824 unsigned int stack_alignment_needed;
8825 HOST_WIDE_INT offset;
8826 unsigned int preferred_alignment;
8827 HOST_WIDE_INT size = get_frame_size ();
8828 HOST_WIDE_INT to_allocate;
8829
8830 frame->nregs = ix86_nsaved_regs ();
8831 frame->nsseregs = ix86_nsaved_sseregs ();
8832
8833 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8834 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8835
8836 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8837 function prologues and leaf. */
8838 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8839 && (!current_function_is_leaf || cfun->calls_alloca != 0
8840 || ix86_current_function_calls_tls_descriptor))
8841 {
8842 preferred_alignment = 16;
8843 stack_alignment_needed = 16;
8844 crtl->preferred_stack_boundary = 128;
8845 crtl->stack_alignment_needed = 128;
8846 }
8847
8848 gcc_assert (!size || stack_alignment_needed);
8849 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8850 gcc_assert (preferred_alignment <= stack_alignment_needed);
8851
8852 /* For SEH we have to limit the amount of code movement into the prologue.
8853 At present we do this via a BLOCKAGE, at which point there's very little
8854 scheduling that can be done, which means that there's very little point
8855 in doing anything except PUSHs. */
8856 if (TARGET_SEH)
8857 cfun->machine->use_fast_prologue_epilogue = false;
8858
8859 /* During reload iteration the amount of registers saved can change.
8860 Recompute the value as needed. Do not recompute when amount of registers
8861 didn't change as reload does multiple calls to the function and does not
8862 expect the decision to change within single iteration. */
8863 else if (!optimize_function_for_size_p (cfun)
8864 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8865 {
8866 int count = frame->nregs;
8867 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8868
8869 cfun->machine->use_fast_prologue_epilogue_nregs = count;
8870
8871 /* The fast prologue uses move instead of push to save registers. This
8872 is significantly longer, but also executes faster as modern hardware
8873 can execute the moves in parallel, but can't do that for push/pop.
8874
8875 Be careful about choosing what prologue to emit: When function takes
8876 many instructions to execute we may use slow version as well as in
8877 case function is known to be outside hot spot (this is known with
8878 feedback only). Weight the size of function by number of registers
8879 to save as it is cheap to use one or two push instructions but very
8880 slow to use many of them. */
8881 if (count)
8882 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
8883 if (node->frequency < NODE_FREQUENCY_NORMAL
8884 || (flag_branch_probabilities
8885 && node->frequency < NODE_FREQUENCY_HOT))
8886 cfun->machine->use_fast_prologue_epilogue = false;
8887 else
8888 cfun->machine->use_fast_prologue_epilogue
8889 = !expensive_function_p (count);
8890 }
8891
8892 frame->save_regs_using_mov
8893 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
8894 /* If static stack checking is enabled and done with probes,
8895 the registers need to be saved before allocating the frame. */
8896 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
8897
8898 /* Skip return address. */
8899 offset = UNITS_PER_WORD;
8900
8901 /* Skip pushed static chain. */
8902 if (ix86_static_chain_on_stack)
8903 offset += UNITS_PER_WORD;
8904
8905 /* Skip saved base pointer. */
8906 if (frame_pointer_needed)
8907 offset += UNITS_PER_WORD;
8908 frame->hfp_save_offset = offset;
8909
8910 /* The traditional frame pointer location is at the top of the frame. */
8911 frame->hard_frame_pointer_offset = offset;
8912
8913 /* Register save area */
8914 offset += frame->nregs * UNITS_PER_WORD;
8915 frame->reg_save_offset = offset;
8916
8917 /* Align and set SSE register save area. */
8918 if (frame->nsseregs)
8919 {
8920 /* The only ABI that has saved SSE registers (Win64) also has a
8921 16-byte aligned default stack, and thus we don't need to be
8922 within the re-aligned local stack frame to save them. */
8923 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
8924 offset = (offset + 16 - 1) & -16;
8925 offset += frame->nsseregs * 16;
8926 }
8927 frame->sse_reg_save_offset = offset;
8928
8929 /* The re-aligned stack starts here. Values before this point are not
8930 directly comparable with values below this point. In order to make
8931 sure that no value happens to be the same before and after, force
8932 the alignment computation below to add a non-zero value. */
8933 if (stack_realign_fp)
8934 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
8935
8936 /* Va-arg area */
8937 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
8938 offset += frame->va_arg_size;
8939
8940 /* Align start of frame for local function. */
8941 if (stack_realign_fp
8942 || offset != frame->sse_reg_save_offset
8943 || size != 0
8944 || !current_function_is_leaf
8945 || cfun->calls_alloca
8946 || ix86_current_function_calls_tls_descriptor)
8947 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
8948
8949 /* Frame pointer points here. */
8950 frame->frame_pointer_offset = offset;
8951
8952 offset += size;
8953
8954 /* Add outgoing arguments area. Can be skipped if we eliminated
8955 all the function calls as dead code.
8956 Skipping is however impossible when function calls alloca. Alloca
8957 expander assumes that last crtl->outgoing_args_size
8958 of stack frame are unused. */
8959 if (ACCUMULATE_OUTGOING_ARGS
8960 && (!current_function_is_leaf || cfun->calls_alloca
8961 || ix86_current_function_calls_tls_descriptor))
8962 {
8963 offset += crtl->outgoing_args_size;
8964 frame->outgoing_arguments_size = crtl->outgoing_args_size;
8965 }
8966 else
8967 frame->outgoing_arguments_size = 0;
8968
8969 /* Align stack boundary. Only needed if we're calling another function
8970 or using alloca. */
8971 if (!current_function_is_leaf || cfun->calls_alloca
8972 || ix86_current_function_calls_tls_descriptor)
8973 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
8974
8975 /* We've reached end of stack frame. */
8976 frame->stack_pointer_offset = offset;
8977
8978 /* Size prologue needs to allocate. */
8979 to_allocate = offset - frame->sse_reg_save_offset;
8980
8981 if ((!to_allocate && frame->nregs <= 1)
8982 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
8983 frame->save_regs_using_mov = false;
8984
8985 if (ix86_using_red_zone ()
8986 && current_function_sp_is_unchanging
8987 && current_function_is_leaf
8988 && !ix86_current_function_calls_tls_descriptor)
8989 {
8990 frame->red_zone_size = to_allocate;
8991 if (frame->save_regs_using_mov)
8992 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
8993 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
8994 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
8995 }
8996 else
8997 frame->red_zone_size = 0;
8998 frame->stack_pointer_offset -= frame->red_zone_size;
8999
9000 /* The SEH frame pointer location is near the bottom of the frame.
9001 This is enforced by the fact that the difference between the
9002 stack pointer and the frame pointer is limited to 240 bytes in
9003 the unwind data structure. */
9004 if (TARGET_SEH)
9005 {
9006 HOST_WIDE_INT diff;
9007
9008 /* If we can leave the frame pointer where it is, do so. */
9009 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9010 if (diff > 240 || (diff & 15) != 0)
9011 {
9012 /* Ideally we'd determine what portion of the local stack frame
9013 (within the constraint of the lowest 240) is most heavily used.
9014 But without that complication, simply bias the frame pointer
9015 by 128 bytes so as to maximize the amount of the local stack
9016 frame that is addressable with 8-bit offsets. */
9017 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9018 }
9019 }
9020 }
9021
9022 /* This is semi-inlined memory_address_length, but simplified
9023 since we know that we're always dealing with reg+offset, and
9024 to avoid having to create and discard all that rtl. */
9025
9026 static inline int
9027 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9028 {
9029 int len = 4;
9030
9031 if (offset == 0)
9032 {
9033 /* EBP and R13 cannot be encoded without an offset. */
9034 len = (regno == BP_REG || regno == R13_REG);
9035 }
9036 else if (IN_RANGE (offset, -128, 127))
9037 len = 1;
9038
9039 /* ESP and R12 must be encoded with a SIB byte. */
9040 if (regno == SP_REG || regno == R12_REG)
9041 len++;
9042
9043 return len;
9044 }
9045
9046 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9047 The valid base registers are taken from CFUN->MACHINE->FS. */
9048
9049 static rtx
9050 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9051 {
9052 const struct machine_function *m = cfun->machine;
9053 rtx base_reg = NULL;
9054 HOST_WIDE_INT base_offset = 0;
9055
9056 if (m->use_fast_prologue_epilogue)
9057 {
9058 /* Choose the base register most likely to allow the most scheduling
9059 opportunities. Generally FP is valid througout the function,
9060 while DRAP must be reloaded within the epilogue. But choose either
9061 over the SP due to increased encoding size. */
9062
9063 if (m->fs.fp_valid)
9064 {
9065 base_reg = hard_frame_pointer_rtx;
9066 base_offset = m->fs.fp_offset - cfa_offset;
9067 }
9068 else if (m->fs.drap_valid)
9069 {
9070 base_reg = crtl->drap_reg;
9071 base_offset = 0 - cfa_offset;
9072 }
9073 else if (m->fs.sp_valid)
9074 {
9075 base_reg = stack_pointer_rtx;
9076 base_offset = m->fs.sp_offset - cfa_offset;
9077 }
9078 }
9079 else
9080 {
9081 HOST_WIDE_INT toffset;
9082 int len = 16, tlen;
9083
9084 /* Choose the base register with the smallest address encoding.
9085 With a tie, choose FP > DRAP > SP. */
9086 if (m->fs.sp_valid)
9087 {
9088 base_reg = stack_pointer_rtx;
9089 base_offset = m->fs.sp_offset - cfa_offset;
9090 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9091 }
9092 if (m->fs.drap_valid)
9093 {
9094 toffset = 0 - cfa_offset;
9095 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9096 if (tlen <= len)
9097 {
9098 base_reg = crtl->drap_reg;
9099 base_offset = toffset;
9100 len = tlen;
9101 }
9102 }
9103 if (m->fs.fp_valid)
9104 {
9105 toffset = m->fs.fp_offset - cfa_offset;
9106 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9107 if (tlen <= len)
9108 {
9109 base_reg = hard_frame_pointer_rtx;
9110 base_offset = toffset;
9111 len = tlen;
9112 }
9113 }
9114 }
9115 gcc_assert (base_reg != NULL);
9116
9117 return plus_constant (base_reg, base_offset);
9118 }
9119
9120 /* Emit code to save registers in the prologue. */
9121
9122 static void
9123 ix86_emit_save_regs (void)
9124 {
9125 unsigned int regno;
9126 rtx insn;
9127
9128 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9129 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9130 {
9131 insn = emit_insn (gen_push (gen_rtx_REG (Pmode, regno)));
9132 RTX_FRAME_RELATED_P (insn) = 1;
9133 }
9134 }
9135
9136 /* Emit a single register save at CFA - CFA_OFFSET. */
9137
9138 static void
9139 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9140 HOST_WIDE_INT cfa_offset)
9141 {
9142 struct machine_function *m = cfun->machine;
9143 rtx reg = gen_rtx_REG (mode, regno);
9144 rtx mem, addr, base, insn;
9145
9146 addr = choose_baseaddr (cfa_offset);
9147 mem = gen_frame_mem (mode, addr);
9148
9149 /* For SSE saves, we need to indicate the 128-bit alignment. */
9150 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9151
9152 insn = emit_move_insn (mem, reg);
9153 RTX_FRAME_RELATED_P (insn) = 1;
9154
9155 base = addr;
9156 if (GET_CODE (base) == PLUS)
9157 base = XEXP (base, 0);
9158 gcc_checking_assert (REG_P (base));
9159
9160 /* When saving registers into a re-aligned local stack frame, avoid
9161 any tricky guessing by dwarf2out. */
9162 if (m->fs.realigned)
9163 {
9164 gcc_checking_assert (stack_realign_drap);
9165
9166 if (regno == REGNO (crtl->drap_reg))
9167 {
9168 /* A bit of a hack. We force the DRAP register to be saved in
9169 the re-aligned stack frame, which provides us with a copy
9170 of the CFA that will last past the prologue. Install it. */
9171 gcc_checking_assert (cfun->machine->fs.fp_valid);
9172 addr = plus_constant (hard_frame_pointer_rtx,
9173 cfun->machine->fs.fp_offset - cfa_offset);
9174 mem = gen_rtx_MEM (mode, addr);
9175 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9176 }
9177 else
9178 {
9179 /* The frame pointer is a stable reference within the
9180 aligned frame. Use it. */
9181 gcc_checking_assert (cfun->machine->fs.fp_valid);
9182 addr = plus_constant (hard_frame_pointer_rtx,
9183 cfun->machine->fs.fp_offset - cfa_offset);
9184 mem = gen_rtx_MEM (mode, addr);
9185 add_reg_note (insn, REG_CFA_EXPRESSION,
9186 gen_rtx_SET (VOIDmode, mem, reg));
9187 }
9188 }
9189
9190 /* The memory may not be relative to the current CFA register,
9191 which means that we may need to generate a new pattern for
9192 use by the unwind info. */
9193 else if (base != m->fs.cfa_reg)
9194 {
9195 addr = plus_constant (m->fs.cfa_reg, m->fs.cfa_offset - cfa_offset);
9196 mem = gen_rtx_MEM (mode, addr);
9197 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9198 }
9199 }
9200
9201 /* Emit code to save registers using MOV insns.
9202 First register is stored at CFA - CFA_OFFSET. */
9203 static void
9204 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9205 {
9206 unsigned int regno;
9207
9208 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9209 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9210 {
9211 ix86_emit_save_reg_using_mov (Pmode, regno, cfa_offset);
9212 cfa_offset -= UNITS_PER_WORD;
9213 }
9214 }
9215
9216 /* Emit code to save SSE registers using MOV insns.
9217 First register is stored at CFA - CFA_OFFSET. */
9218 static void
9219 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9220 {
9221 unsigned int regno;
9222
9223 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9224 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9225 {
9226 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9227 cfa_offset -= 16;
9228 }
9229 }
9230
9231 static GTY(()) rtx queued_cfa_restores;
9232
9233 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9234 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9235 Don't add the note if the previously saved value will be left untouched
9236 within stack red-zone till return, as unwinders can find the same value
9237 in the register and on the stack. */
9238
9239 static void
9240 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9241 {
9242 if (!crtl->shrink_wrapped
9243 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9244 return;
9245
9246 if (insn)
9247 {
9248 add_reg_note (insn, REG_CFA_RESTORE, reg);
9249 RTX_FRAME_RELATED_P (insn) = 1;
9250 }
9251 else
9252 queued_cfa_restores
9253 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9254 }
9255
9256 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9257
9258 static void
9259 ix86_add_queued_cfa_restore_notes (rtx insn)
9260 {
9261 rtx last;
9262 if (!queued_cfa_restores)
9263 return;
9264 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9265 ;
9266 XEXP (last, 1) = REG_NOTES (insn);
9267 REG_NOTES (insn) = queued_cfa_restores;
9268 queued_cfa_restores = NULL_RTX;
9269 RTX_FRAME_RELATED_P (insn) = 1;
9270 }
9271
9272 /* Expand prologue or epilogue stack adjustment.
9273 The pattern exist to put a dependency on all ebp-based memory accesses.
9274 STYLE should be negative if instructions should be marked as frame related,
9275 zero if %r11 register is live and cannot be freely used and positive
9276 otherwise. */
9277
9278 static void
9279 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9280 int style, bool set_cfa)
9281 {
9282 struct machine_function *m = cfun->machine;
9283 rtx insn;
9284 bool add_frame_related_expr = false;
9285
9286 if (! TARGET_64BIT)
9287 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9288 else if (x86_64_immediate_operand (offset, DImode))
9289 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9290 else
9291 {
9292 rtx tmp;
9293 /* r11 is used by indirect sibcall return as well, set before the
9294 epilogue and used after the epilogue. */
9295 if (style)
9296 tmp = gen_rtx_REG (DImode, R11_REG);
9297 else
9298 {
9299 gcc_assert (src != hard_frame_pointer_rtx
9300 && dest != hard_frame_pointer_rtx);
9301 tmp = hard_frame_pointer_rtx;
9302 }
9303 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9304 if (style < 0)
9305 add_frame_related_expr = true;
9306
9307 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9308 }
9309
9310 insn = emit_insn (insn);
9311 if (style >= 0)
9312 ix86_add_queued_cfa_restore_notes (insn);
9313
9314 if (set_cfa)
9315 {
9316 rtx r;
9317
9318 gcc_assert (m->fs.cfa_reg == src);
9319 m->fs.cfa_offset += INTVAL (offset);
9320 m->fs.cfa_reg = dest;
9321
9322 r = gen_rtx_PLUS (Pmode, src, offset);
9323 r = gen_rtx_SET (VOIDmode, dest, r);
9324 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9325 RTX_FRAME_RELATED_P (insn) = 1;
9326 }
9327 else if (style < 0)
9328 {
9329 RTX_FRAME_RELATED_P (insn) = 1;
9330 if (add_frame_related_expr)
9331 {
9332 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9333 r = gen_rtx_SET (VOIDmode, dest, r);
9334 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9335 }
9336 }
9337
9338 if (dest == stack_pointer_rtx)
9339 {
9340 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9341 bool valid = m->fs.sp_valid;
9342
9343 if (src == hard_frame_pointer_rtx)
9344 {
9345 valid = m->fs.fp_valid;
9346 ooffset = m->fs.fp_offset;
9347 }
9348 else if (src == crtl->drap_reg)
9349 {
9350 valid = m->fs.drap_valid;
9351 ooffset = 0;
9352 }
9353 else
9354 {
9355 /* Else there are two possibilities: SP itself, which we set
9356 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9357 taken care of this by hand along the eh_return path. */
9358 gcc_checking_assert (src == stack_pointer_rtx
9359 || offset == const0_rtx);
9360 }
9361
9362 m->fs.sp_offset = ooffset - INTVAL (offset);
9363 m->fs.sp_valid = valid;
9364 }
9365 }
9366
9367 /* Find an available register to be used as dynamic realign argument
9368 pointer regsiter. Such a register will be written in prologue and
9369 used in begin of body, so it must not be
9370 1. parameter passing register.
9371 2. GOT pointer.
9372 We reuse static-chain register if it is available. Otherwise, we
9373 use DI for i386 and R13 for x86-64. We chose R13 since it has
9374 shorter encoding.
9375
9376 Return: the regno of chosen register. */
9377
9378 static unsigned int
9379 find_drap_reg (void)
9380 {
9381 tree decl = cfun->decl;
9382
9383 if (TARGET_64BIT)
9384 {
9385 /* Use R13 for nested function or function need static chain.
9386 Since function with tail call may use any caller-saved
9387 registers in epilogue, DRAP must not use caller-saved
9388 register in such case. */
9389 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9390 return R13_REG;
9391
9392 return R10_REG;
9393 }
9394 else
9395 {
9396 /* Use DI for nested function or function need static chain.
9397 Since function with tail call may use any caller-saved
9398 registers in epilogue, DRAP must not use caller-saved
9399 register in such case. */
9400 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9401 return DI_REG;
9402
9403 /* Reuse static chain register if it isn't used for parameter
9404 passing. */
9405 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9406 {
9407 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9408 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9409 return CX_REG;
9410 }
9411 return DI_REG;
9412 }
9413 }
9414
9415 /* Return minimum incoming stack alignment. */
9416
9417 static unsigned int
9418 ix86_minimum_incoming_stack_boundary (bool sibcall)
9419 {
9420 unsigned int incoming_stack_boundary;
9421
9422 /* Prefer the one specified at command line. */
9423 if (ix86_user_incoming_stack_boundary)
9424 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9425 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9426 if -mstackrealign is used, it isn't used for sibcall check and
9427 estimated stack alignment is 128bit. */
9428 else if (!sibcall
9429 && !TARGET_64BIT
9430 && ix86_force_align_arg_pointer
9431 && crtl->stack_alignment_estimated == 128)
9432 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9433 else
9434 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9435
9436 /* Incoming stack alignment can be changed on individual functions
9437 via force_align_arg_pointer attribute. We use the smallest
9438 incoming stack boundary. */
9439 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9440 && lookup_attribute (ix86_force_align_arg_pointer_string,
9441 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9442 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9443
9444 /* The incoming stack frame has to be aligned at least at
9445 parm_stack_boundary. */
9446 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9447 incoming_stack_boundary = crtl->parm_stack_boundary;
9448
9449 /* Stack at entrance of main is aligned by runtime. We use the
9450 smallest incoming stack boundary. */
9451 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9452 && DECL_NAME (current_function_decl)
9453 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9454 && DECL_FILE_SCOPE_P (current_function_decl))
9455 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9456
9457 return incoming_stack_boundary;
9458 }
9459
9460 /* Update incoming stack boundary and estimated stack alignment. */
9461
9462 static void
9463 ix86_update_stack_boundary (void)
9464 {
9465 ix86_incoming_stack_boundary
9466 = ix86_minimum_incoming_stack_boundary (false);
9467
9468 /* x86_64 vararg needs 16byte stack alignment for register save
9469 area. */
9470 if (TARGET_64BIT
9471 && cfun->stdarg
9472 && crtl->stack_alignment_estimated < 128)
9473 crtl->stack_alignment_estimated = 128;
9474 }
9475
9476 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9477 needed or an rtx for DRAP otherwise. */
9478
9479 static rtx
9480 ix86_get_drap_rtx (void)
9481 {
9482 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9483 crtl->need_drap = true;
9484
9485 if (stack_realign_drap)
9486 {
9487 /* Assign DRAP to vDRAP and returns vDRAP */
9488 unsigned int regno = find_drap_reg ();
9489 rtx drap_vreg;
9490 rtx arg_ptr;
9491 rtx seq, insn;
9492
9493 arg_ptr = gen_rtx_REG (Pmode, regno);
9494 crtl->drap_reg = arg_ptr;
9495
9496 start_sequence ();
9497 drap_vreg = copy_to_reg (arg_ptr);
9498 seq = get_insns ();
9499 end_sequence ();
9500
9501 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9502 if (!optimize)
9503 {
9504 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9505 RTX_FRAME_RELATED_P (insn) = 1;
9506 }
9507 return drap_vreg;
9508 }
9509 else
9510 return NULL;
9511 }
9512
9513 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9514
9515 static rtx
9516 ix86_internal_arg_pointer (void)
9517 {
9518 return virtual_incoming_args_rtx;
9519 }
9520
9521 struct scratch_reg {
9522 rtx reg;
9523 bool saved;
9524 };
9525
9526 /* Return a short-lived scratch register for use on function entry.
9527 In 32-bit mode, it is valid only after the registers are saved
9528 in the prologue. This register must be released by means of
9529 release_scratch_register_on_entry once it is dead. */
9530
9531 static void
9532 get_scratch_register_on_entry (struct scratch_reg *sr)
9533 {
9534 int regno;
9535
9536 sr->saved = false;
9537
9538 if (TARGET_64BIT)
9539 {
9540 /* We always use R11 in 64-bit mode. */
9541 regno = R11_REG;
9542 }
9543 else
9544 {
9545 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9546 bool fastcall_p
9547 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9548 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9549 int regparm = ix86_function_regparm (fntype, decl);
9550 int drap_regno
9551 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9552
9553 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9554 for the static chain register. */
9555 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9556 && drap_regno != AX_REG)
9557 regno = AX_REG;
9558 else if (regparm < 2 && drap_regno != DX_REG)
9559 regno = DX_REG;
9560 /* ecx is the static chain register. */
9561 else if (regparm < 3 && !fastcall_p && !static_chain_p
9562 && drap_regno != CX_REG)
9563 regno = CX_REG;
9564 else if (ix86_save_reg (BX_REG, true))
9565 regno = BX_REG;
9566 /* esi is the static chain register. */
9567 else if (!(regparm == 3 && static_chain_p)
9568 && ix86_save_reg (SI_REG, true))
9569 regno = SI_REG;
9570 else if (ix86_save_reg (DI_REG, true))
9571 regno = DI_REG;
9572 else
9573 {
9574 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9575 sr->saved = true;
9576 }
9577 }
9578
9579 sr->reg = gen_rtx_REG (Pmode, regno);
9580 if (sr->saved)
9581 {
9582 rtx insn = emit_insn (gen_push (sr->reg));
9583 RTX_FRAME_RELATED_P (insn) = 1;
9584 }
9585 }
9586
9587 /* Release a scratch register obtained from the preceding function. */
9588
9589 static void
9590 release_scratch_register_on_entry (struct scratch_reg *sr)
9591 {
9592 if (sr->saved)
9593 {
9594 rtx x, insn = emit_insn (gen_pop (sr->reg));
9595
9596 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9597 RTX_FRAME_RELATED_P (insn) = 1;
9598 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9599 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9600 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9601 }
9602 }
9603
9604 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9605
9606 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9607
9608 static void
9609 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9610 {
9611 /* We skip the probe for the first interval + a small dope of 4 words and
9612 probe that many bytes past the specified size to maintain a protection
9613 area at the botton of the stack. */
9614 const int dope = 4 * UNITS_PER_WORD;
9615 rtx size_rtx = GEN_INT (size), last;
9616
9617 /* See if we have a constant small number of probes to generate. If so,
9618 that's the easy case. The run-time loop is made up of 11 insns in the
9619 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9620 for n # of intervals. */
9621 if (size <= 5 * PROBE_INTERVAL)
9622 {
9623 HOST_WIDE_INT i, adjust;
9624 bool first_probe = true;
9625
9626 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9627 values of N from 1 until it exceeds SIZE. If only one probe is
9628 needed, this will not generate any code. Then adjust and probe
9629 to PROBE_INTERVAL + SIZE. */
9630 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9631 {
9632 if (first_probe)
9633 {
9634 adjust = 2 * PROBE_INTERVAL + dope;
9635 first_probe = false;
9636 }
9637 else
9638 adjust = PROBE_INTERVAL;
9639
9640 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9641 plus_constant (stack_pointer_rtx, -adjust)));
9642 emit_stack_probe (stack_pointer_rtx);
9643 }
9644
9645 if (first_probe)
9646 adjust = size + PROBE_INTERVAL + dope;
9647 else
9648 adjust = size + PROBE_INTERVAL - i;
9649
9650 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9651 plus_constant (stack_pointer_rtx, -adjust)));
9652 emit_stack_probe (stack_pointer_rtx);
9653
9654 /* Adjust back to account for the additional first interval. */
9655 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9656 plus_constant (stack_pointer_rtx,
9657 PROBE_INTERVAL + dope)));
9658 }
9659
9660 /* Otherwise, do the same as above, but in a loop. Note that we must be
9661 extra careful with variables wrapping around because we might be at
9662 the very top (or the very bottom) of the address space and we have
9663 to be able to handle this case properly; in particular, we use an
9664 equality test for the loop condition. */
9665 else
9666 {
9667 HOST_WIDE_INT rounded_size;
9668 struct scratch_reg sr;
9669
9670 get_scratch_register_on_entry (&sr);
9671
9672
9673 /* Step 1: round SIZE to the previous multiple of the interval. */
9674
9675 rounded_size = size & -PROBE_INTERVAL;
9676
9677
9678 /* Step 2: compute initial and final value of the loop counter. */
9679
9680 /* SP = SP_0 + PROBE_INTERVAL. */
9681 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9682 plus_constant (stack_pointer_rtx,
9683 - (PROBE_INTERVAL + dope))));
9684
9685 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9686 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9687 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9688 gen_rtx_PLUS (Pmode, sr.reg,
9689 stack_pointer_rtx)));
9690
9691
9692 /* Step 3: the loop
9693
9694 while (SP != LAST_ADDR)
9695 {
9696 SP = SP + PROBE_INTERVAL
9697 probe at SP
9698 }
9699
9700 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9701 values of N from 1 until it is equal to ROUNDED_SIZE. */
9702
9703 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9704
9705
9706 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9707 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9708
9709 if (size != rounded_size)
9710 {
9711 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9712 plus_constant (stack_pointer_rtx,
9713 rounded_size - size)));
9714 emit_stack_probe (stack_pointer_rtx);
9715 }
9716
9717 /* Adjust back to account for the additional first interval. */
9718 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9719 plus_constant (stack_pointer_rtx,
9720 PROBE_INTERVAL + dope)));
9721
9722 release_scratch_register_on_entry (&sr);
9723 }
9724
9725 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9726
9727 /* Even if the stack pointer isn't the CFA register, we need to correctly
9728 describe the adjustments made to it, in particular differentiate the
9729 frame-related ones from the frame-unrelated ones. */
9730 if (size > 0)
9731 {
9732 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9733 XVECEXP (expr, 0, 0)
9734 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9735 plus_constant (stack_pointer_rtx, -size));
9736 XVECEXP (expr, 0, 1)
9737 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9738 plus_constant (stack_pointer_rtx,
9739 PROBE_INTERVAL + dope + size));
9740 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9741 RTX_FRAME_RELATED_P (last) = 1;
9742
9743 cfun->machine->fs.sp_offset += size;
9744 }
9745
9746 /* Make sure nothing is scheduled before we are done. */
9747 emit_insn (gen_blockage ());
9748 }
9749
9750 /* Adjust the stack pointer up to REG while probing it. */
9751
9752 const char *
9753 output_adjust_stack_and_probe (rtx reg)
9754 {
9755 static int labelno = 0;
9756 char loop_lab[32], end_lab[32];
9757 rtx xops[2];
9758
9759 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9760 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9761
9762 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9763
9764 /* Jump to END_LAB if SP == LAST_ADDR. */
9765 xops[0] = stack_pointer_rtx;
9766 xops[1] = reg;
9767 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9768 fputs ("\tje\t", asm_out_file);
9769 assemble_name_raw (asm_out_file, end_lab);
9770 fputc ('\n', asm_out_file);
9771
9772 /* SP = SP + PROBE_INTERVAL. */
9773 xops[1] = GEN_INT (PROBE_INTERVAL);
9774 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9775
9776 /* Probe at SP. */
9777 xops[1] = const0_rtx;
9778 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9779
9780 fprintf (asm_out_file, "\tjmp\t");
9781 assemble_name_raw (asm_out_file, loop_lab);
9782 fputc ('\n', asm_out_file);
9783
9784 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9785
9786 return "";
9787 }
9788
9789 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9790 inclusive. These are offsets from the current stack pointer. */
9791
9792 static void
9793 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9794 {
9795 /* See if we have a constant small number of probes to generate. If so,
9796 that's the easy case. The run-time loop is made up of 7 insns in the
9797 generic case while the compile-time loop is made up of n insns for n #
9798 of intervals. */
9799 if (size <= 7 * PROBE_INTERVAL)
9800 {
9801 HOST_WIDE_INT i;
9802
9803 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9804 it exceeds SIZE. If only one probe is needed, this will not
9805 generate any code. Then probe at FIRST + SIZE. */
9806 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9807 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + i)));
9808
9809 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + size)));
9810 }
9811
9812 /* Otherwise, do the same as above, but in a loop. Note that we must be
9813 extra careful with variables wrapping around because we might be at
9814 the very top (or the very bottom) of the address space and we have
9815 to be able to handle this case properly; in particular, we use an
9816 equality test for the loop condition. */
9817 else
9818 {
9819 HOST_WIDE_INT rounded_size, last;
9820 struct scratch_reg sr;
9821
9822 get_scratch_register_on_entry (&sr);
9823
9824
9825 /* Step 1: round SIZE to the previous multiple of the interval. */
9826
9827 rounded_size = size & -PROBE_INTERVAL;
9828
9829
9830 /* Step 2: compute initial and final value of the loop counter. */
9831
9832 /* TEST_OFFSET = FIRST. */
9833 emit_move_insn (sr.reg, GEN_INT (-first));
9834
9835 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9836 last = first + rounded_size;
9837
9838
9839 /* Step 3: the loop
9840
9841 while (TEST_ADDR != LAST_ADDR)
9842 {
9843 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9844 probe at TEST_ADDR
9845 }
9846
9847 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9848 until it is equal to ROUNDED_SIZE. */
9849
9850 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9851
9852
9853 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9854 that SIZE is equal to ROUNDED_SIZE. */
9855
9856 if (size != rounded_size)
9857 emit_stack_probe (plus_constant (gen_rtx_PLUS (Pmode,
9858 stack_pointer_rtx,
9859 sr.reg),
9860 rounded_size - size));
9861
9862 release_scratch_register_on_entry (&sr);
9863 }
9864
9865 /* Make sure nothing is scheduled before we are done. */
9866 emit_insn (gen_blockage ());
9867 }
9868
9869 /* Probe a range of stack addresses from REG to END, inclusive. These are
9870 offsets from the current stack pointer. */
9871
9872 const char *
9873 output_probe_stack_range (rtx reg, rtx end)
9874 {
9875 static int labelno = 0;
9876 char loop_lab[32], end_lab[32];
9877 rtx xops[3];
9878
9879 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9880 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9881
9882 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9883
9884 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
9885 xops[0] = reg;
9886 xops[1] = end;
9887 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9888 fputs ("\tje\t", asm_out_file);
9889 assemble_name_raw (asm_out_file, end_lab);
9890 fputc ('\n', asm_out_file);
9891
9892 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
9893 xops[1] = GEN_INT (PROBE_INTERVAL);
9894 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9895
9896 /* Probe at TEST_ADDR. */
9897 xops[0] = stack_pointer_rtx;
9898 xops[1] = reg;
9899 xops[2] = const0_rtx;
9900 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
9901
9902 fprintf (asm_out_file, "\tjmp\t");
9903 assemble_name_raw (asm_out_file, loop_lab);
9904 fputc ('\n', asm_out_file);
9905
9906 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9907
9908 return "";
9909 }
9910
9911 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
9912 to be generated in correct form. */
9913 static void
9914 ix86_finalize_stack_realign_flags (void)
9915 {
9916 /* Check if stack realign is really needed after reload, and
9917 stores result in cfun */
9918 unsigned int incoming_stack_boundary
9919 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
9920 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
9921 unsigned int stack_realign = (incoming_stack_boundary
9922 < (current_function_is_leaf
9923 ? crtl->max_used_stack_slot_alignment
9924 : crtl->stack_alignment_needed));
9925
9926 if (crtl->stack_realign_finalized)
9927 {
9928 /* After stack_realign_needed is finalized, we can't no longer
9929 change it. */
9930 gcc_assert (crtl->stack_realign_needed == stack_realign);
9931 }
9932 else
9933 {
9934 crtl->stack_realign_needed = stack_realign;
9935 crtl->stack_realign_finalized = true;
9936 }
9937 }
9938
9939 /* Expand the prologue into a bunch of separate insns. */
9940
9941 void
9942 ix86_expand_prologue (void)
9943 {
9944 struct machine_function *m = cfun->machine;
9945 rtx insn, t;
9946 bool pic_reg_used;
9947 struct ix86_frame frame;
9948 HOST_WIDE_INT allocate;
9949 bool int_registers_saved;
9950
9951 ix86_finalize_stack_realign_flags ();
9952
9953 /* DRAP should not coexist with stack_realign_fp */
9954 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
9955
9956 memset (&m->fs, 0, sizeof (m->fs));
9957
9958 /* Initialize CFA state for before the prologue. */
9959 m->fs.cfa_reg = stack_pointer_rtx;
9960 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
9961
9962 /* Track SP offset to the CFA. We continue tracking this after we've
9963 swapped the CFA register away from SP. In the case of re-alignment
9964 this is fudged; we're interested to offsets within the local frame. */
9965 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
9966 m->fs.sp_valid = true;
9967
9968 ix86_compute_frame_layout (&frame);
9969
9970 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
9971 {
9972 /* We should have already generated an error for any use of
9973 ms_hook on a nested function. */
9974 gcc_checking_assert (!ix86_static_chain_on_stack);
9975
9976 /* Check if profiling is active and we shall use profiling before
9977 prologue variant. If so sorry. */
9978 if (crtl->profile && flag_fentry != 0)
9979 sorry ("ms_hook_prologue attribute isn%'t compatible "
9980 "with -mfentry for 32-bit");
9981
9982 /* In ix86_asm_output_function_label we emitted:
9983 8b ff movl.s %edi,%edi
9984 55 push %ebp
9985 8b ec movl.s %esp,%ebp
9986
9987 This matches the hookable function prologue in Win32 API
9988 functions in Microsoft Windows XP Service Pack 2 and newer.
9989 Wine uses this to enable Windows apps to hook the Win32 API
9990 functions provided by Wine.
9991
9992 What that means is that we've already set up the frame pointer. */
9993
9994 if (frame_pointer_needed
9995 && !(crtl->drap_reg && crtl->stack_realign_needed))
9996 {
9997 rtx push, mov;
9998
9999 /* We've decided to use the frame pointer already set up.
10000 Describe this to the unwinder by pretending that both
10001 push and mov insns happen right here.
10002
10003 Putting the unwind info here at the end of the ms_hook
10004 is done so that we can make absolutely certain we get
10005 the required byte sequence at the start of the function,
10006 rather than relying on an assembler that can produce
10007 the exact encoding required.
10008
10009 However it does mean (in the unpatched case) that we have
10010 a 1 insn window where the asynchronous unwind info is
10011 incorrect. However, if we placed the unwind info at
10012 its correct location we would have incorrect unwind info
10013 in the patched case. Which is probably all moot since
10014 I don't expect Wine generates dwarf2 unwind info for the
10015 system libraries that use this feature. */
10016
10017 insn = emit_insn (gen_blockage ());
10018
10019 push = gen_push (hard_frame_pointer_rtx);
10020 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10021 stack_pointer_rtx);
10022 RTX_FRAME_RELATED_P (push) = 1;
10023 RTX_FRAME_RELATED_P (mov) = 1;
10024
10025 RTX_FRAME_RELATED_P (insn) = 1;
10026 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10027 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10028
10029 /* Note that gen_push incremented m->fs.cfa_offset, even
10030 though we didn't emit the push insn here. */
10031 m->fs.cfa_reg = hard_frame_pointer_rtx;
10032 m->fs.fp_offset = m->fs.cfa_offset;
10033 m->fs.fp_valid = true;
10034 }
10035 else
10036 {
10037 /* The frame pointer is not needed so pop %ebp again.
10038 This leaves us with a pristine state. */
10039 emit_insn (gen_pop (hard_frame_pointer_rtx));
10040 }
10041 }
10042
10043 /* The first insn of a function that accepts its static chain on the
10044 stack is to push the register that would be filled in by a direct
10045 call. This insn will be skipped by the trampoline. */
10046 else if (ix86_static_chain_on_stack)
10047 {
10048 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10049 emit_insn (gen_blockage ());
10050
10051 /* We don't want to interpret this push insn as a register save,
10052 only as a stack adjustment. The real copy of the register as
10053 a save will be done later, if needed. */
10054 t = plus_constant (stack_pointer_rtx, -UNITS_PER_WORD);
10055 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10056 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10057 RTX_FRAME_RELATED_P (insn) = 1;
10058 }
10059
10060 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10061 of DRAP is needed and stack realignment is really needed after reload */
10062 if (stack_realign_drap)
10063 {
10064 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10065
10066 /* Only need to push parameter pointer reg if it is caller saved. */
10067 if (!call_used_regs[REGNO (crtl->drap_reg)])
10068 {
10069 /* Push arg pointer reg */
10070 insn = emit_insn (gen_push (crtl->drap_reg));
10071 RTX_FRAME_RELATED_P (insn) = 1;
10072 }
10073
10074 /* Grab the argument pointer. */
10075 t = plus_constant (stack_pointer_rtx, m->fs.sp_offset);
10076 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10077 RTX_FRAME_RELATED_P (insn) = 1;
10078 m->fs.cfa_reg = crtl->drap_reg;
10079 m->fs.cfa_offset = 0;
10080
10081 /* Align the stack. */
10082 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10083 stack_pointer_rtx,
10084 GEN_INT (-align_bytes)));
10085 RTX_FRAME_RELATED_P (insn) = 1;
10086
10087 /* Replicate the return address on the stack so that return
10088 address can be reached via (argp - 1) slot. This is needed
10089 to implement macro RETURN_ADDR_RTX and intrinsic function
10090 expand_builtin_return_addr etc. */
10091 t = plus_constant (crtl->drap_reg, -UNITS_PER_WORD);
10092 t = gen_frame_mem (Pmode, t);
10093 insn = emit_insn (gen_push (t));
10094 RTX_FRAME_RELATED_P (insn) = 1;
10095
10096 /* For the purposes of frame and register save area addressing,
10097 we've started over with a new frame. */
10098 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10099 m->fs.realigned = true;
10100 }
10101
10102 if (frame_pointer_needed && !m->fs.fp_valid)
10103 {
10104 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10105 slower on all targets. Also sdb doesn't like it. */
10106 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10107 RTX_FRAME_RELATED_P (insn) = 1;
10108
10109 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10110 {
10111 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10112 RTX_FRAME_RELATED_P (insn) = 1;
10113
10114 if (m->fs.cfa_reg == stack_pointer_rtx)
10115 m->fs.cfa_reg = hard_frame_pointer_rtx;
10116 m->fs.fp_offset = m->fs.sp_offset;
10117 m->fs.fp_valid = true;
10118 }
10119 }
10120
10121 int_registers_saved = (frame.nregs == 0);
10122
10123 if (!int_registers_saved)
10124 {
10125 /* If saving registers via PUSH, do so now. */
10126 if (!frame.save_regs_using_mov)
10127 {
10128 ix86_emit_save_regs ();
10129 int_registers_saved = true;
10130 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10131 }
10132
10133 /* When using red zone we may start register saving before allocating
10134 the stack frame saving one cycle of the prologue. However, avoid
10135 doing this if we have to probe the stack; at least on x86_64 the
10136 stack probe can turn into a call that clobbers a red zone location. */
10137 else if (ix86_using_red_zone ()
10138 && (! TARGET_STACK_PROBE
10139 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10140 {
10141 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10142 int_registers_saved = true;
10143 }
10144 }
10145
10146 if (stack_realign_fp)
10147 {
10148 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10149 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10150
10151 /* The computation of the size of the re-aligned stack frame means
10152 that we must allocate the size of the register save area before
10153 performing the actual alignment. Otherwise we cannot guarantee
10154 that there's enough storage above the realignment point. */
10155 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10156 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10157 GEN_INT (m->fs.sp_offset
10158 - frame.sse_reg_save_offset),
10159 -1, false);
10160
10161 /* Align the stack. */
10162 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10163 stack_pointer_rtx,
10164 GEN_INT (-align_bytes)));
10165
10166 /* For the purposes of register save area addressing, the stack
10167 pointer is no longer valid. As for the value of sp_offset,
10168 see ix86_compute_frame_layout, which we need to match in order
10169 to pass verification of stack_pointer_offset at the end. */
10170 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10171 m->fs.sp_valid = false;
10172 }
10173
10174 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10175
10176 if (flag_stack_usage_info)
10177 {
10178 /* We start to count from ARG_POINTER. */
10179 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10180
10181 /* If it was realigned, take into account the fake frame. */
10182 if (stack_realign_drap)
10183 {
10184 if (ix86_static_chain_on_stack)
10185 stack_size += UNITS_PER_WORD;
10186
10187 if (!call_used_regs[REGNO (crtl->drap_reg)])
10188 stack_size += UNITS_PER_WORD;
10189
10190 /* This over-estimates by 1 minimal-stack-alignment-unit but
10191 mitigates that by counting in the new return address slot. */
10192 current_function_dynamic_stack_size
10193 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10194 }
10195
10196 current_function_static_stack_size = stack_size;
10197 }
10198
10199 /* The stack has already been decremented by the instruction calling us
10200 so probe if the size is non-negative to preserve the protection area. */
10201 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10202 {
10203 /* We expect the registers to be saved when probes are used. */
10204 gcc_assert (int_registers_saved);
10205
10206 if (STACK_CHECK_MOVING_SP)
10207 {
10208 ix86_adjust_stack_and_probe (allocate);
10209 allocate = 0;
10210 }
10211 else
10212 {
10213 HOST_WIDE_INT size = allocate;
10214
10215 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10216 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10217
10218 if (TARGET_STACK_PROBE)
10219 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10220 else
10221 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10222 }
10223 }
10224
10225 if (allocate == 0)
10226 ;
10227 else if (!ix86_target_stack_probe ()
10228 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10229 {
10230 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10231 GEN_INT (-allocate), -1,
10232 m->fs.cfa_reg == stack_pointer_rtx);
10233 }
10234 else
10235 {
10236 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10237 rtx r10 = NULL;
10238 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10239
10240 bool eax_live = false;
10241 bool r10_live = false;
10242
10243 if (TARGET_64BIT)
10244 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10245 if (!TARGET_64BIT_MS_ABI)
10246 eax_live = ix86_eax_live_at_start_p ();
10247
10248 if (eax_live)
10249 {
10250 emit_insn (gen_push (eax));
10251 allocate -= UNITS_PER_WORD;
10252 }
10253 if (r10_live)
10254 {
10255 r10 = gen_rtx_REG (Pmode, R10_REG);
10256 emit_insn (gen_push (r10));
10257 allocate -= UNITS_PER_WORD;
10258 }
10259
10260 emit_move_insn (eax, GEN_INT (allocate));
10261 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10262
10263 /* Use the fact that AX still contains ALLOCATE. */
10264 adjust_stack_insn = (TARGET_64BIT
10265 ? gen_pro_epilogue_adjust_stack_di_sub
10266 : gen_pro_epilogue_adjust_stack_si_sub);
10267
10268 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10269 stack_pointer_rtx, eax));
10270
10271 /* Note that SEH directives need to continue tracking the stack
10272 pointer even after the frame pointer has been set up. */
10273 if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
10274 {
10275 if (m->fs.cfa_reg == stack_pointer_rtx)
10276 m->fs.cfa_offset += allocate;
10277
10278 RTX_FRAME_RELATED_P (insn) = 1;
10279 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10280 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10281 plus_constant (stack_pointer_rtx,
10282 -allocate)));
10283 }
10284 m->fs.sp_offset += allocate;
10285
10286 if (r10_live && eax_live)
10287 {
10288 t = choose_baseaddr (m->fs.sp_offset - allocate);
10289 emit_move_insn (r10, gen_frame_mem (Pmode, t));
10290 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10291 emit_move_insn (eax, gen_frame_mem (Pmode, t));
10292 }
10293 else if (eax_live || r10_live)
10294 {
10295 t = choose_baseaddr (m->fs.sp_offset - allocate);
10296 emit_move_insn ((eax_live ? eax : r10), gen_frame_mem (Pmode, t));
10297 }
10298 }
10299 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10300
10301 /* If we havn't already set up the frame pointer, do so now. */
10302 if (frame_pointer_needed && !m->fs.fp_valid)
10303 {
10304 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10305 GEN_INT (frame.stack_pointer_offset
10306 - frame.hard_frame_pointer_offset));
10307 insn = emit_insn (insn);
10308 RTX_FRAME_RELATED_P (insn) = 1;
10309 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10310
10311 if (m->fs.cfa_reg == stack_pointer_rtx)
10312 m->fs.cfa_reg = hard_frame_pointer_rtx;
10313 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10314 m->fs.fp_valid = true;
10315 }
10316
10317 if (!int_registers_saved)
10318 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10319 if (frame.nsseregs)
10320 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10321
10322 pic_reg_used = false;
10323 if (pic_offset_table_rtx
10324 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10325 || crtl->profile))
10326 {
10327 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10328
10329 if (alt_pic_reg_used != INVALID_REGNUM)
10330 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10331
10332 pic_reg_used = true;
10333 }
10334
10335 if (pic_reg_used)
10336 {
10337 if (TARGET_64BIT)
10338 {
10339 if (ix86_cmodel == CM_LARGE_PIC)
10340 {
10341 rtx tmp_reg = gen_rtx_REG (DImode, R11_REG);
10342 rtx label = gen_label_rtx ();
10343 emit_label (label);
10344 LABEL_PRESERVE_P (label) = 1;
10345 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10346 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx, label));
10347 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10348 insn = emit_insn (gen_adddi3 (pic_offset_table_rtx,
10349 pic_offset_table_rtx, tmp_reg));
10350 }
10351 else
10352 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10353 }
10354 else
10355 {
10356 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10357 RTX_FRAME_RELATED_P (insn) = 1;
10358 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10359 }
10360 }
10361
10362 /* In the pic_reg_used case, make sure that the got load isn't deleted
10363 when mcount needs it. Blockage to avoid call movement across mcount
10364 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10365 note. */
10366 if (crtl->profile && !flag_fentry && pic_reg_used)
10367 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10368
10369 if (crtl->drap_reg && !crtl->stack_realign_needed)
10370 {
10371 /* vDRAP is setup but after reload it turns out stack realign
10372 isn't necessary, here we will emit prologue to setup DRAP
10373 without stack realign adjustment */
10374 t = choose_baseaddr (0);
10375 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10376 }
10377
10378 /* Prevent instructions from being scheduled into register save push
10379 sequence when access to the redzone area is done through frame pointer.
10380 The offset between the frame pointer and the stack pointer is calculated
10381 relative to the value of the stack pointer at the end of the function
10382 prologue, and moving instructions that access redzone area via frame
10383 pointer inside push sequence violates this assumption. */
10384 if (frame_pointer_needed && frame.red_zone_size)
10385 emit_insn (gen_memory_blockage ());
10386
10387 /* Emit cld instruction if stringops are used in the function. */
10388 if (TARGET_CLD && ix86_current_function_needs_cld)
10389 emit_insn (gen_cld ());
10390
10391 /* SEH requires that the prologue end within 256 bytes of the start of
10392 the function. Prevent instruction schedules that would extend that.
10393 Further, prevent alloca modifications to the stack pointer from being
10394 combined with prologue modifications. */
10395 if (TARGET_SEH)
10396 emit_insn (gen_prologue_use (stack_pointer_rtx));
10397 }
10398
10399 /* Emit code to restore REG using a POP insn. */
10400
10401 static void
10402 ix86_emit_restore_reg_using_pop (rtx reg)
10403 {
10404 struct machine_function *m = cfun->machine;
10405 rtx insn = emit_insn (gen_pop (reg));
10406
10407 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10408 m->fs.sp_offset -= UNITS_PER_WORD;
10409
10410 if (m->fs.cfa_reg == crtl->drap_reg
10411 && REGNO (reg) == REGNO (crtl->drap_reg))
10412 {
10413 /* Previously we'd represented the CFA as an expression
10414 like *(%ebp - 8). We've just popped that value from
10415 the stack, which means we need to reset the CFA to
10416 the drap register. This will remain until we restore
10417 the stack pointer. */
10418 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10419 RTX_FRAME_RELATED_P (insn) = 1;
10420
10421 /* This means that the DRAP register is valid for addressing too. */
10422 m->fs.drap_valid = true;
10423 return;
10424 }
10425
10426 if (m->fs.cfa_reg == stack_pointer_rtx)
10427 {
10428 rtx x = plus_constant (stack_pointer_rtx, UNITS_PER_WORD);
10429 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10430 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10431 RTX_FRAME_RELATED_P (insn) = 1;
10432
10433 m->fs.cfa_offset -= UNITS_PER_WORD;
10434 }
10435
10436 /* When the frame pointer is the CFA, and we pop it, we are
10437 swapping back to the stack pointer as the CFA. This happens
10438 for stack frames that don't allocate other data, so we assume
10439 the stack pointer is now pointing at the return address, i.e.
10440 the function entry state, which makes the offset be 1 word. */
10441 if (reg == hard_frame_pointer_rtx)
10442 {
10443 m->fs.fp_valid = false;
10444 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10445 {
10446 m->fs.cfa_reg = stack_pointer_rtx;
10447 m->fs.cfa_offset -= UNITS_PER_WORD;
10448
10449 add_reg_note (insn, REG_CFA_DEF_CFA,
10450 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10451 GEN_INT (m->fs.cfa_offset)));
10452 RTX_FRAME_RELATED_P (insn) = 1;
10453 }
10454 }
10455 }
10456
10457 /* Emit code to restore saved registers using POP insns. */
10458
10459 static void
10460 ix86_emit_restore_regs_using_pop (void)
10461 {
10462 unsigned int regno;
10463
10464 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10465 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10466 ix86_emit_restore_reg_using_pop (gen_rtx_REG (Pmode, regno));
10467 }
10468
10469 /* Emit code and notes for the LEAVE instruction. */
10470
10471 static void
10472 ix86_emit_leave (void)
10473 {
10474 struct machine_function *m = cfun->machine;
10475 rtx insn = emit_insn (ix86_gen_leave ());
10476
10477 ix86_add_queued_cfa_restore_notes (insn);
10478
10479 gcc_assert (m->fs.fp_valid);
10480 m->fs.sp_valid = true;
10481 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10482 m->fs.fp_valid = false;
10483
10484 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10485 {
10486 m->fs.cfa_reg = stack_pointer_rtx;
10487 m->fs.cfa_offset = m->fs.sp_offset;
10488
10489 add_reg_note (insn, REG_CFA_DEF_CFA,
10490 plus_constant (stack_pointer_rtx, m->fs.sp_offset));
10491 RTX_FRAME_RELATED_P (insn) = 1;
10492 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10493 m->fs.fp_offset);
10494 }
10495 }
10496
10497 /* Emit code to restore saved registers using MOV insns.
10498 First register is restored from CFA - CFA_OFFSET. */
10499 static void
10500 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10501 bool maybe_eh_return)
10502 {
10503 struct machine_function *m = cfun->machine;
10504 unsigned int regno;
10505
10506 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10507 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10508 {
10509 rtx reg = gen_rtx_REG (Pmode, regno);
10510 rtx insn, mem;
10511
10512 mem = choose_baseaddr (cfa_offset);
10513 mem = gen_frame_mem (Pmode, mem);
10514 insn = emit_move_insn (reg, mem);
10515
10516 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10517 {
10518 /* Previously we'd represented the CFA as an expression
10519 like *(%ebp - 8). We've just popped that value from
10520 the stack, which means we need to reset the CFA to
10521 the drap register. This will remain until we restore
10522 the stack pointer. */
10523 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10524 RTX_FRAME_RELATED_P (insn) = 1;
10525
10526 /* This means that the DRAP register is valid for addressing. */
10527 m->fs.drap_valid = true;
10528 }
10529 else
10530 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10531
10532 cfa_offset -= UNITS_PER_WORD;
10533 }
10534 }
10535
10536 /* Emit code to restore saved registers using MOV insns.
10537 First register is restored from CFA - CFA_OFFSET. */
10538 static void
10539 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10540 bool maybe_eh_return)
10541 {
10542 unsigned int regno;
10543
10544 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10545 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10546 {
10547 rtx reg = gen_rtx_REG (V4SFmode, regno);
10548 rtx mem;
10549
10550 mem = choose_baseaddr (cfa_offset);
10551 mem = gen_rtx_MEM (V4SFmode, mem);
10552 set_mem_align (mem, 128);
10553 emit_move_insn (reg, mem);
10554
10555 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10556
10557 cfa_offset -= 16;
10558 }
10559 }
10560
10561 /* Restore function stack, frame, and registers. */
10562
10563 void
10564 ix86_expand_epilogue (int style)
10565 {
10566 struct machine_function *m = cfun->machine;
10567 struct machine_frame_state frame_state_save = m->fs;
10568 struct ix86_frame frame;
10569 bool restore_regs_via_mov;
10570 bool using_drap;
10571
10572 ix86_finalize_stack_realign_flags ();
10573 ix86_compute_frame_layout (&frame);
10574
10575 m->fs.sp_valid = (!frame_pointer_needed
10576 || (current_function_sp_is_unchanging
10577 && !stack_realign_fp));
10578 gcc_assert (!m->fs.sp_valid
10579 || m->fs.sp_offset == frame.stack_pointer_offset);
10580
10581 /* The FP must be valid if the frame pointer is present. */
10582 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10583 gcc_assert (!m->fs.fp_valid
10584 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10585
10586 /* We must have *some* valid pointer to the stack frame. */
10587 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10588
10589 /* The DRAP is never valid at this point. */
10590 gcc_assert (!m->fs.drap_valid);
10591
10592 /* See the comment about red zone and frame
10593 pointer usage in ix86_expand_prologue. */
10594 if (frame_pointer_needed && frame.red_zone_size)
10595 emit_insn (gen_memory_blockage ());
10596
10597 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10598 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10599
10600 /* Determine the CFA offset of the end of the red-zone. */
10601 m->fs.red_zone_offset = 0;
10602 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10603 {
10604 /* The red-zone begins below the return address. */
10605 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10606
10607 /* When the register save area is in the aligned portion of
10608 the stack, determine the maximum runtime displacement that
10609 matches up with the aligned frame. */
10610 if (stack_realign_drap)
10611 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10612 + UNITS_PER_WORD);
10613 }
10614
10615 /* Special care must be taken for the normal return case of a function
10616 using eh_return: the eax and edx registers are marked as saved, but
10617 not restored along this path. Adjust the save location to match. */
10618 if (crtl->calls_eh_return && style != 2)
10619 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10620
10621 /* EH_RETURN requires the use of moves to function properly. */
10622 if (crtl->calls_eh_return)
10623 restore_regs_via_mov = true;
10624 /* SEH requires the use of pops to identify the epilogue. */
10625 else if (TARGET_SEH)
10626 restore_regs_via_mov = false;
10627 /* If we're only restoring one register and sp is not valid then
10628 using a move instruction to restore the register since it's
10629 less work than reloading sp and popping the register. */
10630 else if (!m->fs.sp_valid && frame.nregs <= 1)
10631 restore_regs_via_mov = true;
10632 else if (TARGET_EPILOGUE_USING_MOVE
10633 && cfun->machine->use_fast_prologue_epilogue
10634 && (frame.nregs > 1
10635 || m->fs.sp_offset != frame.reg_save_offset))
10636 restore_regs_via_mov = true;
10637 else if (frame_pointer_needed
10638 && !frame.nregs
10639 && m->fs.sp_offset != frame.reg_save_offset)
10640 restore_regs_via_mov = true;
10641 else if (frame_pointer_needed
10642 && TARGET_USE_LEAVE
10643 && cfun->machine->use_fast_prologue_epilogue
10644 && frame.nregs == 1)
10645 restore_regs_via_mov = true;
10646 else
10647 restore_regs_via_mov = false;
10648
10649 if (restore_regs_via_mov || frame.nsseregs)
10650 {
10651 /* Ensure that the entire register save area is addressable via
10652 the stack pointer, if we will restore via sp. */
10653 if (TARGET_64BIT
10654 && m->fs.sp_offset > 0x7fffffff
10655 && !(m->fs.fp_valid || m->fs.drap_valid)
10656 && (frame.nsseregs + frame.nregs) != 0)
10657 {
10658 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10659 GEN_INT (m->fs.sp_offset
10660 - frame.sse_reg_save_offset),
10661 style,
10662 m->fs.cfa_reg == stack_pointer_rtx);
10663 }
10664 }
10665
10666 /* If there are any SSE registers to restore, then we have to do it
10667 via moves, since there's obviously no pop for SSE regs. */
10668 if (frame.nsseregs)
10669 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10670 style == 2);
10671
10672 if (restore_regs_via_mov)
10673 {
10674 rtx t;
10675
10676 if (frame.nregs)
10677 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10678
10679 /* eh_return epilogues need %ecx added to the stack pointer. */
10680 if (style == 2)
10681 {
10682 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10683
10684 /* Stack align doesn't work with eh_return. */
10685 gcc_assert (!stack_realign_drap);
10686 /* Neither does regparm nested functions. */
10687 gcc_assert (!ix86_static_chain_on_stack);
10688
10689 if (frame_pointer_needed)
10690 {
10691 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10692 t = plus_constant (t, m->fs.fp_offset - UNITS_PER_WORD);
10693 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10694
10695 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10696 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10697
10698 /* Note that we use SA as a temporary CFA, as the return
10699 address is at the proper place relative to it. We
10700 pretend this happens at the FP restore insn because
10701 prior to this insn the FP would be stored at the wrong
10702 offset relative to SA, and after this insn we have no
10703 other reasonable register to use for the CFA. We don't
10704 bother resetting the CFA to the SP for the duration of
10705 the return insn. */
10706 add_reg_note (insn, REG_CFA_DEF_CFA,
10707 plus_constant (sa, UNITS_PER_WORD));
10708 ix86_add_queued_cfa_restore_notes (insn);
10709 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10710 RTX_FRAME_RELATED_P (insn) = 1;
10711
10712 m->fs.cfa_reg = sa;
10713 m->fs.cfa_offset = UNITS_PER_WORD;
10714 m->fs.fp_valid = false;
10715
10716 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10717 const0_rtx, style, false);
10718 }
10719 else
10720 {
10721 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10722 t = plus_constant (t, m->fs.sp_offset - UNITS_PER_WORD);
10723 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10724 ix86_add_queued_cfa_restore_notes (insn);
10725
10726 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10727 if (m->fs.cfa_offset != UNITS_PER_WORD)
10728 {
10729 m->fs.cfa_offset = UNITS_PER_WORD;
10730 add_reg_note (insn, REG_CFA_DEF_CFA,
10731 plus_constant (stack_pointer_rtx,
10732 UNITS_PER_WORD));
10733 RTX_FRAME_RELATED_P (insn) = 1;
10734 }
10735 }
10736 m->fs.sp_offset = UNITS_PER_WORD;
10737 m->fs.sp_valid = true;
10738 }
10739 }
10740 else
10741 {
10742 /* SEH requires that the function end with (1) a stack adjustment
10743 if necessary, (2) a sequence of pops, and (3) a return or
10744 jump instruction. Prevent insns from the function body from
10745 being scheduled into this sequence. */
10746 if (TARGET_SEH)
10747 {
10748 /* Prevent a catch region from being adjacent to the standard
10749 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
10750 several other flags that would be interesting to test are
10751 not yet set up. */
10752 if (flag_non_call_exceptions)
10753 emit_insn (gen_nops (const1_rtx));
10754 else
10755 emit_insn (gen_blockage ());
10756 }
10757
10758 /* First step is to deallocate the stack frame so that we can
10759 pop the registers. */
10760 if (!m->fs.sp_valid)
10761 {
10762 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
10763 GEN_INT (m->fs.fp_offset
10764 - frame.reg_save_offset),
10765 style, false);
10766 }
10767 else if (m->fs.sp_offset != frame.reg_save_offset)
10768 {
10769 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10770 GEN_INT (m->fs.sp_offset
10771 - frame.reg_save_offset),
10772 style,
10773 m->fs.cfa_reg == stack_pointer_rtx);
10774 }
10775
10776 ix86_emit_restore_regs_using_pop ();
10777 }
10778
10779 /* If we used a stack pointer and haven't already got rid of it,
10780 then do so now. */
10781 if (m->fs.fp_valid)
10782 {
10783 /* If the stack pointer is valid and pointing at the frame
10784 pointer store address, then we only need a pop. */
10785 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
10786 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10787 /* Leave results in shorter dependency chains on CPUs that are
10788 able to grok it fast. */
10789 else if (TARGET_USE_LEAVE
10790 || optimize_function_for_size_p (cfun)
10791 || !cfun->machine->use_fast_prologue_epilogue)
10792 ix86_emit_leave ();
10793 else
10794 {
10795 pro_epilogue_adjust_stack (stack_pointer_rtx,
10796 hard_frame_pointer_rtx,
10797 const0_rtx, style, !using_drap);
10798 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10799 }
10800 }
10801
10802 if (using_drap)
10803 {
10804 int param_ptr_offset = UNITS_PER_WORD;
10805 rtx insn;
10806
10807 gcc_assert (stack_realign_drap);
10808
10809 if (ix86_static_chain_on_stack)
10810 param_ptr_offset += UNITS_PER_WORD;
10811 if (!call_used_regs[REGNO (crtl->drap_reg)])
10812 param_ptr_offset += UNITS_PER_WORD;
10813
10814 insn = emit_insn (gen_rtx_SET
10815 (VOIDmode, stack_pointer_rtx,
10816 gen_rtx_PLUS (Pmode,
10817 crtl->drap_reg,
10818 GEN_INT (-param_ptr_offset))));
10819 m->fs.cfa_reg = stack_pointer_rtx;
10820 m->fs.cfa_offset = param_ptr_offset;
10821 m->fs.sp_offset = param_ptr_offset;
10822 m->fs.realigned = false;
10823
10824 add_reg_note (insn, REG_CFA_DEF_CFA,
10825 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10826 GEN_INT (param_ptr_offset)));
10827 RTX_FRAME_RELATED_P (insn) = 1;
10828
10829 if (!call_used_regs[REGNO (crtl->drap_reg)])
10830 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
10831 }
10832
10833 /* At this point the stack pointer must be valid, and we must have
10834 restored all of the registers. We may not have deallocated the
10835 entire stack frame. We've delayed this until now because it may
10836 be possible to merge the local stack deallocation with the
10837 deallocation forced by ix86_static_chain_on_stack. */
10838 gcc_assert (m->fs.sp_valid);
10839 gcc_assert (!m->fs.fp_valid);
10840 gcc_assert (!m->fs.realigned);
10841 if (m->fs.sp_offset != UNITS_PER_WORD)
10842 {
10843 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10844 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
10845 style, true);
10846 }
10847 else
10848 ix86_add_queued_cfa_restore_notes (get_last_insn ());
10849
10850 /* Sibcall epilogues don't want a return instruction. */
10851 if (style == 0)
10852 {
10853 m->fs = frame_state_save;
10854 return;
10855 }
10856
10857 /* Emit vzeroupper if needed. */
10858 if (TARGET_VZEROUPPER
10859 && !TREE_THIS_VOLATILE (cfun->decl)
10860 && !cfun->machine->caller_return_avx256_p)
10861 emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
10862
10863 if (crtl->args.pops_args && crtl->args.size)
10864 {
10865 rtx popc = GEN_INT (crtl->args.pops_args);
10866
10867 /* i386 can only pop 64K bytes. If asked to pop more, pop return
10868 address, do explicit add, and jump indirectly to the caller. */
10869
10870 if (crtl->args.pops_args >= 65536)
10871 {
10872 rtx ecx = gen_rtx_REG (SImode, CX_REG);
10873 rtx insn;
10874
10875 /* There is no "pascal" calling convention in any 64bit ABI. */
10876 gcc_assert (!TARGET_64BIT);
10877
10878 insn = emit_insn (gen_pop (ecx));
10879 m->fs.cfa_offset -= UNITS_PER_WORD;
10880 m->fs.sp_offset -= UNITS_PER_WORD;
10881
10882 add_reg_note (insn, REG_CFA_ADJUST_CFA,
10883 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
10884 add_reg_note (insn, REG_CFA_REGISTER,
10885 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
10886 RTX_FRAME_RELATED_P (insn) = 1;
10887
10888 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10889 popc, -1, true);
10890 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
10891 }
10892 else
10893 emit_jump_insn (gen_simple_return_pop_internal (popc));
10894 }
10895 else
10896 emit_jump_insn (gen_simple_return_internal ());
10897
10898 /* Restore the state back to the state from the prologue,
10899 so that it's correct for the next epilogue. */
10900 m->fs = frame_state_save;
10901 }
10902
10903 /* Reset from the function's potential modifications. */
10904
10905 static void
10906 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
10907 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
10908 {
10909 if (pic_offset_table_rtx)
10910 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
10911 #if TARGET_MACHO
10912 /* Mach-O doesn't support labels at the end of objects, so if
10913 it looks like we might want one, insert a NOP. */
10914 {
10915 rtx insn = get_last_insn ();
10916 rtx deleted_debug_label = NULL_RTX;
10917 while (insn
10918 && NOTE_P (insn)
10919 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
10920 {
10921 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
10922 notes only, instead set their CODE_LABEL_NUMBER to -1,
10923 otherwise there would be code generation differences
10924 in between -g and -g0. */
10925 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
10926 deleted_debug_label = insn;
10927 insn = PREV_INSN (insn);
10928 }
10929 if (insn
10930 && (LABEL_P (insn)
10931 || (NOTE_P (insn)
10932 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
10933 fputs ("\tnop\n", file);
10934 else if (deleted_debug_label)
10935 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
10936 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
10937 CODE_LABEL_NUMBER (insn) = -1;
10938 }
10939 #endif
10940
10941 }
10942
10943 /* Return a scratch register to use in the split stack prologue. The
10944 split stack prologue is used for -fsplit-stack. It is the first
10945 instructions in the function, even before the regular prologue.
10946 The scratch register can be any caller-saved register which is not
10947 used for parameters or for the static chain. */
10948
10949 static unsigned int
10950 split_stack_prologue_scratch_regno (void)
10951 {
10952 if (TARGET_64BIT)
10953 return R11_REG;
10954 else
10955 {
10956 bool is_fastcall;
10957 int regparm;
10958
10959 is_fastcall = (lookup_attribute ("fastcall",
10960 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
10961 != NULL);
10962 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
10963
10964 if (is_fastcall)
10965 {
10966 if (DECL_STATIC_CHAIN (cfun->decl))
10967 {
10968 sorry ("-fsplit-stack does not support fastcall with "
10969 "nested function");
10970 return INVALID_REGNUM;
10971 }
10972 return AX_REG;
10973 }
10974 else if (regparm < 3)
10975 {
10976 if (!DECL_STATIC_CHAIN (cfun->decl))
10977 return CX_REG;
10978 else
10979 {
10980 if (regparm >= 2)
10981 {
10982 sorry ("-fsplit-stack does not support 2 register "
10983 " parameters for a nested function");
10984 return INVALID_REGNUM;
10985 }
10986 return DX_REG;
10987 }
10988 }
10989 else
10990 {
10991 /* FIXME: We could make this work by pushing a register
10992 around the addition and comparison. */
10993 sorry ("-fsplit-stack does not support 3 register parameters");
10994 return INVALID_REGNUM;
10995 }
10996 }
10997 }
10998
10999 /* A SYMBOL_REF for the function which allocates new stackspace for
11000 -fsplit-stack. */
11001
11002 static GTY(()) rtx split_stack_fn;
11003
11004 /* A SYMBOL_REF for the more stack function when using the large
11005 model. */
11006
11007 static GTY(()) rtx split_stack_fn_large;
11008
11009 /* Handle -fsplit-stack. These are the first instructions in the
11010 function, even before the regular prologue. */
11011
11012 void
11013 ix86_expand_split_stack_prologue (void)
11014 {
11015 struct ix86_frame frame;
11016 HOST_WIDE_INT allocate;
11017 unsigned HOST_WIDE_INT args_size;
11018 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11019 rtx scratch_reg = NULL_RTX;
11020 rtx varargs_label = NULL_RTX;
11021 rtx fn;
11022
11023 gcc_assert (flag_split_stack && reload_completed);
11024
11025 ix86_finalize_stack_realign_flags ();
11026 ix86_compute_frame_layout (&frame);
11027 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11028
11029 /* This is the label we will branch to if we have enough stack
11030 space. We expect the basic block reordering pass to reverse this
11031 branch if optimizing, so that we branch in the unlikely case. */
11032 label = gen_label_rtx ();
11033
11034 /* We need to compare the stack pointer minus the frame size with
11035 the stack boundary in the TCB. The stack boundary always gives
11036 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11037 can compare directly. Otherwise we need to do an addition. */
11038
11039 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11040 UNSPEC_STACK_CHECK);
11041 limit = gen_rtx_CONST (Pmode, limit);
11042 limit = gen_rtx_MEM (Pmode, limit);
11043 if (allocate < SPLIT_STACK_AVAILABLE)
11044 current = stack_pointer_rtx;
11045 else
11046 {
11047 unsigned int scratch_regno;
11048 rtx offset;
11049
11050 /* We need a scratch register to hold the stack pointer minus
11051 the required frame size. Since this is the very start of the
11052 function, the scratch register can be any caller-saved
11053 register which is not used for parameters. */
11054 offset = GEN_INT (- allocate);
11055 scratch_regno = split_stack_prologue_scratch_regno ();
11056 if (scratch_regno == INVALID_REGNUM)
11057 return;
11058 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11059 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11060 {
11061 /* We don't use ix86_gen_add3 in this case because it will
11062 want to split to lea, but when not optimizing the insn
11063 will not be split after this point. */
11064 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11065 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11066 offset)));
11067 }
11068 else
11069 {
11070 emit_move_insn (scratch_reg, offset);
11071 emit_insn (gen_adddi3 (scratch_reg, scratch_reg,
11072 stack_pointer_rtx));
11073 }
11074 current = scratch_reg;
11075 }
11076
11077 ix86_expand_branch (GEU, current, limit, label);
11078 jump_insn = get_last_insn ();
11079 JUMP_LABEL (jump_insn) = label;
11080
11081 /* Mark the jump as very likely to be taken. */
11082 add_reg_note (jump_insn, REG_BR_PROB,
11083 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11084
11085 if (split_stack_fn == NULL_RTX)
11086 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11087 fn = split_stack_fn;
11088
11089 /* Get more stack space. We pass in the desired stack space and the
11090 size of the arguments to copy to the new stack. In 32-bit mode
11091 we push the parameters; __morestack will return on a new stack
11092 anyhow. In 64-bit mode we pass the parameters in r10 and
11093 r11. */
11094 allocate_rtx = GEN_INT (allocate);
11095 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11096 call_fusage = NULL_RTX;
11097 if (TARGET_64BIT)
11098 {
11099 rtx reg10, reg11;
11100
11101 reg10 = gen_rtx_REG (Pmode, R10_REG);
11102 reg11 = gen_rtx_REG (Pmode, R11_REG);
11103
11104 /* If this function uses a static chain, it will be in %r10.
11105 Preserve it across the call to __morestack. */
11106 if (DECL_STATIC_CHAIN (cfun->decl))
11107 {
11108 rtx rax;
11109
11110 rax = gen_rtx_REG (Pmode, AX_REG);
11111 emit_move_insn (rax, reg10);
11112 use_reg (&call_fusage, rax);
11113 }
11114
11115 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11116 {
11117 HOST_WIDE_INT argval;
11118
11119 /* When using the large model we need to load the address
11120 into a register, and we've run out of registers. So we
11121 switch to a different calling convention, and we call a
11122 different function: __morestack_large. We pass the
11123 argument size in the upper 32 bits of r10 and pass the
11124 frame size in the lower 32 bits. */
11125 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11126 gcc_assert ((args_size & 0xffffffff) == args_size);
11127
11128 if (split_stack_fn_large == NULL_RTX)
11129 split_stack_fn_large =
11130 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11131
11132 if (ix86_cmodel == CM_LARGE_PIC)
11133 {
11134 rtx label, x;
11135
11136 label = gen_label_rtx ();
11137 emit_label (label);
11138 LABEL_PRESERVE_P (label) = 1;
11139 emit_insn (gen_set_rip_rex64 (reg10, label));
11140 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11141 emit_insn (gen_adddi3 (reg10, reg10, reg11));
11142 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11143 UNSPEC_GOT);
11144 x = gen_rtx_CONST (Pmode, x);
11145 emit_move_insn (reg11, x);
11146 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11147 x = gen_const_mem (Pmode, x);
11148 emit_move_insn (reg11, x);
11149 }
11150 else
11151 emit_move_insn (reg11, split_stack_fn_large);
11152
11153 fn = reg11;
11154
11155 argval = ((args_size << 16) << 16) + allocate;
11156 emit_move_insn (reg10, GEN_INT (argval));
11157 }
11158 else
11159 {
11160 emit_move_insn (reg10, allocate_rtx);
11161 emit_move_insn (reg11, GEN_INT (args_size));
11162 use_reg (&call_fusage, reg11);
11163 }
11164
11165 use_reg (&call_fusage, reg10);
11166 }
11167 else
11168 {
11169 emit_insn (gen_push (GEN_INT (args_size)));
11170 emit_insn (gen_push (allocate_rtx));
11171 }
11172 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11173 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11174 NULL_RTX, false);
11175 add_function_usage_to (call_insn, call_fusage);
11176
11177 /* In order to make call/return prediction work right, we now need
11178 to execute a return instruction. See
11179 libgcc/config/i386/morestack.S for the details on how this works.
11180
11181 For flow purposes gcc must not see this as a return
11182 instruction--we need control flow to continue at the subsequent
11183 label. Therefore, we use an unspec. */
11184 gcc_assert (crtl->args.pops_args < 65536);
11185 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11186
11187 /* If we are in 64-bit mode and this function uses a static chain,
11188 we saved %r10 in %rax before calling _morestack. */
11189 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11190 emit_move_insn (gen_rtx_REG (Pmode, R10_REG),
11191 gen_rtx_REG (Pmode, AX_REG));
11192
11193 /* If this function calls va_start, we need to store a pointer to
11194 the arguments on the old stack, because they may not have been
11195 all copied to the new stack. At this point the old stack can be
11196 found at the frame pointer value used by __morestack, because
11197 __morestack has set that up before calling back to us. Here we
11198 store that pointer in a scratch register, and in
11199 ix86_expand_prologue we store the scratch register in a stack
11200 slot. */
11201 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11202 {
11203 unsigned int scratch_regno;
11204 rtx frame_reg;
11205 int words;
11206
11207 scratch_regno = split_stack_prologue_scratch_regno ();
11208 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11209 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11210
11211 /* 64-bit:
11212 fp -> old fp value
11213 return address within this function
11214 return address of caller of this function
11215 stack arguments
11216 So we add three words to get to the stack arguments.
11217
11218 32-bit:
11219 fp -> old fp value
11220 return address within this function
11221 first argument to __morestack
11222 second argument to __morestack
11223 return address of caller of this function
11224 stack arguments
11225 So we add five words to get to the stack arguments.
11226 */
11227 words = TARGET_64BIT ? 3 : 5;
11228 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11229 gen_rtx_PLUS (Pmode, frame_reg,
11230 GEN_INT (words * UNITS_PER_WORD))));
11231
11232 varargs_label = gen_label_rtx ();
11233 emit_jump_insn (gen_jump (varargs_label));
11234 JUMP_LABEL (get_last_insn ()) = varargs_label;
11235
11236 emit_barrier ();
11237 }
11238
11239 emit_label (label);
11240 LABEL_NUSES (label) = 1;
11241
11242 /* If this function calls va_start, we now have to set the scratch
11243 register for the case where we do not call __morestack. In this
11244 case we need to set it based on the stack pointer. */
11245 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11246 {
11247 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11248 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11249 GEN_INT (UNITS_PER_WORD))));
11250
11251 emit_label (varargs_label);
11252 LABEL_NUSES (varargs_label) = 1;
11253 }
11254 }
11255
11256 /* We may have to tell the dataflow pass that the split stack prologue
11257 is initializing a scratch register. */
11258
11259 static void
11260 ix86_live_on_entry (bitmap regs)
11261 {
11262 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11263 {
11264 gcc_assert (flag_split_stack);
11265 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11266 }
11267 }
11268 \f
11269 /* Determine if op is suitable SUBREG RTX for address. */
11270
11271 static bool
11272 ix86_address_subreg_operand (rtx op)
11273 {
11274 enum machine_mode mode;
11275
11276 if (!REG_P (op))
11277 return false;
11278
11279 mode = GET_MODE (op);
11280
11281 if (GET_MODE_CLASS (mode) != MODE_INT)
11282 return false;
11283
11284 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11285 failures when the register is one word out of a two word structure. */
11286 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11287 return false;
11288
11289 /* Allow only SUBREGs of non-eliminable hard registers. */
11290 return register_no_elim_operand (op, mode);
11291 }
11292
11293 /* Extract the parts of an RTL expression that is a valid memory address
11294 for an instruction. Return 0 if the structure of the address is
11295 grossly off. Return -1 if the address contains ASHIFT, so it is not
11296 strictly valid, but still used for computing length of lea instruction. */
11297
11298 int
11299 ix86_decompose_address (rtx addr, struct ix86_address *out)
11300 {
11301 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11302 rtx base_reg, index_reg;
11303 HOST_WIDE_INT scale = 1;
11304 rtx scale_rtx = NULL_RTX;
11305 rtx tmp;
11306 int retval = 1;
11307 enum ix86_address_seg seg = SEG_DEFAULT;
11308
11309 /* Allow zero-extended SImode addresses,
11310 they will be emitted with addr32 prefix. */
11311 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11312 {
11313 if (GET_CODE (addr) == ZERO_EXTEND
11314 && GET_MODE (XEXP (addr, 0)) == SImode)
11315 addr = XEXP (addr, 0);
11316 else if (GET_CODE (addr) == AND
11317 && const_32bit_mask (XEXP (addr, 1), DImode))
11318 {
11319 addr = XEXP (addr, 0);
11320
11321 /* Strip subreg. */
11322 if (GET_CODE (addr) == SUBREG
11323 && GET_MODE (SUBREG_REG (addr)) == SImode)
11324 addr = SUBREG_REG (addr);
11325 }
11326 }
11327
11328 if (REG_P (addr))
11329 base = addr;
11330 else if (GET_CODE (addr) == SUBREG)
11331 {
11332 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11333 base = addr;
11334 else
11335 return 0;
11336 }
11337 else if (GET_CODE (addr) == PLUS)
11338 {
11339 rtx addends[4], op;
11340 int n = 0, i;
11341
11342 op = addr;
11343 do
11344 {
11345 if (n >= 4)
11346 return 0;
11347 addends[n++] = XEXP (op, 1);
11348 op = XEXP (op, 0);
11349 }
11350 while (GET_CODE (op) == PLUS);
11351 if (n >= 4)
11352 return 0;
11353 addends[n] = op;
11354
11355 for (i = n; i >= 0; --i)
11356 {
11357 op = addends[i];
11358 switch (GET_CODE (op))
11359 {
11360 case MULT:
11361 if (index)
11362 return 0;
11363 index = XEXP (op, 0);
11364 scale_rtx = XEXP (op, 1);
11365 break;
11366
11367 case ASHIFT:
11368 if (index)
11369 return 0;
11370 index = XEXP (op, 0);
11371 tmp = XEXP (op, 1);
11372 if (!CONST_INT_P (tmp))
11373 return 0;
11374 scale = INTVAL (tmp);
11375 if ((unsigned HOST_WIDE_INT) scale > 3)
11376 return 0;
11377 scale = 1 << scale;
11378 break;
11379
11380 case UNSPEC:
11381 if (XINT (op, 1) == UNSPEC_TP
11382 && TARGET_TLS_DIRECT_SEG_REFS
11383 && seg == SEG_DEFAULT)
11384 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11385 else
11386 return 0;
11387 break;
11388
11389 case SUBREG:
11390 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11391 return 0;
11392 /* FALLTHRU */
11393
11394 case REG:
11395 if (!base)
11396 base = op;
11397 else if (!index)
11398 index = op;
11399 else
11400 return 0;
11401 break;
11402
11403 case CONST:
11404 case CONST_INT:
11405 case SYMBOL_REF:
11406 case LABEL_REF:
11407 if (disp)
11408 return 0;
11409 disp = op;
11410 break;
11411
11412 default:
11413 return 0;
11414 }
11415 }
11416 }
11417 else if (GET_CODE (addr) == MULT)
11418 {
11419 index = XEXP (addr, 0); /* index*scale */
11420 scale_rtx = XEXP (addr, 1);
11421 }
11422 else if (GET_CODE (addr) == ASHIFT)
11423 {
11424 /* We're called for lea too, which implements ashift on occasion. */
11425 index = XEXP (addr, 0);
11426 tmp = XEXP (addr, 1);
11427 if (!CONST_INT_P (tmp))
11428 return 0;
11429 scale = INTVAL (tmp);
11430 if ((unsigned HOST_WIDE_INT) scale > 3)
11431 return 0;
11432 scale = 1 << scale;
11433 retval = -1;
11434 }
11435 else
11436 disp = addr; /* displacement */
11437
11438 if (index)
11439 {
11440 if (REG_P (index))
11441 ;
11442 else if (GET_CODE (index) == SUBREG
11443 && ix86_address_subreg_operand (SUBREG_REG (index)))
11444 ;
11445 else
11446 return 0;
11447 }
11448
11449 /* Extract the integral value of scale. */
11450 if (scale_rtx)
11451 {
11452 if (!CONST_INT_P (scale_rtx))
11453 return 0;
11454 scale = INTVAL (scale_rtx);
11455 }
11456
11457 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11458 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11459
11460 /* Avoid useless 0 displacement. */
11461 if (disp == const0_rtx && (base || index))
11462 disp = NULL_RTX;
11463
11464 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11465 if (base_reg && index_reg && scale == 1
11466 && (index_reg == arg_pointer_rtx
11467 || index_reg == frame_pointer_rtx
11468 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11469 {
11470 rtx tmp;
11471 tmp = base, base = index, index = tmp;
11472 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11473 }
11474
11475 /* Special case: %ebp cannot be encoded as a base without a displacement.
11476 Similarly %r13. */
11477 if (!disp
11478 && base_reg
11479 && (base_reg == hard_frame_pointer_rtx
11480 || base_reg == frame_pointer_rtx
11481 || base_reg == arg_pointer_rtx
11482 || (REG_P (base_reg)
11483 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11484 || REGNO (base_reg) == R13_REG))))
11485 disp = const0_rtx;
11486
11487 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11488 Avoid this by transforming to [%esi+0].
11489 Reload calls address legitimization without cfun defined, so we need
11490 to test cfun for being non-NULL. */
11491 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11492 && base_reg && !index_reg && !disp
11493 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11494 disp = const0_rtx;
11495
11496 /* Special case: encode reg+reg instead of reg*2. */
11497 if (!base && index && scale == 2)
11498 base = index, base_reg = index_reg, scale = 1;
11499
11500 /* Special case: scaling cannot be encoded without base or displacement. */
11501 if (!base && !disp && index && scale != 1)
11502 disp = const0_rtx;
11503
11504 out->base = base;
11505 out->index = index;
11506 out->disp = disp;
11507 out->scale = scale;
11508 out->seg = seg;
11509
11510 return retval;
11511 }
11512 \f
11513 /* Return cost of the memory address x.
11514 For i386, it is better to use a complex address than let gcc copy
11515 the address into a reg and make a new pseudo. But not if the address
11516 requires to two regs - that would mean more pseudos with longer
11517 lifetimes. */
11518 static int
11519 ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
11520 {
11521 struct ix86_address parts;
11522 int cost = 1;
11523 int ok = ix86_decompose_address (x, &parts);
11524
11525 gcc_assert (ok);
11526
11527 if (parts.base && GET_CODE (parts.base) == SUBREG)
11528 parts.base = SUBREG_REG (parts.base);
11529 if (parts.index && GET_CODE (parts.index) == SUBREG)
11530 parts.index = SUBREG_REG (parts.index);
11531
11532 /* Attempt to minimize number of registers in the address. */
11533 if ((parts.base
11534 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11535 || (parts.index
11536 && (!REG_P (parts.index)
11537 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11538 cost++;
11539
11540 if (parts.base
11541 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11542 && parts.index
11543 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11544 && parts.base != parts.index)
11545 cost++;
11546
11547 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11548 since it's predecode logic can't detect the length of instructions
11549 and it degenerates to vector decoded. Increase cost of such
11550 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11551 to split such addresses or even refuse such addresses at all.
11552
11553 Following addressing modes are affected:
11554 [base+scale*index]
11555 [scale*index+disp]
11556 [base+index]
11557
11558 The first and last case may be avoidable by explicitly coding the zero in
11559 memory address, but I don't have AMD-K6 machine handy to check this
11560 theory. */
11561
11562 if (TARGET_K6
11563 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11564 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11565 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11566 cost += 10;
11567
11568 return cost;
11569 }
11570 \f
11571 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11572 this is used for to form addresses to local data when -fPIC is in
11573 use. */
11574
11575 static bool
11576 darwin_local_data_pic (rtx disp)
11577 {
11578 return (GET_CODE (disp) == UNSPEC
11579 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11580 }
11581
11582 /* Determine if a given RTX is a valid constant. We already know this
11583 satisfies CONSTANT_P. */
11584
11585 static bool
11586 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11587 {
11588 switch (GET_CODE (x))
11589 {
11590 case CONST:
11591 x = XEXP (x, 0);
11592
11593 if (GET_CODE (x) == PLUS)
11594 {
11595 if (!CONST_INT_P (XEXP (x, 1)))
11596 return false;
11597 x = XEXP (x, 0);
11598 }
11599
11600 if (TARGET_MACHO && darwin_local_data_pic (x))
11601 return true;
11602
11603 /* Only some unspecs are valid as "constants". */
11604 if (GET_CODE (x) == UNSPEC)
11605 switch (XINT (x, 1))
11606 {
11607 case UNSPEC_GOT:
11608 case UNSPEC_GOTOFF:
11609 case UNSPEC_PLTOFF:
11610 return TARGET_64BIT;
11611 case UNSPEC_TPOFF:
11612 case UNSPEC_NTPOFF:
11613 x = XVECEXP (x, 0, 0);
11614 return (GET_CODE (x) == SYMBOL_REF
11615 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11616 case UNSPEC_DTPOFF:
11617 x = XVECEXP (x, 0, 0);
11618 return (GET_CODE (x) == SYMBOL_REF
11619 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11620 default:
11621 return false;
11622 }
11623
11624 /* We must have drilled down to a symbol. */
11625 if (GET_CODE (x) == LABEL_REF)
11626 return true;
11627 if (GET_CODE (x) != SYMBOL_REF)
11628 return false;
11629 /* FALLTHRU */
11630
11631 case SYMBOL_REF:
11632 /* TLS symbols are never valid. */
11633 if (SYMBOL_REF_TLS_MODEL (x))
11634 return false;
11635
11636 /* DLLIMPORT symbols are never valid. */
11637 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11638 && SYMBOL_REF_DLLIMPORT_P (x))
11639 return false;
11640
11641 #if TARGET_MACHO
11642 /* mdynamic-no-pic */
11643 if (MACHO_DYNAMIC_NO_PIC_P)
11644 return machopic_symbol_defined_p (x);
11645 #endif
11646 break;
11647
11648 case CONST_DOUBLE:
11649 if (GET_MODE (x) == TImode
11650 && x != CONST0_RTX (TImode)
11651 && !TARGET_64BIT)
11652 return false;
11653 break;
11654
11655 case CONST_VECTOR:
11656 if (!standard_sse_constant_p (x))
11657 return false;
11658
11659 default:
11660 break;
11661 }
11662
11663 /* Otherwise we handle everything else in the move patterns. */
11664 return true;
11665 }
11666
11667 /* Determine if it's legal to put X into the constant pool. This
11668 is not possible for the address of thread-local symbols, which
11669 is checked above. */
11670
11671 static bool
11672 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11673 {
11674 /* We can always put integral constants and vectors in memory. */
11675 switch (GET_CODE (x))
11676 {
11677 case CONST_INT:
11678 case CONST_DOUBLE:
11679 case CONST_VECTOR:
11680 return false;
11681
11682 default:
11683 break;
11684 }
11685 return !ix86_legitimate_constant_p (mode, x);
11686 }
11687
11688
11689 /* Nonzero if the constant value X is a legitimate general operand
11690 when generating PIC code. It is given that flag_pic is on and
11691 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11692
11693 bool
11694 legitimate_pic_operand_p (rtx x)
11695 {
11696 rtx inner;
11697
11698 switch (GET_CODE (x))
11699 {
11700 case CONST:
11701 inner = XEXP (x, 0);
11702 if (GET_CODE (inner) == PLUS
11703 && CONST_INT_P (XEXP (inner, 1)))
11704 inner = XEXP (inner, 0);
11705
11706 /* Only some unspecs are valid as "constants". */
11707 if (GET_CODE (inner) == UNSPEC)
11708 switch (XINT (inner, 1))
11709 {
11710 case UNSPEC_GOT:
11711 case UNSPEC_GOTOFF:
11712 case UNSPEC_PLTOFF:
11713 return TARGET_64BIT;
11714 case UNSPEC_TPOFF:
11715 x = XVECEXP (inner, 0, 0);
11716 return (GET_CODE (x) == SYMBOL_REF
11717 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11718 case UNSPEC_MACHOPIC_OFFSET:
11719 return legitimate_pic_address_disp_p (x);
11720 default:
11721 return false;
11722 }
11723 /* FALLTHRU */
11724
11725 case SYMBOL_REF:
11726 case LABEL_REF:
11727 return legitimate_pic_address_disp_p (x);
11728
11729 default:
11730 return true;
11731 }
11732 }
11733
11734 /* Determine if a given CONST RTX is a valid memory displacement
11735 in PIC mode. */
11736
11737 bool
11738 legitimate_pic_address_disp_p (rtx disp)
11739 {
11740 bool saw_plus;
11741
11742 /* In 64bit mode we can allow direct addresses of symbols and labels
11743 when they are not dynamic symbols. */
11744 if (TARGET_64BIT)
11745 {
11746 rtx op0 = disp, op1;
11747
11748 switch (GET_CODE (disp))
11749 {
11750 case LABEL_REF:
11751 return true;
11752
11753 case CONST:
11754 if (GET_CODE (XEXP (disp, 0)) != PLUS)
11755 break;
11756 op0 = XEXP (XEXP (disp, 0), 0);
11757 op1 = XEXP (XEXP (disp, 0), 1);
11758 if (!CONST_INT_P (op1)
11759 || INTVAL (op1) >= 16*1024*1024
11760 || INTVAL (op1) < -16*1024*1024)
11761 break;
11762 if (GET_CODE (op0) == LABEL_REF)
11763 return true;
11764 if (GET_CODE (op0) != SYMBOL_REF)
11765 break;
11766 /* FALLTHRU */
11767
11768 case SYMBOL_REF:
11769 /* TLS references should always be enclosed in UNSPEC. */
11770 if (SYMBOL_REF_TLS_MODEL (op0))
11771 return false;
11772 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
11773 && ix86_cmodel != CM_LARGE_PIC)
11774 return true;
11775 break;
11776
11777 default:
11778 break;
11779 }
11780 }
11781 if (GET_CODE (disp) != CONST)
11782 return false;
11783 disp = XEXP (disp, 0);
11784
11785 if (TARGET_64BIT)
11786 {
11787 /* We are unsafe to allow PLUS expressions. This limit allowed distance
11788 of GOT tables. We should not need these anyway. */
11789 if (GET_CODE (disp) != UNSPEC
11790 || (XINT (disp, 1) != UNSPEC_GOTPCREL
11791 && XINT (disp, 1) != UNSPEC_GOTOFF
11792 && XINT (disp, 1) != UNSPEC_PCREL
11793 && XINT (disp, 1) != UNSPEC_PLTOFF))
11794 return false;
11795
11796 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
11797 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
11798 return false;
11799 return true;
11800 }
11801
11802 saw_plus = false;
11803 if (GET_CODE (disp) == PLUS)
11804 {
11805 if (!CONST_INT_P (XEXP (disp, 1)))
11806 return false;
11807 disp = XEXP (disp, 0);
11808 saw_plus = true;
11809 }
11810
11811 if (TARGET_MACHO && darwin_local_data_pic (disp))
11812 return true;
11813
11814 if (GET_CODE (disp) != UNSPEC)
11815 return false;
11816
11817 switch (XINT (disp, 1))
11818 {
11819 case UNSPEC_GOT:
11820 if (saw_plus)
11821 return false;
11822 /* We need to check for both symbols and labels because VxWorks loads
11823 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
11824 details. */
11825 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11826 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
11827 case UNSPEC_GOTOFF:
11828 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
11829 While ABI specify also 32bit relocation but we don't produce it in
11830 small PIC model at all. */
11831 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11832 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
11833 && !TARGET_64BIT)
11834 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
11835 return false;
11836 case UNSPEC_GOTTPOFF:
11837 case UNSPEC_GOTNTPOFF:
11838 case UNSPEC_INDNTPOFF:
11839 if (saw_plus)
11840 return false;
11841 disp = XVECEXP (disp, 0, 0);
11842 return (GET_CODE (disp) == SYMBOL_REF
11843 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
11844 case UNSPEC_NTPOFF:
11845 disp = XVECEXP (disp, 0, 0);
11846 return (GET_CODE (disp) == SYMBOL_REF
11847 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
11848 case UNSPEC_DTPOFF:
11849 disp = XVECEXP (disp, 0, 0);
11850 return (GET_CODE (disp) == SYMBOL_REF
11851 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
11852 }
11853
11854 return false;
11855 }
11856
11857 /* Recognizes RTL expressions that are valid memory addresses for an
11858 instruction. The MODE argument is the machine mode for the MEM
11859 expression that wants to use this address.
11860
11861 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
11862 convert common non-canonical forms to canonical form so that they will
11863 be recognized. */
11864
11865 static bool
11866 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
11867 rtx addr, bool strict)
11868 {
11869 struct ix86_address parts;
11870 rtx base, index, disp;
11871 HOST_WIDE_INT scale;
11872
11873 if (ix86_decompose_address (addr, &parts) <= 0)
11874 /* Decomposition failed. */
11875 return false;
11876
11877 base = parts.base;
11878 index = parts.index;
11879 disp = parts.disp;
11880 scale = parts.scale;
11881
11882 /* Validate base register. */
11883 if (base)
11884 {
11885 rtx reg;
11886
11887 if (REG_P (base))
11888 reg = base;
11889 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
11890 reg = SUBREG_REG (base);
11891 else
11892 /* Base is not a register. */
11893 return false;
11894
11895 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
11896 return false;
11897
11898 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
11899 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
11900 /* Base is not valid. */
11901 return false;
11902 }
11903
11904 /* Validate index register. */
11905 if (index)
11906 {
11907 rtx reg;
11908
11909 if (REG_P (index))
11910 reg = index;
11911 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
11912 reg = SUBREG_REG (index);
11913 else
11914 /* Index is not a register. */
11915 return false;
11916
11917 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
11918 return false;
11919
11920 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
11921 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
11922 /* Index is not valid. */
11923 return false;
11924 }
11925
11926 /* Index and base should have the same mode. */
11927 if (base && index
11928 && GET_MODE (base) != GET_MODE (index))
11929 return false;
11930
11931 /* Validate scale factor. */
11932 if (scale != 1)
11933 {
11934 if (!index)
11935 /* Scale without index. */
11936 return false;
11937
11938 if (scale != 2 && scale != 4 && scale != 8)
11939 /* Scale is not a valid multiplier. */
11940 return false;
11941 }
11942
11943 /* Validate displacement. */
11944 if (disp)
11945 {
11946 if (GET_CODE (disp) == CONST
11947 && GET_CODE (XEXP (disp, 0)) == UNSPEC
11948 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
11949 switch (XINT (XEXP (disp, 0), 1))
11950 {
11951 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
11952 used. While ABI specify also 32bit relocations, we don't produce
11953 them at all and use IP relative instead. */
11954 case UNSPEC_GOT:
11955 case UNSPEC_GOTOFF:
11956 gcc_assert (flag_pic);
11957 if (!TARGET_64BIT)
11958 goto is_legitimate_pic;
11959
11960 /* 64bit address unspec. */
11961 return false;
11962
11963 case UNSPEC_GOTPCREL:
11964 case UNSPEC_PCREL:
11965 gcc_assert (flag_pic);
11966 goto is_legitimate_pic;
11967
11968 case UNSPEC_GOTTPOFF:
11969 case UNSPEC_GOTNTPOFF:
11970 case UNSPEC_INDNTPOFF:
11971 case UNSPEC_NTPOFF:
11972 case UNSPEC_DTPOFF:
11973 break;
11974
11975 case UNSPEC_STACK_CHECK:
11976 gcc_assert (flag_split_stack);
11977 break;
11978
11979 default:
11980 /* Invalid address unspec. */
11981 return false;
11982 }
11983
11984 else if (SYMBOLIC_CONST (disp)
11985 && (flag_pic
11986 || (TARGET_MACHO
11987 #if TARGET_MACHO
11988 && MACHOPIC_INDIRECT
11989 && !machopic_operand_p (disp)
11990 #endif
11991 )))
11992 {
11993
11994 is_legitimate_pic:
11995 if (TARGET_64BIT && (index || base))
11996 {
11997 /* foo@dtpoff(%rX) is ok. */
11998 if (GET_CODE (disp) != CONST
11999 || GET_CODE (XEXP (disp, 0)) != PLUS
12000 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12001 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12002 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12003 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12004 /* Non-constant pic memory reference. */
12005 return false;
12006 }
12007 else if ((!TARGET_MACHO || flag_pic)
12008 && ! legitimate_pic_address_disp_p (disp))
12009 /* Displacement is an invalid pic construct. */
12010 return false;
12011 #if TARGET_MACHO
12012 else if (MACHO_DYNAMIC_NO_PIC_P
12013 && !ix86_legitimate_constant_p (Pmode, disp))
12014 /* displacment must be referenced via non_lazy_pointer */
12015 return false;
12016 #endif
12017
12018 /* This code used to verify that a symbolic pic displacement
12019 includes the pic_offset_table_rtx register.
12020
12021 While this is good idea, unfortunately these constructs may
12022 be created by "adds using lea" optimization for incorrect
12023 code like:
12024
12025 int a;
12026 int foo(int i)
12027 {
12028 return *(&a+i);
12029 }
12030
12031 This code is nonsensical, but results in addressing
12032 GOT table with pic_offset_table_rtx base. We can't
12033 just refuse it easily, since it gets matched by
12034 "addsi3" pattern, that later gets split to lea in the
12035 case output register differs from input. While this
12036 can be handled by separate addsi pattern for this case
12037 that never results in lea, this seems to be easier and
12038 correct fix for crash to disable this test. */
12039 }
12040 else if (GET_CODE (disp) != LABEL_REF
12041 && !CONST_INT_P (disp)
12042 && (GET_CODE (disp) != CONST
12043 || !ix86_legitimate_constant_p (Pmode, disp))
12044 && (GET_CODE (disp) != SYMBOL_REF
12045 || !ix86_legitimate_constant_p (Pmode, disp)))
12046 /* Displacement is not constant. */
12047 return false;
12048 else if (TARGET_64BIT
12049 && !x86_64_immediate_operand (disp, VOIDmode))
12050 /* Displacement is out of range. */
12051 return false;
12052 }
12053
12054 /* Everything looks valid. */
12055 return true;
12056 }
12057
12058 /* Determine if a given RTX is a valid constant address. */
12059
12060 bool
12061 constant_address_p (rtx x)
12062 {
12063 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12064 }
12065 \f
12066 /* Return a unique alias set for the GOT. */
12067
12068 static alias_set_type
12069 ix86_GOT_alias_set (void)
12070 {
12071 static alias_set_type set = -1;
12072 if (set == -1)
12073 set = new_alias_set ();
12074 return set;
12075 }
12076
12077 /* Return a legitimate reference for ORIG (an address) using the
12078 register REG. If REG is 0, a new pseudo is generated.
12079
12080 There are two types of references that must be handled:
12081
12082 1. Global data references must load the address from the GOT, via
12083 the PIC reg. An insn is emitted to do this load, and the reg is
12084 returned.
12085
12086 2. Static data references, constant pool addresses, and code labels
12087 compute the address as an offset from the GOT, whose base is in
12088 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12089 differentiate them from global data objects. The returned
12090 address is the PIC reg + an unspec constant.
12091
12092 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12093 reg also appears in the address. */
12094
12095 static rtx
12096 legitimize_pic_address (rtx orig, rtx reg)
12097 {
12098 rtx addr = orig;
12099 rtx new_rtx = orig;
12100 rtx base;
12101
12102 #if TARGET_MACHO
12103 if (TARGET_MACHO && !TARGET_64BIT)
12104 {
12105 if (reg == 0)
12106 reg = gen_reg_rtx (Pmode);
12107 /* Use the generic Mach-O PIC machinery. */
12108 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12109 }
12110 #endif
12111
12112 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12113 new_rtx = addr;
12114 else if (TARGET_64BIT
12115 && ix86_cmodel != CM_SMALL_PIC
12116 && gotoff_operand (addr, Pmode))
12117 {
12118 rtx tmpreg;
12119 /* This symbol may be referenced via a displacement from the PIC
12120 base address (@GOTOFF). */
12121
12122 if (reload_in_progress)
12123 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12124 if (GET_CODE (addr) == CONST)
12125 addr = XEXP (addr, 0);
12126 if (GET_CODE (addr) == PLUS)
12127 {
12128 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12129 UNSPEC_GOTOFF);
12130 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12131 }
12132 else
12133 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12134 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12135 if (!reg)
12136 tmpreg = gen_reg_rtx (Pmode);
12137 else
12138 tmpreg = reg;
12139 emit_move_insn (tmpreg, new_rtx);
12140
12141 if (reg != 0)
12142 {
12143 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12144 tmpreg, 1, OPTAB_DIRECT);
12145 new_rtx = reg;
12146 }
12147 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12148 }
12149 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12150 {
12151 /* This symbol may be referenced via a displacement from the PIC
12152 base address (@GOTOFF). */
12153
12154 if (reload_in_progress)
12155 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12156 if (GET_CODE (addr) == CONST)
12157 addr = XEXP (addr, 0);
12158 if (GET_CODE (addr) == PLUS)
12159 {
12160 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12161 UNSPEC_GOTOFF);
12162 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12163 }
12164 else
12165 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12166 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12167 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12168
12169 if (reg != 0)
12170 {
12171 emit_move_insn (reg, new_rtx);
12172 new_rtx = reg;
12173 }
12174 }
12175 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12176 /* We can't use @GOTOFF for text labels on VxWorks;
12177 see gotoff_operand. */
12178 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12179 {
12180 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12181 {
12182 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12183 return legitimize_dllimport_symbol (addr, true);
12184 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12185 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12186 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12187 {
12188 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12189 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12190 }
12191 }
12192
12193 /* For x64 PE-COFF there is no GOT table. So we use address
12194 directly. */
12195 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12196 {
12197 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12198 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12199
12200 if (reg == 0)
12201 reg = gen_reg_rtx (Pmode);
12202 emit_move_insn (reg, new_rtx);
12203 new_rtx = reg;
12204 }
12205 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12206 {
12207 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12208 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12209 new_rtx = gen_const_mem (Pmode, new_rtx);
12210 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12211
12212 if (reg == 0)
12213 reg = gen_reg_rtx (Pmode);
12214 /* Use directly gen_movsi, otherwise the address is loaded
12215 into register for CSE. We don't want to CSE this addresses,
12216 instead we CSE addresses from the GOT table, so skip this. */
12217 emit_insn (gen_movsi (reg, new_rtx));
12218 new_rtx = reg;
12219 }
12220 else
12221 {
12222 /* This symbol must be referenced via a load from the
12223 Global Offset Table (@GOT). */
12224
12225 if (reload_in_progress)
12226 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12227 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12228 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12229 if (TARGET_64BIT)
12230 new_rtx = force_reg (Pmode, new_rtx);
12231 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12232 new_rtx = gen_const_mem (Pmode, new_rtx);
12233 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12234
12235 if (reg == 0)
12236 reg = gen_reg_rtx (Pmode);
12237 emit_move_insn (reg, new_rtx);
12238 new_rtx = reg;
12239 }
12240 }
12241 else
12242 {
12243 if (CONST_INT_P (addr)
12244 && !x86_64_immediate_operand (addr, VOIDmode))
12245 {
12246 if (reg)
12247 {
12248 emit_move_insn (reg, addr);
12249 new_rtx = reg;
12250 }
12251 else
12252 new_rtx = force_reg (Pmode, addr);
12253 }
12254 else if (GET_CODE (addr) == CONST)
12255 {
12256 addr = XEXP (addr, 0);
12257
12258 /* We must match stuff we generate before. Assume the only
12259 unspecs that can get here are ours. Not that we could do
12260 anything with them anyway.... */
12261 if (GET_CODE (addr) == UNSPEC
12262 || (GET_CODE (addr) == PLUS
12263 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12264 return orig;
12265 gcc_assert (GET_CODE (addr) == PLUS);
12266 }
12267 if (GET_CODE (addr) == PLUS)
12268 {
12269 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12270
12271 /* Check first to see if this is a constant offset from a @GOTOFF
12272 symbol reference. */
12273 if (gotoff_operand (op0, Pmode)
12274 && CONST_INT_P (op1))
12275 {
12276 if (!TARGET_64BIT)
12277 {
12278 if (reload_in_progress)
12279 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12280 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12281 UNSPEC_GOTOFF);
12282 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12283 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12284 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12285
12286 if (reg != 0)
12287 {
12288 emit_move_insn (reg, new_rtx);
12289 new_rtx = reg;
12290 }
12291 }
12292 else
12293 {
12294 if (INTVAL (op1) < -16*1024*1024
12295 || INTVAL (op1) >= 16*1024*1024)
12296 {
12297 if (!x86_64_immediate_operand (op1, Pmode))
12298 op1 = force_reg (Pmode, op1);
12299 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12300 }
12301 }
12302 }
12303 else
12304 {
12305 base = legitimize_pic_address (XEXP (addr, 0), reg);
12306 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12307 base == reg ? NULL_RTX : reg);
12308
12309 if (CONST_INT_P (new_rtx))
12310 new_rtx = plus_constant (base, INTVAL (new_rtx));
12311 else
12312 {
12313 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12314 {
12315 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12316 new_rtx = XEXP (new_rtx, 1);
12317 }
12318 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12319 }
12320 }
12321 }
12322 }
12323 return new_rtx;
12324 }
12325 \f
12326 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12327
12328 static rtx
12329 get_thread_pointer (bool to_reg)
12330 {
12331 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12332
12333 if (GET_MODE (tp) != Pmode)
12334 tp = convert_to_mode (Pmode, tp, 1);
12335
12336 if (to_reg)
12337 tp = copy_addr_to_reg (tp);
12338
12339 return tp;
12340 }
12341
12342 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12343
12344 static GTY(()) rtx ix86_tls_symbol;
12345
12346 static rtx
12347 ix86_tls_get_addr (void)
12348 {
12349 if (!ix86_tls_symbol)
12350 {
12351 const char *sym
12352 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12353 ? "___tls_get_addr" : "__tls_get_addr");
12354
12355 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12356 }
12357
12358 return ix86_tls_symbol;
12359 }
12360
12361 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12362
12363 static GTY(()) rtx ix86_tls_module_base_symbol;
12364
12365 rtx
12366 ix86_tls_module_base (void)
12367 {
12368 if (!ix86_tls_module_base_symbol)
12369 {
12370 ix86_tls_module_base_symbol
12371 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12372
12373 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12374 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12375 }
12376
12377 return ix86_tls_module_base_symbol;
12378 }
12379
12380 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12381 false if we expect this to be used for a memory address and true if
12382 we expect to load the address into a register. */
12383
12384 static rtx
12385 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12386 {
12387 rtx dest, base, off;
12388 rtx pic = NULL_RTX, tp = NULL_RTX;
12389 int type;
12390
12391 switch (model)
12392 {
12393 case TLS_MODEL_GLOBAL_DYNAMIC:
12394 dest = gen_reg_rtx (Pmode);
12395
12396 if (!TARGET_64BIT)
12397 {
12398 if (flag_pic)
12399 pic = pic_offset_table_rtx;
12400 else
12401 {
12402 pic = gen_reg_rtx (Pmode);
12403 emit_insn (gen_set_got (pic));
12404 }
12405 }
12406
12407 if (TARGET_GNU2_TLS)
12408 {
12409 if (TARGET_64BIT)
12410 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12411 else
12412 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12413
12414 tp = get_thread_pointer (true);
12415 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12416
12417 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12418 }
12419 else
12420 {
12421 rtx caddr = ix86_tls_get_addr ();
12422
12423 if (TARGET_64BIT)
12424 {
12425 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
12426
12427 start_sequence ();
12428 emit_call_insn (gen_tls_global_dynamic_64 (rax, x, caddr));
12429 insns = get_insns ();
12430 end_sequence ();
12431
12432 RTL_CONST_CALL_P (insns) = 1;
12433 emit_libcall_block (insns, dest, rax, x);
12434 }
12435 else
12436 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12437 }
12438 break;
12439
12440 case TLS_MODEL_LOCAL_DYNAMIC:
12441 base = gen_reg_rtx (Pmode);
12442
12443 if (!TARGET_64BIT)
12444 {
12445 if (flag_pic)
12446 pic = pic_offset_table_rtx;
12447 else
12448 {
12449 pic = gen_reg_rtx (Pmode);
12450 emit_insn (gen_set_got (pic));
12451 }
12452 }
12453
12454 if (TARGET_GNU2_TLS)
12455 {
12456 rtx tmp = ix86_tls_module_base ();
12457
12458 if (TARGET_64BIT)
12459 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12460 else
12461 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12462
12463 tp = get_thread_pointer (true);
12464 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12465 gen_rtx_MINUS (Pmode, tmp, tp));
12466 }
12467 else
12468 {
12469 rtx caddr = ix86_tls_get_addr ();
12470
12471 if (TARGET_64BIT)
12472 {
12473 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
12474
12475 start_sequence ();
12476 emit_call_insn (gen_tls_local_dynamic_base_64 (rax, caddr));
12477 insns = get_insns ();
12478 end_sequence ();
12479
12480 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12481 share the LD_BASE result with other LD model accesses. */
12482 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12483 UNSPEC_TLS_LD_BASE);
12484
12485 RTL_CONST_CALL_P (insns) = 1;
12486 emit_libcall_block (insns, base, rax, eqv);
12487 }
12488 else
12489 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12490 }
12491
12492 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12493 off = gen_rtx_CONST (Pmode, off);
12494
12495 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12496
12497 if (TARGET_GNU2_TLS)
12498 {
12499 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12500
12501 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12502 }
12503 break;
12504
12505 case TLS_MODEL_INITIAL_EXEC:
12506 if (TARGET_64BIT)
12507 {
12508 if (TARGET_SUN_TLS)
12509 {
12510 /* The Sun linker took the AMD64 TLS spec literally
12511 and can only handle %rax as destination of the
12512 initial executable code sequence. */
12513
12514 dest = gen_reg_rtx (Pmode);
12515 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12516 return dest;
12517 }
12518
12519 pic = NULL;
12520 type = UNSPEC_GOTNTPOFF;
12521 }
12522 else if (flag_pic)
12523 {
12524 if (reload_in_progress)
12525 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12526 pic = pic_offset_table_rtx;
12527 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12528 }
12529 else if (!TARGET_ANY_GNU_TLS)
12530 {
12531 pic = gen_reg_rtx (Pmode);
12532 emit_insn (gen_set_got (pic));
12533 type = UNSPEC_GOTTPOFF;
12534 }
12535 else
12536 {
12537 pic = NULL;
12538 type = UNSPEC_INDNTPOFF;
12539 }
12540
12541 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), type);
12542 off = gen_rtx_CONST (Pmode, off);
12543 if (pic)
12544 off = gen_rtx_PLUS (Pmode, pic, off);
12545 off = gen_const_mem (Pmode, off);
12546 set_mem_alias_set (off, ix86_GOT_alias_set ());
12547
12548 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12549 {
12550 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12551 off = force_reg (Pmode, off);
12552 return gen_rtx_PLUS (Pmode, base, off);
12553 }
12554 else
12555 {
12556 base = get_thread_pointer (true);
12557 dest = gen_reg_rtx (Pmode);
12558 emit_insn (gen_subsi3 (dest, base, off));
12559 }
12560 break;
12561
12562 case TLS_MODEL_LOCAL_EXEC:
12563 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12564 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12565 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12566 off = gen_rtx_CONST (Pmode, off);
12567
12568 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12569 {
12570 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12571 return gen_rtx_PLUS (Pmode, base, off);
12572 }
12573 else
12574 {
12575 base = get_thread_pointer (true);
12576 dest = gen_reg_rtx (Pmode);
12577 emit_insn (gen_subsi3 (dest, base, off));
12578 }
12579 break;
12580
12581 default:
12582 gcc_unreachable ();
12583 }
12584
12585 return dest;
12586 }
12587
12588 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12589 to symbol DECL. */
12590
12591 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12592 htab_t dllimport_map;
12593
12594 static tree
12595 get_dllimport_decl (tree decl)
12596 {
12597 struct tree_map *h, in;
12598 void **loc;
12599 const char *name;
12600 const char *prefix;
12601 size_t namelen, prefixlen;
12602 char *imp_name;
12603 tree to;
12604 rtx rtl;
12605
12606 if (!dllimport_map)
12607 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
12608
12609 in.hash = htab_hash_pointer (decl);
12610 in.base.from = decl;
12611 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
12612 h = (struct tree_map *) *loc;
12613 if (h)
12614 return h->to;
12615
12616 *loc = h = ggc_alloc_tree_map ();
12617 h->hash = in.hash;
12618 h->base.from = decl;
12619 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
12620 VAR_DECL, NULL, ptr_type_node);
12621 DECL_ARTIFICIAL (to) = 1;
12622 DECL_IGNORED_P (to) = 1;
12623 DECL_EXTERNAL (to) = 1;
12624 TREE_READONLY (to) = 1;
12625
12626 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
12627 name = targetm.strip_name_encoding (name);
12628 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
12629 ? "*__imp_" : "*__imp__";
12630 namelen = strlen (name);
12631 prefixlen = strlen (prefix);
12632 imp_name = (char *) alloca (namelen + prefixlen + 1);
12633 memcpy (imp_name, prefix, prefixlen);
12634 memcpy (imp_name + prefixlen, name, namelen + 1);
12635
12636 name = ggc_alloc_string (imp_name, namelen + prefixlen);
12637 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
12638 SET_SYMBOL_REF_DECL (rtl, to);
12639 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
12640
12641 rtl = gen_const_mem (Pmode, rtl);
12642 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
12643
12644 SET_DECL_RTL (to, rtl);
12645 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
12646
12647 return to;
12648 }
12649
12650 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
12651 true if we require the result be a register. */
12652
12653 static rtx
12654 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
12655 {
12656 tree imp_decl;
12657 rtx x;
12658
12659 gcc_assert (SYMBOL_REF_DECL (symbol));
12660 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
12661
12662 x = DECL_RTL (imp_decl);
12663 if (want_reg)
12664 x = force_reg (Pmode, x);
12665 return x;
12666 }
12667
12668 /* Try machine-dependent ways of modifying an illegitimate address
12669 to be legitimate. If we find one, return the new, valid address.
12670 This macro is used in only one place: `memory_address' in explow.c.
12671
12672 OLDX is the address as it was before break_out_memory_refs was called.
12673 In some cases it is useful to look at this to decide what needs to be done.
12674
12675 It is always safe for this macro to do nothing. It exists to recognize
12676 opportunities to optimize the output.
12677
12678 For the 80386, we handle X+REG by loading X into a register R and
12679 using R+REG. R will go in a general reg and indexing will be used.
12680 However, if REG is a broken-out memory address or multiplication,
12681 nothing needs to be done because REG can certainly go in a general reg.
12682
12683 When -fpic is used, special handling is needed for symbolic references.
12684 See comments by legitimize_pic_address in i386.c for details. */
12685
12686 static rtx
12687 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
12688 enum machine_mode mode)
12689 {
12690 int changed = 0;
12691 unsigned log;
12692
12693 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
12694 if (log)
12695 return legitimize_tls_address (x, (enum tls_model) log, false);
12696 if (GET_CODE (x) == CONST
12697 && GET_CODE (XEXP (x, 0)) == PLUS
12698 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12699 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
12700 {
12701 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
12702 (enum tls_model) log, false);
12703 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12704 }
12705
12706 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12707 {
12708 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
12709 return legitimize_dllimport_symbol (x, true);
12710 if (GET_CODE (x) == CONST
12711 && GET_CODE (XEXP (x, 0)) == PLUS
12712 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12713 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
12714 {
12715 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
12716 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12717 }
12718 }
12719
12720 if (flag_pic && SYMBOLIC_CONST (x))
12721 return legitimize_pic_address (x, 0);
12722
12723 #if TARGET_MACHO
12724 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
12725 return machopic_indirect_data_reference (x, 0);
12726 #endif
12727
12728 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
12729 if (GET_CODE (x) == ASHIFT
12730 && CONST_INT_P (XEXP (x, 1))
12731 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
12732 {
12733 changed = 1;
12734 log = INTVAL (XEXP (x, 1));
12735 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
12736 GEN_INT (1 << log));
12737 }
12738
12739 if (GET_CODE (x) == PLUS)
12740 {
12741 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
12742
12743 if (GET_CODE (XEXP (x, 0)) == ASHIFT
12744 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12745 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
12746 {
12747 changed = 1;
12748 log = INTVAL (XEXP (XEXP (x, 0), 1));
12749 XEXP (x, 0) = gen_rtx_MULT (Pmode,
12750 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
12751 GEN_INT (1 << log));
12752 }
12753
12754 if (GET_CODE (XEXP (x, 1)) == ASHIFT
12755 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
12756 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
12757 {
12758 changed = 1;
12759 log = INTVAL (XEXP (XEXP (x, 1), 1));
12760 XEXP (x, 1) = gen_rtx_MULT (Pmode,
12761 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
12762 GEN_INT (1 << log));
12763 }
12764
12765 /* Put multiply first if it isn't already. */
12766 if (GET_CODE (XEXP (x, 1)) == MULT)
12767 {
12768 rtx tmp = XEXP (x, 0);
12769 XEXP (x, 0) = XEXP (x, 1);
12770 XEXP (x, 1) = tmp;
12771 changed = 1;
12772 }
12773
12774 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
12775 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
12776 created by virtual register instantiation, register elimination, and
12777 similar optimizations. */
12778 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
12779 {
12780 changed = 1;
12781 x = gen_rtx_PLUS (Pmode,
12782 gen_rtx_PLUS (Pmode, XEXP (x, 0),
12783 XEXP (XEXP (x, 1), 0)),
12784 XEXP (XEXP (x, 1), 1));
12785 }
12786
12787 /* Canonicalize
12788 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
12789 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
12790 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
12791 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
12792 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
12793 && CONSTANT_P (XEXP (x, 1)))
12794 {
12795 rtx constant;
12796 rtx other = NULL_RTX;
12797
12798 if (CONST_INT_P (XEXP (x, 1)))
12799 {
12800 constant = XEXP (x, 1);
12801 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
12802 }
12803 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
12804 {
12805 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
12806 other = XEXP (x, 1);
12807 }
12808 else
12809 constant = 0;
12810
12811 if (constant)
12812 {
12813 changed = 1;
12814 x = gen_rtx_PLUS (Pmode,
12815 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
12816 XEXP (XEXP (XEXP (x, 0), 1), 0)),
12817 plus_constant (other, INTVAL (constant)));
12818 }
12819 }
12820
12821 if (changed && ix86_legitimate_address_p (mode, x, false))
12822 return x;
12823
12824 if (GET_CODE (XEXP (x, 0)) == MULT)
12825 {
12826 changed = 1;
12827 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
12828 }
12829
12830 if (GET_CODE (XEXP (x, 1)) == MULT)
12831 {
12832 changed = 1;
12833 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
12834 }
12835
12836 if (changed
12837 && REG_P (XEXP (x, 1))
12838 && REG_P (XEXP (x, 0)))
12839 return x;
12840
12841 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
12842 {
12843 changed = 1;
12844 x = legitimize_pic_address (x, 0);
12845 }
12846
12847 if (changed && ix86_legitimate_address_p (mode, x, false))
12848 return x;
12849
12850 if (REG_P (XEXP (x, 0)))
12851 {
12852 rtx temp = gen_reg_rtx (Pmode);
12853 rtx val = force_operand (XEXP (x, 1), temp);
12854 if (val != temp)
12855 {
12856 if (GET_MODE (val) != Pmode)
12857 val = convert_to_mode (Pmode, val, 1);
12858 emit_move_insn (temp, val);
12859 }
12860
12861 XEXP (x, 1) = temp;
12862 return x;
12863 }
12864
12865 else if (REG_P (XEXP (x, 1)))
12866 {
12867 rtx temp = gen_reg_rtx (Pmode);
12868 rtx val = force_operand (XEXP (x, 0), temp);
12869 if (val != temp)
12870 {
12871 if (GET_MODE (val) != Pmode)
12872 val = convert_to_mode (Pmode, val, 1);
12873 emit_move_insn (temp, val);
12874 }
12875
12876 XEXP (x, 0) = temp;
12877 return x;
12878 }
12879 }
12880
12881 return x;
12882 }
12883 \f
12884 /* Print an integer constant expression in assembler syntax. Addition
12885 and subtraction are the only arithmetic that may appear in these
12886 expressions. FILE is the stdio stream to write to, X is the rtx, and
12887 CODE is the operand print code from the output string. */
12888
12889 static void
12890 output_pic_addr_const (FILE *file, rtx x, int code)
12891 {
12892 char buf[256];
12893
12894 switch (GET_CODE (x))
12895 {
12896 case PC:
12897 gcc_assert (flag_pic);
12898 putc ('.', file);
12899 break;
12900
12901 case SYMBOL_REF:
12902 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
12903 output_addr_const (file, x);
12904 else
12905 {
12906 const char *name = XSTR (x, 0);
12907
12908 /* Mark the decl as referenced so that cgraph will
12909 output the function. */
12910 if (SYMBOL_REF_DECL (x))
12911 mark_decl_referenced (SYMBOL_REF_DECL (x));
12912
12913 #if TARGET_MACHO
12914 if (MACHOPIC_INDIRECT
12915 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
12916 name = machopic_indirection_name (x, /*stub_p=*/true);
12917 #endif
12918 assemble_name (file, name);
12919 }
12920 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12921 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
12922 fputs ("@PLT", file);
12923 break;
12924
12925 case LABEL_REF:
12926 x = XEXP (x, 0);
12927 /* FALLTHRU */
12928 case CODE_LABEL:
12929 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
12930 assemble_name (asm_out_file, buf);
12931 break;
12932
12933 case CONST_INT:
12934 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
12935 break;
12936
12937 case CONST:
12938 /* This used to output parentheses around the expression,
12939 but that does not work on the 386 (either ATT or BSD assembler). */
12940 output_pic_addr_const (file, XEXP (x, 0), code);
12941 break;
12942
12943 case CONST_DOUBLE:
12944 if (GET_MODE (x) == VOIDmode)
12945 {
12946 /* We can use %d if the number is <32 bits and positive. */
12947 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
12948 fprintf (file, "0x%lx%08lx",
12949 (unsigned long) CONST_DOUBLE_HIGH (x),
12950 (unsigned long) CONST_DOUBLE_LOW (x));
12951 else
12952 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
12953 }
12954 else
12955 /* We can't handle floating point constants;
12956 TARGET_PRINT_OPERAND must handle them. */
12957 output_operand_lossage ("floating constant misused");
12958 break;
12959
12960 case PLUS:
12961 /* Some assemblers need integer constants to appear first. */
12962 if (CONST_INT_P (XEXP (x, 0)))
12963 {
12964 output_pic_addr_const (file, XEXP (x, 0), code);
12965 putc ('+', file);
12966 output_pic_addr_const (file, XEXP (x, 1), code);
12967 }
12968 else
12969 {
12970 gcc_assert (CONST_INT_P (XEXP (x, 1)));
12971 output_pic_addr_const (file, XEXP (x, 1), code);
12972 putc ('+', file);
12973 output_pic_addr_const (file, XEXP (x, 0), code);
12974 }
12975 break;
12976
12977 case MINUS:
12978 if (!TARGET_MACHO)
12979 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
12980 output_pic_addr_const (file, XEXP (x, 0), code);
12981 putc ('-', file);
12982 output_pic_addr_const (file, XEXP (x, 1), code);
12983 if (!TARGET_MACHO)
12984 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
12985 break;
12986
12987 case UNSPEC:
12988 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
12989 {
12990 bool f = i386_asm_output_addr_const_extra (file, x);
12991 gcc_assert (f);
12992 break;
12993 }
12994
12995 gcc_assert (XVECLEN (x, 0) == 1);
12996 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
12997 switch (XINT (x, 1))
12998 {
12999 case UNSPEC_GOT:
13000 fputs ("@GOT", file);
13001 break;
13002 case UNSPEC_GOTOFF:
13003 fputs ("@GOTOFF", file);
13004 break;
13005 case UNSPEC_PLTOFF:
13006 fputs ("@PLTOFF", file);
13007 break;
13008 case UNSPEC_PCREL:
13009 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13010 "(%rip)" : "[rip]", file);
13011 break;
13012 case UNSPEC_GOTPCREL:
13013 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13014 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13015 break;
13016 case UNSPEC_GOTTPOFF:
13017 /* FIXME: This might be @TPOFF in Sun ld too. */
13018 fputs ("@gottpoff", file);
13019 break;
13020 case UNSPEC_TPOFF:
13021 fputs ("@tpoff", file);
13022 break;
13023 case UNSPEC_NTPOFF:
13024 if (TARGET_64BIT)
13025 fputs ("@tpoff", file);
13026 else
13027 fputs ("@ntpoff", file);
13028 break;
13029 case UNSPEC_DTPOFF:
13030 fputs ("@dtpoff", file);
13031 break;
13032 case UNSPEC_GOTNTPOFF:
13033 if (TARGET_64BIT)
13034 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13035 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13036 else
13037 fputs ("@gotntpoff", file);
13038 break;
13039 case UNSPEC_INDNTPOFF:
13040 fputs ("@indntpoff", file);
13041 break;
13042 #if TARGET_MACHO
13043 case UNSPEC_MACHOPIC_OFFSET:
13044 putc ('-', file);
13045 machopic_output_function_base_name (file);
13046 break;
13047 #endif
13048 default:
13049 output_operand_lossage ("invalid UNSPEC as operand");
13050 break;
13051 }
13052 break;
13053
13054 default:
13055 output_operand_lossage ("invalid expression as operand");
13056 }
13057 }
13058
13059 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13060 We need to emit DTP-relative relocations. */
13061
13062 static void ATTRIBUTE_UNUSED
13063 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13064 {
13065 fputs (ASM_LONG, file);
13066 output_addr_const (file, x);
13067 fputs ("@dtpoff", file);
13068 switch (size)
13069 {
13070 case 4:
13071 break;
13072 case 8:
13073 fputs (", 0", file);
13074 break;
13075 default:
13076 gcc_unreachable ();
13077 }
13078 }
13079
13080 /* Return true if X is a representation of the PIC register. This copes
13081 with calls from ix86_find_base_term, where the register might have
13082 been replaced by a cselib value. */
13083
13084 static bool
13085 ix86_pic_register_p (rtx x)
13086 {
13087 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13088 return (pic_offset_table_rtx
13089 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13090 else
13091 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13092 }
13093
13094 /* Helper function for ix86_delegitimize_address.
13095 Attempt to delegitimize TLS local-exec accesses. */
13096
13097 static rtx
13098 ix86_delegitimize_tls_address (rtx orig_x)
13099 {
13100 rtx x = orig_x, unspec;
13101 struct ix86_address addr;
13102
13103 if (!TARGET_TLS_DIRECT_SEG_REFS)
13104 return orig_x;
13105 if (MEM_P (x))
13106 x = XEXP (x, 0);
13107 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13108 return orig_x;
13109 if (ix86_decompose_address (x, &addr) == 0
13110 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13111 || addr.disp == NULL_RTX
13112 || GET_CODE (addr.disp) != CONST)
13113 return orig_x;
13114 unspec = XEXP (addr.disp, 0);
13115 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13116 unspec = XEXP (unspec, 0);
13117 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13118 return orig_x;
13119 x = XVECEXP (unspec, 0, 0);
13120 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13121 if (unspec != XEXP (addr.disp, 0))
13122 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13123 if (addr.index)
13124 {
13125 rtx idx = addr.index;
13126 if (addr.scale != 1)
13127 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13128 x = gen_rtx_PLUS (Pmode, idx, x);
13129 }
13130 if (addr.base)
13131 x = gen_rtx_PLUS (Pmode, addr.base, x);
13132 if (MEM_P (orig_x))
13133 x = replace_equiv_address_nv (orig_x, x);
13134 return x;
13135 }
13136
13137 /* In the name of slightly smaller debug output, and to cater to
13138 general assembler lossage, recognize PIC+GOTOFF and turn it back
13139 into a direct symbol reference.
13140
13141 On Darwin, this is necessary to avoid a crash, because Darwin
13142 has a different PIC label for each routine but the DWARF debugging
13143 information is not associated with any particular routine, so it's
13144 necessary to remove references to the PIC label from RTL stored by
13145 the DWARF output code. */
13146
13147 static rtx
13148 ix86_delegitimize_address (rtx x)
13149 {
13150 rtx orig_x = delegitimize_mem_from_attrs (x);
13151 /* addend is NULL or some rtx if x is something+GOTOFF where
13152 something doesn't include the PIC register. */
13153 rtx addend = NULL_RTX;
13154 /* reg_addend is NULL or a multiple of some register. */
13155 rtx reg_addend = NULL_RTX;
13156 /* const_addend is NULL or a const_int. */
13157 rtx const_addend = NULL_RTX;
13158 /* This is the result, or NULL. */
13159 rtx result = NULL_RTX;
13160
13161 x = orig_x;
13162
13163 if (MEM_P (x))
13164 x = XEXP (x, 0);
13165
13166 if (TARGET_64BIT)
13167 {
13168 if (GET_CODE (x) != CONST
13169 || GET_CODE (XEXP (x, 0)) != UNSPEC
13170 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13171 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13172 || !MEM_P (orig_x))
13173 return ix86_delegitimize_tls_address (orig_x);
13174 x = XVECEXP (XEXP (x, 0), 0, 0);
13175 if (GET_MODE (orig_x) != GET_MODE (x))
13176 {
13177 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13178 GET_MODE (x), 0);
13179 if (x == NULL_RTX)
13180 return orig_x;
13181 }
13182 return x;
13183 }
13184
13185 if (GET_CODE (x) != PLUS
13186 || GET_CODE (XEXP (x, 1)) != CONST)
13187 return ix86_delegitimize_tls_address (orig_x);
13188
13189 if (ix86_pic_register_p (XEXP (x, 0)))
13190 /* %ebx + GOT/GOTOFF */
13191 ;
13192 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13193 {
13194 /* %ebx + %reg * scale + GOT/GOTOFF */
13195 reg_addend = XEXP (x, 0);
13196 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13197 reg_addend = XEXP (reg_addend, 1);
13198 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13199 reg_addend = XEXP (reg_addend, 0);
13200 else
13201 {
13202 reg_addend = NULL_RTX;
13203 addend = XEXP (x, 0);
13204 }
13205 }
13206 else
13207 addend = XEXP (x, 0);
13208
13209 x = XEXP (XEXP (x, 1), 0);
13210 if (GET_CODE (x) == PLUS
13211 && CONST_INT_P (XEXP (x, 1)))
13212 {
13213 const_addend = XEXP (x, 1);
13214 x = XEXP (x, 0);
13215 }
13216
13217 if (GET_CODE (x) == UNSPEC
13218 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13219 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13220 result = XVECEXP (x, 0, 0);
13221
13222 if (TARGET_MACHO && darwin_local_data_pic (x)
13223 && !MEM_P (orig_x))
13224 result = XVECEXP (x, 0, 0);
13225
13226 if (! result)
13227 return ix86_delegitimize_tls_address (orig_x);
13228
13229 if (const_addend)
13230 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13231 if (reg_addend)
13232 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13233 if (addend)
13234 {
13235 /* If the rest of original X doesn't involve the PIC register, add
13236 addend and subtract pic_offset_table_rtx. This can happen e.g.
13237 for code like:
13238 leal (%ebx, %ecx, 4), %ecx
13239 ...
13240 movl foo@GOTOFF(%ecx), %edx
13241 in which case we return (%ecx - %ebx) + foo. */
13242 if (pic_offset_table_rtx)
13243 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13244 pic_offset_table_rtx),
13245 result);
13246 else
13247 return orig_x;
13248 }
13249 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13250 {
13251 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13252 if (result == NULL_RTX)
13253 return orig_x;
13254 }
13255 return result;
13256 }
13257
13258 /* If X is a machine specific address (i.e. a symbol or label being
13259 referenced as a displacement from the GOT implemented using an
13260 UNSPEC), then return the base term. Otherwise return X. */
13261
13262 rtx
13263 ix86_find_base_term (rtx x)
13264 {
13265 rtx term;
13266
13267 if (TARGET_64BIT)
13268 {
13269 if (GET_CODE (x) != CONST)
13270 return x;
13271 term = XEXP (x, 0);
13272 if (GET_CODE (term) == PLUS
13273 && (CONST_INT_P (XEXP (term, 1))
13274 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13275 term = XEXP (term, 0);
13276 if (GET_CODE (term) != UNSPEC
13277 || (XINT (term, 1) != UNSPEC_GOTPCREL
13278 && XINT (term, 1) != UNSPEC_PCREL))
13279 return x;
13280
13281 return XVECEXP (term, 0, 0);
13282 }
13283
13284 return ix86_delegitimize_address (x);
13285 }
13286 \f
13287 static void
13288 put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
13289 int fp, FILE *file)
13290 {
13291 const char *suffix;
13292
13293 if (mode == CCFPmode || mode == CCFPUmode)
13294 {
13295 code = ix86_fp_compare_code_to_integer (code);
13296 mode = CCmode;
13297 }
13298 if (reverse)
13299 code = reverse_condition (code);
13300
13301 switch (code)
13302 {
13303 case EQ:
13304 switch (mode)
13305 {
13306 case CCAmode:
13307 suffix = "a";
13308 break;
13309
13310 case CCCmode:
13311 suffix = "c";
13312 break;
13313
13314 case CCOmode:
13315 suffix = "o";
13316 break;
13317
13318 case CCSmode:
13319 suffix = "s";
13320 break;
13321
13322 default:
13323 suffix = "e";
13324 }
13325 break;
13326 case NE:
13327 switch (mode)
13328 {
13329 case CCAmode:
13330 suffix = "na";
13331 break;
13332
13333 case CCCmode:
13334 suffix = "nc";
13335 break;
13336
13337 case CCOmode:
13338 suffix = "no";
13339 break;
13340
13341 case CCSmode:
13342 suffix = "ns";
13343 break;
13344
13345 default:
13346 suffix = "ne";
13347 }
13348 break;
13349 case GT:
13350 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13351 suffix = "g";
13352 break;
13353 case GTU:
13354 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13355 Those same assemblers have the same but opposite lossage on cmov. */
13356 if (mode == CCmode)
13357 suffix = fp ? "nbe" : "a";
13358 else if (mode == CCCmode)
13359 suffix = "b";
13360 else
13361 gcc_unreachable ();
13362 break;
13363 case LT:
13364 switch (mode)
13365 {
13366 case CCNOmode:
13367 case CCGOCmode:
13368 suffix = "s";
13369 break;
13370
13371 case CCmode:
13372 case CCGCmode:
13373 suffix = "l";
13374 break;
13375
13376 default:
13377 gcc_unreachable ();
13378 }
13379 break;
13380 case LTU:
13381 gcc_assert (mode == CCmode || mode == CCCmode);
13382 suffix = "b";
13383 break;
13384 case GE:
13385 switch (mode)
13386 {
13387 case CCNOmode:
13388 case CCGOCmode:
13389 suffix = "ns";
13390 break;
13391
13392 case CCmode:
13393 case CCGCmode:
13394 suffix = "ge";
13395 break;
13396
13397 default:
13398 gcc_unreachable ();
13399 }
13400 break;
13401 case GEU:
13402 /* ??? As above. */
13403 gcc_assert (mode == CCmode || mode == CCCmode);
13404 suffix = fp ? "nb" : "ae";
13405 break;
13406 case LE:
13407 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13408 suffix = "le";
13409 break;
13410 case LEU:
13411 /* ??? As above. */
13412 if (mode == CCmode)
13413 suffix = "be";
13414 else if (mode == CCCmode)
13415 suffix = fp ? "nb" : "ae";
13416 else
13417 gcc_unreachable ();
13418 break;
13419 case UNORDERED:
13420 suffix = fp ? "u" : "p";
13421 break;
13422 case ORDERED:
13423 suffix = fp ? "nu" : "np";
13424 break;
13425 default:
13426 gcc_unreachable ();
13427 }
13428 fputs (suffix, file);
13429 }
13430
13431 /* Print the name of register X to FILE based on its machine mode and number.
13432 If CODE is 'w', pretend the mode is HImode.
13433 If CODE is 'b', pretend the mode is QImode.
13434 If CODE is 'k', pretend the mode is SImode.
13435 If CODE is 'q', pretend the mode is DImode.
13436 If CODE is 'x', pretend the mode is V4SFmode.
13437 If CODE is 't', pretend the mode is V8SFmode.
13438 If CODE is 'h', pretend the reg is the 'high' byte register.
13439 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13440 If CODE is 'd', duplicate the operand for AVX instruction.
13441 */
13442
13443 void
13444 print_reg (rtx x, int code, FILE *file)
13445 {
13446 const char *reg;
13447 bool duplicated = code == 'd' && TARGET_AVX;
13448
13449 gcc_assert (x == pc_rtx
13450 || (REGNO (x) != ARG_POINTER_REGNUM
13451 && REGNO (x) != FRAME_POINTER_REGNUM
13452 && REGNO (x) != FLAGS_REG
13453 && REGNO (x) != FPSR_REG
13454 && REGNO (x) != FPCR_REG));
13455
13456 if (ASSEMBLER_DIALECT == ASM_ATT)
13457 putc ('%', file);
13458
13459 if (x == pc_rtx)
13460 {
13461 gcc_assert (TARGET_64BIT);
13462 fputs ("rip", file);
13463 return;
13464 }
13465
13466 if (code == 'w' || MMX_REG_P (x))
13467 code = 2;
13468 else if (code == 'b')
13469 code = 1;
13470 else if (code == 'k')
13471 code = 4;
13472 else if (code == 'q')
13473 code = 8;
13474 else if (code == 'y')
13475 code = 3;
13476 else if (code == 'h')
13477 code = 0;
13478 else if (code == 'x')
13479 code = 16;
13480 else if (code == 't')
13481 code = 32;
13482 else
13483 code = GET_MODE_SIZE (GET_MODE (x));
13484
13485 /* Irritatingly, AMD extended registers use different naming convention
13486 from the normal registers. */
13487 if (REX_INT_REG_P (x))
13488 {
13489 gcc_assert (TARGET_64BIT);
13490 switch (code)
13491 {
13492 case 0:
13493 error ("extended registers have no high halves");
13494 break;
13495 case 1:
13496 fprintf (file, "r%ib", REGNO (x) - FIRST_REX_INT_REG + 8);
13497 break;
13498 case 2:
13499 fprintf (file, "r%iw", REGNO (x) - FIRST_REX_INT_REG + 8);
13500 break;
13501 case 4:
13502 fprintf (file, "r%id", REGNO (x) - FIRST_REX_INT_REG + 8);
13503 break;
13504 case 8:
13505 fprintf (file, "r%i", REGNO (x) - FIRST_REX_INT_REG + 8);
13506 break;
13507 default:
13508 error ("unsupported operand size for extended register");
13509 break;
13510 }
13511 return;
13512 }
13513
13514 reg = NULL;
13515 switch (code)
13516 {
13517 case 3:
13518 if (STACK_TOP_P (x))
13519 {
13520 reg = "st(0)";
13521 break;
13522 }
13523 /* FALLTHRU */
13524 case 8:
13525 case 4:
13526 case 12:
13527 if (! ANY_FP_REG_P (x))
13528 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13529 /* FALLTHRU */
13530 case 16:
13531 case 2:
13532 normal:
13533 reg = hi_reg_name[REGNO (x)];
13534 break;
13535 case 1:
13536 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
13537 goto normal;
13538 reg = qi_reg_name[REGNO (x)];
13539 break;
13540 case 0:
13541 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
13542 goto normal;
13543 reg = qi_high_reg_name[REGNO (x)];
13544 break;
13545 case 32:
13546 if (SSE_REG_P (x))
13547 {
13548 gcc_assert (!duplicated);
13549 putc ('y', file);
13550 fputs (hi_reg_name[REGNO (x)] + 1, file);
13551 return;
13552 }
13553 break;
13554 default:
13555 gcc_unreachable ();
13556 }
13557
13558 fputs (reg, file);
13559 if (duplicated)
13560 {
13561 if (ASSEMBLER_DIALECT == ASM_ATT)
13562 fprintf (file, ", %%%s", reg);
13563 else
13564 fprintf (file, ", %s", reg);
13565 }
13566 }
13567
13568 /* Locate some local-dynamic symbol still in use by this function
13569 so that we can print its name in some tls_local_dynamic_base
13570 pattern. */
13571
13572 static int
13573 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13574 {
13575 rtx x = *px;
13576
13577 if (GET_CODE (x) == SYMBOL_REF
13578 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13579 {
13580 cfun->machine->some_ld_name = XSTR (x, 0);
13581 return 1;
13582 }
13583
13584 return 0;
13585 }
13586
13587 static const char *
13588 get_some_local_dynamic_name (void)
13589 {
13590 rtx insn;
13591
13592 if (cfun->machine->some_ld_name)
13593 return cfun->machine->some_ld_name;
13594
13595 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
13596 if (NONDEBUG_INSN_P (insn)
13597 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
13598 return cfun->machine->some_ld_name;
13599
13600 return NULL;
13601 }
13602
13603 /* Meaning of CODE:
13604 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
13605 C -- print opcode suffix for set/cmov insn.
13606 c -- like C, but print reversed condition
13607 F,f -- likewise, but for floating-point.
13608 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
13609 otherwise nothing
13610 R -- print the prefix for register names.
13611 z -- print the opcode suffix for the size of the current operand.
13612 Z -- likewise, with special suffixes for x87 instructions.
13613 * -- print a star (in certain assembler syntax)
13614 A -- print an absolute memory reference.
13615 w -- print the operand as if it's a "word" (HImode) even if it isn't.
13616 s -- print a shift double count, followed by the assemblers argument
13617 delimiter.
13618 b -- print the QImode name of the register for the indicated operand.
13619 %b0 would print %al if operands[0] is reg 0.
13620 w -- likewise, print the HImode name of the register.
13621 k -- likewise, print the SImode name of the register.
13622 q -- likewise, print the DImode name of the register.
13623 x -- likewise, print the V4SFmode name of the register.
13624 t -- likewise, print the V8SFmode name of the register.
13625 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
13626 y -- print "st(0)" instead of "st" as a register.
13627 d -- print duplicated register operand for AVX instruction.
13628 D -- print condition for SSE cmp instruction.
13629 P -- if PIC, print an @PLT suffix.
13630 p -- print raw symbol name.
13631 X -- don't print any sort of PIC '@' suffix for a symbol.
13632 & -- print some in-use local-dynamic symbol name.
13633 H -- print a memory address offset by 8; used for sse high-parts
13634 Y -- print condition for XOP pcom* instruction.
13635 + -- print a branch hint as 'cs' or 'ds' prefix
13636 ; -- print a semicolon (after prefixes due to bug in older gas).
13637 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
13638 @ -- print a segment register of thread base pointer load
13639 */
13640
13641 void
13642 ix86_print_operand (FILE *file, rtx x, int code)
13643 {
13644 if (code)
13645 {
13646 switch (code)
13647 {
13648 case '*':
13649 if (ASSEMBLER_DIALECT == ASM_ATT)
13650 putc ('*', file);
13651 return;
13652
13653 case '&':
13654 {
13655 const char *name = get_some_local_dynamic_name ();
13656 if (name == NULL)
13657 output_operand_lossage ("'%%&' used without any "
13658 "local dynamic TLS references");
13659 else
13660 assemble_name (file, name);
13661 return;
13662 }
13663
13664 case 'A':
13665 switch (ASSEMBLER_DIALECT)
13666 {
13667 case ASM_ATT:
13668 putc ('*', file);
13669 break;
13670
13671 case ASM_INTEL:
13672 /* Intel syntax. For absolute addresses, registers should not
13673 be surrounded by braces. */
13674 if (!REG_P (x))
13675 {
13676 putc ('[', file);
13677 ix86_print_operand (file, x, 0);
13678 putc (']', file);
13679 return;
13680 }
13681 break;
13682
13683 default:
13684 gcc_unreachable ();
13685 }
13686
13687 ix86_print_operand (file, x, 0);
13688 return;
13689
13690
13691 case 'L':
13692 if (ASSEMBLER_DIALECT == ASM_ATT)
13693 putc ('l', file);
13694 return;
13695
13696 case 'W':
13697 if (ASSEMBLER_DIALECT == ASM_ATT)
13698 putc ('w', file);
13699 return;
13700
13701 case 'B':
13702 if (ASSEMBLER_DIALECT == ASM_ATT)
13703 putc ('b', file);
13704 return;
13705
13706 case 'Q':
13707 if (ASSEMBLER_DIALECT == ASM_ATT)
13708 putc ('l', file);
13709 return;
13710
13711 case 'S':
13712 if (ASSEMBLER_DIALECT == ASM_ATT)
13713 putc ('s', file);
13714 return;
13715
13716 case 'T':
13717 if (ASSEMBLER_DIALECT == ASM_ATT)
13718 putc ('t', file);
13719 return;
13720
13721 case 'z':
13722 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13723 {
13724 /* Opcodes don't get size suffixes if using Intel opcodes. */
13725 if (ASSEMBLER_DIALECT == ASM_INTEL)
13726 return;
13727
13728 switch (GET_MODE_SIZE (GET_MODE (x)))
13729 {
13730 case 1:
13731 putc ('b', file);
13732 return;
13733
13734 case 2:
13735 putc ('w', file);
13736 return;
13737
13738 case 4:
13739 putc ('l', file);
13740 return;
13741
13742 case 8:
13743 putc ('q', file);
13744 return;
13745
13746 default:
13747 output_operand_lossage
13748 ("invalid operand size for operand code '%c'", code);
13749 return;
13750 }
13751 }
13752
13753 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13754 warning
13755 (0, "non-integer operand used with operand code '%c'", code);
13756 /* FALLTHRU */
13757
13758 case 'Z':
13759 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
13760 if (ASSEMBLER_DIALECT == ASM_INTEL)
13761 return;
13762
13763 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13764 {
13765 switch (GET_MODE_SIZE (GET_MODE (x)))
13766 {
13767 case 2:
13768 #ifdef HAVE_AS_IX86_FILDS
13769 putc ('s', file);
13770 #endif
13771 return;
13772
13773 case 4:
13774 putc ('l', file);
13775 return;
13776
13777 case 8:
13778 #ifdef HAVE_AS_IX86_FILDQ
13779 putc ('q', file);
13780 #else
13781 fputs ("ll", file);
13782 #endif
13783 return;
13784
13785 default:
13786 break;
13787 }
13788 }
13789 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13790 {
13791 /* 387 opcodes don't get size suffixes
13792 if the operands are registers. */
13793 if (STACK_REG_P (x))
13794 return;
13795
13796 switch (GET_MODE_SIZE (GET_MODE (x)))
13797 {
13798 case 4:
13799 putc ('s', file);
13800 return;
13801
13802 case 8:
13803 putc ('l', file);
13804 return;
13805
13806 case 12:
13807 case 16:
13808 putc ('t', file);
13809 return;
13810
13811 default:
13812 break;
13813 }
13814 }
13815 else
13816 {
13817 output_operand_lossage
13818 ("invalid operand type used with operand code '%c'", code);
13819 return;
13820 }
13821
13822 output_operand_lossage
13823 ("invalid operand size for operand code '%c'", code);
13824 return;
13825
13826 case 'd':
13827 case 'b':
13828 case 'w':
13829 case 'k':
13830 case 'q':
13831 case 'h':
13832 case 't':
13833 case 'y':
13834 case 'x':
13835 case 'X':
13836 case 'P':
13837 case 'p':
13838 break;
13839
13840 case 's':
13841 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
13842 {
13843 ix86_print_operand (file, x, 0);
13844 fputs (", ", file);
13845 }
13846 return;
13847
13848 case 'D':
13849 /* Little bit of braindamage here. The SSE compare instructions
13850 does use completely different names for the comparisons that the
13851 fp conditional moves. */
13852 if (TARGET_AVX)
13853 {
13854 switch (GET_CODE (x))
13855 {
13856 case EQ:
13857 fputs ("eq", file);
13858 break;
13859 case UNEQ:
13860 fputs ("eq_us", file);
13861 break;
13862 case LT:
13863 fputs ("lt", file);
13864 break;
13865 case UNLT:
13866 fputs ("nge", file);
13867 break;
13868 case LE:
13869 fputs ("le", file);
13870 break;
13871 case UNLE:
13872 fputs ("ngt", file);
13873 break;
13874 case UNORDERED:
13875 fputs ("unord", file);
13876 break;
13877 case NE:
13878 fputs ("neq", file);
13879 break;
13880 case LTGT:
13881 fputs ("neq_oq", file);
13882 break;
13883 case GE:
13884 fputs ("ge", file);
13885 break;
13886 case UNGE:
13887 fputs ("nlt", file);
13888 break;
13889 case GT:
13890 fputs ("gt", file);
13891 break;
13892 case UNGT:
13893 fputs ("nle", file);
13894 break;
13895 case ORDERED:
13896 fputs ("ord", file);
13897 break;
13898 default:
13899 output_operand_lossage ("operand is not a condition code, "
13900 "invalid operand code 'D'");
13901 return;
13902 }
13903 }
13904 else
13905 {
13906 switch (GET_CODE (x))
13907 {
13908 case EQ:
13909 case UNEQ:
13910 fputs ("eq", file);
13911 break;
13912 case LT:
13913 case UNLT:
13914 fputs ("lt", file);
13915 break;
13916 case LE:
13917 case UNLE:
13918 fputs ("le", file);
13919 break;
13920 case UNORDERED:
13921 fputs ("unord", file);
13922 break;
13923 case NE:
13924 case LTGT:
13925 fputs ("neq", file);
13926 break;
13927 case UNGE:
13928 case GE:
13929 fputs ("nlt", file);
13930 break;
13931 case UNGT:
13932 case GT:
13933 fputs ("nle", file);
13934 break;
13935 case ORDERED:
13936 fputs ("ord", file);
13937 break;
13938 default:
13939 output_operand_lossage ("operand is not a condition code, "
13940 "invalid operand code 'D'");
13941 return;
13942 }
13943 }
13944 return;
13945 case 'O':
13946 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
13947 if (ASSEMBLER_DIALECT == ASM_ATT)
13948 {
13949 switch (GET_MODE (x))
13950 {
13951 case HImode: putc ('w', file); break;
13952 case SImode:
13953 case SFmode: putc ('l', file); break;
13954 case DImode:
13955 case DFmode: putc ('q', file); break;
13956 default: gcc_unreachable ();
13957 }
13958 putc ('.', file);
13959 }
13960 #endif
13961 return;
13962 case 'C':
13963 if (!COMPARISON_P (x))
13964 {
13965 output_operand_lossage ("operand is neither a constant nor a "
13966 "condition code, invalid operand code "
13967 "'C'");
13968 return;
13969 }
13970 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 0, file);
13971 return;
13972 case 'F':
13973 if (!COMPARISON_P (x))
13974 {
13975 output_operand_lossage ("operand is neither a constant nor a "
13976 "condition code, invalid operand code "
13977 "'F'");
13978 return;
13979 }
13980 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
13981 if (ASSEMBLER_DIALECT == ASM_ATT)
13982 putc ('.', file);
13983 #endif
13984 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 1, file);
13985 return;
13986
13987 /* Like above, but reverse condition */
13988 case 'c':
13989 /* Check to see if argument to %c is really a constant
13990 and not a condition code which needs to be reversed. */
13991 if (!COMPARISON_P (x))
13992 {
13993 output_operand_lossage ("operand is neither a constant nor a "
13994 "condition code, invalid operand "
13995 "code 'c'");
13996 return;
13997 }
13998 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 0, file);
13999 return;
14000 case 'f':
14001 if (!COMPARISON_P (x))
14002 {
14003 output_operand_lossage ("operand is neither a constant nor a "
14004 "condition code, invalid operand "
14005 "code 'f'");
14006 return;
14007 }
14008 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14009 if (ASSEMBLER_DIALECT == ASM_ATT)
14010 putc ('.', file);
14011 #endif
14012 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
14013 return;
14014
14015 case 'H':
14016 /* It doesn't actually matter what mode we use here, as we're
14017 only going to use this for printing. */
14018 x = adjust_address_nv (x, DImode, 8);
14019 break;
14020
14021 case '+':
14022 {
14023 rtx x;
14024
14025 if (!optimize
14026 || optimize_function_for_size_p (cfun) || !TARGET_BRANCH_PREDICTION_HINTS)
14027 return;
14028
14029 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14030 if (x)
14031 {
14032 int pred_val = INTVAL (XEXP (x, 0));
14033
14034 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14035 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14036 {
14037 int taken = pred_val > REG_BR_PROB_BASE / 2;
14038 int cputaken = final_forward_branch_p (current_output_insn) == 0;
14039
14040 /* Emit hints only in the case default branch prediction
14041 heuristics would fail. */
14042 if (taken != cputaken)
14043 {
14044 /* We use 3e (DS) prefix for taken branches and
14045 2e (CS) prefix for not taken branches. */
14046 if (taken)
14047 fputs ("ds ; ", file);
14048 else
14049 fputs ("cs ; ", file);
14050 }
14051 }
14052 }
14053 return;
14054 }
14055
14056 case 'Y':
14057 switch (GET_CODE (x))
14058 {
14059 case NE:
14060 fputs ("neq", file);
14061 break;
14062 case EQ:
14063 fputs ("eq", file);
14064 break;
14065 case GE:
14066 case GEU:
14067 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14068 break;
14069 case GT:
14070 case GTU:
14071 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14072 break;
14073 case LE:
14074 case LEU:
14075 fputs ("le", file);
14076 break;
14077 case LT:
14078 case LTU:
14079 fputs ("lt", file);
14080 break;
14081 case UNORDERED:
14082 fputs ("unord", file);
14083 break;
14084 case ORDERED:
14085 fputs ("ord", file);
14086 break;
14087 case UNEQ:
14088 fputs ("ueq", file);
14089 break;
14090 case UNGE:
14091 fputs ("nlt", file);
14092 break;
14093 case UNGT:
14094 fputs ("nle", file);
14095 break;
14096 case UNLE:
14097 fputs ("ule", file);
14098 break;
14099 case UNLT:
14100 fputs ("ult", file);
14101 break;
14102 case LTGT:
14103 fputs ("une", file);
14104 break;
14105 default:
14106 output_operand_lossage ("operand is not a condition code, "
14107 "invalid operand code 'Y'");
14108 return;
14109 }
14110 return;
14111
14112 case ';':
14113 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14114 putc (';', file);
14115 #endif
14116 return;
14117
14118 case '@':
14119 if (ASSEMBLER_DIALECT == ASM_ATT)
14120 putc ('%', file);
14121
14122 /* The kernel uses a different segment register for performance
14123 reasons; a system call would not have to trash the userspace
14124 segment register, which would be expensive. */
14125 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14126 fputs ("fs", file);
14127 else
14128 fputs ("gs", file);
14129 return;
14130
14131 case '~':
14132 putc (TARGET_AVX2 ? 'i' : 'f', file);
14133 return;
14134
14135 default:
14136 output_operand_lossage ("invalid operand code '%c'", code);
14137 }
14138 }
14139
14140 if (REG_P (x))
14141 print_reg (x, code, file);
14142
14143 else if (MEM_P (x))
14144 {
14145 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14146 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14147 && GET_MODE (x) != BLKmode)
14148 {
14149 const char * size;
14150 switch (GET_MODE_SIZE (GET_MODE (x)))
14151 {
14152 case 1: size = "BYTE"; break;
14153 case 2: size = "WORD"; break;
14154 case 4: size = "DWORD"; break;
14155 case 8: size = "QWORD"; break;
14156 case 12: size = "TBYTE"; break;
14157 case 16:
14158 if (GET_MODE (x) == XFmode)
14159 size = "TBYTE";
14160 else
14161 size = "XMMWORD";
14162 break;
14163 case 32: size = "YMMWORD"; break;
14164 default:
14165 gcc_unreachable ();
14166 }
14167
14168 /* Check for explicit size override (codes 'b', 'w', 'k',
14169 'q' and 'x') */
14170 if (code == 'b')
14171 size = "BYTE";
14172 else if (code == 'w')
14173 size = "WORD";
14174 else if (code == 'k')
14175 size = "DWORD";
14176 else if (code == 'q')
14177 size = "QWORD";
14178 else if (code == 'x')
14179 size = "XMMWORD";
14180
14181 fputs (size, file);
14182 fputs (" PTR ", file);
14183 }
14184
14185 x = XEXP (x, 0);
14186 /* Avoid (%rip) for call operands. */
14187 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14188 && !CONST_INT_P (x))
14189 output_addr_const (file, x);
14190 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14191 output_operand_lossage ("invalid constraints for operand");
14192 else
14193 output_address (x);
14194 }
14195
14196 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14197 {
14198 REAL_VALUE_TYPE r;
14199 long l;
14200
14201 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14202 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14203
14204 if (ASSEMBLER_DIALECT == ASM_ATT)
14205 putc ('$', file);
14206 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14207 if (code == 'q')
14208 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14209 else
14210 fprintf (file, "0x%08x", (unsigned int) l);
14211 }
14212
14213 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14214 {
14215 REAL_VALUE_TYPE r;
14216 long l[2];
14217
14218 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14219 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14220
14221 if (ASSEMBLER_DIALECT == ASM_ATT)
14222 putc ('$', file);
14223 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14224 }
14225
14226 /* These float cases don't actually occur as immediate operands. */
14227 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14228 {
14229 char dstr[30];
14230
14231 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14232 fputs (dstr, file);
14233 }
14234
14235 else
14236 {
14237 /* We have patterns that allow zero sets of memory, for instance.
14238 In 64-bit mode, we should probably support all 8-byte vectors,
14239 since we can in fact encode that into an immediate. */
14240 if (GET_CODE (x) == CONST_VECTOR)
14241 {
14242 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14243 x = const0_rtx;
14244 }
14245
14246 if (code != 'P' && code != 'p')
14247 {
14248 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14249 {
14250 if (ASSEMBLER_DIALECT == ASM_ATT)
14251 putc ('$', file);
14252 }
14253 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14254 || GET_CODE (x) == LABEL_REF)
14255 {
14256 if (ASSEMBLER_DIALECT == ASM_ATT)
14257 putc ('$', file);
14258 else
14259 fputs ("OFFSET FLAT:", file);
14260 }
14261 }
14262 if (CONST_INT_P (x))
14263 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14264 else if (flag_pic || MACHOPIC_INDIRECT)
14265 output_pic_addr_const (file, x, code);
14266 else
14267 output_addr_const (file, x);
14268 }
14269 }
14270
14271 static bool
14272 ix86_print_operand_punct_valid_p (unsigned char code)
14273 {
14274 return (code == '@' || code == '*' || code == '+'
14275 || code == '&' || code == ';' || code == '~');
14276 }
14277 \f
14278 /* Print a memory operand whose address is ADDR. */
14279
14280 static void
14281 ix86_print_operand_address (FILE *file, rtx addr)
14282 {
14283 struct ix86_address parts;
14284 rtx base, index, disp;
14285 int scale;
14286 int ok;
14287 bool vsib = false;
14288
14289 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14290 {
14291 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14292 gcc_assert (parts.index == NULL_RTX);
14293 parts.index = XVECEXP (addr, 0, 1);
14294 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14295 addr = XVECEXP (addr, 0, 0);
14296 vsib = true;
14297 }
14298 else
14299 ok = ix86_decompose_address (addr, &parts);
14300
14301 gcc_assert (ok);
14302
14303 if (parts.base && GET_CODE (parts.base) == SUBREG)
14304 {
14305 rtx tmp = SUBREG_REG (parts.base);
14306 parts.base = simplify_subreg (GET_MODE (parts.base),
14307 tmp, GET_MODE (tmp), 0);
14308 }
14309
14310 if (parts.index && GET_CODE (parts.index) == SUBREG)
14311 {
14312 rtx tmp = SUBREG_REG (parts.index);
14313 parts.index = simplify_subreg (GET_MODE (parts.index),
14314 tmp, GET_MODE (tmp), 0);
14315 }
14316
14317 base = parts.base;
14318 index = parts.index;
14319 disp = parts.disp;
14320 scale = parts.scale;
14321
14322 switch (parts.seg)
14323 {
14324 case SEG_DEFAULT:
14325 break;
14326 case SEG_FS:
14327 case SEG_GS:
14328 if (ASSEMBLER_DIALECT == ASM_ATT)
14329 putc ('%', file);
14330 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14331 break;
14332 default:
14333 gcc_unreachable ();
14334 }
14335
14336 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14337 if (TARGET_64BIT && !base && !index)
14338 {
14339 rtx symbol = disp;
14340
14341 if (GET_CODE (disp) == CONST
14342 && GET_CODE (XEXP (disp, 0)) == PLUS
14343 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14344 symbol = XEXP (XEXP (disp, 0), 0);
14345
14346 if (GET_CODE (symbol) == LABEL_REF
14347 || (GET_CODE (symbol) == SYMBOL_REF
14348 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14349 base = pc_rtx;
14350 }
14351 if (!base && !index)
14352 {
14353 /* Displacement only requires special attention. */
14354
14355 if (CONST_INT_P (disp))
14356 {
14357 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14358 fputs ("ds:", file);
14359 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14360 }
14361 else if (flag_pic)
14362 output_pic_addr_const (file, disp, 0);
14363 else
14364 output_addr_const (file, disp);
14365 }
14366 else
14367 {
14368 int code = 0;
14369
14370 /* Print SImode registers for zero-extended addresses to force
14371 addr32 prefix. Otherwise print DImode registers to avoid it. */
14372 if (TARGET_64BIT)
14373 code = ((GET_CODE (addr) == ZERO_EXTEND
14374 || GET_CODE (addr) == AND)
14375 ? 'l'
14376 : 'q');
14377
14378 if (ASSEMBLER_DIALECT == ASM_ATT)
14379 {
14380 if (disp)
14381 {
14382 if (flag_pic)
14383 output_pic_addr_const (file, disp, 0);
14384 else if (GET_CODE (disp) == LABEL_REF)
14385 output_asm_label (disp);
14386 else
14387 output_addr_const (file, disp);
14388 }
14389
14390 putc ('(', file);
14391 if (base)
14392 print_reg (base, code, file);
14393 if (index)
14394 {
14395 putc (',', file);
14396 print_reg (index, vsib ? 0 : code, file);
14397 if (scale != 1 || vsib)
14398 fprintf (file, ",%d", scale);
14399 }
14400 putc (')', file);
14401 }
14402 else
14403 {
14404 rtx offset = NULL_RTX;
14405
14406 if (disp)
14407 {
14408 /* Pull out the offset of a symbol; print any symbol itself. */
14409 if (GET_CODE (disp) == CONST
14410 && GET_CODE (XEXP (disp, 0)) == PLUS
14411 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14412 {
14413 offset = XEXP (XEXP (disp, 0), 1);
14414 disp = gen_rtx_CONST (VOIDmode,
14415 XEXP (XEXP (disp, 0), 0));
14416 }
14417
14418 if (flag_pic)
14419 output_pic_addr_const (file, disp, 0);
14420 else if (GET_CODE (disp) == LABEL_REF)
14421 output_asm_label (disp);
14422 else if (CONST_INT_P (disp))
14423 offset = disp;
14424 else
14425 output_addr_const (file, disp);
14426 }
14427
14428 putc ('[', file);
14429 if (base)
14430 {
14431 print_reg (base, code, file);
14432 if (offset)
14433 {
14434 if (INTVAL (offset) >= 0)
14435 putc ('+', file);
14436 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14437 }
14438 }
14439 else if (offset)
14440 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14441 else
14442 putc ('0', file);
14443
14444 if (index)
14445 {
14446 putc ('+', file);
14447 print_reg (index, vsib ? 0 : code, file);
14448 if (scale != 1 || vsib)
14449 fprintf (file, "*%d", scale);
14450 }
14451 putc (']', file);
14452 }
14453 }
14454 }
14455
14456 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14457
14458 static bool
14459 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14460 {
14461 rtx op;
14462
14463 if (GET_CODE (x) != UNSPEC)
14464 return false;
14465
14466 op = XVECEXP (x, 0, 0);
14467 switch (XINT (x, 1))
14468 {
14469 case UNSPEC_GOTTPOFF:
14470 output_addr_const (file, op);
14471 /* FIXME: This might be @TPOFF in Sun ld. */
14472 fputs ("@gottpoff", file);
14473 break;
14474 case UNSPEC_TPOFF:
14475 output_addr_const (file, op);
14476 fputs ("@tpoff", file);
14477 break;
14478 case UNSPEC_NTPOFF:
14479 output_addr_const (file, op);
14480 if (TARGET_64BIT)
14481 fputs ("@tpoff", file);
14482 else
14483 fputs ("@ntpoff", file);
14484 break;
14485 case UNSPEC_DTPOFF:
14486 output_addr_const (file, op);
14487 fputs ("@dtpoff", file);
14488 break;
14489 case UNSPEC_GOTNTPOFF:
14490 output_addr_const (file, op);
14491 if (TARGET_64BIT)
14492 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14493 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14494 else
14495 fputs ("@gotntpoff", file);
14496 break;
14497 case UNSPEC_INDNTPOFF:
14498 output_addr_const (file, op);
14499 fputs ("@indntpoff", file);
14500 break;
14501 #if TARGET_MACHO
14502 case UNSPEC_MACHOPIC_OFFSET:
14503 output_addr_const (file, op);
14504 putc ('-', file);
14505 machopic_output_function_base_name (file);
14506 break;
14507 #endif
14508
14509 case UNSPEC_STACK_CHECK:
14510 {
14511 int offset;
14512
14513 gcc_assert (flag_split_stack);
14514
14515 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14516 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14517 #else
14518 gcc_unreachable ();
14519 #endif
14520
14521 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14522 }
14523 break;
14524
14525 default:
14526 return false;
14527 }
14528
14529 return true;
14530 }
14531 \f
14532 /* Split one or more double-mode RTL references into pairs of half-mode
14533 references. The RTL can be REG, offsettable MEM, integer constant, or
14534 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14535 split and "num" is its length. lo_half and hi_half are output arrays
14536 that parallel "operands". */
14537
14538 void
14539 split_double_mode (enum machine_mode mode, rtx operands[],
14540 int num, rtx lo_half[], rtx hi_half[])
14541 {
14542 enum machine_mode half_mode;
14543 unsigned int byte;
14544
14545 switch (mode)
14546 {
14547 case TImode:
14548 half_mode = DImode;
14549 break;
14550 case DImode:
14551 half_mode = SImode;
14552 break;
14553 default:
14554 gcc_unreachable ();
14555 }
14556
14557 byte = GET_MODE_SIZE (half_mode);
14558
14559 while (num--)
14560 {
14561 rtx op = operands[num];
14562
14563 /* simplify_subreg refuse to split volatile memory addresses,
14564 but we still have to handle it. */
14565 if (MEM_P (op))
14566 {
14567 lo_half[num] = adjust_address (op, half_mode, 0);
14568 hi_half[num] = adjust_address (op, half_mode, byte);
14569 }
14570 else
14571 {
14572 lo_half[num] = simplify_gen_subreg (half_mode, op,
14573 GET_MODE (op) == VOIDmode
14574 ? mode : GET_MODE (op), 0);
14575 hi_half[num] = simplify_gen_subreg (half_mode, op,
14576 GET_MODE (op) == VOIDmode
14577 ? mode : GET_MODE (op), byte);
14578 }
14579 }
14580 }
14581 \f
14582 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
14583 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
14584 is the expression of the binary operation. The output may either be
14585 emitted here, or returned to the caller, like all output_* functions.
14586
14587 There is no guarantee that the operands are the same mode, as they
14588 might be within FLOAT or FLOAT_EXTEND expressions. */
14589
14590 #ifndef SYSV386_COMPAT
14591 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
14592 wants to fix the assemblers because that causes incompatibility
14593 with gcc. No-one wants to fix gcc because that causes
14594 incompatibility with assemblers... You can use the option of
14595 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
14596 #define SYSV386_COMPAT 1
14597 #endif
14598
14599 const char *
14600 output_387_binary_op (rtx insn, rtx *operands)
14601 {
14602 static char buf[40];
14603 const char *p;
14604 const char *ssep;
14605 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
14606
14607 #ifdef ENABLE_CHECKING
14608 /* Even if we do not want to check the inputs, this documents input
14609 constraints. Which helps in understanding the following code. */
14610 if (STACK_REG_P (operands[0])
14611 && ((REG_P (operands[1])
14612 && REGNO (operands[0]) == REGNO (operands[1])
14613 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
14614 || (REG_P (operands[2])
14615 && REGNO (operands[0]) == REGNO (operands[2])
14616 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
14617 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
14618 ; /* ok */
14619 else
14620 gcc_assert (is_sse);
14621 #endif
14622
14623 switch (GET_CODE (operands[3]))
14624 {
14625 case PLUS:
14626 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14627 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14628 p = "fiadd";
14629 else
14630 p = "fadd";
14631 ssep = "vadd";
14632 break;
14633
14634 case MINUS:
14635 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14636 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14637 p = "fisub";
14638 else
14639 p = "fsub";
14640 ssep = "vsub";
14641 break;
14642
14643 case MULT:
14644 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14645 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14646 p = "fimul";
14647 else
14648 p = "fmul";
14649 ssep = "vmul";
14650 break;
14651
14652 case DIV:
14653 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14654 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14655 p = "fidiv";
14656 else
14657 p = "fdiv";
14658 ssep = "vdiv";
14659 break;
14660
14661 default:
14662 gcc_unreachable ();
14663 }
14664
14665 if (is_sse)
14666 {
14667 if (TARGET_AVX)
14668 {
14669 strcpy (buf, ssep);
14670 if (GET_MODE (operands[0]) == SFmode)
14671 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
14672 else
14673 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
14674 }
14675 else
14676 {
14677 strcpy (buf, ssep + 1);
14678 if (GET_MODE (operands[0]) == SFmode)
14679 strcat (buf, "ss\t{%2, %0|%0, %2}");
14680 else
14681 strcat (buf, "sd\t{%2, %0|%0, %2}");
14682 }
14683 return buf;
14684 }
14685 strcpy (buf, p);
14686
14687 switch (GET_CODE (operands[3]))
14688 {
14689 case MULT:
14690 case PLUS:
14691 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
14692 {
14693 rtx temp = operands[2];
14694 operands[2] = operands[1];
14695 operands[1] = temp;
14696 }
14697
14698 /* know operands[0] == operands[1]. */
14699
14700 if (MEM_P (operands[2]))
14701 {
14702 p = "%Z2\t%2";
14703 break;
14704 }
14705
14706 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14707 {
14708 if (STACK_TOP_P (operands[0]))
14709 /* How is it that we are storing to a dead operand[2]?
14710 Well, presumably operands[1] is dead too. We can't
14711 store the result to st(0) as st(0) gets popped on this
14712 instruction. Instead store to operands[2] (which I
14713 think has to be st(1)). st(1) will be popped later.
14714 gcc <= 2.8.1 didn't have this check and generated
14715 assembly code that the Unixware assembler rejected. */
14716 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14717 else
14718 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14719 break;
14720 }
14721
14722 if (STACK_TOP_P (operands[0]))
14723 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14724 else
14725 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14726 break;
14727
14728 case MINUS:
14729 case DIV:
14730 if (MEM_P (operands[1]))
14731 {
14732 p = "r%Z1\t%1";
14733 break;
14734 }
14735
14736 if (MEM_P (operands[2]))
14737 {
14738 p = "%Z2\t%2";
14739 break;
14740 }
14741
14742 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14743 {
14744 #if SYSV386_COMPAT
14745 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
14746 derived assemblers, confusingly reverse the direction of
14747 the operation for fsub{r} and fdiv{r} when the
14748 destination register is not st(0). The Intel assembler
14749 doesn't have this brain damage. Read !SYSV386_COMPAT to
14750 figure out what the hardware really does. */
14751 if (STACK_TOP_P (operands[0]))
14752 p = "{p\t%0, %2|rp\t%2, %0}";
14753 else
14754 p = "{rp\t%2, %0|p\t%0, %2}";
14755 #else
14756 if (STACK_TOP_P (operands[0]))
14757 /* As above for fmul/fadd, we can't store to st(0). */
14758 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14759 else
14760 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14761 #endif
14762 break;
14763 }
14764
14765 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
14766 {
14767 #if SYSV386_COMPAT
14768 if (STACK_TOP_P (operands[0]))
14769 p = "{rp\t%0, %1|p\t%1, %0}";
14770 else
14771 p = "{p\t%1, %0|rp\t%0, %1}";
14772 #else
14773 if (STACK_TOP_P (operands[0]))
14774 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
14775 else
14776 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
14777 #endif
14778 break;
14779 }
14780
14781 if (STACK_TOP_P (operands[0]))
14782 {
14783 if (STACK_TOP_P (operands[1]))
14784 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14785 else
14786 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
14787 break;
14788 }
14789 else if (STACK_TOP_P (operands[1]))
14790 {
14791 #if SYSV386_COMPAT
14792 p = "{\t%1, %0|r\t%0, %1}";
14793 #else
14794 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
14795 #endif
14796 }
14797 else
14798 {
14799 #if SYSV386_COMPAT
14800 p = "{r\t%2, %0|\t%0, %2}";
14801 #else
14802 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14803 #endif
14804 }
14805 break;
14806
14807 default:
14808 gcc_unreachable ();
14809 }
14810
14811 strcat (buf, p);
14812 return buf;
14813 }
14814
14815 /* Return needed mode for entity in optimize_mode_switching pass. */
14816
14817 int
14818 ix86_mode_needed (int entity, rtx insn)
14819 {
14820 enum attr_i387_cw mode;
14821
14822 /* The mode UNINITIALIZED is used to store control word after a
14823 function call or ASM pattern. The mode ANY specify that function
14824 has no requirements on the control word and make no changes in the
14825 bits we are interested in. */
14826
14827 if (CALL_P (insn)
14828 || (NONJUMP_INSN_P (insn)
14829 && (asm_noperands (PATTERN (insn)) >= 0
14830 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
14831 return I387_CW_UNINITIALIZED;
14832
14833 if (recog_memoized (insn) < 0)
14834 return I387_CW_ANY;
14835
14836 mode = get_attr_i387_cw (insn);
14837
14838 switch (entity)
14839 {
14840 case I387_TRUNC:
14841 if (mode == I387_CW_TRUNC)
14842 return mode;
14843 break;
14844
14845 case I387_FLOOR:
14846 if (mode == I387_CW_FLOOR)
14847 return mode;
14848 break;
14849
14850 case I387_CEIL:
14851 if (mode == I387_CW_CEIL)
14852 return mode;
14853 break;
14854
14855 case I387_MASK_PM:
14856 if (mode == I387_CW_MASK_PM)
14857 return mode;
14858 break;
14859
14860 default:
14861 gcc_unreachable ();
14862 }
14863
14864 return I387_CW_ANY;
14865 }
14866
14867 /* Output code to initialize control word copies used by trunc?f?i and
14868 rounding patterns. CURRENT_MODE is set to current control word,
14869 while NEW_MODE is set to new control word. */
14870
14871 void
14872 emit_i387_cw_initialization (int mode)
14873 {
14874 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
14875 rtx new_mode;
14876
14877 enum ix86_stack_slot slot;
14878
14879 rtx reg = gen_reg_rtx (HImode);
14880
14881 emit_insn (gen_x86_fnstcw_1 (stored_mode));
14882 emit_move_insn (reg, copy_rtx (stored_mode));
14883
14884 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
14885 || optimize_function_for_size_p (cfun))
14886 {
14887 switch (mode)
14888 {
14889 case I387_CW_TRUNC:
14890 /* round toward zero (truncate) */
14891 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
14892 slot = SLOT_CW_TRUNC;
14893 break;
14894
14895 case I387_CW_FLOOR:
14896 /* round down toward -oo */
14897 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
14898 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
14899 slot = SLOT_CW_FLOOR;
14900 break;
14901
14902 case I387_CW_CEIL:
14903 /* round up toward +oo */
14904 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
14905 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
14906 slot = SLOT_CW_CEIL;
14907 break;
14908
14909 case I387_CW_MASK_PM:
14910 /* mask precision exception for nearbyint() */
14911 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
14912 slot = SLOT_CW_MASK_PM;
14913 break;
14914
14915 default:
14916 gcc_unreachable ();
14917 }
14918 }
14919 else
14920 {
14921 switch (mode)
14922 {
14923 case I387_CW_TRUNC:
14924 /* round toward zero (truncate) */
14925 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
14926 slot = SLOT_CW_TRUNC;
14927 break;
14928
14929 case I387_CW_FLOOR:
14930 /* round down toward -oo */
14931 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
14932 slot = SLOT_CW_FLOOR;
14933 break;
14934
14935 case I387_CW_CEIL:
14936 /* round up toward +oo */
14937 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
14938 slot = SLOT_CW_CEIL;
14939 break;
14940
14941 case I387_CW_MASK_PM:
14942 /* mask precision exception for nearbyint() */
14943 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
14944 slot = SLOT_CW_MASK_PM;
14945 break;
14946
14947 default:
14948 gcc_unreachable ();
14949 }
14950 }
14951
14952 gcc_assert (slot < MAX_386_STACK_LOCALS);
14953
14954 new_mode = assign_386_stack_local (HImode, slot);
14955 emit_move_insn (new_mode, reg);
14956 }
14957
14958 /* Output code for INSN to convert a float to a signed int. OPERANDS
14959 are the insn operands. The output may be [HSD]Imode and the input
14960 operand may be [SDX]Fmode. */
14961
14962 const char *
14963 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
14964 {
14965 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
14966 int dimode_p = GET_MODE (operands[0]) == DImode;
14967 int round_mode = get_attr_i387_cw (insn);
14968
14969 /* Jump through a hoop or two for DImode, since the hardware has no
14970 non-popping instruction. We used to do this a different way, but
14971 that was somewhat fragile and broke with post-reload splitters. */
14972 if ((dimode_p || fisttp) && !stack_top_dies)
14973 output_asm_insn ("fld\t%y1", operands);
14974
14975 gcc_assert (STACK_TOP_P (operands[1]));
14976 gcc_assert (MEM_P (operands[0]));
14977 gcc_assert (GET_MODE (operands[1]) != TFmode);
14978
14979 if (fisttp)
14980 output_asm_insn ("fisttp%Z0\t%0", operands);
14981 else
14982 {
14983 if (round_mode != I387_CW_ANY)
14984 output_asm_insn ("fldcw\t%3", operands);
14985 if (stack_top_dies || dimode_p)
14986 output_asm_insn ("fistp%Z0\t%0", operands);
14987 else
14988 output_asm_insn ("fist%Z0\t%0", operands);
14989 if (round_mode != I387_CW_ANY)
14990 output_asm_insn ("fldcw\t%2", operands);
14991 }
14992
14993 return "";
14994 }
14995
14996 /* Output code for x87 ffreep insn. The OPNO argument, which may only
14997 have the values zero or one, indicates the ffreep insn's operand
14998 from the OPERANDS array. */
14999
15000 static const char *
15001 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15002 {
15003 if (TARGET_USE_FFREEP)
15004 #ifdef HAVE_AS_IX86_FFREEP
15005 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15006 #else
15007 {
15008 static char retval[32];
15009 int regno = REGNO (operands[opno]);
15010
15011 gcc_assert (FP_REGNO_P (regno));
15012
15013 regno -= FIRST_STACK_REG;
15014
15015 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15016 return retval;
15017 }
15018 #endif
15019
15020 return opno ? "fstp\t%y1" : "fstp\t%y0";
15021 }
15022
15023
15024 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15025 should be used. UNORDERED_P is true when fucom should be used. */
15026
15027 const char *
15028 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15029 {
15030 int stack_top_dies;
15031 rtx cmp_op0, cmp_op1;
15032 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15033
15034 if (eflags_p)
15035 {
15036 cmp_op0 = operands[0];
15037 cmp_op1 = operands[1];
15038 }
15039 else
15040 {
15041 cmp_op0 = operands[1];
15042 cmp_op1 = operands[2];
15043 }
15044
15045 if (is_sse)
15046 {
15047 if (GET_MODE (operands[0]) == SFmode)
15048 if (unordered_p)
15049 return "%vucomiss\t{%1, %0|%0, %1}";
15050 else
15051 return "%vcomiss\t{%1, %0|%0, %1}";
15052 else
15053 if (unordered_p)
15054 return "%vucomisd\t{%1, %0|%0, %1}";
15055 else
15056 return "%vcomisd\t{%1, %0|%0, %1}";
15057 }
15058
15059 gcc_assert (STACK_TOP_P (cmp_op0));
15060
15061 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15062
15063 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15064 {
15065 if (stack_top_dies)
15066 {
15067 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15068 return output_387_ffreep (operands, 1);
15069 }
15070 else
15071 return "ftst\n\tfnstsw\t%0";
15072 }
15073
15074 if (STACK_REG_P (cmp_op1)
15075 && stack_top_dies
15076 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15077 && REGNO (cmp_op1) != FIRST_STACK_REG)
15078 {
15079 /* If both the top of the 387 stack dies, and the other operand
15080 is also a stack register that dies, then this must be a
15081 `fcompp' float compare */
15082
15083 if (eflags_p)
15084 {
15085 /* There is no double popping fcomi variant. Fortunately,
15086 eflags is immune from the fstp's cc clobbering. */
15087 if (unordered_p)
15088 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15089 else
15090 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15091 return output_387_ffreep (operands, 0);
15092 }
15093 else
15094 {
15095 if (unordered_p)
15096 return "fucompp\n\tfnstsw\t%0";
15097 else
15098 return "fcompp\n\tfnstsw\t%0";
15099 }
15100 }
15101 else
15102 {
15103 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15104
15105 static const char * const alt[16] =
15106 {
15107 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15108 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15109 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15110 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15111
15112 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15113 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15114 NULL,
15115 NULL,
15116
15117 "fcomi\t{%y1, %0|%0, %y1}",
15118 "fcomip\t{%y1, %0|%0, %y1}",
15119 "fucomi\t{%y1, %0|%0, %y1}",
15120 "fucomip\t{%y1, %0|%0, %y1}",
15121
15122 NULL,
15123 NULL,
15124 NULL,
15125 NULL
15126 };
15127
15128 int mask;
15129 const char *ret;
15130
15131 mask = eflags_p << 3;
15132 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15133 mask |= unordered_p << 1;
15134 mask |= stack_top_dies;
15135
15136 gcc_assert (mask < 16);
15137 ret = alt[mask];
15138 gcc_assert (ret);
15139
15140 return ret;
15141 }
15142 }
15143
15144 void
15145 ix86_output_addr_vec_elt (FILE *file, int value)
15146 {
15147 const char *directive = ASM_LONG;
15148
15149 #ifdef ASM_QUAD
15150 if (TARGET_LP64)
15151 directive = ASM_QUAD;
15152 #else
15153 gcc_assert (!TARGET_64BIT);
15154 #endif
15155
15156 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15157 }
15158
15159 void
15160 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15161 {
15162 const char *directive = ASM_LONG;
15163
15164 #ifdef ASM_QUAD
15165 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15166 directive = ASM_QUAD;
15167 #else
15168 gcc_assert (!TARGET_64BIT);
15169 #endif
15170 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15171 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15172 fprintf (file, "%s%s%d-%s%d\n",
15173 directive, LPREFIX, value, LPREFIX, rel);
15174 else if (HAVE_AS_GOTOFF_IN_DATA)
15175 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15176 #if TARGET_MACHO
15177 else if (TARGET_MACHO)
15178 {
15179 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15180 machopic_output_function_base_name (file);
15181 putc ('\n', file);
15182 }
15183 #endif
15184 else
15185 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15186 GOT_SYMBOL_NAME, LPREFIX, value);
15187 }
15188 \f
15189 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15190 for the target. */
15191
15192 void
15193 ix86_expand_clear (rtx dest)
15194 {
15195 rtx tmp;
15196
15197 /* We play register width games, which are only valid after reload. */
15198 gcc_assert (reload_completed);
15199
15200 /* Avoid HImode and its attendant prefix byte. */
15201 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15202 dest = gen_rtx_REG (SImode, REGNO (dest));
15203 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15204
15205 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15206 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15207 {
15208 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15209 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15210 }
15211
15212 emit_insn (tmp);
15213 }
15214
15215 /* X is an unchanging MEM. If it is a constant pool reference, return
15216 the constant pool rtx, else NULL. */
15217
15218 rtx
15219 maybe_get_pool_constant (rtx x)
15220 {
15221 x = ix86_delegitimize_address (XEXP (x, 0));
15222
15223 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15224 return get_pool_constant (x);
15225
15226 return NULL_RTX;
15227 }
15228
15229 void
15230 ix86_expand_move (enum machine_mode mode, rtx operands[])
15231 {
15232 rtx op0, op1;
15233 enum tls_model model;
15234
15235 op0 = operands[0];
15236 op1 = operands[1];
15237
15238 if (GET_CODE (op1) == SYMBOL_REF)
15239 {
15240 model = SYMBOL_REF_TLS_MODEL (op1);
15241 if (model)
15242 {
15243 op1 = legitimize_tls_address (op1, model, true);
15244 op1 = force_operand (op1, op0);
15245 if (op1 == op0)
15246 return;
15247 if (GET_MODE (op1) != mode)
15248 op1 = convert_to_mode (mode, op1, 1);
15249 }
15250 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15251 && SYMBOL_REF_DLLIMPORT_P (op1))
15252 op1 = legitimize_dllimport_symbol (op1, false);
15253 }
15254 else if (GET_CODE (op1) == CONST
15255 && GET_CODE (XEXP (op1, 0)) == PLUS
15256 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15257 {
15258 rtx addend = XEXP (XEXP (op1, 0), 1);
15259 rtx symbol = XEXP (XEXP (op1, 0), 0);
15260 rtx tmp = NULL;
15261
15262 model = SYMBOL_REF_TLS_MODEL (symbol);
15263 if (model)
15264 tmp = legitimize_tls_address (symbol, model, true);
15265 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15266 && SYMBOL_REF_DLLIMPORT_P (symbol))
15267 tmp = legitimize_dllimport_symbol (symbol, true);
15268
15269 if (tmp)
15270 {
15271 tmp = force_operand (tmp, NULL);
15272 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15273 op0, 1, OPTAB_DIRECT);
15274 if (tmp == op0)
15275 return;
15276 if (GET_MODE (tmp) != mode)
15277 op1 = convert_to_mode (mode, tmp, 1);
15278 }
15279 }
15280
15281 if ((flag_pic || MACHOPIC_INDIRECT)
15282 && symbolic_operand (op1, mode))
15283 {
15284 if (TARGET_MACHO && !TARGET_64BIT)
15285 {
15286 #if TARGET_MACHO
15287 /* dynamic-no-pic */
15288 if (MACHOPIC_INDIRECT)
15289 {
15290 rtx temp = ((reload_in_progress
15291 || ((op0 && REG_P (op0))
15292 && mode == Pmode))
15293 ? op0 : gen_reg_rtx (Pmode));
15294 op1 = machopic_indirect_data_reference (op1, temp);
15295 if (MACHOPIC_PURE)
15296 op1 = machopic_legitimize_pic_address (op1, mode,
15297 temp == op1 ? 0 : temp);
15298 }
15299 if (op0 != op1 && GET_CODE (op0) != MEM)
15300 {
15301 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15302 emit_insn (insn);
15303 return;
15304 }
15305 if (GET_CODE (op0) == MEM)
15306 op1 = force_reg (Pmode, op1);
15307 else
15308 {
15309 rtx temp = op0;
15310 if (GET_CODE (temp) != REG)
15311 temp = gen_reg_rtx (Pmode);
15312 temp = legitimize_pic_address (op1, temp);
15313 if (temp == op0)
15314 return;
15315 op1 = temp;
15316 }
15317 /* dynamic-no-pic */
15318 #endif
15319 }
15320 else
15321 {
15322 if (MEM_P (op0))
15323 op1 = force_reg (mode, op1);
15324 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15325 {
15326 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15327 op1 = legitimize_pic_address (op1, reg);
15328 if (op0 == op1)
15329 return;
15330 if (GET_MODE (op1) != mode)
15331 op1 = convert_to_mode (mode, op1, 1);
15332 }
15333 }
15334 }
15335 else
15336 {
15337 if (MEM_P (op0)
15338 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15339 || !push_operand (op0, mode))
15340 && MEM_P (op1))
15341 op1 = force_reg (mode, op1);
15342
15343 if (push_operand (op0, mode)
15344 && ! general_no_elim_operand (op1, mode))
15345 op1 = copy_to_mode_reg (mode, op1);
15346
15347 /* Force large constants in 64bit compilation into register
15348 to get them CSEed. */
15349 if (can_create_pseudo_p ()
15350 && (mode == DImode) && TARGET_64BIT
15351 && immediate_operand (op1, mode)
15352 && !x86_64_zext_immediate_operand (op1, VOIDmode)
15353 && !register_operand (op0, mode)
15354 && optimize)
15355 op1 = copy_to_mode_reg (mode, op1);
15356
15357 if (can_create_pseudo_p ()
15358 && FLOAT_MODE_P (mode)
15359 && GET_CODE (op1) == CONST_DOUBLE)
15360 {
15361 /* If we are loading a floating point constant to a register,
15362 force the value to memory now, since we'll get better code
15363 out the back end. */
15364
15365 op1 = validize_mem (force_const_mem (mode, op1));
15366 if (!register_operand (op0, mode))
15367 {
15368 rtx temp = gen_reg_rtx (mode);
15369 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15370 emit_move_insn (op0, temp);
15371 return;
15372 }
15373 }
15374 }
15375
15376 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15377 }
15378
15379 void
15380 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15381 {
15382 rtx op0 = operands[0], op1 = operands[1];
15383 unsigned int align = GET_MODE_ALIGNMENT (mode);
15384
15385 /* Force constants other than zero into memory. We do not know how
15386 the instructions used to build constants modify the upper 64 bits
15387 of the register, once we have that information we may be able
15388 to handle some of them more efficiently. */
15389 if (can_create_pseudo_p ()
15390 && register_operand (op0, mode)
15391 && (CONSTANT_P (op1)
15392 || (GET_CODE (op1) == SUBREG
15393 && CONSTANT_P (SUBREG_REG (op1))))
15394 && !standard_sse_constant_p (op1))
15395 op1 = validize_mem (force_const_mem (mode, op1));
15396
15397 /* We need to check memory alignment for SSE mode since attribute
15398 can make operands unaligned. */
15399 if (can_create_pseudo_p ()
15400 && SSE_REG_MODE_P (mode)
15401 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15402 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15403 {
15404 rtx tmp[2];
15405
15406 /* ix86_expand_vector_move_misalign() does not like constants ... */
15407 if (CONSTANT_P (op1)
15408 || (GET_CODE (op1) == SUBREG
15409 && CONSTANT_P (SUBREG_REG (op1))))
15410 op1 = validize_mem (force_const_mem (mode, op1));
15411
15412 /* ... nor both arguments in memory. */
15413 if (!register_operand (op0, mode)
15414 && !register_operand (op1, mode))
15415 op1 = force_reg (mode, op1);
15416
15417 tmp[0] = op0; tmp[1] = op1;
15418 ix86_expand_vector_move_misalign (mode, tmp);
15419 return;
15420 }
15421
15422 /* Make operand1 a register if it isn't already. */
15423 if (can_create_pseudo_p ()
15424 && !register_operand (op0, mode)
15425 && !register_operand (op1, mode))
15426 {
15427 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15428 return;
15429 }
15430
15431 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15432 }
15433
15434 /* Split 32-byte AVX unaligned load and store if needed. */
15435
15436 static void
15437 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
15438 {
15439 rtx m;
15440 rtx (*extract) (rtx, rtx, rtx);
15441 rtx (*move_unaligned) (rtx, rtx);
15442 enum machine_mode mode;
15443
15444 switch (GET_MODE (op0))
15445 {
15446 default:
15447 gcc_unreachable ();
15448 case V32QImode:
15449 extract = gen_avx_vextractf128v32qi;
15450 move_unaligned = gen_avx_movdqu256;
15451 mode = V16QImode;
15452 break;
15453 case V8SFmode:
15454 extract = gen_avx_vextractf128v8sf;
15455 move_unaligned = gen_avx_movups256;
15456 mode = V4SFmode;
15457 break;
15458 case V4DFmode:
15459 extract = gen_avx_vextractf128v4df;
15460 move_unaligned = gen_avx_movupd256;
15461 mode = V2DFmode;
15462 break;
15463 }
15464
15465 if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
15466 {
15467 rtx r = gen_reg_rtx (mode);
15468 m = adjust_address (op1, mode, 0);
15469 emit_move_insn (r, m);
15470 m = adjust_address (op1, mode, 16);
15471 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
15472 emit_move_insn (op0, r);
15473 }
15474 else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
15475 {
15476 m = adjust_address (op0, mode, 0);
15477 emit_insn (extract (m, op1, const0_rtx));
15478 m = adjust_address (op0, mode, 16);
15479 emit_insn (extract (m, op1, const1_rtx));
15480 }
15481 else
15482 emit_insn (move_unaligned (op0, op1));
15483 }
15484
15485 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
15486 straight to ix86_expand_vector_move. */
15487 /* Code generation for scalar reg-reg moves of single and double precision data:
15488 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
15489 movaps reg, reg
15490 else
15491 movss reg, reg
15492 if (x86_sse_partial_reg_dependency == true)
15493 movapd reg, reg
15494 else
15495 movsd reg, reg
15496
15497 Code generation for scalar loads of double precision data:
15498 if (x86_sse_split_regs == true)
15499 movlpd mem, reg (gas syntax)
15500 else
15501 movsd mem, reg
15502
15503 Code generation for unaligned packed loads of single precision data
15504 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
15505 if (x86_sse_unaligned_move_optimal)
15506 movups mem, reg
15507
15508 if (x86_sse_partial_reg_dependency == true)
15509 {
15510 xorps reg, reg
15511 movlps mem, reg
15512 movhps mem+8, reg
15513 }
15514 else
15515 {
15516 movlps mem, reg
15517 movhps mem+8, reg
15518 }
15519
15520 Code generation for unaligned packed loads of double precision data
15521 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
15522 if (x86_sse_unaligned_move_optimal)
15523 movupd mem, reg
15524
15525 if (x86_sse_split_regs == true)
15526 {
15527 movlpd mem, reg
15528 movhpd mem+8, reg
15529 }
15530 else
15531 {
15532 movsd mem, reg
15533 movhpd mem+8, reg
15534 }
15535 */
15536
15537 void
15538 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
15539 {
15540 rtx op0, op1, m;
15541
15542 op0 = operands[0];
15543 op1 = operands[1];
15544
15545 if (TARGET_AVX)
15546 {
15547 switch (GET_MODE_CLASS (mode))
15548 {
15549 case MODE_VECTOR_INT:
15550 case MODE_INT:
15551 switch (GET_MODE_SIZE (mode))
15552 {
15553 case 16:
15554 /* If we're optimizing for size, movups is the smallest. */
15555 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15556 {
15557 op0 = gen_lowpart (V4SFmode, op0);
15558 op1 = gen_lowpart (V4SFmode, op1);
15559 emit_insn (gen_sse_movups (op0, op1));
15560 return;
15561 }
15562 op0 = gen_lowpart (V16QImode, op0);
15563 op1 = gen_lowpart (V16QImode, op1);
15564 emit_insn (gen_sse2_movdqu (op0, op1));
15565 break;
15566 case 32:
15567 op0 = gen_lowpart (V32QImode, op0);
15568 op1 = gen_lowpart (V32QImode, op1);
15569 ix86_avx256_split_vector_move_misalign (op0, op1);
15570 break;
15571 default:
15572 gcc_unreachable ();
15573 }
15574 break;
15575 case MODE_VECTOR_FLOAT:
15576 op0 = gen_lowpart (mode, op0);
15577 op1 = gen_lowpart (mode, op1);
15578
15579 switch (mode)
15580 {
15581 case V4SFmode:
15582 emit_insn (gen_sse_movups (op0, op1));
15583 break;
15584 case V8SFmode:
15585 ix86_avx256_split_vector_move_misalign (op0, op1);
15586 break;
15587 case V2DFmode:
15588 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15589 {
15590 op0 = gen_lowpart (V4SFmode, op0);
15591 op1 = gen_lowpart (V4SFmode, op1);
15592 emit_insn (gen_sse_movups (op0, op1));
15593 return;
15594 }
15595 emit_insn (gen_sse2_movupd (op0, op1));
15596 break;
15597 case V4DFmode:
15598 ix86_avx256_split_vector_move_misalign (op0, op1);
15599 break;
15600 default:
15601 gcc_unreachable ();
15602 }
15603 break;
15604
15605 default:
15606 gcc_unreachable ();
15607 }
15608
15609 return;
15610 }
15611
15612 if (MEM_P (op1))
15613 {
15614 /* If we're optimizing for size, movups is the smallest. */
15615 if (optimize_insn_for_size_p ()
15616 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15617 {
15618 op0 = gen_lowpart (V4SFmode, op0);
15619 op1 = gen_lowpart (V4SFmode, op1);
15620 emit_insn (gen_sse_movups (op0, op1));
15621 return;
15622 }
15623
15624 /* ??? If we have typed data, then it would appear that using
15625 movdqu is the only way to get unaligned data loaded with
15626 integer type. */
15627 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15628 {
15629 op0 = gen_lowpart (V16QImode, op0);
15630 op1 = gen_lowpart (V16QImode, op1);
15631 emit_insn (gen_sse2_movdqu (op0, op1));
15632 return;
15633 }
15634
15635 if (TARGET_SSE2 && mode == V2DFmode)
15636 {
15637 rtx zero;
15638
15639 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15640 {
15641 op0 = gen_lowpart (V2DFmode, op0);
15642 op1 = gen_lowpart (V2DFmode, op1);
15643 emit_insn (gen_sse2_movupd (op0, op1));
15644 return;
15645 }
15646
15647 /* When SSE registers are split into halves, we can avoid
15648 writing to the top half twice. */
15649 if (TARGET_SSE_SPLIT_REGS)
15650 {
15651 emit_clobber (op0);
15652 zero = op0;
15653 }
15654 else
15655 {
15656 /* ??? Not sure about the best option for the Intel chips.
15657 The following would seem to satisfy; the register is
15658 entirely cleared, breaking the dependency chain. We
15659 then store to the upper half, with a dependency depth
15660 of one. A rumor has it that Intel recommends two movsd
15661 followed by an unpacklpd, but this is unconfirmed. And
15662 given that the dependency depth of the unpacklpd would
15663 still be one, I'm not sure why this would be better. */
15664 zero = CONST0_RTX (V2DFmode);
15665 }
15666
15667 m = adjust_address (op1, DFmode, 0);
15668 emit_insn (gen_sse2_loadlpd (op0, zero, m));
15669 m = adjust_address (op1, DFmode, 8);
15670 emit_insn (gen_sse2_loadhpd (op0, op0, m));
15671 }
15672 else
15673 {
15674 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15675 {
15676 op0 = gen_lowpart (V4SFmode, op0);
15677 op1 = gen_lowpart (V4SFmode, op1);
15678 emit_insn (gen_sse_movups (op0, op1));
15679 return;
15680 }
15681
15682 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
15683 emit_move_insn (op0, CONST0_RTX (mode));
15684 else
15685 emit_clobber (op0);
15686
15687 if (mode != V4SFmode)
15688 op0 = gen_lowpart (V4SFmode, op0);
15689 m = adjust_address (op1, V2SFmode, 0);
15690 emit_insn (gen_sse_loadlps (op0, op0, m));
15691 m = adjust_address (op1, V2SFmode, 8);
15692 emit_insn (gen_sse_loadhps (op0, op0, m));
15693 }
15694 }
15695 else if (MEM_P (op0))
15696 {
15697 /* If we're optimizing for size, movups is the smallest. */
15698 if (optimize_insn_for_size_p ()
15699 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15700 {
15701 op0 = gen_lowpart (V4SFmode, op0);
15702 op1 = gen_lowpart (V4SFmode, op1);
15703 emit_insn (gen_sse_movups (op0, op1));
15704 return;
15705 }
15706
15707 /* ??? Similar to above, only less clear because of quote
15708 typeless stores unquote. */
15709 if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
15710 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15711 {
15712 op0 = gen_lowpart (V16QImode, op0);
15713 op1 = gen_lowpart (V16QImode, op1);
15714 emit_insn (gen_sse2_movdqu (op0, op1));
15715 return;
15716 }
15717
15718 if (TARGET_SSE2 && mode == V2DFmode)
15719 {
15720 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15721 {
15722 op0 = gen_lowpart (V2DFmode, op0);
15723 op1 = gen_lowpart (V2DFmode, op1);
15724 emit_insn (gen_sse2_movupd (op0, op1));
15725 }
15726 else
15727 {
15728 m = adjust_address (op0, DFmode, 0);
15729 emit_insn (gen_sse2_storelpd (m, op1));
15730 m = adjust_address (op0, DFmode, 8);
15731 emit_insn (gen_sse2_storehpd (m, op1));
15732 }
15733 }
15734 else
15735 {
15736 if (mode != V4SFmode)
15737 op1 = gen_lowpart (V4SFmode, op1);
15738
15739 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15740 {
15741 op0 = gen_lowpart (V4SFmode, op0);
15742 emit_insn (gen_sse_movups (op0, op1));
15743 }
15744 else
15745 {
15746 m = adjust_address (op0, V2SFmode, 0);
15747 emit_insn (gen_sse_storelps (m, op1));
15748 m = adjust_address (op0, V2SFmode, 8);
15749 emit_insn (gen_sse_storehps (m, op1));
15750 }
15751 }
15752 }
15753 else
15754 gcc_unreachable ();
15755 }
15756
15757 /* Expand a push in MODE. This is some mode for which we do not support
15758 proper push instructions, at least from the registers that we expect
15759 the value to live in. */
15760
15761 void
15762 ix86_expand_push (enum machine_mode mode, rtx x)
15763 {
15764 rtx tmp;
15765
15766 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
15767 GEN_INT (-GET_MODE_SIZE (mode)),
15768 stack_pointer_rtx, 1, OPTAB_DIRECT);
15769 if (tmp != stack_pointer_rtx)
15770 emit_move_insn (stack_pointer_rtx, tmp);
15771
15772 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
15773
15774 /* When we push an operand onto stack, it has to be aligned at least
15775 at the function argument boundary. However since we don't have
15776 the argument type, we can't determine the actual argument
15777 boundary. */
15778 emit_move_insn (tmp, x);
15779 }
15780
15781 /* Helper function of ix86_fixup_binary_operands to canonicalize
15782 operand order. Returns true if the operands should be swapped. */
15783
15784 static bool
15785 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
15786 rtx operands[])
15787 {
15788 rtx dst = operands[0];
15789 rtx src1 = operands[1];
15790 rtx src2 = operands[2];
15791
15792 /* If the operation is not commutative, we can't do anything. */
15793 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
15794 return false;
15795
15796 /* Highest priority is that src1 should match dst. */
15797 if (rtx_equal_p (dst, src1))
15798 return false;
15799 if (rtx_equal_p (dst, src2))
15800 return true;
15801
15802 /* Next highest priority is that immediate constants come second. */
15803 if (immediate_operand (src2, mode))
15804 return false;
15805 if (immediate_operand (src1, mode))
15806 return true;
15807
15808 /* Lowest priority is that memory references should come second. */
15809 if (MEM_P (src2))
15810 return false;
15811 if (MEM_P (src1))
15812 return true;
15813
15814 return false;
15815 }
15816
15817
15818 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
15819 destination to use for the operation. If different from the true
15820 destination in operands[0], a copy operation will be required. */
15821
15822 rtx
15823 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
15824 rtx operands[])
15825 {
15826 rtx dst = operands[0];
15827 rtx src1 = operands[1];
15828 rtx src2 = operands[2];
15829
15830 /* Canonicalize operand order. */
15831 if (ix86_swap_binary_operands_p (code, mode, operands))
15832 {
15833 rtx temp;
15834
15835 /* It is invalid to swap operands of different modes. */
15836 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
15837
15838 temp = src1;
15839 src1 = src2;
15840 src2 = temp;
15841 }
15842
15843 /* Both source operands cannot be in memory. */
15844 if (MEM_P (src1) && MEM_P (src2))
15845 {
15846 /* Optimization: Only read from memory once. */
15847 if (rtx_equal_p (src1, src2))
15848 {
15849 src2 = force_reg (mode, src2);
15850 src1 = src2;
15851 }
15852 else
15853 src2 = force_reg (mode, src2);
15854 }
15855
15856 /* If the destination is memory, and we do not have matching source
15857 operands, do things in registers. */
15858 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
15859 dst = gen_reg_rtx (mode);
15860
15861 /* Source 1 cannot be a constant. */
15862 if (CONSTANT_P (src1))
15863 src1 = force_reg (mode, src1);
15864
15865 /* Source 1 cannot be a non-matching memory. */
15866 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
15867 src1 = force_reg (mode, src1);
15868
15869 /* Improve address combine. */
15870 if (code == PLUS
15871 && GET_MODE_CLASS (mode) == MODE_INT
15872 && MEM_P (src2))
15873 src2 = force_reg (mode, src2);
15874
15875 operands[1] = src1;
15876 operands[2] = src2;
15877 return dst;
15878 }
15879
15880 /* Similarly, but assume that the destination has already been
15881 set up properly. */
15882
15883 void
15884 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
15885 enum machine_mode mode, rtx operands[])
15886 {
15887 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
15888 gcc_assert (dst == operands[0]);
15889 }
15890
15891 /* Attempt to expand a binary operator. Make the expansion closer to the
15892 actual machine, then just general_operand, which will allow 3 separate
15893 memory references (one output, two input) in a single insn. */
15894
15895 void
15896 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
15897 rtx operands[])
15898 {
15899 rtx src1, src2, dst, op, clob;
15900
15901 dst = ix86_fixup_binary_operands (code, mode, operands);
15902 src1 = operands[1];
15903 src2 = operands[2];
15904
15905 /* Emit the instruction. */
15906
15907 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
15908 if (reload_in_progress)
15909 {
15910 /* Reload doesn't know about the flags register, and doesn't know that
15911 it doesn't want to clobber it. We can only do this with PLUS. */
15912 gcc_assert (code == PLUS);
15913 emit_insn (op);
15914 }
15915 else if (reload_completed
15916 && code == PLUS
15917 && !rtx_equal_p (dst, src1))
15918 {
15919 /* This is going to be an LEA; avoid splitting it later. */
15920 emit_insn (op);
15921 }
15922 else
15923 {
15924 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15925 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
15926 }
15927
15928 /* Fix up the destination if needed. */
15929 if (dst != operands[0])
15930 emit_move_insn (operands[0], dst);
15931 }
15932
15933 /* Return TRUE or FALSE depending on whether the binary operator meets the
15934 appropriate constraints. */
15935
15936 bool
15937 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
15938 rtx operands[3])
15939 {
15940 rtx dst = operands[0];
15941 rtx src1 = operands[1];
15942 rtx src2 = operands[2];
15943
15944 /* Both source operands cannot be in memory. */
15945 if (MEM_P (src1) && MEM_P (src2))
15946 return false;
15947
15948 /* Canonicalize operand order for commutative operators. */
15949 if (ix86_swap_binary_operands_p (code, mode, operands))
15950 {
15951 rtx temp = src1;
15952 src1 = src2;
15953 src2 = temp;
15954 }
15955
15956 /* If the destination is memory, we must have a matching source operand. */
15957 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
15958 return false;
15959
15960 /* Source 1 cannot be a constant. */
15961 if (CONSTANT_P (src1))
15962 return false;
15963
15964 /* Source 1 cannot be a non-matching memory. */
15965 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
15966 /* Support "andhi/andsi/anddi" as a zero-extending move. */
15967 return (code == AND
15968 && (mode == HImode
15969 || mode == SImode
15970 || (TARGET_64BIT && mode == DImode))
15971 && satisfies_constraint_L (src2));
15972
15973 return true;
15974 }
15975
15976 /* Attempt to expand a unary operator. Make the expansion closer to the
15977 actual machine, then just general_operand, which will allow 2 separate
15978 memory references (one output, one input) in a single insn. */
15979
15980 void
15981 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
15982 rtx operands[])
15983 {
15984 int matching_memory;
15985 rtx src, dst, op, clob;
15986
15987 dst = operands[0];
15988 src = operands[1];
15989
15990 /* If the destination is memory, and we do not have matching source
15991 operands, do things in registers. */
15992 matching_memory = 0;
15993 if (MEM_P (dst))
15994 {
15995 if (rtx_equal_p (dst, src))
15996 matching_memory = 1;
15997 else
15998 dst = gen_reg_rtx (mode);
15999 }
16000
16001 /* When source operand is memory, destination must match. */
16002 if (MEM_P (src) && !matching_memory)
16003 src = force_reg (mode, src);
16004
16005 /* Emit the instruction. */
16006
16007 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16008 if (reload_in_progress || code == NOT)
16009 {
16010 /* Reload doesn't know about the flags register, and doesn't know that
16011 it doesn't want to clobber it. */
16012 gcc_assert (code == NOT);
16013 emit_insn (op);
16014 }
16015 else
16016 {
16017 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16018 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16019 }
16020
16021 /* Fix up the destination if needed. */
16022 if (dst != operands[0])
16023 emit_move_insn (operands[0], dst);
16024 }
16025
16026 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16027 divisor are within the range [0-255]. */
16028
16029 void
16030 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16031 bool signed_p)
16032 {
16033 rtx end_label, qimode_label;
16034 rtx insn, div, mod;
16035 rtx scratch, tmp0, tmp1, tmp2;
16036 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16037 rtx (*gen_zero_extend) (rtx, rtx);
16038 rtx (*gen_test_ccno_1) (rtx, rtx);
16039
16040 switch (mode)
16041 {
16042 case SImode:
16043 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16044 gen_test_ccno_1 = gen_testsi_ccno_1;
16045 gen_zero_extend = gen_zero_extendqisi2;
16046 break;
16047 case DImode:
16048 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16049 gen_test_ccno_1 = gen_testdi_ccno_1;
16050 gen_zero_extend = gen_zero_extendqidi2;
16051 break;
16052 default:
16053 gcc_unreachable ();
16054 }
16055
16056 end_label = gen_label_rtx ();
16057 qimode_label = gen_label_rtx ();
16058
16059 scratch = gen_reg_rtx (mode);
16060
16061 /* Use 8bit unsigned divimod if dividend and divisor are within
16062 the range [0-255]. */
16063 emit_move_insn (scratch, operands[2]);
16064 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16065 scratch, 1, OPTAB_DIRECT);
16066 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16067 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16068 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16069 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16070 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16071 pc_rtx);
16072 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16073 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16074 JUMP_LABEL (insn) = qimode_label;
16075
16076 /* Generate original signed/unsigned divimod. */
16077 div = gen_divmod4_1 (operands[0], operands[1],
16078 operands[2], operands[3]);
16079 emit_insn (div);
16080
16081 /* Branch to the end. */
16082 emit_jump_insn (gen_jump (end_label));
16083 emit_barrier ();
16084
16085 /* Generate 8bit unsigned divide. */
16086 emit_label (qimode_label);
16087 /* Don't use operands[0] for result of 8bit divide since not all
16088 registers support QImode ZERO_EXTRACT. */
16089 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16090 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16091 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16092 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16093
16094 if (signed_p)
16095 {
16096 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16097 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16098 }
16099 else
16100 {
16101 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16102 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16103 }
16104
16105 /* Extract remainder from AH. */
16106 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16107 if (REG_P (operands[1]))
16108 insn = emit_move_insn (operands[1], tmp1);
16109 else
16110 {
16111 /* Need a new scratch register since the old one has result
16112 of 8bit divide. */
16113 scratch = gen_reg_rtx (mode);
16114 emit_move_insn (scratch, tmp1);
16115 insn = emit_move_insn (operands[1], scratch);
16116 }
16117 set_unique_reg_note (insn, REG_EQUAL, mod);
16118
16119 /* Zero extend quotient from AL. */
16120 tmp1 = gen_lowpart (QImode, tmp0);
16121 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16122 set_unique_reg_note (insn, REG_EQUAL, div);
16123
16124 emit_label (end_label);
16125 }
16126
16127 #define LEA_MAX_STALL (3)
16128 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16129
16130 /* Increase given DISTANCE in half-cycles according to
16131 dependencies between PREV and NEXT instructions.
16132 Add 1 half-cycle if there is no dependency and
16133 go to next cycle if there is some dependecy. */
16134
16135 static unsigned int
16136 increase_distance (rtx prev, rtx next, unsigned int distance)
16137 {
16138 df_ref *use_rec;
16139 df_ref *def_rec;
16140
16141 if (!prev || !next)
16142 return distance + (distance & 1) + 2;
16143
16144 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16145 return distance + 1;
16146
16147 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16148 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16149 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16150 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16151 return distance + (distance & 1) + 2;
16152
16153 return distance + 1;
16154 }
16155
16156 /* Function checks if instruction INSN defines register number
16157 REGNO1 or REGNO2. */
16158
16159 static bool
16160 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16161 rtx insn)
16162 {
16163 df_ref *def_rec;
16164
16165 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16166 if (DF_REF_REG_DEF_P (*def_rec)
16167 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16168 && (regno1 == DF_REF_REGNO (*def_rec)
16169 || regno2 == DF_REF_REGNO (*def_rec)))
16170 {
16171 return true;
16172 }
16173
16174 return false;
16175 }
16176
16177 /* Function checks if instruction INSN uses register number
16178 REGNO as a part of address expression. */
16179
16180 static bool
16181 insn_uses_reg_mem (unsigned int regno, rtx insn)
16182 {
16183 df_ref *use_rec;
16184
16185 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16186 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16187 return true;
16188
16189 return false;
16190 }
16191
16192 /* Search backward for non-agu definition of register number REGNO1
16193 or register number REGNO2 in basic block starting from instruction
16194 START up to head of basic block or instruction INSN.
16195
16196 Function puts true value into *FOUND var if definition was found
16197 and false otherwise.
16198
16199 Distance in half-cycles between START and found instruction or head
16200 of BB is added to DISTANCE and returned. */
16201
16202 static int
16203 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16204 rtx insn, int distance,
16205 rtx start, bool *found)
16206 {
16207 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16208 rtx prev = start;
16209 rtx next = NULL;
16210 enum attr_type insn_type;
16211
16212 *found = false;
16213
16214 while (prev
16215 && prev != insn
16216 && distance < LEA_SEARCH_THRESHOLD)
16217 {
16218 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16219 {
16220 distance = increase_distance (prev, next, distance);
16221 if (insn_defines_reg (regno1, regno2, prev))
16222 {
16223 insn_type = get_attr_type (prev);
16224 if (insn_type != TYPE_LEA)
16225 {
16226 *found = true;
16227 return distance;
16228 }
16229 }
16230
16231 next = prev;
16232 }
16233 if (prev == BB_HEAD (bb))
16234 break;
16235
16236 prev = PREV_INSN (prev);
16237 }
16238
16239 return distance;
16240 }
16241
16242 /* Search backward for non-agu definition of register number REGNO1
16243 or register number REGNO2 in INSN's basic block until
16244 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16245 2. Reach neighbour BBs boundary, or
16246 3. Reach agu definition.
16247 Returns the distance between the non-agu definition point and INSN.
16248 If no definition point, returns -1. */
16249
16250 static int
16251 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16252 rtx insn)
16253 {
16254 basic_block bb = BLOCK_FOR_INSN (insn);
16255 int distance = 0;
16256 bool found = false;
16257
16258 if (insn != BB_HEAD (bb))
16259 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16260 distance, PREV_INSN (insn),
16261 &found);
16262
16263 if (!found && distance < LEA_SEARCH_THRESHOLD)
16264 {
16265 edge e;
16266 edge_iterator ei;
16267 bool simple_loop = false;
16268
16269 FOR_EACH_EDGE (e, ei, bb->preds)
16270 if (e->src == bb)
16271 {
16272 simple_loop = true;
16273 break;
16274 }
16275
16276 if (simple_loop)
16277 distance = distance_non_agu_define_in_bb (regno1, regno2,
16278 insn, distance,
16279 BB_END (bb), &found);
16280 else
16281 {
16282 int shortest_dist = -1;
16283 bool found_in_bb = false;
16284
16285 FOR_EACH_EDGE (e, ei, bb->preds)
16286 {
16287 int bb_dist
16288 = distance_non_agu_define_in_bb (regno1, regno2,
16289 insn, distance,
16290 BB_END (e->src),
16291 &found_in_bb);
16292 if (found_in_bb)
16293 {
16294 if (shortest_dist < 0)
16295 shortest_dist = bb_dist;
16296 else if (bb_dist > 0)
16297 shortest_dist = MIN (bb_dist, shortest_dist);
16298
16299 found = true;
16300 }
16301 }
16302
16303 distance = shortest_dist;
16304 }
16305 }
16306
16307 /* get_attr_type may modify recog data. We want to make sure
16308 that recog data is valid for instruction INSN, on which
16309 distance_non_agu_define is called. INSN is unchanged here. */
16310 extract_insn_cached (insn);
16311
16312 if (!found)
16313 return -1;
16314
16315 return distance >> 1;
16316 }
16317
16318 /* Return the distance in half-cycles between INSN and the next
16319 insn that uses register number REGNO in memory address added
16320 to DISTANCE. Return -1 if REGNO0 is set.
16321
16322 Put true value into *FOUND if register usage was found and
16323 false otherwise.
16324 Put true value into *REDEFINED if register redefinition was
16325 found and false otherwise. */
16326
16327 static int
16328 distance_agu_use_in_bb (unsigned int regno,
16329 rtx insn, int distance, rtx start,
16330 bool *found, bool *redefined)
16331 {
16332 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16333 rtx next = start;
16334 rtx prev = NULL;
16335
16336 *found = false;
16337 *redefined = false;
16338
16339 while (next
16340 && next != insn
16341 && distance < LEA_SEARCH_THRESHOLD)
16342 {
16343 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
16344 {
16345 distance = increase_distance(prev, next, distance);
16346 if (insn_uses_reg_mem (regno, next))
16347 {
16348 /* Return DISTANCE if OP0 is used in memory
16349 address in NEXT. */
16350 *found = true;
16351 return distance;
16352 }
16353
16354 if (insn_defines_reg (regno, INVALID_REGNUM, next))
16355 {
16356 /* Return -1 if OP0 is set in NEXT. */
16357 *redefined = true;
16358 return -1;
16359 }
16360
16361 prev = next;
16362 }
16363
16364 if (next == BB_END (bb))
16365 break;
16366
16367 next = NEXT_INSN (next);
16368 }
16369
16370 return distance;
16371 }
16372
16373 /* Return the distance between INSN and the next insn that uses
16374 register number REGNO0 in memory address. Return -1 if no such
16375 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
16376
16377 static int
16378 distance_agu_use (unsigned int regno0, rtx insn)
16379 {
16380 basic_block bb = BLOCK_FOR_INSN (insn);
16381 int distance = 0;
16382 bool found = false;
16383 bool redefined = false;
16384
16385 if (insn != BB_END (bb))
16386 distance = distance_agu_use_in_bb (regno0, insn, distance,
16387 NEXT_INSN (insn),
16388 &found, &redefined);
16389
16390 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
16391 {
16392 edge e;
16393 edge_iterator ei;
16394 bool simple_loop = false;
16395
16396 FOR_EACH_EDGE (e, ei, bb->succs)
16397 if (e->dest == bb)
16398 {
16399 simple_loop = true;
16400 break;
16401 }
16402
16403 if (simple_loop)
16404 distance = distance_agu_use_in_bb (regno0, insn,
16405 distance, BB_HEAD (bb),
16406 &found, &redefined);
16407 else
16408 {
16409 int shortest_dist = -1;
16410 bool found_in_bb = false;
16411 bool redefined_in_bb = false;
16412
16413 FOR_EACH_EDGE (e, ei, bb->succs)
16414 {
16415 int bb_dist
16416 = distance_agu_use_in_bb (regno0, insn,
16417 distance, BB_HEAD (e->dest),
16418 &found_in_bb, &redefined_in_bb);
16419 if (found_in_bb)
16420 {
16421 if (shortest_dist < 0)
16422 shortest_dist = bb_dist;
16423 else if (bb_dist > 0)
16424 shortest_dist = MIN (bb_dist, shortest_dist);
16425
16426 found = true;
16427 }
16428 }
16429
16430 distance = shortest_dist;
16431 }
16432 }
16433
16434 if (!found || redefined)
16435 return -1;
16436
16437 return distance >> 1;
16438 }
16439
16440 /* Define this macro to tune LEA priority vs ADD, it take effect when
16441 there is a dilemma of choicing LEA or ADD
16442 Negative value: ADD is more preferred than LEA
16443 Zero: Netrual
16444 Positive value: LEA is more preferred than ADD*/
16445 #define IX86_LEA_PRIORITY 0
16446
16447 /* Return true if usage of lea INSN has performance advantage
16448 over a sequence of instructions. Instructions sequence has
16449 SPLIT_COST cycles higher latency than lea latency. */
16450
16451 bool
16452 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
16453 unsigned int regno2, unsigned int split_cost)
16454 {
16455 int dist_define, dist_use;
16456
16457 dist_define = distance_non_agu_define (regno1, regno2, insn);
16458 dist_use = distance_agu_use (regno0, insn);
16459
16460 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
16461 {
16462 /* If there is no non AGU operand definition, no AGU
16463 operand usage and split cost is 0 then both lea
16464 and non lea variants have same priority. Currently
16465 we prefer lea for 64 bit code and non lea on 32 bit
16466 code. */
16467 if (dist_use < 0 && split_cost == 0)
16468 return TARGET_64BIT || IX86_LEA_PRIORITY;
16469 else
16470 return true;
16471 }
16472
16473 /* With longer definitions distance lea is more preferable.
16474 Here we change it to take into account splitting cost and
16475 lea priority. */
16476 dist_define += split_cost + IX86_LEA_PRIORITY;
16477
16478 /* If there is no use in memory addess then we just check
16479 that split cost does not exceed AGU stall. */
16480 if (dist_use < 0)
16481 return dist_define >= LEA_MAX_STALL;
16482
16483 /* If this insn has both backward non-agu dependence and forward
16484 agu dependence, the one with short distance takes effect. */
16485 return dist_define >= dist_use;
16486 }
16487
16488 /* Return true if it is legal to clobber flags by INSN and
16489 false otherwise. */
16490
16491 static bool
16492 ix86_ok_to_clobber_flags (rtx insn)
16493 {
16494 basic_block bb = BLOCK_FOR_INSN (insn);
16495 df_ref *use;
16496 bitmap live;
16497
16498 while (insn)
16499 {
16500 if (NONDEBUG_INSN_P (insn))
16501 {
16502 for (use = DF_INSN_USES (insn); *use; use++)
16503 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
16504 return false;
16505
16506 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
16507 return true;
16508 }
16509
16510 if (insn == BB_END (bb))
16511 break;
16512
16513 insn = NEXT_INSN (insn);
16514 }
16515
16516 live = df_get_live_out(bb);
16517 return !REGNO_REG_SET_P (live, FLAGS_REG);
16518 }
16519
16520 /* Return true if we need to split op0 = op1 + op2 into a sequence of
16521 move and add to avoid AGU stalls. */
16522
16523 bool
16524 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
16525 {
16526 unsigned int regno0 = true_regnum (operands[0]);
16527 unsigned int regno1 = true_regnum (operands[1]);
16528 unsigned int regno2 = true_regnum (operands[2]);
16529
16530 /* Check if we need to optimize. */
16531 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16532 return false;
16533
16534 /* Check it is correct to split here. */
16535 if (!ix86_ok_to_clobber_flags(insn))
16536 return false;
16537
16538 /* We need to split only adds with non destructive
16539 destination operand. */
16540 if (regno0 == regno1 || regno0 == regno2)
16541 return false;
16542 else
16543 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
16544 }
16545
16546 /* Return true if we should emit lea instruction instead of mov
16547 instruction. */
16548
16549 bool
16550 ix86_use_lea_for_mov (rtx insn, rtx operands[])
16551 {
16552 unsigned int regno0;
16553 unsigned int regno1;
16554
16555 /* Check if we need to optimize. */
16556 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16557 return false;
16558
16559 /* Use lea for reg to reg moves only. */
16560 if (!REG_P (operands[0]) || !REG_P (operands[1]))
16561 return false;
16562
16563 regno0 = true_regnum (operands[0]);
16564 regno1 = true_regnum (operands[1]);
16565
16566 return ix86_lea_outperforms (insn, regno0, regno1, -1, 0);
16567 }
16568
16569 /* Return true if we need to split lea into a sequence of
16570 instructions to avoid AGU stalls. */
16571
16572 bool
16573 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
16574 {
16575 unsigned int regno0 = true_regnum (operands[0]) ;
16576 unsigned int regno1 = -1;
16577 unsigned int regno2 = -1;
16578 unsigned int split_cost = 0;
16579 struct ix86_address parts;
16580 int ok;
16581
16582 /* Check we need to optimize. */
16583 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16584 return false;
16585
16586 /* Check it is correct to split here. */
16587 if (!ix86_ok_to_clobber_flags(insn))
16588 return false;
16589
16590 ok = ix86_decompose_address (operands[1], &parts);
16591 gcc_assert (ok);
16592
16593 /* We should not split into add if non legitimate pic
16594 operand is used as displacement. */
16595 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
16596 return false;
16597
16598 if (parts.base)
16599 regno1 = true_regnum (parts.base);
16600 if (parts.index)
16601 regno2 = true_regnum (parts.index);
16602
16603 /* Compute how many cycles we will add to execution time
16604 if split lea into a sequence of instructions. */
16605 if (parts.base || parts.index)
16606 {
16607 /* Have to use mov instruction if non desctructive
16608 destination form is used. */
16609 if (regno1 != regno0 && regno2 != regno0)
16610 split_cost += 1;
16611
16612 /* Have to add index to base if both exist. */
16613 if (parts.base && parts.index)
16614 split_cost += 1;
16615
16616 /* Have to use shift and adds if scale is 2 or greater. */
16617 if (parts.scale > 1)
16618 {
16619 if (regno0 != regno1)
16620 split_cost += 1;
16621 else if (regno2 == regno0)
16622 split_cost += 4;
16623 else
16624 split_cost += parts.scale;
16625 }
16626
16627 /* Have to use add instruction with immediate if
16628 disp is non zero. */
16629 if (parts.disp && parts.disp != const0_rtx)
16630 split_cost += 1;
16631
16632 /* Subtract the price of lea. */
16633 split_cost -= 1;
16634 }
16635
16636 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
16637 }
16638
16639 /* Emit x86 binary operand CODE in mode MODE, where the first operand
16640 matches destination. RTX includes clobber of FLAGS_REG. */
16641
16642 static void
16643 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
16644 rtx dst, rtx src)
16645 {
16646 rtx op, clob;
16647
16648 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
16649 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16650
16651 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16652 }
16653
16654 /* Split lea instructions into a sequence of instructions
16655 which are executed on ALU to avoid AGU stalls.
16656 It is assumed that it is allowed to clobber flags register
16657 at lea position. */
16658
16659 extern void
16660 ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
16661 {
16662 unsigned int regno0 = true_regnum (operands[0]) ;
16663 unsigned int regno1 = INVALID_REGNUM;
16664 unsigned int regno2 = INVALID_REGNUM;
16665 struct ix86_address parts;
16666 rtx tmp;
16667 int ok, adds;
16668
16669 ok = ix86_decompose_address (operands[1], &parts);
16670 gcc_assert (ok);
16671
16672 if (parts.base)
16673 {
16674 if (GET_MODE (parts.base) != mode)
16675 parts.base = gen_rtx_SUBREG (mode, parts.base, 0);
16676 regno1 = true_regnum (parts.base);
16677 }
16678
16679 if (parts.index)
16680 {
16681 if (GET_MODE (parts.index) != mode)
16682 parts.index = gen_rtx_SUBREG (mode, parts.index, 0);
16683 regno2 = true_regnum (parts.index);
16684 }
16685
16686 if (parts.scale > 1)
16687 {
16688 /* Case r1 = r1 + ... */
16689 if (regno1 == regno0)
16690 {
16691 /* If we have a case r1 = r1 + C * r1 then we
16692 should use multiplication which is very
16693 expensive. Assume cost model is wrong if we
16694 have such case here. */
16695 gcc_assert (regno2 != regno0);
16696
16697 for (adds = parts.scale; adds > 0; adds--)
16698 ix86_emit_binop (PLUS, mode, operands[0], parts.index);
16699 }
16700 else
16701 {
16702 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
16703 if (regno0 != regno2)
16704 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16705
16706 /* Use shift for scaling. */
16707 ix86_emit_binop (ASHIFT, mode, operands[0],
16708 GEN_INT (exact_log2 (parts.scale)));
16709
16710 if (parts.base)
16711 ix86_emit_binop (PLUS, mode, operands[0], parts.base);
16712
16713 if (parts.disp && parts.disp != const0_rtx)
16714 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16715 }
16716 }
16717 else if (!parts.base && !parts.index)
16718 {
16719 gcc_assert(parts.disp);
16720 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.disp));
16721 }
16722 else
16723 {
16724 if (!parts.base)
16725 {
16726 if (regno0 != regno2)
16727 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16728 }
16729 else if (!parts.index)
16730 {
16731 if (regno0 != regno1)
16732 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16733 }
16734 else
16735 {
16736 if (regno0 == regno1)
16737 tmp = parts.index;
16738 else if (regno0 == regno2)
16739 tmp = parts.base;
16740 else
16741 {
16742 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16743 tmp = parts.index;
16744 }
16745
16746 ix86_emit_binop (PLUS, mode, operands[0], tmp);
16747 }
16748
16749 if (parts.disp && parts.disp != const0_rtx)
16750 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16751 }
16752 }
16753
16754 /* Return true if it is ok to optimize an ADD operation to LEA
16755 operation to avoid flag register consumation. For most processors,
16756 ADD is faster than LEA. For the processors like ATOM, if the
16757 destination register of LEA holds an actual address which will be
16758 used soon, LEA is better and otherwise ADD is better. */
16759
16760 bool
16761 ix86_lea_for_add_ok (rtx insn, rtx operands[])
16762 {
16763 unsigned int regno0 = true_regnum (operands[0]);
16764 unsigned int regno1 = true_regnum (operands[1]);
16765 unsigned int regno2 = true_regnum (operands[2]);
16766
16767 /* If a = b + c, (a!=b && a!=c), must use lea form. */
16768 if (regno0 != regno1 && regno0 != regno2)
16769 return true;
16770
16771 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16772 return false;
16773
16774 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
16775 }
16776
16777 /* Return true if destination reg of SET_BODY is shift count of
16778 USE_BODY. */
16779
16780 static bool
16781 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
16782 {
16783 rtx set_dest;
16784 rtx shift_rtx;
16785 int i;
16786
16787 /* Retrieve destination of SET_BODY. */
16788 switch (GET_CODE (set_body))
16789 {
16790 case SET:
16791 set_dest = SET_DEST (set_body);
16792 if (!set_dest || !REG_P (set_dest))
16793 return false;
16794 break;
16795 case PARALLEL:
16796 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
16797 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
16798 use_body))
16799 return true;
16800 default:
16801 return false;
16802 break;
16803 }
16804
16805 /* Retrieve shift count of USE_BODY. */
16806 switch (GET_CODE (use_body))
16807 {
16808 case SET:
16809 shift_rtx = XEXP (use_body, 1);
16810 break;
16811 case PARALLEL:
16812 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
16813 if (ix86_dep_by_shift_count_body (set_body,
16814 XVECEXP (use_body, 0, i)))
16815 return true;
16816 default:
16817 return false;
16818 break;
16819 }
16820
16821 if (shift_rtx
16822 && (GET_CODE (shift_rtx) == ASHIFT
16823 || GET_CODE (shift_rtx) == LSHIFTRT
16824 || GET_CODE (shift_rtx) == ASHIFTRT
16825 || GET_CODE (shift_rtx) == ROTATE
16826 || GET_CODE (shift_rtx) == ROTATERT))
16827 {
16828 rtx shift_count = XEXP (shift_rtx, 1);
16829
16830 /* Return true if shift count is dest of SET_BODY. */
16831 if (REG_P (shift_count)
16832 && true_regnum (set_dest) == true_regnum (shift_count))
16833 return true;
16834 }
16835
16836 return false;
16837 }
16838
16839 /* Return true if destination reg of SET_INSN is shift count of
16840 USE_INSN. */
16841
16842 bool
16843 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
16844 {
16845 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
16846 PATTERN (use_insn));
16847 }
16848
16849 /* Return TRUE or FALSE depending on whether the unary operator meets the
16850 appropriate constraints. */
16851
16852 bool
16853 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
16854 enum machine_mode mode ATTRIBUTE_UNUSED,
16855 rtx operands[2] ATTRIBUTE_UNUSED)
16856 {
16857 /* If one of operands is memory, source and destination must match. */
16858 if ((MEM_P (operands[0])
16859 || MEM_P (operands[1]))
16860 && ! rtx_equal_p (operands[0], operands[1]))
16861 return false;
16862 return true;
16863 }
16864
16865 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
16866 are ok, keeping in mind the possible movddup alternative. */
16867
16868 bool
16869 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
16870 {
16871 if (MEM_P (operands[0]))
16872 return rtx_equal_p (operands[0], operands[1 + high]);
16873 if (MEM_P (operands[1]) && MEM_P (operands[2]))
16874 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
16875 return true;
16876 }
16877
16878 /* Post-reload splitter for converting an SF or DFmode value in an
16879 SSE register into an unsigned SImode. */
16880
16881 void
16882 ix86_split_convert_uns_si_sse (rtx operands[])
16883 {
16884 enum machine_mode vecmode;
16885 rtx value, large, zero_or_two31, input, two31, x;
16886
16887 large = operands[1];
16888 zero_or_two31 = operands[2];
16889 input = operands[3];
16890 two31 = operands[4];
16891 vecmode = GET_MODE (large);
16892 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
16893
16894 /* Load up the value into the low element. We must ensure that the other
16895 elements are valid floats -- zero is the easiest such value. */
16896 if (MEM_P (input))
16897 {
16898 if (vecmode == V4SFmode)
16899 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
16900 else
16901 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
16902 }
16903 else
16904 {
16905 input = gen_rtx_REG (vecmode, REGNO (input));
16906 emit_move_insn (value, CONST0_RTX (vecmode));
16907 if (vecmode == V4SFmode)
16908 emit_insn (gen_sse_movss (value, value, input));
16909 else
16910 emit_insn (gen_sse2_movsd (value, value, input));
16911 }
16912
16913 emit_move_insn (large, two31);
16914 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
16915
16916 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
16917 emit_insn (gen_rtx_SET (VOIDmode, large, x));
16918
16919 x = gen_rtx_AND (vecmode, zero_or_two31, large);
16920 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
16921
16922 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
16923 emit_insn (gen_rtx_SET (VOIDmode, value, x));
16924
16925 large = gen_rtx_REG (V4SImode, REGNO (large));
16926 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
16927
16928 x = gen_rtx_REG (V4SImode, REGNO (value));
16929 if (vecmode == V4SFmode)
16930 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
16931 else
16932 emit_insn (gen_sse2_cvttpd2dq (x, value));
16933 value = x;
16934
16935 emit_insn (gen_xorv4si3 (value, value, large));
16936 }
16937
16938 /* Convert an unsigned DImode value into a DFmode, using only SSE.
16939 Expects the 64-bit DImode to be supplied in a pair of integral
16940 registers. Requires SSE2; will use SSE3 if available. For x86_32,
16941 -mfpmath=sse, !optimize_size only. */
16942
16943 void
16944 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
16945 {
16946 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
16947 rtx int_xmm, fp_xmm;
16948 rtx biases, exponents;
16949 rtx x;
16950
16951 int_xmm = gen_reg_rtx (V4SImode);
16952 if (TARGET_INTER_UNIT_MOVES)
16953 emit_insn (gen_movdi_to_sse (int_xmm, input));
16954 else if (TARGET_SSE_SPLIT_REGS)
16955 {
16956 emit_clobber (int_xmm);
16957 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
16958 }
16959 else
16960 {
16961 x = gen_reg_rtx (V2DImode);
16962 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
16963 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
16964 }
16965
16966 x = gen_rtx_CONST_VECTOR (V4SImode,
16967 gen_rtvec (4, GEN_INT (0x43300000UL),
16968 GEN_INT (0x45300000UL),
16969 const0_rtx, const0_rtx));
16970 exponents = validize_mem (force_const_mem (V4SImode, x));
16971
16972 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
16973 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
16974
16975 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
16976 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
16977 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
16978 (0x1.0p84 + double(fp_value_hi_xmm)).
16979 Note these exponents differ by 32. */
16980
16981 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
16982
16983 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
16984 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
16985 real_ldexp (&bias_lo_rvt, &dconst1, 52);
16986 real_ldexp (&bias_hi_rvt, &dconst1, 84);
16987 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
16988 x = const_double_from_real_value (bias_hi_rvt, DFmode);
16989 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
16990 biases = validize_mem (force_const_mem (V2DFmode, biases));
16991 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
16992
16993 /* Add the upper and lower DFmode values together. */
16994 if (TARGET_SSE3)
16995 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
16996 else
16997 {
16998 x = copy_to_mode_reg (V2DFmode, fp_xmm);
16999 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17000 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17001 }
17002
17003 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17004 }
17005
17006 /* Not used, but eases macroization of patterns. */
17007 void
17008 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17009 rtx input ATTRIBUTE_UNUSED)
17010 {
17011 gcc_unreachable ();
17012 }
17013
17014 /* Convert an unsigned SImode value into a DFmode. Only currently used
17015 for SSE, but applicable anywhere. */
17016
17017 void
17018 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17019 {
17020 REAL_VALUE_TYPE TWO31r;
17021 rtx x, fp;
17022
17023 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17024 NULL, 1, OPTAB_DIRECT);
17025
17026 fp = gen_reg_rtx (DFmode);
17027 emit_insn (gen_floatsidf2 (fp, x));
17028
17029 real_ldexp (&TWO31r, &dconst1, 31);
17030 x = const_double_from_real_value (TWO31r, DFmode);
17031
17032 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17033 if (x != target)
17034 emit_move_insn (target, x);
17035 }
17036
17037 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17038 32-bit mode; otherwise we have a direct convert instruction. */
17039
17040 void
17041 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17042 {
17043 REAL_VALUE_TYPE TWO32r;
17044 rtx fp_lo, fp_hi, x;
17045
17046 fp_lo = gen_reg_rtx (DFmode);
17047 fp_hi = gen_reg_rtx (DFmode);
17048
17049 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17050
17051 real_ldexp (&TWO32r, &dconst1, 32);
17052 x = const_double_from_real_value (TWO32r, DFmode);
17053 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17054
17055 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17056
17057 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17058 0, OPTAB_DIRECT);
17059 if (x != target)
17060 emit_move_insn (target, x);
17061 }
17062
17063 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17064 For x86_32, -mfpmath=sse, !optimize_size only. */
17065 void
17066 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17067 {
17068 REAL_VALUE_TYPE ONE16r;
17069 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17070
17071 real_ldexp (&ONE16r, &dconst1, 16);
17072 x = const_double_from_real_value (ONE16r, SFmode);
17073 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17074 NULL, 0, OPTAB_DIRECT);
17075 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17076 NULL, 0, OPTAB_DIRECT);
17077 fp_hi = gen_reg_rtx (SFmode);
17078 fp_lo = gen_reg_rtx (SFmode);
17079 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17080 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17081 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17082 0, OPTAB_DIRECT);
17083 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17084 0, OPTAB_DIRECT);
17085 if (!rtx_equal_p (target, fp_hi))
17086 emit_move_insn (target, fp_hi);
17087 }
17088
17089 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17090 a vector of unsigned ints VAL to vector of floats TARGET. */
17091
17092 void
17093 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17094 {
17095 rtx tmp[8];
17096 REAL_VALUE_TYPE TWO16r;
17097 enum machine_mode intmode = GET_MODE (val);
17098 enum machine_mode fltmode = GET_MODE (target);
17099 rtx (*cvt) (rtx, rtx);
17100
17101 if (intmode == V4SImode)
17102 cvt = gen_floatv4siv4sf2;
17103 else
17104 cvt = gen_floatv8siv8sf2;
17105 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17106 tmp[0] = force_reg (intmode, tmp[0]);
17107 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17108 OPTAB_DIRECT);
17109 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17110 NULL_RTX, 1, OPTAB_DIRECT);
17111 tmp[3] = gen_reg_rtx (fltmode);
17112 emit_insn (cvt (tmp[3], tmp[1]));
17113 tmp[4] = gen_reg_rtx (fltmode);
17114 emit_insn (cvt (tmp[4], tmp[2]));
17115 real_ldexp (&TWO16r, &dconst1, 16);
17116 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17117 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17118 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17119 OPTAB_DIRECT);
17120 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17121 OPTAB_DIRECT);
17122 if (tmp[7] != target)
17123 emit_move_insn (target, tmp[7]);
17124 }
17125
17126 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17127 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17128 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17129 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17130
17131 rtx
17132 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17133 {
17134 REAL_VALUE_TYPE TWO31r;
17135 rtx two31r, tmp[4];
17136 enum machine_mode mode = GET_MODE (val);
17137 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17138 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17139 rtx (*cmp) (rtx, rtx, rtx, rtx);
17140 int i;
17141
17142 for (i = 0; i < 3; i++)
17143 tmp[i] = gen_reg_rtx (mode);
17144 real_ldexp (&TWO31r, &dconst1, 31);
17145 two31r = const_double_from_real_value (TWO31r, scalarmode);
17146 two31r = ix86_build_const_vector (mode, 1, two31r);
17147 two31r = force_reg (mode, two31r);
17148 switch (mode)
17149 {
17150 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17151 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17152 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17153 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17154 default: gcc_unreachable ();
17155 }
17156 tmp[3] = gen_rtx_LE (mode, two31r, val);
17157 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17158 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17159 0, OPTAB_DIRECT);
17160 if (intmode == V4SImode || TARGET_AVX2)
17161 *xorp = expand_simple_binop (intmode, ASHIFT,
17162 gen_lowpart (intmode, tmp[0]),
17163 GEN_INT (31), NULL_RTX, 0,
17164 OPTAB_DIRECT);
17165 else
17166 {
17167 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17168 two31 = ix86_build_const_vector (intmode, 1, two31);
17169 *xorp = expand_simple_binop (intmode, AND,
17170 gen_lowpart (intmode, tmp[0]),
17171 two31, NULL_RTX, 0,
17172 OPTAB_DIRECT);
17173 }
17174 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17175 0, OPTAB_DIRECT);
17176 }
17177
17178 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17179 then replicate the value for all elements of the vector
17180 register. */
17181
17182 rtx
17183 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17184 {
17185 int i, n_elt;
17186 rtvec v;
17187 enum machine_mode scalar_mode;
17188
17189 switch (mode)
17190 {
17191 case V32QImode:
17192 case V16QImode:
17193 case V16HImode:
17194 case V8HImode:
17195 case V8SImode:
17196 case V4SImode:
17197 case V4DImode:
17198 case V2DImode:
17199 gcc_assert (vect);
17200 case V8SFmode:
17201 case V4SFmode:
17202 case V4DFmode:
17203 case V2DFmode:
17204 n_elt = GET_MODE_NUNITS (mode);
17205 v = rtvec_alloc (n_elt);
17206 scalar_mode = GET_MODE_INNER (mode);
17207
17208 RTVEC_ELT (v, 0) = value;
17209
17210 for (i = 1; i < n_elt; ++i)
17211 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17212
17213 return gen_rtx_CONST_VECTOR (mode, v);
17214
17215 default:
17216 gcc_unreachable ();
17217 }
17218 }
17219
17220 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17221 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17222 for an SSE register. If VECT is true, then replicate the mask for
17223 all elements of the vector register. If INVERT is true, then create
17224 a mask excluding the sign bit. */
17225
17226 rtx
17227 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17228 {
17229 enum machine_mode vec_mode, imode;
17230 HOST_WIDE_INT hi, lo;
17231 int shift = 63;
17232 rtx v;
17233 rtx mask;
17234
17235 /* Find the sign bit, sign extended to 2*HWI. */
17236 switch (mode)
17237 {
17238 case V8SImode:
17239 case V4SImode:
17240 case V8SFmode:
17241 case V4SFmode:
17242 vec_mode = mode;
17243 mode = GET_MODE_INNER (mode);
17244 imode = SImode;
17245 lo = 0x80000000, hi = lo < 0;
17246 break;
17247
17248 case V4DImode:
17249 case V2DImode:
17250 case V4DFmode:
17251 case V2DFmode:
17252 vec_mode = mode;
17253 mode = GET_MODE_INNER (mode);
17254 imode = DImode;
17255 if (HOST_BITS_PER_WIDE_INT >= 64)
17256 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17257 else
17258 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17259 break;
17260
17261 case TImode:
17262 case TFmode:
17263 vec_mode = VOIDmode;
17264 if (HOST_BITS_PER_WIDE_INT >= 64)
17265 {
17266 imode = TImode;
17267 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
17268 }
17269 else
17270 {
17271 rtvec vec;
17272
17273 imode = DImode;
17274 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17275
17276 if (invert)
17277 {
17278 lo = ~lo, hi = ~hi;
17279 v = constm1_rtx;
17280 }
17281 else
17282 v = const0_rtx;
17283
17284 mask = immed_double_const (lo, hi, imode);
17285
17286 vec = gen_rtvec (2, v, mask);
17287 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
17288 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
17289
17290 return v;
17291 }
17292 break;
17293
17294 default:
17295 gcc_unreachable ();
17296 }
17297
17298 if (invert)
17299 lo = ~lo, hi = ~hi;
17300
17301 /* Force this value into the low part of a fp vector constant. */
17302 mask = immed_double_const (lo, hi, imode);
17303 mask = gen_lowpart (mode, mask);
17304
17305 if (vec_mode == VOIDmode)
17306 return force_reg (mode, mask);
17307
17308 v = ix86_build_const_vector (vec_mode, vect, mask);
17309 return force_reg (vec_mode, v);
17310 }
17311
17312 /* Generate code for floating point ABS or NEG. */
17313
17314 void
17315 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
17316 rtx operands[])
17317 {
17318 rtx mask, set, dst, src;
17319 bool use_sse = false;
17320 bool vector_mode = VECTOR_MODE_P (mode);
17321 enum machine_mode vmode = mode;
17322
17323 if (vector_mode)
17324 use_sse = true;
17325 else if (mode == TFmode)
17326 use_sse = true;
17327 else if (TARGET_SSE_MATH)
17328 {
17329 use_sse = SSE_FLOAT_MODE_P (mode);
17330 if (mode == SFmode)
17331 vmode = V4SFmode;
17332 else if (mode == DFmode)
17333 vmode = V2DFmode;
17334 }
17335
17336 /* NEG and ABS performed with SSE use bitwise mask operations.
17337 Create the appropriate mask now. */
17338 if (use_sse)
17339 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
17340 else
17341 mask = NULL_RTX;
17342
17343 dst = operands[0];
17344 src = operands[1];
17345
17346 set = gen_rtx_fmt_e (code, mode, src);
17347 set = gen_rtx_SET (VOIDmode, dst, set);
17348
17349 if (mask)
17350 {
17351 rtx use, clob;
17352 rtvec par;
17353
17354 use = gen_rtx_USE (VOIDmode, mask);
17355 if (vector_mode)
17356 par = gen_rtvec (2, set, use);
17357 else
17358 {
17359 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17360 par = gen_rtvec (3, set, use, clob);
17361 }
17362 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
17363 }
17364 else
17365 emit_insn (set);
17366 }
17367
17368 /* Expand a copysign operation. Special case operand 0 being a constant. */
17369
17370 void
17371 ix86_expand_copysign (rtx operands[])
17372 {
17373 enum machine_mode mode, vmode;
17374 rtx dest, op0, op1, mask, nmask;
17375
17376 dest = operands[0];
17377 op0 = operands[1];
17378 op1 = operands[2];
17379
17380 mode = GET_MODE (dest);
17381
17382 if (mode == SFmode)
17383 vmode = V4SFmode;
17384 else if (mode == DFmode)
17385 vmode = V2DFmode;
17386 else
17387 vmode = mode;
17388
17389 if (GET_CODE (op0) == CONST_DOUBLE)
17390 {
17391 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
17392
17393 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
17394 op0 = simplify_unary_operation (ABS, mode, op0, mode);
17395
17396 if (mode == SFmode || mode == DFmode)
17397 {
17398 if (op0 == CONST0_RTX (mode))
17399 op0 = CONST0_RTX (vmode);
17400 else
17401 {
17402 rtx v = ix86_build_const_vector (vmode, false, op0);
17403
17404 op0 = force_reg (vmode, v);
17405 }
17406 }
17407 else if (op0 != CONST0_RTX (mode))
17408 op0 = force_reg (mode, op0);
17409
17410 mask = ix86_build_signbit_mask (vmode, 0, 0);
17411
17412 if (mode == SFmode)
17413 copysign_insn = gen_copysignsf3_const;
17414 else if (mode == DFmode)
17415 copysign_insn = gen_copysigndf3_const;
17416 else
17417 copysign_insn = gen_copysigntf3_const;
17418
17419 emit_insn (copysign_insn (dest, op0, op1, mask));
17420 }
17421 else
17422 {
17423 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
17424
17425 nmask = ix86_build_signbit_mask (vmode, 0, 1);
17426 mask = ix86_build_signbit_mask (vmode, 0, 0);
17427
17428 if (mode == SFmode)
17429 copysign_insn = gen_copysignsf3_var;
17430 else if (mode == DFmode)
17431 copysign_insn = gen_copysigndf3_var;
17432 else
17433 copysign_insn = gen_copysigntf3_var;
17434
17435 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
17436 }
17437 }
17438
17439 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
17440 be a constant, and so has already been expanded into a vector constant. */
17441
17442 void
17443 ix86_split_copysign_const (rtx operands[])
17444 {
17445 enum machine_mode mode, vmode;
17446 rtx dest, op0, mask, x;
17447
17448 dest = operands[0];
17449 op0 = operands[1];
17450 mask = operands[3];
17451
17452 mode = GET_MODE (dest);
17453 vmode = GET_MODE (mask);
17454
17455 dest = simplify_gen_subreg (vmode, dest, mode, 0);
17456 x = gen_rtx_AND (vmode, dest, mask);
17457 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17458
17459 if (op0 != CONST0_RTX (vmode))
17460 {
17461 x = gen_rtx_IOR (vmode, dest, op0);
17462 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17463 }
17464 }
17465
17466 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
17467 so we have to do two masks. */
17468
17469 void
17470 ix86_split_copysign_var (rtx operands[])
17471 {
17472 enum machine_mode mode, vmode;
17473 rtx dest, scratch, op0, op1, mask, nmask, x;
17474
17475 dest = operands[0];
17476 scratch = operands[1];
17477 op0 = operands[2];
17478 op1 = operands[3];
17479 nmask = operands[4];
17480 mask = operands[5];
17481
17482 mode = GET_MODE (dest);
17483 vmode = GET_MODE (mask);
17484
17485 if (rtx_equal_p (op0, op1))
17486 {
17487 /* Shouldn't happen often (it's useless, obviously), but when it does
17488 we'd generate incorrect code if we continue below. */
17489 emit_move_insn (dest, op0);
17490 return;
17491 }
17492
17493 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
17494 {
17495 gcc_assert (REGNO (op1) == REGNO (scratch));
17496
17497 x = gen_rtx_AND (vmode, scratch, mask);
17498 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17499
17500 dest = mask;
17501 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17502 x = gen_rtx_NOT (vmode, dest);
17503 x = gen_rtx_AND (vmode, x, op0);
17504 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17505 }
17506 else
17507 {
17508 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
17509 {
17510 x = gen_rtx_AND (vmode, scratch, mask);
17511 }
17512 else /* alternative 2,4 */
17513 {
17514 gcc_assert (REGNO (mask) == REGNO (scratch));
17515 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
17516 x = gen_rtx_AND (vmode, scratch, op1);
17517 }
17518 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17519
17520 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
17521 {
17522 dest = simplify_gen_subreg (vmode, op0, mode, 0);
17523 x = gen_rtx_AND (vmode, dest, nmask);
17524 }
17525 else /* alternative 3,4 */
17526 {
17527 gcc_assert (REGNO (nmask) == REGNO (dest));
17528 dest = nmask;
17529 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17530 x = gen_rtx_AND (vmode, dest, op0);
17531 }
17532 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17533 }
17534
17535 x = gen_rtx_IOR (vmode, dest, scratch);
17536 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17537 }
17538
17539 /* Return TRUE or FALSE depending on whether the first SET in INSN
17540 has source and destination with matching CC modes, and that the
17541 CC mode is at least as constrained as REQ_MODE. */
17542
17543 bool
17544 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
17545 {
17546 rtx set;
17547 enum machine_mode set_mode;
17548
17549 set = PATTERN (insn);
17550 if (GET_CODE (set) == PARALLEL)
17551 set = XVECEXP (set, 0, 0);
17552 gcc_assert (GET_CODE (set) == SET);
17553 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
17554
17555 set_mode = GET_MODE (SET_DEST (set));
17556 switch (set_mode)
17557 {
17558 case CCNOmode:
17559 if (req_mode != CCNOmode
17560 && (req_mode != CCmode
17561 || XEXP (SET_SRC (set), 1) != const0_rtx))
17562 return false;
17563 break;
17564 case CCmode:
17565 if (req_mode == CCGCmode)
17566 return false;
17567 /* FALLTHRU */
17568 case CCGCmode:
17569 if (req_mode == CCGOCmode || req_mode == CCNOmode)
17570 return false;
17571 /* FALLTHRU */
17572 case CCGOCmode:
17573 if (req_mode == CCZmode)
17574 return false;
17575 /* FALLTHRU */
17576 case CCZmode:
17577 break;
17578
17579 case CCAmode:
17580 case CCCmode:
17581 case CCOmode:
17582 case CCSmode:
17583 if (set_mode != req_mode)
17584 return false;
17585 break;
17586
17587 default:
17588 gcc_unreachable ();
17589 }
17590
17591 return GET_MODE (SET_SRC (set)) == set_mode;
17592 }
17593
17594 /* Generate insn patterns to do an integer compare of OPERANDS. */
17595
17596 static rtx
17597 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
17598 {
17599 enum machine_mode cmpmode;
17600 rtx tmp, flags;
17601
17602 cmpmode = SELECT_CC_MODE (code, op0, op1);
17603 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
17604
17605 /* This is very simple, but making the interface the same as in the
17606 FP case makes the rest of the code easier. */
17607 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
17608 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
17609
17610 /* Return the test that should be put into the flags user, i.e.
17611 the bcc, scc, or cmov instruction. */
17612 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
17613 }
17614
17615 /* Figure out whether to use ordered or unordered fp comparisons.
17616 Return the appropriate mode to use. */
17617
17618 enum machine_mode
17619 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
17620 {
17621 /* ??? In order to make all comparisons reversible, we do all comparisons
17622 non-trapping when compiling for IEEE. Once gcc is able to distinguish
17623 all forms trapping and nontrapping comparisons, we can make inequality
17624 comparisons trapping again, since it results in better code when using
17625 FCOM based compares. */
17626 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
17627 }
17628
17629 enum machine_mode
17630 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
17631 {
17632 enum machine_mode mode = GET_MODE (op0);
17633
17634 if (SCALAR_FLOAT_MODE_P (mode))
17635 {
17636 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
17637 return ix86_fp_compare_mode (code);
17638 }
17639
17640 switch (code)
17641 {
17642 /* Only zero flag is needed. */
17643 case EQ: /* ZF=0 */
17644 case NE: /* ZF!=0 */
17645 return CCZmode;
17646 /* Codes needing carry flag. */
17647 case GEU: /* CF=0 */
17648 case LTU: /* CF=1 */
17649 /* Detect overflow checks. They need just the carry flag. */
17650 if (GET_CODE (op0) == PLUS
17651 && rtx_equal_p (op1, XEXP (op0, 0)))
17652 return CCCmode;
17653 else
17654 return CCmode;
17655 case GTU: /* CF=0 & ZF=0 */
17656 case LEU: /* CF=1 | ZF=1 */
17657 /* Detect overflow checks. They need just the carry flag. */
17658 if (GET_CODE (op0) == MINUS
17659 && rtx_equal_p (op1, XEXP (op0, 0)))
17660 return CCCmode;
17661 else
17662 return CCmode;
17663 /* Codes possibly doable only with sign flag when
17664 comparing against zero. */
17665 case GE: /* SF=OF or SF=0 */
17666 case LT: /* SF<>OF or SF=1 */
17667 if (op1 == const0_rtx)
17668 return CCGOCmode;
17669 else
17670 /* For other cases Carry flag is not required. */
17671 return CCGCmode;
17672 /* Codes doable only with sign flag when comparing
17673 against zero, but we miss jump instruction for it
17674 so we need to use relational tests against overflow
17675 that thus needs to be zero. */
17676 case GT: /* ZF=0 & SF=OF */
17677 case LE: /* ZF=1 | SF<>OF */
17678 if (op1 == const0_rtx)
17679 return CCNOmode;
17680 else
17681 return CCGCmode;
17682 /* strcmp pattern do (use flags) and combine may ask us for proper
17683 mode. */
17684 case USE:
17685 return CCmode;
17686 default:
17687 gcc_unreachable ();
17688 }
17689 }
17690
17691 /* Return the fixed registers used for condition codes. */
17692
17693 static bool
17694 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
17695 {
17696 *p1 = FLAGS_REG;
17697 *p2 = FPSR_REG;
17698 return true;
17699 }
17700
17701 /* If two condition code modes are compatible, return a condition code
17702 mode which is compatible with both. Otherwise, return
17703 VOIDmode. */
17704
17705 static enum machine_mode
17706 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
17707 {
17708 if (m1 == m2)
17709 return m1;
17710
17711 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
17712 return VOIDmode;
17713
17714 if ((m1 == CCGCmode && m2 == CCGOCmode)
17715 || (m1 == CCGOCmode && m2 == CCGCmode))
17716 return CCGCmode;
17717
17718 switch (m1)
17719 {
17720 default:
17721 gcc_unreachable ();
17722
17723 case CCmode:
17724 case CCGCmode:
17725 case CCGOCmode:
17726 case CCNOmode:
17727 case CCAmode:
17728 case CCCmode:
17729 case CCOmode:
17730 case CCSmode:
17731 case CCZmode:
17732 switch (m2)
17733 {
17734 default:
17735 return VOIDmode;
17736
17737 case CCmode:
17738 case CCGCmode:
17739 case CCGOCmode:
17740 case CCNOmode:
17741 case CCAmode:
17742 case CCCmode:
17743 case CCOmode:
17744 case CCSmode:
17745 case CCZmode:
17746 return CCmode;
17747 }
17748
17749 case CCFPmode:
17750 case CCFPUmode:
17751 /* These are only compatible with themselves, which we already
17752 checked above. */
17753 return VOIDmode;
17754 }
17755 }
17756
17757
17758 /* Return a comparison we can do and that it is equivalent to
17759 swap_condition (code) apart possibly from orderedness.
17760 But, never change orderedness if TARGET_IEEE_FP, returning
17761 UNKNOWN in that case if necessary. */
17762
17763 static enum rtx_code
17764 ix86_fp_swap_condition (enum rtx_code code)
17765 {
17766 switch (code)
17767 {
17768 case GT: /* GTU - CF=0 & ZF=0 */
17769 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
17770 case GE: /* GEU - CF=0 */
17771 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
17772 case UNLT: /* LTU - CF=1 */
17773 return TARGET_IEEE_FP ? UNKNOWN : GT;
17774 case UNLE: /* LEU - CF=1 | ZF=1 */
17775 return TARGET_IEEE_FP ? UNKNOWN : GE;
17776 default:
17777 return swap_condition (code);
17778 }
17779 }
17780
17781 /* Return cost of comparison CODE using the best strategy for performance.
17782 All following functions do use number of instructions as a cost metrics.
17783 In future this should be tweaked to compute bytes for optimize_size and
17784 take into account performance of various instructions on various CPUs. */
17785
17786 static int
17787 ix86_fp_comparison_cost (enum rtx_code code)
17788 {
17789 int arith_cost;
17790
17791 /* The cost of code using bit-twiddling on %ah. */
17792 switch (code)
17793 {
17794 case UNLE:
17795 case UNLT:
17796 case LTGT:
17797 case GT:
17798 case GE:
17799 case UNORDERED:
17800 case ORDERED:
17801 case UNEQ:
17802 arith_cost = 4;
17803 break;
17804 case LT:
17805 case NE:
17806 case EQ:
17807 case UNGE:
17808 arith_cost = TARGET_IEEE_FP ? 5 : 4;
17809 break;
17810 case LE:
17811 case UNGT:
17812 arith_cost = TARGET_IEEE_FP ? 6 : 4;
17813 break;
17814 default:
17815 gcc_unreachable ();
17816 }
17817
17818 switch (ix86_fp_comparison_strategy (code))
17819 {
17820 case IX86_FPCMP_COMI:
17821 return arith_cost > 4 ? 3 : 2;
17822 case IX86_FPCMP_SAHF:
17823 return arith_cost > 4 ? 4 : 3;
17824 default:
17825 return arith_cost;
17826 }
17827 }
17828
17829 /* Return strategy to use for floating-point. We assume that fcomi is always
17830 preferrable where available, since that is also true when looking at size
17831 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
17832
17833 enum ix86_fpcmp_strategy
17834 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
17835 {
17836 /* Do fcomi/sahf based test when profitable. */
17837
17838 if (TARGET_CMOVE)
17839 return IX86_FPCMP_COMI;
17840
17841 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
17842 return IX86_FPCMP_SAHF;
17843
17844 return IX86_FPCMP_ARITH;
17845 }
17846
17847 /* Swap, force into registers, or otherwise massage the two operands
17848 to a fp comparison. The operands are updated in place; the new
17849 comparison code is returned. */
17850
17851 static enum rtx_code
17852 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
17853 {
17854 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
17855 rtx op0 = *pop0, op1 = *pop1;
17856 enum machine_mode op_mode = GET_MODE (op0);
17857 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
17858
17859 /* All of the unordered compare instructions only work on registers.
17860 The same is true of the fcomi compare instructions. The XFmode
17861 compare instructions require registers except when comparing
17862 against zero or when converting operand 1 from fixed point to
17863 floating point. */
17864
17865 if (!is_sse
17866 && (fpcmp_mode == CCFPUmode
17867 || (op_mode == XFmode
17868 && ! (standard_80387_constant_p (op0) == 1
17869 || standard_80387_constant_p (op1) == 1)
17870 && GET_CODE (op1) != FLOAT)
17871 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
17872 {
17873 op0 = force_reg (op_mode, op0);
17874 op1 = force_reg (op_mode, op1);
17875 }
17876 else
17877 {
17878 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
17879 things around if they appear profitable, otherwise force op0
17880 into a register. */
17881
17882 if (standard_80387_constant_p (op0) == 0
17883 || (MEM_P (op0)
17884 && ! (standard_80387_constant_p (op1) == 0
17885 || MEM_P (op1))))
17886 {
17887 enum rtx_code new_code = ix86_fp_swap_condition (code);
17888 if (new_code != UNKNOWN)
17889 {
17890 rtx tmp;
17891 tmp = op0, op0 = op1, op1 = tmp;
17892 code = new_code;
17893 }
17894 }
17895
17896 if (!REG_P (op0))
17897 op0 = force_reg (op_mode, op0);
17898
17899 if (CONSTANT_P (op1))
17900 {
17901 int tmp = standard_80387_constant_p (op1);
17902 if (tmp == 0)
17903 op1 = validize_mem (force_const_mem (op_mode, op1));
17904 else if (tmp == 1)
17905 {
17906 if (TARGET_CMOVE)
17907 op1 = force_reg (op_mode, op1);
17908 }
17909 else
17910 op1 = force_reg (op_mode, op1);
17911 }
17912 }
17913
17914 /* Try to rearrange the comparison to make it cheaper. */
17915 if (ix86_fp_comparison_cost (code)
17916 > ix86_fp_comparison_cost (swap_condition (code))
17917 && (REG_P (op1) || can_create_pseudo_p ()))
17918 {
17919 rtx tmp;
17920 tmp = op0, op0 = op1, op1 = tmp;
17921 code = swap_condition (code);
17922 if (!REG_P (op0))
17923 op0 = force_reg (op_mode, op0);
17924 }
17925
17926 *pop0 = op0;
17927 *pop1 = op1;
17928 return code;
17929 }
17930
17931 /* Convert comparison codes we use to represent FP comparison to integer
17932 code that will result in proper branch. Return UNKNOWN if no such code
17933 is available. */
17934
17935 enum rtx_code
17936 ix86_fp_compare_code_to_integer (enum rtx_code code)
17937 {
17938 switch (code)
17939 {
17940 case GT:
17941 return GTU;
17942 case GE:
17943 return GEU;
17944 case ORDERED:
17945 case UNORDERED:
17946 return code;
17947 break;
17948 case UNEQ:
17949 return EQ;
17950 break;
17951 case UNLT:
17952 return LTU;
17953 break;
17954 case UNLE:
17955 return LEU;
17956 break;
17957 case LTGT:
17958 return NE;
17959 break;
17960 default:
17961 return UNKNOWN;
17962 }
17963 }
17964
17965 /* Generate insn patterns to do a floating point compare of OPERANDS. */
17966
17967 static rtx
17968 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
17969 {
17970 enum machine_mode fpcmp_mode, intcmp_mode;
17971 rtx tmp, tmp2;
17972
17973 fpcmp_mode = ix86_fp_compare_mode (code);
17974 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
17975
17976 /* Do fcomi/sahf based test when profitable. */
17977 switch (ix86_fp_comparison_strategy (code))
17978 {
17979 case IX86_FPCMP_COMI:
17980 intcmp_mode = fpcmp_mode;
17981 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
17982 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
17983 tmp);
17984 emit_insn (tmp);
17985 break;
17986
17987 case IX86_FPCMP_SAHF:
17988 intcmp_mode = fpcmp_mode;
17989 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
17990 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
17991 tmp);
17992
17993 if (!scratch)
17994 scratch = gen_reg_rtx (HImode);
17995 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
17996 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
17997 break;
17998
17999 case IX86_FPCMP_ARITH:
18000 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18001 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18002 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18003 if (!scratch)
18004 scratch = gen_reg_rtx (HImode);
18005 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18006
18007 /* In the unordered case, we have to check C2 for NaN's, which
18008 doesn't happen to work out to anything nice combination-wise.
18009 So do some bit twiddling on the value we've got in AH to come
18010 up with an appropriate set of condition codes. */
18011
18012 intcmp_mode = CCNOmode;
18013 switch (code)
18014 {
18015 case GT:
18016 case UNGT:
18017 if (code == GT || !TARGET_IEEE_FP)
18018 {
18019 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18020 code = EQ;
18021 }
18022 else
18023 {
18024 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18025 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18026 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18027 intcmp_mode = CCmode;
18028 code = GEU;
18029 }
18030 break;
18031 case LT:
18032 case UNLT:
18033 if (code == LT && TARGET_IEEE_FP)
18034 {
18035 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18036 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18037 intcmp_mode = CCmode;
18038 code = EQ;
18039 }
18040 else
18041 {
18042 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18043 code = NE;
18044 }
18045 break;
18046 case GE:
18047 case UNGE:
18048 if (code == GE || !TARGET_IEEE_FP)
18049 {
18050 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18051 code = EQ;
18052 }
18053 else
18054 {
18055 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18056 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18057 code = NE;
18058 }
18059 break;
18060 case LE:
18061 case UNLE:
18062 if (code == LE && TARGET_IEEE_FP)
18063 {
18064 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18065 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18066 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18067 intcmp_mode = CCmode;
18068 code = LTU;
18069 }
18070 else
18071 {
18072 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18073 code = NE;
18074 }
18075 break;
18076 case EQ:
18077 case UNEQ:
18078 if (code == EQ && TARGET_IEEE_FP)
18079 {
18080 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18081 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18082 intcmp_mode = CCmode;
18083 code = EQ;
18084 }
18085 else
18086 {
18087 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18088 code = NE;
18089 }
18090 break;
18091 case NE:
18092 case LTGT:
18093 if (code == NE && TARGET_IEEE_FP)
18094 {
18095 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18096 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18097 GEN_INT (0x40)));
18098 code = NE;
18099 }
18100 else
18101 {
18102 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18103 code = EQ;
18104 }
18105 break;
18106
18107 case UNORDERED:
18108 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18109 code = NE;
18110 break;
18111 case ORDERED:
18112 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18113 code = EQ;
18114 break;
18115
18116 default:
18117 gcc_unreachable ();
18118 }
18119 break;
18120
18121 default:
18122 gcc_unreachable();
18123 }
18124
18125 /* Return the test that should be put into the flags user, i.e.
18126 the bcc, scc, or cmov instruction. */
18127 return gen_rtx_fmt_ee (code, VOIDmode,
18128 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18129 const0_rtx);
18130 }
18131
18132 static rtx
18133 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18134 {
18135 rtx ret;
18136
18137 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18138 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18139
18140 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18141 {
18142 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18143 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18144 }
18145 else
18146 ret = ix86_expand_int_compare (code, op0, op1);
18147
18148 return ret;
18149 }
18150
18151 void
18152 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18153 {
18154 enum machine_mode mode = GET_MODE (op0);
18155 rtx tmp;
18156
18157 switch (mode)
18158 {
18159 case SFmode:
18160 case DFmode:
18161 case XFmode:
18162 case QImode:
18163 case HImode:
18164 case SImode:
18165 simple:
18166 tmp = ix86_expand_compare (code, op0, op1);
18167 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18168 gen_rtx_LABEL_REF (VOIDmode, label),
18169 pc_rtx);
18170 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18171 return;
18172
18173 case DImode:
18174 if (TARGET_64BIT)
18175 goto simple;
18176 case TImode:
18177 /* Expand DImode branch into multiple compare+branch. */
18178 {
18179 rtx lo[2], hi[2], label2;
18180 enum rtx_code code1, code2, code3;
18181 enum machine_mode submode;
18182
18183 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18184 {
18185 tmp = op0, op0 = op1, op1 = tmp;
18186 code = swap_condition (code);
18187 }
18188
18189 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18190 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18191
18192 submode = mode == DImode ? SImode : DImode;
18193
18194 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18195 avoid two branches. This costs one extra insn, so disable when
18196 optimizing for size. */
18197
18198 if ((code == EQ || code == NE)
18199 && (!optimize_insn_for_size_p ()
18200 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18201 {
18202 rtx xor0, xor1;
18203
18204 xor1 = hi[0];
18205 if (hi[1] != const0_rtx)
18206 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18207 NULL_RTX, 0, OPTAB_WIDEN);
18208
18209 xor0 = lo[0];
18210 if (lo[1] != const0_rtx)
18211 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18212 NULL_RTX, 0, OPTAB_WIDEN);
18213
18214 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18215 NULL_RTX, 0, OPTAB_WIDEN);
18216
18217 ix86_expand_branch (code, tmp, const0_rtx, label);
18218 return;
18219 }
18220
18221 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18222 op1 is a constant and the low word is zero, then we can just
18223 examine the high word. Similarly for low word -1 and
18224 less-or-equal-than or greater-than. */
18225
18226 if (CONST_INT_P (hi[1]))
18227 switch (code)
18228 {
18229 case LT: case LTU: case GE: case GEU:
18230 if (lo[1] == const0_rtx)
18231 {
18232 ix86_expand_branch (code, hi[0], hi[1], label);
18233 return;
18234 }
18235 break;
18236 case LE: case LEU: case GT: case GTU:
18237 if (lo[1] == constm1_rtx)
18238 {
18239 ix86_expand_branch (code, hi[0], hi[1], label);
18240 return;
18241 }
18242 break;
18243 default:
18244 break;
18245 }
18246
18247 /* Otherwise, we need two or three jumps. */
18248
18249 label2 = gen_label_rtx ();
18250
18251 code1 = code;
18252 code2 = swap_condition (code);
18253 code3 = unsigned_condition (code);
18254
18255 switch (code)
18256 {
18257 case LT: case GT: case LTU: case GTU:
18258 break;
18259
18260 case LE: code1 = LT; code2 = GT; break;
18261 case GE: code1 = GT; code2 = LT; break;
18262 case LEU: code1 = LTU; code2 = GTU; break;
18263 case GEU: code1 = GTU; code2 = LTU; break;
18264
18265 case EQ: code1 = UNKNOWN; code2 = NE; break;
18266 case NE: code2 = UNKNOWN; break;
18267
18268 default:
18269 gcc_unreachable ();
18270 }
18271
18272 /*
18273 * a < b =>
18274 * if (hi(a) < hi(b)) goto true;
18275 * if (hi(a) > hi(b)) goto false;
18276 * if (lo(a) < lo(b)) goto true;
18277 * false:
18278 */
18279
18280 if (code1 != UNKNOWN)
18281 ix86_expand_branch (code1, hi[0], hi[1], label);
18282 if (code2 != UNKNOWN)
18283 ix86_expand_branch (code2, hi[0], hi[1], label2);
18284
18285 ix86_expand_branch (code3, lo[0], lo[1], label);
18286
18287 if (code2 != UNKNOWN)
18288 emit_label (label2);
18289 return;
18290 }
18291
18292 default:
18293 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
18294 goto simple;
18295 }
18296 }
18297
18298 /* Split branch based on floating point condition. */
18299 void
18300 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
18301 rtx target1, rtx target2, rtx tmp, rtx pushed)
18302 {
18303 rtx condition;
18304 rtx i;
18305
18306 if (target2 != pc_rtx)
18307 {
18308 rtx tmp = target2;
18309 code = reverse_condition_maybe_unordered (code);
18310 target2 = target1;
18311 target1 = tmp;
18312 }
18313
18314 condition = ix86_expand_fp_compare (code, op1, op2,
18315 tmp);
18316
18317 /* Remove pushed operand from stack. */
18318 if (pushed)
18319 ix86_free_from_memory (GET_MODE (pushed));
18320
18321 i = emit_jump_insn (gen_rtx_SET
18322 (VOIDmode, pc_rtx,
18323 gen_rtx_IF_THEN_ELSE (VOIDmode,
18324 condition, target1, target2)));
18325 if (split_branch_probability >= 0)
18326 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
18327 }
18328
18329 void
18330 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
18331 {
18332 rtx ret;
18333
18334 gcc_assert (GET_MODE (dest) == QImode);
18335
18336 ret = ix86_expand_compare (code, op0, op1);
18337 PUT_MODE (ret, QImode);
18338 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
18339 }
18340
18341 /* Expand comparison setting or clearing carry flag. Return true when
18342 successful and set pop for the operation. */
18343 static bool
18344 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
18345 {
18346 enum machine_mode mode =
18347 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
18348
18349 /* Do not handle double-mode compares that go through special path. */
18350 if (mode == (TARGET_64BIT ? TImode : DImode))
18351 return false;
18352
18353 if (SCALAR_FLOAT_MODE_P (mode))
18354 {
18355 rtx compare_op, compare_seq;
18356
18357 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18358
18359 /* Shortcut: following common codes never translate
18360 into carry flag compares. */
18361 if (code == EQ || code == NE || code == UNEQ || code == LTGT
18362 || code == ORDERED || code == UNORDERED)
18363 return false;
18364
18365 /* These comparisons require zero flag; swap operands so they won't. */
18366 if ((code == GT || code == UNLE || code == LE || code == UNGT)
18367 && !TARGET_IEEE_FP)
18368 {
18369 rtx tmp = op0;
18370 op0 = op1;
18371 op1 = tmp;
18372 code = swap_condition (code);
18373 }
18374
18375 /* Try to expand the comparison and verify that we end up with
18376 carry flag based comparison. This fails to be true only when
18377 we decide to expand comparison using arithmetic that is not
18378 too common scenario. */
18379 start_sequence ();
18380 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18381 compare_seq = get_insns ();
18382 end_sequence ();
18383
18384 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
18385 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
18386 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
18387 else
18388 code = GET_CODE (compare_op);
18389
18390 if (code != LTU && code != GEU)
18391 return false;
18392
18393 emit_insn (compare_seq);
18394 *pop = compare_op;
18395 return true;
18396 }
18397
18398 if (!INTEGRAL_MODE_P (mode))
18399 return false;
18400
18401 switch (code)
18402 {
18403 case LTU:
18404 case GEU:
18405 break;
18406
18407 /* Convert a==0 into (unsigned)a<1. */
18408 case EQ:
18409 case NE:
18410 if (op1 != const0_rtx)
18411 return false;
18412 op1 = const1_rtx;
18413 code = (code == EQ ? LTU : GEU);
18414 break;
18415
18416 /* Convert a>b into b<a or a>=b-1. */
18417 case GTU:
18418 case LEU:
18419 if (CONST_INT_P (op1))
18420 {
18421 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
18422 /* Bail out on overflow. We still can swap operands but that
18423 would force loading of the constant into register. */
18424 if (op1 == const0_rtx
18425 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
18426 return false;
18427 code = (code == GTU ? GEU : LTU);
18428 }
18429 else
18430 {
18431 rtx tmp = op1;
18432 op1 = op0;
18433 op0 = tmp;
18434 code = (code == GTU ? LTU : GEU);
18435 }
18436 break;
18437
18438 /* Convert a>=0 into (unsigned)a<0x80000000. */
18439 case LT:
18440 case GE:
18441 if (mode == DImode || op1 != const0_rtx)
18442 return false;
18443 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18444 code = (code == LT ? GEU : LTU);
18445 break;
18446 case LE:
18447 case GT:
18448 if (mode == DImode || op1 != constm1_rtx)
18449 return false;
18450 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18451 code = (code == LE ? GEU : LTU);
18452 break;
18453
18454 default:
18455 return false;
18456 }
18457 /* Swapping operands may cause constant to appear as first operand. */
18458 if (!nonimmediate_operand (op0, VOIDmode))
18459 {
18460 if (!can_create_pseudo_p ())
18461 return false;
18462 op0 = force_reg (mode, op0);
18463 }
18464 *pop = ix86_expand_compare (code, op0, op1);
18465 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
18466 return true;
18467 }
18468
18469 bool
18470 ix86_expand_int_movcc (rtx operands[])
18471 {
18472 enum rtx_code code = GET_CODE (operands[1]), compare_code;
18473 rtx compare_seq, compare_op;
18474 enum machine_mode mode = GET_MODE (operands[0]);
18475 bool sign_bit_compare_p = false;
18476 rtx op0 = XEXP (operands[1], 0);
18477 rtx op1 = XEXP (operands[1], 1);
18478
18479 start_sequence ();
18480 compare_op = ix86_expand_compare (code, op0, op1);
18481 compare_seq = get_insns ();
18482 end_sequence ();
18483
18484 compare_code = GET_CODE (compare_op);
18485
18486 if ((op1 == const0_rtx && (code == GE || code == LT))
18487 || (op1 == constm1_rtx && (code == GT || code == LE)))
18488 sign_bit_compare_p = true;
18489
18490 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
18491 HImode insns, we'd be swallowed in word prefix ops. */
18492
18493 if ((mode != HImode || TARGET_FAST_PREFIX)
18494 && (mode != (TARGET_64BIT ? TImode : DImode))
18495 && CONST_INT_P (operands[2])
18496 && CONST_INT_P (operands[3]))
18497 {
18498 rtx out = operands[0];
18499 HOST_WIDE_INT ct = INTVAL (operands[2]);
18500 HOST_WIDE_INT cf = INTVAL (operands[3]);
18501 HOST_WIDE_INT diff;
18502
18503 diff = ct - cf;
18504 /* Sign bit compares are better done using shifts than we do by using
18505 sbb. */
18506 if (sign_bit_compare_p
18507 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
18508 {
18509 /* Detect overlap between destination and compare sources. */
18510 rtx tmp = out;
18511
18512 if (!sign_bit_compare_p)
18513 {
18514 rtx flags;
18515 bool fpcmp = false;
18516
18517 compare_code = GET_CODE (compare_op);
18518
18519 flags = XEXP (compare_op, 0);
18520
18521 if (GET_MODE (flags) == CCFPmode
18522 || GET_MODE (flags) == CCFPUmode)
18523 {
18524 fpcmp = true;
18525 compare_code
18526 = ix86_fp_compare_code_to_integer (compare_code);
18527 }
18528
18529 /* To simplify rest of code, restrict to the GEU case. */
18530 if (compare_code == LTU)
18531 {
18532 HOST_WIDE_INT tmp = ct;
18533 ct = cf;
18534 cf = tmp;
18535 compare_code = reverse_condition (compare_code);
18536 code = reverse_condition (code);
18537 }
18538 else
18539 {
18540 if (fpcmp)
18541 PUT_CODE (compare_op,
18542 reverse_condition_maybe_unordered
18543 (GET_CODE (compare_op)));
18544 else
18545 PUT_CODE (compare_op,
18546 reverse_condition (GET_CODE (compare_op)));
18547 }
18548 diff = ct - cf;
18549
18550 if (reg_overlap_mentioned_p (out, op0)
18551 || reg_overlap_mentioned_p (out, op1))
18552 tmp = gen_reg_rtx (mode);
18553
18554 if (mode == DImode)
18555 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
18556 else
18557 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
18558 flags, compare_op));
18559 }
18560 else
18561 {
18562 if (code == GT || code == GE)
18563 code = reverse_condition (code);
18564 else
18565 {
18566 HOST_WIDE_INT tmp = ct;
18567 ct = cf;
18568 cf = tmp;
18569 diff = ct - cf;
18570 }
18571 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
18572 }
18573
18574 if (diff == 1)
18575 {
18576 /*
18577 * cmpl op0,op1
18578 * sbbl dest,dest
18579 * [addl dest, ct]
18580 *
18581 * Size 5 - 8.
18582 */
18583 if (ct)
18584 tmp = expand_simple_binop (mode, PLUS,
18585 tmp, GEN_INT (ct),
18586 copy_rtx (tmp), 1, OPTAB_DIRECT);
18587 }
18588 else if (cf == -1)
18589 {
18590 /*
18591 * cmpl op0,op1
18592 * sbbl dest,dest
18593 * orl $ct, dest
18594 *
18595 * Size 8.
18596 */
18597 tmp = expand_simple_binop (mode, IOR,
18598 tmp, GEN_INT (ct),
18599 copy_rtx (tmp), 1, OPTAB_DIRECT);
18600 }
18601 else if (diff == -1 && ct)
18602 {
18603 /*
18604 * cmpl op0,op1
18605 * sbbl dest,dest
18606 * notl dest
18607 * [addl dest, cf]
18608 *
18609 * Size 8 - 11.
18610 */
18611 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18612 if (cf)
18613 tmp = expand_simple_binop (mode, PLUS,
18614 copy_rtx (tmp), GEN_INT (cf),
18615 copy_rtx (tmp), 1, OPTAB_DIRECT);
18616 }
18617 else
18618 {
18619 /*
18620 * cmpl op0,op1
18621 * sbbl dest,dest
18622 * [notl dest]
18623 * andl cf - ct, dest
18624 * [addl dest, ct]
18625 *
18626 * Size 8 - 11.
18627 */
18628
18629 if (cf == 0)
18630 {
18631 cf = ct;
18632 ct = 0;
18633 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18634 }
18635
18636 tmp = expand_simple_binop (mode, AND,
18637 copy_rtx (tmp),
18638 gen_int_mode (cf - ct, mode),
18639 copy_rtx (tmp), 1, OPTAB_DIRECT);
18640 if (ct)
18641 tmp = expand_simple_binop (mode, PLUS,
18642 copy_rtx (tmp), GEN_INT (ct),
18643 copy_rtx (tmp), 1, OPTAB_DIRECT);
18644 }
18645
18646 if (!rtx_equal_p (tmp, out))
18647 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
18648
18649 return true;
18650 }
18651
18652 if (diff < 0)
18653 {
18654 enum machine_mode cmp_mode = GET_MODE (op0);
18655
18656 HOST_WIDE_INT tmp;
18657 tmp = ct, ct = cf, cf = tmp;
18658 diff = -diff;
18659
18660 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18661 {
18662 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18663
18664 /* We may be reversing unordered compare to normal compare, that
18665 is not valid in general (we may convert non-trapping condition
18666 to trapping one), however on i386 we currently emit all
18667 comparisons unordered. */
18668 compare_code = reverse_condition_maybe_unordered (compare_code);
18669 code = reverse_condition_maybe_unordered (code);
18670 }
18671 else
18672 {
18673 compare_code = reverse_condition (compare_code);
18674 code = reverse_condition (code);
18675 }
18676 }
18677
18678 compare_code = UNKNOWN;
18679 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
18680 && CONST_INT_P (op1))
18681 {
18682 if (op1 == const0_rtx
18683 && (code == LT || code == GE))
18684 compare_code = code;
18685 else if (op1 == constm1_rtx)
18686 {
18687 if (code == LE)
18688 compare_code = LT;
18689 else if (code == GT)
18690 compare_code = GE;
18691 }
18692 }
18693
18694 /* Optimize dest = (op0 < 0) ? -1 : cf. */
18695 if (compare_code != UNKNOWN
18696 && GET_MODE (op0) == GET_MODE (out)
18697 && (cf == -1 || ct == -1))
18698 {
18699 /* If lea code below could be used, only optimize
18700 if it results in a 2 insn sequence. */
18701
18702 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
18703 || diff == 3 || diff == 5 || diff == 9)
18704 || (compare_code == LT && ct == -1)
18705 || (compare_code == GE && cf == -1))
18706 {
18707 /*
18708 * notl op1 (if necessary)
18709 * sarl $31, op1
18710 * orl cf, op1
18711 */
18712 if (ct != -1)
18713 {
18714 cf = ct;
18715 ct = -1;
18716 code = reverse_condition (code);
18717 }
18718
18719 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
18720
18721 out = expand_simple_binop (mode, IOR,
18722 out, GEN_INT (cf),
18723 out, 1, OPTAB_DIRECT);
18724 if (out != operands[0])
18725 emit_move_insn (operands[0], out);
18726
18727 return true;
18728 }
18729 }
18730
18731
18732 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
18733 || diff == 3 || diff == 5 || diff == 9)
18734 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
18735 && (mode != DImode
18736 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
18737 {
18738 /*
18739 * xorl dest,dest
18740 * cmpl op1,op2
18741 * setcc dest
18742 * lea cf(dest*(ct-cf)),dest
18743 *
18744 * Size 14.
18745 *
18746 * This also catches the degenerate setcc-only case.
18747 */
18748
18749 rtx tmp;
18750 int nops;
18751
18752 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
18753
18754 nops = 0;
18755 /* On x86_64 the lea instruction operates on Pmode, so we need
18756 to get arithmetics done in proper mode to match. */
18757 if (diff == 1)
18758 tmp = copy_rtx (out);
18759 else
18760 {
18761 rtx out1;
18762 out1 = copy_rtx (out);
18763 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
18764 nops++;
18765 if (diff & 1)
18766 {
18767 tmp = gen_rtx_PLUS (mode, tmp, out1);
18768 nops++;
18769 }
18770 }
18771 if (cf != 0)
18772 {
18773 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
18774 nops++;
18775 }
18776 if (!rtx_equal_p (tmp, out))
18777 {
18778 if (nops == 1)
18779 out = force_operand (tmp, copy_rtx (out));
18780 else
18781 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
18782 }
18783 if (!rtx_equal_p (out, operands[0]))
18784 emit_move_insn (operands[0], copy_rtx (out));
18785
18786 return true;
18787 }
18788
18789 /*
18790 * General case: Jumpful:
18791 * xorl dest,dest cmpl op1, op2
18792 * cmpl op1, op2 movl ct, dest
18793 * setcc dest jcc 1f
18794 * decl dest movl cf, dest
18795 * andl (cf-ct),dest 1:
18796 * addl ct,dest
18797 *
18798 * Size 20. Size 14.
18799 *
18800 * This is reasonably steep, but branch mispredict costs are
18801 * high on modern cpus, so consider failing only if optimizing
18802 * for space.
18803 */
18804
18805 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
18806 && BRANCH_COST (optimize_insn_for_speed_p (),
18807 false) >= 2)
18808 {
18809 if (cf == 0)
18810 {
18811 enum machine_mode cmp_mode = GET_MODE (op0);
18812
18813 cf = ct;
18814 ct = 0;
18815
18816 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18817 {
18818 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18819
18820 /* We may be reversing unordered compare to normal compare,
18821 that is not valid in general (we may convert non-trapping
18822 condition to trapping one), however on i386 we currently
18823 emit all comparisons unordered. */
18824 code = reverse_condition_maybe_unordered (code);
18825 }
18826 else
18827 {
18828 code = reverse_condition (code);
18829 if (compare_code != UNKNOWN)
18830 compare_code = reverse_condition (compare_code);
18831 }
18832 }
18833
18834 if (compare_code != UNKNOWN)
18835 {
18836 /* notl op1 (if needed)
18837 sarl $31, op1
18838 andl (cf-ct), op1
18839 addl ct, op1
18840
18841 For x < 0 (resp. x <= -1) there will be no notl,
18842 so if possible swap the constants to get rid of the
18843 complement.
18844 True/false will be -1/0 while code below (store flag
18845 followed by decrement) is 0/-1, so the constants need
18846 to be exchanged once more. */
18847
18848 if (compare_code == GE || !cf)
18849 {
18850 code = reverse_condition (code);
18851 compare_code = LT;
18852 }
18853 else
18854 {
18855 HOST_WIDE_INT tmp = cf;
18856 cf = ct;
18857 ct = tmp;
18858 }
18859
18860 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
18861 }
18862 else
18863 {
18864 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
18865
18866 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
18867 constm1_rtx,
18868 copy_rtx (out), 1, OPTAB_DIRECT);
18869 }
18870
18871 out = expand_simple_binop (mode, AND, copy_rtx (out),
18872 gen_int_mode (cf - ct, mode),
18873 copy_rtx (out), 1, OPTAB_DIRECT);
18874 if (ct)
18875 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
18876 copy_rtx (out), 1, OPTAB_DIRECT);
18877 if (!rtx_equal_p (out, operands[0]))
18878 emit_move_insn (operands[0], copy_rtx (out));
18879
18880 return true;
18881 }
18882 }
18883
18884 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
18885 {
18886 /* Try a few things more with specific constants and a variable. */
18887
18888 optab op;
18889 rtx var, orig_out, out, tmp;
18890
18891 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
18892 return false;
18893
18894 /* If one of the two operands is an interesting constant, load a
18895 constant with the above and mask it in with a logical operation. */
18896
18897 if (CONST_INT_P (operands[2]))
18898 {
18899 var = operands[3];
18900 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
18901 operands[3] = constm1_rtx, op = and_optab;
18902 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
18903 operands[3] = const0_rtx, op = ior_optab;
18904 else
18905 return false;
18906 }
18907 else if (CONST_INT_P (operands[3]))
18908 {
18909 var = operands[2];
18910 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
18911 operands[2] = constm1_rtx, op = and_optab;
18912 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
18913 operands[2] = const0_rtx, op = ior_optab;
18914 else
18915 return false;
18916 }
18917 else
18918 return false;
18919
18920 orig_out = operands[0];
18921 tmp = gen_reg_rtx (mode);
18922 operands[0] = tmp;
18923
18924 /* Recurse to get the constant loaded. */
18925 if (ix86_expand_int_movcc (operands) == 0)
18926 return false;
18927
18928 /* Mask in the interesting variable. */
18929 out = expand_binop (mode, op, var, tmp, orig_out, 0,
18930 OPTAB_WIDEN);
18931 if (!rtx_equal_p (out, orig_out))
18932 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
18933
18934 return true;
18935 }
18936
18937 /*
18938 * For comparison with above,
18939 *
18940 * movl cf,dest
18941 * movl ct,tmp
18942 * cmpl op1,op2
18943 * cmovcc tmp,dest
18944 *
18945 * Size 15.
18946 */
18947
18948 if (! nonimmediate_operand (operands[2], mode))
18949 operands[2] = force_reg (mode, operands[2]);
18950 if (! nonimmediate_operand (operands[3], mode))
18951 operands[3] = force_reg (mode, operands[3]);
18952
18953 if (! register_operand (operands[2], VOIDmode)
18954 && (mode == QImode
18955 || ! register_operand (operands[3], VOIDmode)))
18956 operands[2] = force_reg (mode, operands[2]);
18957
18958 if (mode == QImode
18959 && ! register_operand (operands[3], VOIDmode))
18960 operands[3] = force_reg (mode, operands[3]);
18961
18962 emit_insn (compare_seq);
18963 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
18964 gen_rtx_IF_THEN_ELSE (mode,
18965 compare_op, operands[2],
18966 operands[3])));
18967 return true;
18968 }
18969
18970 /* Swap, force into registers, or otherwise massage the two operands
18971 to an sse comparison with a mask result. Thus we differ a bit from
18972 ix86_prepare_fp_compare_args which expects to produce a flags result.
18973
18974 The DEST operand exists to help determine whether to commute commutative
18975 operators. The POP0/POP1 operands are updated in place. The new
18976 comparison code is returned, or UNKNOWN if not implementable. */
18977
18978 static enum rtx_code
18979 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
18980 rtx *pop0, rtx *pop1)
18981 {
18982 rtx tmp;
18983
18984 switch (code)
18985 {
18986 case LTGT:
18987 case UNEQ:
18988 /* AVX supports all the needed comparisons. */
18989 if (TARGET_AVX)
18990 break;
18991 /* We have no LTGT as an operator. We could implement it with
18992 NE & ORDERED, but this requires an extra temporary. It's
18993 not clear that it's worth it. */
18994 return UNKNOWN;
18995
18996 case LT:
18997 case LE:
18998 case UNGT:
18999 case UNGE:
19000 /* These are supported directly. */
19001 break;
19002
19003 case EQ:
19004 case NE:
19005 case UNORDERED:
19006 case ORDERED:
19007 /* AVX has 3 operand comparisons, no need to swap anything. */
19008 if (TARGET_AVX)
19009 break;
19010 /* For commutative operators, try to canonicalize the destination
19011 operand to be first in the comparison - this helps reload to
19012 avoid extra moves. */
19013 if (!dest || !rtx_equal_p (dest, *pop1))
19014 break;
19015 /* FALLTHRU */
19016
19017 case GE:
19018 case GT:
19019 case UNLE:
19020 case UNLT:
19021 /* These are not supported directly before AVX, and furthermore
19022 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19023 comparison operands to transform into something that is
19024 supported. */
19025 tmp = *pop0;
19026 *pop0 = *pop1;
19027 *pop1 = tmp;
19028 code = swap_condition (code);
19029 break;
19030
19031 default:
19032 gcc_unreachable ();
19033 }
19034
19035 return code;
19036 }
19037
19038 /* Detect conditional moves that exactly match min/max operational
19039 semantics. Note that this is IEEE safe, as long as we don't
19040 interchange the operands.
19041
19042 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19043 and TRUE if the operation is successful and instructions are emitted. */
19044
19045 static bool
19046 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19047 rtx cmp_op1, rtx if_true, rtx if_false)
19048 {
19049 enum machine_mode mode;
19050 bool is_min;
19051 rtx tmp;
19052
19053 if (code == LT)
19054 ;
19055 else if (code == UNGE)
19056 {
19057 tmp = if_true;
19058 if_true = if_false;
19059 if_false = tmp;
19060 }
19061 else
19062 return false;
19063
19064 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19065 is_min = true;
19066 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19067 is_min = false;
19068 else
19069 return false;
19070
19071 mode = GET_MODE (dest);
19072
19073 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19074 but MODE may be a vector mode and thus not appropriate. */
19075 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19076 {
19077 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19078 rtvec v;
19079
19080 if_true = force_reg (mode, if_true);
19081 v = gen_rtvec (2, if_true, if_false);
19082 tmp = gen_rtx_UNSPEC (mode, v, u);
19083 }
19084 else
19085 {
19086 code = is_min ? SMIN : SMAX;
19087 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19088 }
19089
19090 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19091 return true;
19092 }
19093
19094 /* Expand an sse vector comparison. Return the register with the result. */
19095
19096 static rtx
19097 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19098 rtx op_true, rtx op_false)
19099 {
19100 enum machine_mode mode = GET_MODE (dest);
19101 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19102 rtx x;
19103
19104 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19105 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19106 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19107
19108 if (optimize
19109 || reg_overlap_mentioned_p (dest, op_true)
19110 || reg_overlap_mentioned_p (dest, op_false))
19111 dest = gen_reg_rtx (mode);
19112
19113 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19114 if (cmp_mode != mode)
19115 {
19116 x = force_reg (cmp_mode, x);
19117 convert_move (dest, x, false);
19118 }
19119 else
19120 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19121
19122 return dest;
19123 }
19124
19125 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19126 operations. This is used for both scalar and vector conditional moves. */
19127
19128 static void
19129 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19130 {
19131 enum machine_mode mode = GET_MODE (dest);
19132 rtx t2, t3, x;
19133
19134 if (vector_all_ones_operand (op_true, mode)
19135 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19136 {
19137 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19138 }
19139 else if (op_false == CONST0_RTX (mode))
19140 {
19141 op_true = force_reg (mode, op_true);
19142 x = gen_rtx_AND (mode, cmp, op_true);
19143 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19144 }
19145 else if (op_true == CONST0_RTX (mode))
19146 {
19147 op_false = force_reg (mode, op_false);
19148 x = gen_rtx_NOT (mode, cmp);
19149 x = gen_rtx_AND (mode, x, op_false);
19150 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19151 }
19152 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19153 {
19154 op_false = force_reg (mode, op_false);
19155 x = gen_rtx_IOR (mode, cmp, op_false);
19156 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19157 }
19158 else if (TARGET_XOP)
19159 {
19160 op_true = force_reg (mode, op_true);
19161
19162 if (!nonimmediate_operand (op_false, mode))
19163 op_false = force_reg (mode, op_false);
19164
19165 emit_insn (gen_rtx_SET (mode, dest,
19166 gen_rtx_IF_THEN_ELSE (mode, cmp,
19167 op_true,
19168 op_false)));
19169 }
19170 else
19171 {
19172 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19173
19174 if (!nonimmediate_operand (op_true, mode))
19175 op_true = force_reg (mode, op_true);
19176
19177 op_false = force_reg (mode, op_false);
19178
19179 switch (mode)
19180 {
19181 case V4SFmode:
19182 if (TARGET_SSE4_1)
19183 gen = gen_sse4_1_blendvps;
19184 break;
19185 case V2DFmode:
19186 if (TARGET_SSE4_1)
19187 gen = gen_sse4_1_blendvpd;
19188 break;
19189 case V16QImode:
19190 case V8HImode:
19191 case V4SImode:
19192 case V2DImode:
19193 if (TARGET_SSE4_1)
19194 {
19195 gen = gen_sse4_1_pblendvb;
19196 dest = gen_lowpart (V16QImode, dest);
19197 op_false = gen_lowpart (V16QImode, op_false);
19198 op_true = gen_lowpart (V16QImode, op_true);
19199 cmp = gen_lowpart (V16QImode, cmp);
19200 }
19201 break;
19202 case V8SFmode:
19203 if (TARGET_AVX)
19204 gen = gen_avx_blendvps256;
19205 break;
19206 case V4DFmode:
19207 if (TARGET_AVX)
19208 gen = gen_avx_blendvpd256;
19209 break;
19210 case V32QImode:
19211 case V16HImode:
19212 case V8SImode:
19213 case V4DImode:
19214 if (TARGET_AVX2)
19215 {
19216 gen = gen_avx2_pblendvb;
19217 dest = gen_lowpart (V32QImode, dest);
19218 op_false = gen_lowpart (V32QImode, op_false);
19219 op_true = gen_lowpart (V32QImode, op_true);
19220 cmp = gen_lowpart (V32QImode, cmp);
19221 }
19222 break;
19223 default:
19224 break;
19225 }
19226
19227 if (gen != NULL)
19228 emit_insn (gen (dest, op_false, op_true, cmp));
19229 else
19230 {
19231 op_true = force_reg (mode, op_true);
19232
19233 t2 = gen_reg_rtx (mode);
19234 if (optimize)
19235 t3 = gen_reg_rtx (mode);
19236 else
19237 t3 = dest;
19238
19239 x = gen_rtx_AND (mode, op_true, cmp);
19240 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19241
19242 x = gen_rtx_NOT (mode, cmp);
19243 x = gen_rtx_AND (mode, x, op_false);
19244 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19245
19246 x = gen_rtx_IOR (mode, t3, t2);
19247 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19248 }
19249 }
19250 }
19251
19252 /* Expand a floating-point conditional move. Return true if successful. */
19253
19254 bool
19255 ix86_expand_fp_movcc (rtx operands[])
19256 {
19257 enum machine_mode mode = GET_MODE (operands[0]);
19258 enum rtx_code code = GET_CODE (operands[1]);
19259 rtx tmp, compare_op;
19260 rtx op0 = XEXP (operands[1], 0);
19261 rtx op1 = XEXP (operands[1], 1);
19262
19263 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
19264 {
19265 enum machine_mode cmode;
19266
19267 /* Since we've no cmove for sse registers, don't force bad register
19268 allocation just to gain access to it. Deny movcc when the
19269 comparison mode doesn't match the move mode. */
19270 cmode = GET_MODE (op0);
19271 if (cmode == VOIDmode)
19272 cmode = GET_MODE (op1);
19273 if (cmode != mode)
19274 return false;
19275
19276 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
19277 if (code == UNKNOWN)
19278 return false;
19279
19280 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
19281 operands[2], operands[3]))
19282 return true;
19283
19284 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
19285 operands[2], operands[3]);
19286 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
19287 return true;
19288 }
19289
19290 /* The floating point conditional move instructions don't directly
19291 support conditions resulting from a signed integer comparison. */
19292
19293 compare_op = ix86_expand_compare (code, op0, op1);
19294 if (!fcmov_comparison_operator (compare_op, VOIDmode))
19295 {
19296 tmp = gen_reg_rtx (QImode);
19297 ix86_expand_setcc (tmp, code, op0, op1);
19298
19299 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
19300 }
19301
19302 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19303 gen_rtx_IF_THEN_ELSE (mode, compare_op,
19304 operands[2], operands[3])));
19305
19306 return true;
19307 }
19308
19309 /* Expand a floating-point vector conditional move; a vcond operation
19310 rather than a movcc operation. */
19311
19312 bool
19313 ix86_expand_fp_vcond (rtx operands[])
19314 {
19315 enum rtx_code code = GET_CODE (operands[3]);
19316 rtx cmp;
19317
19318 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
19319 &operands[4], &operands[5]);
19320 if (code == UNKNOWN)
19321 {
19322 rtx temp;
19323 switch (GET_CODE (operands[3]))
19324 {
19325 case LTGT:
19326 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
19327 operands[5], operands[0], operands[0]);
19328 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
19329 operands[5], operands[1], operands[2]);
19330 code = AND;
19331 break;
19332 case UNEQ:
19333 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
19334 operands[5], operands[0], operands[0]);
19335 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
19336 operands[5], operands[1], operands[2]);
19337 code = IOR;
19338 break;
19339 default:
19340 gcc_unreachable ();
19341 }
19342 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
19343 OPTAB_DIRECT);
19344 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19345 return true;
19346 }
19347
19348 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
19349 operands[5], operands[1], operands[2]))
19350 return true;
19351
19352 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
19353 operands[1], operands[2]);
19354 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19355 return true;
19356 }
19357
19358 /* Expand a signed/unsigned integral vector conditional move. */
19359
19360 bool
19361 ix86_expand_int_vcond (rtx operands[])
19362 {
19363 enum machine_mode data_mode = GET_MODE (operands[0]);
19364 enum machine_mode mode = GET_MODE (operands[4]);
19365 enum rtx_code code = GET_CODE (operands[3]);
19366 bool negate = false;
19367 rtx x, cop0, cop1;
19368
19369 cop0 = operands[4];
19370 cop1 = operands[5];
19371
19372 /* XOP supports all of the comparisons on all vector int types. */
19373 if (!TARGET_XOP)
19374 {
19375 /* Canonicalize the comparison to EQ, GT, GTU. */
19376 switch (code)
19377 {
19378 case EQ:
19379 case GT:
19380 case GTU:
19381 break;
19382
19383 case NE:
19384 case LE:
19385 case LEU:
19386 code = reverse_condition (code);
19387 negate = true;
19388 break;
19389
19390 case GE:
19391 case GEU:
19392 code = reverse_condition (code);
19393 negate = true;
19394 /* FALLTHRU */
19395
19396 case LT:
19397 case LTU:
19398 code = swap_condition (code);
19399 x = cop0, cop0 = cop1, cop1 = x;
19400 break;
19401
19402 default:
19403 gcc_unreachable ();
19404 }
19405
19406 /* Only SSE4.1/SSE4.2 supports V2DImode. */
19407 if (mode == V2DImode)
19408 {
19409 switch (code)
19410 {
19411 case EQ:
19412 /* SSE4.1 supports EQ. */
19413 if (!TARGET_SSE4_1)
19414 return false;
19415 break;
19416
19417 case GT:
19418 case GTU:
19419 /* SSE4.2 supports GT/GTU. */
19420 if (!TARGET_SSE4_2)
19421 return false;
19422 break;
19423
19424 default:
19425 gcc_unreachable ();
19426 }
19427 }
19428
19429 /* Unsigned parallel compare is not supported by the hardware.
19430 Play some tricks to turn this into a signed comparison
19431 against 0. */
19432 if (code == GTU)
19433 {
19434 cop0 = force_reg (mode, cop0);
19435
19436 switch (mode)
19437 {
19438 case V8SImode:
19439 case V4DImode:
19440 case V4SImode:
19441 case V2DImode:
19442 {
19443 rtx t1, t2, mask;
19444 rtx (*gen_sub3) (rtx, rtx, rtx);
19445
19446 switch (mode)
19447 {
19448 case V8SImode: gen_sub3 = gen_subv8si3; break;
19449 case V4DImode: gen_sub3 = gen_subv4di3; break;
19450 case V4SImode: gen_sub3 = gen_subv4si3; break;
19451 case V2DImode: gen_sub3 = gen_subv2di3; break;
19452 default:
19453 gcc_unreachable ();
19454 }
19455 /* Subtract (-(INT MAX) - 1) from both operands to make
19456 them signed. */
19457 mask = ix86_build_signbit_mask (mode, true, false);
19458 t1 = gen_reg_rtx (mode);
19459 emit_insn (gen_sub3 (t1, cop0, mask));
19460
19461 t2 = gen_reg_rtx (mode);
19462 emit_insn (gen_sub3 (t2, cop1, mask));
19463
19464 cop0 = t1;
19465 cop1 = t2;
19466 code = GT;
19467 }
19468 break;
19469
19470 case V32QImode:
19471 case V16HImode:
19472 case V16QImode:
19473 case V8HImode:
19474 /* Perform a parallel unsigned saturating subtraction. */
19475 x = gen_reg_rtx (mode);
19476 emit_insn (gen_rtx_SET (VOIDmode, x,
19477 gen_rtx_US_MINUS (mode, cop0, cop1)));
19478
19479 cop0 = x;
19480 cop1 = CONST0_RTX (mode);
19481 code = EQ;
19482 negate = !negate;
19483 break;
19484
19485 default:
19486 gcc_unreachable ();
19487 }
19488 }
19489 }
19490
19491 /* Allow the comparison to be done in one mode, but the movcc to
19492 happen in another mode. */
19493 if (data_mode == mode)
19494 {
19495 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
19496 operands[1+negate], operands[2-negate]);
19497 }
19498 else
19499 {
19500 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
19501 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
19502 code, cop0, cop1,
19503 operands[1+negate], operands[2-negate]);
19504 x = gen_lowpart (data_mode, x);
19505 }
19506
19507 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
19508 operands[2-negate]);
19509 return true;
19510 }
19511
19512 /* Expand a variable vector permutation. */
19513
19514 void
19515 ix86_expand_vec_perm (rtx operands[])
19516 {
19517 rtx target = operands[0];
19518 rtx op0 = operands[1];
19519 rtx op1 = operands[2];
19520 rtx mask = operands[3];
19521 rtx t1, t2, t3, t4, vt, vt2, vec[32];
19522 enum machine_mode mode = GET_MODE (op0);
19523 enum machine_mode maskmode = GET_MODE (mask);
19524 int w, e, i;
19525 bool one_operand_shuffle = rtx_equal_p (op0, op1);
19526
19527 /* Number of elements in the vector. */
19528 w = GET_MODE_NUNITS (mode);
19529 e = GET_MODE_UNIT_SIZE (mode);
19530 gcc_assert (w <= 32);
19531
19532 if (TARGET_AVX2)
19533 {
19534 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
19535 {
19536 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
19537 an constant shuffle operand. With a tiny bit of effort we can
19538 use VPERMD instead. A re-interpretation stall for V4DFmode is
19539 unfortunate but there's no avoiding it.
19540 Similarly for V16HImode we don't have instructions for variable
19541 shuffling, while for V32QImode we can use after preparing suitable
19542 masks vpshufb; vpshufb; vpermq; vpor. */
19543
19544 if (mode == V16HImode)
19545 {
19546 maskmode = mode = V32QImode;
19547 w = 32;
19548 e = 1;
19549 }
19550 else
19551 {
19552 maskmode = mode = V8SImode;
19553 w = 8;
19554 e = 4;
19555 }
19556 t1 = gen_reg_rtx (maskmode);
19557
19558 /* Replicate the low bits of the V4DImode mask into V8SImode:
19559 mask = { A B C D }
19560 t1 = { A A B B C C D D }. */
19561 for (i = 0; i < w / 2; ++i)
19562 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
19563 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19564 vt = force_reg (maskmode, vt);
19565 mask = gen_lowpart (maskmode, mask);
19566 if (maskmode == V8SImode)
19567 emit_insn (gen_avx2_permvarv8si (t1, vt, mask));
19568 else
19569 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
19570
19571 /* Multiply the shuffle indicies by two. */
19572 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
19573 OPTAB_DIRECT);
19574
19575 /* Add one to the odd shuffle indicies:
19576 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
19577 for (i = 0; i < w / 2; ++i)
19578 {
19579 vec[i * 2] = const0_rtx;
19580 vec[i * 2 + 1] = const1_rtx;
19581 }
19582 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19583 vt = force_const_mem (maskmode, vt);
19584 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
19585 OPTAB_DIRECT);
19586
19587 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
19588 operands[3] = mask = t1;
19589 target = gen_lowpart (mode, target);
19590 op0 = gen_lowpart (mode, op0);
19591 op1 = gen_lowpart (mode, op1);
19592 }
19593
19594 switch (mode)
19595 {
19596 case V8SImode:
19597 /* The VPERMD and VPERMPS instructions already properly ignore
19598 the high bits of the shuffle elements. No need for us to
19599 perform an AND ourselves. */
19600 if (one_operand_shuffle)
19601 emit_insn (gen_avx2_permvarv8si (target, mask, op0));
19602 else
19603 {
19604 t1 = gen_reg_rtx (V8SImode);
19605 t2 = gen_reg_rtx (V8SImode);
19606 emit_insn (gen_avx2_permvarv8si (t1, mask, op0));
19607 emit_insn (gen_avx2_permvarv8si (t2, mask, op1));
19608 goto merge_two;
19609 }
19610 return;
19611
19612 case V8SFmode:
19613 mask = gen_lowpart (V8SFmode, mask);
19614 if (one_operand_shuffle)
19615 emit_insn (gen_avx2_permvarv8sf (target, mask, op0));
19616 else
19617 {
19618 t1 = gen_reg_rtx (V8SFmode);
19619 t2 = gen_reg_rtx (V8SFmode);
19620 emit_insn (gen_avx2_permvarv8sf (t1, mask, op0));
19621 emit_insn (gen_avx2_permvarv8sf (t2, mask, op1));
19622 goto merge_two;
19623 }
19624 return;
19625
19626 case V4SImode:
19627 /* By combining the two 128-bit input vectors into one 256-bit
19628 input vector, we can use VPERMD and VPERMPS for the full
19629 two-operand shuffle. */
19630 t1 = gen_reg_rtx (V8SImode);
19631 t2 = gen_reg_rtx (V8SImode);
19632 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
19633 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
19634 emit_insn (gen_avx2_permvarv8si (t1, t2, t1));
19635 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
19636 return;
19637
19638 case V4SFmode:
19639 t1 = gen_reg_rtx (V8SFmode);
19640 t2 = gen_reg_rtx (V8SFmode);
19641 mask = gen_lowpart (V4SFmode, mask);
19642 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
19643 emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask));
19644 emit_insn (gen_avx2_permvarv8sf (t1, t2, t1));
19645 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
19646 return;
19647
19648 case V32QImode:
19649 t1 = gen_reg_rtx (V32QImode);
19650 t2 = gen_reg_rtx (V32QImode);
19651 t3 = gen_reg_rtx (V32QImode);
19652 vt2 = GEN_INT (128);
19653 for (i = 0; i < 32; i++)
19654 vec[i] = vt2;
19655 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19656 vt = force_reg (V32QImode, vt);
19657 for (i = 0; i < 32; i++)
19658 vec[i] = i < 16 ? vt2 : const0_rtx;
19659 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19660 vt2 = force_reg (V32QImode, vt2);
19661 /* From mask create two adjusted masks, which contain the same
19662 bits as mask in the low 7 bits of each vector element.
19663 The first mask will have the most significant bit clear
19664 if it requests element from the same 128-bit lane
19665 and MSB set if it requests element from the other 128-bit lane.
19666 The second mask will have the opposite values of the MSB,
19667 and additionally will have its 128-bit lanes swapped.
19668 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
19669 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
19670 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
19671 stands for other 12 bytes. */
19672 /* The bit whether element is from the same lane or the other
19673 lane is bit 4, so shift it up by 3 to the MSB position. */
19674 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
19675 gen_lowpart (V4DImode, mask),
19676 GEN_INT (3)));
19677 /* Clear MSB bits from the mask just in case it had them set. */
19678 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
19679 /* After this t1 will have MSB set for elements from other lane. */
19680 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
19681 /* Clear bits other than MSB. */
19682 emit_insn (gen_andv32qi3 (t1, t1, vt));
19683 /* Or in the lower bits from mask into t3. */
19684 emit_insn (gen_iorv32qi3 (t3, t1, t2));
19685 /* And invert MSB bits in t1, so MSB is set for elements from the same
19686 lane. */
19687 emit_insn (gen_xorv32qi3 (t1, t1, vt));
19688 /* Swap 128-bit lanes in t3. */
19689 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19690 gen_lowpart (V4DImode, t3),
19691 const2_rtx, GEN_INT (3),
19692 const0_rtx, const1_rtx));
19693 /* And or in the lower bits from mask into t1. */
19694 emit_insn (gen_iorv32qi3 (t1, t1, t2));
19695 if (one_operand_shuffle)
19696 {
19697 /* Each of these shuffles will put 0s in places where
19698 element from the other 128-bit lane is needed, otherwise
19699 will shuffle in the requested value. */
19700 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
19701 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
19702 /* For t3 the 128-bit lanes are swapped again. */
19703 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19704 gen_lowpart (V4DImode, t3),
19705 const2_rtx, GEN_INT (3),
19706 const0_rtx, const1_rtx));
19707 /* And oring both together leads to the result. */
19708 emit_insn (gen_iorv32qi3 (target, t1, t3));
19709 return;
19710 }
19711
19712 t4 = gen_reg_rtx (V32QImode);
19713 /* Similarly to the above one_operand_shuffle code,
19714 just for repeated twice for each operand. merge_two:
19715 code will merge the two results together. */
19716 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
19717 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
19718 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
19719 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
19720 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
19721 gen_lowpart (V4DImode, t4),
19722 const2_rtx, GEN_INT (3),
19723 const0_rtx, const1_rtx));
19724 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19725 gen_lowpart (V4DImode, t3),
19726 const2_rtx, GEN_INT (3),
19727 const0_rtx, const1_rtx));
19728 emit_insn (gen_iorv32qi3 (t4, t2, t4));
19729 emit_insn (gen_iorv32qi3 (t3, t1, t3));
19730 t1 = t4;
19731 t2 = t3;
19732 goto merge_two;
19733
19734 default:
19735 gcc_assert (GET_MODE_SIZE (mode) <= 16);
19736 break;
19737 }
19738 }
19739
19740 if (TARGET_XOP)
19741 {
19742 /* The XOP VPPERM insn supports three inputs. By ignoring the
19743 one_operand_shuffle special case, we avoid creating another
19744 set of constant vectors in memory. */
19745 one_operand_shuffle = false;
19746
19747 /* mask = mask & {2*w-1, ...} */
19748 vt = GEN_INT (2*w - 1);
19749 }
19750 else
19751 {
19752 /* mask = mask & {w-1, ...} */
19753 vt = GEN_INT (w - 1);
19754 }
19755
19756 for (i = 0; i < w; i++)
19757 vec[i] = vt;
19758 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19759 mask = expand_simple_binop (maskmode, AND, mask, vt,
19760 NULL_RTX, 0, OPTAB_DIRECT);
19761
19762 /* For non-QImode operations, convert the word permutation control
19763 into a byte permutation control. */
19764 if (mode != V16QImode)
19765 {
19766 mask = expand_simple_binop (maskmode, ASHIFT, mask,
19767 GEN_INT (exact_log2 (e)),
19768 NULL_RTX, 0, OPTAB_DIRECT);
19769
19770 /* Convert mask to vector of chars. */
19771 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
19772
19773 /* Replicate each of the input bytes into byte positions:
19774 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
19775 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
19776 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
19777 for (i = 0; i < 16; ++i)
19778 vec[i] = GEN_INT (i/e * e);
19779 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
19780 vt = force_const_mem (V16QImode, vt);
19781 if (TARGET_XOP)
19782 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
19783 else
19784 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
19785
19786 /* Convert it into the byte positions by doing
19787 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
19788 for (i = 0; i < 16; ++i)
19789 vec[i] = GEN_INT (i % e);
19790 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
19791 vt = force_const_mem (V16QImode, vt);
19792 emit_insn (gen_addv16qi3 (mask, mask, vt));
19793 }
19794
19795 /* The actual shuffle operations all operate on V16QImode. */
19796 op0 = gen_lowpart (V16QImode, op0);
19797 op1 = gen_lowpart (V16QImode, op1);
19798 target = gen_lowpart (V16QImode, target);
19799
19800 if (TARGET_XOP)
19801 {
19802 emit_insn (gen_xop_pperm (target, op0, op1, mask));
19803 }
19804 else if (one_operand_shuffle)
19805 {
19806 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
19807 }
19808 else
19809 {
19810 rtx xops[6];
19811 bool ok;
19812
19813 /* Shuffle the two input vectors independently. */
19814 t1 = gen_reg_rtx (V16QImode);
19815 t2 = gen_reg_rtx (V16QImode);
19816 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
19817 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
19818
19819 merge_two:
19820 /* Then merge them together. The key is whether any given control
19821 element contained a bit set that indicates the second word. */
19822 mask = operands[3];
19823 vt = GEN_INT (w);
19824 if (maskmode == V2DImode && !TARGET_SSE4_1)
19825 {
19826 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
19827 more shuffle to convert the V2DI input mask into a V4SI
19828 input mask. At which point the masking that expand_int_vcond
19829 will work as desired. */
19830 rtx t3 = gen_reg_rtx (V4SImode);
19831 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
19832 const0_rtx, const0_rtx,
19833 const2_rtx, const2_rtx));
19834 mask = t3;
19835 maskmode = V4SImode;
19836 e = w = 4;
19837 }
19838
19839 for (i = 0; i < w; i++)
19840 vec[i] = vt;
19841 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19842 vt = force_reg (maskmode, vt);
19843 mask = expand_simple_binop (maskmode, AND, mask, vt,
19844 NULL_RTX, 0, OPTAB_DIRECT);
19845
19846 xops[0] = gen_lowpart (mode, operands[0]);
19847 xops[1] = gen_lowpart (mode, t2);
19848 xops[2] = gen_lowpart (mode, t1);
19849 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
19850 xops[4] = mask;
19851 xops[5] = vt;
19852 ok = ix86_expand_int_vcond (xops);
19853 gcc_assert (ok);
19854 }
19855 }
19856
19857 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
19858 true if we should do zero extension, else sign extension. HIGH_P is
19859 true if we want the N/2 high elements, else the low elements. */
19860
19861 void
19862 ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
19863 {
19864 enum machine_mode imode = GET_MODE (operands[1]);
19865 rtx tmp, dest;
19866
19867 if (TARGET_SSE4_1)
19868 {
19869 rtx (*unpack)(rtx, rtx);
19870 rtx (*extract)(rtx, rtx) = NULL;
19871 enum machine_mode halfmode = BLKmode;
19872
19873 switch (imode)
19874 {
19875 case V32QImode:
19876 if (unsigned_p)
19877 unpack = gen_avx2_zero_extendv16qiv16hi2;
19878 else
19879 unpack = gen_avx2_sign_extendv16qiv16hi2;
19880 halfmode = V16QImode;
19881 extract
19882 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
19883 break;
19884 case V16HImode:
19885 if (unsigned_p)
19886 unpack = gen_avx2_zero_extendv8hiv8si2;
19887 else
19888 unpack = gen_avx2_sign_extendv8hiv8si2;
19889 halfmode = V8HImode;
19890 extract
19891 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
19892 break;
19893 case V8SImode:
19894 if (unsigned_p)
19895 unpack = gen_avx2_zero_extendv4siv4di2;
19896 else
19897 unpack = gen_avx2_sign_extendv4siv4di2;
19898 halfmode = V4SImode;
19899 extract
19900 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
19901 break;
19902 case V16QImode:
19903 if (unsigned_p)
19904 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
19905 else
19906 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
19907 break;
19908 case V8HImode:
19909 if (unsigned_p)
19910 unpack = gen_sse4_1_zero_extendv4hiv4si2;
19911 else
19912 unpack = gen_sse4_1_sign_extendv4hiv4si2;
19913 break;
19914 case V4SImode:
19915 if (unsigned_p)
19916 unpack = gen_sse4_1_zero_extendv2siv2di2;
19917 else
19918 unpack = gen_sse4_1_sign_extendv2siv2di2;
19919 break;
19920 default:
19921 gcc_unreachable ();
19922 }
19923
19924 if (GET_MODE_SIZE (imode) == 32)
19925 {
19926 tmp = gen_reg_rtx (halfmode);
19927 emit_insn (extract (tmp, operands[1]));
19928 }
19929 else if (high_p)
19930 {
19931 /* Shift higher 8 bytes to lower 8 bytes. */
19932 tmp = gen_reg_rtx (imode);
19933 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
19934 gen_lowpart (V1TImode, operands[1]),
19935 GEN_INT (64)));
19936 }
19937 else
19938 tmp = operands[1];
19939
19940 emit_insn (unpack (operands[0], tmp));
19941 }
19942 else
19943 {
19944 rtx (*unpack)(rtx, rtx, rtx);
19945
19946 switch (imode)
19947 {
19948 case V16QImode:
19949 if (high_p)
19950 unpack = gen_vec_interleave_highv16qi;
19951 else
19952 unpack = gen_vec_interleave_lowv16qi;
19953 break;
19954 case V8HImode:
19955 if (high_p)
19956 unpack = gen_vec_interleave_highv8hi;
19957 else
19958 unpack = gen_vec_interleave_lowv8hi;
19959 break;
19960 case V4SImode:
19961 if (high_p)
19962 unpack = gen_vec_interleave_highv4si;
19963 else
19964 unpack = gen_vec_interleave_lowv4si;
19965 break;
19966 default:
19967 gcc_unreachable ();
19968 }
19969
19970 dest = gen_lowpart (imode, operands[0]);
19971
19972 if (unsigned_p)
19973 tmp = force_reg (imode, CONST0_RTX (imode));
19974 else
19975 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
19976 operands[1], pc_rtx, pc_rtx);
19977
19978 emit_insn (unpack (dest, operands[1], tmp));
19979 }
19980 }
19981
19982 /* Expand conditional increment or decrement using adb/sbb instructions.
19983 The default case using setcc followed by the conditional move can be
19984 done by generic code. */
19985 bool
19986 ix86_expand_int_addcc (rtx operands[])
19987 {
19988 enum rtx_code code = GET_CODE (operands[1]);
19989 rtx flags;
19990 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
19991 rtx compare_op;
19992 rtx val = const0_rtx;
19993 bool fpcmp = false;
19994 enum machine_mode mode;
19995 rtx op0 = XEXP (operands[1], 0);
19996 rtx op1 = XEXP (operands[1], 1);
19997
19998 if (operands[3] != const1_rtx
19999 && operands[3] != constm1_rtx)
20000 return false;
20001 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20002 return false;
20003 code = GET_CODE (compare_op);
20004
20005 flags = XEXP (compare_op, 0);
20006
20007 if (GET_MODE (flags) == CCFPmode
20008 || GET_MODE (flags) == CCFPUmode)
20009 {
20010 fpcmp = true;
20011 code = ix86_fp_compare_code_to_integer (code);
20012 }
20013
20014 if (code != LTU)
20015 {
20016 val = constm1_rtx;
20017 if (fpcmp)
20018 PUT_CODE (compare_op,
20019 reverse_condition_maybe_unordered
20020 (GET_CODE (compare_op)));
20021 else
20022 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20023 }
20024
20025 mode = GET_MODE (operands[0]);
20026
20027 /* Construct either adc or sbb insn. */
20028 if ((code == LTU) == (operands[3] == constm1_rtx))
20029 {
20030 switch (mode)
20031 {
20032 case QImode:
20033 insn = gen_subqi3_carry;
20034 break;
20035 case HImode:
20036 insn = gen_subhi3_carry;
20037 break;
20038 case SImode:
20039 insn = gen_subsi3_carry;
20040 break;
20041 case DImode:
20042 insn = gen_subdi3_carry;
20043 break;
20044 default:
20045 gcc_unreachable ();
20046 }
20047 }
20048 else
20049 {
20050 switch (mode)
20051 {
20052 case QImode:
20053 insn = gen_addqi3_carry;
20054 break;
20055 case HImode:
20056 insn = gen_addhi3_carry;
20057 break;
20058 case SImode:
20059 insn = gen_addsi3_carry;
20060 break;
20061 case DImode:
20062 insn = gen_adddi3_carry;
20063 break;
20064 default:
20065 gcc_unreachable ();
20066 }
20067 }
20068 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20069
20070 return true;
20071 }
20072
20073
20074 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20075 but works for floating pointer parameters and nonoffsetable memories.
20076 For pushes, it returns just stack offsets; the values will be saved
20077 in the right order. Maximally three parts are generated. */
20078
20079 static int
20080 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20081 {
20082 int size;
20083
20084 if (!TARGET_64BIT)
20085 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20086 else
20087 size = (GET_MODE_SIZE (mode) + 4) / 8;
20088
20089 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20090 gcc_assert (size >= 2 && size <= 4);
20091
20092 /* Optimize constant pool reference to immediates. This is used by fp
20093 moves, that force all constants to memory to allow combining. */
20094 if (MEM_P (operand) && MEM_READONLY_P (operand))
20095 {
20096 rtx tmp = maybe_get_pool_constant (operand);
20097 if (tmp)
20098 operand = tmp;
20099 }
20100
20101 if (MEM_P (operand) && !offsettable_memref_p (operand))
20102 {
20103 /* The only non-offsetable memories we handle are pushes. */
20104 int ok = push_operand (operand, VOIDmode);
20105
20106 gcc_assert (ok);
20107
20108 operand = copy_rtx (operand);
20109 PUT_MODE (operand, Pmode);
20110 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20111 return size;
20112 }
20113
20114 if (GET_CODE (operand) == CONST_VECTOR)
20115 {
20116 enum machine_mode imode = int_mode_for_mode (mode);
20117 /* Caution: if we looked through a constant pool memory above,
20118 the operand may actually have a different mode now. That's
20119 ok, since we want to pun this all the way back to an integer. */
20120 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20121 gcc_assert (operand != NULL);
20122 mode = imode;
20123 }
20124
20125 if (!TARGET_64BIT)
20126 {
20127 if (mode == DImode)
20128 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20129 else
20130 {
20131 int i;
20132
20133 if (REG_P (operand))
20134 {
20135 gcc_assert (reload_completed);
20136 for (i = 0; i < size; i++)
20137 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20138 }
20139 else if (offsettable_memref_p (operand))
20140 {
20141 operand = adjust_address (operand, SImode, 0);
20142 parts[0] = operand;
20143 for (i = 1; i < size; i++)
20144 parts[i] = adjust_address (operand, SImode, 4 * i);
20145 }
20146 else if (GET_CODE (operand) == CONST_DOUBLE)
20147 {
20148 REAL_VALUE_TYPE r;
20149 long l[4];
20150
20151 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20152 switch (mode)
20153 {
20154 case TFmode:
20155 real_to_target (l, &r, mode);
20156 parts[3] = gen_int_mode (l[3], SImode);
20157 parts[2] = gen_int_mode (l[2], SImode);
20158 break;
20159 case XFmode:
20160 REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
20161 parts[2] = gen_int_mode (l[2], SImode);
20162 break;
20163 case DFmode:
20164 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20165 break;
20166 default:
20167 gcc_unreachable ();
20168 }
20169 parts[1] = gen_int_mode (l[1], SImode);
20170 parts[0] = gen_int_mode (l[0], SImode);
20171 }
20172 else
20173 gcc_unreachable ();
20174 }
20175 }
20176 else
20177 {
20178 if (mode == TImode)
20179 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20180 if (mode == XFmode || mode == TFmode)
20181 {
20182 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20183 if (REG_P (operand))
20184 {
20185 gcc_assert (reload_completed);
20186 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20187 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20188 }
20189 else if (offsettable_memref_p (operand))
20190 {
20191 operand = adjust_address (operand, DImode, 0);
20192 parts[0] = operand;
20193 parts[1] = adjust_address (operand, upper_mode, 8);
20194 }
20195 else if (GET_CODE (operand) == CONST_DOUBLE)
20196 {
20197 REAL_VALUE_TYPE r;
20198 long l[4];
20199
20200 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20201 real_to_target (l, &r, mode);
20202
20203 /* Do not use shift by 32 to avoid warning on 32bit systems. */
20204 if (HOST_BITS_PER_WIDE_INT >= 64)
20205 parts[0]
20206 = gen_int_mode
20207 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20208 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
20209 DImode);
20210 else
20211 parts[0] = immed_double_const (l[0], l[1], DImode);
20212
20213 if (upper_mode == SImode)
20214 parts[1] = gen_int_mode (l[2], SImode);
20215 else if (HOST_BITS_PER_WIDE_INT >= 64)
20216 parts[1]
20217 = gen_int_mode
20218 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
20219 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
20220 DImode);
20221 else
20222 parts[1] = immed_double_const (l[2], l[3], DImode);
20223 }
20224 else
20225 gcc_unreachable ();
20226 }
20227 }
20228
20229 return size;
20230 }
20231
20232 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
20233 Return false when normal moves are needed; true when all required
20234 insns have been emitted. Operands 2-4 contain the input values
20235 int the correct order; operands 5-7 contain the output values. */
20236
20237 void
20238 ix86_split_long_move (rtx operands[])
20239 {
20240 rtx part[2][4];
20241 int nparts, i, j;
20242 int push = 0;
20243 int collisions = 0;
20244 enum machine_mode mode = GET_MODE (operands[0]);
20245 bool collisionparts[4];
20246
20247 /* The DFmode expanders may ask us to move double.
20248 For 64bit target this is single move. By hiding the fact
20249 here we simplify i386.md splitters. */
20250 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
20251 {
20252 /* Optimize constant pool reference to immediates. This is used by
20253 fp moves, that force all constants to memory to allow combining. */
20254
20255 if (MEM_P (operands[1])
20256 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
20257 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
20258 operands[1] = get_pool_constant (XEXP (operands[1], 0));
20259 if (push_operand (operands[0], VOIDmode))
20260 {
20261 operands[0] = copy_rtx (operands[0]);
20262 PUT_MODE (operands[0], Pmode);
20263 }
20264 else
20265 operands[0] = gen_lowpart (DImode, operands[0]);
20266 operands[1] = gen_lowpart (DImode, operands[1]);
20267 emit_move_insn (operands[0], operands[1]);
20268 return;
20269 }
20270
20271 /* The only non-offsettable memory we handle is push. */
20272 if (push_operand (operands[0], VOIDmode))
20273 push = 1;
20274 else
20275 gcc_assert (!MEM_P (operands[0])
20276 || offsettable_memref_p (operands[0]));
20277
20278 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
20279 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
20280
20281 /* When emitting push, take care for source operands on the stack. */
20282 if (push && MEM_P (operands[1])
20283 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
20284 {
20285 rtx src_base = XEXP (part[1][nparts - 1], 0);
20286
20287 /* Compensate for the stack decrement by 4. */
20288 if (!TARGET_64BIT && nparts == 3
20289 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
20290 src_base = plus_constant (src_base, 4);
20291
20292 /* src_base refers to the stack pointer and is
20293 automatically decreased by emitted push. */
20294 for (i = 0; i < nparts; i++)
20295 part[1][i] = change_address (part[1][i],
20296 GET_MODE (part[1][i]), src_base);
20297 }
20298
20299 /* We need to do copy in the right order in case an address register
20300 of the source overlaps the destination. */
20301 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
20302 {
20303 rtx tmp;
20304
20305 for (i = 0; i < nparts; i++)
20306 {
20307 collisionparts[i]
20308 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
20309 if (collisionparts[i])
20310 collisions++;
20311 }
20312
20313 /* Collision in the middle part can be handled by reordering. */
20314 if (collisions == 1 && nparts == 3 && collisionparts [1])
20315 {
20316 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20317 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20318 }
20319 else if (collisions == 1
20320 && nparts == 4
20321 && (collisionparts [1] || collisionparts [2]))
20322 {
20323 if (collisionparts [1])
20324 {
20325 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20326 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20327 }
20328 else
20329 {
20330 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
20331 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
20332 }
20333 }
20334
20335 /* If there are more collisions, we can't handle it by reordering.
20336 Do an lea to the last part and use only one colliding move. */
20337 else if (collisions > 1)
20338 {
20339 rtx base;
20340
20341 collisions = 1;
20342
20343 base = part[0][nparts - 1];
20344
20345 /* Handle the case when the last part isn't valid for lea.
20346 Happens in 64-bit mode storing the 12-byte XFmode. */
20347 if (GET_MODE (base) != Pmode)
20348 base = gen_rtx_REG (Pmode, REGNO (base));
20349
20350 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
20351 part[1][0] = replace_equiv_address (part[1][0], base);
20352 for (i = 1; i < nparts; i++)
20353 {
20354 tmp = plus_constant (base, UNITS_PER_WORD * i);
20355 part[1][i] = replace_equiv_address (part[1][i], tmp);
20356 }
20357 }
20358 }
20359
20360 if (push)
20361 {
20362 if (!TARGET_64BIT)
20363 {
20364 if (nparts == 3)
20365 {
20366 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
20367 emit_insn (gen_addsi3 (stack_pointer_rtx,
20368 stack_pointer_rtx, GEN_INT (-4)));
20369 emit_move_insn (part[0][2], part[1][2]);
20370 }
20371 else if (nparts == 4)
20372 {
20373 emit_move_insn (part[0][3], part[1][3]);
20374 emit_move_insn (part[0][2], part[1][2]);
20375 }
20376 }
20377 else
20378 {
20379 /* In 64bit mode we don't have 32bit push available. In case this is
20380 register, it is OK - we will just use larger counterpart. We also
20381 retype memory - these comes from attempt to avoid REX prefix on
20382 moving of second half of TFmode value. */
20383 if (GET_MODE (part[1][1]) == SImode)
20384 {
20385 switch (GET_CODE (part[1][1]))
20386 {
20387 case MEM:
20388 part[1][1] = adjust_address (part[1][1], DImode, 0);
20389 break;
20390
20391 case REG:
20392 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
20393 break;
20394
20395 default:
20396 gcc_unreachable ();
20397 }
20398
20399 if (GET_MODE (part[1][0]) == SImode)
20400 part[1][0] = part[1][1];
20401 }
20402 }
20403 emit_move_insn (part[0][1], part[1][1]);
20404 emit_move_insn (part[0][0], part[1][0]);
20405 return;
20406 }
20407
20408 /* Choose correct order to not overwrite the source before it is copied. */
20409 if ((REG_P (part[0][0])
20410 && REG_P (part[1][1])
20411 && (REGNO (part[0][0]) == REGNO (part[1][1])
20412 || (nparts == 3
20413 && REGNO (part[0][0]) == REGNO (part[1][2]))
20414 || (nparts == 4
20415 && REGNO (part[0][0]) == REGNO (part[1][3]))))
20416 || (collisions > 0
20417 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
20418 {
20419 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
20420 {
20421 operands[2 + i] = part[0][j];
20422 operands[6 + i] = part[1][j];
20423 }
20424 }
20425 else
20426 {
20427 for (i = 0; i < nparts; i++)
20428 {
20429 operands[2 + i] = part[0][i];
20430 operands[6 + i] = part[1][i];
20431 }
20432 }
20433
20434 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
20435 if (optimize_insn_for_size_p ())
20436 {
20437 for (j = 0; j < nparts - 1; j++)
20438 if (CONST_INT_P (operands[6 + j])
20439 && operands[6 + j] != const0_rtx
20440 && REG_P (operands[2 + j]))
20441 for (i = j; i < nparts - 1; i++)
20442 if (CONST_INT_P (operands[7 + i])
20443 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
20444 operands[7 + i] = operands[2 + j];
20445 }
20446
20447 for (i = 0; i < nparts; i++)
20448 emit_move_insn (operands[2 + i], operands[6 + i]);
20449
20450 return;
20451 }
20452
20453 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
20454 left shift by a constant, either using a single shift or
20455 a sequence of add instructions. */
20456
20457 static void
20458 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
20459 {
20460 rtx (*insn)(rtx, rtx, rtx);
20461
20462 if (count == 1
20463 || (count * ix86_cost->add <= ix86_cost->shift_const
20464 && !optimize_insn_for_size_p ()))
20465 {
20466 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
20467 while (count-- > 0)
20468 emit_insn (insn (operand, operand, operand));
20469 }
20470 else
20471 {
20472 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20473 emit_insn (insn (operand, operand, GEN_INT (count)));
20474 }
20475 }
20476
20477 void
20478 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
20479 {
20480 rtx (*gen_ashl3)(rtx, rtx, rtx);
20481 rtx (*gen_shld)(rtx, rtx, rtx);
20482 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20483
20484 rtx low[2], high[2];
20485 int count;
20486
20487 if (CONST_INT_P (operands[2]))
20488 {
20489 split_double_mode (mode, operands, 2, low, high);
20490 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20491
20492 if (count >= half_width)
20493 {
20494 emit_move_insn (high[0], low[1]);
20495 emit_move_insn (low[0], const0_rtx);
20496
20497 if (count > half_width)
20498 ix86_expand_ashl_const (high[0], count - half_width, mode);
20499 }
20500 else
20501 {
20502 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20503
20504 if (!rtx_equal_p (operands[0], operands[1]))
20505 emit_move_insn (operands[0], operands[1]);
20506
20507 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
20508 ix86_expand_ashl_const (low[0], count, mode);
20509 }
20510 return;
20511 }
20512
20513 split_double_mode (mode, operands, 1, low, high);
20514
20515 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20516
20517 if (operands[1] == const1_rtx)
20518 {
20519 /* Assuming we've chosen a QImode capable registers, then 1 << N
20520 can be done with two 32/64-bit shifts, no branches, no cmoves. */
20521 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
20522 {
20523 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
20524
20525 ix86_expand_clear (low[0]);
20526 ix86_expand_clear (high[0]);
20527 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
20528
20529 d = gen_lowpart (QImode, low[0]);
20530 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20531 s = gen_rtx_EQ (QImode, flags, const0_rtx);
20532 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20533
20534 d = gen_lowpart (QImode, high[0]);
20535 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20536 s = gen_rtx_NE (QImode, flags, const0_rtx);
20537 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20538 }
20539
20540 /* Otherwise, we can get the same results by manually performing
20541 a bit extract operation on bit 5/6, and then performing the two
20542 shifts. The two methods of getting 0/1 into low/high are exactly
20543 the same size. Avoiding the shift in the bit extract case helps
20544 pentium4 a bit; no one else seems to care much either way. */
20545 else
20546 {
20547 enum machine_mode half_mode;
20548 rtx (*gen_lshr3)(rtx, rtx, rtx);
20549 rtx (*gen_and3)(rtx, rtx, rtx);
20550 rtx (*gen_xor3)(rtx, rtx, rtx);
20551 HOST_WIDE_INT bits;
20552 rtx x;
20553
20554 if (mode == DImode)
20555 {
20556 half_mode = SImode;
20557 gen_lshr3 = gen_lshrsi3;
20558 gen_and3 = gen_andsi3;
20559 gen_xor3 = gen_xorsi3;
20560 bits = 5;
20561 }
20562 else
20563 {
20564 half_mode = DImode;
20565 gen_lshr3 = gen_lshrdi3;
20566 gen_and3 = gen_anddi3;
20567 gen_xor3 = gen_xordi3;
20568 bits = 6;
20569 }
20570
20571 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
20572 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
20573 else
20574 x = gen_lowpart (half_mode, operands[2]);
20575 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
20576
20577 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
20578 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
20579 emit_move_insn (low[0], high[0]);
20580 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
20581 }
20582
20583 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20584 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
20585 return;
20586 }
20587
20588 if (operands[1] == constm1_rtx)
20589 {
20590 /* For -1 << N, we can avoid the shld instruction, because we
20591 know that we're shifting 0...31/63 ones into a -1. */
20592 emit_move_insn (low[0], constm1_rtx);
20593 if (optimize_insn_for_size_p ())
20594 emit_move_insn (high[0], low[0]);
20595 else
20596 emit_move_insn (high[0], constm1_rtx);
20597 }
20598 else
20599 {
20600 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20601
20602 if (!rtx_equal_p (operands[0], operands[1]))
20603 emit_move_insn (operands[0], operands[1]);
20604
20605 split_double_mode (mode, operands, 1, low, high);
20606 emit_insn (gen_shld (high[0], low[0], operands[2]));
20607 }
20608
20609 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20610
20611 if (TARGET_CMOVE && scratch)
20612 {
20613 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20614 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20615
20616 ix86_expand_clear (scratch);
20617 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
20618 }
20619 else
20620 {
20621 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20622 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20623
20624 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
20625 }
20626 }
20627
20628 void
20629 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
20630 {
20631 rtx (*gen_ashr3)(rtx, rtx, rtx)
20632 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
20633 rtx (*gen_shrd)(rtx, rtx, rtx);
20634 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20635
20636 rtx low[2], high[2];
20637 int count;
20638
20639 if (CONST_INT_P (operands[2]))
20640 {
20641 split_double_mode (mode, operands, 2, low, high);
20642 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20643
20644 if (count == GET_MODE_BITSIZE (mode) - 1)
20645 {
20646 emit_move_insn (high[0], high[1]);
20647 emit_insn (gen_ashr3 (high[0], high[0],
20648 GEN_INT (half_width - 1)));
20649 emit_move_insn (low[0], high[0]);
20650
20651 }
20652 else if (count >= half_width)
20653 {
20654 emit_move_insn (low[0], high[1]);
20655 emit_move_insn (high[0], low[0]);
20656 emit_insn (gen_ashr3 (high[0], high[0],
20657 GEN_INT (half_width - 1)));
20658
20659 if (count > half_width)
20660 emit_insn (gen_ashr3 (low[0], low[0],
20661 GEN_INT (count - half_width)));
20662 }
20663 else
20664 {
20665 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20666
20667 if (!rtx_equal_p (operands[0], operands[1]))
20668 emit_move_insn (operands[0], operands[1]);
20669
20670 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20671 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
20672 }
20673 }
20674 else
20675 {
20676 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20677
20678 if (!rtx_equal_p (operands[0], operands[1]))
20679 emit_move_insn (operands[0], operands[1]);
20680
20681 split_double_mode (mode, operands, 1, low, high);
20682
20683 emit_insn (gen_shrd (low[0], high[0], operands[2]));
20684 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
20685
20686 if (TARGET_CMOVE && scratch)
20687 {
20688 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20689 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20690
20691 emit_move_insn (scratch, high[0]);
20692 emit_insn (gen_ashr3 (scratch, scratch,
20693 GEN_INT (half_width - 1)));
20694 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
20695 scratch));
20696 }
20697 else
20698 {
20699 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
20700 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
20701
20702 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
20703 }
20704 }
20705 }
20706
20707 void
20708 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
20709 {
20710 rtx (*gen_lshr3)(rtx, rtx, rtx)
20711 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
20712 rtx (*gen_shrd)(rtx, rtx, rtx);
20713 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20714
20715 rtx low[2], high[2];
20716 int count;
20717
20718 if (CONST_INT_P (operands[2]))
20719 {
20720 split_double_mode (mode, operands, 2, low, high);
20721 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20722
20723 if (count >= half_width)
20724 {
20725 emit_move_insn (low[0], high[1]);
20726 ix86_expand_clear (high[0]);
20727
20728 if (count > half_width)
20729 emit_insn (gen_lshr3 (low[0], low[0],
20730 GEN_INT (count - half_width)));
20731 }
20732 else
20733 {
20734 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20735
20736 if (!rtx_equal_p (operands[0], operands[1]))
20737 emit_move_insn (operands[0], operands[1]);
20738
20739 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20740 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
20741 }
20742 }
20743 else
20744 {
20745 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20746
20747 if (!rtx_equal_p (operands[0], operands[1]))
20748 emit_move_insn (operands[0], operands[1]);
20749
20750 split_double_mode (mode, operands, 1, low, high);
20751
20752 emit_insn (gen_shrd (low[0], high[0], operands[2]));
20753 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
20754
20755 if (TARGET_CMOVE && scratch)
20756 {
20757 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20758 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20759
20760 ix86_expand_clear (scratch);
20761 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
20762 scratch));
20763 }
20764 else
20765 {
20766 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20767 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20768
20769 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
20770 }
20771 }
20772 }
20773
20774 /* Predict just emitted jump instruction to be taken with probability PROB. */
20775 static void
20776 predict_jump (int prob)
20777 {
20778 rtx insn = get_last_insn ();
20779 gcc_assert (JUMP_P (insn));
20780 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
20781 }
20782
20783 /* Helper function for the string operations below. Dest VARIABLE whether
20784 it is aligned to VALUE bytes. If true, jump to the label. */
20785 static rtx
20786 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
20787 {
20788 rtx label = gen_label_rtx ();
20789 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
20790 if (GET_MODE (variable) == DImode)
20791 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
20792 else
20793 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
20794 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
20795 1, label);
20796 if (epilogue)
20797 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20798 else
20799 predict_jump (REG_BR_PROB_BASE * 90 / 100);
20800 return label;
20801 }
20802
20803 /* Adjust COUNTER by the VALUE. */
20804 static void
20805 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
20806 {
20807 rtx (*gen_add)(rtx, rtx, rtx)
20808 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
20809
20810 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
20811 }
20812
20813 /* Zero extend possibly SImode EXP to Pmode register. */
20814 rtx
20815 ix86_zero_extend_to_Pmode (rtx exp)
20816 {
20817 rtx r;
20818 if (GET_MODE (exp) == VOIDmode)
20819 return force_reg (Pmode, exp);
20820 if (GET_MODE (exp) == Pmode)
20821 return copy_to_mode_reg (Pmode, exp);
20822 r = gen_reg_rtx (Pmode);
20823 emit_insn (gen_zero_extendsidi2 (r, exp));
20824 return r;
20825 }
20826
20827 /* Divide COUNTREG by SCALE. */
20828 static rtx
20829 scale_counter (rtx countreg, int scale)
20830 {
20831 rtx sc;
20832
20833 if (scale == 1)
20834 return countreg;
20835 if (CONST_INT_P (countreg))
20836 return GEN_INT (INTVAL (countreg) / scale);
20837 gcc_assert (REG_P (countreg));
20838
20839 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
20840 GEN_INT (exact_log2 (scale)),
20841 NULL, 1, OPTAB_DIRECT);
20842 return sc;
20843 }
20844
20845 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
20846 DImode for constant loop counts. */
20847
20848 static enum machine_mode
20849 counter_mode (rtx count_exp)
20850 {
20851 if (GET_MODE (count_exp) != VOIDmode)
20852 return GET_MODE (count_exp);
20853 if (!CONST_INT_P (count_exp))
20854 return Pmode;
20855 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
20856 return DImode;
20857 return SImode;
20858 }
20859
20860 /* When SRCPTR is non-NULL, output simple loop to move memory
20861 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
20862 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
20863 equivalent loop to set memory by VALUE (supposed to be in MODE).
20864
20865 The size is rounded down to whole number of chunk size moved at once.
20866 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
20867
20868
20869 static void
20870 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
20871 rtx destptr, rtx srcptr, rtx value,
20872 rtx count, enum machine_mode mode, int unroll,
20873 int expected_size)
20874 {
20875 rtx out_label, top_label, iter, tmp;
20876 enum machine_mode iter_mode = counter_mode (count);
20877 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
20878 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
20879 rtx size;
20880 rtx x_addr;
20881 rtx y_addr;
20882 int i;
20883
20884 top_label = gen_label_rtx ();
20885 out_label = gen_label_rtx ();
20886 iter = gen_reg_rtx (iter_mode);
20887
20888 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
20889 NULL, 1, OPTAB_DIRECT);
20890 /* Those two should combine. */
20891 if (piece_size == const1_rtx)
20892 {
20893 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
20894 true, out_label);
20895 predict_jump (REG_BR_PROB_BASE * 10 / 100);
20896 }
20897 emit_move_insn (iter, const0_rtx);
20898
20899 emit_label (top_label);
20900
20901 tmp = convert_modes (Pmode, iter_mode, iter, true);
20902 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
20903 destmem = change_address (destmem, mode, x_addr);
20904
20905 if (srcmem)
20906 {
20907 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
20908 srcmem = change_address (srcmem, mode, y_addr);
20909
20910 /* When unrolling for chips that reorder memory reads and writes,
20911 we can save registers by using single temporary.
20912 Also using 4 temporaries is overkill in 32bit mode. */
20913 if (!TARGET_64BIT && 0)
20914 {
20915 for (i = 0; i < unroll; i++)
20916 {
20917 if (i)
20918 {
20919 destmem =
20920 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
20921 srcmem =
20922 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
20923 }
20924 emit_move_insn (destmem, srcmem);
20925 }
20926 }
20927 else
20928 {
20929 rtx tmpreg[4];
20930 gcc_assert (unroll <= 4);
20931 for (i = 0; i < unroll; i++)
20932 {
20933 tmpreg[i] = gen_reg_rtx (mode);
20934 if (i)
20935 {
20936 srcmem =
20937 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
20938 }
20939 emit_move_insn (tmpreg[i], srcmem);
20940 }
20941 for (i = 0; i < unroll; i++)
20942 {
20943 if (i)
20944 {
20945 destmem =
20946 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
20947 }
20948 emit_move_insn (destmem, tmpreg[i]);
20949 }
20950 }
20951 }
20952 else
20953 for (i = 0; i < unroll; i++)
20954 {
20955 if (i)
20956 destmem =
20957 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
20958 emit_move_insn (destmem, value);
20959 }
20960
20961 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
20962 true, OPTAB_LIB_WIDEN);
20963 if (tmp != iter)
20964 emit_move_insn (iter, tmp);
20965
20966 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
20967 true, top_label);
20968 if (expected_size != -1)
20969 {
20970 expected_size /= GET_MODE_SIZE (mode) * unroll;
20971 if (expected_size == 0)
20972 predict_jump (0);
20973 else if (expected_size > REG_BR_PROB_BASE)
20974 predict_jump (REG_BR_PROB_BASE - 1);
20975 else
20976 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
20977 }
20978 else
20979 predict_jump (REG_BR_PROB_BASE * 80 / 100);
20980 iter = ix86_zero_extend_to_Pmode (iter);
20981 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
20982 true, OPTAB_LIB_WIDEN);
20983 if (tmp != destptr)
20984 emit_move_insn (destptr, tmp);
20985 if (srcptr)
20986 {
20987 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
20988 true, OPTAB_LIB_WIDEN);
20989 if (tmp != srcptr)
20990 emit_move_insn (srcptr, tmp);
20991 }
20992 emit_label (out_label);
20993 }
20994
20995 /* Output "rep; mov" instruction.
20996 Arguments have same meaning as for previous function */
20997 static void
20998 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
20999 rtx destptr, rtx srcptr,
21000 rtx count,
21001 enum machine_mode mode)
21002 {
21003 rtx destexp;
21004 rtx srcexp;
21005 rtx countreg;
21006 HOST_WIDE_INT rounded_count;
21007
21008 /* If the size is known, it is shorter to use rep movs. */
21009 if (mode == QImode && CONST_INT_P (count)
21010 && !(INTVAL (count) & 3))
21011 mode = SImode;
21012
21013 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21014 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21015 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21016 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21017 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21018 if (mode != QImode)
21019 {
21020 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21021 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21022 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21023 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21024 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21025 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21026 }
21027 else
21028 {
21029 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21030 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21031 }
21032 if (CONST_INT_P (count))
21033 {
21034 rounded_count = (INTVAL (count)
21035 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21036 destmem = shallow_copy_rtx (destmem);
21037 srcmem = shallow_copy_rtx (srcmem);
21038 set_mem_size (destmem, rounded_count);
21039 set_mem_size (srcmem, rounded_count);
21040 }
21041 else
21042 {
21043 if (MEM_SIZE_KNOWN_P (destmem))
21044 clear_mem_size (destmem);
21045 if (MEM_SIZE_KNOWN_P (srcmem))
21046 clear_mem_size (srcmem);
21047 }
21048 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21049 destexp, srcexp));
21050 }
21051
21052 /* Output "rep; stos" instruction.
21053 Arguments have same meaning as for previous function */
21054 static void
21055 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21056 rtx count, enum machine_mode mode,
21057 rtx orig_value)
21058 {
21059 rtx destexp;
21060 rtx countreg;
21061 HOST_WIDE_INT rounded_count;
21062
21063 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21064 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21065 value = force_reg (mode, gen_lowpart (mode, value));
21066 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21067 if (mode != QImode)
21068 {
21069 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21070 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21071 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21072 }
21073 else
21074 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21075 if (orig_value == const0_rtx && CONST_INT_P (count))
21076 {
21077 rounded_count = (INTVAL (count)
21078 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21079 destmem = shallow_copy_rtx (destmem);
21080 set_mem_size (destmem, rounded_count);
21081 }
21082 else if (MEM_SIZE_KNOWN_P (destmem))
21083 clear_mem_size (destmem);
21084 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21085 }
21086
21087 static void
21088 emit_strmov (rtx destmem, rtx srcmem,
21089 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21090 {
21091 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21092 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21093 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21094 }
21095
21096 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21097 static void
21098 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21099 rtx destptr, rtx srcptr, rtx count, int max_size)
21100 {
21101 rtx src, dest;
21102 if (CONST_INT_P (count))
21103 {
21104 HOST_WIDE_INT countval = INTVAL (count);
21105 int offset = 0;
21106
21107 if ((countval & 0x10) && max_size > 16)
21108 {
21109 if (TARGET_64BIT)
21110 {
21111 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21112 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21113 }
21114 else
21115 gcc_unreachable ();
21116 offset += 16;
21117 }
21118 if ((countval & 0x08) && max_size > 8)
21119 {
21120 if (TARGET_64BIT)
21121 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21122 else
21123 {
21124 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21125 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21126 }
21127 offset += 8;
21128 }
21129 if ((countval & 0x04) && max_size > 4)
21130 {
21131 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21132 offset += 4;
21133 }
21134 if ((countval & 0x02) && max_size > 2)
21135 {
21136 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21137 offset += 2;
21138 }
21139 if ((countval & 0x01) && max_size > 1)
21140 {
21141 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21142 offset += 1;
21143 }
21144 return;
21145 }
21146 if (max_size > 8)
21147 {
21148 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21149 count, 1, OPTAB_DIRECT);
21150 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21151 count, QImode, 1, 4);
21152 return;
21153 }
21154
21155 /* When there are stringops, we can cheaply increase dest and src pointers.
21156 Otherwise we save code size by maintaining offset (zero is readily
21157 available from preceding rep operation) and using x86 addressing modes.
21158 */
21159 if (TARGET_SINGLE_STRINGOP)
21160 {
21161 if (max_size > 4)
21162 {
21163 rtx label = ix86_expand_aligntest (count, 4, true);
21164 src = change_address (srcmem, SImode, srcptr);
21165 dest = change_address (destmem, SImode, destptr);
21166 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21167 emit_label (label);
21168 LABEL_NUSES (label) = 1;
21169 }
21170 if (max_size > 2)
21171 {
21172 rtx label = ix86_expand_aligntest (count, 2, true);
21173 src = change_address (srcmem, HImode, srcptr);
21174 dest = change_address (destmem, HImode, destptr);
21175 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21176 emit_label (label);
21177 LABEL_NUSES (label) = 1;
21178 }
21179 if (max_size > 1)
21180 {
21181 rtx label = ix86_expand_aligntest (count, 1, true);
21182 src = change_address (srcmem, QImode, srcptr);
21183 dest = change_address (destmem, QImode, destptr);
21184 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21185 emit_label (label);
21186 LABEL_NUSES (label) = 1;
21187 }
21188 }
21189 else
21190 {
21191 rtx offset = force_reg (Pmode, const0_rtx);
21192 rtx tmp;
21193
21194 if (max_size > 4)
21195 {
21196 rtx label = ix86_expand_aligntest (count, 4, true);
21197 src = change_address (srcmem, SImode, srcptr);
21198 dest = change_address (destmem, SImode, destptr);
21199 emit_move_insn (dest, src);
21200 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21201 true, OPTAB_LIB_WIDEN);
21202 if (tmp != offset)
21203 emit_move_insn (offset, tmp);
21204 emit_label (label);
21205 LABEL_NUSES (label) = 1;
21206 }
21207 if (max_size > 2)
21208 {
21209 rtx label = ix86_expand_aligntest (count, 2, true);
21210 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21211 src = change_address (srcmem, HImode, tmp);
21212 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21213 dest = change_address (destmem, HImode, tmp);
21214 emit_move_insn (dest, src);
21215 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
21216 true, OPTAB_LIB_WIDEN);
21217 if (tmp != offset)
21218 emit_move_insn (offset, tmp);
21219 emit_label (label);
21220 LABEL_NUSES (label) = 1;
21221 }
21222 if (max_size > 1)
21223 {
21224 rtx label = ix86_expand_aligntest (count, 1, true);
21225 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21226 src = change_address (srcmem, QImode, tmp);
21227 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21228 dest = change_address (destmem, QImode, tmp);
21229 emit_move_insn (dest, src);
21230 emit_label (label);
21231 LABEL_NUSES (label) = 1;
21232 }
21233 }
21234 }
21235
21236 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21237 static void
21238 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
21239 rtx count, int max_size)
21240 {
21241 count =
21242 expand_simple_binop (counter_mode (count), AND, count,
21243 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
21244 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
21245 gen_lowpart (QImode, value), count, QImode,
21246 1, max_size / 2);
21247 }
21248
21249 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21250 static void
21251 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
21252 {
21253 rtx dest;
21254
21255 if (CONST_INT_P (count))
21256 {
21257 HOST_WIDE_INT countval = INTVAL (count);
21258 int offset = 0;
21259
21260 if ((countval & 0x10) && max_size > 16)
21261 {
21262 if (TARGET_64BIT)
21263 {
21264 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21265 emit_insn (gen_strset (destptr, dest, value));
21266 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
21267 emit_insn (gen_strset (destptr, dest, value));
21268 }
21269 else
21270 gcc_unreachable ();
21271 offset += 16;
21272 }
21273 if ((countval & 0x08) && max_size > 8)
21274 {
21275 if (TARGET_64BIT)
21276 {
21277 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21278 emit_insn (gen_strset (destptr, dest, value));
21279 }
21280 else
21281 {
21282 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21283 emit_insn (gen_strset (destptr, dest, value));
21284 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
21285 emit_insn (gen_strset (destptr, dest, value));
21286 }
21287 offset += 8;
21288 }
21289 if ((countval & 0x04) && max_size > 4)
21290 {
21291 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21292 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21293 offset += 4;
21294 }
21295 if ((countval & 0x02) && max_size > 2)
21296 {
21297 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
21298 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21299 offset += 2;
21300 }
21301 if ((countval & 0x01) && max_size > 1)
21302 {
21303 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
21304 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21305 offset += 1;
21306 }
21307 return;
21308 }
21309 if (max_size > 32)
21310 {
21311 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
21312 return;
21313 }
21314 if (max_size > 16)
21315 {
21316 rtx label = ix86_expand_aligntest (count, 16, true);
21317 if (TARGET_64BIT)
21318 {
21319 dest = change_address (destmem, DImode, destptr);
21320 emit_insn (gen_strset (destptr, dest, value));
21321 emit_insn (gen_strset (destptr, dest, value));
21322 }
21323 else
21324 {
21325 dest = change_address (destmem, SImode, destptr);
21326 emit_insn (gen_strset (destptr, dest, value));
21327 emit_insn (gen_strset (destptr, dest, value));
21328 emit_insn (gen_strset (destptr, dest, value));
21329 emit_insn (gen_strset (destptr, dest, value));
21330 }
21331 emit_label (label);
21332 LABEL_NUSES (label) = 1;
21333 }
21334 if (max_size > 8)
21335 {
21336 rtx label = ix86_expand_aligntest (count, 8, true);
21337 if (TARGET_64BIT)
21338 {
21339 dest = change_address (destmem, DImode, destptr);
21340 emit_insn (gen_strset (destptr, dest, value));
21341 }
21342 else
21343 {
21344 dest = change_address (destmem, SImode, destptr);
21345 emit_insn (gen_strset (destptr, dest, value));
21346 emit_insn (gen_strset (destptr, dest, value));
21347 }
21348 emit_label (label);
21349 LABEL_NUSES (label) = 1;
21350 }
21351 if (max_size > 4)
21352 {
21353 rtx label = ix86_expand_aligntest (count, 4, true);
21354 dest = change_address (destmem, SImode, destptr);
21355 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21356 emit_label (label);
21357 LABEL_NUSES (label) = 1;
21358 }
21359 if (max_size > 2)
21360 {
21361 rtx label = ix86_expand_aligntest (count, 2, true);
21362 dest = change_address (destmem, HImode, destptr);
21363 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21364 emit_label (label);
21365 LABEL_NUSES (label) = 1;
21366 }
21367 if (max_size > 1)
21368 {
21369 rtx label = ix86_expand_aligntest (count, 1, true);
21370 dest = change_address (destmem, QImode, destptr);
21371 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21372 emit_label (label);
21373 LABEL_NUSES (label) = 1;
21374 }
21375 }
21376
21377 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
21378 DESIRED_ALIGNMENT. */
21379 static void
21380 expand_movmem_prologue (rtx destmem, rtx srcmem,
21381 rtx destptr, rtx srcptr, rtx count,
21382 int align, int desired_alignment)
21383 {
21384 if (align <= 1 && desired_alignment > 1)
21385 {
21386 rtx label = ix86_expand_aligntest (destptr, 1, false);
21387 srcmem = change_address (srcmem, QImode, srcptr);
21388 destmem = change_address (destmem, QImode, destptr);
21389 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21390 ix86_adjust_counter (count, 1);
21391 emit_label (label);
21392 LABEL_NUSES (label) = 1;
21393 }
21394 if (align <= 2 && desired_alignment > 2)
21395 {
21396 rtx label = ix86_expand_aligntest (destptr, 2, false);
21397 srcmem = change_address (srcmem, HImode, srcptr);
21398 destmem = change_address (destmem, HImode, destptr);
21399 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21400 ix86_adjust_counter (count, 2);
21401 emit_label (label);
21402 LABEL_NUSES (label) = 1;
21403 }
21404 if (align <= 4 && desired_alignment > 4)
21405 {
21406 rtx label = ix86_expand_aligntest (destptr, 4, false);
21407 srcmem = change_address (srcmem, SImode, srcptr);
21408 destmem = change_address (destmem, SImode, destptr);
21409 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21410 ix86_adjust_counter (count, 4);
21411 emit_label (label);
21412 LABEL_NUSES (label) = 1;
21413 }
21414 gcc_assert (desired_alignment <= 8);
21415 }
21416
21417 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
21418 ALIGN_BYTES is how many bytes need to be copied. */
21419 static rtx
21420 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
21421 int desired_align, int align_bytes)
21422 {
21423 rtx src = *srcp;
21424 rtx orig_dst = dst;
21425 rtx orig_src = src;
21426 int off = 0;
21427 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
21428 if (src_align_bytes >= 0)
21429 src_align_bytes = desired_align - src_align_bytes;
21430 if (align_bytes & 1)
21431 {
21432 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21433 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
21434 off = 1;
21435 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21436 }
21437 if (align_bytes & 2)
21438 {
21439 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21440 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
21441 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21442 set_mem_align (dst, 2 * BITS_PER_UNIT);
21443 if (src_align_bytes >= 0
21444 && (src_align_bytes & 1) == (align_bytes & 1)
21445 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
21446 set_mem_align (src, 2 * BITS_PER_UNIT);
21447 off = 2;
21448 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21449 }
21450 if (align_bytes & 4)
21451 {
21452 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21453 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
21454 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21455 set_mem_align (dst, 4 * BITS_PER_UNIT);
21456 if (src_align_bytes >= 0)
21457 {
21458 unsigned int src_align = 0;
21459 if ((src_align_bytes & 3) == (align_bytes & 3))
21460 src_align = 4;
21461 else if ((src_align_bytes & 1) == (align_bytes & 1))
21462 src_align = 2;
21463 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21464 set_mem_align (src, src_align * BITS_PER_UNIT);
21465 }
21466 off = 4;
21467 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21468 }
21469 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21470 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
21471 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21472 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21473 if (src_align_bytes >= 0)
21474 {
21475 unsigned int src_align = 0;
21476 if ((src_align_bytes & 7) == (align_bytes & 7))
21477 src_align = 8;
21478 else if ((src_align_bytes & 3) == (align_bytes & 3))
21479 src_align = 4;
21480 else if ((src_align_bytes & 1) == (align_bytes & 1))
21481 src_align = 2;
21482 if (src_align > (unsigned int) desired_align)
21483 src_align = desired_align;
21484 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21485 set_mem_align (src, src_align * BITS_PER_UNIT);
21486 }
21487 if (MEM_SIZE_KNOWN_P (orig_dst))
21488 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21489 if (MEM_SIZE_KNOWN_P (orig_src))
21490 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
21491 *srcp = src;
21492 return dst;
21493 }
21494
21495 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
21496 DESIRED_ALIGNMENT. */
21497 static void
21498 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
21499 int align, int desired_alignment)
21500 {
21501 if (align <= 1 && desired_alignment > 1)
21502 {
21503 rtx label = ix86_expand_aligntest (destptr, 1, false);
21504 destmem = change_address (destmem, QImode, destptr);
21505 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
21506 ix86_adjust_counter (count, 1);
21507 emit_label (label);
21508 LABEL_NUSES (label) = 1;
21509 }
21510 if (align <= 2 && desired_alignment > 2)
21511 {
21512 rtx label = ix86_expand_aligntest (destptr, 2, false);
21513 destmem = change_address (destmem, HImode, destptr);
21514 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
21515 ix86_adjust_counter (count, 2);
21516 emit_label (label);
21517 LABEL_NUSES (label) = 1;
21518 }
21519 if (align <= 4 && desired_alignment > 4)
21520 {
21521 rtx label = ix86_expand_aligntest (destptr, 4, false);
21522 destmem = change_address (destmem, SImode, destptr);
21523 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
21524 ix86_adjust_counter (count, 4);
21525 emit_label (label);
21526 LABEL_NUSES (label) = 1;
21527 }
21528 gcc_assert (desired_alignment <= 8);
21529 }
21530
21531 /* Set enough from DST to align DST known to by aligned by ALIGN to
21532 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
21533 static rtx
21534 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
21535 int desired_align, int align_bytes)
21536 {
21537 int off = 0;
21538 rtx orig_dst = dst;
21539 if (align_bytes & 1)
21540 {
21541 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21542 off = 1;
21543 emit_insn (gen_strset (destreg, dst,
21544 gen_lowpart (QImode, value)));
21545 }
21546 if (align_bytes & 2)
21547 {
21548 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21549 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21550 set_mem_align (dst, 2 * BITS_PER_UNIT);
21551 off = 2;
21552 emit_insn (gen_strset (destreg, dst,
21553 gen_lowpart (HImode, value)));
21554 }
21555 if (align_bytes & 4)
21556 {
21557 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21558 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21559 set_mem_align (dst, 4 * BITS_PER_UNIT);
21560 off = 4;
21561 emit_insn (gen_strset (destreg, dst,
21562 gen_lowpart (SImode, value)));
21563 }
21564 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21565 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21566 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21567 if (MEM_SIZE_KNOWN_P (orig_dst))
21568 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21569 return dst;
21570 }
21571
21572 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
21573 static enum stringop_alg
21574 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
21575 int *dynamic_check)
21576 {
21577 const struct stringop_algs * algs;
21578 bool optimize_for_speed;
21579 /* Algorithms using the rep prefix want at least edi and ecx;
21580 additionally, memset wants eax and memcpy wants esi. Don't
21581 consider such algorithms if the user has appropriated those
21582 registers for their own purposes. */
21583 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
21584 || (memset
21585 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
21586
21587 #define ALG_USABLE_P(alg) (rep_prefix_usable \
21588 || (alg != rep_prefix_1_byte \
21589 && alg != rep_prefix_4_byte \
21590 && alg != rep_prefix_8_byte))
21591 const struct processor_costs *cost;
21592
21593 /* Even if the string operation call is cold, we still might spend a lot
21594 of time processing large blocks. */
21595 if (optimize_function_for_size_p (cfun)
21596 || (optimize_insn_for_size_p ()
21597 && expected_size != -1 && expected_size < 256))
21598 optimize_for_speed = false;
21599 else
21600 optimize_for_speed = true;
21601
21602 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
21603
21604 *dynamic_check = -1;
21605 if (memset)
21606 algs = &cost->memset[TARGET_64BIT != 0];
21607 else
21608 algs = &cost->memcpy[TARGET_64BIT != 0];
21609 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
21610 return ix86_stringop_alg;
21611 /* rep; movq or rep; movl is the smallest variant. */
21612 else if (!optimize_for_speed)
21613 {
21614 if (!count || (count & 3))
21615 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
21616 else
21617 return rep_prefix_usable ? rep_prefix_4_byte : loop;
21618 }
21619 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
21620 */
21621 else if (expected_size != -1 && expected_size < 4)
21622 return loop_1_byte;
21623 else if (expected_size != -1)
21624 {
21625 unsigned int i;
21626 enum stringop_alg alg = libcall;
21627 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21628 {
21629 /* We get here if the algorithms that were not libcall-based
21630 were rep-prefix based and we are unable to use rep prefixes
21631 based on global register usage. Break out of the loop and
21632 use the heuristic below. */
21633 if (algs->size[i].max == 0)
21634 break;
21635 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
21636 {
21637 enum stringop_alg candidate = algs->size[i].alg;
21638
21639 if (candidate != libcall && ALG_USABLE_P (candidate))
21640 alg = candidate;
21641 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
21642 last non-libcall inline algorithm. */
21643 if (TARGET_INLINE_ALL_STRINGOPS)
21644 {
21645 /* When the current size is best to be copied by a libcall,
21646 but we are still forced to inline, run the heuristic below
21647 that will pick code for medium sized blocks. */
21648 if (alg != libcall)
21649 return alg;
21650 break;
21651 }
21652 else if (ALG_USABLE_P (candidate))
21653 return candidate;
21654 }
21655 }
21656 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
21657 }
21658 /* When asked to inline the call anyway, try to pick meaningful choice.
21659 We look for maximal size of block that is faster to copy by hand and
21660 take blocks of at most of that size guessing that average size will
21661 be roughly half of the block.
21662
21663 If this turns out to be bad, we might simply specify the preferred
21664 choice in ix86_costs. */
21665 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21666 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
21667 {
21668 int max = -1;
21669 enum stringop_alg alg;
21670 int i;
21671 bool any_alg_usable_p = true;
21672
21673 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21674 {
21675 enum stringop_alg candidate = algs->size[i].alg;
21676 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
21677
21678 if (candidate != libcall && candidate
21679 && ALG_USABLE_P (candidate))
21680 max = algs->size[i].max;
21681 }
21682 /* If there aren't any usable algorithms, then recursing on
21683 smaller sizes isn't going to find anything. Just return the
21684 simple byte-at-a-time copy loop. */
21685 if (!any_alg_usable_p)
21686 {
21687 /* Pick something reasonable. */
21688 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21689 *dynamic_check = 128;
21690 return loop_1_byte;
21691 }
21692 if (max == -1)
21693 max = 4096;
21694 alg = decide_alg (count, max / 2, memset, dynamic_check);
21695 gcc_assert (*dynamic_check == -1);
21696 gcc_assert (alg != libcall);
21697 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21698 *dynamic_check = max;
21699 return alg;
21700 }
21701 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
21702 #undef ALG_USABLE_P
21703 }
21704
21705 /* Decide on alignment. We know that the operand is already aligned to ALIGN
21706 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
21707 static int
21708 decide_alignment (int align,
21709 enum stringop_alg alg,
21710 int expected_size)
21711 {
21712 int desired_align = 0;
21713 switch (alg)
21714 {
21715 case no_stringop:
21716 gcc_unreachable ();
21717 case loop:
21718 case unrolled_loop:
21719 desired_align = GET_MODE_SIZE (Pmode);
21720 break;
21721 case rep_prefix_8_byte:
21722 desired_align = 8;
21723 break;
21724 case rep_prefix_4_byte:
21725 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
21726 copying whole cacheline at once. */
21727 if (TARGET_PENTIUMPRO)
21728 desired_align = 8;
21729 else
21730 desired_align = 4;
21731 break;
21732 case rep_prefix_1_byte:
21733 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
21734 copying whole cacheline at once. */
21735 if (TARGET_PENTIUMPRO)
21736 desired_align = 8;
21737 else
21738 desired_align = 1;
21739 break;
21740 case loop_1_byte:
21741 desired_align = 1;
21742 break;
21743 case libcall:
21744 return 0;
21745 }
21746
21747 if (optimize_size)
21748 desired_align = 1;
21749 if (desired_align < align)
21750 desired_align = align;
21751 if (expected_size != -1 && expected_size < 4)
21752 desired_align = align;
21753 return desired_align;
21754 }
21755
21756 /* Return the smallest power of 2 greater than VAL. */
21757 static int
21758 smallest_pow2_greater_than (int val)
21759 {
21760 int ret = 1;
21761 while (ret <= val)
21762 ret <<= 1;
21763 return ret;
21764 }
21765
21766 /* Expand string move (memcpy) operation. Use i386 string operations
21767 when profitable. expand_setmem contains similar code. The code
21768 depends upon architecture, block size and alignment, but always has
21769 the same overall structure:
21770
21771 1) Prologue guard: Conditional that jumps up to epilogues for small
21772 blocks that can be handled by epilogue alone. This is faster
21773 but also needed for correctness, since prologue assume the block
21774 is larger than the desired alignment.
21775
21776 Optional dynamic check for size and libcall for large
21777 blocks is emitted here too, with -minline-stringops-dynamically.
21778
21779 2) Prologue: copy first few bytes in order to get destination
21780 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
21781 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
21782 copied. We emit either a jump tree on power of two sized
21783 blocks, or a byte loop.
21784
21785 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
21786 with specified algorithm.
21787
21788 4) Epilogue: code copying tail of the block that is too small to be
21789 handled by main body (or up to size guarded by prologue guard). */
21790
21791 bool
21792 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
21793 rtx expected_align_exp, rtx expected_size_exp)
21794 {
21795 rtx destreg;
21796 rtx srcreg;
21797 rtx label = NULL;
21798 rtx tmp;
21799 rtx jump_around_label = NULL;
21800 HOST_WIDE_INT align = 1;
21801 unsigned HOST_WIDE_INT count = 0;
21802 HOST_WIDE_INT expected_size = -1;
21803 int size_needed = 0, epilogue_size_needed;
21804 int desired_align = 0, align_bytes = 0;
21805 enum stringop_alg alg;
21806 int dynamic_check;
21807 bool need_zero_guard = false;
21808
21809 if (CONST_INT_P (align_exp))
21810 align = INTVAL (align_exp);
21811 /* i386 can do misaligned access on reasonably increased cost. */
21812 if (CONST_INT_P (expected_align_exp)
21813 && INTVAL (expected_align_exp) > align)
21814 align = INTVAL (expected_align_exp);
21815 /* ALIGN is the minimum of destination and source alignment, but we care here
21816 just about destination alignment. */
21817 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
21818 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
21819
21820 if (CONST_INT_P (count_exp))
21821 count = expected_size = INTVAL (count_exp);
21822 if (CONST_INT_P (expected_size_exp) && count == 0)
21823 expected_size = INTVAL (expected_size_exp);
21824
21825 /* Make sure we don't need to care about overflow later on. */
21826 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
21827 return false;
21828
21829 /* Step 0: Decide on preferred algorithm, desired alignment and
21830 size of chunks to be copied by main loop. */
21831
21832 alg = decide_alg (count, expected_size, false, &dynamic_check);
21833 desired_align = decide_alignment (align, alg, expected_size);
21834
21835 if (!TARGET_ALIGN_STRINGOPS)
21836 align = desired_align;
21837
21838 if (alg == libcall)
21839 return false;
21840 gcc_assert (alg != no_stringop);
21841 if (!count)
21842 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
21843 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
21844 srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0));
21845 switch (alg)
21846 {
21847 case libcall:
21848 case no_stringop:
21849 gcc_unreachable ();
21850 case loop:
21851 need_zero_guard = true;
21852 size_needed = GET_MODE_SIZE (Pmode);
21853 break;
21854 case unrolled_loop:
21855 need_zero_guard = true;
21856 size_needed = GET_MODE_SIZE (Pmode) * (TARGET_64BIT ? 4 : 2);
21857 break;
21858 case rep_prefix_8_byte:
21859 size_needed = 8;
21860 break;
21861 case rep_prefix_4_byte:
21862 size_needed = 4;
21863 break;
21864 case rep_prefix_1_byte:
21865 size_needed = 1;
21866 break;
21867 case loop_1_byte:
21868 need_zero_guard = true;
21869 size_needed = 1;
21870 break;
21871 }
21872
21873 epilogue_size_needed = size_needed;
21874
21875 /* Step 1: Prologue guard. */
21876
21877 /* Alignment code needs count to be in register. */
21878 if (CONST_INT_P (count_exp) && desired_align > align)
21879 {
21880 if (INTVAL (count_exp) > desired_align
21881 && INTVAL (count_exp) > size_needed)
21882 {
21883 align_bytes
21884 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
21885 if (align_bytes <= 0)
21886 align_bytes = 0;
21887 else
21888 align_bytes = desired_align - align_bytes;
21889 }
21890 if (align_bytes == 0)
21891 count_exp = force_reg (counter_mode (count_exp), count_exp);
21892 }
21893 gcc_assert (desired_align >= 1 && align >= 1);
21894
21895 /* Ensure that alignment prologue won't copy past end of block. */
21896 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
21897 {
21898 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
21899 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
21900 Make sure it is power of 2. */
21901 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
21902
21903 if (count)
21904 {
21905 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
21906 {
21907 /* If main algorithm works on QImode, no epilogue is needed.
21908 For small sizes just don't align anything. */
21909 if (size_needed == 1)
21910 desired_align = align;
21911 else
21912 goto epilogue;
21913 }
21914 }
21915 else
21916 {
21917 label = gen_label_rtx ();
21918 emit_cmp_and_jump_insns (count_exp,
21919 GEN_INT (epilogue_size_needed),
21920 LTU, 0, counter_mode (count_exp), 1, label);
21921 if (expected_size == -1 || expected_size < epilogue_size_needed)
21922 predict_jump (REG_BR_PROB_BASE * 60 / 100);
21923 else
21924 predict_jump (REG_BR_PROB_BASE * 20 / 100);
21925 }
21926 }
21927
21928 /* Emit code to decide on runtime whether library call or inline should be
21929 used. */
21930 if (dynamic_check != -1)
21931 {
21932 if (CONST_INT_P (count_exp))
21933 {
21934 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
21935 {
21936 emit_block_move_via_libcall (dst, src, count_exp, false);
21937 count_exp = const0_rtx;
21938 goto epilogue;
21939 }
21940 }
21941 else
21942 {
21943 rtx hot_label = gen_label_rtx ();
21944 jump_around_label = gen_label_rtx ();
21945 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
21946 LEU, 0, GET_MODE (count_exp), 1, hot_label);
21947 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21948 emit_block_move_via_libcall (dst, src, count_exp, false);
21949 emit_jump (jump_around_label);
21950 emit_label (hot_label);
21951 }
21952 }
21953
21954 /* Step 2: Alignment prologue. */
21955
21956 if (desired_align > align)
21957 {
21958 if (align_bytes == 0)
21959 {
21960 /* Except for the first move in epilogue, we no longer know
21961 constant offset in aliasing info. It don't seems to worth
21962 the pain to maintain it for the first move, so throw away
21963 the info early. */
21964 src = change_address (src, BLKmode, srcreg);
21965 dst = change_address (dst, BLKmode, destreg);
21966 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
21967 desired_align);
21968 }
21969 else
21970 {
21971 /* If we know how many bytes need to be stored before dst is
21972 sufficiently aligned, maintain aliasing info accurately. */
21973 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
21974 desired_align, align_bytes);
21975 count_exp = plus_constant (count_exp, -align_bytes);
21976 count -= align_bytes;
21977 }
21978 if (need_zero_guard
21979 && (count < (unsigned HOST_WIDE_INT) size_needed
21980 || (align_bytes == 0
21981 && count < ((unsigned HOST_WIDE_INT) size_needed
21982 + desired_align - align))))
21983 {
21984 /* It is possible that we copied enough so the main loop will not
21985 execute. */
21986 gcc_assert (size_needed > 1);
21987 if (label == NULL_RTX)
21988 label = gen_label_rtx ();
21989 emit_cmp_and_jump_insns (count_exp,
21990 GEN_INT (size_needed),
21991 LTU, 0, counter_mode (count_exp), 1, label);
21992 if (expected_size == -1
21993 || expected_size < (desired_align - align) / 2 + size_needed)
21994 predict_jump (REG_BR_PROB_BASE * 20 / 100);
21995 else
21996 predict_jump (REG_BR_PROB_BASE * 60 / 100);
21997 }
21998 }
21999 if (label && size_needed == 1)
22000 {
22001 emit_label (label);
22002 LABEL_NUSES (label) = 1;
22003 label = NULL;
22004 epilogue_size_needed = 1;
22005 }
22006 else if (label == NULL_RTX)
22007 epilogue_size_needed = size_needed;
22008
22009 /* Step 3: Main loop. */
22010
22011 switch (alg)
22012 {
22013 case libcall:
22014 case no_stringop:
22015 gcc_unreachable ();
22016 case loop_1_byte:
22017 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22018 count_exp, QImode, 1, expected_size);
22019 break;
22020 case loop:
22021 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22022 count_exp, Pmode, 1, expected_size);
22023 break;
22024 case unrolled_loop:
22025 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22026 registers for 4 temporaries anyway. */
22027 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22028 count_exp, Pmode, TARGET_64BIT ? 4 : 2,
22029 expected_size);
22030 break;
22031 case rep_prefix_8_byte:
22032 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22033 DImode);
22034 break;
22035 case rep_prefix_4_byte:
22036 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22037 SImode);
22038 break;
22039 case rep_prefix_1_byte:
22040 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22041 QImode);
22042 break;
22043 }
22044 /* Adjust properly the offset of src and dest memory for aliasing. */
22045 if (CONST_INT_P (count_exp))
22046 {
22047 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22048 (count / size_needed) * size_needed);
22049 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22050 (count / size_needed) * size_needed);
22051 }
22052 else
22053 {
22054 src = change_address (src, BLKmode, srcreg);
22055 dst = change_address (dst, BLKmode, destreg);
22056 }
22057
22058 /* Step 4: Epilogue to copy the remaining bytes. */
22059 epilogue:
22060 if (label)
22061 {
22062 /* When the main loop is done, COUNT_EXP might hold original count,
22063 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22064 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22065 bytes. Compensate if needed. */
22066
22067 if (size_needed < epilogue_size_needed)
22068 {
22069 tmp =
22070 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22071 GEN_INT (size_needed - 1), count_exp, 1,
22072 OPTAB_DIRECT);
22073 if (tmp != count_exp)
22074 emit_move_insn (count_exp, tmp);
22075 }
22076 emit_label (label);
22077 LABEL_NUSES (label) = 1;
22078 }
22079
22080 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22081 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22082 epilogue_size_needed);
22083 if (jump_around_label)
22084 emit_label (jump_around_label);
22085 return true;
22086 }
22087
22088 /* Helper function for memcpy. For QImode value 0xXY produce
22089 0xXYXYXYXY of wide specified by MODE. This is essentially
22090 a * 0x10101010, but we can do slightly better than
22091 synth_mult by unwinding the sequence by hand on CPUs with
22092 slow multiply. */
22093 static rtx
22094 promote_duplicated_reg (enum machine_mode mode, rtx val)
22095 {
22096 enum machine_mode valmode = GET_MODE (val);
22097 rtx tmp;
22098 int nops = mode == DImode ? 3 : 2;
22099
22100 gcc_assert (mode == SImode || mode == DImode);
22101 if (val == const0_rtx)
22102 return copy_to_mode_reg (mode, const0_rtx);
22103 if (CONST_INT_P (val))
22104 {
22105 HOST_WIDE_INT v = INTVAL (val) & 255;
22106
22107 v |= v << 8;
22108 v |= v << 16;
22109 if (mode == DImode)
22110 v |= (v << 16) << 16;
22111 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22112 }
22113
22114 if (valmode == VOIDmode)
22115 valmode = QImode;
22116 if (valmode != QImode)
22117 val = gen_lowpart (QImode, val);
22118 if (mode == QImode)
22119 return val;
22120 if (!TARGET_PARTIAL_REG_STALL)
22121 nops--;
22122 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22123 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22124 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22125 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22126 {
22127 rtx reg = convert_modes (mode, QImode, val, true);
22128 tmp = promote_duplicated_reg (mode, const1_rtx);
22129 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22130 OPTAB_DIRECT);
22131 }
22132 else
22133 {
22134 rtx reg = convert_modes (mode, QImode, val, true);
22135
22136 if (!TARGET_PARTIAL_REG_STALL)
22137 if (mode == SImode)
22138 emit_insn (gen_movsi_insv_1 (reg, reg));
22139 else
22140 emit_insn (gen_movdi_insv_1 (reg, reg));
22141 else
22142 {
22143 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22144 NULL, 1, OPTAB_DIRECT);
22145 reg =
22146 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22147 }
22148 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22149 NULL, 1, OPTAB_DIRECT);
22150 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22151 if (mode == SImode)
22152 return reg;
22153 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22154 NULL, 1, OPTAB_DIRECT);
22155 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22156 return reg;
22157 }
22158 }
22159
22160 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22161 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22162 alignment from ALIGN to DESIRED_ALIGN. */
22163 static rtx
22164 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22165 {
22166 rtx promoted_val;
22167
22168 if (TARGET_64BIT
22169 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22170 promoted_val = promote_duplicated_reg (DImode, val);
22171 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22172 promoted_val = promote_duplicated_reg (SImode, val);
22173 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22174 promoted_val = promote_duplicated_reg (HImode, val);
22175 else
22176 promoted_val = val;
22177
22178 return promoted_val;
22179 }
22180
22181 /* Expand string clear operation (bzero). Use i386 string operations when
22182 profitable. See expand_movmem comment for explanation of individual
22183 steps performed. */
22184 bool
22185 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22186 rtx expected_align_exp, rtx expected_size_exp)
22187 {
22188 rtx destreg;
22189 rtx label = NULL;
22190 rtx tmp;
22191 rtx jump_around_label = NULL;
22192 HOST_WIDE_INT align = 1;
22193 unsigned HOST_WIDE_INT count = 0;
22194 HOST_WIDE_INT expected_size = -1;
22195 int size_needed = 0, epilogue_size_needed;
22196 int desired_align = 0, align_bytes = 0;
22197 enum stringop_alg alg;
22198 rtx promoted_val = NULL;
22199 bool force_loopy_epilogue = false;
22200 int dynamic_check;
22201 bool need_zero_guard = false;
22202
22203 if (CONST_INT_P (align_exp))
22204 align = INTVAL (align_exp);
22205 /* i386 can do misaligned access on reasonably increased cost. */
22206 if (CONST_INT_P (expected_align_exp)
22207 && INTVAL (expected_align_exp) > align)
22208 align = INTVAL (expected_align_exp);
22209 if (CONST_INT_P (count_exp))
22210 count = expected_size = INTVAL (count_exp);
22211 if (CONST_INT_P (expected_size_exp) && count == 0)
22212 expected_size = INTVAL (expected_size_exp);
22213
22214 /* Make sure we don't need to care about overflow later on. */
22215 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22216 return false;
22217
22218 /* Step 0: Decide on preferred algorithm, desired alignment and
22219 size of chunks to be copied by main loop. */
22220
22221 alg = decide_alg (count, expected_size, true, &dynamic_check);
22222 desired_align = decide_alignment (align, alg, expected_size);
22223
22224 if (!TARGET_ALIGN_STRINGOPS)
22225 align = desired_align;
22226
22227 if (alg == libcall)
22228 return false;
22229 gcc_assert (alg != no_stringop);
22230 if (!count)
22231 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
22232 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
22233 switch (alg)
22234 {
22235 case libcall:
22236 case no_stringop:
22237 gcc_unreachable ();
22238 case loop:
22239 need_zero_guard = true;
22240 size_needed = GET_MODE_SIZE (Pmode);
22241 break;
22242 case unrolled_loop:
22243 need_zero_guard = true;
22244 size_needed = GET_MODE_SIZE (Pmode) * 4;
22245 break;
22246 case rep_prefix_8_byte:
22247 size_needed = 8;
22248 break;
22249 case rep_prefix_4_byte:
22250 size_needed = 4;
22251 break;
22252 case rep_prefix_1_byte:
22253 size_needed = 1;
22254 break;
22255 case loop_1_byte:
22256 need_zero_guard = true;
22257 size_needed = 1;
22258 break;
22259 }
22260 epilogue_size_needed = size_needed;
22261
22262 /* Step 1: Prologue guard. */
22263
22264 /* Alignment code needs count to be in register. */
22265 if (CONST_INT_P (count_exp) && desired_align > align)
22266 {
22267 if (INTVAL (count_exp) > desired_align
22268 && INTVAL (count_exp) > size_needed)
22269 {
22270 align_bytes
22271 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22272 if (align_bytes <= 0)
22273 align_bytes = 0;
22274 else
22275 align_bytes = desired_align - align_bytes;
22276 }
22277 if (align_bytes == 0)
22278 {
22279 enum machine_mode mode = SImode;
22280 if (TARGET_64BIT && (count & ~0xffffffff))
22281 mode = DImode;
22282 count_exp = force_reg (mode, count_exp);
22283 }
22284 }
22285 /* Do the cheap promotion to allow better CSE across the
22286 main loop and epilogue (ie one load of the big constant in the
22287 front of all code. */
22288 if (CONST_INT_P (val_exp))
22289 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22290 desired_align, align);
22291 /* Ensure that alignment prologue won't copy past end of block. */
22292 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22293 {
22294 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22295 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
22296 Make sure it is power of 2. */
22297 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22298
22299 /* To improve performance of small blocks, we jump around the VAL
22300 promoting mode. This mean that if the promoted VAL is not constant,
22301 we might not use it in the epilogue and have to use byte
22302 loop variant. */
22303 if (epilogue_size_needed > 2 && !promoted_val)
22304 force_loopy_epilogue = true;
22305 if (count)
22306 {
22307 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22308 {
22309 /* If main algorithm works on QImode, no epilogue is needed.
22310 For small sizes just don't align anything. */
22311 if (size_needed == 1)
22312 desired_align = align;
22313 else
22314 goto epilogue;
22315 }
22316 }
22317 else
22318 {
22319 label = gen_label_rtx ();
22320 emit_cmp_and_jump_insns (count_exp,
22321 GEN_INT (epilogue_size_needed),
22322 LTU, 0, counter_mode (count_exp), 1, label);
22323 if (expected_size == -1 || expected_size <= epilogue_size_needed)
22324 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22325 else
22326 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22327 }
22328 }
22329 if (dynamic_check != -1)
22330 {
22331 rtx hot_label = gen_label_rtx ();
22332 jump_around_label = gen_label_rtx ();
22333 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22334 LEU, 0, counter_mode (count_exp), 1, hot_label);
22335 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22336 set_storage_via_libcall (dst, count_exp, val_exp, false);
22337 emit_jump (jump_around_label);
22338 emit_label (hot_label);
22339 }
22340
22341 /* Step 2: Alignment prologue. */
22342
22343 /* Do the expensive promotion once we branched off the small blocks. */
22344 if (!promoted_val)
22345 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22346 desired_align, align);
22347 gcc_assert (desired_align >= 1 && align >= 1);
22348
22349 if (desired_align > align)
22350 {
22351 if (align_bytes == 0)
22352 {
22353 /* Except for the first move in epilogue, we no longer know
22354 constant offset in aliasing info. It don't seems to worth
22355 the pain to maintain it for the first move, so throw away
22356 the info early. */
22357 dst = change_address (dst, BLKmode, destreg);
22358 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
22359 desired_align);
22360 }
22361 else
22362 {
22363 /* If we know how many bytes need to be stored before dst is
22364 sufficiently aligned, maintain aliasing info accurately. */
22365 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
22366 desired_align, align_bytes);
22367 count_exp = plus_constant (count_exp, -align_bytes);
22368 count -= align_bytes;
22369 }
22370 if (need_zero_guard
22371 && (count < (unsigned HOST_WIDE_INT) size_needed
22372 || (align_bytes == 0
22373 && count < ((unsigned HOST_WIDE_INT) size_needed
22374 + desired_align - align))))
22375 {
22376 /* It is possible that we copied enough so the main loop will not
22377 execute. */
22378 gcc_assert (size_needed > 1);
22379 if (label == NULL_RTX)
22380 label = gen_label_rtx ();
22381 emit_cmp_and_jump_insns (count_exp,
22382 GEN_INT (size_needed),
22383 LTU, 0, counter_mode (count_exp), 1, label);
22384 if (expected_size == -1
22385 || expected_size < (desired_align - align) / 2 + size_needed)
22386 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22387 else
22388 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22389 }
22390 }
22391 if (label && size_needed == 1)
22392 {
22393 emit_label (label);
22394 LABEL_NUSES (label) = 1;
22395 label = NULL;
22396 promoted_val = val_exp;
22397 epilogue_size_needed = 1;
22398 }
22399 else if (label == NULL_RTX)
22400 epilogue_size_needed = size_needed;
22401
22402 /* Step 3: Main loop. */
22403
22404 switch (alg)
22405 {
22406 case libcall:
22407 case no_stringop:
22408 gcc_unreachable ();
22409 case loop_1_byte:
22410 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22411 count_exp, QImode, 1, expected_size);
22412 break;
22413 case loop:
22414 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22415 count_exp, Pmode, 1, expected_size);
22416 break;
22417 case unrolled_loop:
22418 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22419 count_exp, Pmode, 4, expected_size);
22420 break;
22421 case rep_prefix_8_byte:
22422 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22423 DImode, val_exp);
22424 break;
22425 case rep_prefix_4_byte:
22426 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22427 SImode, val_exp);
22428 break;
22429 case rep_prefix_1_byte:
22430 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22431 QImode, val_exp);
22432 break;
22433 }
22434 /* Adjust properly the offset of src and dest memory for aliasing. */
22435 if (CONST_INT_P (count_exp))
22436 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22437 (count / size_needed) * size_needed);
22438 else
22439 dst = change_address (dst, BLKmode, destreg);
22440
22441 /* Step 4: Epilogue to copy the remaining bytes. */
22442
22443 if (label)
22444 {
22445 /* When the main loop is done, COUNT_EXP might hold original count,
22446 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22447 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22448 bytes. Compensate if needed. */
22449
22450 if (size_needed < epilogue_size_needed)
22451 {
22452 tmp =
22453 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22454 GEN_INT (size_needed - 1), count_exp, 1,
22455 OPTAB_DIRECT);
22456 if (tmp != count_exp)
22457 emit_move_insn (count_exp, tmp);
22458 }
22459 emit_label (label);
22460 LABEL_NUSES (label) = 1;
22461 }
22462 epilogue:
22463 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22464 {
22465 if (force_loopy_epilogue)
22466 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
22467 epilogue_size_needed);
22468 else
22469 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
22470 epilogue_size_needed);
22471 }
22472 if (jump_around_label)
22473 emit_label (jump_around_label);
22474 return true;
22475 }
22476
22477 /* Expand the appropriate insns for doing strlen if not just doing
22478 repnz; scasb
22479
22480 out = result, initialized with the start address
22481 align_rtx = alignment of the address.
22482 scratch = scratch register, initialized with the startaddress when
22483 not aligned, otherwise undefined
22484
22485 This is just the body. It needs the initializations mentioned above and
22486 some address computing at the end. These things are done in i386.md. */
22487
22488 static void
22489 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
22490 {
22491 int align;
22492 rtx tmp;
22493 rtx align_2_label = NULL_RTX;
22494 rtx align_3_label = NULL_RTX;
22495 rtx align_4_label = gen_label_rtx ();
22496 rtx end_0_label = gen_label_rtx ();
22497 rtx mem;
22498 rtx tmpreg = gen_reg_rtx (SImode);
22499 rtx scratch = gen_reg_rtx (SImode);
22500 rtx cmp;
22501
22502 align = 0;
22503 if (CONST_INT_P (align_rtx))
22504 align = INTVAL (align_rtx);
22505
22506 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
22507
22508 /* Is there a known alignment and is it less than 4? */
22509 if (align < 4)
22510 {
22511 rtx scratch1 = gen_reg_rtx (Pmode);
22512 emit_move_insn (scratch1, out);
22513 /* Is there a known alignment and is it not 2? */
22514 if (align != 2)
22515 {
22516 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
22517 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
22518
22519 /* Leave just the 3 lower bits. */
22520 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
22521 NULL_RTX, 0, OPTAB_WIDEN);
22522
22523 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22524 Pmode, 1, align_4_label);
22525 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
22526 Pmode, 1, align_2_label);
22527 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
22528 Pmode, 1, align_3_label);
22529 }
22530 else
22531 {
22532 /* Since the alignment is 2, we have to check 2 or 0 bytes;
22533 check if is aligned to 4 - byte. */
22534
22535 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
22536 NULL_RTX, 0, OPTAB_WIDEN);
22537
22538 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22539 Pmode, 1, align_4_label);
22540 }
22541
22542 mem = change_address (src, QImode, out);
22543
22544 /* Now compare the bytes. */
22545
22546 /* Compare the first n unaligned byte on a byte per byte basis. */
22547 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
22548 QImode, 1, end_0_label);
22549
22550 /* Increment the address. */
22551 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22552
22553 /* Not needed with an alignment of 2 */
22554 if (align != 2)
22555 {
22556 emit_label (align_2_label);
22557
22558 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22559 end_0_label);
22560
22561 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22562
22563 emit_label (align_3_label);
22564 }
22565
22566 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22567 end_0_label);
22568
22569 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22570 }
22571
22572 /* Generate loop to check 4 bytes at a time. It is not a good idea to
22573 align this loop. It gives only huge programs, but does not help to
22574 speed up. */
22575 emit_label (align_4_label);
22576
22577 mem = change_address (src, SImode, out);
22578 emit_move_insn (scratch, mem);
22579 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
22580
22581 /* This formula yields a nonzero result iff one of the bytes is zero.
22582 This saves three branches inside loop and many cycles. */
22583
22584 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
22585 emit_insn (gen_one_cmplsi2 (scratch, scratch));
22586 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
22587 emit_insn (gen_andsi3 (tmpreg, tmpreg,
22588 gen_int_mode (0x80808080, SImode)));
22589 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
22590 align_4_label);
22591
22592 if (TARGET_CMOVE)
22593 {
22594 rtx reg = gen_reg_rtx (SImode);
22595 rtx reg2 = gen_reg_rtx (Pmode);
22596 emit_move_insn (reg, tmpreg);
22597 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
22598
22599 /* If zero is not in the first two bytes, move two bytes forward. */
22600 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22601 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22602 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22603 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
22604 gen_rtx_IF_THEN_ELSE (SImode, tmp,
22605 reg,
22606 tmpreg)));
22607 /* Emit lea manually to avoid clobbering of flags. */
22608 emit_insn (gen_rtx_SET (SImode, reg2,
22609 gen_rtx_PLUS (Pmode, out, const2_rtx)));
22610
22611 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22612 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22613 emit_insn (gen_rtx_SET (VOIDmode, out,
22614 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
22615 reg2,
22616 out)));
22617 }
22618 else
22619 {
22620 rtx end_2_label = gen_label_rtx ();
22621 /* Is zero in the first two bytes? */
22622
22623 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22624 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22625 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
22626 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22627 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
22628 pc_rtx);
22629 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
22630 JUMP_LABEL (tmp) = end_2_label;
22631
22632 /* Not in the first two. Move two bytes forward. */
22633 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
22634 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
22635
22636 emit_label (end_2_label);
22637
22638 }
22639
22640 /* Avoid branch in fixing the byte. */
22641 tmpreg = gen_lowpart (QImode, tmpreg);
22642 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
22643 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
22644 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
22645 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
22646
22647 emit_label (end_0_label);
22648 }
22649
22650 /* Expand strlen. */
22651
22652 bool
22653 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
22654 {
22655 rtx addr, scratch1, scratch2, scratch3, scratch4;
22656
22657 /* The generic case of strlen expander is long. Avoid it's
22658 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
22659
22660 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22661 && !TARGET_INLINE_ALL_STRINGOPS
22662 && !optimize_insn_for_size_p ()
22663 && (!CONST_INT_P (align) || INTVAL (align) < 4))
22664 return false;
22665
22666 addr = force_reg (Pmode, XEXP (src, 0));
22667 scratch1 = gen_reg_rtx (Pmode);
22668
22669 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22670 && !optimize_insn_for_size_p ())
22671 {
22672 /* Well it seems that some optimizer does not combine a call like
22673 foo(strlen(bar), strlen(bar));
22674 when the move and the subtraction is done here. It does calculate
22675 the length just once when these instructions are done inside of
22676 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
22677 often used and I use one fewer register for the lifetime of
22678 output_strlen_unroll() this is better. */
22679
22680 emit_move_insn (out, addr);
22681
22682 ix86_expand_strlensi_unroll_1 (out, src, align);
22683
22684 /* strlensi_unroll_1 returns the address of the zero at the end of
22685 the string, like memchr(), so compute the length by subtracting
22686 the start address. */
22687 emit_insn (ix86_gen_sub3 (out, out, addr));
22688 }
22689 else
22690 {
22691 rtx unspec;
22692
22693 /* Can't use this if the user has appropriated eax, ecx, or edi. */
22694 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
22695 return false;
22696
22697 scratch2 = gen_reg_rtx (Pmode);
22698 scratch3 = gen_reg_rtx (Pmode);
22699 scratch4 = force_reg (Pmode, constm1_rtx);
22700
22701 emit_move_insn (scratch3, addr);
22702 eoschar = force_reg (QImode, eoschar);
22703
22704 src = replace_equiv_address_nv (src, scratch3);
22705
22706 /* If .md starts supporting :P, this can be done in .md. */
22707 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
22708 scratch4), UNSPEC_SCAS);
22709 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
22710 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
22711 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
22712 }
22713 return true;
22714 }
22715
22716 /* For given symbol (function) construct code to compute address of it's PLT
22717 entry in large x86-64 PIC model. */
22718 rtx
22719 construct_plt_address (rtx symbol)
22720 {
22721 rtx tmp = gen_reg_rtx (Pmode);
22722 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
22723
22724 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
22725 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
22726
22727 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
22728 emit_insn (gen_adddi3 (tmp, tmp, pic_offset_table_rtx));
22729 return tmp;
22730 }
22731
22732 rtx
22733 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
22734 rtx callarg2,
22735 rtx pop, bool sibcall)
22736 {
22737 /* We need to represent that SI and DI registers are clobbered
22738 by SYSV calls. */
22739 static int clobbered_registers[] = {
22740 XMM6_REG, XMM7_REG, XMM8_REG,
22741 XMM9_REG, XMM10_REG, XMM11_REG,
22742 XMM12_REG, XMM13_REG, XMM14_REG,
22743 XMM15_REG, SI_REG, DI_REG
22744 };
22745 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
22746 rtx use = NULL, call;
22747 unsigned int vec_len;
22748
22749 if (pop == const0_rtx)
22750 pop = NULL;
22751 gcc_assert (!TARGET_64BIT || !pop);
22752
22753 if (TARGET_MACHO && !TARGET_64BIT)
22754 {
22755 #if TARGET_MACHO
22756 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
22757 fnaddr = machopic_indirect_call_target (fnaddr);
22758 #endif
22759 }
22760 else
22761 {
22762 /* Static functions and indirect calls don't need the pic register. */
22763 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
22764 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
22765 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
22766 use_reg (&use, pic_offset_table_rtx);
22767 }
22768
22769 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
22770 {
22771 rtx al = gen_rtx_REG (QImode, AX_REG);
22772 emit_move_insn (al, callarg2);
22773 use_reg (&use, al);
22774 }
22775
22776 if (ix86_cmodel == CM_LARGE_PIC
22777 && MEM_P (fnaddr)
22778 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
22779 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
22780 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
22781 else if (sibcall
22782 ? !sibcall_insn_operand (XEXP (fnaddr, 0), Pmode)
22783 : !call_insn_operand (XEXP (fnaddr, 0), Pmode))
22784 {
22785 fnaddr = XEXP (fnaddr, 0);
22786 if (GET_MODE (fnaddr) != Pmode)
22787 fnaddr = convert_to_mode (Pmode, fnaddr, 1);
22788 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (Pmode, fnaddr));
22789 }
22790
22791 vec_len = 0;
22792 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
22793 if (retval)
22794 call = gen_rtx_SET (VOIDmode, retval, call);
22795 vec[vec_len++] = call;
22796
22797 if (pop)
22798 {
22799 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
22800 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
22801 vec[vec_len++] = pop;
22802 }
22803
22804 if (TARGET_64BIT_MS_ABI
22805 && (!callarg2 || INTVAL (callarg2) != -2))
22806 {
22807 unsigned i;
22808
22809 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
22810 UNSPEC_MS_TO_SYSV_CALL);
22811
22812 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
22813 vec[vec_len++]
22814 = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
22815 ? TImode : DImode,
22816 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
22817 ? TImode : DImode,
22818 clobbered_registers[i]));
22819 }
22820
22821 /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
22822 if (TARGET_VZEROUPPER)
22823 {
22824 int avx256;
22825 if (cfun->machine->callee_pass_avx256_p)
22826 {
22827 if (cfun->machine->callee_return_avx256_p)
22828 avx256 = callee_return_pass_avx256;
22829 else
22830 avx256 = callee_pass_avx256;
22831 }
22832 else if (cfun->machine->callee_return_avx256_p)
22833 avx256 = callee_return_avx256;
22834 else
22835 avx256 = call_no_avx256;
22836
22837 if (reload_completed)
22838 emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
22839 else
22840 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
22841 gen_rtvec (1, GEN_INT (avx256)),
22842 UNSPEC_CALL_NEEDS_VZEROUPPER);
22843 }
22844
22845 if (vec_len > 1)
22846 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
22847 call = emit_call_insn (call);
22848 if (use)
22849 CALL_INSN_FUNCTION_USAGE (call) = use;
22850
22851 return call;
22852 }
22853
22854 void
22855 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
22856 {
22857 rtx pat = PATTERN (insn);
22858 rtvec vec = XVEC (pat, 0);
22859 int len = GET_NUM_ELEM (vec) - 1;
22860
22861 /* Strip off the last entry of the parallel. */
22862 gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
22863 gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
22864 if (len == 1)
22865 pat = RTVEC_ELT (vec, 0);
22866 else
22867 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
22868
22869 emit_insn (gen_avx_vzeroupper (vzeroupper));
22870 emit_call_insn (pat);
22871 }
22872
22873 /* Output the assembly for a call instruction. */
22874
22875 const char *
22876 ix86_output_call_insn (rtx insn, rtx call_op)
22877 {
22878 bool direct_p = constant_call_address_operand (call_op, Pmode);
22879 bool seh_nop_p = false;
22880 const char *xasm;
22881
22882 if (SIBLING_CALL_P (insn))
22883 {
22884 if (direct_p)
22885 xasm = "jmp\t%P0";
22886 /* SEH epilogue detection requires the indirect branch case
22887 to include REX.W. */
22888 else if (TARGET_SEH)
22889 xasm = "rex.W jmp %A0";
22890 else
22891 xasm = "jmp\t%A0";
22892
22893 output_asm_insn (xasm, &call_op);
22894 return "";
22895 }
22896
22897 /* SEH unwinding can require an extra nop to be emitted in several
22898 circumstances. Determine if we have one of those. */
22899 if (TARGET_SEH)
22900 {
22901 rtx i;
22902
22903 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
22904 {
22905 /* If we get to another real insn, we don't need the nop. */
22906 if (INSN_P (i))
22907 break;
22908
22909 /* If we get to the epilogue note, prevent a catch region from
22910 being adjacent to the standard epilogue sequence. If non-
22911 call-exceptions, we'll have done this during epilogue emission. */
22912 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
22913 && !flag_non_call_exceptions
22914 && !can_throw_internal (insn))
22915 {
22916 seh_nop_p = true;
22917 break;
22918 }
22919 }
22920
22921 /* If we didn't find a real insn following the call, prevent the
22922 unwinder from looking into the next function. */
22923 if (i == NULL)
22924 seh_nop_p = true;
22925 }
22926
22927 if (direct_p)
22928 xasm = "call\t%P0";
22929 else
22930 xasm = "call\t%A0";
22931
22932 output_asm_insn (xasm, &call_op);
22933
22934 if (seh_nop_p)
22935 return "nop";
22936
22937 return "";
22938 }
22939 \f
22940 /* Clear stack slot assignments remembered from previous functions.
22941 This is called from INIT_EXPANDERS once before RTL is emitted for each
22942 function. */
22943
22944 static struct machine_function *
22945 ix86_init_machine_status (void)
22946 {
22947 struct machine_function *f;
22948
22949 f = ggc_alloc_cleared_machine_function ();
22950 f->use_fast_prologue_epilogue_nregs = -1;
22951 f->tls_descriptor_call_expanded_p = 0;
22952 f->call_abi = ix86_abi;
22953
22954 return f;
22955 }
22956
22957 /* Return a MEM corresponding to a stack slot with mode MODE.
22958 Allocate a new slot if necessary.
22959
22960 The RTL for a function can have several slots available: N is
22961 which slot to use. */
22962
22963 rtx
22964 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
22965 {
22966 struct stack_local_entry *s;
22967
22968 gcc_assert (n < MAX_386_STACK_LOCALS);
22969
22970 /* Virtual slot is valid only before vregs are instantiated. */
22971 gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
22972
22973 for (s = ix86_stack_locals; s; s = s->next)
22974 if (s->mode == mode && s->n == n)
22975 return validize_mem (copy_rtx (s->rtl));
22976
22977 s = ggc_alloc_stack_local_entry ();
22978 s->n = n;
22979 s->mode = mode;
22980 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
22981
22982 s->next = ix86_stack_locals;
22983 ix86_stack_locals = s;
22984 return validize_mem (s->rtl);
22985 }
22986 \f
22987 /* Calculate the length of the memory address in the instruction encoding.
22988 Includes addr32 prefix, does not include the one-byte modrm, opcode,
22989 or other prefixes. */
22990
22991 int
22992 memory_address_length (rtx addr)
22993 {
22994 struct ix86_address parts;
22995 rtx base, index, disp;
22996 int len;
22997 int ok;
22998
22999 if (GET_CODE (addr) == PRE_DEC
23000 || GET_CODE (addr) == POST_INC
23001 || GET_CODE (addr) == PRE_MODIFY
23002 || GET_CODE (addr) == POST_MODIFY)
23003 return 0;
23004
23005 ok = ix86_decompose_address (addr, &parts);
23006 gcc_assert (ok);
23007
23008 if (parts.base && GET_CODE (parts.base) == SUBREG)
23009 parts.base = SUBREG_REG (parts.base);
23010 if (parts.index && GET_CODE (parts.index) == SUBREG)
23011 parts.index = SUBREG_REG (parts.index);
23012
23013 base = parts.base;
23014 index = parts.index;
23015 disp = parts.disp;
23016
23017 /* Add length of addr32 prefix. */
23018 len = (GET_CODE (addr) == ZERO_EXTEND
23019 || GET_CODE (addr) == AND);
23020
23021 /* Rule of thumb:
23022 - esp as the base always wants an index,
23023 - ebp as the base always wants a displacement,
23024 - r12 as the base always wants an index,
23025 - r13 as the base always wants a displacement. */
23026
23027 /* Register Indirect. */
23028 if (base && !index && !disp)
23029 {
23030 /* esp (for its index) and ebp (for its displacement) need
23031 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23032 code. */
23033 if (REG_P (addr)
23034 && (addr == arg_pointer_rtx
23035 || addr == frame_pointer_rtx
23036 || REGNO (addr) == SP_REG
23037 || REGNO (addr) == BP_REG
23038 || REGNO (addr) == R12_REG
23039 || REGNO (addr) == R13_REG))
23040 len = 1;
23041 }
23042
23043 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23044 is not disp32, but disp32(%rip), so for disp32
23045 SIB byte is needed, unless print_operand_address
23046 optimizes it into disp32(%rip) or (%rip) is implied
23047 by UNSPEC. */
23048 else if (disp && !base && !index)
23049 {
23050 len = 4;
23051 if (TARGET_64BIT)
23052 {
23053 rtx symbol = disp;
23054
23055 if (GET_CODE (disp) == CONST)
23056 symbol = XEXP (disp, 0);
23057 if (GET_CODE (symbol) == PLUS
23058 && CONST_INT_P (XEXP (symbol, 1)))
23059 symbol = XEXP (symbol, 0);
23060
23061 if (GET_CODE (symbol) != LABEL_REF
23062 && (GET_CODE (symbol) != SYMBOL_REF
23063 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23064 && (GET_CODE (symbol) != UNSPEC
23065 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23066 && XINT (symbol, 1) != UNSPEC_PCREL
23067 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23068 len += 1;
23069 }
23070 }
23071
23072 else
23073 {
23074 /* Find the length of the displacement constant. */
23075 if (disp)
23076 {
23077 if (base && satisfies_constraint_K (disp))
23078 len = 1;
23079 else
23080 len = 4;
23081 }
23082 /* ebp always wants a displacement. Similarly r13. */
23083 else if (base && REG_P (base)
23084 && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23085 len = 1;
23086
23087 /* An index requires the two-byte modrm form.... */
23088 if (index
23089 /* ...like esp (or r12), which always wants an index. */
23090 || base == arg_pointer_rtx
23091 || base == frame_pointer_rtx
23092 || (base && REG_P (base)
23093 && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23094 len += 1;
23095 }
23096
23097 switch (parts.seg)
23098 {
23099 case SEG_FS:
23100 case SEG_GS:
23101 len += 1;
23102 break;
23103 default:
23104 break;
23105 }
23106
23107 return len;
23108 }
23109
23110 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23111 is set, expect that insn have 8bit immediate alternative. */
23112 int
23113 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23114 {
23115 int len = 0;
23116 int i;
23117 extract_insn_cached (insn);
23118 for (i = recog_data.n_operands - 1; i >= 0; --i)
23119 if (CONSTANT_P (recog_data.operand[i]))
23120 {
23121 enum attr_mode mode = get_attr_mode (insn);
23122
23123 gcc_assert (!len);
23124 if (shortform && CONST_INT_P (recog_data.operand[i]))
23125 {
23126 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23127 switch (mode)
23128 {
23129 case MODE_QI:
23130 len = 1;
23131 continue;
23132 case MODE_HI:
23133 ival = trunc_int_for_mode (ival, HImode);
23134 break;
23135 case MODE_SI:
23136 ival = trunc_int_for_mode (ival, SImode);
23137 break;
23138 default:
23139 break;
23140 }
23141 if (IN_RANGE (ival, -128, 127))
23142 {
23143 len = 1;
23144 continue;
23145 }
23146 }
23147 switch (mode)
23148 {
23149 case MODE_QI:
23150 len = 1;
23151 break;
23152 case MODE_HI:
23153 len = 2;
23154 break;
23155 case MODE_SI:
23156 len = 4;
23157 break;
23158 /* Immediates for DImode instructions are encoded as 32bit sign extended values. */
23159 case MODE_DI:
23160 len = 4;
23161 break;
23162 default:
23163 fatal_insn ("unknown insn mode", insn);
23164 }
23165 }
23166 return len;
23167 }
23168 /* Compute default value for "length_address" attribute. */
23169 int
23170 ix86_attr_length_address_default (rtx insn)
23171 {
23172 int i;
23173
23174 if (get_attr_type (insn) == TYPE_LEA)
23175 {
23176 rtx set = PATTERN (insn), addr;
23177
23178 if (GET_CODE (set) == PARALLEL)
23179 set = XVECEXP (set, 0, 0);
23180
23181 gcc_assert (GET_CODE (set) == SET);
23182
23183 addr = SET_SRC (set);
23184 if (TARGET_64BIT && get_attr_mode (insn) == MODE_SI)
23185 {
23186 if (GET_CODE (addr) == ZERO_EXTEND)
23187 addr = XEXP (addr, 0);
23188 if (GET_CODE (addr) == SUBREG)
23189 addr = SUBREG_REG (addr);
23190 }
23191
23192 return memory_address_length (addr);
23193 }
23194
23195 extract_insn_cached (insn);
23196 for (i = recog_data.n_operands - 1; i >= 0; --i)
23197 if (MEM_P (recog_data.operand[i]))
23198 {
23199 constrain_operands_cached (reload_completed);
23200 if (which_alternative != -1)
23201 {
23202 const char *constraints = recog_data.constraints[i];
23203 int alt = which_alternative;
23204
23205 while (*constraints == '=' || *constraints == '+')
23206 constraints++;
23207 while (alt-- > 0)
23208 while (*constraints++ != ',')
23209 ;
23210 /* Skip ignored operands. */
23211 if (*constraints == 'X')
23212 continue;
23213 }
23214 return memory_address_length (XEXP (recog_data.operand[i], 0));
23215 }
23216 return 0;
23217 }
23218
23219 /* Compute default value for "length_vex" attribute. It includes
23220 2 or 3 byte VEX prefix and 1 opcode byte. */
23221
23222 int
23223 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
23224 {
23225 int i;
23226
23227 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
23228 byte VEX prefix. */
23229 if (!has_0f_opcode || has_vex_w)
23230 return 3 + 1;
23231
23232 /* We can always use 2 byte VEX prefix in 32bit. */
23233 if (!TARGET_64BIT)
23234 return 2 + 1;
23235
23236 extract_insn_cached (insn);
23237
23238 for (i = recog_data.n_operands - 1; i >= 0; --i)
23239 if (REG_P (recog_data.operand[i]))
23240 {
23241 /* REX.W bit uses 3 byte VEX prefix. */
23242 if (GET_MODE (recog_data.operand[i]) == DImode
23243 && GENERAL_REG_P (recog_data.operand[i]))
23244 return 3 + 1;
23245 }
23246 else
23247 {
23248 /* REX.X or REX.B bits use 3 byte VEX prefix. */
23249 if (MEM_P (recog_data.operand[i])
23250 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
23251 return 3 + 1;
23252 }
23253
23254 return 2 + 1;
23255 }
23256 \f
23257 /* Return the maximum number of instructions a cpu can issue. */
23258
23259 static int
23260 ix86_issue_rate (void)
23261 {
23262 switch (ix86_tune)
23263 {
23264 case PROCESSOR_PENTIUM:
23265 case PROCESSOR_ATOM:
23266 case PROCESSOR_K6:
23267 return 2;
23268
23269 case PROCESSOR_PENTIUMPRO:
23270 case PROCESSOR_PENTIUM4:
23271 case PROCESSOR_CORE2_32:
23272 case PROCESSOR_CORE2_64:
23273 case PROCESSOR_COREI7_32:
23274 case PROCESSOR_COREI7_64:
23275 case PROCESSOR_ATHLON:
23276 case PROCESSOR_K8:
23277 case PROCESSOR_AMDFAM10:
23278 case PROCESSOR_NOCONA:
23279 case PROCESSOR_GENERIC32:
23280 case PROCESSOR_GENERIC64:
23281 case PROCESSOR_BDVER1:
23282 case PROCESSOR_BDVER2:
23283 case PROCESSOR_BTVER1:
23284 return 3;
23285
23286 default:
23287 return 1;
23288 }
23289 }
23290
23291 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
23292 by DEP_INSN and nothing set by DEP_INSN. */
23293
23294 static bool
23295 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
23296 {
23297 rtx set, set2;
23298
23299 /* Simplify the test for uninteresting insns. */
23300 if (insn_type != TYPE_SETCC
23301 && insn_type != TYPE_ICMOV
23302 && insn_type != TYPE_FCMOV
23303 && insn_type != TYPE_IBR)
23304 return false;
23305
23306 if ((set = single_set (dep_insn)) != 0)
23307 {
23308 set = SET_DEST (set);
23309 set2 = NULL_RTX;
23310 }
23311 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
23312 && XVECLEN (PATTERN (dep_insn), 0) == 2
23313 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
23314 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
23315 {
23316 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23317 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23318 }
23319 else
23320 return false;
23321
23322 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
23323 return false;
23324
23325 /* This test is true if the dependent insn reads the flags but
23326 not any other potentially set register. */
23327 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
23328 return false;
23329
23330 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
23331 return false;
23332
23333 return true;
23334 }
23335
23336 /* Return true iff USE_INSN has a memory address with operands set by
23337 SET_INSN. */
23338
23339 bool
23340 ix86_agi_dependent (rtx set_insn, rtx use_insn)
23341 {
23342 int i;
23343 extract_insn_cached (use_insn);
23344 for (i = recog_data.n_operands - 1; i >= 0; --i)
23345 if (MEM_P (recog_data.operand[i]))
23346 {
23347 rtx addr = XEXP (recog_data.operand[i], 0);
23348 return modified_in_p (addr, set_insn) != 0;
23349 }
23350 return false;
23351 }
23352
23353 static int
23354 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
23355 {
23356 enum attr_type insn_type, dep_insn_type;
23357 enum attr_memory memory;
23358 rtx set, set2;
23359 int dep_insn_code_number;
23360
23361 /* Anti and output dependencies have zero cost on all CPUs. */
23362 if (REG_NOTE_KIND (link) != 0)
23363 return 0;
23364
23365 dep_insn_code_number = recog_memoized (dep_insn);
23366
23367 /* If we can't recognize the insns, we can't really do anything. */
23368 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
23369 return cost;
23370
23371 insn_type = get_attr_type (insn);
23372 dep_insn_type = get_attr_type (dep_insn);
23373
23374 switch (ix86_tune)
23375 {
23376 case PROCESSOR_PENTIUM:
23377 /* Address Generation Interlock adds a cycle of latency. */
23378 if (insn_type == TYPE_LEA)
23379 {
23380 rtx addr = PATTERN (insn);
23381
23382 if (GET_CODE (addr) == PARALLEL)
23383 addr = XVECEXP (addr, 0, 0);
23384
23385 gcc_assert (GET_CODE (addr) == SET);
23386
23387 addr = SET_SRC (addr);
23388 if (modified_in_p (addr, dep_insn))
23389 cost += 1;
23390 }
23391 else if (ix86_agi_dependent (dep_insn, insn))
23392 cost += 1;
23393
23394 /* ??? Compares pair with jump/setcc. */
23395 if (ix86_flags_dependent (insn, dep_insn, insn_type))
23396 cost = 0;
23397
23398 /* Floating point stores require value to be ready one cycle earlier. */
23399 if (insn_type == TYPE_FMOV
23400 && get_attr_memory (insn) == MEMORY_STORE
23401 && !ix86_agi_dependent (dep_insn, insn))
23402 cost += 1;
23403 break;
23404
23405 case PROCESSOR_PENTIUMPRO:
23406 memory = get_attr_memory (insn);
23407
23408 /* INT->FP conversion is expensive. */
23409 if (get_attr_fp_int_src (dep_insn))
23410 cost += 5;
23411
23412 /* There is one cycle extra latency between an FP op and a store. */
23413 if (insn_type == TYPE_FMOV
23414 && (set = single_set (dep_insn)) != NULL_RTX
23415 && (set2 = single_set (insn)) != NULL_RTX
23416 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
23417 && MEM_P (SET_DEST (set2)))
23418 cost += 1;
23419
23420 /* Show ability of reorder buffer to hide latency of load by executing
23421 in parallel with previous instruction in case
23422 previous instruction is not needed to compute the address. */
23423 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23424 && !ix86_agi_dependent (dep_insn, insn))
23425 {
23426 /* Claim moves to take one cycle, as core can issue one load
23427 at time and the next load can start cycle later. */
23428 if (dep_insn_type == TYPE_IMOV
23429 || dep_insn_type == TYPE_FMOV)
23430 cost = 1;
23431 else if (cost > 1)
23432 cost--;
23433 }
23434 break;
23435
23436 case PROCESSOR_K6:
23437 memory = get_attr_memory (insn);
23438
23439 /* The esp dependency is resolved before the instruction is really
23440 finished. */
23441 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
23442 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
23443 return 1;
23444
23445 /* INT->FP conversion is expensive. */
23446 if (get_attr_fp_int_src (dep_insn))
23447 cost += 5;
23448
23449 /* Show ability of reorder buffer to hide latency of load by executing
23450 in parallel with previous instruction in case
23451 previous instruction is not needed to compute the address. */
23452 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23453 && !ix86_agi_dependent (dep_insn, insn))
23454 {
23455 /* Claim moves to take one cycle, as core can issue one load
23456 at time and the next load can start cycle later. */
23457 if (dep_insn_type == TYPE_IMOV
23458 || dep_insn_type == TYPE_FMOV)
23459 cost = 1;
23460 else if (cost > 2)
23461 cost -= 2;
23462 else
23463 cost = 1;
23464 }
23465 break;
23466
23467 case PROCESSOR_ATHLON:
23468 case PROCESSOR_K8:
23469 case PROCESSOR_AMDFAM10:
23470 case PROCESSOR_BDVER1:
23471 case PROCESSOR_BDVER2:
23472 case PROCESSOR_BTVER1:
23473 case PROCESSOR_ATOM:
23474 case PROCESSOR_GENERIC32:
23475 case PROCESSOR_GENERIC64:
23476 memory = get_attr_memory (insn);
23477
23478 /* Show ability of reorder buffer to hide latency of load by executing
23479 in parallel with previous instruction in case
23480 previous instruction is not needed to compute the address. */
23481 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23482 && !ix86_agi_dependent (dep_insn, insn))
23483 {
23484 enum attr_unit unit = get_attr_unit (insn);
23485 int loadcost = 3;
23486
23487 /* Because of the difference between the length of integer and
23488 floating unit pipeline preparation stages, the memory operands
23489 for floating point are cheaper.
23490
23491 ??? For Athlon it the difference is most probably 2. */
23492 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
23493 loadcost = 3;
23494 else
23495 loadcost = TARGET_ATHLON ? 2 : 0;
23496
23497 if (cost >= loadcost)
23498 cost -= loadcost;
23499 else
23500 cost = 0;
23501 }
23502
23503 default:
23504 break;
23505 }
23506
23507 return cost;
23508 }
23509
23510 /* How many alternative schedules to try. This should be as wide as the
23511 scheduling freedom in the DFA, but no wider. Making this value too
23512 large results extra work for the scheduler. */
23513
23514 static int
23515 ia32_multipass_dfa_lookahead (void)
23516 {
23517 switch (ix86_tune)
23518 {
23519 case PROCESSOR_PENTIUM:
23520 return 2;
23521
23522 case PROCESSOR_PENTIUMPRO:
23523 case PROCESSOR_K6:
23524 return 1;
23525
23526 case PROCESSOR_CORE2_32:
23527 case PROCESSOR_CORE2_64:
23528 case PROCESSOR_COREI7_32:
23529 case PROCESSOR_COREI7_64:
23530 /* Generally, we want haifa-sched:max_issue() to look ahead as far
23531 as many instructions can be executed on a cycle, i.e.,
23532 issue_rate. I wonder why tuning for many CPUs does not do this. */
23533 return ix86_issue_rate ();
23534
23535 default:
23536 return 0;
23537 }
23538 }
23539
23540 \f
23541
23542 /* Model decoder of Core 2/i7.
23543 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
23544 track the instruction fetch block boundaries and make sure that long
23545 (9+ bytes) instructions are assigned to D0. */
23546
23547 /* Maximum length of an insn that can be handled by
23548 a secondary decoder unit. '8' for Core 2/i7. */
23549 static int core2i7_secondary_decoder_max_insn_size;
23550
23551 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
23552 '16' for Core 2/i7. */
23553 static int core2i7_ifetch_block_size;
23554
23555 /* Maximum number of instructions decoder can handle per cycle.
23556 '6' for Core 2/i7. */
23557 static int core2i7_ifetch_block_max_insns;
23558
23559 typedef struct ix86_first_cycle_multipass_data_ *
23560 ix86_first_cycle_multipass_data_t;
23561 typedef const struct ix86_first_cycle_multipass_data_ *
23562 const_ix86_first_cycle_multipass_data_t;
23563
23564 /* A variable to store target state across calls to max_issue within
23565 one cycle. */
23566 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
23567 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
23568
23569 /* Initialize DATA. */
23570 static void
23571 core2i7_first_cycle_multipass_init (void *_data)
23572 {
23573 ix86_first_cycle_multipass_data_t data
23574 = (ix86_first_cycle_multipass_data_t) _data;
23575
23576 data->ifetch_block_len = 0;
23577 data->ifetch_block_n_insns = 0;
23578 data->ready_try_change = NULL;
23579 data->ready_try_change_size = 0;
23580 }
23581
23582 /* Advancing the cycle; reset ifetch block counts. */
23583 static void
23584 core2i7_dfa_post_advance_cycle (void)
23585 {
23586 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
23587
23588 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23589
23590 data->ifetch_block_len = 0;
23591 data->ifetch_block_n_insns = 0;
23592 }
23593
23594 static int min_insn_size (rtx);
23595
23596 /* Filter out insns from ready_try that the core will not be able to issue
23597 on current cycle due to decoder. */
23598 static void
23599 core2i7_first_cycle_multipass_filter_ready_try
23600 (const_ix86_first_cycle_multipass_data_t data,
23601 char *ready_try, int n_ready, bool first_cycle_insn_p)
23602 {
23603 while (n_ready--)
23604 {
23605 rtx insn;
23606 int insn_size;
23607
23608 if (ready_try[n_ready])
23609 continue;
23610
23611 insn = get_ready_element (n_ready);
23612 insn_size = min_insn_size (insn);
23613
23614 if (/* If this is a too long an insn for a secondary decoder ... */
23615 (!first_cycle_insn_p
23616 && insn_size > core2i7_secondary_decoder_max_insn_size)
23617 /* ... or it would not fit into the ifetch block ... */
23618 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
23619 /* ... or the decoder is full already ... */
23620 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
23621 /* ... mask the insn out. */
23622 {
23623 ready_try[n_ready] = 1;
23624
23625 if (data->ready_try_change)
23626 SET_BIT (data->ready_try_change, n_ready);
23627 }
23628 }
23629 }
23630
23631 /* Prepare for a new round of multipass lookahead scheduling. */
23632 static void
23633 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
23634 bool first_cycle_insn_p)
23635 {
23636 ix86_first_cycle_multipass_data_t data
23637 = (ix86_first_cycle_multipass_data_t) _data;
23638 const_ix86_first_cycle_multipass_data_t prev_data
23639 = ix86_first_cycle_multipass_data;
23640
23641 /* Restore the state from the end of the previous round. */
23642 data->ifetch_block_len = prev_data->ifetch_block_len;
23643 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
23644
23645 /* Filter instructions that cannot be issued on current cycle due to
23646 decoder restrictions. */
23647 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
23648 first_cycle_insn_p);
23649 }
23650
23651 /* INSN is being issued in current solution. Account for its impact on
23652 the decoder model. */
23653 static void
23654 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
23655 rtx insn, const void *_prev_data)
23656 {
23657 ix86_first_cycle_multipass_data_t data
23658 = (ix86_first_cycle_multipass_data_t) _data;
23659 const_ix86_first_cycle_multipass_data_t prev_data
23660 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
23661
23662 int insn_size = min_insn_size (insn);
23663
23664 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
23665 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
23666 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
23667 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23668
23669 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
23670 if (!data->ready_try_change)
23671 {
23672 data->ready_try_change = sbitmap_alloc (n_ready);
23673 data->ready_try_change_size = n_ready;
23674 }
23675 else if (data->ready_try_change_size < n_ready)
23676 {
23677 data->ready_try_change = sbitmap_resize (data->ready_try_change,
23678 n_ready, 0);
23679 data->ready_try_change_size = n_ready;
23680 }
23681 sbitmap_zero (data->ready_try_change);
23682
23683 /* Filter out insns from ready_try that the core will not be able to issue
23684 on current cycle due to decoder. */
23685 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
23686 false);
23687 }
23688
23689 /* Revert the effect on ready_try. */
23690 static void
23691 core2i7_first_cycle_multipass_backtrack (const void *_data,
23692 char *ready_try,
23693 int n_ready ATTRIBUTE_UNUSED)
23694 {
23695 const_ix86_first_cycle_multipass_data_t data
23696 = (const_ix86_first_cycle_multipass_data_t) _data;
23697 unsigned int i = 0;
23698 sbitmap_iterator sbi;
23699
23700 gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
23701 EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
23702 {
23703 ready_try[i] = 0;
23704 }
23705 }
23706
23707 /* Save the result of multipass lookahead scheduling for the next round. */
23708 static void
23709 core2i7_first_cycle_multipass_end (const void *_data)
23710 {
23711 const_ix86_first_cycle_multipass_data_t data
23712 = (const_ix86_first_cycle_multipass_data_t) _data;
23713 ix86_first_cycle_multipass_data_t next_data
23714 = ix86_first_cycle_multipass_data;
23715
23716 if (data != NULL)
23717 {
23718 next_data->ifetch_block_len = data->ifetch_block_len;
23719 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
23720 }
23721 }
23722
23723 /* Deallocate target data. */
23724 static void
23725 core2i7_first_cycle_multipass_fini (void *_data)
23726 {
23727 ix86_first_cycle_multipass_data_t data
23728 = (ix86_first_cycle_multipass_data_t) _data;
23729
23730 if (data->ready_try_change)
23731 {
23732 sbitmap_free (data->ready_try_change);
23733 data->ready_try_change = NULL;
23734 data->ready_try_change_size = 0;
23735 }
23736 }
23737
23738 /* Prepare for scheduling pass. */
23739 static void
23740 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
23741 int verbose ATTRIBUTE_UNUSED,
23742 int max_uid ATTRIBUTE_UNUSED)
23743 {
23744 /* Install scheduling hooks for current CPU. Some of these hooks are used
23745 in time-critical parts of the scheduler, so we only set them up when
23746 they are actually used. */
23747 switch (ix86_tune)
23748 {
23749 case PROCESSOR_CORE2_32:
23750 case PROCESSOR_CORE2_64:
23751 case PROCESSOR_COREI7_32:
23752 case PROCESSOR_COREI7_64:
23753 targetm.sched.dfa_post_advance_cycle
23754 = core2i7_dfa_post_advance_cycle;
23755 targetm.sched.first_cycle_multipass_init
23756 = core2i7_first_cycle_multipass_init;
23757 targetm.sched.first_cycle_multipass_begin
23758 = core2i7_first_cycle_multipass_begin;
23759 targetm.sched.first_cycle_multipass_issue
23760 = core2i7_first_cycle_multipass_issue;
23761 targetm.sched.first_cycle_multipass_backtrack
23762 = core2i7_first_cycle_multipass_backtrack;
23763 targetm.sched.first_cycle_multipass_end
23764 = core2i7_first_cycle_multipass_end;
23765 targetm.sched.first_cycle_multipass_fini
23766 = core2i7_first_cycle_multipass_fini;
23767
23768 /* Set decoder parameters. */
23769 core2i7_secondary_decoder_max_insn_size = 8;
23770 core2i7_ifetch_block_size = 16;
23771 core2i7_ifetch_block_max_insns = 6;
23772 break;
23773
23774 default:
23775 targetm.sched.dfa_post_advance_cycle = NULL;
23776 targetm.sched.first_cycle_multipass_init = NULL;
23777 targetm.sched.first_cycle_multipass_begin = NULL;
23778 targetm.sched.first_cycle_multipass_issue = NULL;
23779 targetm.sched.first_cycle_multipass_backtrack = NULL;
23780 targetm.sched.first_cycle_multipass_end = NULL;
23781 targetm.sched.first_cycle_multipass_fini = NULL;
23782 break;
23783 }
23784 }
23785
23786 \f
23787 /* Compute the alignment given to a constant that is being placed in memory.
23788 EXP is the constant and ALIGN is the alignment that the object would
23789 ordinarily have.
23790 The value of this function is used instead of that alignment to align
23791 the object. */
23792
23793 int
23794 ix86_constant_alignment (tree exp, int align)
23795 {
23796 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
23797 || TREE_CODE (exp) == INTEGER_CST)
23798 {
23799 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
23800 return 64;
23801 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
23802 return 128;
23803 }
23804 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
23805 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
23806 return BITS_PER_WORD;
23807
23808 return align;
23809 }
23810
23811 /* Compute the alignment for a static variable.
23812 TYPE is the data type, and ALIGN is the alignment that
23813 the object would ordinarily have. The value of this function is used
23814 instead of that alignment to align the object. */
23815
23816 int
23817 ix86_data_alignment (tree type, int align)
23818 {
23819 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
23820
23821 if (AGGREGATE_TYPE_P (type)
23822 && TYPE_SIZE (type)
23823 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23824 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
23825 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
23826 && align < max_align)
23827 align = max_align;
23828
23829 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
23830 to 16byte boundary. */
23831 if (TARGET_64BIT)
23832 {
23833 if (AGGREGATE_TYPE_P (type)
23834 && TYPE_SIZE (type)
23835 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23836 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
23837 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
23838 return 128;
23839 }
23840
23841 if (TREE_CODE (type) == ARRAY_TYPE)
23842 {
23843 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
23844 return 64;
23845 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
23846 return 128;
23847 }
23848 else if (TREE_CODE (type) == COMPLEX_TYPE)
23849 {
23850
23851 if (TYPE_MODE (type) == DCmode && align < 64)
23852 return 64;
23853 if ((TYPE_MODE (type) == XCmode
23854 || TYPE_MODE (type) == TCmode) && align < 128)
23855 return 128;
23856 }
23857 else if ((TREE_CODE (type) == RECORD_TYPE
23858 || TREE_CODE (type) == UNION_TYPE
23859 || TREE_CODE (type) == QUAL_UNION_TYPE)
23860 && TYPE_FIELDS (type))
23861 {
23862 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
23863 return 64;
23864 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
23865 return 128;
23866 }
23867 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
23868 || TREE_CODE (type) == INTEGER_TYPE)
23869 {
23870 if (TYPE_MODE (type) == DFmode && align < 64)
23871 return 64;
23872 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
23873 return 128;
23874 }
23875
23876 return align;
23877 }
23878
23879 /* Compute the alignment for a local variable or a stack slot. EXP is
23880 the data type or decl itself, MODE is the widest mode available and
23881 ALIGN is the alignment that the object would ordinarily have. The
23882 value of this macro is used instead of that alignment to align the
23883 object. */
23884
23885 unsigned int
23886 ix86_local_alignment (tree exp, enum machine_mode mode,
23887 unsigned int align)
23888 {
23889 tree type, decl;
23890
23891 if (exp && DECL_P (exp))
23892 {
23893 type = TREE_TYPE (exp);
23894 decl = exp;
23895 }
23896 else
23897 {
23898 type = exp;
23899 decl = NULL;
23900 }
23901
23902 /* Don't do dynamic stack realignment for long long objects with
23903 -mpreferred-stack-boundary=2. */
23904 if (!TARGET_64BIT
23905 && align == 64
23906 && ix86_preferred_stack_boundary < 64
23907 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
23908 && (!type || !TYPE_USER_ALIGN (type))
23909 && (!decl || !DECL_USER_ALIGN (decl)))
23910 align = 32;
23911
23912 /* If TYPE is NULL, we are allocating a stack slot for caller-save
23913 register in MODE. We will return the largest alignment of XF
23914 and DF. */
23915 if (!type)
23916 {
23917 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
23918 align = GET_MODE_ALIGNMENT (DFmode);
23919 return align;
23920 }
23921
23922 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
23923 to 16byte boundary. Exact wording is:
23924
23925 An array uses the same alignment as its elements, except that a local or
23926 global array variable of length at least 16 bytes or
23927 a C99 variable-length array variable always has alignment of at least 16 bytes.
23928
23929 This was added to allow use of aligned SSE instructions at arrays. This
23930 rule is meant for static storage (where compiler can not do the analysis
23931 by itself). We follow it for automatic variables only when convenient.
23932 We fully control everything in the function compiled and functions from
23933 other unit can not rely on the alignment.
23934
23935 Exclude va_list type. It is the common case of local array where
23936 we can not benefit from the alignment. */
23937 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
23938 && TARGET_SSE)
23939 {
23940 if (AGGREGATE_TYPE_P (type)
23941 && (va_list_type_node == NULL_TREE
23942 || (TYPE_MAIN_VARIANT (type)
23943 != TYPE_MAIN_VARIANT (va_list_type_node)))
23944 && TYPE_SIZE (type)
23945 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23946 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
23947 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
23948 return 128;
23949 }
23950 if (TREE_CODE (type) == ARRAY_TYPE)
23951 {
23952 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
23953 return 64;
23954 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
23955 return 128;
23956 }
23957 else if (TREE_CODE (type) == COMPLEX_TYPE)
23958 {
23959 if (TYPE_MODE (type) == DCmode && align < 64)
23960 return 64;
23961 if ((TYPE_MODE (type) == XCmode
23962 || TYPE_MODE (type) == TCmode) && align < 128)
23963 return 128;
23964 }
23965 else if ((TREE_CODE (type) == RECORD_TYPE
23966 || TREE_CODE (type) == UNION_TYPE
23967 || TREE_CODE (type) == QUAL_UNION_TYPE)
23968 && TYPE_FIELDS (type))
23969 {
23970 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
23971 return 64;
23972 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
23973 return 128;
23974 }
23975 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
23976 || TREE_CODE (type) == INTEGER_TYPE)
23977 {
23978
23979 if (TYPE_MODE (type) == DFmode && align < 64)
23980 return 64;
23981 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
23982 return 128;
23983 }
23984 return align;
23985 }
23986
23987 /* Compute the minimum required alignment for dynamic stack realignment
23988 purposes for a local variable, parameter or a stack slot. EXP is
23989 the data type or decl itself, MODE is its mode and ALIGN is the
23990 alignment that the object would ordinarily have. */
23991
23992 unsigned int
23993 ix86_minimum_alignment (tree exp, enum machine_mode mode,
23994 unsigned int align)
23995 {
23996 tree type, decl;
23997
23998 if (exp && DECL_P (exp))
23999 {
24000 type = TREE_TYPE (exp);
24001 decl = exp;
24002 }
24003 else
24004 {
24005 type = exp;
24006 decl = NULL;
24007 }
24008
24009 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
24010 return align;
24011
24012 /* Don't do dynamic stack realignment for long long objects with
24013 -mpreferred-stack-boundary=2. */
24014 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
24015 && (!type || !TYPE_USER_ALIGN (type))
24016 && (!decl || !DECL_USER_ALIGN (decl)))
24017 return 32;
24018
24019 return align;
24020 }
24021 \f
24022 /* Find a location for the static chain incoming to a nested function.
24023 This is a register, unless all free registers are used by arguments. */
24024
24025 static rtx
24026 ix86_static_chain (const_tree fndecl, bool incoming_p)
24027 {
24028 unsigned regno;
24029
24030 if (!DECL_STATIC_CHAIN (fndecl))
24031 return NULL;
24032
24033 if (TARGET_64BIT)
24034 {
24035 /* We always use R10 in 64-bit mode. */
24036 regno = R10_REG;
24037 }
24038 else
24039 {
24040 tree fntype;
24041 unsigned int ccvt;
24042
24043 /* By default in 32-bit mode we use ECX to pass the static chain. */
24044 regno = CX_REG;
24045
24046 fntype = TREE_TYPE (fndecl);
24047 ccvt = ix86_get_callcvt (fntype);
24048 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
24049 {
24050 /* Fastcall functions use ecx/edx for arguments, which leaves
24051 us with EAX for the static chain.
24052 Thiscall functions use ecx for arguments, which also
24053 leaves us with EAX for the static chain. */
24054 regno = AX_REG;
24055 }
24056 else if (ix86_function_regparm (fntype, fndecl) == 3)
24057 {
24058 /* For regparm 3, we have no free call-clobbered registers in
24059 which to store the static chain. In order to implement this,
24060 we have the trampoline push the static chain to the stack.
24061 However, we can't push a value below the return address when
24062 we call the nested function directly, so we have to use an
24063 alternate entry point. For this we use ESI, and have the
24064 alternate entry point push ESI, so that things appear the
24065 same once we're executing the nested function. */
24066 if (incoming_p)
24067 {
24068 if (fndecl == current_function_decl)
24069 ix86_static_chain_on_stack = true;
24070 return gen_frame_mem (SImode,
24071 plus_constant (arg_pointer_rtx, -8));
24072 }
24073 regno = SI_REG;
24074 }
24075 }
24076
24077 return gen_rtx_REG (Pmode, regno);
24078 }
24079
24080 /* Emit RTL insns to initialize the variable parts of a trampoline.
24081 FNDECL is the decl of the target address; M_TRAMP is a MEM for
24082 the trampoline, and CHAIN_VALUE is an RTX for the static chain
24083 to be passed to the target function. */
24084
24085 static void
24086 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
24087 {
24088 rtx mem, fnaddr;
24089 int opcode;
24090 int offset = 0;
24091
24092 fnaddr = XEXP (DECL_RTL (fndecl), 0);
24093
24094 if (TARGET_64BIT)
24095 {
24096 int size;
24097
24098 /* Load the function address to r11. Try to load address using
24099 the shorter movl instead of movabs. We may want to support
24100 movq for kernel mode, but kernel does not use trampolines at
24101 the moment. */
24102 if (x86_64_zext_immediate_operand (fnaddr, VOIDmode))
24103 {
24104 fnaddr = copy_to_mode_reg (DImode, fnaddr);
24105
24106 mem = adjust_address (m_tramp, HImode, offset);
24107 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
24108
24109 mem = adjust_address (m_tramp, SImode, offset + 2);
24110 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
24111 offset += 6;
24112 }
24113 else
24114 {
24115 mem = adjust_address (m_tramp, HImode, offset);
24116 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
24117
24118 mem = adjust_address (m_tramp, DImode, offset + 2);
24119 emit_move_insn (mem, fnaddr);
24120 offset += 10;
24121 }
24122
24123 /* Load static chain using movabs to r10. Use the
24124 shorter movl instead of movabs for x32. */
24125 if (TARGET_X32)
24126 {
24127 opcode = 0xba41;
24128 size = 6;
24129 }
24130 else
24131 {
24132 opcode = 0xba49;
24133 size = 10;
24134 }
24135
24136 mem = adjust_address (m_tramp, HImode, offset);
24137 emit_move_insn (mem, gen_int_mode (opcode, HImode));
24138
24139 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
24140 emit_move_insn (mem, chain_value);
24141 offset += size;
24142
24143 /* Jump to r11; the last (unused) byte is a nop, only there to
24144 pad the write out to a single 32-bit store. */
24145 mem = adjust_address (m_tramp, SImode, offset);
24146 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
24147 offset += 4;
24148 }
24149 else
24150 {
24151 rtx disp, chain;
24152
24153 /* Depending on the static chain location, either load a register
24154 with a constant, or push the constant to the stack. All of the
24155 instructions are the same size. */
24156 chain = ix86_static_chain (fndecl, true);
24157 if (REG_P (chain))
24158 {
24159 switch (REGNO (chain))
24160 {
24161 case AX_REG:
24162 opcode = 0xb8; break;
24163 case CX_REG:
24164 opcode = 0xb9; break;
24165 default:
24166 gcc_unreachable ();
24167 }
24168 }
24169 else
24170 opcode = 0x68;
24171
24172 mem = adjust_address (m_tramp, QImode, offset);
24173 emit_move_insn (mem, gen_int_mode (opcode, QImode));
24174
24175 mem = adjust_address (m_tramp, SImode, offset + 1);
24176 emit_move_insn (mem, chain_value);
24177 offset += 5;
24178
24179 mem = adjust_address (m_tramp, QImode, offset);
24180 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
24181
24182 mem = adjust_address (m_tramp, SImode, offset + 1);
24183
24184 /* Compute offset from the end of the jmp to the target function.
24185 In the case in which the trampoline stores the static chain on
24186 the stack, we need to skip the first insn which pushes the
24187 (call-saved) register static chain; this push is 1 byte. */
24188 offset += 5;
24189 disp = expand_binop (SImode, sub_optab, fnaddr,
24190 plus_constant (XEXP (m_tramp, 0),
24191 offset - (MEM_P (chain) ? 1 : 0)),
24192 NULL_RTX, 1, OPTAB_DIRECT);
24193 emit_move_insn (mem, disp);
24194 }
24195
24196 gcc_assert (offset <= TRAMPOLINE_SIZE);
24197
24198 #ifdef HAVE_ENABLE_EXECUTE_STACK
24199 #ifdef CHECK_EXECUTE_STACK_ENABLED
24200 if (CHECK_EXECUTE_STACK_ENABLED)
24201 #endif
24202 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
24203 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
24204 #endif
24205 }
24206 \f
24207 /* The following file contains several enumerations and data structures
24208 built from the definitions in i386-builtin-types.def. */
24209
24210 #include "i386-builtin-types.inc"
24211
24212 /* Table for the ix86 builtin non-function types. */
24213 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
24214
24215 /* Retrieve an element from the above table, building some of
24216 the types lazily. */
24217
24218 static tree
24219 ix86_get_builtin_type (enum ix86_builtin_type tcode)
24220 {
24221 unsigned int index;
24222 tree type, itype;
24223
24224 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
24225
24226 type = ix86_builtin_type_tab[(int) tcode];
24227 if (type != NULL)
24228 return type;
24229
24230 gcc_assert (tcode > IX86_BT_LAST_PRIM);
24231 if (tcode <= IX86_BT_LAST_VECT)
24232 {
24233 enum machine_mode mode;
24234
24235 index = tcode - IX86_BT_LAST_PRIM - 1;
24236 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
24237 mode = ix86_builtin_type_vect_mode[index];
24238
24239 type = build_vector_type_for_mode (itype, mode);
24240 }
24241 else
24242 {
24243 int quals;
24244
24245 index = tcode - IX86_BT_LAST_VECT - 1;
24246 if (tcode <= IX86_BT_LAST_PTR)
24247 quals = TYPE_UNQUALIFIED;
24248 else
24249 quals = TYPE_QUAL_CONST;
24250
24251 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
24252 if (quals != TYPE_UNQUALIFIED)
24253 itype = build_qualified_type (itype, quals);
24254
24255 type = build_pointer_type (itype);
24256 }
24257
24258 ix86_builtin_type_tab[(int) tcode] = type;
24259 return type;
24260 }
24261
24262 /* Table for the ix86 builtin function types. */
24263 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
24264
24265 /* Retrieve an element from the above table, building some of
24266 the types lazily. */
24267
24268 static tree
24269 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
24270 {
24271 tree type;
24272
24273 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
24274
24275 type = ix86_builtin_func_type_tab[(int) tcode];
24276 if (type != NULL)
24277 return type;
24278
24279 if (tcode <= IX86_BT_LAST_FUNC)
24280 {
24281 unsigned start = ix86_builtin_func_start[(int) tcode];
24282 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
24283 tree rtype, atype, args = void_list_node;
24284 unsigned i;
24285
24286 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
24287 for (i = after - 1; i > start; --i)
24288 {
24289 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
24290 args = tree_cons (NULL, atype, args);
24291 }
24292
24293 type = build_function_type (rtype, args);
24294 }
24295 else
24296 {
24297 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
24298 enum ix86_builtin_func_type icode;
24299
24300 icode = ix86_builtin_func_alias_base[index];
24301 type = ix86_get_builtin_func_type (icode);
24302 }
24303
24304 ix86_builtin_func_type_tab[(int) tcode] = type;
24305 return type;
24306 }
24307
24308
24309 /* Codes for all the SSE/MMX builtins. */
24310 enum ix86_builtins
24311 {
24312 IX86_BUILTIN_ADDPS,
24313 IX86_BUILTIN_ADDSS,
24314 IX86_BUILTIN_DIVPS,
24315 IX86_BUILTIN_DIVSS,
24316 IX86_BUILTIN_MULPS,
24317 IX86_BUILTIN_MULSS,
24318 IX86_BUILTIN_SUBPS,
24319 IX86_BUILTIN_SUBSS,
24320
24321 IX86_BUILTIN_CMPEQPS,
24322 IX86_BUILTIN_CMPLTPS,
24323 IX86_BUILTIN_CMPLEPS,
24324 IX86_BUILTIN_CMPGTPS,
24325 IX86_BUILTIN_CMPGEPS,
24326 IX86_BUILTIN_CMPNEQPS,
24327 IX86_BUILTIN_CMPNLTPS,
24328 IX86_BUILTIN_CMPNLEPS,
24329 IX86_BUILTIN_CMPNGTPS,
24330 IX86_BUILTIN_CMPNGEPS,
24331 IX86_BUILTIN_CMPORDPS,
24332 IX86_BUILTIN_CMPUNORDPS,
24333 IX86_BUILTIN_CMPEQSS,
24334 IX86_BUILTIN_CMPLTSS,
24335 IX86_BUILTIN_CMPLESS,
24336 IX86_BUILTIN_CMPNEQSS,
24337 IX86_BUILTIN_CMPNLTSS,
24338 IX86_BUILTIN_CMPNLESS,
24339 IX86_BUILTIN_CMPNGTSS,
24340 IX86_BUILTIN_CMPNGESS,
24341 IX86_BUILTIN_CMPORDSS,
24342 IX86_BUILTIN_CMPUNORDSS,
24343
24344 IX86_BUILTIN_COMIEQSS,
24345 IX86_BUILTIN_COMILTSS,
24346 IX86_BUILTIN_COMILESS,
24347 IX86_BUILTIN_COMIGTSS,
24348 IX86_BUILTIN_COMIGESS,
24349 IX86_BUILTIN_COMINEQSS,
24350 IX86_BUILTIN_UCOMIEQSS,
24351 IX86_BUILTIN_UCOMILTSS,
24352 IX86_BUILTIN_UCOMILESS,
24353 IX86_BUILTIN_UCOMIGTSS,
24354 IX86_BUILTIN_UCOMIGESS,
24355 IX86_BUILTIN_UCOMINEQSS,
24356
24357 IX86_BUILTIN_CVTPI2PS,
24358 IX86_BUILTIN_CVTPS2PI,
24359 IX86_BUILTIN_CVTSI2SS,
24360 IX86_BUILTIN_CVTSI642SS,
24361 IX86_BUILTIN_CVTSS2SI,
24362 IX86_BUILTIN_CVTSS2SI64,
24363 IX86_BUILTIN_CVTTPS2PI,
24364 IX86_BUILTIN_CVTTSS2SI,
24365 IX86_BUILTIN_CVTTSS2SI64,
24366
24367 IX86_BUILTIN_MAXPS,
24368 IX86_BUILTIN_MAXSS,
24369 IX86_BUILTIN_MINPS,
24370 IX86_BUILTIN_MINSS,
24371
24372 IX86_BUILTIN_LOADUPS,
24373 IX86_BUILTIN_STOREUPS,
24374 IX86_BUILTIN_MOVSS,
24375
24376 IX86_BUILTIN_MOVHLPS,
24377 IX86_BUILTIN_MOVLHPS,
24378 IX86_BUILTIN_LOADHPS,
24379 IX86_BUILTIN_LOADLPS,
24380 IX86_BUILTIN_STOREHPS,
24381 IX86_BUILTIN_STORELPS,
24382
24383 IX86_BUILTIN_MASKMOVQ,
24384 IX86_BUILTIN_MOVMSKPS,
24385 IX86_BUILTIN_PMOVMSKB,
24386
24387 IX86_BUILTIN_MOVNTPS,
24388 IX86_BUILTIN_MOVNTQ,
24389
24390 IX86_BUILTIN_LOADDQU,
24391 IX86_BUILTIN_STOREDQU,
24392
24393 IX86_BUILTIN_PACKSSWB,
24394 IX86_BUILTIN_PACKSSDW,
24395 IX86_BUILTIN_PACKUSWB,
24396
24397 IX86_BUILTIN_PADDB,
24398 IX86_BUILTIN_PADDW,
24399 IX86_BUILTIN_PADDD,
24400 IX86_BUILTIN_PADDQ,
24401 IX86_BUILTIN_PADDSB,
24402 IX86_BUILTIN_PADDSW,
24403 IX86_BUILTIN_PADDUSB,
24404 IX86_BUILTIN_PADDUSW,
24405 IX86_BUILTIN_PSUBB,
24406 IX86_BUILTIN_PSUBW,
24407 IX86_BUILTIN_PSUBD,
24408 IX86_BUILTIN_PSUBQ,
24409 IX86_BUILTIN_PSUBSB,
24410 IX86_BUILTIN_PSUBSW,
24411 IX86_BUILTIN_PSUBUSB,
24412 IX86_BUILTIN_PSUBUSW,
24413
24414 IX86_BUILTIN_PAND,
24415 IX86_BUILTIN_PANDN,
24416 IX86_BUILTIN_POR,
24417 IX86_BUILTIN_PXOR,
24418
24419 IX86_BUILTIN_PAVGB,
24420 IX86_BUILTIN_PAVGW,
24421
24422 IX86_BUILTIN_PCMPEQB,
24423 IX86_BUILTIN_PCMPEQW,
24424 IX86_BUILTIN_PCMPEQD,
24425 IX86_BUILTIN_PCMPGTB,
24426 IX86_BUILTIN_PCMPGTW,
24427 IX86_BUILTIN_PCMPGTD,
24428
24429 IX86_BUILTIN_PMADDWD,
24430
24431 IX86_BUILTIN_PMAXSW,
24432 IX86_BUILTIN_PMAXUB,
24433 IX86_BUILTIN_PMINSW,
24434 IX86_BUILTIN_PMINUB,
24435
24436 IX86_BUILTIN_PMULHUW,
24437 IX86_BUILTIN_PMULHW,
24438 IX86_BUILTIN_PMULLW,
24439
24440 IX86_BUILTIN_PSADBW,
24441 IX86_BUILTIN_PSHUFW,
24442
24443 IX86_BUILTIN_PSLLW,
24444 IX86_BUILTIN_PSLLD,
24445 IX86_BUILTIN_PSLLQ,
24446 IX86_BUILTIN_PSRAW,
24447 IX86_BUILTIN_PSRAD,
24448 IX86_BUILTIN_PSRLW,
24449 IX86_BUILTIN_PSRLD,
24450 IX86_BUILTIN_PSRLQ,
24451 IX86_BUILTIN_PSLLWI,
24452 IX86_BUILTIN_PSLLDI,
24453 IX86_BUILTIN_PSLLQI,
24454 IX86_BUILTIN_PSRAWI,
24455 IX86_BUILTIN_PSRADI,
24456 IX86_BUILTIN_PSRLWI,
24457 IX86_BUILTIN_PSRLDI,
24458 IX86_BUILTIN_PSRLQI,
24459
24460 IX86_BUILTIN_PUNPCKHBW,
24461 IX86_BUILTIN_PUNPCKHWD,
24462 IX86_BUILTIN_PUNPCKHDQ,
24463 IX86_BUILTIN_PUNPCKLBW,
24464 IX86_BUILTIN_PUNPCKLWD,
24465 IX86_BUILTIN_PUNPCKLDQ,
24466
24467 IX86_BUILTIN_SHUFPS,
24468
24469 IX86_BUILTIN_RCPPS,
24470 IX86_BUILTIN_RCPSS,
24471 IX86_BUILTIN_RSQRTPS,
24472 IX86_BUILTIN_RSQRTPS_NR,
24473 IX86_BUILTIN_RSQRTSS,
24474 IX86_BUILTIN_RSQRTF,
24475 IX86_BUILTIN_SQRTPS,
24476 IX86_BUILTIN_SQRTPS_NR,
24477 IX86_BUILTIN_SQRTSS,
24478
24479 IX86_BUILTIN_UNPCKHPS,
24480 IX86_BUILTIN_UNPCKLPS,
24481
24482 IX86_BUILTIN_ANDPS,
24483 IX86_BUILTIN_ANDNPS,
24484 IX86_BUILTIN_ORPS,
24485 IX86_BUILTIN_XORPS,
24486
24487 IX86_BUILTIN_EMMS,
24488 IX86_BUILTIN_LDMXCSR,
24489 IX86_BUILTIN_STMXCSR,
24490 IX86_BUILTIN_SFENCE,
24491
24492 /* 3DNow! Original */
24493 IX86_BUILTIN_FEMMS,
24494 IX86_BUILTIN_PAVGUSB,
24495 IX86_BUILTIN_PF2ID,
24496 IX86_BUILTIN_PFACC,
24497 IX86_BUILTIN_PFADD,
24498 IX86_BUILTIN_PFCMPEQ,
24499 IX86_BUILTIN_PFCMPGE,
24500 IX86_BUILTIN_PFCMPGT,
24501 IX86_BUILTIN_PFMAX,
24502 IX86_BUILTIN_PFMIN,
24503 IX86_BUILTIN_PFMUL,
24504 IX86_BUILTIN_PFRCP,
24505 IX86_BUILTIN_PFRCPIT1,
24506 IX86_BUILTIN_PFRCPIT2,
24507 IX86_BUILTIN_PFRSQIT1,
24508 IX86_BUILTIN_PFRSQRT,
24509 IX86_BUILTIN_PFSUB,
24510 IX86_BUILTIN_PFSUBR,
24511 IX86_BUILTIN_PI2FD,
24512 IX86_BUILTIN_PMULHRW,
24513
24514 /* 3DNow! Athlon Extensions */
24515 IX86_BUILTIN_PF2IW,
24516 IX86_BUILTIN_PFNACC,
24517 IX86_BUILTIN_PFPNACC,
24518 IX86_BUILTIN_PI2FW,
24519 IX86_BUILTIN_PSWAPDSI,
24520 IX86_BUILTIN_PSWAPDSF,
24521
24522 /* SSE2 */
24523 IX86_BUILTIN_ADDPD,
24524 IX86_BUILTIN_ADDSD,
24525 IX86_BUILTIN_DIVPD,
24526 IX86_BUILTIN_DIVSD,
24527 IX86_BUILTIN_MULPD,
24528 IX86_BUILTIN_MULSD,
24529 IX86_BUILTIN_SUBPD,
24530 IX86_BUILTIN_SUBSD,
24531
24532 IX86_BUILTIN_CMPEQPD,
24533 IX86_BUILTIN_CMPLTPD,
24534 IX86_BUILTIN_CMPLEPD,
24535 IX86_BUILTIN_CMPGTPD,
24536 IX86_BUILTIN_CMPGEPD,
24537 IX86_BUILTIN_CMPNEQPD,
24538 IX86_BUILTIN_CMPNLTPD,
24539 IX86_BUILTIN_CMPNLEPD,
24540 IX86_BUILTIN_CMPNGTPD,
24541 IX86_BUILTIN_CMPNGEPD,
24542 IX86_BUILTIN_CMPORDPD,
24543 IX86_BUILTIN_CMPUNORDPD,
24544 IX86_BUILTIN_CMPEQSD,
24545 IX86_BUILTIN_CMPLTSD,
24546 IX86_BUILTIN_CMPLESD,
24547 IX86_BUILTIN_CMPNEQSD,
24548 IX86_BUILTIN_CMPNLTSD,
24549 IX86_BUILTIN_CMPNLESD,
24550 IX86_BUILTIN_CMPORDSD,
24551 IX86_BUILTIN_CMPUNORDSD,
24552
24553 IX86_BUILTIN_COMIEQSD,
24554 IX86_BUILTIN_COMILTSD,
24555 IX86_BUILTIN_COMILESD,
24556 IX86_BUILTIN_COMIGTSD,
24557 IX86_BUILTIN_COMIGESD,
24558 IX86_BUILTIN_COMINEQSD,
24559 IX86_BUILTIN_UCOMIEQSD,
24560 IX86_BUILTIN_UCOMILTSD,
24561 IX86_BUILTIN_UCOMILESD,
24562 IX86_BUILTIN_UCOMIGTSD,
24563 IX86_BUILTIN_UCOMIGESD,
24564 IX86_BUILTIN_UCOMINEQSD,
24565
24566 IX86_BUILTIN_MAXPD,
24567 IX86_BUILTIN_MAXSD,
24568 IX86_BUILTIN_MINPD,
24569 IX86_BUILTIN_MINSD,
24570
24571 IX86_BUILTIN_ANDPD,
24572 IX86_BUILTIN_ANDNPD,
24573 IX86_BUILTIN_ORPD,
24574 IX86_BUILTIN_XORPD,
24575
24576 IX86_BUILTIN_SQRTPD,
24577 IX86_BUILTIN_SQRTSD,
24578
24579 IX86_BUILTIN_UNPCKHPD,
24580 IX86_BUILTIN_UNPCKLPD,
24581
24582 IX86_BUILTIN_SHUFPD,
24583
24584 IX86_BUILTIN_LOADUPD,
24585 IX86_BUILTIN_STOREUPD,
24586 IX86_BUILTIN_MOVSD,
24587
24588 IX86_BUILTIN_LOADHPD,
24589 IX86_BUILTIN_LOADLPD,
24590
24591 IX86_BUILTIN_CVTDQ2PD,
24592 IX86_BUILTIN_CVTDQ2PS,
24593
24594 IX86_BUILTIN_CVTPD2DQ,
24595 IX86_BUILTIN_CVTPD2PI,
24596 IX86_BUILTIN_CVTPD2PS,
24597 IX86_BUILTIN_CVTTPD2DQ,
24598 IX86_BUILTIN_CVTTPD2PI,
24599
24600 IX86_BUILTIN_CVTPI2PD,
24601 IX86_BUILTIN_CVTSI2SD,
24602 IX86_BUILTIN_CVTSI642SD,
24603
24604 IX86_BUILTIN_CVTSD2SI,
24605 IX86_BUILTIN_CVTSD2SI64,
24606 IX86_BUILTIN_CVTSD2SS,
24607 IX86_BUILTIN_CVTSS2SD,
24608 IX86_BUILTIN_CVTTSD2SI,
24609 IX86_BUILTIN_CVTTSD2SI64,
24610
24611 IX86_BUILTIN_CVTPS2DQ,
24612 IX86_BUILTIN_CVTPS2PD,
24613 IX86_BUILTIN_CVTTPS2DQ,
24614
24615 IX86_BUILTIN_MOVNTI,
24616 IX86_BUILTIN_MOVNTPD,
24617 IX86_BUILTIN_MOVNTDQ,
24618
24619 IX86_BUILTIN_MOVQ128,
24620
24621 /* SSE2 MMX */
24622 IX86_BUILTIN_MASKMOVDQU,
24623 IX86_BUILTIN_MOVMSKPD,
24624 IX86_BUILTIN_PMOVMSKB128,
24625
24626 IX86_BUILTIN_PACKSSWB128,
24627 IX86_BUILTIN_PACKSSDW128,
24628 IX86_BUILTIN_PACKUSWB128,
24629
24630 IX86_BUILTIN_PADDB128,
24631 IX86_BUILTIN_PADDW128,
24632 IX86_BUILTIN_PADDD128,
24633 IX86_BUILTIN_PADDQ128,
24634 IX86_BUILTIN_PADDSB128,
24635 IX86_BUILTIN_PADDSW128,
24636 IX86_BUILTIN_PADDUSB128,
24637 IX86_BUILTIN_PADDUSW128,
24638 IX86_BUILTIN_PSUBB128,
24639 IX86_BUILTIN_PSUBW128,
24640 IX86_BUILTIN_PSUBD128,
24641 IX86_BUILTIN_PSUBQ128,
24642 IX86_BUILTIN_PSUBSB128,
24643 IX86_BUILTIN_PSUBSW128,
24644 IX86_BUILTIN_PSUBUSB128,
24645 IX86_BUILTIN_PSUBUSW128,
24646
24647 IX86_BUILTIN_PAND128,
24648 IX86_BUILTIN_PANDN128,
24649 IX86_BUILTIN_POR128,
24650 IX86_BUILTIN_PXOR128,
24651
24652 IX86_BUILTIN_PAVGB128,
24653 IX86_BUILTIN_PAVGW128,
24654
24655 IX86_BUILTIN_PCMPEQB128,
24656 IX86_BUILTIN_PCMPEQW128,
24657 IX86_BUILTIN_PCMPEQD128,
24658 IX86_BUILTIN_PCMPGTB128,
24659 IX86_BUILTIN_PCMPGTW128,
24660 IX86_BUILTIN_PCMPGTD128,
24661
24662 IX86_BUILTIN_PMADDWD128,
24663
24664 IX86_BUILTIN_PMAXSW128,
24665 IX86_BUILTIN_PMAXUB128,
24666 IX86_BUILTIN_PMINSW128,
24667 IX86_BUILTIN_PMINUB128,
24668
24669 IX86_BUILTIN_PMULUDQ,
24670 IX86_BUILTIN_PMULUDQ128,
24671 IX86_BUILTIN_PMULHUW128,
24672 IX86_BUILTIN_PMULHW128,
24673 IX86_BUILTIN_PMULLW128,
24674
24675 IX86_BUILTIN_PSADBW128,
24676 IX86_BUILTIN_PSHUFHW,
24677 IX86_BUILTIN_PSHUFLW,
24678 IX86_BUILTIN_PSHUFD,
24679
24680 IX86_BUILTIN_PSLLDQI128,
24681 IX86_BUILTIN_PSLLWI128,
24682 IX86_BUILTIN_PSLLDI128,
24683 IX86_BUILTIN_PSLLQI128,
24684 IX86_BUILTIN_PSRAWI128,
24685 IX86_BUILTIN_PSRADI128,
24686 IX86_BUILTIN_PSRLDQI128,
24687 IX86_BUILTIN_PSRLWI128,
24688 IX86_BUILTIN_PSRLDI128,
24689 IX86_BUILTIN_PSRLQI128,
24690
24691 IX86_BUILTIN_PSLLDQ128,
24692 IX86_BUILTIN_PSLLW128,
24693 IX86_BUILTIN_PSLLD128,
24694 IX86_BUILTIN_PSLLQ128,
24695 IX86_BUILTIN_PSRAW128,
24696 IX86_BUILTIN_PSRAD128,
24697 IX86_BUILTIN_PSRLW128,
24698 IX86_BUILTIN_PSRLD128,
24699 IX86_BUILTIN_PSRLQ128,
24700
24701 IX86_BUILTIN_PUNPCKHBW128,
24702 IX86_BUILTIN_PUNPCKHWD128,
24703 IX86_BUILTIN_PUNPCKHDQ128,
24704 IX86_BUILTIN_PUNPCKHQDQ128,
24705 IX86_BUILTIN_PUNPCKLBW128,
24706 IX86_BUILTIN_PUNPCKLWD128,
24707 IX86_BUILTIN_PUNPCKLDQ128,
24708 IX86_BUILTIN_PUNPCKLQDQ128,
24709
24710 IX86_BUILTIN_CLFLUSH,
24711 IX86_BUILTIN_MFENCE,
24712 IX86_BUILTIN_LFENCE,
24713 IX86_BUILTIN_PAUSE,
24714
24715 IX86_BUILTIN_BSRSI,
24716 IX86_BUILTIN_BSRDI,
24717 IX86_BUILTIN_RDPMC,
24718 IX86_BUILTIN_RDTSC,
24719 IX86_BUILTIN_RDTSCP,
24720 IX86_BUILTIN_ROLQI,
24721 IX86_BUILTIN_ROLHI,
24722 IX86_BUILTIN_RORQI,
24723 IX86_BUILTIN_RORHI,
24724
24725 /* SSE3. */
24726 IX86_BUILTIN_ADDSUBPS,
24727 IX86_BUILTIN_HADDPS,
24728 IX86_BUILTIN_HSUBPS,
24729 IX86_BUILTIN_MOVSHDUP,
24730 IX86_BUILTIN_MOVSLDUP,
24731 IX86_BUILTIN_ADDSUBPD,
24732 IX86_BUILTIN_HADDPD,
24733 IX86_BUILTIN_HSUBPD,
24734 IX86_BUILTIN_LDDQU,
24735
24736 IX86_BUILTIN_MONITOR,
24737 IX86_BUILTIN_MWAIT,
24738
24739 /* SSSE3. */
24740 IX86_BUILTIN_PHADDW,
24741 IX86_BUILTIN_PHADDD,
24742 IX86_BUILTIN_PHADDSW,
24743 IX86_BUILTIN_PHSUBW,
24744 IX86_BUILTIN_PHSUBD,
24745 IX86_BUILTIN_PHSUBSW,
24746 IX86_BUILTIN_PMADDUBSW,
24747 IX86_BUILTIN_PMULHRSW,
24748 IX86_BUILTIN_PSHUFB,
24749 IX86_BUILTIN_PSIGNB,
24750 IX86_BUILTIN_PSIGNW,
24751 IX86_BUILTIN_PSIGND,
24752 IX86_BUILTIN_PALIGNR,
24753 IX86_BUILTIN_PABSB,
24754 IX86_BUILTIN_PABSW,
24755 IX86_BUILTIN_PABSD,
24756
24757 IX86_BUILTIN_PHADDW128,
24758 IX86_BUILTIN_PHADDD128,
24759 IX86_BUILTIN_PHADDSW128,
24760 IX86_BUILTIN_PHSUBW128,
24761 IX86_BUILTIN_PHSUBD128,
24762 IX86_BUILTIN_PHSUBSW128,
24763 IX86_BUILTIN_PMADDUBSW128,
24764 IX86_BUILTIN_PMULHRSW128,
24765 IX86_BUILTIN_PSHUFB128,
24766 IX86_BUILTIN_PSIGNB128,
24767 IX86_BUILTIN_PSIGNW128,
24768 IX86_BUILTIN_PSIGND128,
24769 IX86_BUILTIN_PALIGNR128,
24770 IX86_BUILTIN_PABSB128,
24771 IX86_BUILTIN_PABSW128,
24772 IX86_BUILTIN_PABSD128,
24773
24774 /* AMDFAM10 - SSE4A New Instructions. */
24775 IX86_BUILTIN_MOVNTSD,
24776 IX86_BUILTIN_MOVNTSS,
24777 IX86_BUILTIN_EXTRQI,
24778 IX86_BUILTIN_EXTRQ,
24779 IX86_BUILTIN_INSERTQI,
24780 IX86_BUILTIN_INSERTQ,
24781
24782 /* SSE4.1. */
24783 IX86_BUILTIN_BLENDPD,
24784 IX86_BUILTIN_BLENDPS,
24785 IX86_BUILTIN_BLENDVPD,
24786 IX86_BUILTIN_BLENDVPS,
24787 IX86_BUILTIN_PBLENDVB128,
24788 IX86_BUILTIN_PBLENDW128,
24789
24790 IX86_BUILTIN_DPPD,
24791 IX86_BUILTIN_DPPS,
24792
24793 IX86_BUILTIN_INSERTPS128,
24794
24795 IX86_BUILTIN_MOVNTDQA,
24796 IX86_BUILTIN_MPSADBW128,
24797 IX86_BUILTIN_PACKUSDW128,
24798 IX86_BUILTIN_PCMPEQQ,
24799 IX86_BUILTIN_PHMINPOSUW128,
24800
24801 IX86_BUILTIN_PMAXSB128,
24802 IX86_BUILTIN_PMAXSD128,
24803 IX86_BUILTIN_PMAXUD128,
24804 IX86_BUILTIN_PMAXUW128,
24805
24806 IX86_BUILTIN_PMINSB128,
24807 IX86_BUILTIN_PMINSD128,
24808 IX86_BUILTIN_PMINUD128,
24809 IX86_BUILTIN_PMINUW128,
24810
24811 IX86_BUILTIN_PMOVSXBW128,
24812 IX86_BUILTIN_PMOVSXBD128,
24813 IX86_BUILTIN_PMOVSXBQ128,
24814 IX86_BUILTIN_PMOVSXWD128,
24815 IX86_BUILTIN_PMOVSXWQ128,
24816 IX86_BUILTIN_PMOVSXDQ128,
24817
24818 IX86_BUILTIN_PMOVZXBW128,
24819 IX86_BUILTIN_PMOVZXBD128,
24820 IX86_BUILTIN_PMOVZXBQ128,
24821 IX86_BUILTIN_PMOVZXWD128,
24822 IX86_BUILTIN_PMOVZXWQ128,
24823 IX86_BUILTIN_PMOVZXDQ128,
24824
24825 IX86_BUILTIN_PMULDQ128,
24826 IX86_BUILTIN_PMULLD128,
24827
24828 IX86_BUILTIN_ROUNDPD,
24829 IX86_BUILTIN_ROUNDPS,
24830 IX86_BUILTIN_ROUNDSD,
24831 IX86_BUILTIN_ROUNDSS,
24832
24833 IX86_BUILTIN_FLOORPD,
24834 IX86_BUILTIN_CEILPD,
24835 IX86_BUILTIN_TRUNCPD,
24836 IX86_BUILTIN_RINTPD,
24837 IX86_BUILTIN_ROUNDPD_AZ,
24838 IX86_BUILTIN_FLOORPS,
24839 IX86_BUILTIN_CEILPS,
24840 IX86_BUILTIN_TRUNCPS,
24841 IX86_BUILTIN_RINTPS,
24842 IX86_BUILTIN_ROUNDPS_AZ,
24843
24844 IX86_BUILTIN_PTESTZ,
24845 IX86_BUILTIN_PTESTC,
24846 IX86_BUILTIN_PTESTNZC,
24847
24848 IX86_BUILTIN_VEC_INIT_V2SI,
24849 IX86_BUILTIN_VEC_INIT_V4HI,
24850 IX86_BUILTIN_VEC_INIT_V8QI,
24851 IX86_BUILTIN_VEC_EXT_V2DF,
24852 IX86_BUILTIN_VEC_EXT_V2DI,
24853 IX86_BUILTIN_VEC_EXT_V4SF,
24854 IX86_BUILTIN_VEC_EXT_V4SI,
24855 IX86_BUILTIN_VEC_EXT_V8HI,
24856 IX86_BUILTIN_VEC_EXT_V2SI,
24857 IX86_BUILTIN_VEC_EXT_V4HI,
24858 IX86_BUILTIN_VEC_EXT_V16QI,
24859 IX86_BUILTIN_VEC_SET_V2DI,
24860 IX86_BUILTIN_VEC_SET_V4SF,
24861 IX86_BUILTIN_VEC_SET_V4SI,
24862 IX86_BUILTIN_VEC_SET_V8HI,
24863 IX86_BUILTIN_VEC_SET_V4HI,
24864 IX86_BUILTIN_VEC_SET_V16QI,
24865
24866 IX86_BUILTIN_VEC_PACK_SFIX,
24867 IX86_BUILTIN_VEC_PACK_SFIX256,
24868
24869 /* SSE4.2. */
24870 IX86_BUILTIN_CRC32QI,
24871 IX86_BUILTIN_CRC32HI,
24872 IX86_BUILTIN_CRC32SI,
24873 IX86_BUILTIN_CRC32DI,
24874
24875 IX86_BUILTIN_PCMPESTRI128,
24876 IX86_BUILTIN_PCMPESTRM128,
24877 IX86_BUILTIN_PCMPESTRA128,
24878 IX86_BUILTIN_PCMPESTRC128,
24879 IX86_BUILTIN_PCMPESTRO128,
24880 IX86_BUILTIN_PCMPESTRS128,
24881 IX86_BUILTIN_PCMPESTRZ128,
24882 IX86_BUILTIN_PCMPISTRI128,
24883 IX86_BUILTIN_PCMPISTRM128,
24884 IX86_BUILTIN_PCMPISTRA128,
24885 IX86_BUILTIN_PCMPISTRC128,
24886 IX86_BUILTIN_PCMPISTRO128,
24887 IX86_BUILTIN_PCMPISTRS128,
24888 IX86_BUILTIN_PCMPISTRZ128,
24889
24890 IX86_BUILTIN_PCMPGTQ,
24891
24892 /* AES instructions */
24893 IX86_BUILTIN_AESENC128,
24894 IX86_BUILTIN_AESENCLAST128,
24895 IX86_BUILTIN_AESDEC128,
24896 IX86_BUILTIN_AESDECLAST128,
24897 IX86_BUILTIN_AESIMC128,
24898 IX86_BUILTIN_AESKEYGENASSIST128,
24899
24900 /* PCLMUL instruction */
24901 IX86_BUILTIN_PCLMULQDQ128,
24902
24903 /* AVX */
24904 IX86_BUILTIN_ADDPD256,
24905 IX86_BUILTIN_ADDPS256,
24906 IX86_BUILTIN_ADDSUBPD256,
24907 IX86_BUILTIN_ADDSUBPS256,
24908 IX86_BUILTIN_ANDPD256,
24909 IX86_BUILTIN_ANDPS256,
24910 IX86_BUILTIN_ANDNPD256,
24911 IX86_BUILTIN_ANDNPS256,
24912 IX86_BUILTIN_BLENDPD256,
24913 IX86_BUILTIN_BLENDPS256,
24914 IX86_BUILTIN_BLENDVPD256,
24915 IX86_BUILTIN_BLENDVPS256,
24916 IX86_BUILTIN_DIVPD256,
24917 IX86_BUILTIN_DIVPS256,
24918 IX86_BUILTIN_DPPS256,
24919 IX86_BUILTIN_HADDPD256,
24920 IX86_BUILTIN_HADDPS256,
24921 IX86_BUILTIN_HSUBPD256,
24922 IX86_BUILTIN_HSUBPS256,
24923 IX86_BUILTIN_MAXPD256,
24924 IX86_BUILTIN_MAXPS256,
24925 IX86_BUILTIN_MINPD256,
24926 IX86_BUILTIN_MINPS256,
24927 IX86_BUILTIN_MULPD256,
24928 IX86_BUILTIN_MULPS256,
24929 IX86_BUILTIN_ORPD256,
24930 IX86_BUILTIN_ORPS256,
24931 IX86_BUILTIN_SHUFPD256,
24932 IX86_BUILTIN_SHUFPS256,
24933 IX86_BUILTIN_SUBPD256,
24934 IX86_BUILTIN_SUBPS256,
24935 IX86_BUILTIN_XORPD256,
24936 IX86_BUILTIN_XORPS256,
24937 IX86_BUILTIN_CMPSD,
24938 IX86_BUILTIN_CMPSS,
24939 IX86_BUILTIN_CMPPD,
24940 IX86_BUILTIN_CMPPS,
24941 IX86_BUILTIN_CMPPD256,
24942 IX86_BUILTIN_CMPPS256,
24943 IX86_BUILTIN_CVTDQ2PD256,
24944 IX86_BUILTIN_CVTDQ2PS256,
24945 IX86_BUILTIN_CVTPD2PS256,
24946 IX86_BUILTIN_CVTPS2DQ256,
24947 IX86_BUILTIN_CVTPS2PD256,
24948 IX86_BUILTIN_CVTTPD2DQ256,
24949 IX86_BUILTIN_CVTPD2DQ256,
24950 IX86_BUILTIN_CVTTPS2DQ256,
24951 IX86_BUILTIN_EXTRACTF128PD256,
24952 IX86_BUILTIN_EXTRACTF128PS256,
24953 IX86_BUILTIN_EXTRACTF128SI256,
24954 IX86_BUILTIN_VZEROALL,
24955 IX86_BUILTIN_VZEROUPPER,
24956 IX86_BUILTIN_VPERMILVARPD,
24957 IX86_BUILTIN_VPERMILVARPS,
24958 IX86_BUILTIN_VPERMILVARPD256,
24959 IX86_BUILTIN_VPERMILVARPS256,
24960 IX86_BUILTIN_VPERMILPD,
24961 IX86_BUILTIN_VPERMILPS,
24962 IX86_BUILTIN_VPERMILPD256,
24963 IX86_BUILTIN_VPERMILPS256,
24964 IX86_BUILTIN_VPERMIL2PD,
24965 IX86_BUILTIN_VPERMIL2PS,
24966 IX86_BUILTIN_VPERMIL2PD256,
24967 IX86_BUILTIN_VPERMIL2PS256,
24968 IX86_BUILTIN_VPERM2F128PD256,
24969 IX86_BUILTIN_VPERM2F128PS256,
24970 IX86_BUILTIN_VPERM2F128SI256,
24971 IX86_BUILTIN_VBROADCASTSS,
24972 IX86_BUILTIN_VBROADCASTSD256,
24973 IX86_BUILTIN_VBROADCASTSS256,
24974 IX86_BUILTIN_VBROADCASTPD256,
24975 IX86_BUILTIN_VBROADCASTPS256,
24976 IX86_BUILTIN_VINSERTF128PD256,
24977 IX86_BUILTIN_VINSERTF128PS256,
24978 IX86_BUILTIN_VINSERTF128SI256,
24979 IX86_BUILTIN_LOADUPD256,
24980 IX86_BUILTIN_LOADUPS256,
24981 IX86_BUILTIN_STOREUPD256,
24982 IX86_BUILTIN_STOREUPS256,
24983 IX86_BUILTIN_LDDQU256,
24984 IX86_BUILTIN_MOVNTDQ256,
24985 IX86_BUILTIN_MOVNTPD256,
24986 IX86_BUILTIN_MOVNTPS256,
24987 IX86_BUILTIN_LOADDQU256,
24988 IX86_BUILTIN_STOREDQU256,
24989 IX86_BUILTIN_MASKLOADPD,
24990 IX86_BUILTIN_MASKLOADPS,
24991 IX86_BUILTIN_MASKSTOREPD,
24992 IX86_BUILTIN_MASKSTOREPS,
24993 IX86_BUILTIN_MASKLOADPD256,
24994 IX86_BUILTIN_MASKLOADPS256,
24995 IX86_BUILTIN_MASKSTOREPD256,
24996 IX86_BUILTIN_MASKSTOREPS256,
24997 IX86_BUILTIN_MOVSHDUP256,
24998 IX86_BUILTIN_MOVSLDUP256,
24999 IX86_BUILTIN_MOVDDUP256,
25000
25001 IX86_BUILTIN_SQRTPD256,
25002 IX86_BUILTIN_SQRTPS256,
25003 IX86_BUILTIN_SQRTPS_NR256,
25004 IX86_BUILTIN_RSQRTPS256,
25005 IX86_BUILTIN_RSQRTPS_NR256,
25006
25007 IX86_BUILTIN_RCPPS256,
25008
25009 IX86_BUILTIN_ROUNDPD256,
25010 IX86_BUILTIN_ROUNDPS256,
25011
25012 IX86_BUILTIN_FLOORPD256,
25013 IX86_BUILTIN_CEILPD256,
25014 IX86_BUILTIN_TRUNCPD256,
25015 IX86_BUILTIN_RINTPD256,
25016 IX86_BUILTIN_ROUNDPD_AZ256,
25017 IX86_BUILTIN_FLOORPS256,
25018 IX86_BUILTIN_CEILPS256,
25019 IX86_BUILTIN_TRUNCPS256,
25020 IX86_BUILTIN_RINTPS256,
25021 IX86_BUILTIN_ROUNDPS_AZ256,
25022
25023 IX86_BUILTIN_UNPCKHPD256,
25024 IX86_BUILTIN_UNPCKLPD256,
25025 IX86_BUILTIN_UNPCKHPS256,
25026 IX86_BUILTIN_UNPCKLPS256,
25027
25028 IX86_BUILTIN_SI256_SI,
25029 IX86_BUILTIN_PS256_PS,
25030 IX86_BUILTIN_PD256_PD,
25031 IX86_BUILTIN_SI_SI256,
25032 IX86_BUILTIN_PS_PS256,
25033 IX86_BUILTIN_PD_PD256,
25034
25035 IX86_BUILTIN_VTESTZPD,
25036 IX86_BUILTIN_VTESTCPD,
25037 IX86_BUILTIN_VTESTNZCPD,
25038 IX86_BUILTIN_VTESTZPS,
25039 IX86_BUILTIN_VTESTCPS,
25040 IX86_BUILTIN_VTESTNZCPS,
25041 IX86_BUILTIN_VTESTZPD256,
25042 IX86_BUILTIN_VTESTCPD256,
25043 IX86_BUILTIN_VTESTNZCPD256,
25044 IX86_BUILTIN_VTESTZPS256,
25045 IX86_BUILTIN_VTESTCPS256,
25046 IX86_BUILTIN_VTESTNZCPS256,
25047 IX86_BUILTIN_PTESTZ256,
25048 IX86_BUILTIN_PTESTC256,
25049 IX86_BUILTIN_PTESTNZC256,
25050
25051 IX86_BUILTIN_MOVMSKPD256,
25052 IX86_BUILTIN_MOVMSKPS256,
25053
25054 /* AVX2 */
25055 IX86_BUILTIN_MPSADBW256,
25056 IX86_BUILTIN_PABSB256,
25057 IX86_BUILTIN_PABSW256,
25058 IX86_BUILTIN_PABSD256,
25059 IX86_BUILTIN_PACKSSDW256,
25060 IX86_BUILTIN_PACKSSWB256,
25061 IX86_BUILTIN_PACKUSDW256,
25062 IX86_BUILTIN_PACKUSWB256,
25063 IX86_BUILTIN_PADDB256,
25064 IX86_BUILTIN_PADDW256,
25065 IX86_BUILTIN_PADDD256,
25066 IX86_BUILTIN_PADDQ256,
25067 IX86_BUILTIN_PADDSB256,
25068 IX86_BUILTIN_PADDSW256,
25069 IX86_BUILTIN_PADDUSB256,
25070 IX86_BUILTIN_PADDUSW256,
25071 IX86_BUILTIN_PALIGNR256,
25072 IX86_BUILTIN_AND256I,
25073 IX86_BUILTIN_ANDNOT256I,
25074 IX86_BUILTIN_PAVGB256,
25075 IX86_BUILTIN_PAVGW256,
25076 IX86_BUILTIN_PBLENDVB256,
25077 IX86_BUILTIN_PBLENDVW256,
25078 IX86_BUILTIN_PCMPEQB256,
25079 IX86_BUILTIN_PCMPEQW256,
25080 IX86_BUILTIN_PCMPEQD256,
25081 IX86_BUILTIN_PCMPEQQ256,
25082 IX86_BUILTIN_PCMPGTB256,
25083 IX86_BUILTIN_PCMPGTW256,
25084 IX86_BUILTIN_PCMPGTD256,
25085 IX86_BUILTIN_PCMPGTQ256,
25086 IX86_BUILTIN_PHADDW256,
25087 IX86_BUILTIN_PHADDD256,
25088 IX86_BUILTIN_PHADDSW256,
25089 IX86_BUILTIN_PHSUBW256,
25090 IX86_BUILTIN_PHSUBD256,
25091 IX86_BUILTIN_PHSUBSW256,
25092 IX86_BUILTIN_PMADDUBSW256,
25093 IX86_BUILTIN_PMADDWD256,
25094 IX86_BUILTIN_PMAXSB256,
25095 IX86_BUILTIN_PMAXSW256,
25096 IX86_BUILTIN_PMAXSD256,
25097 IX86_BUILTIN_PMAXUB256,
25098 IX86_BUILTIN_PMAXUW256,
25099 IX86_BUILTIN_PMAXUD256,
25100 IX86_BUILTIN_PMINSB256,
25101 IX86_BUILTIN_PMINSW256,
25102 IX86_BUILTIN_PMINSD256,
25103 IX86_BUILTIN_PMINUB256,
25104 IX86_BUILTIN_PMINUW256,
25105 IX86_BUILTIN_PMINUD256,
25106 IX86_BUILTIN_PMOVMSKB256,
25107 IX86_BUILTIN_PMOVSXBW256,
25108 IX86_BUILTIN_PMOVSXBD256,
25109 IX86_BUILTIN_PMOVSXBQ256,
25110 IX86_BUILTIN_PMOVSXWD256,
25111 IX86_BUILTIN_PMOVSXWQ256,
25112 IX86_BUILTIN_PMOVSXDQ256,
25113 IX86_BUILTIN_PMOVZXBW256,
25114 IX86_BUILTIN_PMOVZXBD256,
25115 IX86_BUILTIN_PMOVZXBQ256,
25116 IX86_BUILTIN_PMOVZXWD256,
25117 IX86_BUILTIN_PMOVZXWQ256,
25118 IX86_BUILTIN_PMOVZXDQ256,
25119 IX86_BUILTIN_PMULDQ256,
25120 IX86_BUILTIN_PMULHRSW256,
25121 IX86_BUILTIN_PMULHUW256,
25122 IX86_BUILTIN_PMULHW256,
25123 IX86_BUILTIN_PMULLW256,
25124 IX86_BUILTIN_PMULLD256,
25125 IX86_BUILTIN_PMULUDQ256,
25126 IX86_BUILTIN_POR256,
25127 IX86_BUILTIN_PSADBW256,
25128 IX86_BUILTIN_PSHUFB256,
25129 IX86_BUILTIN_PSHUFD256,
25130 IX86_BUILTIN_PSHUFHW256,
25131 IX86_BUILTIN_PSHUFLW256,
25132 IX86_BUILTIN_PSIGNB256,
25133 IX86_BUILTIN_PSIGNW256,
25134 IX86_BUILTIN_PSIGND256,
25135 IX86_BUILTIN_PSLLDQI256,
25136 IX86_BUILTIN_PSLLWI256,
25137 IX86_BUILTIN_PSLLW256,
25138 IX86_BUILTIN_PSLLDI256,
25139 IX86_BUILTIN_PSLLD256,
25140 IX86_BUILTIN_PSLLQI256,
25141 IX86_BUILTIN_PSLLQ256,
25142 IX86_BUILTIN_PSRAWI256,
25143 IX86_BUILTIN_PSRAW256,
25144 IX86_BUILTIN_PSRADI256,
25145 IX86_BUILTIN_PSRAD256,
25146 IX86_BUILTIN_PSRLDQI256,
25147 IX86_BUILTIN_PSRLWI256,
25148 IX86_BUILTIN_PSRLW256,
25149 IX86_BUILTIN_PSRLDI256,
25150 IX86_BUILTIN_PSRLD256,
25151 IX86_BUILTIN_PSRLQI256,
25152 IX86_BUILTIN_PSRLQ256,
25153 IX86_BUILTIN_PSUBB256,
25154 IX86_BUILTIN_PSUBW256,
25155 IX86_BUILTIN_PSUBD256,
25156 IX86_BUILTIN_PSUBQ256,
25157 IX86_BUILTIN_PSUBSB256,
25158 IX86_BUILTIN_PSUBSW256,
25159 IX86_BUILTIN_PSUBUSB256,
25160 IX86_BUILTIN_PSUBUSW256,
25161 IX86_BUILTIN_PUNPCKHBW256,
25162 IX86_BUILTIN_PUNPCKHWD256,
25163 IX86_BUILTIN_PUNPCKHDQ256,
25164 IX86_BUILTIN_PUNPCKHQDQ256,
25165 IX86_BUILTIN_PUNPCKLBW256,
25166 IX86_BUILTIN_PUNPCKLWD256,
25167 IX86_BUILTIN_PUNPCKLDQ256,
25168 IX86_BUILTIN_PUNPCKLQDQ256,
25169 IX86_BUILTIN_PXOR256,
25170 IX86_BUILTIN_MOVNTDQA256,
25171 IX86_BUILTIN_VBROADCASTSS_PS,
25172 IX86_BUILTIN_VBROADCASTSS_PS256,
25173 IX86_BUILTIN_VBROADCASTSD_PD256,
25174 IX86_BUILTIN_VBROADCASTSI256,
25175 IX86_BUILTIN_PBLENDD256,
25176 IX86_BUILTIN_PBLENDD128,
25177 IX86_BUILTIN_PBROADCASTB256,
25178 IX86_BUILTIN_PBROADCASTW256,
25179 IX86_BUILTIN_PBROADCASTD256,
25180 IX86_BUILTIN_PBROADCASTQ256,
25181 IX86_BUILTIN_PBROADCASTB128,
25182 IX86_BUILTIN_PBROADCASTW128,
25183 IX86_BUILTIN_PBROADCASTD128,
25184 IX86_BUILTIN_PBROADCASTQ128,
25185 IX86_BUILTIN_VPERMVARSI256,
25186 IX86_BUILTIN_VPERMDF256,
25187 IX86_BUILTIN_VPERMVARSF256,
25188 IX86_BUILTIN_VPERMDI256,
25189 IX86_BUILTIN_VPERMTI256,
25190 IX86_BUILTIN_VEXTRACT128I256,
25191 IX86_BUILTIN_VINSERT128I256,
25192 IX86_BUILTIN_MASKLOADD,
25193 IX86_BUILTIN_MASKLOADQ,
25194 IX86_BUILTIN_MASKLOADD256,
25195 IX86_BUILTIN_MASKLOADQ256,
25196 IX86_BUILTIN_MASKSTORED,
25197 IX86_BUILTIN_MASKSTOREQ,
25198 IX86_BUILTIN_MASKSTORED256,
25199 IX86_BUILTIN_MASKSTOREQ256,
25200 IX86_BUILTIN_PSLLVV4DI,
25201 IX86_BUILTIN_PSLLVV2DI,
25202 IX86_BUILTIN_PSLLVV8SI,
25203 IX86_BUILTIN_PSLLVV4SI,
25204 IX86_BUILTIN_PSRAVV8SI,
25205 IX86_BUILTIN_PSRAVV4SI,
25206 IX86_BUILTIN_PSRLVV4DI,
25207 IX86_BUILTIN_PSRLVV2DI,
25208 IX86_BUILTIN_PSRLVV8SI,
25209 IX86_BUILTIN_PSRLVV4SI,
25210
25211 IX86_BUILTIN_GATHERSIV2DF,
25212 IX86_BUILTIN_GATHERSIV4DF,
25213 IX86_BUILTIN_GATHERDIV2DF,
25214 IX86_BUILTIN_GATHERDIV4DF,
25215 IX86_BUILTIN_GATHERSIV4SF,
25216 IX86_BUILTIN_GATHERSIV8SF,
25217 IX86_BUILTIN_GATHERDIV4SF,
25218 IX86_BUILTIN_GATHERDIV8SF,
25219 IX86_BUILTIN_GATHERSIV2DI,
25220 IX86_BUILTIN_GATHERSIV4DI,
25221 IX86_BUILTIN_GATHERDIV2DI,
25222 IX86_BUILTIN_GATHERDIV4DI,
25223 IX86_BUILTIN_GATHERSIV4SI,
25224 IX86_BUILTIN_GATHERSIV8SI,
25225 IX86_BUILTIN_GATHERDIV4SI,
25226 IX86_BUILTIN_GATHERDIV8SI,
25227
25228 /* Alternate 4 element gather for the vectorizer where
25229 all operands are 32-byte wide. */
25230 IX86_BUILTIN_GATHERALTSIV4DF,
25231 IX86_BUILTIN_GATHERALTDIV8SF,
25232 IX86_BUILTIN_GATHERALTSIV4DI,
25233 IX86_BUILTIN_GATHERALTDIV8SI,
25234
25235 /* TFmode support builtins. */
25236 IX86_BUILTIN_INFQ,
25237 IX86_BUILTIN_HUGE_VALQ,
25238 IX86_BUILTIN_FABSQ,
25239 IX86_BUILTIN_COPYSIGNQ,
25240
25241 /* Vectorizer support builtins. */
25242 IX86_BUILTIN_CPYSGNPS,
25243 IX86_BUILTIN_CPYSGNPD,
25244 IX86_BUILTIN_CPYSGNPS256,
25245 IX86_BUILTIN_CPYSGNPD256,
25246
25247 /* FMA4 instructions. */
25248 IX86_BUILTIN_VFMADDSS,
25249 IX86_BUILTIN_VFMADDSD,
25250 IX86_BUILTIN_VFMADDPS,
25251 IX86_BUILTIN_VFMADDPD,
25252 IX86_BUILTIN_VFMADDPS256,
25253 IX86_BUILTIN_VFMADDPD256,
25254 IX86_BUILTIN_VFMADDSUBPS,
25255 IX86_BUILTIN_VFMADDSUBPD,
25256 IX86_BUILTIN_VFMADDSUBPS256,
25257 IX86_BUILTIN_VFMADDSUBPD256,
25258
25259 /* FMA3 instructions. */
25260 IX86_BUILTIN_VFMADDSS3,
25261 IX86_BUILTIN_VFMADDSD3,
25262
25263 /* XOP instructions. */
25264 IX86_BUILTIN_VPCMOV,
25265 IX86_BUILTIN_VPCMOV_V2DI,
25266 IX86_BUILTIN_VPCMOV_V4SI,
25267 IX86_BUILTIN_VPCMOV_V8HI,
25268 IX86_BUILTIN_VPCMOV_V16QI,
25269 IX86_BUILTIN_VPCMOV_V4SF,
25270 IX86_BUILTIN_VPCMOV_V2DF,
25271 IX86_BUILTIN_VPCMOV256,
25272 IX86_BUILTIN_VPCMOV_V4DI256,
25273 IX86_BUILTIN_VPCMOV_V8SI256,
25274 IX86_BUILTIN_VPCMOV_V16HI256,
25275 IX86_BUILTIN_VPCMOV_V32QI256,
25276 IX86_BUILTIN_VPCMOV_V8SF256,
25277 IX86_BUILTIN_VPCMOV_V4DF256,
25278
25279 IX86_BUILTIN_VPPERM,
25280
25281 IX86_BUILTIN_VPMACSSWW,
25282 IX86_BUILTIN_VPMACSWW,
25283 IX86_BUILTIN_VPMACSSWD,
25284 IX86_BUILTIN_VPMACSWD,
25285 IX86_BUILTIN_VPMACSSDD,
25286 IX86_BUILTIN_VPMACSDD,
25287 IX86_BUILTIN_VPMACSSDQL,
25288 IX86_BUILTIN_VPMACSSDQH,
25289 IX86_BUILTIN_VPMACSDQL,
25290 IX86_BUILTIN_VPMACSDQH,
25291 IX86_BUILTIN_VPMADCSSWD,
25292 IX86_BUILTIN_VPMADCSWD,
25293
25294 IX86_BUILTIN_VPHADDBW,
25295 IX86_BUILTIN_VPHADDBD,
25296 IX86_BUILTIN_VPHADDBQ,
25297 IX86_BUILTIN_VPHADDWD,
25298 IX86_BUILTIN_VPHADDWQ,
25299 IX86_BUILTIN_VPHADDDQ,
25300 IX86_BUILTIN_VPHADDUBW,
25301 IX86_BUILTIN_VPHADDUBD,
25302 IX86_BUILTIN_VPHADDUBQ,
25303 IX86_BUILTIN_VPHADDUWD,
25304 IX86_BUILTIN_VPHADDUWQ,
25305 IX86_BUILTIN_VPHADDUDQ,
25306 IX86_BUILTIN_VPHSUBBW,
25307 IX86_BUILTIN_VPHSUBWD,
25308 IX86_BUILTIN_VPHSUBDQ,
25309
25310 IX86_BUILTIN_VPROTB,
25311 IX86_BUILTIN_VPROTW,
25312 IX86_BUILTIN_VPROTD,
25313 IX86_BUILTIN_VPROTQ,
25314 IX86_BUILTIN_VPROTB_IMM,
25315 IX86_BUILTIN_VPROTW_IMM,
25316 IX86_BUILTIN_VPROTD_IMM,
25317 IX86_BUILTIN_VPROTQ_IMM,
25318
25319 IX86_BUILTIN_VPSHLB,
25320 IX86_BUILTIN_VPSHLW,
25321 IX86_BUILTIN_VPSHLD,
25322 IX86_BUILTIN_VPSHLQ,
25323 IX86_BUILTIN_VPSHAB,
25324 IX86_BUILTIN_VPSHAW,
25325 IX86_BUILTIN_VPSHAD,
25326 IX86_BUILTIN_VPSHAQ,
25327
25328 IX86_BUILTIN_VFRCZSS,
25329 IX86_BUILTIN_VFRCZSD,
25330 IX86_BUILTIN_VFRCZPS,
25331 IX86_BUILTIN_VFRCZPD,
25332 IX86_BUILTIN_VFRCZPS256,
25333 IX86_BUILTIN_VFRCZPD256,
25334
25335 IX86_BUILTIN_VPCOMEQUB,
25336 IX86_BUILTIN_VPCOMNEUB,
25337 IX86_BUILTIN_VPCOMLTUB,
25338 IX86_BUILTIN_VPCOMLEUB,
25339 IX86_BUILTIN_VPCOMGTUB,
25340 IX86_BUILTIN_VPCOMGEUB,
25341 IX86_BUILTIN_VPCOMFALSEUB,
25342 IX86_BUILTIN_VPCOMTRUEUB,
25343
25344 IX86_BUILTIN_VPCOMEQUW,
25345 IX86_BUILTIN_VPCOMNEUW,
25346 IX86_BUILTIN_VPCOMLTUW,
25347 IX86_BUILTIN_VPCOMLEUW,
25348 IX86_BUILTIN_VPCOMGTUW,
25349 IX86_BUILTIN_VPCOMGEUW,
25350 IX86_BUILTIN_VPCOMFALSEUW,
25351 IX86_BUILTIN_VPCOMTRUEUW,
25352
25353 IX86_BUILTIN_VPCOMEQUD,
25354 IX86_BUILTIN_VPCOMNEUD,
25355 IX86_BUILTIN_VPCOMLTUD,
25356 IX86_BUILTIN_VPCOMLEUD,
25357 IX86_BUILTIN_VPCOMGTUD,
25358 IX86_BUILTIN_VPCOMGEUD,
25359 IX86_BUILTIN_VPCOMFALSEUD,
25360 IX86_BUILTIN_VPCOMTRUEUD,
25361
25362 IX86_BUILTIN_VPCOMEQUQ,
25363 IX86_BUILTIN_VPCOMNEUQ,
25364 IX86_BUILTIN_VPCOMLTUQ,
25365 IX86_BUILTIN_VPCOMLEUQ,
25366 IX86_BUILTIN_VPCOMGTUQ,
25367 IX86_BUILTIN_VPCOMGEUQ,
25368 IX86_BUILTIN_VPCOMFALSEUQ,
25369 IX86_BUILTIN_VPCOMTRUEUQ,
25370
25371 IX86_BUILTIN_VPCOMEQB,
25372 IX86_BUILTIN_VPCOMNEB,
25373 IX86_BUILTIN_VPCOMLTB,
25374 IX86_BUILTIN_VPCOMLEB,
25375 IX86_BUILTIN_VPCOMGTB,
25376 IX86_BUILTIN_VPCOMGEB,
25377 IX86_BUILTIN_VPCOMFALSEB,
25378 IX86_BUILTIN_VPCOMTRUEB,
25379
25380 IX86_BUILTIN_VPCOMEQW,
25381 IX86_BUILTIN_VPCOMNEW,
25382 IX86_BUILTIN_VPCOMLTW,
25383 IX86_BUILTIN_VPCOMLEW,
25384 IX86_BUILTIN_VPCOMGTW,
25385 IX86_BUILTIN_VPCOMGEW,
25386 IX86_BUILTIN_VPCOMFALSEW,
25387 IX86_BUILTIN_VPCOMTRUEW,
25388
25389 IX86_BUILTIN_VPCOMEQD,
25390 IX86_BUILTIN_VPCOMNED,
25391 IX86_BUILTIN_VPCOMLTD,
25392 IX86_BUILTIN_VPCOMLED,
25393 IX86_BUILTIN_VPCOMGTD,
25394 IX86_BUILTIN_VPCOMGED,
25395 IX86_BUILTIN_VPCOMFALSED,
25396 IX86_BUILTIN_VPCOMTRUED,
25397
25398 IX86_BUILTIN_VPCOMEQQ,
25399 IX86_BUILTIN_VPCOMNEQ,
25400 IX86_BUILTIN_VPCOMLTQ,
25401 IX86_BUILTIN_VPCOMLEQ,
25402 IX86_BUILTIN_VPCOMGTQ,
25403 IX86_BUILTIN_VPCOMGEQ,
25404 IX86_BUILTIN_VPCOMFALSEQ,
25405 IX86_BUILTIN_VPCOMTRUEQ,
25406
25407 /* LWP instructions. */
25408 IX86_BUILTIN_LLWPCB,
25409 IX86_BUILTIN_SLWPCB,
25410 IX86_BUILTIN_LWPVAL32,
25411 IX86_BUILTIN_LWPVAL64,
25412 IX86_BUILTIN_LWPINS32,
25413 IX86_BUILTIN_LWPINS64,
25414
25415 IX86_BUILTIN_CLZS,
25416
25417 /* BMI instructions. */
25418 IX86_BUILTIN_BEXTR32,
25419 IX86_BUILTIN_BEXTR64,
25420 IX86_BUILTIN_CTZS,
25421
25422 /* TBM instructions. */
25423 IX86_BUILTIN_BEXTRI32,
25424 IX86_BUILTIN_BEXTRI64,
25425
25426 /* BMI2 instructions. */
25427 IX86_BUILTIN_BZHI32,
25428 IX86_BUILTIN_BZHI64,
25429 IX86_BUILTIN_PDEP32,
25430 IX86_BUILTIN_PDEP64,
25431 IX86_BUILTIN_PEXT32,
25432 IX86_BUILTIN_PEXT64,
25433
25434 /* FSGSBASE instructions. */
25435 IX86_BUILTIN_RDFSBASE32,
25436 IX86_BUILTIN_RDFSBASE64,
25437 IX86_BUILTIN_RDGSBASE32,
25438 IX86_BUILTIN_RDGSBASE64,
25439 IX86_BUILTIN_WRFSBASE32,
25440 IX86_BUILTIN_WRFSBASE64,
25441 IX86_BUILTIN_WRGSBASE32,
25442 IX86_BUILTIN_WRGSBASE64,
25443
25444 /* RDRND instructions. */
25445 IX86_BUILTIN_RDRAND16_STEP,
25446 IX86_BUILTIN_RDRAND32_STEP,
25447 IX86_BUILTIN_RDRAND64_STEP,
25448
25449 /* F16C instructions. */
25450 IX86_BUILTIN_CVTPH2PS,
25451 IX86_BUILTIN_CVTPH2PS256,
25452 IX86_BUILTIN_CVTPS2PH,
25453 IX86_BUILTIN_CVTPS2PH256,
25454
25455 /* CFString built-in for darwin */
25456 IX86_BUILTIN_CFSTRING,
25457
25458 IX86_BUILTIN_MAX
25459 };
25460
25461 /* Table for the ix86 builtin decls. */
25462 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
25463
25464 /* Table of all of the builtin functions that are possible with different ISA's
25465 but are waiting to be built until a function is declared to use that
25466 ISA. */
25467 struct builtin_isa {
25468 const char *name; /* function name */
25469 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
25470 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
25471 bool const_p; /* true if the declaration is constant */
25472 bool set_and_not_built_p;
25473 };
25474
25475 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
25476
25477
25478 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
25479 of which isa_flags to use in the ix86_builtins_isa array. Stores the
25480 function decl in the ix86_builtins array. Returns the function decl or
25481 NULL_TREE, if the builtin was not added.
25482
25483 If the front end has a special hook for builtin functions, delay adding
25484 builtin functions that aren't in the current ISA until the ISA is changed
25485 with function specific optimization. Doing so, can save about 300K for the
25486 default compiler. When the builtin is expanded, check at that time whether
25487 it is valid.
25488
25489 If the front end doesn't have a special hook, record all builtins, even if
25490 it isn't an instruction set in the current ISA in case the user uses
25491 function specific options for a different ISA, so that we don't get scope
25492 errors if a builtin is added in the middle of a function scope. */
25493
25494 static inline tree
25495 def_builtin (HOST_WIDE_INT mask, const char *name,
25496 enum ix86_builtin_func_type tcode,
25497 enum ix86_builtins code)
25498 {
25499 tree decl = NULL_TREE;
25500
25501 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
25502 {
25503 ix86_builtins_isa[(int) code].isa = mask;
25504
25505 mask &= ~OPTION_MASK_ISA_64BIT;
25506 if (mask == 0
25507 || (mask & ix86_isa_flags) != 0
25508 || (lang_hooks.builtin_function
25509 == lang_hooks.builtin_function_ext_scope))
25510
25511 {
25512 tree type = ix86_get_builtin_func_type (tcode);
25513 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
25514 NULL, NULL_TREE);
25515 ix86_builtins[(int) code] = decl;
25516 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
25517 }
25518 else
25519 {
25520 ix86_builtins[(int) code] = NULL_TREE;
25521 ix86_builtins_isa[(int) code].tcode = tcode;
25522 ix86_builtins_isa[(int) code].name = name;
25523 ix86_builtins_isa[(int) code].const_p = false;
25524 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
25525 }
25526 }
25527
25528 return decl;
25529 }
25530
25531 /* Like def_builtin, but also marks the function decl "const". */
25532
25533 static inline tree
25534 def_builtin_const (HOST_WIDE_INT mask, const char *name,
25535 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
25536 {
25537 tree decl = def_builtin (mask, name, tcode, code);
25538 if (decl)
25539 TREE_READONLY (decl) = 1;
25540 else
25541 ix86_builtins_isa[(int) code].const_p = true;
25542
25543 return decl;
25544 }
25545
25546 /* Add any new builtin functions for a given ISA that may not have been
25547 declared. This saves a bit of space compared to adding all of the
25548 declarations to the tree, even if we didn't use them. */
25549
25550 static void
25551 ix86_add_new_builtins (HOST_WIDE_INT isa)
25552 {
25553 int i;
25554
25555 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
25556 {
25557 if ((ix86_builtins_isa[i].isa & isa) != 0
25558 && ix86_builtins_isa[i].set_and_not_built_p)
25559 {
25560 tree decl, type;
25561
25562 /* Don't define the builtin again. */
25563 ix86_builtins_isa[i].set_and_not_built_p = false;
25564
25565 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
25566 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
25567 type, i, BUILT_IN_MD, NULL,
25568 NULL_TREE);
25569
25570 ix86_builtins[i] = decl;
25571 if (ix86_builtins_isa[i].const_p)
25572 TREE_READONLY (decl) = 1;
25573 }
25574 }
25575 }
25576
25577 /* Bits for builtin_description.flag. */
25578
25579 /* Set when we don't support the comparison natively, and should
25580 swap_comparison in order to support it. */
25581 #define BUILTIN_DESC_SWAP_OPERANDS 1
25582
25583 struct builtin_description
25584 {
25585 const HOST_WIDE_INT mask;
25586 const enum insn_code icode;
25587 const char *const name;
25588 const enum ix86_builtins code;
25589 const enum rtx_code comparison;
25590 const int flag;
25591 };
25592
25593 static const struct builtin_description bdesc_comi[] =
25594 {
25595 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
25596 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
25597 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
25598 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
25599 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
25600 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
25601 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
25602 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
25603 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
25604 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
25605 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
25606 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
25607 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
25608 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
25609 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
25610 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
25611 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
25612 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
25613 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
25614 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
25615 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
25616 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
25617 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
25618 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
25619 };
25620
25621 static const struct builtin_description bdesc_pcmpestr[] =
25622 {
25623 /* SSE4.2 */
25624 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
25625 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
25626 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
25627 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
25628 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
25629 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
25630 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
25631 };
25632
25633 static const struct builtin_description bdesc_pcmpistr[] =
25634 {
25635 /* SSE4.2 */
25636 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
25637 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
25638 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
25639 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
25640 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
25641 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
25642 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
25643 };
25644
25645 /* Special builtins with variable number of arguments. */
25646 static const struct builtin_description bdesc_special_args[] =
25647 {
25648 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
25649 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
25650 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
25651
25652 /* MMX */
25653 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
25654
25655 /* 3DNow! */
25656 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
25657
25658 /* SSE */
25659 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25660 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25661 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
25662
25663 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
25664 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
25665 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
25666 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
25667
25668 /* SSE or 3DNow!A */
25669 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25670 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntdi, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
25671
25672 /* SSE2 */
25673 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25674 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25675 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25676 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
25677 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25678 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
25679 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntsi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
25680 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
25681 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
25682
25683 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
25684 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
25685
25686 /* SSE3 */
25687 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
25688
25689 /* SSE4.1 */
25690 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
25691
25692 /* SSE4A */
25693 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25694 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25695
25696 /* AVX */
25697 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
25698 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
25699
25700 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
25701 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
25702 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
25703 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
25704 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
25705
25706 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
25707 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
25708 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
25709 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
25710 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
25711 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
25712 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
25713
25714 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
25715 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
25716 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
25717
25718 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
25719 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
25720 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
25721 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
25722 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
25723 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
25724 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
25725 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
25726
25727 /* AVX2 */
25728 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
25729 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
25730 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
25731 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
25732 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
25733 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
25734 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
25735 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
25736 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
25737
25738 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
25739 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
25740 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
25741 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
25742 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
25743 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
25744
25745 /* FSGSBASE */
25746 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
25747 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
25748 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
25749 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
25750 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
25751 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
25752 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
25753 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
25754 };
25755
25756 /* Builtins with variable number of arguments. */
25757 static const struct builtin_description bdesc_args[] =
25758 {
25759 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
25760 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
25761 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
25762 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
25763 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
25764 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
25765 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
25766
25767 /* MMX */
25768 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25769 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25770 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25771 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25772 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25773 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25774
25775 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25776 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25777 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25778 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25779 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25780 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25781 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25782 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25783
25784 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25785 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25786
25787 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25788 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25789 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25790 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25791
25792 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25793 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25794 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25795 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25796 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25797 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25798
25799 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25800 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25801 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25802 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25803 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
25804 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
25805
25806 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
25807 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
25808 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
25809
25810 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
25811
25812 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
25813 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
25814 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
25815 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
25816 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
25817 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
25818
25819 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
25820 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
25821 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
25822 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
25823 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
25824 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
25825
25826 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
25827 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
25828 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
25829 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
25830
25831 /* 3DNow! */
25832 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
25833 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
25834 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
25835 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
25836
25837 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25838 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25839 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25840 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
25841 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
25842 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
25843 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25844 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25845 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25846 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25847 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25848 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25849 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25850 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25851 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25852
25853 /* 3DNow!A */
25854 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
25855 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
25856 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
25857 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
25858 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25859 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25860
25861 /* SSE */
25862 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
25863 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25864 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25865 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25866 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25867 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25868 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
25869 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
25870 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
25871 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
25872 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
25873 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
25874
25875 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
25876
25877 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25878 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25879 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25880 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25881 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25882 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25883 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25884 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25885
25886 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
25887 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
25888 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
25889 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
25890 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
25891 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
25892 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
25893 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
25894 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
25895 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
25896 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
25897 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
25898 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
25899 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
25900 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
25901 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
25902 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
25903 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
25904 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
25905 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
25906 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
25907 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
25908
25909 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25910 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25911 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25912 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25913
25914 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25915 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25916 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25917 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25918
25919 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25920
25921 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25922 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25923 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25924 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25925 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25926
25927 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
25928 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
25929 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
25930
25931 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
25932
25933 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
25934 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
25935 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
25936
25937 /* SSE MMX or 3Dnow!A */
25938 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25939 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25940 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25941
25942 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25943 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25944 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25945 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25946
25947 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
25948 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
25949
25950 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
25951
25952 /* SSE2 */
25953 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
25954
25955 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
25956 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
25957 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
25958 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
25959 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
25960
25961 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
25962 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
25963 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
25964 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
25965 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
25966
25967 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
25968
25969 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
25970 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
25971 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
25972 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
25973
25974 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
25975 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
25976 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
25977
25978 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25979 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25980 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25981 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25982 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25983 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25984 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25985 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
25986
25987 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
25988 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
25989 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
25990 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
25991 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
25992 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
25993 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
25994 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
25995 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
25996 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
25997 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
25998 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
25999 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26000 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26001 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26002 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26003 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26004 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26005 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26006 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26007
26008 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26009 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26010 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26011 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26012
26013 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26014 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26015 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26016 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26017
26018 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26019
26020 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26021 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26022 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26023
26024 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26025
26026 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26027 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26028 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26029 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26030 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26031 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26032 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26033 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26034
26035 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26036 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26037 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26038 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26039 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26040 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26041 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26042 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26043
26044 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26045 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
26046
26047 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26048 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26049 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26050 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26051
26052 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26053 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26054
26055 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26056 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26057 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26058 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26059 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26060 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26061
26062 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26063 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26064 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26065 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26066
26067 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26068 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26069 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26070 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26071 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26072 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26073 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26074 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26075
26076 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26077 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26078 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26079
26080 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26081 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
26082
26083 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
26084 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26085
26086 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
26087
26088 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
26089 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
26090 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
26091 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
26092
26093 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26094 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26095 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26096 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26097 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26098 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26099 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26100
26101 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26102 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26103 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26104 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26105 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26106 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26107 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26108
26109 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26110 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26111 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26112 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26113
26114 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
26115 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26116 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26117
26118 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
26119
26120 { OPTION_MASK_ISA_SSE2, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
26121 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
26122
26123 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26124
26125 /* SSE2 MMX */
26126 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26127 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26128
26129 /* SSE3 */
26130 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
26131 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26132
26133 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26134 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26135 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26136 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26137 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26138 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26139
26140 /* SSSE3 */
26141 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26142 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
26143 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26144 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
26145 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26146 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26147
26148 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26149 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26150 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26151 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26152 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26153 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26154 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26155 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26156 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26157 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26158 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26159 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26160 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
26161 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
26162 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26163 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26164 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26165 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26166 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26167 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26168 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26169 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26170 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26171 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26172
26173 /* SSSE3. */
26174 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
26175 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
26176
26177 /* SSE4.1 */
26178 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26179 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26180 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
26181 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
26182 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26183 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26184 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26185 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
26186 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
26187 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
26188
26189 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26190 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26191 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26192 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26193 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26194 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26195 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26196 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26197 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26198 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26199 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26200 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26201 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26202
26203 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26204 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26205 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26206 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26207 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26208 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26209 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26210 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26211 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26212 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26213 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26214 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26215
26216 /* SSE4.1 */
26217 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26218 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26219 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26220 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26221
26222 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
26223 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
26224 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
26225 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
26226
26227 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26228
26229 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
26230 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
26231 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
26232 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
26233
26234 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26235
26236 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26237 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26238 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26239
26240 /* SSE4.2 */
26241 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26242 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
26243 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
26244 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26245 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26246
26247 /* SSE4A */
26248 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
26249 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
26250 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
26251 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26252
26253 /* AES */
26254 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
26255 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26256
26257 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26258 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26259 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26260 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26261
26262 /* PCLMUL */
26263 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
26264
26265 /* AVX */
26266 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26267 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26268 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26269 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26270 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26271 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26272 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26273 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26274 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26275 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26276 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26277 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26278 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26279 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26280 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26281 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26282 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26283 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26284 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26285 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26286 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26287 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26288 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26289 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26290 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26291 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26292
26293 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
26294 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
26295 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
26296 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
26297
26298 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26299 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26300 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
26301 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
26302 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26303 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26304 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26305 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26306 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26307 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26308 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26309 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26310 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26311 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
26312 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
26313 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
26314 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
26315 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
26316 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
26317 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26318 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
26319 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26320 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26321 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26322 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26323 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26324 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26325 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26326 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26327 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26328 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26329 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
26330 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
26331 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
26332
26333 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26334 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26335 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26336
26337 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26338 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26339 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26340 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26341 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26342
26343 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26344
26345 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26346 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26347
26348 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
26349 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
26350 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
26351 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
26352
26353 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26354
26355 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
26356 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
26357 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
26358 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
26359
26360 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26361
26362 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26363 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26364 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26365 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26366
26367 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26368 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26369 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26370 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
26371 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
26372 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
26373
26374 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26375 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26376 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26377 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26378 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26379 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26380 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26381 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26382 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26383 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26384 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26385 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26386 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26387 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26388 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26389
26390 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
26391 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
26392
26393 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26394 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26395
26396 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26397
26398 /* AVX2 */
26399 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
26400 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
26401 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
26402 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
26403 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26404 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26405 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26406 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26407 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26408 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26409 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26410 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26411 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26412 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26413 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26414 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26415 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
26416 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26417 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26418 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26419 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26420 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
26421 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
26422 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26423 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26424 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26425 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26426 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26427 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26428 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26429 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26430 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26431 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26432 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26433 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26434 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26435 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26436 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26437 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
26438 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26439 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26440 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26441 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26442 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26443 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26444 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26445 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26446 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26447 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26448 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26449 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26450 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
26451 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26452 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26453 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26454 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26455 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26456 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26457 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26458 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26459 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26460 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26461 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26462 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26463 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3 , "__builtin_ia32_pmuldq256" , IX86_BUILTIN_PMULDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26464 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26465 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26466 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26467 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26468 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26469 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3 , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26470 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26471 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26472 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26473 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
26474 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26475 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26476 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26477 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26478 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26479 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26480 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26481 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26482 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26483 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26484 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26485 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26486 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26487 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26488 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26489 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26490 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26491 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26492 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26493 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26494 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26495 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26496 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26497 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26498 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26499 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26500 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26501 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26502 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26503 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26504 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26505 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26506 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26507 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26508 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26509 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26510 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26511 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26512 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26513 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26514 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26515 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26516 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26517 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26518 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
26519 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26520 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
26521 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
26522 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26523 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26524 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26525 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26526 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26527 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26528 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26529 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26530 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26531 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
26532 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
26533 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
26534 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
26535 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26536 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26537 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26538 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26539 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26540 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26541 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26542 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26543 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26544 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26545
26546 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26547
26548 /* BMI */
26549 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26550 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26551 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26552
26553 /* TBM */
26554 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26555 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26556
26557 /* F16C */
26558 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
26559 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
26560 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
26561 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
26562
26563 /* BMI2 */
26564 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26565 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26566 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26567 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26568 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26569 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26570 };
26571
26572 /* FMA4 and XOP. */
26573 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
26574 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
26575 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
26576 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
26577 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
26578 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
26579 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
26580 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
26581 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
26582 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
26583 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
26584 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
26585 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
26586 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
26587 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
26588 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
26589 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
26590 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
26591 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
26592 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
26593 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
26594 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
26595 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
26596 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
26597 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
26598 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
26599 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
26600 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
26601 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
26602 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
26603 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
26604 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
26605 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
26606 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
26607 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
26608 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
26609 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
26610 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
26611 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
26612 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
26613 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
26614 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
26615 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
26616 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
26617 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
26618 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
26619 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
26620 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
26621 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
26622 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
26623 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
26624 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
26625
26626 static const struct builtin_description bdesc_multi_arg[] =
26627 {
26628 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
26629 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
26630 UNKNOWN, (int)MULTI_ARG_3_SF },
26631 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
26632 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
26633 UNKNOWN, (int)MULTI_ARG_3_DF },
26634
26635 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
26636 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
26637 UNKNOWN, (int)MULTI_ARG_3_SF },
26638 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
26639 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
26640 UNKNOWN, (int)MULTI_ARG_3_DF },
26641
26642 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
26643 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
26644 UNKNOWN, (int)MULTI_ARG_3_SF },
26645 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
26646 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
26647 UNKNOWN, (int)MULTI_ARG_3_DF },
26648 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
26649 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
26650 UNKNOWN, (int)MULTI_ARG_3_SF2 },
26651 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
26652 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
26653 UNKNOWN, (int)MULTI_ARG_3_DF2 },
26654
26655 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
26656 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
26657 UNKNOWN, (int)MULTI_ARG_3_SF },
26658 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
26659 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
26660 UNKNOWN, (int)MULTI_ARG_3_DF },
26661 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
26662 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
26663 UNKNOWN, (int)MULTI_ARG_3_SF2 },
26664 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
26665 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
26666 UNKNOWN, (int)MULTI_ARG_3_DF2 },
26667
26668 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
26669 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
26670 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
26671 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
26672 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
26673 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
26674 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
26675
26676 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
26677 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
26678 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
26679 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
26680 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
26681 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
26682 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
26683
26684 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
26685
26686 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
26687 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
26688 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26689 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26690 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
26691 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
26692 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26693 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26694 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26695 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26696 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26697 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26698
26699 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26700 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
26701 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
26702 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
26703 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
26704 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
26705 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
26706 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
26707 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26708 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
26709 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
26710 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
26711 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26712 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
26713 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
26714 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
26715
26716 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
26717 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
26718 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
26719 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
26720 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
26721 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
26722
26723 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26724 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
26725 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
26726 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26727 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
26728 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26729 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26730 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
26731 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
26732 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26733 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
26734 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26735 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26736 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26737 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26738
26739 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
26740 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
26741 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
26742 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
26743 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
26744 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
26745 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
26746
26747 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
26748 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
26749 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
26750 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
26751 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
26752 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
26753 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
26754
26755 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
26756 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
26757 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
26758 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
26759 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
26760 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
26761 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
26762
26763 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
26764 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
26765 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
26766 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
26767 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
26768 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
26769 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
26770
26771 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
26772 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
26773 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
26774 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
26775 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
26776 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
26777 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
26778
26779 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
26780 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
26781 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
26782 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
26783 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
26784 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
26785 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
26786
26787 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
26788 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
26789 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
26790 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
26791 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
26792 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
26793 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
26794
26795 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
26796 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
26797 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
26798 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
26799 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
26800 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
26801 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
26802
26803 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
26804 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
26805 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
26806 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
26807 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
26808 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
26809 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
26810 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
26811
26812 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
26813 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
26814 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
26815 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
26816 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
26817 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
26818 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
26819 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
26820
26821 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
26822 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
26823 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
26824 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
26825
26826 };
26827 \f
26828 /* TM vector builtins. */
26829
26830 /* Reuse the existing x86-specific `struct builtin_description' cause
26831 we're lazy. Add casts to make them fit. */
26832 static const struct builtin_description bdesc_tm[] =
26833 {
26834 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
26835 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
26836 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
26837 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
26838 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
26839 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
26840 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
26841
26842 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
26843 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
26844 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
26845 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
26846 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
26847 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
26848 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
26849
26850 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
26851 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
26852 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
26853 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
26854 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
26855 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
26856 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
26857
26858 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
26859 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
26860 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
26861 };
26862
26863 /* TM callbacks. */
26864
26865 /* Return the builtin decl needed to load a vector of TYPE. */
26866
26867 static tree
26868 ix86_builtin_tm_load (tree type)
26869 {
26870 if (TREE_CODE (type) == VECTOR_TYPE)
26871 {
26872 switch (tree_low_cst (TYPE_SIZE (type), 1))
26873 {
26874 case 64:
26875 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
26876 case 128:
26877 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
26878 case 256:
26879 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
26880 }
26881 }
26882 return NULL_TREE;
26883 }
26884
26885 /* Return the builtin decl needed to store a vector of TYPE. */
26886
26887 static tree
26888 ix86_builtin_tm_store (tree type)
26889 {
26890 if (TREE_CODE (type) == VECTOR_TYPE)
26891 {
26892 switch (tree_low_cst (TYPE_SIZE (type), 1))
26893 {
26894 case 64:
26895 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
26896 case 128:
26897 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
26898 case 256:
26899 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
26900 }
26901 }
26902 return NULL_TREE;
26903 }
26904 \f
26905 /* Initialize the transactional memory vector load/store builtins. */
26906
26907 static void
26908 ix86_init_tm_builtins (void)
26909 {
26910 enum ix86_builtin_func_type ftype;
26911 const struct builtin_description *d;
26912 size_t i;
26913 tree decl;
26914 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
26915 tree attrs_log, attrs_type_log;
26916
26917 if (!flag_tm)
26918 return;
26919
26920 /* Use whatever attributes a normal TM load has. */
26921 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
26922 attrs_load = DECL_ATTRIBUTES (decl);
26923 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
26924 /* Use whatever attributes a normal TM store has. */
26925 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
26926 attrs_store = DECL_ATTRIBUTES (decl);
26927 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
26928 /* Use whatever attributes a normal TM log has. */
26929 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
26930 attrs_log = DECL_ATTRIBUTES (decl);
26931 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
26932
26933 for (i = 0, d = bdesc_tm;
26934 i < ARRAY_SIZE (bdesc_tm);
26935 i++, d++)
26936 {
26937 if ((d->mask & ix86_isa_flags) != 0
26938 || (lang_hooks.builtin_function
26939 == lang_hooks.builtin_function_ext_scope))
26940 {
26941 tree type, attrs, attrs_type;
26942 enum built_in_function code = (enum built_in_function) d->code;
26943
26944 ftype = (enum ix86_builtin_func_type) d->flag;
26945 type = ix86_get_builtin_func_type (ftype);
26946
26947 if (BUILTIN_TM_LOAD_P (code))
26948 {
26949 attrs = attrs_load;
26950 attrs_type = attrs_type_load;
26951 }
26952 else if (BUILTIN_TM_STORE_P (code))
26953 {
26954 attrs = attrs_store;
26955 attrs_type = attrs_type_store;
26956 }
26957 else
26958 {
26959 attrs = attrs_log;
26960 attrs_type = attrs_type_log;
26961 }
26962 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
26963 /* The builtin without the prefix for
26964 calling it directly. */
26965 d->name + strlen ("__builtin_"),
26966 attrs);
26967 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
26968 set the TYPE_ATTRIBUTES. */
26969 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
26970
26971 set_builtin_decl (code, decl, false);
26972 }
26973 }
26974 }
26975
26976 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
26977 in the current target ISA to allow the user to compile particular modules
26978 with different target specific options that differ from the command line
26979 options. */
26980 static void
26981 ix86_init_mmx_sse_builtins (void)
26982 {
26983 const struct builtin_description * d;
26984 enum ix86_builtin_func_type ftype;
26985 size_t i;
26986
26987 /* Add all special builtins with variable number of operands. */
26988 for (i = 0, d = bdesc_special_args;
26989 i < ARRAY_SIZE (bdesc_special_args);
26990 i++, d++)
26991 {
26992 if (d->name == 0)
26993 continue;
26994
26995 ftype = (enum ix86_builtin_func_type) d->flag;
26996 def_builtin (d->mask, d->name, ftype, d->code);
26997 }
26998
26999 /* Add all builtins with variable number of operands. */
27000 for (i = 0, d = bdesc_args;
27001 i < ARRAY_SIZE (bdesc_args);
27002 i++, d++)
27003 {
27004 if (d->name == 0)
27005 continue;
27006
27007 ftype = (enum ix86_builtin_func_type) d->flag;
27008 def_builtin_const (d->mask, d->name, ftype, d->code);
27009 }
27010
27011 /* pcmpestr[im] insns. */
27012 for (i = 0, d = bdesc_pcmpestr;
27013 i < ARRAY_SIZE (bdesc_pcmpestr);
27014 i++, d++)
27015 {
27016 if (d->code == IX86_BUILTIN_PCMPESTRM128)
27017 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
27018 else
27019 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
27020 def_builtin_const (d->mask, d->name, ftype, d->code);
27021 }
27022
27023 /* pcmpistr[im] insns. */
27024 for (i = 0, d = bdesc_pcmpistr;
27025 i < ARRAY_SIZE (bdesc_pcmpistr);
27026 i++, d++)
27027 {
27028 if (d->code == IX86_BUILTIN_PCMPISTRM128)
27029 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
27030 else
27031 ftype = INT_FTYPE_V16QI_V16QI_INT;
27032 def_builtin_const (d->mask, d->name, ftype, d->code);
27033 }
27034
27035 /* comi/ucomi insns. */
27036 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
27037 {
27038 if (d->mask == OPTION_MASK_ISA_SSE2)
27039 ftype = INT_FTYPE_V2DF_V2DF;
27040 else
27041 ftype = INT_FTYPE_V4SF_V4SF;
27042 def_builtin_const (d->mask, d->name, ftype, d->code);
27043 }
27044
27045 /* SSE */
27046 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
27047 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
27048 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
27049 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
27050
27051 /* SSE or 3DNow!A */
27052 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27053 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
27054 IX86_BUILTIN_MASKMOVQ);
27055
27056 /* SSE2 */
27057 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
27058 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
27059
27060 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
27061 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
27062 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
27063 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
27064
27065 /* SSE3. */
27066 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
27067 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
27068 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
27069 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
27070
27071 /* AES */
27072 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
27073 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
27074 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
27075 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
27076 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
27077 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
27078 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
27079 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
27080 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
27081 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
27082 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
27083 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
27084
27085 /* PCLMUL */
27086 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
27087 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
27088
27089 /* RDRND */
27090 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
27091 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
27092 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
27093 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
27094 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
27095 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
27096 IX86_BUILTIN_RDRAND64_STEP);
27097
27098 /* AVX2 */
27099 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
27100 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
27101 IX86_BUILTIN_GATHERSIV2DF);
27102
27103 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
27104 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
27105 IX86_BUILTIN_GATHERSIV4DF);
27106
27107 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
27108 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
27109 IX86_BUILTIN_GATHERDIV2DF);
27110
27111 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
27112 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
27113 IX86_BUILTIN_GATHERDIV4DF);
27114
27115 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
27116 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
27117 IX86_BUILTIN_GATHERSIV4SF);
27118
27119 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
27120 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
27121 IX86_BUILTIN_GATHERSIV8SF);
27122
27123 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
27124 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
27125 IX86_BUILTIN_GATHERDIV4SF);
27126
27127 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
27128 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
27129 IX86_BUILTIN_GATHERDIV8SF);
27130
27131 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
27132 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
27133 IX86_BUILTIN_GATHERSIV2DI);
27134
27135 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
27136 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
27137 IX86_BUILTIN_GATHERSIV4DI);
27138
27139 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
27140 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
27141 IX86_BUILTIN_GATHERDIV2DI);
27142
27143 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
27144 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
27145 IX86_BUILTIN_GATHERDIV4DI);
27146
27147 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
27148 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
27149 IX86_BUILTIN_GATHERSIV4SI);
27150
27151 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
27152 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
27153 IX86_BUILTIN_GATHERSIV8SI);
27154
27155 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
27156 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
27157 IX86_BUILTIN_GATHERDIV4SI);
27158
27159 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
27160 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
27161 IX86_BUILTIN_GATHERDIV8SI);
27162
27163 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
27164 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
27165 IX86_BUILTIN_GATHERALTSIV4DF);
27166
27167 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
27168 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
27169 IX86_BUILTIN_GATHERALTDIV8SF);
27170
27171 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
27172 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
27173 IX86_BUILTIN_GATHERALTSIV4DI);
27174
27175 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
27176 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
27177 IX86_BUILTIN_GATHERALTDIV8SI);
27178
27179 /* MMX access to the vec_init patterns. */
27180 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
27181 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
27182
27183 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
27184 V4HI_FTYPE_HI_HI_HI_HI,
27185 IX86_BUILTIN_VEC_INIT_V4HI);
27186
27187 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
27188 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
27189 IX86_BUILTIN_VEC_INIT_V8QI);
27190
27191 /* Access to the vec_extract patterns. */
27192 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
27193 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
27194 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
27195 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
27196 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
27197 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
27198 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
27199 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
27200 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
27201 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
27202
27203 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27204 "__builtin_ia32_vec_ext_v4hi",
27205 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
27206
27207 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
27208 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
27209
27210 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
27211 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
27212
27213 /* Access to the vec_set patterns. */
27214 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
27215 "__builtin_ia32_vec_set_v2di",
27216 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
27217
27218 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
27219 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
27220
27221 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
27222 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
27223
27224 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
27225 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
27226
27227 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27228 "__builtin_ia32_vec_set_v4hi",
27229 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
27230
27231 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
27232 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
27233
27234 /* Add FMA4 multi-arg argument instructions */
27235 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
27236 {
27237 if (d->name == 0)
27238 continue;
27239
27240 ftype = (enum ix86_builtin_func_type) d->flag;
27241 def_builtin_const (d->mask, d->name, ftype, d->code);
27242 }
27243 }
27244
27245 /* Internal method for ix86_init_builtins. */
27246
27247 static void
27248 ix86_init_builtins_va_builtins_abi (void)
27249 {
27250 tree ms_va_ref, sysv_va_ref;
27251 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
27252 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
27253 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
27254 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
27255
27256 if (!TARGET_64BIT)
27257 return;
27258 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
27259 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
27260 ms_va_ref = build_reference_type (ms_va_list_type_node);
27261 sysv_va_ref =
27262 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
27263
27264 fnvoid_va_end_ms =
27265 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27266 fnvoid_va_start_ms =
27267 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27268 fnvoid_va_end_sysv =
27269 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
27270 fnvoid_va_start_sysv =
27271 build_varargs_function_type_list (void_type_node, sysv_va_ref,
27272 NULL_TREE);
27273 fnvoid_va_copy_ms =
27274 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
27275 NULL_TREE);
27276 fnvoid_va_copy_sysv =
27277 build_function_type_list (void_type_node, sysv_va_ref,
27278 sysv_va_ref, NULL_TREE);
27279
27280 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
27281 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
27282 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
27283 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
27284 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
27285 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
27286 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
27287 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27288 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
27289 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27290 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
27291 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27292 }
27293
27294 static void
27295 ix86_init_builtin_types (void)
27296 {
27297 tree float128_type_node, float80_type_node;
27298
27299 /* The __float80 type. */
27300 float80_type_node = long_double_type_node;
27301 if (TYPE_MODE (float80_type_node) != XFmode)
27302 {
27303 /* The __float80 type. */
27304 float80_type_node = make_node (REAL_TYPE);
27305
27306 TYPE_PRECISION (float80_type_node) = 80;
27307 layout_type (float80_type_node);
27308 }
27309 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
27310
27311 /* The __float128 type. */
27312 float128_type_node = make_node (REAL_TYPE);
27313 TYPE_PRECISION (float128_type_node) = 128;
27314 layout_type (float128_type_node);
27315 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
27316
27317 /* This macro is built by i386-builtin-types.awk. */
27318 DEFINE_BUILTIN_PRIMITIVE_TYPES;
27319 }
27320
27321 static void
27322 ix86_init_builtins (void)
27323 {
27324 tree t;
27325
27326 ix86_init_builtin_types ();
27327
27328 /* TFmode support builtins. */
27329 def_builtin_const (0, "__builtin_infq",
27330 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
27331 def_builtin_const (0, "__builtin_huge_valq",
27332 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
27333
27334 /* We will expand them to normal call if SSE2 isn't available since
27335 they are used by libgcc. */
27336 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
27337 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
27338 BUILT_IN_MD, "__fabstf2", NULL_TREE);
27339 TREE_READONLY (t) = 1;
27340 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
27341
27342 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
27343 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
27344 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
27345 TREE_READONLY (t) = 1;
27346 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
27347
27348 ix86_init_tm_builtins ();
27349 ix86_init_mmx_sse_builtins ();
27350
27351 if (TARGET_LP64)
27352 ix86_init_builtins_va_builtins_abi ();
27353
27354 #ifdef SUBTARGET_INIT_BUILTINS
27355 SUBTARGET_INIT_BUILTINS;
27356 #endif
27357 }
27358
27359 /* Return the ix86 builtin for CODE. */
27360
27361 static tree
27362 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
27363 {
27364 if (code >= IX86_BUILTIN_MAX)
27365 return error_mark_node;
27366
27367 return ix86_builtins[code];
27368 }
27369
27370 /* Errors in the source file can cause expand_expr to return const0_rtx
27371 where we expect a vector. To avoid crashing, use one of the vector
27372 clear instructions. */
27373 static rtx
27374 safe_vector_operand (rtx x, enum machine_mode mode)
27375 {
27376 if (x == const0_rtx)
27377 x = CONST0_RTX (mode);
27378 return x;
27379 }
27380
27381 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
27382
27383 static rtx
27384 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
27385 {
27386 rtx pat;
27387 tree arg0 = CALL_EXPR_ARG (exp, 0);
27388 tree arg1 = CALL_EXPR_ARG (exp, 1);
27389 rtx op0 = expand_normal (arg0);
27390 rtx op1 = expand_normal (arg1);
27391 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27392 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27393 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
27394
27395 if (VECTOR_MODE_P (mode0))
27396 op0 = safe_vector_operand (op0, mode0);
27397 if (VECTOR_MODE_P (mode1))
27398 op1 = safe_vector_operand (op1, mode1);
27399
27400 if (optimize || !target
27401 || GET_MODE (target) != tmode
27402 || !insn_data[icode].operand[0].predicate (target, tmode))
27403 target = gen_reg_rtx (tmode);
27404
27405 if (GET_MODE (op1) == SImode && mode1 == TImode)
27406 {
27407 rtx x = gen_reg_rtx (V4SImode);
27408 emit_insn (gen_sse2_loadd (x, op1));
27409 op1 = gen_lowpart (TImode, x);
27410 }
27411
27412 if (!insn_data[icode].operand[1].predicate (op0, mode0))
27413 op0 = copy_to_mode_reg (mode0, op0);
27414 if (!insn_data[icode].operand[2].predicate (op1, mode1))
27415 op1 = copy_to_mode_reg (mode1, op1);
27416
27417 pat = GEN_FCN (icode) (target, op0, op1);
27418 if (! pat)
27419 return 0;
27420
27421 emit_insn (pat);
27422
27423 return target;
27424 }
27425
27426 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
27427
27428 static rtx
27429 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
27430 enum ix86_builtin_func_type m_type,
27431 enum rtx_code sub_code)
27432 {
27433 rtx pat;
27434 int i;
27435 int nargs;
27436 bool comparison_p = false;
27437 bool tf_p = false;
27438 bool last_arg_constant = false;
27439 int num_memory = 0;
27440 struct {
27441 rtx op;
27442 enum machine_mode mode;
27443 } args[4];
27444
27445 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27446
27447 switch (m_type)
27448 {
27449 case MULTI_ARG_4_DF2_DI_I:
27450 case MULTI_ARG_4_DF2_DI_I1:
27451 case MULTI_ARG_4_SF2_SI_I:
27452 case MULTI_ARG_4_SF2_SI_I1:
27453 nargs = 4;
27454 last_arg_constant = true;
27455 break;
27456
27457 case MULTI_ARG_3_SF:
27458 case MULTI_ARG_3_DF:
27459 case MULTI_ARG_3_SF2:
27460 case MULTI_ARG_3_DF2:
27461 case MULTI_ARG_3_DI:
27462 case MULTI_ARG_3_SI:
27463 case MULTI_ARG_3_SI_DI:
27464 case MULTI_ARG_3_HI:
27465 case MULTI_ARG_3_HI_SI:
27466 case MULTI_ARG_3_QI:
27467 case MULTI_ARG_3_DI2:
27468 case MULTI_ARG_3_SI2:
27469 case MULTI_ARG_3_HI2:
27470 case MULTI_ARG_3_QI2:
27471 nargs = 3;
27472 break;
27473
27474 case MULTI_ARG_2_SF:
27475 case MULTI_ARG_2_DF:
27476 case MULTI_ARG_2_DI:
27477 case MULTI_ARG_2_SI:
27478 case MULTI_ARG_2_HI:
27479 case MULTI_ARG_2_QI:
27480 nargs = 2;
27481 break;
27482
27483 case MULTI_ARG_2_DI_IMM:
27484 case MULTI_ARG_2_SI_IMM:
27485 case MULTI_ARG_2_HI_IMM:
27486 case MULTI_ARG_2_QI_IMM:
27487 nargs = 2;
27488 last_arg_constant = true;
27489 break;
27490
27491 case MULTI_ARG_1_SF:
27492 case MULTI_ARG_1_DF:
27493 case MULTI_ARG_1_SF2:
27494 case MULTI_ARG_1_DF2:
27495 case MULTI_ARG_1_DI:
27496 case MULTI_ARG_1_SI:
27497 case MULTI_ARG_1_HI:
27498 case MULTI_ARG_1_QI:
27499 case MULTI_ARG_1_SI_DI:
27500 case MULTI_ARG_1_HI_DI:
27501 case MULTI_ARG_1_HI_SI:
27502 case MULTI_ARG_1_QI_DI:
27503 case MULTI_ARG_1_QI_SI:
27504 case MULTI_ARG_1_QI_HI:
27505 nargs = 1;
27506 break;
27507
27508 case MULTI_ARG_2_DI_CMP:
27509 case MULTI_ARG_2_SI_CMP:
27510 case MULTI_ARG_2_HI_CMP:
27511 case MULTI_ARG_2_QI_CMP:
27512 nargs = 2;
27513 comparison_p = true;
27514 break;
27515
27516 case MULTI_ARG_2_SF_TF:
27517 case MULTI_ARG_2_DF_TF:
27518 case MULTI_ARG_2_DI_TF:
27519 case MULTI_ARG_2_SI_TF:
27520 case MULTI_ARG_2_HI_TF:
27521 case MULTI_ARG_2_QI_TF:
27522 nargs = 2;
27523 tf_p = true;
27524 break;
27525
27526 default:
27527 gcc_unreachable ();
27528 }
27529
27530 if (optimize || !target
27531 || GET_MODE (target) != tmode
27532 || !insn_data[icode].operand[0].predicate (target, tmode))
27533 target = gen_reg_rtx (tmode);
27534
27535 gcc_assert (nargs <= 4);
27536
27537 for (i = 0; i < nargs; i++)
27538 {
27539 tree arg = CALL_EXPR_ARG (exp, i);
27540 rtx op = expand_normal (arg);
27541 int adjust = (comparison_p) ? 1 : 0;
27542 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
27543
27544 if (last_arg_constant && i == nargs - 1)
27545 {
27546 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
27547 {
27548 enum insn_code new_icode = icode;
27549 switch (icode)
27550 {
27551 case CODE_FOR_xop_vpermil2v2df3:
27552 case CODE_FOR_xop_vpermil2v4sf3:
27553 case CODE_FOR_xop_vpermil2v4df3:
27554 case CODE_FOR_xop_vpermil2v8sf3:
27555 error ("the last argument must be a 2-bit immediate");
27556 return gen_reg_rtx (tmode);
27557 case CODE_FOR_xop_rotlv2di3:
27558 new_icode = CODE_FOR_rotlv2di3;
27559 goto xop_rotl;
27560 case CODE_FOR_xop_rotlv4si3:
27561 new_icode = CODE_FOR_rotlv4si3;
27562 goto xop_rotl;
27563 case CODE_FOR_xop_rotlv8hi3:
27564 new_icode = CODE_FOR_rotlv8hi3;
27565 goto xop_rotl;
27566 case CODE_FOR_xop_rotlv16qi3:
27567 new_icode = CODE_FOR_rotlv16qi3;
27568 xop_rotl:
27569 if (CONST_INT_P (op))
27570 {
27571 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
27572 op = GEN_INT (INTVAL (op) & mask);
27573 gcc_checking_assert
27574 (insn_data[icode].operand[i + 1].predicate (op, mode));
27575 }
27576 else
27577 {
27578 gcc_checking_assert
27579 (nargs == 2
27580 && insn_data[new_icode].operand[0].mode == tmode
27581 && insn_data[new_icode].operand[1].mode == tmode
27582 && insn_data[new_icode].operand[2].mode == mode
27583 && insn_data[new_icode].operand[0].predicate
27584 == insn_data[icode].operand[0].predicate
27585 && insn_data[new_icode].operand[1].predicate
27586 == insn_data[icode].operand[1].predicate);
27587 icode = new_icode;
27588 goto non_constant;
27589 }
27590 break;
27591 default:
27592 gcc_unreachable ();
27593 }
27594 }
27595 }
27596 else
27597 {
27598 non_constant:
27599 if (VECTOR_MODE_P (mode))
27600 op = safe_vector_operand (op, mode);
27601
27602 /* If we aren't optimizing, only allow one memory operand to be
27603 generated. */
27604 if (memory_operand (op, mode))
27605 num_memory++;
27606
27607 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
27608
27609 if (optimize
27610 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
27611 || num_memory > 1)
27612 op = force_reg (mode, op);
27613 }
27614
27615 args[i].op = op;
27616 args[i].mode = mode;
27617 }
27618
27619 switch (nargs)
27620 {
27621 case 1:
27622 pat = GEN_FCN (icode) (target, args[0].op);
27623 break;
27624
27625 case 2:
27626 if (tf_p)
27627 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
27628 GEN_INT ((int)sub_code));
27629 else if (! comparison_p)
27630 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
27631 else
27632 {
27633 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
27634 args[0].op,
27635 args[1].op);
27636
27637 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
27638 }
27639 break;
27640
27641 case 3:
27642 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
27643 break;
27644
27645 case 4:
27646 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
27647 break;
27648
27649 default:
27650 gcc_unreachable ();
27651 }
27652
27653 if (! pat)
27654 return 0;
27655
27656 emit_insn (pat);
27657 return target;
27658 }
27659
27660 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
27661 insns with vec_merge. */
27662
27663 static rtx
27664 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
27665 rtx target)
27666 {
27667 rtx pat;
27668 tree arg0 = CALL_EXPR_ARG (exp, 0);
27669 rtx op1, op0 = expand_normal (arg0);
27670 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27671 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27672
27673 if (optimize || !target
27674 || GET_MODE (target) != tmode
27675 || !insn_data[icode].operand[0].predicate (target, tmode))
27676 target = gen_reg_rtx (tmode);
27677
27678 if (VECTOR_MODE_P (mode0))
27679 op0 = safe_vector_operand (op0, mode0);
27680
27681 if ((optimize && !register_operand (op0, mode0))
27682 || !insn_data[icode].operand[1].predicate (op0, mode0))
27683 op0 = copy_to_mode_reg (mode0, op0);
27684
27685 op1 = op0;
27686 if (!insn_data[icode].operand[2].predicate (op1, mode0))
27687 op1 = copy_to_mode_reg (mode0, op1);
27688
27689 pat = GEN_FCN (icode) (target, op0, op1);
27690 if (! pat)
27691 return 0;
27692 emit_insn (pat);
27693 return target;
27694 }
27695
27696 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
27697
27698 static rtx
27699 ix86_expand_sse_compare (const struct builtin_description *d,
27700 tree exp, rtx target, bool swap)
27701 {
27702 rtx pat;
27703 tree arg0 = CALL_EXPR_ARG (exp, 0);
27704 tree arg1 = CALL_EXPR_ARG (exp, 1);
27705 rtx op0 = expand_normal (arg0);
27706 rtx op1 = expand_normal (arg1);
27707 rtx op2;
27708 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
27709 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
27710 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
27711 enum rtx_code comparison = d->comparison;
27712
27713 if (VECTOR_MODE_P (mode0))
27714 op0 = safe_vector_operand (op0, mode0);
27715 if (VECTOR_MODE_P (mode1))
27716 op1 = safe_vector_operand (op1, mode1);
27717
27718 /* Swap operands if we have a comparison that isn't available in
27719 hardware. */
27720 if (swap)
27721 {
27722 rtx tmp = gen_reg_rtx (mode1);
27723 emit_move_insn (tmp, op1);
27724 op1 = op0;
27725 op0 = tmp;
27726 }
27727
27728 if (optimize || !target
27729 || GET_MODE (target) != tmode
27730 || !insn_data[d->icode].operand[0].predicate (target, tmode))
27731 target = gen_reg_rtx (tmode);
27732
27733 if ((optimize && !register_operand (op0, mode0))
27734 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
27735 op0 = copy_to_mode_reg (mode0, op0);
27736 if ((optimize && !register_operand (op1, mode1))
27737 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
27738 op1 = copy_to_mode_reg (mode1, op1);
27739
27740 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
27741 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
27742 if (! pat)
27743 return 0;
27744 emit_insn (pat);
27745 return target;
27746 }
27747
27748 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
27749
27750 static rtx
27751 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
27752 rtx target)
27753 {
27754 rtx pat;
27755 tree arg0 = CALL_EXPR_ARG (exp, 0);
27756 tree arg1 = CALL_EXPR_ARG (exp, 1);
27757 rtx op0 = expand_normal (arg0);
27758 rtx op1 = expand_normal (arg1);
27759 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
27760 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
27761 enum rtx_code comparison = d->comparison;
27762
27763 if (VECTOR_MODE_P (mode0))
27764 op0 = safe_vector_operand (op0, mode0);
27765 if (VECTOR_MODE_P (mode1))
27766 op1 = safe_vector_operand (op1, mode1);
27767
27768 /* Swap operands if we have a comparison that isn't available in
27769 hardware. */
27770 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
27771 {
27772 rtx tmp = op1;
27773 op1 = op0;
27774 op0 = tmp;
27775 }
27776
27777 target = gen_reg_rtx (SImode);
27778 emit_move_insn (target, const0_rtx);
27779 target = gen_rtx_SUBREG (QImode, target, 0);
27780
27781 if ((optimize && !register_operand (op0, mode0))
27782 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
27783 op0 = copy_to_mode_reg (mode0, op0);
27784 if ((optimize && !register_operand (op1, mode1))
27785 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
27786 op1 = copy_to_mode_reg (mode1, op1);
27787
27788 pat = GEN_FCN (d->icode) (op0, op1);
27789 if (! pat)
27790 return 0;
27791 emit_insn (pat);
27792 emit_insn (gen_rtx_SET (VOIDmode,
27793 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
27794 gen_rtx_fmt_ee (comparison, QImode,
27795 SET_DEST (pat),
27796 const0_rtx)));
27797
27798 return SUBREG_REG (target);
27799 }
27800
27801 /* Subroutine of ix86_expand_args_builtin to take care of round insns. */
27802
27803 static rtx
27804 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
27805 rtx target)
27806 {
27807 rtx pat;
27808 tree arg0 = CALL_EXPR_ARG (exp, 0);
27809 rtx op1, op0 = expand_normal (arg0);
27810 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
27811 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
27812
27813 if (optimize || target == 0
27814 || GET_MODE (target) != tmode
27815 || !insn_data[d->icode].operand[0].predicate (target, tmode))
27816 target = gen_reg_rtx (tmode);
27817
27818 if (VECTOR_MODE_P (mode0))
27819 op0 = safe_vector_operand (op0, mode0);
27820
27821 if ((optimize && !register_operand (op0, mode0))
27822 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
27823 op0 = copy_to_mode_reg (mode0, op0);
27824
27825 op1 = GEN_INT (d->comparison);
27826
27827 pat = GEN_FCN (d->icode) (target, op0, op1);
27828 if (! pat)
27829 return 0;
27830 emit_insn (pat);
27831 return target;
27832 }
27833
27834 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
27835
27836 static rtx
27837 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
27838 rtx target)
27839 {
27840 rtx pat;
27841 tree arg0 = CALL_EXPR_ARG (exp, 0);
27842 tree arg1 = CALL_EXPR_ARG (exp, 1);
27843 rtx op0 = expand_normal (arg0);
27844 rtx op1 = expand_normal (arg1);
27845 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
27846 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
27847 enum rtx_code comparison = d->comparison;
27848
27849 if (VECTOR_MODE_P (mode0))
27850 op0 = safe_vector_operand (op0, mode0);
27851 if (VECTOR_MODE_P (mode1))
27852 op1 = safe_vector_operand (op1, mode1);
27853
27854 target = gen_reg_rtx (SImode);
27855 emit_move_insn (target, const0_rtx);
27856 target = gen_rtx_SUBREG (QImode, target, 0);
27857
27858 if ((optimize && !register_operand (op0, mode0))
27859 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
27860 op0 = copy_to_mode_reg (mode0, op0);
27861 if ((optimize && !register_operand (op1, mode1))
27862 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
27863 op1 = copy_to_mode_reg (mode1, op1);
27864
27865 pat = GEN_FCN (d->icode) (op0, op1);
27866 if (! pat)
27867 return 0;
27868 emit_insn (pat);
27869 emit_insn (gen_rtx_SET (VOIDmode,
27870 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
27871 gen_rtx_fmt_ee (comparison, QImode,
27872 SET_DEST (pat),
27873 const0_rtx)));
27874
27875 return SUBREG_REG (target);
27876 }
27877
27878 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
27879
27880 static rtx
27881 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
27882 tree exp, rtx target)
27883 {
27884 rtx pat;
27885 tree arg0 = CALL_EXPR_ARG (exp, 0);
27886 tree arg1 = CALL_EXPR_ARG (exp, 1);
27887 tree arg2 = CALL_EXPR_ARG (exp, 2);
27888 tree arg3 = CALL_EXPR_ARG (exp, 3);
27889 tree arg4 = CALL_EXPR_ARG (exp, 4);
27890 rtx scratch0, scratch1;
27891 rtx op0 = expand_normal (arg0);
27892 rtx op1 = expand_normal (arg1);
27893 rtx op2 = expand_normal (arg2);
27894 rtx op3 = expand_normal (arg3);
27895 rtx op4 = expand_normal (arg4);
27896 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
27897
27898 tmode0 = insn_data[d->icode].operand[0].mode;
27899 tmode1 = insn_data[d->icode].operand[1].mode;
27900 modev2 = insn_data[d->icode].operand[2].mode;
27901 modei3 = insn_data[d->icode].operand[3].mode;
27902 modev4 = insn_data[d->icode].operand[4].mode;
27903 modei5 = insn_data[d->icode].operand[5].mode;
27904 modeimm = insn_data[d->icode].operand[6].mode;
27905
27906 if (VECTOR_MODE_P (modev2))
27907 op0 = safe_vector_operand (op0, modev2);
27908 if (VECTOR_MODE_P (modev4))
27909 op2 = safe_vector_operand (op2, modev4);
27910
27911 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
27912 op0 = copy_to_mode_reg (modev2, op0);
27913 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
27914 op1 = copy_to_mode_reg (modei3, op1);
27915 if ((optimize && !register_operand (op2, modev4))
27916 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
27917 op2 = copy_to_mode_reg (modev4, op2);
27918 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
27919 op3 = copy_to_mode_reg (modei5, op3);
27920
27921 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
27922 {
27923 error ("the fifth argument must be an 8-bit immediate");
27924 return const0_rtx;
27925 }
27926
27927 if (d->code == IX86_BUILTIN_PCMPESTRI128)
27928 {
27929 if (optimize || !target
27930 || GET_MODE (target) != tmode0
27931 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
27932 target = gen_reg_rtx (tmode0);
27933
27934 scratch1 = gen_reg_rtx (tmode1);
27935
27936 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
27937 }
27938 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
27939 {
27940 if (optimize || !target
27941 || GET_MODE (target) != tmode1
27942 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
27943 target = gen_reg_rtx (tmode1);
27944
27945 scratch0 = gen_reg_rtx (tmode0);
27946
27947 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
27948 }
27949 else
27950 {
27951 gcc_assert (d->flag);
27952
27953 scratch0 = gen_reg_rtx (tmode0);
27954 scratch1 = gen_reg_rtx (tmode1);
27955
27956 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
27957 }
27958
27959 if (! pat)
27960 return 0;
27961
27962 emit_insn (pat);
27963
27964 if (d->flag)
27965 {
27966 target = gen_reg_rtx (SImode);
27967 emit_move_insn (target, const0_rtx);
27968 target = gen_rtx_SUBREG (QImode, target, 0);
27969
27970 emit_insn
27971 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
27972 gen_rtx_fmt_ee (EQ, QImode,
27973 gen_rtx_REG ((enum machine_mode) d->flag,
27974 FLAGS_REG),
27975 const0_rtx)));
27976 return SUBREG_REG (target);
27977 }
27978 else
27979 return target;
27980 }
27981
27982
27983 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
27984
27985 static rtx
27986 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
27987 tree exp, rtx target)
27988 {
27989 rtx pat;
27990 tree arg0 = CALL_EXPR_ARG (exp, 0);
27991 tree arg1 = CALL_EXPR_ARG (exp, 1);
27992 tree arg2 = CALL_EXPR_ARG (exp, 2);
27993 rtx scratch0, scratch1;
27994 rtx op0 = expand_normal (arg0);
27995 rtx op1 = expand_normal (arg1);
27996 rtx op2 = expand_normal (arg2);
27997 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
27998
27999 tmode0 = insn_data[d->icode].operand[0].mode;
28000 tmode1 = insn_data[d->icode].operand[1].mode;
28001 modev2 = insn_data[d->icode].operand[2].mode;
28002 modev3 = insn_data[d->icode].operand[3].mode;
28003 modeimm = insn_data[d->icode].operand[4].mode;
28004
28005 if (VECTOR_MODE_P (modev2))
28006 op0 = safe_vector_operand (op0, modev2);
28007 if (VECTOR_MODE_P (modev3))
28008 op1 = safe_vector_operand (op1, modev3);
28009
28010 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28011 op0 = copy_to_mode_reg (modev2, op0);
28012 if ((optimize && !register_operand (op1, modev3))
28013 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
28014 op1 = copy_to_mode_reg (modev3, op1);
28015
28016 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
28017 {
28018 error ("the third argument must be an 8-bit immediate");
28019 return const0_rtx;
28020 }
28021
28022 if (d->code == IX86_BUILTIN_PCMPISTRI128)
28023 {
28024 if (optimize || !target
28025 || GET_MODE (target) != tmode0
28026 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28027 target = gen_reg_rtx (tmode0);
28028
28029 scratch1 = gen_reg_rtx (tmode1);
28030
28031 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
28032 }
28033 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
28034 {
28035 if (optimize || !target
28036 || GET_MODE (target) != tmode1
28037 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28038 target = gen_reg_rtx (tmode1);
28039
28040 scratch0 = gen_reg_rtx (tmode0);
28041
28042 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
28043 }
28044 else
28045 {
28046 gcc_assert (d->flag);
28047
28048 scratch0 = gen_reg_rtx (tmode0);
28049 scratch1 = gen_reg_rtx (tmode1);
28050
28051 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
28052 }
28053
28054 if (! pat)
28055 return 0;
28056
28057 emit_insn (pat);
28058
28059 if (d->flag)
28060 {
28061 target = gen_reg_rtx (SImode);
28062 emit_move_insn (target, const0_rtx);
28063 target = gen_rtx_SUBREG (QImode, target, 0);
28064
28065 emit_insn
28066 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28067 gen_rtx_fmt_ee (EQ, QImode,
28068 gen_rtx_REG ((enum machine_mode) d->flag,
28069 FLAGS_REG),
28070 const0_rtx)));
28071 return SUBREG_REG (target);
28072 }
28073 else
28074 return target;
28075 }
28076
28077 /* Subroutine of ix86_expand_builtin to take care of insns with
28078 variable number of operands. */
28079
28080 static rtx
28081 ix86_expand_args_builtin (const struct builtin_description *d,
28082 tree exp, rtx target)
28083 {
28084 rtx pat, real_target;
28085 unsigned int i, nargs;
28086 unsigned int nargs_constant = 0;
28087 int num_memory = 0;
28088 struct
28089 {
28090 rtx op;
28091 enum machine_mode mode;
28092 } args[4];
28093 bool last_arg_count = false;
28094 enum insn_code icode = d->icode;
28095 const struct insn_data_d *insn_p = &insn_data[icode];
28096 enum machine_mode tmode = insn_p->operand[0].mode;
28097 enum machine_mode rmode = VOIDmode;
28098 bool swap = false;
28099 enum rtx_code comparison = d->comparison;
28100
28101 switch ((enum ix86_builtin_func_type) d->flag)
28102 {
28103 case V2DF_FTYPE_V2DF_ROUND:
28104 case V4DF_FTYPE_V4DF_ROUND:
28105 case V4SF_FTYPE_V4SF_ROUND:
28106 case V8SF_FTYPE_V8SF_ROUND:
28107 return ix86_expand_sse_round (d, exp, target);
28108 case INT_FTYPE_V8SF_V8SF_PTEST:
28109 case INT_FTYPE_V4DI_V4DI_PTEST:
28110 case INT_FTYPE_V4DF_V4DF_PTEST:
28111 case INT_FTYPE_V4SF_V4SF_PTEST:
28112 case INT_FTYPE_V2DI_V2DI_PTEST:
28113 case INT_FTYPE_V2DF_V2DF_PTEST:
28114 return ix86_expand_sse_ptest (d, exp, target);
28115 case FLOAT128_FTYPE_FLOAT128:
28116 case FLOAT_FTYPE_FLOAT:
28117 case INT_FTYPE_INT:
28118 case UINT64_FTYPE_INT:
28119 case UINT16_FTYPE_UINT16:
28120 case INT64_FTYPE_INT64:
28121 case INT64_FTYPE_V4SF:
28122 case INT64_FTYPE_V2DF:
28123 case INT_FTYPE_V16QI:
28124 case INT_FTYPE_V8QI:
28125 case INT_FTYPE_V8SF:
28126 case INT_FTYPE_V4DF:
28127 case INT_FTYPE_V4SF:
28128 case INT_FTYPE_V2DF:
28129 case INT_FTYPE_V32QI:
28130 case V16QI_FTYPE_V16QI:
28131 case V8SI_FTYPE_V8SF:
28132 case V8SI_FTYPE_V4SI:
28133 case V8HI_FTYPE_V8HI:
28134 case V8HI_FTYPE_V16QI:
28135 case V8QI_FTYPE_V8QI:
28136 case V8SF_FTYPE_V8SF:
28137 case V8SF_FTYPE_V8SI:
28138 case V8SF_FTYPE_V4SF:
28139 case V8SF_FTYPE_V8HI:
28140 case V4SI_FTYPE_V4SI:
28141 case V4SI_FTYPE_V16QI:
28142 case V4SI_FTYPE_V4SF:
28143 case V4SI_FTYPE_V8SI:
28144 case V4SI_FTYPE_V8HI:
28145 case V4SI_FTYPE_V4DF:
28146 case V4SI_FTYPE_V2DF:
28147 case V4HI_FTYPE_V4HI:
28148 case V4DF_FTYPE_V4DF:
28149 case V4DF_FTYPE_V4SI:
28150 case V4DF_FTYPE_V4SF:
28151 case V4DF_FTYPE_V2DF:
28152 case V4SF_FTYPE_V4SF:
28153 case V4SF_FTYPE_V4SI:
28154 case V4SF_FTYPE_V8SF:
28155 case V4SF_FTYPE_V4DF:
28156 case V4SF_FTYPE_V8HI:
28157 case V4SF_FTYPE_V2DF:
28158 case V2DI_FTYPE_V2DI:
28159 case V2DI_FTYPE_V16QI:
28160 case V2DI_FTYPE_V8HI:
28161 case V2DI_FTYPE_V4SI:
28162 case V2DF_FTYPE_V2DF:
28163 case V2DF_FTYPE_V4SI:
28164 case V2DF_FTYPE_V4DF:
28165 case V2DF_FTYPE_V4SF:
28166 case V2DF_FTYPE_V2SI:
28167 case V2SI_FTYPE_V2SI:
28168 case V2SI_FTYPE_V4SF:
28169 case V2SI_FTYPE_V2SF:
28170 case V2SI_FTYPE_V2DF:
28171 case V2SF_FTYPE_V2SF:
28172 case V2SF_FTYPE_V2SI:
28173 case V32QI_FTYPE_V32QI:
28174 case V32QI_FTYPE_V16QI:
28175 case V16HI_FTYPE_V16HI:
28176 case V16HI_FTYPE_V8HI:
28177 case V8SI_FTYPE_V8SI:
28178 case V16HI_FTYPE_V16QI:
28179 case V8SI_FTYPE_V16QI:
28180 case V4DI_FTYPE_V16QI:
28181 case V8SI_FTYPE_V8HI:
28182 case V4DI_FTYPE_V8HI:
28183 case V4DI_FTYPE_V4SI:
28184 case V4DI_FTYPE_V2DI:
28185 nargs = 1;
28186 break;
28187 case V4SF_FTYPE_V4SF_VEC_MERGE:
28188 case V2DF_FTYPE_V2DF_VEC_MERGE:
28189 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
28190 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
28191 case V16QI_FTYPE_V16QI_V16QI:
28192 case V16QI_FTYPE_V8HI_V8HI:
28193 case V8QI_FTYPE_V8QI_V8QI:
28194 case V8QI_FTYPE_V4HI_V4HI:
28195 case V8HI_FTYPE_V8HI_V8HI:
28196 case V8HI_FTYPE_V16QI_V16QI:
28197 case V8HI_FTYPE_V4SI_V4SI:
28198 case V8SF_FTYPE_V8SF_V8SF:
28199 case V8SF_FTYPE_V8SF_V8SI:
28200 case V4SI_FTYPE_V4SI_V4SI:
28201 case V4SI_FTYPE_V8HI_V8HI:
28202 case V4SI_FTYPE_V4SF_V4SF:
28203 case V4SI_FTYPE_V2DF_V2DF:
28204 case V4HI_FTYPE_V4HI_V4HI:
28205 case V4HI_FTYPE_V8QI_V8QI:
28206 case V4HI_FTYPE_V2SI_V2SI:
28207 case V4DF_FTYPE_V4DF_V4DF:
28208 case V4DF_FTYPE_V4DF_V4DI:
28209 case V4SF_FTYPE_V4SF_V4SF:
28210 case V4SF_FTYPE_V4SF_V4SI:
28211 case V4SF_FTYPE_V4SF_V2SI:
28212 case V4SF_FTYPE_V4SF_V2DF:
28213 case V4SF_FTYPE_V4SF_DI:
28214 case V4SF_FTYPE_V4SF_SI:
28215 case V2DI_FTYPE_V2DI_V2DI:
28216 case V2DI_FTYPE_V16QI_V16QI:
28217 case V2DI_FTYPE_V4SI_V4SI:
28218 case V2DI_FTYPE_V2DI_V16QI:
28219 case V2DI_FTYPE_V2DF_V2DF:
28220 case V2SI_FTYPE_V2SI_V2SI:
28221 case V2SI_FTYPE_V4HI_V4HI:
28222 case V2SI_FTYPE_V2SF_V2SF:
28223 case V2DF_FTYPE_V2DF_V2DF:
28224 case V2DF_FTYPE_V2DF_V4SF:
28225 case V2DF_FTYPE_V2DF_V2DI:
28226 case V2DF_FTYPE_V2DF_DI:
28227 case V2DF_FTYPE_V2DF_SI:
28228 case V2SF_FTYPE_V2SF_V2SF:
28229 case V1DI_FTYPE_V1DI_V1DI:
28230 case V1DI_FTYPE_V8QI_V8QI:
28231 case V1DI_FTYPE_V2SI_V2SI:
28232 case V32QI_FTYPE_V16HI_V16HI:
28233 case V16HI_FTYPE_V8SI_V8SI:
28234 case V32QI_FTYPE_V32QI_V32QI:
28235 case V16HI_FTYPE_V32QI_V32QI:
28236 case V16HI_FTYPE_V16HI_V16HI:
28237 case V8SI_FTYPE_V4DF_V4DF:
28238 case V8SI_FTYPE_V8SI_V8SI:
28239 case V8SI_FTYPE_V16HI_V16HI:
28240 case V4DI_FTYPE_V4DI_V4DI:
28241 case V4DI_FTYPE_V8SI_V8SI:
28242 if (comparison == UNKNOWN)
28243 return ix86_expand_binop_builtin (icode, exp, target);
28244 nargs = 2;
28245 break;
28246 case V4SF_FTYPE_V4SF_V4SF_SWAP:
28247 case V2DF_FTYPE_V2DF_V2DF_SWAP:
28248 gcc_assert (comparison != UNKNOWN);
28249 nargs = 2;
28250 swap = true;
28251 break;
28252 case V16HI_FTYPE_V16HI_V8HI_COUNT:
28253 case V16HI_FTYPE_V16HI_SI_COUNT:
28254 case V8SI_FTYPE_V8SI_V4SI_COUNT:
28255 case V8SI_FTYPE_V8SI_SI_COUNT:
28256 case V4DI_FTYPE_V4DI_V2DI_COUNT:
28257 case V4DI_FTYPE_V4DI_INT_COUNT:
28258 case V8HI_FTYPE_V8HI_V8HI_COUNT:
28259 case V8HI_FTYPE_V8HI_SI_COUNT:
28260 case V4SI_FTYPE_V4SI_V4SI_COUNT:
28261 case V4SI_FTYPE_V4SI_SI_COUNT:
28262 case V4HI_FTYPE_V4HI_V4HI_COUNT:
28263 case V4HI_FTYPE_V4HI_SI_COUNT:
28264 case V2DI_FTYPE_V2DI_V2DI_COUNT:
28265 case V2DI_FTYPE_V2DI_SI_COUNT:
28266 case V2SI_FTYPE_V2SI_V2SI_COUNT:
28267 case V2SI_FTYPE_V2SI_SI_COUNT:
28268 case V1DI_FTYPE_V1DI_V1DI_COUNT:
28269 case V1DI_FTYPE_V1DI_SI_COUNT:
28270 nargs = 2;
28271 last_arg_count = true;
28272 break;
28273 case UINT64_FTYPE_UINT64_UINT64:
28274 case UINT_FTYPE_UINT_UINT:
28275 case UINT_FTYPE_UINT_USHORT:
28276 case UINT_FTYPE_UINT_UCHAR:
28277 case UINT16_FTYPE_UINT16_INT:
28278 case UINT8_FTYPE_UINT8_INT:
28279 nargs = 2;
28280 break;
28281 case V2DI_FTYPE_V2DI_INT_CONVERT:
28282 nargs = 2;
28283 rmode = V1TImode;
28284 nargs_constant = 1;
28285 break;
28286 case V4DI_FTYPE_V4DI_INT_CONVERT:
28287 nargs = 2;
28288 rmode = V2TImode;
28289 nargs_constant = 1;
28290 break;
28291 case V8HI_FTYPE_V8HI_INT:
28292 case V8HI_FTYPE_V8SF_INT:
28293 case V8HI_FTYPE_V4SF_INT:
28294 case V8SF_FTYPE_V8SF_INT:
28295 case V4SI_FTYPE_V4SI_INT:
28296 case V4SI_FTYPE_V8SI_INT:
28297 case V4HI_FTYPE_V4HI_INT:
28298 case V4DF_FTYPE_V4DF_INT:
28299 case V4SF_FTYPE_V4SF_INT:
28300 case V4SF_FTYPE_V8SF_INT:
28301 case V2DI_FTYPE_V2DI_INT:
28302 case V2DF_FTYPE_V2DF_INT:
28303 case V2DF_FTYPE_V4DF_INT:
28304 case V16HI_FTYPE_V16HI_INT:
28305 case V8SI_FTYPE_V8SI_INT:
28306 case V4DI_FTYPE_V4DI_INT:
28307 case V2DI_FTYPE_V4DI_INT:
28308 nargs = 2;
28309 nargs_constant = 1;
28310 break;
28311 case V16QI_FTYPE_V16QI_V16QI_V16QI:
28312 case V8SF_FTYPE_V8SF_V8SF_V8SF:
28313 case V4DF_FTYPE_V4DF_V4DF_V4DF:
28314 case V4SF_FTYPE_V4SF_V4SF_V4SF:
28315 case V2DF_FTYPE_V2DF_V2DF_V2DF:
28316 case V32QI_FTYPE_V32QI_V32QI_V32QI:
28317 nargs = 3;
28318 break;
28319 case V32QI_FTYPE_V32QI_V32QI_INT:
28320 case V16HI_FTYPE_V16HI_V16HI_INT:
28321 case V16QI_FTYPE_V16QI_V16QI_INT:
28322 case V4DI_FTYPE_V4DI_V4DI_INT:
28323 case V8HI_FTYPE_V8HI_V8HI_INT:
28324 case V8SI_FTYPE_V8SI_V8SI_INT:
28325 case V8SI_FTYPE_V8SI_V4SI_INT:
28326 case V8SF_FTYPE_V8SF_V8SF_INT:
28327 case V8SF_FTYPE_V8SF_V4SF_INT:
28328 case V4SI_FTYPE_V4SI_V4SI_INT:
28329 case V4DF_FTYPE_V4DF_V4DF_INT:
28330 case V4DF_FTYPE_V4DF_V2DF_INT:
28331 case V4SF_FTYPE_V4SF_V4SF_INT:
28332 case V2DI_FTYPE_V2DI_V2DI_INT:
28333 case V4DI_FTYPE_V4DI_V2DI_INT:
28334 case V2DF_FTYPE_V2DF_V2DF_INT:
28335 nargs = 3;
28336 nargs_constant = 1;
28337 break;
28338 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
28339 nargs = 3;
28340 rmode = V4DImode;
28341 nargs_constant = 1;
28342 break;
28343 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
28344 nargs = 3;
28345 rmode = V2DImode;
28346 nargs_constant = 1;
28347 break;
28348 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
28349 nargs = 3;
28350 rmode = DImode;
28351 nargs_constant = 1;
28352 break;
28353 case V2DI_FTYPE_V2DI_UINT_UINT:
28354 nargs = 3;
28355 nargs_constant = 2;
28356 break;
28357 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
28358 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
28359 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
28360 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
28361 nargs = 4;
28362 nargs_constant = 1;
28363 break;
28364 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
28365 nargs = 4;
28366 nargs_constant = 2;
28367 break;
28368 default:
28369 gcc_unreachable ();
28370 }
28371
28372 gcc_assert (nargs <= ARRAY_SIZE (args));
28373
28374 if (comparison != UNKNOWN)
28375 {
28376 gcc_assert (nargs == 2);
28377 return ix86_expand_sse_compare (d, exp, target, swap);
28378 }
28379
28380 if (rmode == VOIDmode || rmode == tmode)
28381 {
28382 if (optimize
28383 || target == 0
28384 || GET_MODE (target) != tmode
28385 || !insn_p->operand[0].predicate (target, tmode))
28386 target = gen_reg_rtx (tmode);
28387 real_target = target;
28388 }
28389 else
28390 {
28391 target = gen_reg_rtx (rmode);
28392 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
28393 }
28394
28395 for (i = 0; i < nargs; i++)
28396 {
28397 tree arg = CALL_EXPR_ARG (exp, i);
28398 rtx op = expand_normal (arg);
28399 enum machine_mode mode = insn_p->operand[i + 1].mode;
28400 bool match = insn_p->operand[i + 1].predicate (op, mode);
28401
28402 if (last_arg_count && (i + 1) == nargs)
28403 {
28404 /* SIMD shift insns take either an 8-bit immediate or
28405 register as count. But builtin functions take int as
28406 count. If count doesn't match, we put it in register. */
28407 if (!match)
28408 {
28409 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
28410 if (!insn_p->operand[i + 1].predicate (op, mode))
28411 op = copy_to_reg (op);
28412 }
28413 }
28414 else if ((nargs - i) <= nargs_constant)
28415 {
28416 if (!match)
28417 switch (icode)
28418 {
28419 case CODE_FOR_avx2_inserti128:
28420 case CODE_FOR_avx2_extracti128:
28421 error ("the last argument must be an 1-bit immediate");
28422 return const0_rtx;
28423
28424 case CODE_FOR_sse4_1_roundpd:
28425 case CODE_FOR_sse4_1_roundps:
28426 case CODE_FOR_sse4_1_roundsd:
28427 case CODE_FOR_sse4_1_roundss:
28428 case CODE_FOR_sse4_1_blendps:
28429 case CODE_FOR_avx_blendpd256:
28430 case CODE_FOR_avx_vpermilv4df:
28431 case CODE_FOR_avx_roundpd256:
28432 case CODE_FOR_avx_roundps256:
28433 error ("the last argument must be a 4-bit immediate");
28434 return const0_rtx;
28435
28436 case CODE_FOR_sse4_1_blendpd:
28437 case CODE_FOR_avx_vpermilv2df:
28438 case CODE_FOR_xop_vpermil2v2df3:
28439 case CODE_FOR_xop_vpermil2v4sf3:
28440 case CODE_FOR_xop_vpermil2v4df3:
28441 case CODE_FOR_xop_vpermil2v8sf3:
28442 error ("the last argument must be a 2-bit immediate");
28443 return const0_rtx;
28444
28445 case CODE_FOR_avx_vextractf128v4df:
28446 case CODE_FOR_avx_vextractf128v8sf:
28447 case CODE_FOR_avx_vextractf128v8si:
28448 case CODE_FOR_avx_vinsertf128v4df:
28449 case CODE_FOR_avx_vinsertf128v8sf:
28450 case CODE_FOR_avx_vinsertf128v8si:
28451 error ("the last argument must be a 1-bit immediate");
28452 return const0_rtx;
28453
28454 case CODE_FOR_avx_vmcmpv2df3:
28455 case CODE_FOR_avx_vmcmpv4sf3:
28456 case CODE_FOR_avx_cmpv2df3:
28457 case CODE_FOR_avx_cmpv4sf3:
28458 case CODE_FOR_avx_cmpv4df3:
28459 case CODE_FOR_avx_cmpv8sf3:
28460 error ("the last argument must be a 5-bit immediate");
28461 return const0_rtx;
28462
28463 default:
28464 switch (nargs_constant)
28465 {
28466 case 2:
28467 if ((nargs - i) == nargs_constant)
28468 {
28469 error ("the next to last argument must be an 8-bit immediate");
28470 break;
28471 }
28472 case 1:
28473 error ("the last argument must be an 8-bit immediate");
28474 break;
28475 default:
28476 gcc_unreachable ();
28477 }
28478 return const0_rtx;
28479 }
28480 }
28481 else
28482 {
28483 if (VECTOR_MODE_P (mode))
28484 op = safe_vector_operand (op, mode);
28485
28486 /* If we aren't optimizing, only allow one memory operand to
28487 be generated. */
28488 if (memory_operand (op, mode))
28489 num_memory++;
28490
28491 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
28492 {
28493 if (optimize || !match || num_memory > 1)
28494 op = copy_to_mode_reg (mode, op);
28495 }
28496 else
28497 {
28498 op = copy_to_reg (op);
28499 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
28500 }
28501 }
28502
28503 args[i].op = op;
28504 args[i].mode = mode;
28505 }
28506
28507 switch (nargs)
28508 {
28509 case 1:
28510 pat = GEN_FCN (icode) (real_target, args[0].op);
28511 break;
28512 case 2:
28513 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
28514 break;
28515 case 3:
28516 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28517 args[2].op);
28518 break;
28519 case 4:
28520 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28521 args[2].op, args[3].op);
28522 break;
28523 default:
28524 gcc_unreachable ();
28525 }
28526
28527 if (! pat)
28528 return 0;
28529
28530 emit_insn (pat);
28531 return target;
28532 }
28533
28534 /* Subroutine of ix86_expand_builtin to take care of special insns
28535 with variable number of operands. */
28536
28537 static rtx
28538 ix86_expand_special_args_builtin (const struct builtin_description *d,
28539 tree exp, rtx target)
28540 {
28541 tree arg;
28542 rtx pat, op;
28543 unsigned int i, nargs, arg_adjust, memory;
28544 struct
28545 {
28546 rtx op;
28547 enum machine_mode mode;
28548 } args[3];
28549 enum insn_code icode = d->icode;
28550 bool last_arg_constant = false;
28551 const struct insn_data_d *insn_p = &insn_data[icode];
28552 enum machine_mode tmode = insn_p->operand[0].mode;
28553 enum { load, store } klass;
28554
28555 switch ((enum ix86_builtin_func_type) d->flag)
28556 {
28557 case VOID_FTYPE_VOID:
28558 if (icode == CODE_FOR_avx_vzeroupper)
28559 target = GEN_INT (vzeroupper_intrinsic);
28560 emit_insn (GEN_FCN (icode) (target));
28561 return 0;
28562 case VOID_FTYPE_UINT64:
28563 case VOID_FTYPE_UNSIGNED:
28564 nargs = 0;
28565 klass = store;
28566 memory = 0;
28567 break;
28568 case UINT64_FTYPE_VOID:
28569 case UNSIGNED_FTYPE_VOID:
28570 nargs = 0;
28571 klass = load;
28572 memory = 0;
28573 break;
28574 case UINT64_FTYPE_PUNSIGNED:
28575 case V2DI_FTYPE_PV2DI:
28576 case V4DI_FTYPE_PV4DI:
28577 case V32QI_FTYPE_PCCHAR:
28578 case V16QI_FTYPE_PCCHAR:
28579 case V8SF_FTYPE_PCV4SF:
28580 case V8SF_FTYPE_PCFLOAT:
28581 case V4SF_FTYPE_PCFLOAT:
28582 case V4DF_FTYPE_PCV2DF:
28583 case V4DF_FTYPE_PCDOUBLE:
28584 case V2DF_FTYPE_PCDOUBLE:
28585 case VOID_FTYPE_PVOID:
28586 nargs = 1;
28587 klass = load;
28588 memory = 0;
28589 break;
28590 case VOID_FTYPE_PV2SF_V4SF:
28591 case VOID_FTYPE_PV4DI_V4DI:
28592 case VOID_FTYPE_PV2DI_V2DI:
28593 case VOID_FTYPE_PCHAR_V32QI:
28594 case VOID_FTYPE_PCHAR_V16QI:
28595 case VOID_FTYPE_PFLOAT_V8SF:
28596 case VOID_FTYPE_PFLOAT_V4SF:
28597 case VOID_FTYPE_PDOUBLE_V4DF:
28598 case VOID_FTYPE_PDOUBLE_V2DF:
28599 case VOID_FTYPE_PULONGLONG_ULONGLONG:
28600 case VOID_FTYPE_PINT_INT:
28601 nargs = 1;
28602 klass = store;
28603 /* Reserve memory operand for target. */
28604 memory = ARRAY_SIZE (args);
28605 break;
28606 case V4SF_FTYPE_V4SF_PCV2SF:
28607 case V2DF_FTYPE_V2DF_PCDOUBLE:
28608 nargs = 2;
28609 klass = load;
28610 memory = 1;
28611 break;
28612 case V8SF_FTYPE_PCV8SF_V8SI:
28613 case V4DF_FTYPE_PCV4DF_V4DI:
28614 case V4SF_FTYPE_PCV4SF_V4SI:
28615 case V2DF_FTYPE_PCV2DF_V2DI:
28616 case V8SI_FTYPE_PCV8SI_V8SI:
28617 case V4DI_FTYPE_PCV4DI_V4DI:
28618 case V4SI_FTYPE_PCV4SI_V4SI:
28619 case V2DI_FTYPE_PCV2DI_V2DI:
28620 nargs = 2;
28621 klass = load;
28622 memory = 0;
28623 break;
28624 case VOID_FTYPE_PV8SF_V8SI_V8SF:
28625 case VOID_FTYPE_PV4DF_V4DI_V4DF:
28626 case VOID_FTYPE_PV4SF_V4SI_V4SF:
28627 case VOID_FTYPE_PV2DF_V2DI_V2DF:
28628 case VOID_FTYPE_PV8SI_V8SI_V8SI:
28629 case VOID_FTYPE_PV4DI_V4DI_V4DI:
28630 case VOID_FTYPE_PV4SI_V4SI_V4SI:
28631 case VOID_FTYPE_PV2DI_V2DI_V2DI:
28632 nargs = 2;
28633 klass = store;
28634 /* Reserve memory operand for target. */
28635 memory = ARRAY_SIZE (args);
28636 break;
28637 case VOID_FTYPE_UINT_UINT_UINT:
28638 case VOID_FTYPE_UINT64_UINT_UINT:
28639 case UCHAR_FTYPE_UINT_UINT_UINT:
28640 case UCHAR_FTYPE_UINT64_UINT_UINT:
28641 nargs = 3;
28642 klass = load;
28643 memory = ARRAY_SIZE (args);
28644 last_arg_constant = true;
28645 break;
28646 default:
28647 gcc_unreachable ();
28648 }
28649
28650 gcc_assert (nargs <= ARRAY_SIZE (args));
28651
28652 if (klass == store)
28653 {
28654 arg = CALL_EXPR_ARG (exp, 0);
28655 op = expand_normal (arg);
28656 gcc_assert (target == 0);
28657 if (memory)
28658 {
28659 if (GET_MODE (op) != Pmode)
28660 op = convert_to_mode (Pmode, op, 1);
28661 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
28662 }
28663 else
28664 target = force_reg (tmode, op);
28665 arg_adjust = 1;
28666 }
28667 else
28668 {
28669 arg_adjust = 0;
28670 if (optimize
28671 || target == 0
28672 || GET_MODE (target) != tmode
28673 || !insn_p->operand[0].predicate (target, tmode))
28674 target = gen_reg_rtx (tmode);
28675 }
28676
28677 for (i = 0; i < nargs; i++)
28678 {
28679 enum machine_mode mode = insn_p->operand[i + 1].mode;
28680 bool match;
28681
28682 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
28683 op = expand_normal (arg);
28684 match = insn_p->operand[i + 1].predicate (op, mode);
28685
28686 if (last_arg_constant && (i + 1) == nargs)
28687 {
28688 if (!match)
28689 {
28690 if (icode == CODE_FOR_lwp_lwpvalsi3
28691 || icode == CODE_FOR_lwp_lwpinssi3
28692 || icode == CODE_FOR_lwp_lwpvaldi3
28693 || icode == CODE_FOR_lwp_lwpinsdi3)
28694 error ("the last argument must be a 32-bit immediate");
28695 else
28696 error ("the last argument must be an 8-bit immediate");
28697 return const0_rtx;
28698 }
28699 }
28700 else
28701 {
28702 if (i == memory)
28703 {
28704 /* This must be the memory operand. */
28705 if (GET_MODE (op) != Pmode)
28706 op = convert_to_mode (Pmode, op, 1);
28707 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
28708 gcc_assert (GET_MODE (op) == mode
28709 || GET_MODE (op) == VOIDmode);
28710 }
28711 else
28712 {
28713 /* This must be register. */
28714 if (VECTOR_MODE_P (mode))
28715 op = safe_vector_operand (op, mode);
28716
28717 gcc_assert (GET_MODE (op) == mode
28718 || GET_MODE (op) == VOIDmode);
28719 op = copy_to_mode_reg (mode, op);
28720 }
28721 }
28722
28723 args[i].op = op;
28724 args[i].mode = mode;
28725 }
28726
28727 switch (nargs)
28728 {
28729 case 0:
28730 pat = GEN_FCN (icode) (target);
28731 break;
28732 case 1:
28733 pat = GEN_FCN (icode) (target, args[0].op);
28734 break;
28735 case 2:
28736 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
28737 break;
28738 case 3:
28739 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
28740 break;
28741 default:
28742 gcc_unreachable ();
28743 }
28744
28745 if (! pat)
28746 return 0;
28747 emit_insn (pat);
28748 return klass == store ? 0 : target;
28749 }
28750
28751 /* Return the integer constant in ARG. Constrain it to be in the range
28752 of the subparts of VEC_TYPE; issue an error if not. */
28753
28754 static int
28755 get_element_number (tree vec_type, tree arg)
28756 {
28757 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
28758
28759 if (!host_integerp (arg, 1)
28760 || (elt = tree_low_cst (arg, 1), elt > max))
28761 {
28762 error ("selector must be an integer constant in the range 0..%wi", max);
28763 return 0;
28764 }
28765
28766 return elt;
28767 }
28768
28769 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
28770 ix86_expand_vector_init. We DO have language-level syntax for this, in
28771 the form of (type){ init-list }. Except that since we can't place emms
28772 instructions from inside the compiler, we can't allow the use of MMX
28773 registers unless the user explicitly asks for it. So we do *not* define
28774 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
28775 we have builtins invoked by mmintrin.h that gives us license to emit
28776 these sorts of instructions. */
28777
28778 static rtx
28779 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
28780 {
28781 enum machine_mode tmode = TYPE_MODE (type);
28782 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
28783 int i, n_elt = GET_MODE_NUNITS (tmode);
28784 rtvec v = rtvec_alloc (n_elt);
28785
28786 gcc_assert (VECTOR_MODE_P (tmode));
28787 gcc_assert (call_expr_nargs (exp) == n_elt);
28788
28789 for (i = 0; i < n_elt; ++i)
28790 {
28791 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
28792 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
28793 }
28794
28795 if (!target || !register_operand (target, tmode))
28796 target = gen_reg_rtx (tmode);
28797
28798 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
28799 return target;
28800 }
28801
28802 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
28803 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
28804 had a language-level syntax for referencing vector elements. */
28805
28806 static rtx
28807 ix86_expand_vec_ext_builtin (tree exp, rtx target)
28808 {
28809 enum machine_mode tmode, mode0;
28810 tree arg0, arg1;
28811 int elt;
28812 rtx op0;
28813
28814 arg0 = CALL_EXPR_ARG (exp, 0);
28815 arg1 = CALL_EXPR_ARG (exp, 1);
28816
28817 op0 = expand_normal (arg0);
28818 elt = get_element_number (TREE_TYPE (arg0), arg1);
28819
28820 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
28821 mode0 = TYPE_MODE (TREE_TYPE (arg0));
28822 gcc_assert (VECTOR_MODE_P (mode0));
28823
28824 op0 = force_reg (mode0, op0);
28825
28826 if (optimize || !target || !register_operand (target, tmode))
28827 target = gen_reg_rtx (tmode);
28828
28829 ix86_expand_vector_extract (true, target, op0, elt);
28830
28831 return target;
28832 }
28833
28834 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
28835 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
28836 a language-level syntax for referencing vector elements. */
28837
28838 static rtx
28839 ix86_expand_vec_set_builtin (tree exp)
28840 {
28841 enum machine_mode tmode, mode1;
28842 tree arg0, arg1, arg2;
28843 int elt;
28844 rtx op0, op1, target;
28845
28846 arg0 = CALL_EXPR_ARG (exp, 0);
28847 arg1 = CALL_EXPR_ARG (exp, 1);
28848 arg2 = CALL_EXPR_ARG (exp, 2);
28849
28850 tmode = TYPE_MODE (TREE_TYPE (arg0));
28851 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
28852 gcc_assert (VECTOR_MODE_P (tmode));
28853
28854 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
28855 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
28856 elt = get_element_number (TREE_TYPE (arg0), arg2);
28857
28858 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
28859 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
28860
28861 op0 = force_reg (tmode, op0);
28862 op1 = force_reg (mode1, op1);
28863
28864 /* OP0 is the source of these builtin functions and shouldn't be
28865 modified. Create a copy, use it and return it as target. */
28866 target = gen_reg_rtx (tmode);
28867 emit_move_insn (target, op0);
28868 ix86_expand_vector_set (true, target, op1, elt);
28869
28870 return target;
28871 }
28872
28873 /* Expand an expression EXP that calls a built-in function,
28874 with result going to TARGET if that's convenient
28875 (and in mode MODE if that's convenient).
28876 SUBTARGET may be used as the target for computing one of EXP's operands.
28877 IGNORE is nonzero if the value is to be ignored. */
28878
28879 static rtx
28880 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
28881 enum machine_mode mode ATTRIBUTE_UNUSED,
28882 int ignore ATTRIBUTE_UNUSED)
28883 {
28884 const struct builtin_description *d;
28885 size_t i;
28886 enum insn_code icode;
28887 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
28888 tree arg0, arg1, arg2, arg3, arg4;
28889 rtx op0, op1, op2, op3, op4, pat;
28890 enum machine_mode mode0, mode1, mode2, mode3, mode4;
28891 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
28892
28893 /* Determine whether the builtin function is available under the current ISA.
28894 Originally the builtin was not created if it wasn't applicable to the
28895 current ISA based on the command line switches. With function specific
28896 options, we need to check in the context of the function making the call
28897 whether it is supported. */
28898 if (ix86_builtins_isa[fcode].isa
28899 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
28900 {
28901 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
28902 NULL, (enum fpmath_unit) 0, false);
28903
28904 if (!opts)
28905 error ("%qE needs unknown isa option", fndecl);
28906 else
28907 {
28908 gcc_assert (opts != NULL);
28909 error ("%qE needs isa option %s", fndecl, opts);
28910 free (opts);
28911 }
28912 return const0_rtx;
28913 }
28914
28915 switch (fcode)
28916 {
28917 case IX86_BUILTIN_MASKMOVQ:
28918 case IX86_BUILTIN_MASKMOVDQU:
28919 icode = (fcode == IX86_BUILTIN_MASKMOVQ
28920 ? CODE_FOR_mmx_maskmovq
28921 : CODE_FOR_sse2_maskmovdqu);
28922 /* Note the arg order is different from the operand order. */
28923 arg1 = CALL_EXPR_ARG (exp, 0);
28924 arg2 = CALL_EXPR_ARG (exp, 1);
28925 arg0 = CALL_EXPR_ARG (exp, 2);
28926 op0 = expand_normal (arg0);
28927 op1 = expand_normal (arg1);
28928 op2 = expand_normal (arg2);
28929 mode0 = insn_data[icode].operand[0].mode;
28930 mode1 = insn_data[icode].operand[1].mode;
28931 mode2 = insn_data[icode].operand[2].mode;
28932
28933 if (GET_MODE (op0) != Pmode)
28934 op0 = convert_to_mode (Pmode, op0, 1);
28935 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
28936
28937 if (!insn_data[icode].operand[0].predicate (op0, mode0))
28938 op0 = copy_to_mode_reg (mode0, op0);
28939 if (!insn_data[icode].operand[1].predicate (op1, mode1))
28940 op1 = copy_to_mode_reg (mode1, op1);
28941 if (!insn_data[icode].operand[2].predicate (op2, mode2))
28942 op2 = copy_to_mode_reg (mode2, op2);
28943 pat = GEN_FCN (icode) (op0, op1, op2);
28944 if (! pat)
28945 return 0;
28946 emit_insn (pat);
28947 return 0;
28948
28949 case IX86_BUILTIN_LDMXCSR:
28950 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
28951 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
28952 emit_move_insn (target, op0);
28953 emit_insn (gen_sse_ldmxcsr (target));
28954 return 0;
28955
28956 case IX86_BUILTIN_STMXCSR:
28957 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
28958 emit_insn (gen_sse_stmxcsr (target));
28959 return copy_to_mode_reg (SImode, target);
28960
28961 case IX86_BUILTIN_CLFLUSH:
28962 arg0 = CALL_EXPR_ARG (exp, 0);
28963 op0 = expand_normal (arg0);
28964 icode = CODE_FOR_sse2_clflush;
28965 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
28966 {
28967 if (GET_MODE (op0) != Pmode)
28968 op0 = convert_to_mode (Pmode, op0, 1);
28969 op0 = force_reg (Pmode, op0);
28970 }
28971
28972 emit_insn (gen_sse2_clflush (op0));
28973 return 0;
28974
28975 case IX86_BUILTIN_MONITOR:
28976 arg0 = CALL_EXPR_ARG (exp, 0);
28977 arg1 = CALL_EXPR_ARG (exp, 1);
28978 arg2 = CALL_EXPR_ARG (exp, 2);
28979 op0 = expand_normal (arg0);
28980 op1 = expand_normal (arg1);
28981 op2 = expand_normal (arg2);
28982 if (!REG_P (op0))
28983 {
28984 if (GET_MODE (op0) != Pmode)
28985 op0 = convert_to_mode (Pmode, op0, 1);
28986 op0 = force_reg (Pmode, op0);
28987 }
28988 if (!REG_P (op1))
28989 op1 = copy_to_mode_reg (SImode, op1);
28990 if (!REG_P (op2))
28991 op2 = copy_to_mode_reg (SImode, op2);
28992 emit_insn (ix86_gen_monitor (op0, op1, op2));
28993 return 0;
28994
28995 case IX86_BUILTIN_MWAIT:
28996 arg0 = CALL_EXPR_ARG (exp, 0);
28997 arg1 = CALL_EXPR_ARG (exp, 1);
28998 op0 = expand_normal (arg0);
28999 op1 = expand_normal (arg1);
29000 if (!REG_P (op0))
29001 op0 = copy_to_mode_reg (SImode, op0);
29002 if (!REG_P (op1))
29003 op1 = copy_to_mode_reg (SImode, op1);
29004 emit_insn (gen_sse3_mwait (op0, op1));
29005 return 0;
29006
29007 case IX86_BUILTIN_VEC_INIT_V2SI:
29008 case IX86_BUILTIN_VEC_INIT_V4HI:
29009 case IX86_BUILTIN_VEC_INIT_V8QI:
29010 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
29011
29012 case IX86_BUILTIN_VEC_EXT_V2DF:
29013 case IX86_BUILTIN_VEC_EXT_V2DI:
29014 case IX86_BUILTIN_VEC_EXT_V4SF:
29015 case IX86_BUILTIN_VEC_EXT_V4SI:
29016 case IX86_BUILTIN_VEC_EXT_V8HI:
29017 case IX86_BUILTIN_VEC_EXT_V2SI:
29018 case IX86_BUILTIN_VEC_EXT_V4HI:
29019 case IX86_BUILTIN_VEC_EXT_V16QI:
29020 return ix86_expand_vec_ext_builtin (exp, target);
29021
29022 case IX86_BUILTIN_VEC_SET_V2DI:
29023 case IX86_BUILTIN_VEC_SET_V4SF:
29024 case IX86_BUILTIN_VEC_SET_V4SI:
29025 case IX86_BUILTIN_VEC_SET_V8HI:
29026 case IX86_BUILTIN_VEC_SET_V4HI:
29027 case IX86_BUILTIN_VEC_SET_V16QI:
29028 return ix86_expand_vec_set_builtin (exp);
29029
29030 case IX86_BUILTIN_INFQ:
29031 case IX86_BUILTIN_HUGE_VALQ:
29032 {
29033 REAL_VALUE_TYPE inf;
29034 rtx tmp;
29035
29036 real_inf (&inf);
29037 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
29038
29039 tmp = validize_mem (force_const_mem (mode, tmp));
29040
29041 if (target == 0)
29042 target = gen_reg_rtx (mode);
29043
29044 emit_move_insn (target, tmp);
29045 return target;
29046 }
29047
29048 case IX86_BUILTIN_LLWPCB:
29049 arg0 = CALL_EXPR_ARG (exp, 0);
29050 op0 = expand_normal (arg0);
29051 icode = CODE_FOR_lwp_llwpcb;
29052 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29053 {
29054 if (GET_MODE (op0) != Pmode)
29055 op0 = convert_to_mode (Pmode, op0, 1);
29056 op0 = force_reg (Pmode, op0);
29057 }
29058 emit_insn (gen_lwp_llwpcb (op0));
29059 return 0;
29060
29061 case IX86_BUILTIN_SLWPCB:
29062 icode = CODE_FOR_lwp_slwpcb;
29063 if (!target
29064 || !insn_data[icode].operand[0].predicate (target, Pmode))
29065 target = gen_reg_rtx (Pmode);
29066 emit_insn (gen_lwp_slwpcb (target));
29067 return target;
29068
29069 case IX86_BUILTIN_BEXTRI32:
29070 case IX86_BUILTIN_BEXTRI64:
29071 arg0 = CALL_EXPR_ARG (exp, 0);
29072 arg1 = CALL_EXPR_ARG (exp, 1);
29073 op0 = expand_normal (arg0);
29074 op1 = expand_normal (arg1);
29075 icode = (fcode == IX86_BUILTIN_BEXTRI32
29076 ? CODE_FOR_tbm_bextri_si
29077 : CODE_FOR_tbm_bextri_di);
29078 if (!CONST_INT_P (op1))
29079 {
29080 error ("last argument must be an immediate");
29081 return const0_rtx;
29082 }
29083 else
29084 {
29085 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
29086 unsigned char lsb_index = INTVAL (op1) & 0xFF;
29087 op1 = GEN_INT (length);
29088 op2 = GEN_INT (lsb_index);
29089 pat = GEN_FCN (icode) (target, op0, op1, op2);
29090 if (pat)
29091 emit_insn (pat);
29092 return target;
29093 }
29094
29095 case IX86_BUILTIN_RDRAND16_STEP:
29096 icode = CODE_FOR_rdrandhi_1;
29097 mode0 = HImode;
29098 goto rdrand_step;
29099
29100 case IX86_BUILTIN_RDRAND32_STEP:
29101 icode = CODE_FOR_rdrandsi_1;
29102 mode0 = SImode;
29103 goto rdrand_step;
29104
29105 case IX86_BUILTIN_RDRAND64_STEP:
29106 icode = CODE_FOR_rdranddi_1;
29107 mode0 = DImode;
29108
29109 rdrand_step:
29110 op0 = gen_reg_rtx (mode0);
29111 emit_insn (GEN_FCN (icode) (op0));
29112
29113 arg0 = CALL_EXPR_ARG (exp, 0);
29114 op1 = expand_normal (arg0);
29115 if (!address_operand (op1, VOIDmode))
29116 {
29117 op1 = convert_memory_address (Pmode, op1);
29118 op1 = copy_addr_to_reg (op1);
29119 }
29120 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
29121
29122 op1 = gen_reg_rtx (SImode);
29123 emit_move_insn (op1, CONST1_RTX (SImode));
29124
29125 /* Emit SImode conditional move. */
29126 if (mode0 == HImode)
29127 {
29128 op2 = gen_reg_rtx (SImode);
29129 emit_insn (gen_zero_extendhisi2 (op2, op0));
29130 }
29131 else if (mode0 == SImode)
29132 op2 = op0;
29133 else
29134 op2 = gen_rtx_SUBREG (SImode, op0, 0);
29135
29136 if (target == 0)
29137 target = gen_reg_rtx (SImode);
29138
29139 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
29140 const0_rtx);
29141 emit_insn (gen_rtx_SET (VOIDmode, target,
29142 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
29143 return target;
29144
29145 case IX86_BUILTIN_GATHERSIV2DF:
29146 icode = CODE_FOR_avx2_gathersiv2df;
29147 goto gather_gen;
29148 case IX86_BUILTIN_GATHERSIV4DF:
29149 icode = CODE_FOR_avx2_gathersiv4df;
29150 goto gather_gen;
29151 case IX86_BUILTIN_GATHERDIV2DF:
29152 icode = CODE_FOR_avx2_gatherdiv2df;
29153 goto gather_gen;
29154 case IX86_BUILTIN_GATHERDIV4DF:
29155 icode = CODE_FOR_avx2_gatherdiv4df;
29156 goto gather_gen;
29157 case IX86_BUILTIN_GATHERSIV4SF:
29158 icode = CODE_FOR_avx2_gathersiv4sf;
29159 goto gather_gen;
29160 case IX86_BUILTIN_GATHERSIV8SF:
29161 icode = CODE_FOR_avx2_gathersiv8sf;
29162 goto gather_gen;
29163 case IX86_BUILTIN_GATHERDIV4SF:
29164 icode = CODE_FOR_avx2_gatherdiv4sf;
29165 goto gather_gen;
29166 case IX86_BUILTIN_GATHERDIV8SF:
29167 icode = CODE_FOR_avx2_gatherdiv8sf;
29168 goto gather_gen;
29169 case IX86_BUILTIN_GATHERSIV2DI:
29170 icode = CODE_FOR_avx2_gathersiv2di;
29171 goto gather_gen;
29172 case IX86_BUILTIN_GATHERSIV4DI:
29173 icode = CODE_FOR_avx2_gathersiv4di;
29174 goto gather_gen;
29175 case IX86_BUILTIN_GATHERDIV2DI:
29176 icode = CODE_FOR_avx2_gatherdiv2di;
29177 goto gather_gen;
29178 case IX86_BUILTIN_GATHERDIV4DI:
29179 icode = CODE_FOR_avx2_gatherdiv4di;
29180 goto gather_gen;
29181 case IX86_BUILTIN_GATHERSIV4SI:
29182 icode = CODE_FOR_avx2_gathersiv4si;
29183 goto gather_gen;
29184 case IX86_BUILTIN_GATHERSIV8SI:
29185 icode = CODE_FOR_avx2_gathersiv8si;
29186 goto gather_gen;
29187 case IX86_BUILTIN_GATHERDIV4SI:
29188 icode = CODE_FOR_avx2_gatherdiv4si;
29189 goto gather_gen;
29190 case IX86_BUILTIN_GATHERDIV8SI:
29191 icode = CODE_FOR_avx2_gatherdiv8si;
29192 goto gather_gen;
29193 case IX86_BUILTIN_GATHERALTSIV4DF:
29194 icode = CODE_FOR_avx2_gathersiv4df;
29195 goto gather_gen;
29196 case IX86_BUILTIN_GATHERALTDIV8SF:
29197 icode = CODE_FOR_avx2_gatherdiv8sf;
29198 goto gather_gen;
29199 case IX86_BUILTIN_GATHERALTSIV4DI:
29200 icode = CODE_FOR_avx2_gathersiv4df;
29201 goto gather_gen;
29202 case IX86_BUILTIN_GATHERALTDIV8SI:
29203 icode = CODE_FOR_avx2_gatherdiv8si;
29204 goto gather_gen;
29205
29206 gather_gen:
29207 arg0 = CALL_EXPR_ARG (exp, 0);
29208 arg1 = CALL_EXPR_ARG (exp, 1);
29209 arg2 = CALL_EXPR_ARG (exp, 2);
29210 arg3 = CALL_EXPR_ARG (exp, 3);
29211 arg4 = CALL_EXPR_ARG (exp, 4);
29212 op0 = expand_normal (arg0);
29213 op1 = expand_normal (arg1);
29214 op2 = expand_normal (arg2);
29215 op3 = expand_normal (arg3);
29216 op4 = expand_normal (arg4);
29217 /* Note the arg order is different from the operand order. */
29218 mode0 = insn_data[icode].operand[1].mode;
29219 mode2 = insn_data[icode].operand[3].mode;
29220 mode3 = insn_data[icode].operand[4].mode;
29221 mode4 = insn_data[icode].operand[5].mode;
29222
29223 if (target == NULL_RTX
29224 || GET_MODE (target) != insn_data[icode].operand[0].mode)
29225 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
29226 else
29227 subtarget = target;
29228
29229 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
29230 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
29231 {
29232 rtx half = gen_reg_rtx (V4SImode);
29233 if (!nonimmediate_operand (op2, V8SImode))
29234 op2 = copy_to_mode_reg (V8SImode, op2);
29235 emit_insn (gen_vec_extract_lo_v8si (half, op2));
29236 op2 = half;
29237 }
29238 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
29239 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
29240 {
29241 rtx (*gen) (rtx, rtx);
29242 rtx half = gen_reg_rtx (mode0);
29243 if (mode0 == V4SFmode)
29244 gen = gen_vec_extract_lo_v8sf;
29245 else
29246 gen = gen_vec_extract_lo_v8si;
29247 if (!nonimmediate_operand (op0, GET_MODE (op0)))
29248 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
29249 emit_insn (gen (half, op0));
29250 op0 = half;
29251 if (!nonimmediate_operand (op3, GET_MODE (op3)))
29252 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
29253 emit_insn (gen (half, op3));
29254 op3 = half;
29255 }
29256
29257 /* Force memory operand only with base register here. But we
29258 don't want to do it on memory operand for other builtin
29259 functions. */
29260 if (GET_MODE (op1) != Pmode)
29261 op1 = convert_to_mode (Pmode, op1, 1);
29262 op1 = force_reg (Pmode, op1);
29263
29264 if (!insn_data[icode].operand[1].predicate (op0, mode0))
29265 op0 = copy_to_mode_reg (mode0, op0);
29266 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
29267 op1 = copy_to_mode_reg (Pmode, op1);
29268 if (!insn_data[icode].operand[3].predicate (op2, mode2))
29269 op2 = copy_to_mode_reg (mode2, op2);
29270 if (!insn_data[icode].operand[4].predicate (op3, mode3))
29271 op3 = copy_to_mode_reg (mode3, op3);
29272 if (!insn_data[icode].operand[5].predicate (op4, mode4))
29273 {
29274 error ("last argument must be scale 1, 2, 4, 8");
29275 return const0_rtx;
29276 }
29277
29278 /* Optimize. If mask is known to have all high bits set,
29279 replace op0 with pc_rtx to signal that the instruction
29280 overwrites the whole destination and doesn't use its
29281 previous contents. */
29282 if (optimize)
29283 {
29284 if (TREE_CODE (arg3) == VECTOR_CST)
29285 {
29286 tree elt;
29287 unsigned int negative = 0;
29288 for (elt = TREE_VECTOR_CST_ELTS (arg3);
29289 elt; elt = TREE_CHAIN (elt))
29290 {
29291 tree cst = TREE_VALUE (elt);
29292 if (TREE_CODE (cst) == INTEGER_CST
29293 && tree_int_cst_sign_bit (cst))
29294 negative++;
29295 else if (TREE_CODE (cst) == REAL_CST
29296 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
29297 negative++;
29298 }
29299 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
29300 op0 = pc_rtx;
29301 }
29302 else if (TREE_CODE (arg3) == SSA_NAME)
29303 {
29304 /* Recognize also when mask is like:
29305 __v2df src = _mm_setzero_pd ();
29306 __v2df mask = _mm_cmpeq_pd (src, src);
29307 or
29308 __v8sf src = _mm256_setzero_ps ();
29309 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
29310 as that is a cheaper way to load all ones into
29311 a register than having to load a constant from
29312 memory. */
29313 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
29314 if (is_gimple_call (def_stmt))
29315 {
29316 tree fndecl = gimple_call_fndecl (def_stmt);
29317 if (fndecl
29318 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
29319 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
29320 {
29321 case IX86_BUILTIN_CMPPD:
29322 case IX86_BUILTIN_CMPPS:
29323 case IX86_BUILTIN_CMPPD256:
29324 case IX86_BUILTIN_CMPPS256:
29325 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
29326 break;
29327 /* FALLTHRU */
29328 case IX86_BUILTIN_CMPEQPD:
29329 case IX86_BUILTIN_CMPEQPS:
29330 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
29331 && initializer_zerop (gimple_call_arg (def_stmt,
29332 1)))
29333 op0 = pc_rtx;
29334 break;
29335 default:
29336 break;
29337 }
29338 }
29339 }
29340 }
29341
29342 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
29343 if (! pat)
29344 return const0_rtx;
29345 emit_insn (pat);
29346
29347 if (fcode == IX86_BUILTIN_GATHERDIV8SF
29348 || fcode == IX86_BUILTIN_GATHERDIV8SI)
29349 {
29350 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
29351 ? V4SFmode : V4SImode;
29352 if (target == NULL_RTX)
29353 target = gen_reg_rtx (tmode);
29354 if (tmode == V4SFmode)
29355 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
29356 else
29357 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
29358 }
29359 else
29360 target = subtarget;
29361
29362 return target;
29363
29364 default:
29365 break;
29366 }
29367
29368 for (i = 0, d = bdesc_special_args;
29369 i < ARRAY_SIZE (bdesc_special_args);
29370 i++, d++)
29371 if (d->code == fcode)
29372 return ix86_expand_special_args_builtin (d, exp, target);
29373
29374 for (i = 0, d = bdesc_args;
29375 i < ARRAY_SIZE (bdesc_args);
29376 i++, d++)
29377 if (d->code == fcode)
29378 switch (fcode)
29379 {
29380 case IX86_BUILTIN_FABSQ:
29381 case IX86_BUILTIN_COPYSIGNQ:
29382 if (!TARGET_SSE2)
29383 /* Emit a normal call if SSE2 isn't available. */
29384 return expand_call (exp, target, ignore);
29385 default:
29386 return ix86_expand_args_builtin (d, exp, target);
29387 }
29388
29389 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
29390 if (d->code == fcode)
29391 return ix86_expand_sse_comi (d, exp, target);
29392
29393 for (i = 0, d = bdesc_pcmpestr;
29394 i < ARRAY_SIZE (bdesc_pcmpestr);
29395 i++, d++)
29396 if (d->code == fcode)
29397 return ix86_expand_sse_pcmpestr (d, exp, target);
29398
29399 for (i = 0, d = bdesc_pcmpistr;
29400 i < ARRAY_SIZE (bdesc_pcmpistr);
29401 i++, d++)
29402 if (d->code == fcode)
29403 return ix86_expand_sse_pcmpistr (d, exp, target);
29404
29405 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29406 if (d->code == fcode)
29407 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
29408 (enum ix86_builtin_func_type)
29409 d->flag, d->comparison);
29410
29411 gcc_unreachable ();
29412 }
29413
29414 /* Returns a function decl for a vectorized version of the builtin function
29415 with builtin function code FN and the result vector type TYPE, or NULL_TREE
29416 if it is not available. */
29417
29418 static tree
29419 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
29420 tree type_in)
29421 {
29422 enum machine_mode in_mode, out_mode;
29423 int in_n, out_n;
29424 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
29425
29426 if (TREE_CODE (type_out) != VECTOR_TYPE
29427 || TREE_CODE (type_in) != VECTOR_TYPE
29428 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
29429 return NULL_TREE;
29430
29431 out_mode = TYPE_MODE (TREE_TYPE (type_out));
29432 out_n = TYPE_VECTOR_SUBPARTS (type_out);
29433 in_mode = TYPE_MODE (TREE_TYPE (type_in));
29434 in_n = TYPE_VECTOR_SUBPARTS (type_in);
29435
29436 switch (fn)
29437 {
29438 case BUILT_IN_SQRT:
29439 if (out_mode == DFmode && in_mode == DFmode)
29440 {
29441 if (out_n == 2 && in_n == 2)
29442 return ix86_builtins[IX86_BUILTIN_SQRTPD];
29443 else if (out_n == 4 && in_n == 4)
29444 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
29445 }
29446 break;
29447
29448 case BUILT_IN_SQRTF:
29449 if (out_mode == SFmode && in_mode == SFmode)
29450 {
29451 if (out_n == 4 && in_n == 4)
29452 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
29453 else if (out_n == 8 && in_n == 8)
29454 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
29455 }
29456 break;
29457
29458 case BUILT_IN_IRINT:
29459 case BUILT_IN_LRINT:
29460 case BUILT_IN_LLRINT:
29461 if (out_mode == SImode && in_mode == DFmode)
29462 {
29463 if (out_n == 4 && in_n == 2)
29464 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
29465 else if (out_n == 8 && in_n == 4)
29466 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
29467 }
29468 break;
29469
29470 case BUILT_IN_IRINTF:
29471 case BUILT_IN_LRINTF:
29472 case BUILT_IN_LLRINTF:
29473 if (out_mode == SImode && in_mode == SFmode)
29474 {
29475 if (out_n == 4 && in_n == 4)
29476 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
29477 else if (out_n == 8 && in_n == 8)
29478 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
29479 }
29480 break;
29481
29482 case BUILT_IN_COPYSIGN:
29483 if (out_mode == DFmode && in_mode == DFmode)
29484 {
29485 if (out_n == 2 && in_n == 2)
29486 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
29487 else if (out_n == 4 && in_n == 4)
29488 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
29489 }
29490 break;
29491
29492 case BUILT_IN_COPYSIGNF:
29493 if (out_mode == SFmode && in_mode == SFmode)
29494 {
29495 if (out_n == 4 && in_n == 4)
29496 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
29497 else if (out_n == 8 && in_n == 8)
29498 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
29499 }
29500 break;
29501
29502 case BUILT_IN_FLOOR:
29503 /* The round insn does not trap on denormals. */
29504 if (flag_trapping_math || !TARGET_ROUND)
29505 break;
29506
29507 if (out_mode == DFmode && in_mode == DFmode)
29508 {
29509 if (out_n == 2 && in_n == 2)
29510 return ix86_builtins[IX86_BUILTIN_FLOORPD];
29511 else if (out_n == 4 && in_n == 4)
29512 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
29513 }
29514 break;
29515
29516 case BUILT_IN_FLOORF:
29517 /* The round insn does not trap on denormals. */
29518 if (flag_trapping_math || !TARGET_ROUND)
29519 break;
29520
29521 if (out_mode == SFmode && in_mode == SFmode)
29522 {
29523 if (out_n == 4 && in_n == 4)
29524 return ix86_builtins[IX86_BUILTIN_FLOORPS];
29525 else if (out_n == 8 && in_n == 8)
29526 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
29527 }
29528 break;
29529
29530 case BUILT_IN_CEIL:
29531 /* The round insn does not trap on denormals. */
29532 if (flag_trapping_math || !TARGET_ROUND)
29533 break;
29534
29535 if (out_mode == DFmode && in_mode == DFmode)
29536 {
29537 if (out_n == 2 && in_n == 2)
29538 return ix86_builtins[IX86_BUILTIN_CEILPD];
29539 else if (out_n == 4 && in_n == 4)
29540 return ix86_builtins[IX86_BUILTIN_CEILPD256];
29541 }
29542 break;
29543
29544 case BUILT_IN_CEILF:
29545 /* The round insn does not trap on denormals. */
29546 if (flag_trapping_math || !TARGET_ROUND)
29547 break;
29548
29549 if (out_mode == SFmode && in_mode == SFmode)
29550 {
29551 if (out_n == 4 && in_n == 4)
29552 return ix86_builtins[IX86_BUILTIN_CEILPS];
29553 else if (out_n == 8 && in_n == 8)
29554 return ix86_builtins[IX86_BUILTIN_CEILPS256];
29555 }
29556 break;
29557
29558 case BUILT_IN_TRUNC:
29559 /* The round insn does not trap on denormals. */
29560 if (flag_trapping_math || !TARGET_ROUND)
29561 break;
29562
29563 if (out_mode == DFmode && in_mode == DFmode)
29564 {
29565 if (out_n == 2 && in_n == 2)
29566 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
29567 else if (out_n == 4 && in_n == 4)
29568 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
29569 }
29570 break;
29571
29572 case BUILT_IN_TRUNCF:
29573 /* The round insn does not trap on denormals. */
29574 if (flag_trapping_math || !TARGET_ROUND)
29575 break;
29576
29577 if (out_mode == SFmode && in_mode == SFmode)
29578 {
29579 if (out_n == 4 && in_n == 4)
29580 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
29581 else if (out_n == 8 && in_n == 8)
29582 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
29583 }
29584 break;
29585
29586 case BUILT_IN_RINT:
29587 /* The round insn does not trap on denormals. */
29588 if (flag_trapping_math || !TARGET_ROUND)
29589 break;
29590
29591 if (out_mode == DFmode && in_mode == DFmode)
29592 {
29593 if (out_n == 2 && in_n == 2)
29594 return ix86_builtins[IX86_BUILTIN_RINTPD];
29595 else if (out_n == 4 && in_n == 4)
29596 return ix86_builtins[IX86_BUILTIN_RINTPD256];
29597 }
29598 break;
29599
29600 case BUILT_IN_RINTF:
29601 /* The round insn does not trap on denormals. */
29602 if (flag_trapping_math || !TARGET_ROUND)
29603 break;
29604
29605 if (out_mode == SFmode && in_mode == SFmode)
29606 {
29607 if (out_n == 4 && in_n == 4)
29608 return ix86_builtins[IX86_BUILTIN_RINTPS];
29609 else if (out_n == 8 && in_n == 8)
29610 return ix86_builtins[IX86_BUILTIN_RINTPS256];
29611 }
29612 break;
29613
29614 case BUILT_IN_ROUND:
29615 /* The round insn does not trap on denormals. */
29616 if (flag_trapping_math || !TARGET_ROUND)
29617 break;
29618
29619 if (out_mode == DFmode && in_mode == DFmode)
29620 {
29621 if (out_n == 2 && in_n == 2)
29622 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
29623 else if (out_n == 4 && in_n == 4)
29624 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
29625 }
29626 break;
29627
29628 case BUILT_IN_ROUNDF:
29629 /* The round insn does not trap on denormals. */
29630 if (flag_trapping_math || !TARGET_ROUND)
29631 break;
29632
29633 if (out_mode == SFmode && in_mode == SFmode)
29634 {
29635 if (out_n == 4 && in_n == 4)
29636 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
29637 else if (out_n == 8 && in_n == 8)
29638 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
29639 }
29640 break;
29641
29642 case BUILT_IN_FMA:
29643 if (out_mode == DFmode && in_mode == DFmode)
29644 {
29645 if (out_n == 2 && in_n == 2)
29646 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
29647 if (out_n == 4 && in_n == 4)
29648 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
29649 }
29650 break;
29651
29652 case BUILT_IN_FMAF:
29653 if (out_mode == SFmode && in_mode == SFmode)
29654 {
29655 if (out_n == 4 && in_n == 4)
29656 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
29657 if (out_n == 8 && in_n == 8)
29658 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
29659 }
29660 break;
29661
29662 default:
29663 break;
29664 }
29665
29666 /* Dispatch to a handler for a vectorization library. */
29667 if (ix86_veclib_handler)
29668 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
29669 type_in);
29670
29671 return NULL_TREE;
29672 }
29673
29674 /* Handler for an SVML-style interface to
29675 a library with vectorized intrinsics. */
29676
29677 static tree
29678 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
29679 {
29680 char name[20];
29681 tree fntype, new_fndecl, args;
29682 unsigned arity;
29683 const char *bname;
29684 enum machine_mode el_mode, in_mode;
29685 int n, in_n;
29686
29687 /* The SVML is suitable for unsafe math only. */
29688 if (!flag_unsafe_math_optimizations)
29689 return NULL_TREE;
29690
29691 el_mode = TYPE_MODE (TREE_TYPE (type_out));
29692 n = TYPE_VECTOR_SUBPARTS (type_out);
29693 in_mode = TYPE_MODE (TREE_TYPE (type_in));
29694 in_n = TYPE_VECTOR_SUBPARTS (type_in);
29695 if (el_mode != in_mode
29696 || n != in_n)
29697 return NULL_TREE;
29698
29699 switch (fn)
29700 {
29701 case BUILT_IN_EXP:
29702 case BUILT_IN_LOG:
29703 case BUILT_IN_LOG10:
29704 case BUILT_IN_POW:
29705 case BUILT_IN_TANH:
29706 case BUILT_IN_TAN:
29707 case BUILT_IN_ATAN:
29708 case BUILT_IN_ATAN2:
29709 case BUILT_IN_ATANH:
29710 case BUILT_IN_CBRT:
29711 case BUILT_IN_SINH:
29712 case BUILT_IN_SIN:
29713 case BUILT_IN_ASINH:
29714 case BUILT_IN_ASIN:
29715 case BUILT_IN_COSH:
29716 case BUILT_IN_COS:
29717 case BUILT_IN_ACOSH:
29718 case BUILT_IN_ACOS:
29719 if (el_mode != DFmode || n != 2)
29720 return NULL_TREE;
29721 break;
29722
29723 case BUILT_IN_EXPF:
29724 case BUILT_IN_LOGF:
29725 case BUILT_IN_LOG10F:
29726 case BUILT_IN_POWF:
29727 case BUILT_IN_TANHF:
29728 case BUILT_IN_TANF:
29729 case BUILT_IN_ATANF:
29730 case BUILT_IN_ATAN2F:
29731 case BUILT_IN_ATANHF:
29732 case BUILT_IN_CBRTF:
29733 case BUILT_IN_SINHF:
29734 case BUILT_IN_SINF:
29735 case BUILT_IN_ASINHF:
29736 case BUILT_IN_ASINF:
29737 case BUILT_IN_COSHF:
29738 case BUILT_IN_COSF:
29739 case BUILT_IN_ACOSHF:
29740 case BUILT_IN_ACOSF:
29741 if (el_mode != SFmode || n != 4)
29742 return NULL_TREE;
29743 break;
29744
29745 default:
29746 return NULL_TREE;
29747 }
29748
29749 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
29750
29751 if (fn == BUILT_IN_LOGF)
29752 strcpy (name, "vmlsLn4");
29753 else if (fn == BUILT_IN_LOG)
29754 strcpy (name, "vmldLn2");
29755 else if (n == 4)
29756 {
29757 sprintf (name, "vmls%s", bname+10);
29758 name[strlen (name)-1] = '4';
29759 }
29760 else
29761 sprintf (name, "vmld%s2", bname+10);
29762
29763 /* Convert to uppercase. */
29764 name[4] &= ~0x20;
29765
29766 arity = 0;
29767 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
29768 args;
29769 args = TREE_CHAIN (args))
29770 arity++;
29771
29772 if (arity == 1)
29773 fntype = build_function_type_list (type_out, type_in, NULL);
29774 else
29775 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
29776
29777 /* Build a function declaration for the vectorized function. */
29778 new_fndecl = build_decl (BUILTINS_LOCATION,
29779 FUNCTION_DECL, get_identifier (name), fntype);
29780 TREE_PUBLIC (new_fndecl) = 1;
29781 DECL_EXTERNAL (new_fndecl) = 1;
29782 DECL_IS_NOVOPS (new_fndecl) = 1;
29783 TREE_READONLY (new_fndecl) = 1;
29784
29785 return new_fndecl;
29786 }
29787
29788 /* Handler for an ACML-style interface to
29789 a library with vectorized intrinsics. */
29790
29791 static tree
29792 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
29793 {
29794 char name[20] = "__vr.._";
29795 tree fntype, new_fndecl, args;
29796 unsigned arity;
29797 const char *bname;
29798 enum machine_mode el_mode, in_mode;
29799 int n, in_n;
29800
29801 /* The ACML is 64bits only and suitable for unsafe math only as
29802 it does not correctly support parts of IEEE with the required
29803 precision such as denormals. */
29804 if (!TARGET_64BIT
29805 || !flag_unsafe_math_optimizations)
29806 return NULL_TREE;
29807
29808 el_mode = TYPE_MODE (TREE_TYPE (type_out));
29809 n = TYPE_VECTOR_SUBPARTS (type_out);
29810 in_mode = TYPE_MODE (TREE_TYPE (type_in));
29811 in_n = TYPE_VECTOR_SUBPARTS (type_in);
29812 if (el_mode != in_mode
29813 || n != in_n)
29814 return NULL_TREE;
29815
29816 switch (fn)
29817 {
29818 case BUILT_IN_SIN:
29819 case BUILT_IN_COS:
29820 case BUILT_IN_EXP:
29821 case BUILT_IN_LOG:
29822 case BUILT_IN_LOG2:
29823 case BUILT_IN_LOG10:
29824 name[4] = 'd';
29825 name[5] = '2';
29826 if (el_mode != DFmode
29827 || n != 2)
29828 return NULL_TREE;
29829 break;
29830
29831 case BUILT_IN_SINF:
29832 case BUILT_IN_COSF:
29833 case BUILT_IN_EXPF:
29834 case BUILT_IN_POWF:
29835 case BUILT_IN_LOGF:
29836 case BUILT_IN_LOG2F:
29837 case BUILT_IN_LOG10F:
29838 name[4] = 's';
29839 name[5] = '4';
29840 if (el_mode != SFmode
29841 || n != 4)
29842 return NULL_TREE;
29843 break;
29844
29845 default:
29846 return NULL_TREE;
29847 }
29848
29849 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
29850 sprintf (name + 7, "%s", bname+10);
29851
29852 arity = 0;
29853 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
29854 args;
29855 args = TREE_CHAIN (args))
29856 arity++;
29857
29858 if (arity == 1)
29859 fntype = build_function_type_list (type_out, type_in, NULL);
29860 else
29861 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
29862
29863 /* Build a function declaration for the vectorized function. */
29864 new_fndecl = build_decl (BUILTINS_LOCATION,
29865 FUNCTION_DECL, get_identifier (name), fntype);
29866 TREE_PUBLIC (new_fndecl) = 1;
29867 DECL_EXTERNAL (new_fndecl) = 1;
29868 DECL_IS_NOVOPS (new_fndecl) = 1;
29869 TREE_READONLY (new_fndecl) = 1;
29870
29871 return new_fndecl;
29872 }
29873
29874 /* Returns a decl of a function that implements gather load with
29875 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
29876 Return NULL_TREE if it is not available. */
29877
29878 static tree
29879 ix86_vectorize_builtin_gather (const_tree mem_vectype,
29880 const_tree index_type, int scale)
29881 {
29882 bool si;
29883 enum ix86_builtins code;
29884
29885 if (! TARGET_AVX2)
29886 return NULL_TREE;
29887
29888 if ((TREE_CODE (index_type) != INTEGER_TYPE
29889 && !POINTER_TYPE_P (index_type))
29890 || (TYPE_MODE (index_type) != SImode
29891 && TYPE_MODE (index_type) != DImode))
29892 return NULL_TREE;
29893
29894 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
29895 return NULL_TREE;
29896
29897 /* v*gather* insn sign extends index to pointer mode. */
29898 if (TYPE_PRECISION (index_type) < POINTER_SIZE
29899 && TYPE_UNSIGNED (index_type))
29900 return NULL_TREE;
29901
29902 if (scale <= 0
29903 || scale > 8
29904 || (scale & (scale - 1)) != 0)
29905 return NULL_TREE;
29906
29907 si = TYPE_MODE (index_type) == SImode;
29908 switch (TYPE_MODE (mem_vectype))
29909 {
29910 case V2DFmode:
29911 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
29912 break;
29913 case V4DFmode:
29914 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
29915 break;
29916 case V2DImode:
29917 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
29918 break;
29919 case V4DImode:
29920 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
29921 break;
29922 case V4SFmode:
29923 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
29924 break;
29925 case V8SFmode:
29926 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
29927 break;
29928 case V4SImode:
29929 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
29930 break;
29931 case V8SImode:
29932 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
29933 break;
29934 default:
29935 return NULL_TREE;
29936 }
29937
29938 return ix86_builtins[code];
29939 }
29940
29941 /* Returns a code for a target-specific builtin that implements
29942 reciprocal of the function, or NULL_TREE if not available. */
29943
29944 static tree
29945 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
29946 bool sqrt ATTRIBUTE_UNUSED)
29947 {
29948 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
29949 && flag_finite_math_only && !flag_trapping_math
29950 && flag_unsafe_math_optimizations))
29951 return NULL_TREE;
29952
29953 if (md_fn)
29954 /* Machine dependent builtins. */
29955 switch (fn)
29956 {
29957 /* Vectorized version of sqrt to rsqrt conversion. */
29958 case IX86_BUILTIN_SQRTPS_NR:
29959 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
29960
29961 case IX86_BUILTIN_SQRTPS_NR256:
29962 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
29963
29964 default:
29965 return NULL_TREE;
29966 }
29967 else
29968 /* Normal builtins. */
29969 switch (fn)
29970 {
29971 /* Sqrt to rsqrt conversion. */
29972 case BUILT_IN_SQRTF:
29973 return ix86_builtins[IX86_BUILTIN_RSQRTF];
29974
29975 default:
29976 return NULL_TREE;
29977 }
29978 }
29979 \f
29980 /* Helper for avx_vpermilps256_operand et al. This is also used by
29981 the expansion functions to turn the parallel back into a mask.
29982 The return value is 0 for no match and the imm8+1 for a match. */
29983
29984 int
29985 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
29986 {
29987 unsigned i, nelt = GET_MODE_NUNITS (mode);
29988 unsigned mask = 0;
29989 unsigned char ipar[8];
29990
29991 if (XVECLEN (par, 0) != (int) nelt)
29992 return 0;
29993
29994 /* Validate that all of the elements are constants, and not totally
29995 out of range. Copy the data into an integral array to make the
29996 subsequent checks easier. */
29997 for (i = 0; i < nelt; ++i)
29998 {
29999 rtx er = XVECEXP (par, 0, i);
30000 unsigned HOST_WIDE_INT ei;
30001
30002 if (!CONST_INT_P (er))
30003 return 0;
30004 ei = INTVAL (er);
30005 if (ei >= nelt)
30006 return 0;
30007 ipar[i] = ei;
30008 }
30009
30010 switch (mode)
30011 {
30012 case V4DFmode:
30013 /* In the 256-bit DFmode case, we can only move elements within
30014 a 128-bit lane. */
30015 for (i = 0; i < 2; ++i)
30016 {
30017 if (ipar[i] >= 2)
30018 return 0;
30019 mask |= ipar[i] << i;
30020 }
30021 for (i = 2; i < 4; ++i)
30022 {
30023 if (ipar[i] < 2)
30024 return 0;
30025 mask |= (ipar[i] - 2) << i;
30026 }
30027 break;
30028
30029 case V8SFmode:
30030 /* In the 256-bit SFmode case, we have full freedom of movement
30031 within the low 128-bit lane, but the high 128-bit lane must
30032 mirror the exact same pattern. */
30033 for (i = 0; i < 4; ++i)
30034 if (ipar[i] + 4 != ipar[i + 4])
30035 return 0;
30036 nelt = 4;
30037 /* FALLTHRU */
30038
30039 case V2DFmode:
30040 case V4SFmode:
30041 /* In the 128-bit case, we've full freedom in the placement of
30042 the elements from the source operand. */
30043 for (i = 0; i < nelt; ++i)
30044 mask |= ipar[i] << (i * (nelt / 2));
30045 break;
30046
30047 default:
30048 gcc_unreachable ();
30049 }
30050
30051 /* Make sure success has a non-zero value by adding one. */
30052 return mask + 1;
30053 }
30054
30055 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
30056 the expansion functions to turn the parallel back into a mask.
30057 The return value is 0 for no match and the imm8+1 for a match. */
30058
30059 int
30060 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
30061 {
30062 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
30063 unsigned mask = 0;
30064 unsigned char ipar[8];
30065
30066 if (XVECLEN (par, 0) != (int) nelt)
30067 return 0;
30068
30069 /* Validate that all of the elements are constants, and not totally
30070 out of range. Copy the data into an integral array to make the
30071 subsequent checks easier. */
30072 for (i = 0; i < nelt; ++i)
30073 {
30074 rtx er = XVECEXP (par, 0, i);
30075 unsigned HOST_WIDE_INT ei;
30076
30077 if (!CONST_INT_P (er))
30078 return 0;
30079 ei = INTVAL (er);
30080 if (ei >= 2 * nelt)
30081 return 0;
30082 ipar[i] = ei;
30083 }
30084
30085 /* Validate that the halves of the permute are halves. */
30086 for (i = 0; i < nelt2 - 1; ++i)
30087 if (ipar[i] + 1 != ipar[i + 1])
30088 return 0;
30089 for (i = nelt2; i < nelt - 1; ++i)
30090 if (ipar[i] + 1 != ipar[i + 1])
30091 return 0;
30092
30093 /* Reconstruct the mask. */
30094 for (i = 0; i < 2; ++i)
30095 {
30096 unsigned e = ipar[i * nelt2];
30097 if (e % nelt2)
30098 return 0;
30099 e /= nelt2;
30100 mask |= e << (i * 4);
30101 }
30102
30103 /* Make sure success has a non-zero value by adding one. */
30104 return mask + 1;
30105 }
30106 \f
30107 /* Store OPERAND to the memory after reload is completed. This means
30108 that we can't easily use assign_stack_local. */
30109 rtx
30110 ix86_force_to_memory (enum machine_mode mode, rtx operand)
30111 {
30112 rtx result;
30113
30114 gcc_assert (reload_completed);
30115 if (ix86_using_red_zone ())
30116 {
30117 result = gen_rtx_MEM (mode,
30118 gen_rtx_PLUS (Pmode,
30119 stack_pointer_rtx,
30120 GEN_INT (-RED_ZONE_SIZE)));
30121 emit_move_insn (result, operand);
30122 }
30123 else if (TARGET_64BIT)
30124 {
30125 switch (mode)
30126 {
30127 case HImode:
30128 case SImode:
30129 operand = gen_lowpart (DImode, operand);
30130 /* FALLTHRU */
30131 case DImode:
30132 emit_insn (
30133 gen_rtx_SET (VOIDmode,
30134 gen_rtx_MEM (DImode,
30135 gen_rtx_PRE_DEC (DImode,
30136 stack_pointer_rtx)),
30137 operand));
30138 break;
30139 default:
30140 gcc_unreachable ();
30141 }
30142 result = gen_rtx_MEM (mode, stack_pointer_rtx);
30143 }
30144 else
30145 {
30146 switch (mode)
30147 {
30148 case DImode:
30149 {
30150 rtx operands[2];
30151 split_double_mode (mode, &operand, 1, operands, operands + 1);
30152 emit_insn (
30153 gen_rtx_SET (VOIDmode,
30154 gen_rtx_MEM (SImode,
30155 gen_rtx_PRE_DEC (Pmode,
30156 stack_pointer_rtx)),
30157 operands[1]));
30158 emit_insn (
30159 gen_rtx_SET (VOIDmode,
30160 gen_rtx_MEM (SImode,
30161 gen_rtx_PRE_DEC (Pmode,
30162 stack_pointer_rtx)),
30163 operands[0]));
30164 }
30165 break;
30166 case HImode:
30167 /* Store HImodes as SImodes. */
30168 operand = gen_lowpart (SImode, operand);
30169 /* FALLTHRU */
30170 case SImode:
30171 emit_insn (
30172 gen_rtx_SET (VOIDmode,
30173 gen_rtx_MEM (GET_MODE (operand),
30174 gen_rtx_PRE_DEC (SImode,
30175 stack_pointer_rtx)),
30176 operand));
30177 break;
30178 default:
30179 gcc_unreachable ();
30180 }
30181 result = gen_rtx_MEM (mode, stack_pointer_rtx);
30182 }
30183 return result;
30184 }
30185
30186 /* Free operand from the memory. */
30187 void
30188 ix86_free_from_memory (enum machine_mode mode)
30189 {
30190 if (!ix86_using_red_zone ())
30191 {
30192 int size;
30193
30194 if (mode == DImode || TARGET_64BIT)
30195 size = 8;
30196 else
30197 size = 4;
30198 /* Use LEA to deallocate stack space. In peephole2 it will be converted
30199 to pop or add instruction if registers are available. */
30200 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
30201 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
30202 GEN_INT (size))));
30203 }
30204 }
30205
30206 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
30207
30208 Put float CONST_DOUBLE in the constant pool instead of fp regs.
30209 QImode must go into class Q_REGS.
30210 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
30211 movdf to do mem-to-mem moves through integer regs. */
30212
30213 static reg_class_t
30214 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
30215 {
30216 enum machine_mode mode = GET_MODE (x);
30217
30218 /* We're only allowed to return a subclass of CLASS. Many of the
30219 following checks fail for NO_REGS, so eliminate that early. */
30220 if (regclass == NO_REGS)
30221 return NO_REGS;
30222
30223 /* All classes can load zeros. */
30224 if (x == CONST0_RTX (mode))
30225 return regclass;
30226
30227 /* Force constants into memory if we are loading a (nonzero) constant into
30228 an MMX or SSE register. This is because there are no MMX/SSE instructions
30229 to load from a constant. */
30230 if (CONSTANT_P (x)
30231 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
30232 return NO_REGS;
30233
30234 /* Prefer SSE regs only, if we can use them for math. */
30235 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
30236 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
30237
30238 /* Floating-point constants need more complex checks. */
30239 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
30240 {
30241 /* General regs can load everything. */
30242 if (reg_class_subset_p (regclass, GENERAL_REGS))
30243 return regclass;
30244
30245 /* Floats can load 0 and 1 plus some others. Note that we eliminated
30246 zero above. We only want to wind up preferring 80387 registers if
30247 we plan on doing computation with them. */
30248 if (TARGET_80387
30249 && standard_80387_constant_p (x) > 0)
30250 {
30251 /* Limit class to non-sse. */
30252 if (regclass == FLOAT_SSE_REGS)
30253 return FLOAT_REGS;
30254 if (regclass == FP_TOP_SSE_REGS)
30255 return FP_TOP_REG;
30256 if (regclass == FP_SECOND_SSE_REGS)
30257 return FP_SECOND_REG;
30258 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
30259 return regclass;
30260 }
30261
30262 return NO_REGS;
30263 }
30264
30265 /* Generally when we see PLUS here, it's the function invariant
30266 (plus soft-fp const_int). Which can only be computed into general
30267 regs. */
30268 if (GET_CODE (x) == PLUS)
30269 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
30270
30271 /* QImode constants are easy to load, but non-constant QImode data
30272 must go into Q_REGS. */
30273 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
30274 {
30275 if (reg_class_subset_p (regclass, Q_REGS))
30276 return regclass;
30277 if (reg_class_subset_p (Q_REGS, regclass))
30278 return Q_REGS;
30279 return NO_REGS;
30280 }
30281
30282 return regclass;
30283 }
30284
30285 /* Discourage putting floating-point values in SSE registers unless
30286 SSE math is being used, and likewise for the 387 registers. */
30287 static reg_class_t
30288 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
30289 {
30290 enum machine_mode mode = GET_MODE (x);
30291
30292 /* Restrict the output reload class to the register bank that we are doing
30293 math on. If we would like not to return a subset of CLASS, reject this
30294 alternative: if reload cannot do this, it will still use its choice. */
30295 mode = GET_MODE (x);
30296 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
30297 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
30298
30299 if (X87_FLOAT_MODE_P (mode))
30300 {
30301 if (regclass == FP_TOP_SSE_REGS)
30302 return FP_TOP_REG;
30303 else if (regclass == FP_SECOND_SSE_REGS)
30304 return FP_SECOND_REG;
30305 else
30306 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
30307 }
30308
30309 return regclass;
30310 }
30311
30312 static reg_class_t
30313 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
30314 enum machine_mode mode, secondary_reload_info *sri)
30315 {
30316 /* Double-word spills from general registers to non-offsettable memory
30317 references (zero-extended addresses) require special handling. */
30318 if (TARGET_64BIT
30319 && MEM_P (x)
30320 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
30321 && rclass == GENERAL_REGS
30322 && !offsettable_memref_p (x))
30323 {
30324 sri->icode = (in_p
30325 ? CODE_FOR_reload_noff_load
30326 : CODE_FOR_reload_noff_store);
30327 /* Add the cost of moving address to a temporary. */
30328 sri->extra_cost = 1;
30329
30330 return NO_REGS;
30331 }
30332
30333 /* QImode spills from non-QI registers require
30334 intermediate register on 32bit targets. */
30335 if (!TARGET_64BIT
30336 && !in_p && mode == QImode
30337 && (rclass == GENERAL_REGS
30338 || rclass == LEGACY_REGS
30339 || rclass == INDEX_REGS))
30340 {
30341 int regno;
30342
30343 if (REG_P (x))
30344 regno = REGNO (x);
30345 else
30346 regno = -1;
30347
30348 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
30349 regno = true_regnum (x);
30350
30351 /* Return Q_REGS if the operand is in memory. */
30352 if (regno == -1)
30353 return Q_REGS;
30354 }
30355
30356 /* This condition handles corner case where an expression involving
30357 pointers gets vectorized. We're trying to use the address of a
30358 stack slot as a vector initializer.
30359
30360 (set (reg:V2DI 74 [ vect_cst_.2 ])
30361 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
30362
30363 Eventually frame gets turned into sp+offset like this:
30364
30365 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30366 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
30367 (const_int 392 [0x188]))))
30368
30369 That later gets turned into:
30370
30371 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30372 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
30373 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
30374
30375 We'll have the following reload recorded:
30376
30377 Reload 0: reload_in (DI) =
30378 (plus:DI (reg/f:DI 7 sp)
30379 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
30380 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30381 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
30382 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
30383 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30384 reload_reg_rtx: (reg:V2DI 22 xmm1)
30385
30386 Which isn't going to work since SSE instructions can't handle scalar
30387 additions. Returning GENERAL_REGS forces the addition into integer
30388 register and reload can handle subsequent reloads without problems. */
30389
30390 if (in_p && GET_CODE (x) == PLUS
30391 && SSE_CLASS_P (rclass)
30392 && SCALAR_INT_MODE_P (mode))
30393 return GENERAL_REGS;
30394
30395 return NO_REGS;
30396 }
30397
30398 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
30399
30400 static bool
30401 ix86_class_likely_spilled_p (reg_class_t rclass)
30402 {
30403 switch (rclass)
30404 {
30405 case AREG:
30406 case DREG:
30407 case CREG:
30408 case BREG:
30409 case AD_REGS:
30410 case SIREG:
30411 case DIREG:
30412 case SSE_FIRST_REG:
30413 case FP_TOP_REG:
30414 case FP_SECOND_REG:
30415 return true;
30416
30417 default:
30418 break;
30419 }
30420
30421 return false;
30422 }
30423
30424 /* If we are copying between general and FP registers, we need a memory
30425 location. The same is true for SSE and MMX registers.
30426
30427 To optimize register_move_cost performance, allow inline variant.
30428
30429 The macro can't work reliably when one of the CLASSES is class containing
30430 registers from multiple units (SSE, MMX, integer). We avoid this by never
30431 combining those units in single alternative in the machine description.
30432 Ensure that this constraint holds to avoid unexpected surprises.
30433
30434 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
30435 enforce these sanity checks. */
30436
30437 static inline bool
30438 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
30439 enum machine_mode mode, int strict)
30440 {
30441 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
30442 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
30443 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
30444 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
30445 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
30446 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
30447 {
30448 gcc_assert (!strict);
30449 return true;
30450 }
30451
30452 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
30453 return true;
30454
30455 /* ??? This is a lie. We do have moves between mmx/general, and for
30456 mmx/sse2. But by saying we need secondary memory we discourage the
30457 register allocator from using the mmx registers unless needed. */
30458 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
30459 return true;
30460
30461 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
30462 {
30463 /* SSE1 doesn't have any direct moves from other classes. */
30464 if (!TARGET_SSE2)
30465 return true;
30466
30467 /* If the target says that inter-unit moves are more expensive
30468 than moving through memory, then don't generate them. */
30469 if (!TARGET_INTER_UNIT_MOVES)
30470 return true;
30471
30472 /* Between SSE and general, we have moves no larger than word size. */
30473 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
30474 return true;
30475 }
30476
30477 return false;
30478 }
30479
30480 bool
30481 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
30482 enum machine_mode mode, int strict)
30483 {
30484 return inline_secondary_memory_needed (class1, class2, mode, strict);
30485 }
30486
30487 /* Implement the TARGET_CLASS_MAX_NREGS hook.
30488
30489 On the 80386, this is the size of MODE in words,
30490 except in the FP regs, where a single reg is always enough. */
30491
30492 static unsigned char
30493 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
30494 {
30495 if (MAYBE_INTEGER_CLASS_P (rclass))
30496 {
30497 if (mode == XFmode)
30498 return (TARGET_64BIT ? 2 : 3);
30499 else if (mode == XCmode)
30500 return (TARGET_64BIT ? 4 : 6);
30501 else
30502 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
30503 }
30504 else
30505 {
30506 if (COMPLEX_MODE_P (mode))
30507 return 2;
30508 else
30509 return 1;
30510 }
30511 }
30512
30513 /* Return true if the registers in CLASS cannot represent the change from
30514 modes FROM to TO. */
30515
30516 bool
30517 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
30518 enum reg_class regclass)
30519 {
30520 if (from == to)
30521 return false;
30522
30523 /* x87 registers can't do subreg at all, as all values are reformatted
30524 to extended precision. */
30525 if (MAYBE_FLOAT_CLASS_P (regclass))
30526 return true;
30527
30528 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
30529 {
30530 /* Vector registers do not support QI or HImode loads. If we don't
30531 disallow a change to these modes, reload will assume it's ok to
30532 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
30533 the vec_dupv4hi pattern. */
30534 if (GET_MODE_SIZE (from) < 4)
30535 return true;
30536
30537 /* Vector registers do not support subreg with nonzero offsets, which
30538 are otherwise valid for integer registers. Since we can't see
30539 whether we have a nonzero offset from here, prohibit all
30540 nonparadoxical subregs changing size. */
30541 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
30542 return true;
30543 }
30544
30545 return false;
30546 }
30547
30548 /* Return the cost of moving data of mode M between a
30549 register and memory. A value of 2 is the default; this cost is
30550 relative to those in `REGISTER_MOVE_COST'.
30551
30552 This function is used extensively by register_move_cost that is used to
30553 build tables at startup. Make it inline in this case.
30554 When IN is 2, return maximum of in and out move cost.
30555
30556 If moving between registers and memory is more expensive than
30557 between two registers, you should define this macro to express the
30558 relative cost.
30559
30560 Model also increased moving costs of QImode registers in non
30561 Q_REGS classes.
30562 */
30563 static inline int
30564 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
30565 int in)
30566 {
30567 int cost;
30568 if (FLOAT_CLASS_P (regclass))
30569 {
30570 int index;
30571 switch (mode)
30572 {
30573 case SFmode:
30574 index = 0;
30575 break;
30576 case DFmode:
30577 index = 1;
30578 break;
30579 case XFmode:
30580 index = 2;
30581 break;
30582 default:
30583 return 100;
30584 }
30585 if (in == 2)
30586 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
30587 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
30588 }
30589 if (SSE_CLASS_P (regclass))
30590 {
30591 int index;
30592 switch (GET_MODE_SIZE (mode))
30593 {
30594 case 4:
30595 index = 0;
30596 break;
30597 case 8:
30598 index = 1;
30599 break;
30600 case 16:
30601 index = 2;
30602 break;
30603 default:
30604 return 100;
30605 }
30606 if (in == 2)
30607 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
30608 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
30609 }
30610 if (MMX_CLASS_P (regclass))
30611 {
30612 int index;
30613 switch (GET_MODE_SIZE (mode))
30614 {
30615 case 4:
30616 index = 0;
30617 break;
30618 case 8:
30619 index = 1;
30620 break;
30621 default:
30622 return 100;
30623 }
30624 if (in)
30625 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
30626 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
30627 }
30628 switch (GET_MODE_SIZE (mode))
30629 {
30630 case 1:
30631 if (Q_CLASS_P (regclass) || TARGET_64BIT)
30632 {
30633 if (!in)
30634 return ix86_cost->int_store[0];
30635 if (TARGET_PARTIAL_REG_DEPENDENCY
30636 && optimize_function_for_speed_p (cfun))
30637 cost = ix86_cost->movzbl_load;
30638 else
30639 cost = ix86_cost->int_load[0];
30640 if (in == 2)
30641 return MAX (cost, ix86_cost->int_store[0]);
30642 return cost;
30643 }
30644 else
30645 {
30646 if (in == 2)
30647 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
30648 if (in)
30649 return ix86_cost->movzbl_load;
30650 else
30651 return ix86_cost->int_store[0] + 4;
30652 }
30653 break;
30654 case 2:
30655 if (in == 2)
30656 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
30657 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
30658 default:
30659 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
30660 if (mode == TFmode)
30661 mode = XFmode;
30662 if (in == 2)
30663 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
30664 else if (in)
30665 cost = ix86_cost->int_load[2];
30666 else
30667 cost = ix86_cost->int_store[2];
30668 return (cost * (((int) GET_MODE_SIZE (mode)
30669 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
30670 }
30671 }
30672
30673 static int
30674 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
30675 bool in)
30676 {
30677 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
30678 }
30679
30680
30681 /* Return the cost of moving data from a register in class CLASS1 to
30682 one in class CLASS2.
30683
30684 It is not required that the cost always equal 2 when FROM is the same as TO;
30685 on some machines it is expensive to move between registers if they are not
30686 general registers. */
30687
30688 static int
30689 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
30690 reg_class_t class2_i)
30691 {
30692 enum reg_class class1 = (enum reg_class) class1_i;
30693 enum reg_class class2 = (enum reg_class) class2_i;
30694
30695 /* In case we require secondary memory, compute cost of the store followed
30696 by load. In order to avoid bad register allocation choices, we need
30697 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
30698
30699 if (inline_secondary_memory_needed (class1, class2, mode, 0))
30700 {
30701 int cost = 1;
30702
30703 cost += inline_memory_move_cost (mode, class1, 2);
30704 cost += inline_memory_move_cost (mode, class2, 2);
30705
30706 /* In case of copying from general_purpose_register we may emit multiple
30707 stores followed by single load causing memory size mismatch stall.
30708 Count this as arbitrarily high cost of 20. */
30709 if (targetm.class_max_nregs (class1, mode)
30710 > targetm.class_max_nregs (class2, mode))
30711 cost += 20;
30712
30713 /* In the case of FP/MMX moves, the registers actually overlap, and we
30714 have to switch modes in order to treat them differently. */
30715 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
30716 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
30717 cost += 20;
30718
30719 return cost;
30720 }
30721
30722 /* Moves between SSE/MMX and integer unit are expensive. */
30723 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
30724 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
30725
30726 /* ??? By keeping returned value relatively high, we limit the number
30727 of moves between integer and MMX/SSE registers for all targets.
30728 Additionally, high value prevents problem with x86_modes_tieable_p(),
30729 where integer modes in MMX/SSE registers are not tieable
30730 because of missing QImode and HImode moves to, from or between
30731 MMX/SSE registers. */
30732 return MAX (8, ix86_cost->mmxsse_to_integer);
30733
30734 if (MAYBE_FLOAT_CLASS_P (class1))
30735 return ix86_cost->fp_move;
30736 if (MAYBE_SSE_CLASS_P (class1))
30737 return ix86_cost->sse_move;
30738 if (MAYBE_MMX_CLASS_P (class1))
30739 return ix86_cost->mmx_move;
30740 return 2;
30741 }
30742
30743 /* Return TRUE if hard register REGNO can hold a value of machine-mode
30744 MODE. */
30745
30746 bool
30747 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
30748 {
30749 /* Flags and only flags can only hold CCmode values. */
30750 if (CC_REGNO_P (regno))
30751 return GET_MODE_CLASS (mode) == MODE_CC;
30752 if (GET_MODE_CLASS (mode) == MODE_CC
30753 || GET_MODE_CLASS (mode) == MODE_RANDOM
30754 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
30755 return false;
30756 if (FP_REGNO_P (regno))
30757 return VALID_FP_MODE_P (mode);
30758 if (SSE_REGNO_P (regno))
30759 {
30760 /* We implement the move patterns for all vector modes into and
30761 out of SSE registers, even when no operation instructions
30762 are available. OImode move is available only when AVX is
30763 enabled. */
30764 return ((TARGET_AVX && mode == OImode)
30765 || VALID_AVX256_REG_MODE (mode)
30766 || VALID_SSE_REG_MODE (mode)
30767 || VALID_SSE2_REG_MODE (mode)
30768 || VALID_MMX_REG_MODE (mode)
30769 || VALID_MMX_REG_MODE_3DNOW (mode));
30770 }
30771 if (MMX_REGNO_P (regno))
30772 {
30773 /* We implement the move patterns for 3DNOW modes even in MMX mode,
30774 so if the register is available at all, then we can move data of
30775 the given mode into or out of it. */
30776 return (VALID_MMX_REG_MODE (mode)
30777 || VALID_MMX_REG_MODE_3DNOW (mode));
30778 }
30779
30780 if (mode == QImode)
30781 {
30782 /* Take care for QImode values - they can be in non-QI regs,
30783 but then they do cause partial register stalls. */
30784 if (regno <= BX_REG || TARGET_64BIT)
30785 return true;
30786 if (!TARGET_PARTIAL_REG_STALL)
30787 return true;
30788 return !can_create_pseudo_p ();
30789 }
30790 /* We handle both integer and floats in the general purpose registers. */
30791 else if (VALID_INT_MODE_P (mode))
30792 return true;
30793 else if (VALID_FP_MODE_P (mode))
30794 return true;
30795 else if (VALID_DFP_MODE_P (mode))
30796 return true;
30797 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
30798 on to use that value in smaller contexts, this can easily force a
30799 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
30800 supporting DImode, allow it. */
30801 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
30802 return true;
30803
30804 return false;
30805 }
30806
30807 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
30808 tieable integer mode. */
30809
30810 static bool
30811 ix86_tieable_integer_mode_p (enum machine_mode mode)
30812 {
30813 switch (mode)
30814 {
30815 case HImode:
30816 case SImode:
30817 return true;
30818
30819 case QImode:
30820 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
30821
30822 case DImode:
30823 return TARGET_64BIT;
30824
30825 default:
30826 return false;
30827 }
30828 }
30829
30830 /* Return true if MODE1 is accessible in a register that can hold MODE2
30831 without copying. That is, all register classes that can hold MODE2
30832 can also hold MODE1. */
30833
30834 bool
30835 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
30836 {
30837 if (mode1 == mode2)
30838 return true;
30839
30840 if (ix86_tieable_integer_mode_p (mode1)
30841 && ix86_tieable_integer_mode_p (mode2))
30842 return true;
30843
30844 /* MODE2 being XFmode implies fp stack or general regs, which means we
30845 can tie any smaller floating point modes to it. Note that we do not
30846 tie this with TFmode. */
30847 if (mode2 == XFmode)
30848 return mode1 == SFmode || mode1 == DFmode;
30849
30850 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
30851 that we can tie it with SFmode. */
30852 if (mode2 == DFmode)
30853 return mode1 == SFmode;
30854
30855 /* If MODE2 is only appropriate for an SSE register, then tie with
30856 any other mode acceptable to SSE registers. */
30857 if (GET_MODE_SIZE (mode2) == 16
30858 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
30859 return (GET_MODE_SIZE (mode1) == 16
30860 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
30861
30862 /* If MODE2 is appropriate for an MMX register, then tie
30863 with any other mode acceptable to MMX registers. */
30864 if (GET_MODE_SIZE (mode2) == 8
30865 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
30866 return (GET_MODE_SIZE (mode1) == 8
30867 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
30868
30869 return false;
30870 }
30871
30872 /* Compute a (partial) cost for rtx X. Return true if the complete
30873 cost has been computed, and false if subexpressions should be
30874 scanned. In either case, *TOTAL contains the cost result. */
30875
30876 static bool
30877 ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
30878 bool speed)
30879 {
30880 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
30881 enum machine_mode mode = GET_MODE (x);
30882 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
30883
30884 switch (code)
30885 {
30886 case CONST_INT:
30887 case CONST:
30888 case LABEL_REF:
30889 case SYMBOL_REF:
30890 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
30891 *total = 3;
30892 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
30893 *total = 2;
30894 else if (flag_pic && SYMBOLIC_CONST (x)
30895 && (!TARGET_64BIT
30896 || (!GET_CODE (x) != LABEL_REF
30897 && (GET_CODE (x) != SYMBOL_REF
30898 || !SYMBOL_REF_LOCAL_P (x)))))
30899 *total = 1;
30900 else
30901 *total = 0;
30902 return true;
30903
30904 case CONST_DOUBLE:
30905 if (mode == VOIDmode)
30906 *total = 0;
30907 else
30908 switch (standard_80387_constant_p (x))
30909 {
30910 case 1: /* 0.0 */
30911 *total = 1;
30912 break;
30913 default: /* Other constants */
30914 *total = 2;
30915 break;
30916 case 0:
30917 case -1:
30918 /* Start with (MEM (SYMBOL_REF)), since that's where
30919 it'll probably end up. Add a penalty for size. */
30920 *total = (COSTS_N_INSNS (1)
30921 + (flag_pic != 0 && !TARGET_64BIT)
30922 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
30923 break;
30924 }
30925 return true;
30926
30927 case ZERO_EXTEND:
30928 /* The zero extensions is often completely free on x86_64, so make
30929 it as cheap as possible. */
30930 if (TARGET_64BIT && mode == DImode
30931 && GET_MODE (XEXP (x, 0)) == SImode)
30932 *total = 1;
30933 else if (TARGET_ZERO_EXTEND_WITH_AND)
30934 *total = cost->add;
30935 else
30936 *total = cost->movzx;
30937 return false;
30938
30939 case SIGN_EXTEND:
30940 *total = cost->movsx;
30941 return false;
30942
30943 case ASHIFT:
30944 if (CONST_INT_P (XEXP (x, 1))
30945 && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
30946 {
30947 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
30948 if (value == 1)
30949 {
30950 *total = cost->add;
30951 return false;
30952 }
30953 if ((value == 2 || value == 3)
30954 && cost->lea <= cost->shift_const)
30955 {
30956 *total = cost->lea;
30957 return false;
30958 }
30959 }
30960 /* FALLTHRU */
30961
30962 case ROTATE:
30963 case ASHIFTRT:
30964 case LSHIFTRT:
30965 case ROTATERT:
30966 if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
30967 {
30968 if (CONST_INT_P (XEXP (x, 1)))
30969 {
30970 if (INTVAL (XEXP (x, 1)) > 32)
30971 *total = cost->shift_const + COSTS_N_INSNS (2);
30972 else
30973 *total = cost->shift_const * 2;
30974 }
30975 else
30976 {
30977 if (GET_CODE (XEXP (x, 1)) == AND)
30978 *total = cost->shift_var * 2;
30979 else
30980 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
30981 }
30982 }
30983 else
30984 {
30985 if (CONST_INT_P (XEXP (x, 1)))
30986 *total = cost->shift_const;
30987 else
30988 *total = cost->shift_var;
30989 }
30990 return false;
30991
30992 case FMA:
30993 {
30994 rtx sub;
30995
30996 gcc_assert (FLOAT_MODE_P (mode));
30997 gcc_assert (TARGET_FMA || TARGET_FMA4);
30998
30999 /* ??? SSE scalar/vector cost should be used here. */
31000 /* ??? Bald assumption that fma has the same cost as fmul. */
31001 *total = cost->fmul;
31002 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
31003
31004 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
31005 sub = XEXP (x, 0);
31006 if (GET_CODE (sub) == NEG)
31007 sub = XEXP (sub, 0);
31008 *total += rtx_cost (sub, FMA, 0, speed);
31009
31010 sub = XEXP (x, 2);
31011 if (GET_CODE (sub) == NEG)
31012 sub = XEXP (sub, 0);
31013 *total += rtx_cost (sub, FMA, 2, speed);
31014 return true;
31015 }
31016
31017 case MULT:
31018 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31019 {
31020 /* ??? SSE scalar cost should be used here. */
31021 *total = cost->fmul;
31022 return false;
31023 }
31024 else if (X87_FLOAT_MODE_P (mode))
31025 {
31026 *total = cost->fmul;
31027 return false;
31028 }
31029 else if (FLOAT_MODE_P (mode))
31030 {
31031 /* ??? SSE vector cost should be used here. */
31032 *total = cost->fmul;
31033 return false;
31034 }
31035 else
31036 {
31037 rtx op0 = XEXP (x, 0);
31038 rtx op1 = XEXP (x, 1);
31039 int nbits;
31040 if (CONST_INT_P (XEXP (x, 1)))
31041 {
31042 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31043 for (nbits = 0; value != 0; value &= value - 1)
31044 nbits++;
31045 }
31046 else
31047 /* This is arbitrary. */
31048 nbits = 7;
31049
31050 /* Compute costs correctly for widening multiplication. */
31051 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
31052 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
31053 == GET_MODE_SIZE (mode))
31054 {
31055 int is_mulwiden = 0;
31056 enum machine_mode inner_mode = GET_MODE (op0);
31057
31058 if (GET_CODE (op0) == GET_CODE (op1))
31059 is_mulwiden = 1, op1 = XEXP (op1, 0);
31060 else if (CONST_INT_P (op1))
31061 {
31062 if (GET_CODE (op0) == SIGN_EXTEND)
31063 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
31064 == INTVAL (op1);
31065 else
31066 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
31067 }
31068
31069 if (is_mulwiden)
31070 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
31071 }
31072
31073 *total = (cost->mult_init[MODE_INDEX (mode)]
31074 + nbits * cost->mult_bit
31075 + rtx_cost (op0, outer_code, opno, speed)
31076 + rtx_cost (op1, outer_code, opno, speed));
31077
31078 return true;
31079 }
31080
31081 case DIV:
31082 case UDIV:
31083 case MOD:
31084 case UMOD:
31085 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31086 /* ??? SSE cost should be used here. */
31087 *total = cost->fdiv;
31088 else if (X87_FLOAT_MODE_P (mode))
31089 *total = cost->fdiv;
31090 else if (FLOAT_MODE_P (mode))
31091 /* ??? SSE vector cost should be used here. */
31092 *total = cost->fdiv;
31093 else
31094 *total = cost->divide[MODE_INDEX (mode)];
31095 return false;
31096
31097 case PLUS:
31098 if (GET_MODE_CLASS (mode) == MODE_INT
31099 && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
31100 {
31101 if (GET_CODE (XEXP (x, 0)) == PLUS
31102 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
31103 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
31104 && CONSTANT_P (XEXP (x, 1)))
31105 {
31106 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
31107 if (val == 2 || val == 4 || val == 8)
31108 {
31109 *total = cost->lea;
31110 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31111 outer_code, opno, speed);
31112 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
31113 outer_code, opno, speed);
31114 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31115 return true;
31116 }
31117 }
31118 else if (GET_CODE (XEXP (x, 0)) == MULT
31119 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
31120 {
31121 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
31122 if (val == 2 || val == 4 || val == 8)
31123 {
31124 *total = cost->lea;
31125 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31126 outer_code, opno, speed);
31127 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31128 return true;
31129 }
31130 }
31131 else if (GET_CODE (XEXP (x, 0)) == PLUS)
31132 {
31133 *total = cost->lea;
31134 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31135 outer_code, opno, speed);
31136 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31137 outer_code, opno, speed);
31138 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31139 return true;
31140 }
31141 }
31142 /* FALLTHRU */
31143
31144 case MINUS:
31145 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31146 {
31147 /* ??? SSE cost should be used here. */
31148 *total = cost->fadd;
31149 return false;
31150 }
31151 else if (X87_FLOAT_MODE_P (mode))
31152 {
31153 *total = cost->fadd;
31154 return false;
31155 }
31156 else if (FLOAT_MODE_P (mode))
31157 {
31158 /* ??? SSE vector cost should be used here. */
31159 *total = cost->fadd;
31160 return false;
31161 }
31162 /* FALLTHRU */
31163
31164 case AND:
31165 case IOR:
31166 case XOR:
31167 if (!TARGET_64BIT && mode == DImode)
31168 {
31169 *total = (cost->add * 2
31170 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
31171 << (GET_MODE (XEXP (x, 0)) != DImode))
31172 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
31173 << (GET_MODE (XEXP (x, 1)) != DImode)));
31174 return true;
31175 }
31176 /* FALLTHRU */
31177
31178 case NEG:
31179 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31180 {
31181 /* ??? SSE cost should be used here. */
31182 *total = cost->fchs;
31183 return false;
31184 }
31185 else if (X87_FLOAT_MODE_P (mode))
31186 {
31187 *total = cost->fchs;
31188 return false;
31189 }
31190 else if (FLOAT_MODE_P (mode))
31191 {
31192 /* ??? SSE vector cost should be used here. */
31193 *total = cost->fchs;
31194 return false;
31195 }
31196 /* FALLTHRU */
31197
31198 case NOT:
31199 if (!TARGET_64BIT && mode == DImode)
31200 *total = cost->add * 2;
31201 else
31202 *total = cost->add;
31203 return false;
31204
31205 case COMPARE:
31206 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
31207 && XEXP (XEXP (x, 0), 1) == const1_rtx
31208 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
31209 && XEXP (x, 1) == const0_rtx)
31210 {
31211 /* This kind of construct is implemented using test[bwl].
31212 Treat it as if we had an AND. */
31213 *total = (cost->add
31214 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
31215 + rtx_cost (const1_rtx, outer_code, opno, speed));
31216 return true;
31217 }
31218 return false;
31219
31220 case FLOAT_EXTEND:
31221 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
31222 *total = 0;
31223 return false;
31224
31225 case ABS:
31226 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31227 /* ??? SSE cost should be used here. */
31228 *total = cost->fabs;
31229 else if (X87_FLOAT_MODE_P (mode))
31230 *total = cost->fabs;
31231 else if (FLOAT_MODE_P (mode))
31232 /* ??? SSE vector cost should be used here. */
31233 *total = cost->fabs;
31234 return false;
31235
31236 case SQRT:
31237 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31238 /* ??? SSE cost should be used here. */
31239 *total = cost->fsqrt;
31240 else if (X87_FLOAT_MODE_P (mode))
31241 *total = cost->fsqrt;
31242 else if (FLOAT_MODE_P (mode))
31243 /* ??? SSE vector cost should be used here. */
31244 *total = cost->fsqrt;
31245 return false;
31246
31247 case UNSPEC:
31248 if (XINT (x, 1) == UNSPEC_TP)
31249 *total = 0;
31250 return false;
31251
31252 case VEC_SELECT:
31253 case VEC_CONCAT:
31254 case VEC_MERGE:
31255 case VEC_DUPLICATE:
31256 /* ??? Assume all of these vector manipulation patterns are
31257 recognizable. In which case they all pretty much have the
31258 same cost. */
31259 *total = COSTS_N_INSNS (1);
31260 return true;
31261
31262 default:
31263 return false;
31264 }
31265 }
31266
31267 #if TARGET_MACHO
31268
31269 static int current_machopic_label_num;
31270
31271 /* Given a symbol name and its associated stub, write out the
31272 definition of the stub. */
31273
31274 void
31275 machopic_output_stub (FILE *file, const char *symb, const char *stub)
31276 {
31277 unsigned int length;
31278 char *binder_name, *symbol_name, lazy_ptr_name[32];
31279 int label = ++current_machopic_label_num;
31280
31281 /* For 64-bit we shouldn't get here. */
31282 gcc_assert (!TARGET_64BIT);
31283
31284 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
31285 symb = targetm.strip_name_encoding (symb);
31286
31287 length = strlen (stub);
31288 binder_name = XALLOCAVEC (char, length + 32);
31289 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
31290
31291 length = strlen (symb);
31292 symbol_name = XALLOCAVEC (char, length + 32);
31293 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
31294
31295 sprintf (lazy_ptr_name, "L%d$lz", label);
31296
31297 if (MACHOPIC_ATT_STUB)
31298 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
31299 else if (MACHOPIC_PURE)
31300 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
31301 else
31302 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
31303
31304 fprintf (file, "%s:\n", stub);
31305 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
31306
31307 if (MACHOPIC_ATT_STUB)
31308 {
31309 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
31310 }
31311 else if (MACHOPIC_PURE)
31312 {
31313 /* PIC stub. */
31314 /* 25-byte PIC stub using "CALL get_pc_thunk". */
31315 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
31316 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
31317 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
31318 label, lazy_ptr_name, label);
31319 fprintf (file, "\tjmp\t*%%ecx\n");
31320 }
31321 else
31322 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
31323
31324 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
31325 it needs no stub-binding-helper. */
31326 if (MACHOPIC_ATT_STUB)
31327 return;
31328
31329 fprintf (file, "%s:\n", binder_name);
31330
31331 if (MACHOPIC_PURE)
31332 {
31333 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
31334 fprintf (file, "\tpushl\t%%ecx\n");
31335 }
31336 else
31337 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
31338
31339 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
31340
31341 /* N.B. Keep the correspondence of these
31342 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
31343 old-pic/new-pic/non-pic stubs; altering this will break
31344 compatibility with existing dylibs. */
31345 if (MACHOPIC_PURE)
31346 {
31347 /* 25-byte PIC stub using "CALL get_pc_thunk". */
31348 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
31349 }
31350 else
31351 /* 16-byte -mdynamic-no-pic stub. */
31352 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
31353
31354 fprintf (file, "%s:\n", lazy_ptr_name);
31355 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
31356 fprintf (file, ASM_LONG "%s\n", binder_name);
31357 }
31358 #endif /* TARGET_MACHO */
31359
31360 /* Order the registers for register allocator. */
31361
31362 void
31363 x86_order_regs_for_local_alloc (void)
31364 {
31365 int pos = 0;
31366 int i;
31367
31368 /* First allocate the local general purpose registers. */
31369 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
31370 if (GENERAL_REGNO_P (i) && call_used_regs[i])
31371 reg_alloc_order [pos++] = i;
31372
31373 /* Global general purpose registers. */
31374 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
31375 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
31376 reg_alloc_order [pos++] = i;
31377
31378 /* x87 registers come first in case we are doing FP math
31379 using them. */
31380 if (!TARGET_SSE_MATH)
31381 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
31382 reg_alloc_order [pos++] = i;
31383
31384 /* SSE registers. */
31385 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
31386 reg_alloc_order [pos++] = i;
31387 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
31388 reg_alloc_order [pos++] = i;
31389
31390 /* x87 registers. */
31391 if (TARGET_SSE_MATH)
31392 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
31393 reg_alloc_order [pos++] = i;
31394
31395 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
31396 reg_alloc_order [pos++] = i;
31397
31398 /* Initialize the rest of array as we do not allocate some registers
31399 at all. */
31400 while (pos < FIRST_PSEUDO_REGISTER)
31401 reg_alloc_order [pos++] = 0;
31402 }
31403
31404 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
31405 in struct attribute_spec handler. */
31406 static tree
31407 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
31408 tree args,
31409 int flags ATTRIBUTE_UNUSED,
31410 bool *no_add_attrs)
31411 {
31412 if (TREE_CODE (*node) != FUNCTION_TYPE
31413 && TREE_CODE (*node) != METHOD_TYPE
31414 && TREE_CODE (*node) != FIELD_DECL
31415 && TREE_CODE (*node) != TYPE_DECL)
31416 {
31417 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31418 name);
31419 *no_add_attrs = true;
31420 return NULL_TREE;
31421 }
31422 if (TARGET_64BIT)
31423 {
31424 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
31425 name);
31426 *no_add_attrs = true;
31427 return NULL_TREE;
31428 }
31429 if (is_attribute_p ("callee_pop_aggregate_return", name))
31430 {
31431 tree cst;
31432
31433 cst = TREE_VALUE (args);
31434 if (TREE_CODE (cst) != INTEGER_CST)
31435 {
31436 warning (OPT_Wattributes,
31437 "%qE attribute requires an integer constant argument",
31438 name);
31439 *no_add_attrs = true;
31440 }
31441 else if (compare_tree_int (cst, 0) != 0
31442 && compare_tree_int (cst, 1) != 0)
31443 {
31444 warning (OPT_Wattributes,
31445 "argument to %qE attribute is neither zero, nor one",
31446 name);
31447 *no_add_attrs = true;
31448 }
31449
31450 return NULL_TREE;
31451 }
31452
31453 return NULL_TREE;
31454 }
31455
31456 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
31457 struct attribute_spec.handler. */
31458 static tree
31459 ix86_handle_abi_attribute (tree *node, tree name,
31460 tree args ATTRIBUTE_UNUSED,
31461 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31462 {
31463 if (TREE_CODE (*node) != FUNCTION_TYPE
31464 && TREE_CODE (*node) != METHOD_TYPE
31465 && TREE_CODE (*node) != FIELD_DECL
31466 && TREE_CODE (*node) != TYPE_DECL)
31467 {
31468 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31469 name);
31470 *no_add_attrs = true;
31471 return NULL_TREE;
31472 }
31473
31474 /* Can combine regparm with all attributes but fastcall. */
31475 if (is_attribute_p ("ms_abi", name))
31476 {
31477 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
31478 {
31479 error ("ms_abi and sysv_abi attributes are not compatible");
31480 }
31481
31482 return NULL_TREE;
31483 }
31484 else if (is_attribute_p ("sysv_abi", name))
31485 {
31486 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
31487 {
31488 error ("ms_abi and sysv_abi attributes are not compatible");
31489 }
31490
31491 return NULL_TREE;
31492 }
31493
31494 return NULL_TREE;
31495 }
31496
31497 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
31498 struct attribute_spec.handler. */
31499 static tree
31500 ix86_handle_struct_attribute (tree *node, tree name,
31501 tree args ATTRIBUTE_UNUSED,
31502 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31503 {
31504 tree *type = NULL;
31505 if (DECL_P (*node))
31506 {
31507 if (TREE_CODE (*node) == TYPE_DECL)
31508 type = &TREE_TYPE (*node);
31509 }
31510 else
31511 type = node;
31512
31513 if (!(type && (TREE_CODE (*type) == RECORD_TYPE
31514 || TREE_CODE (*type) == UNION_TYPE)))
31515 {
31516 warning (OPT_Wattributes, "%qE attribute ignored",
31517 name);
31518 *no_add_attrs = true;
31519 }
31520
31521 else if ((is_attribute_p ("ms_struct", name)
31522 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
31523 || ((is_attribute_p ("gcc_struct", name)
31524 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
31525 {
31526 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
31527 name);
31528 *no_add_attrs = true;
31529 }
31530
31531 return NULL_TREE;
31532 }
31533
31534 static tree
31535 ix86_handle_fndecl_attribute (tree *node, tree name,
31536 tree args ATTRIBUTE_UNUSED,
31537 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31538 {
31539 if (TREE_CODE (*node) != FUNCTION_DECL)
31540 {
31541 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31542 name);
31543 *no_add_attrs = true;
31544 }
31545 return NULL_TREE;
31546 }
31547
31548 static bool
31549 ix86_ms_bitfield_layout_p (const_tree record_type)
31550 {
31551 return ((TARGET_MS_BITFIELD_LAYOUT
31552 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
31553 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
31554 }
31555
31556 /* Returns an expression indicating where the this parameter is
31557 located on entry to the FUNCTION. */
31558
31559 static rtx
31560 x86_this_parameter (tree function)
31561 {
31562 tree type = TREE_TYPE (function);
31563 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
31564 int nregs;
31565
31566 if (TARGET_64BIT)
31567 {
31568 const int *parm_regs;
31569
31570 if (ix86_function_type_abi (type) == MS_ABI)
31571 parm_regs = x86_64_ms_abi_int_parameter_registers;
31572 else
31573 parm_regs = x86_64_int_parameter_registers;
31574 return gen_rtx_REG (DImode, parm_regs[aggr]);
31575 }
31576
31577 nregs = ix86_function_regparm (type, function);
31578
31579 if (nregs > 0 && !stdarg_p (type))
31580 {
31581 int regno;
31582 unsigned int ccvt = ix86_get_callcvt (type);
31583
31584 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
31585 regno = aggr ? DX_REG : CX_REG;
31586 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
31587 {
31588 regno = CX_REG;
31589 if (aggr)
31590 return gen_rtx_MEM (SImode,
31591 plus_constant (stack_pointer_rtx, 4));
31592 }
31593 else
31594 {
31595 regno = AX_REG;
31596 if (aggr)
31597 {
31598 regno = DX_REG;
31599 if (nregs == 1)
31600 return gen_rtx_MEM (SImode,
31601 plus_constant (stack_pointer_rtx, 4));
31602 }
31603 }
31604 return gen_rtx_REG (SImode, regno);
31605 }
31606
31607 return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, aggr ? 8 : 4));
31608 }
31609
31610 /* Determine whether x86_output_mi_thunk can succeed. */
31611
31612 static bool
31613 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
31614 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
31615 HOST_WIDE_INT vcall_offset, const_tree function)
31616 {
31617 /* 64-bit can handle anything. */
31618 if (TARGET_64BIT)
31619 return true;
31620
31621 /* For 32-bit, everything's fine if we have one free register. */
31622 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
31623 return true;
31624
31625 /* Need a free register for vcall_offset. */
31626 if (vcall_offset)
31627 return false;
31628
31629 /* Need a free register for GOT references. */
31630 if (flag_pic && !targetm.binds_local_p (function))
31631 return false;
31632
31633 /* Otherwise ok. */
31634 return true;
31635 }
31636
31637 /* Output the assembler code for a thunk function. THUNK_DECL is the
31638 declaration for the thunk function itself, FUNCTION is the decl for
31639 the target function. DELTA is an immediate constant offset to be
31640 added to THIS. If VCALL_OFFSET is nonzero, the word at
31641 *(*this + vcall_offset) should be added to THIS. */
31642
31643 static void
31644 x86_output_mi_thunk (FILE *file,
31645 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
31646 HOST_WIDE_INT vcall_offset, tree function)
31647 {
31648 rtx this_param = x86_this_parameter (function);
31649 rtx this_reg, tmp, fnaddr;
31650
31651 emit_note (NOTE_INSN_PROLOGUE_END);
31652
31653 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
31654 pull it in now and let DELTA benefit. */
31655 if (REG_P (this_param))
31656 this_reg = this_param;
31657 else if (vcall_offset)
31658 {
31659 /* Put the this parameter into %eax. */
31660 this_reg = gen_rtx_REG (Pmode, AX_REG);
31661 emit_move_insn (this_reg, this_param);
31662 }
31663 else
31664 this_reg = NULL_RTX;
31665
31666 /* Adjust the this parameter by a fixed constant. */
31667 if (delta)
31668 {
31669 rtx delta_rtx = GEN_INT (delta);
31670 rtx delta_dst = this_reg ? this_reg : this_param;
31671
31672 if (TARGET_64BIT)
31673 {
31674 if (!x86_64_general_operand (delta_rtx, Pmode))
31675 {
31676 tmp = gen_rtx_REG (Pmode, R10_REG);
31677 emit_move_insn (tmp, delta_rtx);
31678 delta_rtx = tmp;
31679 }
31680 }
31681
31682 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
31683 }
31684
31685 /* Adjust the this parameter by a value stored in the vtable. */
31686 if (vcall_offset)
31687 {
31688 rtx vcall_addr, vcall_mem, this_mem;
31689 unsigned int tmp_regno;
31690
31691 if (TARGET_64BIT)
31692 tmp_regno = R10_REG;
31693 else
31694 {
31695 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
31696 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
31697 tmp_regno = AX_REG;
31698 else
31699 tmp_regno = CX_REG;
31700 }
31701 tmp = gen_rtx_REG (Pmode, tmp_regno);
31702
31703 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
31704 if (Pmode != ptr_mode)
31705 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
31706 emit_move_insn (tmp, this_mem);
31707
31708 /* Adjust the this parameter. */
31709 vcall_addr = plus_constant (tmp, vcall_offset);
31710 if (TARGET_64BIT
31711 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
31712 {
31713 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
31714 emit_move_insn (tmp2, GEN_INT (vcall_offset));
31715 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
31716 }
31717
31718 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
31719 if (Pmode != ptr_mode)
31720 emit_insn (gen_addsi_1_zext (this_reg,
31721 gen_rtx_REG (ptr_mode,
31722 REGNO (this_reg)),
31723 vcall_mem));
31724 else
31725 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
31726 }
31727
31728 /* If necessary, drop THIS back to its stack slot. */
31729 if (this_reg && this_reg != this_param)
31730 emit_move_insn (this_param, this_reg);
31731
31732 fnaddr = XEXP (DECL_RTL (function), 0);
31733 if (TARGET_64BIT)
31734 {
31735 if (!flag_pic || targetm.binds_local_p (function)
31736 || cfun->machine->call_abi == MS_ABI)
31737 ;
31738 else
31739 {
31740 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
31741 tmp = gen_rtx_CONST (Pmode, tmp);
31742 fnaddr = gen_rtx_MEM (Pmode, tmp);
31743 }
31744 }
31745 else
31746 {
31747 if (!flag_pic || targetm.binds_local_p (function))
31748 ;
31749 #if TARGET_MACHO
31750 else if (TARGET_MACHO)
31751 {
31752 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
31753 fnaddr = XEXP (fnaddr, 0);
31754 }
31755 #endif /* TARGET_MACHO */
31756 else
31757 {
31758 tmp = gen_rtx_REG (Pmode, CX_REG);
31759 output_set_got (tmp, NULL_RTX);
31760
31761 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
31762 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
31763 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
31764 }
31765 }
31766
31767 /* Our sibling call patterns do not allow memories, because we have no
31768 predicate that can distinguish between frame and non-frame memory.
31769 For our purposes here, we can get away with (ab)using a jump pattern,
31770 because we're going to do no optimization. */
31771 if (MEM_P (fnaddr))
31772 emit_jump_insn (gen_indirect_jump (fnaddr));
31773 else
31774 {
31775 tmp = gen_rtx_MEM (QImode, fnaddr);
31776 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
31777 tmp = emit_call_insn (tmp);
31778 SIBLING_CALL_P (tmp) = 1;
31779 }
31780 emit_barrier ();
31781
31782 /* Emit just enough of rest_of_compilation to get the insns emitted.
31783 Note that use_thunk calls assemble_start_function et al. */
31784 tmp = get_insns ();
31785 insn_locators_alloc ();
31786 shorten_branches (tmp);
31787 final_start_function (tmp, file, 1);
31788 final (tmp, file, 1);
31789 final_end_function ();
31790 }
31791
31792 static void
31793 x86_file_start (void)
31794 {
31795 default_file_start ();
31796 #if TARGET_MACHO
31797 darwin_file_start ();
31798 #endif
31799 if (X86_FILE_START_VERSION_DIRECTIVE)
31800 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
31801 if (X86_FILE_START_FLTUSED)
31802 fputs ("\t.global\t__fltused\n", asm_out_file);
31803 if (ix86_asm_dialect == ASM_INTEL)
31804 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
31805 }
31806
31807 int
31808 x86_field_alignment (tree field, int computed)
31809 {
31810 enum machine_mode mode;
31811 tree type = TREE_TYPE (field);
31812
31813 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
31814 return computed;
31815 mode = TYPE_MODE (strip_array_types (type));
31816 if (mode == DFmode || mode == DCmode
31817 || GET_MODE_CLASS (mode) == MODE_INT
31818 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
31819 return MIN (32, computed);
31820 return computed;
31821 }
31822
31823 /* Output assembler code to FILE to increment profiler label # LABELNO
31824 for profiling a function entry. */
31825 void
31826 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
31827 {
31828 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
31829 : MCOUNT_NAME);
31830
31831 if (TARGET_64BIT)
31832 {
31833 #ifndef NO_PROFILE_COUNTERS
31834 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
31835 #endif
31836
31837 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
31838 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
31839 else
31840 fprintf (file, "\tcall\t%s\n", mcount_name);
31841 }
31842 else if (flag_pic)
31843 {
31844 #ifndef NO_PROFILE_COUNTERS
31845 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
31846 LPREFIX, labelno);
31847 #endif
31848 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
31849 }
31850 else
31851 {
31852 #ifndef NO_PROFILE_COUNTERS
31853 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
31854 LPREFIX, labelno);
31855 #endif
31856 fprintf (file, "\tcall\t%s\n", mcount_name);
31857 }
31858 }
31859
31860 /* We don't have exact information about the insn sizes, but we may assume
31861 quite safely that we are informed about all 1 byte insns and memory
31862 address sizes. This is enough to eliminate unnecessary padding in
31863 99% of cases. */
31864
31865 static int
31866 min_insn_size (rtx insn)
31867 {
31868 int l = 0, len;
31869
31870 if (!INSN_P (insn) || !active_insn_p (insn))
31871 return 0;
31872
31873 /* Discard alignments we've emit and jump instructions. */
31874 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
31875 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
31876 return 0;
31877 if (JUMP_TABLE_DATA_P (insn))
31878 return 0;
31879
31880 /* Important case - calls are always 5 bytes.
31881 It is common to have many calls in the row. */
31882 if (CALL_P (insn)
31883 && symbolic_reference_mentioned_p (PATTERN (insn))
31884 && !SIBLING_CALL_P (insn))
31885 return 5;
31886 len = get_attr_length (insn);
31887 if (len <= 1)
31888 return 1;
31889
31890 /* For normal instructions we rely on get_attr_length being exact,
31891 with a few exceptions. */
31892 if (!JUMP_P (insn))
31893 {
31894 enum attr_type type = get_attr_type (insn);
31895
31896 switch (type)
31897 {
31898 case TYPE_MULTI:
31899 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
31900 || asm_noperands (PATTERN (insn)) >= 0)
31901 return 0;
31902 break;
31903 case TYPE_OTHER:
31904 case TYPE_FCMP:
31905 break;
31906 default:
31907 /* Otherwise trust get_attr_length. */
31908 return len;
31909 }
31910
31911 l = get_attr_length_address (insn);
31912 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
31913 l = 4;
31914 }
31915 if (l)
31916 return 1+l;
31917 else
31918 return 2;
31919 }
31920
31921 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
31922
31923 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
31924 window. */
31925
31926 static void
31927 ix86_avoid_jump_mispredicts (void)
31928 {
31929 rtx insn, start = get_insns ();
31930 int nbytes = 0, njumps = 0;
31931 int isjump = 0;
31932
31933 /* Look for all minimal intervals of instructions containing 4 jumps.
31934 The intervals are bounded by START and INSN. NBYTES is the total
31935 size of instructions in the interval including INSN and not including
31936 START. When the NBYTES is smaller than 16 bytes, it is possible
31937 that the end of START and INSN ends up in the same 16byte page.
31938
31939 The smallest offset in the page INSN can start is the case where START
31940 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
31941 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
31942 */
31943 for (insn = start; insn; insn = NEXT_INSN (insn))
31944 {
31945 int min_size;
31946
31947 if (LABEL_P (insn))
31948 {
31949 int align = label_to_alignment (insn);
31950 int max_skip = label_to_max_skip (insn);
31951
31952 if (max_skip > 15)
31953 max_skip = 15;
31954 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
31955 already in the current 16 byte page, because otherwise
31956 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
31957 bytes to reach 16 byte boundary. */
31958 if (align <= 0
31959 || (align <= 3 && max_skip != (1 << align) - 1))
31960 max_skip = 0;
31961 if (dump_file)
31962 fprintf (dump_file, "Label %i with max_skip %i\n",
31963 INSN_UID (insn), max_skip);
31964 if (max_skip)
31965 {
31966 while (nbytes + max_skip >= 16)
31967 {
31968 start = NEXT_INSN (start);
31969 if ((JUMP_P (start)
31970 && GET_CODE (PATTERN (start)) != ADDR_VEC
31971 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
31972 || CALL_P (start))
31973 njumps--, isjump = 1;
31974 else
31975 isjump = 0;
31976 nbytes -= min_insn_size (start);
31977 }
31978 }
31979 continue;
31980 }
31981
31982 min_size = min_insn_size (insn);
31983 nbytes += min_size;
31984 if (dump_file)
31985 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
31986 INSN_UID (insn), min_size);
31987 if ((JUMP_P (insn)
31988 && GET_CODE (PATTERN (insn)) != ADDR_VEC
31989 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
31990 || CALL_P (insn))
31991 njumps++;
31992 else
31993 continue;
31994
31995 while (njumps > 3)
31996 {
31997 start = NEXT_INSN (start);
31998 if ((JUMP_P (start)
31999 && GET_CODE (PATTERN (start)) != ADDR_VEC
32000 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32001 || CALL_P (start))
32002 njumps--, isjump = 1;
32003 else
32004 isjump = 0;
32005 nbytes -= min_insn_size (start);
32006 }
32007 gcc_assert (njumps >= 0);
32008 if (dump_file)
32009 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
32010 INSN_UID (start), INSN_UID (insn), nbytes);
32011
32012 if (njumps == 3 && isjump && nbytes < 16)
32013 {
32014 int padsize = 15 - nbytes + min_insn_size (insn);
32015
32016 if (dump_file)
32017 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
32018 INSN_UID (insn), padsize);
32019 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
32020 }
32021 }
32022 }
32023 #endif
32024
32025 /* AMD Athlon works faster
32026 when RET is not destination of conditional jump or directly preceded
32027 by other jump instruction. We avoid the penalty by inserting NOP just
32028 before the RET instructions in such cases. */
32029 static void
32030 ix86_pad_returns (void)
32031 {
32032 edge e;
32033 edge_iterator ei;
32034
32035 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32036 {
32037 basic_block bb = e->src;
32038 rtx ret = BB_END (bb);
32039 rtx prev;
32040 bool replace = false;
32041
32042 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
32043 || optimize_bb_for_size_p (bb))
32044 continue;
32045 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
32046 if (active_insn_p (prev) || LABEL_P (prev))
32047 break;
32048 if (prev && LABEL_P (prev))
32049 {
32050 edge e;
32051 edge_iterator ei;
32052
32053 FOR_EACH_EDGE (e, ei, bb->preds)
32054 if (EDGE_FREQUENCY (e) && e->src->index >= 0
32055 && !(e->flags & EDGE_FALLTHRU))
32056 replace = true;
32057 }
32058 if (!replace)
32059 {
32060 prev = prev_active_insn (ret);
32061 if (prev
32062 && ((JUMP_P (prev) && any_condjump_p (prev))
32063 || CALL_P (prev)))
32064 replace = true;
32065 /* Empty functions get branch mispredict even when
32066 the jump destination is not visible to us. */
32067 if (!prev && !optimize_function_for_size_p (cfun))
32068 replace = true;
32069 }
32070 if (replace)
32071 {
32072 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
32073 delete_insn (ret);
32074 }
32075 }
32076 }
32077
32078 /* Count the minimum number of instructions in BB. Return 4 if the
32079 number of instructions >= 4. */
32080
32081 static int
32082 ix86_count_insn_bb (basic_block bb)
32083 {
32084 rtx insn;
32085 int insn_count = 0;
32086
32087 /* Count number of instructions in this block. Return 4 if the number
32088 of instructions >= 4. */
32089 FOR_BB_INSNS (bb, insn)
32090 {
32091 /* Only happen in exit blocks. */
32092 if (JUMP_P (insn)
32093 && ANY_RETURN_P (PATTERN (insn)))
32094 break;
32095
32096 if (NONDEBUG_INSN_P (insn)
32097 && GET_CODE (PATTERN (insn)) != USE
32098 && GET_CODE (PATTERN (insn)) != CLOBBER)
32099 {
32100 insn_count++;
32101 if (insn_count >= 4)
32102 return insn_count;
32103 }
32104 }
32105
32106 return insn_count;
32107 }
32108
32109
32110 /* Count the minimum number of instructions in code path in BB.
32111 Return 4 if the number of instructions >= 4. */
32112
32113 static int
32114 ix86_count_insn (basic_block bb)
32115 {
32116 edge e;
32117 edge_iterator ei;
32118 int min_prev_count;
32119
32120 /* Only bother counting instructions along paths with no
32121 more than 2 basic blocks between entry and exit. Given
32122 that BB has an edge to exit, determine if a predecessor
32123 of BB has an edge from entry. If so, compute the number
32124 of instructions in the predecessor block. If there
32125 happen to be multiple such blocks, compute the minimum. */
32126 min_prev_count = 4;
32127 FOR_EACH_EDGE (e, ei, bb->preds)
32128 {
32129 edge prev_e;
32130 edge_iterator prev_ei;
32131
32132 if (e->src == ENTRY_BLOCK_PTR)
32133 {
32134 min_prev_count = 0;
32135 break;
32136 }
32137 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
32138 {
32139 if (prev_e->src == ENTRY_BLOCK_PTR)
32140 {
32141 int count = ix86_count_insn_bb (e->src);
32142 if (count < min_prev_count)
32143 min_prev_count = count;
32144 break;
32145 }
32146 }
32147 }
32148
32149 if (min_prev_count < 4)
32150 min_prev_count += ix86_count_insn_bb (bb);
32151
32152 return min_prev_count;
32153 }
32154
32155 /* Pad short funtion to 4 instructions. */
32156
32157 static void
32158 ix86_pad_short_function (void)
32159 {
32160 edge e;
32161 edge_iterator ei;
32162
32163 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32164 {
32165 rtx ret = BB_END (e->src);
32166 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
32167 {
32168 int insn_count = ix86_count_insn (e->src);
32169
32170 /* Pad short function. */
32171 if (insn_count < 4)
32172 {
32173 rtx insn = ret;
32174
32175 /* Find epilogue. */
32176 while (insn
32177 && (!NOTE_P (insn)
32178 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
32179 insn = PREV_INSN (insn);
32180
32181 if (!insn)
32182 insn = ret;
32183
32184 /* Two NOPs count as one instruction. */
32185 insn_count = 2 * (4 - insn_count);
32186 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
32187 }
32188 }
32189 }
32190 }
32191
32192 /* Implement machine specific optimizations. We implement padding of returns
32193 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
32194 static void
32195 ix86_reorg (void)
32196 {
32197 /* We are freeing block_for_insn in the toplev to keep compatibility
32198 with old MDEP_REORGS that are not CFG based. Recompute it now. */
32199 compute_bb_for_insn ();
32200
32201 /* Run the vzeroupper optimization if needed. */
32202 if (TARGET_VZEROUPPER)
32203 move_or_delete_vzeroupper ();
32204
32205 if (optimize && optimize_function_for_speed_p (cfun))
32206 {
32207 if (TARGET_PAD_SHORT_FUNCTION)
32208 ix86_pad_short_function ();
32209 else if (TARGET_PAD_RETURNS)
32210 ix86_pad_returns ();
32211 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32212 if (TARGET_FOUR_JUMP_LIMIT)
32213 ix86_avoid_jump_mispredicts ();
32214 #endif
32215 }
32216 }
32217
32218 /* Return nonzero when QImode register that must be represented via REX prefix
32219 is used. */
32220 bool
32221 x86_extended_QIreg_mentioned_p (rtx insn)
32222 {
32223 int i;
32224 extract_insn_cached (insn);
32225 for (i = 0; i < recog_data.n_operands; i++)
32226 if (REG_P (recog_data.operand[i])
32227 && REGNO (recog_data.operand[i]) > BX_REG)
32228 return true;
32229 return false;
32230 }
32231
32232 /* Return nonzero when P points to register encoded via REX prefix.
32233 Called via for_each_rtx. */
32234 static int
32235 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
32236 {
32237 unsigned int regno;
32238 if (!REG_P (*p))
32239 return 0;
32240 regno = REGNO (*p);
32241 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
32242 }
32243
32244 /* Return true when INSN mentions register that must be encoded using REX
32245 prefix. */
32246 bool
32247 x86_extended_reg_mentioned_p (rtx insn)
32248 {
32249 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
32250 extended_reg_mentioned_1, NULL);
32251 }
32252
32253 /* If profitable, negate (without causing overflow) integer constant
32254 of mode MODE at location LOC. Return true in this case. */
32255 bool
32256 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
32257 {
32258 HOST_WIDE_INT val;
32259
32260 if (!CONST_INT_P (*loc))
32261 return false;
32262
32263 switch (mode)
32264 {
32265 case DImode:
32266 /* DImode x86_64 constants must fit in 32 bits. */
32267 gcc_assert (x86_64_immediate_operand (*loc, mode));
32268
32269 mode = SImode;
32270 break;
32271
32272 case SImode:
32273 case HImode:
32274 case QImode:
32275 break;
32276
32277 default:
32278 gcc_unreachable ();
32279 }
32280
32281 /* Avoid overflows. */
32282 if (mode_signbit_p (mode, *loc))
32283 return false;
32284
32285 val = INTVAL (*loc);
32286
32287 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
32288 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
32289 if ((val < 0 && val != -128)
32290 || val == 128)
32291 {
32292 *loc = GEN_INT (-val);
32293 return true;
32294 }
32295
32296 return false;
32297 }
32298
32299 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
32300 optabs would emit if we didn't have TFmode patterns. */
32301
32302 void
32303 x86_emit_floatuns (rtx operands[2])
32304 {
32305 rtx neglab, donelab, i0, i1, f0, in, out;
32306 enum machine_mode mode, inmode;
32307
32308 inmode = GET_MODE (operands[1]);
32309 gcc_assert (inmode == SImode || inmode == DImode);
32310
32311 out = operands[0];
32312 in = force_reg (inmode, operands[1]);
32313 mode = GET_MODE (out);
32314 neglab = gen_label_rtx ();
32315 donelab = gen_label_rtx ();
32316 f0 = gen_reg_rtx (mode);
32317
32318 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
32319
32320 expand_float (out, in, 0);
32321
32322 emit_jump_insn (gen_jump (donelab));
32323 emit_barrier ();
32324
32325 emit_label (neglab);
32326
32327 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
32328 1, OPTAB_DIRECT);
32329 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
32330 1, OPTAB_DIRECT);
32331 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
32332
32333 expand_float (f0, i0, 0);
32334
32335 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
32336
32337 emit_label (donelab);
32338 }
32339 \f
32340 /* AVX2 does support 32-byte integer vector operations,
32341 thus the longest vector we are faced with is V32QImode. */
32342 #define MAX_VECT_LEN 32
32343
32344 struct expand_vec_perm_d
32345 {
32346 rtx target, op0, op1;
32347 unsigned char perm[MAX_VECT_LEN];
32348 enum machine_mode vmode;
32349 unsigned char nelt;
32350 bool testing_p;
32351 };
32352
32353 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
32354 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
32355
32356 /* Get a vector mode of the same size as the original but with elements
32357 twice as wide. This is only guaranteed to apply to integral vectors. */
32358
32359 static inline enum machine_mode
32360 get_mode_wider_vector (enum machine_mode o)
32361 {
32362 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
32363 enum machine_mode n = GET_MODE_WIDER_MODE (o);
32364 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
32365 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
32366 return n;
32367 }
32368
32369 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32370 with all elements equal to VAR. Return true if successful. */
32371
32372 static bool
32373 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
32374 rtx target, rtx val)
32375 {
32376 bool ok;
32377
32378 switch (mode)
32379 {
32380 case V2SImode:
32381 case V2SFmode:
32382 if (!mmx_ok)
32383 return false;
32384 /* FALLTHRU */
32385
32386 case V4DFmode:
32387 case V4DImode:
32388 case V8SFmode:
32389 case V8SImode:
32390 case V2DFmode:
32391 case V2DImode:
32392 case V4SFmode:
32393 case V4SImode:
32394 {
32395 rtx insn, dup;
32396
32397 /* First attempt to recognize VAL as-is. */
32398 dup = gen_rtx_VEC_DUPLICATE (mode, val);
32399 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
32400 if (recog_memoized (insn) < 0)
32401 {
32402 rtx seq;
32403 /* If that fails, force VAL into a register. */
32404
32405 start_sequence ();
32406 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
32407 seq = get_insns ();
32408 end_sequence ();
32409 if (seq)
32410 emit_insn_before (seq, insn);
32411
32412 ok = recog_memoized (insn) >= 0;
32413 gcc_assert (ok);
32414 }
32415 }
32416 return true;
32417
32418 case V4HImode:
32419 if (!mmx_ok)
32420 return false;
32421 if (TARGET_SSE || TARGET_3DNOW_A)
32422 {
32423 rtx x;
32424
32425 val = gen_lowpart (SImode, val);
32426 x = gen_rtx_TRUNCATE (HImode, val);
32427 x = gen_rtx_VEC_DUPLICATE (mode, x);
32428 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32429 return true;
32430 }
32431 goto widen;
32432
32433 case V8QImode:
32434 if (!mmx_ok)
32435 return false;
32436 goto widen;
32437
32438 case V8HImode:
32439 if (TARGET_SSE2)
32440 {
32441 struct expand_vec_perm_d dperm;
32442 rtx tmp1, tmp2;
32443
32444 permute:
32445 memset (&dperm, 0, sizeof (dperm));
32446 dperm.target = target;
32447 dperm.vmode = mode;
32448 dperm.nelt = GET_MODE_NUNITS (mode);
32449 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
32450
32451 /* Extend to SImode using a paradoxical SUBREG. */
32452 tmp1 = gen_reg_rtx (SImode);
32453 emit_move_insn (tmp1, gen_lowpart (SImode, val));
32454
32455 /* Insert the SImode value as low element of a V4SImode vector. */
32456 tmp2 = gen_lowpart (V4SImode, dperm.op0);
32457 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
32458
32459 ok = (expand_vec_perm_1 (&dperm)
32460 || expand_vec_perm_broadcast_1 (&dperm));
32461 gcc_assert (ok);
32462 return ok;
32463 }
32464 goto widen;
32465
32466 case V16QImode:
32467 if (TARGET_SSE2)
32468 goto permute;
32469 goto widen;
32470
32471 widen:
32472 /* Replicate the value once into the next wider mode and recurse. */
32473 {
32474 enum machine_mode smode, wsmode, wvmode;
32475 rtx x;
32476
32477 smode = GET_MODE_INNER (mode);
32478 wvmode = get_mode_wider_vector (mode);
32479 wsmode = GET_MODE_INNER (wvmode);
32480
32481 val = convert_modes (wsmode, smode, val, true);
32482 x = expand_simple_binop (wsmode, ASHIFT, val,
32483 GEN_INT (GET_MODE_BITSIZE (smode)),
32484 NULL_RTX, 1, OPTAB_LIB_WIDEN);
32485 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
32486
32487 x = gen_lowpart (wvmode, target);
32488 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
32489 gcc_assert (ok);
32490 return ok;
32491 }
32492
32493 case V16HImode:
32494 case V32QImode:
32495 {
32496 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
32497 rtx x = gen_reg_rtx (hvmode);
32498
32499 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
32500 gcc_assert (ok);
32501
32502 x = gen_rtx_VEC_CONCAT (mode, x, x);
32503 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32504 }
32505 return true;
32506
32507 default:
32508 return false;
32509 }
32510 }
32511
32512 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32513 whose ONE_VAR element is VAR, and other elements are zero. Return true
32514 if successful. */
32515
32516 static bool
32517 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
32518 rtx target, rtx var, int one_var)
32519 {
32520 enum machine_mode vsimode;
32521 rtx new_target;
32522 rtx x, tmp;
32523 bool use_vector_set = false;
32524
32525 switch (mode)
32526 {
32527 case V2DImode:
32528 /* For SSE4.1, we normally use vector set. But if the second
32529 element is zero and inter-unit moves are OK, we use movq
32530 instead. */
32531 use_vector_set = (TARGET_64BIT
32532 && TARGET_SSE4_1
32533 && !(TARGET_INTER_UNIT_MOVES
32534 && one_var == 0));
32535 break;
32536 case V16QImode:
32537 case V4SImode:
32538 case V4SFmode:
32539 use_vector_set = TARGET_SSE4_1;
32540 break;
32541 case V8HImode:
32542 use_vector_set = TARGET_SSE2;
32543 break;
32544 case V4HImode:
32545 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
32546 break;
32547 case V32QImode:
32548 case V16HImode:
32549 case V8SImode:
32550 case V8SFmode:
32551 case V4DFmode:
32552 use_vector_set = TARGET_AVX;
32553 break;
32554 case V4DImode:
32555 /* Use ix86_expand_vector_set in 64bit mode only. */
32556 use_vector_set = TARGET_AVX && TARGET_64BIT;
32557 break;
32558 default:
32559 break;
32560 }
32561
32562 if (use_vector_set)
32563 {
32564 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
32565 var = force_reg (GET_MODE_INNER (mode), var);
32566 ix86_expand_vector_set (mmx_ok, target, var, one_var);
32567 return true;
32568 }
32569
32570 switch (mode)
32571 {
32572 case V2SFmode:
32573 case V2SImode:
32574 if (!mmx_ok)
32575 return false;
32576 /* FALLTHRU */
32577
32578 case V2DFmode:
32579 case V2DImode:
32580 if (one_var != 0)
32581 return false;
32582 var = force_reg (GET_MODE_INNER (mode), var);
32583 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
32584 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32585 return true;
32586
32587 case V4SFmode:
32588 case V4SImode:
32589 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
32590 new_target = gen_reg_rtx (mode);
32591 else
32592 new_target = target;
32593 var = force_reg (GET_MODE_INNER (mode), var);
32594 x = gen_rtx_VEC_DUPLICATE (mode, var);
32595 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
32596 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
32597 if (one_var != 0)
32598 {
32599 /* We need to shuffle the value to the correct position, so
32600 create a new pseudo to store the intermediate result. */
32601
32602 /* With SSE2, we can use the integer shuffle insns. */
32603 if (mode != V4SFmode && TARGET_SSE2)
32604 {
32605 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
32606 const1_rtx,
32607 GEN_INT (one_var == 1 ? 0 : 1),
32608 GEN_INT (one_var == 2 ? 0 : 1),
32609 GEN_INT (one_var == 3 ? 0 : 1)));
32610 if (target != new_target)
32611 emit_move_insn (target, new_target);
32612 return true;
32613 }
32614
32615 /* Otherwise convert the intermediate result to V4SFmode and
32616 use the SSE1 shuffle instructions. */
32617 if (mode != V4SFmode)
32618 {
32619 tmp = gen_reg_rtx (V4SFmode);
32620 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
32621 }
32622 else
32623 tmp = new_target;
32624
32625 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
32626 const1_rtx,
32627 GEN_INT (one_var == 1 ? 0 : 1),
32628 GEN_INT (one_var == 2 ? 0+4 : 1+4),
32629 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
32630
32631 if (mode != V4SFmode)
32632 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
32633 else if (tmp != target)
32634 emit_move_insn (target, tmp);
32635 }
32636 else if (target != new_target)
32637 emit_move_insn (target, new_target);
32638 return true;
32639
32640 case V8HImode:
32641 case V16QImode:
32642 vsimode = V4SImode;
32643 goto widen;
32644 case V4HImode:
32645 case V8QImode:
32646 if (!mmx_ok)
32647 return false;
32648 vsimode = V2SImode;
32649 goto widen;
32650 widen:
32651 if (one_var != 0)
32652 return false;
32653
32654 /* Zero extend the variable element to SImode and recurse. */
32655 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
32656
32657 x = gen_reg_rtx (vsimode);
32658 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
32659 var, one_var))
32660 gcc_unreachable ();
32661
32662 emit_move_insn (target, gen_lowpart (mode, x));
32663 return true;
32664
32665 default:
32666 return false;
32667 }
32668 }
32669
32670 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32671 consisting of the values in VALS. It is known that all elements
32672 except ONE_VAR are constants. Return true if successful. */
32673
32674 static bool
32675 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
32676 rtx target, rtx vals, int one_var)
32677 {
32678 rtx var = XVECEXP (vals, 0, one_var);
32679 enum machine_mode wmode;
32680 rtx const_vec, x;
32681
32682 const_vec = copy_rtx (vals);
32683 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
32684 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
32685
32686 switch (mode)
32687 {
32688 case V2DFmode:
32689 case V2DImode:
32690 case V2SFmode:
32691 case V2SImode:
32692 /* For the two element vectors, it's just as easy to use
32693 the general case. */
32694 return false;
32695
32696 case V4DImode:
32697 /* Use ix86_expand_vector_set in 64bit mode only. */
32698 if (!TARGET_64BIT)
32699 return false;
32700 case V4DFmode:
32701 case V8SFmode:
32702 case V8SImode:
32703 case V16HImode:
32704 case V32QImode:
32705 case V4SFmode:
32706 case V4SImode:
32707 case V8HImode:
32708 case V4HImode:
32709 break;
32710
32711 case V16QImode:
32712 if (TARGET_SSE4_1)
32713 break;
32714 wmode = V8HImode;
32715 goto widen;
32716 case V8QImode:
32717 wmode = V4HImode;
32718 goto widen;
32719 widen:
32720 /* There's no way to set one QImode entry easily. Combine
32721 the variable value with its adjacent constant value, and
32722 promote to an HImode set. */
32723 x = XVECEXP (vals, 0, one_var ^ 1);
32724 if (one_var & 1)
32725 {
32726 var = convert_modes (HImode, QImode, var, true);
32727 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
32728 NULL_RTX, 1, OPTAB_LIB_WIDEN);
32729 x = GEN_INT (INTVAL (x) & 0xff);
32730 }
32731 else
32732 {
32733 var = convert_modes (HImode, QImode, var, true);
32734 x = gen_int_mode (INTVAL (x) << 8, HImode);
32735 }
32736 if (x != const0_rtx)
32737 var = expand_simple_binop (HImode, IOR, var, x, var,
32738 1, OPTAB_LIB_WIDEN);
32739
32740 x = gen_reg_rtx (wmode);
32741 emit_move_insn (x, gen_lowpart (wmode, const_vec));
32742 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
32743
32744 emit_move_insn (target, gen_lowpart (mode, x));
32745 return true;
32746
32747 default:
32748 return false;
32749 }
32750
32751 emit_move_insn (target, const_vec);
32752 ix86_expand_vector_set (mmx_ok, target, var, one_var);
32753 return true;
32754 }
32755
32756 /* A subroutine of ix86_expand_vector_init_general. Use vector
32757 concatenate to handle the most general case: all values variable,
32758 and none identical. */
32759
32760 static void
32761 ix86_expand_vector_init_concat (enum machine_mode mode,
32762 rtx target, rtx *ops, int n)
32763 {
32764 enum machine_mode cmode, hmode = VOIDmode;
32765 rtx first[8], second[4];
32766 rtvec v;
32767 int i, j;
32768
32769 switch (n)
32770 {
32771 case 2:
32772 switch (mode)
32773 {
32774 case V8SImode:
32775 cmode = V4SImode;
32776 break;
32777 case V8SFmode:
32778 cmode = V4SFmode;
32779 break;
32780 case V4DImode:
32781 cmode = V2DImode;
32782 break;
32783 case V4DFmode:
32784 cmode = V2DFmode;
32785 break;
32786 case V4SImode:
32787 cmode = V2SImode;
32788 break;
32789 case V4SFmode:
32790 cmode = V2SFmode;
32791 break;
32792 case V2DImode:
32793 cmode = DImode;
32794 break;
32795 case V2SImode:
32796 cmode = SImode;
32797 break;
32798 case V2DFmode:
32799 cmode = DFmode;
32800 break;
32801 case V2SFmode:
32802 cmode = SFmode;
32803 break;
32804 default:
32805 gcc_unreachable ();
32806 }
32807
32808 if (!register_operand (ops[1], cmode))
32809 ops[1] = force_reg (cmode, ops[1]);
32810 if (!register_operand (ops[0], cmode))
32811 ops[0] = force_reg (cmode, ops[0]);
32812 emit_insn (gen_rtx_SET (VOIDmode, target,
32813 gen_rtx_VEC_CONCAT (mode, ops[0],
32814 ops[1])));
32815 break;
32816
32817 case 4:
32818 switch (mode)
32819 {
32820 case V4DImode:
32821 cmode = V2DImode;
32822 break;
32823 case V4DFmode:
32824 cmode = V2DFmode;
32825 break;
32826 case V4SImode:
32827 cmode = V2SImode;
32828 break;
32829 case V4SFmode:
32830 cmode = V2SFmode;
32831 break;
32832 default:
32833 gcc_unreachable ();
32834 }
32835 goto half;
32836
32837 case 8:
32838 switch (mode)
32839 {
32840 case V8SImode:
32841 cmode = V2SImode;
32842 hmode = V4SImode;
32843 break;
32844 case V8SFmode:
32845 cmode = V2SFmode;
32846 hmode = V4SFmode;
32847 break;
32848 default:
32849 gcc_unreachable ();
32850 }
32851 goto half;
32852
32853 half:
32854 /* FIXME: We process inputs backward to help RA. PR 36222. */
32855 i = n - 1;
32856 j = (n >> 1) - 1;
32857 for (; i > 0; i -= 2, j--)
32858 {
32859 first[j] = gen_reg_rtx (cmode);
32860 v = gen_rtvec (2, ops[i - 1], ops[i]);
32861 ix86_expand_vector_init (false, first[j],
32862 gen_rtx_PARALLEL (cmode, v));
32863 }
32864
32865 n >>= 1;
32866 if (n > 2)
32867 {
32868 gcc_assert (hmode != VOIDmode);
32869 for (i = j = 0; i < n; i += 2, j++)
32870 {
32871 second[j] = gen_reg_rtx (hmode);
32872 ix86_expand_vector_init_concat (hmode, second [j],
32873 &first [i], 2);
32874 }
32875 n >>= 1;
32876 ix86_expand_vector_init_concat (mode, target, second, n);
32877 }
32878 else
32879 ix86_expand_vector_init_concat (mode, target, first, n);
32880 break;
32881
32882 default:
32883 gcc_unreachable ();
32884 }
32885 }
32886
32887 /* A subroutine of ix86_expand_vector_init_general. Use vector
32888 interleave to handle the most general case: all values variable,
32889 and none identical. */
32890
32891 static void
32892 ix86_expand_vector_init_interleave (enum machine_mode mode,
32893 rtx target, rtx *ops, int n)
32894 {
32895 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
32896 int i, j;
32897 rtx op0, op1;
32898 rtx (*gen_load_even) (rtx, rtx, rtx);
32899 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
32900 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
32901
32902 switch (mode)
32903 {
32904 case V8HImode:
32905 gen_load_even = gen_vec_setv8hi;
32906 gen_interleave_first_low = gen_vec_interleave_lowv4si;
32907 gen_interleave_second_low = gen_vec_interleave_lowv2di;
32908 inner_mode = HImode;
32909 first_imode = V4SImode;
32910 second_imode = V2DImode;
32911 third_imode = VOIDmode;
32912 break;
32913 case V16QImode:
32914 gen_load_even = gen_vec_setv16qi;
32915 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
32916 gen_interleave_second_low = gen_vec_interleave_lowv4si;
32917 inner_mode = QImode;
32918 first_imode = V8HImode;
32919 second_imode = V4SImode;
32920 third_imode = V2DImode;
32921 break;
32922 default:
32923 gcc_unreachable ();
32924 }
32925
32926 for (i = 0; i < n; i++)
32927 {
32928 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
32929 op0 = gen_reg_rtx (SImode);
32930 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
32931
32932 /* Insert the SImode value as low element of V4SImode vector. */
32933 op1 = gen_reg_rtx (V4SImode);
32934 op0 = gen_rtx_VEC_MERGE (V4SImode,
32935 gen_rtx_VEC_DUPLICATE (V4SImode,
32936 op0),
32937 CONST0_RTX (V4SImode),
32938 const1_rtx);
32939 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
32940
32941 /* Cast the V4SImode vector back to a vector in orignal mode. */
32942 op0 = gen_reg_rtx (mode);
32943 emit_move_insn (op0, gen_lowpart (mode, op1));
32944
32945 /* Load even elements into the second positon. */
32946 emit_insn (gen_load_even (op0,
32947 force_reg (inner_mode,
32948 ops [i + i + 1]),
32949 const1_rtx));
32950
32951 /* Cast vector to FIRST_IMODE vector. */
32952 ops[i] = gen_reg_rtx (first_imode);
32953 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
32954 }
32955
32956 /* Interleave low FIRST_IMODE vectors. */
32957 for (i = j = 0; i < n; i += 2, j++)
32958 {
32959 op0 = gen_reg_rtx (first_imode);
32960 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
32961
32962 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
32963 ops[j] = gen_reg_rtx (second_imode);
32964 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
32965 }
32966
32967 /* Interleave low SECOND_IMODE vectors. */
32968 switch (second_imode)
32969 {
32970 case V4SImode:
32971 for (i = j = 0; i < n / 2; i += 2, j++)
32972 {
32973 op0 = gen_reg_rtx (second_imode);
32974 emit_insn (gen_interleave_second_low (op0, ops[i],
32975 ops[i + 1]));
32976
32977 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
32978 vector. */
32979 ops[j] = gen_reg_rtx (third_imode);
32980 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
32981 }
32982 second_imode = V2DImode;
32983 gen_interleave_second_low = gen_vec_interleave_lowv2di;
32984 /* FALLTHRU */
32985
32986 case V2DImode:
32987 op0 = gen_reg_rtx (second_imode);
32988 emit_insn (gen_interleave_second_low (op0, ops[0],
32989 ops[1]));
32990
32991 /* Cast the SECOND_IMODE vector back to a vector on original
32992 mode. */
32993 emit_insn (gen_rtx_SET (VOIDmode, target,
32994 gen_lowpart (mode, op0)));
32995 break;
32996
32997 default:
32998 gcc_unreachable ();
32999 }
33000 }
33001
33002 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
33003 all values variable, and none identical. */
33004
33005 static void
33006 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
33007 rtx target, rtx vals)
33008 {
33009 rtx ops[32], op0, op1;
33010 enum machine_mode half_mode = VOIDmode;
33011 int n, i;
33012
33013 switch (mode)
33014 {
33015 case V2SFmode:
33016 case V2SImode:
33017 if (!mmx_ok && !TARGET_SSE)
33018 break;
33019 /* FALLTHRU */
33020
33021 case V8SFmode:
33022 case V8SImode:
33023 case V4DFmode:
33024 case V4DImode:
33025 case V4SFmode:
33026 case V4SImode:
33027 case V2DFmode:
33028 case V2DImode:
33029 n = GET_MODE_NUNITS (mode);
33030 for (i = 0; i < n; i++)
33031 ops[i] = XVECEXP (vals, 0, i);
33032 ix86_expand_vector_init_concat (mode, target, ops, n);
33033 return;
33034
33035 case V32QImode:
33036 half_mode = V16QImode;
33037 goto half;
33038
33039 case V16HImode:
33040 half_mode = V8HImode;
33041 goto half;
33042
33043 half:
33044 n = GET_MODE_NUNITS (mode);
33045 for (i = 0; i < n; i++)
33046 ops[i] = XVECEXP (vals, 0, i);
33047 op0 = gen_reg_rtx (half_mode);
33048 op1 = gen_reg_rtx (half_mode);
33049 ix86_expand_vector_init_interleave (half_mode, op0, ops,
33050 n >> 2);
33051 ix86_expand_vector_init_interleave (half_mode, op1,
33052 &ops [n >> 1], n >> 2);
33053 emit_insn (gen_rtx_SET (VOIDmode, target,
33054 gen_rtx_VEC_CONCAT (mode, op0, op1)));
33055 return;
33056
33057 case V16QImode:
33058 if (!TARGET_SSE4_1)
33059 break;
33060 /* FALLTHRU */
33061
33062 case V8HImode:
33063 if (!TARGET_SSE2)
33064 break;
33065
33066 /* Don't use ix86_expand_vector_init_interleave if we can't
33067 move from GPR to SSE register directly. */
33068 if (!TARGET_INTER_UNIT_MOVES)
33069 break;
33070
33071 n = GET_MODE_NUNITS (mode);
33072 for (i = 0; i < n; i++)
33073 ops[i] = XVECEXP (vals, 0, i);
33074 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
33075 return;
33076
33077 case V4HImode:
33078 case V8QImode:
33079 break;
33080
33081 default:
33082 gcc_unreachable ();
33083 }
33084
33085 {
33086 int i, j, n_elts, n_words, n_elt_per_word;
33087 enum machine_mode inner_mode;
33088 rtx words[4], shift;
33089
33090 inner_mode = GET_MODE_INNER (mode);
33091 n_elts = GET_MODE_NUNITS (mode);
33092 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
33093 n_elt_per_word = n_elts / n_words;
33094 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
33095
33096 for (i = 0; i < n_words; ++i)
33097 {
33098 rtx word = NULL_RTX;
33099
33100 for (j = 0; j < n_elt_per_word; ++j)
33101 {
33102 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
33103 elt = convert_modes (word_mode, inner_mode, elt, true);
33104
33105 if (j == 0)
33106 word = elt;
33107 else
33108 {
33109 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
33110 word, 1, OPTAB_LIB_WIDEN);
33111 word = expand_simple_binop (word_mode, IOR, word, elt,
33112 word, 1, OPTAB_LIB_WIDEN);
33113 }
33114 }
33115
33116 words[i] = word;
33117 }
33118
33119 if (n_words == 1)
33120 emit_move_insn (target, gen_lowpart (mode, words[0]));
33121 else if (n_words == 2)
33122 {
33123 rtx tmp = gen_reg_rtx (mode);
33124 emit_clobber (tmp);
33125 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
33126 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
33127 emit_move_insn (target, tmp);
33128 }
33129 else if (n_words == 4)
33130 {
33131 rtx tmp = gen_reg_rtx (V4SImode);
33132 gcc_assert (word_mode == SImode);
33133 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
33134 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
33135 emit_move_insn (target, gen_lowpart (mode, tmp));
33136 }
33137 else
33138 gcc_unreachable ();
33139 }
33140 }
33141
33142 /* Initialize vector TARGET via VALS. Suppress the use of MMX
33143 instructions unless MMX_OK is true. */
33144
33145 void
33146 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
33147 {
33148 enum machine_mode mode = GET_MODE (target);
33149 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33150 int n_elts = GET_MODE_NUNITS (mode);
33151 int n_var = 0, one_var = -1;
33152 bool all_same = true, all_const_zero = true;
33153 int i;
33154 rtx x;
33155
33156 for (i = 0; i < n_elts; ++i)
33157 {
33158 x = XVECEXP (vals, 0, i);
33159 if (!(CONST_INT_P (x)
33160 || GET_CODE (x) == CONST_DOUBLE
33161 || GET_CODE (x) == CONST_FIXED))
33162 n_var++, one_var = i;
33163 else if (x != CONST0_RTX (inner_mode))
33164 all_const_zero = false;
33165 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
33166 all_same = false;
33167 }
33168
33169 /* Constants are best loaded from the constant pool. */
33170 if (n_var == 0)
33171 {
33172 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
33173 return;
33174 }
33175
33176 /* If all values are identical, broadcast the value. */
33177 if (all_same
33178 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
33179 XVECEXP (vals, 0, 0)))
33180 return;
33181
33182 /* Values where only one field is non-constant are best loaded from
33183 the pool and overwritten via move later. */
33184 if (n_var == 1)
33185 {
33186 if (all_const_zero
33187 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
33188 XVECEXP (vals, 0, one_var),
33189 one_var))
33190 return;
33191
33192 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
33193 return;
33194 }
33195
33196 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
33197 }
33198
33199 void
33200 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
33201 {
33202 enum machine_mode mode = GET_MODE (target);
33203 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33204 enum machine_mode half_mode;
33205 bool use_vec_merge = false;
33206 rtx tmp;
33207 static rtx (*gen_extract[6][2]) (rtx, rtx)
33208 = {
33209 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
33210 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
33211 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
33212 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
33213 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
33214 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
33215 };
33216 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
33217 = {
33218 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
33219 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
33220 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
33221 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
33222 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
33223 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
33224 };
33225 int i, j, n;
33226
33227 switch (mode)
33228 {
33229 case V2SFmode:
33230 case V2SImode:
33231 if (mmx_ok)
33232 {
33233 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33234 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
33235 if (elt == 0)
33236 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33237 else
33238 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33239 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33240 return;
33241 }
33242 break;
33243
33244 case V2DImode:
33245 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
33246 if (use_vec_merge)
33247 break;
33248
33249 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33250 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
33251 if (elt == 0)
33252 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33253 else
33254 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33255 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33256 return;
33257
33258 case V2DFmode:
33259 {
33260 rtx op0, op1;
33261
33262 /* For the two element vectors, we implement a VEC_CONCAT with
33263 the extraction of the other element. */
33264
33265 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
33266 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
33267
33268 if (elt == 0)
33269 op0 = val, op1 = tmp;
33270 else
33271 op0 = tmp, op1 = val;
33272
33273 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
33274 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33275 }
33276 return;
33277
33278 case V4SFmode:
33279 use_vec_merge = TARGET_SSE4_1;
33280 if (use_vec_merge)
33281 break;
33282
33283 switch (elt)
33284 {
33285 case 0:
33286 use_vec_merge = true;
33287 break;
33288
33289 case 1:
33290 /* tmp = target = A B C D */
33291 tmp = copy_to_reg (target);
33292 /* target = A A B B */
33293 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
33294 /* target = X A B B */
33295 ix86_expand_vector_set (false, target, val, 0);
33296 /* target = A X C D */
33297 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33298 const1_rtx, const0_rtx,
33299 GEN_INT (2+4), GEN_INT (3+4)));
33300 return;
33301
33302 case 2:
33303 /* tmp = target = A B C D */
33304 tmp = copy_to_reg (target);
33305 /* tmp = X B C D */
33306 ix86_expand_vector_set (false, tmp, val, 0);
33307 /* target = A B X D */
33308 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33309 const0_rtx, const1_rtx,
33310 GEN_INT (0+4), GEN_INT (3+4)));
33311 return;
33312
33313 case 3:
33314 /* tmp = target = A B C D */
33315 tmp = copy_to_reg (target);
33316 /* tmp = X B C D */
33317 ix86_expand_vector_set (false, tmp, val, 0);
33318 /* target = A B X D */
33319 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33320 const0_rtx, const1_rtx,
33321 GEN_INT (2+4), GEN_INT (0+4)));
33322 return;
33323
33324 default:
33325 gcc_unreachable ();
33326 }
33327 break;
33328
33329 case V4SImode:
33330 use_vec_merge = TARGET_SSE4_1;
33331 if (use_vec_merge)
33332 break;
33333
33334 /* Element 0 handled by vec_merge below. */
33335 if (elt == 0)
33336 {
33337 use_vec_merge = true;
33338 break;
33339 }
33340
33341 if (TARGET_SSE2)
33342 {
33343 /* With SSE2, use integer shuffles to swap element 0 and ELT,
33344 store into element 0, then shuffle them back. */
33345
33346 rtx order[4];
33347
33348 order[0] = GEN_INT (elt);
33349 order[1] = const1_rtx;
33350 order[2] = const2_rtx;
33351 order[3] = GEN_INT (3);
33352 order[elt] = const0_rtx;
33353
33354 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
33355 order[1], order[2], order[3]));
33356
33357 ix86_expand_vector_set (false, target, val, 0);
33358
33359 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
33360 order[1], order[2], order[3]));
33361 }
33362 else
33363 {
33364 /* For SSE1, we have to reuse the V4SF code. */
33365 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
33366 gen_lowpart (SFmode, val), elt);
33367 }
33368 return;
33369
33370 case V8HImode:
33371 use_vec_merge = TARGET_SSE2;
33372 break;
33373 case V4HImode:
33374 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
33375 break;
33376
33377 case V16QImode:
33378 use_vec_merge = TARGET_SSE4_1;
33379 break;
33380
33381 case V8QImode:
33382 break;
33383
33384 case V32QImode:
33385 half_mode = V16QImode;
33386 j = 0;
33387 n = 16;
33388 goto half;
33389
33390 case V16HImode:
33391 half_mode = V8HImode;
33392 j = 1;
33393 n = 8;
33394 goto half;
33395
33396 case V8SImode:
33397 half_mode = V4SImode;
33398 j = 2;
33399 n = 4;
33400 goto half;
33401
33402 case V4DImode:
33403 half_mode = V2DImode;
33404 j = 3;
33405 n = 2;
33406 goto half;
33407
33408 case V8SFmode:
33409 half_mode = V4SFmode;
33410 j = 4;
33411 n = 4;
33412 goto half;
33413
33414 case V4DFmode:
33415 half_mode = V2DFmode;
33416 j = 5;
33417 n = 2;
33418 goto half;
33419
33420 half:
33421 /* Compute offset. */
33422 i = elt / n;
33423 elt %= n;
33424
33425 gcc_assert (i <= 1);
33426
33427 /* Extract the half. */
33428 tmp = gen_reg_rtx (half_mode);
33429 emit_insn (gen_extract[j][i] (tmp, target));
33430
33431 /* Put val in tmp at elt. */
33432 ix86_expand_vector_set (false, tmp, val, elt);
33433
33434 /* Put it back. */
33435 emit_insn (gen_insert[j][i] (target, target, tmp));
33436 return;
33437
33438 default:
33439 break;
33440 }
33441
33442 if (use_vec_merge)
33443 {
33444 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
33445 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
33446 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33447 }
33448 else
33449 {
33450 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
33451
33452 emit_move_insn (mem, target);
33453
33454 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
33455 emit_move_insn (tmp, val);
33456
33457 emit_move_insn (target, mem);
33458 }
33459 }
33460
33461 void
33462 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
33463 {
33464 enum machine_mode mode = GET_MODE (vec);
33465 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33466 bool use_vec_extr = false;
33467 rtx tmp;
33468
33469 switch (mode)
33470 {
33471 case V2SImode:
33472 case V2SFmode:
33473 if (!mmx_ok)
33474 break;
33475 /* FALLTHRU */
33476
33477 case V2DFmode:
33478 case V2DImode:
33479 use_vec_extr = true;
33480 break;
33481
33482 case V4SFmode:
33483 use_vec_extr = TARGET_SSE4_1;
33484 if (use_vec_extr)
33485 break;
33486
33487 switch (elt)
33488 {
33489 case 0:
33490 tmp = vec;
33491 break;
33492
33493 case 1:
33494 case 3:
33495 tmp = gen_reg_rtx (mode);
33496 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
33497 GEN_INT (elt), GEN_INT (elt),
33498 GEN_INT (elt+4), GEN_INT (elt+4)));
33499 break;
33500
33501 case 2:
33502 tmp = gen_reg_rtx (mode);
33503 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
33504 break;
33505
33506 default:
33507 gcc_unreachable ();
33508 }
33509 vec = tmp;
33510 use_vec_extr = true;
33511 elt = 0;
33512 break;
33513
33514 case V4SImode:
33515 use_vec_extr = TARGET_SSE4_1;
33516 if (use_vec_extr)
33517 break;
33518
33519 if (TARGET_SSE2)
33520 {
33521 switch (elt)
33522 {
33523 case 0:
33524 tmp = vec;
33525 break;
33526
33527 case 1:
33528 case 3:
33529 tmp = gen_reg_rtx (mode);
33530 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
33531 GEN_INT (elt), GEN_INT (elt),
33532 GEN_INT (elt), GEN_INT (elt)));
33533 break;
33534
33535 case 2:
33536 tmp = gen_reg_rtx (mode);
33537 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
33538 break;
33539
33540 default:
33541 gcc_unreachable ();
33542 }
33543 vec = tmp;
33544 use_vec_extr = true;
33545 elt = 0;
33546 }
33547 else
33548 {
33549 /* For SSE1, we have to reuse the V4SF code. */
33550 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
33551 gen_lowpart (V4SFmode, vec), elt);
33552 return;
33553 }
33554 break;
33555
33556 case V8HImode:
33557 use_vec_extr = TARGET_SSE2;
33558 break;
33559 case V4HImode:
33560 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
33561 break;
33562
33563 case V16QImode:
33564 use_vec_extr = TARGET_SSE4_1;
33565 break;
33566
33567 case V8SFmode:
33568 if (TARGET_AVX)
33569 {
33570 tmp = gen_reg_rtx (V4SFmode);
33571 if (elt < 4)
33572 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
33573 else
33574 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
33575 ix86_expand_vector_extract (false, target, tmp, elt & 3);
33576 return;
33577 }
33578 break;
33579
33580 case V4DFmode:
33581 if (TARGET_AVX)
33582 {
33583 tmp = gen_reg_rtx (V2DFmode);
33584 if (elt < 2)
33585 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
33586 else
33587 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
33588 ix86_expand_vector_extract (false, target, tmp, elt & 1);
33589 return;
33590 }
33591 break;
33592
33593 case V32QImode:
33594 if (TARGET_AVX)
33595 {
33596 tmp = gen_reg_rtx (V16QImode);
33597 if (elt < 16)
33598 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
33599 else
33600 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
33601 ix86_expand_vector_extract (false, target, tmp, elt & 15);
33602 return;
33603 }
33604 break;
33605
33606 case V16HImode:
33607 if (TARGET_AVX)
33608 {
33609 tmp = gen_reg_rtx (V8HImode);
33610 if (elt < 8)
33611 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
33612 else
33613 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
33614 ix86_expand_vector_extract (false, target, tmp, elt & 7);
33615 return;
33616 }
33617 break;
33618
33619 case V8SImode:
33620 if (TARGET_AVX)
33621 {
33622 tmp = gen_reg_rtx (V4SImode);
33623 if (elt < 4)
33624 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
33625 else
33626 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
33627 ix86_expand_vector_extract (false, target, tmp, elt & 3);
33628 return;
33629 }
33630 break;
33631
33632 case V4DImode:
33633 if (TARGET_AVX)
33634 {
33635 tmp = gen_reg_rtx (V2DImode);
33636 if (elt < 2)
33637 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
33638 else
33639 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
33640 ix86_expand_vector_extract (false, target, tmp, elt & 1);
33641 return;
33642 }
33643 break;
33644
33645 case V8QImode:
33646 /* ??? Could extract the appropriate HImode element and shift. */
33647 default:
33648 break;
33649 }
33650
33651 if (use_vec_extr)
33652 {
33653 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
33654 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
33655
33656 /* Let the rtl optimizers know about the zero extension performed. */
33657 if (inner_mode == QImode || inner_mode == HImode)
33658 {
33659 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
33660 target = gen_lowpart (SImode, target);
33661 }
33662
33663 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33664 }
33665 else
33666 {
33667 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
33668
33669 emit_move_insn (mem, vec);
33670
33671 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
33672 emit_move_insn (target, tmp);
33673 }
33674 }
33675
33676 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
33677 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
33678 The upper bits of DEST are undefined, though they shouldn't cause
33679 exceptions (some bits from src or all zeros are ok). */
33680
33681 static void
33682 emit_reduc_half (rtx dest, rtx src, int i)
33683 {
33684 rtx tem;
33685 switch (GET_MODE (src))
33686 {
33687 case V4SFmode:
33688 if (i == 128)
33689 tem = gen_sse_movhlps (dest, src, src);
33690 else
33691 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
33692 GEN_INT (1 + 4), GEN_INT (1 + 4));
33693 break;
33694 case V2DFmode:
33695 tem = gen_vec_interleave_highv2df (dest, src, src);
33696 break;
33697 case V16QImode:
33698 case V8HImode:
33699 case V4SImode:
33700 case V2DImode:
33701 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
33702 gen_lowpart (V1TImode, src),
33703 GEN_INT (i / 2));
33704 break;
33705 case V8SFmode:
33706 if (i == 256)
33707 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
33708 else
33709 tem = gen_avx_shufps256 (dest, src, src,
33710 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
33711 break;
33712 case V4DFmode:
33713 if (i == 256)
33714 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
33715 else
33716 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
33717 break;
33718 case V32QImode:
33719 case V16HImode:
33720 case V8SImode:
33721 case V4DImode:
33722 if (i == 256)
33723 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
33724 gen_lowpart (V4DImode, src),
33725 gen_lowpart (V4DImode, src),
33726 const1_rtx);
33727 else
33728 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
33729 gen_lowpart (V2TImode, src),
33730 GEN_INT (i / 2));
33731 break;
33732 default:
33733 gcc_unreachable ();
33734 }
33735 emit_insn (tem);
33736 }
33737
33738 /* Expand a vector reduction. FN is the binary pattern to reduce;
33739 DEST is the destination; IN is the input vector. */
33740
33741 void
33742 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
33743 {
33744 rtx half, dst, vec = in;
33745 enum machine_mode mode = GET_MODE (in);
33746 int i;
33747
33748 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
33749 if (TARGET_SSE4_1
33750 && mode == V8HImode
33751 && fn == gen_uminv8hi3)
33752 {
33753 emit_insn (gen_sse4_1_phminposuw (dest, in));
33754 return;
33755 }
33756
33757 for (i = GET_MODE_BITSIZE (mode);
33758 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
33759 i >>= 1)
33760 {
33761 half = gen_reg_rtx (mode);
33762 emit_reduc_half (half, vec, i);
33763 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
33764 dst = dest;
33765 else
33766 dst = gen_reg_rtx (mode);
33767 emit_insn (fn (dst, half, vec));
33768 vec = dst;
33769 }
33770 }
33771 \f
33772 /* Target hook for scalar_mode_supported_p. */
33773 static bool
33774 ix86_scalar_mode_supported_p (enum machine_mode mode)
33775 {
33776 if (DECIMAL_FLOAT_MODE_P (mode))
33777 return default_decimal_float_supported_p ();
33778 else if (mode == TFmode)
33779 return true;
33780 else
33781 return default_scalar_mode_supported_p (mode);
33782 }
33783
33784 /* Implements target hook vector_mode_supported_p. */
33785 static bool
33786 ix86_vector_mode_supported_p (enum machine_mode mode)
33787 {
33788 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
33789 return true;
33790 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
33791 return true;
33792 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
33793 return true;
33794 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
33795 return true;
33796 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
33797 return true;
33798 return false;
33799 }
33800
33801 /* Target hook for c_mode_for_suffix. */
33802 static enum machine_mode
33803 ix86_c_mode_for_suffix (char suffix)
33804 {
33805 if (suffix == 'q')
33806 return TFmode;
33807 if (suffix == 'w')
33808 return XFmode;
33809
33810 return VOIDmode;
33811 }
33812
33813 /* Worker function for TARGET_MD_ASM_CLOBBERS.
33814
33815 We do this in the new i386 backend to maintain source compatibility
33816 with the old cc0-based compiler. */
33817
33818 static tree
33819 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
33820 tree inputs ATTRIBUTE_UNUSED,
33821 tree clobbers)
33822 {
33823 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
33824 clobbers);
33825 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
33826 clobbers);
33827 return clobbers;
33828 }
33829
33830 /* Implements target vector targetm.asm.encode_section_info. */
33831
33832 static void ATTRIBUTE_UNUSED
33833 ix86_encode_section_info (tree decl, rtx rtl, int first)
33834 {
33835 default_encode_section_info (decl, rtl, first);
33836
33837 if (TREE_CODE (decl) == VAR_DECL
33838 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
33839 && ix86_in_large_data_p (decl))
33840 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
33841 }
33842
33843 /* Worker function for REVERSE_CONDITION. */
33844
33845 enum rtx_code
33846 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
33847 {
33848 return (mode != CCFPmode && mode != CCFPUmode
33849 ? reverse_condition (code)
33850 : reverse_condition_maybe_unordered (code));
33851 }
33852
33853 /* Output code to perform an x87 FP register move, from OPERANDS[1]
33854 to OPERANDS[0]. */
33855
33856 const char *
33857 output_387_reg_move (rtx insn, rtx *operands)
33858 {
33859 if (REG_P (operands[0]))
33860 {
33861 if (REG_P (operands[1])
33862 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
33863 {
33864 if (REGNO (operands[0]) == FIRST_STACK_REG)
33865 return output_387_ffreep (operands, 0);
33866 return "fstp\t%y0";
33867 }
33868 if (STACK_TOP_P (operands[0]))
33869 return "fld%Z1\t%y1";
33870 return "fst\t%y0";
33871 }
33872 else if (MEM_P (operands[0]))
33873 {
33874 gcc_assert (REG_P (operands[1]));
33875 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
33876 return "fstp%Z0\t%y0";
33877 else
33878 {
33879 /* There is no non-popping store to memory for XFmode.
33880 So if we need one, follow the store with a load. */
33881 if (GET_MODE (operands[0]) == XFmode)
33882 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
33883 else
33884 return "fst%Z0\t%y0";
33885 }
33886 }
33887 else
33888 gcc_unreachable();
33889 }
33890
33891 /* Output code to perform a conditional jump to LABEL, if C2 flag in
33892 FP status register is set. */
33893
33894 void
33895 ix86_emit_fp_unordered_jump (rtx label)
33896 {
33897 rtx reg = gen_reg_rtx (HImode);
33898 rtx temp;
33899
33900 emit_insn (gen_x86_fnstsw_1 (reg));
33901
33902 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
33903 {
33904 emit_insn (gen_x86_sahf_1 (reg));
33905
33906 temp = gen_rtx_REG (CCmode, FLAGS_REG);
33907 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
33908 }
33909 else
33910 {
33911 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
33912
33913 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
33914 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
33915 }
33916
33917 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
33918 gen_rtx_LABEL_REF (VOIDmode, label),
33919 pc_rtx);
33920 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
33921
33922 emit_jump_insn (temp);
33923 predict_jump (REG_BR_PROB_BASE * 10 / 100);
33924 }
33925
33926 /* Output code to perform a log1p XFmode calculation. */
33927
33928 void ix86_emit_i387_log1p (rtx op0, rtx op1)
33929 {
33930 rtx label1 = gen_label_rtx ();
33931 rtx label2 = gen_label_rtx ();
33932
33933 rtx tmp = gen_reg_rtx (XFmode);
33934 rtx tmp2 = gen_reg_rtx (XFmode);
33935 rtx test;
33936
33937 emit_insn (gen_absxf2 (tmp, op1));
33938 test = gen_rtx_GE (VOIDmode, tmp,
33939 CONST_DOUBLE_FROM_REAL_VALUE (
33940 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
33941 XFmode));
33942 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
33943
33944 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
33945 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
33946 emit_jump (label2);
33947
33948 emit_label (label1);
33949 emit_move_insn (tmp, CONST1_RTX (XFmode));
33950 emit_insn (gen_addxf3 (tmp, op1, tmp));
33951 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
33952 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
33953
33954 emit_label (label2);
33955 }
33956
33957 /* Emit code for round calculation. */
33958 void ix86_emit_i387_round (rtx op0, rtx op1)
33959 {
33960 enum machine_mode inmode = GET_MODE (op1);
33961 enum machine_mode outmode = GET_MODE (op0);
33962 rtx e1, e2, res, tmp, tmp1, half;
33963 rtx scratch = gen_reg_rtx (HImode);
33964 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
33965 rtx jump_label = gen_label_rtx ();
33966 rtx insn;
33967 rtx (*gen_abs) (rtx, rtx);
33968 rtx (*gen_neg) (rtx, rtx);
33969
33970 switch (inmode)
33971 {
33972 case SFmode:
33973 gen_abs = gen_abssf2;
33974 break;
33975 case DFmode:
33976 gen_abs = gen_absdf2;
33977 break;
33978 case XFmode:
33979 gen_abs = gen_absxf2;
33980 break;
33981 default:
33982 gcc_unreachable ();
33983 }
33984
33985 switch (outmode)
33986 {
33987 case SFmode:
33988 gen_neg = gen_negsf2;
33989 break;
33990 case DFmode:
33991 gen_neg = gen_negdf2;
33992 break;
33993 case XFmode:
33994 gen_neg = gen_negxf2;
33995 break;
33996 case HImode:
33997 gen_neg = gen_neghi2;
33998 break;
33999 case SImode:
34000 gen_neg = gen_negsi2;
34001 break;
34002 case DImode:
34003 gen_neg = gen_negdi2;
34004 break;
34005 default:
34006 gcc_unreachable ();
34007 }
34008
34009 e1 = gen_reg_rtx (inmode);
34010 e2 = gen_reg_rtx (inmode);
34011 res = gen_reg_rtx (outmode);
34012
34013 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
34014
34015 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
34016
34017 /* scratch = fxam(op1) */
34018 emit_insn (gen_rtx_SET (VOIDmode, scratch,
34019 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
34020 UNSPEC_FXAM)));
34021 /* e1 = fabs(op1) */
34022 emit_insn (gen_abs (e1, op1));
34023
34024 /* e2 = e1 + 0.5 */
34025 half = force_reg (inmode, half);
34026 emit_insn (gen_rtx_SET (VOIDmode, e2,
34027 gen_rtx_PLUS (inmode, e1, half)));
34028
34029 /* res = floor(e2) */
34030 if (inmode != XFmode)
34031 {
34032 tmp1 = gen_reg_rtx (XFmode);
34033
34034 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
34035 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
34036 }
34037 else
34038 tmp1 = e2;
34039
34040 switch (outmode)
34041 {
34042 case SFmode:
34043 case DFmode:
34044 {
34045 rtx tmp0 = gen_reg_rtx (XFmode);
34046
34047 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
34048
34049 emit_insn (gen_rtx_SET (VOIDmode, res,
34050 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
34051 UNSPEC_TRUNC_NOOP)));
34052 }
34053 break;
34054 case XFmode:
34055 emit_insn (gen_frndintxf2_floor (res, tmp1));
34056 break;
34057 case HImode:
34058 emit_insn (gen_lfloorxfhi2 (res, tmp1));
34059 break;
34060 case SImode:
34061 emit_insn (gen_lfloorxfsi2 (res, tmp1));
34062 break;
34063 case DImode:
34064 emit_insn (gen_lfloorxfdi2 (res, tmp1));
34065 break;
34066 default:
34067 gcc_unreachable ();
34068 }
34069
34070 /* flags = signbit(a) */
34071 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
34072
34073 /* if (flags) then res = -res */
34074 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
34075 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
34076 gen_rtx_LABEL_REF (VOIDmode, jump_label),
34077 pc_rtx);
34078 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
34079 predict_jump (REG_BR_PROB_BASE * 50 / 100);
34080 JUMP_LABEL (insn) = jump_label;
34081
34082 emit_insn (gen_neg (res, res));
34083
34084 emit_label (jump_label);
34085 LABEL_NUSES (jump_label) = 1;
34086
34087 emit_move_insn (op0, res);
34088 }
34089
34090 /* Output code to perform a Newton-Rhapson approximation of a single precision
34091 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
34092
34093 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
34094 {
34095 rtx x0, x1, e0, e1;
34096
34097 x0 = gen_reg_rtx (mode);
34098 e0 = gen_reg_rtx (mode);
34099 e1 = gen_reg_rtx (mode);
34100 x1 = gen_reg_rtx (mode);
34101
34102 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
34103
34104 b = force_reg (mode, b);
34105
34106 /* x0 = rcp(b) estimate */
34107 emit_insn (gen_rtx_SET (VOIDmode, x0,
34108 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
34109 UNSPEC_RCP)));
34110 /* e0 = x0 * b */
34111 emit_insn (gen_rtx_SET (VOIDmode, e0,
34112 gen_rtx_MULT (mode, x0, b)));
34113
34114 /* e0 = x0 * e0 */
34115 emit_insn (gen_rtx_SET (VOIDmode, e0,
34116 gen_rtx_MULT (mode, x0, e0)));
34117
34118 /* e1 = x0 + x0 */
34119 emit_insn (gen_rtx_SET (VOIDmode, e1,
34120 gen_rtx_PLUS (mode, x0, x0)));
34121
34122 /* x1 = e1 - e0 */
34123 emit_insn (gen_rtx_SET (VOIDmode, x1,
34124 gen_rtx_MINUS (mode, e1, e0)));
34125
34126 /* res = a * x1 */
34127 emit_insn (gen_rtx_SET (VOIDmode, res,
34128 gen_rtx_MULT (mode, a, x1)));
34129 }
34130
34131 /* Output code to perform a Newton-Rhapson approximation of a
34132 single precision floating point [reciprocal] square root. */
34133
34134 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
34135 bool recip)
34136 {
34137 rtx x0, e0, e1, e2, e3, mthree, mhalf;
34138 REAL_VALUE_TYPE r;
34139
34140 x0 = gen_reg_rtx (mode);
34141 e0 = gen_reg_rtx (mode);
34142 e1 = gen_reg_rtx (mode);
34143 e2 = gen_reg_rtx (mode);
34144 e3 = gen_reg_rtx (mode);
34145
34146 real_from_integer (&r, VOIDmode, -3, -1, 0);
34147 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34148
34149 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
34150 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34151
34152 if (VECTOR_MODE_P (mode))
34153 {
34154 mthree = ix86_build_const_vector (mode, true, mthree);
34155 mhalf = ix86_build_const_vector (mode, true, mhalf);
34156 }
34157
34158 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
34159 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
34160
34161 a = force_reg (mode, a);
34162
34163 /* x0 = rsqrt(a) estimate */
34164 emit_insn (gen_rtx_SET (VOIDmode, x0,
34165 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
34166 UNSPEC_RSQRT)));
34167
34168 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
34169 if (!recip)
34170 {
34171 rtx zero, mask;
34172
34173 zero = gen_reg_rtx (mode);
34174 mask = gen_reg_rtx (mode);
34175
34176 zero = force_reg (mode, CONST0_RTX(mode));
34177 emit_insn (gen_rtx_SET (VOIDmode, mask,
34178 gen_rtx_NE (mode, zero, a)));
34179
34180 emit_insn (gen_rtx_SET (VOIDmode, x0,
34181 gen_rtx_AND (mode, x0, mask)));
34182 }
34183
34184 /* e0 = x0 * a */
34185 emit_insn (gen_rtx_SET (VOIDmode, e0,
34186 gen_rtx_MULT (mode, x0, a)));
34187 /* e1 = e0 * x0 */
34188 emit_insn (gen_rtx_SET (VOIDmode, e1,
34189 gen_rtx_MULT (mode, e0, x0)));
34190
34191 /* e2 = e1 - 3. */
34192 mthree = force_reg (mode, mthree);
34193 emit_insn (gen_rtx_SET (VOIDmode, e2,
34194 gen_rtx_PLUS (mode, e1, mthree)));
34195
34196 mhalf = force_reg (mode, mhalf);
34197 if (recip)
34198 /* e3 = -.5 * x0 */
34199 emit_insn (gen_rtx_SET (VOIDmode, e3,
34200 gen_rtx_MULT (mode, x0, mhalf)));
34201 else
34202 /* e3 = -.5 * e0 */
34203 emit_insn (gen_rtx_SET (VOIDmode, e3,
34204 gen_rtx_MULT (mode, e0, mhalf)));
34205 /* ret = e2 * e3 */
34206 emit_insn (gen_rtx_SET (VOIDmode, res,
34207 gen_rtx_MULT (mode, e2, e3)));
34208 }
34209
34210 #ifdef TARGET_SOLARIS
34211 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
34212
34213 static void
34214 i386_solaris_elf_named_section (const char *name, unsigned int flags,
34215 tree decl)
34216 {
34217 /* With Binutils 2.15, the "@unwind" marker must be specified on
34218 every occurrence of the ".eh_frame" section, not just the first
34219 one. */
34220 if (TARGET_64BIT
34221 && strcmp (name, ".eh_frame") == 0)
34222 {
34223 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
34224 flags & SECTION_WRITE ? "aw" : "a");
34225 return;
34226 }
34227
34228 #ifndef USE_GAS
34229 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
34230 {
34231 solaris_elf_asm_comdat_section (name, flags, decl);
34232 return;
34233 }
34234 #endif
34235
34236 default_elf_asm_named_section (name, flags, decl);
34237 }
34238 #endif /* TARGET_SOLARIS */
34239
34240 /* Return the mangling of TYPE if it is an extended fundamental type. */
34241
34242 static const char *
34243 ix86_mangle_type (const_tree type)
34244 {
34245 type = TYPE_MAIN_VARIANT (type);
34246
34247 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
34248 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
34249 return NULL;
34250
34251 switch (TYPE_MODE (type))
34252 {
34253 case TFmode:
34254 /* __float128 is "g". */
34255 return "g";
34256 case XFmode:
34257 /* "long double" or __float80 is "e". */
34258 return "e";
34259 default:
34260 return NULL;
34261 }
34262 }
34263
34264 /* For 32-bit code we can save PIC register setup by using
34265 __stack_chk_fail_local hidden function instead of calling
34266 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
34267 register, so it is better to call __stack_chk_fail directly. */
34268
34269 static tree ATTRIBUTE_UNUSED
34270 ix86_stack_protect_fail (void)
34271 {
34272 return TARGET_64BIT
34273 ? default_external_stack_protect_fail ()
34274 : default_hidden_stack_protect_fail ();
34275 }
34276
34277 /* Select a format to encode pointers in exception handling data. CODE
34278 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
34279 true if the symbol may be affected by dynamic relocations.
34280
34281 ??? All x86 object file formats are capable of representing this.
34282 After all, the relocation needed is the same as for the call insn.
34283 Whether or not a particular assembler allows us to enter such, I
34284 guess we'll have to see. */
34285 int
34286 asm_preferred_eh_data_format (int code, int global)
34287 {
34288 if (flag_pic)
34289 {
34290 int type = DW_EH_PE_sdata8;
34291 if (!TARGET_64BIT
34292 || ix86_cmodel == CM_SMALL_PIC
34293 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
34294 type = DW_EH_PE_sdata4;
34295 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
34296 }
34297 if (ix86_cmodel == CM_SMALL
34298 || (ix86_cmodel == CM_MEDIUM && code))
34299 return DW_EH_PE_udata4;
34300 return DW_EH_PE_absptr;
34301 }
34302 \f
34303 /* Expand copysign from SIGN to the positive value ABS_VALUE
34304 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
34305 the sign-bit. */
34306 static void
34307 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
34308 {
34309 enum machine_mode mode = GET_MODE (sign);
34310 rtx sgn = gen_reg_rtx (mode);
34311 if (mask == NULL_RTX)
34312 {
34313 enum machine_mode vmode;
34314
34315 if (mode == SFmode)
34316 vmode = V4SFmode;
34317 else if (mode == DFmode)
34318 vmode = V2DFmode;
34319 else
34320 vmode = mode;
34321
34322 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
34323 if (!VECTOR_MODE_P (mode))
34324 {
34325 /* We need to generate a scalar mode mask in this case. */
34326 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
34327 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
34328 mask = gen_reg_rtx (mode);
34329 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
34330 }
34331 }
34332 else
34333 mask = gen_rtx_NOT (mode, mask);
34334 emit_insn (gen_rtx_SET (VOIDmode, sgn,
34335 gen_rtx_AND (mode, mask, sign)));
34336 emit_insn (gen_rtx_SET (VOIDmode, result,
34337 gen_rtx_IOR (mode, abs_value, sgn)));
34338 }
34339
34340 /* Expand fabs (OP0) and return a new rtx that holds the result. The
34341 mask for masking out the sign-bit is stored in *SMASK, if that is
34342 non-null. */
34343 static rtx
34344 ix86_expand_sse_fabs (rtx op0, rtx *smask)
34345 {
34346 enum machine_mode vmode, mode = GET_MODE (op0);
34347 rtx xa, mask;
34348
34349 xa = gen_reg_rtx (mode);
34350 if (mode == SFmode)
34351 vmode = V4SFmode;
34352 else if (mode == DFmode)
34353 vmode = V2DFmode;
34354 else
34355 vmode = mode;
34356 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
34357 if (!VECTOR_MODE_P (mode))
34358 {
34359 /* We need to generate a scalar mode mask in this case. */
34360 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
34361 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
34362 mask = gen_reg_rtx (mode);
34363 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
34364 }
34365 emit_insn (gen_rtx_SET (VOIDmode, xa,
34366 gen_rtx_AND (mode, op0, mask)));
34367
34368 if (smask)
34369 *smask = mask;
34370
34371 return xa;
34372 }
34373
34374 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
34375 swapping the operands if SWAP_OPERANDS is true. The expanded
34376 code is a forward jump to a newly created label in case the
34377 comparison is true. The generated label rtx is returned. */
34378 static rtx
34379 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
34380 bool swap_operands)
34381 {
34382 rtx label, tmp;
34383
34384 if (swap_operands)
34385 {
34386 tmp = op0;
34387 op0 = op1;
34388 op1 = tmp;
34389 }
34390
34391 label = gen_label_rtx ();
34392 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
34393 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34394 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
34395 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
34396 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
34397 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
34398 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
34399 JUMP_LABEL (tmp) = label;
34400
34401 return label;
34402 }
34403
34404 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
34405 using comparison code CODE. Operands are swapped for the comparison if
34406 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
34407 static rtx
34408 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
34409 bool swap_operands)
34410 {
34411 rtx (*insn)(rtx, rtx, rtx, rtx);
34412 enum machine_mode mode = GET_MODE (op0);
34413 rtx mask = gen_reg_rtx (mode);
34414
34415 if (swap_operands)
34416 {
34417 rtx tmp = op0;
34418 op0 = op1;
34419 op1 = tmp;
34420 }
34421
34422 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
34423
34424 emit_insn (insn (mask, op0, op1,
34425 gen_rtx_fmt_ee (code, mode, op0, op1)));
34426 return mask;
34427 }
34428
34429 /* Generate and return a rtx of mode MODE for 2**n where n is the number
34430 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
34431 static rtx
34432 ix86_gen_TWO52 (enum machine_mode mode)
34433 {
34434 REAL_VALUE_TYPE TWO52r;
34435 rtx TWO52;
34436
34437 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
34438 TWO52 = const_double_from_real_value (TWO52r, mode);
34439 TWO52 = force_reg (mode, TWO52);
34440
34441 return TWO52;
34442 }
34443
34444 /* Expand SSE sequence for computing lround from OP1 storing
34445 into OP0. */
34446 void
34447 ix86_expand_lround (rtx op0, rtx op1)
34448 {
34449 /* C code for the stuff we're doing below:
34450 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
34451 return (long)tmp;
34452 */
34453 enum machine_mode mode = GET_MODE (op1);
34454 const struct real_format *fmt;
34455 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
34456 rtx adj;
34457
34458 /* load nextafter (0.5, 0.0) */
34459 fmt = REAL_MODE_FORMAT (mode);
34460 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
34461 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
34462
34463 /* adj = copysign (0.5, op1) */
34464 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
34465 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
34466
34467 /* adj = op1 + adj */
34468 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
34469
34470 /* op0 = (imode)adj */
34471 expand_fix (op0, adj, 0);
34472 }
34473
34474 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
34475 into OPERAND0. */
34476 void
34477 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
34478 {
34479 /* C code for the stuff we're doing below (for do_floor):
34480 xi = (long)op1;
34481 xi -= (double)xi > op1 ? 1 : 0;
34482 return xi;
34483 */
34484 enum machine_mode fmode = GET_MODE (op1);
34485 enum machine_mode imode = GET_MODE (op0);
34486 rtx ireg, freg, label, tmp;
34487
34488 /* reg = (long)op1 */
34489 ireg = gen_reg_rtx (imode);
34490 expand_fix (ireg, op1, 0);
34491
34492 /* freg = (double)reg */
34493 freg = gen_reg_rtx (fmode);
34494 expand_float (freg, ireg, 0);
34495
34496 /* ireg = (freg > op1) ? ireg - 1 : ireg */
34497 label = ix86_expand_sse_compare_and_jump (UNLE,
34498 freg, op1, !do_floor);
34499 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
34500 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
34501 emit_move_insn (ireg, tmp);
34502
34503 emit_label (label);
34504 LABEL_NUSES (label) = 1;
34505
34506 emit_move_insn (op0, ireg);
34507 }
34508
34509 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
34510 result in OPERAND0. */
34511 void
34512 ix86_expand_rint (rtx operand0, rtx operand1)
34513 {
34514 /* C code for the stuff we're doing below:
34515 xa = fabs (operand1);
34516 if (!isless (xa, 2**52))
34517 return operand1;
34518 xa = xa + 2**52 - 2**52;
34519 return copysign (xa, operand1);
34520 */
34521 enum machine_mode mode = GET_MODE (operand0);
34522 rtx res, xa, label, TWO52, mask;
34523
34524 res = gen_reg_rtx (mode);
34525 emit_move_insn (res, operand1);
34526
34527 /* xa = abs (operand1) */
34528 xa = ix86_expand_sse_fabs (res, &mask);
34529
34530 /* if (!isless (xa, TWO52)) goto label; */
34531 TWO52 = ix86_gen_TWO52 (mode);
34532 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34533
34534 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34535 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
34536
34537 ix86_sse_copysign_to_positive (res, xa, res, mask);
34538
34539 emit_label (label);
34540 LABEL_NUSES (label) = 1;
34541
34542 emit_move_insn (operand0, res);
34543 }
34544
34545 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
34546 into OPERAND0. */
34547 void
34548 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
34549 {
34550 /* C code for the stuff we expand below.
34551 double xa = fabs (x), x2;
34552 if (!isless (xa, TWO52))
34553 return x;
34554 xa = xa + TWO52 - TWO52;
34555 x2 = copysign (xa, x);
34556 Compensate. Floor:
34557 if (x2 > x)
34558 x2 -= 1;
34559 Compensate. Ceil:
34560 if (x2 < x)
34561 x2 -= -1;
34562 return x2;
34563 */
34564 enum machine_mode mode = GET_MODE (operand0);
34565 rtx xa, TWO52, tmp, label, one, res, mask;
34566
34567 TWO52 = ix86_gen_TWO52 (mode);
34568
34569 /* Temporary for holding the result, initialized to the input
34570 operand to ease control flow. */
34571 res = gen_reg_rtx (mode);
34572 emit_move_insn (res, operand1);
34573
34574 /* xa = abs (operand1) */
34575 xa = ix86_expand_sse_fabs (res, &mask);
34576
34577 /* if (!isless (xa, TWO52)) goto label; */
34578 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34579
34580 /* xa = xa + TWO52 - TWO52; */
34581 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34582 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
34583
34584 /* xa = copysign (xa, operand1) */
34585 ix86_sse_copysign_to_positive (xa, xa, res, mask);
34586
34587 /* generate 1.0 or -1.0 */
34588 one = force_reg (mode,
34589 const_double_from_real_value (do_floor
34590 ? dconst1 : dconstm1, mode));
34591
34592 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
34593 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
34594 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34595 gen_rtx_AND (mode, one, tmp)));
34596 /* We always need to subtract here to preserve signed zero. */
34597 tmp = expand_simple_binop (mode, MINUS,
34598 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
34599 emit_move_insn (res, tmp);
34600
34601 emit_label (label);
34602 LABEL_NUSES (label) = 1;
34603
34604 emit_move_insn (operand0, res);
34605 }
34606
34607 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
34608 into OPERAND0. */
34609 void
34610 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
34611 {
34612 /* C code for the stuff we expand below.
34613 double xa = fabs (x), x2;
34614 if (!isless (xa, TWO52))
34615 return x;
34616 x2 = (double)(long)x;
34617 Compensate. Floor:
34618 if (x2 > x)
34619 x2 -= 1;
34620 Compensate. Ceil:
34621 if (x2 < x)
34622 x2 += 1;
34623 if (HONOR_SIGNED_ZEROS (mode))
34624 return copysign (x2, x);
34625 return x2;
34626 */
34627 enum machine_mode mode = GET_MODE (operand0);
34628 rtx xa, xi, TWO52, tmp, label, one, res, mask;
34629
34630 TWO52 = ix86_gen_TWO52 (mode);
34631
34632 /* Temporary for holding the result, initialized to the input
34633 operand to ease control flow. */
34634 res = gen_reg_rtx (mode);
34635 emit_move_insn (res, operand1);
34636
34637 /* xa = abs (operand1) */
34638 xa = ix86_expand_sse_fabs (res, &mask);
34639
34640 /* if (!isless (xa, TWO52)) goto label; */
34641 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34642
34643 /* xa = (double)(long)x */
34644 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
34645 expand_fix (xi, res, 0);
34646 expand_float (xa, xi, 0);
34647
34648 /* generate 1.0 */
34649 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
34650
34651 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
34652 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
34653 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34654 gen_rtx_AND (mode, one, tmp)));
34655 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
34656 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
34657 emit_move_insn (res, tmp);
34658
34659 if (HONOR_SIGNED_ZEROS (mode))
34660 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
34661
34662 emit_label (label);
34663 LABEL_NUSES (label) = 1;
34664
34665 emit_move_insn (operand0, res);
34666 }
34667
34668 /* Expand SSE sequence for computing round from OPERAND1 storing
34669 into OPERAND0. Sequence that works without relying on DImode truncation
34670 via cvttsd2siq that is only available on 64bit targets. */
34671 void
34672 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
34673 {
34674 /* C code for the stuff we expand below.
34675 double xa = fabs (x), xa2, x2;
34676 if (!isless (xa, TWO52))
34677 return x;
34678 Using the absolute value and copying back sign makes
34679 -0.0 -> -0.0 correct.
34680 xa2 = xa + TWO52 - TWO52;
34681 Compensate.
34682 dxa = xa2 - xa;
34683 if (dxa <= -0.5)
34684 xa2 += 1;
34685 else if (dxa > 0.5)
34686 xa2 -= 1;
34687 x2 = copysign (xa2, x);
34688 return x2;
34689 */
34690 enum machine_mode mode = GET_MODE (operand0);
34691 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
34692
34693 TWO52 = ix86_gen_TWO52 (mode);
34694
34695 /* Temporary for holding the result, initialized to the input
34696 operand to ease control flow. */
34697 res = gen_reg_rtx (mode);
34698 emit_move_insn (res, operand1);
34699
34700 /* xa = abs (operand1) */
34701 xa = ix86_expand_sse_fabs (res, &mask);
34702
34703 /* if (!isless (xa, TWO52)) goto label; */
34704 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34705
34706 /* xa2 = xa + TWO52 - TWO52; */
34707 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34708 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
34709
34710 /* dxa = xa2 - xa; */
34711 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
34712
34713 /* generate 0.5, 1.0 and -0.5 */
34714 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
34715 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
34716 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
34717 0, OPTAB_DIRECT);
34718
34719 /* Compensate. */
34720 tmp = gen_reg_rtx (mode);
34721 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
34722 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
34723 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34724 gen_rtx_AND (mode, one, tmp)));
34725 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
34726 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
34727 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
34728 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34729 gen_rtx_AND (mode, one, tmp)));
34730 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
34731
34732 /* res = copysign (xa2, operand1) */
34733 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
34734
34735 emit_label (label);
34736 LABEL_NUSES (label) = 1;
34737
34738 emit_move_insn (operand0, res);
34739 }
34740
34741 /* Expand SSE sequence for computing trunc from OPERAND1 storing
34742 into OPERAND0. */
34743 void
34744 ix86_expand_trunc (rtx operand0, rtx operand1)
34745 {
34746 /* C code for SSE variant we expand below.
34747 double xa = fabs (x), x2;
34748 if (!isless (xa, TWO52))
34749 return x;
34750 x2 = (double)(long)x;
34751 if (HONOR_SIGNED_ZEROS (mode))
34752 return copysign (x2, x);
34753 return x2;
34754 */
34755 enum machine_mode mode = GET_MODE (operand0);
34756 rtx xa, xi, TWO52, label, res, mask;
34757
34758 TWO52 = ix86_gen_TWO52 (mode);
34759
34760 /* Temporary for holding the result, initialized to the input
34761 operand to ease control flow. */
34762 res = gen_reg_rtx (mode);
34763 emit_move_insn (res, operand1);
34764
34765 /* xa = abs (operand1) */
34766 xa = ix86_expand_sse_fabs (res, &mask);
34767
34768 /* if (!isless (xa, TWO52)) goto label; */
34769 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34770
34771 /* x = (double)(long)x */
34772 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
34773 expand_fix (xi, res, 0);
34774 expand_float (res, xi, 0);
34775
34776 if (HONOR_SIGNED_ZEROS (mode))
34777 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
34778
34779 emit_label (label);
34780 LABEL_NUSES (label) = 1;
34781
34782 emit_move_insn (operand0, res);
34783 }
34784
34785 /* Expand SSE sequence for computing trunc from OPERAND1 storing
34786 into OPERAND0. */
34787 void
34788 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
34789 {
34790 enum machine_mode mode = GET_MODE (operand0);
34791 rtx xa, mask, TWO52, label, one, res, smask, tmp;
34792
34793 /* C code for SSE variant we expand below.
34794 double xa = fabs (x), x2;
34795 if (!isless (xa, TWO52))
34796 return x;
34797 xa2 = xa + TWO52 - TWO52;
34798 Compensate:
34799 if (xa2 > xa)
34800 xa2 -= 1.0;
34801 x2 = copysign (xa2, x);
34802 return x2;
34803 */
34804
34805 TWO52 = ix86_gen_TWO52 (mode);
34806
34807 /* Temporary for holding the result, initialized to the input
34808 operand to ease control flow. */
34809 res = gen_reg_rtx (mode);
34810 emit_move_insn (res, operand1);
34811
34812 /* xa = abs (operand1) */
34813 xa = ix86_expand_sse_fabs (res, &smask);
34814
34815 /* if (!isless (xa, TWO52)) goto label; */
34816 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34817
34818 /* res = xa + TWO52 - TWO52; */
34819 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34820 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
34821 emit_move_insn (res, tmp);
34822
34823 /* generate 1.0 */
34824 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
34825
34826 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
34827 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
34828 emit_insn (gen_rtx_SET (VOIDmode, mask,
34829 gen_rtx_AND (mode, mask, one)));
34830 tmp = expand_simple_binop (mode, MINUS,
34831 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
34832 emit_move_insn (res, tmp);
34833
34834 /* res = copysign (res, operand1) */
34835 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
34836
34837 emit_label (label);
34838 LABEL_NUSES (label) = 1;
34839
34840 emit_move_insn (operand0, res);
34841 }
34842
34843 /* Expand SSE sequence for computing round from OPERAND1 storing
34844 into OPERAND0. */
34845 void
34846 ix86_expand_round (rtx operand0, rtx operand1)
34847 {
34848 /* C code for the stuff we're doing below:
34849 double xa = fabs (x);
34850 if (!isless (xa, TWO52))
34851 return x;
34852 xa = (double)(long)(xa + nextafter (0.5, 0.0));
34853 return copysign (xa, x);
34854 */
34855 enum machine_mode mode = GET_MODE (operand0);
34856 rtx res, TWO52, xa, label, xi, half, mask;
34857 const struct real_format *fmt;
34858 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
34859
34860 /* Temporary for holding the result, initialized to the input
34861 operand to ease control flow. */
34862 res = gen_reg_rtx (mode);
34863 emit_move_insn (res, operand1);
34864
34865 TWO52 = ix86_gen_TWO52 (mode);
34866 xa = ix86_expand_sse_fabs (res, &mask);
34867 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34868
34869 /* load nextafter (0.5, 0.0) */
34870 fmt = REAL_MODE_FORMAT (mode);
34871 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
34872 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
34873
34874 /* xa = xa + 0.5 */
34875 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
34876 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
34877
34878 /* xa = (double)(int64_t)xa */
34879 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
34880 expand_fix (xi, xa, 0);
34881 expand_float (xa, xi, 0);
34882
34883 /* res = copysign (xa, operand1) */
34884 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
34885
34886 emit_label (label);
34887 LABEL_NUSES (label) = 1;
34888
34889 emit_move_insn (operand0, res);
34890 }
34891
34892 /* Expand SSE sequence for computing round
34893 from OP1 storing into OP0 using sse4 round insn. */
34894 void
34895 ix86_expand_round_sse4 (rtx op0, rtx op1)
34896 {
34897 enum machine_mode mode = GET_MODE (op0);
34898 rtx e1, e2, res, half;
34899 const struct real_format *fmt;
34900 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
34901 rtx (*gen_copysign) (rtx, rtx, rtx);
34902 rtx (*gen_round) (rtx, rtx, rtx);
34903
34904 switch (mode)
34905 {
34906 case SFmode:
34907 gen_copysign = gen_copysignsf3;
34908 gen_round = gen_sse4_1_roundsf2;
34909 break;
34910 case DFmode:
34911 gen_copysign = gen_copysigndf3;
34912 gen_round = gen_sse4_1_rounddf2;
34913 break;
34914 default:
34915 gcc_unreachable ();
34916 }
34917
34918 /* round (a) = trunc (a + copysign (0.5, a)) */
34919
34920 /* load nextafter (0.5, 0.0) */
34921 fmt = REAL_MODE_FORMAT (mode);
34922 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
34923 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
34924 half = const_double_from_real_value (pred_half, mode);
34925
34926 /* e1 = copysign (0.5, op1) */
34927 e1 = gen_reg_rtx (mode);
34928 emit_insn (gen_copysign (e1, half, op1));
34929
34930 /* e2 = op1 + e1 */
34931 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
34932
34933 /* res = trunc (e2) */
34934 res = gen_reg_rtx (mode);
34935 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
34936
34937 emit_move_insn (op0, res);
34938 }
34939 \f
34940
34941 /* Table of valid machine attributes. */
34942 static const struct attribute_spec ix86_attribute_table[] =
34943 {
34944 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
34945 affects_type_identity } */
34946 /* Stdcall attribute says callee is responsible for popping arguments
34947 if they are not variable. */
34948 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
34949 true },
34950 /* Fastcall attribute says callee is responsible for popping arguments
34951 if they are not variable. */
34952 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
34953 true },
34954 /* Thiscall attribute says callee is responsible for popping arguments
34955 if they are not variable. */
34956 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
34957 true },
34958 /* Cdecl attribute says the callee is a normal C declaration */
34959 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
34960 true },
34961 /* Regparm attribute specifies how many integer arguments are to be
34962 passed in registers. */
34963 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
34964 true },
34965 /* Sseregparm attribute says we are using x86_64 calling conventions
34966 for FP arguments. */
34967 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
34968 true },
34969 /* The transactional memory builtins are implicitly regparm or fastcall
34970 depending on the ABI. Override the generic do-nothing attribute that
34971 these builtins were declared with. */
34972 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
34973 true },
34974 /* force_align_arg_pointer says this function realigns the stack at entry. */
34975 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
34976 false, true, true, ix86_handle_cconv_attribute, false },
34977 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
34978 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
34979 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
34980 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
34981 false },
34982 #endif
34983 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
34984 false },
34985 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
34986 false },
34987 #ifdef SUBTARGET_ATTRIBUTE_TABLE
34988 SUBTARGET_ATTRIBUTE_TABLE,
34989 #endif
34990 /* ms_abi and sysv_abi calling convention function attributes. */
34991 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
34992 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
34993 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
34994 false },
34995 { "callee_pop_aggregate_return", 1, 1, false, true, true,
34996 ix86_handle_callee_pop_aggregate_return, true },
34997 /* End element. */
34998 { NULL, 0, 0, false, false, false, NULL, false }
34999 };
35000
35001 /* Implement targetm.vectorize.builtin_vectorization_cost. */
35002 static int
35003 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
35004 tree vectype ATTRIBUTE_UNUSED,
35005 int misalign ATTRIBUTE_UNUSED)
35006 {
35007 switch (type_of_cost)
35008 {
35009 case scalar_stmt:
35010 return ix86_cost->scalar_stmt_cost;
35011
35012 case scalar_load:
35013 return ix86_cost->scalar_load_cost;
35014
35015 case scalar_store:
35016 return ix86_cost->scalar_store_cost;
35017
35018 case vector_stmt:
35019 return ix86_cost->vec_stmt_cost;
35020
35021 case vector_load:
35022 return ix86_cost->vec_align_load_cost;
35023
35024 case vector_store:
35025 return ix86_cost->vec_store_cost;
35026
35027 case vec_to_scalar:
35028 return ix86_cost->vec_to_scalar_cost;
35029
35030 case scalar_to_vec:
35031 return ix86_cost->scalar_to_vec_cost;
35032
35033 case unaligned_load:
35034 case unaligned_store:
35035 return ix86_cost->vec_unalign_load_cost;
35036
35037 case cond_branch_taken:
35038 return ix86_cost->cond_taken_branch_cost;
35039
35040 case cond_branch_not_taken:
35041 return ix86_cost->cond_not_taken_branch_cost;
35042
35043 case vec_perm:
35044 return 1;
35045
35046 default:
35047 gcc_unreachable ();
35048 }
35049 }
35050
35051
35052 /* Return a vector mode with twice as many elements as VMODE. */
35053 /* ??? Consider moving this to a table generated by genmodes.c. */
35054
35055 static enum machine_mode
35056 doublesize_vector_mode (enum machine_mode vmode)
35057 {
35058 switch (vmode)
35059 {
35060 case V2SFmode: return V4SFmode;
35061 case V1DImode: return V2DImode;
35062 case V2SImode: return V4SImode;
35063 case V4HImode: return V8HImode;
35064 case V8QImode: return V16QImode;
35065
35066 case V2DFmode: return V4DFmode;
35067 case V4SFmode: return V8SFmode;
35068 case V2DImode: return V4DImode;
35069 case V4SImode: return V8SImode;
35070 case V8HImode: return V16HImode;
35071 case V16QImode: return V32QImode;
35072
35073 case V4DFmode: return V8DFmode;
35074 case V8SFmode: return V16SFmode;
35075 case V4DImode: return V8DImode;
35076 case V8SImode: return V16SImode;
35077 case V16HImode: return V32HImode;
35078 case V32QImode: return V64QImode;
35079
35080 default:
35081 gcc_unreachable ();
35082 }
35083 }
35084
35085 /* Construct (set target (vec_select op0 (parallel perm))) and
35086 return true if that's a valid instruction in the active ISA. */
35087
35088 static bool
35089 expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)
35090 {
35091 rtx rperm[MAX_VECT_LEN], x;
35092 unsigned i;
35093
35094 for (i = 0; i < nelt; ++i)
35095 rperm[i] = GEN_INT (perm[i]);
35096
35097 x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, rperm));
35098 x = gen_rtx_VEC_SELECT (GET_MODE (target), op0, x);
35099 x = gen_rtx_SET (VOIDmode, target, x);
35100
35101 x = emit_insn (x);
35102 if (recog_memoized (x) < 0)
35103 {
35104 remove_insn (x);
35105 return false;
35106 }
35107 return true;
35108 }
35109
35110 /* Similar, but generate a vec_concat from op0 and op1 as well. */
35111
35112 static bool
35113 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
35114 const unsigned char *perm, unsigned nelt)
35115 {
35116 enum machine_mode v2mode;
35117 rtx x;
35118
35119 v2mode = doublesize_vector_mode (GET_MODE (op0));
35120 x = gen_rtx_VEC_CONCAT (v2mode, op0, op1);
35121 return expand_vselect (target, x, perm, nelt);
35122 }
35123
35124 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35125 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
35126
35127 static bool
35128 expand_vec_perm_blend (struct expand_vec_perm_d *d)
35129 {
35130 enum machine_mode vmode = d->vmode;
35131 unsigned i, mask, nelt = d->nelt;
35132 rtx target, op0, op1, x;
35133 rtx rperm[32], vperm;
35134
35135 if (d->op0 == d->op1)
35136 return false;
35137 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
35138 ;
35139 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
35140 ;
35141 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
35142 ;
35143 else
35144 return false;
35145
35146 /* This is a blend, not a permute. Elements must stay in their
35147 respective lanes. */
35148 for (i = 0; i < nelt; ++i)
35149 {
35150 unsigned e = d->perm[i];
35151 if (!(e == i || e == i + nelt))
35152 return false;
35153 }
35154
35155 if (d->testing_p)
35156 return true;
35157
35158 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
35159 decision should be extracted elsewhere, so that we only try that
35160 sequence once all budget==3 options have been tried. */
35161 target = d->target;
35162 op0 = d->op0;
35163 op1 = d->op1;
35164 mask = 0;
35165
35166 switch (vmode)
35167 {
35168 case V4DFmode:
35169 case V8SFmode:
35170 case V2DFmode:
35171 case V4SFmode:
35172 case V8HImode:
35173 case V8SImode:
35174 for (i = 0; i < nelt; ++i)
35175 mask |= (d->perm[i] >= nelt) << i;
35176 break;
35177
35178 case V2DImode:
35179 for (i = 0; i < 2; ++i)
35180 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
35181 vmode = V8HImode;
35182 goto do_subreg;
35183
35184 case V4SImode:
35185 for (i = 0; i < 4; ++i)
35186 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35187 vmode = V8HImode;
35188 goto do_subreg;
35189
35190 case V16QImode:
35191 /* See if bytes move in pairs so we can use pblendw with
35192 an immediate argument, rather than pblendvb with a vector
35193 argument. */
35194 for (i = 0; i < 16; i += 2)
35195 if (d->perm[i] + 1 != d->perm[i + 1])
35196 {
35197 use_pblendvb:
35198 for (i = 0; i < nelt; ++i)
35199 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
35200
35201 finish_pblendvb:
35202 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
35203 vperm = force_reg (vmode, vperm);
35204
35205 if (GET_MODE_SIZE (vmode) == 16)
35206 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
35207 else
35208 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
35209 return true;
35210 }
35211
35212 for (i = 0; i < 8; ++i)
35213 mask |= (d->perm[i * 2] >= 16) << i;
35214 vmode = V8HImode;
35215 /* FALLTHRU */
35216
35217 do_subreg:
35218 target = gen_lowpart (vmode, target);
35219 op0 = gen_lowpart (vmode, op0);
35220 op1 = gen_lowpart (vmode, op1);
35221 break;
35222
35223 case V32QImode:
35224 /* See if bytes move in pairs. If not, vpblendvb must be used. */
35225 for (i = 0; i < 32; i += 2)
35226 if (d->perm[i] + 1 != d->perm[i + 1])
35227 goto use_pblendvb;
35228 /* See if bytes move in quadruplets. If yes, vpblendd
35229 with immediate can be used. */
35230 for (i = 0; i < 32; i += 4)
35231 if (d->perm[i] + 2 != d->perm[i + 2])
35232 break;
35233 if (i < 32)
35234 {
35235 /* See if bytes move the same in both lanes. If yes,
35236 vpblendw with immediate can be used. */
35237 for (i = 0; i < 16; i += 2)
35238 if (d->perm[i] + 16 != d->perm[i + 16])
35239 goto use_pblendvb;
35240
35241 /* Use vpblendw. */
35242 for (i = 0; i < 16; ++i)
35243 mask |= (d->perm[i * 2] >= 32) << i;
35244 vmode = V16HImode;
35245 goto do_subreg;
35246 }
35247
35248 /* Use vpblendd. */
35249 for (i = 0; i < 8; ++i)
35250 mask |= (d->perm[i * 4] >= 32) << i;
35251 vmode = V8SImode;
35252 goto do_subreg;
35253
35254 case V16HImode:
35255 /* See if words move in pairs. If yes, vpblendd can be used. */
35256 for (i = 0; i < 16; i += 2)
35257 if (d->perm[i] + 1 != d->perm[i + 1])
35258 break;
35259 if (i < 16)
35260 {
35261 /* See if words move the same in both lanes. If not,
35262 vpblendvb must be used. */
35263 for (i = 0; i < 8; i++)
35264 if (d->perm[i] + 8 != d->perm[i + 8])
35265 {
35266 /* Use vpblendvb. */
35267 for (i = 0; i < 32; ++i)
35268 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
35269
35270 vmode = V32QImode;
35271 nelt = 32;
35272 target = gen_lowpart (vmode, target);
35273 op0 = gen_lowpart (vmode, op0);
35274 op1 = gen_lowpart (vmode, op1);
35275 goto finish_pblendvb;
35276 }
35277
35278 /* Use vpblendw. */
35279 for (i = 0; i < 16; ++i)
35280 mask |= (d->perm[i] >= 16) << i;
35281 break;
35282 }
35283
35284 /* Use vpblendd. */
35285 for (i = 0; i < 8; ++i)
35286 mask |= (d->perm[i * 2] >= 16) << i;
35287 vmode = V8SImode;
35288 goto do_subreg;
35289
35290 case V4DImode:
35291 /* Use vpblendd. */
35292 for (i = 0; i < 4; ++i)
35293 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35294 vmode = V8SImode;
35295 goto do_subreg;
35296
35297 default:
35298 gcc_unreachable ();
35299 }
35300
35301 /* This matches five different patterns with the different modes. */
35302 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
35303 x = gen_rtx_SET (VOIDmode, target, x);
35304 emit_insn (x);
35305
35306 return true;
35307 }
35308
35309 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35310 in terms of the variable form of vpermilps.
35311
35312 Note that we will have already failed the immediate input vpermilps,
35313 which requires that the high and low part shuffle be identical; the
35314 variable form doesn't require that. */
35315
35316 static bool
35317 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
35318 {
35319 rtx rperm[8], vperm;
35320 unsigned i;
35321
35322 if (!TARGET_AVX || d->vmode != V8SFmode || d->op0 != d->op1)
35323 return false;
35324
35325 /* We can only permute within the 128-bit lane. */
35326 for (i = 0; i < 8; ++i)
35327 {
35328 unsigned e = d->perm[i];
35329 if (i < 4 ? e >= 4 : e < 4)
35330 return false;
35331 }
35332
35333 if (d->testing_p)
35334 return true;
35335
35336 for (i = 0; i < 8; ++i)
35337 {
35338 unsigned e = d->perm[i];
35339
35340 /* Within each 128-bit lane, the elements of op0 are numbered
35341 from 0 and the elements of op1 are numbered from 4. */
35342 if (e >= 8 + 4)
35343 e -= 8;
35344 else if (e >= 4)
35345 e -= 4;
35346
35347 rperm[i] = GEN_INT (e);
35348 }
35349
35350 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
35351 vperm = force_reg (V8SImode, vperm);
35352 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
35353
35354 return true;
35355 }
35356
35357 /* Return true if permutation D can be performed as VMODE permutation
35358 instead. */
35359
35360 static bool
35361 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
35362 {
35363 unsigned int i, j, chunk;
35364
35365 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
35366 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
35367 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
35368 return false;
35369
35370 if (GET_MODE_NUNITS (vmode) >= d->nelt)
35371 return true;
35372
35373 chunk = d->nelt / GET_MODE_NUNITS (vmode);
35374 for (i = 0; i < d->nelt; i += chunk)
35375 if (d->perm[i] & (chunk - 1))
35376 return false;
35377 else
35378 for (j = 1; j < chunk; ++j)
35379 if (d->perm[i] + j != d->perm[i + j])
35380 return false;
35381
35382 return true;
35383 }
35384
35385 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35386 in terms of pshufb, vpperm, vpermq, vpermd or vperm2i128. */
35387
35388 static bool
35389 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
35390 {
35391 unsigned i, nelt, eltsz, mask;
35392 unsigned char perm[32];
35393 enum machine_mode vmode = V16QImode;
35394 rtx rperm[32], vperm, target, op0, op1;
35395
35396 nelt = d->nelt;
35397
35398 if (d->op0 != d->op1)
35399 {
35400 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
35401 {
35402 if (TARGET_AVX2
35403 && valid_perm_using_mode_p (V2TImode, d))
35404 {
35405 if (d->testing_p)
35406 return true;
35407
35408 /* Use vperm2i128 insn. The pattern uses
35409 V4DImode instead of V2TImode. */
35410 target = gen_lowpart (V4DImode, d->target);
35411 op0 = gen_lowpart (V4DImode, d->op0);
35412 op1 = gen_lowpart (V4DImode, d->op1);
35413 rperm[0]
35414 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
35415 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
35416 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
35417 return true;
35418 }
35419 return false;
35420 }
35421 }
35422 else
35423 {
35424 if (GET_MODE_SIZE (d->vmode) == 16)
35425 {
35426 if (!TARGET_SSSE3)
35427 return false;
35428 }
35429 else if (GET_MODE_SIZE (d->vmode) == 32)
35430 {
35431 if (!TARGET_AVX2)
35432 return false;
35433
35434 /* V4DImode should be already handled through
35435 expand_vselect by vpermq instruction. */
35436 gcc_assert (d->vmode != V4DImode);
35437
35438 vmode = V32QImode;
35439 if (d->vmode == V8SImode
35440 || d->vmode == V16HImode
35441 || d->vmode == V32QImode)
35442 {
35443 /* First see if vpermq can be used for
35444 V8SImode/V16HImode/V32QImode. */
35445 if (valid_perm_using_mode_p (V4DImode, d))
35446 {
35447 for (i = 0; i < 4; i++)
35448 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
35449 if (d->testing_p)
35450 return true;
35451 return expand_vselect (gen_lowpart (V4DImode, d->target),
35452 gen_lowpart (V4DImode, d->op0),
35453 perm, 4);
35454 }
35455
35456 /* Next see if vpermd can be used. */
35457 if (valid_perm_using_mode_p (V8SImode, d))
35458 vmode = V8SImode;
35459 }
35460
35461 if (vmode == V32QImode)
35462 {
35463 /* vpshufb only works intra lanes, it is not
35464 possible to shuffle bytes in between the lanes. */
35465 for (i = 0; i < nelt; ++i)
35466 if ((d->perm[i] ^ i) & (nelt / 2))
35467 return false;
35468 }
35469 }
35470 else
35471 return false;
35472 }
35473
35474 if (d->testing_p)
35475 return true;
35476
35477 if (vmode == V8SImode)
35478 for (i = 0; i < 8; ++i)
35479 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
35480 else
35481 {
35482 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
35483 if (d->op0 != d->op1)
35484 mask = 2 * nelt - 1;
35485 else if (vmode == V16QImode)
35486 mask = nelt - 1;
35487 else
35488 mask = nelt / 2 - 1;
35489
35490 for (i = 0; i < nelt; ++i)
35491 {
35492 unsigned j, e = d->perm[i] & mask;
35493 for (j = 0; j < eltsz; ++j)
35494 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
35495 }
35496 }
35497
35498 vperm = gen_rtx_CONST_VECTOR (vmode,
35499 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
35500 vperm = force_reg (vmode, vperm);
35501
35502 target = gen_lowpart (vmode, d->target);
35503 op0 = gen_lowpart (vmode, d->op0);
35504 if (d->op0 == d->op1)
35505 {
35506 if (vmode == V16QImode)
35507 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
35508 else if (vmode == V32QImode)
35509 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
35510 else
35511 emit_insn (gen_avx2_permvarv8si (target, vperm, op0));
35512 }
35513 else
35514 {
35515 op1 = gen_lowpart (vmode, d->op1);
35516 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
35517 }
35518
35519 return true;
35520 }
35521
35522 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
35523 in a single instruction. */
35524
35525 static bool
35526 expand_vec_perm_1 (struct expand_vec_perm_d *d)
35527 {
35528 unsigned i, nelt = d->nelt;
35529 unsigned char perm2[MAX_VECT_LEN];
35530
35531 /* Check plain VEC_SELECT first, because AVX has instructions that could
35532 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
35533 input where SEL+CONCAT may not. */
35534 if (d->op0 == d->op1)
35535 {
35536 int mask = nelt - 1;
35537 bool identity_perm = true;
35538 bool broadcast_perm = true;
35539
35540 for (i = 0; i < nelt; i++)
35541 {
35542 perm2[i] = d->perm[i] & mask;
35543 if (perm2[i] != i)
35544 identity_perm = false;
35545 if (perm2[i])
35546 broadcast_perm = false;
35547 }
35548
35549 if (identity_perm)
35550 {
35551 if (!d->testing_p)
35552 emit_move_insn (d->target, d->op0);
35553 return true;
35554 }
35555 else if (broadcast_perm && TARGET_AVX2)
35556 {
35557 /* Use vpbroadcast{b,w,d}. */
35558 rtx op = d->op0, (*gen) (rtx, rtx) = NULL;
35559 switch (d->vmode)
35560 {
35561 case V32QImode:
35562 op = gen_lowpart (V16QImode, op);
35563 gen = gen_avx2_pbroadcastv32qi;
35564 break;
35565 case V16HImode:
35566 op = gen_lowpart (V8HImode, op);
35567 gen = gen_avx2_pbroadcastv16hi;
35568 break;
35569 case V8SImode:
35570 op = gen_lowpart (V4SImode, op);
35571 gen = gen_avx2_pbroadcastv8si;
35572 break;
35573 case V16QImode:
35574 gen = gen_avx2_pbroadcastv16qi;
35575 break;
35576 case V8HImode:
35577 gen = gen_avx2_pbroadcastv8hi;
35578 break;
35579 /* For other modes prefer other shuffles this function creates. */
35580 default: break;
35581 }
35582 if (gen != NULL)
35583 {
35584 if (!d->testing_p)
35585 emit_insn (gen (d->target, op));
35586 return true;
35587 }
35588 }
35589
35590 if (expand_vselect (d->target, d->op0, perm2, nelt))
35591 return true;
35592
35593 /* There are plenty of patterns in sse.md that are written for
35594 SEL+CONCAT and are not replicated for a single op. Perhaps
35595 that should be changed, to avoid the nastiness here. */
35596
35597 /* Recognize interleave style patterns, which means incrementing
35598 every other permutation operand. */
35599 for (i = 0; i < nelt; i += 2)
35600 {
35601 perm2[i] = d->perm[i] & mask;
35602 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
35603 }
35604 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
35605 return true;
35606
35607 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
35608 if (nelt >= 4)
35609 {
35610 for (i = 0; i < nelt; i += 4)
35611 {
35612 perm2[i + 0] = d->perm[i + 0] & mask;
35613 perm2[i + 1] = d->perm[i + 1] & mask;
35614 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
35615 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
35616 }
35617
35618 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
35619 return true;
35620 }
35621 }
35622
35623 /* Finally, try the fully general two operand permute. */
35624 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt))
35625 return true;
35626
35627 /* Recognize interleave style patterns with reversed operands. */
35628 if (d->op0 != d->op1)
35629 {
35630 for (i = 0; i < nelt; ++i)
35631 {
35632 unsigned e = d->perm[i];
35633 if (e >= nelt)
35634 e -= nelt;
35635 else
35636 e += nelt;
35637 perm2[i] = e;
35638 }
35639
35640 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt))
35641 return true;
35642 }
35643
35644 /* Try the SSE4.1 blend variable merge instructions. */
35645 if (expand_vec_perm_blend (d))
35646 return true;
35647
35648 /* Try one of the AVX vpermil variable permutations. */
35649 if (expand_vec_perm_vpermil (d))
35650 return true;
35651
35652 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
35653 vpshufb, vpermd or vpermq variable permutation. */
35654 if (expand_vec_perm_pshufb (d))
35655 return true;
35656
35657 return false;
35658 }
35659
35660 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35661 in terms of a pair of pshuflw + pshufhw instructions. */
35662
35663 static bool
35664 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
35665 {
35666 unsigned char perm2[MAX_VECT_LEN];
35667 unsigned i;
35668 bool ok;
35669
35670 if (d->vmode != V8HImode || d->op0 != d->op1)
35671 return false;
35672
35673 /* The two permutations only operate in 64-bit lanes. */
35674 for (i = 0; i < 4; ++i)
35675 if (d->perm[i] >= 4)
35676 return false;
35677 for (i = 4; i < 8; ++i)
35678 if (d->perm[i] < 4)
35679 return false;
35680
35681 if (d->testing_p)
35682 return true;
35683
35684 /* Emit the pshuflw. */
35685 memcpy (perm2, d->perm, 4);
35686 for (i = 4; i < 8; ++i)
35687 perm2[i] = i;
35688 ok = expand_vselect (d->target, d->op0, perm2, 8);
35689 gcc_assert (ok);
35690
35691 /* Emit the pshufhw. */
35692 memcpy (perm2 + 4, d->perm + 4, 4);
35693 for (i = 0; i < 4; ++i)
35694 perm2[i] = i;
35695 ok = expand_vselect (d->target, d->target, perm2, 8);
35696 gcc_assert (ok);
35697
35698 return true;
35699 }
35700
35701 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
35702 the permutation using the SSSE3 palignr instruction. This succeeds
35703 when all of the elements in PERM fit within one vector and we merely
35704 need to shift them down so that a single vector permutation has a
35705 chance to succeed. */
35706
35707 static bool
35708 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
35709 {
35710 unsigned i, nelt = d->nelt;
35711 unsigned min, max;
35712 bool in_order, ok;
35713 rtx shift;
35714
35715 /* Even with AVX, palignr only operates on 128-bit vectors. */
35716 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
35717 return false;
35718
35719 min = nelt, max = 0;
35720 for (i = 0; i < nelt; ++i)
35721 {
35722 unsigned e = d->perm[i];
35723 if (e < min)
35724 min = e;
35725 if (e > max)
35726 max = e;
35727 }
35728 if (min == 0 || max - min >= nelt)
35729 return false;
35730
35731 /* Given that we have SSSE3, we know we'll be able to implement the
35732 single operand permutation after the palignr with pshufb. */
35733 if (d->testing_p)
35734 return true;
35735
35736 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
35737 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
35738 gen_lowpart (TImode, d->op1),
35739 gen_lowpart (TImode, d->op0), shift));
35740
35741 d->op0 = d->op1 = d->target;
35742
35743 in_order = true;
35744 for (i = 0; i < nelt; ++i)
35745 {
35746 unsigned e = d->perm[i] - min;
35747 if (e != i)
35748 in_order = false;
35749 d->perm[i] = e;
35750 }
35751
35752 /* Test for the degenerate case where the alignment by itself
35753 produces the desired permutation. */
35754 if (in_order)
35755 return true;
35756
35757 ok = expand_vec_perm_1 (d);
35758 gcc_assert (ok);
35759
35760 return ok;
35761 }
35762
35763 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
35764 a two vector permutation into a single vector permutation by using
35765 an interleave operation to merge the vectors. */
35766
35767 static bool
35768 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
35769 {
35770 struct expand_vec_perm_d dremap, dfinal;
35771 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
35772 unsigned HOST_WIDE_INT contents;
35773 unsigned char remap[2 * MAX_VECT_LEN];
35774 rtx seq;
35775 bool ok, same_halves = false;
35776
35777 if (GET_MODE_SIZE (d->vmode) == 16)
35778 {
35779 if (d->op0 == d->op1)
35780 return false;
35781 }
35782 else if (GET_MODE_SIZE (d->vmode) == 32)
35783 {
35784 if (!TARGET_AVX)
35785 return false;
35786 /* For 32-byte modes allow even d->op0 == d->op1.
35787 The lack of cross-lane shuffling in some instructions
35788 might prevent a single insn shuffle. */
35789 }
35790 else
35791 return false;
35792
35793 /* Examine from whence the elements come. */
35794 contents = 0;
35795 for (i = 0; i < nelt; ++i)
35796 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
35797
35798 memset (remap, 0xff, sizeof (remap));
35799 dremap = *d;
35800
35801 if (GET_MODE_SIZE (d->vmode) == 16)
35802 {
35803 unsigned HOST_WIDE_INT h1, h2, h3, h4;
35804
35805 /* Split the two input vectors into 4 halves. */
35806 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
35807 h2 = h1 << nelt2;
35808 h3 = h2 << nelt2;
35809 h4 = h3 << nelt2;
35810
35811 /* If the elements from the low halves use interleave low, and similarly
35812 for interleave high. If the elements are from mis-matched halves, we
35813 can use shufps for V4SF/V4SI or do a DImode shuffle. */
35814 if ((contents & (h1 | h3)) == contents)
35815 {
35816 /* punpckl* */
35817 for (i = 0; i < nelt2; ++i)
35818 {
35819 remap[i] = i * 2;
35820 remap[i + nelt] = i * 2 + 1;
35821 dremap.perm[i * 2] = i;
35822 dremap.perm[i * 2 + 1] = i + nelt;
35823 }
35824 }
35825 else if ((contents & (h2 | h4)) == contents)
35826 {
35827 /* punpckh* */
35828 for (i = 0; i < nelt2; ++i)
35829 {
35830 remap[i + nelt2] = i * 2;
35831 remap[i + nelt + nelt2] = i * 2 + 1;
35832 dremap.perm[i * 2] = i + nelt2;
35833 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
35834 }
35835 }
35836 else if ((contents & (h1 | h4)) == contents)
35837 {
35838 /* shufps */
35839 for (i = 0; i < nelt2; ++i)
35840 {
35841 remap[i] = i;
35842 remap[i + nelt + nelt2] = i + nelt2;
35843 dremap.perm[i] = i;
35844 dremap.perm[i + nelt2] = i + nelt + nelt2;
35845 }
35846 if (nelt != 4)
35847 {
35848 /* shufpd */
35849 dremap.vmode = V2DImode;
35850 dremap.nelt = 2;
35851 dremap.perm[0] = 0;
35852 dremap.perm[1] = 3;
35853 }
35854 }
35855 else if ((contents & (h2 | h3)) == contents)
35856 {
35857 /* shufps */
35858 for (i = 0; i < nelt2; ++i)
35859 {
35860 remap[i + nelt2] = i;
35861 remap[i + nelt] = i + nelt2;
35862 dremap.perm[i] = i + nelt2;
35863 dremap.perm[i + nelt2] = i + nelt;
35864 }
35865 if (nelt != 4)
35866 {
35867 /* shufpd */
35868 dremap.vmode = V2DImode;
35869 dremap.nelt = 2;
35870 dremap.perm[0] = 1;
35871 dremap.perm[1] = 2;
35872 }
35873 }
35874 else
35875 return false;
35876 }
35877 else
35878 {
35879 unsigned int nelt4 = nelt / 4, nzcnt = 0;
35880 unsigned HOST_WIDE_INT q[8];
35881 unsigned int nonzero_halves[4];
35882
35883 /* Split the two input vectors into 8 quarters. */
35884 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
35885 for (i = 1; i < 8; ++i)
35886 q[i] = q[0] << (nelt4 * i);
35887 for (i = 0; i < 4; ++i)
35888 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
35889 {
35890 nonzero_halves[nzcnt] = i;
35891 ++nzcnt;
35892 }
35893
35894 if (nzcnt == 1)
35895 {
35896 gcc_assert (d->op0 == d->op1);
35897 nonzero_halves[1] = nonzero_halves[0];
35898 same_halves = true;
35899 }
35900 else if (d->op0 == d->op1)
35901 {
35902 gcc_assert (nonzero_halves[0] == 0);
35903 gcc_assert (nonzero_halves[1] == 1);
35904 }
35905
35906 if (nzcnt <= 2)
35907 {
35908 if (d->perm[0] / nelt2 == nonzero_halves[1])
35909 {
35910 /* Attempt to increase the likelyhood that dfinal
35911 shuffle will be intra-lane. */
35912 char tmph = nonzero_halves[0];
35913 nonzero_halves[0] = nonzero_halves[1];
35914 nonzero_halves[1] = tmph;
35915 }
35916
35917 /* vperm2f128 or vperm2i128. */
35918 for (i = 0; i < nelt2; ++i)
35919 {
35920 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
35921 remap[i + nonzero_halves[0] * nelt2] = i;
35922 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
35923 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
35924 }
35925
35926 if (d->vmode != V8SFmode
35927 && d->vmode != V4DFmode
35928 && d->vmode != V8SImode)
35929 {
35930 dremap.vmode = V8SImode;
35931 dremap.nelt = 8;
35932 for (i = 0; i < 4; ++i)
35933 {
35934 dremap.perm[i] = i + nonzero_halves[0] * 4;
35935 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
35936 }
35937 }
35938 }
35939 else if (d->op0 == d->op1)
35940 return false;
35941 else if (TARGET_AVX2
35942 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
35943 {
35944 /* vpunpckl* */
35945 for (i = 0; i < nelt4; ++i)
35946 {
35947 remap[i] = i * 2;
35948 remap[i + nelt] = i * 2 + 1;
35949 remap[i + nelt2] = i * 2 + nelt2;
35950 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
35951 dremap.perm[i * 2] = i;
35952 dremap.perm[i * 2 + 1] = i + nelt;
35953 dremap.perm[i * 2 + nelt2] = i + nelt2;
35954 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
35955 }
35956 }
35957 else if (TARGET_AVX2
35958 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
35959 {
35960 /* vpunpckh* */
35961 for (i = 0; i < nelt4; ++i)
35962 {
35963 remap[i + nelt4] = i * 2;
35964 remap[i + nelt + nelt4] = i * 2 + 1;
35965 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
35966 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
35967 dremap.perm[i * 2] = i + nelt4;
35968 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
35969 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
35970 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
35971 }
35972 }
35973 else
35974 return false;
35975 }
35976
35977 /* Use the remapping array set up above to move the elements from their
35978 swizzled locations into their final destinations. */
35979 dfinal = *d;
35980 for (i = 0; i < nelt; ++i)
35981 {
35982 unsigned e = remap[d->perm[i]];
35983 gcc_assert (e < nelt);
35984 /* If same_halves is true, both halves of the remapped vector are the
35985 same. Avoid cross-lane accesses if possible. */
35986 if (same_halves && i >= nelt2)
35987 {
35988 gcc_assert (e < nelt2);
35989 dfinal.perm[i] = e + nelt2;
35990 }
35991 else
35992 dfinal.perm[i] = e;
35993 }
35994 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
35995 dfinal.op1 = dfinal.op0;
35996 dremap.target = dfinal.op0;
35997
35998 /* Test if the final remap can be done with a single insn. For V4SFmode or
35999 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
36000 start_sequence ();
36001 ok = expand_vec_perm_1 (&dfinal);
36002 seq = get_insns ();
36003 end_sequence ();
36004
36005 if (!ok)
36006 return false;
36007
36008 if (d->testing_p)
36009 return true;
36010
36011 if (dremap.vmode != dfinal.vmode)
36012 {
36013 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
36014 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
36015 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
36016 }
36017
36018 ok = expand_vec_perm_1 (&dremap);
36019 gcc_assert (ok);
36020
36021 emit_insn (seq);
36022 return true;
36023 }
36024
36025 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36026 a single vector cross-lane permutation into vpermq followed
36027 by any of the single insn permutations. */
36028
36029 static bool
36030 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
36031 {
36032 struct expand_vec_perm_d dremap, dfinal;
36033 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
36034 unsigned contents[2];
36035 bool ok;
36036
36037 if (!(TARGET_AVX2
36038 && (d->vmode == V32QImode || d->vmode == V16HImode)
36039 && d->op0 == d->op1))
36040 return false;
36041
36042 contents[0] = 0;
36043 contents[1] = 0;
36044 for (i = 0; i < nelt2; ++i)
36045 {
36046 contents[0] |= 1u << (d->perm[i] / nelt4);
36047 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
36048 }
36049
36050 for (i = 0; i < 2; ++i)
36051 {
36052 unsigned int cnt = 0;
36053 for (j = 0; j < 4; ++j)
36054 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
36055 return false;
36056 }
36057
36058 if (d->testing_p)
36059 return true;
36060
36061 dremap = *d;
36062 dremap.vmode = V4DImode;
36063 dremap.nelt = 4;
36064 dremap.target = gen_reg_rtx (V4DImode);
36065 dremap.op0 = gen_lowpart (V4DImode, d->op0);
36066 dremap.op1 = dremap.op0;
36067 for (i = 0; i < 2; ++i)
36068 {
36069 unsigned int cnt = 0;
36070 for (j = 0; j < 4; ++j)
36071 if ((contents[i] & (1u << j)) != 0)
36072 dremap.perm[2 * i + cnt++] = j;
36073 for (; cnt < 2; ++cnt)
36074 dremap.perm[2 * i + cnt] = 0;
36075 }
36076
36077 dfinal = *d;
36078 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
36079 dfinal.op1 = dfinal.op0;
36080 for (i = 0, j = 0; i < nelt; ++i)
36081 {
36082 if (i == nelt2)
36083 j = 2;
36084 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
36085 if ((d->perm[i] / nelt4) == dremap.perm[j])
36086 ;
36087 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
36088 dfinal.perm[i] |= nelt4;
36089 else
36090 gcc_unreachable ();
36091 }
36092
36093 ok = expand_vec_perm_1 (&dremap);
36094 gcc_assert (ok);
36095
36096 ok = expand_vec_perm_1 (&dfinal);
36097 gcc_assert (ok);
36098
36099 return true;
36100 }
36101
36102 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36103 a two vector permutation using 2 intra-lane interleave insns
36104 and cross-lane shuffle for 32-byte vectors. */
36105
36106 static bool
36107 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
36108 {
36109 unsigned i, nelt;
36110 rtx (*gen) (rtx, rtx, rtx);
36111
36112 if (d->op0 == d->op1)
36113 return false;
36114 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
36115 ;
36116 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
36117 ;
36118 else
36119 return false;
36120
36121 nelt = d->nelt;
36122 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
36123 return false;
36124 for (i = 0; i < nelt; i += 2)
36125 if (d->perm[i] != d->perm[0] + i / 2
36126 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
36127 return false;
36128
36129 if (d->testing_p)
36130 return true;
36131
36132 switch (d->vmode)
36133 {
36134 case V32QImode:
36135 if (d->perm[0])
36136 gen = gen_vec_interleave_highv32qi;
36137 else
36138 gen = gen_vec_interleave_lowv32qi;
36139 break;
36140 case V16HImode:
36141 if (d->perm[0])
36142 gen = gen_vec_interleave_highv16hi;
36143 else
36144 gen = gen_vec_interleave_lowv16hi;
36145 break;
36146 case V8SImode:
36147 if (d->perm[0])
36148 gen = gen_vec_interleave_highv8si;
36149 else
36150 gen = gen_vec_interleave_lowv8si;
36151 break;
36152 case V4DImode:
36153 if (d->perm[0])
36154 gen = gen_vec_interleave_highv4di;
36155 else
36156 gen = gen_vec_interleave_lowv4di;
36157 break;
36158 case V8SFmode:
36159 if (d->perm[0])
36160 gen = gen_vec_interleave_highv8sf;
36161 else
36162 gen = gen_vec_interleave_lowv8sf;
36163 break;
36164 case V4DFmode:
36165 if (d->perm[0])
36166 gen = gen_vec_interleave_highv4df;
36167 else
36168 gen = gen_vec_interleave_lowv4df;
36169 break;
36170 default:
36171 gcc_unreachable ();
36172 }
36173
36174 emit_insn (gen (d->target, d->op0, d->op1));
36175 return true;
36176 }
36177
36178 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
36179 permutation with two pshufb insns and an ior. We should have already
36180 failed all two instruction sequences. */
36181
36182 static bool
36183 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
36184 {
36185 rtx rperm[2][16], vperm, l, h, op, m128;
36186 unsigned int i, nelt, eltsz;
36187
36188 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36189 return false;
36190 gcc_assert (d->op0 != d->op1);
36191
36192 nelt = d->nelt;
36193 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36194
36195 /* Generate two permutation masks. If the required element is within
36196 the given vector it is shuffled into the proper lane. If the required
36197 element is in the other vector, force a zero into the lane by setting
36198 bit 7 in the permutation mask. */
36199 m128 = GEN_INT (-128);
36200 for (i = 0; i < nelt; ++i)
36201 {
36202 unsigned j, e = d->perm[i];
36203 unsigned which = (e >= nelt);
36204 if (e >= nelt)
36205 e -= nelt;
36206
36207 for (j = 0; j < eltsz; ++j)
36208 {
36209 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
36210 rperm[1-which][i*eltsz + j] = m128;
36211 }
36212 }
36213
36214 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
36215 vperm = force_reg (V16QImode, vperm);
36216
36217 l = gen_reg_rtx (V16QImode);
36218 op = gen_lowpart (V16QImode, d->op0);
36219 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
36220
36221 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
36222 vperm = force_reg (V16QImode, vperm);
36223
36224 h = gen_reg_rtx (V16QImode);
36225 op = gen_lowpart (V16QImode, d->op1);
36226 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
36227
36228 op = gen_lowpart (V16QImode, d->target);
36229 emit_insn (gen_iorv16qi3 (op, l, h));
36230
36231 return true;
36232 }
36233
36234 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
36235 with two vpshufb insns, vpermq and vpor. We should have already failed
36236 all two or three instruction sequences. */
36237
36238 static bool
36239 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
36240 {
36241 rtx rperm[2][32], vperm, l, h, hp, op, m128;
36242 unsigned int i, nelt, eltsz;
36243
36244 if (!TARGET_AVX2
36245 || d->op0 != d->op1
36246 || (d->vmode != V32QImode && d->vmode != V16HImode))
36247 return false;
36248
36249 if (d->testing_p)
36250 return true;
36251
36252 nelt = d->nelt;
36253 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36254
36255 /* Generate two permutation masks. If the required element is within
36256 the same lane, it is shuffled in. If the required element from the
36257 other lane, force a zero by setting bit 7 in the permutation mask.
36258 In the other mask the mask has non-negative elements if element
36259 is requested from the other lane, but also moved to the other lane,
36260 so that the result of vpshufb can have the two V2TImode halves
36261 swapped. */
36262 m128 = GEN_INT (-128);
36263 for (i = 0; i < nelt; ++i)
36264 {
36265 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36266 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
36267
36268 for (j = 0; j < eltsz; ++j)
36269 {
36270 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
36271 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
36272 }
36273 }
36274
36275 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
36276 vperm = force_reg (V32QImode, vperm);
36277
36278 h = gen_reg_rtx (V32QImode);
36279 op = gen_lowpart (V32QImode, d->op0);
36280 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
36281
36282 /* Swap the 128-byte lanes of h into hp. */
36283 hp = gen_reg_rtx (V4DImode);
36284 op = gen_lowpart (V4DImode, h);
36285 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
36286 const1_rtx));
36287
36288 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
36289 vperm = force_reg (V32QImode, vperm);
36290
36291 l = gen_reg_rtx (V32QImode);
36292 op = gen_lowpart (V32QImode, d->op0);
36293 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
36294
36295 op = gen_lowpart (V32QImode, d->target);
36296 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
36297
36298 return true;
36299 }
36300
36301 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
36302 and extract-odd permutations of two V32QImode and V16QImode operand
36303 with two vpshufb insns, vpor and vpermq. We should have already
36304 failed all two or three instruction sequences. */
36305
36306 static bool
36307 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
36308 {
36309 rtx rperm[2][32], vperm, l, h, ior, op, m128;
36310 unsigned int i, nelt, eltsz;
36311
36312 if (!TARGET_AVX2
36313 || d->op0 == d->op1
36314 || (d->vmode != V32QImode && d->vmode != V16HImode))
36315 return false;
36316
36317 for (i = 0; i < d->nelt; ++i)
36318 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
36319 return false;
36320
36321 if (d->testing_p)
36322 return true;
36323
36324 nelt = d->nelt;
36325 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36326
36327 /* Generate two permutation masks. In the first permutation mask
36328 the first quarter will contain indexes for the first half
36329 of the op0, the second quarter will contain bit 7 set, third quarter
36330 will contain indexes for the second half of the op0 and the
36331 last quarter bit 7 set. In the second permutation mask
36332 the first quarter will contain bit 7 set, the second quarter
36333 indexes for the first half of the op1, the third quarter bit 7 set
36334 and last quarter indexes for the second half of the op1.
36335 I.e. the first mask e.g. for V32QImode extract even will be:
36336 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
36337 (all values masked with 0xf except for -128) and second mask
36338 for extract even will be
36339 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
36340 m128 = GEN_INT (-128);
36341 for (i = 0; i < nelt; ++i)
36342 {
36343 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36344 unsigned which = d->perm[i] >= nelt;
36345 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
36346
36347 for (j = 0; j < eltsz; ++j)
36348 {
36349 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
36350 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
36351 }
36352 }
36353
36354 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
36355 vperm = force_reg (V32QImode, vperm);
36356
36357 l = gen_reg_rtx (V32QImode);
36358 op = gen_lowpart (V32QImode, d->op0);
36359 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
36360
36361 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
36362 vperm = force_reg (V32QImode, vperm);
36363
36364 h = gen_reg_rtx (V32QImode);
36365 op = gen_lowpart (V32QImode, d->op1);
36366 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
36367
36368 ior = gen_reg_rtx (V32QImode);
36369 emit_insn (gen_iorv32qi3 (ior, l, h));
36370
36371 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
36372 op = gen_lowpart (V4DImode, d->target);
36373 ior = gen_lowpart (V4DImode, ior);
36374 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
36375 const1_rtx, GEN_INT (3)));
36376
36377 return true;
36378 }
36379
36380 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
36381 and extract-odd permutations. */
36382
36383 static bool
36384 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
36385 {
36386 rtx t1, t2, t3;
36387
36388 switch (d->vmode)
36389 {
36390 case V4DFmode:
36391 t1 = gen_reg_rtx (V4DFmode);
36392 t2 = gen_reg_rtx (V4DFmode);
36393
36394 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
36395 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
36396 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
36397
36398 /* Now an unpck[lh]pd will produce the result required. */
36399 if (odd)
36400 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
36401 else
36402 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
36403 emit_insn (t3);
36404 break;
36405
36406 case V8SFmode:
36407 {
36408 int mask = odd ? 0xdd : 0x88;
36409
36410 t1 = gen_reg_rtx (V8SFmode);
36411 t2 = gen_reg_rtx (V8SFmode);
36412 t3 = gen_reg_rtx (V8SFmode);
36413
36414 /* Shuffle within the 128-bit lanes to produce:
36415 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
36416 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
36417 GEN_INT (mask)));
36418
36419 /* Shuffle the lanes around to produce:
36420 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
36421 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
36422 GEN_INT (0x3)));
36423
36424 /* Shuffle within the 128-bit lanes to produce:
36425 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
36426 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
36427
36428 /* Shuffle within the 128-bit lanes to produce:
36429 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
36430 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
36431
36432 /* Shuffle the lanes around to produce:
36433 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
36434 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
36435 GEN_INT (0x20)));
36436 }
36437 break;
36438
36439 case V2DFmode:
36440 case V4SFmode:
36441 case V2DImode:
36442 case V4SImode:
36443 /* These are always directly implementable by expand_vec_perm_1. */
36444 gcc_unreachable ();
36445
36446 case V8HImode:
36447 if (TARGET_SSSE3)
36448 return expand_vec_perm_pshufb2 (d);
36449 else
36450 {
36451 /* We need 2*log2(N)-1 operations to achieve odd/even
36452 with interleave. */
36453 t1 = gen_reg_rtx (V8HImode);
36454 t2 = gen_reg_rtx (V8HImode);
36455 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
36456 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
36457 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
36458 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
36459 if (odd)
36460 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
36461 else
36462 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
36463 emit_insn (t3);
36464 }
36465 break;
36466
36467 case V16QImode:
36468 if (TARGET_SSSE3)
36469 return expand_vec_perm_pshufb2 (d);
36470 else
36471 {
36472 t1 = gen_reg_rtx (V16QImode);
36473 t2 = gen_reg_rtx (V16QImode);
36474 t3 = gen_reg_rtx (V16QImode);
36475 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
36476 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
36477 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
36478 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
36479 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
36480 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
36481 if (odd)
36482 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
36483 else
36484 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
36485 emit_insn (t3);
36486 }
36487 break;
36488
36489 case V16HImode:
36490 case V32QImode:
36491 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
36492
36493 case V4DImode:
36494 if (!TARGET_AVX2)
36495 {
36496 struct expand_vec_perm_d d_copy = *d;
36497 d_copy.vmode = V4DFmode;
36498 d_copy.target = gen_lowpart (V4DFmode, d->target);
36499 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
36500 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
36501 return expand_vec_perm_even_odd_1 (&d_copy, odd);
36502 }
36503
36504 t1 = gen_reg_rtx (V4DImode);
36505 t2 = gen_reg_rtx (V4DImode);
36506
36507 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
36508 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
36509 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
36510
36511 /* Now an vpunpck[lh]qdq will produce the result required. */
36512 if (odd)
36513 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
36514 else
36515 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
36516 emit_insn (t3);
36517 break;
36518
36519 case V8SImode:
36520 if (!TARGET_AVX2)
36521 {
36522 struct expand_vec_perm_d d_copy = *d;
36523 d_copy.vmode = V8SFmode;
36524 d_copy.target = gen_lowpart (V8SFmode, d->target);
36525 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
36526 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
36527 return expand_vec_perm_even_odd_1 (&d_copy, odd);
36528 }
36529
36530 t1 = gen_reg_rtx (V8SImode);
36531 t2 = gen_reg_rtx (V8SImode);
36532
36533 /* Shuffle the lanes around into
36534 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
36535 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
36536 gen_lowpart (V4DImode, d->op0),
36537 gen_lowpart (V4DImode, d->op1),
36538 GEN_INT (0x20)));
36539 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
36540 gen_lowpart (V4DImode, d->op0),
36541 gen_lowpart (V4DImode, d->op1),
36542 GEN_INT (0x31)));
36543
36544 /* Swap the 2nd and 3rd position in each lane into
36545 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
36546 emit_insn (gen_avx2_pshufdv3 (t1, t1,
36547 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
36548 emit_insn (gen_avx2_pshufdv3 (t2, t2,
36549 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
36550
36551 /* Now an vpunpck[lh]qdq will produce
36552 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
36553 if (odd)
36554 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
36555 gen_lowpart (V4DImode, t1),
36556 gen_lowpart (V4DImode, t2));
36557 else
36558 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
36559 gen_lowpart (V4DImode, t1),
36560 gen_lowpart (V4DImode, t2));
36561 emit_insn (t3);
36562 break;
36563
36564 default:
36565 gcc_unreachable ();
36566 }
36567
36568 return true;
36569 }
36570
36571 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
36572 extract-even and extract-odd permutations. */
36573
36574 static bool
36575 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
36576 {
36577 unsigned i, odd, nelt = d->nelt;
36578
36579 odd = d->perm[0];
36580 if (odd != 0 && odd != 1)
36581 return false;
36582
36583 for (i = 1; i < nelt; ++i)
36584 if (d->perm[i] != 2 * i + odd)
36585 return false;
36586
36587 return expand_vec_perm_even_odd_1 (d, odd);
36588 }
36589
36590 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
36591 permutations. We assume that expand_vec_perm_1 has already failed. */
36592
36593 static bool
36594 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
36595 {
36596 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
36597 enum machine_mode vmode = d->vmode;
36598 unsigned char perm2[4];
36599 rtx op0 = d->op0;
36600 bool ok;
36601
36602 switch (vmode)
36603 {
36604 case V4DFmode:
36605 case V8SFmode:
36606 /* These are special-cased in sse.md so that we can optionally
36607 use the vbroadcast instruction. They expand to two insns
36608 if the input happens to be in a register. */
36609 gcc_unreachable ();
36610
36611 case V2DFmode:
36612 case V2DImode:
36613 case V4SFmode:
36614 case V4SImode:
36615 /* These are always implementable using standard shuffle patterns. */
36616 gcc_unreachable ();
36617
36618 case V8HImode:
36619 case V16QImode:
36620 /* These can be implemented via interleave. We save one insn by
36621 stopping once we have promoted to V4SImode and then use pshufd. */
36622 do
36623 {
36624 optab otab = vec_interleave_low_optab;
36625
36626 if (elt >= nelt2)
36627 {
36628 otab = vec_interleave_high_optab;
36629 elt -= nelt2;
36630 }
36631 nelt2 /= 2;
36632
36633 op0 = expand_binop (vmode, otab, op0, op0, NULL, 0, OPTAB_DIRECT);
36634 vmode = get_mode_wider_vector (vmode);
36635 op0 = gen_lowpart (vmode, op0);
36636 }
36637 while (vmode != V4SImode);
36638
36639 memset (perm2, elt, 4);
36640 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4);
36641 gcc_assert (ok);
36642 return true;
36643
36644 case V32QImode:
36645 case V16HImode:
36646 case V8SImode:
36647 case V4DImode:
36648 /* For AVX2 broadcasts of the first element vpbroadcast* or
36649 vpermq should be used by expand_vec_perm_1. */
36650 gcc_assert (!TARGET_AVX2 || d->perm[0]);
36651 return false;
36652
36653 default:
36654 gcc_unreachable ();
36655 }
36656 }
36657
36658 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
36659 broadcast permutations. */
36660
36661 static bool
36662 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
36663 {
36664 unsigned i, elt, nelt = d->nelt;
36665
36666 if (d->op0 != d->op1)
36667 return false;
36668
36669 elt = d->perm[0];
36670 for (i = 1; i < nelt; ++i)
36671 if (d->perm[i] != elt)
36672 return false;
36673
36674 return expand_vec_perm_broadcast_1 (d);
36675 }
36676
36677 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
36678 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
36679 all the shorter instruction sequences. */
36680
36681 static bool
36682 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
36683 {
36684 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
36685 unsigned int i, nelt, eltsz;
36686 bool used[4];
36687
36688 if (!TARGET_AVX2
36689 || d->op0 == d->op1
36690 || (d->vmode != V32QImode && d->vmode != V16HImode))
36691 return false;
36692
36693 if (d->testing_p)
36694 return true;
36695
36696 nelt = d->nelt;
36697 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36698
36699 /* Generate 4 permutation masks. If the required element is within
36700 the same lane, it is shuffled in. If the required element from the
36701 other lane, force a zero by setting bit 7 in the permutation mask.
36702 In the other mask the mask has non-negative elements if element
36703 is requested from the other lane, but also moved to the other lane,
36704 so that the result of vpshufb can have the two V2TImode halves
36705 swapped. */
36706 m128 = GEN_INT (-128);
36707 for (i = 0; i < 32; ++i)
36708 {
36709 rperm[0][i] = m128;
36710 rperm[1][i] = m128;
36711 rperm[2][i] = m128;
36712 rperm[3][i] = m128;
36713 }
36714 used[0] = false;
36715 used[1] = false;
36716 used[2] = false;
36717 used[3] = false;
36718 for (i = 0; i < nelt; ++i)
36719 {
36720 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36721 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
36722 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
36723
36724 for (j = 0; j < eltsz; ++j)
36725 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
36726 used[which] = true;
36727 }
36728
36729 for (i = 0; i < 2; ++i)
36730 {
36731 if (!used[2 * i + 1])
36732 {
36733 h[i] = NULL_RTX;
36734 continue;
36735 }
36736 vperm = gen_rtx_CONST_VECTOR (V32QImode,
36737 gen_rtvec_v (32, rperm[2 * i + 1]));
36738 vperm = force_reg (V32QImode, vperm);
36739 h[i] = gen_reg_rtx (V32QImode);
36740 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
36741 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
36742 }
36743
36744 /* Swap the 128-byte lanes of h[X]. */
36745 for (i = 0; i < 2; ++i)
36746 {
36747 if (h[i] == NULL_RTX)
36748 continue;
36749 op = gen_reg_rtx (V4DImode);
36750 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
36751 const2_rtx, GEN_INT (3), const0_rtx,
36752 const1_rtx));
36753 h[i] = gen_lowpart (V32QImode, op);
36754 }
36755
36756 for (i = 0; i < 2; ++i)
36757 {
36758 if (!used[2 * i])
36759 {
36760 l[i] = NULL_RTX;
36761 continue;
36762 }
36763 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
36764 vperm = force_reg (V32QImode, vperm);
36765 l[i] = gen_reg_rtx (V32QImode);
36766 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
36767 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
36768 }
36769
36770 for (i = 0; i < 2; ++i)
36771 {
36772 if (h[i] && l[i])
36773 {
36774 op = gen_reg_rtx (V32QImode);
36775 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
36776 l[i] = op;
36777 }
36778 else if (h[i])
36779 l[i] = h[i];
36780 }
36781
36782 gcc_assert (l[0] && l[1]);
36783 op = gen_lowpart (V32QImode, d->target);
36784 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
36785 return true;
36786 }
36787
36788 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
36789 With all of the interface bits taken care of, perform the expansion
36790 in D and return true on success. */
36791
36792 static bool
36793 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
36794 {
36795 /* Try a single instruction expansion. */
36796 if (expand_vec_perm_1 (d))
36797 return true;
36798
36799 /* Try sequences of two instructions. */
36800
36801 if (expand_vec_perm_pshuflw_pshufhw (d))
36802 return true;
36803
36804 if (expand_vec_perm_palignr (d))
36805 return true;
36806
36807 if (expand_vec_perm_interleave2 (d))
36808 return true;
36809
36810 if (expand_vec_perm_broadcast (d))
36811 return true;
36812
36813 if (expand_vec_perm_vpermq_perm_1 (d))
36814 return true;
36815
36816 /* Try sequences of three instructions. */
36817
36818 if (expand_vec_perm_pshufb2 (d))
36819 return true;
36820
36821 if (expand_vec_perm_interleave3 (d))
36822 return true;
36823
36824 /* Try sequences of four instructions. */
36825
36826 if (expand_vec_perm_vpshufb2_vpermq (d))
36827 return true;
36828
36829 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
36830 return true;
36831
36832 /* ??? Look for narrow permutations whose element orderings would
36833 allow the promotion to a wider mode. */
36834
36835 /* ??? Look for sequences of interleave or a wider permute that place
36836 the data into the correct lanes for a half-vector shuffle like
36837 pshuf[lh]w or vpermilps. */
36838
36839 /* ??? Look for sequences of interleave that produce the desired results.
36840 The combinatorics of punpck[lh] get pretty ugly... */
36841
36842 if (expand_vec_perm_even_odd (d))
36843 return true;
36844
36845 /* Even longer sequences. */
36846 if (expand_vec_perm_vpshufb4_vpermq2 (d))
36847 return true;
36848
36849 return false;
36850 }
36851
36852 bool
36853 ix86_expand_vec_perm_const (rtx operands[4])
36854 {
36855 struct expand_vec_perm_d d;
36856 unsigned char perm[MAX_VECT_LEN];
36857 int i, nelt, which;
36858 rtx sel;
36859
36860 d.target = operands[0];
36861 d.op0 = operands[1];
36862 d.op1 = operands[2];
36863 sel = operands[3];
36864
36865 d.vmode = GET_MODE (d.target);
36866 gcc_assert (VECTOR_MODE_P (d.vmode));
36867 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
36868 d.testing_p = false;
36869
36870 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
36871 gcc_assert (XVECLEN (sel, 0) == nelt);
36872 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
36873
36874 for (i = which = 0; i < nelt; ++i)
36875 {
36876 rtx e = XVECEXP (sel, 0, i);
36877 int ei = INTVAL (e) & (2 * nelt - 1);
36878
36879 which |= (ei < nelt ? 1 : 2);
36880 d.perm[i] = ei;
36881 perm[i] = ei;
36882 }
36883
36884 switch (which)
36885 {
36886 default:
36887 gcc_unreachable();
36888
36889 case 3:
36890 if (!rtx_equal_p (d.op0, d.op1))
36891 break;
36892
36893 /* The elements of PERM do not suggest that only the first operand
36894 is used, but both operands are identical. Allow easier matching
36895 of the permutation by folding the permutation into the single
36896 input vector. */
36897 for (i = 0; i < nelt; ++i)
36898 if (d.perm[i] >= nelt)
36899 d.perm[i] -= nelt;
36900 /* FALLTHRU */
36901
36902 case 1:
36903 d.op1 = d.op0;
36904 break;
36905
36906 case 2:
36907 for (i = 0; i < nelt; ++i)
36908 d.perm[i] -= nelt;
36909 d.op0 = d.op1;
36910 break;
36911 }
36912
36913 if (ix86_expand_vec_perm_const_1 (&d))
36914 return true;
36915
36916 /* If the mask says both arguments are needed, but they are the same,
36917 the above tried to expand with d.op0 == d.op1. If that didn't work,
36918 retry with d.op0 != d.op1 as that is what testing has been done with. */
36919 if (which == 3 && d.op0 == d.op1)
36920 {
36921 rtx seq;
36922 bool ok;
36923
36924 memcpy (d.perm, perm, sizeof (perm));
36925 d.op1 = gen_reg_rtx (d.vmode);
36926 start_sequence ();
36927 ok = ix86_expand_vec_perm_const_1 (&d);
36928 seq = get_insns ();
36929 end_sequence ();
36930 if (ok)
36931 {
36932 emit_move_insn (d.op1, d.op0);
36933 emit_insn (seq);
36934 return true;
36935 }
36936 }
36937
36938 return false;
36939 }
36940
36941 /* Implement targetm.vectorize.vec_perm_const_ok. */
36942
36943 static bool
36944 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
36945 const unsigned char *sel)
36946 {
36947 struct expand_vec_perm_d d;
36948 unsigned int i, nelt, which;
36949 bool ret, one_vec;
36950
36951 d.vmode = vmode;
36952 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
36953 d.testing_p = true;
36954
36955 /* Given sufficient ISA support we can just return true here
36956 for selected vector modes. */
36957 if (GET_MODE_SIZE (d.vmode) == 16)
36958 {
36959 /* All implementable with a single vpperm insn. */
36960 if (TARGET_XOP)
36961 return true;
36962 /* All implementable with 2 pshufb + 1 ior. */
36963 if (TARGET_SSSE3)
36964 return true;
36965 /* All implementable with shufpd or unpck[lh]pd. */
36966 if (d.nelt == 2)
36967 return true;
36968 }
36969
36970 /* Extract the values from the vector CST into the permutation
36971 array in D. */
36972 memcpy (d.perm, sel, nelt);
36973 for (i = which = 0; i < nelt; ++i)
36974 {
36975 unsigned char e = d.perm[i];
36976 gcc_assert (e < 2 * nelt);
36977 which |= (e < nelt ? 1 : 2);
36978 }
36979
36980 /* For all elements from second vector, fold the elements to first. */
36981 if (which == 2)
36982 for (i = 0; i < nelt; ++i)
36983 d.perm[i] -= nelt;
36984
36985 /* Check whether the mask can be applied to the vector type. */
36986 one_vec = (which != 3);
36987
36988 /* Implementable with shufps or pshufd. */
36989 if (one_vec && (d.vmode == V4SFmode || d.vmode == V4SImode))
36990 return true;
36991
36992 /* Otherwise we have to go through the motions and see if we can
36993 figure out how to generate the requested permutation. */
36994 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
36995 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
36996 if (!one_vec)
36997 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
36998
36999 start_sequence ();
37000 ret = ix86_expand_vec_perm_const_1 (&d);
37001 end_sequence ();
37002
37003 return ret;
37004 }
37005
37006 void
37007 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
37008 {
37009 struct expand_vec_perm_d d;
37010 unsigned i, nelt;
37011
37012 d.target = targ;
37013 d.op0 = op0;
37014 d.op1 = op1;
37015 d.vmode = GET_MODE (targ);
37016 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37017 d.testing_p = false;
37018
37019 for (i = 0; i < nelt; ++i)
37020 d.perm[i] = i * 2 + odd;
37021
37022 /* We'll either be able to implement the permutation directly... */
37023 if (expand_vec_perm_1 (&d))
37024 return;
37025
37026 /* ... or we use the special-case patterns. */
37027 expand_vec_perm_even_odd_1 (&d, odd);
37028 }
37029
37030 /* Expand an insert into a vector register through pinsr insn.
37031 Return true if successful. */
37032
37033 bool
37034 ix86_expand_pinsr (rtx *operands)
37035 {
37036 rtx dst = operands[0];
37037 rtx src = operands[3];
37038
37039 unsigned int size = INTVAL (operands[1]);
37040 unsigned int pos = INTVAL (operands[2]);
37041
37042 if (GET_CODE (dst) == SUBREG)
37043 {
37044 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
37045 dst = SUBREG_REG (dst);
37046 }
37047
37048 if (GET_CODE (src) == SUBREG)
37049 src = SUBREG_REG (src);
37050
37051 switch (GET_MODE (dst))
37052 {
37053 case V16QImode:
37054 case V8HImode:
37055 case V4SImode:
37056 case V2DImode:
37057 {
37058 enum machine_mode srcmode, dstmode;
37059 rtx (*pinsr)(rtx, rtx, rtx, rtx);
37060
37061 srcmode = mode_for_size (size, MODE_INT, 0);
37062
37063 switch (srcmode)
37064 {
37065 case QImode:
37066 if (!TARGET_SSE4_1)
37067 return false;
37068 dstmode = V16QImode;
37069 pinsr = gen_sse4_1_pinsrb;
37070 break;
37071
37072 case HImode:
37073 if (!TARGET_SSE2)
37074 return false;
37075 dstmode = V8HImode;
37076 pinsr = gen_sse2_pinsrw;
37077 break;
37078
37079 case SImode:
37080 if (!TARGET_SSE4_1)
37081 return false;
37082 dstmode = V4SImode;
37083 pinsr = gen_sse4_1_pinsrd;
37084 break;
37085
37086 case DImode:
37087 gcc_assert (TARGET_64BIT);
37088 if (!TARGET_SSE4_1)
37089 return false;
37090 dstmode = V2DImode;
37091 pinsr = gen_sse4_1_pinsrq;
37092 break;
37093
37094 default:
37095 return false;
37096 }
37097
37098 dst = gen_lowpart (dstmode, dst);
37099 src = gen_lowpart (srcmode, src);
37100
37101 pos /= size;
37102
37103 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
37104 return true;
37105 }
37106
37107 default:
37108 return false;
37109 }
37110 }
37111 \f
37112 /* This function returns the calling abi specific va_list type node.
37113 It returns the FNDECL specific va_list type. */
37114
37115 static tree
37116 ix86_fn_abi_va_list (tree fndecl)
37117 {
37118 if (!TARGET_64BIT)
37119 return va_list_type_node;
37120 gcc_assert (fndecl != NULL_TREE);
37121
37122 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
37123 return ms_va_list_type_node;
37124 else
37125 return sysv_va_list_type_node;
37126 }
37127
37128 /* Returns the canonical va_list type specified by TYPE. If there
37129 is no valid TYPE provided, it return NULL_TREE. */
37130
37131 static tree
37132 ix86_canonical_va_list_type (tree type)
37133 {
37134 tree wtype, htype;
37135
37136 /* Resolve references and pointers to va_list type. */
37137 if (TREE_CODE (type) == MEM_REF)
37138 type = TREE_TYPE (type);
37139 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
37140 type = TREE_TYPE (type);
37141 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
37142 type = TREE_TYPE (type);
37143
37144 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
37145 {
37146 wtype = va_list_type_node;
37147 gcc_assert (wtype != NULL_TREE);
37148 htype = type;
37149 if (TREE_CODE (wtype) == ARRAY_TYPE)
37150 {
37151 /* If va_list is an array type, the argument may have decayed
37152 to a pointer type, e.g. by being passed to another function.
37153 In that case, unwrap both types so that we can compare the
37154 underlying records. */
37155 if (TREE_CODE (htype) == ARRAY_TYPE
37156 || POINTER_TYPE_P (htype))
37157 {
37158 wtype = TREE_TYPE (wtype);
37159 htype = TREE_TYPE (htype);
37160 }
37161 }
37162 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37163 return va_list_type_node;
37164 wtype = sysv_va_list_type_node;
37165 gcc_assert (wtype != NULL_TREE);
37166 htype = type;
37167 if (TREE_CODE (wtype) == ARRAY_TYPE)
37168 {
37169 /* If va_list is an array type, the argument may have decayed
37170 to a pointer type, e.g. by being passed to another function.
37171 In that case, unwrap both types so that we can compare the
37172 underlying records. */
37173 if (TREE_CODE (htype) == ARRAY_TYPE
37174 || POINTER_TYPE_P (htype))
37175 {
37176 wtype = TREE_TYPE (wtype);
37177 htype = TREE_TYPE (htype);
37178 }
37179 }
37180 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37181 return sysv_va_list_type_node;
37182 wtype = ms_va_list_type_node;
37183 gcc_assert (wtype != NULL_TREE);
37184 htype = type;
37185 if (TREE_CODE (wtype) == ARRAY_TYPE)
37186 {
37187 /* If va_list is an array type, the argument may have decayed
37188 to a pointer type, e.g. by being passed to another function.
37189 In that case, unwrap both types so that we can compare the
37190 underlying records. */
37191 if (TREE_CODE (htype) == ARRAY_TYPE
37192 || POINTER_TYPE_P (htype))
37193 {
37194 wtype = TREE_TYPE (wtype);
37195 htype = TREE_TYPE (htype);
37196 }
37197 }
37198 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37199 return ms_va_list_type_node;
37200 return NULL_TREE;
37201 }
37202 return std_canonical_va_list_type (type);
37203 }
37204
37205 /* Iterate through the target-specific builtin types for va_list.
37206 IDX denotes the iterator, *PTREE is set to the result type of
37207 the va_list builtin, and *PNAME to its internal type.
37208 Returns zero if there is no element for this index, otherwise
37209 IDX should be increased upon the next call.
37210 Note, do not iterate a base builtin's name like __builtin_va_list.
37211 Used from c_common_nodes_and_builtins. */
37212
37213 static int
37214 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
37215 {
37216 if (TARGET_64BIT)
37217 {
37218 switch (idx)
37219 {
37220 default:
37221 break;
37222
37223 case 0:
37224 *ptree = ms_va_list_type_node;
37225 *pname = "__builtin_ms_va_list";
37226 return 1;
37227
37228 case 1:
37229 *ptree = sysv_va_list_type_node;
37230 *pname = "__builtin_sysv_va_list";
37231 return 1;
37232 }
37233 }
37234
37235 return 0;
37236 }
37237
37238 #undef TARGET_SCHED_DISPATCH
37239 #define TARGET_SCHED_DISPATCH has_dispatch
37240 #undef TARGET_SCHED_DISPATCH_DO
37241 #define TARGET_SCHED_DISPATCH_DO do_dispatch
37242 #undef TARGET_SCHED_REASSOCIATION_WIDTH
37243 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
37244
37245 /* The size of the dispatch window is the total number of bytes of
37246 object code allowed in a window. */
37247 #define DISPATCH_WINDOW_SIZE 16
37248
37249 /* Number of dispatch windows considered for scheduling. */
37250 #define MAX_DISPATCH_WINDOWS 3
37251
37252 /* Maximum number of instructions in a window. */
37253 #define MAX_INSN 4
37254
37255 /* Maximum number of immediate operands in a window. */
37256 #define MAX_IMM 4
37257
37258 /* Maximum number of immediate bits allowed in a window. */
37259 #define MAX_IMM_SIZE 128
37260
37261 /* Maximum number of 32 bit immediates allowed in a window. */
37262 #define MAX_IMM_32 4
37263
37264 /* Maximum number of 64 bit immediates allowed in a window. */
37265 #define MAX_IMM_64 2
37266
37267 /* Maximum total of loads or prefetches allowed in a window. */
37268 #define MAX_LOAD 2
37269
37270 /* Maximum total of stores allowed in a window. */
37271 #define MAX_STORE 1
37272
37273 #undef BIG
37274 #define BIG 100
37275
37276
37277 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
37278 enum dispatch_group {
37279 disp_no_group = 0,
37280 disp_load,
37281 disp_store,
37282 disp_load_store,
37283 disp_prefetch,
37284 disp_imm,
37285 disp_imm_32,
37286 disp_imm_64,
37287 disp_branch,
37288 disp_cmp,
37289 disp_jcc,
37290 disp_last
37291 };
37292
37293 /* Number of allowable groups in a dispatch window. It is an array
37294 indexed by dispatch_group enum. 100 is used as a big number,
37295 because the number of these kind of operations does not have any
37296 effect in dispatch window, but we need them for other reasons in
37297 the table. */
37298 static unsigned int num_allowable_groups[disp_last] = {
37299 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
37300 };
37301
37302 char group_name[disp_last + 1][16] = {
37303 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
37304 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
37305 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
37306 };
37307
37308 /* Instruction path. */
37309 enum insn_path {
37310 no_path = 0,
37311 path_single, /* Single micro op. */
37312 path_double, /* Double micro op. */
37313 path_multi, /* Instructions with more than 2 micro op.. */
37314 last_path
37315 };
37316
37317 /* sched_insn_info defines a window to the instructions scheduled in
37318 the basic block. It contains a pointer to the insn_info table and
37319 the instruction scheduled.
37320
37321 Windows are allocated for each basic block and are linked
37322 together. */
37323 typedef struct sched_insn_info_s {
37324 rtx insn;
37325 enum dispatch_group group;
37326 enum insn_path path;
37327 int byte_len;
37328 int imm_bytes;
37329 } sched_insn_info;
37330
37331 /* Linked list of dispatch windows. This is a two way list of
37332 dispatch windows of a basic block. It contains information about
37333 the number of uops in the window and the total number of
37334 instructions and of bytes in the object code for this dispatch
37335 window. */
37336 typedef struct dispatch_windows_s {
37337 int num_insn; /* Number of insn in the window. */
37338 int num_uops; /* Number of uops in the window. */
37339 int window_size; /* Number of bytes in the window. */
37340 int window_num; /* Window number between 0 or 1. */
37341 int num_imm; /* Number of immediates in an insn. */
37342 int num_imm_32; /* Number of 32 bit immediates in an insn. */
37343 int num_imm_64; /* Number of 64 bit immediates in an insn. */
37344 int imm_size; /* Total immediates in the window. */
37345 int num_loads; /* Total memory loads in the window. */
37346 int num_stores; /* Total memory stores in the window. */
37347 int violation; /* Violation exists in window. */
37348 sched_insn_info *window; /* Pointer to the window. */
37349 struct dispatch_windows_s *next;
37350 struct dispatch_windows_s *prev;
37351 } dispatch_windows;
37352
37353 /* Immediate valuse used in an insn. */
37354 typedef struct imm_info_s
37355 {
37356 int imm;
37357 int imm32;
37358 int imm64;
37359 } imm_info;
37360
37361 static dispatch_windows *dispatch_window_list;
37362 static dispatch_windows *dispatch_window_list1;
37363
37364 /* Get dispatch group of insn. */
37365
37366 static enum dispatch_group
37367 get_mem_group (rtx insn)
37368 {
37369 enum attr_memory memory;
37370
37371 if (INSN_CODE (insn) < 0)
37372 return disp_no_group;
37373 memory = get_attr_memory (insn);
37374 if (memory == MEMORY_STORE)
37375 return disp_store;
37376
37377 if (memory == MEMORY_LOAD)
37378 return disp_load;
37379
37380 if (memory == MEMORY_BOTH)
37381 return disp_load_store;
37382
37383 return disp_no_group;
37384 }
37385
37386 /* Return true if insn is a compare instruction. */
37387
37388 static bool
37389 is_cmp (rtx insn)
37390 {
37391 enum attr_type type;
37392
37393 type = get_attr_type (insn);
37394 return (type == TYPE_TEST
37395 || type == TYPE_ICMP
37396 || type == TYPE_FCMP
37397 || GET_CODE (PATTERN (insn)) == COMPARE);
37398 }
37399
37400 /* Return true if a dispatch violation encountered. */
37401
37402 static bool
37403 dispatch_violation (void)
37404 {
37405 if (dispatch_window_list->next)
37406 return dispatch_window_list->next->violation;
37407 return dispatch_window_list->violation;
37408 }
37409
37410 /* Return true if insn is a branch instruction. */
37411
37412 static bool
37413 is_branch (rtx insn)
37414 {
37415 return (CALL_P (insn) || JUMP_P (insn));
37416 }
37417
37418 /* Return true if insn is a prefetch instruction. */
37419
37420 static bool
37421 is_prefetch (rtx insn)
37422 {
37423 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
37424 }
37425
37426 /* This function initializes a dispatch window and the list container holding a
37427 pointer to the window. */
37428
37429 static void
37430 init_window (int window_num)
37431 {
37432 int i;
37433 dispatch_windows *new_list;
37434
37435 if (window_num == 0)
37436 new_list = dispatch_window_list;
37437 else
37438 new_list = dispatch_window_list1;
37439
37440 new_list->num_insn = 0;
37441 new_list->num_uops = 0;
37442 new_list->window_size = 0;
37443 new_list->next = NULL;
37444 new_list->prev = NULL;
37445 new_list->window_num = window_num;
37446 new_list->num_imm = 0;
37447 new_list->num_imm_32 = 0;
37448 new_list->num_imm_64 = 0;
37449 new_list->imm_size = 0;
37450 new_list->num_loads = 0;
37451 new_list->num_stores = 0;
37452 new_list->violation = false;
37453
37454 for (i = 0; i < MAX_INSN; i++)
37455 {
37456 new_list->window[i].insn = NULL;
37457 new_list->window[i].group = disp_no_group;
37458 new_list->window[i].path = no_path;
37459 new_list->window[i].byte_len = 0;
37460 new_list->window[i].imm_bytes = 0;
37461 }
37462 return;
37463 }
37464
37465 /* This function allocates and initializes a dispatch window and the
37466 list container holding a pointer to the window. */
37467
37468 static dispatch_windows *
37469 allocate_window (void)
37470 {
37471 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
37472 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
37473
37474 return new_list;
37475 }
37476
37477 /* This routine initializes the dispatch scheduling information. It
37478 initiates building dispatch scheduler tables and constructs the
37479 first dispatch window. */
37480
37481 static void
37482 init_dispatch_sched (void)
37483 {
37484 /* Allocate a dispatch list and a window. */
37485 dispatch_window_list = allocate_window ();
37486 dispatch_window_list1 = allocate_window ();
37487 init_window (0);
37488 init_window (1);
37489 }
37490
37491 /* This function returns true if a branch is detected. End of a basic block
37492 does not have to be a branch, but here we assume only branches end a
37493 window. */
37494
37495 static bool
37496 is_end_basic_block (enum dispatch_group group)
37497 {
37498 return group == disp_branch;
37499 }
37500
37501 /* This function is called when the end of a window processing is reached. */
37502
37503 static void
37504 process_end_window (void)
37505 {
37506 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
37507 if (dispatch_window_list->next)
37508 {
37509 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
37510 gcc_assert (dispatch_window_list->window_size
37511 + dispatch_window_list1->window_size <= 48);
37512 init_window (1);
37513 }
37514 init_window (0);
37515 }
37516
37517 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
37518 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
37519 for 48 bytes of instructions. Note that these windows are not dispatch
37520 windows that their sizes are DISPATCH_WINDOW_SIZE. */
37521
37522 static dispatch_windows *
37523 allocate_next_window (int window_num)
37524 {
37525 if (window_num == 0)
37526 {
37527 if (dispatch_window_list->next)
37528 init_window (1);
37529 init_window (0);
37530 return dispatch_window_list;
37531 }
37532
37533 dispatch_window_list->next = dispatch_window_list1;
37534 dispatch_window_list1->prev = dispatch_window_list;
37535
37536 return dispatch_window_list1;
37537 }
37538
37539 /* Increment the number of immediate operands of an instruction. */
37540
37541 static int
37542 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
37543 {
37544 if (*in_rtx == 0)
37545 return 0;
37546
37547 switch ( GET_CODE (*in_rtx))
37548 {
37549 case CONST:
37550 case SYMBOL_REF:
37551 case CONST_INT:
37552 (imm_values->imm)++;
37553 if (x86_64_immediate_operand (*in_rtx, SImode))
37554 (imm_values->imm32)++;
37555 else
37556 (imm_values->imm64)++;
37557 break;
37558
37559 case CONST_DOUBLE:
37560 (imm_values->imm)++;
37561 (imm_values->imm64)++;
37562 break;
37563
37564 case CODE_LABEL:
37565 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
37566 {
37567 (imm_values->imm)++;
37568 (imm_values->imm32)++;
37569 }
37570 break;
37571
37572 default:
37573 break;
37574 }
37575
37576 return 0;
37577 }
37578
37579 /* Compute number of immediate operands of an instruction. */
37580
37581 static void
37582 find_constant (rtx in_rtx, imm_info *imm_values)
37583 {
37584 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
37585 (rtx_function) find_constant_1, (void *) imm_values);
37586 }
37587
37588 /* Return total size of immediate operands of an instruction along with number
37589 of corresponding immediate-operands. It initializes its parameters to zero
37590 befor calling FIND_CONSTANT.
37591 INSN is the input instruction. IMM is the total of immediates.
37592 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
37593 bit immediates. */
37594
37595 static int
37596 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
37597 {
37598 imm_info imm_values = {0, 0, 0};
37599
37600 find_constant (insn, &imm_values);
37601 *imm = imm_values.imm;
37602 *imm32 = imm_values.imm32;
37603 *imm64 = imm_values.imm64;
37604 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
37605 }
37606
37607 /* This function indicates if an operand of an instruction is an
37608 immediate. */
37609
37610 static bool
37611 has_immediate (rtx insn)
37612 {
37613 int num_imm_operand;
37614 int num_imm32_operand;
37615 int num_imm64_operand;
37616
37617 if (insn)
37618 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
37619 &num_imm64_operand);
37620 return false;
37621 }
37622
37623 /* Return single or double path for instructions. */
37624
37625 static enum insn_path
37626 get_insn_path (rtx insn)
37627 {
37628 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
37629
37630 if ((int)path == 0)
37631 return path_single;
37632
37633 if ((int)path == 1)
37634 return path_double;
37635
37636 return path_multi;
37637 }
37638
37639 /* Return insn dispatch group. */
37640
37641 static enum dispatch_group
37642 get_insn_group (rtx insn)
37643 {
37644 enum dispatch_group group = get_mem_group (insn);
37645 if (group)
37646 return group;
37647
37648 if (is_branch (insn))
37649 return disp_branch;
37650
37651 if (is_cmp (insn))
37652 return disp_cmp;
37653
37654 if (has_immediate (insn))
37655 return disp_imm;
37656
37657 if (is_prefetch (insn))
37658 return disp_prefetch;
37659
37660 return disp_no_group;
37661 }
37662
37663 /* Count number of GROUP restricted instructions in a dispatch
37664 window WINDOW_LIST. */
37665
37666 static int
37667 count_num_restricted (rtx insn, dispatch_windows *window_list)
37668 {
37669 enum dispatch_group group = get_insn_group (insn);
37670 int imm_size;
37671 int num_imm_operand;
37672 int num_imm32_operand;
37673 int num_imm64_operand;
37674
37675 if (group == disp_no_group)
37676 return 0;
37677
37678 if (group == disp_imm)
37679 {
37680 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
37681 &num_imm64_operand);
37682 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
37683 || num_imm_operand + window_list->num_imm > MAX_IMM
37684 || (num_imm32_operand > 0
37685 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
37686 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
37687 || (num_imm64_operand > 0
37688 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
37689 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
37690 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
37691 && num_imm64_operand > 0
37692 && ((window_list->num_imm_64 > 0
37693 && window_list->num_insn >= 2)
37694 || window_list->num_insn >= 3)))
37695 return BIG;
37696
37697 return 1;
37698 }
37699
37700 if ((group == disp_load_store
37701 && (window_list->num_loads >= MAX_LOAD
37702 || window_list->num_stores >= MAX_STORE))
37703 || ((group == disp_load
37704 || group == disp_prefetch)
37705 && window_list->num_loads >= MAX_LOAD)
37706 || (group == disp_store
37707 && window_list->num_stores >= MAX_STORE))
37708 return BIG;
37709
37710 return 1;
37711 }
37712
37713 /* This function returns true if insn satisfies dispatch rules on the
37714 last window scheduled. */
37715
37716 static bool
37717 fits_dispatch_window (rtx insn)
37718 {
37719 dispatch_windows *window_list = dispatch_window_list;
37720 dispatch_windows *window_list_next = dispatch_window_list->next;
37721 unsigned int num_restrict;
37722 enum dispatch_group group = get_insn_group (insn);
37723 enum insn_path path = get_insn_path (insn);
37724 int sum;
37725
37726 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
37727 instructions should be given the lowest priority in the
37728 scheduling process in Haifa scheduler to make sure they will be
37729 scheduled in the same dispatch window as the refrence to them. */
37730 if (group == disp_jcc || group == disp_cmp)
37731 return false;
37732
37733 /* Check nonrestricted. */
37734 if (group == disp_no_group || group == disp_branch)
37735 return true;
37736
37737 /* Get last dispatch window. */
37738 if (window_list_next)
37739 window_list = window_list_next;
37740
37741 if (window_list->window_num == 1)
37742 {
37743 sum = window_list->prev->window_size + window_list->window_size;
37744
37745 if (sum == 32
37746 || (min_insn_size (insn) + sum) >= 48)
37747 /* Window 1 is full. Go for next window. */
37748 return true;
37749 }
37750
37751 num_restrict = count_num_restricted (insn, window_list);
37752
37753 if (num_restrict > num_allowable_groups[group])
37754 return false;
37755
37756 /* See if it fits in the first window. */
37757 if (window_list->window_num == 0)
37758 {
37759 /* The first widow should have only single and double path
37760 uops. */
37761 if (path == path_double
37762 && (window_list->num_uops + 2) > MAX_INSN)
37763 return false;
37764 else if (path != path_single)
37765 return false;
37766 }
37767 return true;
37768 }
37769
37770 /* Add an instruction INSN with NUM_UOPS micro-operations to the
37771 dispatch window WINDOW_LIST. */
37772
37773 static void
37774 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
37775 {
37776 int byte_len = min_insn_size (insn);
37777 int num_insn = window_list->num_insn;
37778 int imm_size;
37779 sched_insn_info *window = window_list->window;
37780 enum dispatch_group group = get_insn_group (insn);
37781 enum insn_path path = get_insn_path (insn);
37782 int num_imm_operand;
37783 int num_imm32_operand;
37784 int num_imm64_operand;
37785
37786 if (!window_list->violation && group != disp_cmp
37787 && !fits_dispatch_window (insn))
37788 window_list->violation = true;
37789
37790 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
37791 &num_imm64_operand);
37792
37793 /* Initialize window with new instruction. */
37794 window[num_insn].insn = insn;
37795 window[num_insn].byte_len = byte_len;
37796 window[num_insn].group = group;
37797 window[num_insn].path = path;
37798 window[num_insn].imm_bytes = imm_size;
37799
37800 window_list->window_size += byte_len;
37801 window_list->num_insn = num_insn + 1;
37802 window_list->num_uops = window_list->num_uops + num_uops;
37803 window_list->imm_size += imm_size;
37804 window_list->num_imm += num_imm_operand;
37805 window_list->num_imm_32 += num_imm32_operand;
37806 window_list->num_imm_64 += num_imm64_operand;
37807
37808 if (group == disp_store)
37809 window_list->num_stores += 1;
37810 else if (group == disp_load
37811 || group == disp_prefetch)
37812 window_list->num_loads += 1;
37813 else if (group == disp_load_store)
37814 {
37815 window_list->num_stores += 1;
37816 window_list->num_loads += 1;
37817 }
37818 }
37819
37820 /* Adds a scheduled instruction, INSN, to the current dispatch window.
37821 If the total bytes of instructions or the number of instructions in
37822 the window exceed allowable, it allocates a new window. */
37823
37824 static void
37825 add_to_dispatch_window (rtx insn)
37826 {
37827 int byte_len;
37828 dispatch_windows *window_list;
37829 dispatch_windows *next_list;
37830 dispatch_windows *window0_list;
37831 enum insn_path path;
37832 enum dispatch_group insn_group;
37833 bool insn_fits;
37834 int num_insn;
37835 int num_uops;
37836 int window_num;
37837 int insn_num_uops;
37838 int sum;
37839
37840 if (INSN_CODE (insn) < 0)
37841 return;
37842
37843 byte_len = min_insn_size (insn);
37844 window_list = dispatch_window_list;
37845 next_list = window_list->next;
37846 path = get_insn_path (insn);
37847 insn_group = get_insn_group (insn);
37848
37849 /* Get the last dispatch window. */
37850 if (next_list)
37851 window_list = dispatch_window_list->next;
37852
37853 if (path == path_single)
37854 insn_num_uops = 1;
37855 else if (path == path_double)
37856 insn_num_uops = 2;
37857 else
37858 insn_num_uops = (int) path;
37859
37860 /* If current window is full, get a new window.
37861 Window number zero is full, if MAX_INSN uops are scheduled in it.
37862 Window number one is full, if window zero's bytes plus window
37863 one's bytes is 32, or if the bytes of the new instruction added
37864 to the total makes it greater than 48, or it has already MAX_INSN
37865 instructions in it. */
37866 num_insn = window_list->num_insn;
37867 num_uops = window_list->num_uops;
37868 window_num = window_list->window_num;
37869 insn_fits = fits_dispatch_window (insn);
37870
37871 if (num_insn >= MAX_INSN
37872 || num_uops + insn_num_uops > MAX_INSN
37873 || !(insn_fits))
37874 {
37875 window_num = ~window_num & 1;
37876 window_list = allocate_next_window (window_num);
37877 }
37878
37879 if (window_num == 0)
37880 {
37881 add_insn_window (insn, window_list, insn_num_uops);
37882 if (window_list->num_insn >= MAX_INSN
37883 && insn_group == disp_branch)
37884 {
37885 process_end_window ();
37886 return;
37887 }
37888 }
37889 else if (window_num == 1)
37890 {
37891 window0_list = window_list->prev;
37892 sum = window0_list->window_size + window_list->window_size;
37893 if (sum == 32
37894 || (byte_len + sum) >= 48)
37895 {
37896 process_end_window ();
37897 window_list = dispatch_window_list;
37898 }
37899
37900 add_insn_window (insn, window_list, insn_num_uops);
37901 }
37902 else
37903 gcc_unreachable ();
37904
37905 if (is_end_basic_block (insn_group))
37906 {
37907 /* End of basic block is reached do end-basic-block process. */
37908 process_end_window ();
37909 return;
37910 }
37911 }
37912
37913 /* Print the dispatch window, WINDOW_NUM, to FILE. */
37914
37915 DEBUG_FUNCTION static void
37916 debug_dispatch_window_file (FILE *file, int window_num)
37917 {
37918 dispatch_windows *list;
37919 int i;
37920
37921 if (window_num == 0)
37922 list = dispatch_window_list;
37923 else
37924 list = dispatch_window_list1;
37925
37926 fprintf (file, "Window #%d:\n", list->window_num);
37927 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
37928 list->num_insn, list->num_uops, list->window_size);
37929 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
37930 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
37931
37932 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
37933 list->num_stores);
37934 fprintf (file, " insn info:\n");
37935
37936 for (i = 0; i < MAX_INSN; i++)
37937 {
37938 if (!list->window[i].insn)
37939 break;
37940 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
37941 i, group_name[list->window[i].group],
37942 i, (void *)list->window[i].insn,
37943 i, list->window[i].path,
37944 i, list->window[i].byte_len,
37945 i, list->window[i].imm_bytes);
37946 }
37947 }
37948
37949 /* Print to stdout a dispatch window. */
37950
37951 DEBUG_FUNCTION void
37952 debug_dispatch_window (int window_num)
37953 {
37954 debug_dispatch_window_file (stdout, window_num);
37955 }
37956
37957 /* Print INSN dispatch information to FILE. */
37958
37959 DEBUG_FUNCTION static void
37960 debug_insn_dispatch_info_file (FILE *file, rtx insn)
37961 {
37962 int byte_len;
37963 enum insn_path path;
37964 enum dispatch_group group;
37965 int imm_size;
37966 int num_imm_operand;
37967 int num_imm32_operand;
37968 int num_imm64_operand;
37969
37970 if (INSN_CODE (insn) < 0)
37971 return;
37972
37973 byte_len = min_insn_size (insn);
37974 path = get_insn_path (insn);
37975 group = get_insn_group (insn);
37976 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
37977 &num_imm64_operand);
37978
37979 fprintf (file, " insn info:\n");
37980 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
37981 group_name[group], path, byte_len);
37982 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
37983 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
37984 }
37985
37986 /* Print to STDERR the status of the ready list with respect to
37987 dispatch windows. */
37988
37989 DEBUG_FUNCTION void
37990 debug_ready_dispatch (void)
37991 {
37992 int i;
37993 int no_ready = number_in_ready ();
37994
37995 fprintf (stdout, "Number of ready: %d\n", no_ready);
37996
37997 for (i = 0; i < no_ready; i++)
37998 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
37999 }
38000
38001 /* This routine is the driver of the dispatch scheduler. */
38002
38003 static void
38004 do_dispatch (rtx insn, int mode)
38005 {
38006 if (mode == DISPATCH_INIT)
38007 init_dispatch_sched ();
38008 else if (mode == ADD_TO_DISPATCH_WINDOW)
38009 add_to_dispatch_window (insn);
38010 }
38011
38012 /* Return TRUE if Dispatch Scheduling is supported. */
38013
38014 static bool
38015 has_dispatch (rtx insn, int action)
38016 {
38017 if ((ix86_tune == PROCESSOR_BDVER1 || ix86_tune == PROCESSOR_BDVER2)
38018 && flag_dispatch_scheduler)
38019 switch (action)
38020 {
38021 default:
38022 return false;
38023
38024 case IS_DISPATCH_ON:
38025 return true;
38026 break;
38027
38028 case IS_CMP:
38029 return is_cmp (insn);
38030
38031 case DISPATCH_VIOLATION:
38032 return dispatch_violation ();
38033
38034 case FITS_DISPATCH_WINDOW:
38035 return fits_dispatch_window (insn);
38036 }
38037
38038 return false;
38039 }
38040
38041 /* Implementation of reassociation_width target hook used by
38042 reassoc phase to identify parallelism level in reassociated
38043 tree. Statements tree_code is passed in OPC. Arguments type
38044 is passed in MODE.
38045
38046 Currently parallel reassociation is enabled for Atom
38047 processors only and we set reassociation width to be 2
38048 because Atom may issue up to 2 instructions per cycle.
38049
38050 Return value should be fixed if parallel reassociation is
38051 enabled for other processors. */
38052
38053 static int
38054 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
38055 enum machine_mode mode)
38056 {
38057 int res = 1;
38058
38059 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
38060 res = 2;
38061 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
38062 res = 2;
38063
38064 return res;
38065 }
38066
38067 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
38068 place emms and femms instructions. */
38069
38070 static enum machine_mode
38071 ix86_preferred_simd_mode (enum machine_mode mode)
38072 {
38073 if (!TARGET_SSE)
38074 return word_mode;
38075
38076 switch (mode)
38077 {
38078 case QImode:
38079 return TARGET_AVX2 ? V32QImode : V16QImode;
38080 case HImode:
38081 return TARGET_AVX2 ? V16HImode : V8HImode;
38082 case SImode:
38083 return TARGET_AVX2 ? V8SImode : V4SImode;
38084 case DImode:
38085 return TARGET_AVX2 ? V4DImode : V2DImode;
38086
38087 case SFmode:
38088 if (TARGET_AVX && !TARGET_PREFER_AVX128)
38089 return V8SFmode;
38090 else
38091 return V4SFmode;
38092
38093 case DFmode:
38094 if (!TARGET_VECTORIZE_DOUBLE)
38095 return word_mode;
38096 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
38097 return V4DFmode;
38098 else if (TARGET_SSE2)
38099 return V2DFmode;
38100 /* FALLTHRU */
38101
38102 default:
38103 return word_mode;
38104 }
38105 }
38106
38107 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
38108 vectors. */
38109
38110 static unsigned int
38111 ix86_autovectorize_vector_sizes (void)
38112 {
38113 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
38114 }
38115
38116 /* Initialize the GCC target structure. */
38117 #undef TARGET_RETURN_IN_MEMORY
38118 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
38119
38120 #undef TARGET_LEGITIMIZE_ADDRESS
38121 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
38122
38123 #undef TARGET_ATTRIBUTE_TABLE
38124 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
38125 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38126 # undef TARGET_MERGE_DECL_ATTRIBUTES
38127 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
38128 #endif
38129
38130 #undef TARGET_COMP_TYPE_ATTRIBUTES
38131 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
38132
38133 #undef TARGET_INIT_BUILTINS
38134 #define TARGET_INIT_BUILTINS ix86_init_builtins
38135 #undef TARGET_BUILTIN_DECL
38136 #define TARGET_BUILTIN_DECL ix86_builtin_decl
38137 #undef TARGET_EXPAND_BUILTIN
38138 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
38139
38140 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
38141 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
38142 ix86_builtin_vectorized_function
38143
38144 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
38145 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
38146
38147 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
38148 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
38149
38150 #undef TARGET_VECTORIZE_BUILTIN_GATHER
38151 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
38152
38153 #undef TARGET_BUILTIN_RECIPROCAL
38154 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
38155
38156 #undef TARGET_ASM_FUNCTION_EPILOGUE
38157 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
38158
38159 #undef TARGET_ENCODE_SECTION_INFO
38160 #ifndef SUBTARGET_ENCODE_SECTION_INFO
38161 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
38162 #else
38163 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
38164 #endif
38165
38166 #undef TARGET_ASM_OPEN_PAREN
38167 #define TARGET_ASM_OPEN_PAREN ""
38168 #undef TARGET_ASM_CLOSE_PAREN
38169 #define TARGET_ASM_CLOSE_PAREN ""
38170
38171 #undef TARGET_ASM_BYTE_OP
38172 #define TARGET_ASM_BYTE_OP ASM_BYTE
38173
38174 #undef TARGET_ASM_ALIGNED_HI_OP
38175 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
38176 #undef TARGET_ASM_ALIGNED_SI_OP
38177 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
38178 #ifdef ASM_QUAD
38179 #undef TARGET_ASM_ALIGNED_DI_OP
38180 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
38181 #endif
38182
38183 #undef TARGET_PROFILE_BEFORE_PROLOGUE
38184 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
38185
38186 #undef TARGET_ASM_UNALIGNED_HI_OP
38187 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
38188 #undef TARGET_ASM_UNALIGNED_SI_OP
38189 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
38190 #undef TARGET_ASM_UNALIGNED_DI_OP
38191 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
38192
38193 #undef TARGET_PRINT_OPERAND
38194 #define TARGET_PRINT_OPERAND ix86_print_operand
38195 #undef TARGET_PRINT_OPERAND_ADDRESS
38196 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
38197 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
38198 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
38199 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
38200 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
38201
38202 #undef TARGET_SCHED_INIT_GLOBAL
38203 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
38204 #undef TARGET_SCHED_ADJUST_COST
38205 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
38206 #undef TARGET_SCHED_ISSUE_RATE
38207 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
38208 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
38209 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
38210 ia32_multipass_dfa_lookahead
38211
38212 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
38213 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
38214
38215 #ifdef HAVE_AS_TLS
38216 #undef TARGET_HAVE_TLS
38217 #define TARGET_HAVE_TLS true
38218 #endif
38219 #undef TARGET_CANNOT_FORCE_CONST_MEM
38220 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
38221 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
38222 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
38223
38224 #undef TARGET_DELEGITIMIZE_ADDRESS
38225 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
38226
38227 #undef TARGET_MS_BITFIELD_LAYOUT_P
38228 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
38229
38230 #if TARGET_MACHO
38231 #undef TARGET_BINDS_LOCAL_P
38232 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
38233 #endif
38234 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38235 #undef TARGET_BINDS_LOCAL_P
38236 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
38237 #endif
38238
38239 #undef TARGET_ASM_OUTPUT_MI_THUNK
38240 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
38241 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
38242 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
38243
38244 #undef TARGET_ASM_FILE_START
38245 #define TARGET_ASM_FILE_START x86_file_start
38246
38247 #undef TARGET_OPTION_OVERRIDE
38248 #define TARGET_OPTION_OVERRIDE ix86_option_override
38249
38250 #undef TARGET_REGISTER_MOVE_COST
38251 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
38252 #undef TARGET_MEMORY_MOVE_COST
38253 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
38254 #undef TARGET_RTX_COSTS
38255 #define TARGET_RTX_COSTS ix86_rtx_costs
38256 #undef TARGET_ADDRESS_COST
38257 #define TARGET_ADDRESS_COST ix86_address_cost
38258
38259 #undef TARGET_FIXED_CONDITION_CODE_REGS
38260 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
38261 #undef TARGET_CC_MODES_COMPATIBLE
38262 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
38263
38264 #undef TARGET_MACHINE_DEPENDENT_REORG
38265 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
38266
38267 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
38268 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
38269
38270 #undef TARGET_BUILD_BUILTIN_VA_LIST
38271 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
38272
38273 #undef TARGET_ENUM_VA_LIST_P
38274 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
38275
38276 #undef TARGET_FN_ABI_VA_LIST
38277 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
38278
38279 #undef TARGET_CANONICAL_VA_LIST_TYPE
38280 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
38281
38282 #undef TARGET_EXPAND_BUILTIN_VA_START
38283 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
38284
38285 #undef TARGET_MD_ASM_CLOBBERS
38286 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
38287
38288 #undef TARGET_PROMOTE_PROTOTYPES
38289 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
38290 #undef TARGET_STRUCT_VALUE_RTX
38291 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
38292 #undef TARGET_SETUP_INCOMING_VARARGS
38293 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
38294 #undef TARGET_MUST_PASS_IN_STACK
38295 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
38296 #undef TARGET_FUNCTION_ARG_ADVANCE
38297 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
38298 #undef TARGET_FUNCTION_ARG
38299 #define TARGET_FUNCTION_ARG ix86_function_arg
38300 #undef TARGET_FUNCTION_ARG_BOUNDARY
38301 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
38302 #undef TARGET_PASS_BY_REFERENCE
38303 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
38304 #undef TARGET_INTERNAL_ARG_POINTER
38305 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
38306 #undef TARGET_UPDATE_STACK_BOUNDARY
38307 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
38308 #undef TARGET_GET_DRAP_RTX
38309 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
38310 #undef TARGET_STRICT_ARGUMENT_NAMING
38311 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
38312 #undef TARGET_STATIC_CHAIN
38313 #define TARGET_STATIC_CHAIN ix86_static_chain
38314 #undef TARGET_TRAMPOLINE_INIT
38315 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
38316 #undef TARGET_RETURN_POPS_ARGS
38317 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
38318
38319 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
38320 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
38321
38322 #undef TARGET_SCALAR_MODE_SUPPORTED_P
38323 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
38324
38325 #undef TARGET_VECTOR_MODE_SUPPORTED_P
38326 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
38327
38328 #undef TARGET_C_MODE_FOR_SUFFIX
38329 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
38330
38331 #ifdef HAVE_AS_TLS
38332 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
38333 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
38334 #endif
38335
38336 #ifdef SUBTARGET_INSERT_ATTRIBUTES
38337 #undef TARGET_INSERT_ATTRIBUTES
38338 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
38339 #endif
38340
38341 #undef TARGET_MANGLE_TYPE
38342 #define TARGET_MANGLE_TYPE ix86_mangle_type
38343
38344 #ifndef TARGET_MACHO
38345 #undef TARGET_STACK_PROTECT_FAIL
38346 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
38347 #endif
38348
38349 #undef TARGET_FUNCTION_VALUE
38350 #define TARGET_FUNCTION_VALUE ix86_function_value
38351
38352 #undef TARGET_FUNCTION_VALUE_REGNO_P
38353 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
38354
38355 #undef TARGET_PROMOTE_FUNCTION_MODE
38356 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
38357
38358 #undef TARGET_SECONDARY_RELOAD
38359 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
38360
38361 #undef TARGET_CLASS_MAX_NREGS
38362 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
38363
38364 #undef TARGET_PREFERRED_RELOAD_CLASS
38365 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
38366 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
38367 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
38368 #undef TARGET_CLASS_LIKELY_SPILLED_P
38369 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
38370
38371 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
38372 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
38373 ix86_builtin_vectorization_cost
38374 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
38375 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
38376 ix86_vectorize_vec_perm_const_ok
38377 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
38378 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
38379 ix86_preferred_simd_mode
38380 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
38381 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
38382 ix86_autovectorize_vector_sizes
38383
38384 #undef TARGET_SET_CURRENT_FUNCTION
38385 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
38386
38387 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
38388 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
38389
38390 #undef TARGET_OPTION_SAVE
38391 #define TARGET_OPTION_SAVE ix86_function_specific_save
38392
38393 #undef TARGET_OPTION_RESTORE
38394 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
38395
38396 #undef TARGET_OPTION_PRINT
38397 #define TARGET_OPTION_PRINT ix86_function_specific_print
38398
38399 #undef TARGET_CAN_INLINE_P
38400 #define TARGET_CAN_INLINE_P ix86_can_inline_p
38401
38402 #undef TARGET_EXPAND_TO_RTL_HOOK
38403 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
38404
38405 #undef TARGET_LEGITIMATE_ADDRESS_P
38406 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
38407
38408 #undef TARGET_LEGITIMATE_CONSTANT_P
38409 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
38410
38411 #undef TARGET_FRAME_POINTER_REQUIRED
38412 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
38413
38414 #undef TARGET_CAN_ELIMINATE
38415 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
38416
38417 #undef TARGET_EXTRA_LIVE_ON_ENTRY
38418 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
38419
38420 #undef TARGET_ASM_CODE_END
38421 #define TARGET_ASM_CODE_END ix86_code_end
38422
38423 #undef TARGET_CONDITIONAL_REGISTER_USAGE
38424 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
38425
38426 #if TARGET_MACHO
38427 #undef TARGET_INIT_LIBFUNCS
38428 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
38429 #endif
38430
38431 struct gcc_target targetm = TARGET_INITIALIZER;
38432 \f
38433 #include "gt-i386.h"