]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/i386/i386.c
9136c63193771b63a043dbd1c686d22be0430ffd
[thirdparty/gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
4 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "reload.h"
51 #include "cgraph.h"
52 #include "gimple.h"
53 #include "dwarf2.h"
54 #include "df.h"
55 #include "tm-constrs.h"
56 #include "params.h"
57 #include "cselib.h"
58 #include "debug.h"
59 #include "sched-int.h"
60 #include "sbitmap.h"
61 #include "fibheap.h"
62 #include "opts.h"
63 #include "diagnostic.h"
64 #include "dumpfile.h"
65
66 enum upper_128bits_state
67 {
68 unknown = 0,
69 unused,
70 used
71 };
72
73 typedef struct block_info_def
74 {
75 /* State of the upper 128bits of AVX registers at exit. */
76 enum upper_128bits_state state;
77 /* TRUE if state of the upper 128bits of AVX registers is unchanged
78 in this block. */
79 bool unchanged;
80 /* TRUE if block has been processed. */
81 bool processed;
82 /* TRUE if block has been scanned. */
83 bool scanned;
84 /* Previous state of the upper 128bits of AVX registers at entry. */
85 enum upper_128bits_state prev;
86 } *block_info;
87
88 #define BLOCK_INFO(B) ((block_info) (B)->aux)
89
90 enum call_avx256_state
91 {
92 /* Callee returns 256bit AVX register. */
93 callee_return_avx256 = -1,
94 /* Callee returns and passes 256bit AVX register. */
95 callee_return_pass_avx256,
96 /* Callee passes 256bit AVX register. */
97 callee_pass_avx256,
98 /* Callee doesn't return nor passe 256bit AVX register, or no
99 256bit AVX register in function return. */
100 call_no_avx256,
101 /* vzeroupper intrinsic. */
102 vzeroupper_intrinsic
103 };
104
105 /* Check if a 256bit AVX register is referenced in stores. */
106
107 static void
108 check_avx256_stores (rtx dest, const_rtx set, void *data)
109 {
110 if ((REG_P (dest)
111 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
112 || (GET_CODE (set) == SET
113 && REG_P (SET_SRC (set))
114 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
115 {
116 enum upper_128bits_state *state
117 = (enum upper_128bits_state *) data;
118 *state = used;
119 }
120 }
121
122 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
123 in basic block BB. Delete it if upper 128bit AVX registers are
124 unused. If it isn't deleted, move it to just before a jump insn.
125
126 STATE is state of the upper 128bits of AVX registers at entry. */
127
128 static void
129 move_or_delete_vzeroupper_2 (basic_block bb,
130 enum upper_128bits_state state)
131 {
132 rtx insn, bb_end;
133 rtx vzeroupper_insn = NULL_RTX;
134 rtx pat;
135 int avx256;
136 bool unchanged;
137
138 if (BLOCK_INFO (bb)->unchanged)
139 {
140 if (dump_file)
141 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
142 bb->index, state);
143
144 BLOCK_INFO (bb)->state = state;
145 return;
146 }
147
148 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
149 {
150 if (dump_file)
151 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
152 bb->index, BLOCK_INFO (bb)->state);
153 return;
154 }
155
156 BLOCK_INFO (bb)->prev = state;
157
158 if (dump_file)
159 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
160 bb->index, state);
161
162 unchanged = true;
163
164 /* BB_END changes when it is deleted. */
165 bb_end = BB_END (bb);
166 insn = BB_HEAD (bb);
167 while (insn != bb_end)
168 {
169 insn = NEXT_INSN (insn);
170
171 if (!NONDEBUG_INSN_P (insn))
172 continue;
173
174 /* Move vzeroupper before jump/call. */
175 if (JUMP_P (insn) || CALL_P (insn))
176 {
177 if (!vzeroupper_insn)
178 continue;
179
180 if (PREV_INSN (insn) != vzeroupper_insn)
181 {
182 if (dump_file)
183 {
184 fprintf (dump_file, "Move vzeroupper after:\n");
185 print_rtl_single (dump_file, PREV_INSN (insn));
186 fprintf (dump_file, "before:\n");
187 print_rtl_single (dump_file, insn);
188 }
189 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
190 PREV_INSN (insn));
191 }
192 vzeroupper_insn = NULL_RTX;
193 continue;
194 }
195
196 pat = PATTERN (insn);
197
198 /* Check insn for vzeroupper intrinsic. */
199 if (GET_CODE (pat) == UNSPEC_VOLATILE
200 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
201 {
202 if (dump_file)
203 {
204 /* Found vzeroupper intrinsic. */
205 fprintf (dump_file, "Found vzeroupper:\n");
206 print_rtl_single (dump_file, insn);
207 }
208 }
209 else
210 {
211 /* Check insn for vzeroall intrinsic. */
212 if (GET_CODE (pat) == PARALLEL
213 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
214 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
215 {
216 state = unused;
217 unchanged = false;
218
219 /* Delete pending vzeroupper insertion. */
220 if (vzeroupper_insn)
221 {
222 delete_insn (vzeroupper_insn);
223 vzeroupper_insn = NULL_RTX;
224 }
225 }
226 else if (state != used)
227 {
228 note_stores (pat, check_avx256_stores, &state);
229 if (state == used)
230 unchanged = false;
231 }
232 continue;
233 }
234
235 /* Process vzeroupper intrinsic. */
236 avx256 = INTVAL (XVECEXP (pat, 0, 0));
237
238 if (state == unused)
239 {
240 /* Since the upper 128bits are cleared, callee must not pass
241 256bit AVX register. We only need to check if callee
242 returns 256bit AVX register. */
243 if (avx256 == callee_return_avx256)
244 {
245 state = used;
246 unchanged = false;
247 }
248
249 /* Remove unnecessary vzeroupper since upper 128bits are
250 cleared. */
251 if (dump_file)
252 {
253 fprintf (dump_file, "Delete redundant vzeroupper:\n");
254 print_rtl_single (dump_file, insn);
255 }
256 delete_insn (insn);
257 }
258 else
259 {
260 /* Set state to UNUSED if callee doesn't return 256bit AVX
261 register. */
262 if (avx256 != callee_return_pass_avx256)
263 state = unused;
264
265 if (avx256 == callee_return_pass_avx256
266 || avx256 == callee_pass_avx256)
267 {
268 /* Must remove vzeroupper since callee passes in 256bit
269 AVX register. */
270 if (dump_file)
271 {
272 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
273 print_rtl_single (dump_file, insn);
274 }
275 delete_insn (insn);
276 }
277 else
278 {
279 vzeroupper_insn = insn;
280 unchanged = false;
281 }
282 }
283 }
284
285 BLOCK_INFO (bb)->state = state;
286 BLOCK_INFO (bb)->unchanged = unchanged;
287 BLOCK_INFO (bb)->scanned = true;
288
289 if (dump_file)
290 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
291 bb->index, unchanged ? "unchanged" : "changed",
292 state);
293 }
294
295 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
296 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
297 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
298 state is changed. */
299
300 static bool
301 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
302 {
303 edge e;
304 edge_iterator ei;
305 enum upper_128bits_state state, old_state, new_state;
306 bool seen_unknown;
307
308 if (dump_file)
309 fprintf (dump_file, " Process [bb %i]: status: %d\n",
310 block->index, BLOCK_INFO (block)->processed);
311
312 if (BLOCK_INFO (block)->processed)
313 return false;
314
315 state = unused;
316
317 /* Check all predecessor edges of this block. */
318 seen_unknown = false;
319 FOR_EACH_EDGE (e, ei, block->preds)
320 {
321 if (e->src == block)
322 continue;
323 switch (BLOCK_INFO (e->src)->state)
324 {
325 case unknown:
326 if (!unknown_is_unused)
327 seen_unknown = true;
328 case unused:
329 break;
330 case used:
331 state = used;
332 goto done;
333 }
334 }
335
336 if (seen_unknown)
337 state = unknown;
338
339 done:
340 old_state = BLOCK_INFO (block)->state;
341 move_or_delete_vzeroupper_2 (block, state);
342 new_state = BLOCK_INFO (block)->state;
343
344 if (state != unknown || new_state == used)
345 BLOCK_INFO (block)->processed = true;
346
347 /* Need to rescan if the upper 128bits of AVX registers are changed
348 to USED at exit. */
349 if (new_state != old_state)
350 {
351 if (new_state == used)
352 cfun->machine->rescan_vzeroupper_p = 1;
353 return true;
354 }
355 else
356 return false;
357 }
358
359 /* Go through the instruction stream looking for vzeroupper. Delete
360 it if upper 128bit AVX registers are unused. If it isn't deleted,
361 move it to just before a jump insn. */
362
363 static void
364 move_or_delete_vzeroupper (void)
365 {
366 edge e;
367 edge_iterator ei;
368 basic_block bb;
369 fibheap_t worklist, pending, fibheap_swap;
370 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
371 int *bb_order;
372 int *rc_order;
373 int i;
374
375 /* Set up block info for each basic block. */
376 alloc_aux_for_blocks (sizeof (struct block_info_def));
377
378 /* Process outgoing edges of entry point. */
379 if (dump_file)
380 fprintf (dump_file, "Process outgoing edges of entry point\n");
381
382 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
383 {
384 move_or_delete_vzeroupper_2 (e->dest,
385 cfun->machine->caller_pass_avx256_p
386 ? used : unused);
387 BLOCK_INFO (e->dest)->processed = true;
388 }
389
390 /* Compute reverse completion order of depth first search of the CFG
391 so that the data-flow runs faster. */
392 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
393 bb_order = XNEWVEC (int, last_basic_block);
394 pre_and_rev_post_order_compute (NULL, rc_order, false);
395 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
396 bb_order[rc_order[i]] = i;
397 free (rc_order);
398
399 worklist = fibheap_new ();
400 pending = fibheap_new ();
401 visited = sbitmap_alloc (last_basic_block);
402 in_worklist = sbitmap_alloc (last_basic_block);
403 in_pending = sbitmap_alloc (last_basic_block);
404 sbitmap_zero (in_worklist);
405
406 /* Don't check outgoing edges of entry point. */
407 sbitmap_ones (in_pending);
408 FOR_EACH_BB (bb)
409 if (BLOCK_INFO (bb)->processed)
410 RESET_BIT (in_pending, bb->index);
411 else
412 {
413 move_or_delete_vzeroupper_1 (bb, false);
414 fibheap_insert (pending, bb_order[bb->index], bb);
415 }
416
417 if (dump_file)
418 fprintf (dump_file, "Check remaining basic blocks\n");
419
420 while (!fibheap_empty (pending))
421 {
422 fibheap_swap = pending;
423 pending = worklist;
424 worklist = fibheap_swap;
425 sbitmap_swap = in_pending;
426 in_pending = in_worklist;
427 in_worklist = sbitmap_swap;
428
429 sbitmap_zero (visited);
430
431 cfun->machine->rescan_vzeroupper_p = 0;
432
433 while (!fibheap_empty (worklist))
434 {
435 bb = (basic_block) fibheap_extract_min (worklist);
436 RESET_BIT (in_worklist, bb->index);
437 gcc_assert (!TEST_BIT (visited, bb->index));
438 if (!TEST_BIT (visited, bb->index))
439 {
440 edge_iterator ei;
441
442 SET_BIT (visited, bb->index);
443
444 if (move_or_delete_vzeroupper_1 (bb, false))
445 FOR_EACH_EDGE (e, ei, bb->succs)
446 {
447 if (e->dest == EXIT_BLOCK_PTR
448 || BLOCK_INFO (e->dest)->processed)
449 continue;
450
451 if (TEST_BIT (visited, e->dest->index))
452 {
453 if (!TEST_BIT (in_pending, e->dest->index))
454 {
455 /* Send E->DEST to next round. */
456 SET_BIT (in_pending, e->dest->index);
457 fibheap_insert (pending,
458 bb_order[e->dest->index],
459 e->dest);
460 }
461 }
462 else if (!TEST_BIT (in_worklist, e->dest->index))
463 {
464 /* Add E->DEST to current round. */
465 SET_BIT (in_worklist, e->dest->index);
466 fibheap_insert (worklist, bb_order[e->dest->index],
467 e->dest);
468 }
469 }
470 }
471 }
472
473 if (!cfun->machine->rescan_vzeroupper_p)
474 break;
475 }
476
477 free (bb_order);
478 fibheap_delete (worklist);
479 fibheap_delete (pending);
480 sbitmap_free (visited);
481 sbitmap_free (in_worklist);
482 sbitmap_free (in_pending);
483
484 if (dump_file)
485 fprintf (dump_file, "Process remaining basic blocks\n");
486
487 FOR_EACH_BB (bb)
488 move_or_delete_vzeroupper_1 (bb, true);
489
490 free_aux_for_blocks ();
491 }
492
493 static rtx legitimize_dllimport_symbol (rtx, bool);
494
495 #ifndef CHECK_STACK_LIMIT
496 #define CHECK_STACK_LIMIT (-1)
497 #endif
498
499 /* Return index of given mode in mult and division cost tables. */
500 #define MODE_INDEX(mode) \
501 ((mode) == QImode ? 0 \
502 : (mode) == HImode ? 1 \
503 : (mode) == SImode ? 2 \
504 : (mode) == DImode ? 3 \
505 : 4)
506
507 /* Processor costs (relative to an add) */
508 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
509 #define COSTS_N_BYTES(N) ((N) * 2)
510
511 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
512
513 const
514 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
515 COSTS_N_BYTES (2), /* cost of an add instruction */
516 COSTS_N_BYTES (3), /* cost of a lea instruction */
517 COSTS_N_BYTES (2), /* variable shift costs */
518 COSTS_N_BYTES (3), /* constant shift costs */
519 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
520 COSTS_N_BYTES (3), /* HI */
521 COSTS_N_BYTES (3), /* SI */
522 COSTS_N_BYTES (3), /* DI */
523 COSTS_N_BYTES (5)}, /* other */
524 0, /* cost of multiply per each bit set */
525 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
526 COSTS_N_BYTES (3), /* HI */
527 COSTS_N_BYTES (3), /* SI */
528 COSTS_N_BYTES (3), /* DI */
529 COSTS_N_BYTES (5)}, /* other */
530 COSTS_N_BYTES (3), /* cost of movsx */
531 COSTS_N_BYTES (3), /* cost of movzx */
532 0, /* "large" insn */
533 2, /* MOVE_RATIO */
534 2, /* cost for loading QImode using movzbl */
535 {2, 2, 2}, /* cost of loading integer registers
536 in QImode, HImode and SImode.
537 Relative to reg-reg move (2). */
538 {2, 2, 2}, /* cost of storing integer registers */
539 2, /* cost of reg,reg fld/fst */
540 {2, 2, 2}, /* cost of loading fp registers
541 in SFmode, DFmode and XFmode */
542 {2, 2, 2}, /* cost of storing fp registers
543 in SFmode, DFmode and XFmode */
544 3, /* cost of moving MMX register */
545 {3, 3}, /* cost of loading MMX registers
546 in SImode and DImode */
547 {3, 3}, /* cost of storing MMX registers
548 in SImode and DImode */
549 3, /* cost of moving SSE register */
550 {3, 3, 3}, /* cost of loading SSE registers
551 in SImode, DImode and TImode */
552 {3, 3, 3}, /* cost of storing SSE registers
553 in SImode, DImode and TImode */
554 3, /* MMX or SSE register to integer */
555 0, /* size of l1 cache */
556 0, /* size of l2 cache */
557 0, /* size of prefetch block */
558 0, /* number of parallel prefetches */
559 2, /* Branch cost */
560 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
561 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
562 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
563 COSTS_N_BYTES (2), /* cost of FABS instruction. */
564 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
565 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
566 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
567 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
568 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
569 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
570 1, /* scalar_stmt_cost. */
571 1, /* scalar load_cost. */
572 1, /* scalar_store_cost. */
573 1, /* vec_stmt_cost. */
574 1, /* vec_to_scalar_cost. */
575 1, /* scalar_to_vec_cost. */
576 1, /* vec_align_load_cost. */
577 1, /* vec_unalign_load_cost. */
578 1, /* vec_store_cost. */
579 1, /* cond_taken_branch_cost. */
580 1, /* cond_not_taken_branch_cost. */
581 };
582
583 /* Processor costs (relative to an add) */
584 static const
585 struct processor_costs i386_cost = { /* 386 specific costs */
586 COSTS_N_INSNS (1), /* cost of an add instruction */
587 COSTS_N_INSNS (1), /* cost of a lea instruction */
588 COSTS_N_INSNS (3), /* variable shift costs */
589 COSTS_N_INSNS (2), /* constant shift costs */
590 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
591 COSTS_N_INSNS (6), /* HI */
592 COSTS_N_INSNS (6), /* SI */
593 COSTS_N_INSNS (6), /* DI */
594 COSTS_N_INSNS (6)}, /* other */
595 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
596 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
597 COSTS_N_INSNS (23), /* HI */
598 COSTS_N_INSNS (23), /* SI */
599 COSTS_N_INSNS (23), /* DI */
600 COSTS_N_INSNS (23)}, /* other */
601 COSTS_N_INSNS (3), /* cost of movsx */
602 COSTS_N_INSNS (2), /* cost of movzx */
603 15, /* "large" insn */
604 3, /* MOVE_RATIO */
605 4, /* cost for loading QImode using movzbl */
606 {2, 4, 2}, /* cost of loading integer registers
607 in QImode, HImode and SImode.
608 Relative to reg-reg move (2). */
609 {2, 4, 2}, /* cost of storing integer registers */
610 2, /* cost of reg,reg fld/fst */
611 {8, 8, 8}, /* cost of loading fp registers
612 in SFmode, DFmode and XFmode */
613 {8, 8, 8}, /* cost of storing fp registers
614 in SFmode, DFmode and XFmode */
615 2, /* cost of moving MMX register */
616 {4, 8}, /* cost of loading MMX registers
617 in SImode and DImode */
618 {4, 8}, /* cost of storing MMX registers
619 in SImode and DImode */
620 2, /* cost of moving SSE register */
621 {4, 8, 16}, /* cost of loading SSE registers
622 in SImode, DImode and TImode */
623 {4, 8, 16}, /* cost of storing SSE registers
624 in SImode, DImode and TImode */
625 3, /* MMX or SSE register to integer */
626 0, /* size of l1 cache */
627 0, /* size of l2 cache */
628 0, /* size of prefetch block */
629 0, /* number of parallel prefetches */
630 1, /* Branch cost */
631 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
632 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
633 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
634 COSTS_N_INSNS (22), /* cost of FABS instruction. */
635 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
636 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
637 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
638 DUMMY_STRINGOP_ALGS},
639 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
640 DUMMY_STRINGOP_ALGS},
641 1, /* scalar_stmt_cost. */
642 1, /* scalar load_cost. */
643 1, /* scalar_store_cost. */
644 1, /* vec_stmt_cost. */
645 1, /* vec_to_scalar_cost. */
646 1, /* scalar_to_vec_cost. */
647 1, /* vec_align_load_cost. */
648 2, /* vec_unalign_load_cost. */
649 1, /* vec_store_cost. */
650 3, /* cond_taken_branch_cost. */
651 1, /* cond_not_taken_branch_cost. */
652 };
653
654 static const
655 struct processor_costs i486_cost = { /* 486 specific costs */
656 COSTS_N_INSNS (1), /* cost of an add instruction */
657 COSTS_N_INSNS (1), /* cost of a lea instruction */
658 COSTS_N_INSNS (3), /* variable shift costs */
659 COSTS_N_INSNS (2), /* constant shift costs */
660 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
661 COSTS_N_INSNS (12), /* HI */
662 COSTS_N_INSNS (12), /* SI */
663 COSTS_N_INSNS (12), /* DI */
664 COSTS_N_INSNS (12)}, /* other */
665 1, /* cost of multiply per each bit set */
666 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
667 COSTS_N_INSNS (40), /* HI */
668 COSTS_N_INSNS (40), /* SI */
669 COSTS_N_INSNS (40), /* DI */
670 COSTS_N_INSNS (40)}, /* other */
671 COSTS_N_INSNS (3), /* cost of movsx */
672 COSTS_N_INSNS (2), /* cost of movzx */
673 15, /* "large" insn */
674 3, /* MOVE_RATIO */
675 4, /* cost for loading QImode using movzbl */
676 {2, 4, 2}, /* cost of loading integer registers
677 in QImode, HImode and SImode.
678 Relative to reg-reg move (2). */
679 {2, 4, 2}, /* cost of storing integer registers */
680 2, /* cost of reg,reg fld/fst */
681 {8, 8, 8}, /* cost of loading fp registers
682 in SFmode, DFmode and XFmode */
683 {8, 8, 8}, /* cost of storing fp registers
684 in SFmode, DFmode and XFmode */
685 2, /* cost of moving MMX register */
686 {4, 8}, /* cost of loading MMX registers
687 in SImode and DImode */
688 {4, 8}, /* cost of storing MMX registers
689 in SImode and DImode */
690 2, /* cost of moving SSE register */
691 {4, 8, 16}, /* cost of loading SSE registers
692 in SImode, DImode and TImode */
693 {4, 8, 16}, /* cost of storing SSE registers
694 in SImode, DImode and TImode */
695 3, /* MMX or SSE register to integer */
696 4, /* size of l1 cache. 486 has 8kB cache
697 shared for code and data, so 4kB is
698 not really precise. */
699 4, /* size of l2 cache */
700 0, /* size of prefetch block */
701 0, /* number of parallel prefetches */
702 1, /* Branch cost */
703 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
704 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
705 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
706 COSTS_N_INSNS (3), /* cost of FABS instruction. */
707 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
708 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
709 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
710 DUMMY_STRINGOP_ALGS},
711 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
712 DUMMY_STRINGOP_ALGS},
713 1, /* scalar_stmt_cost. */
714 1, /* scalar load_cost. */
715 1, /* scalar_store_cost. */
716 1, /* vec_stmt_cost. */
717 1, /* vec_to_scalar_cost. */
718 1, /* scalar_to_vec_cost. */
719 1, /* vec_align_load_cost. */
720 2, /* vec_unalign_load_cost. */
721 1, /* vec_store_cost. */
722 3, /* cond_taken_branch_cost. */
723 1, /* cond_not_taken_branch_cost. */
724 };
725
726 static const
727 struct processor_costs pentium_cost = {
728 COSTS_N_INSNS (1), /* cost of an add instruction */
729 COSTS_N_INSNS (1), /* cost of a lea instruction */
730 COSTS_N_INSNS (4), /* variable shift costs */
731 COSTS_N_INSNS (1), /* constant shift costs */
732 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
733 COSTS_N_INSNS (11), /* HI */
734 COSTS_N_INSNS (11), /* SI */
735 COSTS_N_INSNS (11), /* DI */
736 COSTS_N_INSNS (11)}, /* other */
737 0, /* cost of multiply per each bit set */
738 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
739 COSTS_N_INSNS (25), /* HI */
740 COSTS_N_INSNS (25), /* SI */
741 COSTS_N_INSNS (25), /* DI */
742 COSTS_N_INSNS (25)}, /* other */
743 COSTS_N_INSNS (3), /* cost of movsx */
744 COSTS_N_INSNS (2), /* cost of movzx */
745 8, /* "large" insn */
746 6, /* MOVE_RATIO */
747 6, /* cost for loading QImode using movzbl */
748 {2, 4, 2}, /* cost of loading integer registers
749 in QImode, HImode and SImode.
750 Relative to reg-reg move (2). */
751 {2, 4, 2}, /* cost of storing integer registers */
752 2, /* cost of reg,reg fld/fst */
753 {2, 2, 6}, /* cost of loading fp registers
754 in SFmode, DFmode and XFmode */
755 {4, 4, 6}, /* cost of storing fp registers
756 in SFmode, DFmode and XFmode */
757 8, /* cost of moving MMX register */
758 {8, 8}, /* cost of loading MMX registers
759 in SImode and DImode */
760 {8, 8}, /* cost of storing MMX registers
761 in SImode and DImode */
762 2, /* cost of moving SSE register */
763 {4, 8, 16}, /* cost of loading SSE registers
764 in SImode, DImode and TImode */
765 {4, 8, 16}, /* cost of storing SSE registers
766 in SImode, DImode and TImode */
767 3, /* MMX or SSE register to integer */
768 8, /* size of l1 cache. */
769 8, /* size of l2 cache */
770 0, /* size of prefetch block */
771 0, /* number of parallel prefetches */
772 2, /* Branch cost */
773 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
774 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
775 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
776 COSTS_N_INSNS (1), /* cost of FABS instruction. */
777 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
778 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
779 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
780 DUMMY_STRINGOP_ALGS},
781 {{libcall, {{-1, rep_prefix_4_byte}}},
782 DUMMY_STRINGOP_ALGS},
783 1, /* scalar_stmt_cost. */
784 1, /* scalar load_cost. */
785 1, /* scalar_store_cost. */
786 1, /* vec_stmt_cost. */
787 1, /* vec_to_scalar_cost. */
788 1, /* scalar_to_vec_cost. */
789 1, /* vec_align_load_cost. */
790 2, /* vec_unalign_load_cost. */
791 1, /* vec_store_cost. */
792 3, /* cond_taken_branch_cost. */
793 1, /* cond_not_taken_branch_cost. */
794 };
795
796 static const
797 struct processor_costs pentiumpro_cost = {
798 COSTS_N_INSNS (1), /* cost of an add instruction */
799 COSTS_N_INSNS (1), /* cost of a lea instruction */
800 COSTS_N_INSNS (1), /* variable shift costs */
801 COSTS_N_INSNS (1), /* constant shift costs */
802 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
803 COSTS_N_INSNS (4), /* HI */
804 COSTS_N_INSNS (4), /* SI */
805 COSTS_N_INSNS (4), /* DI */
806 COSTS_N_INSNS (4)}, /* other */
807 0, /* cost of multiply per each bit set */
808 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
809 COSTS_N_INSNS (17), /* HI */
810 COSTS_N_INSNS (17), /* SI */
811 COSTS_N_INSNS (17), /* DI */
812 COSTS_N_INSNS (17)}, /* other */
813 COSTS_N_INSNS (1), /* cost of movsx */
814 COSTS_N_INSNS (1), /* cost of movzx */
815 8, /* "large" insn */
816 6, /* MOVE_RATIO */
817 2, /* cost for loading QImode using movzbl */
818 {4, 4, 4}, /* cost of loading integer registers
819 in QImode, HImode and SImode.
820 Relative to reg-reg move (2). */
821 {2, 2, 2}, /* cost of storing integer registers */
822 2, /* cost of reg,reg fld/fst */
823 {2, 2, 6}, /* cost of loading fp registers
824 in SFmode, DFmode and XFmode */
825 {4, 4, 6}, /* cost of storing fp registers
826 in SFmode, DFmode and XFmode */
827 2, /* cost of moving MMX register */
828 {2, 2}, /* cost of loading MMX registers
829 in SImode and DImode */
830 {2, 2}, /* cost of storing MMX registers
831 in SImode and DImode */
832 2, /* cost of moving SSE register */
833 {2, 2, 8}, /* cost of loading SSE registers
834 in SImode, DImode and TImode */
835 {2, 2, 8}, /* cost of storing SSE registers
836 in SImode, DImode and TImode */
837 3, /* MMX or SSE register to integer */
838 8, /* size of l1 cache. */
839 256, /* size of l2 cache */
840 32, /* size of prefetch block */
841 6, /* number of parallel prefetches */
842 2, /* Branch cost */
843 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
844 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
845 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
846 COSTS_N_INSNS (2), /* cost of FABS instruction. */
847 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
848 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
849 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
850 (we ensure the alignment). For small blocks inline loop is still a
851 noticeable win, for bigger blocks either rep movsl or rep movsb is
852 way to go. Rep movsb has apparently more expensive startup time in CPU,
853 but after 4K the difference is down in the noise. */
854 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
855 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
856 DUMMY_STRINGOP_ALGS},
857 {{rep_prefix_4_byte, {{1024, unrolled_loop},
858 {8192, rep_prefix_4_byte}, {-1, libcall}}},
859 DUMMY_STRINGOP_ALGS},
860 1, /* scalar_stmt_cost. */
861 1, /* scalar load_cost. */
862 1, /* scalar_store_cost. */
863 1, /* vec_stmt_cost. */
864 1, /* vec_to_scalar_cost. */
865 1, /* scalar_to_vec_cost. */
866 1, /* vec_align_load_cost. */
867 2, /* vec_unalign_load_cost. */
868 1, /* vec_store_cost. */
869 3, /* cond_taken_branch_cost. */
870 1, /* cond_not_taken_branch_cost. */
871 };
872
873 static const
874 struct processor_costs geode_cost = {
875 COSTS_N_INSNS (1), /* cost of an add instruction */
876 COSTS_N_INSNS (1), /* cost of a lea instruction */
877 COSTS_N_INSNS (2), /* variable shift costs */
878 COSTS_N_INSNS (1), /* constant shift costs */
879 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
880 COSTS_N_INSNS (4), /* HI */
881 COSTS_N_INSNS (7), /* SI */
882 COSTS_N_INSNS (7), /* DI */
883 COSTS_N_INSNS (7)}, /* other */
884 0, /* cost of multiply per each bit set */
885 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
886 COSTS_N_INSNS (23), /* HI */
887 COSTS_N_INSNS (39), /* SI */
888 COSTS_N_INSNS (39), /* DI */
889 COSTS_N_INSNS (39)}, /* other */
890 COSTS_N_INSNS (1), /* cost of movsx */
891 COSTS_N_INSNS (1), /* cost of movzx */
892 8, /* "large" insn */
893 4, /* MOVE_RATIO */
894 1, /* cost for loading QImode using movzbl */
895 {1, 1, 1}, /* cost of loading integer registers
896 in QImode, HImode and SImode.
897 Relative to reg-reg move (2). */
898 {1, 1, 1}, /* cost of storing integer registers */
899 1, /* cost of reg,reg fld/fst */
900 {1, 1, 1}, /* cost of loading fp registers
901 in SFmode, DFmode and XFmode */
902 {4, 6, 6}, /* cost of storing fp registers
903 in SFmode, DFmode and XFmode */
904
905 1, /* cost of moving MMX register */
906 {1, 1}, /* cost of loading MMX registers
907 in SImode and DImode */
908 {1, 1}, /* cost of storing MMX registers
909 in SImode and DImode */
910 1, /* cost of moving SSE register */
911 {1, 1, 1}, /* cost of loading SSE registers
912 in SImode, DImode and TImode */
913 {1, 1, 1}, /* cost of storing SSE registers
914 in SImode, DImode and TImode */
915 1, /* MMX or SSE register to integer */
916 64, /* size of l1 cache. */
917 128, /* size of l2 cache. */
918 32, /* size of prefetch block */
919 1, /* number of parallel prefetches */
920 1, /* Branch cost */
921 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
922 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
923 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
924 COSTS_N_INSNS (1), /* cost of FABS instruction. */
925 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
926 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
927 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
928 DUMMY_STRINGOP_ALGS},
929 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
930 DUMMY_STRINGOP_ALGS},
931 1, /* scalar_stmt_cost. */
932 1, /* scalar load_cost. */
933 1, /* scalar_store_cost. */
934 1, /* vec_stmt_cost. */
935 1, /* vec_to_scalar_cost. */
936 1, /* scalar_to_vec_cost. */
937 1, /* vec_align_load_cost. */
938 2, /* vec_unalign_load_cost. */
939 1, /* vec_store_cost. */
940 3, /* cond_taken_branch_cost. */
941 1, /* cond_not_taken_branch_cost. */
942 };
943
944 static const
945 struct processor_costs k6_cost = {
946 COSTS_N_INSNS (1), /* cost of an add instruction */
947 COSTS_N_INSNS (2), /* cost of a lea instruction */
948 COSTS_N_INSNS (1), /* variable shift costs */
949 COSTS_N_INSNS (1), /* constant shift costs */
950 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
951 COSTS_N_INSNS (3), /* HI */
952 COSTS_N_INSNS (3), /* SI */
953 COSTS_N_INSNS (3), /* DI */
954 COSTS_N_INSNS (3)}, /* other */
955 0, /* cost of multiply per each bit set */
956 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
957 COSTS_N_INSNS (18), /* HI */
958 COSTS_N_INSNS (18), /* SI */
959 COSTS_N_INSNS (18), /* DI */
960 COSTS_N_INSNS (18)}, /* other */
961 COSTS_N_INSNS (2), /* cost of movsx */
962 COSTS_N_INSNS (2), /* cost of movzx */
963 8, /* "large" insn */
964 4, /* MOVE_RATIO */
965 3, /* cost for loading QImode using movzbl */
966 {4, 5, 4}, /* cost of loading integer registers
967 in QImode, HImode and SImode.
968 Relative to reg-reg move (2). */
969 {2, 3, 2}, /* cost of storing integer registers */
970 4, /* cost of reg,reg fld/fst */
971 {6, 6, 6}, /* cost of loading fp registers
972 in SFmode, DFmode and XFmode */
973 {4, 4, 4}, /* cost of storing fp registers
974 in SFmode, DFmode and XFmode */
975 2, /* cost of moving MMX register */
976 {2, 2}, /* cost of loading MMX registers
977 in SImode and DImode */
978 {2, 2}, /* cost of storing MMX registers
979 in SImode and DImode */
980 2, /* cost of moving SSE register */
981 {2, 2, 8}, /* cost of loading SSE registers
982 in SImode, DImode and TImode */
983 {2, 2, 8}, /* cost of storing SSE registers
984 in SImode, DImode and TImode */
985 6, /* MMX or SSE register to integer */
986 32, /* size of l1 cache. */
987 32, /* size of l2 cache. Some models
988 have integrated l2 cache, but
989 optimizing for k6 is not important
990 enough to worry about that. */
991 32, /* size of prefetch block */
992 1, /* number of parallel prefetches */
993 1, /* Branch cost */
994 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
995 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
996 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
997 COSTS_N_INSNS (2), /* cost of FABS instruction. */
998 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
999 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
1000 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1001 DUMMY_STRINGOP_ALGS},
1002 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1003 DUMMY_STRINGOP_ALGS},
1004 1, /* scalar_stmt_cost. */
1005 1, /* scalar load_cost. */
1006 1, /* scalar_store_cost. */
1007 1, /* vec_stmt_cost. */
1008 1, /* vec_to_scalar_cost. */
1009 1, /* scalar_to_vec_cost. */
1010 1, /* vec_align_load_cost. */
1011 2, /* vec_unalign_load_cost. */
1012 1, /* vec_store_cost. */
1013 3, /* cond_taken_branch_cost. */
1014 1, /* cond_not_taken_branch_cost. */
1015 };
1016
1017 static const
1018 struct processor_costs athlon_cost = {
1019 COSTS_N_INSNS (1), /* cost of an add instruction */
1020 COSTS_N_INSNS (2), /* cost of a lea instruction */
1021 COSTS_N_INSNS (1), /* variable shift costs */
1022 COSTS_N_INSNS (1), /* constant shift costs */
1023 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1024 COSTS_N_INSNS (5), /* HI */
1025 COSTS_N_INSNS (5), /* SI */
1026 COSTS_N_INSNS (5), /* DI */
1027 COSTS_N_INSNS (5)}, /* other */
1028 0, /* cost of multiply per each bit set */
1029 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1030 COSTS_N_INSNS (26), /* HI */
1031 COSTS_N_INSNS (42), /* SI */
1032 COSTS_N_INSNS (74), /* DI */
1033 COSTS_N_INSNS (74)}, /* other */
1034 COSTS_N_INSNS (1), /* cost of movsx */
1035 COSTS_N_INSNS (1), /* cost of movzx */
1036 8, /* "large" insn */
1037 9, /* MOVE_RATIO */
1038 4, /* cost for loading QImode using movzbl */
1039 {3, 4, 3}, /* cost of loading integer registers
1040 in QImode, HImode and SImode.
1041 Relative to reg-reg move (2). */
1042 {3, 4, 3}, /* cost of storing integer registers */
1043 4, /* cost of reg,reg fld/fst */
1044 {4, 4, 12}, /* cost of loading fp registers
1045 in SFmode, DFmode and XFmode */
1046 {6, 6, 8}, /* cost of storing fp registers
1047 in SFmode, DFmode and XFmode */
1048 2, /* cost of moving MMX register */
1049 {4, 4}, /* cost of loading MMX registers
1050 in SImode and DImode */
1051 {4, 4}, /* cost of storing MMX registers
1052 in SImode and DImode */
1053 2, /* cost of moving SSE register */
1054 {4, 4, 6}, /* cost of loading SSE registers
1055 in SImode, DImode and TImode */
1056 {4, 4, 5}, /* cost of storing SSE registers
1057 in SImode, DImode and TImode */
1058 5, /* MMX or SSE register to integer */
1059 64, /* size of l1 cache. */
1060 256, /* size of l2 cache. */
1061 64, /* size of prefetch block */
1062 6, /* number of parallel prefetches */
1063 5, /* Branch cost */
1064 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1065 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1066 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1067 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1068 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1069 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1070 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1071 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1072 128 bytes for memset. */
1073 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1074 DUMMY_STRINGOP_ALGS},
1075 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1076 DUMMY_STRINGOP_ALGS},
1077 1, /* scalar_stmt_cost. */
1078 1, /* scalar load_cost. */
1079 1, /* scalar_store_cost. */
1080 1, /* vec_stmt_cost. */
1081 1, /* vec_to_scalar_cost. */
1082 1, /* scalar_to_vec_cost. */
1083 1, /* vec_align_load_cost. */
1084 2, /* vec_unalign_load_cost. */
1085 1, /* vec_store_cost. */
1086 3, /* cond_taken_branch_cost. */
1087 1, /* cond_not_taken_branch_cost. */
1088 };
1089
1090 static const
1091 struct processor_costs k8_cost = {
1092 COSTS_N_INSNS (1), /* cost of an add instruction */
1093 COSTS_N_INSNS (2), /* cost of a lea instruction */
1094 COSTS_N_INSNS (1), /* variable shift costs */
1095 COSTS_N_INSNS (1), /* constant shift costs */
1096 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1097 COSTS_N_INSNS (4), /* HI */
1098 COSTS_N_INSNS (3), /* SI */
1099 COSTS_N_INSNS (4), /* DI */
1100 COSTS_N_INSNS (5)}, /* other */
1101 0, /* cost of multiply per each bit set */
1102 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1103 COSTS_N_INSNS (26), /* HI */
1104 COSTS_N_INSNS (42), /* SI */
1105 COSTS_N_INSNS (74), /* DI */
1106 COSTS_N_INSNS (74)}, /* other */
1107 COSTS_N_INSNS (1), /* cost of movsx */
1108 COSTS_N_INSNS (1), /* cost of movzx */
1109 8, /* "large" insn */
1110 9, /* MOVE_RATIO */
1111 4, /* cost for loading QImode using movzbl */
1112 {3, 4, 3}, /* cost of loading integer registers
1113 in QImode, HImode and SImode.
1114 Relative to reg-reg move (2). */
1115 {3, 4, 3}, /* cost of storing integer registers */
1116 4, /* cost of reg,reg fld/fst */
1117 {4, 4, 12}, /* cost of loading fp registers
1118 in SFmode, DFmode and XFmode */
1119 {6, 6, 8}, /* cost of storing fp registers
1120 in SFmode, DFmode and XFmode */
1121 2, /* cost of moving MMX register */
1122 {3, 3}, /* cost of loading MMX registers
1123 in SImode and DImode */
1124 {4, 4}, /* cost of storing MMX registers
1125 in SImode and DImode */
1126 2, /* cost of moving SSE register */
1127 {4, 3, 6}, /* cost of loading SSE registers
1128 in SImode, DImode and TImode */
1129 {4, 4, 5}, /* cost of storing SSE registers
1130 in SImode, DImode and TImode */
1131 5, /* MMX or SSE register to integer */
1132 64, /* size of l1 cache. */
1133 512, /* size of l2 cache. */
1134 64, /* size of prefetch block */
1135 /* New AMD processors never drop prefetches; if they cannot be performed
1136 immediately, they are queued. We set number of simultaneous prefetches
1137 to a large constant to reflect this (it probably is not a good idea not
1138 to limit number of prefetches at all, as their execution also takes some
1139 time). */
1140 100, /* number of parallel prefetches */
1141 3, /* Branch cost */
1142 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1143 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1144 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1145 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1146 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1147 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1148 /* K8 has optimized REP instruction for medium sized blocks, but for very
1149 small blocks it is better to use loop. For large blocks, libcall can
1150 do nontemporary accesses and beat inline considerably. */
1151 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1152 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1153 {{libcall, {{8, loop}, {24, unrolled_loop},
1154 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1155 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1156 4, /* scalar_stmt_cost. */
1157 2, /* scalar load_cost. */
1158 2, /* scalar_store_cost. */
1159 5, /* vec_stmt_cost. */
1160 0, /* vec_to_scalar_cost. */
1161 2, /* scalar_to_vec_cost. */
1162 2, /* vec_align_load_cost. */
1163 3, /* vec_unalign_load_cost. */
1164 3, /* vec_store_cost. */
1165 3, /* cond_taken_branch_cost. */
1166 2, /* cond_not_taken_branch_cost. */
1167 };
1168
1169 struct processor_costs amdfam10_cost = {
1170 COSTS_N_INSNS (1), /* cost of an add instruction */
1171 COSTS_N_INSNS (2), /* cost of a lea instruction */
1172 COSTS_N_INSNS (1), /* variable shift costs */
1173 COSTS_N_INSNS (1), /* constant shift costs */
1174 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1175 COSTS_N_INSNS (4), /* HI */
1176 COSTS_N_INSNS (3), /* SI */
1177 COSTS_N_INSNS (4), /* DI */
1178 COSTS_N_INSNS (5)}, /* other */
1179 0, /* cost of multiply per each bit set */
1180 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1181 COSTS_N_INSNS (35), /* HI */
1182 COSTS_N_INSNS (51), /* SI */
1183 COSTS_N_INSNS (83), /* DI */
1184 COSTS_N_INSNS (83)}, /* other */
1185 COSTS_N_INSNS (1), /* cost of movsx */
1186 COSTS_N_INSNS (1), /* cost of movzx */
1187 8, /* "large" insn */
1188 9, /* MOVE_RATIO */
1189 4, /* cost for loading QImode using movzbl */
1190 {3, 4, 3}, /* cost of loading integer registers
1191 in QImode, HImode and SImode.
1192 Relative to reg-reg move (2). */
1193 {3, 4, 3}, /* cost of storing integer registers */
1194 4, /* cost of reg,reg fld/fst */
1195 {4, 4, 12}, /* cost of loading fp registers
1196 in SFmode, DFmode and XFmode */
1197 {6, 6, 8}, /* cost of storing fp registers
1198 in SFmode, DFmode and XFmode */
1199 2, /* cost of moving MMX register */
1200 {3, 3}, /* cost of loading MMX registers
1201 in SImode and DImode */
1202 {4, 4}, /* cost of storing MMX registers
1203 in SImode and DImode */
1204 2, /* cost of moving SSE register */
1205 {4, 4, 3}, /* cost of loading SSE registers
1206 in SImode, DImode and TImode */
1207 {4, 4, 5}, /* cost of storing SSE registers
1208 in SImode, DImode and TImode */
1209 3, /* MMX or SSE register to integer */
1210 /* On K8:
1211 MOVD reg64, xmmreg Double FSTORE 4
1212 MOVD reg32, xmmreg Double FSTORE 4
1213 On AMDFAM10:
1214 MOVD reg64, xmmreg Double FADD 3
1215 1/1 1/1
1216 MOVD reg32, xmmreg Double FADD 3
1217 1/1 1/1 */
1218 64, /* size of l1 cache. */
1219 512, /* size of l2 cache. */
1220 64, /* size of prefetch block */
1221 /* New AMD processors never drop prefetches; if they cannot be performed
1222 immediately, they are queued. We set number of simultaneous prefetches
1223 to a large constant to reflect this (it probably is not a good idea not
1224 to limit number of prefetches at all, as their execution also takes some
1225 time). */
1226 100, /* number of parallel prefetches */
1227 2, /* Branch cost */
1228 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1229 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1230 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1231 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1232 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1233 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1234
1235 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1236 very small blocks it is better to use loop. For large blocks, libcall can
1237 do nontemporary accesses and beat inline considerably. */
1238 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1239 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1240 {{libcall, {{8, loop}, {24, unrolled_loop},
1241 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1242 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1243 4, /* scalar_stmt_cost. */
1244 2, /* scalar load_cost. */
1245 2, /* scalar_store_cost. */
1246 6, /* vec_stmt_cost. */
1247 0, /* vec_to_scalar_cost. */
1248 2, /* scalar_to_vec_cost. */
1249 2, /* vec_align_load_cost. */
1250 2, /* vec_unalign_load_cost. */
1251 2, /* vec_store_cost. */
1252 2, /* cond_taken_branch_cost. */
1253 1, /* cond_not_taken_branch_cost. */
1254 };
1255
1256 struct processor_costs bdver1_cost = {
1257 COSTS_N_INSNS (1), /* cost of an add instruction */
1258 COSTS_N_INSNS (1), /* cost of a lea instruction */
1259 COSTS_N_INSNS (1), /* variable shift costs */
1260 COSTS_N_INSNS (1), /* constant shift costs */
1261 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1262 COSTS_N_INSNS (4), /* HI */
1263 COSTS_N_INSNS (4), /* SI */
1264 COSTS_N_INSNS (6), /* DI */
1265 COSTS_N_INSNS (6)}, /* other */
1266 0, /* cost of multiply per each bit set */
1267 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1268 COSTS_N_INSNS (35), /* HI */
1269 COSTS_N_INSNS (51), /* SI */
1270 COSTS_N_INSNS (83), /* DI */
1271 COSTS_N_INSNS (83)}, /* other */
1272 COSTS_N_INSNS (1), /* cost of movsx */
1273 COSTS_N_INSNS (1), /* cost of movzx */
1274 8, /* "large" insn */
1275 9, /* MOVE_RATIO */
1276 4, /* cost for loading QImode using movzbl */
1277 {5, 5, 4}, /* cost of loading integer registers
1278 in QImode, HImode and SImode.
1279 Relative to reg-reg move (2). */
1280 {4, 4, 4}, /* cost of storing integer registers */
1281 2, /* cost of reg,reg fld/fst */
1282 {5, 5, 12}, /* cost of loading fp registers
1283 in SFmode, DFmode and XFmode */
1284 {4, 4, 8}, /* cost of storing fp registers
1285 in SFmode, DFmode and XFmode */
1286 2, /* cost of moving MMX register */
1287 {4, 4}, /* cost of loading MMX registers
1288 in SImode and DImode */
1289 {4, 4}, /* cost of storing MMX registers
1290 in SImode and DImode */
1291 2, /* cost of moving SSE register */
1292 {4, 4, 4}, /* cost of loading SSE registers
1293 in SImode, DImode and TImode */
1294 {4, 4, 4}, /* cost of storing SSE registers
1295 in SImode, DImode and TImode */
1296 2, /* MMX or SSE register to integer */
1297 /* On K8:
1298 MOVD reg64, xmmreg Double FSTORE 4
1299 MOVD reg32, xmmreg Double FSTORE 4
1300 On AMDFAM10:
1301 MOVD reg64, xmmreg Double FADD 3
1302 1/1 1/1
1303 MOVD reg32, xmmreg Double FADD 3
1304 1/1 1/1 */
1305 16, /* size of l1 cache. */
1306 2048, /* size of l2 cache. */
1307 64, /* size of prefetch block */
1308 /* New AMD processors never drop prefetches; if they cannot be performed
1309 immediately, they are queued. We set number of simultaneous prefetches
1310 to a large constant to reflect this (it probably is not a good idea not
1311 to limit number of prefetches at all, as their execution also takes some
1312 time). */
1313 100, /* number of parallel prefetches */
1314 2, /* Branch cost */
1315 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1316 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1317 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1318 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1319 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1320 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1321
1322 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1323 very small blocks it is better to use loop. For large blocks, libcall
1324 can do nontemporary accesses and beat inline considerably. */
1325 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1326 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1327 {{libcall, {{8, loop}, {24, unrolled_loop},
1328 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1329 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1330 6, /* scalar_stmt_cost. */
1331 4, /* scalar load_cost. */
1332 4, /* scalar_store_cost. */
1333 6, /* vec_stmt_cost. */
1334 0, /* vec_to_scalar_cost. */
1335 2, /* scalar_to_vec_cost. */
1336 4, /* vec_align_load_cost. */
1337 4, /* vec_unalign_load_cost. */
1338 4, /* vec_store_cost. */
1339 2, /* cond_taken_branch_cost. */
1340 1, /* cond_not_taken_branch_cost. */
1341 };
1342
1343 struct processor_costs bdver2_cost = {
1344 COSTS_N_INSNS (1), /* cost of an add instruction */
1345 COSTS_N_INSNS (1), /* cost of a lea instruction */
1346 COSTS_N_INSNS (1), /* variable shift costs */
1347 COSTS_N_INSNS (1), /* constant shift costs */
1348 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1349 COSTS_N_INSNS (4), /* HI */
1350 COSTS_N_INSNS (4), /* SI */
1351 COSTS_N_INSNS (6), /* DI */
1352 COSTS_N_INSNS (6)}, /* other */
1353 0, /* cost of multiply per each bit set */
1354 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1355 COSTS_N_INSNS (35), /* HI */
1356 COSTS_N_INSNS (51), /* SI */
1357 COSTS_N_INSNS (83), /* DI */
1358 COSTS_N_INSNS (83)}, /* other */
1359 COSTS_N_INSNS (1), /* cost of movsx */
1360 COSTS_N_INSNS (1), /* cost of movzx */
1361 8, /* "large" insn */
1362 9, /* MOVE_RATIO */
1363 4, /* cost for loading QImode using movzbl */
1364 {5, 5, 4}, /* cost of loading integer registers
1365 in QImode, HImode and SImode.
1366 Relative to reg-reg move (2). */
1367 {4, 4, 4}, /* cost of storing integer registers */
1368 2, /* cost of reg,reg fld/fst */
1369 {5, 5, 12}, /* cost of loading fp registers
1370 in SFmode, DFmode and XFmode */
1371 {4, 4, 8}, /* cost of storing fp registers
1372 in SFmode, DFmode and XFmode */
1373 2, /* cost of moving MMX register */
1374 {4, 4}, /* cost of loading MMX registers
1375 in SImode and DImode */
1376 {4, 4}, /* cost of storing MMX registers
1377 in SImode and DImode */
1378 2, /* cost of moving SSE register */
1379 {4, 4, 4}, /* cost of loading SSE registers
1380 in SImode, DImode and TImode */
1381 {4, 4, 4}, /* cost of storing SSE registers
1382 in SImode, DImode and TImode */
1383 2, /* MMX or SSE register to integer */
1384 /* On K8:
1385 MOVD reg64, xmmreg Double FSTORE 4
1386 MOVD reg32, xmmreg Double FSTORE 4
1387 On AMDFAM10:
1388 MOVD reg64, xmmreg Double FADD 3
1389 1/1 1/1
1390 MOVD reg32, xmmreg Double FADD 3
1391 1/1 1/1 */
1392 16, /* size of l1 cache. */
1393 2048, /* size of l2 cache. */
1394 64, /* size of prefetch block */
1395 /* New AMD processors never drop prefetches; if they cannot be performed
1396 immediately, they are queued. We set number of simultaneous prefetches
1397 to a large constant to reflect this (it probably is not a good idea not
1398 to limit number of prefetches at all, as their execution also takes some
1399 time). */
1400 100, /* number of parallel prefetches */
1401 2, /* Branch cost */
1402 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1403 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1404 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1405 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1406 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1407 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1408
1409 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1410 very small blocks it is better to use loop. For large blocks, libcall
1411 can do nontemporary accesses and beat inline considerably. */
1412 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1413 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1414 {{libcall, {{8, loop}, {24, unrolled_loop},
1415 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1416 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1417 6, /* scalar_stmt_cost. */
1418 4, /* scalar load_cost. */
1419 4, /* scalar_store_cost. */
1420 6, /* vec_stmt_cost. */
1421 0, /* vec_to_scalar_cost. */
1422 2, /* scalar_to_vec_cost. */
1423 4, /* vec_align_load_cost. */
1424 4, /* vec_unalign_load_cost. */
1425 4, /* vec_store_cost. */
1426 2, /* cond_taken_branch_cost. */
1427 1, /* cond_not_taken_branch_cost. */
1428 };
1429
1430 struct processor_costs btver1_cost = {
1431 COSTS_N_INSNS (1), /* cost of an add instruction */
1432 COSTS_N_INSNS (2), /* cost of a lea instruction */
1433 COSTS_N_INSNS (1), /* variable shift costs */
1434 COSTS_N_INSNS (1), /* constant shift costs */
1435 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1436 COSTS_N_INSNS (4), /* HI */
1437 COSTS_N_INSNS (3), /* SI */
1438 COSTS_N_INSNS (4), /* DI */
1439 COSTS_N_INSNS (5)}, /* other */
1440 0, /* cost of multiply per each bit set */
1441 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1442 COSTS_N_INSNS (35), /* HI */
1443 COSTS_N_INSNS (51), /* SI */
1444 COSTS_N_INSNS (83), /* DI */
1445 COSTS_N_INSNS (83)}, /* other */
1446 COSTS_N_INSNS (1), /* cost of movsx */
1447 COSTS_N_INSNS (1), /* cost of movzx */
1448 8, /* "large" insn */
1449 9, /* MOVE_RATIO */
1450 4, /* cost for loading QImode using movzbl */
1451 {3, 4, 3}, /* cost of loading integer registers
1452 in QImode, HImode and SImode.
1453 Relative to reg-reg move (2). */
1454 {3, 4, 3}, /* cost of storing integer registers */
1455 4, /* cost of reg,reg fld/fst */
1456 {4, 4, 12}, /* cost of loading fp registers
1457 in SFmode, DFmode and XFmode */
1458 {6, 6, 8}, /* cost of storing fp registers
1459 in SFmode, DFmode and XFmode */
1460 2, /* cost of moving MMX register */
1461 {3, 3}, /* cost of loading MMX registers
1462 in SImode and DImode */
1463 {4, 4}, /* cost of storing MMX registers
1464 in SImode and DImode */
1465 2, /* cost of moving SSE register */
1466 {4, 4, 3}, /* cost of loading SSE registers
1467 in SImode, DImode and TImode */
1468 {4, 4, 5}, /* cost of storing SSE registers
1469 in SImode, DImode and TImode */
1470 3, /* MMX or SSE register to integer */
1471 /* On K8:
1472 MOVD reg64, xmmreg Double FSTORE 4
1473 MOVD reg32, xmmreg Double FSTORE 4
1474 On AMDFAM10:
1475 MOVD reg64, xmmreg Double FADD 3
1476 1/1 1/1
1477 MOVD reg32, xmmreg Double FADD 3
1478 1/1 1/1 */
1479 32, /* size of l1 cache. */
1480 512, /* size of l2 cache. */
1481 64, /* size of prefetch block */
1482 100, /* number of parallel prefetches */
1483 2, /* Branch cost */
1484 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1485 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1486 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1487 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1488 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1489 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1490
1491 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1492 very small blocks it is better to use loop. For large blocks, libcall can
1493 do nontemporary accesses and beat inline considerably. */
1494 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1495 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1496 {{libcall, {{8, loop}, {24, unrolled_loop},
1497 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1498 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1499 4, /* scalar_stmt_cost. */
1500 2, /* scalar load_cost. */
1501 2, /* scalar_store_cost. */
1502 6, /* vec_stmt_cost. */
1503 0, /* vec_to_scalar_cost. */
1504 2, /* scalar_to_vec_cost. */
1505 2, /* vec_align_load_cost. */
1506 2, /* vec_unalign_load_cost. */
1507 2, /* vec_store_cost. */
1508 2, /* cond_taken_branch_cost. */
1509 1, /* cond_not_taken_branch_cost. */
1510 };
1511
1512 struct processor_costs btver2_cost = {
1513 COSTS_N_INSNS (1), /* cost of an add instruction */
1514 COSTS_N_INSNS (2), /* cost of a lea instruction */
1515 COSTS_N_INSNS (1), /* variable shift costs */
1516 COSTS_N_INSNS (1), /* constant shift costs */
1517 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1518 COSTS_N_INSNS (4), /* HI */
1519 COSTS_N_INSNS (3), /* SI */
1520 COSTS_N_INSNS (4), /* DI */
1521 COSTS_N_INSNS (5)}, /* other */
1522 0, /* cost of multiply per each bit set */
1523 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1524 COSTS_N_INSNS (35), /* HI */
1525 COSTS_N_INSNS (51), /* SI */
1526 COSTS_N_INSNS (83), /* DI */
1527 COSTS_N_INSNS (83)}, /* other */
1528 COSTS_N_INSNS (1), /* cost of movsx */
1529 COSTS_N_INSNS (1), /* cost of movzx */
1530 8, /* "large" insn */
1531 9, /* MOVE_RATIO */
1532 4, /* cost for loading QImode using movzbl */
1533 {3, 4, 3}, /* cost of loading integer registers
1534 in QImode, HImode and SImode.
1535 Relative to reg-reg move (2). */
1536 {3, 4, 3}, /* cost of storing integer registers */
1537 4, /* cost of reg,reg fld/fst */
1538 {4, 4, 12}, /* cost of loading fp registers
1539 in SFmode, DFmode and XFmode */
1540 {6, 6, 8}, /* cost of storing fp registers
1541 in SFmode, DFmode and XFmode */
1542 2, /* cost of moving MMX register */
1543 {3, 3}, /* cost of loading MMX registers
1544 in SImode and DImode */
1545 {4, 4}, /* cost of storing MMX registers
1546 in SImode and DImode */
1547 2, /* cost of moving SSE register */
1548 {4, 4, 3}, /* cost of loading SSE registers
1549 in SImode, DImode and TImode */
1550 {4, 4, 5}, /* cost of storing SSE registers
1551 in SImode, DImode and TImode */
1552 3, /* MMX or SSE register to integer */
1553 /* On K8:
1554 MOVD reg64, xmmreg Double FSTORE 4
1555 MOVD reg32, xmmreg Double FSTORE 4
1556 On AMDFAM10:
1557 MOVD reg64, xmmreg Double FADD 3
1558 1/1 1/1
1559 MOVD reg32, xmmreg Double FADD 3
1560 1/1 1/1 */
1561 32, /* size of l1 cache. */
1562 2048, /* size of l2 cache. */
1563 64, /* size of prefetch block */
1564 100, /* number of parallel prefetches */
1565 2, /* Branch cost */
1566 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1567 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1568 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1569 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1570 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1571 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1572
1573 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1574 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1575 {{libcall, {{8, loop}, {24, unrolled_loop},
1576 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1577 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1578 4, /* scalar_stmt_cost. */
1579 2, /* scalar load_cost. */
1580 2, /* scalar_store_cost. */
1581 6, /* vec_stmt_cost. */
1582 0, /* vec_to_scalar_cost. */
1583 2, /* scalar_to_vec_cost. */
1584 2, /* vec_align_load_cost. */
1585 2, /* vec_unalign_load_cost. */
1586 2, /* vec_store_cost. */
1587 2, /* cond_taken_branch_cost. */
1588 1, /* cond_not_taken_branch_cost. */
1589 };
1590
1591 static const
1592 struct processor_costs pentium4_cost = {
1593 COSTS_N_INSNS (1), /* cost of an add instruction */
1594 COSTS_N_INSNS (3), /* cost of a lea instruction */
1595 COSTS_N_INSNS (4), /* variable shift costs */
1596 COSTS_N_INSNS (4), /* constant shift costs */
1597 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1598 COSTS_N_INSNS (15), /* HI */
1599 COSTS_N_INSNS (15), /* SI */
1600 COSTS_N_INSNS (15), /* DI */
1601 COSTS_N_INSNS (15)}, /* other */
1602 0, /* cost of multiply per each bit set */
1603 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1604 COSTS_N_INSNS (56), /* HI */
1605 COSTS_N_INSNS (56), /* SI */
1606 COSTS_N_INSNS (56), /* DI */
1607 COSTS_N_INSNS (56)}, /* other */
1608 COSTS_N_INSNS (1), /* cost of movsx */
1609 COSTS_N_INSNS (1), /* cost of movzx */
1610 16, /* "large" insn */
1611 6, /* MOVE_RATIO */
1612 2, /* cost for loading QImode using movzbl */
1613 {4, 5, 4}, /* cost of loading integer registers
1614 in QImode, HImode and SImode.
1615 Relative to reg-reg move (2). */
1616 {2, 3, 2}, /* cost of storing integer registers */
1617 2, /* cost of reg,reg fld/fst */
1618 {2, 2, 6}, /* cost of loading fp registers
1619 in SFmode, DFmode and XFmode */
1620 {4, 4, 6}, /* cost of storing fp registers
1621 in SFmode, DFmode and XFmode */
1622 2, /* cost of moving MMX register */
1623 {2, 2}, /* cost of loading MMX registers
1624 in SImode and DImode */
1625 {2, 2}, /* cost of storing MMX registers
1626 in SImode and DImode */
1627 12, /* cost of moving SSE register */
1628 {12, 12, 12}, /* cost of loading SSE registers
1629 in SImode, DImode and TImode */
1630 {2, 2, 8}, /* cost of storing SSE registers
1631 in SImode, DImode and TImode */
1632 10, /* MMX or SSE register to integer */
1633 8, /* size of l1 cache. */
1634 256, /* size of l2 cache. */
1635 64, /* size of prefetch block */
1636 6, /* number of parallel prefetches */
1637 2, /* Branch cost */
1638 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1639 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1640 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1641 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1642 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1643 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1644 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1645 DUMMY_STRINGOP_ALGS},
1646 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1647 {-1, libcall}}},
1648 DUMMY_STRINGOP_ALGS},
1649 1, /* scalar_stmt_cost. */
1650 1, /* scalar load_cost. */
1651 1, /* scalar_store_cost. */
1652 1, /* vec_stmt_cost. */
1653 1, /* vec_to_scalar_cost. */
1654 1, /* scalar_to_vec_cost. */
1655 1, /* vec_align_load_cost. */
1656 2, /* vec_unalign_load_cost. */
1657 1, /* vec_store_cost. */
1658 3, /* cond_taken_branch_cost. */
1659 1, /* cond_not_taken_branch_cost. */
1660 };
1661
1662 static const
1663 struct processor_costs nocona_cost = {
1664 COSTS_N_INSNS (1), /* cost of an add instruction */
1665 COSTS_N_INSNS (1), /* cost of a lea instruction */
1666 COSTS_N_INSNS (1), /* variable shift costs */
1667 COSTS_N_INSNS (1), /* constant shift costs */
1668 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1669 COSTS_N_INSNS (10), /* HI */
1670 COSTS_N_INSNS (10), /* SI */
1671 COSTS_N_INSNS (10), /* DI */
1672 COSTS_N_INSNS (10)}, /* other */
1673 0, /* cost of multiply per each bit set */
1674 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1675 COSTS_N_INSNS (66), /* HI */
1676 COSTS_N_INSNS (66), /* SI */
1677 COSTS_N_INSNS (66), /* DI */
1678 COSTS_N_INSNS (66)}, /* other */
1679 COSTS_N_INSNS (1), /* cost of movsx */
1680 COSTS_N_INSNS (1), /* cost of movzx */
1681 16, /* "large" insn */
1682 17, /* MOVE_RATIO */
1683 4, /* cost for loading QImode using movzbl */
1684 {4, 4, 4}, /* cost of loading integer registers
1685 in QImode, HImode and SImode.
1686 Relative to reg-reg move (2). */
1687 {4, 4, 4}, /* cost of storing integer registers */
1688 3, /* cost of reg,reg fld/fst */
1689 {12, 12, 12}, /* cost of loading fp registers
1690 in SFmode, DFmode and XFmode */
1691 {4, 4, 4}, /* cost of storing fp registers
1692 in SFmode, DFmode and XFmode */
1693 6, /* cost of moving MMX register */
1694 {12, 12}, /* cost of loading MMX registers
1695 in SImode and DImode */
1696 {12, 12}, /* cost of storing MMX registers
1697 in SImode and DImode */
1698 6, /* cost of moving SSE register */
1699 {12, 12, 12}, /* cost of loading SSE registers
1700 in SImode, DImode and TImode */
1701 {12, 12, 12}, /* cost of storing SSE registers
1702 in SImode, DImode and TImode */
1703 8, /* MMX or SSE register to integer */
1704 8, /* size of l1 cache. */
1705 1024, /* size of l2 cache. */
1706 128, /* size of prefetch block */
1707 8, /* number of parallel prefetches */
1708 1, /* Branch cost */
1709 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1710 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1711 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1712 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1713 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1714 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1715 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1716 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1717 {100000, unrolled_loop}, {-1, libcall}}}},
1718 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1719 {-1, libcall}}},
1720 {libcall, {{24, loop}, {64, unrolled_loop},
1721 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1722 1, /* scalar_stmt_cost. */
1723 1, /* scalar load_cost. */
1724 1, /* scalar_store_cost. */
1725 1, /* vec_stmt_cost. */
1726 1, /* vec_to_scalar_cost. */
1727 1, /* scalar_to_vec_cost. */
1728 1, /* vec_align_load_cost. */
1729 2, /* vec_unalign_load_cost. */
1730 1, /* vec_store_cost. */
1731 3, /* cond_taken_branch_cost. */
1732 1, /* cond_not_taken_branch_cost. */
1733 };
1734
1735 static const
1736 struct processor_costs atom_cost = {
1737 COSTS_N_INSNS (1), /* cost of an add instruction */
1738 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1739 COSTS_N_INSNS (1), /* variable shift costs */
1740 COSTS_N_INSNS (1), /* constant shift costs */
1741 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1742 COSTS_N_INSNS (4), /* HI */
1743 COSTS_N_INSNS (3), /* SI */
1744 COSTS_N_INSNS (4), /* DI */
1745 COSTS_N_INSNS (2)}, /* other */
1746 0, /* cost of multiply per each bit set */
1747 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1748 COSTS_N_INSNS (26), /* HI */
1749 COSTS_N_INSNS (42), /* SI */
1750 COSTS_N_INSNS (74), /* DI */
1751 COSTS_N_INSNS (74)}, /* other */
1752 COSTS_N_INSNS (1), /* cost of movsx */
1753 COSTS_N_INSNS (1), /* cost of movzx */
1754 8, /* "large" insn */
1755 17, /* MOVE_RATIO */
1756 4, /* cost for loading QImode using movzbl */
1757 {4, 4, 4}, /* cost of loading integer registers
1758 in QImode, HImode and SImode.
1759 Relative to reg-reg move (2). */
1760 {4, 4, 4}, /* cost of storing integer registers */
1761 4, /* cost of reg,reg fld/fst */
1762 {12, 12, 12}, /* cost of loading fp registers
1763 in SFmode, DFmode and XFmode */
1764 {6, 6, 8}, /* cost of storing fp registers
1765 in SFmode, DFmode and XFmode */
1766 2, /* cost of moving MMX register */
1767 {8, 8}, /* cost of loading MMX registers
1768 in SImode and DImode */
1769 {8, 8}, /* cost of storing MMX registers
1770 in SImode and DImode */
1771 2, /* cost of moving SSE register */
1772 {8, 8, 8}, /* cost of loading SSE registers
1773 in SImode, DImode and TImode */
1774 {8, 8, 8}, /* cost of storing SSE registers
1775 in SImode, DImode and TImode */
1776 5, /* MMX or SSE register to integer */
1777 32, /* size of l1 cache. */
1778 256, /* size of l2 cache. */
1779 64, /* size of prefetch block */
1780 6, /* number of parallel prefetches */
1781 3, /* Branch cost */
1782 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1783 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1784 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1785 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1786 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1787 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1788 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1789 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1790 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1791 {{libcall, {{8, loop}, {15, unrolled_loop},
1792 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1793 {libcall, {{24, loop}, {32, unrolled_loop},
1794 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1795 1, /* scalar_stmt_cost. */
1796 1, /* scalar load_cost. */
1797 1, /* scalar_store_cost. */
1798 1, /* vec_stmt_cost. */
1799 1, /* vec_to_scalar_cost. */
1800 1, /* scalar_to_vec_cost. */
1801 1, /* vec_align_load_cost. */
1802 2, /* vec_unalign_load_cost. */
1803 1, /* vec_store_cost. */
1804 3, /* cond_taken_branch_cost. */
1805 1, /* cond_not_taken_branch_cost. */
1806 };
1807
1808 /* Generic64 should produce code tuned for Nocona and K8. */
1809 static const
1810 struct processor_costs generic64_cost = {
1811 COSTS_N_INSNS (1), /* cost of an add instruction */
1812 /* On all chips taken into consideration lea is 2 cycles and more. With
1813 this cost however our current implementation of synth_mult results in
1814 use of unnecessary temporary registers causing regression on several
1815 SPECfp benchmarks. */
1816 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1817 COSTS_N_INSNS (1), /* variable shift costs */
1818 COSTS_N_INSNS (1), /* constant shift costs */
1819 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1820 COSTS_N_INSNS (4), /* HI */
1821 COSTS_N_INSNS (3), /* SI */
1822 COSTS_N_INSNS (4), /* DI */
1823 COSTS_N_INSNS (2)}, /* other */
1824 0, /* cost of multiply per each bit set */
1825 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1826 COSTS_N_INSNS (26), /* HI */
1827 COSTS_N_INSNS (42), /* SI */
1828 COSTS_N_INSNS (74), /* DI */
1829 COSTS_N_INSNS (74)}, /* other */
1830 COSTS_N_INSNS (1), /* cost of movsx */
1831 COSTS_N_INSNS (1), /* cost of movzx */
1832 8, /* "large" insn */
1833 17, /* MOVE_RATIO */
1834 4, /* cost for loading QImode using movzbl */
1835 {4, 4, 4}, /* cost of loading integer registers
1836 in QImode, HImode and SImode.
1837 Relative to reg-reg move (2). */
1838 {4, 4, 4}, /* cost of storing integer registers */
1839 4, /* cost of reg,reg fld/fst */
1840 {12, 12, 12}, /* cost of loading fp registers
1841 in SFmode, DFmode and XFmode */
1842 {6, 6, 8}, /* cost of storing fp registers
1843 in SFmode, DFmode and XFmode */
1844 2, /* cost of moving MMX register */
1845 {8, 8}, /* cost of loading MMX registers
1846 in SImode and DImode */
1847 {8, 8}, /* cost of storing MMX registers
1848 in SImode and DImode */
1849 2, /* cost of moving SSE register */
1850 {8, 8, 8}, /* cost of loading SSE registers
1851 in SImode, DImode and TImode */
1852 {8, 8, 8}, /* cost of storing SSE registers
1853 in SImode, DImode and TImode */
1854 5, /* MMX or SSE register to integer */
1855 32, /* size of l1 cache. */
1856 512, /* size of l2 cache. */
1857 64, /* size of prefetch block */
1858 6, /* number of parallel prefetches */
1859 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1860 value is increased to perhaps more appropriate value of 5. */
1861 3, /* Branch cost */
1862 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1863 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1864 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1865 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1866 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1867 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1868 {DUMMY_STRINGOP_ALGS,
1869 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1870 {DUMMY_STRINGOP_ALGS,
1871 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1872 1, /* scalar_stmt_cost. */
1873 1, /* scalar load_cost. */
1874 1, /* scalar_store_cost. */
1875 1, /* vec_stmt_cost. */
1876 1, /* vec_to_scalar_cost. */
1877 1, /* scalar_to_vec_cost. */
1878 1, /* vec_align_load_cost. */
1879 2, /* vec_unalign_load_cost. */
1880 1, /* vec_store_cost. */
1881 3, /* cond_taken_branch_cost. */
1882 1, /* cond_not_taken_branch_cost. */
1883 };
1884
1885 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1886 Athlon and K8. */
1887 static const
1888 struct processor_costs generic32_cost = {
1889 COSTS_N_INSNS (1), /* cost of an add instruction */
1890 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1891 COSTS_N_INSNS (1), /* variable shift costs */
1892 COSTS_N_INSNS (1), /* constant shift costs */
1893 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1894 COSTS_N_INSNS (4), /* HI */
1895 COSTS_N_INSNS (3), /* SI */
1896 COSTS_N_INSNS (4), /* DI */
1897 COSTS_N_INSNS (2)}, /* other */
1898 0, /* cost of multiply per each bit set */
1899 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1900 COSTS_N_INSNS (26), /* HI */
1901 COSTS_N_INSNS (42), /* SI */
1902 COSTS_N_INSNS (74), /* DI */
1903 COSTS_N_INSNS (74)}, /* other */
1904 COSTS_N_INSNS (1), /* cost of movsx */
1905 COSTS_N_INSNS (1), /* cost of movzx */
1906 8, /* "large" insn */
1907 17, /* MOVE_RATIO */
1908 4, /* cost for loading QImode using movzbl */
1909 {4, 4, 4}, /* cost of loading integer registers
1910 in QImode, HImode and SImode.
1911 Relative to reg-reg move (2). */
1912 {4, 4, 4}, /* cost of storing integer registers */
1913 4, /* cost of reg,reg fld/fst */
1914 {12, 12, 12}, /* cost of loading fp registers
1915 in SFmode, DFmode and XFmode */
1916 {6, 6, 8}, /* cost of storing fp registers
1917 in SFmode, DFmode and XFmode */
1918 2, /* cost of moving MMX register */
1919 {8, 8}, /* cost of loading MMX registers
1920 in SImode and DImode */
1921 {8, 8}, /* cost of storing MMX registers
1922 in SImode and DImode */
1923 2, /* cost of moving SSE register */
1924 {8, 8, 8}, /* cost of loading SSE registers
1925 in SImode, DImode and TImode */
1926 {8, 8, 8}, /* cost of storing SSE registers
1927 in SImode, DImode and TImode */
1928 5, /* MMX or SSE register to integer */
1929 32, /* size of l1 cache. */
1930 256, /* size of l2 cache. */
1931 64, /* size of prefetch block */
1932 6, /* number of parallel prefetches */
1933 3, /* Branch cost */
1934 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1935 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1936 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1937 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1938 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1939 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1940 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1941 DUMMY_STRINGOP_ALGS},
1942 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1943 DUMMY_STRINGOP_ALGS},
1944 1, /* scalar_stmt_cost. */
1945 1, /* scalar load_cost. */
1946 1, /* scalar_store_cost. */
1947 1, /* vec_stmt_cost. */
1948 1, /* vec_to_scalar_cost. */
1949 1, /* scalar_to_vec_cost. */
1950 1, /* vec_align_load_cost. */
1951 2, /* vec_unalign_load_cost. */
1952 1, /* vec_store_cost. */
1953 3, /* cond_taken_branch_cost. */
1954 1, /* cond_not_taken_branch_cost. */
1955 };
1956
1957 /* Set by -mtune. */
1958 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1959
1960 /* Set by -mtune or -Os. */
1961 const struct processor_costs *ix86_cost = &pentium_cost;
1962
1963 /* Processor feature/optimization bitmasks. */
1964 #define m_386 (1<<PROCESSOR_I386)
1965 #define m_486 (1<<PROCESSOR_I486)
1966 #define m_PENT (1<<PROCESSOR_PENTIUM)
1967 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1968 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1969 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1970 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1971 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1972 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1973 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1974 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1975 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1976 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1977 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1978 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1979 #define m_ATOM (1<<PROCESSOR_ATOM)
1980
1981 #define m_GEODE (1<<PROCESSOR_GEODE)
1982 #define m_K6 (1<<PROCESSOR_K6)
1983 #define m_K6_GEODE (m_K6 | m_GEODE)
1984 #define m_K8 (1<<PROCESSOR_K8)
1985 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1986 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1987 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1988 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1989 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1990 #define m_BDVER (m_BDVER1 | m_BDVER2)
1991 #define m_BTVER (m_BTVER1 | m_BTVER2)
1992 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1993 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1994 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1995
1996 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1997 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1998
1999 /* Generic instruction choice should be common subset of supported CPUs
2000 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
2001 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
2002
2003 /* Feature tests against the various tunings. */
2004 unsigned char ix86_tune_features[X86_TUNE_LAST];
2005
2006 /* Feature tests against the various tunings used to create ix86_tune_features
2007 based on the processor mask. */
2008 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
2009 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
2010 negatively, so enabling for Generic64 seems like good code size
2011 tradeoff. We can't enable it for 32bit generic because it does not
2012 work well with PPro base chips. */
2013 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
2014
2015 /* X86_TUNE_PUSH_MEMORY */
2016 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2017
2018 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
2019 m_486 | m_PENT,
2020
2021 /* X86_TUNE_UNROLL_STRLEN */
2022 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
2023
2024 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
2025 on simulation result. But after P4 was made, no performance benefit
2026 was observed with branch hints. It also increases the code size.
2027 As a result, icc never generates branch hints. */
2028 0,
2029
2030 /* X86_TUNE_DOUBLE_WITH_ADD */
2031 ~m_386,
2032
2033 /* X86_TUNE_USE_SAHF */
2034 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC,
2035
2036 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
2037 partial dependencies. */
2038 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2039
2040 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
2041 register stalls on Generic32 compilation setting as well. However
2042 in current implementation the partial register stalls are not eliminated
2043 very well - they can be introduced via subregs synthesized by combine
2044 and can happen in caller/callee saving sequences. Because this option
2045 pays back little on PPro based chips and is in conflict with partial reg
2046 dependencies used by Athlon/P4 based chips, it is better to leave it off
2047 for generic32 for now. */
2048 m_PPRO,
2049
2050 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
2051 m_CORE2I7 | m_GENERIC,
2052
2053 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
2054 * on 16-bit immediate moves into memory on Core2 and Corei7. */
2055 m_CORE2I7 | m_GENERIC,
2056
2057 /* X86_TUNE_USE_HIMODE_FIOP */
2058 m_386 | m_486 | m_K6_GEODE,
2059
2060 /* X86_TUNE_USE_SIMODE_FIOP */
2061 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
2062
2063 /* X86_TUNE_USE_MOV0 */
2064 m_K6,
2065
2066 /* X86_TUNE_USE_CLTD */
2067 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
2068
2069 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
2070 m_PENT4,
2071
2072 /* X86_TUNE_SPLIT_LONG_MOVES */
2073 m_PPRO,
2074
2075 /* X86_TUNE_READ_MODIFY_WRITE */
2076 ~m_PENT,
2077
2078 /* X86_TUNE_READ_MODIFY */
2079 ~(m_PENT | m_PPRO),
2080
2081 /* X86_TUNE_PROMOTE_QIMODE */
2082 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2083
2084 /* X86_TUNE_FAST_PREFIX */
2085 ~(m_386 | m_486 | m_PENT),
2086
2087 /* X86_TUNE_SINGLE_STRINGOP */
2088 m_386 | m_P4_NOCONA,
2089
2090 /* X86_TUNE_QIMODE_MATH */
2091 ~0,
2092
2093 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2094 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2095 might be considered for Generic32 if our scheme for avoiding partial
2096 stalls was more effective. */
2097 ~m_PPRO,
2098
2099 /* X86_TUNE_PROMOTE_QI_REGS */
2100 0,
2101
2102 /* X86_TUNE_PROMOTE_HI_REGS */
2103 m_PPRO,
2104
2105 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2106 over esp addition. */
2107 m_386 | m_486 | m_PENT | m_PPRO,
2108
2109 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2110 over esp addition. */
2111 m_PENT,
2112
2113 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2114 over esp subtraction. */
2115 m_386 | m_486 | m_PENT | m_K6_GEODE,
2116
2117 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2118 over esp subtraction. */
2119 m_PENT | m_K6_GEODE,
2120
2121 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2122 for DFmode copies */
2123 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2124
2125 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2126 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2127
2128 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2129 conflict here in between PPro/Pentium4 based chips that thread 128bit
2130 SSE registers as single units versus K8 based chips that divide SSE
2131 registers to two 64bit halves. This knob promotes all store destinations
2132 to be 128bit to allow register renaming on 128bit SSE units, but usually
2133 results in one extra microop on 64bit SSE units. Experimental results
2134 shows that disabling this option on P4 brings over 20% SPECfp regression,
2135 while enabling it on K8 brings roughly 2.4% regression that can be partly
2136 masked by careful scheduling of moves. */
2137 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2138
2139 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2140 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER,
2141
2142 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2143 m_COREI7 | m_BDVER,
2144
2145 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2146 m_BDVER ,
2147
2148 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2149 are resolved on SSE register parts instead of whole registers, so we may
2150 maintain just lower part of scalar values in proper format leaving the
2151 upper part undefined. */
2152 m_ATHLON_K8,
2153
2154 /* X86_TUNE_SSE_TYPELESS_STORES */
2155 m_AMD_MULTIPLE,
2156
2157 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2158 m_PPRO | m_P4_NOCONA,
2159
2160 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2161 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2162
2163 /* X86_TUNE_PROLOGUE_USING_MOVE */
2164 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2165
2166 /* X86_TUNE_EPILOGUE_USING_MOVE */
2167 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2168
2169 /* X86_TUNE_SHIFT1 */
2170 ~m_486,
2171
2172 /* X86_TUNE_USE_FFREEP */
2173 m_AMD_MULTIPLE,
2174
2175 /* X86_TUNE_INTER_UNIT_MOVES */
2176 ~(m_AMD_MULTIPLE | m_GENERIC),
2177
2178 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2179 ~(m_AMDFAM10 | m_BDVER ),
2180
2181 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2182 than 4 branch instructions in the 16 byte window. */
2183 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2184
2185 /* X86_TUNE_SCHEDULE */
2186 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2187
2188 /* X86_TUNE_USE_BT */
2189 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2190
2191 /* X86_TUNE_USE_INCDEC */
2192 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2193
2194 /* X86_TUNE_PAD_RETURNS */
2195 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2196
2197 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2198 m_ATOM,
2199
2200 /* X86_TUNE_EXT_80387_CONSTANTS */
2201 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2202
2203 /* X86_TUNE_SHORTEN_X87_SSE */
2204 ~m_K8,
2205
2206 /* X86_TUNE_AVOID_VECTOR_DECODE */
2207 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2208
2209 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2210 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2211 ~(m_386 | m_486),
2212
2213 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2214 vector path on AMD machines. */
2215 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
2216
2217 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2218 machines. */
2219 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
2220
2221 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2222 than a MOV. */
2223 m_PENT,
2224
2225 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2226 but one byte longer. */
2227 m_PENT,
2228
2229 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2230 operand that cannot be represented using a modRM byte. The XOR
2231 replacement is long decoded, so this split helps here as well. */
2232 m_K6,
2233
2234 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2235 from FP to FP. */
2236 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2237
2238 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2239 from integer to FP. */
2240 m_AMDFAM10,
2241
2242 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2243 with a subsequent conditional jump instruction into a single
2244 compare-and-branch uop. */
2245 m_BDVER,
2246
2247 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2248 will impact LEA instruction selection. */
2249 m_ATOM,
2250
2251 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2252 instructions. */
2253 ~m_ATOM,
2254
2255 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2256 at -O3. For the moment, the prefetching seems badly tuned for Intel
2257 chips. */
2258 m_K6_GEODE | m_AMD_MULTIPLE,
2259
2260 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2261 the auto-vectorizer. */
2262 m_BDVER,
2263
2264 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2265 during reassociation of integer computation. */
2266 m_ATOM,
2267
2268 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2269 during reassociation of fp computation. */
2270 m_ATOM
2271 };
2272
2273 /* Feature tests against the various architecture variations. */
2274 unsigned char ix86_arch_features[X86_ARCH_LAST];
2275
2276 /* Feature tests against the various architecture variations, used to create
2277 ix86_arch_features based on the processor mask. */
2278 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2279 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2280 ~(m_386 | m_486 | m_PENT | m_K6),
2281
2282 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2283 ~m_386,
2284
2285 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2286 ~(m_386 | m_486),
2287
2288 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2289 ~m_386,
2290
2291 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2292 ~m_386,
2293 };
2294
2295 static const unsigned int x86_accumulate_outgoing_args
2296 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2297
2298 static const unsigned int x86_arch_always_fancy_math_387
2299 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2300
2301 static const unsigned int x86_avx256_split_unaligned_load
2302 = m_COREI7 | m_GENERIC;
2303
2304 static const unsigned int x86_avx256_split_unaligned_store
2305 = m_COREI7 | m_BDVER | m_GENERIC;
2306
2307 /* In case the average insn count for single function invocation is
2308 lower than this constant, emit fast (but longer) prologue and
2309 epilogue code. */
2310 #define FAST_PROLOGUE_INSN_COUNT 20
2311
2312 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2313 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2314 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2315 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2316
2317 /* Array of the smallest class containing reg number REGNO, indexed by
2318 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2319
2320 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2321 {
2322 /* ax, dx, cx, bx */
2323 AREG, DREG, CREG, BREG,
2324 /* si, di, bp, sp */
2325 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2326 /* FP registers */
2327 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2328 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2329 /* arg pointer */
2330 NON_Q_REGS,
2331 /* flags, fpsr, fpcr, frame */
2332 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2333 /* SSE registers */
2334 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2335 SSE_REGS, SSE_REGS,
2336 /* MMX registers */
2337 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2338 MMX_REGS, MMX_REGS,
2339 /* REX registers */
2340 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2341 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2342 /* SSE REX registers */
2343 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2344 SSE_REGS, SSE_REGS,
2345 };
2346
2347 /* The "default" register map used in 32bit mode. */
2348
2349 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2350 {
2351 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2352 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2353 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2354 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2355 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2356 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2357 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2358 };
2359
2360 /* The "default" register map used in 64bit mode. */
2361
2362 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2363 {
2364 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2365 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2366 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2367 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2368 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2369 8,9,10,11,12,13,14,15, /* extended integer registers */
2370 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2371 };
2372
2373 /* Define the register numbers to be used in Dwarf debugging information.
2374 The SVR4 reference port C compiler uses the following register numbers
2375 in its Dwarf output code:
2376 0 for %eax (gcc regno = 0)
2377 1 for %ecx (gcc regno = 2)
2378 2 for %edx (gcc regno = 1)
2379 3 for %ebx (gcc regno = 3)
2380 4 for %esp (gcc regno = 7)
2381 5 for %ebp (gcc regno = 6)
2382 6 for %esi (gcc regno = 4)
2383 7 for %edi (gcc regno = 5)
2384 The following three DWARF register numbers are never generated by
2385 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2386 believes these numbers have these meanings.
2387 8 for %eip (no gcc equivalent)
2388 9 for %eflags (gcc regno = 17)
2389 10 for %trapno (no gcc equivalent)
2390 It is not at all clear how we should number the FP stack registers
2391 for the x86 architecture. If the version of SDB on x86/svr4 were
2392 a bit less brain dead with respect to floating-point then we would
2393 have a precedent to follow with respect to DWARF register numbers
2394 for x86 FP registers, but the SDB on x86/svr4 is so completely
2395 broken with respect to FP registers that it is hardly worth thinking
2396 of it as something to strive for compatibility with.
2397 The version of x86/svr4 SDB I have at the moment does (partially)
2398 seem to believe that DWARF register number 11 is associated with
2399 the x86 register %st(0), but that's about all. Higher DWARF
2400 register numbers don't seem to be associated with anything in
2401 particular, and even for DWARF regno 11, SDB only seems to under-
2402 stand that it should say that a variable lives in %st(0) (when
2403 asked via an `=' command) if we said it was in DWARF regno 11,
2404 but SDB still prints garbage when asked for the value of the
2405 variable in question (via a `/' command).
2406 (Also note that the labels SDB prints for various FP stack regs
2407 when doing an `x' command are all wrong.)
2408 Note that these problems generally don't affect the native SVR4
2409 C compiler because it doesn't allow the use of -O with -g and
2410 because when it is *not* optimizing, it allocates a memory
2411 location for each floating-point variable, and the memory
2412 location is what gets described in the DWARF AT_location
2413 attribute for the variable in question.
2414 Regardless of the severe mental illness of the x86/svr4 SDB, we
2415 do something sensible here and we use the following DWARF
2416 register numbers. Note that these are all stack-top-relative
2417 numbers.
2418 11 for %st(0) (gcc regno = 8)
2419 12 for %st(1) (gcc regno = 9)
2420 13 for %st(2) (gcc regno = 10)
2421 14 for %st(3) (gcc regno = 11)
2422 15 for %st(4) (gcc regno = 12)
2423 16 for %st(5) (gcc regno = 13)
2424 17 for %st(6) (gcc regno = 14)
2425 18 for %st(7) (gcc regno = 15)
2426 */
2427 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2428 {
2429 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2430 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2431 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2432 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2433 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2434 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2435 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2436 };
2437
2438 /* Define parameter passing and return registers. */
2439
2440 static int const x86_64_int_parameter_registers[6] =
2441 {
2442 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2443 };
2444
2445 static int const x86_64_ms_abi_int_parameter_registers[4] =
2446 {
2447 CX_REG, DX_REG, R8_REG, R9_REG
2448 };
2449
2450 static int const x86_64_int_return_registers[4] =
2451 {
2452 AX_REG, DX_REG, DI_REG, SI_REG
2453 };
2454
2455 /* Define the structure for the machine field in struct function. */
2456
2457 struct GTY(()) stack_local_entry {
2458 unsigned short mode;
2459 unsigned short n;
2460 rtx rtl;
2461 struct stack_local_entry *next;
2462 };
2463
2464 /* Structure describing stack frame layout.
2465 Stack grows downward:
2466
2467 [arguments]
2468 <- ARG_POINTER
2469 saved pc
2470
2471 saved static chain if ix86_static_chain_on_stack
2472
2473 saved frame pointer if frame_pointer_needed
2474 <- HARD_FRAME_POINTER
2475 [saved regs]
2476 <- regs_save_offset
2477 [padding0]
2478
2479 [saved SSE regs]
2480 <- sse_regs_save_offset
2481 [padding1] |
2482 | <- FRAME_POINTER
2483 [va_arg registers] |
2484 |
2485 [frame] |
2486 |
2487 [padding2] | = to_allocate
2488 <- STACK_POINTER
2489 */
2490 struct ix86_frame
2491 {
2492 int nsseregs;
2493 int nregs;
2494 int va_arg_size;
2495 int red_zone_size;
2496 int outgoing_arguments_size;
2497
2498 /* The offsets relative to ARG_POINTER. */
2499 HOST_WIDE_INT frame_pointer_offset;
2500 HOST_WIDE_INT hard_frame_pointer_offset;
2501 HOST_WIDE_INT stack_pointer_offset;
2502 HOST_WIDE_INT hfp_save_offset;
2503 HOST_WIDE_INT reg_save_offset;
2504 HOST_WIDE_INT sse_reg_save_offset;
2505
2506 /* When save_regs_using_mov is set, emit prologue using
2507 move instead of push instructions. */
2508 bool save_regs_using_mov;
2509 };
2510
2511 /* Which cpu are we scheduling for. */
2512 enum attr_cpu ix86_schedule;
2513
2514 /* Which cpu are we optimizing for. */
2515 enum processor_type ix86_tune;
2516
2517 /* Which instruction set architecture to use. */
2518 enum processor_type ix86_arch;
2519
2520 /* true if sse prefetch instruction is not NOOP. */
2521 int x86_prefetch_sse;
2522
2523 /* -mstackrealign option */
2524 static const char ix86_force_align_arg_pointer_string[]
2525 = "force_align_arg_pointer";
2526
2527 static rtx (*ix86_gen_leave) (void);
2528 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2529 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2530 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2531 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2532 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2533 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2534 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2535 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2536 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2537 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2538 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2539
2540 /* Preferred alignment for stack boundary in bits. */
2541 unsigned int ix86_preferred_stack_boundary;
2542
2543 /* Alignment for incoming stack boundary in bits specified at
2544 command line. */
2545 static unsigned int ix86_user_incoming_stack_boundary;
2546
2547 /* Default alignment for incoming stack boundary in bits. */
2548 static unsigned int ix86_default_incoming_stack_boundary;
2549
2550 /* Alignment for incoming stack boundary in bits. */
2551 unsigned int ix86_incoming_stack_boundary;
2552
2553 /* Calling abi specific va_list type nodes. */
2554 static GTY(()) tree sysv_va_list_type_node;
2555 static GTY(()) tree ms_va_list_type_node;
2556
2557 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2558 char internal_label_prefix[16];
2559 int internal_label_prefix_len;
2560
2561 /* Fence to use after loop using movnt. */
2562 tree x86_mfence;
2563
2564 /* Register class used for passing given 64bit part of the argument.
2565 These represent classes as documented by the PS ABI, with the exception
2566 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2567 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2568
2569 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2570 whenever possible (upper half does contain padding). */
2571 enum x86_64_reg_class
2572 {
2573 X86_64_NO_CLASS,
2574 X86_64_INTEGER_CLASS,
2575 X86_64_INTEGERSI_CLASS,
2576 X86_64_SSE_CLASS,
2577 X86_64_SSESF_CLASS,
2578 X86_64_SSEDF_CLASS,
2579 X86_64_SSEUP_CLASS,
2580 X86_64_X87_CLASS,
2581 X86_64_X87UP_CLASS,
2582 X86_64_COMPLEX_X87_CLASS,
2583 X86_64_MEMORY_CLASS
2584 };
2585
2586 #define MAX_CLASSES 4
2587
2588 /* Table of constants used by fldpi, fldln2, etc.... */
2589 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2590 static bool ext_80387_constants_init = 0;
2591
2592 \f
2593 static struct machine_function * ix86_init_machine_status (void);
2594 static rtx ix86_function_value (const_tree, const_tree, bool);
2595 static bool ix86_function_value_regno_p (const unsigned int);
2596 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2597 const_tree);
2598 static rtx ix86_static_chain (const_tree, bool);
2599 static int ix86_function_regparm (const_tree, const_tree);
2600 static void ix86_compute_frame_layout (struct ix86_frame *);
2601 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2602 rtx, rtx, int);
2603 static void ix86_add_new_builtins (HOST_WIDE_INT);
2604 static tree ix86_canonical_va_list_type (tree);
2605 static void predict_jump (int);
2606 static unsigned int split_stack_prologue_scratch_regno (void);
2607 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2608
2609 enum ix86_function_specific_strings
2610 {
2611 IX86_FUNCTION_SPECIFIC_ARCH,
2612 IX86_FUNCTION_SPECIFIC_TUNE,
2613 IX86_FUNCTION_SPECIFIC_MAX
2614 };
2615
2616 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2617 const char *, enum fpmath_unit, bool);
2618 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2619 static void ix86_function_specific_save (struct cl_target_option *);
2620 static void ix86_function_specific_restore (struct cl_target_option *);
2621 static void ix86_function_specific_print (FILE *, int,
2622 struct cl_target_option *);
2623 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2624 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2625 struct gcc_options *);
2626 static bool ix86_can_inline_p (tree, tree);
2627 static void ix86_set_current_function (tree);
2628 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2629
2630 static enum calling_abi ix86_function_abi (const_tree);
2631
2632 \f
2633 #ifndef SUBTARGET32_DEFAULT_CPU
2634 #define SUBTARGET32_DEFAULT_CPU "i386"
2635 #endif
2636
2637 /* The svr4 ABI for the i386 says that records and unions are returned
2638 in memory. */
2639 #ifndef DEFAULT_PCC_STRUCT_RETURN
2640 #define DEFAULT_PCC_STRUCT_RETURN 1
2641 #endif
2642
2643 /* Whether -mtune= or -march= were specified */
2644 static int ix86_tune_defaulted;
2645 static int ix86_arch_specified;
2646
2647 /* Vectorization library interface and handlers. */
2648 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2649
2650 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2651 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2652
2653 /* Processor target table, indexed by processor number */
2654 struct ptt
2655 {
2656 const struct processor_costs *cost; /* Processor costs */
2657 const int align_loop; /* Default alignments. */
2658 const int align_loop_max_skip;
2659 const int align_jump;
2660 const int align_jump_max_skip;
2661 const int align_func;
2662 };
2663
2664 static const struct ptt processor_target_table[PROCESSOR_max] =
2665 {
2666 {&i386_cost, 4, 3, 4, 3, 4},
2667 {&i486_cost, 16, 15, 16, 15, 16},
2668 {&pentium_cost, 16, 7, 16, 7, 16},
2669 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2670 {&geode_cost, 0, 0, 0, 0, 0},
2671 {&k6_cost, 32, 7, 32, 7, 32},
2672 {&athlon_cost, 16, 7, 16, 7, 16},
2673 {&pentium4_cost, 0, 0, 0, 0, 0},
2674 {&k8_cost, 16, 7, 16, 7, 16},
2675 {&nocona_cost, 0, 0, 0, 0, 0},
2676 /* Core 2 32-bit. */
2677 {&generic32_cost, 16, 10, 16, 10, 16},
2678 /* Core 2 64-bit. */
2679 {&generic64_cost, 16, 10, 16, 10, 16},
2680 /* Core i7 32-bit. */
2681 {&generic32_cost, 16, 10, 16, 10, 16},
2682 /* Core i7 64-bit. */
2683 {&generic64_cost, 16, 10, 16, 10, 16},
2684 {&generic32_cost, 16, 7, 16, 7, 16},
2685 {&generic64_cost, 16, 10, 16, 10, 16},
2686 {&amdfam10_cost, 32, 24, 32, 7, 32},
2687 {&bdver1_cost, 32, 24, 32, 7, 32},
2688 {&bdver2_cost, 32, 24, 32, 7, 32},
2689 {&btver1_cost, 32, 24, 32, 7, 32},
2690 {&btver2_cost, 32, 24, 32, 7, 32},
2691 {&atom_cost, 16, 15, 16, 7, 16}
2692 };
2693
2694 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2695 {
2696 "generic",
2697 "i386",
2698 "i486",
2699 "pentium",
2700 "pentium-mmx",
2701 "pentiumpro",
2702 "pentium2",
2703 "pentium3",
2704 "pentium4",
2705 "pentium-m",
2706 "prescott",
2707 "nocona",
2708 "core2",
2709 "corei7",
2710 "atom",
2711 "geode",
2712 "k6",
2713 "k6-2",
2714 "k6-3",
2715 "athlon",
2716 "athlon-4",
2717 "k8",
2718 "amdfam10",
2719 "bdver1",
2720 "bdver2",
2721 "btver1",
2722 "btver2"
2723 };
2724 \f
2725 /* Return true if a red-zone is in use. */
2726
2727 static inline bool
2728 ix86_using_red_zone (void)
2729 {
2730 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2731 }
2732 \f
2733 /* Return a string that documents the current -m options. The caller is
2734 responsible for freeing the string. */
2735
2736 static char *
2737 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2738 const char *tune, enum fpmath_unit fpmath,
2739 bool add_nl_p)
2740 {
2741 struct ix86_target_opts
2742 {
2743 const char *option; /* option string */
2744 HOST_WIDE_INT mask; /* isa mask options */
2745 };
2746
2747 /* This table is ordered so that options like -msse4.2 that imply
2748 preceding options while match those first. */
2749 static struct ix86_target_opts isa_opts[] =
2750 {
2751 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2752 { "-mfma", OPTION_MASK_ISA_FMA },
2753 { "-mxop", OPTION_MASK_ISA_XOP },
2754 { "-mlwp", OPTION_MASK_ISA_LWP },
2755 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2756 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2757 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2758 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2759 { "-msse3", OPTION_MASK_ISA_SSE3 },
2760 { "-msse2", OPTION_MASK_ISA_SSE2 },
2761 { "-msse", OPTION_MASK_ISA_SSE },
2762 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2763 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2764 { "-mmmx", OPTION_MASK_ISA_MMX },
2765 { "-mabm", OPTION_MASK_ISA_ABM },
2766 { "-mbmi", OPTION_MASK_ISA_BMI },
2767 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2768 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2769 { "-mhle", OPTION_MASK_ISA_HLE },
2770 { "-mtbm", OPTION_MASK_ISA_TBM },
2771 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2772 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2773 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2774 { "-maes", OPTION_MASK_ISA_AES },
2775 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2776 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2777 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2778 { "-mf16c", OPTION_MASK_ISA_F16C },
2779 { "-mrtm", OPTION_MASK_ISA_RTM },
2780 };
2781
2782 /* Flag options. */
2783 static struct ix86_target_opts flag_opts[] =
2784 {
2785 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2786 { "-m80387", MASK_80387 },
2787 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2788 { "-malign-double", MASK_ALIGN_DOUBLE },
2789 { "-mcld", MASK_CLD },
2790 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2791 { "-mieee-fp", MASK_IEEE_FP },
2792 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2793 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2794 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2795 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2796 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2797 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2798 { "-mno-red-zone", MASK_NO_RED_ZONE },
2799 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2800 { "-mrecip", MASK_RECIP },
2801 { "-mrtd", MASK_RTD },
2802 { "-msseregparm", MASK_SSEREGPARM },
2803 { "-mstack-arg-probe", MASK_STACK_PROBE },
2804 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2805 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2806 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2807 { "-mvzeroupper", MASK_VZEROUPPER },
2808 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2809 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2810 { "-mprefer-avx128", MASK_PREFER_AVX128},
2811 };
2812
2813 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2814
2815 char isa_other[40];
2816 char target_other[40];
2817 unsigned num = 0;
2818 unsigned i, j;
2819 char *ret;
2820 char *ptr;
2821 size_t len;
2822 size_t line_len;
2823 size_t sep_len;
2824 const char *abi;
2825
2826 memset (opts, '\0', sizeof (opts));
2827
2828 /* Add -march= option. */
2829 if (arch)
2830 {
2831 opts[num][0] = "-march=";
2832 opts[num++][1] = arch;
2833 }
2834
2835 /* Add -mtune= option. */
2836 if (tune)
2837 {
2838 opts[num][0] = "-mtune=";
2839 opts[num++][1] = tune;
2840 }
2841
2842 /* Add -m32/-m64/-mx32. */
2843 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2844 {
2845 if ((isa & OPTION_MASK_ABI_64) != 0)
2846 abi = "-m64";
2847 else
2848 abi = "-mx32";
2849 isa &= ~ (OPTION_MASK_ISA_64BIT
2850 | OPTION_MASK_ABI_64
2851 | OPTION_MASK_ABI_X32);
2852 }
2853 else
2854 abi = "-m32";
2855 opts[num++][0] = abi;
2856
2857 /* Pick out the options in isa options. */
2858 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2859 {
2860 if ((isa & isa_opts[i].mask) != 0)
2861 {
2862 opts[num++][0] = isa_opts[i].option;
2863 isa &= ~ isa_opts[i].mask;
2864 }
2865 }
2866
2867 if (isa && add_nl_p)
2868 {
2869 opts[num++][0] = isa_other;
2870 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2871 isa);
2872 }
2873
2874 /* Add flag options. */
2875 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2876 {
2877 if ((flags & flag_opts[i].mask) != 0)
2878 {
2879 opts[num++][0] = flag_opts[i].option;
2880 flags &= ~ flag_opts[i].mask;
2881 }
2882 }
2883
2884 if (flags && add_nl_p)
2885 {
2886 opts[num++][0] = target_other;
2887 sprintf (target_other, "(other flags: %#x)", flags);
2888 }
2889
2890 /* Add -fpmath= option. */
2891 if (fpmath)
2892 {
2893 opts[num][0] = "-mfpmath=";
2894 switch ((int) fpmath)
2895 {
2896 case FPMATH_387:
2897 opts[num++][1] = "387";
2898 break;
2899
2900 case FPMATH_SSE:
2901 opts[num++][1] = "sse";
2902 break;
2903
2904 case FPMATH_387 | FPMATH_SSE:
2905 opts[num++][1] = "sse+387";
2906 break;
2907
2908 default:
2909 gcc_unreachable ();
2910 }
2911 }
2912
2913 /* Any options? */
2914 if (num == 0)
2915 return NULL;
2916
2917 gcc_assert (num < ARRAY_SIZE (opts));
2918
2919 /* Size the string. */
2920 len = 0;
2921 sep_len = (add_nl_p) ? 3 : 1;
2922 for (i = 0; i < num; i++)
2923 {
2924 len += sep_len;
2925 for (j = 0; j < 2; j++)
2926 if (opts[i][j])
2927 len += strlen (opts[i][j]);
2928 }
2929
2930 /* Build the string. */
2931 ret = ptr = (char *) xmalloc (len);
2932 line_len = 0;
2933
2934 for (i = 0; i < num; i++)
2935 {
2936 size_t len2[2];
2937
2938 for (j = 0; j < 2; j++)
2939 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2940
2941 if (i != 0)
2942 {
2943 *ptr++ = ' ';
2944 line_len++;
2945
2946 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2947 {
2948 *ptr++ = '\\';
2949 *ptr++ = '\n';
2950 line_len = 0;
2951 }
2952 }
2953
2954 for (j = 0; j < 2; j++)
2955 if (opts[i][j])
2956 {
2957 memcpy (ptr, opts[i][j], len2[j]);
2958 ptr += len2[j];
2959 line_len += len2[j];
2960 }
2961 }
2962
2963 *ptr = '\0';
2964 gcc_assert (ret + len >= ptr);
2965
2966 return ret;
2967 }
2968
2969 /* Return true, if profiling code should be emitted before
2970 prologue. Otherwise it returns false.
2971 Note: For x86 with "hotfix" it is sorried. */
2972 static bool
2973 ix86_profile_before_prologue (void)
2974 {
2975 return flag_fentry != 0;
2976 }
2977
2978 /* Function that is callable from the debugger to print the current
2979 options. */
2980 void
2981 ix86_debug_options (void)
2982 {
2983 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2984 ix86_arch_string, ix86_tune_string,
2985 ix86_fpmath, true);
2986
2987 if (opts)
2988 {
2989 fprintf (stderr, "%s\n\n", opts);
2990 free (opts);
2991 }
2992 else
2993 fputs ("<no options>\n\n", stderr);
2994
2995 return;
2996 }
2997 \f
2998 /* Override various settings based on options. If MAIN_ARGS_P, the
2999 options are from the command line, otherwise they are from
3000 attributes. */
3001
3002 static void
3003 ix86_option_override_internal (bool main_args_p)
3004 {
3005 int i;
3006 unsigned int ix86_arch_mask, ix86_tune_mask;
3007 const bool ix86_tune_specified = (ix86_tune_string != NULL);
3008 const char *prefix;
3009 const char *suffix;
3010 const char *sw;
3011
3012 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
3013 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
3014 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
3015 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
3016 #define PTA_AES (HOST_WIDE_INT_1 << 4)
3017 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
3018 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
3019 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
3020 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
3021 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
3022 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
3023 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
3024 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
3025 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
3026 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
3027 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
3028 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
3029 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
3030 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
3031 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
3032 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
3033 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
3034 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
3035 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
3036 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
3037 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
3038 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
3039 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
3040 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
3041 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
3042 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
3043 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
3044 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
3045 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
3046 /* if this reaches 64, need to widen struct pta flags below */
3047
3048 static struct pta
3049 {
3050 const char *const name; /* processor name or nickname. */
3051 const enum processor_type processor;
3052 const enum attr_cpu schedule;
3053 const unsigned HOST_WIDE_INT flags;
3054 }
3055 const processor_alias_table[] =
3056 {
3057 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3058 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3059 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3060 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3061 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3062 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3063 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3064 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3065 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
3066 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3067 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3068 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
3069 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3070 PTA_MMX | PTA_SSE},
3071 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3072 PTA_MMX | PTA_SSE},
3073 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3074 PTA_MMX | PTA_SSE | PTA_SSE2},
3075 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3076 PTA_MMX |PTA_SSE | PTA_SSE2},
3077 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3078 PTA_MMX | PTA_SSE | PTA_SSE2},
3079 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3080 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
3081 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3082 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3083 | PTA_CX16 | PTA_NO_SAHF},
3084 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
3085 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3086 | PTA_SSSE3 | PTA_CX16},
3087 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
3088 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3089 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16},
3090 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
3091 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3092 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3093 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
3094 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
3095 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3096 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3097 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3098 | PTA_RDRND | PTA_F16C},
3099 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
3100 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3101 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
3102 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3103 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
3104 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE},
3105 {"atom", PROCESSOR_ATOM, CPU_ATOM,
3106 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3107 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
3108 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3109 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE},
3110 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3111 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3112 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3113 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3114 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3115 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3116 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3117 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3118 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3119 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3120 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3121 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3122 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3123 {"x86-64", PROCESSOR_K8, CPU_K8,
3124 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3125 {"k8", PROCESSOR_K8, CPU_K8,
3126 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3127 | PTA_SSE2 | PTA_NO_SAHF},
3128 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3129 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3130 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3131 {"opteron", PROCESSOR_K8, CPU_K8,
3132 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3133 | PTA_SSE2 | PTA_NO_SAHF},
3134 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3135 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3136 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3137 {"athlon64", PROCESSOR_K8, CPU_K8,
3138 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3139 | PTA_SSE2 | PTA_NO_SAHF},
3140 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3141 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3142 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3143 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3144 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3145 | PTA_SSE2 | PTA_NO_SAHF},
3146 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3147 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3148 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3149 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3150 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3151 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3152 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3153 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3154 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3155 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3156 | PTA_XOP | PTA_LWP},
3157 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3158 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3159 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3160 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3161 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3162 | PTA_FMA},
3163 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3164 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3165 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16},
3166 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3167 PTA_HLE /* flags are only used for -march switch. */ },
3168 {"btver2", PROCESSOR_BTVER2, CPU_GENERIC64,
3169 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3170 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3171 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3172 | PTA_BMI | PTA_F16C | PTA_MOVBE},
3173 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3174 PTA_64BIT
3175 | PTA_HLE /* flags are only used for -march switch. */ },
3176 };
3177
3178 /* -mrecip options. */
3179 static struct
3180 {
3181 const char *string; /* option name */
3182 unsigned int mask; /* mask bits to set */
3183 }
3184 const recip_options[] =
3185 {
3186 { "all", RECIP_MASK_ALL },
3187 { "none", RECIP_MASK_NONE },
3188 { "div", RECIP_MASK_DIV },
3189 { "sqrt", RECIP_MASK_SQRT },
3190 { "vec-div", RECIP_MASK_VEC_DIV },
3191 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3192 };
3193
3194 int const pta_size = ARRAY_SIZE (processor_alias_table);
3195
3196 /* Set up prefix/suffix so the error messages refer to either the command
3197 line argument, or the attribute(target). */
3198 if (main_args_p)
3199 {
3200 prefix = "-m";
3201 suffix = "";
3202 sw = "switch";
3203 }
3204 else
3205 {
3206 prefix = "option(\"";
3207 suffix = "\")";
3208 sw = "attribute";
3209 }
3210
3211 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3212 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3213 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
3214 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3215 #ifdef TARGET_BI_ARCH
3216 else
3217 {
3218 #if TARGET_BI_ARCH == 1
3219 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3220 is on and OPTION_MASK_ABI_X32 is off. We turn off
3221 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3222 -mx32. */
3223 if (TARGET_X32)
3224 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3225 #else
3226 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3227 on and OPTION_MASK_ABI_64 is off. We turn off
3228 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3229 -m64. */
3230 if (TARGET_LP64)
3231 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3232 #endif
3233 }
3234 #endif
3235
3236 if (TARGET_X32)
3237 {
3238 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3239 OPTION_MASK_ABI_64 for TARGET_X32. */
3240 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3241 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3242 }
3243 else if (TARGET_LP64)
3244 {
3245 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3246 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3247 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3248 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3249 }
3250
3251 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3252 SUBTARGET_OVERRIDE_OPTIONS;
3253 #endif
3254
3255 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3256 SUBSUBTARGET_OVERRIDE_OPTIONS;
3257 #endif
3258
3259 /* -fPIC is the default for x86_64. */
3260 if (TARGET_MACHO && TARGET_64BIT)
3261 flag_pic = 2;
3262
3263 /* Need to check -mtune=generic first. */
3264 if (ix86_tune_string)
3265 {
3266 if (!strcmp (ix86_tune_string, "generic")
3267 || !strcmp (ix86_tune_string, "i686")
3268 /* As special support for cross compilers we read -mtune=native
3269 as -mtune=generic. With native compilers we won't see the
3270 -mtune=native, as it was changed by the driver. */
3271 || !strcmp (ix86_tune_string, "native"))
3272 {
3273 if (TARGET_64BIT)
3274 ix86_tune_string = "generic64";
3275 else
3276 ix86_tune_string = "generic32";
3277 }
3278 /* If this call is for setting the option attribute, allow the
3279 generic32/generic64 that was previously set. */
3280 else if (!main_args_p
3281 && (!strcmp (ix86_tune_string, "generic32")
3282 || !strcmp (ix86_tune_string, "generic64")))
3283 ;
3284 else if (!strncmp (ix86_tune_string, "generic", 7))
3285 error ("bad value (%s) for %stune=%s %s",
3286 ix86_tune_string, prefix, suffix, sw);
3287 else if (!strcmp (ix86_tune_string, "x86-64"))
3288 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3289 "%stune=k8%s or %stune=generic%s instead as appropriate",
3290 prefix, suffix, prefix, suffix, prefix, suffix);
3291 }
3292 else
3293 {
3294 if (ix86_arch_string)
3295 ix86_tune_string = ix86_arch_string;
3296 if (!ix86_tune_string)
3297 {
3298 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3299 ix86_tune_defaulted = 1;
3300 }
3301
3302 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3303 need to use a sensible tune option. */
3304 if (!strcmp (ix86_tune_string, "generic")
3305 || !strcmp (ix86_tune_string, "x86-64")
3306 || !strcmp (ix86_tune_string, "i686"))
3307 {
3308 if (TARGET_64BIT)
3309 ix86_tune_string = "generic64";
3310 else
3311 ix86_tune_string = "generic32";
3312 }
3313 }
3314
3315 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3316 {
3317 /* rep; movq isn't available in 32-bit code. */
3318 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3319 ix86_stringop_alg = no_stringop;
3320 }
3321
3322 if (!ix86_arch_string)
3323 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3324 else
3325 ix86_arch_specified = 1;
3326
3327 if (global_options_set.x_ix86_pmode)
3328 {
3329 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3330 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3331 error ("address mode %qs not supported in the %s bit mode",
3332 TARGET_64BIT ? "short" : "long",
3333 TARGET_64BIT ? "64" : "32");
3334 }
3335 else
3336 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3337
3338 if (!global_options_set.x_ix86_abi)
3339 ix86_abi = DEFAULT_ABI;
3340
3341 if (global_options_set.x_ix86_cmodel)
3342 {
3343 switch (ix86_cmodel)
3344 {
3345 case CM_SMALL:
3346 case CM_SMALL_PIC:
3347 if (flag_pic)
3348 ix86_cmodel = CM_SMALL_PIC;
3349 if (!TARGET_64BIT)
3350 error ("code model %qs not supported in the %s bit mode",
3351 "small", "32");
3352 break;
3353
3354 case CM_MEDIUM:
3355 case CM_MEDIUM_PIC:
3356 if (flag_pic)
3357 ix86_cmodel = CM_MEDIUM_PIC;
3358 if (!TARGET_64BIT)
3359 error ("code model %qs not supported in the %s bit mode",
3360 "medium", "32");
3361 else if (TARGET_X32)
3362 error ("code model %qs not supported in x32 mode",
3363 "medium");
3364 break;
3365
3366 case CM_LARGE:
3367 case CM_LARGE_PIC:
3368 if (flag_pic)
3369 ix86_cmodel = CM_LARGE_PIC;
3370 if (!TARGET_64BIT)
3371 error ("code model %qs not supported in the %s bit mode",
3372 "large", "32");
3373 else if (TARGET_X32)
3374 error ("code model %qs not supported in x32 mode",
3375 "large");
3376 break;
3377
3378 case CM_32:
3379 if (flag_pic)
3380 error ("code model %s does not support PIC mode", "32");
3381 if (TARGET_64BIT)
3382 error ("code model %qs not supported in the %s bit mode",
3383 "32", "64");
3384 break;
3385
3386 case CM_KERNEL:
3387 if (flag_pic)
3388 {
3389 error ("code model %s does not support PIC mode", "kernel");
3390 ix86_cmodel = CM_32;
3391 }
3392 if (!TARGET_64BIT)
3393 error ("code model %qs not supported in the %s bit mode",
3394 "kernel", "32");
3395 break;
3396
3397 default:
3398 gcc_unreachable ();
3399 }
3400 }
3401 else
3402 {
3403 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3404 use of rip-relative addressing. This eliminates fixups that
3405 would otherwise be needed if this object is to be placed in a
3406 DLL, and is essentially just as efficient as direct addressing. */
3407 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3408 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3409 else if (TARGET_64BIT)
3410 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3411 else
3412 ix86_cmodel = CM_32;
3413 }
3414 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3415 {
3416 error ("-masm=intel not supported in this configuration");
3417 ix86_asm_dialect = ASM_ATT;
3418 }
3419 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3420 sorry ("%i-bit mode not compiled in",
3421 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3422
3423 for (i = 0; i < pta_size; i++)
3424 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3425 {
3426 ix86_schedule = processor_alias_table[i].schedule;
3427 ix86_arch = processor_alias_table[i].processor;
3428 /* Default cpu tuning to the architecture. */
3429 ix86_tune = ix86_arch;
3430
3431 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3432 error ("CPU you selected does not support x86-64 "
3433 "instruction set");
3434
3435 if (processor_alias_table[i].flags & PTA_MMX
3436 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3437 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3438 if (processor_alias_table[i].flags & PTA_3DNOW
3439 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3440 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3441 if (processor_alias_table[i].flags & PTA_3DNOW_A
3442 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3443 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3444 if (processor_alias_table[i].flags & PTA_SSE
3445 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3446 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3447 if (processor_alias_table[i].flags & PTA_SSE2
3448 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3449 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3450 if (processor_alias_table[i].flags & PTA_SSE3
3451 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3452 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3453 if (processor_alias_table[i].flags & PTA_SSSE3
3454 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3455 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3456 if (processor_alias_table[i].flags & PTA_SSE4_1
3457 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3458 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3459 if (processor_alias_table[i].flags & PTA_SSE4_2
3460 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3461 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3462 if (processor_alias_table[i].flags & PTA_AVX
3463 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3464 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3465 if (processor_alias_table[i].flags & PTA_AVX2
3466 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3467 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3468 if (processor_alias_table[i].flags & PTA_FMA
3469 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3470 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3471 if (processor_alias_table[i].flags & PTA_SSE4A
3472 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3473 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3474 if (processor_alias_table[i].flags & PTA_FMA4
3475 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3476 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3477 if (processor_alias_table[i].flags & PTA_XOP
3478 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3479 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3480 if (processor_alias_table[i].flags & PTA_LWP
3481 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3482 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3483 if (processor_alias_table[i].flags & PTA_ABM
3484 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3485 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3486 if (processor_alias_table[i].flags & PTA_BMI
3487 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3488 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3489 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3490 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3491 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3492 if (processor_alias_table[i].flags & PTA_TBM
3493 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3494 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3495 if (processor_alias_table[i].flags & PTA_BMI2
3496 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3497 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3498 if (processor_alias_table[i].flags & PTA_CX16
3499 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3500 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3501 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3502 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3503 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3504 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3505 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3506 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3507 if (processor_alias_table[i].flags & PTA_MOVBE
3508 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3509 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3510 if (processor_alias_table[i].flags & PTA_AES
3511 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3512 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3513 if (processor_alias_table[i].flags & PTA_PCLMUL
3514 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3515 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3516 if (processor_alias_table[i].flags & PTA_FSGSBASE
3517 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3518 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3519 if (processor_alias_table[i].flags & PTA_RDRND
3520 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3521 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3522 if (processor_alias_table[i].flags & PTA_F16C
3523 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3524 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3525 if (processor_alias_table[i].flags & PTA_RTM
3526 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3527 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3528 if (processor_alias_table[i].flags & PTA_HLE
3529 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3530 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3531 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3532 x86_prefetch_sse = true;
3533
3534 break;
3535 }
3536
3537 if (!strcmp (ix86_arch_string, "generic"))
3538 error ("generic CPU can be used only for %stune=%s %s",
3539 prefix, suffix, sw);
3540 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3541 error ("bad value (%s) for %sarch=%s %s",
3542 ix86_arch_string, prefix, suffix, sw);
3543
3544 ix86_arch_mask = 1u << ix86_arch;
3545 for (i = 0; i < X86_ARCH_LAST; ++i)
3546 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3547
3548 for (i = 0; i < pta_size; i++)
3549 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3550 {
3551 ix86_schedule = processor_alias_table[i].schedule;
3552 ix86_tune = processor_alias_table[i].processor;
3553 if (TARGET_64BIT)
3554 {
3555 if (!(processor_alias_table[i].flags & PTA_64BIT))
3556 {
3557 if (ix86_tune_defaulted)
3558 {
3559 ix86_tune_string = "x86-64";
3560 for (i = 0; i < pta_size; i++)
3561 if (! strcmp (ix86_tune_string,
3562 processor_alias_table[i].name))
3563 break;
3564 ix86_schedule = processor_alias_table[i].schedule;
3565 ix86_tune = processor_alias_table[i].processor;
3566 }
3567 else
3568 error ("CPU you selected does not support x86-64 "
3569 "instruction set");
3570 }
3571 }
3572 else
3573 {
3574 /* Adjust tuning when compiling for 32-bit ABI. */
3575 switch (ix86_tune)
3576 {
3577 case PROCESSOR_GENERIC64:
3578 ix86_tune = PROCESSOR_GENERIC32;
3579 ix86_schedule = CPU_PENTIUMPRO;
3580 break;
3581
3582 case PROCESSOR_CORE2_64:
3583 ix86_tune = PROCESSOR_CORE2_32;
3584 break;
3585
3586 case PROCESSOR_COREI7_64:
3587 ix86_tune = PROCESSOR_COREI7_32;
3588 break;
3589
3590 default:
3591 break;
3592 }
3593 }
3594 /* Intel CPUs have always interpreted SSE prefetch instructions as
3595 NOPs; so, we can enable SSE prefetch instructions even when
3596 -mtune (rather than -march) points us to a processor that has them.
3597 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3598 higher processors. */
3599 if (TARGET_CMOV
3600 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3601 x86_prefetch_sse = true;
3602 break;
3603 }
3604
3605 if (ix86_tune_specified && i == pta_size)
3606 error ("bad value (%s) for %stune=%s %s",
3607 ix86_tune_string, prefix, suffix, sw);
3608
3609 ix86_tune_mask = 1u << ix86_tune;
3610 for (i = 0; i < X86_TUNE_LAST; ++i)
3611 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3612
3613 #ifndef USE_IX86_FRAME_POINTER
3614 #define USE_IX86_FRAME_POINTER 0
3615 #endif
3616
3617 #ifndef USE_X86_64_FRAME_POINTER
3618 #define USE_X86_64_FRAME_POINTER 0
3619 #endif
3620
3621 /* Set the default values for switches whose default depends on TARGET_64BIT
3622 in case they weren't overwritten by command line options. */
3623 if (TARGET_64BIT)
3624 {
3625 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3626 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3627 if (flag_asynchronous_unwind_tables == 2)
3628 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3629 if (flag_pcc_struct_return == 2)
3630 flag_pcc_struct_return = 0;
3631 }
3632 else
3633 {
3634 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3635 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3636 if (flag_asynchronous_unwind_tables == 2)
3637 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3638 if (flag_pcc_struct_return == 2)
3639 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3640 }
3641
3642 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3643 if (optimize_size)
3644 ix86_cost = &ix86_size_cost;
3645 else
3646 ix86_cost = ix86_tune_cost;
3647
3648 /* Arrange to set up i386_stack_locals for all functions. */
3649 init_machine_status = ix86_init_machine_status;
3650
3651 /* Validate -mregparm= value. */
3652 if (global_options_set.x_ix86_regparm)
3653 {
3654 if (TARGET_64BIT)
3655 warning (0, "-mregparm is ignored in 64-bit mode");
3656 if (ix86_regparm > REGPARM_MAX)
3657 {
3658 error ("-mregparm=%d is not between 0 and %d",
3659 ix86_regparm, REGPARM_MAX);
3660 ix86_regparm = 0;
3661 }
3662 }
3663 if (TARGET_64BIT)
3664 ix86_regparm = REGPARM_MAX;
3665
3666 /* Default align_* from the processor table. */
3667 if (align_loops == 0)
3668 {
3669 align_loops = processor_target_table[ix86_tune].align_loop;
3670 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3671 }
3672 if (align_jumps == 0)
3673 {
3674 align_jumps = processor_target_table[ix86_tune].align_jump;
3675 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3676 }
3677 if (align_functions == 0)
3678 {
3679 align_functions = processor_target_table[ix86_tune].align_func;
3680 }
3681
3682 /* Provide default for -mbranch-cost= value. */
3683 if (!global_options_set.x_ix86_branch_cost)
3684 ix86_branch_cost = ix86_cost->branch_cost;
3685
3686 if (TARGET_64BIT)
3687 {
3688 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3689
3690 /* Enable by default the SSE and MMX builtins. Do allow the user to
3691 explicitly disable any of these. In particular, disabling SSE and
3692 MMX for kernel code is extremely useful. */
3693 if (!ix86_arch_specified)
3694 ix86_isa_flags
3695 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3696 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3697
3698 if (TARGET_RTD)
3699 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3700 }
3701 else
3702 {
3703 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3704
3705 if (!ix86_arch_specified)
3706 ix86_isa_flags
3707 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3708
3709 /* i386 ABI does not specify red zone. It still makes sense to use it
3710 when programmer takes care to stack from being destroyed. */
3711 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3712 target_flags |= MASK_NO_RED_ZONE;
3713 }
3714
3715 /* Keep nonleaf frame pointers. */
3716 if (flag_omit_frame_pointer)
3717 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3718 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3719 flag_omit_frame_pointer = 1;
3720
3721 /* If we're doing fast math, we don't care about comparison order
3722 wrt NaNs. This lets us use a shorter comparison sequence. */
3723 if (flag_finite_math_only)
3724 target_flags &= ~MASK_IEEE_FP;
3725
3726 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3727 since the insns won't need emulation. */
3728 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3729 target_flags &= ~MASK_NO_FANCY_MATH_387;
3730
3731 /* Likewise, if the target doesn't have a 387, or we've specified
3732 software floating point, don't use 387 inline intrinsics. */
3733 if (!TARGET_80387)
3734 target_flags |= MASK_NO_FANCY_MATH_387;
3735
3736 /* Turn on MMX builtins for -msse. */
3737 if (TARGET_SSE)
3738 {
3739 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3740 x86_prefetch_sse = true;
3741 }
3742
3743 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3744 if (TARGET_SSE4_2 || TARGET_ABM)
3745 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3746
3747 /* Turn on lzcnt instruction for -mabm. */
3748 if (TARGET_ABM)
3749 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3750
3751 /* Validate -mpreferred-stack-boundary= value or default it to
3752 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3753 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3754 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3755 {
3756 int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
3757 int max = (TARGET_SEH ? 4 : 12);
3758
3759 if (ix86_preferred_stack_boundary_arg < min
3760 || ix86_preferred_stack_boundary_arg > max)
3761 {
3762 if (min == max)
3763 error ("-mpreferred-stack-boundary is not supported "
3764 "for this target");
3765 else
3766 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3767 ix86_preferred_stack_boundary_arg, min, max);
3768 }
3769 else
3770 ix86_preferred_stack_boundary
3771 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3772 }
3773
3774 /* Set the default value for -mstackrealign. */
3775 if (ix86_force_align_arg_pointer == -1)
3776 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3777
3778 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3779
3780 /* Validate -mincoming-stack-boundary= value or default it to
3781 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3782 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3783 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3784 {
3785 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3786 || ix86_incoming_stack_boundary_arg > 12)
3787 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3788 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3789 else
3790 {
3791 ix86_user_incoming_stack_boundary
3792 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3793 ix86_incoming_stack_boundary
3794 = ix86_user_incoming_stack_boundary;
3795 }
3796 }
3797
3798 /* Accept -msseregparm only if at least SSE support is enabled. */
3799 if (TARGET_SSEREGPARM
3800 && ! TARGET_SSE)
3801 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3802
3803 if (global_options_set.x_ix86_fpmath)
3804 {
3805 if (ix86_fpmath & FPMATH_SSE)
3806 {
3807 if (!TARGET_SSE)
3808 {
3809 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3810 ix86_fpmath = FPMATH_387;
3811 }
3812 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3813 {
3814 warning (0, "387 instruction set disabled, using SSE arithmetics");
3815 ix86_fpmath = FPMATH_SSE;
3816 }
3817 }
3818 }
3819 else
3820 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3821
3822 /* If the i387 is disabled, then do not return values in it. */
3823 if (!TARGET_80387)
3824 target_flags &= ~MASK_FLOAT_RETURNS;
3825
3826 /* Use external vectorized library in vectorizing intrinsics. */
3827 if (global_options_set.x_ix86_veclibabi_type)
3828 switch (ix86_veclibabi_type)
3829 {
3830 case ix86_veclibabi_type_svml:
3831 ix86_veclib_handler = ix86_veclibabi_svml;
3832 break;
3833
3834 case ix86_veclibabi_type_acml:
3835 ix86_veclib_handler = ix86_veclibabi_acml;
3836 break;
3837
3838 default:
3839 gcc_unreachable ();
3840 }
3841
3842 if ((!USE_IX86_FRAME_POINTER
3843 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3844 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3845 && !optimize_size)
3846 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3847
3848 /* ??? Unwind info is not correct around the CFG unless either a frame
3849 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3850 unwind info generation to be aware of the CFG and propagating states
3851 around edges. */
3852 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3853 || flag_exceptions || flag_non_call_exceptions)
3854 && flag_omit_frame_pointer
3855 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3856 {
3857 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3858 warning (0, "unwind tables currently require either a frame pointer "
3859 "or %saccumulate-outgoing-args%s for correctness",
3860 prefix, suffix);
3861 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3862 }
3863
3864 /* If stack probes are required, the space used for large function
3865 arguments on the stack must also be probed, so enable
3866 -maccumulate-outgoing-args so this happens in the prologue. */
3867 if (TARGET_STACK_PROBE
3868 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3869 {
3870 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3871 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3872 "for correctness", prefix, suffix);
3873 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3874 }
3875
3876 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3877 {
3878 char *p;
3879 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3880 p = strchr (internal_label_prefix, 'X');
3881 internal_label_prefix_len = p - internal_label_prefix;
3882 *p = '\0';
3883 }
3884
3885 /* When scheduling description is not available, disable scheduler pass
3886 so it won't slow down the compilation and make x87 code slower. */
3887 if (!TARGET_SCHEDULE)
3888 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3889
3890 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3891 ix86_tune_cost->simultaneous_prefetches,
3892 global_options.x_param_values,
3893 global_options_set.x_param_values);
3894 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3895 ix86_tune_cost->prefetch_block,
3896 global_options.x_param_values,
3897 global_options_set.x_param_values);
3898 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3899 ix86_tune_cost->l1_cache_size,
3900 global_options.x_param_values,
3901 global_options_set.x_param_values);
3902 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3903 ix86_tune_cost->l2_cache_size,
3904 global_options.x_param_values,
3905 global_options_set.x_param_values);
3906
3907 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3908 if (flag_prefetch_loop_arrays < 0
3909 && HAVE_prefetch
3910 && optimize >= 3
3911 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3912 flag_prefetch_loop_arrays = 1;
3913
3914 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3915 can be optimized to ap = __builtin_next_arg (0). */
3916 if (!TARGET_64BIT && !flag_split_stack)
3917 targetm.expand_builtin_va_start = NULL;
3918
3919 if (TARGET_64BIT)
3920 {
3921 ix86_gen_leave = gen_leave_rex64;
3922 if (Pmode == DImode)
3923 {
3924 ix86_gen_monitor = gen_sse3_monitor64_di;
3925 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3926 ix86_gen_tls_local_dynamic_base_64
3927 = gen_tls_local_dynamic_base_64_di;
3928 }
3929 else
3930 {
3931 ix86_gen_monitor = gen_sse3_monitor64_si;
3932 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3933 ix86_gen_tls_local_dynamic_base_64
3934 = gen_tls_local_dynamic_base_64_si;
3935 }
3936 }
3937 else
3938 {
3939 ix86_gen_leave = gen_leave;
3940 ix86_gen_monitor = gen_sse3_monitor;
3941 }
3942
3943 if (Pmode == DImode)
3944 {
3945 ix86_gen_add3 = gen_adddi3;
3946 ix86_gen_sub3 = gen_subdi3;
3947 ix86_gen_sub3_carry = gen_subdi3_carry;
3948 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3949 ix86_gen_andsp = gen_anddi3;
3950 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3951 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3952 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3953 }
3954 else
3955 {
3956 ix86_gen_add3 = gen_addsi3;
3957 ix86_gen_sub3 = gen_subsi3;
3958 ix86_gen_sub3_carry = gen_subsi3_carry;
3959 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3960 ix86_gen_andsp = gen_andsi3;
3961 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3962 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3963 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3964 }
3965
3966 #ifdef USE_IX86_CLD
3967 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3968 if (!TARGET_64BIT)
3969 target_flags |= MASK_CLD & ~target_flags_explicit;
3970 #endif
3971
3972 if (!TARGET_64BIT && flag_pic)
3973 {
3974 if (flag_fentry > 0)
3975 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3976 "with -fpic");
3977 flag_fentry = 0;
3978 }
3979 else if (TARGET_SEH)
3980 {
3981 if (flag_fentry == 0)
3982 sorry ("-mno-fentry isn%'t compatible with SEH");
3983 flag_fentry = 1;
3984 }
3985 else if (flag_fentry < 0)
3986 {
3987 #if defined(PROFILE_BEFORE_PROLOGUE)
3988 flag_fentry = 1;
3989 #else
3990 flag_fentry = 0;
3991 #endif
3992 }
3993
3994 if (TARGET_AVX)
3995 {
3996 /* When not optimize for size, enable vzeroupper optimization for
3997 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3998 AVX unaligned load/store. */
3999 if (!optimize_size)
4000 {
4001 if (flag_expensive_optimizations
4002 && !(target_flags_explicit & MASK_VZEROUPPER))
4003 target_flags |= MASK_VZEROUPPER;
4004 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
4005 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4006 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4007 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
4008 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4009 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4010 /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */
4011 if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
4012 target_flags |= MASK_PREFER_AVX128;
4013 }
4014 }
4015 else
4016 {
4017 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
4018 target_flags &= ~MASK_VZEROUPPER;
4019 }
4020
4021 if (ix86_recip_name)
4022 {
4023 char *p = ASTRDUP (ix86_recip_name);
4024 char *q;
4025 unsigned int mask, i;
4026 bool invert;
4027
4028 while ((q = strtok (p, ",")) != NULL)
4029 {
4030 p = NULL;
4031 if (*q == '!')
4032 {
4033 invert = true;
4034 q++;
4035 }
4036 else
4037 invert = false;
4038
4039 if (!strcmp (q, "default"))
4040 mask = RECIP_MASK_ALL;
4041 else
4042 {
4043 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4044 if (!strcmp (q, recip_options[i].string))
4045 {
4046 mask = recip_options[i].mask;
4047 break;
4048 }
4049
4050 if (i == ARRAY_SIZE (recip_options))
4051 {
4052 error ("unknown option for -mrecip=%s", q);
4053 invert = false;
4054 mask = RECIP_MASK_NONE;
4055 }
4056 }
4057
4058 recip_mask_explicit |= mask;
4059 if (invert)
4060 recip_mask &= ~mask;
4061 else
4062 recip_mask |= mask;
4063 }
4064 }
4065
4066 if (TARGET_RECIP)
4067 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
4068 else if (target_flags_explicit & MASK_RECIP)
4069 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
4070
4071 /* Save the initial options in case the user does function specific
4072 options. */
4073 if (main_args_p)
4074 target_option_default_node = target_option_current_node
4075 = build_target_option_node ();
4076 }
4077
4078 /* Return TRUE if VAL is passed in register with 256bit AVX modes. */
4079
4080 static bool
4081 function_pass_avx256_p (const_rtx val)
4082 {
4083 if (!val)
4084 return false;
4085
4086 if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
4087 return true;
4088
4089 if (GET_CODE (val) == PARALLEL)
4090 {
4091 int i;
4092 rtx r;
4093
4094 for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
4095 {
4096 r = XVECEXP (val, 0, i);
4097 if (GET_CODE (r) == EXPR_LIST
4098 && XEXP (r, 0)
4099 && REG_P (XEXP (r, 0))
4100 && (GET_MODE (XEXP (r, 0)) == OImode
4101 || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
4102 return true;
4103 }
4104 }
4105
4106 return false;
4107 }
4108
4109 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4110
4111 static void
4112 ix86_option_override (void)
4113 {
4114 ix86_option_override_internal (true);
4115 }
4116
4117 /* Update register usage after having seen the compiler flags. */
4118
4119 static void
4120 ix86_conditional_register_usage (void)
4121 {
4122 int i;
4123 unsigned int j;
4124
4125 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4126 {
4127 if (fixed_regs[i] > 1)
4128 fixed_regs[i] = (fixed_regs[i] == (TARGET_64BIT ? 3 : 2));
4129 if (call_used_regs[i] > 1)
4130 call_used_regs[i] = (call_used_regs[i] == (TARGET_64BIT ? 3 : 2));
4131 }
4132
4133 /* The PIC register, if it exists, is fixed. */
4134 j = PIC_OFFSET_TABLE_REGNUM;
4135 if (j != INVALID_REGNUM)
4136 fixed_regs[j] = call_used_regs[j] = 1;
4137
4138 /* The 64-bit MS_ABI changes the set of call-used registers. */
4139 if (TARGET_64BIT_MS_ABI)
4140 {
4141 call_used_regs[SI_REG] = 0;
4142 call_used_regs[DI_REG] = 0;
4143 call_used_regs[XMM6_REG] = 0;
4144 call_used_regs[XMM7_REG] = 0;
4145 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4146 call_used_regs[i] = 0;
4147 }
4148
4149 /* The default setting of CLOBBERED_REGS is for 32-bit; add in the
4150 other call-clobbered regs for 64-bit. */
4151 if (TARGET_64BIT)
4152 {
4153 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4154
4155 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4156 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4157 && call_used_regs[i])
4158 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4159 }
4160
4161 /* If MMX is disabled, squash the registers. */
4162 if (! TARGET_MMX)
4163 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4164 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4165 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4166
4167 /* If SSE is disabled, squash the registers. */
4168 if (! TARGET_SSE)
4169 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4170 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4171 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4172
4173 /* If the FPU is disabled, squash the registers. */
4174 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4175 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4176 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4177 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4178
4179 /* If 32-bit, squash the 64-bit registers. */
4180 if (! TARGET_64BIT)
4181 {
4182 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4183 reg_names[i] = "";
4184 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4185 reg_names[i] = "";
4186 }
4187 }
4188
4189 \f
4190 /* Save the current options */
4191
4192 static void
4193 ix86_function_specific_save (struct cl_target_option *ptr)
4194 {
4195 ptr->arch = ix86_arch;
4196 ptr->schedule = ix86_schedule;
4197 ptr->tune = ix86_tune;
4198 ptr->branch_cost = ix86_branch_cost;
4199 ptr->tune_defaulted = ix86_tune_defaulted;
4200 ptr->arch_specified = ix86_arch_specified;
4201 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4202 ptr->ix86_target_flags_explicit = target_flags_explicit;
4203 ptr->x_recip_mask_explicit = recip_mask_explicit;
4204
4205 /* The fields are char but the variables are not; make sure the
4206 values fit in the fields. */
4207 gcc_assert (ptr->arch == ix86_arch);
4208 gcc_assert (ptr->schedule == ix86_schedule);
4209 gcc_assert (ptr->tune == ix86_tune);
4210 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4211 }
4212
4213 /* Restore the current options */
4214
4215 static void
4216 ix86_function_specific_restore (struct cl_target_option *ptr)
4217 {
4218 enum processor_type old_tune = ix86_tune;
4219 enum processor_type old_arch = ix86_arch;
4220 unsigned int ix86_arch_mask, ix86_tune_mask;
4221 int i;
4222
4223 ix86_arch = (enum processor_type) ptr->arch;
4224 ix86_schedule = (enum attr_cpu) ptr->schedule;
4225 ix86_tune = (enum processor_type) ptr->tune;
4226 ix86_branch_cost = ptr->branch_cost;
4227 ix86_tune_defaulted = ptr->tune_defaulted;
4228 ix86_arch_specified = ptr->arch_specified;
4229 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4230 target_flags_explicit = ptr->ix86_target_flags_explicit;
4231 recip_mask_explicit = ptr->x_recip_mask_explicit;
4232
4233 /* Recreate the arch feature tests if the arch changed */
4234 if (old_arch != ix86_arch)
4235 {
4236 ix86_arch_mask = 1u << ix86_arch;
4237 for (i = 0; i < X86_ARCH_LAST; ++i)
4238 ix86_arch_features[i]
4239 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4240 }
4241
4242 /* Recreate the tune optimization tests */
4243 if (old_tune != ix86_tune)
4244 {
4245 ix86_tune_mask = 1u << ix86_tune;
4246 for (i = 0; i < X86_TUNE_LAST; ++i)
4247 ix86_tune_features[i]
4248 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4249 }
4250 }
4251
4252 /* Print the current options */
4253
4254 static void
4255 ix86_function_specific_print (FILE *file, int indent,
4256 struct cl_target_option *ptr)
4257 {
4258 char *target_string
4259 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4260 NULL, NULL, ptr->x_ix86_fpmath, false);
4261
4262 fprintf (file, "%*sarch = %d (%s)\n",
4263 indent, "",
4264 ptr->arch,
4265 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4266 ? cpu_names[ptr->arch]
4267 : "<unknown>"));
4268
4269 fprintf (file, "%*stune = %d (%s)\n",
4270 indent, "",
4271 ptr->tune,
4272 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4273 ? cpu_names[ptr->tune]
4274 : "<unknown>"));
4275
4276 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4277
4278 if (target_string)
4279 {
4280 fprintf (file, "%*s%s\n", indent, "", target_string);
4281 free (target_string);
4282 }
4283 }
4284
4285 \f
4286 /* Inner function to process the attribute((target(...))), take an argument and
4287 set the current options from the argument. If we have a list, recursively go
4288 over the list. */
4289
4290 static bool
4291 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4292 struct gcc_options *enum_opts_set)
4293 {
4294 char *next_optstr;
4295 bool ret = true;
4296
4297 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4298 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4299 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4300 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4301 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4302
4303 enum ix86_opt_type
4304 {
4305 ix86_opt_unknown,
4306 ix86_opt_yes,
4307 ix86_opt_no,
4308 ix86_opt_str,
4309 ix86_opt_enum,
4310 ix86_opt_isa
4311 };
4312
4313 static const struct
4314 {
4315 const char *string;
4316 size_t len;
4317 enum ix86_opt_type type;
4318 int opt;
4319 int mask;
4320 } attrs[] = {
4321 /* isa options */
4322 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4323 IX86_ATTR_ISA ("abm", OPT_mabm),
4324 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4325 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4326 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4327 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4328 IX86_ATTR_ISA ("aes", OPT_maes),
4329 IX86_ATTR_ISA ("avx", OPT_mavx),
4330 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4331 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4332 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4333 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4334 IX86_ATTR_ISA ("sse", OPT_msse),
4335 IX86_ATTR_ISA ("sse2", OPT_msse2),
4336 IX86_ATTR_ISA ("sse3", OPT_msse3),
4337 IX86_ATTR_ISA ("sse4", OPT_msse4),
4338 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4339 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4340 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4341 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4342 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4343 IX86_ATTR_ISA ("fma", OPT_mfma),
4344 IX86_ATTR_ISA ("xop", OPT_mxop),
4345 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4346 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4347 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4348 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4349 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4350 IX86_ATTR_ISA ("hle", OPT_mhle),
4351
4352 /* enum options */
4353 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4354
4355 /* string options */
4356 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4357 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4358
4359 /* flag options */
4360 IX86_ATTR_YES ("cld",
4361 OPT_mcld,
4362 MASK_CLD),
4363
4364 IX86_ATTR_NO ("fancy-math-387",
4365 OPT_mfancy_math_387,
4366 MASK_NO_FANCY_MATH_387),
4367
4368 IX86_ATTR_YES ("ieee-fp",
4369 OPT_mieee_fp,
4370 MASK_IEEE_FP),
4371
4372 IX86_ATTR_YES ("inline-all-stringops",
4373 OPT_minline_all_stringops,
4374 MASK_INLINE_ALL_STRINGOPS),
4375
4376 IX86_ATTR_YES ("inline-stringops-dynamically",
4377 OPT_minline_stringops_dynamically,
4378 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4379
4380 IX86_ATTR_NO ("align-stringops",
4381 OPT_mno_align_stringops,
4382 MASK_NO_ALIGN_STRINGOPS),
4383
4384 IX86_ATTR_YES ("recip",
4385 OPT_mrecip,
4386 MASK_RECIP),
4387
4388 };
4389
4390 /* If this is a list, recurse to get the options. */
4391 if (TREE_CODE (args) == TREE_LIST)
4392 {
4393 bool ret = true;
4394
4395 for (; args; args = TREE_CHAIN (args))
4396 if (TREE_VALUE (args)
4397 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4398 p_strings, enum_opts_set))
4399 ret = false;
4400
4401 return ret;
4402 }
4403
4404 else if (TREE_CODE (args) != STRING_CST)
4405 gcc_unreachable ();
4406
4407 /* Handle multiple arguments separated by commas. */
4408 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4409
4410 while (next_optstr && *next_optstr != '\0')
4411 {
4412 char *p = next_optstr;
4413 char *orig_p = p;
4414 char *comma = strchr (next_optstr, ',');
4415 const char *opt_string;
4416 size_t len, opt_len;
4417 int opt;
4418 bool opt_set_p;
4419 char ch;
4420 unsigned i;
4421 enum ix86_opt_type type = ix86_opt_unknown;
4422 int mask = 0;
4423
4424 if (comma)
4425 {
4426 *comma = '\0';
4427 len = comma - next_optstr;
4428 next_optstr = comma + 1;
4429 }
4430 else
4431 {
4432 len = strlen (p);
4433 next_optstr = NULL;
4434 }
4435
4436 /* Recognize no-xxx. */
4437 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4438 {
4439 opt_set_p = false;
4440 p += 3;
4441 len -= 3;
4442 }
4443 else
4444 opt_set_p = true;
4445
4446 /* Find the option. */
4447 ch = *p;
4448 opt = N_OPTS;
4449 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4450 {
4451 type = attrs[i].type;
4452 opt_len = attrs[i].len;
4453 if (ch == attrs[i].string[0]
4454 && ((type != ix86_opt_str && type != ix86_opt_enum)
4455 ? len == opt_len
4456 : len > opt_len)
4457 && memcmp (p, attrs[i].string, opt_len) == 0)
4458 {
4459 opt = attrs[i].opt;
4460 mask = attrs[i].mask;
4461 opt_string = attrs[i].string;
4462 break;
4463 }
4464 }
4465
4466 /* Process the option. */
4467 if (opt == N_OPTS)
4468 {
4469 error ("attribute(target(\"%s\")) is unknown", orig_p);
4470 ret = false;
4471 }
4472
4473 else if (type == ix86_opt_isa)
4474 {
4475 struct cl_decoded_option decoded;
4476
4477 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4478 ix86_handle_option (&global_options, &global_options_set,
4479 &decoded, input_location);
4480 }
4481
4482 else if (type == ix86_opt_yes || type == ix86_opt_no)
4483 {
4484 if (type == ix86_opt_no)
4485 opt_set_p = !opt_set_p;
4486
4487 if (opt_set_p)
4488 target_flags |= mask;
4489 else
4490 target_flags &= ~mask;
4491 }
4492
4493 else if (type == ix86_opt_str)
4494 {
4495 if (p_strings[opt])
4496 {
4497 error ("option(\"%s\") was already specified", opt_string);
4498 ret = false;
4499 }
4500 else
4501 p_strings[opt] = xstrdup (p + opt_len);
4502 }
4503
4504 else if (type == ix86_opt_enum)
4505 {
4506 bool arg_ok;
4507 int value;
4508
4509 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4510 if (arg_ok)
4511 set_option (&global_options, enum_opts_set, opt, value,
4512 p + opt_len, DK_UNSPECIFIED, input_location,
4513 global_dc);
4514 else
4515 {
4516 error ("attribute(target(\"%s\")) is unknown", orig_p);
4517 ret = false;
4518 }
4519 }
4520
4521 else
4522 gcc_unreachable ();
4523 }
4524
4525 return ret;
4526 }
4527
4528 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4529
4530 tree
4531 ix86_valid_target_attribute_tree (tree args)
4532 {
4533 const char *orig_arch_string = ix86_arch_string;
4534 const char *orig_tune_string = ix86_tune_string;
4535 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4536 int orig_tune_defaulted = ix86_tune_defaulted;
4537 int orig_arch_specified = ix86_arch_specified;
4538 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4539 tree t = NULL_TREE;
4540 int i;
4541 struct cl_target_option *def
4542 = TREE_TARGET_OPTION (target_option_default_node);
4543 struct gcc_options enum_opts_set;
4544
4545 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4546
4547 /* Process each of the options on the chain. */
4548 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4549 &enum_opts_set))
4550 return NULL_TREE;
4551
4552 /* If the changed options are different from the default, rerun
4553 ix86_option_override_internal, and then save the options away.
4554 The string options are are attribute options, and will be undone
4555 when we copy the save structure. */
4556 if (ix86_isa_flags != def->x_ix86_isa_flags
4557 || target_flags != def->x_target_flags
4558 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4559 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4560 || enum_opts_set.x_ix86_fpmath)
4561 {
4562 /* If we are using the default tune= or arch=, undo the string assigned,
4563 and use the default. */
4564 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4565 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4566 else if (!orig_arch_specified)
4567 ix86_arch_string = NULL;
4568
4569 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4570 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4571 else if (orig_tune_defaulted)
4572 ix86_tune_string = NULL;
4573
4574 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4575 if (enum_opts_set.x_ix86_fpmath)
4576 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4577 else if (!TARGET_64BIT && TARGET_SSE)
4578 {
4579 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4580 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4581 }
4582
4583 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4584 ix86_option_override_internal (false);
4585
4586 /* Add any builtin functions with the new isa if any. */
4587 ix86_add_new_builtins (ix86_isa_flags);
4588
4589 /* Save the current options unless we are validating options for
4590 #pragma. */
4591 t = build_target_option_node ();
4592
4593 ix86_arch_string = orig_arch_string;
4594 ix86_tune_string = orig_tune_string;
4595 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4596
4597 /* Free up memory allocated to hold the strings */
4598 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4599 free (option_strings[i]);
4600 }
4601
4602 return t;
4603 }
4604
4605 /* Hook to validate attribute((target("string"))). */
4606
4607 static bool
4608 ix86_valid_target_attribute_p (tree fndecl,
4609 tree ARG_UNUSED (name),
4610 tree args,
4611 int ARG_UNUSED (flags))
4612 {
4613 struct cl_target_option cur_target;
4614 bool ret = true;
4615 tree old_optimize = build_optimization_node ();
4616 tree new_target, new_optimize;
4617 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4618
4619 /* If the function changed the optimization levels as well as setting target
4620 options, start with the optimizations specified. */
4621 if (func_optimize && func_optimize != old_optimize)
4622 cl_optimization_restore (&global_options,
4623 TREE_OPTIMIZATION (func_optimize));
4624
4625 /* The target attributes may also change some optimization flags, so update
4626 the optimization options if necessary. */
4627 cl_target_option_save (&cur_target, &global_options);
4628 new_target = ix86_valid_target_attribute_tree (args);
4629 new_optimize = build_optimization_node ();
4630
4631 if (!new_target)
4632 ret = false;
4633
4634 else if (fndecl)
4635 {
4636 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4637
4638 if (old_optimize != new_optimize)
4639 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4640 }
4641
4642 cl_target_option_restore (&global_options, &cur_target);
4643
4644 if (old_optimize != new_optimize)
4645 cl_optimization_restore (&global_options,
4646 TREE_OPTIMIZATION (old_optimize));
4647
4648 return ret;
4649 }
4650
4651 \f
4652 /* Hook to determine if one function can safely inline another. */
4653
4654 static bool
4655 ix86_can_inline_p (tree caller, tree callee)
4656 {
4657 bool ret = false;
4658 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4659 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4660
4661 /* If callee has no option attributes, then it is ok to inline. */
4662 if (!callee_tree)
4663 ret = true;
4664
4665 /* If caller has no option attributes, but callee does then it is not ok to
4666 inline. */
4667 else if (!caller_tree)
4668 ret = false;
4669
4670 else
4671 {
4672 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4673 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4674
4675 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4676 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4677 function. */
4678 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4679 != callee_opts->x_ix86_isa_flags)
4680 ret = false;
4681
4682 /* See if we have the same non-isa options. */
4683 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4684 ret = false;
4685
4686 /* See if arch, tune, etc. are the same. */
4687 else if (caller_opts->arch != callee_opts->arch)
4688 ret = false;
4689
4690 else if (caller_opts->tune != callee_opts->tune)
4691 ret = false;
4692
4693 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4694 ret = false;
4695
4696 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4697 ret = false;
4698
4699 else
4700 ret = true;
4701 }
4702
4703 return ret;
4704 }
4705
4706 \f
4707 /* Remember the last target of ix86_set_current_function. */
4708 static GTY(()) tree ix86_previous_fndecl;
4709
4710 /* Establish appropriate back-end context for processing the function
4711 FNDECL. The argument might be NULL to indicate processing at top
4712 level, outside of any function scope. */
4713 static void
4714 ix86_set_current_function (tree fndecl)
4715 {
4716 /* Only change the context if the function changes. This hook is called
4717 several times in the course of compiling a function, and we don't want to
4718 slow things down too much or call target_reinit when it isn't safe. */
4719 if (fndecl && fndecl != ix86_previous_fndecl)
4720 {
4721 tree old_tree = (ix86_previous_fndecl
4722 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4723 : NULL_TREE);
4724
4725 tree new_tree = (fndecl
4726 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4727 : NULL_TREE);
4728
4729 ix86_previous_fndecl = fndecl;
4730 if (old_tree == new_tree)
4731 ;
4732
4733 else if (new_tree)
4734 {
4735 cl_target_option_restore (&global_options,
4736 TREE_TARGET_OPTION (new_tree));
4737 target_reinit ();
4738 }
4739
4740 else if (old_tree)
4741 {
4742 struct cl_target_option *def
4743 = TREE_TARGET_OPTION (target_option_current_node);
4744
4745 cl_target_option_restore (&global_options, def);
4746 target_reinit ();
4747 }
4748 }
4749 }
4750
4751 \f
4752 /* Return true if this goes in large data/bss. */
4753
4754 static bool
4755 ix86_in_large_data_p (tree exp)
4756 {
4757 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4758 return false;
4759
4760 /* Functions are never large data. */
4761 if (TREE_CODE (exp) == FUNCTION_DECL)
4762 return false;
4763
4764 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4765 {
4766 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4767 if (strcmp (section, ".ldata") == 0
4768 || strcmp (section, ".lbss") == 0)
4769 return true;
4770 return false;
4771 }
4772 else
4773 {
4774 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4775
4776 /* If this is an incomplete type with size 0, then we can't put it
4777 in data because it might be too big when completed. */
4778 if (!size || size > ix86_section_threshold)
4779 return true;
4780 }
4781
4782 return false;
4783 }
4784
4785 /* Switch to the appropriate section for output of DECL.
4786 DECL is either a `VAR_DECL' node or a constant of some sort.
4787 RELOC indicates whether forming the initial value of DECL requires
4788 link-time relocations. */
4789
4790 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4791 ATTRIBUTE_UNUSED;
4792
4793 static section *
4794 x86_64_elf_select_section (tree decl, int reloc,
4795 unsigned HOST_WIDE_INT align)
4796 {
4797 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4798 && ix86_in_large_data_p (decl))
4799 {
4800 const char *sname = NULL;
4801 unsigned int flags = SECTION_WRITE;
4802 switch (categorize_decl_for_section (decl, reloc))
4803 {
4804 case SECCAT_DATA:
4805 sname = ".ldata";
4806 break;
4807 case SECCAT_DATA_REL:
4808 sname = ".ldata.rel";
4809 break;
4810 case SECCAT_DATA_REL_LOCAL:
4811 sname = ".ldata.rel.local";
4812 break;
4813 case SECCAT_DATA_REL_RO:
4814 sname = ".ldata.rel.ro";
4815 break;
4816 case SECCAT_DATA_REL_RO_LOCAL:
4817 sname = ".ldata.rel.ro.local";
4818 break;
4819 case SECCAT_BSS:
4820 sname = ".lbss";
4821 flags |= SECTION_BSS;
4822 break;
4823 case SECCAT_RODATA:
4824 case SECCAT_RODATA_MERGE_STR:
4825 case SECCAT_RODATA_MERGE_STR_INIT:
4826 case SECCAT_RODATA_MERGE_CONST:
4827 sname = ".lrodata";
4828 flags = 0;
4829 break;
4830 case SECCAT_SRODATA:
4831 case SECCAT_SDATA:
4832 case SECCAT_SBSS:
4833 gcc_unreachable ();
4834 case SECCAT_TEXT:
4835 case SECCAT_TDATA:
4836 case SECCAT_TBSS:
4837 /* We don't split these for medium model. Place them into
4838 default sections and hope for best. */
4839 break;
4840 }
4841 if (sname)
4842 {
4843 /* We might get called with string constants, but get_named_section
4844 doesn't like them as they are not DECLs. Also, we need to set
4845 flags in that case. */
4846 if (!DECL_P (decl))
4847 return get_section (sname, flags, NULL);
4848 return get_named_section (decl, sname, reloc);
4849 }
4850 }
4851 return default_elf_select_section (decl, reloc, align);
4852 }
4853
4854 /* Build up a unique section name, expressed as a
4855 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4856 RELOC indicates whether the initial value of EXP requires
4857 link-time relocations. */
4858
4859 static void ATTRIBUTE_UNUSED
4860 x86_64_elf_unique_section (tree decl, int reloc)
4861 {
4862 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4863 && ix86_in_large_data_p (decl))
4864 {
4865 const char *prefix = NULL;
4866 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4867 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4868
4869 switch (categorize_decl_for_section (decl, reloc))
4870 {
4871 case SECCAT_DATA:
4872 case SECCAT_DATA_REL:
4873 case SECCAT_DATA_REL_LOCAL:
4874 case SECCAT_DATA_REL_RO:
4875 case SECCAT_DATA_REL_RO_LOCAL:
4876 prefix = one_only ? ".ld" : ".ldata";
4877 break;
4878 case SECCAT_BSS:
4879 prefix = one_only ? ".lb" : ".lbss";
4880 break;
4881 case SECCAT_RODATA:
4882 case SECCAT_RODATA_MERGE_STR:
4883 case SECCAT_RODATA_MERGE_STR_INIT:
4884 case SECCAT_RODATA_MERGE_CONST:
4885 prefix = one_only ? ".lr" : ".lrodata";
4886 break;
4887 case SECCAT_SRODATA:
4888 case SECCAT_SDATA:
4889 case SECCAT_SBSS:
4890 gcc_unreachable ();
4891 case SECCAT_TEXT:
4892 case SECCAT_TDATA:
4893 case SECCAT_TBSS:
4894 /* We don't split these for medium model. Place them into
4895 default sections and hope for best. */
4896 break;
4897 }
4898 if (prefix)
4899 {
4900 const char *name, *linkonce;
4901 char *string;
4902
4903 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4904 name = targetm.strip_name_encoding (name);
4905
4906 /* If we're using one_only, then there needs to be a .gnu.linkonce
4907 prefix to the section name. */
4908 linkonce = one_only ? ".gnu.linkonce" : "";
4909
4910 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4911
4912 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4913 return;
4914 }
4915 }
4916 default_unique_section (decl, reloc);
4917 }
4918
4919 #ifdef COMMON_ASM_OP
4920 /* This says how to output assembler code to declare an
4921 uninitialized external linkage data object.
4922
4923 For medium model x86-64 we need to use .largecomm opcode for
4924 large objects. */
4925 void
4926 x86_elf_aligned_common (FILE *file,
4927 const char *name, unsigned HOST_WIDE_INT size,
4928 int align)
4929 {
4930 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4931 && size > (unsigned int)ix86_section_threshold)
4932 fputs (".largecomm\t", file);
4933 else
4934 fputs (COMMON_ASM_OP, file);
4935 assemble_name (file, name);
4936 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4937 size, align / BITS_PER_UNIT);
4938 }
4939 #endif
4940
4941 /* Utility function for targets to use in implementing
4942 ASM_OUTPUT_ALIGNED_BSS. */
4943
4944 void
4945 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4946 const char *name, unsigned HOST_WIDE_INT size,
4947 int align)
4948 {
4949 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4950 && size > (unsigned int)ix86_section_threshold)
4951 switch_to_section (get_named_section (decl, ".lbss", 0));
4952 else
4953 switch_to_section (bss_section);
4954 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4955 #ifdef ASM_DECLARE_OBJECT_NAME
4956 last_assemble_variable_decl = decl;
4957 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4958 #else
4959 /* Standard thing is just output label for the object. */
4960 ASM_OUTPUT_LABEL (file, name);
4961 #endif /* ASM_DECLARE_OBJECT_NAME */
4962 ASM_OUTPUT_SKIP (file, size ? size : 1);
4963 }
4964 \f
4965 /* Decide whether we must probe the stack before any space allocation
4966 on this target. It's essentially TARGET_STACK_PROBE except when
4967 -fstack-check causes the stack to be already probed differently. */
4968
4969 bool
4970 ix86_target_stack_probe (void)
4971 {
4972 /* Do not probe the stack twice if static stack checking is enabled. */
4973 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4974 return false;
4975
4976 return TARGET_STACK_PROBE;
4977 }
4978 \f
4979 /* Decide whether we can make a sibling call to a function. DECL is the
4980 declaration of the function being targeted by the call and EXP is the
4981 CALL_EXPR representing the call. */
4982
4983 static bool
4984 ix86_function_ok_for_sibcall (tree decl, tree exp)
4985 {
4986 tree type, decl_or_type;
4987 rtx a, b;
4988
4989 /* If we are generating position-independent code, we cannot sibcall
4990 optimize any indirect call, or a direct call to a global function,
4991 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4992 if (!TARGET_MACHO
4993 && !TARGET_64BIT
4994 && flag_pic
4995 && (!decl || !targetm.binds_local_p (decl)))
4996 return false;
4997
4998 /* If we need to align the outgoing stack, then sibcalling would
4999 unalign the stack, which may break the called function. */
5000 if (ix86_minimum_incoming_stack_boundary (true)
5001 < PREFERRED_STACK_BOUNDARY)
5002 return false;
5003
5004 if (decl)
5005 {
5006 decl_or_type = decl;
5007 type = TREE_TYPE (decl);
5008 }
5009 else
5010 {
5011 /* We're looking at the CALL_EXPR, we need the type of the function. */
5012 type = CALL_EXPR_FN (exp); /* pointer expression */
5013 type = TREE_TYPE (type); /* pointer type */
5014 type = TREE_TYPE (type); /* function type */
5015 decl_or_type = type;
5016 }
5017
5018 /* Check that the return value locations are the same. Like
5019 if we are returning floats on the 80387 register stack, we cannot
5020 make a sibcall from a function that doesn't return a float to a
5021 function that does or, conversely, from a function that does return
5022 a float to a function that doesn't; the necessary stack adjustment
5023 would not be executed. This is also the place we notice
5024 differences in the return value ABI. Note that it is ok for one
5025 of the functions to have void return type as long as the return
5026 value of the other is passed in a register. */
5027 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
5028 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
5029 cfun->decl, false);
5030 if (STACK_REG_P (a) || STACK_REG_P (b))
5031 {
5032 if (!rtx_equal_p (a, b))
5033 return false;
5034 }
5035 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
5036 {
5037 /* Disable sibcall if we need to generate vzeroupper after
5038 callee returns. */
5039 if (TARGET_VZEROUPPER
5040 && cfun->machine->callee_return_avx256_p
5041 && !cfun->machine->caller_return_avx256_p)
5042 return false;
5043 }
5044 else if (!rtx_equal_p (a, b))
5045 return false;
5046
5047 if (TARGET_64BIT)
5048 {
5049 /* The SYSV ABI has more call-clobbered registers;
5050 disallow sibcalls from MS to SYSV. */
5051 if (cfun->machine->call_abi == MS_ABI
5052 && ix86_function_type_abi (type) == SYSV_ABI)
5053 return false;
5054 }
5055 else
5056 {
5057 /* If this call is indirect, we'll need to be able to use a
5058 call-clobbered register for the address of the target function.
5059 Make sure that all such registers are not used for passing
5060 parameters. Note that DLLIMPORT functions are indirect. */
5061 if (!decl
5062 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
5063 {
5064 if (ix86_function_regparm (type, NULL) >= 3)
5065 {
5066 /* ??? Need to count the actual number of registers to be used,
5067 not the possible number of registers. Fix later. */
5068 return false;
5069 }
5070 }
5071 }
5072
5073 /* Otherwise okay. That also includes certain types of indirect calls. */
5074 return true;
5075 }
5076
5077 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
5078 and "sseregparm" calling convention attributes;
5079 arguments as in struct attribute_spec.handler. */
5080
5081 static tree
5082 ix86_handle_cconv_attribute (tree *node, tree name,
5083 tree args,
5084 int flags ATTRIBUTE_UNUSED,
5085 bool *no_add_attrs)
5086 {
5087 if (TREE_CODE (*node) != FUNCTION_TYPE
5088 && TREE_CODE (*node) != METHOD_TYPE
5089 && TREE_CODE (*node) != FIELD_DECL
5090 && TREE_CODE (*node) != TYPE_DECL)
5091 {
5092 warning (OPT_Wattributes, "%qE attribute only applies to functions",
5093 name);
5094 *no_add_attrs = true;
5095 return NULL_TREE;
5096 }
5097
5098 /* Can combine regparm with all attributes but fastcall, and thiscall. */
5099 if (is_attribute_p ("regparm", name))
5100 {
5101 tree cst;
5102
5103 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5104 {
5105 error ("fastcall and regparm attributes are not compatible");
5106 }
5107
5108 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5109 {
5110 error ("regparam and thiscall attributes are not compatible");
5111 }
5112
5113 cst = TREE_VALUE (args);
5114 if (TREE_CODE (cst) != INTEGER_CST)
5115 {
5116 warning (OPT_Wattributes,
5117 "%qE attribute requires an integer constant argument",
5118 name);
5119 *no_add_attrs = true;
5120 }
5121 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
5122 {
5123 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
5124 name, REGPARM_MAX);
5125 *no_add_attrs = true;
5126 }
5127
5128 return NULL_TREE;
5129 }
5130
5131 if (TARGET_64BIT)
5132 {
5133 /* Do not warn when emulating the MS ABI. */
5134 if ((TREE_CODE (*node) != FUNCTION_TYPE
5135 && TREE_CODE (*node) != METHOD_TYPE)
5136 || ix86_function_type_abi (*node) != MS_ABI)
5137 warning (OPT_Wattributes, "%qE attribute ignored",
5138 name);
5139 *no_add_attrs = true;
5140 return NULL_TREE;
5141 }
5142
5143 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
5144 if (is_attribute_p ("fastcall", name))
5145 {
5146 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5147 {
5148 error ("fastcall and cdecl attributes are not compatible");
5149 }
5150 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5151 {
5152 error ("fastcall and stdcall attributes are not compatible");
5153 }
5154 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5155 {
5156 error ("fastcall and regparm attributes are not compatible");
5157 }
5158 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5159 {
5160 error ("fastcall and thiscall attributes are not compatible");
5161 }
5162 }
5163
5164 /* Can combine stdcall with fastcall (redundant), regparm and
5165 sseregparm. */
5166 else if (is_attribute_p ("stdcall", name))
5167 {
5168 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5169 {
5170 error ("stdcall and cdecl attributes are not compatible");
5171 }
5172 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5173 {
5174 error ("stdcall and fastcall attributes are not compatible");
5175 }
5176 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5177 {
5178 error ("stdcall and thiscall attributes are not compatible");
5179 }
5180 }
5181
5182 /* Can combine cdecl with regparm and sseregparm. */
5183 else if (is_attribute_p ("cdecl", name))
5184 {
5185 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5186 {
5187 error ("stdcall and cdecl attributes are not compatible");
5188 }
5189 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5190 {
5191 error ("fastcall and cdecl attributes are not compatible");
5192 }
5193 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5194 {
5195 error ("cdecl and thiscall attributes are not compatible");
5196 }
5197 }
5198 else if (is_attribute_p ("thiscall", name))
5199 {
5200 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5201 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5202 name);
5203 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5204 {
5205 error ("stdcall and thiscall attributes are not compatible");
5206 }
5207 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5208 {
5209 error ("fastcall and thiscall attributes are not compatible");
5210 }
5211 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5212 {
5213 error ("cdecl and thiscall attributes are not compatible");
5214 }
5215 }
5216
5217 /* Can combine sseregparm with all attributes. */
5218
5219 return NULL_TREE;
5220 }
5221
5222 /* The transactional memory builtins are implicitly regparm or fastcall
5223 depending on the ABI. Override the generic do-nothing attribute that
5224 these builtins were declared with, and replace it with one of the two
5225 attributes that we expect elsewhere. */
5226
5227 static tree
5228 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5229 tree args ATTRIBUTE_UNUSED,
5230 int flags ATTRIBUTE_UNUSED,
5231 bool *no_add_attrs)
5232 {
5233 tree alt;
5234
5235 /* In no case do we want to add the placeholder attribute. */
5236 *no_add_attrs = true;
5237
5238 /* The 64-bit ABI is unchanged for transactional memory. */
5239 if (TARGET_64BIT)
5240 return NULL_TREE;
5241
5242 /* ??? Is there a better way to validate 32-bit windows? We have
5243 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5244 if (CHECK_STACK_LIMIT > 0)
5245 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5246 else
5247 {
5248 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5249 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5250 }
5251 decl_attributes (node, alt, flags);
5252
5253 return NULL_TREE;
5254 }
5255
5256 /* This function determines from TYPE the calling-convention. */
5257
5258 unsigned int
5259 ix86_get_callcvt (const_tree type)
5260 {
5261 unsigned int ret = 0;
5262 bool is_stdarg;
5263 tree attrs;
5264
5265 if (TARGET_64BIT)
5266 return IX86_CALLCVT_CDECL;
5267
5268 attrs = TYPE_ATTRIBUTES (type);
5269 if (attrs != NULL_TREE)
5270 {
5271 if (lookup_attribute ("cdecl", attrs))
5272 ret |= IX86_CALLCVT_CDECL;
5273 else if (lookup_attribute ("stdcall", attrs))
5274 ret |= IX86_CALLCVT_STDCALL;
5275 else if (lookup_attribute ("fastcall", attrs))
5276 ret |= IX86_CALLCVT_FASTCALL;
5277 else if (lookup_attribute ("thiscall", attrs))
5278 ret |= IX86_CALLCVT_THISCALL;
5279
5280 /* Regparam isn't allowed for thiscall and fastcall. */
5281 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5282 {
5283 if (lookup_attribute ("regparm", attrs))
5284 ret |= IX86_CALLCVT_REGPARM;
5285 if (lookup_attribute ("sseregparm", attrs))
5286 ret |= IX86_CALLCVT_SSEREGPARM;
5287 }
5288
5289 if (IX86_BASE_CALLCVT(ret) != 0)
5290 return ret;
5291 }
5292
5293 is_stdarg = stdarg_p (type);
5294 if (TARGET_RTD && !is_stdarg)
5295 return IX86_CALLCVT_STDCALL | ret;
5296
5297 if (ret != 0
5298 || is_stdarg
5299 || TREE_CODE (type) != METHOD_TYPE
5300 || ix86_function_type_abi (type) != MS_ABI)
5301 return IX86_CALLCVT_CDECL | ret;
5302
5303 return IX86_CALLCVT_THISCALL;
5304 }
5305
5306 /* Return 0 if the attributes for two types are incompatible, 1 if they
5307 are compatible, and 2 if they are nearly compatible (which causes a
5308 warning to be generated). */
5309
5310 static int
5311 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5312 {
5313 unsigned int ccvt1, ccvt2;
5314
5315 if (TREE_CODE (type1) != FUNCTION_TYPE
5316 && TREE_CODE (type1) != METHOD_TYPE)
5317 return 1;
5318
5319 ccvt1 = ix86_get_callcvt (type1);
5320 ccvt2 = ix86_get_callcvt (type2);
5321 if (ccvt1 != ccvt2)
5322 return 0;
5323 if (ix86_function_regparm (type1, NULL)
5324 != ix86_function_regparm (type2, NULL))
5325 return 0;
5326
5327 return 1;
5328 }
5329 \f
5330 /* Return the regparm value for a function with the indicated TYPE and DECL.
5331 DECL may be NULL when calling function indirectly
5332 or considering a libcall. */
5333
5334 static int
5335 ix86_function_regparm (const_tree type, const_tree decl)
5336 {
5337 tree attr;
5338 int regparm;
5339 unsigned int ccvt;
5340
5341 if (TARGET_64BIT)
5342 return (ix86_function_type_abi (type) == SYSV_ABI
5343 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5344 ccvt = ix86_get_callcvt (type);
5345 regparm = ix86_regparm;
5346
5347 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5348 {
5349 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5350 if (attr)
5351 {
5352 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5353 return regparm;
5354 }
5355 }
5356 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5357 return 2;
5358 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5359 return 1;
5360
5361 /* Use register calling convention for local functions when possible. */
5362 if (decl
5363 && TREE_CODE (decl) == FUNCTION_DECL
5364 && optimize
5365 && !(profile_flag && !flag_fentry))
5366 {
5367 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5368 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5369 if (i && i->local && i->can_change_signature)
5370 {
5371 int local_regparm, globals = 0, regno;
5372
5373 /* Make sure no regparm register is taken by a
5374 fixed register variable. */
5375 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5376 if (fixed_regs[local_regparm])
5377 break;
5378
5379 /* We don't want to use regparm(3) for nested functions as
5380 these use a static chain pointer in the third argument. */
5381 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5382 local_regparm = 2;
5383
5384 /* In 32-bit mode save a register for the split stack. */
5385 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5386 local_regparm = 2;
5387
5388 /* Each fixed register usage increases register pressure,
5389 so less registers should be used for argument passing.
5390 This functionality can be overriden by an explicit
5391 regparm value. */
5392 for (regno = 0; regno <= DI_REG; regno++)
5393 if (fixed_regs[regno])
5394 globals++;
5395
5396 local_regparm
5397 = globals < local_regparm ? local_regparm - globals : 0;
5398
5399 if (local_regparm > regparm)
5400 regparm = local_regparm;
5401 }
5402 }
5403
5404 return regparm;
5405 }
5406
5407 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5408 DFmode (2) arguments in SSE registers for a function with the
5409 indicated TYPE and DECL. DECL may be NULL when calling function
5410 indirectly or considering a libcall. Otherwise return 0. */
5411
5412 static int
5413 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5414 {
5415 gcc_assert (!TARGET_64BIT);
5416
5417 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5418 by the sseregparm attribute. */
5419 if (TARGET_SSEREGPARM
5420 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5421 {
5422 if (!TARGET_SSE)
5423 {
5424 if (warn)
5425 {
5426 if (decl)
5427 error ("calling %qD with attribute sseregparm without "
5428 "SSE/SSE2 enabled", decl);
5429 else
5430 error ("calling %qT with attribute sseregparm without "
5431 "SSE/SSE2 enabled", type);
5432 }
5433 return 0;
5434 }
5435
5436 return 2;
5437 }
5438
5439 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5440 (and DFmode for SSE2) arguments in SSE registers. */
5441 if (decl && TARGET_SSE_MATH && optimize
5442 && !(profile_flag && !flag_fentry))
5443 {
5444 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5445 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5446 if (i && i->local && i->can_change_signature)
5447 return TARGET_SSE2 ? 2 : 1;
5448 }
5449
5450 return 0;
5451 }
5452
5453 /* Return true if EAX is live at the start of the function. Used by
5454 ix86_expand_prologue to determine if we need special help before
5455 calling allocate_stack_worker. */
5456
5457 static bool
5458 ix86_eax_live_at_start_p (void)
5459 {
5460 /* Cheat. Don't bother working forward from ix86_function_regparm
5461 to the function type to whether an actual argument is located in
5462 eax. Instead just look at cfg info, which is still close enough
5463 to correct at this point. This gives false positives for broken
5464 functions that might use uninitialized data that happens to be
5465 allocated in eax, but who cares? */
5466 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5467 }
5468
5469 static bool
5470 ix86_keep_aggregate_return_pointer (tree fntype)
5471 {
5472 tree attr;
5473
5474 if (!TARGET_64BIT)
5475 {
5476 attr = lookup_attribute ("callee_pop_aggregate_return",
5477 TYPE_ATTRIBUTES (fntype));
5478 if (attr)
5479 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5480
5481 /* For 32-bit MS-ABI the default is to keep aggregate
5482 return pointer. */
5483 if (ix86_function_type_abi (fntype) == MS_ABI)
5484 return true;
5485 }
5486 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5487 }
5488
5489 /* Value is the number of bytes of arguments automatically
5490 popped when returning from a subroutine call.
5491 FUNDECL is the declaration node of the function (as a tree),
5492 FUNTYPE is the data type of the function (as a tree),
5493 or for a library call it is an identifier node for the subroutine name.
5494 SIZE is the number of bytes of arguments passed on the stack.
5495
5496 On the 80386, the RTD insn may be used to pop them if the number
5497 of args is fixed, but if the number is variable then the caller
5498 must pop them all. RTD can't be used for library calls now
5499 because the library is compiled with the Unix compiler.
5500 Use of RTD is a selectable option, since it is incompatible with
5501 standard Unix calling sequences. If the option is not selected,
5502 the caller must always pop the args.
5503
5504 The attribute stdcall is equivalent to RTD on a per module basis. */
5505
5506 static int
5507 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5508 {
5509 unsigned int ccvt;
5510
5511 /* None of the 64-bit ABIs pop arguments. */
5512 if (TARGET_64BIT)
5513 return 0;
5514
5515 ccvt = ix86_get_callcvt (funtype);
5516
5517 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5518 | IX86_CALLCVT_THISCALL)) != 0
5519 && ! stdarg_p (funtype))
5520 return size;
5521
5522 /* Lose any fake structure return argument if it is passed on the stack. */
5523 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5524 && !ix86_keep_aggregate_return_pointer (funtype))
5525 {
5526 int nregs = ix86_function_regparm (funtype, fundecl);
5527 if (nregs == 0)
5528 return GET_MODE_SIZE (Pmode);
5529 }
5530
5531 return 0;
5532 }
5533 \f
5534 /* Argument support functions. */
5535
5536 /* Return true when register may be used to pass function parameters. */
5537 bool
5538 ix86_function_arg_regno_p (int regno)
5539 {
5540 int i;
5541 const int *parm_regs;
5542
5543 if (!TARGET_64BIT)
5544 {
5545 if (TARGET_MACHO)
5546 return (regno < REGPARM_MAX
5547 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5548 else
5549 return (regno < REGPARM_MAX
5550 || (TARGET_MMX && MMX_REGNO_P (regno)
5551 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5552 || (TARGET_SSE && SSE_REGNO_P (regno)
5553 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5554 }
5555
5556 if (TARGET_MACHO)
5557 {
5558 if (SSE_REGNO_P (regno) && TARGET_SSE)
5559 return true;
5560 }
5561 else
5562 {
5563 if (TARGET_SSE && SSE_REGNO_P (regno)
5564 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5565 return true;
5566 }
5567
5568 /* TODO: The function should depend on current function ABI but
5569 builtins.c would need updating then. Therefore we use the
5570 default ABI. */
5571
5572 /* RAX is used as hidden argument to va_arg functions. */
5573 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5574 return true;
5575
5576 if (ix86_abi == MS_ABI)
5577 parm_regs = x86_64_ms_abi_int_parameter_registers;
5578 else
5579 parm_regs = x86_64_int_parameter_registers;
5580 for (i = 0; i < (ix86_abi == MS_ABI
5581 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5582 if (regno == parm_regs[i])
5583 return true;
5584 return false;
5585 }
5586
5587 /* Return if we do not know how to pass TYPE solely in registers. */
5588
5589 static bool
5590 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5591 {
5592 if (must_pass_in_stack_var_size_or_pad (mode, type))
5593 return true;
5594
5595 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5596 The layout_type routine is crafty and tries to trick us into passing
5597 currently unsupported vector types on the stack by using TImode. */
5598 return (!TARGET_64BIT && mode == TImode
5599 && type && TREE_CODE (type) != VECTOR_TYPE);
5600 }
5601
5602 /* It returns the size, in bytes, of the area reserved for arguments passed
5603 in registers for the function represented by fndecl dependent to the used
5604 abi format. */
5605 int
5606 ix86_reg_parm_stack_space (const_tree fndecl)
5607 {
5608 enum calling_abi call_abi = SYSV_ABI;
5609 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5610 call_abi = ix86_function_abi (fndecl);
5611 else
5612 call_abi = ix86_function_type_abi (fndecl);
5613 if (TARGET_64BIT && call_abi == MS_ABI)
5614 return 32;
5615 return 0;
5616 }
5617
5618 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5619 call abi used. */
5620 enum calling_abi
5621 ix86_function_type_abi (const_tree fntype)
5622 {
5623 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5624 {
5625 enum calling_abi abi = ix86_abi;
5626 if (abi == SYSV_ABI)
5627 {
5628 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5629 abi = MS_ABI;
5630 }
5631 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5632 abi = SYSV_ABI;
5633 return abi;
5634 }
5635 return ix86_abi;
5636 }
5637
5638 static bool
5639 ix86_function_ms_hook_prologue (const_tree fn)
5640 {
5641 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5642 {
5643 if (decl_function_context (fn) != NULL_TREE)
5644 error_at (DECL_SOURCE_LOCATION (fn),
5645 "ms_hook_prologue is not compatible with nested function");
5646 else
5647 return true;
5648 }
5649 return false;
5650 }
5651
5652 static enum calling_abi
5653 ix86_function_abi (const_tree fndecl)
5654 {
5655 if (! fndecl)
5656 return ix86_abi;
5657 return ix86_function_type_abi (TREE_TYPE (fndecl));
5658 }
5659
5660 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5661 call abi used. */
5662 enum calling_abi
5663 ix86_cfun_abi (void)
5664 {
5665 if (! cfun)
5666 return ix86_abi;
5667 return cfun->machine->call_abi;
5668 }
5669
5670 /* Write the extra assembler code needed to declare a function properly. */
5671
5672 void
5673 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5674 tree decl)
5675 {
5676 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5677
5678 if (is_ms_hook)
5679 {
5680 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5681 unsigned int filler_cc = 0xcccccccc;
5682
5683 for (i = 0; i < filler_count; i += 4)
5684 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5685 }
5686
5687 #ifdef SUBTARGET_ASM_UNWIND_INIT
5688 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5689 #endif
5690
5691 ASM_OUTPUT_LABEL (asm_out_file, fname);
5692
5693 /* Output magic byte marker, if hot-patch attribute is set. */
5694 if (is_ms_hook)
5695 {
5696 if (TARGET_64BIT)
5697 {
5698 /* leaq [%rsp + 0], %rsp */
5699 asm_fprintf (asm_out_file, ASM_BYTE
5700 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5701 }
5702 else
5703 {
5704 /* movl.s %edi, %edi
5705 push %ebp
5706 movl.s %esp, %ebp */
5707 asm_fprintf (asm_out_file, ASM_BYTE
5708 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5709 }
5710 }
5711 }
5712
5713 /* regclass.c */
5714 extern void init_regs (void);
5715
5716 /* Implementation of call abi switching target hook. Specific to FNDECL
5717 the specific call register sets are set. See also
5718 ix86_conditional_register_usage for more details. */
5719 void
5720 ix86_call_abi_override (const_tree fndecl)
5721 {
5722 if (fndecl == NULL_TREE)
5723 cfun->machine->call_abi = ix86_abi;
5724 else
5725 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5726 }
5727
5728 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5729 expensive re-initialization of init_regs each time we switch function context
5730 since this is needed only during RTL expansion. */
5731 static void
5732 ix86_maybe_switch_abi (void)
5733 {
5734 if (TARGET_64BIT &&
5735 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5736 reinit_regs ();
5737 }
5738
5739 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5740 for a call to a function whose data type is FNTYPE.
5741 For a library call, FNTYPE is 0. */
5742
5743 void
5744 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5745 tree fntype, /* tree ptr for function decl */
5746 rtx libname, /* SYMBOL_REF of library name or 0 */
5747 tree fndecl,
5748 int caller)
5749 {
5750 struct cgraph_local_info *i;
5751 tree fnret_type;
5752
5753 memset (cum, 0, sizeof (*cum));
5754
5755 /* Initialize for the current callee. */
5756 if (caller)
5757 {
5758 cfun->machine->callee_pass_avx256_p = false;
5759 cfun->machine->callee_return_avx256_p = false;
5760 }
5761
5762 if (fndecl)
5763 {
5764 i = cgraph_local_info (fndecl);
5765 cum->call_abi = ix86_function_abi (fndecl);
5766 fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5767 }
5768 else
5769 {
5770 i = NULL;
5771 cum->call_abi = ix86_function_type_abi (fntype);
5772 if (fntype)
5773 fnret_type = TREE_TYPE (fntype);
5774 else
5775 fnret_type = NULL;
5776 }
5777
5778 if (TARGET_VZEROUPPER && fnret_type)
5779 {
5780 rtx fnret_value = ix86_function_value (fnret_type, fntype,
5781 false);
5782 if (function_pass_avx256_p (fnret_value))
5783 {
5784 /* The return value of this function uses 256bit AVX modes. */
5785 if (caller)
5786 cfun->machine->callee_return_avx256_p = true;
5787 else
5788 cfun->machine->caller_return_avx256_p = true;
5789 }
5790 }
5791
5792 cum->caller = caller;
5793
5794 /* Set up the number of registers to use for passing arguments. */
5795
5796 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5797 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5798 "or subtarget optimization implying it");
5799 cum->nregs = ix86_regparm;
5800 if (TARGET_64BIT)
5801 {
5802 cum->nregs = (cum->call_abi == SYSV_ABI
5803 ? X86_64_REGPARM_MAX
5804 : X86_64_MS_REGPARM_MAX);
5805 }
5806 if (TARGET_SSE)
5807 {
5808 cum->sse_nregs = SSE_REGPARM_MAX;
5809 if (TARGET_64BIT)
5810 {
5811 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5812 ? X86_64_SSE_REGPARM_MAX
5813 : X86_64_MS_SSE_REGPARM_MAX);
5814 }
5815 }
5816 if (TARGET_MMX)
5817 cum->mmx_nregs = MMX_REGPARM_MAX;
5818 cum->warn_avx = true;
5819 cum->warn_sse = true;
5820 cum->warn_mmx = true;
5821
5822 /* Because type might mismatch in between caller and callee, we need to
5823 use actual type of function for local calls.
5824 FIXME: cgraph_analyze can be told to actually record if function uses
5825 va_start so for local functions maybe_vaarg can be made aggressive
5826 helping K&R code.
5827 FIXME: once typesytem is fixed, we won't need this code anymore. */
5828 if (i && i->local && i->can_change_signature)
5829 fntype = TREE_TYPE (fndecl);
5830 cum->maybe_vaarg = (fntype
5831 ? (!prototype_p (fntype) || stdarg_p (fntype))
5832 : !libname);
5833
5834 if (!TARGET_64BIT)
5835 {
5836 /* If there are variable arguments, then we won't pass anything
5837 in registers in 32-bit mode. */
5838 if (stdarg_p (fntype))
5839 {
5840 cum->nregs = 0;
5841 cum->sse_nregs = 0;
5842 cum->mmx_nregs = 0;
5843 cum->warn_avx = 0;
5844 cum->warn_sse = 0;
5845 cum->warn_mmx = 0;
5846 return;
5847 }
5848
5849 /* Use ecx and edx registers if function has fastcall attribute,
5850 else look for regparm information. */
5851 if (fntype)
5852 {
5853 unsigned int ccvt = ix86_get_callcvt (fntype);
5854 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5855 {
5856 cum->nregs = 1;
5857 cum->fastcall = 1; /* Same first register as in fastcall. */
5858 }
5859 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5860 {
5861 cum->nregs = 2;
5862 cum->fastcall = 1;
5863 }
5864 else
5865 cum->nregs = ix86_function_regparm (fntype, fndecl);
5866 }
5867
5868 /* Set up the number of SSE registers used for passing SFmode
5869 and DFmode arguments. Warn for mismatching ABI. */
5870 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5871 }
5872 }
5873
5874 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5875 But in the case of vector types, it is some vector mode.
5876
5877 When we have only some of our vector isa extensions enabled, then there
5878 are some modes for which vector_mode_supported_p is false. For these
5879 modes, the generic vector support in gcc will choose some non-vector mode
5880 in order to implement the type. By computing the natural mode, we'll
5881 select the proper ABI location for the operand and not depend on whatever
5882 the middle-end decides to do with these vector types.
5883
5884 The midde-end can't deal with the vector types > 16 bytes. In this
5885 case, we return the original mode and warn ABI change if CUM isn't
5886 NULL. */
5887
5888 static enum machine_mode
5889 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5890 {
5891 enum machine_mode mode = TYPE_MODE (type);
5892
5893 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5894 {
5895 HOST_WIDE_INT size = int_size_in_bytes (type);
5896 if ((size == 8 || size == 16 || size == 32)
5897 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5898 && TYPE_VECTOR_SUBPARTS (type) > 1)
5899 {
5900 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5901
5902 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5903 mode = MIN_MODE_VECTOR_FLOAT;
5904 else
5905 mode = MIN_MODE_VECTOR_INT;
5906
5907 /* Get the mode which has this inner mode and number of units. */
5908 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5909 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5910 && GET_MODE_INNER (mode) == innermode)
5911 {
5912 if (size == 32 && !TARGET_AVX)
5913 {
5914 static bool warnedavx;
5915
5916 if (cum
5917 && !warnedavx
5918 && cum->warn_avx)
5919 {
5920 warnedavx = true;
5921 warning (0, "AVX vector argument without AVX "
5922 "enabled changes the ABI");
5923 }
5924 return TYPE_MODE (type);
5925 }
5926 else if ((size == 8 || size == 16) && !TARGET_SSE)
5927 {
5928 static bool warnedsse;
5929
5930 if (cum
5931 && !warnedsse
5932 && cum->warn_sse)
5933 {
5934 warnedsse = true;
5935 warning (0, "SSE vector argument without SSE "
5936 "enabled changes the ABI");
5937 }
5938 return mode;
5939 }
5940 else
5941 return mode;
5942 }
5943
5944 gcc_unreachable ();
5945 }
5946 }
5947
5948 return mode;
5949 }
5950
5951 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5952 this may not agree with the mode that the type system has chosen for the
5953 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5954 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5955
5956 static rtx
5957 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5958 unsigned int regno)
5959 {
5960 rtx tmp;
5961
5962 if (orig_mode != BLKmode)
5963 tmp = gen_rtx_REG (orig_mode, regno);
5964 else
5965 {
5966 tmp = gen_rtx_REG (mode, regno);
5967 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5968 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5969 }
5970
5971 return tmp;
5972 }
5973
5974 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5975 of this code is to classify each 8bytes of incoming argument by the register
5976 class and assign registers accordingly. */
5977
5978 /* Return the union class of CLASS1 and CLASS2.
5979 See the x86-64 PS ABI for details. */
5980
5981 static enum x86_64_reg_class
5982 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5983 {
5984 /* Rule #1: If both classes are equal, this is the resulting class. */
5985 if (class1 == class2)
5986 return class1;
5987
5988 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5989 the other class. */
5990 if (class1 == X86_64_NO_CLASS)
5991 return class2;
5992 if (class2 == X86_64_NO_CLASS)
5993 return class1;
5994
5995 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5996 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5997 return X86_64_MEMORY_CLASS;
5998
5999 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
6000 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
6001 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
6002 return X86_64_INTEGERSI_CLASS;
6003 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
6004 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
6005 return X86_64_INTEGER_CLASS;
6006
6007 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
6008 MEMORY is used. */
6009 if (class1 == X86_64_X87_CLASS
6010 || class1 == X86_64_X87UP_CLASS
6011 || class1 == X86_64_COMPLEX_X87_CLASS
6012 || class2 == X86_64_X87_CLASS
6013 || class2 == X86_64_X87UP_CLASS
6014 || class2 == X86_64_COMPLEX_X87_CLASS)
6015 return X86_64_MEMORY_CLASS;
6016
6017 /* Rule #6: Otherwise class SSE is used. */
6018 return X86_64_SSE_CLASS;
6019 }
6020
6021 /* Classify the argument of type TYPE and mode MODE.
6022 CLASSES will be filled by the register class used to pass each word
6023 of the operand. The number of words is returned. In case the parameter
6024 should be passed in memory, 0 is returned. As a special case for zero
6025 sized containers, classes[0] will be NO_CLASS and 1 is returned.
6026
6027 BIT_OFFSET is used internally for handling records and specifies offset
6028 of the offset in bits modulo 256 to avoid overflow cases.
6029
6030 See the x86-64 PS ABI for details.
6031 */
6032
6033 static int
6034 classify_argument (enum machine_mode mode, const_tree type,
6035 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
6036 {
6037 HOST_WIDE_INT bytes =
6038 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6039 int words
6040 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6041
6042 /* Variable sized entities are always passed/returned in memory. */
6043 if (bytes < 0)
6044 return 0;
6045
6046 if (mode != VOIDmode
6047 && targetm.calls.must_pass_in_stack (mode, type))
6048 return 0;
6049
6050 if (type && AGGREGATE_TYPE_P (type))
6051 {
6052 int i;
6053 tree field;
6054 enum x86_64_reg_class subclasses[MAX_CLASSES];
6055
6056 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
6057 if (bytes > 32)
6058 return 0;
6059
6060 for (i = 0; i < words; i++)
6061 classes[i] = X86_64_NO_CLASS;
6062
6063 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
6064 signalize memory class, so handle it as special case. */
6065 if (!words)
6066 {
6067 classes[0] = X86_64_NO_CLASS;
6068 return 1;
6069 }
6070
6071 /* Classify each field of record and merge classes. */
6072 switch (TREE_CODE (type))
6073 {
6074 case RECORD_TYPE:
6075 /* And now merge the fields of structure. */
6076 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6077 {
6078 if (TREE_CODE (field) == FIELD_DECL)
6079 {
6080 int num;
6081
6082 if (TREE_TYPE (field) == error_mark_node)
6083 continue;
6084
6085 /* Bitfields are always classified as integer. Handle them
6086 early, since later code would consider them to be
6087 misaligned integers. */
6088 if (DECL_BIT_FIELD (field))
6089 {
6090 for (i = (int_bit_position (field)
6091 + (bit_offset % 64)) / 8 / 8;
6092 i < ((int_bit_position (field) + (bit_offset % 64))
6093 + tree_low_cst (DECL_SIZE (field), 0)
6094 + 63) / 8 / 8; i++)
6095 classes[i] =
6096 merge_classes (X86_64_INTEGER_CLASS,
6097 classes[i]);
6098 }
6099 else
6100 {
6101 int pos;
6102
6103 type = TREE_TYPE (field);
6104
6105 /* Flexible array member is ignored. */
6106 if (TYPE_MODE (type) == BLKmode
6107 && TREE_CODE (type) == ARRAY_TYPE
6108 && TYPE_SIZE (type) == NULL_TREE
6109 && TYPE_DOMAIN (type) != NULL_TREE
6110 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6111 == NULL_TREE))
6112 {
6113 static bool warned;
6114
6115 if (!warned && warn_psabi)
6116 {
6117 warned = true;
6118 inform (input_location,
6119 "the ABI of passing struct with"
6120 " a flexible array member has"
6121 " changed in GCC 4.4");
6122 }
6123 continue;
6124 }
6125 num = classify_argument (TYPE_MODE (type), type,
6126 subclasses,
6127 (int_bit_position (field)
6128 + bit_offset) % 256);
6129 if (!num)
6130 return 0;
6131 pos = (int_bit_position (field)
6132 + (bit_offset % 64)) / 8 / 8;
6133 for (i = 0; i < num && (i + pos) < words; i++)
6134 classes[i + pos] =
6135 merge_classes (subclasses[i], classes[i + pos]);
6136 }
6137 }
6138 }
6139 break;
6140
6141 case ARRAY_TYPE:
6142 /* Arrays are handled as small records. */
6143 {
6144 int num;
6145 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6146 TREE_TYPE (type), subclasses, bit_offset);
6147 if (!num)
6148 return 0;
6149
6150 /* The partial classes are now full classes. */
6151 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6152 subclasses[0] = X86_64_SSE_CLASS;
6153 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6154 && !((bit_offset % 64) == 0 && bytes == 4))
6155 subclasses[0] = X86_64_INTEGER_CLASS;
6156
6157 for (i = 0; i < words; i++)
6158 classes[i] = subclasses[i % num];
6159
6160 break;
6161 }
6162 case UNION_TYPE:
6163 case QUAL_UNION_TYPE:
6164 /* Unions are similar to RECORD_TYPE but offset is always 0.
6165 */
6166 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6167 {
6168 if (TREE_CODE (field) == FIELD_DECL)
6169 {
6170 int num;
6171
6172 if (TREE_TYPE (field) == error_mark_node)
6173 continue;
6174
6175 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6176 TREE_TYPE (field), subclasses,
6177 bit_offset);
6178 if (!num)
6179 return 0;
6180 for (i = 0; i < num; i++)
6181 classes[i] = merge_classes (subclasses[i], classes[i]);
6182 }
6183 }
6184 break;
6185
6186 default:
6187 gcc_unreachable ();
6188 }
6189
6190 if (words > 2)
6191 {
6192 /* When size > 16 bytes, if the first one isn't
6193 X86_64_SSE_CLASS or any other ones aren't
6194 X86_64_SSEUP_CLASS, everything should be passed in
6195 memory. */
6196 if (classes[0] != X86_64_SSE_CLASS)
6197 return 0;
6198
6199 for (i = 1; i < words; i++)
6200 if (classes[i] != X86_64_SSEUP_CLASS)
6201 return 0;
6202 }
6203
6204 /* Final merger cleanup. */
6205 for (i = 0; i < words; i++)
6206 {
6207 /* If one class is MEMORY, everything should be passed in
6208 memory. */
6209 if (classes[i] == X86_64_MEMORY_CLASS)
6210 return 0;
6211
6212 /* The X86_64_SSEUP_CLASS should be always preceded by
6213 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6214 if (classes[i] == X86_64_SSEUP_CLASS
6215 && classes[i - 1] != X86_64_SSE_CLASS
6216 && classes[i - 1] != X86_64_SSEUP_CLASS)
6217 {
6218 /* The first one should never be X86_64_SSEUP_CLASS. */
6219 gcc_assert (i != 0);
6220 classes[i] = X86_64_SSE_CLASS;
6221 }
6222
6223 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6224 everything should be passed in memory. */
6225 if (classes[i] == X86_64_X87UP_CLASS
6226 && (classes[i - 1] != X86_64_X87_CLASS))
6227 {
6228 static bool warned;
6229
6230 /* The first one should never be X86_64_X87UP_CLASS. */
6231 gcc_assert (i != 0);
6232 if (!warned && warn_psabi)
6233 {
6234 warned = true;
6235 inform (input_location,
6236 "the ABI of passing union with long double"
6237 " has changed in GCC 4.4");
6238 }
6239 return 0;
6240 }
6241 }
6242 return words;
6243 }
6244
6245 /* Compute alignment needed. We align all types to natural boundaries with
6246 exception of XFmode that is aligned to 64bits. */
6247 if (mode != VOIDmode && mode != BLKmode)
6248 {
6249 int mode_alignment = GET_MODE_BITSIZE (mode);
6250
6251 if (mode == XFmode)
6252 mode_alignment = 128;
6253 else if (mode == XCmode)
6254 mode_alignment = 256;
6255 if (COMPLEX_MODE_P (mode))
6256 mode_alignment /= 2;
6257 /* Misaligned fields are always returned in memory. */
6258 if (bit_offset % mode_alignment)
6259 return 0;
6260 }
6261
6262 /* for V1xx modes, just use the base mode */
6263 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6264 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6265 mode = GET_MODE_INNER (mode);
6266
6267 /* Classification of atomic types. */
6268 switch (mode)
6269 {
6270 case SDmode:
6271 case DDmode:
6272 classes[0] = X86_64_SSE_CLASS;
6273 return 1;
6274 case TDmode:
6275 classes[0] = X86_64_SSE_CLASS;
6276 classes[1] = X86_64_SSEUP_CLASS;
6277 return 2;
6278 case DImode:
6279 case SImode:
6280 case HImode:
6281 case QImode:
6282 case CSImode:
6283 case CHImode:
6284 case CQImode:
6285 {
6286 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6287
6288 if (size <= 32)
6289 {
6290 classes[0] = X86_64_INTEGERSI_CLASS;
6291 return 1;
6292 }
6293 else if (size <= 64)
6294 {
6295 classes[0] = X86_64_INTEGER_CLASS;
6296 return 1;
6297 }
6298 else if (size <= 64+32)
6299 {
6300 classes[0] = X86_64_INTEGER_CLASS;
6301 classes[1] = X86_64_INTEGERSI_CLASS;
6302 return 2;
6303 }
6304 else if (size <= 64+64)
6305 {
6306 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6307 return 2;
6308 }
6309 else
6310 gcc_unreachable ();
6311 }
6312 case CDImode:
6313 case TImode:
6314 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6315 return 2;
6316 case COImode:
6317 case OImode:
6318 /* OImode shouldn't be used directly. */
6319 gcc_unreachable ();
6320 case CTImode:
6321 return 0;
6322 case SFmode:
6323 if (!(bit_offset % 64))
6324 classes[0] = X86_64_SSESF_CLASS;
6325 else
6326 classes[0] = X86_64_SSE_CLASS;
6327 return 1;
6328 case DFmode:
6329 classes[0] = X86_64_SSEDF_CLASS;
6330 return 1;
6331 case XFmode:
6332 classes[0] = X86_64_X87_CLASS;
6333 classes[1] = X86_64_X87UP_CLASS;
6334 return 2;
6335 case TFmode:
6336 classes[0] = X86_64_SSE_CLASS;
6337 classes[1] = X86_64_SSEUP_CLASS;
6338 return 2;
6339 case SCmode:
6340 classes[0] = X86_64_SSE_CLASS;
6341 if (!(bit_offset % 64))
6342 return 1;
6343 else
6344 {
6345 static bool warned;
6346
6347 if (!warned && warn_psabi)
6348 {
6349 warned = true;
6350 inform (input_location,
6351 "the ABI of passing structure with complex float"
6352 " member has changed in GCC 4.4");
6353 }
6354 classes[1] = X86_64_SSESF_CLASS;
6355 return 2;
6356 }
6357 case DCmode:
6358 classes[0] = X86_64_SSEDF_CLASS;
6359 classes[1] = X86_64_SSEDF_CLASS;
6360 return 2;
6361 case XCmode:
6362 classes[0] = X86_64_COMPLEX_X87_CLASS;
6363 return 1;
6364 case TCmode:
6365 /* This modes is larger than 16 bytes. */
6366 return 0;
6367 case V8SFmode:
6368 case V8SImode:
6369 case V32QImode:
6370 case V16HImode:
6371 case V4DFmode:
6372 case V4DImode:
6373 classes[0] = X86_64_SSE_CLASS;
6374 classes[1] = X86_64_SSEUP_CLASS;
6375 classes[2] = X86_64_SSEUP_CLASS;
6376 classes[3] = X86_64_SSEUP_CLASS;
6377 return 4;
6378 case V4SFmode:
6379 case V4SImode:
6380 case V16QImode:
6381 case V8HImode:
6382 case V2DFmode:
6383 case V2DImode:
6384 classes[0] = X86_64_SSE_CLASS;
6385 classes[1] = X86_64_SSEUP_CLASS;
6386 return 2;
6387 case V1TImode:
6388 case V1DImode:
6389 case V2SFmode:
6390 case V2SImode:
6391 case V4HImode:
6392 case V8QImode:
6393 classes[0] = X86_64_SSE_CLASS;
6394 return 1;
6395 case BLKmode:
6396 case VOIDmode:
6397 return 0;
6398 default:
6399 gcc_assert (VECTOR_MODE_P (mode));
6400
6401 if (bytes > 16)
6402 return 0;
6403
6404 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6405
6406 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6407 classes[0] = X86_64_INTEGERSI_CLASS;
6408 else
6409 classes[0] = X86_64_INTEGER_CLASS;
6410 classes[1] = X86_64_INTEGER_CLASS;
6411 return 1 + (bytes > 8);
6412 }
6413 }
6414
6415 /* Examine the argument and return set number of register required in each
6416 class. Return 0 iff parameter should be passed in memory. */
6417 static int
6418 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6419 int *int_nregs, int *sse_nregs)
6420 {
6421 enum x86_64_reg_class regclass[MAX_CLASSES];
6422 int n = classify_argument (mode, type, regclass, 0);
6423
6424 *int_nregs = 0;
6425 *sse_nregs = 0;
6426 if (!n)
6427 return 0;
6428 for (n--; n >= 0; n--)
6429 switch (regclass[n])
6430 {
6431 case X86_64_INTEGER_CLASS:
6432 case X86_64_INTEGERSI_CLASS:
6433 (*int_nregs)++;
6434 break;
6435 case X86_64_SSE_CLASS:
6436 case X86_64_SSESF_CLASS:
6437 case X86_64_SSEDF_CLASS:
6438 (*sse_nregs)++;
6439 break;
6440 case X86_64_NO_CLASS:
6441 case X86_64_SSEUP_CLASS:
6442 break;
6443 case X86_64_X87_CLASS:
6444 case X86_64_X87UP_CLASS:
6445 if (!in_return)
6446 return 0;
6447 break;
6448 case X86_64_COMPLEX_X87_CLASS:
6449 return in_return ? 2 : 0;
6450 case X86_64_MEMORY_CLASS:
6451 gcc_unreachable ();
6452 }
6453 return 1;
6454 }
6455
6456 /* Construct container for the argument used by GCC interface. See
6457 FUNCTION_ARG for the detailed description. */
6458
6459 static rtx
6460 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6461 const_tree type, int in_return, int nintregs, int nsseregs,
6462 const int *intreg, int sse_regno)
6463 {
6464 /* The following variables hold the static issued_error state. */
6465 static bool issued_sse_arg_error;
6466 static bool issued_sse_ret_error;
6467 static bool issued_x87_ret_error;
6468
6469 enum machine_mode tmpmode;
6470 int bytes =
6471 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6472 enum x86_64_reg_class regclass[MAX_CLASSES];
6473 int n;
6474 int i;
6475 int nexps = 0;
6476 int needed_sseregs, needed_intregs;
6477 rtx exp[MAX_CLASSES];
6478 rtx ret;
6479
6480 n = classify_argument (mode, type, regclass, 0);
6481 if (!n)
6482 return NULL;
6483 if (!examine_argument (mode, type, in_return, &needed_intregs,
6484 &needed_sseregs))
6485 return NULL;
6486 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6487 return NULL;
6488
6489 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6490 some less clueful developer tries to use floating-point anyway. */
6491 if (needed_sseregs && !TARGET_SSE)
6492 {
6493 if (in_return)
6494 {
6495 if (!issued_sse_ret_error)
6496 {
6497 error ("SSE register return with SSE disabled");
6498 issued_sse_ret_error = true;
6499 }
6500 }
6501 else if (!issued_sse_arg_error)
6502 {
6503 error ("SSE register argument with SSE disabled");
6504 issued_sse_arg_error = true;
6505 }
6506 return NULL;
6507 }
6508
6509 /* Likewise, error if the ABI requires us to return values in the
6510 x87 registers and the user specified -mno-80387. */
6511 if (!TARGET_80387 && in_return)
6512 for (i = 0; i < n; i++)
6513 if (regclass[i] == X86_64_X87_CLASS
6514 || regclass[i] == X86_64_X87UP_CLASS
6515 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6516 {
6517 if (!issued_x87_ret_error)
6518 {
6519 error ("x87 register return with x87 disabled");
6520 issued_x87_ret_error = true;
6521 }
6522 return NULL;
6523 }
6524
6525 /* First construct simple cases. Avoid SCmode, since we want to use
6526 single register to pass this type. */
6527 if (n == 1 && mode != SCmode)
6528 switch (regclass[0])
6529 {
6530 case X86_64_INTEGER_CLASS:
6531 case X86_64_INTEGERSI_CLASS:
6532 return gen_rtx_REG (mode, intreg[0]);
6533 case X86_64_SSE_CLASS:
6534 case X86_64_SSESF_CLASS:
6535 case X86_64_SSEDF_CLASS:
6536 if (mode != BLKmode)
6537 return gen_reg_or_parallel (mode, orig_mode,
6538 SSE_REGNO (sse_regno));
6539 break;
6540 case X86_64_X87_CLASS:
6541 case X86_64_COMPLEX_X87_CLASS:
6542 return gen_rtx_REG (mode, FIRST_STACK_REG);
6543 case X86_64_NO_CLASS:
6544 /* Zero sized array, struct or class. */
6545 return NULL;
6546 default:
6547 gcc_unreachable ();
6548 }
6549 if (n == 2
6550 && regclass[0] == X86_64_SSE_CLASS
6551 && regclass[1] == X86_64_SSEUP_CLASS
6552 && mode != BLKmode)
6553 return gen_reg_or_parallel (mode, orig_mode,
6554 SSE_REGNO (sse_regno));
6555 if (n == 4
6556 && regclass[0] == X86_64_SSE_CLASS
6557 && regclass[1] == X86_64_SSEUP_CLASS
6558 && regclass[2] == X86_64_SSEUP_CLASS
6559 && regclass[3] == X86_64_SSEUP_CLASS
6560 && mode != BLKmode)
6561 return gen_reg_or_parallel (mode, orig_mode,
6562 SSE_REGNO (sse_regno));
6563 if (n == 2
6564 && regclass[0] == X86_64_X87_CLASS
6565 && regclass[1] == X86_64_X87UP_CLASS)
6566 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6567
6568 if (n == 2
6569 && regclass[0] == X86_64_INTEGER_CLASS
6570 && regclass[1] == X86_64_INTEGER_CLASS
6571 && (mode == CDImode || mode == TImode || mode == TFmode)
6572 && intreg[0] + 1 == intreg[1])
6573 return gen_rtx_REG (mode, intreg[0]);
6574
6575 /* Otherwise figure out the entries of the PARALLEL. */
6576 for (i = 0; i < n; i++)
6577 {
6578 int pos;
6579
6580 switch (regclass[i])
6581 {
6582 case X86_64_NO_CLASS:
6583 break;
6584 case X86_64_INTEGER_CLASS:
6585 case X86_64_INTEGERSI_CLASS:
6586 /* Merge TImodes on aligned occasions here too. */
6587 if (i * 8 + 8 > bytes)
6588 tmpmode
6589 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6590 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6591 tmpmode = SImode;
6592 else
6593 tmpmode = DImode;
6594 /* We've requested 24 bytes we
6595 don't have mode for. Use DImode. */
6596 if (tmpmode == BLKmode)
6597 tmpmode = DImode;
6598 exp [nexps++]
6599 = gen_rtx_EXPR_LIST (VOIDmode,
6600 gen_rtx_REG (tmpmode, *intreg),
6601 GEN_INT (i*8));
6602 intreg++;
6603 break;
6604 case X86_64_SSESF_CLASS:
6605 exp [nexps++]
6606 = gen_rtx_EXPR_LIST (VOIDmode,
6607 gen_rtx_REG (SFmode,
6608 SSE_REGNO (sse_regno)),
6609 GEN_INT (i*8));
6610 sse_regno++;
6611 break;
6612 case X86_64_SSEDF_CLASS:
6613 exp [nexps++]
6614 = gen_rtx_EXPR_LIST (VOIDmode,
6615 gen_rtx_REG (DFmode,
6616 SSE_REGNO (sse_regno)),
6617 GEN_INT (i*8));
6618 sse_regno++;
6619 break;
6620 case X86_64_SSE_CLASS:
6621 pos = i;
6622 switch (n)
6623 {
6624 case 1:
6625 tmpmode = DImode;
6626 break;
6627 case 2:
6628 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6629 {
6630 tmpmode = TImode;
6631 i++;
6632 }
6633 else
6634 tmpmode = DImode;
6635 break;
6636 case 4:
6637 gcc_assert (i == 0
6638 && regclass[1] == X86_64_SSEUP_CLASS
6639 && regclass[2] == X86_64_SSEUP_CLASS
6640 && regclass[3] == X86_64_SSEUP_CLASS);
6641 tmpmode = OImode;
6642 i += 3;
6643 break;
6644 default:
6645 gcc_unreachable ();
6646 }
6647 exp [nexps++]
6648 = gen_rtx_EXPR_LIST (VOIDmode,
6649 gen_rtx_REG (tmpmode,
6650 SSE_REGNO (sse_regno)),
6651 GEN_INT (pos*8));
6652 sse_regno++;
6653 break;
6654 default:
6655 gcc_unreachable ();
6656 }
6657 }
6658
6659 /* Empty aligned struct, union or class. */
6660 if (nexps == 0)
6661 return NULL;
6662
6663 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6664 for (i = 0; i < nexps; i++)
6665 XVECEXP (ret, 0, i) = exp [i];
6666 return ret;
6667 }
6668
6669 /* Update the data in CUM to advance over an argument of mode MODE
6670 and data type TYPE. (TYPE is null for libcalls where that information
6671 may not be available.) */
6672
6673 static void
6674 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6675 const_tree type, HOST_WIDE_INT bytes,
6676 HOST_WIDE_INT words)
6677 {
6678 switch (mode)
6679 {
6680 default:
6681 break;
6682
6683 case BLKmode:
6684 if (bytes < 0)
6685 break;
6686 /* FALLTHRU */
6687
6688 case DImode:
6689 case SImode:
6690 case HImode:
6691 case QImode:
6692 cum->words += words;
6693 cum->nregs -= words;
6694 cum->regno += words;
6695
6696 if (cum->nregs <= 0)
6697 {
6698 cum->nregs = 0;
6699 cum->regno = 0;
6700 }
6701 break;
6702
6703 case OImode:
6704 /* OImode shouldn't be used directly. */
6705 gcc_unreachable ();
6706
6707 case DFmode:
6708 if (cum->float_in_sse < 2)
6709 break;
6710 case SFmode:
6711 if (cum->float_in_sse < 1)
6712 break;
6713 /* FALLTHRU */
6714
6715 case V8SFmode:
6716 case V8SImode:
6717 case V32QImode:
6718 case V16HImode:
6719 case V4DFmode:
6720 case V4DImode:
6721 case TImode:
6722 case V16QImode:
6723 case V8HImode:
6724 case V4SImode:
6725 case V2DImode:
6726 case V4SFmode:
6727 case V2DFmode:
6728 if (!type || !AGGREGATE_TYPE_P (type))
6729 {
6730 cum->sse_words += words;
6731 cum->sse_nregs -= 1;
6732 cum->sse_regno += 1;
6733 if (cum->sse_nregs <= 0)
6734 {
6735 cum->sse_nregs = 0;
6736 cum->sse_regno = 0;
6737 }
6738 }
6739 break;
6740
6741 case V8QImode:
6742 case V4HImode:
6743 case V2SImode:
6744 case V2SFmode:
6745 case V1TImode:
6746 case V1DImode:
6747 if (!type || !AGGREGATE_TYPE_P (type))
6748 {
6749 cum->mmx_words += words;
6750 cum->mmx_nregs -= 1;
6751 cum->mmx_regno += 1;
6752 if (cum->mmx_nregs <= 0)
6753 {
6754 cum->mmx_nregs = 0;
6755 cum->mmx_regno = 0;
6756 }
6757 }
6758 break;
6759 }
6760 }
6761
6762 static void
6763 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6764 const_tree type, HOST_WIDE_INT words, bool named)
6765 {
6766 int int_nregs, sse_nregs;
6767
6768 /* Unnamed 256bit vector mode parameters are passed on stack. */
6769 if (!named && VALID_AVX256_REG_MODE (mode))
6770 return;
6771
6772 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6773 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6774 {
6775 cum->nregs -= int_nregs;
6776 cum->sse_nregs -= sse_nregs;
6777 cum->regno += int_nregs;
6778 cum->sse_regno += sse_nregs;
6779 }
6780 else
6781 {
6782 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6783 cum->words = (cum->words + align - 1) & ~(align - 1);
6784 cum->words += words;
6785 }
6786 }
6787
6788 static void
6789 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6790 HOST_WIDE_INT words)
6791 {
6792 /* Otherwise, this should be passed indirect. */
6793 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6794
6795 cum->words += words;
6796 if (cum->nregs > 0)
6797 {
6798 cum->nregs -= 1;
6799 cum->regno += 1;
6800 }
6801 }
6802
6803 /* Update the data in CUM to advance over an argument of mode MODE and
6804 data type TYPE. (TYPE is null for libcalls where that information
6805 may not be available.) */
6806
6807 static void
6808 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6809 const_tree type, bool named)
6810 {
6811 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6812 HOST_WIDE_INT bytes, words;
6813
6814 if (mode == BLKmode)
6815 bytes = int_size_in_bytes (type);
6816 else
6817 bytes = GET_MODE_SIZE (mode);
6818 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6819
6820 if (type)
6821 mode = type_natural_mode (type, NULL);
6822
6823 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6824 function_arg_advance_ms_64 (cum, bytes, words);
6825 else if (TARGET_64BIT)
6826 function_arg_advance_64 (cum, mode, type, words, named);
6827 else
6828 function_arg_advance_32 (cum, mode, type, bytes, words);
6829 }
6830
6831 /* Define where to put the arguments to a function.
6832 Value is zero to push the argument on the stack,
6833 or a hard register in which to store the argument.
6834
6835 MODE is the argument's machine mode.
6836 TYPE is the data type of the argument (as a tree).
6837 This is null for libcalls where that information may
6838 not be available.
6839 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6840 the preceding args and about the function being called.
6841 NAMED is nonzero if this argument is a named parameter
6842 (otherwise it is an extra parameter matching an ellipsis). */
6843
6844 static rtx
6845 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6846 enum machine_mode orig_mode, const_tree type,
6847 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6848 {
6849 static bool warnedsse, warnedmmx;
6850
6851 /* Avoid the AL settings for the Unix64 ABI. */
6852 if (mode == VOIDmode)
6853 return constm1_rtx;
6854
6855 switch (mode)
6856 {
6857 default:
6858 break;
6859
6860 case BLKmode:
6861 if (bytes < 0)
6862 break;
6863 /* FALLTHRU */
6864 case DImode:
6865 case SImode:
6866 case HImode:
6867 case QImode:
6868 if (words <= cum->nregs)
6869 {
6870 int regno = cum->regno;
6871
6872 /* Fastcall allocates the first two DWORD (SImode) or
6873 smaller arguments to ECX and EDX if it isn't an
6874 aggregate type . */
6875 if (cum->fastcall)
6876 {
6877 if (mode == BLKmode
6878 || mode == DImode
6879 || (type && AGGREGATE_TYPE_P (type)))
6880 break;
6881
6882 /* ECX not EAX is the first allocated register. */
6883 if (regno == AX_REG)
6884 regno = CX_REG;
6885 }
6886 return gen_rtx_REG (mode, regno);
6887 }
6888 break;
6889
6890 case DFmode:
6891 if (cum->float_in_sse < 2)
6892 break;
6893 case SFmode:
6894 if (cum->float_in_sse < 1)
6895 break;
6896 /* FALLTHRU */
6897 case TImode:
6898 /* In 32bit, we pass TImode in xmm registers. */
6899 case V16QImode:
6900 case V8HImode:
6901 case V4SImode:
6902 case V2DImode:
6903 case V4SFmode:
6904 case V2DFmode:
6905 if (!type || !AGGREGATE_TYPE_P (type))
6906 {
6907 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6908 {
6909 warnedsse = true;
6910 warning (0, "SSE vector argument without SSE enabled "
6911 "changes the ABI");
6912 }
6913 if (cum->sse_nregs)
6914 return gen_reg_or_parallel (mode, orig_mode,
6915 cum->sse_regno + FIRST_SSE_REG);
6916 }
6917 break;
6918
6919 case OImode:
6920 /* OImode shouldn't be used directly. */
6921 gcc_unreachable ();
6922
6923 case V8SFmode:
6924 case V8SImode:
6925 case V32QImode:
6926 case V16HImode:
6927 case V4DFmode:
6928 case V4DImode:
6929 if (!type || !AGGREGATE_TYPE_P (type))
6930 {
6931 if (cum->sse_nregs)
6932 return gen_reg_or_parallel (mode, orig_mode,
6933 cum->sse_regno + FIRST_SSE_REG);
6934 }
6935 break;
6936
6937 case V8QImode:
6938 case V4HImode:
6939 case V2SImode:
6940 case V2SFmode:
6941 case V1TImode:
6942 case V1DImode:
6943 if (!type || !AGGREGATE_TYPE_P (type))
6944 {
6945 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6946 {
6947 warnedmmx = true;
6948 warning (0, "MMX vector argument without MMX enabled "
6949 "changes the ABI");
6950 }
6951 if (cum->mmx_nregs)
6952 return gen_reg_or_parallel (mode, orig_mode,
6953 cum->mmx_regno + FIRST_MMX_REG);
6954 }
6955 break;
6956 }
6957
6958 return NULL_RTX;
6959 }
6960
6961 static rtx
6962 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6963 enum machine_mode orig_mode, const_tree type, bool named)
6964 {
6965 /* Handle a hidden AL argument containing number of registers
6966 for varargs x86-64 functions. */
6967 if (mode == VOIDmode)
6968 return GEN_INT (cum->maybe_vaarg
6969 ? (cum->sse_nregs < 0
6970 ? X86_64_SSE_REGPARM_MAX
6971 : cum->sse_regno)
6972 : -1);
6973
6974 switch (mode)
6975 {
6976 default:
6977 break;
6978
6979 case V8SFmode:
6980 case V8SImode:
6981 case V32QImode:
6982 case V16HImode:
6983 case V4DFmode:
6984 case V4DImode:
6985 /* Unnamed 256bit vector mode parameters are passed on stack. */
6986 if (!named)
6987 return NULL;
6988 break;
6989 }
6990
6991 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6992 cum->sse_nregs,
6993 &x86_64_int_parameter_registers [cum->regno],
6994 cum->sse_regno);
6995 }
6996
6997 static rtx
6998 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6999 enum machine_mode orig_mode, bool named,
7000 HOST_WIDE_INT bytes)
7001 {
7002 unsigned int regno;
7003
7004 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
7005 We use value of -2 to specify that current function call is MSABI. */
7006 if (mode == VOIDmode)
7007 return GEN_INT (-2);
7008
7009 /* If we've run out of registers, it goes on the stack. */
7010 if (cum->nregs == 0)
7011 return NULL_RTX;
7012
7013 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
7014
7015 /* Only floating point modes are passed in anything but integer regs. */
7016 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
7017 {
7018 if (named)
7019 regno = cum->regno + FIRST_SSE_REG;
7020 else
7021 {
7022 rtx t1, t2;
7023
7024 /* Unnamed floating parameters are passed in both the
7025 SSE and integer registers. */
7026 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
7027 t2 = gen_rtx_REG (mode, regno);
7028 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
7029 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
7030 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
7031 }
7032 }
7033 /* Handle aggregated types passed in register. */
7034 if (orig_mode == BLKmode)
7035 {
7036 if (bytes > 0 && bytes <= 8)
7037 mode = (bytes > 4 ? DImode : SImode);
7038 if (mode == BLKmode)
7039 mode = DImode;
7040 }
7041
7042 return gen_reg_or_parallel (mode, orig_mode, regno);
7043 }
7044
7045 /* Return where to put the arguments to a function.
7046 Return zero to push the argument on the stack, or a hard register in which to store the argument.
7047
7048 MODE is the argument's machine mode. TYPE is the data type of the
7049 argument. It is null for libcalls where that information may not be
7050 available. CUM gives information about the preceding args and about
7051 the function being called. NAMED is nonzero if this argument is a
7052 named parameter (otherwise it is an extra parameter matching an
7053 ellipsis). */
7054
7055 static rtx
7056 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
7057 const_tree type, bool named)
7058 {
7059 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7060 enum machine_mode mode = omode;
7061 HOST_WIDE_INT bytes, words;
7062 rtx arg;
7063
7064 if (mode == BLKmode)
7065 bytes = int_size_in_bytes (type);
7066 else
7067 bytes = GET_MODE_SIZE (mode);
7068 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7069
7070 /* To simplify the code below, represent vector types with a vector mode
7071 even if MMX/SSE are not active. */
7072 if (type && TREE_CODE (type) == VECTOR_TYPE)
7073 mode = type_natural_mode (type, cum);
7074
7075 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7076 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
7077 else if (TARGET_64BIT)
7078 arg = function_arg_64 (cum, mode, omode, type, named);
7079 else
7080 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
7081
7082 if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
7083 {
7084 /* This argument uses 256bit AVX modes. */
7085 if (cum->caller)
7086 cfun->machine->callee_pass_avx256_p = true;
7087 else
7088 cfun->machine->caller_pass_avx256_p = true;
7089 }
7090
7091 return arg;
7092 }
7093
7094 /* A C expression that indicates when an argument must be passed by
7095 reference. If nonzero for an argument, a copy of that argument is
7096 made in memory and a pointer to the argument is passed instead of
7097 the argument itself. The pointer is passed in whatever way is
7098 appropriate for passing a pointer to that type. */
7099
7100 static bool
7101 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
7102 enum machine_mode mode ATTRIBUTE_UNUSED,
7103 const_tree type, bool named ATTRIBUTE_UNUSED)
7104 {
7105 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7106
7107 /* See Windows x64 Software Convention. */
7108 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7109 {
7110 int msize = (int) GET_MODE_SIZE (mode);
7111 if (type)
7112 {
7113 /* Arrays are passed by reference. */
7114 if (TREE_CODE (type) == ARRAY_TYPE)
7115 return true;
7116
7117 if (AGGREGATE_TYPE_P (type))
7118 {
7119 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7120 are passed by reference. */
7121 msize = int_size_in_bytes (type);
7122 }
7123 }
7124
7125 /* __m128 is passed by reference. */
7126 switch (msize) {
7127 case 1: case 2: case 4: case 8:
7128 break;
7129 default:
7130 return true;
7131 }
7132 }
7133 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7134 return 1;
7135
7136 return 0;
7137 }
7138
7139 /* Return true when TYPE should be 128bit aligned for 32bit argument
7140 passing ABI. XXX: This function is obsolete and is only used for
7141 checking psABI compatibility with previous versions of GCC. */
7142
7143 static bool
7144 ix86_compat_aligned_value_p (const_tree type)
7145 {
7146 enum machine_mode mode = TYPE_MODE (type);
7147 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7148 || mode == TDmode
7149 || mode == TFmode
7150 || mode == TCmode)
7151 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7152 return true;
7153 if (TYPE_ALIGN (type) < 128)
7154 return false;
7155
7156 if (AGGREGATE_TYPE_P (type))
7157 {
7158 /* Walk the aggregates recursively. */
7159 switch (TREE_CODE (type))
7160 {
7161 case RECORD_TYPE:
7162 case UNION_TYPE:
7163 case QUAL_UNION_TYPE:
7164 {
7165 tree field;
7166
7167 /* Walk all the structure fields. */
7168 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7169 {
7170 if (TREE_CODE (field) == FIELD_DECL
7171 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7172 return true;
7173 }
7174 break;
7175 }
7176
7177 case ARRAY_TYPE:
7178 /* Just for use if some languages passes arrays by value. */
7179 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7180 return true;
7181 break;
7182
7183 default:
7184 gcc_unreachable ();
7185 }
7186 }
7187 return false;
7188 }
7189
7190 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7191 XXX: This function is obsolete and is only used for checking psABI
7192 compatibility with previous versions of GCC. */
7193
7194 static unsigned int
7195 ix86_compat_function_arg_boundary (enum machine_mode mode,
7196 const_tree type, unsigned int align)
7197 {
7198 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7199 natural boundaries. */
7200 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7201 {
7202 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7203 make an exception for SSE modes since these require 128bit
7204 alignment.
7205
7206 The handling here differs from field_alignment. ICC aligns MMX
7207 arguments to 4 byte boundaries, while structure fields are aligned
7208 to 8 byte boundaries. */
7209 if (!type)
7210 {
7211 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7212 align = PARM_BOUNDARY;
7213 }
7214 else
7215 {
7216 if (!ix86_compat_aligned_value_p (type))
7217 align = PARM_BOUNDARY;
7218 }
7219 }
7220 if (align > BIGGEST_ALIGNMENT)
7221 align = BIGGEST_ALIGNMENT;
7222 return align;
7223 }
7224
7225 /* Return true when TYPE should be 128bit aligned for 32bit argument
7226 passing ABI. */
7227
7228 static bool
7229 ix86_contains_aligned_value_p (const_tree type)
7230 {
7231 enum machine_mode mode = TYPE_MODE (type);
7232
7233 if (mode == XFmode || mode == XCmode)
7234 return false;
7235
7236 if (TYPE_ALIGN (type) < 128)
7237 return false;
7238
7239 if (AGGREGATE_TYPE_P (type))
7240 {
7241 /* Walk the aggregates recursively. */
7242 switch (TREE_CODE (type))
7243 {
7244 case RECORD_TYPE:
7245 case UNION_TYPE:
7246 case QUAL_UNION_TYPE:
7247 {
7248 tree field;
7249
7250 /* Walk all the structure fields. */
7251 for (field = TYPE_FIELDS (type);
7252 field;
7253 field = DECL_CHAIN (field))
7254 {
7255 if (TREE_CODE (field) == FIELD_DECL
7256 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7257 return true;
7258 }
7259 break;
7260 }
7261
7262 case ARRAY_TYPE:
7263 /* Just for use if some languages passes arrays by value. */
7264 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7265 return true;
7266 break;
7267
7268 default:
7269 gcc_unreachable ();
7270 }
7271 }
7272 else
7273 return TYPE_ALIGN (type) >= 128;
7274
7275 return false;
7276 }
7277
7278 /* Gives the alignment boundary, in bits, of an argument with the
7279 specified mode and type. */
7280
7281 static unsigned int
7282 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7283 {
7284 unsigned int align;
7285 if (type)
7286 {
7287 /* Since the main variant type is used for call, we convert it to
7288 the main variant type. */
7289 type = TYPE_MAIN_VARIANT (type);
7290 align = TYPE_ALIGN (type);
7291 }
7292 else
7293 align = GET_MODE_ALIGNMENT (mode);
7294 if (align < PARM_BOUNDARY)
7295 align = PARM_BOUNDARY;
7296 else
7297 {
7298 static bool warned;
7299 unsigned int saved_align = align;
7300
7301 if (!TARGET_64BIT)
7302 {
7303 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7304 if (!type)
7305 {
7306 if (mode == XFmode || mode == XCmode)
7307 align = PARM_BOUNDARY;
7308 }
7309 else if (!ix86_contains_aligned_value_p (type))
7310 align = PARM_BOUNDARY;
7311
7312 if (align < 128)
7313 align = PARM_BOUNDARY;
7314 }
7315
7316 if (warn_psabi
7317 && !warned
7318 && align != ix86_compat_function_arg_boundary (mode, type,
7319 saved_align))
7320 {
7321 warned = true;
7322 inform (input_location,
7323 "The ABI for passing parameters with %d-byte"
7324 " alignment has changed in GCC 4.6",
7325 align / BITS_PER_UNIT);
7326 }
7327 }
7328
7329 return align;
7330 }
7331
7332 /* Return true if N is a possible register number of function value. */
7333
7334 static bool
7335 ix86_function_value_regno_p (const unsigned int regno)
7336 {
7337 switch (regno)
7338 {
7339 case AX_REG:
7340 return true;
7341
7342 case FIRST_FLOAT_REG:
7343 /* TODO: The function should depend on current function ABI but
7344 builtins.c would need updating then. Therefore we use the
7345 default ABI. */
7346 if (TARGET_64BIT && ix86_abi == MS_ABI)
7347 return false;
7348 return TARGET_FLOAT_RETURNS_IN_80387;
7349
7350 case FIRST_SSE_REG:
7351 return TARGET_SSE;
7352
7353 case FIRST_MMX_REG:
7354 if (TARGET_MACHO || TARGET_64BIT)
7355 return false;
7356 return TARGET_MMX;
7357 }
7358
7359 return false;
7360 }
7361
7362 /* Define how to find the value returned by a function.
7363 VALTYPE is the data type of the value (as a tree).
7364 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7365 otherwise, FUNC is 0. */
7366
7367 static rtx
7368 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7369 const_tree fntype, const_tree fn)
7370 {
7371 unsigned int regno;
7372
7373 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7374 we normally prevent this case when mmx is not available. However
7375 some ABIs may require the result to be returned like DImode. */
7376 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7377 regno = FIRST_MMX_REG;
7378
7379 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7380 we prevent this case when sse is not available. However some ABIs
7381 may require the result to be returned like integer TImode. */
7382 else if (mode == TImode
7383 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7384 regno = FIRST_SSE_REG;
7385
7386 /* 32-byte vector modes in %ymm0. */
7387 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7388 regno = FIRST_SSE_REG;
7389
7390 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7391 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7392 regno = FIRST_FLOAT_REG;
7393 else
7394 /* Most things go in %eax. */
7395 regno = AX_REG;
7396
7397 /* Override FP return register with %xmm0 for local functions when
7398 SSE math is enabled or for functions with sseregparm attribute. */
7399 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7400 {
7401 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7402 if ((sse_level >= 1 && mode == SFmode)
7403 || (sse_level == 2 && mode == DFmode))
7404 regno = FIRST_SSE_REG;
7405 }
7406
7407 /* OImode shouldn't be used directly. */
7408 gcc_assert (mode != OImode);
7409
7410 return gen_rtx_REG (orig_mode, regno);
7411 }
7412
7413 static rtx
7414 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7415 const_tree valtype)
7416 {
7417 rtx ret;
7418
7419 /* Handle libcalls, which don't provide a type node. */
7420 if (valtype == NULL)
7421 {
7422 unsigned int regno;
7423
7424 switch (mode)
7425 {
7426 case SFmode:
7427 case SCmode:
7428 case DFmode:
7429 case DCmode:
7430 case TFmode:
7431 case SDmode:
7432 case DDmode:
7433 case TDmode:
7434 regno = FIRST_SSE_REG;
7435 break;
7436 case XFmode:
7437 case XCmode:
7438 regno = FIRST_FLOAT_REG;
7439 break;
7440 case TCmode:
7441 return NULL;
7442 default:
7443 regno = AX_REG;
7444 }
7445
7446 return gen_rtx_REG (mode, regno);
7447 }
7448 else if (POINTER_TYPE_P (valtype))
7449 {
7450 /* Pointers are always returned in word_mode. */
7451 mode = word_mode;
7452 }
7453
7454 ret = construct_container (mode, orig_mode, valtype, 1,
7455 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7456 x86_64_int_return_registers, 0);
7457
7458 /* For zero sized structures, construct_container returns NULL, but we
7459 need to keep rest of compiler happy by returning meaningful value. */
7460 if (!ret)
7461 ret = gen_rtx_REG (orig_mode, AX_REG);
7462
7463 return ret;
7464 }
7465
7466 static rtx
7467 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7468 {
7469 unsigned int regno = AX_REG;
7470
7471 if (TARGET_SSE)
7472 {
7473 switch (GET_MODE_SIZE (mode))
7474 {
7475 case 16:
7476 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7477 && !COMPLEX_MODE_P (mode))
7478 regno = FIRST_SSE_REG;
7479 break;
7480 case 8:
7481 case 4:
7482 if (mode == SFmode || mode == DFmode)
7483 regno = FIRST_SSE_REG;
7484 break;
7485 default:
7486 break;
7487 }
7488 }
7489 return gen_rtx_REG (orig_mode, regno);
7490 }
7491
7492 static rtx
7493 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7494 enum machine_mode orig_mode, enum machine_mode mode)
7495 {
7496 const_tree fn, fntype;
7497
7498 fn = NULL_TREE;
7499 if (fntype_or_decl && DECL_P (fntype_or_decl))
7500 fn = fntype_or_decl;
7501 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7502
7503 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7504 return function_value_ms_64 (orig_mode, mode);
7505 else if (TARGET_64BIT)
7506 return function_value_64 (orig_mode, mode, valtype);
7507 else
7508 return function_value_32 (orig_mode, mode, fntype, fn);
7509 }
7510
7511 static rtx
7512 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7513 bool outgoing ATTRIBUTE_UNUSED)
7514 {
7515 enum machine_mode mode, orig_mode;
7516
7517 orig_mode = TYPE_MODE (valtype);
7518 mode = type_natural_mode (valtype, NULL);
7519 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7520 }
7521
7522 /* Pointer function arguments and return values are promoted to
7523 word_mode. */
7524
7525 static enum machine_mode
7526 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7527 int *punsignedp, const_tree fntype,
7528 int for_return)
7529 {
7530 if (type != NULL_TREE && POINTER_TYPE_P (type))
7531 {
7532 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7533 return word_mode;
7534 }
7535 return default_promote_function_mode (type, mode, punsignedp, fntype,
7536 for_return);
7537 }
7538
7539 rtx
7540 ix86_libcall_value (enum machine_mode mode)
7541 {
7542 return ix86_function_value_1 (NULL, NULL, mode, mode);
7543 }
7544
7545 /* Return true iff type is returned in memory. */
7546
7547 static bool ATTRIBUTE_UNUSED
7548 return_in_memory_32 (const_tree type, enum machine_mode mode)
7549 {
7550 HOST_WIDE_INT size;
7551
7552 if (mode == BLKmode)
7553 return true;
7554
7555 size = int_size_in_bytes (type);
7556
7557 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7558 return false;
7559
7560 if (VECTOR_MODE_P (mode) || mode == TImode)
7561 {
7562 /* User-created vectors small enough to fit in EAX. */
7563 if (size < 8)
7564 return false;
7565
7566 /* MMX/3dNow values are returned in MM0,
7567 except when it doesn't exits or the ABI prescribes otherwise. */
7568 if (size == 8)
7569 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7570
7571 /* SSE values are returned in XMM0, except when it doesn't exist. */
7572 if (size == 16)
7573 return !TARGET_SSE;
7574
7575 /* AVX values are returned in YMM0, except when it doesn't exist. */
7576 if (size == 32)
7577 return !TARGET_AVX;
7578 }
7579
7580 if (mode == XFmode)
7581 return false;
7582
7583 if (size > 12)
7584 return true;
7585
7586 /* OImode shouldn't be used directly. */
7587 gcc_assert (mode != OImode);
7588
7589 return false;
7590 }
7591
7592 static bool ATTRIBUTE_UNUSED
7593 return_in_memory_64 (const_tree type, enum machine_mode mode)
7594 {
7595 int needed_intregs, needed_sseregs;
7596 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7597 }
7598
7599 static bool ATTRIBUTE_UNUSED
7600 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7601 {
7602 HOST_WIDE_INT size = int_size_in_bytes (type);
7603
7604 /* __m128 is returned in xmm0. */
7605 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7606 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7607 return false;
7608
7609 /* Otherwise, the size must be exactly in [1248]. */
7610 return size != 1 && size != 2 && size != 4 && size != 8;
7611 }
7612
7613 static bool
7614 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7615 {
7616 #ifdef SUBTARGET_RETURN_IN_MEMORY
7617 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7618 #else
7619 const enum machine_mode mode = type_natural_mode (type, NULL);
7620
7621 if (TARGET_64BIT)
7622 {
7623 if (ix86_function_type_abi (fntype) == MS_ABI)
7624 return return_in_memory_ms_64 (type, mode);
7625 else
7626 return return_in_memory_64 (type, mode);
7627 }
7628 else
7629 return return_in_memory_32 (type, mode);
7630 #endif
7631 }
7632
7633 /* When returning SSE vector types, we have a choice of either
7634 (1) being abi incompatible with a -march switch, or
7635 (2) generating an error.
7636 Given no good solution, I think the safest thing is one warning.
7637 The user won't be able to use -Werror, but....
7638
7639 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7640 called in response to actually generating a caller or callee that
7641 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7642 via aggregate_value_p for general type probing from tree-ssa. */
7643
7644 static rtx
7645 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7646 {
7647 static bool warnedsse, warnedmmx;
7648
7649 if (!TARGET_64BIT && type)
7650 {
7651 /* Look at the return type of the function, not the function type. */
7652 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7653
7654 if (!TARGET_SSE && !warnedsse)
7655 {
7656 if (mode == TImode
7657 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7658 {
7659 warnedsse = true;
7660 warning (0, "SSE vector return without SSE enabled "
7661 "changes the ABI");
7662 }
7663 }
7664
7665 if (!TARGET_MMX && !warnedmmx)
7666 {
7667 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7668 {
7669 warnedmmx = true;
7670 warning (0, "MMX vector return without MMX enabled "
7671 "changes the ABI");
7672 }
7673 }
7674 }
7675
7676 return NULL;
7677 }
7678
7679 \f
7680 /* Create the va_list data type. */
7681
7682 /* Returns the calling convention specific va_list date type.
7683 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7684
7685 static tree
7686 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7687 {
7688 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7689
7690 /* For i386 we use plain pointer to argument area. */
7691 if (!TARGET_64BIT || abi == MS_ABI)
7692 return build_pointer_type (char_type_node);
7693
7694 record = lang_hooks.types.make_type (RECORD_TYPE);
7695 type_decl = build_decl (BUILTINS_LOCATION,
7696 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7697
7698 f_gpr = build_decl (BUILTINS_LOCATION,
7699 FIELD_DECL, get_identifier ("gp_offset"),
7700 unsigned_type_node);
7701 f_fpr = build_decl (BUILTINS_LOCATION,
7702 FIELD_DECL, get_identifier ("fp_offset"),
7703 unsigned_type_node);
7704 f_ovf = build_decl (BUILTINS_LOCATION,
7705 FIELD_DECL, get_identifier ("overflow_arg_area"),
7706 ptr_type_node);
7707 f_sav = build_decl (BUILTINS_LOCATION,
7708 FIELD_DECL, get_identifier ("reg_save_area"),
7709 ptr_type_node);
7710
7711 va_list_gpr_counter_field = f_gpr;
7712 va_list_fpr_counter_field = f_fpr;
7713
7714 DECL_FIELD_CONTEXT (f_gpr) = record;
7715 DECL_FIELD_CONTEXT (f_fpr) = record;
7716 DECL_FIELD_CONTEXT (f_ovf) = record;
7717 DECL_FIELD_CONTEXT (f_sav) = record;
7718
7719 TYPE_STUB_DECL (record) = type_decl;
7720 TYPE_NAME (record) = type_decl;
7721 TYPE_FIELDS (record) = f_gpr;
7722 DECL_CHAIN (f_gpr) = f_fpr;
7723 DECL_CHAIN (f_fpr) = f_ovf;
7724 DECL_CHAIN (f_ovf) = f_sav;
7725
7726 layout_type (record);
7727
7728 /* The correct type is an array type of one element. */
7729 return build_array_type (record, build_index_type (size_zero_node));
7730 }
7731
7732 /* Setup the builtin va_list data type and for 64-bit the additional
7733 calling convention specific va_list data types. */
7734
7735 static tree
7736 ix86_build_builtin_va_list (void)
7737 {
7738 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7739
7740 /* Initialize abi specific va_list builtin types. */
7741 if (TARGET_64BIT)
7742 {
7743 tree t;
7744 if (ix86_abi == MS_ABI)
7745 {
7746 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7747 if (TREE_CODE (t) != RECORD_TYPE)
7748 t = build_variant_type_copy (t);
7749 sysv_va_list_type_node = t;
7750 }
7751 else
7752 {
7753 t = ret;
7754 if (TREE_CODE (t) != RECORD_TYPE)
7755 t = build_variant_type_copy (t);
7756 sysv_va_list_type_node = t;
7757 }
7758 if (ix86_abi != MS_ABI)
7759 {
7760 t = ix86_build_builtin_va_list_abi (MS_ABI);
7761 if (TREE_CODE (t) != RECORD_TYPE)
7762 t = build_variant_type_copy (t);
7763 ms_va_list_type_node = t;
7764 }
7765 else
7766 {
7767 t = ret;
7768 if (TREE_CODE (t) != RECORD_TYPE)
7769 t = build_variant_type_copy (t);
7770 ms_va_list_type_node = t;
7771 }
7772 }
7773
7774 return ret;
7775 }
7776
7777 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7778
7779 static void
7780 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7781 {
7782 rtx save_area, mem;
7783 alias_set_type set;
7784 int i, max;
7785
7786 /* GPR size of varargs save area. */
7787 if (cfun->va_list_gpr_size)
7788 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7789 else
7790 ix86_varargs_gpr_size = 0;
7791
7792 /* FPR size of varargs save area. We don't need it if we don't pass
7793 anything in SSE registers. */
7794 if (TARGET_SSE && cfun->va_list_fpr_size)
7795 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7796 else
7797 ix86_varargs_fpr_size = 0;
7798
7799 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7800 return;
7801
7802 save_area = frame_pointer_rtx;
7803 set = get_varargs_alias_set ();
7804
7805 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7806 if (max > X86_64_REGPARM_MAX)
7807 max = X86_64_REGPARM_MAX;
7808
7809 for (i = cum->regno; i < max; i++)
7810 {
7811 mem = gen_rtx_MEM (word_mode,
7812 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7813 MEM_NOTRAP_P (mem) = 1;
7814 set_mem_alias_set (mem, set);
7815 emit_move_insn (mem,
7816 gen_rtx_REG (word_mode,
7817 x86_64_int_parameter_registers[i]));
7818 }
7819
7820 if (ix86_varargs_fpr_size)
7821 {
7822 enum machine_mode smode;
7823 rtx label, test;
7824
7825 /* Now emit code to save SSE registers. The AX parameter contains number
7826 of SSE parameter registers used to call this function, though all we
7827 actually check here is the zero/non-zero status. */
7828
7829 label = gen_label_rtx ();
7830 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7831 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7832 label));
7833
7834 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7835 we used movdqa (i.e. TImode) instead? Perhaps even better would
7836 be if we could determine the real mode of the data, via a hook
7837 into pass_stdarg. Ignore all that for now. */
7838 smode = V4SFmode;
7839 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7840 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7841
7842 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7843 if (max > X86_64_SSE_REGPARM_MAX)
7844 max = X86_64_SSE_REGPARM_MAX;
7845
7846 for (i = cum->sse_regno; i < max; ++i)
7847 {
7848 mem = plus_constant (Pmode, save_area,
7849 i * 16 + ix86_varargs_gpr_size);
7850 mem = gen_rtx_MEM (smode, mem);
7851 MEM_NOTRAP_P (mem) = 1;
7852 set_mem_alias_set (mem, set);
7853 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7854
7855 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7856 }
7857
7858 emit_label (label);
7859 }
7860 }
7861
7862 static void
7863 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7864 {
7865 alias_set_type set = get_varargs_alias_set ();
7866 int i;
7867
7868 /* Reset to zero, as there might be a sysv vaarg used
7869 before. */
7870 ix86_varargs_gpr_size = 0;
7871 ix86_varargs_fpr_size = 0;
7872
7873 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7874 {
7875 rtx reg, mem;
7876
7877 mem = gen_rtx_MEM (Pmode,
7878 plus_constant (Pmode, virtual_incoming_args_rtx,
7879 i * UNITS_PER_WORD));
7880 MEM_NOTRAP_P (mem) = 1;
7881 set_mem_alias_set (mem, set);
7882
7883 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7884 emit_move_insn (mem, reg);
7885 }
7886 }
7887
7888 static void
7889 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7890 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7891 int no_rtl)
7892 {
7893 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7894 CUMULATIVE_ARGS next_cum;
7895 tree fntype;
7896
7897 /* This argument doesn't appear to be used anymore. Which is good,
7898 because the old code here didn't suppress rtl generation. */
7899 gcc_assert (!no_rtl);
7900
7901 if (!TARGET_64BIT)
7902 return;
7903
7904 fntype = TREE_TYPE (current_function_decl);
7905
7906 /* For varargs, we do not want to skip the dummy va_dcl argument.
7907 For stdargs, we do want to skip the last named argument. */
7908 next_cum = *cum;
7909 if (stdarg_p (fntype))
7910 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7911 true);
7912
7913 if (cum->call_abi == MS_ABI)
7914 setup_incoming_varargs_ms_64 (&next_cum);
7915 else
7916 setup_incoming_varargs_64 (&next_cum);
7917 }
7918
7919 /* Checks if TYPE is of kind va_list char *. */
7920
7921 static bool
7922 is_va_list_char_pointer (tree type)
7923 {
7924 tree canonic;
7925
7926 /* For 32-bit it is always true. */
7927 if (!TARGET_64BIT)
7928 return true;
7929 canonic = ix86_canonical_va_list_type (type);
7930 return (canonic == ms_va_list_type_node
7931 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7932 }
7933
7934 /* Implement va_start. */
7935
7936 static void
7937 ix86_va_start (tree valist, rtx nextarg)
7938 {
7939 HOST_WIDE_INT words, n_gpr, n_fpr;
7940 tree f_gpr, f_fpr, f_ovf, f_sav;
7941 tree gpr, fpr, ovf, sav, t;
7942 tree type;
7943 rtx ovf_rtx;
7944
7945 if (flag_split_stack
7946 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7947 {
7948 unsigned int scratch_regno;
7949
7950 /* When we are splitting the stack, we can't refer to the stack
7951 arguments using internal_arg_pointer, because they may be on
7952 the old stack. The split stack prologue will arrange to
7953 leave a pointer to the old stack arguments in a scratch
7954 register, which we here copy to a pseudo-register. The split
7955 stack prologue can't set the pseudo-register directly because
7956 it (the prologue) runs before any registers have been saved. */
7957
7958 scratch_regno = split_stack_prologue_scratch_regno ();
7959 if (scratch_regno != INVALID_REGNUM)
7960 {
7961 rtx reg, seq;
7962
7963 reg = gen_reg_rtx (Pmode);
7964 cfun->machine->split_stack_varargs_pointer = reg;
7965
7966 start_sequence ();
7967 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7968 seq = get_insns ();
7969 end_sequence ();
7970
7971 push_topmost_sequence ();
7972 emit_insn_after (seq, entry_of_function ());
7973 pop_topmost_sequence ();
7974 }
7975 }
7976
7977 /* Only 64bit target needs something special. */
7978 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7979 {
7980 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7981 std_expand_builtin_va_start (valist, nextarg);
7982 else
7983 {
7984 rtx va_r, next;
7985
7986 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7987 next = expand_binop (ptr_mode, add_optab,
7988 cfun->machine->split_stack_varargs_pointer,
7989 crtl->args.arg_offset_rtx,
7990 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7991 convert_move (va_r, next, 0);
7992 }
7993 return;
7994 }
7995
7996 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7997 f_fpr = DECL_CHAIN (f_gpr);
7998 f_ovf = DECL_CHAIN (f_fpr);
7999 f_sav = DECL_CHAIN (f_ovf);
8000
8001 valist = build_simple_mem_ref (valist);
8002 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
8003 /* The following should be folded into the MEM_REF offset. */
8004 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
8005 f_gpr, NULL_TREE);
8006 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
8007 f_fpr, NULL_TREE);
8008 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
8009 f_ovf, NULL_TREE);
8010 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
8011 f_sav, NULL_TREE);
8012
8013 /* Count number of gp and fp argument registers used. */
8014 words = crtl->args.info.words;
8015 n_gpr = crtl->args.info.regno;
8016 n_fpr = crtl->args.info.sse_regno;
8017
8018 if (cfun->va_list_gpr_size)
8019 {
8020 type = TREE_TYPE (gpr);
8021 t = build2 (MODIFY_EXPR, type,
8022 gpr, build_int_cst (type, n_gpr * 8));
8023 TREE_SIDE_EFFECTS (t) = 1;
8024 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8025 }
8026
8027 if (TARGET_SSE && cfun->va_list_fpr_size)
8028 {
8029 type = TREE_TYPE (fpr);
8030 t = build2 (MODIFY_EXPR, type, fpr,
8031 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
8032 TREE_SIDE_EFFECTS (t) = 1;
8033 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8034 }
8035
8036 /* Find the overflow area. */
8037 type = TREE_TYPE (ovf);
8038 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8039 ovf_rtx = crtl->args.internal_arg_pointer;
8040 else
8041 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
8042 t = make_tree (type, ovf_rtx);
8043 if (words != 0)
8044 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
8045 t = build2 (MODIFY_EXPR, type, ovf, t);
8046 TREE_SIDE_EFFECTS (t) = 1;
8047 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8048
8049 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
8050 {
8051 /* Find the register save area.
8052 Prologue of the function save it right above stack frame. */
8053 type = TREE_TYPE (sav);
8054 t = make_tree (type, frame_pointer_rtx);
8055 if (!ix86_varargs_gpr_size)
8056 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
8057 t = build2 (MODIFY_EXPR, type, sav, t);
8058 TREE_SIDE_EFFECTS (t) = 1;
8059 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8060 }
8061 }
8062
8063 /* Implement va_arg. */
8064
8065 static tree
8066 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
8067 gimple_seq *post_p)
8068 {
8069 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
8070 tree f_gpr, f_fpr, f_ovf, f_sav;
8071 tree gpr, fpr, ovf, sav, t;
8072 int size, rsize;
8073 tree lab_false, lab_over = NULL_TREE;
8074 tree addr, t2;
8075 rtx container;
8076 int indirect_p = 0;
8077 tree ptrtype;
8078 enum machine_mode nat_mode;
8079 unsigned int arg_boundary;
8080
8081 /* Only 64bit target needs something special. */
8082 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8083 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
8084
8085 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8086 f_fpr = DECL_CHAIN (f_gpr);
8087 f_ovf = DECL_CHAIN (f_fpr);
8088 f_sav = DECL_CHAIN (f_ovf);
8089
8090 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
8091 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
8092 valist = build_va_arg_indirect_ref (valist);
8093 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
8094 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
8095 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
8096
8097 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8098 if (indirect_p)
8099 type = build_pointer_type (type);
8100 size = int_size_in_bytes (type);
8101 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8102
8103 nat_mode = type_natural_mode (type, NULL);
8104 switch (nat_mode)
8105 {
8106 case V8SFmode:
8107 case V8SImode:
8108 case V32QImode:
8109 case V16HImode:
8110 case V4DFmode:
8111 case V4DImode:
8112 /* Unnamed 256bit vector mode parameters are passed on stack. */
8113 if (!TARGET_64BIT_MS_ABI)
8114 {
8115 container = NULL;
8116 break;
8117 }
8118
8119 default:
8120 container = construct_container (nat_mode, TYPE_MODE (type),
8121 type, 0, X86_64_REGPARM_MAX,
8122 X86_64_SSE_REGPARM_MAX, intreg,
8123 0);
8124 break;
8125 }
8126
8127 /* Pull the value out of the saved registers. */
8128
8129 addr = create_tmp_var (ptr_type_node, "addr");
8130
8131 if (container)
8132 {
8133 int needed_intregs, needed_sseregs;
8134 bool need_temp;
8135 tree int_addr, sse_addr;
8136
8137 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8138 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8139
8140 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8141
8142 need_temp = (!REG_P (container)
8143 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8144 || TYPE_ALIGN (type) > 128));
8145
8146 /* In case we are passing structure, verify that it is consecutive block
8147 on the register save area. If not we need to do moves. */
8148 if (!need_temp && !REG_P (container))
8149 {
8150 /* Verify that all registers are strictly consecutive */
8151 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8152 {
8153 int i;
8154
8155 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8156 {
8157 rtx slot = XVECEXP (container, 0, i);
8158 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8159 || INTVAL (XEXP (slot, 1)) != i * 16)
8160 need_temp = 1;
8161 }
8162 }
8163 else
8164 {
8165 int i;
8166
8167 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8168 {
8169 rtx slot = XVECEXP (container, 0, i);
8170 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8171 || INTVAL (XEXP (slot, 1)) != i * 8)
8172 need_temp = 1;
8173 }
8174 }
8175 }
8176 if (!need_temp)
8177 {
8178 int_addr = addr;
8179 sse_addr = addr;
8180 }
8181 else
8182 {
8183 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8184 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8185 }
8186
8187 /* First ensure that we fit completely in registers. */
8188 if (needed_intregs)
8189 {
8190 t = build_int_cst (TREE_TYPE (gpr),
8191 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8192 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8193 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8194 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8195 gimplify_and_add (t, pre_p);
8196 }
8197 if (needed_sseregs)
8198 {
8199 t = build_int_cst (TREE_TYPE (fpr),
8200 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8201 + X86_64_REGPARM_MAX * 8);
8202 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8203 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8204 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8205 gimplify_and_add (t, pre_p);
8206 }
8207
8208 /* Compute index to start of area used for integer regs. */
8209 if (needed_intregs)
8210 {
8211 /* int_addr = gpr + sav; */
8212 t = fold_build_pointer_plus (sav, gpr);
8213 gimplify_assign (int_addr, t, pre_p);
8214 }
8215 if (needed_sseregs)
8216 {
8217 /* sse_addr = fpr + sav; */
8218 t = fold_build_pointer_plus (sav, fpr);
8219 gimplify_assign (sse_addr, t, pre_p);
8220 }
8221 if (need_temp)
8222 {
8223 int i, prev_size = 0;
8224 tree temp = create_tmp_var (type, "va_arg_tmp");
8225
8226 /* addr = &temp; */
8227 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8228 gimplify_assign (addr, t, pre_p);
8229
8230 for (i = 0; i < XVECLEN (container, 0); i++)
8231 {
8232 rtx slot = XVECEXP (container, 0, i);
8233 rtx reg = XEXP (slot, 0);
8234 enum machine_mode mode = GET_MODE (reg);
8235 tree piece_type;
8236 tree addr_type;
8237 tree daddr_type;
8238 tree src_addr, src;
8239 int src_offset;
8240 tree dest_addr, dest;
8241 int cur_size = GET_MODE_SIZE (mode);
8242
8243 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8244 prev_size = INTVAL (XEXP (slot, 1));
8245 if (prev_size + cur_size > size)
8246 {
8247 cur_size = size - prev_size;
8248 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8249 if (mode == BLKmode)
8250 mode = QImode;
8251 }
8252 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8253 if (mode == GET_MODE (reg))
8254 addr_type = build_pointer_type (piece_type);
8255 else
8256 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8257 true);
8258 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8259 true);
8260
8261 if (SSE_REGNO_P (REGNO (reg)))
8262 {
8263 src_addr = sse_addr;
8264 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8265 }
8266 else
8267 {
8268 src_addr = int_addr;
8269 src_offset = REGNO (reg) * 8;
8270 }
8271 src_addr = fold_convert (addr_type, src_addr);
8272 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8273
8274 dest_addr = fold_convert (daddr_type, addr);
8275 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8276 if (cur_size == GET_MODE_SIZE (mode))
8277 {
8278 src = build_va_arg_indirect_ref (src_addr);
8279 dest = build_va_arg_indirect_ref (dest_addr);
8280
8281 gimplify_assign (dest, src, pre_p);
8282 }
8283 else
8284 {
8285 tree copy
8286 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8287 3, dest_addr, src_addr,
8288 size_int (cur_size));
8289 gimplify_and_add (copy, pre_p);
8290 }
8291 prev_size += cur_size;
8292 }
8293 }
8294
8295 if (needed_intregs)
8296 {
8297 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8298 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8299 gimplify_assign (gpr, t, pre_p);
8300 }
8301
8302 if (needed_sseregs)
8303 {
8304 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8305 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8306 gimplify_assign (fpr, t, pre_p);
8307 }
8308
8309 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8310
8311 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8312 }
8313
8314 /* ... otherwise out of the overflow area. */
8315
8316 /* When we align parameter on stack for caller, if the parameter
8317 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8318 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8319 here with caller. */
8320 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8321 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8322 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8323
8324 /* Care for on-stack alignment if needed. */
8325 if (arg_boundary <= 64 || size == 0)
8326 t = ovf;
8327 else
8328 {
8329 HOST_WIDE_INT align = arg_boundary / 8;
8330 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8331 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8332 build_int_cst (TREE_TYPE (t), -align));
8333 }
8334
8335 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8336 gimplify_assign (addr, t, pre_p);
8337
8338 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8339 gimplify_assign (unshare_expr (ovf), t, pre_p);
8340
8341 if (container)
8342 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8343
8344 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8345 addr = fold_convert (ptrtype, addr);
8346
8347 if (indirect_p)
8348 addr = build_va_arg_indirect_ref (addr);
8349 return build_va_arg_indirect_ref (addr);
8350 }
8351 \f
8352 /* Return true if OPNUM's MEM should be matched
8353 in movabs* patterns. */
8354
8355 bool
8356 ix86_check_movabs (rtx insn, int opnum)
8357 {
8358 rtx set, mem;
8359
8360 set = PATTERN (insn);
8361 if (GET_CODE (set) == PARALLEL)
8362 set = XVECEXP (set, 0, 0);
8363 gcc_assert (GET_CODE (set) == SET);
8364 mem = XEXP (set, opnum);
8365 while (GET_CODE (mem) == SUBREG)
8366 mem = SUBREG_REG (mem);
8367 gcc_assert (MEM_P (mem));
8368 return volatile_ok || !MEM_VOLATILE_P (mem);
8369 }
8370 \f
8371 /* Initialize the table of extra 80387 mathematical constants. */
8372
8373 static void
8374 init_ext_80387_constants (void)
8375 {
8376 static const char * cst[5] =
8377 {
8378 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8379 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8380 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8381 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8382 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8383 };
8384 int i;
8385
8386 for (i = 0; i < 5; i++)
8387 {
8388 real_from_string (&ext_80387_constants_table[i], cst[i]);
8389 /* Ensure each constant is rounded to XFmode precision. */
8390 real_convert (&ext_80387_constants_table[i],
8391 XFmode, &ext_80387_constants_table[i]);
8392 }
8393
8394 ext_80387_constants_init = 1;
8395 }
8396
8397 /* Return non-zero if the constant is something that
8398 can be loaded with a special instruction. */
8399
8400 int
8401 standard_80387_constant_p (rtx x)
8402 {
8403 enum machine_mode mode = GET_MODE (x);
8404
8405 REAL_VALUE_TYPE r;
8406
8407 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8408 return -1;
8409
8410 if (x == CONST0_RTX (mode))
8411 return 1;
8412 if (x == CONST1_RTX (mode))
8413 return 2;
8414
8415 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8416
8417 /* For XFmode constants, try to find a special 80387 instruction when
8418 optimizing for size or on those CPUs that benefit from them. */
8419 if (mode == XFmode
8420 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8421 {
8422 int i;
8423
8424 if (! ext_80387_constants_init)
8425 init_ext_80387_constants ();
8426
8427 for (i = 0; i < 5; i++)
8428 if (real_identical (&r, &ext_80387_constants_table[i]))
8429 return i + 3;
8430 }
8431
8432 /* Load of the constant -0.0 or -1.0 will be split as
8433 fldz;fchs or fld1;fchs sequence. */
8434 if (real_isnegzero (&r))
8435 return 8;
8436 if (real_identical (&r, &dconstm1))
8437 return 9;
8438
8439 return 0;
8440 }
8441
8442 /* Return the opcode of the special instruction to be used to load
8443 the constant X. */
8444
8445 const char *
8446 standard_80387_constant_opcode (rtx x)
8447 {
8448 switch (standard_80387_constant_p (x))
8449 {
8450 case 1:
8451 return "fldz";
8452 case 2:
8453 return "fld1";
8454 case 3:
8455 return "fldlg2";
8456 case 4:
8457 return "fldln2";
8458 case 5:
8459 return "fldl2e";
8460 case 6:
8461 return "fldl2t";
8462 case 7:
8463 return "fldpi";
8464 case 8:
8465 case 9:
8466 return "#";
8467 default:
8468 gcc_unreachable ();
8469 }
8470 }
8471
8472 /* Return the CONST_DOUBLE representing the 80387 constant that is
8473 loaded by the specified special instruction. The argument IDX
8474 matches the return value from standard_80387_constant_p. */
8475
8476 rtx
8477 standard_80387_constant_rtx (int idx)
8478 {
8479 int i;
8480
8481 if (! ext_80387_constants_init)
8482 init_ext_80387_constants ();
8483
8484 switch (idx)
8485 {
8486 case 3:
8487 case 4:
8488 case 5:
8489 case 6:
8490 case 7:
8491 i = idx - 3;
8492 break;
8493
8494 default:
8495 gcc_unreachable ();
8496 }
8497
8498 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8499 XFmode);
8500 }
8501
8502 /* Return 1 if X is all 0s and 2 if x is all 1s
8503 in supported SSE/AVX vector mode. */
8504
8505 int
8506 standard_sse_constant_p (rtx x)
8507 {
8508 enum machine_mode mode = GET_MODE (x);
8509
8510 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8511 return 1;
8512 if (vector_all_ones_operand (x, mode))
8513 switch (mode)
8514 {
8515 case V16QImode:
8516 case V8HImode:
8517 case V4SImode:
8518 case V2DImode:
8519 if (TARGET_SSE2)
8520 return 2;
8521 case V32QImode:
8522 case V16HImode:
8523 case V8SImode:
8524 case V4DImode:
8525 if (TARGET_AVX2)
8526 return 2;
8527 default:
8528 break;
8529 }
8530
8531 return 0;
8532 }
8533
8534 /* Return the opcode of the special instruction to be used to load
8535 the constant X. */
8536
8537 const char *
8538 standard_sse_constant_opcode (rtx insn, rtx x)
8539 {
8540 switch (standard_sse_constant_p (x))
8541 {
8542 case 1:
8543 switch (get_attr_mode (insn))
8544 {
8545 case MODE_TI:
8546 return "%vpxor\t%0, %d0";
8547 case MODE_V2DF:
8548 return "%vxorpd\t%0, %d0";
8549 case MODE_V4SF:
8550 return "%vxorps\t%0, %d0";
8551
8552 case MODE_OI:
8553 return "vpxor\t%x0, %x0, %x0";
8554 case MODE_V4DF:
8555 return "vxorpd\t%x0, %x0, %x0";
8556 case MODE_V8SF:
8557 return "vxorps\t%x0, %x0, %x0";
8558
8559 default:
8560 break;
8561 }
8562
8563 case 2:
8564 if (TARGET_AVX)
8565 return "vpcmpeqd\t%0, %0, %0";
8566 else
8567 return "pcmpeqd\t%0, %0";
8568
8569 default:
8570 break;
8571 }
8572 gcc_unreachable ();
8573 }
8574
8575 /* Returns true if OP contains a symbol reference */
8576
8577 bool
8578 symbolic_reference_mentioned_p (rtx op)
8579 {
8580 const char *fmt;
8581 int i;
8582
8583 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8584 return true;
8585
8586 fmt = GET_RTX_FORMAT (GET_CODE (op));
8587 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8588 {
8589 if (fmt[i] == 'E')
8590 {
8591 int j;
8592
8593 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8594 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8595 return true;
8596 }
8597
8598 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8599 return true;
8600 }
8601
8602 return false;
8603 }
8604
8605 /* Return true if it is appropriate to emit `ret' instructions in the
8606 body of a function. Do this only if the epilogue is simple, needing a
8607 couple of insns. Prior to reloading, we can't tell how many registers
8608 must be saved, so return false then. Return false if there is no frame
8609 marker to de-allocate. */
8610
8611 bool
8612 ix86_can_use_return_insn_p (void)
8613 {
8614 struct ix86_frame frame;
8615
8616 if (! reload_completed || frame_pointer_needed)
8617 return 0;
8618
8619 /* Don't allow more than 32k pop, since that's all we can do
8620 with one instruction. */
8621 if (crtl->args.pops_args && crtl->args.size >= 32768)
8622 return 0;
8623
8624 ix86_compute_frame_layout (&frame);
8625 return (frame.stack_pointer_offset == UNITS_PER_WORD
8626 && (frame.nregs + frame.nsseregs) == 0);
8627 }
8628 \f
8629 /* Value should be nonzero if functions must have frame pointers.
8630 Zero means the frame pointer need not be set up (and parms may
8631 be accessed via the stack pointer) in functions that seem suitable. */
8632
8633 static bool
8634 ix86_frame_pointer_required (void)
8635 {
8636 /* If we accessed previous frames, then the generated code expects
8637 to be able to access the saved ebp value in our frame. */
8638 if (cfun->machine->accesses_prev_frame)
8639 return true;
8640
8641 /* Several x86 os'es need a frame pointer for other reasons,
8642 usually pertaining to setjmp. */
8643 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8644 return true;
8645
8646 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8647 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8648 return true;
8649
8650 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8651 allocation is 4GB. */
8652 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8653 return true;
8654
8655 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8656 turns off the frame pointer by default. Turn it back on now if
8657 we've not got a leaf function. */
8658 if (TARGET_OMIT_LEAF_FRAME_POINTER
8659 && (!crtl->is_leaf
8660 || ix86_current_function_calls_tls_descriptor))
8661 return true;
8662
8663 if (crtl->profile && !flag_fentry)
8664 return true;
8665
8666 return false;
8667 }
8668
8669 /* Record that the current function accesses previous call frames. */
8670
8671 void
8672 ix86_setup_frame_addresses (void)
8673 {
8674 cfun->machine->accesses_prev_frame = 1;
8675 }
8676 \f
8677 #ifndef USE_HIDDEN_LINKONCE
8678 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8679 # define USE_HIDDEN_LINKONCE 1
8680 # else
8681 # define USE_HIDDEN_LINKONCE 0
8682 # endif
8683 #endif
8684
8685 static int pic_labels_used;
8686
8687 /* Fills in the label name that should be used for a pc thunk for
8688 the given register. */
8689
8690 static void
8691 get_pc_thunk_name (char name[32], unsigned int regno)
8692 {
8693 gcc_assert (!TARGET_64BIT);
8694
8695 if (USE_HIDDEN_LINKONCE)
8696 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8697 else
8698 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8699 }
8700
8701
8702 /* This function generates code for -fpic that loads %ebx with
8703 the return address of the caller and then returns. */
8704
8705 static void
8706 ix86_code_end (void)
8707 {
8708 rtx xops[2];
8709 int regno;
8710
8711 for (regno = AX_REG; regno <= SP_REG; regno++)
8712 {
8713 char name[32];
8714 tree decl;
8715
8716 if (!(pic_labels_used & (1 << regno)))
8717 continue;
8718
8719 get_pc_thunk_name (name, regno);
8720
8721 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8722 get_identifier (name),
8723 build_function_type_list (void_type_node, NULL_TREE));
8724 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8725 NULL_TREE, void_type_node);
8726 TREE_PUBLIC (decl) = 1;
8727 TREE_STATIC (decl) = 1;
8728 DECL_IGNORED_P (decl) = 1;
8729
8730 #if TARGET_MACHO
8731 if (TARGET_MACHO)
8732 {
8733 switch_to_section (darwin_sections[text_coal_section]);
8734 fputs ("\t.weak_definition\t", asm_out_file);
8735 assemble_name (asm_out_file, name);
8736 fputs ("\n\t.private_extern\t", asm_out_file);
8737 assemble_name (asm_out_file, name);
8738 putc ('\n', asm_out_file);
8739 ASM_OUTPUT_LABEL (asm_out_file, name);
8740 DECL_WEAK (decl) = 1;
8741 }
8742 else
8743 #endif
8744 if (USE_HIDDEN_LINKONCE)
8745 {
8746 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8747
8748 targetm.asm_out.unique_section (decl, 0);
8749 switch_to_section (get_named_section (decl, NULL, 0));
8750
8751 targetm.asm_out.globalize_label (asm_out_file, name);
8752 fputs ("\t.hidden\t", asm_out_file);
8753 assemble_name (asm_out_file, name);
8754 putc ('\n', asm_out_file);
8755 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8756 }
8757 else
8758 {
8759 switch_to_section (text_section);
8760 ASM_OUTPUT_LABEL (asm_out_file, name);
8761 }
8762
8763 DECL_INITIAL (decl) = make_node (BLOCK);
8764 current_function_decl = decl;
8765 init_function_start (decl);
8766 first_function_block_is_cold = false;
8767 /* Make sure unwind info is emitted for the thunk if needed. */
8768 final_start_function (emit_barrier (), asm_out_file, 1);
8769
8770 /* Pad stack IP move with 4 instructions (two NOPs count
8771 as one instruction). */
8772 if (TARGET_PAD_SHORT_FUNCTION)
8773 {
8774 int i = 8;
8775
8776 while (i--)
8777 fputs ("\tnop\n", asm_out_file);
8778 }
8779
8780 xops[0] = gen_rtx_REG (Pmode, regno);
8781 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8782 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8783 fputs ("\tret\n", asm_out_file);
8784 final_end_function ();
8785 init_insn_lengths ();
8786 free_after_compilation (cfun);
8787 set_cfun (NULL);
8788 current_function_decl = NULL;
8789 }
8790
8791 if (flag_split_stack)
8792 file_end_indicate_split_stack ();
8793 }
8794
8795 /* Emit code for the SET_GOT patterns. */
8796
8797 const char *
8798 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8799 {
8800 rtx xops[3];
8801
8802 xops[0] = dest;
8803
8804 if (TARGET_VXWORKS_RTP && flag_pic)
8805 {
8806 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8807 xops[2] = gen_rtx_MEM (Pmode,
8808 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8809 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8810
8811 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8812 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8813 an unadorned address. */
8814 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8815 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8816 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8817 return "";
8818 }
8819
8820 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8821
8822 if (!flag_pic)
8823 {
8824 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8825
8826 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8827
8828 #if TARGET_MACHO
8829 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8830 is what will be referenced by the Mach-O PIC subsystem. */
8831 if (!label)
8832 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8833 #endif
8834
8835 targetm.asm_out.internal_label (asm_out_file, "L",
8836 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8837 }
8838 else
8839 {
8840 char name[32];
8841 get_pc_thunk_name (name, REGNO (dest));
8842 pic_labels_used |= 1 << REGNO (dest);
8843
8844 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8845 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8846 output_asm_insn ("call\t%X2", xops);
8847 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8848 is what will be referenced by the Mach-O PIC subsystem. */
8849 #if TARGET_MACHO
8850 if (!label)
8851 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8852 else
8853 targetm.asm_out.internal_label (asm_out_file, "L",
8854 CODE_LABEL_NUMBER (label));
8855 #endif
8856 }
8857
8858 if (!TARGET_MACHO)
8859 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8860
8861 return "";
8862 }
8863
8864 /* Generate an "push" pattern for input ARG. */
8865
8866 static rtx
8867 gen_push (rtx arg)
8868 {
8869 struct machine_function *m = cfun->machine;
8870
8871 if (m->fs.cfa_reg == stack_pointer_rtx)
8872 m->fs.cfa_offset += UNITS_PER_WORD;
8873 m->fs.sp_offset += UNITS_PER_WORD;
8874
8875 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8876 arg = gen_rtx_REG (word_mode, REGNO (arg));
8877
8878 return gen_rtx_SET (VOIDmode,
8879 gen_rtx_MEM (word_mode,
8880 gen_rtx_PRE_DEC (Pmode,
8881 stack_pointer_rtx)),
8882 arg);
8883 }
8884
8885 /* Generate an "pop" pattern for input ARG. */
8886
8887 static rtx
8888 gen_pop (rtx arg)
8889 {
8890 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8891 arg = gen_rtx_REG (word_mode, REGNO (arg));
8892
8893 return gen_rtx_SET (VOIDmode,
8894 arg,
8895 gen_rtx_MEM (word_mode,
8896 gen_rtx_POST_INC (Pmode,
8897 stack_pointer_rtx)));
8898 }
8899
8900 /* Return >= 0 if there is an unused call-clobbered register available
8901 for the entire function. */
8902
8903 static unsigned int
8904 ix86_select_alt_pic_regnum (void)
8905 {
8906 if (crtl->is_leaf
8907 && !crtl->profile
8908 && !ix86_current_function_calls_tls_descriptor)
8909 {
8910 int i, drap;
8911 /* Can't use the same register for both PIC and DRAP. */
8912 if (crtl->drap_reg)
8913 drap = REGNO (crtl->drap_reg);
8914 else
8915 drap = -1;
8916 for (i = 2; i >= 0; --i)
8917 if (i != drap && !df_regs_ever_live_p (i))
8918 return i;
8919 }
8920
8921 return INVALID_REGNUM;
8922 }
8923
8924 /* Return TRUE if we need to save REGNO. */
8925
8926 static bool
8927 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8928 {
8929 if (pic_offset_table_rtx
8930 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8931 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8932 || crtl->profile
8933 || crtl->calls_eh_return
8934 || crtl->uses_const_pool))
8935 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8936
8937 if (crtl->calls_eh_return && maybe_eh_return)
8938 {
8939 unsigned i;
8940 for (i = 0; ; i++)
8941 {
8942 unsigned test = EH_RETURN_DATA_REGNO (i);
8943 if (test == INVALID_REGNUM)
8944 break;
8945 if (test == regno)
8946 return true;
8947 }
8948 }
8949
8950 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8951 return true;
8952
8953 return (df_regs_ever_live_p (regno)
8954 && !call_used_regs[regno]
8955 && !fixed_regs[regno]
8956 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8957 }
8958
8959 /* Return number of saved general prupose registers. */
8960
8961 static int
8962 ix86_nsaved_regs (void)
8963 {
8964 int nregs = 0;
8965 int regno;
8966
8967 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8968 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8969 nregs ++;
8970 return nregs;
8971 }
8972
8973 /* Return number of saved SSE registrers. */
8974
8975 static int
8976 ix86_nsaved_sseregs (void)
8977 {
8978 int nregs = 0;
8979 int regno;
8980
8981 if (!TARGET_64BIT_MS_ABI)
8982 return 0;
8983 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8984 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8985 nregs ++;
8986 return nregs;
8987 }
8988
8989 /* Given FROM and TO register numbers, say whether this elimination is
8990 allowed. If stack alignment is needed, we can only replace argument
8991 pointer with hard frame pointer, or replace frame pointer with stack
8992 pointer. Otherwise, frame pointer elimination is automatically
8993 handled and all other eliminations are valid. */
8994
8995 static bool
8996 ix86_can_eliminate (const int from, const int to)
8997 {
8998 if (stack_realign_fp)
8999 return ((from == ARG_POINTER_REGNUM
9000 && to == HARD_FRAME_POINTER_REGNUM)
9001 || (from == FRAME_POINTER_REGNUM
9002 && to == STACK_POINTER_REGNUM));
9003 else
9004 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
9005 }
9006
9007 /* Return the offset between two registers, one to be eliminated, and the other
9008 its replacement, at the start of a routine. */
9009
9010 HOST_WIDE_INT
9011 ix86_initial_elimination_offset (int from, int to)
9012 {
9013 struct ix86_frame frame;
9014 ix86_compute_frame_layout (&frame);
9015
9016 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
9017 return frame.hard_frame_pointer_offset;
9018 else if (from == FRAME_POINTER_REGNUM
9019 && to == HARD_FRAME_POINTER_REGNUM)
9020 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
9021 else
9022 {
9023 gcc_assert (to == STACK_POINTER_REGNUM);
9024
9025 if (from == ARG_POINTER_REGNUM)
9026 return frame.stack_pointer_offset;
9027
9028 gcc_assert (from == FRAME_POINTER_REGNUM);
9029 return frame.stack_pointer_offset - frame.frame_pointer_offset;
9030 }
9031 }
9032
9033 /* In a dynamically-aligned function, we can't know the offset from
9034 stack pointer to frame pointer, so we must ensure that setjmp
9035 eliminates fp against the hard fp (%ebp) rather than trying to
9036 index from %esp up to the top of the frame across a gap that is
9037 of unknown (at compile-time) size. */
9038 static rtx
9039 ix86_builtin_setjmp_frame_value (void)
9040 {
9041 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
9042 }
9043
9044 /* When using -fsplit-stack, the allocation routines set a field in
9045 the TCB to the bottom of the stack plus this much space, measured
9046 in bytes. */
9047
9048 #define SPLIT_STACK_AVAILABLE 256
9049
9050 /* Fill structure ix86_frame about frame of currently computed function. */
9051
9052 static void
9053 ix86_compute_frame_layout (struct ix86_frame *frame)
9054 {
9055 unsigned HOST_WIDE_INT stack_alignment_needed;
9056 HOST_WIDE_INT offset;
9057 unsigned HOST_WIDE_INT preferred_alignment;
9058 HOST_WIDE_INT size = get_frame_size ();
9059 HOST_WIDE_INT to_allocate;
9060
9061 frame->nregs = ix86_nsaved_regs ();
9062 frame->nsseregs = ix86_nsaved_sseregs ();
9063
9064 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
9065 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
9066
9067 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
9068 function prologues and leaf. */
9069 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
9070 && (!crtl->is_leaf || cfun->calls_alloca != 0
9071 || ix86_current_function_calls_tls_descriptor))
9072 {
9073 preferred_alignment = 16;
9074 stack_alignment_needed = 16;
9075 crtl->preferred_stack_boundary = 128;
9076 crtl->stack_alignment_needed = 128;
9077 }
9078
9079 gcc_assert (!size || stack_alignment_needed);
9080 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
9081 gcc_assert (preferred_alignment <= stack_alignment_needed);
9082
9083 /* For SEH we have to limit the amount of code movement into the prologue.
9084 At present we do this via a BLOCKAGE, at which point there's very little
9085 scheduling that can be done, which means that there's very little point
9086 in doing anything except PUSHs. */
9087 if (TARGET_SEH)
9088 cfun->machine->use_fast_prologue_epilogue = false;
9089
9090 /* During reload iteration the amount of registers saved can change.
9091 Recompute the value as needed. Do not recompute when amount of registers
9092 didn't change as reload does multiple calls to the function and does not
9093 expect the decision to change within single iteration. */
9094 else if (!optimize_function_for_size_p (cfun)
9095 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
9096 {
9097 int count = frame->nregs;
9098 struct cgraph_node *node = cgraph_get_node (current_function_decl);
9099
9100 cfun->machine->use_fast_prologue_epilogue_nregs = count;
9101
9102 /* The fast prologue uses move instead of push to save registers. This
9103 is significantly longer, but also executes faster as modern hardware
9104 can execute the moves in parallel, but can't do that for push/pop.
9105
9106 Be careful about choosing what prologue to emit: When function takes
9107 many instructions to execute we may use slow version as well as in
9108 case function is known to be outside hot spot (this is known with
9109 feedback only). Weight the size of function by number of registers
9110 to save as it is cheap to use one or two push instructions but very
9111 slow to use many of them. */
9112 if (count)
9113 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9114 if (node->frequency < NODE_FREQUENCY_NORMAL
9115 || (flag_branch_probabilities
9116 && node->frequency < NODE_FREQUENCY_HOT))
9117 cfun->machine->use_fast_prologue_epilogue = false;
9118 else
9119 cfun->machine->use_fast_prologue_epilogue
9120 = !expensive_function_p (count);
9121 }
9122
9123 frame->save_regs_using_mov
9124 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9125 /* If static stack checking is enabled and done with probes,
9126 the registers need to be saved before allocating the frame. */
9127 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9128
9129 /* Skip return address. */
9130 offset = UNITS_PER_WORD;
9131
9132 /* Skip pushed static chain. */
9133 if (ix86_static_chain_on_stack)
9134 offset += UNITS_PER_WORD;
9135
9136 /* Skip saved base pointer. */
9137 if (frame_pointer_needed)
9138 offset += UNITS_PER_WORD;
9139 frame->hfp_save_offset = offset;
9140
9141 /* The traditional frame pointer location is at the top of the frame. */
9142 frame->hard_frame_pointer_offset = offset;
9143
9144 /* Register save area */
9145 offset += frame->nregs * UNITS_PER_WORD;
9146 frame->reg_save_offset = offset;
9147
9148 /* On SEH target, registers are pushed just before the frame pointer
9149 location. */
9150 if (TARGET_SEH)
9151 frame->hard_frame_pointer_offset = offset;
9152
9153 /* Align and set SSE register save area. */
9154 if (frame->nsseregs)
9155 {
9156 /* The only ABI that has saved SSE registers (Win64) also has a
9157 16-byte aligned default stack, and thus we don't need to be
9158 within the re-aligned local stack frame to save them. */
9159 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9160 offset = (offset + 16 - 1) & -16;
9161 offset += frame->nsseregs * 16;
9162 }
9163 frame->sse_reg_save_offset = offset;
9164
9165 /* The re-aligned stack starts here. Values before this point are not
9166 directly comparable with values below this point. In order to make
9167 sure that no value happens to be the same before and after, force
9168 the alignment computation below to add a non-zero value. */
9169 if (stack_realign_fp)
9170 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9171
9172 /* Va-arg area */
9173 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9174 offset += frame->va_arg_size;
9175
9176 /* Align start of frame for local function. */
9177 if (stack_realign_fp
9178 || offset != frame->sse_reg_save_offset
9179 || size != 0
9180 || !crtl->is_leaf
9181 || cfun->calls_alloca
9182 || ix86_current_function_calls_tls_descriptor)
9183 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9184
9185 /* Frame pointer points here. */
9186 frame->frame_pointer_offset = offset;
9187
9188 offset += size;
9189
9190 /* Add outgoing arguments area. Can be skipped if we eliminated
9191 all the function calls as dead code.
9192 Skipping is however impossible when function calls alloca. Alloca
9193 expander assumes that last crtl->outgoing_args_size
9194 of stack frame are unused. */
9195 if (ACCUMULATE_OUTGOING_ARGS
9196 && (!crtl->is_leaf || cfun->calls_alloca
9197 || ix86_current_function_calls_tls_descriptor))
9198 {
9199 offset += crtl->outgoing_args_size;
9200 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9201 }
9202 else
9203 frame->outgoing_arguments_size = 0;
9204
9205 /* Align stack boundary. Only needed if we're calling another function
9206 or using alloca. */
9207 if (!crtl->is_leaf || cfun->calls_alloca
9208 || ix86_current_function_calls_tls_descriptor)
9209 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9210
9211 /* We've reached end of stack frame. */
9212 frame->stack_pointer_offset = offset;
9213
9214 /* Size prologue needs to allocate. */
9215 to_allocate = offset - frame->sse_reg_save_offset;
9216
9217 if ((!to_allocate && frame->nregs <= 1)
9218 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9219 frame->save_regs_using_mov = false;
9220
9221 if (ix86_using_red_zone ()
9222 && crtl->sp_is_unchanging
9223 && crtl->is_leaf
9224 && !ix86_current_function_calls_tls_descriptor)
9225 {
9226 frame->red_zone_size = to_allocate;
9227 if (frame->save_regs_using_mov)
9228 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9229 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9230 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9231 }
9232 else
9233 frame->red_zone_size = 0;
9234 frame->stack_pointer_offset -= frame->red_zone_size;
9235
9236 /* The SEH frame pointer location is near the bottom of the frame.
9237 This is enforced by the fact that the difference between the
9238 stack pointer and the frame pointer is limited to 240 bytes in
9239 the unwind data structure. */
9240 if (TARGET_SEH)
9241 {
9242 HOST_WIDE_INT diff;
9243
9244 /* If we can leave the frame pointer where it is, do so. Also, returns
9245 the establisher frame for __builtin_frame_address (0). */
9246 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9247 if (diff <= SEH_MAX_FRAME_SIZE
9248 && (diff > 240 || (diff & 15) != 0)
9249 && !crtl->accesses_prior_frames)
9250 {
9251 /* Ideally we'd determine what portion of the local stack frame
9252 (within the constraint of the lowest 240) is most heavily used.
9253 But without that complication, simply bias the frame pointer
9254 by 128 bytes so as to maximize the amount of the local stack
9255 frame that is addressable with 8-bit offsets. */
9256 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9257 }
9258 }
9259 }
9260
9261 /* This is semi-inlined memory_address_length, but simplified
9262 since we know that we're always dealing with reg+offset, and
9263 to avoid having to create and discard all that rtl. */
9264
9265 static inline int
9266 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9267 {
9268 int len = 4;
9269
9270 if (offset == 0)
9271 {
9272 /* EBP and R13 cannot be encoded without an offset. */
9273 len = (regno == BP_REG || regno == R13_REG);
9274 }
9275 else if (IN_RANGE (offset, -128, 127))
9276 len = 1;
9277
9278 /* ESP and R12 must be encoded with a SIB byte. */
9279 if (regno == SP_REG || regno == R12_REG)
9280 len++;
9281
9282 return len;
9283 }
9284
9285 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9286 The valid base registers are taken from CFUN->MACHINE->FS. */
9287
9288 static rtx
9289 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9290 {
9291 const struct machine_function *m = cfun->machine;
9292 rtx base_reg = NULL;
9293 HOST_WIDE_INT base_offset = 0;
9294
9295 if (m->use_fast_prologue_epilogue)
9296 {
9297 /* Choose the base register most likely to allow the most scheduling
9298 opportunities. Generally FP is valid throughout the function,
9299 while DRAP must be reloaded within the epilogue. But choose either
9300 over the SP due to increased encoding size. */
9301
9302 if (m->fs.fp_valid)
9303 {
9304 base_reg = hard_frame_pointer_rtx;
9305 base_offset = m->fs.fp_offset - cfa_offset;
9306 }
9307 else if (m->fs.drap_valid)
9308 {
9309 base_reg = crtl->drap_reg;
9310 base_offset = 0 - cfa_offset;
9311 }
9312 else if (m->fs.sp_valid)
9313 {
9314 base_reg = stack_pointer_rtx;
9315 base_offset = m->fs.sp_offset - cfa_offset;
9316 }
9317 }
9318 else
9319 {
9320 HOST_WIDE_INT toffset;
9321 int len = 16, tlen;
9322
9323 /* Choose the base register with the smallest address encoding.
9324 With a tie, choose FP > DRAP > SP. */
9325 if (m->fs.sp_valid)
9326 {
9327 base_reg = stack_pointer_rtx;
9328 base_offset = m->fs.sp_offset - cfa_offset;
9329 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9330 }
9331 if (m->fs.drap_valid)
9332 {
9333 toffset = 0 - cfa_offset;
9334 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9335 if (tlen <= len)
9336 {
9337 base_reg = crtl->drap_reg;
9338 base_offset = toffset;
9339 len = tlen;
9340 }
9341 }
9342 if (m->fs.fp_valid)
9343 {
9344 toffset = m->fs.fp_offset - cfa_offset;
9345 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9346 if (tlen <= len)
9347 {
9348 base_reg = hard_frame_pointer_rtx;
9349 base_offset = toffset;
9350 len = tlen;
9351 }
9352 }
9353 }
9354 gcc_assert (base_reg != NULL);
9355
9356 return plus_constant (Pmode, base_reg, base_offset);
9357 }
9358
9359 /* Emit code to save registers in the prologue. */
9360
9361 static void
9362 ix86_emit_save_regs (void)
9363 {
9364 unsigned int regno;
9365 rtx insn;
9366
9367 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9368 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9369 {
9370 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9371 RTX_FRAME_RELATED_P (insn) = 1;
9372 }
9373 }
9374
9375 /* Emit a single register save at CFA - CFA_OFFSET. */
9376
9377 static void
9378 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9379 HOST_WIDE_INT cfa_offset)
9380 {
9381 struct machine_function *m = cfun->machine;
9382 rtx reg = gen_rtx_REG (mode, regno);
9383 rtx mem, addr, base, insn;
9384
9385 addr = choose_baseaddr (cfa_offset);
9386 mem = gen_frame_mem (mode, addr);
9387
9388 /* For SSE saves, we need to indicate the 128-bit alignment. */
9389 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9390
9391 insn = emit_move_insn (mem, reg);
9392 RTX_FRAME_RELATED_P (insn) = 1;
9393
9394 base = addr;
9395 if (GET_CODE (base) == PLUS)
9396 base = XEXP (base, 0);
9397 gcc_checking_assert (REG_P (base));
9398
9399 /* When saving registers into a re-aligned local stack frame, avoid
9400 any tricky guessing by dwarf2out. */
9401 if (m->fs.realigned)
9402 {
9403 gcc_checking_assert (stack_realign_drap);
9404
9405 if (regno == REGNO (crtl->drap_reg))
9406 {
9407 /* A bit of a hack. We force the DRAP register to be saved in
9408 the re-aligned stack frame, which provides us with a copy
9409 of the CFA that will last past the prologue. Install it. */
9410 gcc_checking_assert (cfun->machine->fs.fp_valid);
9411 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9412 cfun->machine->fs.fp_offset - cfa_offset);
9413 mem = gen_rtx_MEM (mode, addr);
9414 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9415 }
9416 else
9417 {
9418 /* The frame pointer is a stable reference within the
9419 aligned frame. Use it. */
9420 gcc_checking_assert (cfun->machine->fs.fp_valid);
9421 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9422 cfun->machine->fs.fp_offset - cfa_offset);
9423 mem = gen_rtx_MEM (mode, addr);
9424 add_reg_note (insn, REG_CFA_EXPRESSION,
9425 gen_rtx_SET (VOIDmode, mem, reg));
9426 }
9427 }
9428
9429 /* The memory may not be relative to the current CFA register,
9430 which means that we may need to generate a new pattern for
9431 use by the unwind info. */
9432 else if (base != m->fs.cfa_reg)
9433 {
9434 addr = plus_constant (Pmode, m->fs.cfa_reg,
9435 m->fs.cfa_offset - cfa_offset);
9436 mem = gen_rtx_MEM (mode, addr);
9437 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9438 }
9439 }
9440
9441 /* Emit code to save registers using MOV insns.
9442 First register is stored at CFA - CFA_OFFSET. */
9443 static void
9444 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9445 {
9446 unsigned int regno;
9447
9448 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9449 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9450 {
9451 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9452 cfa_offset -= UNITS_PER_WORD;
9453 }
9454 }
9455
9456 /* Emit code to save SSE registers using MOV insns.
9457 First register is stored at CFA - CFA_OFFSET. */
9458 static void
9459 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9460 {
9461 unsigned int regno;
9462
9463 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9464 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9465 {
9466 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9467 cfa_offset -= 16;
9468 }
9469 }
9470
9471 static GTY(()) rtx queued_cfa_restores;
9472
9473 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9474 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9475 Don't add the note if the previously saved value will be left untouched
9476 within stack red-zone till return, as unwinders can find the same value
9477 in the register and on the stack. */
9478
9479 static void
9480 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9481 {
9482 if (!crtl->shrink_wrapped
9483 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9484 return;
9485
9486 if (insn)
9487 {
9488 add_reg_note (insn, REG_CFA_RESTORE, reg);
9489 RTX_FRAME_RELATED_P (insn) = 1;
9490 }
9491 else
9492 queued_cfa_restores
9493 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9494 }
9495
9496 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9497
9498 static void
9499 ix86_add_queued_cfa_restore_notes (rtx insn)
9500 {
9501 rtx last;
9502 if (!queued_cfa_restores)
9503 return;
9504 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9505 ;
9506 XEXP (last, 1) = REG_NOTES (insn);
9507 REG_NOTES (insn) = queued_cfa_restores;
9508 queued_cfa_restores = NULL_RTX;
9509 RTX_FRAME_RELATED_P (insn) = 1;
9510 }
9511
9512 /* Expand prologue or epilogue stack adjustment.
9513 The pattern exist to put a dependency on all ebp-based memory accesses.
9514 STYLE should be negative if instructions should be marked as frame related,
9515 zero if %r11 register is live and cannot be freely used and positive
9516 otherwise. */
9517
9518 static void
9519 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9520 int style, bool set_cfa)
9521 {
9522 struct machine_function *m = cfun->machine;
9523 rtx insn;
9524 bool add_frame_related_expr = false;
9525
9526 if (Pmode == SImode)
9527 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9528 else if (x86_64_immediate_operand (offset, DImode))
9529 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9530 else
9531 {
9532 rtx tmp;
9533 /* r11 is used by indirect sibcall return as well, set before the
9534 epilogue and used after the epilogue. */
9535 if (style)
9536 tmp = gen_rtx_REG (DImode, R11_REG);
9537 else
9538 {
9539 gcc_assert (src != hard_frame_pointer_rtx
9540 && dest != hard_frame_pointer_rtx);
9541 tmp = hard_frame_pointer_rtx;
9542 }
9543 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9544 if (style < 0)
9545 add_frame_related_expr = true;
9546
9547 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9548 }
9549
9550 insn = emit_insn (insn);
9551 if (style >= 0)
9552 ix86_add_queued_cfa_restore_notes (insn);
9553
9554 if (set_cfa)
9555 {
9556 rtx r;
9557
9558 gcc_assert (m->fs.cfa_reg == src);
9559 m->fs.cfa_offset += INTVAL (offset);
9560 m->fs.cfa_reg = dest;
9561
9562 r = gen_rtx_PLUS (Pmode, src, offset);
9563 r = gen_rtx_SET (VOIDmode, dest, r);
9564 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9565 RTX_FRAME_RELATED_P (insn) = 1;
9566 }
9567 else if (style < 0)
9568 {
9569 RTX_FRAME_RELATED_P (insn) = 1;
9570 if (add_frame_related_expr)
9571 {
9572 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9573 r = gen_rtx_SET (VOIDmode, dest, r);
9574 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9575 }
9576 }
9577
9578 if (dest == stack_pointer_rtx)
9579 {
9580 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9581 bool valid = m->fs.sp_valid;
9582
9583 if (src == hard_frame_pointer_rtx)
9584 {
9585 valid = m->fs.fp_valid;
9586 ooffset = m->fs.fp_offset;
9587 }
9588 else if (src == crtl->drap_reg)
9589 {
9590 valid = m->fs.drap_valid;
9591 ooffset = 0;
9592 }
9593 else
9594 {
9595 /* Else there are two possibilities: SP itself, which we set
9596 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9597 taken care of this by hand along the eh_return path. */
9598 gcc_checking_assert (src == stack_pointer_rtx
9599 || offset == const0_rtx);
9600 }
9601
9602 m->fs.sp_offset = ooffset - INTVAL (offset);
9603 m->fs.sp_valid = valid;
9604 }
9605 }
9606
9607 /* Find an available register to be used as dynamic realign argument
9608 pointer regsiter. Such a register will be written in prologue and
9609 used in begin of body, so it must not be
9610 1. parameter passing register.
9611 2. GOT pointer.
9612 We reuse static-chain register if it is available. Otherwise, we
9613 use DI for i386 and R13 for x86-64. We chose R13 since it has
9614 shorter encoding.
9615
9616 Return: the regno of chosen register. */
9617
9618 static unsigned int
9619 find_drap_reg (void)
9620 {
9621 tree decl = cfun->decl;
9622
9623 if (TARGET_64BIT)
9624 {
9625 /* Use R13 for nested function or function need static chain.
9626 Since function with tail call may use any caller-saved
9627 registers in epilogue, DRAP must not use caller-saved
9628 register in such case. */
9629 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9630 return R13_REG;
9631
9632 return R10_REG;
9633 }
9634 else
9635 {
9636 /* Use DI for nested function or function need static chain.
9637 Since function with tail call may use any caller-saved
9638 registers in epilogue, DRAP must not use caller-saved
9639 register in such case. */
9640 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9641 return DI_REG;
9642
9643 /* Reuse static chain register if it isn't used for parameter
9644 passing. */
9645 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9646 {
9647 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9648 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9649 return CX_REG;
9650 }
9651 return DI_REG;
9652 }
9653 }
9654
9655 /* Return minimum incoming stack alignment. */
9656
9657 static unsigned int
9658 ix86_minimum_incoming_stack_boundary (bool sibcall)
9659 {
9660 unsigned int incoming_stack_boundary;
9661
9662 /* Prefer the one specified at command line. */
9663 if (ix86_user_incoming_stack_boundary)
9664 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9665 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9666 if -mstackrealign is used, it isn't used for sibcall check and
9667 estimated stack alignment is 128bit. */
9668 else if (!sibcall
9669 && !TARGET_64BIT
9670 && ix86_force_align_arg_pointer
9671 && crtl->stack_alignment_estimated == 128)
9672 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9673 else
9674 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9675
9676 /* Incoming stack alignment can be changed on individual functions
9677 via force_align_arg_pointer attribute. We use the smallest
9678 incoming stack boundary. */
9679 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9680 && lookup_attribute (ix86_force_align_arg_pointer_string,
9681 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9682 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9683
9684 /* The incoming stack frame has to be aligned at least at
9685 parm_stack_boundary. */
9686 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9687 incoming_stack_boundary = crtl->parm_stack_boundary;
9688
9689 /* Stack at entrance of main is aligned by runtime. We use the
9690 smallest incoming stack boundary. */
9691 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9692 && DECL_NAME (current_function_decl)
9693 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9694 && DECL_FILE_SCOPE_P (current_function_decl))
9695 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9696
9697 return incoming_stack_boundary;
9698 }
9699
9700 /* Update incoming stack boundary and estimated stack alignment. */
9701
9702 static void
9703 ix86_update_stack_boundary (void)
9704 {
9705 ix86_incoming_stack_boundary
9706 = ix86_minimum_incoming_stack_boundary (false);
9707
9708 /* x86_64 vararg needs 16byte stack alignment for register save
9709 area. */
9710 if (TARGET_64BIT
9711 && cfun->stdarg
9712 && crtl->stack_alignment_estimated < 128)
9713 crtl->stack_alignment_estimated = 128;
9714 }
9715
9716 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9717 needed or an rtx for DRAP otherwise. */
9718
9719 static rtx
9720 ix86_get_drap_rtx (void)
9721 {
9722 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9723 crtl->need_drap = true;
9724
9725 if (stack_realign_drap)
9726 {
9727 /* Assign DRAP to vDRAP and returns vDRAP */
9728 unsigned int regno = find_drap_reg ();
9729 rtx drap_vreg;
9730 rtx arg_ptr;
9731 rtx seq, insn;
9732
9733 arg_ptr = gen_rtx_REG (Pmode, regno);
9734 crtl->drap_reg = arg_ptr;
9735
9736 start_sequence ();
9737 drap_vreg = copy_to_reg (arg_ptr);
9738 seq = get_insns ();
9739 end_sequence ();
9740
9741 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9742 if (!optimize)
9743 {
9744 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9745 RTX_FRAME_RELATED_P (insn) = 1;
9746 }
9747 return drap_vreg;
9748 }
9749 else
9750 return NULL;
9751 }
9752
9753 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9754
9755 static rtx
9756 ix86_internal_arg_pointer (void)
9757 {
9758 return virtual_incoming_args_rtx;
9759 }
9760
9761 struct scratch_reg {
9762 rtx reg;
9763 bool saved;
9764 };
9765
9766 /* Return a short-lived scratch register for use on function entry.
9767 In 32-bit mode, it is valid only after the registers are saved
9768 in the prologue. This register must be released by means of
9769 release_scratch_register_on_entry once it is dead. */
9770
9771 static void
9772 get_scratch_register_on_entry (struct scratch_reg *sr)
9773 {
9774 int regno;
9775
9776 sr->saved = false;
9777
9778 if (TARGET_64BIT)
9779 {
9780 /* We always use R11 in 64-bit mode. */
9781 regno = R11_REG;
9782 }
9783 else
9784 {
9785 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9786 bool fastcall_p
9787 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9788 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9789 int regparm = ix86_function_regparm (fntype, decl);
9790 int drap_regno
9791 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9792
9793 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9794 for the static chain register. */
9795 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9796 && drap_regno != AX_REG)
9797 regno = AX_REG;
9798 else if (regparm < 2 && drap_regno != DX_REG)
9799 regno = DX_REG;
9800 /* ecx is the static chain register. */
9801 else if (regparm < 3 && !fastcall_p && !static_chain_p
9802 && drap_regno != CX_REG)
9803 regno = CX_REG;
9804 else if (ix86_save_reg (BX_REG, true))
9805 regno = BX_REG;
9806 /* esi is the static chain register. */
9807 else if (!(regparm == 3 && static_chain_p)
9808 && ix86_save_reg (SI_REG, true))
9809 regno = SI_REG;
9810 else if (ix86_save_reg (DI_REG, true))
9811 regno = DI_REG;
9812 else
9813 {
9814 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9815 sr->saved = true;
9816 }
9817 }
9818
9819 sr->reg = gen_rtx_REG (Pmode, regno);
9820 if (sr->saved)
9821 {
9822 rtx insn = emit_insn (gen_push (sr->reg));
9823 RTX_FRAME_RELATED_P (insn) = 1;
9824 }
9825 }
9826
9827 /* Release a scratch register obtained from the preceding function. */
9828
9829 static void
9830 release_scratch_register_on_entry (struct scratch_reg *sr)
9831 {
9832 if (sr->saved)
9833 {
9834 rtx x, insn = emit_insn (gen_pop (sr->reg));
9835
9836 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9837 RTX_FRAME_RELATED_P (insn) = 1;
9838 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9839 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9840 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9841 }
9842 }
9843
9844 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9845
9846 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9847
9848 static void
9849 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9850 {
9851 /* We skip the probe for the first interval + a small dope of 4 words and
9852 probe that many bytes past the specified size to maintain a protection
9853 area at the botton of the stack. */
9854 const int dope = 4 * UNITS_PER_WORD;
9855 rtx size_rtx = GEN_INT (size), last;
9856
9857 /* See if we have a constant small number of probes to generate. If so,
9858 that's the easy case. The run-time loop is made up of 11 insns in the
9859 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9860 for n # of intervals. */
9861 if (size <= 5 * PROBE_INTERVAL)
9862 {
9863 HOST_WIDE_INT i, adjust;
9864 bool first_probe = true;
9865
9866 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9867 values of N from 1 until it exceeds SIZE. If only one probe is
9868 needed, this will not generate any code. Then adjust and probe
9869 to PROBE_INTERVAL + SIZE. */
9870 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9871 {
9872 if (first_probe)
9873 {
9874 adjust = 2 * PROBE_INTERVAL + dope;
9875 first_probe = false;
9876 }
9877 else
9878 adjust = PROBE_INTERVAL;
9879
9880 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9881 plus_constant (Pmode, stack_pointer_rtx,
9882 -adjust)));
9883 emit_stack_probe (stack_pointer_rtx);
9884 }
9885
9886 if (first_probe)
9887 adjust = size + PROBE_INTERVAL + dope;
9888 else
9889 adjust = size + PROBE_INTERVAL - i;
9890
9891 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9892 plus_constant (Pmode, stack_pointer_rtx,
9893 -adjust)));
9894 emit_stack_probe (stack_pointer_rtx);
9895
9896 /* Adjust back to account for the additional first interval. */
9897 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9898 plus_constant (Pmode, stack_pointer_rtx,
9899 PROBE_INTERVAL + dope)));
9900 }
9901
9902 /* Otherwise, do the same as above, but in a loop. Note that we must be
9903 extra careful with variables wrapping around because we might be at
9904 the very top (or the very bottom) of the address space and we have
9905 to be able to handle this case properly; in particular, we use an
9906 equality test for the loop condition. */
9907 else
9908 {
9909 HOST_WIDE_INT rounded_size;
9910 struct scratch_reg sr;
9911
9912 get_scratch_register_on_entry (&sr);
9913
9914
9915 /* Step 1: round SIZE to the previous multiple of the interval. */
9916
9917 rounded_size = size & -PROBE_INTERVAL;
9918
9919
9920 /* Step 2: compute initial and final value of the loop counter. */
9921
9922 /* SP = SP_0 + PROBE_INTERVAL. */
9923 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9924 plus_constant (Pmode, stack_pointer_rtx,
9925 - (PROBE_INTERVAL + dope))));
9926
9927 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9928 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9929 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9930 gen_rtx_PLUS (Pmode, sr.reg,
9931 stack_pointer_rtx)));
9932
9933
9934 /* Step 3: the loop
9935
9936 while (SP != LAST_ADDR)
9937 {
9938 SP = SP + PROBE_INTERVAL
9939 probe at SP
9940 }
9941
9942 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9943 values of N from 1 until it is equal to ROUNDED_SIZE. */
9944
9945 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9946
9947
9948 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9949 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9950
9951 if (size != rounded_size)
9952 {
9953 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9954 plus_constant (Pmode, stack_pointer_rtx,
9955 rounded_size - size)));
9956 emit_stack_probe (stack_pointer_rtx);
9957 }
9958
9959 /* Adjust back to account for the additional first interval. */
9960 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9961 plus_constant (Pmode, stack_pointer_rtx,
9962 PROBE_INTERVAL + dope)));
9963
9964 release_scratch_register_on_entry (&sr);
9965 }
9966
9967 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9968
9969 /* Even if the stack pointer isn't the CFA register, we need to correctly
9970 describe the adjustments made to it, in particular differentiate the
9971 frame-related ones from the frame-unrelated ones. */
9972 if (size > 0)
9973 {
9974 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9975 XVECEXP (expr, 0, 0)
9976 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9977 plus_constant (Pmode, stack_pointer_rtx, -size));
9978 XVECEXP (expr, 0, 1)
9979 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9980 plus_constant (Pmode, stack_pointer_rtx,
9981 PROBE_INTERVAL + dope + size));
9982 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9983 RTX_FRAME_RELATED_P (last) = 1;
9984
9985 cfun->machine->fs.sp_offset += size;
9986 }
9987
9988 /* Make sure nothing is scheduled before we are done. */
9989 emit_insn (gen_blockage ());
9990 }
9991
9992 /* Adjust the stack pointer up to REG while probing it. */
9993
9994 const char *
9995 output_adjust_stack_and_probe (rtx reg)
9996 {
9997 static int labelno = 0;
9998 char loop_lab[32], end_lab[32];
9999 rtx xops[2];
10000
10001 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10002 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10003
10004 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10005
10006 /* Jump to END_LAB if SP == LAST_ADDR. */
10007 xops[0] = stack_pointer_rtx;
10008 xops[1] = reg;
10009 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10010 fputs ("\tje\t", asm_out_file);
10011 assemble_name_raw (asm_out_file, end_lab);
10012 fputc ('\n', asm_out_file);
10013
10014 /* SP = SP + PROBE_INTERVAL. */
10015 xops[1] = GEN_INT (PROBE_INTERVAL);
10016 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10017
10018 /* Probe at SP. */
10019 xops[1] = const0_rtx;
10020 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
10021
10022 fprintf (asm_out_file, "\tjmp\t");
10023 assemble_name_raw (asm_out_file, loop_lab);
10024 fputc ('\n', asm_out_file);
10025
10026 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10027
10028 return "";
10029 }
10030
10031 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
10032 inclusive. These are offsets from the current stack pointer. */
10033
10034 static void
10035 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
10036 {
10037 /* See if we have a constant small number of probes to generate. If so,
10038 that's the easy case. The run-time loop is made up of 7 insns in the
10039 generic case while the compile-time loop is made up of n insns for n #
10040 of intervals. */
10041 if (size <= 7 * PROBE_INTERVAL)
10042 {
10043 HOST_WIDE_INT i;
10044
10045 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
10046 it exceeds SIZE. If only one probe is needed, this will not
10047 generate any code. Then probe at FIRST + SIZE. */
10048 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
10049 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10050 -(first + i)));
10051
10052 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10053 -(first + size)));
10054 }
10055
10056 /* Otherwise, do the same as above, but in a loop. Note that we must be
10057 extra careful with variables wrapping around because we might be at
10058 the very top (or the very bottom) of the address space and we have
10059 to be able to handle this case properly; in particular, we use an
10060 equality test for the loop condition. */
10061 else
10062 {
10063 HOST_WIDE_INT rounded_size, last;
10064 struct scratch_reg sr;
10065
10066 get_scratch_register_on_entry (&sr);
10067
10068
10069 /* Step 1: round SIZE to the previous multiple of the interval. */
10070
10071 rounded_size = size & -PROBE_INTERVAL;
10072
10073
10074 /* Step 2: compute initial and final value of the loop counter. */
10075
10076 /* TEST_OFFSET = FIRST. */
10077 emit_move_insn (sr.reg, GEN_INT (-first));
10078
10079 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
10080 last = first + rounded_size;
10081
10082
10083 /* Step 3: the loop
10084
10085 while (TEST_ADDR != LAST_ADDR)
10086 {
10087 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
10088 probe at TEST_ADDR
10089 }
10090
10091 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
10092 until it is equal to ROUNDED_SIZE. */
10093
10094 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10095
10096
10097 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10098 that SIZE is equal to ROUNDED_SIZE. */
10099
10100 if (size != rounded_size)
10101 emit_stack_probe (plus_constant (Pmode,
10102 gen_rtx_PLUS (Pmode,
10103 stack_pointer_rtx,
10104 sr.reg),
10105 rounded_size - size));
10106
10107 release_scratch_register_on_entry (&sr);
10108 }
10109
10110 /* Make sure nothing is scheduled before we are done. */
10111 emit_insn (gen_blockage ());
10112 }
10113
10114 /* Probe a range of stack addresses from REG to END, inclusive. These are
10115 offsets from the current stack pointer. */
10116
10117 const char *
10118 output_probe_stack_range (rtx reg, rtx end)
10119 {
10120 static int labelno = 0;
10121 char loop_lab[32], end_lab[32];
10122 rtx xops[3];
10123
10124 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10125 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10126
10127 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10128
10129 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10130 xops[0] = reg;
10131 xops[1] = end;
10132 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10133 fputs ("\tje\t", asm_out_file);
10134 assemble_name_raw (asm_out_file, end_lab);
10135 fputc ('\n', asm_out_file);
10136
10137 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10138 xops[1] = GEN_INT (PROBE_INTERVAL);
10139 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10140
10141 /* Probe at TEST_ADDR. */
10142 xops[0] = stack_pointer_rtx;
10143 xops[1] = reg;
10144 xops[2] = const0_rtx;
10145 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10146
10147 fprintf (asm_out_file, "\tjmp\t");
10148 assemble_name_raw (asm_out_file, loop_lab);
10149 fputc ('\n', asm_out_file);
10150
10151 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10152
10153 return "";
10154 }
10155
10156 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10157 to be generated in correct form. */
10158 static void
10159 ix86_finalize_stack_realign_flags (void)
10160 {
10161 /* Check if stack realign is really needed after reload, and
10162 stores result in cfun */
10163 unsigned int incoming_stack_boundary
10164 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10165 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10166 unsigned int stack_realign = (incoming_stack_boundary
10167 < (crtl->is_leaf
10168 ? crtl->max_used_stack_slot_alignment
10169 : crtl->stack_alignment_needed));
10170
10171 if (crtl->stack_realign_finalized)
10172 {
10173 /* After stack_realign_needed is finalized, we can't no longer
10174 change it. */
10175 gcc_assert (crtl->stack_realign_needed == stack_realign);
10176 return;
10177 }
10178
10179 /* If the only reason for frame_pointer_needed is that we conservatively
10180 assumed stack realignment might be needed, but in the end nothing that
10181 needed the stack alignment had been spilled, clear frame_pointer_needed
10182 and say we don't need stack realignment. */
10183 if (stack_realign
10184 && !crtl->need_drap
10185 && frame_pointer_needed
10186 && crtl->is_leaf
10187 && flag_omit_frame_pointer
10188 && crtl->sp_is_unchanging
10189 && !ix86_current_function_calls_tls_descriptor
10190 && !crtl->accesses_prior_frames
10191 && !cfun->calls_alloca
10192 && !crtl->calls_eh_return
10193 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10194 && !ix86_frame_pointer_required ()
10195 && get_frame_size () == 0
10196 && ix86_nsaved_sseregs () == 0
10197 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10198 {
10199 HARD_REG_SET set_up_by_prologue, prologue_used;
10200 basic_block bb;
10201
10202 CLEAR_HARD_REG_SET (prologue_used);
10203 CLEAR_HARD_REG_SET (set_up_by_prologue);
10204 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10205 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10206 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10207 HARD_FRAME_POINTER_REGNUM);
10208 FOR_EACH_BB (bb)
10209 {
10210 rtx insn;
10211 FOR_BB_INSNS (bb, insn)
10212 if (NONDEBUG_INSN_P (insn)
10213 && requires_stack_frame_p (insn, prologue_used,
10214 set_up_by_prologue))
10215 {
10216 crtl->stack_realign_needed = stack_realign;
10217 crtl->stack_realign_finalized = true;
10218 return;
10219 }
10220 }
10221
10222 frame_pointer_needed = false;
10223 stack_realign = false;
10224 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10225 crtl->stack_alignment_needed = incoming_stack_boundary;
10226 crtl->stack_alignment_estimated = incoming_stack_boundary;
10227 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10228 crtl->preferred_stack_boundary = incoming_stack_boundary;
10229 df_finish_pass (true);
10230 df_scan_alloc (NULL);
10231 df_scan_blocks ();
10232 df_compute_regs_ever_live (true);
10233 df_analyze ();
10234 }
10235
10236 crtl->stack_realign_needed = stack_realign;
10237 crtl->stack_realign_finalized = true;
10238 }
10239
10240 /* Expand the prologue into a bunch of separate insns. */
10241
10242 void
10243 ix86_expand_prologue (void)
10244 {
10245 struct machine_function *m = cfun->machine;
10246 rtx insn, t;
10247 bool pic_reg_used;
10248 struct ix86_frame frame;
10249 HOST_WIDE_INT allocate;
10250 bool int_registers_saved;
10251 bool sse_registers_saved;
10252
10253 ix86_finalize_stack_realign_flags ();
10254
10255 /* DRAP should not coexist with stack_realign_fp */
10256 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10257
10258 memset (&m->fs, 0, sizeof (m->fs));
10259
10260 /* Initialize CFA state for before the prologue. */
10261 m->fs.cfa_reg = stack_pointer_rtx;
10262 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10263
10264 /* Track SP offset to the CFA. We continue tracking this after we've
10265 swapped the CFA register away from SP. In the case of re-alignment
10266 this is fudged; we're interested to offsets within the local frame. */
10267 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10268 m->fs.sp_valid = true;
10269
10270 ix86_compute_frame_layout (&frame);
10271
10272 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10273 {
10274 /* We should have already generated an error for any use of
10275 ms_hook on a nested function. */
10276 gcc_checking_assert (!ix86_static_chain_on_stack);
10277
10278 /* Check if profiling is active and we shall use profiling before
10279 prologue variant. If so sorry. */
10280 if (crtl->profile && flag_fentry != 0)
10281 sorry ("ms_hook_prologue attribute isn%'t compatible "
10282 "with -mfentry for 32-bit");
10283
10284 /* In ix86_asm_output_function_label we emitted:
10285 8b ff movl.s %edi,%edi
10286 55 push %ebp
10287 8b ec movl.s %esp,%ebp
10288
10289 This matches the hookable function prologue in Win32 API
10290 functions in Microsoft Windows XP Service Pack 2 and newer.
10291 Wine uses this to enable Windows apps to hook the Win32 API
10292 functions provided by Wine.
10293
10294 What that means is that we've already set up the frame pointer. */
10295
10296 if (frame_pointer_needed
10297 && !(crtl->drap_reg && crtl->stack_realign_needed))
10298 {
10299 rtx push, mov;
10300
10301 /* We've decided to use the frame pointer already set up.
10302 Describe this to the unwinder by pretending that both
10303 push and mov insns happen right here.
10304
10305 Putting the unwind info here at the end of the ms_hook
10306 is done so that we can make absolutely certain we get
10307 the required byte sequence at the start of the function,
10308 rather than relying on an assembler that can produce
10309 the exact encoding required.
10310
10311 However it does mean (in the unpatched case) that we have
10312 a 1 insn window where the asynchronous unwind info is
10313 incorrect. However, if we placed the unwind info at
10314 its correct location we would have incorrect unwind info
10315 in the patched case. Which is probably all moot since
10316 I don't expect Wine generates dwarf2 unwind info for the
10317 system libraries that use this feature. */
10318
10319 insn = emit_insn (gen_blockage ());
10320
10321 push = gen_push (hard_frame_pointer_rtx);
10322 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10323 stack_pointer_rtx);
10324 RTX_FRAME_RELATED_P (push) = 1;
10325 RTX_FRAME_RELATED_P (mov) = 1;
10326
10327 RTX_FRAME_RELATED_P (insn) = 1;
10328 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10329 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10330
10331 /* Note that gen_push incremented m->fs.cfa_offset, even
10332 though we didn't emit the push insn here. */
10333 m->fs.cfa_reg = hard_frame_pointer_rtx;
10334 m->fs.fp_offset = m->fs.cfa_offset;
10335 m->fs.fp_valid = true;
10336 }
10337 else
10338 {
10339 /* The frame pointer is not needed so pop %ebp again.
10340 This leaves us with a pristine state. */
10341 emit_insn (gen_pop (hard_frame_pointer_rtx));
10342 }
10343 }
10344
10345 /* The first insn of a function that accepts its static chain on the
10346 stack is to push the register that would be filled in by a direct
10347 call. This insn will be skipped by the trampoline. */
10348 else if (ix86_static_chain_on_stack)
10349 {
10350 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10351 emit_insn (gen_blockage ());
10352
10353 /* We don't want to interpret this push insn as a register save,
10354 only as a stack adjustment. The real copy of the register as
10355 a save will be done later, if needed. */
10356 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10357 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10358 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10359 RTX_FRAME_RELATED_P (insn) = 1;
10360 }
10361
10362 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10363 of DRAP is needed and stack realignment is really needed after reload */
10364 if (stack_realign_drap)
10365 {
10366 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10367
10368 /* Only need to push parameter pointer reg if it is caller saved. */
10369 if (!call_used_regs[REGNO (crtl->drap_reg)])
10370 {
10371 /* Push arg pointer reg */
10372 insn = emit_insn (gen_push (crtl->drap_reg));
10373 RTX_FRAME_RELATED_P (insn) = 1;
10374 }
10375
10376 /* Grab the argument pointer. */
10377 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10378 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10379 RTX_FRAME_RELATED_P (insn) = 1;
10380 m->fs.cfa_reg = crtl->drap_reg;
10381 m->fs.cfa_offset = 0;
10382
10383 /* Align the stack. */
10384 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10385 stack_pointer_rtx,
10386 GEN_INT (-align_bytes)));
10387 RTX_FRAME_RELATED_P (insn) = 1;
10388
10389 /* Replicate the return address on the stack so that return
10390 address can be reached via (argp - 1) slot. This is needed
10391 to implement macro RETURN_ADDR_RTX and intrinsic function
10392 expand_builtin_return_addr etc. */
10393 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10394 t = gen_frame_mem (word_mode, t);
10395 insn = emit_insn (gen_push (t));
10396 RTX_FRAME_RELATED_P (insn) = 1;
10397
10398 /* For the purposes of frame and register save area addressing,
10399 we've started over with a new frame. */
10400 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10401 m->fs.realigned = true;
10402 }
10403
10404 int_registers_saved = (frame.nregs == 0);
10405 sse_registers_saved = (frame.nsseregs == 0);
10406
10407 if (frame_pointer_needed && !m->fs.fp_valid)
10408 {
10409 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10410 slower on all targets. Also sdb doesn't like it. */
10411 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10412 RTX_FRAME_RELATED_P (insn) = 1;
10413
10414 /* Push registers now, before setting the frame pointer
10415 on SEH target. */
10416 if (!int_registers_saved
10417 && TARGET_SEH
10418 && !frame.save_regs_using_mov)
10419 {
10420 ix86_emit_save_regs ();
10421 int_registers_saved = true;
10422 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10423 }
10424
10425 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10426 {
10427 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10428 RTX_FRAME_RELATED_P (insn) = 1;
10429
10430 if (m->fs.cfa_reg == stack_pointer_rtx)
10431 m->fs.cfa_reg = hard_frame_pointer_rtx;
10432 m->fs.fp_offset = m->fs.sp_offset;
10433 m->fs.fp_valid = true;
10434 }
10435 }
10436
10437 if (!int_registers_saved)
10438 {
10439 /* If saving registers via PUSH, do so now. */
10440 if (!frame.save_regs_using_mov)
10441 {
10442 ix86_emit_save_regs ();
10443 int_registers_saved = true;
10444 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10445 }
10446
10447 /* When using red zone we may start register saving before allocating
10448 the stack frame saving one cycle of the prologue. However, avoid
10449 doing this if we have to probe the stack; at least on x86_64 the
10450 stack probe can turn into a call that clobbers a red zone location. */
10451 else if (ix86_using_red_zone ()
10452 && (! TARGET_STACK_PROBE
10453 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10454 {
10455 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10456 int_registers_saved = true;
10457 }
10458 }
10459
10460 if (stack_realign_fp)
10461 {
10462 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10463 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10464
10465 /* The computation of the size of the re-aligned stack frame means
10466 that we must allocate the size of the register save area before
10467 performing the actual alignment. Otherwise we cannot guarantee
10468 that there's enough storage above the realignment point. */
10469 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10470 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10471 GEN_INT (m->fs.sp_offset
10472 - frame.sse_reg_save_offset),
10473 -1, false);
10474
10475 /* Align the stack. */
10476 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10477 stack_pointer_rtx,
10478 GEN_INT (-align_bytes)));
10479
10480 /* For the purposes of register save area addressing, the stack
10481 pointer is no longer valid. As for the value of sp_offset,
10482 see ix86_compute_frame_layout, which we need to match in order
10483 to pass verification of stack_pointer_offset at the end. */
10484 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10485 m->fs.sp_valid = false;
10486 }
10487
10488 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10489
10490 if (flag_stack_usage_info)
10491 {
10492 /* We start to count from ARG_POINTER. */
10493 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10494
10495 /* If it was realigned, take into account the fake frame. */
10496 if (stack_realign_drap)
10497 {
10498 if (ix86_static_chain_on_stack)
10499 stack_size += UNITS_PER_WORD;
10500
10501 if (!call_used_regs[REGNO (crtl->drap_reg)])
10502 stack_size += UNITS_PER_WORD;
10503
10504 /* This over-estimates by 1 minimal-stack-alignment-unit but
10505 mitigates that by counting in the new return address slot. */
10506 current_function_dynamic_stack_size
10507 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10508 }
10509
10510 current_function_static_stack_size = stack_size;
10511 }
10512
10513 /* On SEH target with very large frame size, allocate an area to save
10514 SSE registers (as the very large allocation won't be described). */
10515 if (TARGET_SEH
10516 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10517 && !sse_registers_saved)
10518 {
10519 HOST_WIDE_INT sse_size =
10520 frame.sse_reg_save_offset - frame.reg_save_offset;
10521
10522 gcc_assert (int_registers_saved);
10523
10524 /* No need to do stack checking as the area will be immediately
10525 written. */
10526 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10527 GEN_INT (-sse_size), -1,
10528 m->fs.cfa_reg == stack_pointer_rtx);
10529 allocate -= sse_size;
10530 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10531 sse_registers_saved = true;
10532 }
10533
10534 /* The stack has already been decremented by the instruction calling us
10535 so probe if the size is non-negative to preserve the protection area. */
10536 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10537 {
10538 /* We expect the registers to be saved when probes are used. */
10539 gcc_assert (int_registers_saved);
10540
10541 if (STACK_CHECK_MOVING_SP)
10542 {
10543 ix86_adjust_stack_and_probe (allocate);
10544 allocate = 0;
10545 }
10546 else
10547 {
10548 HOST_WIDE_INT size = allocate;
10549
10550 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10551 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10552
10553 if (TARGET_STACK_PROBE)
10554 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10555 else
10556 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10557 }
10558 }
10559
10560 if (allocate == 0)
10561 ;
10562 else if (!ix86_target_stack_probe ()
10563 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10564 {
10565 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10566 GEN_INT (-allocate), -1,
10567 m->fs.cfa_reg == stack_pointer_rtx);
10568 }
10569 else
10570 {
10571 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10572 rtx r10 = NULL;
10573 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10574
10575 bool eax_live = false;
10576 bool r10_live = false;
10577
10578 if (TARGET_64BIT)
10579 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10580 if (!TARGET_64BIT_MS_ABI)
10581 eax_live = ix86_eax_live_at_start_p ();
10582
10583 if (eax_live)
10584 {
10585 emit_insn (gen_push (eax));
10586 allocate -= UNITS_PER_WORD;
10587 }
10588 if (r10_live)
10589 {
10590 r10 = gen_rtx_REG (Pmode, R10_REG);
10591 emit_insn (gen_push (r10));
10592 allocate -= UNITS_PER_WORD;
10593 }
10594
10595 emit_move_insn (eax, GEN_INT (allocate));
10596 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10597
10598 /* Use the fact that AX still contains ALLOCATE. */
10599 adjust_stack_insn = (Pmode == DImode
10600 ? gen_pro_epilogue_adjust_stack_di_sub
10601 : gen_pro_epilogue_adjust_stack_si_sub);
10602
10603 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10604 stack_pointer_rtx, eax));
10605
10606 /* Note that SEH directives need to continue tracking the stack
10607 pointer even after the frame pointer has been set up. */
10608 if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
10609 {
10610 if (m->fs.cfa_reg == stack_pointer_rtx)
10611 m->fs.cfa_offset += allocate;
10612
10613 RTX_FRAME_RELATED_P (insn) = 1;
10614 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10615 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10616 plus_constant (Pmode, stack_pointer_rtx,
10617 -allocate)));
10618 }
10619 m->fs.sp_offset += allocate;
10620
10621 if (r10_live && eax_live)
10622 {
10623 t = choose_baseaddr (m->fs.sp_offset - allocate);
10624 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10625 gen_frame_mem (word_mode, t));
10626 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10627 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10628 gen_frame_mem (word_mode, t));
10629 }
10630 else if (eax_live || r10_live)
10631 {
10632 t = choose_baseaddr (m->fs.sp_offset - allocate);
10633 emit_move_insn (gen_rtx_REG (word_mode,
10634 (eax_live ? AX_REG : R10_REG)),
10635 gen_frame_mem (word_mode, t));
10636 }
10637 }
10638 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10639
10640 /* If we havn't already set up the frame pointer, do so now. */
10641 if (frame_pointer_needed && !m->fs.fp_valid)
10642 {
10643 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10644 GEN_INT (frame.stack_pointer_offset
10645 - frame.hard_frame_pointer_offset));
10646 insn = emit_insn (insn);
10647 RTX_FRAME_RELATED_P (insn) = 1;
10648 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10649
10650 if (m->fs.cfa_reg == stack_pointer_rtx)
10651 m->fs.cfa_reg = hard_frame_pointer_rtx;
10652 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10653 m->fs.fp_valid = true;
10654 }
10655
10656 if (!int_registers_saved)
10657 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10658 if (!sse_registers_saved)
10659 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10660
10661 pic_reg_used = false;
10662 if (pic_offset_table_rtx
10663 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10664 || crtl->profile))
10665 {
10666 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10667
10668 if (alt_pic_reg_used != INVALID_REGNUM)
10669 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10670
10671 pic_reg_used = true;
10672 }
10673
10674 if (pic_reg_used)
10675 {
10676 if (TARGET_64BIT)
10677 {
10678 if (ix86_cmodel == CM_LARGE_PIC)
10679 {
10680 rtx label, tmp_reg;
10681
10682 gcc_assert (Pmode == DImode);
10683 label = gen_label_rtx ();
10684 emit_label (label);
10685 LABEL_PRESERVE_P (label) = 1;
10686 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10687 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10688 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10689 label));
10690 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10691 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10692 pic_offset_table_rtx, tmp_reg));
10693 }
10694 else
10695 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10696 }
10697 else
10698 {
10699 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10700 RTX_FRAME_RELATED_P (insn) = 1;
10701 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10702 }
10703 }
10704
10705 /* In the pic_reg_used case, make sure that the got load isn't deleted
10706 when mcount needs it. Blockage to avoid call movement across mcount
10707 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10708 note. */
10709 if (crtl->profile && !flag_fentry && pic_reg_used)
10710 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10711
10712 if (crtl->drap_reg && !crtl->stack_realign_needed)
10713 {
10714 /* vDRAP is setup but after reload it turns out stack realign
10715 isn't necessary, here we will emit prologue to setup DRAP
10716 without stack realign adjustment */
10717 t = choose_baseaddr (0);
10718 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10719 }
10720
10721 /* Prevent instructions from being scheduled into register save push
10722 sequence when access to the redzone area is done through frame pointer.
10723 The offset between the frame pointer and the stack pointer is calculated
10724 relative to the value of the stack pointer at the end of the function
10725 prologue, and moving instructions that access redzone area via frame
10726 pointer inside push sequence violates this assumption. */
10727 if (frame_pointer_needed && frame.red_zone_size)
10728 emit_insn (gen_memory_blockage ());
10729
10730 /* Emit cld instruction if stringops are used in the function. */
10731 if (TARGET_CLD && ix86_current_function_needs_cld)
10732 emit_insn (gen_cld ());
10733
10734 /* SEH requires that the prologue end within 256 bytes of the start of
10735 the function. Prevent instruction schedules that would extend that.
10736 Further, prevent alloca modifications to the stack pointer from being
10737 combined with prologue modifications. */
10738 if (TARGET_SEH)
10739 emit_insn (gen_prologue_use (stack_pointer_rtx));
10740 }
10741
10742 /* Emit code to restore REG using a POP insn. */
10743
10744 static void
10745 ix86_emit_restore_reg_using_pop (rtx reg)
10746 {
10747 struct machine_function *m = cfun->machine;
10748 rtx insn = emit_insn (gen_pop (reg));
10749
10750 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10751 m->fs.sp_offset -= UNITS_PER_WORD;
10752
10753 if (m->fs.cfa_reg == crtl->drap_reg
10754 && REGNO (reg) == REGNO (crtl->drap_reg))
10755 {
10756 /* Previously we'd represented the CFA as an expression
10757 like *(%ebp - 8). We've just popped that value from
10758 the stack, which means we need to reset the CFA to
10759 the drap register. This will remain until we restore
10760 the stack pointer. */
10761 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10762 RTX_FRAME_RELATED_P (insn) = 1;
10763
10764 /* This means that the DRAP register is valid for addressing too. */
10765 m->fs.drap_valid = true;
10766 return;
10767 }
10768
10769 if (m->fs.cfa_reg == stack_pointer_rtx)
10770 {
10771 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10772 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10773 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10774 RTX_FRAME_RELATED_P (insn) = 1;
10775
10776 m->fs.cfa_offset -= UNITS_PER_WORD;
10777 }
10778
10779 /* When the frame pointer is the CFA, and we pop it, we are
10780 swapping back to the stack pointer as the CFA. This happens
10781 for stack frames that don't allocate other data, so we assume
10782 the stack pointer is now pointing at the return address, i.e.
10783 the function entry state, which makes the offset be 1 word. */
10784 if (reg == hard_frame_pointer_rtx)
10785 {
10786 m->fs.fp_valid = false;
10787 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10788 {
10789 m->fs.cfa_reg = stack_pointer_rtx;
10790 m->fs.cfa_offset -= UNITS_PER_WORD;
10791
10792 add_reg_note (insn, REG_CFA_DEF_CFA,
10793 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10794 GEN_INT (m->fs.cfa_offset)));
10795 RTX_FRAME_RELATED_P (insn) = 1;
10796 }
10797 }
10798 }
10799
10800 /* Emit code to restore saved registers using POP insns. */
10801
10802 static void
10803 ix86_emit_restore_regs_using_pop (void)
10804 {
10805 unsigned int regno;
10806
10807 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10808 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10809 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10810 }
10811
10812 /* Emit code and notes for the LEAVE instruction. */
10813
10814 static void
10815 ix86_emit_leave (void)
10816 {
10817 struct machine_function *m = cfun->machine;
10818 rtx insn = emit_insn (ix86_gen_leave ());
10819
10820 ix86_add_queued_cfa_restore_notes (insn);
10821
10822 gcc_assert (m->fs.fp_valid);
10823 m->fs.sp_valid = true;
10824 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10825 m->fs.fp_valid = false;
10826
10827 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10828 {
10829 m->fs.cfa_reg = stack_pointer_rtx;
10830 m->fs.cfa_offset = m->fs.sp_offset;
10831
10832 add_reg_note (insn, REG_CFA_DEF_CFA,
10833 plus_constant (Pmode, stack_pointer_rtx,
10834 m->fs.sp_offset));
10835 RTX_FRAME_RELATED_P (insn) = 1;
10836 }
10837 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10838 m->fs.fp_offset);
10839 }
10840
10841 /* Emit code to restore saved registers using MOV insns.
10842 First register is restored from CFA - CFA_OFFSET. */
10843 static void
10844 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10845 bool maybe_eh_return)
10846 {
10847 struct machine_function *m = cfun->machine;
10848 unsigned int regno;
10849
10850 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10851 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10852 {
10853 rtx reg = gen_rtx_REG (word_mode, regno);
10854 rtx insn, mem;
10855
10856 mem = choose_baseaddr (cfa_offset);
10857 mem = gen_frame_mem (word_mode, mem);
10858 insn = emit_move_insn (reg, mem);
10859
10860 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10861 {
10862 /* Previously we'd represented the CFA as an expression
10863 like *(%ebp - 8). We've just popped that value from
10864 the stack, which means we need to reset the CFA to
10865 the drap register. This will remain until we restore
10866 the stack pointer. */
10867 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10868 RTX_FRAME_RELATED_P (insn) = 1;
10869
10870 /* This means that the DRAP register is valid for addressing. */
10871 m->fs.drap_valid = true;
10872 }
10873 else
10874 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10875
10876 cfa_offset -= UNITS_PER_WORD;
10877 }
10878 }
10879
10880 /* Emit code to restore saved registers using MOV insns.
10881 First register is restored from CFA - CFA_OFFSET. */
10882 static void
10883 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10884 bool maybe_eh_return)
10885 {
10886 unsigned int regno;
10887
10888 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10889 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10890 {
10891 rtx reg = gen_rtx_REG (V4SFmode, regno);
10892 rtx mem;
10893
10894 mem = choose_baseaddr (cfa_offset);
10895 mem = gen_rtx_MEM (V4SFmode, mem);
10896 set_mem_align (mem, 128);
10897 emit_move_insn (reg, mem);
10898
10899 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10900
10901 cfa_offset -= 16;
10902 }
10903 }
10904
10905 /* Emit vzeroupper if needed. */
10906
10907 void
10908 ix86_maybe_emit_epilogue_vzeroupper (void)
10909 {
10910 if (TARGET_VZEROUPPER
10911 && !TREE_THIS_VOLATILE (cfun->decl)
10912 && !cfun->machine->caller_return_avx256_p)
10913 emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
10914 }
10915
10916 /* Restore function stack, frame, and registers. */
10917
10918 void
10919 ix86_expand_epilogue (int style)
10920 {
10921 struct machine_function *m = cfun->machine;
10922 struct machine_frame_state frame_state_save = m->fs;
10923 struct ix86_frame frame;
10924 bool restore_regs_via_mov;
10925 bool using_drap;
10926
10927 ix86_finalize_stack_realign_flags ();
10928 ix86_compute_frame_layout (&frame);
10929
10930 m->fs.sp_valid = (!frame_pointer_needed
10931 || (crtl->sp_is_unchanging
10932 && !stack_realign_fp));
10933 gcc_assert (!m->fs.sp_valid
10934 || m->fs.sp_offset == frame.stack_pointer_offset);
10935
10936 /* The FP must be valid if the frame pointer is present. */
10937 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10938 gcc_assert (!m->fs.fp_valid
10939 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10940
10941 /* We must have *some* valid pointer to the stack frame. */
10942 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10943
10944 /* The DRAP is never valid at this point. */
10945 gcc_assert (!m->fs.drap_valid);
10946
10947 /* See the comment about red zone and frame
10948 pointer usage in ix86_expand_prologue. */
10949 if (frame_pointer_needed && frame.red_zone_size)
10950 emit_insn (gen_memory_blockage ());
10951
10952 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10953 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10954
10955 /* Determine the CFA offset of the end of the red-zone. */
10956 m->fs.red_zone_offset = 0;
10957 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10958 {
10959 /* The red-zone begins below the return address. */
10960 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10961
10962 /* When the register save area is in the aligned portion of
10963 the stack, determine the maximum runtime displacement that
10964 matches up with the aligned frame. */
10965 if (stack_realign_drap)
10966 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10967 + UNITS_PER_WORD);
10968 }
10969
10970 /* Special care must be taken for the normal return case of a function
10971 using eh_return: the eax and edx registers are marked as saved, but
10972 not restored along this path. Adjust the save location to match. */
10973 if (crtl->calls_eh_return && style != 2)
10974 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10975
10976 /* EH_RETURN requires the use of moves to function properly. */
10977 if (crtl->calls_eh_return)
10978 restore_regs_via_mov = true;
10979 /* SEH requires the use of pops to identify the epilogue. */
10980 else if (TARGET_SEH)
10981 restore_regs_via_mov = false;
10982 /* If we're only restoring one register and sp is not valid then
10983 using a move instruction to restore the register since it's
10984 less work than reloading sp and popping the register. */
10985 else if (!m->fs.sp_valid && frame.nregs <= 1)
10986 restore_regs_via_mov = true;
10987 else if (TARGET_EPILOGUE_USING_MOVE
10988 && cfun->machine->use_fast_prologue_epilogue
10989 && (frame.nregs > 1
10990 || m->fs.sp_offset != frame.reg_save_offset))
10991 restore_regs_via_mov = true;
10992 else if (frame_pointer_needed
10993 && !frame.nregs
10994 && m->fs.sp_offset != frame.reg_save_offset)
10995 restore_regs_via_mov = true;
10996 else if (frame_pointer_needed
10997 && TARGET_USE_LEAVE
10998 && cfun->machine->use_fast_prologue_epilogue
10999 && frame.nregs == 1)
11000 restore_regs_via_mov = true;
11001 else
11002 restore_regs_via_mov = false;
11003
11004 if (restore_regs_via_mov || frame.nsseregs)
11005 {
11006 /* Ensure that the entire register save area is addressable via
11007 the stack pointer, if we will restore via sp. */
11008 if (TARGET_64BIT
11009 && m->fs.sp_offset > 0x7fffffff
11010 && !(m->fs.fp_valid || m->fs.drap_valid)
11011 && (frame.nsseregs + frame.nregs) != 0)
11012 {
11013 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11014 GEN_INT (m->fs.sp_offset
11015 - frame.sse_reg_save_offset),
11016 style,
11017 m->fs.cfa_reg == stack_pointer_rtx);
11018 }
11019 }
11020
11021 /* If there are any SSE registers to restore, then we have to do it
11022 via moves, since there's obviously no pop for SSE regs. */
11023 if (frame.nsseregs)
11024 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
11025 style == 2);
11026
11027 if (restore_regs_via_mov)
11028 {
11029 rtx t;
11030
11031 if (frame.nregs)
11032 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
11033
11034 /* eh_return epilogues need %ecx added to the stack pointer. */
11035 if (style == 2)
11036 {
11037 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
11038
11039 /* Stack align doesn't work with eh_return. */
11040 gcc_assert (!stack_realign_drap);
11041 /* Neither does regparm nested functions. */
11042 gcc_assert (!ix86_static_chain_on_stack);
11043
11044 if (frame_pointer_needed)
11045 {
11046 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
11047 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
11048 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
11049
11050 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
11051 insn = emit_move_insn (hard_frame_pointer_rtx, t);
11052
11053 /* Note that we use SA as a temporary CFA, as the return
11054 address is at the proper place relative to it. We
11055 pretend this happens at the FP restore insn because
11056 prior to this insn the FP would be stored at the wrong
11057 offset relative to SA, and after this insn we have no
11058 other reasonable register to use for the CFA. We don't
11059 bother resetting the CFA to the SP for the duration of
11060 the return insn. */
11061 add_reg_note (insn, REG_CFA_DEF_CFA,
11062 plus_constant (Pmode, sa, UNITS_PER_WORD));
11063 ix86_add_queued_cfa_restore_notes (insn);
11064 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
11065 RTX_FRAME_RELATED_P (insn) = 1;
11066
11067 m->fs.cfa_reg = sa;
11068 m->fs.cfa_offset = UNITS_PER_WORD;
11069 m->fs.fp_valid = false;
11070
11071 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
11072 const0_rtx, style, false);
11073 }
11074 else
11075 {
11076 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
11077 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
11078 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
11079 ix86_add_queued_cfa_restore_notes (insn);
11080
11081 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
11082 if (m->fs.cfa_offset != UNITS_PER_WORD)
11083 {
11084 m->fs.cfa_offset = UNITS_PER_WORD;
11085 add_reg_note (insn, REG_CFA_DEF_CFA,
11086 plus_constant (Pmode, stack_pointer_rtx,
11087 UNITS_PER_WORD));
11088 RTX_FRAME_RELATED_P (insn) = 1;
11089 }
11090 }
11091 m->fs.sp_offset = UNITS_PER_WORD;
11092 m->fs.sp_valid = true;
11093 }
11094 }
11095 else
11096 {
11097 /* SEH requires that the function end with (1) a stack adjustment
11098 if necessary, (2) a sequence of pops, and (3) a return or
11099 jump instruction. Prevent insns from the function body from
11100 being scheduled into this sequence. */
11101 if (TARGET_SEH)
11102 {
11103 /* Prevent a catch region from being adjacent to the standard
11104 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11105 several other flags that would be interesting to test are
11106 not yet set up. */
11107 if (flag_non_call_exceptions)
11108 emit_insn (gen_nops (const1_rtx));
11109 else
11110 emit_insn (gen_blockage ());
11111 }
11112
11113 /* First step is to deallocate the stack frame so that we can
11114 pop the registers. Also do it on SEH target for very large
11115 frame as the emitted instructions aren't allowed by the ABI in
11116 epilogues. */
11117 if (!m->fs.sp_valid
11118 || (TARGET_SEH
11119 && (m->fs.sp_offset - frame.reg_save_offset
11120 >= SEH_MAX_FRAME_SIZE)))
11121 {
11122 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11123 GEN_INT (m->fs.fp_offset
11124 - frame.reg_save_offset),
11125 style, false);
11126 }
11127 else if (m->fs.sp_offset != frame.reg_save_offset)
11128 {
11129 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11130 GEN_INT (m->fs.sp_offset
11131 - frame.reg_save_offset),
11132 style,
11133 m->fs.cfa_reg == stack_pointer_rtx);
11134 }
11135
11136 ix86_emit_restore_regs_using_pop ();
11137 }
11138
11139 /* If we used a stack pointer and haven't already got rid of it,
11140 then do so now. */
11141 if (m->fs.fp_valid)
11142 {
11143 /* If the stack pointer is valid and pointing at the frame
11144 pointer store address, then we only need a pop. */
11145 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11146 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11147 /* Leave results in shorter dependency chains on CPUs that are
11148 able to grok it fast. */
11149 else if (TARGET_USE_LEAVE
11150 || optimize_function_for_size_p (cfun)
11151 || !cfun->machine->use_fast_prologue_epilogue)
11152 ix86_emit_leave ();
11153 else
11154 {
11155 pro_epilogue_adjust_stack (stack_pointer_rtx,
11156 hard_frame_pointer_rtx,
11157 const0_rtx, style, !using_drap);
11158 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11159 }
11160 }
11161
11162 if (using_drap)
11163 {
11164 int param_ptr_offset = UNITS_PER_WORD;
11165 rtx insn;
11166
11167 gcc_assert (stack_realign_drap);
11168
11169 if (ix86_static_chain_on_stack)
11170 param_ptr_offset += UNITS_PER_WORD;
11171 if (!call_used_regs[REGNO (crtl->drap_reg)])
11172 param_ptr_offset += UNITS_PER_WORD;
11173
11174 insn = emit_insn (gen_rtx_SET
11175 (VOIDmode, stack_pointer_rtx,
11176 gen_rtx_PLUS (Pmode,
11177 crtl->drap_reg,
11178 GEN_INT (-param_ptr_offset))));
11179 m->fs.cfa_reg = stack_pointer_rtx;
11180 m->fs.cfa_offset = param_ptr_offset;
11181 m->fs.sp_offset = param_ptr_offset;
11182 m->fs.realigned = false;
11183
11184 add_reg_note (insn, REG_CFA_DEF_CFA,
11185 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11186 GEN_INT (param_ptr_offset)));
11187 RTX_FRAME_RELATED_P (insn) = 1;
11188
11189 if (!call_used_regs[REGNO (crtl->drap_reg)])
11190 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11191 }
11192
11193 /* At this point the stack pointer must be valid, and we must have
11194 restored all of the registers. We may not have deallocated the
11195 entire stack frame. We've delayed this until now because it may
11196 be possible to merge the local stack deallocation with the
11197 deallocation forced by ix86_static_chain_on_stack. */
11198 gcc_assert (m->fs.sp_valid);
11199 gcc_assert (!m->fs.fp_valid);
11200 gcc_assert (!m->fs.realigned);
11201 if (m->fs.sp_offset != UNITS_PER_WORD)
11202 {
11203 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11204 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11205 style, true);
11206 }
11207 else
11208 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11209
11210 /* Sibcall epilogues don't want a return instruction. */
11211 if (style == 0)
11212 {
11213 m->fs = frame_state_save;
11214 return;
11215 }
11216
11217 /* Emit vzeroupper if needed. */
11218 ix86_maybe_emit_epilogue_vzeroupper ();
11219
11220 if (crtl->args.pops_args && crtl->args.size)
11221 {
11222 rtx popc = GEN_INT (crtl->args.pops_args);
11223
11224 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11225 address, do explicit add, and jump indirectly to the caller. */
11226
11227 if (crtl->args.pops_args >= 65536)
11228 {
11229 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11230 rtx insn;
11231
11232 /* There is no "pascal" calling convention in any 64bit ABI. */
11233 gcc_assert (!TARGET_64BIT);
11234
11235 insn = emit_insn (gen_pop (ecx));
11236 m->fs.cfa_offset -= UNITS_PER_WORD;
11237 m->fs.sp_offset -= UNITS_PER_WORD;
11238
11239 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11240 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11241 add_reg_note (insn, REG_CFA_REGISTER,
11242 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11243 RTX_FRAME_RELATED_P (insn) = 1;
11244
11245 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11246 popc, -1, true);
11247 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11248 }
11249 else
11250 emit_jump_insn (gen_simple_return_pop_internal (popc));
11251 }
11252 else
11253 emit_jump_insn (gen_simple_return_internal ());
11254
11255 /* Restore the state back to the state from the prologue,
11256 so that it's correct for the next epilogue. */
11257 m->fs = frame_state_save;
11258 }
11259
11260 /* Reset from the function's potential modifications. */
11261
11262 static void
11263 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11264 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11265 {
11266 if (pic_offset_table_rtx)
11267 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11268 #if TARGET_MACHO
11269 /* Mach-O doesn't support labels at the end of objects, so if
11270 it looks like we might want one, insert a NOP. */
11271 {
11272 rtx insn = get_last_insn ();
11273 rtx deleted_debug_label = NULL_RTX;
11274 while (insn
11275 && NOTE_P (insn)
11276 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11277 {
11278 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11279 notes only, instead set their CODE_LABEL_NUMBER to -1,
11280 otherwise there would be code generation differences
11281 in between -g and -g0. */
11282 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11283 deleted_debug_label = insn;
11284 insn = PREV_INSN (insn);
11285 }
11286 if (insn
11287 && (LABEL_P (insn)
11288 || (NOTE_P (insn)
11289 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11290 fputs ("\tnop\n", file);
11291 else if (deleted_debug_label)
11292 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11293 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11294 CODE_LABEL_NUMBER (insn) = -1;
11295 }
11296 #endif
11297
11298 }
11299
11300 /* Return a scratch register to use in the split stack prologue. The
11301 split stack prologue is used for -fsplit-stack. It is the first
11302 instructions in the function, even before the regular prologue.
11303 The scratch register can be any caller-saved register which is not
11304 used for parameters or for the static chain. */
11305
11306 static unsigned int
11307 split_stack_prologue_scratch_regno (void)
11308 {
11309 if (TARGET_64BIT)
11310 return R11_REG;
11311 else
11312 {
11313 bool is_fastcall;
11314 int regparm;
11315
11316 is_fastcall = (lookup_attribute ("fastcall",
11317 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11318 != NULL);
11319 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11320
11321 if (is_fastcall)
11322 {
11323 if (DECL_STATIC_CHAIN (cfun->decl))
11324 {
11325 sorry ("-fsplit-stack does not support fastcall with "
11326 "nested function");
11327 return INVALID_REGNUM;
11328 }
11329 return AX_REG;
11330 }
11331 else if (regparm < 3)
11332 {
11333 if (!DECL_STATIC_CHAIN (cfun->decl))
11334 return CX_REG;
11335 else
11336 {
11337 if (regparm >= 2)
11338 {
11339 sorry ("-fsplit-stack does not support 2 register "
11340 " parameters for a nested function");
11341 return INVALID_REGNUM;
11342 }
11343 return DX_REG;
11344 }
11345 }
11346 else
11347 {
11348 /* FIXME: We could make this work by pushing a register
11349 around the addition and comparison. */
11350 sorry ("-fsplit-stack does not support 3 register parameters");
11351 return INVALID_REGNUM;
11352 }
11353 }
11354 }
11355
11356 /* A SYMBOL_REF for the function which allocates new stackspace for
11357 -fsplit-stack. */
11358
11359 static GTY(()) rtx split_stack_fn;
11360
11361 /* A SYMBOL_REF for the more stack function when using the large
11362 model. */
11363
11364 static GTY(()) rtx split_stack_fn_large;
11365
11366 /* Handle -fsplit-stack. These are the first instructions in the
11367 function, even before the regular prologue. */
11368
11369 void
11370 ix86_expand_split_stack_prologue (void)
11371 {
11372 struct ix86_frame frame;
11373 HOST_WIDE_INT allocate;
11374 unsigned HOST_WIDE_INT args_size;
11375 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11376 rtx scratch_reg = NULL_RTX;
11377 rtx varargs_label = NULL_RTX;
11378 rtx fn;
11379
11380 gcc_assert (flag_split_stack && reload_completed);
11381
11382 ix86_finalize_stack_realign_flags ();
11383 ix86_compute_frame_layout (&frame);
11384 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11385
11386 /* This is the label we will branch to if we have enough stack
11387 space. We expect the basic block reordering pass to reverse this
11388 branch if optimizing, so that we branch in the unlikely case. */
11389 label = gen_label_rtx ();
11390
11391 /* We need to compare the stack pointer minus the frame size with
11392 the stack boundary in the TCB. The stack boundary always gives
11393 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11394 can compare directly. Otherwise we need to do an addition. */
11395
11396 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11397 UNSPEC_STACK_CHECK);
11398 limit = gen_rtx_CONST (Pmode, limit);
11399 limit = gen_rtx_MEM (Pmode, limit);
11400 if (allocate < SPLIT_STACK_AVAILABLE)
11401 current = stack_pointer_rtx;
11402 else
11403 {
11404 unsigned int scratch_regno;
11405 rtx offset;
11406
11407 /* We need a scratch register to hold the stack pointer minus
11408 the required frame size. Since this is the very start of the
11409 function, the scratch register can be any caller-saved
11410 register which is not used for parameters. */
11411 offset = GEN_INT (- allocate);
11412 scratch_regno = split_stack_prologue_scratch_regno ();
11413 if (scratch_regno == INVALID_REGNUM)
11414 return;
11415 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11416 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11417 {
11418 /* We don't use ix86_gen_add3 in this case because it will
11419 want to split to lea, but when not optimizing the insn
11420 will not be split after this point. */
11421 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11422 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11423 offset)));
11424 }
11425 else
11426 {
11427 emit_move_insn (scratch_reg, offset);
11428 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11429 stack_pointer_rtx));
11430 }
11431 current = scratch_reg;
11432 }
11433
11434 ix86_expand_branch (GEU, current, limit, label);
11435 jump_insn = get_last_insn ();
11436 JUMP_LABEL (jump_insn) = label;
11437
11438 /* Mark the jump as very likely to be taken. */
11439 add_reg_note (jump_insn, REG_BR_PROB,
11440 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11441
11442 if (split_stack_fn == NULL_RTX)
11443 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11444 fn = split_stack_fn;
11445
11446 /* Get more stack space. We pass in the desired stack space and the
11447 size of the arguments to copy to the new stack. In 32-bit mode
11448 we push the parameters; __morestack will return on a new stack
11449 anyhow. In 64-bit mode we pass the parameters in r10 and
11450 r11. */
11451 allocate_rtx = GEN_INT (allocate);
11452 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11453 call_fusage = NULL_RTX;
11454 if (TARGET_64BIT)
11455 {
11456 rtx reg10, reg11;
11457
11458 reg10 = gen_rtx_REG (Pmode, R10_REG);
11459 reg11 = gen_rtx_REG (Pmode, R11_REG);
11460
11461 /* If this function uses a static chain, it will be in %r10.
11462 Preserve it across the call to __morestack. */
11463 if (DECL_STATIC_CHAIN (cfun->decl))
11464 {
11465 rtx rax;
11466
11467 rax = gen_rtx_REG (word_mode, AX_REG);
11468 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11469 use_reg (&call_fusage, rax);
11470 }
11471
11472 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11473 {
11474 HOST_WIDE_INT argval;
11475
11476 gcc_assert (Pmode == DImode);
11477 /* When using the large model we need to load the address
11478 into a register, and we've run out of registers. So we
11479 switch to a different calling convention, and we call a
11480 different function: __morestack_large. We pass the
11481 argument size in the upper 32 bits of r10 and pass the
11482 frame size in the lower 32 bits. */
11483 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11484 gcc_assert ((args_size & 0xffffffff) == args_size);
11485
11486 if (split_stack_fn_large == NULL_RTX)
11487 split_stack_fn_large =
11488 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11489
11490 if (ix86_cmodel == CM_LARGE_PIC)
11491 {
11492 rtx label, x;
11493
11494 label = gen_label_rtx ();
11495 emit_label (label);
11496 LABEL_PRESERVE_P (label) = 1;
11497 emit_insn (gen_set_rip_rex64 (reg10, label));
11498 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11499 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11500 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11501 UNSPEC_GOT);
11502 x = gen_rtx_CONST (Pmode, x);
11503 emit_move_insn (reg11, x);
11504 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11505 x = gen_const_mem (Pmode, x);
11506 emit_move_insn (reg11, x);
11507 }
11508 else
11509 emit_move_insn (reg11, split_stack_fn_large);
11510
11511 fn = reg11;
11512
11513 argval = ((args_size << 16) << 16) + allocate;
11514 emit_move_insn (reg10, GEN_INT (argval));
11515 }
11516 else
11517 {
11518 emit_move_insn (reg10, allocate_rtx);
11519 emit_move_insn (reg11, GEN_INT (args_size));
11520 use_reg (&call_fusage, reg11);
11521 }
11522
11523 use_reg (&call_fusage, reg10);
11524 }
11525 else
11526 {
11527 emit_insn (gen_push (GEN_INT (args_size)));
11528 emit_insn (gen_push (allocate_rtx));
11529 }
11530 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11531 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11532 NULL_RTX, false);
11533 add_function_usage_to (call_insn, call_fusage);
11534
11535 /* In order to make call/return prediction work right, we now need
11536 to execute a return instruction. See
11537 libgcc/config/i386/morestack.S for the details on how this works.
11538
11539 For flow purposes gcc must not see this as a return
11540 instruction--we need control flow to continue at the subsequent
11541 label. Therefore, we use an unspec. */
11542 gcc_assert (crtl->args.pops_args < 65536);
11543 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11544
11545 /* If we are in 64-bit mode and this function uses a static chain,
11546 we saved %r10 in %rax before calling _morestack. */
11547 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11548 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11549 gen_rtx_REG (word_mode, AX_REG));
11550
11551 /* If this function calls va_start, we need to store a pointer to
11552 the arguments on the old stack, because they may not have been
11553 all copied to the new stack. At this point the old stack can be
11554 found at the frame pointer value used by __morestack, because
11555 __morestack has set that up before calling back to us. Here we
11556 store that pointer in a scratch register, and in
11557 ix86_expand_prologue we store the scratch register in a stack
11558 slot. */
11559 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11560 {
11561 unsigned int scratch_regno;
11562 rtx frame_reg;
11563 int words;
11564
11565 scratch_regno = split_stack_prologue_scratch_regno ();
11566 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11567 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11568
11569 /* 64-bit:
11570 fp -> old fp value
11571 return address within this function
11572 return address of caller of this function
11573 stack arguments
11574 So we add three words to get to the stack arguments.
11575
11576 32-bit:
11577 fp -> old fp value
11578 return address within this function
11579 first argument to __morestack
11580 second argument to __morestack
11581 return address of caller of this function
11582 stack arguments
11583 So we add five words to get to the stack arguments.
11584 */
11585 words = TARGET_64BIT ? 3 : 5;
11586 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11587 gen_rtx_PLUS (Pmode, frame_reg,
11588 GEN_INT (words * UNITS_PER_WORD))));
11589
11590 varargs_label = gen_label_rtx ();
11591 emit_jump_insn (gen_jump (varargs_label));
11592 JUMP_LABEL (get_last_insn ()) = varargs_label;
11593
11594 emit_barrier ();
11595 }
11596
11597 emit_label (label);
11598 LABEL_NUSES (label) = 1;
11599
11600 /* If this function calls va_start, we now have to set the scratch
11601 register for the case where we do not call __morestack. In this
11602 case we need to set it based on the stack pointer. */
11603 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11604 {
11605 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11606 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11607 GEN_INT (UNITS_PER_WORD))));
11608
11609 emit_label (varargs_label);
11610 LABEL_NUSES (varargs_label) = 1;
11611 }
11612 }
11613
11614 /* We may have to tell the dataflow pass that the split stack prologue
11615 is initializing a scratch register. */
11616
11617 static void
11618 ix86_live_on_entry (bitmap regs)
11619 {
11620 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11621 {
11622 gcc_assert (flag_split_stack);
11623 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11624 }
11625 }
11626 \f
11627 /* Determine if op is suitable SUBREG RTX for address. */
11628
11629 static bool
11630 ix86_address_subreg_operand (rtx op)
11631 {
11632 enum machine_mode mode;
11633
11634 if (!REG_P (op))
11635 return false;
11636
11637 mode = GET_MODE (op);
11638
11639 if (GET_MODE_CLASS (mode) != MODE_INT)
11640 return false;
11641
11642 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11643 failures when the register is one word out of a two word structure. */
11644 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11645 return false;
11646
11647 /* Allow only SUBREGs of non-eliminable hard registers. */
11648 return register_no_elim_operand (op, mode);
11649 }
11650
11651 /* Extract the parts of an RTL expression that is a valid memory address
11652 for an instruction. Return 0 if the structure of the address is
11653 grossly off. Return -1 if the address contains ASHIFT, so it is not
11654 strictly valid, but still used for computing length of lea instruction. */
11655
11656 int
11657 ix86_decompose_address (rtx addr, struct ix86_address *out)
11658 {
11659 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11660 rtx base_reg, index_reg;
11661 HOST_WIDE_INT scale = 1;
11662 rtx scale_rtx = NULL_RTX;
11663 rtx tmp;
11664 int retval = 1;
11665 enum ix86_address_seg seg = SEG_DEFAULT;
11666
11667 /* Allow zero-extended SImode addresses,
11668 they will be emitted with addr32 prefix. */
11669 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11670 {
11671 if (GET_CODE (addr) == ZERO_EXTEND
11672 && GET_MODE (XEXP (addr, 0)) == SImode)
11673 {
11674 addr = XEXP (addr, 0);
11675 if (CONST_INT_P (addr))
11676 return 0;
11677 }
11678 else if (GET_CODE (addr) == AND
11679 && const_32bit_mask (XEXP (addr, 1), DImode))
11680 {
11681 addr = XEXP (addr, 0);
11682
11683 /* Adjust SUBREGs. */
11684 if (GET_CODE (addr) == SUBREG
11685 && GET_MODE (SUBREG_REG (addr)) == SImode)
11686 {
11687 addr = SUBREG_REG (addr);
11688 if (CONST_INT_P (addr))
11689 return 0;
11690 }
11691 else if (GET_MODE (addr) == DImode)
11692 addr = gen_rtx_SUBREG (SImode, addr, 0);
11693 else if (GET_MODE (addr) != VOIDmode)
11694 return 0;
11695 }
11696 }
11697
11698 /* Allow SImode subregs of DImode addresses,
11699 they will be emitted with addr32 prefix. */
11700 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11701 {
11702 if (GET_CODE (addr) == SUBREG
11703 && GET_MODE (SUBREG_REG (addr)) == DImode)
11704 {
11705 addr = SUBREG_REG (addr);
11706 if (CONST_INT_P (addr))
11707 return 0;
11708 }
11709 }
11710
11711 if (REG_P (addr))
11712 base = addr;
11713 else if (GET_CODE (addr) == SUBREG)
11714 {
11715 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11716 base = addr;
11717 else
11718 return 0;
11719 }
11720 else if (GET_CODE (addr) == PLUS)
11721 {
11722 rtx addends[4], op;
11723 int n = 0, i;
11724
11725 op = addr;
11726 do
11727 {
11728 if (n >= 4)
11729 return 0;
11730 addends[n++] = XEXP (op, 1);
11731 op = XEXP (op, 0);
11732 }
11733 while (GET_CODE (op) == PLUS);
11734 if (n >= 4)
11735 return 0;
11736 addends[n] = op;
11737
11738 for (i = n; i >= 0; --i)
11739 {
11740 op = addends[i];
11741 switch (GET_CODE (op))
11742 {
11743 case MULT:
11744 if (index)
11745 return 0;
11746 index = XEXP (op, 0);
11747 scale_rtx = XEXP (op, 1);
11748 break;
11749
11750 case ASHIFT:
11751 if (index)
11752 return 0;
11753 index = XEXP (op, 0);
11754 tmp = XEXP (op, 1);
11755 if (!CONST_INT_P (tmp))
11756 return 0;
11757 scale = INTVAL (tmp);
11758 if ((unsigned HOST_WIDE_INT) scale > 3)
11759 return 0;
11760 scale = 1 << scale;
11761 break;
11762
11763 case ZERO_EXTEND:
11764 op = XEXP (op, 0);
11765 if (GET_CODE (op) != UNSPEC)
11766 return 0;
11767 /* FALLTHRU */
11768
11769 case UNSPEC:
11770 if (XINT (op, 1) == UNSPEC_TP
11771 && TARGET_TLS_DIRECT_SEG_REFS
11772 && seg == SEG_DEFAULT)
11773 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11774 else
11775 return 0;
11776 break;
11777
11778 case SUBREG:
11779 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11780 return 0;
11781 /* FALLTHRU */
11782
11783 case REG:
11784 if (!base)
11785 base = op;
11786 else if (!index)
11787 index = op;
11788 else
11789 return 0;
11790 break;
11791
11792 case CONST:
11793 case CONST_INT:
11794 case SYMBOL_REF:
11795 case LABEL_REF:
11796 if (disp)
11797 return 0;
11798 disp = op;
11799 break;
11800
11801 default:
11802 return 0;
11803 }
11804 }
11805 }
11806 else if (GET_CODE (addr) == MULT)
11807 {
11808 index = XEXP (addr, 0); /* index*scale */
11809 scale_rtx = XEXP (addr, 1);
11810 }
11811 else if (GET_CODE (addr) == ASHIFT)
11812 {
11813 /* We're called for lea too, which implements ashift on occasion. */
11814 index = XEXP (addr, 0);
11815 tmp = XEXP (addr, 1);
11816 if (!CONST_INT_P (tmp))
11817 return 0;
11818 scale = INTVAL (tmp);
11819 if ((unsigned HOST_WIDE_INT) scale > 3)
11820 return 0;
11821 scale = 1 << scale;
11822 retval = -1;
11823 }
11824 else if (CONST_INT_P (addr))
11825 {
11826 if (!x86_64_immediate_operand (addr, VOIDmode))
11827 return 0;
11828
11829 /* Constant addresses are sign extended to 64bit, we have to
11830 prevent addresses from 0x80000000 to 0xffffffff in x32 mode. */
11831 if (TARGET_X32
11832 && val_signbit_known_set_p (SImode, INTVAL (addr)))
11833 return 0;
11834
11835 disp = addr;
11836 }
11837 else
11838 disp = addr; /* displacement */
11839
11840 if (index)
11841 {
11842 if (REG_P (index))
11843 ;
11844 else if (GET_CODE (index) == SUBREG
11845 && ix86_address_subreg_operand (SUBREG_REG (index)))
11846 ;
11847 else
11848 return 0;
11849 }
11850
11851 /* Address override works only on the (%reg) part of %fs:(%reg). */
11852 if (seg != SEG_DEFAULT
11853 && ((base && GET_MODE (base) != word_mode)
11854 || (index && GET_MODE (index) != word_mode)))
11855 return 0;
11856
11857 /* Extract the integral value of scale. */
11858 if (scale_rtx)
11859 {
11860 if (!CONST_INT_P (scale_rtx))
11861 return 0;
11862 scale = INTVAL (scale_rtx);
11863 }
11864
11865 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11866 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11867
11868 /* Avoid useless 0 displacement. */
11869 if (disp == const0_rtx && (base || index))
11870 disp = NULL_RTX;
11871
11872 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11873 if (base_reg && index_reg && scale == 1
11874 && (index_reg == arg_pointer_rtx
11875 || index_reg == frame_pointer_rtx
11876 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11877 {
11878 rtx tmp;
11879 tmp = base, base = index, index = tmp;
11880 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11881 }
11882
11883 /* Special case: %ebp cannot be encoded as a base without a displacement.
11884 Similarly %r13. */
11885 if (!disp
11886 && base_reg
11887 && (base_reg == hard_frame_pointer_rtx
11888 || base_reg == frame_pointer_rtx
11889 || base_reg == arg_pointer_rtx
11890 || (REG_P (base_reg)
11891 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11892 || REGNO (base_reg) == R13_REG))))
11893 disp = const0_rtx;
11894
11895 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11896 Avoid this by transforming to [%esi+0].
11897 Reload calls address legitimization without cfun defined, so we need
11898 to test cfun for being non-NULL. */
11899 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11900 && base_reg && !index_reg && !disp
11901 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11902 disp = const0_rtx;
11903
11904 /* Special case: encode reg+reg instead of reg*2. */
11905 if (!base && index && scale == 2)
11906 base = index, base_reg = index_reg, scale = 1;
11907
11908 /* Special case: scaling cannot be encoded without base or displacement. */
11909 if (!base && !disp && index && scale != 1)
11910 disp = const0_rtx;
11911
11912 out->base = base;
11913 out->index = index;
11914 out->disp = disp;
11915 out->scale = scale;
11916 out->seg = seg;
11917
11918 return retval;
11919 }
11920 \f
11921 /* Return cost of the memory address x.
11922 For i386, it is better to use a complex address than let gcc copy
11923 the address into a reg and make a new pseudo. But not if the address
11924 requires to two regs - that would mean more pseudos with longer
11925 lifetimes. */
11926 static int
11927 ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
11928 {
11929 struct ix86_address parts;
11930 int cost = 1;
11931 int ok = ix86_decompose_address (x, &parts);
11932
11933 gcc_assert (ok);
11934
11935 if (parts.base && GET_CODE (parts.base) == SUBREG)
11936 parts.base = SUBREG_REG (parts.base);
11937 if (parts.index && GET_CODE (parts.index) == SUBREG)
11938 parts.index = SUBREG_REG (parts.index);
11939
11940 /* Attempt to minimize number of registers in the address. */
11941 if ((parts.base
11942 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11943 || (parts.index
11944 && (!REG_P (parts.index)
11945 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11946 cost++;
11947
11948 if (parts.base
11949 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11950 && parts.index
11951 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11952 && parts.base != parts.index)
11953 cost++;
11954
11955 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11956 since it's predecode logic can't detect the length of instructions
11957 and it degenerates to vector decoded. Increase cost of such
11958 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11959 to split such addresses or even refuse such addresses at all.
11960
11961 Following addressing modes are affected:
11962 [base+scale*index]
11963 [scale*index+disp]
11964 [base+index]
11965
11966 The first and last case may be avoidable by explicitly coding the zero in
11967 memory address, but I don't have AMD-K6 machine handy to check this
11968 theory. */
11969
11970 if (TARGET_K6
11971 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11972 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11973 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11974 cost += 10;
11975
11976 return cost;
11977 }
11978 \f
11979 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11980 this is used for to form addresses to local data when -fPIC is in
11981 use. */
11982
11983 static bool
11984 darwin_local_data_pic (rtx disp)
11985 {
11986 return (GET_CODE (disp) == UNSPEC
11987 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11988 }
11989
11990 /* Determine if a given RTX is a valid constant. We already know this
11991 satisfies CONSTANT_P. */
11992
11993 static bool
11994 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11995 {
11996 switch (GET_CODE (x))
11997 {
11998 case CONST:
11999 x = XEXP (x, 0);
12000
12001 if (GET_CODE (x) == PLUS)
12002 {
12003 if (!CONST_INT_P (XEXP (x, 1)))
12004 return false;
12005 x = XEXP (x, 0);
12006 }
12007
12008 if (TARGET_MACHO && darwin_local_data_pic (x))
12009 return true;
12010
12011 /* Only some unspecs are valid as "constants". */
12012 if (GET_CODE (x) == UNSPEC)
12013 switch (XINT (x, 1))
12014 {
12015 case UNSPEC_GOT:
12016 case UNSPEC_GOTOFF:
12017 case UNSPEC_PLTOFF:
12018 return TARGET_64BIT;
12019 case UNSPEC_TPOFF:
12020 case UNSPEC_NTPOFF:
12021 x = XVECEXP (x, 0, 0);
12022 return (GET_CODE (x) == SYMBOL_REF
12023 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12024 case UNSPEC_DTPOFF:
12025 x = XVECEXP (x, 0, 0);
12026 return (GET_CODE (x) == SYMBOL_REF
12027 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
12028 default:
12029 return false;
12030 }
12031
12032 /* We must have drilled down to a symbol. */
12033 if (GET_CODE (x) == LABEL_REF)
12034 return true;
12035 if (GET_CODE (x) != SYMBOL_REF)
12036 return false;
12037 /* FALLTHRU */
12038
12039 case SYMBOL_REF:
12040 /* TLS symbols are never valid. */
12041 if (SYMBOL_REF_TLS_MODEL (x))
12042 return false;
12043
12044 /* DLLIMPORT symbols are never valid. */
12045 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
12046 && SYMBOL_REF_DLLIMPORT_P (x))
12047 return false;
12048
12049 #if TARGET_MACHO
12050 /* mdynamic-no-pic */
12051 if (MACHO_DYNAMIC_NO_PIC_P)
12052 return machopic_symbol_defined_p (x);
12053 #endif
12054 break;
12055
12056 case CONST_DOUBLE:
12057 if (GET_MODE (x) == TImode
12058 && x != CONST0_RTX (TImode)
12059 && !TARGET_64BIT)
12060 return false;
12061 break;
12062
12063 case CONST_VECTOR:
12064 if (!standard_sse_constant_p (x))
12065 return false;
12066
12067 default:
12068 break;
12069 }
12070
12071 /* Otherwise we handle everything else in the move patterns. */
12072 return true;
12073 }
12074
12075 /* Determine if it's legal to put X into the constant pool. This
12076 is not possible for the address of thread-local symbols, which
12077 is checked above. */
12078
12079 static bool
12080 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
12081 {
12082 /* We can always put integral constants and vectors in memory. */
12083 switch (GET_CODE (x))
12084 {
12085 case CONST_INT:
12086 case CONST_DOUBLE:
12087 case CONST_VECTOR:
12088 return false;
12089
12090 default:
12091 break;
12092 }
12093 return !ix86_legitimate_constant_p (mode, x);
12094 }
12095
12096
12097 /* Nonzero if the constant value X is a legitimate general operand
12098 when generating PIC code. It is given that flag_pic is on and
12099 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
12100
12101 bool
12102 legitimate_pic_operand_p (rtx x)
12103 {
12104 rtx inner;
12105
12106 switch (GET_CODE (x))
12107 {
12108 case CONST:
12109 inner = XEXP (x, 0);
12110 if (GET_CODE (inner) == PLUS
12111 && CONST_INT_P (XEXP (inner, 1)))
12112 inner = XEXP (inner, 0);
12113
12114 /* Only some unspecs are valid as "constants". */
12115 if (GET_CODE (inner) == UNSPEC)
12116 switch (XINT (inner, 1))
12117 {
12118 case UNSPEC_GOT:
12119 case UNSPEC_GOTOFF:
12120 case UNSPEC_PLTOFF:
12121 return TARGET_64BIT;
12122 case UNSPEC_TPOFF:
12123 x = XVECEXP (inner, 0, 0);
12124 return (GET_CODE (x) == SYMBOL_REF
12125 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12126 case UNSPEC_MACHOPIC_OFFSET:
12127 return legitimate_pic_address_disp_p (x);
12128 default:
12129 return false;
12130 }
12131 /* FALLTHRU */
12132
12133 case SYMBOL_REF:
12134 case LABEL_REF:
12135 return legitimate_pic_address_disp_p (x);
12136
12137 default:
12138 return true;
12139 }
12140 }
12141
12142 /* Determine if a given CONST RTX is a valid memory displacement
12143 in PIC mode. */
12144
12145 bool
12146 legitimate_pic_address_disp_p (rtx disp)
12147 {
12148 bool saw_plus;
12149
12150 /* In 64bit mode we can allow direct addresses of symbols and labels
12151 when they are not dynamic symbols. */
12152 if (TARGET_64BIT)
12153 {
12154 rtx op0 = disp, op1;
12155
12156 switch (GET_CODE (disp))
12157 {
12158 case LABEL_REF:
12159 return true;
12160
12161 case CONST:
12162 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12163 break;
12164 op0 = XEXP (XEXP (disp, 0), 0);
12165 op1 = XEXP (XEXP (disp, 0), 1);
12166 if (!CONST_INT_P (op1)
12167 || INTVAL (op1) >= 16*1024*1024
12168 || INTVAL (op1) < -16*1024*1024)
12169 break;
12170 if (GET_CODE (op0) == LABEL_REF)
12171 return true;
12172 if (GET_CODE (op0) == CONST
12173 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12174 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12175 return true;
12176 if (GET_CODE (op0) == UNSPEC
12177 && XINT (op0, 1) == UNSPEC_PCREL)
12178 return true;
12179 if (GET_CODE (op0) != SYMBOL_REF)
12180 break;
12181 /* FALLTHRU */
12182
12183 case SYMBOL_REF:
12184 /* TLS references should always be enclosed in UNSPEC. */
12185 if (SYMBOL_REF_TLS_MODEL (op0))
12186 return false;
12187 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
12188 && ix86_cmodel != CM_LARGE_PIC)
12189 return true;
12190 break;
12191
12192 default:
12193 break;
12194 }
12195 }
12196 if (GET_CODE (disp) != CONST)
12197 return false;
12198 disp = XEXP (disp, 0);
12199
12200 if (TARGET_64BIT)
12201 {
12202 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12203 of GOT tables. We should not need these anyway. */
12204 if (GET_CODE (disp) != UNSPEC
12205 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12206 && XINT (disp, 1) != UNSPEC_GOTOFF
12207 && XINT (disp, 1) != UNSPEC_PCREL
12208 && XINT (disp, 1) != UNSPEC_PLTOFF))
12209 return false;
12210
12211 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12212 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12213 return false;
12214 return true;
12215 }
12216
12217 saw_plus = false;
12218 if (GET_CODE (disp) == PLUS)
12219 {
12220 if (!CONST_INT_P (XEXP (disp, 1)))
12221 return false;
12222 disp = XEXP (disp, 0);
12223 saw_plus = true;
12224 }
12225
12226 if (TARGET_MACHO && darwin_local_data_pic (disp))
12227 return true;
12228
12229 if (GET_CODE (disp) != UNSPEC)
12230 return false;
12231
12232 switch (XINT (disp, 1))
12233 {
12234 case UNSPEC_GOT:
12235 if (saw_plus)
12236 return false;
12237 /* We need to check for both symbols and labels because VxWorks loads
12238 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12239 details. */
12240 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12241 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12242 case UNSPEC_GOTOFF:
12243 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12244 While ABI specify also 32bit relocation but we don't produce it in
12245 small PIC model at all. */
12246 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12247 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12248 && !TARGET_64BIT)
12249 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12250 return false;
12251 case UNSPEC_GOTTPOFF:
12252 case UNSPEC_GOTNTPOFF:
12253 case UNSPEC_INDNTPOFF:
12254 if (saw_plus)
12255 return false;
12256 disp = XVECEXP (disp, 0, 0);
12257 return (GET_CODE (disp) == SYMBOL_REF
12258 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12259 case UNSPEC_NTPOFF:
12260 disp = XVECEXP (disp, 0, 0);
12261 return (GET_CODE (disp) == SYMBOL_REF
12262 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12263 case UNSPEC_DTPOFF:
12264 disp = XVECEXP (disp, 0, 0);
12265 return (GET_CODE (disp) == SYMBOL_REF
12266 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12267 }
12268
12269 return false;
12270 }
12271
12272 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12273 replace the input X, or the original X if no replacement is called for.
12274 The output parameter *WIN is 1 if the calling macro should goto WIN,
12275 0 if it should not. */
12276
12277 bool
12278 ix86_legitimize_reload_address (rtx x,
12279 enum machine_mode mode ATTRIBUTE_UNUSED,
12280 int opnum, int type,
12281 int ind_levels ATTRIBUTE_UNUSED)
12282 {
12283 /* Reload can generate:
12284
12285 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12286 (reg:DI 97))
12287 (reg:DI 2 cx))
12288
12289 This RTX is rejected from ix86_legitimate_address_p due to
12290 non-strictness of base register 97. Following this rejection,
12291 reload pushes all three components into separate registers,
12292 creating invalid memory address RTX.
12293
12294 Following code reloads only the invalid part of the
12295 memory address RTX. */
12296
12297 if (GET_CODE (x) == PLUS
12298 && REG_P (XEXP (x, 1))
12299 && GET_CODE (XEXP (x, 0)) == PLUS
12300 && REG_P (XEXP (XEXP (x, 0), 1)))
12301 {
12302 rtx base, index;
12303 bool something_reloaded = false;
12304
12305 base = XEXP (XEXP (x, 0), 1);
12306 if (!REG_OK_FOR_BASE_STRICT_P (base))
12307 {
12308 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12309 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12310 opnum, (enum reload_type) type);
12311 something_reloaded = true;
12312 }
12313
12314 index = XEXP (x, 1);
12315 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12316 {
12317 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12318 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12319 opnum, (enum reload_type) type);
12320 something_reloaded = true;
12321 }
12322
12323 gcc_assert (something_reloaded);
12324 return true;
12325 }
12326
12327 return false;
12328 }
12329
12330 /* Recognizes RTL expressions that are valid memory addresses for an
12331 instruction. The MODE argument is the machine mode for the MEM
12332 expression that wants to use this address.
12333
12334 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12335 convert common non-canonical forms to canonical form so that they will
12336 be recognized. */
12337
12338 static bool
12339 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12340 rtx addr, bool strict)
12341 {
12342 struct ix86_address parts;
12343 rtx base, index, disp;
12344 HOST_WIDE_INT scale;
12345
12346 if (ix86_decompose_address (addr, &parts) <= 0)
12347 /* Decomposition failed. */
12348 return false;
12349
12350 base = parts.base;
12351 index = parts.index;
12352 disp = parts.disp;
12353 scale = parts.scale;
12354
12355 /* Validate base register. */
12356 if (base)
12357 {
12358 rtx reg;
12359
12360 if (REG_P (base))
12361 reg = base;
12362 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12363 reg = SUBREG_REG (base);
12364 else
12365 /* Base is not a register. */
12366 return false;
12367
12368 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12369 return false;
12370
12371 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12372 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12373 /* Base is not valid. */
12374 return false;
12375 }
12376
12377 /* Validate index register. */
12378 if (index)
12379 {
12380 rtx reg;
12381
12382 if (REG_P (index))
12383 reg = index;
12384 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12385 reg = SUBREG_REG (index);
12386 else
12387 /* Index is not a register. */
12388 return false;
12389
12390 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12391 return false;
12392
12393 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12394 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12395 /* Index is not valid. */
12396 return false;
12397 }
12398
12399 /* Index and base should have the same mode. */
12400 if (base && index
12401 && GET_MODE (base) != GET_MODE (index))
12402 return false;
12403
12404 /* Validate scale factor. */
12405 if (scale != 1)
12406 {
12407 if (!index)
12408 /* Scale without index. */
12409 return false;
12410
12411 if (scale != 2 && scale != 4 && scale != 8)
12412 /* Scale is not a valid multiplier. */
12413 return false;
12414 }
12415
12416 /* Validate displacement. */
12417 if (disp)
12418 {
12419 if (GET_CODE (disp) == CONST
12420 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12421 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12422 switch (XINT (XEXP (disp, 0), 1))
12423 {
12424 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12425 used. While ABI specify also 32bit relocations, we don't produce
12426 them at all and use IP relative instead. */
12427 case UNSPEC_GOT:
12428 case UNSPEC_GOTOFF:
12429 gcc_assert (flag_pic);
12430 if (!TARGET_64BIT)
12431 goto is_legitimate_pic;
12432
12433 /* 64bit address unspec. */
12434 return false;
12435
12436 case UNSPEC_GOTPCREL:
12437 case UNSPEC_PCREL:
12438 gcc_assert (flag_pic);
12439 goto is_legitimate_pic;
12440
12441 case UNSPEC_GOTTPOFF:
12442 case UNSPEC_GOTNTPOFF:
12443 case UNSPEC_INDNTPOFF:
12444 case UNSPEC_NTPOFF:
12445 case UNSPEC_DTPOFF:
12446 break;
12447
12448 case UNSPEC_STACK_CHECK:
12449 gcc_assert (flag_split_stack);
12450 break;
12451
12452 default:
12453 /* Invalid address unspec. */
12454 return false;
12455 }
12456
12457 else if (SYMBOLIC_CONST (disp)
12458 && (flag_pic
12459 || (TARGET_MACHO
12460 #if TARGET_MACHO
12461 && MACHOPIC_INDIRECT
12462 && !machopic_operand_p (disp)
12463 #endif
12464 )))
12465 {
12466
12467 is_legitimate_pic:
12468 if (TARGET_64BIT && (index || base))
12469 {
12470 /* foo@dtpoff(%rX) is ok. */
12471 if (GET_CODE (disp) != CONST
12472 || GET_CODE (XEXP (disp, 0)) != PLUS
12473 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12474 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12475 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12476 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12477 /* Non-constant pic memory reference. */
12478 return false;
12479 }
12480 else if ((!TARGET_MACHO || flag_pic)
12481 && ! legitimate_pic_address_disp_p (disp))
12482 /* Displacement is an invalid pic construct. */
12483 return false;
12484 #if TARGET_MACHO
12485 else if (MACHO_DYNAMIC_NO_PIC_P
12486 && !ix86_legitimate_constant_p (Pmode, disp))
12487 /* displacment must be referenced via non_lazy_pointer */
12488 return false;
12489 #endif
12490
12491 /* This code used to verify that a symbolic pic displacement
12492 includes the pic_offset_table_rtx register.
12493
12494 While this is good idea, unfortunately these constructs may
12495 be created by "adds using lea" optimization for incorrect
12496 code like:
12497
12498 int a;
12499 int foo(int i)
12500 {
12501 return *(&a+i);
12502 }
12503
12504 This code is nonsensical, but results in addressing
12505 GOT table with pic_offset_table_rtx base. We can't
12506 just refuse it easily, since it gets matched by
12507 "addsi3" pattern, that later gets split to lea in the
12508 case output register differs from input. While this
12509 can be handled by separate addsi pattern for this case
12510 that never results in lea, this seems to be easier and
12511 correct fix for crash to disable this test. */
12512 }
12513 else if (GET_CODE (disp) != LABEL_REF
12514 && !CONST_INT_P (disp)
12515 && (GET_CODE (disp) != CONST
12516 || !ix86_legitimate_constant_p (Pmode, disp))
12517 && (GET_CODE (disp) != SYMBOL_REF
12518 || !ix86_legitimate_constant_p (Pmode, disp)))
12519 /* Displacement is not constant. */
12520 return false;
12521 else if (TARGET_64BIT
12522 && !x86_64_immediate_operand (disp, VOIDmode))
12523 /* Displacement is out of range. */
12524 return false;
12525 }
12526
12527 /* Everything looks valid. */
12528 return true;
12529 }
12530
12531 /* Determine if a given RTX is a valid constant address. */
12532
12533 bool
12534 constant_address_p (rtx x)
12535 {
12536 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12537 }
12538 \f
12539 /* Return a unique alias set for the GOT. */
12540
12541 static alias_set_type
12542 ix86_GOT_alias_set (void)
12543 {
12544 static alias_set_type set = -1;
12545 if (set == -1)
12546 set = new_alias_set ();
12547 return set;
12548 }
12549
12550 /* Return a legitimate reference for ORIG (an address) using the
12551 register REG. If REG is 0, a new pseudo is generated.
12552
12553 There are two types of references that must be handled:
12554
12555 1. Global data references must load the address from the GOT, via
12556 the PIC reg. An insn is emitted to do this load, and the reg is
12557 returned.
12558
12559 2. Static data references, constant pool addresses, and code labels
12560 compute the address as an offset from the GOT, whose base is in
12561 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12562 differentiate them from global data objects. The returned
12563 address is the PIC reg + an unspec constant.
12564
12565 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12566 reg also appears in the address. */
12567
12568 static rtx
12569 legitimize_pic_address (rtx orig, rtx reg)
12570 {
12571 rtx addr = orig;
12572 rtx new_rtx = orig;
12573 rtx base;
12574
12575 #if TARGET_MACHO
12576 if (TARGET_MACHO && !TARGET_64BIT)
12577 {
12578 if (reg == 0)
12579 reg = gen_reg_rtx (Pmode);
12580 /* Use the generic Mach-O PIC machinery. */
12581 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12582 }
12583 #endif
12584
12585 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12586 new_rtx = addr;
12587 else if (TARGET_64BIT
12588 && ix86_cmodel != CM_SMALL_PIC
12589 && gotoff_operand (addr, Pmode))
12590 {
12591 rtx tmpreg;
12592 /* This symbol may be referenced via a displacement from the PIC
12593 base address (@GOTOFF). */
12594
12595 if (reload_in_progress)
12596 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12597 if (GET_CODE (addr) == CONST)
12598 addr = XEXP (addr, 0);
12599 if (GET_CODE (addr) == PLUS)
12600 {
12601 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12602 UNSPEC_GOTOFF);
12603 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12604 }
12605 else
12606 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12607 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12608 if (!reg)
12609 tmpreg = gen_reg_rtx (Pmode);
12610 else
12611 tmpreg = reg;
12612 emit_move_insn (tmpreg, new_rtx);
12613
12614 if (reg != 0)
12615 {
12616 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12617 tmpreg, 1, OPTAB_DIRECT);
12618 new_rtx = reg;
12619 }
12620 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12621 }
12622 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12623 {
12624 /* This symbol may be referenced via a displacement from the PIC
12625 base address (@GOTOFF). */
12626
12627 if (reload_in_progress)
12628 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12629 if (GET_CODE (addr) == CONST)
12630 addr = XEXP (addr, 0);
12631 if (GET_CODE (addr) == PLUS)
12632 {
12633 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12634 UNSPEC_GOTOFF);
12635 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12636 }
12637 else
12638 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12639 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12640 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12641
12642 if (reg != 0)
12643 {
12644 emit_move_insn (reg, new_rtx);
12645 new_rtx = reg;
12646 }
12647 }
12648 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12649 /* We can't use @GOTOFF for text labels on VxWorks;
12650 see gotoff_operand. */
12651 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12652 {
12653 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12654 {
12655 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12656 return legitimize_dllimport_symbol (addr, true);
12657 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12658 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12659 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12660 {
12661 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12662 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12663 }
12664 }
12665
12666 /* For x64 PE-COFF there is no GOT table. So we use address
12667 directly. */
12668 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12669 {
12670 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12671 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12672
12673 if (reg == 0)
12674 reg = gen_reg_rtx (Pmode);
12675 emit_move_insn (reg, new_rtx);
12676 new_rtx = reg;
12677 }
12678 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12679 {
12680 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12681 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12682 new_rtx = gen_const_mem (Pmode, new_rtx);
12683 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12684
12685 if (reg == 0)
12686 reg = gen_reg_rtx (Pmode);
12687 /* Use directly gen_movsi, otherwise the address is loaded
12688 into register for CSE. We don't want to CSE this addresses,
12689 instead we CSE addresses from the GOT table, so skip this. */
12690 emit_insn (gen_movsi (reg, new_rtx));
12691 new_rtx = reg;
12692 }
12693 else
12694 {
12695 /* This symbol must be referenced via a load from the
12696 Global Offset Table (@GOT). */
12697
12698 if (reload_in_progress)
12699 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12700 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12701 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12702 if (TARGET_64BIT)
12703 new_rtx = force_reg (Pmode, new_rtx);
12704 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12705 new_rtx = gen_const_mem (Pmode, new_rtx);
12706 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12707
12708 if (reg == 0)
12709 reg = gen_reg_rtx (Pmode);
12710 emit_move_insn (reg, new_rtx);
12711 new_rtx = reg;
12712 }
12713 }
12714 else
12715 {
12716 if (CONST_INT_P (addr)
12717 && !x86_64_immediate_operand (addr, VOIDmode))
12718 {
12719 if (reg)
12720 {
12721 emit_move_insn (reg, addr);
12722 new_rtx = reg;
12723 }
12724 else
12725 new_rtx = force_reg (Pmode, addr);
12726 }
12727 else if (GET_CODE (addr) == CONST)
12728 {
12729 addr = XEXP (addr, 0);
12730
12731 /* We must match stuff we generate before. Assume the only
12732 unspecs that can get here are ours. Not that we could do
12733 anything with them anyway.... */
12734 if (GET_CODE (addr) == UNSPEC
12735 || (GET_CODE (addr) == PLUS
12736 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12737 return orig;
12738 gcc_assert (GET_CODE (addr) == PLUS);
12739 }
12740 if (GET_CODE (addr) == PLUS)
12741 {
12742 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12743
12744 /* Check first to see if this is a constant offset from a @GOTOFF
12745 symbol reference. */
12746 if (gotoff_operand (op0, Pmode)
12747 && CONST_INT_P (op1))
12748 {
12749 if (!TARGET_64BIT)
12750 {
12751 if (reload_in_progress)
12752 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12753 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12754 UNSPEC_GOTOFF);
12755 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12756 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12757 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12758
12759 if (reg != 0)
12760 {
12761 emit_move_insn (reg, new_rtx);
12762 new_rtx = reg;
12763 }
12764 }
12765 else
12766 {
12767 if (INTVAL (op1) < -16*1024*1024
12768 || INTVAL (op1) >= 16*1024*1024)
12769 {
12770 if (!x86_64_immediate_operand (op1, Pmode))
12771 op1 = force_reg (Pmode, op1);
12772 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12773 }
12774 }
12775 }
12776 else
12777 {
12778 base = legitimize_pic_address (XEXP (addr, 0), reg);
12779 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12780 base == reg ? NULL_RTX : reg);
12781
12782 if (CONST_INT_P (new_rtx))
12783 new_rtx = plus_constant (Pmode, base, INTVAL (new_rtx));
12784 else
12785 {
12786 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12787 {
12788 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12789 new_rtx = XEXP (new_rtx, 1);
12790 }
12791 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12792 }
12793 }
12794 }
12795 }
12796 return new_rtx;
12797 }
12798 \f
12799 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12800
12801 static rtx
12802 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12803 {
12804 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12805
12806 if (GET_MODE (tp) != tp_mode)
12807 {
12808 gcc_assert (GET_MODE (tp) == SImode);
12809 gcc_assert (tp_mode == DImode);
12810
12811 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12812 }
12813
12814 if (to_reg)
12815 tp = copy_to_mode_reg (tp_mode, tp);
12816
12817 return tp;
12818 }
12819
12820 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12821
12822 static GTY(()) rtx ix86_tls_symbol;
12823
12824 static rtx
12825 ix86_tls_get_addr (void)
12826 {
12827 if (!ix86_tls_symbol)
12828 {
12829 const char *sym
12830 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12831 ? "___tls_get_addr" : "__tls_get_addr");
12832
12833 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12834 }
12835
12836 return ix86_tls_symbol;
12837 }
12838
12839 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12840
12841 static GTY(()) rtx ix86_tls_module_base_symbol;
12842
12843 rtx
12844 ix86_tls_module_base (void)
12845 {
12846 if (!ix86_tls_module_base_symbol)
12847 {
12848 ix86_tls_module_base_symbol
12849 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12850
12851 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12852 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12853 }
12854
12855 return ix86_tls_module_base_symbol;
12856 }
12857
12858 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12859 false if we expect this to be used for a memory address and true if
12860 we expect to load the address into a register. */
12861
12862 static rtx
12863 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12864 {
12865 rtx dest, base, off;
12866 rtx pic = NULL_RTX, tp = NULL_RTX;
12867 enum machine_mode tp_mode = Pmode;
12868 int type;
12869
12870 switch (model)
12871 {
12872 case TLS_MODEL_GLOBAL_DYNAMIC:
12873 dest = gen_reg_rtx (Pmode);
12874
12875 if (!TARGET_64BIT)
12876 {
12877 if (flag_pic)
12878 pic = pic_offset_table_rtx;
12879 else
12880 {
12881 pic = gen_reg_rtx (Pmode);
12882 emit_insn (gen_set_got (pic));
12883 }
12884 }
12885
12886 if (TARGET_GNU2_TLS)
12887 {
12888 if (TARGET_64BIT)
12889 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12890 else
12891 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12892
12893 tp = get_thread_pointer (Pmode, true);
12894 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12895
12896 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12897 }
12898 else
12899 {
12900 rtx caddr = ix86_tls_get_addr ();
12901
12902 if (TARGET_64BIT)
12903 {
12904 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
12905
12906 start_sequence ();
12907 emit_call_insn (ix86_gen_tls_global_dynamic_64 (rax, x,
12908 caddr));
12909 insns = get_insns ();
12910 end_sequence ();
12911
12912 RTL_CONST_CALL_P (insns) = 1;
12913 emit_libcall_block (insns, dest, rax, x);
12914 }
12915 else
12916 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12917 }
12918 break;
12919
12920 case TLS_MODEL_LOCAL_DYNAMIC:
12921 base = gen_reg_rtx (Pmode);
12922
12923 if (!TARGET_64BIT)
12924 {
12925 if (flag_pic)
12926 pic = pic_offset_table_rtx;
12927 else
12928 {
12929 pic = gen_reg_rtx (Pmode);
12930 emit_insn (gen_set_got (pic));
12931 }
12932 }
12933
12934 if (TARGET_GNU2_TLS)
12935 {
12936 rtx tmp = ix86_tls_module_base ();
12937
12938 if (TARGET_64BIT)
12939 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12940 else
12941 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12942
12943 tp = get_thread_pointer (Pmode, true);
12944 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12945 gen_rtx_MINUS (Pmode, tmp, tp));
12946 }
12947 else
12948 {
12949 rtx caddr = ix86_tls_get_addr ();
12950
12951 if (TARGET_64BIT)
12952 {
12953 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
12954
12955 start_sequence ();
12956 emit_call_insn (ix86_gen_tls_local_dynamic_base_64 (rax,
12957 caddr));
12958 insns = get_insns ();
12959 end_sequence ();
12960
12961 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12962 share the LD_BASE result with other LD model accesses. */
12963 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12964 UNSPEC_TLS_LD_BASE);
12965
12966 RTL_CONST_CALL_P (insns) = 1;
12967 emit_libcall_block (insns, base, rax, eqv);
12968 }
12969 else
12970 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12971 }
12972
12973 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12974 off = gen_rtx_CONST (Pmode, off);
12975
12976 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12977
12978 if (TARGET_GNU2_TLS)
12979 {
12980 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12981
12982 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12983 }
12984 break;
12985
12986 case TLS_MODEL_INITIAL_EXEC:
12987 if (TARGET_64BIT)
12988 {
12989 if (TARGET_SUN_TLS && !TARGET_X32)
12990 {
12991 /* The Sun linker took the AMD64 TLS spec literally
12992 and can only handle %rax as destination of the
12993 initial executable code sequence. */
12994
12995 dest = gen_reg_rtx (DImode);
12996 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12997 return dest;
12998 }
12999
13000 /* Generate DImode references to avoid %fs:(%reg32)
13001 problems and linker IE->LE relaxation bug. */
13002 tp_mode = DImode;
13003 pic = NULL;
13004 type = UNSPEC_GOTNTPOFF;
13005 }
13006 else if (flag_pic)
13007 {
13008 if (reload_in_progress)
13009 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13010 pic = pic_offset_table_rtx;
13011 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
13012 }
13013 else if (!TARGET_ANY_GNU_TLS)
13014 {
13015 pic = gen_reg_rtx (Pmode);
13016 emit_insn (gen_set_got (pic));
13017 type = UNSPEC_GOTTPOFF;
13018 }
13019 else
13020 {
13021 pic = NULL;
13022 type = UNSPEC_INDNTPOFF;
13023 }
13024
13025 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
13026 off = gen_rtx_CONST (tp_mode, off);
13027 if (pic)
13028 off = gen_rtx_PLUS (tp_mode, pic, off);
13029 off = gen_const_mem (tp_mode, off);
13030 set_mem_alias_set (off, ix86_GOT_alias_set ());
13031
13032 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13033 {
13034 base = get_thread_pointer (tp_mode,
13035 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13036 off = force_reg (tp_mode, off);
13037 return gen_rtx_PLUS (tp_mode, base, off);
13038 }
13039 else
13040 {
13041 base = get_thread_pointer (Pmode, true);
13042 dest = gen_reg_rtx (Pmode);
13043 emit_insn (ix86_gen_sub3 (dest, base, off));
13044 }
13045 break;
13046
13047 case TLS_MODEL_LOCAL_EXEC:
13048 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
13049 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13050 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
13051 off = gen_rtx_CONST (Pmode, off);
13052
13053 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13054 {
13055 base = get_thread_pointer (Pmode,
13056 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13057 return gen_rtx_PLUS (Pmode, base, off);
13058 }
13059 else
13060 {
13061 base = get_thread_pointer (Pmode, true);
13062 dest = gen_reg_rtx (Pmode);
13063 emit_insn (ix86_gen_sub3 (dest, base, off));
13064 }
13065 break;
13066
13067 default:
13068 gcc_unreachable ();
13069 }
13070
13071 return dest;
13072 }
13073
13074 /* Create or return the unique __imp_DECL dllimport symbol corresponding
13075 to symbol DECL. */
13076
13077 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
13078 htab_t dllimport_map;
13079
13080 static tree
13081 get_dllimport_decl (tree decl)
13082 {
13083 struct tree_map *h, in;
13084 void **loc;
13085 const char *name;
13086 const char *prefix;
13087 size_t namelen, prefixlen;
13088 char *imp_name;
13089 tree to;
13090 rtx rtl;
13091
13092 if (!dllimport_map)
13093 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13094
13095 in.hash = htab_hash_pointer (decl);
13096 in.base.from = decl;
13097 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13098 h = (struct tree_map *) *loc;
13099 if (h)
13100 return h->to;
13101
13102 *loc = h = ggc_alloc_tree_map ();
13103 h->hash = in.hash;
13104 h->base.from = decl;
13105 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13106 VAR_DECL, NULL, ptr_type_node);
13107 DECL_ARTIFICIAL (to) = 1;
13108 DECL_IGNORED_P (to) = 1;
13109 DECL_EXTERNAL (to) = 1;
13110 TREE_READONLY (to) = 1;
13111
13112 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13113 name = targetm.strip_name_encoding (name);
13114 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13115 ? "*__imp_" : "*__imp__";
13116 namelen = strlen (name);
13117 prefixlen = strlen (prefix);
13118 imp_name = (char *) alloca (namelen + prefixlen + 1);
13119 memcpy (imp_name, prefix, prefixlen);
13120 memcpy (imp_name + prefixlen, name, namelen + 1);
13121
13122 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13123 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13124 SET_SYMBOL_REF_DECL (rtl, to);
13125 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
13126
13127 rtl = gen_const_mem (Pmode, rtl);
13128 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13129
13130 SET_DECL_RTL (to, rtl);
13131 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13132
13133 return to;
13134 }
13135
13136 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13137 true if we require the result be a register. */
13138
13139 static rtx
13140 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13141 {
13142 tree imp_decl;
13143 rtx x;
13144
13145 gcc_assert (SYMBOL_REF_DECL (symbol));
13146 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
13147
13148 x = DECL_RTL (imp_decl);
13149 if (want_reg)
13150 x = force_reg (Pmode, x);
13151 return x;
13152 }
13153
13154 /* Try machine-dependent ways of modifying an illegitimate address
13155 to be legitimate. If we find one, return the new, valid address.
13156 This macro is used in only one place: `memory_address' in explow.c.
13157
13158 OLDX is the address as it was before break_out_memory_refs was called.
13159 In some cases it is useful to look at this to decide what needs to be done.
13160
13161 It is always safe for this macro to do nothing. It exists to recognize
13162 opportunities to optimize the output.
13163
13164 For the 80386, we handle X+REG by loading X into a register R and
13165 using R+REG. R will go in a general reg and indexing will be used.
13166 However, if REG is a broken-out memory address or multiplication,
13167 nothing needs to be done because REG can certainly go in a general reg.
13168
13169 When -fpic is used, special handling is needed for symbolic references.
13170 See comments by legitimize_pic_address in i386.c for details. */
13171
13172 static rtx
13173 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13174 enum machine_mode mode)
13175 {
13176 int changed = 0;
13177 unsigned log;
13178
13179 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13180 if (log)
13181 return legitimize_tls_address (x, (enum tls_model) log, false);
13182 if (GET_CODE (x) == CONST
13183 && GET_CODE (XEXP (x, 0)) == PLUS
13184 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13185 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13186 {
13187 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13188 (enum tls_model) log, false);
13189 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13190 }
13191
13192 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13193 {
13194 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
13195 return legitimize_dllimport_symbol (x, true);
13196 if (GET_CODE (x) == CONST
13197 && GET_CODE (XEXP (x, 0)) == PLUS
13198 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13199 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
13200 {
13201 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
13202 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13203 }
13204 }
13205
13206 if (flag_pic && SYMBOLIC_CONST (x))
13207 return legitimize_pic_address (x, 0);
13208
13209 #if TARGET_MACHO
13210 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13211 return machopic_indirect_data_reference (x, 0);
13212 #endif
13213
13214 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13215 if (GET_CODE (x) == ASHIFT
13216 && CONST_INT_P (XEXP (x, 1))
13217 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13218 {
13219 changed = 1;
13220 log = INTVAL (XEXP (x, 1));
13221 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13222 GEN_INT (1 << log));
13223 }
13224
13225 if (GET_CODE (x) == PLUS)
13226 {
13227 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13228
13229 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13230 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13231 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13232 {
13233 changed = 1;
13234 log = INTVAL (XEXP (XEXP (x, 0), 1));
13235 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13236 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13237 GEN_INT (1 << log));
13238 }
13239
13240 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13241 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13242 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13243 {
13244 changed = 1;
13245 log = INTVAL (XEXP (XEXP (x, 1), 1));
13246 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13247 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13248 GEN_INT (1 << log));
13249 }
13250
13251 /* Put multiply first if it isn't already. */
13252 if (GET_CODE (XEXP (x, 1)) == MULT)
13253 {
13254 rtx tmp = XEXP (x, 0);
13255 XEXP (x, 0) = XEXP (x, 1);
13256 XEXP (x, 1) = tmp;
13257 changed = 1;
13258 }
13259
13260 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13261 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13262 created by virtual register instantiation, register elimination, and
13263 similar optimizations. */
13264 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13265 {
13266 changed = 1;
13267 x = gen_rtx_PLUS (Pmode,
13268 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13269 XEXP (XEXP (x, 1), 0)),
13270 XEXP (XEXP (x, 1), 1));
13271 }
13272
13273 /* Canonicalize
13274 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13275 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13276 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13277 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13278 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13279 && CONSTANT_P (XEXP (x, 1)))
13280 {
13281 rtx constant;
13282 rtx other = NULL_RTX;
13283
13284 if (CONST_INT_P (XEXP (x, 1)))
13285 {
13286 constant = XEXP (x, 1);
13287 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13288 }
13289 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13290 {
13291 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13292 other = XEXP (x, 1);
13293 }
13294 else
13295 constant = 0;
13296
13297 if (constant)
13298 {
13299 changed = 1;
13300 x = gen_rtx_PLUS (Pmode,
13301 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13302 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13303 plus_constant (Pmode, other,
13304 INTVAL (constant)));
13305 }
13306 }
13307
13308 if (changed && ix86_legitimate_address_p (mode, x, false))
13309 return x;
13310
13311 if (GET_CODE (XEXP (x, 0)) == MULT)
13312 {
13313 changed = 1;
13314 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13315 }
13316
13317 if (GET_CODE (XEXP (x, 1)) == MULT)
13318 {
13319 changed = 1;
13320 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13321 }
13322
13323 if (changed
13324 && REG_P (XEXP (x, 1))
13325 && REG_P (XEXP (x, 0)))
13326 return x;
13327
13328 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13329 {
13330 changed = 1;
13331 x = legitimize_pic_address (x, 0);
13332 }
13333
13334 if (changed && ix86_legitimate_address_p (mode, x, false))
13335 return x;
13336
13337 if (REG_P (XEXP (x, 0)))
13338 {
13339 rtx temp = gen_reg_rtx (Pmode);
13340 rtx val = force_operand (XEXP (x, 1), temp);
13341 if (val != temp)
13342 {
13343 if (GET_MODE (val) != Pmode)
13344 val = convert_to_mode (Pmode, val, 1);
13345 emit_move_insn (temp, val);
13346 }
13347
13348 XEXP (x, 1) = temp;
13349 return x;
13350 }
13351
13352 else if (REG_P (XEXP (x, 1)))
13353 {
13354 rtx temp = gen_reg_rtx (Pmode);
13355 rtx val = force_operand (XEXP (x, 0), temp);
13356 if (val != temp)
13357 {
13358 if (GET_MODE (val) != Pmode)
13359 val = convert_to_mode (Pmode, val, 1);
13360 emit_move_insn (temp, val);
13361 }
13362
13363 XEXP (x, 0) = temp;
13364 return x;
13365 }
13366 }
13367
13368 return x;
13369 }
13370 \f
13371 /* Print an integer constant expression in assembler syntax. Addition
13372 and subtraction are the only arithmetic that may appear in these
13373 expressions. FILE is the stdio stream to write to, X is the rtx, and
13374 CODE is the operand print code from the output string. */
13375
13376 static void
13377 output_pic_addr_const (FILE *file, rtx x, int code)
13378 {
13379 char buf[256];
13380
13381 switch (GET_CODE (x))
13382 {
13383 case PC:
13384 gcc_assert (flag_pic);
13385 putc ('.', file);
13386 break;
13387
13388 case SYMBOL_REF:
13389 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13390 output_addr_const (file, x);
13391 else
13392 {
13393 const char *name = XSTR (x, 0);
13394
13395 /* Mark the decl as referenced so that cgraph will
13396 output the function. */
13397 if (SYMBOL_REF_DECL (x))
13398 mark_decl_referenced (SYMBOL_REF_DECL (x));
13399
13400 #if TARGET_MACHO
13401 if (MACHOPIC_INDIRECT
13402 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13403 name = machopic_indirection_name (x, /*stub_p=*/true);
13404 #endif
13405 assemble_name (file, name);
13406 }
13407 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13408 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13409 fputs ("@PLT", file);
13410 break;
13411
13412 case LABEL_REF:
13413 x = XEXP (x, 0);
13414 /* FALLTHRU */
13415 case CODE_LABEL:
13416 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13417 assemble_name (asm_out_file, buf);
13418 break;
13419
13420 case CONST_INT:
13421 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13422 break;
13423
13424 case CONST:
13425 /* This used to output parentheses around the expression,
13426 but that does not work on the 386 (either ATT or BSD assembler). */
13427 output_pic_addr_const (file, XEXP (x, 0), code);
13428 break;
13429
13430 case CONST_DOUBLE:
13431 if (GET_MODE (x) == VOIDmode)
13432 {
13433 /* We can use %d if the number is <32 bits and positive. */
13434 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13435 fprintf (file, "0x%lx%08lx",
13436 (unsigned long) CONST_DOUBLE_HIGH (x),
13437 (unsigned long) CONST_DOUBLE_LOW (x));
13438 else
13439 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13440 }
13441 else
13442 /* We can't handle floating point constants;
13443 TARGET_PRINT_OPERAND must handle them. */
13444 output_operand_lossage ("floating constant misused");
13445 break;
13446
13447 case PLUS:
13448 /* Some assemblers need integer constants to appear first. */
13449 if (CONST_INT_P (XEXP (x, 0)))
13450 {
13451 output_pic_addr_const (file, XEXP (x, 0), code);
13452 putc ('+', file);
13453 output_pic_addr_const (file, XEXP (x, 1), code);
13454 }
13455 else
13456 {
13457 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13458 output_pic_addr_const (file, XEXP (x, 1), code);
13459 putc ('+', file);
13460 output_pic_addr_const (file, XEXP (x, 0), code);
13461 }
13462 break;
13463
13464 case MINUS:
13465 if (!TARGET_MACHO)
13466 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13467 output_pic_addr_const (file, XEXP (x, 0), code);
13468 putc ('-', file);
13469 output_pic_addr_const (file, XEXP (x, 1), code);
13470 if (!TARGET_MACHO)
13471 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13472 break;
13473
13474 case UNSPEC:
13475 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13476 {
13477 bool f = i386_asm_output_addr_const_extra (file, x);
13478 gcc_assert (f);
13479 break;
13480 }
13481
13482 gcc_assert (XVECLEN (x, 0) == 1);
13483 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13484 switch (XINT (x, 1))
13485 {
13486 case UNSPEC_GOT:
13487 fputs ("@GOT", file);
13488 break;
13489 case UNSPEC_GOTOFF:
13490 fputs ("@GOTOFF", file);
13491 break;
13492 case UNSPEC_PLTOFF:
13493 fputs ("@PLTOFF", file);
13494 break;
13495 case UNSPEC_PCREL:
13496 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13497 "(%rip)" : "[rip]", file);
13498 break;
13499 case UNSPEC_GOTPCREL:
13500 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13501 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13502 break;
13503 case UNSPEC_GOTTPOFF:
13504 /* FIXME: This might be @TPOFF in Sun ld too. */
13505 fputs ("@gottpoff", file);
13506 break;
13507 case UNSPEC_TPOFF:
13508 fputs ("@tpoff", file);
13509 break;
13510 case UNSPEC_NTPOFF:
13511 if (TARGET_64BIT)
13512 fputs ("@tpoff", file);
13513 else
13514 fputs ("@ntpoff", file);
13515 break;
13516 case UNSPEC_DTPOFF:
13517 fputs ("@dtpoff", file);
13518 break;
13519 case UNSPEC_GOTNTPOFF:
13520 if (TARGET_64BIT)
13521 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13522 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13523 else
13524 fputs ("@gotntpoff", file);
13525 break;
13526 case UNSPEC_INDNTPOFF:
13527 fputs ("@indntpoff", file);
13528 break;
13529 #if TARGET_MACHO
13530 case UNSPEC_MACHOPIC_OFFSET:
13531 putc ('-', file);
13532 machopic_output_function_base_name (file);
13533 break;
13534 #endif
13535 default:
13536 output_operand_lossage ("invalid UNSPEC as operand");
13537 break;
13538 }
13539 break;
13540
13541 default:
13542 output_operand_lossage ("invalid expression as operand");
13543 }
13544 }
13545
13546 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13547 We need to emit DTP-relative relocations. */
13548
13549 static void ATTRIBUTE_UNUSED
13550 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13551 {
13552 fputs (ASM_LONG, file);
13553 output_addr_const (file, x);
13554 fputs ("@dtpoff", file);
13555 switch (size)
13556 {
13557 case 4:
13558 break;
13559 case 8:
13560 fputs (", 0", file);
13561 break;
13562 default:
13563 gcc_unreachable ();
13564 }
13565 }
13566
13567 /* Return true if X is a representation of the PIC register. This copes
13568 with calls from ix86_find_base_term, where the register might have
13569 been replaced by a cselib value. */
13570
13571 static bool
13572 ix86_pic_register_p (rtx x)
13573 {
13574 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13575 return (pic_offset_table_rtx
13576 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13577 else
13578 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13579 }
13580
13581 /* Helper function for ix86_delegitimize_address.
13582 Attempt to delegitimize TLS local-exec accesses. */
13583
13584 static rtx
13585 ix86_delegitimize_tls_address (rtx orig_x)
13586 {
13587 rtx x = orig_x, unspec;
13588 struct ix86_address addr;
13589
13590 if (!TARGET_TLS_DIRECT_SEG_REFS)
13591 return orig_x;
13592 if (MEM_P (x))
13593 x = XEXP (x, 0);
13594 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13595 return orig_x;
13596 if (ix86_decompose_address (x, &addr) == 0
13597 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13598 || addr.disp == NULL_RTX
13599 || GET_CODE (addr.disp) != CONST)
13600 return orig_x;
13601 unspec = XEXP (addr.disp, 0);
13602 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13603 unspec = XEXP (unspec, 0);
13604 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13605 return orig_x;
13606 x = XVECEXP (unspec, 0, 0);
13607 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13608 if (unspec != XEXP (addr.disp, 0))
13609 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13610 if (addr.index)
13611 {
13612 rtx idx = addr.index;
13613 if (addr.scale != 1)
13614 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13615 x = gen_rtx_PLUS (Pmode, idx, x);
13616 }
13617 if (addr.base)
13618 x = gen_rtx_PLUS (Pmode, addr.base, x);
13619 if (MEM_P (orig_x))
13620 x = replace_equiv_address_nv (orig_x, x);
13621 return x;
13622 }
13623
13624 /* In the name of slightly smaller debug output, and to cater to
13625 general assembler lossage, recognize PIC+GOTOFF and turn it back
13626 into a direct symbol reference.
13627
13628 On Darwin, this is necessary to avoid a crash, because Darwin
13629 has a different PIC label for each routine but the DWARF debugging
13630 information is not associated with any particular routine, so it's
13631 necessary to remove references to the PIC label from RTL stored by
13632 the DWARF output code. */
13633
13634 static rtx
13635 ix86_delegitimize_address (rtx x)
13636 {
13637 rtx orig_x = delegitimize_mem_from_attrs (x);
13638 /* addend is NULL or some rtx if x is something+GOTOFF where
13639 something doesn't include the PIC register. */
13640 rtx addend = NULL_RTX;
13641 /* reg_addend is NULL or a multiple of some register. */
13642 rtx reg_addend = NULL_RTX;
13643 /* const_addend is NULL or a const_int. */
13644 rtx const_addend = NULL_RTX;
13645 /* This is the result, or NULL. */
13646 rtx result = NULL_RTX;
13647
13648 x = orig_x;
13649
13650 if (MEM_P (x))
13651 x = XEXP (x, 0);
13652
13653 if (TARGET_64BIT)
13654 {
13655 if (GET_CODE (x) == CONST
13656 && GET_CODE (XEXP (x, 0)) == PLUS
13657 && GET_MODE (XEXP (x, 0)) == Pmode
13658 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13659 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13660 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13661 {
13662 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13663 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13664 if (MEM_P (orig_x))
13665 x = replace_equiv_address_nv (orig_x, x);
13666 return x;
13667 }
13668 if (GET_CODE (x) != CONST
13669 || GET_CODE (XEXP (x, 0)) != UNSPEC
13670 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13671 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13672 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13673 return ix86_delegitimize_tls_address (orig_x);
13674 x = XVECEXP (XEXP (x, 0), 0, 0);
13675 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13676 {
13677 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13678 GET_MODE (x), 0);
13679 if (x == NULL_RTX)
13680 return orig_x;
13681 }
13682 return x;
13683 }
13684
13685 if (GET_CODE (x) != PLUS
13686 || GET_CODE (XEXP (x, 1)) != CONST)
13687 return ix86_delegitimize_tls_address (orig_x);
13688
13689 if (ix86_pic_register_p (XEXP (x, 0)))
13690 /* %ebx + GOT/GOTOFF */
13691 ;
13692 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13693 {
13694 /* %ebx + %reg * scale + GOT/GOTOFF */
13695 reg_addend = XEXP (x, 0);
13696 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13697 reg_addend = XEXP (reg_addend, 1);
13698 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13699 reg_addend = XEXP (reg_addend, 0);
13700 else
13701 {
13702 reg_addend = NULL_RTX;
13703 addend = XEXP (x, 0);
13704 }
13705 }
13706 else
13707 addend = XEXP (x, 0);
13708
13709 x = XEXP (XEXP (x, 1), 0);
13710 if (GET_CODE (x) == PLUS
13711 && CONST_INT_P (XEXP (x, 1)))
13712 {
13713 const_addend = XEXP (x, 1);
13714 x = XEXP (x, 0);
13715 }
13716
13717 if (GET_CODE (x) == UNSPEC
13718 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13719 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13720 result = XVECEXP (x, 0, 0);
13721
13722 if (TARGET_MACHO && darwin_local_data_pic (x)
13723 && !MEM_P (orig_x))
13724 result = XVECEXP (x, 0, 0);
13725
13726 if (! result)
13727 return ix86_delegitimize_tls_address (orig_x);
13728
13729 if (const_addend)
13730 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13731 if (reg_addend)
13732 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13733 if (addend)
13734 {
13735 /* If the rest of original X doesn't involve the PIC register, add
13736 addend and subtract pic_offset_table_rtx. This can happen e.g.
13737 for code like:
13738 leal (%ebx, %ecx, 4), %ecx
13739 ...
13740 movl foo@GOTOFF(%ecx), %edx
13741 in which case we return (%ecx - %ebx) + foo. */
13742 if (pic_offset_table_rtx)
13743 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13744 pic_offset_table_rtx),
13745 result);
13746 else
13747 return orig_x;
13748 }
13749 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13750 {
13751 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13752 if (result == NULL_RTX)
13753 return orig_x;
13754 }
13755 return result;
13756 }
13757
13758 /* If X is a machine specific address (i.e. a symbol or label being
13759 referenced as a displacement from the GOT implemented using an
13760 UNSPEC), then return the base term. Otherwise return X. */
13761
13762 rtx
13763 ix86_find_base_term (rtx x)
13764 {
13765 rtx term;
13766
13767 if (TARGET_64BIT)
13768 {
13769 if (GET_CODE (x) != CONST)
13770 return x;
13771 term = XEXP (x, 0);
13772 if (GET_CODE (term) == PLUS
13773 && (CONST_INT_P (XEXP (term, 1))
13774 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13775 term = XEXP (term, 0);
13776 if (GET_CODE (term) != UNSPEC
13777 || (XINT (term, 1) != UNSPEC_GOTPCREL
13778 && XINT (term, 1) != UNSPEC_PCREL))
13779 return x;
13780
13781 return XVECEXP (term, 0, 0);
13782 }
13783
13784 return ix86_delegitimize_address (x);
13785 }
13786 \f
13787 static void
13788 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
13789 bool fp, FILE *file)
13790 {
13791 const char *suffix;
13792
13793 if (mode == CCFPmode || mode == CCFPUmode)
13794 {
13795 code = ix86_fp_compare_code_to_integer (code);
13796 mode = CCmode;
13797 }
13798 if (reverse)
13799 code = reverse_condition (code);
13800
13801 switch (code)
13802 {
13803 case EQ:
13804 switch (mode)
13805 {
13806 case CCAmode:
13807 suffix = "a";
13808 break;
13809
13810 case CCCmode:
13811 suffix = "c";
13812 break;
13813
13814 case CCOmode:
13815 suffix = "o";
13816 break;
13817
13818 case CCSmode:
13819 suffix = "s";
13820 break;
13821
13822 default:
13823 suffix = "e";
13824 }
13825 break;
13826 case NE:
13827 switch (mode)
13828 {
13829 case CCAmode:
13830 suffix = "na";
13831 break;
13832
13833 case CCCmode:
13834 suffix = "nc";
13835 break;
13836
13837 case CCOmode:
13838 suffix = "no";
13839 break;
13840
13841 case CCSmode:
13842 suffix = "ns";
13843 break;
13844
13845 default:
13846 suffix = "ne";
13847 }
13848 break;
13849 case GT:
13850 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13851 suffix = "g";
13852 break;
13853 case GTU:
13854 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13855 Those same assemblers have the same but opposite lossage on cmov. */
13856 if (mode == CCmode)
13857 suffix = fp ? "nbe" : "a";
13858 else if (mode == CCCmode)
13859 suffix = "b";
13860 else
13861 gcc_unreachable ();
13862 break;
13863 case LT:
13864 switch (mode)
13865 {
13866 case CCNOmode:
13867 case CCGOCmode:
13868 suffix = "s";
13869 break;
13870
13871 case CCmode:
13872 case CCGCmode:
13873 suffix = "l";
13874 break;
13875
13876 default:
13877 gcc_unreachable ();
13878 }
13879 break;
13880 case LTU:
13881 gcc_assert (mode == CCmode || mode == CCCmode);
13882 suffix = "b";
13883 break;
13884 case GE:
13885 switch (mode)
13886 {
13887 case CCNOmode:
13888 case CCGOCmode:
13889 suffix = "ns";
13890 break;
13891
13892 case CCmode:
13893 case CCGCmode:
13894 suffix = "ge";
13895 break;
13896
13897 default:
13898 gcc_unreachable ();
13899 }
13900 break;
13901 case GEU:
13902 /* ??? As above. */
13903 gcc_assert (mode == CCmode || mode == CCCmode);
13904 suffix = fp ? "nb" : "ae";
13905 break;
13906 case LE:
13907 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13908 suffix = "le";
13909 break;
13910 case LEU:
13911 /* ??? As above. */
13912 if (mode == CCmode)
13913 suffix = "be";
13914 else if (mode == CCCmode)
13915 suffix = fp ? "nb" : "ae";
13916 else
13917 gcc_unreachable ();
13918 break;
13919 case UNORDERED:
13920 suffix = fp ? "u" : "p";
13921 break;
13922 case ORDERED:
13923 suffix = fp ? "nu" : "np";
13924 break;
13925 default:
13926 gcc_unreachable ();
13927 }
13928 fputs (suffix, file);
13929 }
13930
13931 /* Print the name of register X to FILE based on its machine mode and number.
13932 If CODE is 'w', pretend the mode is HImode.
13933 If CODE is 'b', pretend the mode is QImode.
13934 If CODE is 'k', pretend the mode is SImode.
13935 If CODE is 'q', pretend the mode is DImode.
13936 If CODE is 'x', pretend the mode is V4SFmode.
13937 If CODE is 't', pretend the mode is V8SFmode.
13938 If CODE is 'h', pretend the reg is the 'high' byte register.
13939 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13940 If CODE is 'd', duplicate the operand for AVX instruction.
13941 */
13942
13943 void
13944 print_reg (rtx x, int code, FILE *file)
13945 {
13946 const char *reg;
13947 bool duplicated = code == 'd' && TARGET_AVX;
13948
13949 gcc_assert (x == pc_rtx
13950 || (REGNO (x) != ARG_POINTER_REGNUM
13951 && REGNO (x) != FRAME_POINTER_REGNUM
13952 && REGNO (x) != FLAGS_REG
13953 && REGNO (x) != FPSR_REG
13954 && REGNO (x) != FPCR_REG));
13955
13956 if (ASSEMBLER_DIALECT == ASM_ATT)
13957 putc ('%', file);
13958
13959 if (x == pc_rtx)
13960 {
13961 gcc_assert (TARGET_64BIT);
13962 fputs ("rip", file);
13963 return;
13964 }
13965
13966 if (code == 'w' || MMX_REG_P (x))
13967 code = 2;
13968 else if (code == 'b')
13969 code = 1;
13970 else if (code == 'k')
13971 code = 4;
13972 else if (code == 'q')
13973 code = 8;
13974 else if (code == 'y')
13975 code = 3;
13976 else if (code == 'h')
13977 code = 0;
13978 else if (code == 'x')
13979 code = 16;
13980 else if (code == 't')
13981 code = 32;
13982 else
13983 code = GET_MODE_SIZE (GET_MODE (x));
13984
13985 /* Irritatingly, AMD extended registers use different naming convention
13986 from the normal registers: "r%d[bwd]" */
13987 if (REX_INT_REG_P (x))
13988 {
13989 gcc_assert (TARGET_64BIT);
13990 putc ('r', file);
13991 fprint_ul (file, REGNO (x) - FIRST_REX_INT_REG + 8);
13992 switch (code)
13993 {
13994 case 0:
13995 error ("extended registers have no high halves");
13996 break;
13997 case 1:
13998 putc ('b', file);
13999 break;
14000 case 2:
14001 putc ('w', file);
14002 break;
14003 case 4:
14004 putc ('d', file);
14005 break;
14006 case 8:
14007 /* no suffix */
14008 break;
14009 default:
14010 error ("unsupported operand size for extended register");
14011 break;
14012 }
14013 return;
14014 }
14015
14016 reg = NULL;
14017 switch (code)
14018 {
14019 case 3:
14020 if (STACK_TOP_P (x))
14021 {
14022 reg = "st(0)";
14023 break;
14024 }
14025 /* FALLTHRU */
14026 case 8:
14027 case 4:
14028 case 12:
14029 if (! ANY_FP_REG_P (x))
14030 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
14031 /* FALLTHRU */
14032 case 16:
14033 case 2:
14034 normal:
14035 reg = hi_reg_name[REGNO (x)];
14036 break;
14037 case 1:
14038 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
14039 goto normal;
14040 reg = qi_reg_name[REGNO (x)];
14041 break;
14042 case 0:
14043 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
14044 goto normal;
14045 reg = qi_high_reg_name[REGNO (x)];
14046 break;
14047 case 32:
14048 if (SSE_REG_P (x))
14049 {
14050 gcc_assert (!duplicated);
14051 putc ('y', file);
14052 fputs (hi_reg_name[REGNO (x)] + 1, file);
14053 return;
14054 }
14055 break;
14056 default:
14057 gcc_unreachable ();
14058 }
14059
14060 fputs (reg, file);
14061 if (duplicated)
14062 {
14063 if (ASSEMBLER_DIALECT == ASM_ATT)
14064 fprintf (file, ", %%%s", reg);
14065 else
14066 fprintf (file, ", %s", reg);
14067 }
14068 }
14069
14070 /* Locate some local-dynamic symbol still in use by this function
14071 so that we can print its name in some tls_local_dynamic_base
14072 pattern. */
14073
14074 static int
14075 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14076 {
14077 rtx x = *px;
14078
14079 if (GET_CODE (x) == SYMBOL_REF
14080 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14081 {
14082 cfun->machine->some_ld_name = XSTR (x, 0);
14083 return 1;
14084 }
14085
14086 return 0;
14087 }
14088
14089 static const char *
14090 get_some_local_dynamic_name (void)
14091 {
14092 rtx insn;
14093
14094 if (cfun->machine->some_ld_name)
14095 return cfun->machine->some_ld_name;
14096
14097 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14098 if (NONDEBUG_INSN_P (insn)
14099 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14100 return cfun->machine->some_ld_name;
14101
14102 return NULL;
14103 }
14104
14105 /* Meaning of CODE:
14106 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14107 C -- print opcode suffix for set/cmov insn.
14108 c -- like C, but print reversed condition
14109 F,f -- likewise, but for floating-point.
14110 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14111 otherwise nothing
14112 R -- print the prefix for register names.
14113 z -- print the opcode suffix for the size of the current operand.
14114 Z -- likewise, with special suffixes for x87 instructions.
14115 * -- print a star (in certain assembler syntax)
14116 A -- print an absolute memory reference.
14117 E -- print address with DImode register names if TARGET_64BIT.
14118 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14119 s -- print a shift double count, followed by the assemblers argument
14120 delimiter.
14121 b -- print the QImode name of the register for the indicated operand.
14122 %b0 would print %al if operands[0] is reg 0.
14123 w -- likewise, print the HImode name of the register.
14124 k -- likewise, print the SImode name of the register.
14125 q -- likewise, print the DImode name of the register.
14126 x -- likewise, print the V4SFmode name of the register.
14127 t -- likewise, print the V8SFmode name of the register.
14128 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14129 y -- print "st(0)" instead of "st" as a register.
14130 d -- print duplicated register operand for AVX instruction.
14131 D -- print condition for SSE cmp instruction.
14132 P -- if PIC, print an @PLT suffix.
14133 p -- print raw symbol name.
14134 X -- don't print any sort of PIC '@' suffix for a symbol.
14135 & -- print some in-use local-dynamic symbol name.
14136 H -- print a memory address offset by 8; used for sse high-parts
14137 Y -- print condition for XOP pcom* instruction.
14138 + -- print a branch hint as 'cs' or 'ds' prefix
14139 ; -- print a semicolon (after prefixes due to bug in older gas).
14140 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14141 @ -- print a segment register of thread base pointer load
14142 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14143 */
14144
14145 void
14146 ix86_print_operand (FILE *file, rtx x, int code)
14147 {
14148 if (code)
14149 {
14150 switch (code)
14151 {
14152 case 'A':
14153 switch (ASSEMBLER_DIALECT)
14154 {
14155 case ASM_ATT:
14156 putc ('*', file);
14157 break;
14158
14159 case ASM_INTEL:
14160 /* Intel syntax. For absolute addresses, registers should not
14161 be surrounded by braces. */
14162 if (!REG_P (x))
14163 {
14164 putc ('[', file);
14165 ix86_print_operand (file, x, 0);
14166 putc (']', file);
14167 return;
14168 }
14169 break;
14170
14171 default:
14172 gcc_unreachable ();
14173 }
14174
14175 ix86_print_operand (file, x, 0);
14176 return;
14177
14178 case 'E':
14179 /* Wrap address in an UNSPEC to declare special handling. */
14180 if (TARGET_64BIT)
14181 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14182
14183 output_address (x);
14184 return;
14185
14186 case 'L':
14187 if (ASSEMBLER_DIALECT == ASM_ATT)
14188 putc ('l', file);
14189 return;
14190
14191 case 'W':
14192 if (ASSEMBLER_DIALECT == ASM_ATT)
14193 putc ('w', file);
14194 return;
14195
14196 case 'B':
14197 if (ASSEMBLER_DIALECT == ASM_ATT)
14198 putc ('b', file);
14199 return;
14200
14201 case 'Q':
14202 if (ASSEMBLER_DIALECT == ASM_ATT)
14203 putc ('l', file);
14204 return;
14205
14206 case 'S':
14207 if (ASSEMBLER_DIALECT == ASM_ATT)
14208 putc ('s', file);
14209 return;
14210
14211 case 'T':
14212 if (ASSEMBLER_DIALECT == ASM_ATT)
14213 putc ('t', file);
14214 return;
14215
14216 case 'O':
14217 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14218 if (ASSEMBLER_DIALECT != ASM_ATT)
14219 return;
14220
14221 switch (GET_MODE_SIZE (GET_MODE (x)))
14222 {
14223 case 2:
14224 putc ('w', file);
14225 break;
14226
14227 case 4:
14228 putc ('l', file);
14229 break;
14230
14231 case 8:
14232 putc ('q', file);
14233 break;
14234
14235 default:
14236 output_operand_lossage
14237 ("invalid operand size for operand code 'O'");
14238 return;
14239 }
14240
14241 putc ('.', file);
14242 #endif
14243 return;
14244
14245 case 'z':
14246 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14247 {
14248 /* Opcodes don't get size suffixes if using Intel opcodes. */
14249 if (ASSEMBLER_DIALECT == ASM_INTEL)
14250 return;
14251
14252 switch (GET_MODE_SIZE (GET_MODE (x)))
14253 {
14254 case 1:
14255 putc ('b', file);
14256 return;
14257
14258 case 2:
14259 putc ('w', file);
14260 return;
14261
14262 case 4:
14263 putc ('l', file);
14264 return;
14265
14266 case 8:
14267 putc ('q', file);
14268 return;
14269
14270 default:
14271 output_operand_lossage
14272 ("invalid operand size for operand code 'z'");
14273 return;
14274 }
14275 }
14276
14277 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14278 warning
14279 (0, "non-integer operand used with operand code 'z'");
14280 /* FALLTHRU */
14281
14282 case 'Z':
14283 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14284 if (ASSEMBLER_DIALECT == ASM_INTEL)
14285 return;
14286
14287 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14288 {
14289 switch (GET_MODE_SIZE (GET_MODE (x)))
14290 {
14291 case 2:
14292 #ifdef HAVE_AS_IX86_FILDS
14293 putc ('s', file);
14294 #endif
14295 return;
14296
14297 case 4:
14298 putc ('l', file);
14299 return;
14300
14301 case 8:
14302 #ifdef HAVE_AS_IX86_FILDQ
14303 putc ('q', file);
14304 #else
14305 fputs ("ll", file);
14306 #endif
14307 return;
14308
14309 default:
14310 break;
14311 }
14312 }
14313 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14314 {
14315 /* 387 opcodes don't get size suffixes
14316 if the operands are registers. */
14317 if (STACK_REG_P (x))
14318 return;
14319
14320 switch (GET_MODE_SIZE (GET_MODE (x)))
14321 {
14322 case 4:
14323 putc ('s', file);
14324 return;
14325
14326 case 8:
14327 putc ('l', file);
14328 return;
14329
14330 case 12:
14331 case 16:
14332 putc ('t', file);
14333 return;
14334
14335 default:
14336 break;
14337 }
14338 }
14339 else
14340 {
14341 output_operand_lossage
14342 ("invalid operand type used with operand code 'Z'");
14343 return;
14344 }
14345
14346 output_operand_lossage
14347 ("invalid operand size for operand code 'Z'");
14348 return;
14349
14350 case 'd':
14351 case 'b':
14352 case 'w':
14353 case 'k':
14354 case 'q':
14355 case 'h':
14356 case 't':
14357 case 'y':
14358 case 'x':
14359 case 'X':
14360 case 'P':
14361 case 'p':
14362 break;
14363
14364 case 's':
14365 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14366 {
14367 ix86_print_operand (file, x, 0);
14368 fputs (", ", file);
14369 }
14370 return;
14371
14372 case 'Y':
14373 switch (GET_CODE (x))
14374 {
14375 case NE:
14376 fputs ("neq", file);
14377 break;
14378 case EQ:
14379 fputs ("eq", file);
14380 break;
14381 case GE:
14382 case GEU:
14383 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14384 break;
14385 case GT:
14386 case GTU:
14387 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14388 break;
14389 case LE:
14390 case LEU:
14391 fputs ("le", file);
14392 break;
14393 case LT:
14394 case LTU:
14395 fputs ("lt", file);
14396 break;
14397 case UNORDERED:
14398 fputs ("unord", file);
14399 break;
14400 case ORDERED:
14401 fputs ("ord", file);
14402 break;
14403 case UNEQ:
14404 fputs ("ueq", file);
14405 break;
14406 case UNGE:
14407 fputs ("nlt", file);
14408 break;
14409 case UNGT:
14410 fputs ("nle", file);
14411 break;
14412 case UNLE:
14413 fputs ("ule", file);
14414 break;
14415 case UNLT:
14416 fputs ("ult", file);
14417 break;
14418 case LTGT:
14419 fputs ("une", file);
14420 break;
14421 default:
14422 output_operand_lossage ("operand is not a condition code, "
14423 "invalid operand code 'Y'");
14424 return;
14425 }
14426 return;
14427
14428 case 'D':
14429 /* Little bit of braindamage here. The SSE compare instructions
14430 does use completely different names for the comparisons that the
14431 fp conditional moves. */
14432 switch (GET_CODE (x))
14433 {
14434 case UNEQ:
14435 if (TARGET_AVX)
14436 {
14437 fputs ("eq_us", file);
14438 break;
14439 }
14440 case EQ:
14441 fputs ("eq", file);
14442 break;
14443 case UNLT:
14444 if (TARGET_AVX)
14445 {
14446 fputs ("nge", file);
14447 break;
14448 }
14449 case LT:
14450 fputs ("lt", file);
14451 break;
14452 case UNLE:
14453 if (TARGET_AVX)
14454 {
14455 fputs ("ngt", file);
14456 break;
14457 }
14458 case LE:
14459 fputs ("le", file);
14460 break;
14461 case UNORDERED:
14462 fputs ("unord", file);
14463 break;
14464 case LTGT:
14465 if (TARGET_AVX)
14466 {
14467 fputs ("neq_oq", file);
14468 break;
14469 }
14470 case NE:
14471 fputs ("neq", file);
14472 break;
14473 case GE:
14474 if (TARGET_AVX)
14475 {
14476 fputs ("ge", file);
14477 break;
14478 }
14479 case UNGE:
14480 fputs ("nlt", file);
14481 break;
14482 case GT:
14483 if (TARGET_AVX)
14484 {
14485 fputs ("gt", file);
14486 break;
14487 }
14488 case UNGT:
14489 fputs ("nle", file);
14490 break;
14491 case ORDERED:
14492 fputs ("ord", file);
14493 break;
14494 default:
14495 output_operand_lossage ("operand is not a condition code, "
14496 "invalid operand code 'D'");
14497 return;
14498 }
14499 return;
14500
14501 case 'F':
14502 case 'f':
14503 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14504 if (ASSEMBLER_DIALECT == ASM_ATT)
14505 putc ('.', file);
14506 #endif
14507
14508 case 'C':
14509 case 'c':
14510 if (!COMPARISON_P (x))
14511 {
14512 output_operand_lossage ("operand is not a condition code, "
14513 "invalid operand code '%c'", code);
14514 return;
14515 }
14516 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14517 code == 'c' || code == 'f',
14518 code == 'F' || code == 'f',
14519 file);
14520 return;
14521
14522 case 'H':
14523 if (!offsettable_memref_p (x))
14524 {
14525 output_operand_lossage ("operand is not an offsettable memory "
14526 "reference, invalid operand code 'H'");
14527 return;
14528 }
14529 /* It doesn't actually matter what mode we use here, as we're
14530 only going to use this for printing. */
14531 x = adjust_address_nv (x, DImode, 8);
14532 break;
14533
14534 case 'K':
14535 gcc_assert (CONST_INT_P (x));
14536
14537 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14538 #ifdef HAVE_AS_IX86_HLE
14539 fputs ("xacquire ", file);
14540 #else
14541 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14542 #endif
14543 else if (INTVAL (x) & IX86_HLE_RELEASE)
14544 #ifdef HAVE_AS_IX86_HLE
14545 fputs ("xrelease ", file);
14546 #else
14547 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14548 #endif
14549 /* We do not want to print value of the operand. */
14550 return;
14551
14552 case '*':
14553 if (ASSEMBLER_DIALECT == ASM_ATT)
14554 putc ('*', file);
14555 return;
14556
14557 case '&':
14558 {
14559 const char *name = get_some_local_dynamic_name ();
14560 if (name == NULL)
14561 output_operand_lossage ("'%%&' used without any "
14562 "local dynamic TLS references");
14563 else
14564 assemble_name (file, name);
14565 return;
14566 }
14567
14568 case '+':
14569 {
14570 rtx x;
14571
14572 if (!optimize
14573 || optimize_function_for_size_p (cfun)
14574 || !TARGET_BRANCH_PREDICTION_HINTS)
14575 return;
14576
14577 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14578 if (x)
14579 {
14580 int pred_val = INTVAL (XEXP (x, 0));
14581
14582 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14583 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14584 {
14585 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14586 bool cputaken
14587 = final_forward_branch_p (current_output_insn) == 0;
14588
14589 /* Emit hints only in the case default branch prediction
14590 heuristics would fail. */
14591 if (taken != cputaken)
14592 {
14593 /* We use 3e (DS) prefix for taken branches and
14594 2e (CS) prefix for not taken branches. */
14595 if (taken)
14596 fputs ("ds ; ", file);
14597 else
14598 fputs ("cs ; ", file);
14599 }
14600 }
14601 }
14602 return;
14603 }
14604
14605 case ';':
14606 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14607 putc (';', file);
14608 #endif
14609 return;
14610
14611 case '@':
14612 if (ASSEMBLER_DIALECT == ASM_ATT)
14613 putc ('%', file);
14614
14615 /* The kernel uses a different segment register for performance
14616 reasons; a system call would not have to trash the userspace
14617 segment register, which would be expensive. */
14618 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14619 fputs ("fs", file);
14620 else
14621 fputs ("gs", file);
14622 return;
14623
14624 case '~':
14625 putc (TARGET_AVX2 ? 'i' : 'f', file);
14626 return;
14627
14628 case '^':
14629 if (TARGET_64BIT && Pmode != word_mode)
14630 fputs ("addr32 ", file);
14631 return;
14632
14633 default:
14634 output_operand_lossage ("invalid operand code '%c'", code);
14635 }
14636 }
14637
14638 if (REG_P (x))
14639 print_reg (x, code, file);
14640
14641 else if (MEM_P (x))
14642 {
14643 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14644 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14645 && GET_MODE (x) != BLKmode)
14646 {
14647 const char * size;
14648 switch (GET_MODE_SIZE (GET_MODE (x)))
14649 {
14650 case 1: size = "BYTE"; break;
14651 case 2: size = "WORD"; break;
14652 case 4: size = "DWORD"; break;
14653 case 8: size = "QWORD"; break;
14654 case 12: size = "TBYTE"; break;
14655 case 16:
14656 if (GET_MODE (x) == XFmode)
14657 size = "TBYTE";
14658 else
14659 size = "XMMWORD";
14660 break;
14661 case 32: size = "YMMWORD"; break;
14662 default:
14663 gcc_unreachable ();
14664 }
14665
14666 /* Check for explicit size override (codes 'b', 'w', 'k',
14667 'q' and 'x') */
14668 if (code == 'b')
14669 size = "BYTE";
14670 else if (code == 'w')
14671 size = "WORD";
14672 else if (code == 'k')
14673 size = "DWORD";
14674 else if (code == 'q')
14675 size = "QWORD";
14676 else if (code == 'x')
14677 size = "XMMWORD";
14678
14679 fputs (size, file);
14680 fputs (" PTR ", file);
14681 }
14682
14683 x = XEXP (x, 0);
14684 /* Avoid (%rip) for call operands. */
14685 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14686 && !CONST_INT_P (x))
14687 output_addr_const (file, x);
14688 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14689 output_operand_lossage ("invalid constraints for operand");
14690 else
14691 output_address (x);
14692 }
14693
14694 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14695 {
14696 REAL_VALUE_TYPE r;
14697 long l;
14698
14699 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14700 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14701
14702 if (ASSEMBLER_DIALECT == ASM_ATT)
14703 putc ('$', file);
14704 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14705 if (code == 'q')
14706 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14707 else
14708 fprintf (file, "0x%08x", (unsigned int) l);
14709 }
14710
14711 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14712 {
14713 REAL_VALUE_TYPE r;
14714 long l[2];
14715
14716 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14717 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14718
14719 if (ASSEMBLER_DIALECT == ASM_ATT)
14720 putc ('$', file);
14721 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14722 }
14723
14724 /* These float cases don't actually occur as immediate operands. */
14725 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14726 {
14727 char dstr[30];
14728
14729 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14730 fputs (dstr, file);
14731 }
14732
14733 else
14734 {
14735 /* We have patterns that allow zero sets of memory, for instance.
14736 In 64-bit mode, we should probably support all 8-byte vectors,
14737 since we can in fact encode that into an immediate. */
14738 if (GET_CODE (x) == CONST_VECTOR)
14739 {
14740 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14741 x = const0_rtx;
14742 }
14743
14744 if (code != 'P' && code != 'p')
14745 {
14746 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14747 {
14748 if (ASSEMBLER_DIALECT == ASM_ATT)
14749 putc ('$', file);
14750 }
14751 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14752 || GET_CODE (x) == LABEL_REF)
14753 {
14754 if (ASSEMBLER_DIALECT == ASM_ATT)
14755 putc ('$', file);
14756 else
14757 fputs ("OFFSET FLAT:", file);
14758 }
14759 }
14760 if (CONST_INT_P (x))
14761 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14762 else if (flag_pic || MACHOPIC_INDIRECT)
14763 output_pic_addr_const (file, x, code);
14764 else
14765 output_addr_const (file, x);
14766 }
14767 }
14768
14769 static bool
14770 ix86_print_operand_punct_valid_p (unsigned char code)
14771 {
14772 return (code == '@' || code == '*' || code == '+' || code == '&'
14773 || code == ';' || code == '~' || code == '^');
14774 }
14775 \f
14776 /* Print a memory operand whose address is ADDR. */
14777
14778 static void
14779 ix86_print_operand_address (FILE *file, rtx addr)
14780 {
14781 struct ix86_address parts;
14782 rtx base, index, disp;
14783 int scale;
14784 int ok;
14785 bool vsib = false;
14786 int code = 0;
14787
14788 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14789 {
14790 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14791 gcc_assert (parts.index == NULL_RTX);
14792 parts.index = XVECEXP (addr, 0, 1);
14793 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14794 addr = XVECEXP (addr, 0, 0);
14795 vsib = true;
14796 }
14797 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14798 {
14799 gcc_assert (TARGET_64BIT);
14800 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14801 code = 'q';
14802 }
14803 else
14804 ok = ix86_decompose_address (addr, &parts);
14805
14806 gcc_assert (ok);
14807
14808 if (parts.base && GET_CODE (parts.base) == SUBREG)
14809 {
14810 rtx tmp = SUBREG_REG (parts.base);
14811 parts.base = simplify_subreg (GET_MODE (parts.base),
14812 tmp, GET_MODE (tmp), 0);
14813 }
14814
14815 if (parts.index && GET_CODE (parts.index) == SUBREG)
14816 {
14817 rtx tmp = SUBREG_REG (parts.index);
14818 parts.index = simplify_subreg (GET_MODE (parts.index),
14819 tmp, GET_MODE (tmp), 0);
14820 }
14821
14822 base = parts.base;
14823 index = parts.index;
14824 disp = parts.disp;
14825 scale = parts.scale;
14826
14827 switch (parts.seg)
14828 {
14829 case SEG_DEFAULT:
14830 break;
14831 case SEG_FS:
14832 case SEG_GS:
14833 if (ASSEMBLER_DIALECT == ASM_ATT)
14834 putc ('%', file);
14835 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14836 break;
14837 default:
14838 gcc_unreachable ();
14839 }
14840
14841 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14842 if (TARGET_64BIT && !base && !index)
14843 {
14844 rtx symbol = disp;
14845
14846 if (GET_CODE (disp) == CONST
14847 && GET_CODE (XEXP (disp, 0)) == PLUS
14848 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14849 symbol = XEXP (XEXP (disp, 0), 0);
14850
14851 if (GET_CODE (symbol) == LABEL_REF
14852 || (GET_CODE (symbol) == SYMBOL_REF
14853 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14854 base = pc_rtx;
14855 }
14856 if (!base && !index)
14857 {
14858 /* Displacement only requires special attention. */
14859
14860 if (CONST_INT_P (disp))
14861 {
14862 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14863 fputs ("ds:", file);
14864 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14865 }
14866 else if (flag_pic)
14867 output_pic_addr_const (file, disp, 0);
14868 else
14869 output_addr_const (file, disp);
14870 }
14871 else
14872 {
14873 /* Print SImode register names to force addr32 prefix. */
14874 if (GET_CODE (addr) == SUBREG)
14875 {
14876 gcc_assert (TARGET_64BIT);
14877 gcc_assert (GET_MODE (addr) == SImode);
14878 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
14879 gcc_assert (!code);
14880 code = 'l';
14881 }
14882 else if (GET_CODE (addr) == ZERO_EXTEND
14883 || GET_CODE (addr) == AND)
14884 {
14885 gcc_assert (TARGET_64BIT);
14886 gcc_assert (GET_MODE (addr) == DImode);
14887 gcc_assert (!code);
14888 code = 'l';
14889 }
14890
14891 if (ASSEMBLER_DIALECT == ASM_ATT)
14892 {
14893 if (disp)
14894 {
14895 if (flag_pic)
14896 output_pic_addr_const (file, disp, 0);
14897 else if (GET_CODE (disp) == LABEL_REF)
14898 output_asm_label (disp);
14899 else
14900 output_addr_const (file, disp);
14901 }
14902
14903 putc ('(', file);
14904 if (base)
14905 print_reg (base, code, file);
14906 if (index)
14907 {
14908 putc (',', file);
14909 print_reg (index, vsib ? 0 : code, file);
14910 if (scale != 1 || vsib)
14911 fprintf (file, ",%d", scale);
14912 }
14913 putc (')', file);
14914 }
14915 else
14916 {
14917 rtx offset = NULL_RTX;
14918
14919 if (disp)
14920 {
14921 /* Pull out the offset of a symbol; print any symbol itself. */
14922 if (GET_CODE (disp) == CONST
14923 && GET_CODE (XEXP (disp, 0)) == PLUS
14924 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14925 {
14926 offset = XEXP (XEXP (disp, 0), 1);
14927 disp = gen_rtx_CONST (VOIDmode,
14928 XEXP (XEXP (disp, 0), 0));
14929 }
14930
14931 if (flag_pic)
14932 output_pic_addr_const (file, disp, 0);
14933 else if (GET_CODE (disp) == LABEL_REF)
14934 output_asm_label (disp);
14935 else if (CONST_INT_P (disp))
14936 offset = disp;
14937 else
14938 output_addr_const (file, disp);
14939 }
14940
14941 putc ('[', file);
14942 if (base)
14943 {
14944 print_reg (base, code, file);
14945 if (offset)
14946 {
14947 if (INTVAL (offset) >= 0)
14948 putc ('+', file);
14949 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14950 }
14951 }
14952 else if (offset)
14953 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14954 else
14955 putc ('0', file);
14956
14957 if (index)
14958 {
14959 putc ('+', file);
14960 print_reg (index, vsib ? 0 : code, file);
14961 if (scale != 1 || vsib)
14962 fprintf (file, "*%d", scale);
14963 }
14964 putc (']', file);
14965 }
14966 }
14967 }
14968
14969 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14970
14971 static bool
14972 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14973 {
14974 rtx op;
14975
14976 if (GET_CODE (x) != UNSPEC)
14977 return false;
14978
14979 op = XVECEXP (x, 0, 0);
14980 switch (XINT (x, 1))
14981 {
14982 case UNSPEC_GOTTPOFF:
14983 output_addr_const (file, op);
14984 /* FIXME: This might be @TPOFF in Sun ld. */
14985 fputs ("@gottpoff", file);
14986 break;
14987 case UNSPEC_TPOFF:
14988 output_addr_const (file, op);
14989 fputs ("@tpoff", file);
14990 break;
14991 case UNSPEC_NTPOFF:
14992 output_addr_const (file, op);
14993 if (TARGET_64BIT)
14994 fputs ("@tpoff", file);
14995 else
14996 fputs ("@ntpoff", file);
14997 break;
14998 case UNSPEC_DTPOFF:
14999 output_addr_const (file, op);
15000 fputs ("@dtpoff", file);
15001 break;
15002 case UNSPEC_GOTNTPOFF:
15003 output_addr_const (file, op);
15004 if (TARGET_64BIT)
15005 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
15006 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
15007 else
15008 fputs ("@gotntpoff", file);
15009 break;
15010 case UNSPEC_INDNTPOFF:
15011 output_addr_const (file, op);
15012 fputs ("@indntpoff", file);
15013 break;
15014 #if TARGET_MACHO
15015 case UNSPEC_MACHOPIC_OFFSET:
15016 output_addr_const (file, op);
15017 putc ('-', file);
15018 machopic_output_function_base_name (file);
15019 break;
15020 #endif
15021
15022 case UNSPEC_STACK_CHECK:
15023 {
15024 int offset;
15025
15026 gcc_assert (flag_split_stack);
15027
15028 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15029 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15030 #else
15031 gcc_unreachable ();
15032 #endif
15033
15034 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
15035 }
15036 break;
15037
15038 default:
15039 return false;
15040 }
15041
15042 return true;
15043 }
15044 \f
15045 /* Split one or more double-mode RTL references into pairs of half-mode
15046 references. The RTL can be REG, offsettable MEM, integer constant, or
15047 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
15048 split and "num" is its length. lo_half and hi_half are output arrays
15049 that parallel "operands". */
15050
15051 void
15052 split_double_mode (enum machine_mode mode, rtx operands[],
15053 int num, rtx lo_half[], rtx hi_half[])
15054 {
15055 enum machine_mode half_mode;
15056 unsigned int byte;
15057
15058 switch (mode)
15059 {
15060 case TImode:
15061 half_mode = DImode;
15062 break;
15063 case DImode:
15064 half_mode = SImode;
15065 break;
15066 default:
15067 gcc_unreachable ();
15068 }
15069
15070 byte = GET_MODE_SIZE (half_mode);
15071
15072 while (num--)
15073 {
15074 rtx op = operands[num];
15075
15076 /* simplify_subreg refuse to split volatile memory addresses,
15077 but we still have to handle it. */
15078 if (MEM_P (op))
15079 {
15080 lo_half[num] = adjust_address (op, half_mode, 0);
15081 hi_half[num] = adjust_address (op, half_mode, byte);
15082 }
15083 else
15084 {
15085 lo_half[num] = simplify_gen_subreg (half_mode, op,
15086 GET_MODE (op) == VOIDmode
15087 ? mode : GET_MODE (op), 0);
15088 hi_half[num] = simplify_gen_subreg (half_mode, op,
15089 GET_MODE (op) == VOIDmode
15090 ? mode : GET_MODE (op), byte);
15091 }
15092 }
15093 }
15094 \f
15095 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15096 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15097 is the expression of the binary operation. The output may either be
15098 emitted here, or returned to the caller, like all output_* functions.
15099
15100 There is no guarantee that the operands are the same mode, as they
15101 might be within FLOAT or FLOAT_EXTEND expressions. */
15102
15103 #ifndef SYSV386_COMPAT
15104 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15105 wants to fix the assemblers because that causes incompatibility
15106 with gcc. No-one wants to fix gcc because that causes
15107 incompatibility with assemblers... You can use the option of
15108 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15109 #define SYSV386_COMPAT 1
15110 #endif
15111
15112 const char *
15113 output_387_binary_op (rtx insn, rtx *operands)
15114 {
15115 static char buf[40];
15116 const char *p;
15117 const char *ssep;
15118 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15119
15120 #ifdef ENABLE_CHECKING
15121 /* Even if we do not want to check the inputs, this documents input
15122 constraints. Which helps in understanding the following code. */
15123 if (STACK_REG_P (operands[0])
15124 && ((REG_P (operands[1])
15125 && REGNO (operands[0]) == REGNO (operands[1])
15126 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15127 || (REG_P (operands[2])
15128 && REGNO (operands[0]) == REGNO (operands[2])
15129 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15130 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15131 ; /* ok */
15132 else
15133 gcc_assert (is_sse);
15134 #endif
15135
15136 switch (GET_CODE (operands[3]))
15137 {
15138 case PLUS:
15139 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15140 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15141 p = "fiadd";
15142 else
15143 p = "fadd";
15144 ssep = "vadd";
15145 break;
15146
15147 case MINUS:
15148 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15149 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15150 p = "fisub";
15151 else
15152 p = "fsub";
15153 ssep = "vsub";
15154 break;
15155
15156 case MULT:
15157 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15158 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15159 p = "fimul";
15160 else
15161 p = "fmul";
15162 ssep = "vmul";
15163 break;
15164
15165 case DIV:
15166 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15167 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15168 p = "fidiv";
15169 else
15170 p = "fdiv";
15171 ssep = "vdiv";
15172 break;
15173
15174 default:
15175 gcc_unreachable ();
15176 }
15177
15178 if (is_sse)
15179 {
15180 if (TARGET_AVX)
15181 {
15182 strcpy (buf, ssep);
15183 if (GET_MODE (operands[0]) == SFmode)
15184 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15185 else
15186 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15187 }
15188 else
15189 {
15190 strcpy (buf, ssep + 1);
15191 if (GET_MODE (operands[0]) == SFmode)
15192 strcat (buf, "ss\t{%2, %0|%0, %2}");
15193 else
15194 strcat (buf, "sd\t{%2, %0|%0, %2}");
15195 }
15196 return buf;
15197 }
15198 strcpy (buf, p);
15199
15200 switch (GET_CODE (operands[3]))
15201 {
15202 case MULT:
15203 case PLUS:
15204 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15205 {
15206 rtx temp = operands[2];
15207 operands[2] = operands[1];
15208 operands[1] = temp;
15209 }
15210
15211 /* know operands[0] == operands[1]. */
15212
15213 if (MEM_P (operands[2]))
15214 {
15215 p = "%Z2\t%2";
15216 break;
15217 }
15218
15219 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15220 {
15221 if (STACK_TOP_P (operands[0]))
15222 /* How is it that we are storing to a dead operand[2]?
15223 Well, presumably operands[1] is dead too. We can't
15224 store the result to st(0) as st(0) gets popped on this
15225 instruction. Instead store to operands[2] (which I
15226 think has to be st(1)). st(1) will be popped later.
15227 gcc <= 2.8.1 didn't have this check and generated
15228 assembly code that the Unixware assembler rejected. */
15229 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15230 else
15231 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15232 break;
15233 }
15234
15235 if (STACK_TOP_P (operands[0]))
15236 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15237 else
15238 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15239 break;
15240
15241 case MINUS:
15242 case DIV:
15243 if (MEM_P (operands[1]))
15244 {
15245 p = "r%Z1\t%1";
15246 break;
15247 }
15248
15249 if (MEM_P (operands[2]))
15250 {
15251 p = "%Z2\t%2";
15252 break;
15253 }
15254
15255 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15256 {
15257 #if SYSV386_COMPAT
15258 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15259 derived assemblers, confusingly reverse the direction of
15260 the operation for fsub{r} and fdiv{r} when the
15261 destination register is not st(0). The Intel assembler
15262 doesn't have this brain damage. Read !SYSV386_COMPAT to
15263 figure out what the hardware really does. */
15264 if (STACK_TOP_P (operands[0]))
15265 p = "{p\t%0, %2|rp\t%2, %0}";
15266 else
15267 p = "{rp\t%2, %0|p\t%0, %2}";
15268 #else
15269 if (STACK_TOP_P (operands[0]))
15270 /* As above for fmul/fadd, we can't store to st(0). */
15271 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15272 else
15273 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15274 #endif
15275 break;
15276 }
15277
15278 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15279 {
15280 #if SYSV386_COMPAT
15281 if (STACK_TOP_P (operands[0]))
15282 p = "{rp\t%0, %1|p\t%1, %0}";
15283 else
15284 p = "{p\t%1, %0|rp\t%0, %1}";
15285 #else
15286 if (STACK_TOP_P (operands[0]))
15287 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15288 else
15289 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15290 #endif
15291 break;
15292 }
15293
15294 if (STACK_TOP_P (operands[0]))
15295 {
15296 if (STACK_TOP_P (operands[1]))
15297 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15298 else
15299 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15300 break;
15301 }
15302 else if (STACK_TOP_P (operands[1]))
15303 {
15304 #if SYSV386_COMPAT
15305 p = "{\t%1, %0|r\t%0, %1}";
15306 #else
15307 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15308 #endif
15309 }
15310 else
15311 {
15312 #if SYSV386_COMPAT
15313 p = "{r\t%2, %0|\t%0, %2}";
15314 #else
15315 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15316 #endif
15317 }
15318 break;
15319
15320 default:
15321 gcc_unreachable ();
15322 }
15323
15324 strcat (buf, p);
15325 return buf;
15326 }
15327
15328 /* Return needed mode for entity in optimize_mode_switching pass. */
15329
15330 int
15331 ix86_mode_needed (int entity, rtx insn)
15332 {
15333 enum attr_i387_cw mode;
15334
15335 /* The mode UNINITIALIZED is used to store control word after a
15336 function call or ASM pattern. The mode ANY specify that function
15337 has no requirements on the control word and make no changes in the
15338 bits we are interested in. */
15339
15340 if (CALL_P (insn)
15341 || (NONJUMP_INSN_P (insn)
15342 && (asm_noperands (PATTERN (insn)) >= 0
15343 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15344 return I387_CW_UNINITIALIZED;
15345
15346 if (recog_memoized (insn) < 0)
15347 return I387_CW_ANY;
15348
15349 mode = get_attr_i387_cw (insn);
15350
15351 switch (entity)
15352 {
15353 case I387_TRUNC:
15354 if (mode == I387_CW_TRUNC)
15355 return mode;
15356 break;
15357
15358 case I387_FLOOR:
15359 if (mode == I387_CW_FLOOR)
15360 return mode;
15361 break;
15362
15363 case I387_CEIL:
15364 if (mode == I387_CW_CEIL)
15365 return mode;
15366 break;
15367
15368 case I387_MASK_PM:
15369 if (mode == I387_CW_MASK_PM)
15370 return mode;
15371 break;
15372
15373 default:
15374 gcc_unreachable ();
15375 }
15376
15377 return I387_CW_ANY;
15378 }
15379
15380 /* Output code to initialize control word copies used by trunc?f?i and
15381 rounding patterns. CURRENT_MODE is set to current control word,
15382 while NEW_MODE is set to new control word. */
15383
15384 void
15385 emit_i387_cw_initialization (int mode)
15386 {
15387 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15388 rtx new_mode;
15389
15390 enum ix86_stack_slot slot;
15391
15392 rtx reg = gen_reg_rtx (HImode);
15393
15394 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15395 emit_move_insn (reg, copy_rtx (stored_mode));
15396
15397 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15398 || optimize_function_for_size_p (cfun))
15399 {
15400 switch (mode)
15401 {
15402 case I387_CW_TRUNC:
15403 /* round toward zero (truncate) */
15404 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15405 slot = SLOT_CW_TRUNC;
15406 break;
15407
15408 case I387_CW_FLOOR:
15409 /* round down toward -oo */
15410 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15411 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15412 slot = SLOT_CW_FLOOR;
15413 break;
15414
15415 case I387_CW_CEIL:
15416 /* round up toward +oo */
15417 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15418 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15419 slot = SLOT_CW_CEIL;
15420 break;
15421
15422 case I387_CW_MASK_PM:
15423 /* mask precision exception for nearbyint() */
15424 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15425 slot = SLOT_CW_MASK_PM;
15426 break;
15427
15428 default:
15429 gcc_unreachable ();
15430 }
15431 }
15432 else
15433 {
15434 switch (mode)
15435 {
15436 case I387_CW_TRUNC:
15437 /* round toward zero (truncate) */
15438 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15439 slot = SLOT_CW_TRUNC;
15440 break;
15441
15442 case I387_CW_FLOOR:
15443 /* round down toward -oo */
15444 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15445 slot = SLOT_CW_FLOOR;
15446 break;
15447
15448 case I387_CW_CEIL:
15449 /* round up toward +oo */
15450 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15451 slot = SLOT_CW_CEIL;
15452 break;
15453
15454 case I387_CW_MASK_PM:
15455 /* mask precision exception for nearbyint() */
15456 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15457 slot = SLOT_CW_MASK_PM;
15458 break;
15459
15460 default:
15461 gcc_unreachable ();
15462 }
15463 }
15464
15465 gcc_assert (slot < MAX_386_STACK_LOCALS);
15466
15467 new_mode = assign_386_stack_local (HImode, slot);
15468 emit_move_insn (new_mode, reg);
15469 }
15470
15471 /* Output code for INSN to convert a float to a signed int. OPERANDS
15472 are the insn operands. The output may be [HSD]Imode and the input
15473 operand may be [SDX]Fmode. */
15474
15475 const char *
15476 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15477 {
15478 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15479 int dimode_p = GET_MODE (operands[0]) == DImode;
15480 int round_mode = get_attr_i387_cw (insn);
15481
15482 /* Jump through a hoop or two for DImode, since the hardware has no
15483 non-popping instruction. We used to do this a different way, but
15484 that was somewhat fragile and broke with post-reload splitters. */
15485 if ((dimode_p || fisttp) && !stack_top_dies)
15486 output_asm_insn ("fld\t%y1", operands);
15487
15488 gcc_assert (STACK_TOP_P (operands[1]));
15489 gcc_assert (MEM_P (operands[0]));
15490 gcc_assert (GET_MODE (operands[1]) != TFmode);
15491
15492 if (fisttp)
15493 output_asm_insn ("fisttp%Z0\t%0", operands);
15494 else
15495 {
15496 if (round_mode != I387_CW_ANY)
15497 output_asm_insn ("fldcw\t%3", operands);
15498 if (stack_top_dies || dimode_p)
15499 output_asm_insn ("fistp%Z0\t%0", operands);
15500 else
15501 output_asm_insn ("fist%Z0\t%0", operands);
15502 if (round_mode != I387_CW_ANY)
15503 output_asm_insn ("fldcw\t%2", operands);
15504 }
15505
15506 return "";
15507 }
15508
15509 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15510 have the values zero or one, indicates the ffreep insn's operand
15511 from the OPERANDS array. */
15512
15513 static const char *
15514 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15515 {
15516 if (TARGET_USE_FFREEP)
15517 #ifdef HAVE_AS_IX86_FFREEP
15518 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15519 #else
15520 {
15521 static char retval[32];
15522 int regno = REGNO (operands[opno]);
15523
15524 gcc_assert (FP_REGNO_P (regno));
15525
15526 regno -= FIRST_STACK_REG;
15527
15528 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15529 return retval;
15530 }
15531 #endif
15532
15533 return opno ? "fstp\t%y1" : "fstp\t%y0";
15534 }
15535
15536
15537 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15538 should be used. UNORDERED_P is true when fucom should be used. */
15539
15540 const char *
15541 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15542 {
15543 int stack_top_dies;
15544 rtx cmp_op0, cmp_op1;
15545 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15546
15547 if (eflags_p)
15548 {
15549 cmp_op0 = operands[0];
15550 cmp_op1 = operands[1];
15551 }
15552 else
15553 {
15554 cmp_op0 = operands[1];
15555 cmp_op1 = operands[2];
15556 }
15557
15558 if (is_sse)
15559 {
15560 if (GET_MODE (operands[0]) == SFmode)
15561 if (unordered_p)
15562 return "%vucomiss\t{%1, %0|%0, %1}";
15563 else
15564 return "%vcomiss\t{%1, %0|%0, %1}";
15565 else
15566 if (unordered_p)
15567 return "%vucomisd\t{%1, %0|%0, %1}";
15568 else
15569 return "%vcomisd\t{%1, %0|%0, %1}";
15570 }
15571
15572 gcc_assert (STACK_TOP_P (cmp_op0));
15573
15574 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15575
15576 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15577 {
15578 if (stack_top_dies)
15579 {
15580 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15581 return output_387_ffreep (operands, 1);
15582 }
15583 else
15584 return "ftst\n\tfnstsw\t%0";
15585 }
15586
15587 if (STACK_REG_P (cmp_op1)
15588 && stack_top_dies
15589 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15590 && REGNO (cmp_op1) != FIRST_STACK_REG)
15591 {
15592 /* If both the top of the 387 stack dies, and the other operand
15593 is also a stack register that dies, then this must be a
15594 `fcompp' float compare */
15595
15596 if (eflags_p)
15597 {
15598 /* There is no double popping fcomi variant. Fortunately,
15599 eflags is immune from the fstp's cc clobbering. */
15600 if (unordered_p)
15601 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15602 else
15603 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15604 return output_387_ffreep (operands, 0);
15605 }
15606 else
15607 {
15608 if (unordered_p)
15609 return "fucompp\n\tfnstsw\t%0";
15610 else
15611 return "fcompp\n\tfnstsw\t%0";
15612 }
15613 }
15614 else
15615 {
15616 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15617
15618 static const char * const alt[16] =
15619 {
15620 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15621 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15622 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15623 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15624
15625 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15626 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15627 NULL,
15628 NULL,
15629
15630 "fcomi\t{%y1, %0|%0, %y1}",
15631 "fcomip\t{%y1, %0|%0, %y1}",
15632 "fucomi\t{%y1, %0|%0, %y1}",
15633 "fucomip\t{%y1, %0|%0, %y1}",
15634
15635 NULL,
15636 NULL,
15637 NULL,
15638 NULL
15639 };
15640
15641 int mask;
15642 const char *ret;
15643
15644 mask = eflags_p << 3;
15645 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15646 mask |= unordered_p << 1;
15647 mask |= stack_top_dies;
15648
15649 gcc_assert (mask < 16);
15650 ret = alt[mask];
15651 gcc_assert (ret);
15652
15653 return ret;
15654 }
15655 }
15656
15657 void
15658 ix86_output_addr_vec_elt (FILE *file, int value)
15659 {
15660 const char *directive = ASM_LONG;
15661
15662 #ifdef ASM_QUAD
15663 if (TARGET_LP64)
15664 directive = ASM_QUAD;
15665 #else
15666 gcc_assert (!TARGET_64BIT);
15667 #endif
15668
15669 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15670 }
15671
15672 void
15673 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15674 {
15675 const char *directive = ASM_LONG;
15676
15677 #ifdef ASM_QUAD
15678 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15679 directive = ASM_QUAD;
15680 #else
15681 gcc_assert (!TARGET_64BIT);
15682 #endif
15683 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15684 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15685 fprintf (file, "%s%s%d-%s%d\n",
15686 directive, LPREFIX, value, LPREFIX, rel);
15687 else if (HAVE_AS_GOTOFF_IN_DATA)
15688 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15689 #if TARGET_MACHO
15690 else if (TARGET_MACHO)
15691 {
15692 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15693 machopic_output_function_base_name (file);
15694 putc ('\n', file);
15695 }
15696 #endif
15697 else
15698 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15699 GOT_SYMBOL_NAME, LPREFIX, value);
15700 }
15701 \f
15702 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15703 for the target. */
15704
15705 void
15706 ix86_expand_clear (rtx dest)
15707 {
15708 rtx tmp;
15709
15710 /* We play register width games, which are only valid after reload. */
15711 gcc_assert (reload_completed);
15712
15713 /* Avoid HImode and its attendant prefix byte. */
15714 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15715 dest = gen_rtx_REG (SImode, REGNO (dest));
15716 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15717
15718 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15719 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15720 {
15721 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15722 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15723 }
15724
15725 emit_insn (tmp);
15726 }
15727
15728 /* X is an unchanging MEM. If it is a constant pool reference, return
15729 the constant pool rtx, else NULL. */
15730
15731 rtx
15732 maybe_get_pool_constant (rtx x)
15733 {
15734 x = ix86_delegitimize_address (XEXP (x, 0));
15735
15736 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15737 return get_pool_constant (x);
15738
15739 return NULL_RTX;
15740 }
15741
15742 void
15743 ix86_expand_move (enum machine_mode mode, rtx operands[])
15744 {
15745 rtx op0, op1;
15746 enum tls_model model;
15747
15748 op0 = operands[0];
15749 op1 = operands[1];
15750
15751 if (GET_CODE (op1) == SYMBOL_REF)
15752 {
15753 model = SYMBOL_REF_TLS_MODEL (op1);
15754 if (model)
15755 {
15756 op1 = legitimize_tls_address (op1, model, true);
15757 op1 = force_operand (op1, op0);
15758 if (op1 == op0)
15759 return;
15760 if (GET_MODE (op1) != mode)
15761 op1 = convert_to_mode (mode, op1, 1);
15762 }
15763 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15764 && SYMBOL_REF_DLLIMPORT_P (op1))
15765 op1 = legitimize_dllimport_symbol (op1, false);
15766 }
15767 else if (GET_CODE (op1) == CONST
15768 && GET_CODE (XEXP (op1, 0)) == PLUS
15769 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15770 {
15771 rtx addend = XEXP (XEXP (op1, 0), 1);
15772 rtx symbol = XEXP (XEXP (op1, 0), 0);
15773 rtx tmp = NULL;
15774
15775 model = SYMBOL_REF_TLS_MODEL (symbol);
15776 if (model)
15777 tmp = legitimize_tls_address (symbol, model, true);
15778 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15779 && SYMBOL_REF_DLLIMPORT_P (symbol))
15780 tmp = legitimize_dllimport_symbol (symbol, true);
15781
15782 if (tmp)
15783 {
15784 tmp = force_operand (tmp, NULL);
15785 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15786 op0, 1, OPTAB_DIRECT);
15787 if (tmp == op0)
15788 return;
15789 if (GET_MODE (tmp) != mode)
15790 op1 = convert_to_mode (mode, tmp, 1);
15791 }
15792 }
15793
15794 if ((flag_pic || MACHOPIC_INDIRECT)
15795 && symbolic_operand (op1, mode))
15796 {
15797 if (TARGET_MACHO && !TARGET_64BIT)
15798 {
15799 #if TARGET_MACHO
15800 /* dynamic-no-pic */
15801 if (MACHOPIC_INDIRECT)
15802 {
15803 rtx temp = ((reload_in_progress
15804 || ((op0 && REG_P (op0))
15805 && mode == Pmode))
15806 ? op0 : gen_reg_rtx (Pmode));
15807 op1 = machopic_indirect_data_reference (op1, temp);
15808 if (MACHOPIC_PURE)
15809 op1 = machopic_legitimize_pic_address (op1, mode,
15810 temp == op1 ? 0 : temp);
15811 }
15812 if (op0 != op1 && GET_CODE (op0) != MEM)
15813 {
15814 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15815 emit_insn (insn);
15816 return;
15817 }
15818 if (GET_CODE (op0) == MEM)
15819 op1 = force_reg (Pmode, op1);
15820 else
15821 {
15822 rtx temp = op0;
15823 if (GET_CODE (temp) != REG)
15824 temp = gen_reg_rtx (Pmode);
15825 temp = legitimize_pic_address (op1, temp);
15826 if (temp == op0)
15827 return;
15828 op1 = temp;
15829 }
15830 /* dynamic-no-pic */
15831 #endif
15832 }
15833 else
15834 {
15835 if (MEM_P (op0))
15836 op1 = force_reg (mode, op1);
15837 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15838 {
15839 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15840 op1 = legitimize_pic_address (op1, reg);
15841 if (op0 == op1)
15842 return;
15843 if (GET_MODE (op1) != mode)
15844 op1 = convert_to_mode (mode, op1, 1);
15845 }
15846 }
15847 }
15848 else
15849 {
15850 if (MEM_P (op0)
15851 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15852 || !push_operand (op0, mode))
15853 && MEM_P (op1))
15854 op1 = force_reg (mode, op1);
15855
15856 if (push_operand (op0, mode)
15857 && ! general_no_elim_operand (op1, mode))
15858 op1 = copy_to_mode_reg (mode, op1);
15859
15860 /* Force large constants in 64bit compilation into register
15861 to get them CSEed. */
15862 if (can_create_pseudo_p ()
15863 && (mode == DImode) && TARGET_64BIT
15864 && immediate_operand (op1, mode)
15865 && !x86_64_zext_immediate_operand (op1, VOIDmode)
15866 && !register_operand (op0, mode)
15867 && optimize)
15868 op1 = copy_to_mode_reg (mode, op1);
15869
15870 if (can_create_pseudo_p ()
15871 && FLOAT_MODE_P (mode)
15872 && GET_CODE (op1) == CONST_DOUBLE)
15873 {
15874 /* If we are loading a floating point constant to a register,
15875 force the value to memory now, since we'll get better code
15876 out the back end. */
15877
15878 op1 = validize_mem (force_const_mem (mode, op1));
15879 if (!register_operand (op0, mode))
15880 {
15881 rtx temp = gen_reg_rtx (mode);
15882 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15883 emit_move_insn (op0, temp);
15884 return;
15885 }
15886 }
15887 }
15888
15889 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15890 }
15891
15892 void
15893 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15894 {
15895 rtx op0 = operands[0], op1 = operands[1];
15896 unsigned int align = GET_MODE_ALIGNMENT (mode);
15897
15898 /* Force constants other than zero into memory. We do not know how
15899 the instructions used to build constants modify the upper 64 bits
15900 of the register, once we have that information we may be able
15901 to handle some of them more efficiently. */
15902 if (can_create_pseudo_p ()
15903 && register_operand (op0, mode)
15904 && (CONSTANT_P (op1)
15905 || (GET_CODE (op1) == SUBREG
15906 && CONSTANT_P (SUBREG_REG (op1))))
15907 && !standard_sse_constant_p (op1))
15908 op1 = validize_mem (force_const_mem (mode, op1));
15909
15910 /* We need to check memory alignment for SSE mode since attribute
15911 can make operands unaligned. */
15912 if (can_create_pseudo_p ()
15913 && SSE_REG_MODE_P (mode)
15914 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15915 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15916 {
15917 rtx tmp[2];
15918
15919 /* ix86_expand_vector_move_misalign() does not like constants ... */
15920 if (CONSTANT_P (op1)
15921 || (GET_CODE (op1) == SUBREG
15922 && CONSTANT_P (SUBREG_REG (op1))))
15923 op1 = validize_mem (force_const_mem (mode, op1));
15924
15925 /* ... nor both arguments in memory. */
15926 if (!register_operand (op0, mode)
15927 && !register_operand (op1, mode))
15928 op1 = force_reg (mode, op1);
15929
15930 tmp[0] = op0; tmp[1] = op1;
15931 ix86_expand_vector_move_misalign (mode, tmp);
15932 return;
15933 }
15934
15935 /* Make operand1 a register if it isn't already. */
15936 if (can_create_pseudo_p ()
15937 && !register_operand (op0, mode)
15938 && !register_operand (op1, mode))
15939 {
15940 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15941 return;
15942 }
15943
15944 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15945 }
15946
15947 /* Split 32-byte AVX unaligned load and store if needed. */
15948
15949 static void
15950 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
15951 {
15952 rtx m;
15953 rtx (*extract) (rtx, rtx, rtx);
15954 rtx (*move_unaligned) (rtx, rtx);
15955 enum machine_mode mode;
15956
15957 switch (GET_MODE (op0))
15958 {
15959 default:
15960 gcc_unreachable ();
15961 case V32QImode:
15962 extract = gen_avx_vextractf128v32qi;
15963 move_unaligned = gen_avx_movdqu256;
15964 mode = V16QImode;
15965 break;
15966 case V8SFmode:
15967 extract = gen_avx_vextractf128v8sf;
15968 move_unaligned = gen_avx_movups256;
15969 mode = V4SFmode;
15970 break;
15971 case V4DFmode:
15972 extract = gen_avx_vextractf128v4df;
15973 move_unaligned = gen_avx_movupd256;
15974 mode = V2DFmode;
15975 break;
15976 }
15977
15978 if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
15979 {
15980 rtx r = gen_reg_rtx (mode);
15981 m = adjust_address (op1, mode, 0);
15982 emit_move_insn (r, m);
15983 m = adjust_address (op1, mode, 16);
15984 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
15985 emit_move_insn (op0, r);
15986 }
15987 else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
15988 {
15989 m = adjust_address (op0, mode, 0);
15990 emit_insn (extract (m, op1, const0_rtx));
15991 m = adjust_address (op0, mode, 16);
15992 emit_insn (extract (m, op1, const1_rtx));
15993 }
15994 else
15995 emit_insn (move_unaligned (op0, op1));
15996 }
15997
15998 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
15999 straight to ix86_expand_vector_move. */
16000 /* Code generation for scalar reg-reg moves of single and double precision data:
16001 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16002 movaps reg, reg
16003 else
16004 movss reg, reg
16005 if (x86_sse_partial_reg_dependency == true)
16006 movapd reg, reg
16007 else
16008 movsd reg, reg
16009
16010 Code generation for scalar loads of double precision data:
16011 if (x86_sse_split_regs == true)
16012 movlpd mem, reg (gas syntax)
16013 else
16014 movsd mem, reg
16015
16016 Code generation for unaligned packed loads of single precision data
16017 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16018 if (x86_sse_unaligned_move_optimal)
16019 movups mem, reg
16020
16021 if (x86_sse_partial_reg_dependency == true)
16022 {
16023 xorps reg, reg
16024 movlps mem, reg
16025 movhps mem+8, reg
16026 }
16027 else
16028 {
16029 movlps mem, reg
16030 movhps mem+8, reg
16031 }
16032
16033 Code generation for unaligned packed loads of double precision data
16034 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16035 if (x86_sse_unaligned_move_optimal)
16036 movupd mem, reg
16037
16038 if (x86_sse_split_regs == true)
16039 {
16040 movlpd mem, reg
16041 movhpd mem+8, reg
16042 }
16043 else
16044 {
16045 movsd mem, reg
16046 movhpd mem+8, reg
16047 }
16048 */
16049
16050 void
16051 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16052 {
16053 rtx op0, op1, m;
16054
16055 op0 = operands[0];
16056 op1 = operands[1];
16057
16058 if (TARGET_AVX
16059 && GET_MODE_SIZE (mode) == 32)
16060 {
16061 switch (GET_MODE_CLASS (mode))
16062 {
16063 case MODE_VECTOR_INT:
16064 case MODE_INT:
16065 op0 = gen_lowpart (V32QImode, op0);
16066 op1 = gen_lowpart (V32QImode, op1);
16067 /* FALLTHRU */
16068
16069 case MODE_VECTOR_FLOAT:
16070 ix86_avx256_split_vector_move_misalign (op0, op1);
16071 break;
16072
16073 default:
16074 gcc_unreachable ();
16075 }
16076
16077 return;
16078 }
16079
16080 if (MEM_P (op1))
16081 {
16082 /* ??? If we have typed data, then it would appear that using
16083 movdqu is the only way to get unaligned data loaded with
16084 integer type. */
16085 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16086 {
16087 op0 = gen_lowpart (V16QImode, op0);
16088 op1 = gen_lowpart (V16QImode, op1);
16089 /* We will eventually emit movups based on insn attributes. */
16090 emit_insn (gen_sse2_movdqu (op0, op1));
16091 }
16092 else if (TARGET_SSE2 && mode == V2DFmode)
16093 {
16094 rtx zero;
16095
16096 if (TARGET_AVX
16097 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16098 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16099 || optimize_function_for_size_p (cfun))
16100 {
16101 /* We will eventually emit movups based on insn attributes. */
16102 emit_insn (gen_sse2_movupd (op0, op1));
16103 return;
16104 }
16105
16106 /* When SSE registers are split into halves, we can avoid
16107 writing to the top half twice. */
16108 if (TARGET_SSE_SPLIT_REGS)
16109 {
16110 emit_clobber (op0);
16111 zero = op0;
16112 }
16113 else
16114 {
16115 /* ??? Not sure about the best option for the Intel chips.
16116 The following would seem to satisfy; the register is
16117 entirely cleared, breaking the dependency chain. We
16118 then store to the upper half, with a dependency depth
16119 of one. A rumor has it that Intel recommends two movsd
16120 followed by an unpacklpd, but this is unconfirmed. And
16121 given that the dependency depth of the unpacklpd would
16122 still be one, I'm not sure why this would be better. */
16123 zero = CONST0_RTX (V2DFmode);
16124 }
16125
16126 m = adjust_address (op1, DFmode, 0);
16127 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16128 m = adjust_address (op1, DFmode, 8);
16129 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16130 }
16131 else
16132 {
16133 if (TARGET_AVX
16134 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16135 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16136 || optimize_function_for_size_p (cfun))
16137 {
16138 op0 = gen_lowpart (V4SFmode, op0);
16139 op1 = gen_lowpart (V4SFmode, op1);
16140 emit_insn (gen_sse_movups (op0, op1));
16141 return;
16142 }
16143
16144 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16145 emit_move_insn (op0, CONST0_RTX (mode));
16146 else
16147 emit_clobber (op0);
16148
16149 if (mode != V4SFmode)
16150 op0 = gen_lowpart (V4SFmode, op0);
16151
16152 m = adjust_address (op1, V2SFmode, 0);
16153 emit_insn (gen_sse_loadlps (op0, op0, m));
16154 m = adjust_address (op1, V2SFmode, 8);
16155 emit_insn (gen_sse_loadhps (op0, op0, m));
16156 }
16157 }
16158 else if (MEM_P (op0))
16159 {
16160 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16161 {
16162 op0 = gen_lowpart (V16QImode, op0);
16163 op1 = gen_lowpart (V16QImode, op1);
16164 /* We will eventually emit movups based on insn attributes. */
16165 emit_insn (gen_sse2_movdqu (op0, op1));
16166 }
16167 else if (TARGET_SSE2 && mode == V2DFmode)
16168 {
16169 if (TARGET_AVX
16170 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16171 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16172 || optimize_function_for_size_p (cfun))
16173 /* We will eventually emit movups based on insn attributes. */
16174 emit_insn (gen_sse2_movupd (op0, op1));
16175 else
16176 {
16177 m = adjust_address (op0, DFmode, 0);
16178 emit_insn (gen_sse2_storelpd (m, op1));
16179 m = adjust_address (op0, DFmode, 8);
16180 emit_insn (gen_sse2_storehpd (m, op1));
16181 }
16182 }
16183 else
16184 {
16185 if (mode != V4SFmode)
16186 op1 = gen_lowpart (V4SFmode, op1);
16187
16188 if (TARGET_AVX
16189 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16190 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16191 || optimize_function_for_size_p (cfun))
16192 {
16193 op0 = gen_lowpart (V4SFmode, op0);
16194 emit_insn (gen_sse_movups (op0, op1));
16195 }
16196 else
16197 {
16198 m = adjust_address (op0, V2SFmode, 0);
16199 emit_insn (gen_sse_storelps (m, op1));
16200 m = adjust_address (op0, V2SFmode, 8);
16201 emit_insn (gen_sse_storehps (m, op1));
16202 }
16203 }
16204 }
16205 else
16206 gcc_unreachable ();
16207 }
16208
16209 /* Expand a push in MODE. This is some mode for which we do not support
16210 proper push instructions, at least from the registers that we expect
16211 the value to live in. */
16212
16213 void
16214 ix86_expand_push (enum machine_mode mode, rtx x)
16215 {
16216 rtx tmp;
16217
16218 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16219 GEN_INT (-GET_MODE_SIZE (mode)),
16220 stack_pointer_rtx, 1, OPTAB_DIRECT);
16221 if (tmp != stack_pointer_rtx)
16222 emit_move_insn (stack_pointer_rtx, tmp);
16223
16224 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16225
16226 /* When we push an operand onto stack, it has to be aligned at least
16227 at the function argument boundary. However since we don't have
16228 the argument type, we can't determine the actual argument
16229 boundary. */
16230 emit_move_insn (tmp, x);
16231 }
16232
16233 /* Helper function of ix86_fixup_binary_operands to canonicalize
16234 operand order. Returns true if the operands should be swapped. */
16235
16236 static bool
16237 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16238 rtx operands[])
16239 {
16240 rtx dst = operands[0];
16241 rtx src1 = operands[1];
16242 rtx src2 = operands[2];
16243
16244 /* If the operation is not commutative, we can't do anything. */
16245 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16246 return false;
16247
16248 /* Highest priority is that src1 should match dst. */
16249 if (rtx_equal_p (dst, src1))
16250 return false;
16251 if (rtx_equal_p (dst, src2))
16252 return true;
16253
16254 /* Next highest priority is that immediate constants come second. */
16255 if (immediate_operand (src2, mode))
16256 return false;
16257 if (immediate_operand (src1, mode))
16258 return true;
16259
16260 /* Lowest priority is that memory references should come second. */
16261 if (MEM_P (src2))
16262 return false;
16263 if (MEM_P (src1))
16264 return true;
16265
16266 return false;
16267 }
16268
16269
16270 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16271 destination to use for the operation. If different from the true
16272 destination in operands[0], a copy operation will be required. */
16273
16274 rtx
16275 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16276 rtx operands[])
16277 {
16278 rtx dst = operands[0];
16279 rtx src1 = operands[1];
16280 rtx src2 = operands[2];
16281
16282 /* Canonicalize operand order. */
16283 if (ix86_swap_binary_operands_p (code, mode, operands))
16284 {
16285 rtx temp;
16286
16287 /* It is invalid to swap operands of different modes. */
16288 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16289
16290 temp = src1;
16291 src1 = src2;
16292 src2 = temp;
16293 }
16294
16295 /* Both source operands cannot be in memory. */
16296 if (MEM_P (src1) && MEM_P (src2))
16297 {
16298 /* Optimization: Only read from memory once. */
16299 if (rtx_equal_p (src1, src2))
16300 {
16301 src2 = force_reg (mode, src2);
16302 src1 = src2;
16303 }
16304 else
16305 src2 = force_reg (mode, src2);
16306 }
16307
16308 /* If the destination is memory, and we do not have matching source
16309 operands, do things in registers. */
16310 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16311 dst = gen_reg_rtx (mode);
16312
16313 /* Source 1 cannot be a constant. */
16314 if (CONSTANT_P (src1))
16315 src1 = force_reg (mode, src1);
16316
16317 /* Source 1 cannot be a non-matching memory. */
16318 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16319 src1 = force_reg (mode, src1);
16320
16321 /* Improve address combine. */
16322 if (code == PLUS
16323 && GET_MODE_CLASS (mode) == MODE_INT
16324 && MEM_P (src2))
16325 src2 = force_reg (mode, src2);
16326
16327 operands[1] = src1;
16328 operands[2] = src2;
16329 return dst;
16330 }
16331
16332 /* Similarly, but assume that the destination has already been
16333 set up properly. */
16334
16335 void
16336 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16337 enum machine_mode mode, rtx operands[])
16338 {
16339 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16340 gcc_assert (dst == operands[0]);
16341 }
16342
16343 /* Attempt to expand a binary operator. Make the expansion closer to the
16344 actual machine, then just general_operand, which will allow 3 separate
16345 memory references (one output, two input) in a single insn. */
16346
16347 void
16348 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16349 rtx operands[])
16350 {
16351 rtx src1, src2, dst, op, clob;
16352
16353 dst = ix86_fixup_binary_operands (code, mode, operands);
16354 src1 = operands[1];
16355 src2 = operands[2];
16356
16357 /* Emit the instruction. */
16358
16359 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16360 if (reload_in_progress)
16361 {
16362 /* Reload doesn't know about the flags register, and doesn't know that
16363 it doesn't want to clobber it. We can only do this with PLUS. */
16364 gcc_assert (code == PLUS);
16365 emit_insn (op);
16366 }
16367 else if (reload_completed
16368 && code == PLUS
16369 && !rtx_equal_p (dst, src1))
16370 {
16371 /* This is going to be an LEA; avoid splitting it later. */
16372 emit_insn (op);
16373 }
16374 else
16375 {
16376 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16377 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16378 }
16379
16380 /* Fix up the destination if needed. */
16381 if (dst != operands[0])
16382 emit_move_insn (operands[0], dst);
16383 }
16384
16385 /* Return TRUE or FALSE depending on whether the binary operator meets the
16386 appropriate constraints. */
16387
16388 bool
16389 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16390 rtx operands[3])
16391 {
16392 rtx dst = operands[0];
16393 rtx src1 = operands[1];
16394 rtx src2 = operands[2];
16395
16396 /* Both source operands cannot be in memory. */
16397 if (MEM_P (src1) && MEM_P (src2))
16398 return false;
16399
16400 /* Canonicalize operand order for commutative operators. */
16401 if (ix86_swap_binary_operands_p (code, mode, operands))
16402 {
16403 rtx temp = src1;
16404 src1 = src2;
16405 src2 = temp;
16406 }
16407
16408 /* If the destination is memory, we must have a matching source operand. */
16409 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16410 return false;
16411
16412 /* Source 1 cannot be a constant. */
16413 if (CONSTANT_P (src1))
16414 return false;
16415
16416 /* Source 1 cannot be a non-matching memory. */
16417 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16418 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16419 return (code == AND
16420 && (mode == HImode
16421 || mode == SImode
16422 || (TARGET_64BIT && mode == DImode))
16423 && satisfies_constraint_L (src2));
16424
16425 return true;
16426 }
16427
16428 /* Attempt to expand a unary operator. Make the expansion closer to the
16429 actual machine, then just general_operand, which will allow 2 separate
16430 memory references (one output, one input) in a single insn. */
16431
16432 void
16433 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16434 rtx operands[])
16435 {
16436 int matching_memory;
16437 rtx src, dst, op, clob;
16438
16439 dst = operands[0];
16440 src = operands[1];
16441
16442 /* If the destination is memory, and we do not have matching source
16443 operands, do things in registers. */
16444 matching_memory = 0;
16445 if (MEM_P (dst))
16446 {
16447 if (rtx_equal_p (dst, src))
16448 matching_memory = 1;
16449 else
16450 dst = gen_reg_rtx (mode);
16451 }
16452
16453 /* When source operand is memory, destination must match. */
16454 if (MEM_P (src) && !matching_memory)
16455 src = force_reg (mode, src);
16456
16457 /* Emit the instruction. */
16458
16459 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16460 if (reload_in_progress || code == NOT)
16461 {
16462 /* Reload doesn't know about the flags register, and doesn't know that
16463 it doesn't want to clobber it. */
16464 gcc_assert (code == NOT);
16465 emit_insn (op);
16466 }
16467 else
16468 {
16469 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16470 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16471 }
16472
16473 /* Fix up the destination if needed. */
16474 if (dst != operands[0])
16475 emit_move_insn (operands[0], dst);
16476 }
16477
16478 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16479 divisor are within the range [0-255]. */
16480
16481 void
16482 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16483 bool signed_p)
16484 {
16485 rtx end_label, qimode_label;
16486 rtx insn, div, mod;
16487 rtx scratch, tmp0, tmp1, tmp2;
16488 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16489 rtx (*gen_zero_extend) (rtx, rtx);
16490 rtx (*gen_test_ccno_1) (rtx, rtx);
16491
16492 switch (mode)
16493 {
16494 case SImode:
16495 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16496 gen_test_ccno_1 = gen_testsi_ccno_1;
16497 gen_zero_extend = gen_zero_extendqisi2;
16498 break;
16499 case DImode:
16500 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16501 gen_test_ccno_1 = gen_testdi_ccno_1;
16502 gen_zero_extend = gen_zero_extendqidi2;
16503 break;
16504 default:
16505 gcc_unreachable ();
16506 }
16507
16508 end_label = gen_label_rtx ();
16509 qimode_label = gen_label_rtx ();
16510
16511 scratch = gen_reg_rtx (mode);
16512
16513 /* Use 8bit unsigned divimod if dividend and divisor are within
16514 the range [0-255]. */
16515 emit_move_insn (scratch, operands[2]);
16516 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16517 scratch, 1, OPTAB_DIRECT);
16518 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16519 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16520 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16521 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16522 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16523 pc_rtx);
16524 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16525 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16526 JUMP_LABEL (insn) = qimode_label;
16527
16528 /* Generate original signed/unsigned divimod. */
16529 div = gen_divmod4_1 (operands[0], operands[1],
16530 operands[2], operands[3]);
16531 emit_insn (div);
16532
16533 /* Branch to the end. */
16534 emit_jump_insn (gen_jump (end_label));
16535 emit_barrier ();
16536
16537 /* Generate 8bit unsigned divide. */
16538 emit_label (qimode_label);
16539 /* Don't use operands[0] for result of 8bit divide since not all
16540 registers support QImode ZERO_EXTRACT. */
16541 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16542 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16543 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16544 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16545
16546 if (signed_p)
16547 {
16548 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16549 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16550 }
16551 else
16552 {
16553 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16554 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16555 }
16556
16557 /* Extract remainder from AH. */
16558 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16559 if (REG_P (operands[1]))
16560 insn = emit_move_insn (operands[1], tmp1);
16561 else
16562 {
16563 /* Need a new scratch register since the old one has result
16564 of 8bit divide. */
16565 scratch = gen_reg_rtx (mode);
16566 emit_move_insn (scratch, tmp1);
16567 insn = emit_move_insn (operands[1], scratch);
16568 }
16569 set_unique_reg_note (insn, REG_EQUAL, mod);
16570
16571 /* Zero extend quotient from AL. */
16572 tmp1 = gen_lowpart (QImode, tmp0);
16573 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16574 set_unique_reg_note (insn, REG_EQUAL, div);
16575
16576 emit_label (end_label);
16577 }
16578
16579 #define LEA_MAX_STALL (3)
16580 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16581
16582 /* Increase given DISTANCE in half-cycles according to
16583 dependencies between PREV and NEXT instructions.
16584 Add 1 half-cycle if there is no dependency and
16585 go to next cycle if there is some dependecy. */
16586
16587 static unsigned int
16588 increase_distance (rtx prev, rtx next, unsigned int distance)
16589 {
16590 df_ref *use_rec;
16591 df_ref *def_rec;
16592
16593 if (!prev || !next)
16594 return distance + (distance & 1) + 2;
16595
16596 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16597 return distance + 1;
16598
16599 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16600 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16601 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16602 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16603 return distance + (distance & 1) + 2;
16604
16605 return distance + 1;
16606 }
16607
16608 /* Function checks if instruction INSN defines register number
16609 REGNO1 or REGNO2. */
16610
16611 static bool
16612 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16613 rtx insn)
16614 {
16615 df_ref *def_rec;
16616
16617 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16618 if (DF_REF_REG_DEF_P (*def_rec)
16619 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16620 && (regno1 == DF_REF_REGNO (*def_rec)
16621 || regno2 == DF_REF_REGNO (*def_rec)))
16622 {
16623 return true;
16624 }
16625
16626 return false;
16627 }
16628
16629 /* Function checks if instruction INSN uses register number
16630 REGNO as a part of address expression. */
16631
16632 static bool
16633 insn_uses_reg_mem (unsigned int regno, rtx insn)
16634 {
16635 df_ref *use_rec;
16636
16637 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16638 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16639 return true;
16640
16641 return false;
16642 }
16643
16644 /* Search backward for non-agu definition of register number REGNO1
16645 or register number REGNO2 in basic block starting from instruction
16646 START up to head of basic block or instruction INSN.
16647
16648 Function puts true value into *FOUND var if definition was found
16649 and false otherwise.
16650
16651 Distance in half-cycles between START and found instruction or head
16652 of BB is added to DISTANCE and returned. */
16653
16654 static int
16655 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16656 rtx insn, int distance,
16657 rtx start, bool *found)
16658 {
16659 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16660 rtx prev = start;
16661 rtx next = NULL;
16662
16663 *found = false;
16664
16665 while (prev
16666 && prev != insn
16667 && distance < LEA_SEARCH_THRESHOLD)
16668 {
16669 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16670 {
16671 distance = increase_distance (prev, next, distance);
16672 if (insn_defines_reg (regno1, regno2, prev))
16673 {
16674 if (recog_memoized (prev) < 0
16675 || get_attr_type (prev) != TYPE_LEA)
16676 {
16677 *found = true;
16678 return distance;
16679 }
16680 }
16681
16682 next = prev;
16683 }
16684 if (prev == BB_HEAD (bb))
16685 break;
16686
16687 prev = PREV_INSN (prev);
16688 }
16689
16690 return distance;
16691 }
16692
16693 /* Search backward for non-agu definition of register number REGNO1
16694 or register number REGNO2 in INSN's basic block until
16695 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16696 2. Reach neighbour BBs boundary, or
16697 3. Reach agu definition.
16698 Returns the distance between the non-agu definition point and INSN.
16699 If no definition point, returns -1. */
16700
16701 static int
16702 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16703 rtx insn)
16704 {
16705 basic_block bb = BLOCK_FOR_INSN (insn);
16706 int distance = 0;
16707 bool found = false;
16708
16709 if (insn != BB_HEAD (bb))
16710 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16711 distance, PREV_INSN (insn),
16712 &found);
16713
16714 if (!found && distance < LEA_SEARCH_THRESHOLD)
16715 {
16716 edge e;
16717 edge_iterator ei;
16718 bool simple_loop = false;
16719
16720 FOR_EACH_EDGE (e, ei, bb->preds)
16721 if (e->src == bb)
16722 {
16723 simple_loop = true;
16724 break;
16725 }
16726
16727 if (simple_loop)
16728 distance = distance_non_agu_define_in_bb (regno1, regno2,
16729 insn, distance,
16730 BB_END (bb), &found);
16731 else
16732 {
16733 int shortest_dist = -1;
16734 bool found_in_bb = false;
16735
16736 FOR_EACH_EDGE (e, ei, bb->preds)
16737 {
16738 int bb_dist
16739 = distance_non_agu_define_in_bb (regno1, regno2,
16740 insn, distance,
16741 BB_END (e->src),
16742 &found_in_bb);
16743 if (found_in_bb)
16744 {
16745 if (shortest_dist < 0)
16746 shortest_dist = bb_dist;
16747 else if (bb_dist > 0)
16748 shortest_dist = MIN (bb_dist, shortest_dist);
16749
16750 found = true;
16751 }
16752 }
16753
16754 distance = shortest_dist;
16755 }
16756 }
16757
16758 /* get_attr_type may modify recog data. We want to make sure
16759 that recog data is valid for instruction INSN, on which
16760 distance_non_agu_define is called. INSN is unchanged here. */
16761 extract_insn_cached (insn);
16762
16763 if (!found)
16764 return -1;
16765
16766 return distance >> 1;
16767 }
16768
16769 /* Return the distance in half-cycles between INSN and the next
16770 insn that uses register number REGNO in memory address added
16771 to DISTANCE. Return -1 if REGNO0 is set.
16772
16773 Put true value into *FOUND if register usage was found and
16774 false otherwise.
16775 Put true value into *REDEFINED if register redefinition was
16776 found and false otherwise. */
16777
16778 static int
16779 distance_agu_use_in_bb (unsigned int regno,
16780 rtx insn, int distance, rtx start,
16781 bool *found, bool *redefined)
16782 {
16783 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16784 rtx next = start;
16785 rtx prev = NULL;
16786
16787 *found = false;
16788 *redefined = false;
16789
16790 while (next
16791 && next != insn
16792 && distance < LEA_SEARCH_THRESHOLD)
16793 {
16794 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
16795 {
16796 distance = increase_distance(prev, next, distance);
16797 if (insn_uses_reg_mem (regno, next))
16798 {
16799 /* Return DISTANCE if OP0 is used in memory
16800 address in NEXT. */
16801 *found = true;
16802 return distance;
16803 }
16804
16805 if (insn_defines_reg (regno, INVALID_REGNUM, next))
16806 {
16807 /* Return -1 if OP0 is set in NEXT. */
16808 *redefined = true;
16809 return -1;
16810 }
16811
16812 prev = next;
16813 }
16814
16815 if (next == BB_END (bb))
16816 break;
16817
16818 next = NEXT_INSN (next);
16819 }
16820
16821 return distance;
16822 }
16823
16824 /* Return the distance between INSN and the next insn that uses
16825 register number REGNO0 in memory address. Return -1 if no such
16826 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
16827
16828 static int
16829 distance_agu_use (unsigned int regno0, rtx insn)
16830 {
16831 basic_block bb = BLOCK_FOR_INSN (insn);
16832 int distance = 0;
16833 bool found = false;
16834 bool redefined = false;
16835
16836 if (insn != BB_END (bb))
16837 distance = distance_agu_use_in_bb (regno0, insn, distance,
16838 NEXT_INSN (insn),
16839 &found, &redefined);
16840
16841 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
16842 {
16843 edge e;
16844 edge_iterator ei;
16845 bool simple_loop = false;
16846
16847 FOR_EACH_EDGE (e, ei, bb->succs)
16848 if (e->dest == bb)
16849 {
16850 simple_loop = true;
16851 break;
16852 }
16853
16854 if (simple_loop)
16855 distance = distance_agu_use_in_bb (regno0, insn,
16856 distance, BB_HEAD (bb),
16857 &found, &redefined);
16858 else
16859 {
16860 int shortest_dist = -1;
16861 bool found_in_bb = false;
16862 bool redefined_in_bb = false;
16863
16864 FOR_EACH_EDGE (e, ei, bb->succs)
16865 {
16866 int bb_dist
16867 = distance_agu_use_in_bb (regno0, insn,
16868 distance, BB_HEAD (e->dest),
16869 &found_in_bb, &redefined_in_bb);
16870 if (found_in_bb)
16871 {
16872 if (shortest_dist < 0)
16873 shortest_dist = bb_dist;
16874 else if (bb_dist > 0)
16875 shortest_dist = MIN (bb_dist, shortest_dist);
16876
16877 found = true;
16878 }
16879 }
16880
16881 distance = shortest_dist;
16882 }
16883 }
16884
16885 if (!found || redefined)
16886 return -1;
16887
16888 return distance >> 1;
16889 }
16890
16891 /* Define this macro to tune LEA priority vs ADD, it take effect when
16892 there is a dilemma of choicing LEA or ADD
16893 Negative value: ADD is more preferred than LEA
16894 Zero: Netrual
16895 Positive value: LEA is more preferred than ADD*/
16896 #define IX86_LEA_PRIORITY 0
16897
16898 /* Return true if usage of lea INSN has performance advantage
16899 over a sequence of instructions. Instructions sequence has
16900 SPLIT_COST cycles higher latency than lea latency. */
16901
16902 static bool
16903 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
16904 unsigned int regno2, int split_cost)
16905 {
16906 int dist_define, dist_use;
16907
16908 dist_define = distance_non_agu_define (regno1, regno2, insn);
16909 dist_use = distance_agu_use (regno0, insn);
16910
16911 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
16912 {
16913 /* If there is no non AGU operand definition, no AGU
16914 operand usage and split cost is 0 then both lea
16915 and non lea variants have same priority. Currently
16916 we prefer lea for 64 bit code and non lea on 32 bit
16917 code. */
16918 if (dist_use < 0 && split_cost == 0)
16919 return TARGET_64BIT || IX86_LEA_PRIORITY;
16920 else
16921 return true;
16922 }
16923
16924 /* With longer definitions distance lea is more preferable.
16925 Here we change it to take into account splitting cost and
16926 lea priority. */
16927 dist_define += split_cost + IX86_LEA_PRIORITY;
16928
16929 /* If there is no use in memory addess then we just check
16930 that split cost does not exceed AGU stall. */
16931 if (dist_use < 0)
16932 return dist_define >= LEA_MAX_STALL;
16933
16934 /* If this insn has both backward non-agu dependence and forward
16935 agu dependence, the one with short distance takes effect. */
16936 return dist_define >= dist_use;
16937 }
16938
16939 /* Return true if it is legal to clobber flags by INSN and
16940 false otherwise. */
16941
16942 static bool
16943 ix86_ok_to_clobber_flags (rtx insn)
16944 {
16945 basic_block bb = BLOCK_FOR_INSN (insn);
16946 df_ref *use;
16947 bitmap live;
16948
16949 while (insn)
16950 {
16951 if (NONDEBUG_INSN_P (insn))
16952 {
16953 for (use = DF_INSN_USES (insn); *use; use++)
16954 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
16955 return false;
16956
16957 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
16958 return true;
16959 }
16960
16961 if (insn == BB_END (bb))
16962 break;
16963
16964 insn = NEXT_INSN (insn);
16965 }
16966
16967 live = df_get_live_out(bb);
16968 return !REGNO_REG_SET_P (live, FLAGS_REG);
16969 }
16970
16971 /* Return true if we need to split op0 = op1 + op2 into a sequence of
16972 move and add to avoid AGU stalls. */
16973
16974 bool
16975 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
16976 {
16977 unsigned int regno0, regno1, regno2;
16978
16979 /* Check if we need to optimize. */
16980 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16981 return false;
16982
16983 /* Check it is correct to split here. */
16984 if (!ix86_ok_to_clobber_flags(insn))
16985 return false;
16986
16987 regno0 = true_regnum (operands[0]);
16988 regno1 = true_regnum (operands[1]);
16989 regno2 = true_regnum (operands[2]);
16990
16991 /* We need to split only adds with non destructive
16992 destination operand. */
16993 if (regno0 == regno1 || regno0 == regno2)
16994 return false;
16995 else
16996 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
16997 }
16998
16999 /* Return true if we should emit lea instruction instead of mov
17000 instruction. */
17001
17002 bool
17003 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17004 {
17005 unsigned int regno0, regno1;
17006
17007 /* Check if we need to optimize. */
17008 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17009 return false;
17010
17011 /* Use lea for reg to reg moves only. */
17012 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17013 return false;
17014
17015 regno0 = true_regnum (operands[0]);
17016 regno1 = true_regnum (operands[1]);
17017
17018 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0);
17019 }
17020
17021 /* Return true if we need to split lea into a sequence of
17022 instructions to avoid AGU stalls. */
17023
17024 bool
17025 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17026 {
17027 unsigned int regno0, regno1, regno2;
17028 int split_cost;
17029 struct ix86_address parts;
17030 int ok;
17031
17032 /* FIXME: Handle zero-extended addresses. */
17033 if (GET_CODE (operands[1]) == ZERO_EXTEND
17034 || GET_CODE (operands[1]) == AND)
17035 return false;
17036
17037 /* Check we need to optimize. */
17038 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17039 return false;
17040
17041 /* Check it is correct to split here. */
17042 if (!ix86_ok_to_clobber_flags(insn))
17043 return false;
17044
17045 ok = ix86_decompose_address (operands[1], &parts);
17046 gcc_assert (ok);
17047
17048 /* We should not split into add if non legitimate pic
17049 operand is used as displacement. */
17050 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17051 return false;
17052
17053 regno0 = true_regnum (operands[0]) ;
17054 regno1 = INVALID_REGNUM;
17055 regno2 = INVALID_REGNUM;
17056
17057 if (parts.base)
17058 regno1 = true_regnum (parts.base);
17059 if (parts.index)
17060 regno2 = true_regnum (parts.index);
17061
17062 split_cost = 0;
17063
17064 /* Compute how many cycles we will add to execution time
17065 if split lea into a sequence of instructions. */
17066 if (parts.base || parts.index)
17067 {
17068 /* Have to use mov instruction if non desctructive
17069 destination form is used. */
17070 if (regno1 != regno0 && regno2 != regno0)
17071 split_cost += 1;
17072
17073 /* Have to add index to base if both exist. */
17074 if (parts.base && parts.index)
17075 split_cost += 1;
17076
17077 /* Have to use shift and adds if scale is 2 or greater. */
17078 if (parts.scale > 1)
17079 {
17080 if (regno0 != regno1)
17081 split_cost += 1;
17082 else if (regno2 == regno0)
17083 split_cost += 4;
17084 else
17085 split_cost += parts.scale;
17086 }
17087
17088 /* Have to use add instruction with immediate if
17089 disp is non zero. */
17090 if (parts.disp && parts.disp != const0_rtx)
17091 split_cost += 1;
17092
17093 /* Subtract the price of lea. */
17094 split_cost -= 1;
17095 }
17096
17097 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
17098 }
17099
17100 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17101 matches destination. RTX includes clobber of FLAGS_REG. */
17102
17103 static void
17104 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17105 rtx dst, rtx src)
17106 {
17107 rtx op, clob;
17108
17109 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17110 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17111
17112 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17113 }
17114
17115 /* Split lea instructions into a sequence of instructions
17116 which are executed on ALU to avoid AGU stalls.
17117 It is assumed that it is allowed to clobber flags register
17118 at lea position. */
17119
17120 extern void
17121 ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
17122 {
17123 unsigned int regno0, regno1, regno2;
17124 struct ix86_address parts;
17125 rtx target, tmp;
17126 int ok, adds;
17127
17128 ok = ix86_decompose_address (operands[1], &parts);
17129 gcc_assert (ok);
17130
17131 target = operands[0];
17132
17133 regno0 = true_regnum (target);
17134 regno1 = INVALID_REGNUM;
17135 regno2 = INVALID_REGNUM;
17136
17137 if (parts.base)
17138 {
17139 if (GET_MODE (parts.base) != mode)
17140 parts.base = gen_lowpart (mode, parts.base);
17141 regno1 = true_regnum (parts.base);
17142 }
17143
17144 if (parts.index)
17145 {
17146 if (GET_MODE (parts.index) != mode)
17147 parts.index = gen_lowpart (mode, parts.index);
17148 regno2 = true_regnum (parts.index);
17149 }
17150
17151 if (parts.scale > 1)
17152 {
17153 /* Case r1 = r1 + ... */
17154 if (regno1 == regno0)
17155 {
17156 /* If we have a case r1 = r1 + C * r1 then we
17157 should use multiplication which is very
17158 expensive. Assume cost model is wrong if we
17159 have such case here. */
17160 gcc_assert (regno2 != regno0);
17161
17162 for (adds = parts.scale; adds > 0; adds--)
17163 ix86_emit_binop (PLUS, mode, target, parts.index);
17164 }
17165 else
17166 {
17167 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
17168 if (regno0 != regno2)
17169 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17170
17171 /* Use shift for scaling. */
17172 ix86_emit_binop (ASHIFT, mode, target,
17173 GEN_INT (exact_log2 (parts.scale)));
17174
17175 if (parts.base)
17176 ix86_emit_binop (PLUS, mode, target, parts.base);
17177
17178 if (parts.disp && parts.disp != const0_rtx)
17179 ix86_emit_binop (PLUS, mode, target, parts.disp);
17180 }
17181 }
17182 else if (!parts.base && !parts.index)
17183 {
17184 gcc_assert(parts.disp);
17185 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
17186 }
17187 else
17188 {
17189 if (!parts.base)
17190 {
17191 if (regno0 != regno2)
17192 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17193 }
17194 else if (!parts.index)
17195 {
17196 if (regno0 != regno1)
17197 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
17198 }
17199 else
17200 {
17201 if (regno0 == regno1)
17202 tmp = parts.index;
17203 else if (regno0 == regno2)
17204 tmp = parts.base;
17205 else
17206 {
17207 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
17208 tmp = parts.index;
17209 }
17210
17211 ix86_emit_binop (PLUS, mode, target, tmp);
17212 }
17213
17214 if (parts.disp && parts.disp != const0_rtx)
17215 ix86_emit_binop (PLUS, mode, target, parts.disp);
17216 }
17217 }
17218
17219 /* Return true if it is ok to optimize an ADD operation to LEA
17220 operation to avoid flag register consumation. For most processors,
17221 ADD is faster than LEA. For the processors like ATOM, if the
17222 destination register of LEA holds an actual address which will be
17223 used soon, LEA is better and otherwise ADD is better. */
17224
17225 bool
17226 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17227 {
17228 unsigned int regno0 = true_regnum (operands[0]);
17229 unsigned int regno1 = true_regnum (operands[1]);
17230 unsigned int regno2 = true_regnum (operands[2]);
17231
17232 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17233 if (regno0 != regno1 && regno0 != regno2)
17234 return true;
17235
17236 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17237 return false;
17238
17239 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17240 }
17241
17242 /* Return true if destination reg of SET_BODY is shift count of
17243 USE_BODY. */
17244
17245 static bool
17246 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17247 {
17248 rtx set_dest;
17249 rtx shift_rtx;
17250 int i;
17251
17252 /* Retrieve destination of SET_BODY. */
17253 switch (GET_CODE (set_body))
17254 {
17255 case SET:
17256 set_dest = SET_DEST (set_body);
17257 if (!set_dest || !REG_P (set_dest))
17258 return false;
17259 break;
17260 case PARALLEL:
17261 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17262 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17263 use_body))
17264 return true;
17265 default:
17266 return false;
17267 break;
17268 }
17269
17270 /* Retrieve shift count of USE_BODY. */
17271 switch (GET_CODE (use_body))
17272 {
17273 case SET:
17274 shift_rtx = XEXP (use_body, 1);
17275 break;
17276 case PARALLEL:
17277 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17278 if (ix86_dep_by_shift_count_body (set_body,
17279 XVECEXP (use_body, 0, i)))
17280 return true;
17281 default:
17282 return false;
17283 break;
17284 }
17285
17286 if (shift_rtx
17287 && (GET_CODE (shift_rtx) == ASHIFT
17288 || GET_CODE (shift_rtx) == LSHIFTRT
17289 || GET_CODE (shift_rtx) == ASHIFTRT
17290 || GET_CODE (shift_rtx) == ROTATE
17291 || GET_CODE (shift_rtx) == ROTATERT))
17292 {
17293 rtx shift_count = XEXP (shift_rtx, 1);
17294
17295 /* Return true if shift count is dest of SET_BODY. */
17296 if (REG_P (shift_count)
17297 && true_regnum (set_dest) == true_regnum (shift_count))
17298 return true;
17299 }
17300
17301 return false;
17302 }
17303
17304 /* Return true if destination reg of SET_INSN is shift count of
17305 USE_INSN. */
17306
17307 bool
17308 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17309 {
17310 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17311 PATTERN (use_insn));
17312 }
17313
17314 /* Return TRUE or FALSE depending on whether the unary operator meets the
17315 appropriate constraints. */
17316
17317 bool
17318 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17319 enum machine_mode mode ATTRIBUTE_UNUSED,
17320 rtx operands[2] ATTRIBUTE_UNUSED)
17321 {
17322 /* If one of operands is memory, source and destination must match. */
17323 if ((MEM_P (operands[0])
17324 || MEM_P (operands[1]))
17325 && ! rtx_equal_p (operands[0], operands[1]))
17326 return false;
17327 return true;
17328 }
17329
17330 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17331 are ok, keeping in mind the possible movddup alternative. */
17332
17333 bool
17334 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17335 {
17336 if (MEM_P (operands[0]))
17337 return rtx_equal_p (operands[0], operands[1 + high]);
17338 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17339 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17340 return true;
17341 }
17342
17343 /* Post-reload splitter for converting an SF or DFmode value in an
17344 SSE register into an unsigned SImode. */
17345
17346 void
17347 ix86_split_convert_uns_si_sse (rtx operands[])
17348 {
17349 enum machine_mode vecmode;
17350 rtx value, large, zero_or_two31, input, two31, x;
17351
17352 large = operands[1];
17353 zero_or_two31 = operands[2];
17354 input = operands[3];
17355 two31 = operands[4];
17356 vecmode = GET_MODE (large);
17357 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17358
17359 /* Load up the value into the low element. We must ensure that the other
17360 elements are valid floats -- zero is the easiest such value. */
17361 if (MEM_P (input))
17362 {
17363 if (vecmode == V4SFmode)
17364 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17365 else
17366 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17367 }
17368 else
17369 {
17370 input = gen_rtx_REG (vecmode, REGNO (input));
17371 emit_move_insn (value, CONST0_RTX (vecmode));
17372 if (vecmode == V4SFmode)
17373 emit_insn (gen_sse_movss (value, value, input));
17374 else
17375 emit_insn (gen_sse2_movsd (value, value, input));
17376 }
17377
17378 emit_move_insn (large, two31);
17379 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17380
17381 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17382 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17383
17384 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17385 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17386
17387 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17388 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17389
17390 large = gen_rtx_REG (V4SImode, REGNO (large));
17391 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17392
17393 x = gen_rtx_REG (V4SImode, REGNO (value));
17394 if (vecmode == V4SFmode)
17395 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17396 else
17397 emit_insn (gen_sse2_cvttpd2dq (x, value));
17398 value = x;
17399
17400 emit_insn (gen_xorv4si3 (value, value, large));
17401 }
17402
17403 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17404 Expects the 64-bit DImode to be supplied in a pair of integral
17405 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17406 -mfpmath=sse, !optimize_size only. */
17407
17408 void
17409 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17410 {
17411 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17412 rtx int_xmm, fp_xmm;
17413 rtx biases, exponents;
17414 rtx x;
17415
17416 int_xmm = gen_reg_rtx (V4SImode);
17417 if (TARGET_INTER_UNIT_MOVES)
17418 emit_insn (gen_movdi_to_sse (int_xmm, input));
17419 else if (TARGET_SSE_SPLIT_REGS)
17420 {
17421 emit_clobber (int_xmm);
17422 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17423 }
17424 else
17425 {
17426 x = gen_reg_rtx (V2DImode);
17427 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17428 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17429 }
17430
17431 x = gen_rtx_CONST_VECTOR (V4SImode,
17432 gen_rtvec (4, GEN_INT (0x43300000UL),
17433 GEN_INT (0x45300000UL),
17434 const0_rtx, const0_rtx));
17435 exponents = validize_mem (force_const_mem (V4SImode, x));
17436
17437 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17438 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17439
17440 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17441 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17442 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17443 (0x1.0p84 + double(fp_value_hi_xmm)).
17444 Note these exponents differ by 32. */
17445
17446 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17447
17448 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17449 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17450 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17451 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17452 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17453 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17454 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17455 biases = validize_mem (force_const_mem (V2DFmode, biases));
17456 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17457
17458 /* Add the upper and lower DFmode values together. */
17459 if (TARGET_SSE3)
17460 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17461 else
17462 {
17463 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17464 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17465 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17466 }
17467
17468 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17469 }
17470
17471 /* Not used, but eases macroization of patterns. */
17472 void
17473 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17474 rtx input ATTRIBUTE_UNUSED)
17475 {
17476 gcc_unreachable ();
17477 }
17478
17479 /* Convert an unsigned SImode value into a DFmode. Only currently used
17480 for SSE, but applicable anywhere. */
17481
17482 void
17483 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17484 {
17485 REAL_VALUE_TYPE TWO31r;
17486 rtx x, fp;
17487
17488 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17489 NULL, 1, OPTAB_DIRECT);
17490
17491 fp = gen_reg_rtx (DFmode);
17492 emit_insn (gen_floatsidf2 (fp, x));
17493
17494 real_ldexp (&TWO31r, &dconst1, 31);
17495 x = const_double_from_real_value (TWO31r, DFmode);
17496
17497 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17498 if (x != target)
17499 emit_move_insn (target, x);
17500 }
17501
17502 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17503 32-bit mode; otherwise we have a direct convert instruction. */
17504
17505 void
17506 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17507 {
17508 REAL_VALUE_TYPE TWO32r;
17509 rtx fp_lo, fp_hi, x;
17510
17511 fp_lo = gen_reg_rtx (DFmode);
17512 fp_hi = gen_reg_rtx (DFmode);
17513
17514 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17515
17516 real_ldexp (&TWO32r, &dconst1, 32);
17517 x = const_double_from_real_value (TWO32r, DFmode);
17518 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17519
17520 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17521
17522 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17523 0, OPTAB_DIRECT);
17524 if (x != target)
17525 emit_move_insn (target, x);
17526 }
17527
17528 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17529 For x86_32, -mfpmath=sse, !optimize_size only. */
17530 void
17531 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17532 {
17533 REAL_VALUE_TYPE ONE16r;
17534 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17535
17536 real_ldexp (&ONE16r, &dconst1, 16);
17537 x = const_double_from_real_value (ONE16r, SFmode);
17538 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17539 NULL, 0, OPTAB_DIRECT);
17540 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17541 NULL, 0, OPTAB_DIRECT);
17542 fp_hi = gen_reg_rtx (SFmode);
17543 fp_lo = gen_reg_rtx (SFmode);
17544 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17545 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17546 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17547 0, OPTAB_DIRECT);
17548 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17549 0, OPTAB_DIRECT);
17550 if (!rtx_equal_p (target, fp_hi))
17551 emit_move_insn (target, fp_hi);
17552 }
17553
17554 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17555 a vector of unsigned ints VAL to vector of floats TARGET. */
17556
17557 void
17558 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17559 {
17560 rtx tmp[8];
17561 REAL_VALUE_TYPE TWO16r;
17562 enum machine_mode intmode = GET_MODE (val);
17563 enum machine_mode fltmode = GET_MODE (target);
17564 rtx (*cvt) (rtx, rtx);
17565
17566 if (intmode == V4SImode)
17567 cvt = gen_floatv4siv4sf2;
17568 else
17569 cvt = gen_floatv8siv8sf2;
17570 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17571 tmp[0] = force_reg (intmode, tmp[0]);
17572 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17573 OPTAB_DIRECT);
17574 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17575 NULL_RTX, 1, OPTAB_DIRECT);
17576 tmp[3] = gen_reg_rtx (fltmode);
17577 emit_insn (cvt (tmp[3], tmp[1]));
17578 tmp[4] = gen_reg_rtx (fltmode);
17579 emit_insn (cvt (tmp[4], tmp[2]));
17580 real_ldexp (&TWO16r, &dconst1, 16);
17581 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17582 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17583 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17584 OPTAB_DIRECT);
17585 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17586 OPTAB_DIRECT);
17587 if (tmp[7] != target)
17588 emit_move_insn (target, tmp[7]);
17589 }
17590
17591 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17592 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17593 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17594 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17595
17596 rtx
17597 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17598 {
17599 REAL_VALUE_TYPE TWO31r;
17600 rtx two31r, tmp[4];
17601 enum machine_mode mode = GET_MODE (val);
17602 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17603 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17604 rtx (*cmp) (rtx, rtx, rtx, rtx);
17605 int i;
17606
17607 for (i = 0; i < 3; i++)
17608 tmp[i] = gen_reg_rtx (mode);
17609 real_ldexp (&TWO31r, &dconst1, 31);
17610 two31r = const_double_from_real_value (TWO31r, scalarmode);
17611 two31r = ix86_build_const_vector (mode, 1, two31r);
17612 two31r = force_reg (mode, two31r);
17613 switch (mode)
17614 {
17615 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17616 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17617 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17618 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17619 default: gcc_unreachable ();
17620 }
17621 tmp[3] = gen_rtx_LE (mode, two31r, val);
17622 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17623 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17624 0, OPTAB_DIRECT);
17625 if (intmode == V4SImode || TARGET_AVX2)
17626 *xorp = expand_simple_binop (intmode, ASHIFT,
17627 gen_lowpart (intmode, tmp[0]),
17628 GEN_INT (31), NULL_RTX, 0,
17629 OPTAB_DIRECT);
17630 else
17631 {
17632 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17633 two31 = ix86_build_const_vector (intmode, 1, two31);
17634 *xorp = expand_simple_binop (intmode, AND,
17635 gen_lowpart (intmode, tmp[0]),
17636 two31, NULL_RTX, 0,
17637 OPTAB_DIRECT);
17638 }
17639 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17640 0, OPTAB_DIRECT);
17641 }
17642
17643 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17644 then replicate the value for all elements of the vector
17645 register. */
17646
17647 rtx
17648 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17649 {
17650 int i, n_elt;
17651 rtvec v;
17652 enum machine_mode scalar_mode;
17653
17654 switch (mode)
17655 {
17656 case V32QImode:
17657 case V16QImode:
17658 case V16HImode:
17659 case V8HImode:
17660 case V8SImode:
17661 case V4SImode:
17662 case V4DImode:
17663 case V2DImode:
17664 gcc_assert (vect);
17665 case V8SFmode:
17666 case V4SFmode:
17667 case V4DFmode:
17668 case V2DFmode:
17669 n_elt = GET_MODE_NUNITS (mode);
17670 v = rtvec_alloc (n_elt);
17671 scalar_mode = GET_MODE_INNER (mode);
17672
17673 RTVEC_ELT (v, 0) = value;
17674
17675 for (i = 1; i < n_elt; ++i)
17676 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17677
17678 return gen_rtx_CONST_VECTOR (mode, v);
17679
17680 default:
17681 gcc_unreachable ();
17682 }
17683 }
17684
17685 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17686 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17687 for an SSE register. If VECT is true, then replicate the mask for
17688 all elements of the vector register. If INVERT is true, then create
17689 a mask excluding the sign bit. */
17690
17691 rtx
17692 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17693 {
17694 enum machine_mode vec_mode, imode;
17695 HOST_WIDE_INT hi, lo;
17696 int shift = 63;
17697 rtx v;
17698 rtx mask;
17699
17700 /* Find the sign bit, sign extended to 2*HWI. */
17701 switch (mode)
17702 {
17703 case V8SImode:
17704 case V4SImode:
17705 case V8SFmode:
17706 case V4SFmode:
17707 vec_mode = mode;
17708 mode = GET_MODE_INNER (mode);
17709 imode = SImode;
17710 lo = 0x80000000, hi = lo < 0;
17711 break;
17712
17713 case V4DImode:
17714 case V2DImode:
17715 case V4DFmode:
17716 case V2DFmode:
17717 vec_mode = mode;
17718 mode = GET_MODE_INNER (mode);
17719 imode = DImode;
17720 if (HOST_BITS_PER_WIDE_INT >= 64)
17721 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17722 else
17723 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17724 break;
17725
17726 case TImode:
17727 case TFmode:
17728 vec_mode = VOIDmode;
17729 if (HOST_BITS_PER_WIDE_INT >= 64)
17730 {
17731 imode = TImode;
17732 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
17733 }
17734 else
17735 {
17736 rtvec vec;
17737
17738 imode = DImode;
17739 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17740
17741 if (invert)
17742 {
17743 lo = ~lo, hi = ~hi;
17744 v = constm1_rtx;
17745 }
17746 else
17747 v = const0_rtx;
17748
17749 mask = immed_double_const (lo, hi, imode);
17750
17751 vec = gen_rtvec (2, v, mask);
17752 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
17753 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
17754
17755 return v;
17756 }
17757 break;
17758
17759 default:
17760 gcc_unreachable ();
17761 }
17762
17763 if (invert)
17764 lo = ~lo, hi = ~hi;
17765
17766 /* Force this value into the low part of a fp vector constant. */
17767 mask = immed_double_const (lo, hi, imode);
17768 mask = gen_lowpart (mode, mask);
17769
17770 if (vec_mode == VOIDmode)
17771 return force_reg (mode, mask);
17772
17773 v = ix86_build_const_vector (vec_mode, vect, mask);
17774 return force_reg (vec_mode, v);
17775 }
17776
17777 /* Generate code for floating point ABS or NEG. */
17778
17779 void
17780 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
17781 rtx operands[])
17782 {
17783 rtx mask, set, dst, src;
17784 bool use_sse = false;
17785 bool vector_mode = VECTOR_MODE_P (mode);
17786 enum machine_mode vmode = mode;
17787
17788 if (vector_mode)
17789 use_sse = true;
17790 else if (mode == TFmode)
17791 use_sse = true;
17792 else if (TARGET_SSE_MATH)
17793 {
17794 use_sse = SSE_FLOAT_MODE_P (mode);
17795 if (mode == SFmode)
17796 vmode = V4SFmode;
17797 else if (mode == DFmode)
17798 vmode = V2DFmode;
17799 }
17800
17801 /* NEG and ABS performed with SSE use bitwise mask operations.
17802 Create the appropriate mask now. */
17803 if (use_sse)
17804 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
17805 else
17806 mask = NULL_RTX;
17807
17808 dst = operands[0];
17809 src = operands[1];
17810
17811 set = gen_rtx_fmt_e (code, mode, src);
17812 set = gen_rtx_SET (VOIDmode, dst, set);
17813
17814 if (mask)
17815 {
17816 rtx use, clob;
17817 rtvec par;
17818
17819 use = gen_rtx_USE (VOIDmode, mask);
17820 if (vector_mode)
17821 par = gen_rtvec (2, set, use);
17822 else
17823 {
17824 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17825 par = gen_rtvec (3, set, use, clob);
17826 }
17827 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
17828 }
17829 else
17830 emit_insn (set);
17831 }
17832
17833 /* Expand a copysign operation. Special case operand 0 being a constant. */
17834
17835 void
17836 ix86_expand_copysign (rtx operands[])
17837 {
17838 enum machine_mode mode, vmode;
17839 rtx dest, op0, op1, mask, nmask;
17840
17841 dest = operands[0];
17842 op0 = operands[1];
17843 op1 = operands[2];
17844
17845 mode = GET_MODE (dest);
17846
17847 if (mode == SFmode)
17848 vmode = V4SFmode;
17849 else if (mode == DFmode)
17850 vmode = V2DFmode;
17851 else
17852 vmode = mode;
17853
17854 if (GET_CODE (op0) == CONST_DOUBLE)
17855 {
17856 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
17857
17858 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
17859 op0 = simplify_unary_operation (ABS, mode, op0, mode);
17860
17861 if (mode == SFmode || mode == DFmode)
17862 {
17863 if (op0 == CONST0_RTX (mode))
17864 op0 = CONST0_RTX (vmode);
17865 else
17866 {
17867 rtx v = ix86_build_const_vector (vmode, false, op0);
17868
17869 op0 = force_reg (vmode, v);
17870 }
17871 }
17872 else if (op0 != CONST0_RTX (mode))
17873 op0 = force_reg (mode, op0);
17874
17875 mask = ix86_build_signbit_mask (vmode, 0, 0);
17876
17877 if (mode == SFmode)
17878 copysign_insn = gen_copysignsf3_const;
17879 else if (mode == DFmode)
17880 copysign_insn = gen_copysigndf3_const;
17881 else
17882 copysign_insn = gen_copysigntf3_const;
17883
17884 emit_insn (copysign_insn (dest, op0, op1, mask));
17885 }
17886 else
17887 {
17888 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
17889
17890 nmask = ix86_build_signbit_mask (vmode, 0, 1);
17891 mask = ix86_build_signbit_mask (vmode, 0, 0);
17892
17893 if (mode == SFmode)
17894 copysign_insn = gen_copysignsf3_var;
17895 else if (mode == DFmode)
17896 copysign_insn = gen_copysigndf3_var;
17897 else
17898 copysign_insn = gen_copysigntf3_var;
17899
17900 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
17901 }
17902 }
17903
17904 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
17905 be a constant, and so has already been expanded into a vector constant. */
17906
17907 void
17908 ix86_split_copysign_const (rtx operands[])
17909 {
17910 enum machine_mode mode, vmode;
17911 rtx dest, op0, mask, x;
17912
17913 dest = operands[0];
17914 op0 = operands[1];
17915 mask = operands[3];
17916
17917 mode = GET_MODE (dest);
17918 vmode = GET_MODE (mask);
17919
17920 dest = simplify_gen_subreg (vmode, dest, mode, 0);
17921 x = gen_rtx_AND (vmode, dest, mask);
17922 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17923
17924 if (op0 != CONST0_RTX (vmode))
17925 {
17926 x = gen_rtx_IOR (vmode, dest, op0);
17927 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17928 }
17929 }
17930
17931 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
17932 so we have to do two masks. */
17933
17934 void
17935 ix86_split_copysign_var (rtx operands[])
17936 {
17937 enum machine_mode mode, vmode;
17938 rtx dest, scratch, op0, op1, mask, nmask, x;
17939
17940 dest = operands[0];
17941 scratch = operands[1];
17942 op0 = operands[2];
17943 op1 = operands[3];
17944 nmask = operands[4];
17945 mask = operands[5];
17946
17947 mode = GET_MODE (dest);
17948 vmode = GET_MODE (mask);
17949
17950 if (rtx_equal_p (op0, op1))
17951 {
17952 /* Shouldn't happen often (it's useless, obviously), but when it does
17953 we'd generate incorrect code if we continue below. */
17954 emit_move_insn (dest, op0);
17955 return;
17956 }
17957
17958 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
17959 {
17960 gcc_assert (REGNO (op1) == REGNO (scratch));
17961
17962 x = gen_rtx_AND (vmode, scratch, mask);
17963 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17964
17965 dest = mask;
17966 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17967 x = gen_rtx_NOT (vmode, dest);
17968 x = gen_rtx_AND (vmode, x, op0);
17969 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17970 }
17971 else
17972 {
17973 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
17974 {
17975 x = gen_rtx_AND (vmode, scratch, mask);
17976 }
17977 else /* alternative 2,4 */
17978 {
17979 gcc_assert (REGNO (mask) == REGNO (scratch));
17980 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
17981 x = gen_rtx_AND (vmode, scratch, op1);
17982 }
17983 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17984
17985 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
17986 {
17987 dest = simplify_gen_subreg (vmode, op0, mode, 0);
17988 x = gen_rtx_AND (vmode, dest, nmask);
17989 }
17990 else /* alternative 3,4 */
17991 {
17992 gcc_assert (REGNO (nmask) == REGNO (dest));
17993 dest = nmask;
17994 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17995 x = gen_rtx_AND (vmode, dest, op0);
17996 }
17997 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17998 }
17999
18000 x = gen_rtx_IOR (vmode, dest, scratch);
18001 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18002 }
18003
18004 /* Return TRUE or FALSE depending on whether the first SET in INSN
18005 has source and destination with matching CC modes, and that the
18006 CC mode is at least as constrained as REQ_MODE. */
18007
18008 bool
18009 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18010 {
18011 rtx set;
18012 enum machine_mode set_mode;
18013
18014 set = PATTERN (insn);
18015 if (GET_CODE (set) == PARALLEL)
18016 set = XVECEXP (set, 0, 0);
18017 gcc_assert (GET_CODE (set) == SET);
18018 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18019
18020 set_mode = GET_MODE (SET_DEST (set));
18021 switch (set_mode)
18022 {
18023 case CCNOmode:
18024 if (req_mode != CCNOmode
18025 && (req_mode != CCmode
18026 || XEXP (SET_SRC (set), 1) != const0_rtx))
18027 return false;
18028 break;
18029 case CCmode:
18030 if (req_mode == CCGCmode)
18031 return false;
18032 /* FALLTHRU */
18033 case CCGCmode:
18034 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18035 return false;
18036 /* FALLTHRU */
18037 case CCGOCmode:
18038 if (req_mode == CCZmode)
18039 return false;
18040 /* FALLTHRU */
18041 case CCZmode:
18042 break;
18043
18044 case CCAmode:
18045 case CCCmode:
18046 case CCOmode:
18047 case CCSmode:
18048 if (set_mode != req_mode)
18049 return false;
18050 break;
18051
18052 default:
18053 gcc_unreachable ();
18054 }
18055
18056 return GET_MODE (SET_SRC (set)) == set_mode;
18057 }
18058
18059 /* Generate insn patterns to do an integer compare of OPERANDS. */
18060
18061 static rtx
18062 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18063 {
18064 enum machine_mode cmpmode;
18065 rtx tmp, flags;
18066
18067 cmpmode = SELECT_CC_MODE (code, op0, op1);
18068 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18069
18070 /* This is very simple, but making the interface the same as in the
18071 FP case makes the rest of the code easier. */
18072 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18073 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18074
18075 /* Return the test that should be put into the flags user, i.e.
18076 the bcc, scc, or cmov instruction. */
18077 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18078 }
18079
18080 /* Figure out whether to use ordered or unordered fp comparisons.
18081 Return the appropriate mode to use. */
18082
18083 enum machine_mode
18084 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18085 {
18086 /* ??? In order to make all comparisons reversible, we do all comparisons
18087 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18088 all forms trapping and nontrapping comparisons, we can make inequality
18089 comparisons trapping again, since it results in better code when using
18090 FCOM based compares. */
18091 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18092 }
18093
18094 enum machine_mode
18095 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18096 {
18097 enum machine_mode mode = GET_MODE (op0);
18098
18099 if (SCALAR_FLOAT_MODE_P (mode))
18100 {
18101 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18102 return ix86_fp_compare_mode (code);
18103 }
18104
18105 switch (code)
18106 {
18107 /* Only zero flag is needed. */
18108 case EQ: /* ZF=0 */
18109 case NE: /* ZF!=0 */
18110 return CCZmode;
18111 /* Codes needing carry flag. */
18112 case GEU: /* CF=0 */
18113 case LTU: /* CF=1 */
18114 /* Detect overflow checks. They need just the carry flag. */
18115 if (GET_CODE (op0) == PLUS
18116 && rtx_equal_p (op1, XEXP (op0, 0)))
18117 return CCCmode;
18118 else
18119 return CCmode;
18120 case GTU: /* CF=0 & ZF=0 */
18121 case LEU: /* CF=1 | ZF=1 */
18122 /* Detect overflow checks. They need just the carry flag. */
18123 if (GET_CODE (op0) == MINUS
18124 && rtx_equal_p (op1, XEXP (op0, 0)))
18125 return CCCmode;
18126 else
18127 return CCmode;
18128 /* Codes possibly doable only with sign flag when
18129 comparing against zero. */
18130 case GE: /* SF=OF or SF=0 */
18131 case LT: /* SF<>OF or SF=1 */
18132 if (op1 == const0_rtx)
18133 return CCGOCmode;
18134 else
18135 /* For other cases Carry flag is not required. */
18136 return CCGCmode;
18137 /* Codes doable only with sign flag when comparing
18138 against zero, but we miss jump instruction for it
18139 so we need to use relational tests against overflow
18140 that thus needs to be zero. */
18141 case GT: /* ZF=0 & SF=OF */
18142 case LE: /* ZF=1 | SF<>OF */
18143 if (op1 == const0_rtx)
18144 return CCNOmode;
18145 else
18146 return CCGCmode;
18147 /* strcmp pattern do (use flags) and combine may ask us for proper
18148 mode. */
18149 case USE:
18150 return CCmode;
18151 default:
18152 gcc_unreachable ();
18153 }
18154 }
18155
18156 /* Return the fixed registers used for condition codes. */
18157
18158 static bool
18159 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18160 {
18161 *p1 = FLAGS_REG;
18162 *p2 = FPSR_REG;
18163 return true;
18164 }
18165
18166 /* If two condition code modes are compatible, return a condition code
18167 mode which is compatible with both. Otherwise, return
18168 VOIDmode. */
18169
18170 static enum machine_mode
18171 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18172 {
18173 if (m1 == m2)
18174 return m1;
18175
18176 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18177 return VOIDmode;
18178
18179 if ((m1 == CCGCmode && m2 == CCGOCmode)
18180 || (m1 == CCGOCmode && m2 == CCGCmode))
18181 return CCGCmode;
18182
18183 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
18184 return m2;
18185 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
18186 return m1;
18187
18188 switch (m1)
18189 {
18190 default:
18191 gcc_unreachable ();
18192
18193 case CCmode:
18194 case CCGCmode:
18195 case CCGOCmode:
18196 case CCNOmode:
18197 case CCAmode:
18198 case CCCmode:
18199 case CCOmode:
18200 case CCSmode:
18201 case CCZmode:
18202 switch (m2)
18203 {
18204 default:
18205 return VOIDmode;
18206
18207 case CCmode:
18208 case CCGCmode:
18209 case CCGOCmode:
18210 case CCNOmode:
18211 case CCAmode:
18212 case CCCmode:
18213 case CCOmode:
18214 case CCSmode:
18215 case CCZmode:
18216 return CCmode;
18217 }
18218
18219 case CCFPmode:
18220 case CCFPUmode:
18221 /* These are only compatible with themselves, which we already
18222 checked above. */
18223 return VOIDmode;
18224 }
18225 }
18226
18227
18228 /* Return a comparison we can do and that it is equivalent to
18229 swap_condition (code) apart possibly from orderedness.
18230 But, never change orderedness if TARGET_IEEE_FP, returning
18231 UNKNOWN in that case if necessary. */
18232
18233 static enum rtx_code
18234 ix86_fp_swap_condition (enum rtx_code code)
18235 {
18236 switch (code)
18237 {
18238 case GT: /* GTU - CF=0 & ZF=0 */
18239 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18240 case GE: /* GEU - CF=0 */
18241 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18242 case UNLT: /* LTU - CF=1 */
18243 return TARGET_IEEE_FP ? UNKNOWN : GT;
18244 case UNLE: /* LEU - CF=1 | ZF=1 */
18245 return TARGET_IEEE_FP ? UNKNOWN : GE;
18246 default:
18247 return swap_condition (code);
18248 }
18249 }
18250
18251 /* Return cost of comparison CODE using the best strategy for performance.
18252 All following functions do use number of instructions as a cost metrics.
18253 In future this should be tweaked to compute bytes for optimize_size and
18254 take into account performance of various instructions on various CPUs. */
18255
18256 static int
18257 ix86_fp_comparison_cost (enum rtx_code code)
18258 {
18259 int arith_cost;
18260
18261 /* The cost of code using bit-twiddling on %ah. */
18262 switch (code)
18263 {
18264 case UNLE:
18265 case UNLT:
18266 case LTGT:
18267 case GT:
18268 case GE:
18269 case UNORDERED:
18270 case ORDERED:
18271 case UNEQ:
18272 arith_cost = 4;
18273 break;
18274 case LT:
18275 case NE:
18276 case EQ:
18277 case UNGE:
18278 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18279 break;
18280 case LE:
18281 case UNGT:
18282 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18283 break;
18284 default:
18285 gcc_unreachable ();
18286 }
18287
18288 switch (ix86_fp_comparison_strategy (code))
18289 {
18290 case IX86_FPCMP_COMI:
18291 return arith_cost > 4 ? 3 : 2;
18292 case IX86_FPCMP_SAHF:
18293 return arith_cost > 4 ? 4 : 3;
18294 default:
18295 return arith_cost;
18296 }
18297 }
18298
18299 /* Return strategy to use for floating-point. We assume that fcomi is always
18300 preferrable where available, since that is also true when looking at size
18301 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18302
18303 enum ix86_fpcmp_strategy
18304 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18305 {
18306 /* Do fcomi/sahf based test when profitable. */
18307
18308 if (TARGET_CMOVE)
18309 return IX86_FPCMP_COMI;
18310
18311 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18312 return IX86_FPCMP_SAHF;
18313
18314 return IX86_FPCMP_ARITH;
18315 }
18316
18317 /* Swap, force into registers, or otherwise massage the two operands
18318 to a fp comparison. The operands are updated in place; the new
18319 comparison code is returned. */
18320
18321 static enum rtx_code
18322 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18323 {
18324 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18325 rtx op0 = *pop0, op1 = *pop1;
18326 enum machine_mode op_mode = GET_MODE (op0);
18327 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18328
18329 /* All of the unordered compare instructions only work on registers.
18330 The same is true of the fcomi compare instructions. The XFmode
18331 compare instructions require registers except when comparing
18332 against zero or when converting operand 1 from fixed point to
18333 floating point. */
18334
18335 if (!is_sse
18336 && (fpcmp_mode == CCFPUmode
18337 || (op_mode == XFmode
18338 && ! (standard_80387_constant_p (op0) == 1
18339 || standard_80387_constant_p (op1) == 1)
18340 && GET_CODE (op1) != FLOAT)
18341 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18342 {
18343 op0 = force_reg (op_mode, op0);
18344 op1 = force_reg (op_mode, op1);
18345 }
18346 else
18347 {
18348 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18349 things around if they appear profitable, otherwise force op0
18350 into a register. */
18351
18352 if (standard_80387_constant_p (op0) == 0
18353 || (MEM_P (op0)
18354 && ! (standard_80387_constant_p (op1) == 0
18355 || MEM_P (op1))))
18356 {
18357 enum rtx_code new_code = ix86_fp_swap_condition (code);
18358 if (new_code != UNKNOWN)
18359 {
18360 rtx tmp;
18361 tmp = op0, op0 = op1, op1 = tmp;
18362 code = new_code;
18363 }
18364 }
18365
18366 if (!REG_P (op0))
18367 op0 = force_reg (op_mode, op0);
18368
18369 if (CONSTANT_P (op1))
18370 {
18371 int tmp = standard_80387_constant_p (op1);
18372 if (tmp == 0)
18373 op1 = validize_mem (force_const_mem (op_mode, op1));
18374 else if (tmp == 1)
18375 {
18376 if (TARGET_CMOVE)
18377 op1 = force_reg (op_mode, op1);
18378 }
18379 else
18380 op1 = force_reg (op_mode, op1);
18381 }
18382 }
18383
18384 /* Try to rearrange the comparison to make it cheaper. */
18385 if (ix86_fp_comparison_cost (code)
18386 > ix86_fp_comparison_cost (swap_condition (code))
18387 && (REG_P (op1) || can_create_pseudo_p ()))
18388 {
18389 rtx tmp;
18390 tmp = op0, op0 = op1, op1 = tmp;
18391 code = swap_condition (code);
18392 if (!REG_P (op0))
18393 op0 = force_reg (op_mode, op0);
18394 }
18395
18396 *pop0 = op0;
18397 *pop1 = op1;
18398 return code;
18399 }
18400
18401 /* Convert comparison codes we use to represent FP comparison to integer
18402 code that will result in proper branch. Return UNKNOWN if no such code
18403 is available. */
18404
18405 enum rtx_code
18406 ix86_fp_compare_code_to_integer (enum rtx_code code)
18407 {
18408 switch (code)
18409 {
18410 case GT:
18411 return GTU;
18412 case GE:
18413 return GEU;
18414 case ORDERED:
18415 case UNORDERED:
18416 return code;
18417 break;
18418 case UNEQ:
18419 return EQ;
18420 break;
18421 case UNLT:
18422 return LTU;
18423 break;
18424 case UNLE:
18425 return LEU;
18426 break;
18427 case LTGT:
18428 return NE;
18429 break;
18430 default:
18431 return UNKNOWN;
18432 }
18433 }
18434
18435 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18436
18437 static rtx
18438 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18439 {
18440 enum machine_mode fpcmp_mode, intcmp_mode;
18441 rtx tmp, tmp2;
18442
18443 fpcmp_mode = ix86_fp_compare_mode (code);
18444 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18445
18446 /* Do fcomi/sahf based test when profitable. */
18447 switch (ix86_fp_comparison_strategy (code))
18448 {
18449 case IX86_FPCMP_COMI:
18450 intcmp_mode = fpcmp_mode;
18451 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18452 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18453 tmp);
18454 emit_insn (tmp);
18455 break;
18456
18457 case IX86_FPCMP_SAHF:
18458 intcmp_mode = fpcmp_mode;
18459 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18460 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18461 tmp);
18462
18463 if (!scratch)
18464 scratch = gen_reg_rtx (HImode);
18465 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18466 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18467 break;
18468
18469 case IX86_FPCMP_ARITH:
18470 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18471 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18472 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18473 if (!scratch)
18474 scratch = gen_reg_rtx (HImode);
18475 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18476
18477 /* In the unordered case, we have to check C2 for NaN's, which
18478 doesn't happen to work out to anything nice combination-wise.
18479 So do some bit twiddling on the value we've got in AH to come
18480 up with an appropriate set of condition codes. */
18481
18482 intcmp_mode = CCNOmode;
18483 switch (code)
18484 {
18485 case GT:
18486 case UNGT:
18487 if (code == GT || !TARGET_IEEE_FP)
18488 {
18489 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18490 code = EQ;
18491 }
18492 else
18493 {
18494 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18495 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18496 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18497 intcmp_mode = CCmode;
18498 code = GEU;
18499 }
18500 break;
18501 case LT:
18502 case UNLT:
18503 if (code == LT && TARGET_IEEE_FP)
18504 {
18505 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18506 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18507 intcmp_mode = CCmode;
18508 code = EQ;
18509 }
18510 else
18511 {
18512 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18513 code = NE;
18514 }
18515 break;
18516 case GE:
18517 case UNGE:
18518 if (code == GE || !TARGET_IEEE_FP)
18519 {
18520 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18521 code = EQ;
18522 }
18523 else
18524 {
18525 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18526 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18527 code = NE;
18528 }
18529 break;
18530 case LE:
18531 case UNLE:
18532 if (code == LE && TARGET_IEEE_FP)
18533 {
18534 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18535 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18536 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18537 intcmp_mode = CCmode;
18538 code = LTU;
18539 }
18540 else
18541 {
18542 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18543 code = NE;
18544 }
18545 break;
18546 case EQ:
18547 case UNEQ:
18548 if (code == EQ && TARGET_IEEE_FP)
18549 {
18550 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18551 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18552 intcmp_mode = CCmode;
18553 code = EQ;
18554 }
18555 else
18556 {
18557 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18558 code = NE;
18559 }
18560 break;
18561 case NE:
18562 case LTGT:
18563 if (code == NE && TARGET_IEEE_FP)
18564 {
18565 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18566 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18567 GEN_INT (0x40)));
18568 code = NE;
18569 }
18570 else
18571 {
18572 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18573 code = EQ;
18574 }
18575 break;
18576
18577 case UNORDERED:
18578 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18579 code = NE;
18580 break;
18581 case ORDERED:
18582 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18583 code = EQ;
18584 break;
18585
18586 default:
18587 gcc_unreachable ();
18588 }
18589 break;
18590
18591 default:
18592 gcc_unreachable();
18593 }
18594
18595 /* Return the test that should be put into the flags user, i.e.
18596 the bcc, scc, or cmov instruction. */
18597 return gen_rtx_fmt_ee (code, VOIDmode,
18598 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18599 const0_rtx);
18600 }
18601
18602 static rtx
18603 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18604 {
18605 rtx ret;
18606
18607 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18608 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18609
18610 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18611 {
18612 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18613 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18614 }
18615 else
18616 ret = ix86_expand_int_compare (code, op0, op1);
18617
18618 return ret;
18619 }
18620
18621 void
18622 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18623 {
18624 enum machine_mode mode = GET_MODE (op0);
18625 rtx tmp;
18626
18627 switch (mode)
18628 {
18629 case SFmode:
18630 case DFmode:
18631 case XFmode:
18632 case QImode:
18633 case HImode:
18634 case SImode:
18635 simple:
18636 tmp = ix86_expand_compare (code, op0, op1);
18637 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18638 gen_rtx_LABEL_REF (VOIDmode, label),
18639 pc_rtx);
18640 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18641 return;
18642
18643 case DImode:
18644 if (TARGET_64BIT)
18645 goto simple;
18646 case TImode:
18647 /* Expand DImode branch into multiple compare+branch. */
18648 {
18649 rtx lo[2], hi[2], label2;
18650 enum rtx_code code1, code2, code3;
18651 enum machine_mode submode;
18652
18653 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18654 {
18655 tmp = op0, op0 = op1, op1 = tmp;
18656 code = swap_condition (code);
18657 }
18658
18659 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18660 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18661
18662 submode = mode == DImode ? SImode : DImode;
18663
18664 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18665 avoid two branches. This costs one extra insn, so disable when
18666 optimizing for size. */
18667
18668 if ((code == EQ || code == NE)
18669 && (!optimize_insn_for_size_p ()
18670 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18671 {
18672 rtx xor0, xor1;
18673
18674 xor1 = hi[0];
18675 if (hi[1] != const0_rtx)
18676 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18677 NULL_RTX, 0, OPTAB_WIDEN);
18678
18679 xor0 = lo[0];
18680 if (lo[1] != const0_rtx)
18681 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18682 NULL_RTX, 0, OPTAB_WIDEN);
18683
18684 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18685 NULL_RTX, 0, OPTAB_WIDEN);
18686
18687 ix86_expand_branch (code, tmp, const0_rtx, label);
18688 return;
18689 }
18690
18691 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18692 op1 is a constant and the low word is zero, then we can just
18693 examine the high word. Similarly for low word -1 and
18694 less-or-equal-than or greater-than. */
18695
18696 if (CONST_INT_P (hi[1]))
18697 switch (code)
18698 {
18699 case LT: case LTU: case GE: case GEU:
18700 if (lo[1] == const0_rtx)
18701 {
18702 ix86_expand_branch (code, hi[0], hi[1], label);
18703 return;
18704 }
18705 break;
18706 case LE: case LEU: case GT: case GTU:
18707 if (lo[1] == constm1_rtx)
18708 {
18709 ix86_expand_branch (code, hi[0], hi[1], label);
18710 return;
18711 }
18712 break;
18713 default:
18714 break;
18715 }
18716
18717 /* Otherwise, we need two or three jumps. */
18718
18719 label2 = gen_label_rtx ();
18720
18721 code1 = code;
18722 code2 = swap_condition (code);
18723 code3 = unsigned_condition (code);
18724
18725 switch (code)
18726 {
18727 case LT: case GT: case LTU: case GTU:
18728 break;
18729
18730 case LE: code1 = LT; code2 = GT; break;
18731 case GE: code1 = GT; code2 = LT; break;
18732 case LEU: code1 = LTU; code2 = GTU; break;
18733 case GEU: code1 = GTU; code2 = LTU; break;
18734
18735 case EQ: code1 = UNKNOWN; code2 = NE; break;
18736 case NE: code2 = UNKNOWN; break;
18737
18738 default:
18739 gcc_unreachable ();
18740 }
18741
18742 /*
18743 * a < b =>
18744 * if (hi(a) < hi(b)) goto true;
18745 * if (hi(a) > hi(b)) goto false;
18746 * if (lo(a) < lo(b)) goto true;
18747 * false:
18748 */
18749
18750 if (code1 != UNKNOWN)
18751 ix86_expand_branch (code1, hi[0], hi[1], label);
18752 if (code2 != UNKNOWN)
18753 ix86_expand_branch (code2, hi[0], hi[1], label2);
18754
18755 ix86_expand_branch (code3, lo[0], lo[1], label);
18756
18757 if (code2 != UNKNOWN)
18758 emit_label (label2);
18759 return;
18760 }
18761
18762 default:
18763 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
18764 goto simple;
18765 }
18766 }
18767
18768 /* Split branch based on floating point condition. */
18769 void
18770 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
18771 rtx target1, rtx target2, rtx tmp, rtx pushed)
18772 {
18773 rtx condition;
18774 rtx i;
18775
18776 if (target2 != pc_rtx)
18777 {
18778 rtx tmp = target2;
18779 code = reverse_condition_maybe_unordered (code);
18780 target2 = target1;
18781 target1 = tmp;
18782 }
18783
18784 condition = ix86_expand_fp_compare (code, op1, op2,
18785 tmp);
18786
18787 /* Remove pushed operand from stack. */
18788 if (pushed)
18789 ix86_free_from_memory (GET_MODE (pushed));
18790
18791 i = emit_jump_insn (gen_rtx_SET
18792 (VOIDmode, pc_rtx,
18793 gen_rtx_IF_THEN_ELSE (VOIDmode,
18794 condition, target1, target2)));
18795 if (split_branch_probability >= 0)
18796 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
18797 }
18798
18799 void
18800 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
18801 {
18802 rtx ret;
18803
18804 gcc_assert (GET_MODE (dest) == QImode);
18805
18806 ret = ix86_expand_compare (code, op0, op1);
18807 PUT_MODE (ret, QImode);
18808 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
18809 }
18810
18811 /* Expand comparison setting or clearing carry flag. Return true when
18812 successful and set pop for the operation. */
18813 static bool
18814 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
18815 {
18816 enum machine_mode mode =
18817 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
18818
18819 /* Do not handle double-mode compares that go through special path. */
18820 if (mode == (TARGET_64BIT ? TImode : DImode))
18821 return false;
18822
18823 if (SCALAR_FLOAT_MODE_P (mode))
18824 {
18825 rtx compare_op, compare_seq;
18826
18827 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18828
18829 /* Shortcut: following common codes never translate
18830 into carry flag compares. */
18831 if (code == EQ || code == NE || code == UNEQ || code == LTGT
18832 || code == ORDERED || code == UNORDERED)
18833 return false;
18834
18835 /* These comparisons require zero flag; swap operands so they won't. */
18836 if ((code == GT || code == UNLE || code == LE || code == UNGT)
18837 && !TARGET_IEEE_FP)
18838 {
18839 rtx tmp = op0;
18840 op0 = op1;
18841 op1 = tmp;
18842 code = swap_condition (code);
18843 }
18844
18845 /* Try to expand the comparison and verify that we end up with
18846 carry flag based comparison. This fails to be true only when
18847 we decide to expand comparison using arithmetic that is not
18848 too common scenario. */
18849 start_sequence ();
18850 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18851 compare_seq = get_insns ();
18852 end_sequence ();
18853
18854 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
18855 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
18856 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
18857 else
18858 code = GET_CODE (compare_op);
18859
18860 if (code != LTU && code != GEU)
18861 return false;
18862
18863 emit_insn (compare_seq);
18864 *pop = compare_op;
18865 return true;
18866 }
18867
18868 if (!INTEGRAL_MODE_P (mode))
18869 return false;
18870
18871 switch (code)
18872 {
18873 case LTU:
18874 case GEU:
18875 break;
18876
18877 /* Convert a==0 into (unsigned)a<1. */
18878 case EQ:
18879 case NE:
18880 if (op1 != const0_rtx)
18881 return false;
18882 op1 = const1_rtx;
18883 code = (code == EQ ? LTU : GEU);
18884 break;
18885
18886 /* Convert a>b into b<a or a>=b-1. */
18887 case GTU:
18888 case LEU:
18889 if (CONST_INT_P (op1))
18890 {
18891 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
18892 /* Bail out on overflow. We still can swap operands but that
18893 would force loading of the constant into register. */
18894 if (op1 == const0_rtx
18895 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
18896 return false;
18897 code = (code == GTU ? GEU : LTU);
18898 }
18899 else
18900 {
18901 rtx tmp = op1;
18902 op1 = op0;
18903 op0 = tmp;
18904 code = (code == GTU ? LTU : GEU);
18905 }
18906 break;
18907
18908 /* Convert a>=0 into (unsigned)a<0x80000000. */
18909 case LT:
18910 case GE:
18911 if (mode == DImode || op1 != const0_rtx)
18912 return false;
18913 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18914 code = (code == LT ? GEU : LTU);
18915 break;
18916 case LE:
18917 case GT:
18918 if (mode == DImode || op1 != constm1_rtx)
18919 return false;
18920 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18921 code = (code == LE ? GEU : LTU);
18922 break;
18923
18924 default:
18925 return false;
18926 }
18927 /* Swapping operands may cause constant to appear as first operand. */
18928 if (!nonimmediate_operand (op0, VOIDmode))
18929 {
18930 if (!can_create_pseudo_p ())
18931 return false;
18932 op0 = force_reg (mode, op0);
18933 }
18934 *pop = ix86_expand_compare (code, op0, op1);
18935 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
18936 return true;
18937 }
18938
18939 bool
18940 ix86_expand_int_movcc (rtx operands[])
18941 {
18942 enum rtx_code code = GET_CODE (operands[1]), compare_code;
18943 rtx compare_seq, compare_op;
18944 enum machine_mode mode = GET_MODE (operands[0]);
18945 bool sign_bit_compare_p = false;
18946 rtx op0 = XEXP (operands[1], 0);
18947 rtx op1 = XEXP (operands[1], 1);
18948
18949 if (GET_MODE (op0) == TImode
18950 || (GET_MODE (op0) == DImode
18951 && !TARGET_64BIT))
18952 return false;
18953
18954 start_sequence ();
18955 compare_op = ix86_expand_compare (code, op0, op1);
18956 compare_seq = get_insns ();
18957 end_sequence ();
18958
18959 compare_code = GET_CODE (compare_op);
18960
18961 if ((op1 == const0_rtx && (code == GE || code == LT))
18962 || (op1 == constm1_rtx && (code == GT || code == LE)))
18963 sign_bit_compare_p = true;
18964
18965 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
18966 HImode insns, we'd be swallowed in word prefix ops. */
18967
18968 if ((mode != HImode || TARGET_FAST_PREFIX)
18969 && (mode != (TARGET_64BIT ? TImode : DImode))
18970 && CONST_INT_P (operands[2])
18971 && CONST_INT_P (operands[3]))
18972 {
18973 rtx out = operands[0];
18974 HOST_WIDE_INT ct = INTVAL (operands[2]);
18975 HOST_WIDE_INT cf = INTVAL (operands[3]);
18976 HOST_WIDE_INT diff;
18977
18978 diff = ct - cf;
18979 /* Sign bit compares are better done using shifts than we do by using
18980 sbb. */
18981 if (sign_bit_compare_p
18982 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
18983 {
18984 /* Detect overlap between destination and compare sources. */
18985 rtx tmp = out;
18986
18987 if (!sign_bit_compare_p)
18988 {
18989 rtx flags;
18990 bool fpcmp = false;
18991
18992 compare_code = GET_CODE (compare_op);
18993
18994 flags = XEXP (compare_op, 0);
18995
18996 if (GET_MODE (flags) == CCFPmode
18997 || GET_MODE (flags) == CCFPUmode)
18998 {
18999 fpcmp = true;
19000 compare_code
19001 = ix86_fp_compare_code_to_integer (compare_code);
19002 }
19003
19004 /* To simplify rest of code, restrict to the GEU case. */
19005 if (compare_code == LTU)
19006 {
19007 HOST_WIDE_INT tmp = ct;
19008 ct = cf;
19009 cf = tmp;
19010 compare_code = reverse_condition (compare_code);
19011 code = reverse_condition (code);
19012 }
19013 else
19014 {
19015 if (fpcmp)
19016 PUT_CODE (compare_op,
19017 reverse_condition_maybe_unordered
19018 (GET_CODE (compare_op)));
19019 else
19020 PUT_CODE (compare_op,
19021 reverse_condition (GET_CODE (compare_op)));
19022 }
19023 diff = ct - cf;
19024
19025 if (reg_overlap_mentioned_p (out, op0)
19026 || reg_overlap_mentioned_p (out, op1))
19027 tmp = gen_reg_rtx (mode);
19028
19029 if (mode == DImode)
19030 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19031 else
19032 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19033 flags, compare_op));
19034 }
19035 else
19036 {
19037 if (code == GT || code == GE)
19038 code = reverse_condition (code);
19039 else
19040 {
19041 HOST_WIDE_INT tmp = ct;
19042 ct = cf;
19043 cf = tmp;
19044 diff = ct - cf;
19045 }
19046 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19047 }
19048
19049 if (diff == 1)
19050 {
19051 /*
19052 * cmpl op0,op1
19053 * sbbl dest,dest
19054 * [addl dest, ct]
19055 *
19056 * Size 5 - 8.
19057 */
19058 if (ct)
19059 tmp = expand_simple_binop (mode, PLUS,
19060 tmp, GEN_INT (ct),
19061 copy_rtx (tmp), 1, OPTAB_DIRECT);
19062 }
19063 else if (cf == -1)
19064 {
19065 /*
19066 * cmpl op0,op1
19067 * sbbl dest,dest
19068 * orl $ct, dest
19069 *
19070 * Size 8.
19071 */
19072 tmp = expand_simple_binop (mode, IOR,
19073 tmp, GEN_INT (ct),
19074 copy_rtx (tmp), 1, OPTAB_DIRECT);
19075 }
19076 else if (diff == -1 && ct)
19077 {
19078 /*
19079 * cmpl op0,op1
19080 * sbbl dest,dest
19081 * notl dest
19082 * [addl dest, cf]
19083 *
19084 * Size 8 - 11.
19085 */
19086 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19087 if (cf)
19088 tmp = expand_simple_binop (mode, PLUS,
19089 copy_rtx (tmp), GEN_INT (cf),
19090 copy_rtx (tmp), 1, OPTAB_DIRECT);
19091 }
19092 else
19093 {
19094 /*
19095 * cmpl op0,op1
19096 * sbbl dest,dest
19097 * [notl dest]
19098 * andl cf - ct, dest
19099 * [addl dest, ct]
19100 *
19101 * Size 8 - 11.
19102 */
19103
19104 if (cf == 0)
19105 {
19106 cf = ct;
19107 ct = 0;
19108 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19109 }
19110
19111 tmp = expand_simple_binop (mode, AND,
19112 copy_rtx (tmp),
19113 gen_int_mode (cf - ct, mode),
19114 copy_rtx (tmp), 1, OPTAB_DIRECT);
19115 if (ct)
19116 tmp = expand_simple_binop (mode, PLUS,
19117 copy_rtx (tmp), GEN_INT (ct),
19118 copy_rtx (tmp), 1, OPTAB_DIRECT);
19119 }
19120
19121 if (!rtx_equal_p (tmp, out))
19122 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19123
19124 return true;
19125 }
19126
19127 if (diff < 0)
19128 {
19129 enum machine_mode cmp_mode = GET_MODE (op0);
19130
19131 HOST_WIDE_INT tmp;
19132 tmp = ct, ct = cf, cf = tmp;
19133 diff = -diff;
19134
19135 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19136 {
19137 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19138
19139 /* We may be reversing unordered compare to normal compare, that
19140 is not valid in general (we may convert non-trapping condition
19141 to trapping one), however on i386 we currently emit all
19142 comparisons unordered. */
19143 compare_code = reverse_condition_maybe_unordered (compare_code);
19144 code = reverse_condition_maybe_unordered (code);
19145 }
19146 else
19147 {
19148 compare_code = reverse_condition (compare_code);
19149 code = reverse_condition (code);
19150 }
19151 }
19152
19153 compare_code = UNKNOWN;
19154 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19155 && CONST_INT_P (op1))
19156 {
19157 if (op1 == const0_rtx
19158 && (code == LT || code == GE))
19159 compare_code = code;
19160 else if (op1 == constm1_rtx)
19161 {
19162 if (code == LE)
19163 compare_code = LT;
19164 else if (code == GT)
19165 compare_code = GE;
19166 }
19167 }
19168
19169 /* Optimize dest = (op0 < 0) ? -1 : cf. */
19170 if (compare_code != UNKNOWN
19171 && GET_MODE (op0) == GET_MODE (out)
19172 && (cf == -1 || ct == -1))
19173 {
19174 /* If lea code below could be used, only optimize
19175 if it results in a 2 insn sequence. */
19176
19177 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19178 || diff == 3 || diff == 5 || diff == 9)
19179 || (compare_code == LT && ct == -1)
19180 || (compare_code == GE && cf == -1))
19181 {
19182 /*
19183 * notl op1 (if necessary)
19184 * sarl $31, op1
19185 * orl cf, op1
19186 */
19187 if (ct != -1)
19188 {
19189 cf = ct;
19190 ct = -1;
19191 code = reverse_condition (code);
19192 }
19193
19194 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19195
19196 out = expand_simple_binop (mode, IOR,
19197 out, GEN_INT (cf),
19198 out, 1, OPTAB_DIRECT);
19199 if (out != operands[0])
19200 emit_move_insn (operands[0], out);
19201
19202 return true;
19203 }
19204 }
19205
19206
19207 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19208 || diff == 3 || diff == 5 || diff == 9)
19209 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19210 && (mode != DImode
19211 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19212 {
19213 /*
19214 * xorl dest,dest
19215 * cmpl op1,op2
19216 * setcc dest
19217 * lea cf(dest*(ct-cf)),dest
19218 *
19219 * Size 14.
19220 *
19221 * This also catches the degenerate setcc-only case.
19222 */
19223
19224 rtx tmp;
19225 int nops;
19226
19227 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19228
19229 nops = 0;
19230 /* On x86_64 the lea instruction operates on Pmode, so we need
19231 to get arithmetics done in proper mode to match. */
19232 if (diff == 1)
19233 tmp = copy_rtx (out);
19234 else
19235 {
19236 rtx out1;
19237 out1 = copy_rtx (out);
19238 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19239 nops++;
19240 if (diff & 1)
19241 {
19242 tmp = gen_rtx_PLUS (mode, tmp, out1);
19243 nops++;
19244 }
19245 }
19246 if (cf != 0)
19247 {
19248 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19249 nops++;
19250 }
19251 if (!rtx_equal_p (tmp, out))
19252 {
19253 if (nops == 1)
19254 out = force_operand (tmp, copy_rtx (out));
19255 else
19256 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19257 }
19258 if (!rtx_equal_p (out, operands[0]))
19259 emit_move_insn (operands[0], copy_rtx (out));
19260
19261 return true;
19262 }
19263
19264 /*
19265 * General case: Jumpful:
19266 * xorl dest,dest cmpl op1, op2
19267 * cmpl op1, op2 movl ct, dest
19268 * setcc dest jcc 1f
19269 * decl dest movl cf, dest
19270 * andl (cf-ct),dest 1:
19271 * addl ct,dest
19272 *
19273 * Size 20. Size 14.
19274 *
19275 * This is reasonably steep, but branch mispredict costs are
19276 * high on modern cpus, so consider failing only if optimizing
19277 * for space.
19278 */
19279
19280 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19281 && BRANCH_COST (optimize_insn_for_speed_p (),
19282 false) >= 2)
19283 {
19284 if (cf == 0)
19285 {
19286 enum machine_mode cmp_mode = GET_MODE (op0);
19287
19288 cf = ct;
19289 ct = 0;
19290
19291 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19292 {
19293 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19294
19295 /* We may be reversing unordered compare to normal compare,
19296 that is not valid in general (we may convert non-trapping
19297 condition to trapping one), however on i386 we currently
19298 emit all comparisons unordered. */
19299 code = reverse_condition_maybe_unordered (code);
19300 }
19301 else
19302 {
19303 code = reverse_condition (code);
19304 if (compare_code != UNKNOWN)
19305 compare_code = reverse_condition (compare_code);
19306 }
19307 }
19308
19309 if (compare_code != UNKNOWN)
19310 {
19311 /* notl op1 (if needed)
19312 sarl $31, op1
19313 andl (cf-ct), op1
19314 addl ct, op1
19315
19316 For x < 0 (resp. x <= -1) there will be no notl,
19317 so if possible swap the constants to get rid of the
19318 complement.
19319 True/false will be -1/0 while code below (store flag
19320 followed by decrement) is 0/-1, so the constants need
19321 to be exchanged once more. */
19322
19323 if (compare_code == GE || !cf)
19324 {
19325 code = reverse_condition (code);
19326 compare_code = LT;
19327 }
19328 else
19329 {
19330 HOST_WIDE_INT tmp = cf;
19331 cf = ct;
19332 ct = tmp;
19333 }
19334
19335 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19336 }
19337 else
19338 {
19339 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19340
19341 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19342 constm1_rtx,
19343 copy_rtx (out), 1, OPTAB_DIRECT);
19344 }
19345
19346 out = expand_simple_binop (mode, AND, copy_rtx (out),
19347 gen_int_mode (cf - ct, mode),
19348 copy_rtx (out), 1, OPTAB_DIRECT);
19349 if (ct)
19350 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19351 copy_rtx (out), 1, OPTAB_DIRECT);
19352 if (!rtx_equal_p (out, operands[0]))
19353 emit_move_insn (operands[0], copy_rtx (out));
19354
19355 return true;
19356 }
19357 }
19358
19359 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19360 {
19361 /* Try a few things more with specific constants and a variable. */
19362
19363 optab op;
19364 rtx var, orig_out, out, tmp;
19365
19366 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19367 return false;
19368
19369 /* If one of the two operands is an interesting constant, load a
19370 constant with the above and mask it in with a logical operation. */
19371
19372 if (CONST_INT_P (operands[2]))
19373 {
19374 var = operands[3];
19375 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19376 operands[3] = constm1_rtx, op = and_optab;
19377 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19378 operands[3] = const0_rtx, op = ior_optab;
19379 else
19380 return false;
19381 }
19382 else if (CONST_INT_P (operands[3]))
19383 {
19384 var = operands[2];
19385 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19386 operands[2] = constm1_rtx, op = and_optab;
19387 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19388 operands[2] = const0_rtx, op = ior_optab;
19389 else
19390 return false;
19391 }
19392 else
19393 return false;
19394
19395 orig_out = operands[0];
19396 tmp = gen_reg_rtx (mode);
19397 operands[0] = tmp;
19398
19399 /* Recurse to get the constant loaded. */
19400 if (ix86_expand_int_movcc (operands) == 0)
19401 return false;
19402
19403 /* Mask in the interesting variable. */
19404 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19405 OPTAB_WIDEN);
19406 if (!rtx_equal_p (out, orig_out))
19407 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19408
19409 return true;
19410 }
19411
19412 /*
19413 * For comparison with above,
19414 *
19415 * movl cf,dest
19416 * movl ct,tmp
19417 * cmpl op1,op2
19418 * cmovcc tmp,dest
19419 *
19420 * Size 15.
19421 */
19422
19423 if (! nonimmediate_operand (operands[2], mode))
19424 operands[2] = force_reg (mode, operands[2]);
19425 if (! nonimmediate_operand (operands[3], mode))
19426 operands[3] = force_reg (mode, operands[3]);
19427
19428 if (! register_operand (operands[2], VOIDmode)
19429 && (mode == QImode
19430 || ! register_operand (operands[3], VOIDmode)))
19431 operands[2] = force_reg (mode, operands[2]);
19432
19433 if (mode == QImode
19434 && ! register_operand (operands[3], VOIDmode))
19435 operands[3] = force_reg (mode, operands[3]);
19436
19437 emit_insn (compare_seq);
19438 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19439 gen_rtx_IF_THEN_ELSE (mode,
19440 compare_op, operands[2],
19441 operands[3])));
19442 return true;
19443 }
19444
19445 /* Swap, force into registers, or otherwise massage the two operands
19446 to an sse comparison with a mask result. Thus we differ a bit from
19447 ix86_prepare_fp_compare_args which expects to produce a flags result.
19448
19449 The DEST operand exists to help determine whether to commute commutative
19450 operators. The POP0/POP1 operands are updated in place. The new
19451 comparison code is returned, or UNKNOWN if not implementable. */
19452
19453 static enum rtx_code
19454 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19455 rtx *pop0, rtx *pop1)
19456 {
19457 rtx tmp;
19458
19459 switch (code)
19460 {
19461 case LTGT:
19462 case UNEQ:
19463 /* AVX supports all the needed comparisons. */
19464 if (TARGET_AVX)
19465 break;
19466 /* We have no LTGT as an operator. We could implement it with
19467 NE & ORDERED, but this requires an extra temporary. It's
19468 not clear that it's worth it. */
19469 return UNKNOWN;
19470
19471 case LT:
19472 case LE:
19473 case UNGT:
19474 case UNGE:
19475 /* These are supported directly. */
19476 break;
19477
19478 case EQ:
19479 case NE:
19480 case UNORDERED:
19481 case ORDERED:
19482 /* AVX has 3 operand comparisons, no need to swap anything. */
19483 if (TARGET_AVX)
19484 break;
19485 /* For commutative operators, try to canonicalize the destination
19486 operand to be first in the comparison - this helps reload to
19487 avoid extra moves. */
19488 if (!dest || !rtx_equal_p (dest, *pop1))
19489 break;
19490 /* FALLTHRU */
19491
19492 case GE:
19493 case GT:
19494 case UNLE:
19495 case UNLT:
19496 /* These are not supported directly before AVX, and furthermore
19497 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19498 comparison operands to transform into something that is
19499 supported. */
19500 tmp = *pop0;
19501 *pop0 = *pop1;
19502 *pop1 = tmp;
19503 code = swap_condition (code);
19504 break;
19505
19506 default:
19507 gcc_unreachable ();
19508 }
19509
19510 return code;
19511 }
19512
19513 /* Detect conditional moves that exactly match min/max operational
19514 semantics. Note that this is IEEE safe, as long as we don't
19515 interchange the operands.
19516
19517 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19518 and TRUE if the operation is successful and instructions are emitted. */
19519
19520 static bool
19521 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19522 rtx cmp_op1, rtx if_true, rtx if_false)
19523 {
19524 enum machine_mode mode;
19525 bool is_min;
19526 rtx tmp;
19527
19528 if (code == LT)
19529 ;
19530 else if (code == UNGE)
19531 {
19532 tmp = if_true;
19533 if_true = if_false;
19534 if_false = tmp;
19535 }
19536 else
19537 return false;
19538
19539 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19540 is_min = true;
19541 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19542 is_min = false;
19543 else
19544 return false;
19545
19546 mode = GET_MODE (dest);
19547
19548 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19549 but MODE may be a vector mode and thus not appropriate. */
19550 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19551 {
19552 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19553 rtvec v;
19554
19555 if_true = force_reg (mode, if_true);
19556 v = gen_rtvec (2, if_true, if_false);
19557 tmp = gen_rtx_UNSPEC (mode, v, u);
19558 }
19559 else
19560 {
19561 code = is_min ? SMIN : SMAX;
19562 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19563 }
19564
19565 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19566 return true;
19567 }
19568
19569 /* Expand an sse vector comparison. Return the register with the result. */
19570
19571 static rtx
19572 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19573 rtx op_true, rtx op_false)
19574 {
19575 enum machine_mode mode = GET_MODE (dest);
19576 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19577 rtx x;
19578
19579 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19580 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19581 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19582
19583 if (optimize
19584 || reg_overlap_mentioned_p (dest, op_true)
19585 || reg_overlap_mentioned_p (dest, op_false))
19586 dest = gen_reg_rtx (mode);
19587
19588 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19589 if (cmp_mode != mode)
19590 {
19591 x = force_reg (cmp_mode, x);
19592 convert_move (dest, x, false);
19593 }
19594 else
19595 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19596
19597 return dest;
19598 }
19599
19600 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19601 operations. This is used for both scalar and vector conditional moves. */
19602
19603 static void
19604 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19605 {
19606 enum machine_mode mode = GET_MODE (dest);
19607 rtx t2, t3, x;
19608
19609 if (vector_all_ones_operand (op_true, mode)
19610 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19611 {
19612 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19613 }
19614 else if (op_false == CONST0_RTX (mode))
19615 {
19616 op_true = force_reg (mode, op_true);
19617 x = gen_rtx_AND (mode, cmp, op_true);
19618 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19619 }
19620 else if (op_true == CONST0_RTX (mode))
19621 {
19622 op_false = force_reg (mode, op_false);
19623 x = gen_rtx_NOT (mode, cmp);
19624 x = gen_rtx_AND (mode, x, op_false);
19625 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19626 }
19627 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19628 {
19629 op_false = force_reg (mode, op_false);
19630 x = gen_rtx_IOR (mode, cmp, op_false);
19631 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19632 }
19633 else if (TARGET_XOP)
19634 {
19635 op_true = force_reg (mode, op_true);
19636
19637 if (!nonimmediate_operand (op_false, mode))
19638 op_false = force_reg (mode, op_false);
19639
19640 emit_insn (gen_rtx_SET (mode, dest,
19641 gen_rtx_IF_THEN_ELSE (mode, cmp,
19642 op_true,
19643 op_false)));
19644 }
19645 else
19646 {
19647 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19648
19649 if (!nonimmediate_operand (op_true, mode))
19650 op_true = force_reg (mode, op_true);
19651
19652 op_false = force_reg (mode, op_false);
19653
19654 switch (mode)
19655 {
19656 case V4SFmode:
19657 if (TARGET_SSE4_1)
19658 gen = gen_sse4_1_blendvps;
19659 break;
19660 case V2DFmode:
19661 if (TARGET_SSE4_1)
19662 gen = gen_sse4_1_blendvpd;
19663 break;
19664 case V16QImode:
19665 case V8HImode:
19666 case V4SImode:
19667 case V2DImode:
19668 if (TARGET_SSE4_1)
19669 {
19670 gen = gen_sse4_1_pblendvb;
19671 dest = gen_lowpart (V16QImode, dest);
19672 op_false = gen_lowpart (V16QImode, op_false);
19673 op_true = gen_lowpart (V16QImode, op_true);
19674 cmp = gen_lowpart (V16QImode, cmp);
19675 }
19676 break;
19677 case V8SFmode:
19678 if (TARGET_AVX)
19679 gen = gen_avx_blendvps256;
19680 break;
19681 case V4DFmode:
19682 if (TARGET_AVX)
19683 gen = gen_avx_blendvpd256;
19684 break;
19685 case V32QImode:
19686 case V16HImode:
19687 case V8SImode:
19688 case V4DImode:
19689 if (TARGET_AVX2)
19690 {
19691 gen = gen_avx2_pblendvb;
19692 dest = gen_lowpart (V32QImode, dest);
19693 op_false = gen_lowpart (V32QImode, op_false);
19694 op_true = gen_lowpart (V32QImode, op_true);
19695 cmp = gen_lowpart (V32QImode, cmp);
19696 }
19697 break;
19698 default:
19699 break;
19700 }
19701
19702 if (gen != NULL)
19703 emit_insn (gen (dest, op_false, op_true, cmp));
19704 else
19705 {
19706 op_true = force_reg (mode, op_true);
19707
19708 t2 = gen_reg_rtx (mode);
19709 if (optimize)
19710 t3 = gen_reg_rtx (mode);
19711 else
19712 t3 = dest;
19713
19714 x = gen_rtx_AND (mode, op_true, cmp);
19715 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19716
19717 x = gen_rtx_NOT (mode, cmp);
19718 x = gen_rtx_AND (mode, x, op_false);
19719 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19720
19721 x = gen_rtx_IOR (mode, t3, t2);
19722 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19723 }
19724 }
19725 }
19726
19727 /* Expand a floating-point conditional move. Return true if successful. */
19728
19729 bool
19730 ix86_expand_fp_movcc (rtx operands[])
19731 {
19732 enum machine_mode mode = GET_MODE (operands[0]);
19733 enum rtx_code code = GET_CODE (operands[1]);
19734 rtx tmp, compare_op;
19735 rtx op0 = XEXP (operands[1], 0);
19736 rtx op1 = XEXP (operands[1], 1);
19737
19738 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
19739 {
19740 enum machine_mode cmode;
19741
19742 /* Since we've no cmove for sse registers, don't force bad register
19743 allocation just to gain access to it. Deny movcc when the
19744 comparison mode doesn't match the move mode. */
19745 cmode = GET_MODE (op0);
19746 if (cmode == VOIDmode)
19747 cmode = GET_MODE (op1);
19748 if (cmode != mode)
19749 return false;
19750
19751 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
19752 if (code == UNKNOWN)
19753 return false;
19754
19755 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
19756 operands[2], operands[3]))
19757 return true;
19758
19759 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
19760 operands[2], operands[3]);
19761 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
19762 return true;
19763 }
19764
19765 /* The floating point conditional move instructions don't directly
19766 support conditions resulting from a signed integer comparison. */
19767
19768 compare_op = ix86_expand_compare (code, op0, op1);
19769 if (!fcmov_comparison_operator (compare_op, VOIDmode))
19770 {
19771 tmp = gen_reg_rtx (QImode);
19772 ix86_expand_setcc (tmp, code, op0, op1);
19773
19774 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
19775 }
19776
19777 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19778 gen_rtx_IF_THEN_ELSE (mode, compare_op,
19779 operands[2], operands[3])));
19780
19781 return true;
19782 }
19783
19784 /* Expand a floating-point vector conditional move; a vcond operation
19785 rather than a movcc operation. */
19786
19787 bool
19788 ix86_expand_fp_vcond (rtx operands[])
19789 {
19790 enum rtx_code code = GET_CODE (operands[3]);
19791 rtx cmp;
19792
19793 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
19794 &operands[4], &operands[5]);
19795 if (code == UNKNOWN)
19796 {
19797 rtx temp;
19798 switch (GET_CODE (operands[3]))
19799 {
19800 case LTGT:
19801 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
19802 operands[5], operands[0], operands[0]);
19803 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
19804 operands[5], operands[1], operands[2]);
19805 code = AND;
19806 break;
19807 case UNEQ:
19808 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
19809 operands[5], operands[0], operands[0]);
19810 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
19811 operands[5], operands[1], operands[2]);
19812 code = IOR;
19813 break;
19814 default:
19815 gcc_unreachable ();
19816 }
19817 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
19818 OPTAB_DIRECT);
19819 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19820 return true;
19821 }
19822
19823 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
19824 operands[5], operands[1], operands[2]))
19825 return true;
19826
19827 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
19828 operands[1], operands[2]);
19829 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19830 return true;
19831 }
19832
19833 /* Expand a signed/unsigned integral vector conditional move. */
19834
19835 bool
19836 ix86_expand_int_vcond (rtx operands[])
19837 {
19838 enum machine_mode data_mode = GET_MODE (operands[0]);
19839 enum machine_mode mode = GET_MODE (operands[4]);
19840 enum rtx_code code = GET_CODE (operands[3]);
19841 bool negate = false;
19842 rtx x, cop0, cop1;
19843
19844 cop0 = operands[4];
19845 cop1 = operands[5];
19846
19847 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
19848 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
19849 if ((code == LT || code == GE)
19850 && data_mode == mode
19851 && cop1 == CONST0_RTX (mode)
19852 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
19853 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
19854 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
19855 && (GET_MODE_SIZE (data_mode) == 16
19856 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
19857 {
19858 rtx negop = operands[2 - (code == LT)];
19859 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
19860 if (negop == CONST1_RTX (data_mode))
19861 {
19862 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
19863 operands[0], 1, OPTAB_DIRECT);
19864 if (res != operands[0])
19865 emit_move_insn (operands[0], res);
19866 return true;
19867 }
19868 else if (GET_MODE_INNER (data_mode) != DImode
19869 && vector_all_ones_operand (negop, data_mode))
19870 {
19871 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
19872 operands[0], 0, OPTAB_DIRECT);
19873 if (res != operands[0])
19874 emit_move_insn (operands[0], res);
19875 return true;
19876 }
19877 }
19878
19879 if (!nonimmediate_operand (cop1, mode))
19880 cop1 = force_reg (mode, cop1);
19881 if (!general_operand (operands[1], data_mode))
19882 operands[1] = force_reg (data_mode, operands[1]);
19883 if (!general_operand (operands[2], data_mode))
19884 operands[2] = force_reg (data_mode, operands[2]);
19885
19886 /* XOP supports all of the comparisons on all 128-bit vector int types. */
19887 if (TARGET_XOP
19888 && (mode == V16QImode || mode == V8HImode
19889 || mode == V4SImode || mode == V2DImode))
19890 ;
19891 else
19892 {
19893 /* Canonicalize the comparison to EQ, GT, GTU. */
19894 switch (code)
19895 {
19896 case EQ:
19897 case GT:
19898 case GTU:
19899 break;
19900
19901 case NE:
19902 case LE:
19903 case LEU:
19904 code = reverse_condition (code);
19905 negate = true;
19906 break;
19907
19908 case GE:
19909 case GEU:
19910 code = reverse_condition (code);
19911 negate = true;
19912 /* FALLTHRU */
19913
19914 case LT:
19915 case LTU:
19916 code = swap_condition (code);
19917 x = cop0, cop0 = cop1, cop1 = x;
19918 break;
19919
19920 default:
19921 gcc_unreachable ();
19922 }
19923
19924 /* Only SSE4.1/SSE4.2 supports V2DImode. */
19925 if (mode == V2DImode)
19926 {
19927 switch (code)
19928 {
19929 case EQ:
19930 /* SSE4.1 supports EQ. */
19931 if (!TARGET_SSE4_1)
19932 return false;
19933 break;
19934
19935 case GT:
19936 case GTU:
19937 /* SSE4.2 supports GT/GTU. */
19938 if (!TARGET_SSE4_2)
19939 return false;
19940 break;
19941
19942 default:
19943 gcc_unreachable ();
19944 }
19945 }
19946
19947 /* Unsigned parallel compare is not supported by the hardware.
19948 Play some tricks to turn this into a signed comparison
19949 against 0. */
19950 if (code == GTU)
19951 {
19952 cop0 = force_reg (mode, cop0);
19953
19954 switch (mode)
19955 {
19956 case V8SImode:
19957 case V4DImode:
19958 case V4SImode:
19959 case V2DImode:
19960 {
19961 rtx t1, t2, mask;
19962 rtx (*gen_sub3) (rtx, rtx, rtx);
19963
19964 switch (mode)
19965 {
19966 case V8SImode: gen_sub3 = gen_subv8si3; break;
19967 case V4DImode: gen_sub3 = gen_subv4di3; break;
19968 case V4SImode: gen_sub3 = gen_subv4si3; break;
19969 case V2DImode: gen_sub3 = gen_subv2di3; break;
19970 default:
19971 gcc_unreachable ();
19972 }
19973 /* Subtract (-(INT MAX) - 1) from both operands to make
19974 them signed. */
19975 mask = ix86_build_signbit_mask (mode, true, false);
19976 t1 = gen_reg_rtx (mode);
19977 emit_insn (gen_sub3 (t1, cop0, mask));
19978
19979 t2 = gen_reg_rtx (mode);
19980 emit_insn (gen_sub3 (t2, cop1, mask));
19981
19982 cop0 = t1;
19983 cop1 = t2;
19984 code = GT;
19985 }
19986 break;
19987
19988 case V32QImode:
19989 case V16HImode:
19990 case V16QImode:
19991 case V8HImode:
19992 /* Perform a parallel unsigned saturating subtraction. */
19993 x = gen_reg_rtx (mode);
19994 emit_insn (gen_rtx_SET (VOIDmode, x,
19995 gen_rtx_US_MINUS (mode, cop0, cop1)));
19996
19997 cop0 = x;
19998 cop1 = CONST0_RTX (mode);
19999 code = EQ;
20000 negate = !negate;
20001 break;
20002
20003 default:
20004 gcc_unreachable ();
20005 }
20006 }
20007 }
20008
20009 /* Allow the comparison to be done in one mode, but the movcc to
20010 happen in another mode. */
20011 if (data_mode == mode)
20012 {
20013 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20014 operands[1+negate], operands[2-negate]);
20015 }
20016 else
20017 {
20018 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20019 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
20020 code, cop0, cop1,
20021 operands[1+negate], operands[2-negate]);
20022 x = gen_lowpart (data_mode, x);
20023 }
20024
20025 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20026 operands[2-negate]);
20027 return true;
20028 }
20029
20030 /* Expand a variable vector permutation. */
20031
20032 void
20033 ix86_expand_vec_perm (rtx operands[])
20034 {
20035 rtx target = operands[0];
20036 rtx op0 = operands[1];
20037 rtx op1 = operands[2];
20038 rtx mask = operands[3];
20039 rtx t1, t2, t3, t4, vt, vt2, vec[32];
20040 enum machine_mode mode = GET_MODE (op0);
20041 enum machine_mode maskmode = GET_MODE (mask);
20042 int w, e, i;
20043 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20044
20045 /* Number of elements in the vector. */
20046 w = GET_MODE_NUNITS (mode);
20047 e = GET_MODE_UNIT_SIZE (mode);
20048 gcc_assert (w <= 32);
20049
20050 if (TARGET_AVX2)
20051 {
20052 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20053 {
20054 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20055 an constant shuffle operand. With a tiny bit of effort we can
20056 use VPERMD instead. A re-interpretation stall for V4DFmode is
20057 unfortunate but there's no avoiding it.
20058 Similarly for V16HImode we don't have instructions for variable
20059 shuffling, while for V32QImode we can use after preparing suitable
20060 masks vpshufb; vpshufb; vpermq; vpor. */
20061
20062 if (mode == V16HImode)
20063 {
20064 maskmode = mode = V32QImode;
20065 w = 32;
20066 e = 1;
20067 }
20068 else
20069 {
20070 maskmode = mode = V8SImode;
20071 w = 8;
20072 e = 4;
20073 }
20074 t1 = gen_reg_rtx (maskmode);
20075
20076 /* Replicate the low bits of the V4DImode mask into V8SImode:
20077 mask = { A B C D }
20078 t1 = { A A B B C C D D }. */
20079 for (i = 0; i < w / 2; ++i)
20080 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20081 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20082 vt = force_reg (maskmode, vt);
20083 mask = gen_lowpart (maskmode, mask);
20084 if (maskmode == V8SImode)
20085 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20086 else
20087 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20088
20089 /* Multiply the shuffle indicies by two. */
20090 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20091 OPTAB_DIRECT);
20092
20093 /* Add one to the odd shuffle indicies:
20094 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20095 for (i = 0; i < w / 2; ++i)
20096 {
20097 vec[i * 2] = const0_rtx;
20098 vec[i * 2 + 1] = const1_rtx;
20099 }
20100 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20101 vt = force_const_mem (maskmode, vt);
20102 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20103 OPTAB_DIRECT);
20104
20105 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20106 operands[3] = mask = t1;
20107 target = gen_lowpart (mode, target);
20108 op0 = gen_lowpart (mode, op0);
20109 op1 = gen_lowpart (mode, op1);
20110 }
20111
20112 switch (mode)
20113 {
20114 case V8SImode:
20115 /* The VPERMD and VPERMPS instructions already properly ignore
20116 the high bits of the shuffle elements. No need for us to
20117 perform an AND ourselves. */
20118 if (one_operand_shuffle)
20119 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20120 else
20121 {
20122 t1 = gen_reg_rtx (V8SImode);
20123 t2 = gen_reg_rtx (V8SImode);
20124 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20125 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20126 goto merge_two;
20127 }
20128 return;
20129
20130 case V8SFmode:
20131 mask = gen_lowpart (V8SFmode, mask);
20132 if (one_operand_shuffle)
20133 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20134 else
20135 {
20136 t1 = gen_reg_rtx (V8SFmode);
20137 t2 = gen_reg_rtx (V8SFmode);
20138 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20139 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20140 goto merge_two;
20141 }
20142 return;
20143
20144 case V4SImode:
20145 /* By combining the two 128-bit input vectors into one 256-bit
20146 input vector, we can use VPERMD and VPERMPS for the full
20147 two-operand shuffle. */
20148 t1 = gen_reg_rtx (V8SImode);
20149 t2 = gen_reg_rtx (V8SImode);
20150 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
20151 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20152 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
20153 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
20154 return;
20155
20156 case V4SFmode:
20157 t1 = gen_reg_rtx (V8SFmode);
20158 t2 = gen_reg_rtx (V8SImode);
20159 mask = gen_lowpart (V4SImode, mask);
20160 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
20161 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20162 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
20163 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
20164 return;
20165
20166 case V32QImode:
20167 t1 = gen_reg_rtx (V32QImode);
20168 t2 = gen_reg_rtx (V32QImode);
20169 t3 = gen_reg_rtx (V32QImode);
20170 vt2 = GEN_INT (128);
20171 for (i = 0; i < 32; i++)
20172 vec[i] = vt2;
20173 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20174 vt = force_reg (V32QImode, vt);
20175 for (i = 0; i < 32; i++)
20176 vec[i] = i < 16 ? vt2 : const0_rtx;
20177 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20178 vt2 = force_reg (V32QImode, vt2);
20179 /* From mask create two adjusted masks, which contain the same
20180 bits as mask in the low 7 bits of each vector element.
20181 The first mask will have the most significant bit clear
20182 if it requests element from the same 128-bit lane
20183 and MSB set if it requests element from the other 128-bit lane.
20184 The second mask will have the opposite values of the MSB,
20185 and additionally will have its 128-bit lanes swapped.
20186 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20187 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
20188 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20189 stands for other 12 bytes. */
20190 /* The bit whether element is from the same lane or the other
20191 lane is bit 4, so shift it up by 3 to the MSB position. */
20192 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20193 gen_lowpart (V4DImode, mask),
20194 GEN_INT (3)));
20195 /* Clear MSB bits from the mask just in case it had them set. */
20196 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20197 /* After this t1 will have MSB set for elements from other lane. */
20198 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20199 /* Clear bits other than MSB. */
20200 emit_insn (gen_andv32qi3 (t1, t1, vt));
20201 /* Or in the lower bits from mask into t3. */
20202 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20203 /* And invert MSB bits in t1, so MSB is set for elements from the same
20204 lane. */
20205 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20206 /* Swap 128-bit lanes in t3. */
20207 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20208 gen_lowpart (V4DImode, t3),
20209 const2_rtx, GEN_INT (3),
20210 const0_rtx, const1_rtx));
20211 /* And or in the lower bits from mask into t1. */
20212 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20213 if (one_operand_shuffle)
20214 {
20215 /* Each of these shuffles will put 0s in places where
20216 element from the other 128-bit lane is needed, otherwise
20217 will shuffle in the requested value. */
20218 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20219 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20220 /* For t3 the 128-bit lanes are swapped again. */
20221 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20222 gen_lowpart (V4DImode, t3),
20223 const2_rtx, GEN_INT (3),
20224 const0_rtx, const1_rtx));
20225 /* And oring both together leads to the result. */
20226 emit_insn (gen_iorv32qi3 (target, t1, t3));
20227 return;
20228 }
20229
20230 t4 = gen_reg_rtx (V32QImode);
20231 /* Similarly to the above one_operand_shuffle code,
20232 just for repeated twice for each operand. merge_two:
20233 code will merge the two results together. */
20234 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20235 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20236 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20237 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20238 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20239 gen_lowpart (V4DImode, t4),
20240 const2_rtx, GEN_INT (3),
20241 const0_rtx, const1_rtx));
20242 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20243 gen_lowpart (V4DImode, t3),
20244 const2_rtx, GEN_INT (3),
20245 const0_rtx, const1_rtx));
20246 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20247 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20248 t1 = t4;
20249 t2 = t3;
20250 goto merge_two;
20251
20252 default:
20253 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20254 break;
20255 }
20256 }
20257
20258 if (TARGET_XOP)
20259 {
20260 /* The XOP VPPERM insn supports three inputs. By ignoring the
20261 one_operand_shuffle special case, we avoid creating another
20262 set of constant vectors in memory. */
20263 one_operand_shuffle = false;
20264
20265 /* mask = mask & {2*w-1, ...} */
20266 vt = GEN_INT (2*w - 1);
20267 }
20268 else
20269 {
20270 /* mask = mask & {w-1, ...} */
20271 vt = GEN_INT (w - 1);
20272 }
20273
20274 for (i = 0; i < w; i++)
20275 vec[i] = vt;
20276 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20277 mask = expand_simple_binop (maskmode, AND, mask, vt,
20278 NULL_RTX, 0, OPTAB_DIRECT);
20279
20280 /* For non-QImode operations, convert the word permutation control
20281 into a byte permutation control. */
20282 if (mode != V16QImode)
20283 {
20284 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20285 GEN_INT (exact_log2 (e)),
20286 NULL_RTX, 0, OPTAB_DIRECT);
20287
20288 /* Convert mask to vector of chars. */
20289 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20290
20291 /* Replicate each of the input bytes into byte positions:
20292 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20293 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20294 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20295 for (i = 0; i < 16; ++i)
20296 vec[i] = GEN_INT (i/e * e);
20297 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20298 vt = force_const_mem (V16QImode, vt);
20299 if (TARGET_XOP)
20300 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20301 else
20302 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20303
20304 /* Convert it into the byte positions by doing
20305 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20306 for (i = 0; i < 16; ++i)
20307 vec[i] = GEN_INT (i % e);
20308 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20309 vt = force_const_mem (V16QImode, vt);
20310 emit_insn (gen_addv16qi3 (mask, mask, vt));
20311 }
20312
20313 /* The actual shuffle operations all operate on V16QImode. */
20314 op0 = gen_lowpart (V16QImode, op0);
20315 op1 = gen_lowpart (V16QImode, op1);
20316 target = gen_lowpart (V16QImode, target);
20317
20318 if (TARGET_XOP)
20319 {
20320 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20321 }
20322 else if (one_operand_shuffle)
20323 {
20324 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20325 }
20326 else
20327 {
20328 rtx xops[6];
20329 bool ok;
20330
20331 /* Shuffle the two input vectors independently. */
20332 t1 = gen_reg_rtx (V16QImode);
20333 t2 = gen_reg_rtx (V16QImode);
20334 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20335 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20336
20337 merge_two:
20338 /* Then merge them together. The key is whether any given control
20339 element contained a bit set that indicates the second word. */
20340 mask = operands[3];
20341 vt = GEN_INT (w);
20342 if (maskmode == V2DImode && !TARGET_SSE4_1)
20343 {
20344 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20345 more shuffle to convert the V2DI input mask into a V4SI
20346 input mask. At which point the masking that expand_int_vcond
20347 will work as desired. */
20348 rtx t3 = gen_reg_rtx (V4SImode);
20349 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20350 const0_rtx, const0_rtx,
20351 const2_rtx, const2_rtx));
20352 mask = t3;
20353 maskmode = V4SImode;
20354 e = w = 4;
20355 }
20356
20357 for (i = 0; i < w; i++)
20358 vec[i] = vt;
20359 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20360 vt = force_reg (maskmode, vt);
20361 mask = expand_simple_binop (maskmode, AND, mask, vt,
20362 NULL_RTX, 0, OPTAB_DIRECT);
20363
20364 xops[0] = gen_lowpart (mode, operands[0]);
20365 xops[1] = gen_lowpart (mode, t2);
20366 xops[2] = gen_lowpart (mode, t1);
20367 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20368 xops[4] = mask;
20369 xops[5] = vt;
20370 ok = ix86_expand_int_vcond (xops);
20371 gcc_assert (ok);
20372 }
20373 }
20374
20375 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20376 true if we should do zero extension, else sign extension. HIGH_P is
20377 true if we want the N/2 high elements, else the low elements. */
20378
20379 void
20380 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
20381 {
20382 enum machine_mode imode = GET_MODE (src);
20383 rtx tmp;
20384
20385 if (TARGET_SSE4_1)
20386 {
20387 rtx (*unpack)(rtx, rtx);
20388 rtx (*extract)(rtx, rtx) = NULL;
20389 enum machine_mode halfmode = BLKmode;
20390
20391 switch (imode)
20392 {
20393 case V32QImode:
20394 if (unsigned_p)
20395 unpack = gen_avx2_zero_extendv16qiv16hi2;
20396 else
20397 unpack = gen_avx2_sign_extendv16qiv16hi2;
20398 halfmode = V16QImode;
20399 extract
20400 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20401 break;
20402 case V16HImode:
20403 if (unsigned_p)
20404 unpack = gen_avx2_zero_extendv8hiv8si2;
20405 else
20406 unpack = gen_avx2_sign_extendv8hiv8si2;
20407 halfmode = V8HImode;
20408 extract
20409 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20410 break;
20411 case V8SImode:
20412 if (unsigned_p)
20413 unpack = gen_avx2_zero_extendv4siv4di2;
20414 else
20415 unpack = gen_avx2_sign_extendv4siv4di2;
20416 halfmode = V4SImode;
20417 extract
20418 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20419 break;
20420 case V16QImode:
20421 if (unsigned_p)
20422 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20423 else
20424 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20425 break;
20426 case V8HImode:
20427 if (unsigned_p)
20428 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20429 else
20430 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20431 break;
20432 case V4SImode:
20433 if (unsigned_p)
20434 unpack = gen_sse4_1_zero_extendv2siv2di2;
20435 else
20436 unpack = gen_sse4_1_sign_extendv2siv2di2;
20437 break;
20438 default:
20439 gcc_unreachable ();
20440 }
20441
20442 if (GET_MODE_SIZE (imode) == 32)
20443 {
20444 tmp = gen_reg_rtx (halfmode);
20445 emit_insn (extract (tmp, src));
20446 }
20447 else if (high_p)
20448 {
20449 /* Shift higher 8 bytes to lower 8 bytes. */
20450 tmp = gen_reg_rtx (imode);
20451 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20452 gen_lowpart (V1TImode, src),
20453 GEN_INT (64)));
20454 }
20455 else
20456 tmp = src;
20457
20458 emit_insn (unpack (dest, tmp));
20459 }
20460 else
20461 {
20462 rtx (*unpack)(rtx, rtx, rtx);
20463
20464 switch (imode)
20465 {
20466 case V16QImode:
20467 if (high_p)
20468 unpack = gen_vec_interleave_highv16qi;
20469 else
20470 unpack = gen_vec_interleave_lowv16qi;
20471 break;
20472 case V8HImode:
20473 if (high_p)
20474 unpack = gen_vec_interleave_highv8hi;
20475 else
20476 unpack = gen_vec_interleave_lowv8hi;
20477 break;
20478 case V4SImode:
20479 if (high_p)
20480 unpack = gen_vec_interleave_highv4si;
20481 else
20482 unpack = gen_vec_interleave_lowv4si;
20483 break;
20484 default:
20485 gcc_unreachable ();
20486 }
20487
20488 if (unsigned_p)
20489 tmp = force_reg (imode, CONST0_RTX (imode));
20490 else
20491 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20492 src, pc_rtx, pc_rtx);
20493
20494 emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
20495 }
20496 }
20497
20498 /* Expand conditional increment or decrement using adb/sbb instructions.
20499 The default case using setcc followed by the conditional move can be
20500 done by generic code. */
20501 bool
20502 ix86_expand_int_addcc (rtx operands[])
20503 {
20504 enum rtx_code code = GET_CODE (operands[1]);
20505 rtx flags;
20506 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20507 rtx compare_op;
20508 rtx val = const0_rtx;
20509 bool fpcmp = false;
20510 enum machine_mode mode;
20511 rtx op0 = XEXP (operands[1], 0);
20512 rtx op1 = XEXP (operands[1], 1);
20513
20514 if (operands[3] != const1_rtx
20515 && operands[3] != constm1_rtx)
20516 return false;
20517 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20518 return false;
20519 code = GET_CODE (compare_op);
20520
20521 flags = XEXP (compare_op, 0);
20522
20523 if (GET_MODE (flags) == CCFPmode
20524 || GET_MODE (flags) == CCFPUmode)
20525 {
20526 fpcmp = true;
20527 code = ix86_fp_compare_code_to_integer (code);
20528 }
20529
20530 if (code != LTU)
20531 {
20532 val = constm1_rtx;
20533 if (fpcmp)
20534 PUT_CODE (compare_op,
20535 reverse_condition_maybe_unordered
20536 (GET_CODE (compare_op)));
20537 else
20538 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20539 }
20540
20541 mode = GET_MODE (operands[0]);
20542
20543 /* Construct either adc or sbb insn. */
20544 if ((code == LTU) == (operands[3] == constm1_rtx))
20545 {
20546 switch (mode)
20547 {
20548 case QImode:
20549 insn = gen_subqi3_carry;
20550 break;
20551 case HImode:
20552 insn = gen_subhi3_carry;
20553 break;
20554 case SImode:
20555 insn = gen_subsi3_carry;
20556 break;
20557 case DImode:
20558 insn = gen_subdi3_carry;
20559 break;
20560 default:
20561 gcc_unreachable ();
20562 }
20563 }
20564 else
20565 {
20566 switch (mode)
20567 {
20568 case QImode:
20569 insn = gen_addqi3_carry;
20570 break;
20571 case HImode:
20572 insn = gen_addhi3_carry;
20573 break;
20574 case SImode:
20575 insn = gen_addsi3_carry;
20576 break;
20577 case DImode:
20578 insn = gen_adddi3_carry;
20579 break;
20580 default:
20581 gcc_unreachable ();
20582 }
20583 }
20584 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20585
20586 return true;
20587 }
20588
20589
20590 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20591 but works for floating pointer parameters and nonoffsetable memories.
20592 For pushes, it returns just stack offsets; the values will be saved
20593 in the right order. Maximally three parts are generated. */
20594
20595 static int
20596 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20597 {
20598 int size;
20599
20600 if (!TARGET_64BIT)
20601 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20602 else
20603 size = (GET_MODE_SIZE (mode) + 4) / 8;
20604
20605 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20606 gcc_assert (size >= 2 && size <= 4);
20607
20608 /* Optimize constant pool reference to immediates. This is used by fp
20609 moves, that force all constants to memory to allow combining. */
20610 if (MEM_P (operand) && MEM_READONLY_P (operand))
20611 {
20612 rtx tmp = maybe_get_pool_constant (operand);
20613 if (tmp)
20614 operand = tmp;
20615 }
20616
20617 if (MEM_P (operand) && !offsettable_memref_p (operand))
20618 {
20619 /* The only non-offsetable memories we handle are pushes. */
20620 int ok = push_operand (operand, VOIDmode);
20621
20622 gcc_assert (ok);
20623
20624 operand = copy_rtx (operand);
20625 PUT_MODE (operand, word_mode);
20626 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20627 return size;
20628 }
20629
20630 if (GET_CODE (operand) == CONST_VECTOR)
20631 {
20632 enum machine_mode imode = int_mode_for_mode (mode);
20633 /* Caution: if we looked through a constant pool memory above,
20634 the operand may actually have a different mode now. That's
20635 ok, since we want to pun this all the way back to an integer. */
20636 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20637 gcc_assert (operand != NULL);
20638 mode = imode;
20639 }
20640
20641 if (!TARGET_64BIT)
20642 {
20643 if (mode == DImode)
20644 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20645 else
20646 {
20647 int i;
20648
20649 if (REG_P (operand))
20650 {
20651 gcc_assert (reload_completed);
20652 for (i = 0; i < size; i++)
20653 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20654 }
20655 else if (offsettable_memref_p (operand))
20656 {
20657 operand = adjust_address (operand, SImode, 0);
20658 parts[0] = operand;
20659 for (i = 1; i < size; i++)
20660 parts[i] = adjust_address (operand, SImode, 4 * i);
20661 }
20662 else if (GET_CODE (operand) == CONST_DOUBLE)
20663 {
20664 REAL_VALUE_TYPE r;
20665 long l[4];
20666
20667 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20668 switch (mode)
20669 {
20670 case TFmode:
20671 real_to_target (l, &r, mode);
20672 parts[3] = gen_int_mode (l[3], SImode);
20673 parts[2] = gen_int_mode (l[2], SImode);
20674 break;
20675 case XFmode:
20676 REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
20677 parts[2] = gen_int_mode (l[2], SImode);
20678 break;
20679 case DFmode:
20680 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20681 break;
20682 default:
20683 gcc_unreachable ();
20684 }
20685 parts[1] = gen_int_mode (l[1], SImode);
20686 parts[0] = gen_int_mode (l[0], SImode);
20687 }
20688 else
20689 gcc_unreachable ();
20690 }
20691 }
20692 else
20693 {
20694 if (mode == TImode)
20695 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20696 if (mode == XFmode || mode == TFmode)
20697 {
20698 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20699 if (REG_P (operand))
20700 {
20701 gcc_assert (reload_completed);
20702 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20703 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20704 }
20705 else if (offsettable_memref_p (operand))
20706 {
20707 operand = adjust_address (operand, DImode, 0);
20708 parts[0] = operand;
20709 parts[1] = adjust_address (operand, upper_mode, 8);
20710 }
20711 else if (GET_CODE (operand) == CONST_DOUBLE)
20712 {
20713 REAL_VALUE_TYPE r;
20714 long l[4];
20715
20716 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20717 real_to_target (l, &r, mode);
20718
20719 /* Do not use shift by 32 to avoid warning on 32bit systems. */
20720 if (HOST_BITS_PER_WIDE_INT >= 64)
20721 parts[0]
20722 = gen_int_mode
20723 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20724 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
20725 DImode);
20726 else
20727 parts[0] = immed_double_const (l[0], l[1], DImode);
20728
20729 if (upper_mode == SImode)
20730 parts[1] = gen_int_mode (l[2], SImode);
20731 else if (HOST_BITS_PER_WIDE_INT >= 64)
20732 parts[1]
20733 = gen_int_mode
20734 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
20735 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
20736 DImode);
20737 else
20738 parts[1] = immed_double_const (l[2], l[3], DImode);
20739 }
20740 else
20741 gcc_unreachable ();
20742 }
20743 }
20744
20745 return size;
20746 }
20747
20748 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
20749 Return false when normal moves are needed; true when all required
20750 insns have been emitted. Operands 2-4 contain the input values
20751 int the correct order; operands 5-7 contain the output values. */
20752
20753 void
20754 ix86_split_long_move (rtx operands[])
20755 {
20756 rtx part[2][4];
20757 int nparts, i, j;
20758 int push = 0;
20759 int collisions = 0;
20760 enum machine_mode mode = GET_MODE (operands[0]);
20761 bool collisionparts[4];
20762
20763 /* The DFmode expanders may ask us to move double.
20764 For 64bit target this is single move. By hiding the fact
20765 here we simplify i386.md splitters. */
20766 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
20767 {
20768 /* Optimize constant pool reference to immediates. This is used by
20769 fp moves, that force all constants to memory to allow combining. */
20770
20771 if (MEM_P (operands[1])
20772 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
20773 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
20774 operands[1] = get_pool_constant (XEXP (operands[1], 0));
20775 if (push_operand (operands[0], VOIDmode))
20776 {
20777 operands[0] = copy_rtx (operands[0]);
20778 PUT_MODE (operands[0], word_mode);
20779 }
20780 else
20781 operands[0] = gen_lowpart (DImode, operands[0]);
20782 operands[1] = gen_lowpart (DImode, operands[1]);
20783 emit_move_insn (operands[0], operands[1]);
20784 return;
20785 }
20786
20787 /* The only non-offsettable memory we handle is push. */
20788 if (push_operand (operands[0], VOIDmode))
20789 push = 1;
20790 else
20791 gcc_assert (!MEM_P (operands[0])
20792 || offsettable_memref_p (operands[0]));
20793
20794 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
20795 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
20796
20797 /* When emitting push, take care for source operands on the stack. */
20798 if (push && MEM_P (operands[1])
20799 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
20800 {
20801 rtx src_base = XEXP (part[1][nparts - 1], 0);
20802
20803 /* Compensate for the stack decrement by 4. */
20804 if (!TARGET_64BIT && nparts == 3
20805 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
20806 src_base = plus_constant (Pmode, src_base, 4);
20807
20808 /* src_base refers to the stack pointer and is
20809 automatically decreased by emitted push. */
20810 for (i = 0; i < nparts; i++)
20811 part[1][i] = change_address (part[1][i],
20812 GET_MODE (part[1][i]), src_base);
20813 }
20814
20815 /* We need to do copy in the right order in case an address register
20816 of the source overlaps the destination. */
20817 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
20818 {
20819 rtx tmp;
20820
20821 for (i = 0; i < nparts; i++)
20822 {
20823 collisionparts[i]
20824 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
20825 if (collisionparts[i])
20826 collisions++;
20827 }
20828
20829 /* Collision in the middle part can be handled by reordering. */
20830 if (collisions == 1 && nparts == 3 && collisionparts [1])
20831 {
20832 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20833 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20834 }
20835 else if (collisions == 1
20836 && nparts == 4
20837 && (collisionparts [1] || collisionparts [2]))
20838 {
20839 if (collisionparts [1])
20840 {
20841 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20842 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20843 }
20844 else
20845 {
20846 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
20847 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
20848 }
20849 }
20850
20851 /* If there are more collisions, we can't handle it by reordering.
20852 Do an lea to the last part and use only one colliding move. */
20853 else if (collisions > 1)
20854 {
20855 rtx base;
20856
20857 collisions = 1;
20858
20859 base = part[0][nparts - 1];
20860
20861 /* Handle the case when the last part isn't valid for lea.
20862 Happens in 64-bit mode storing the 12-byte XFmode. */
20863 if (GET_MODE (base) != Pmode)
20864 base = gen_rtx_REG (Pmode, REGNO (base));
20865
20866 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
20867 part[1][0] = replace_equiv_address (part[1][0], base);
20868 for (i = 1; i < nparts; i++)
20869 {
20870 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
20871 part[1][i] = replace_equiv_address (part[1][i], tmp);
20872 }
20873 }
20874 }
20875
20876 if (push)
20877 {
20878 if (!TARGET_64BIT)
20879 {
20880 if (nparts == 3)
20881 {
20882 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
20883 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
20884 stack_pointer_rtx, GEN_INT (-4)));
20885 emit_move_insn (part[0][2], part[1][2]);
20886 }
20887 else if (nparts == 4)
20888 {
20889 emit_move_insn (part[0][3], part[1][3]);
20890 emit_move_insn (part[0][2], part[1][2]);
20891 }
20892 }
20893 else
20894 {
20895 /* In 64bit mode we don't have 32bit push available. In case this is
20896 register, it is OK - we will just use larger counterpart. We also
20897 retype memory - these comes from attempt to avoid REX prefix on
20898 moving of second half of TFmode value. */
20899 if (GET_MODE (part[1][1]) == SImode)
20900 {
20901 switch (GET_CODE (part[1][1]))
20902 {
20903 case MEM:
20904 part[1][1] = adjust_address (part[1][1], DImode, 0);
20905 break;
20906
20907 case REG:
20908 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
20909 break;
20910
20911 default:
20912 gcc_unreachable ();
20913 }
20914
20915 if (GET_MODE (part[1][0]) == SImode)
20916 part[1][0] = part[1][1];
20917 }
20918 }
20919 emit_move_insn (part[0][1], part[1][1]);
20920 emit_move_insn (part[0][0], part[1][0]);
20921 return;
20922 }
20923
20924 /* Choose correct order to not overwrite the source before it is copied. */
20925 if ((REG_P (part[0][0])
20926 && REG_P (part[1][1])
20927 && (REGNO (part[0][0]) == REGNO (part[1][1])
20928 || (nparts == 3
20929 && REGNO (part[0][0]) == REGNO (part[1][2]))
20930 || (nparts == 4
20931 && REGNO (part[0][0]) == REGNO (part[1][3]))))
20932 || (collisions > 0
20933 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
20934 {
20935 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
20936 {
20937 operands[2 + i] = part[0][j];
20938 operands[6 + i] = part[1][j];
20939 }
20940 }
20941 else
20942 {
20943 for (i = 0; i < nparts; i++)
20944 {
20945 operands[2 + i] = part[0][i];
20946 operands[6 + i] = part[1][i];
20947 }
20948 }
20949
20950 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
20951 if (optimize_insn_for_size_p ())
20952 {
20953 for (j = 0; j < nparts - 1; j++)
20954 if (CONST_INT_P (operands[6 + j])
20955 && operands[6 + j] != const0_rtx
20956 && REG_P (operands[2 + j]))
20957 for (i = j; i < nparts - 1; i++)
20958 if (CONST_INT_P (operands[7 + i])
20959 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
20960 operands[7 + i] = operands[2 + j];
20961 }
20962
20963 for (i = 0; i < nparts; i++)
20964 emit_move_insn (operands[2 + i], operands[6 + i]);
20965
20966 return;
20967 }
20968
20969 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
20970 left shift by a constant, either using a single shift or
20971 a sequence of add instructions. */
20972
20973 static void
20974 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
20975 {
20976 rtx (*insn)(rtx, rtx, rtx);
20977
20978 if (count == 1
20979 || (count * ix86_cost->add <= ix86_cost->shift_const
20980 && !optimize_insn_for_size_p ()))
20981 {
20982 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
20983 while (count-- > 0)
20984 emit_insn (insn (operand, operand, operand));
20985 }
20986 else
20987 {
20988 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20989 emit_insn (insn (operand, operand, GEN_INT (count)));
20990 }
20991 }
20992
20993 void
20994 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
20995 {
20996 rtx (*gen_ashl3)(rtx, rtx, rtx);
20997 rtx (*gen_shld)(rtx, rtx, rtx);
20998 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20999
21000 rtx low[2], high[2];
21001 int count;
21002
21003 if (CONST_INT_P (operands[2]))
21004 {
21005 split_double_mode (mode, operands, 2, low, high);
21006 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21007
21008 if (count >= half_width)
21009 {
21010 emit_move_insn (high[0], low[1]);
21011 emit_move_insn (low[0], const0_rtx);
21012
21013 if (count > half_width)
21014 ix86_expand_ashl_const (high[0], count - half_width, mode);
21015 }
21016 else
21017 {
21018 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21019
21020 if (!rtx_equal_p (operands[0], operands[1]))
21021 emit_move_insn (operands[0], operands[1]);
21022
21023 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21024 ix86_expand_ashl_const (low[0], count, mode);
21025 }
21026 return;
21027 }
21028
21029 split_double_mode (mode, operands, 1, low, high);
21030
21031 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21032
21033 if (operands[1] == const1_rtx)
21034 {
21035 /* Assuming we've chosen a QImode capable registers, then 1 << N
21036 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21037 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21038 {
21039 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21040
21041 ix86_expand_clear (low[0]);
21042 ix86_expand_clear (high[0]);
21043 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21044
21045 d = gen_lowpart (QImode, low[0]);
21046 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21047 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21048 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21049
21050 d = gen_lowpart (QImode, high[0]);
21051 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21052 s = gen_rtx_NE (QImode, flags, const0_rtx);
21053 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21054 }
21055
21056 /* Otherwise, we can get the same results by manually performing
21057 a bit extract operation on bit 5/6, and then performing the two
21058 shifts. The two methods of getting 0/1 into low/high are exactly
21059 the same size. Avoiding the shift in the bit extract case helps
21060 pentium4 a bit; no one else seems to care much either way. */
21061 else
21062 {
21063 enum machine_mode half_mode;
21064 rtx (*gen_lshr3)(rtx, rtx, rtx);
21065 rtx (*gen_and3)(rtx, rtx, rtx);
21066 rtx (*gen_xor3)(rtx, rtx, rtx);
21067 HOST_WIDE_INT bits;
21068 rtx x;
21069
21070 if (mode == DImode)
21071 {
21072 half_mode = SImode;
21073 gen_lshr3 = gen_lshrsi3;
21074 gen_and3 = gen_andsi3;
21075 gen_xor3 = gen_xorsi3;
21076 bits = 5;
21077 }
21078 else
21079 {
21080 half_mode = DImode;
21081 gen_lshr3 = gen_lshrdi3;
21082 gen_and3 = gen_anddi3;
21083 gen_xor3 = gen_xordi3;
21084 bits = 6;
21085 }
21086
21087 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21088 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21089 else
21090 x = gen_lowpart (half_mode, operands[2]);
21091 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21092
21093 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21094 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21095 emit_move_insn (low[0], high[0]);
21096 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21097 }
21098
21099 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21100 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21101 return;
21102 }
21103
21104 if (operands[1] == constm1_rtx)
21105 {
21106 /* For -1 << N, we can avoid the shld instruction, because we
21107 know that we're shifting 0...31/63 ones into a -1. */
21108 emit_move_insn (low[0], constm1_rtx);
21109 if (optimize_insn_for_size_p ())
21110 emit_move_insn (high[0], low[0]);
21111 else
21112 emit_move_insn (high[0], constm1_rtx);
21113 }
21114 else
21115 {
21116 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21117
21118 if (!rtx_equal_p (operands[0], operands[1]))
21119 emit_move_insn (operands[0], operands[1]);
21120
21121 split_double_mode (mode, operands, 1, low, high);
21122 emit_insn (gen_shld (high[0], low[0], operands[2]));
21123 }
21124
21125 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21126
21127 if (TARGET_CMOVE && scratch)
21128 {
21129 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21130 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21131
21132 ix86_expand_clear (scratch);
21133 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
21134 }
21135 else
21136 {
21137 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21138 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21139
21140 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
21141 }
21142 }
21143
21144 void
21145 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
21146 {
21147 rtx (*gen_ashr3)(rtx, rtx, rtx)
21148 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
21149 rtx (*gen_shrd)(rtx, rtx, rtx);
21150 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21151
21152 rtx low[2], high[2];
21153 int count;
21154
21155 if (CONST_INT_P (operands[2]))
21156 {
21157 split_double_mode (mode, operands, 2, low, high);
21158 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21159
21160 if (count == GET_MODE_BITSIZE (mode) - 1)
21161 {
21162 emit_move_insn (high[0], high[1]);
21163 emit_insn (gen_ashr3 (high[0], high[0],
21164 GEN_INT (half_width - 1)));
21165 emit_move_insn (low[0], high[0]);
21166
21167 }
21168 else if (count >= half_width)
21169 {
21170 emit_move_insn (low[0], high[1]);
21171 emit_move_insn (high[0], low[0]);
21172 emit_insn (gen_ashr3 (high[0], high[0],
21173 GEN_INT (half_width - 1)));
21174
21175 if (count > half_width)
21176 emit_insn (gen_ashr3 (low[0], low[0],
21177 GEN_INT (count - half_width)));
21178 }
21179 else
21180 {
21181 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21182
21183 if (!rtx_equal_p (operands[0], operands[1]))
21184 emit_move_insn (operands[0], operands[1]);
21185
21186 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21187 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21188 }
21189 }
21190 else
21191 {
21192 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21193
21194 if (!rtx_equal_p (operands[0], operands[1]))
21195 emit_move_insn (operands[0], operands[1]);
21196
21197 split_double_mode (mode, operands, 1, low, high);
21198
21199 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21200 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21201
21202 if (TARGET_CMOVE && scratch)
21203 {
21204 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21205 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21206
21207 emit_move_insn (scratch, high[0]);
21208 emit_insn (gen_ashr3 (scratch, scratch,
21209 GEN_INT (half_width - 1)));
21210 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21211 scratch));
21212 }
21213 else
21214 {
21215 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21216 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21217
21218 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21219 }
21220 }
21221 }
21222
21223 void
21224 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21225 {
21226 rtx (*gen_lshr3)(rtx, rtx, rtx)
21227 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21228 rtx (*gen_shrd)(rtx, rtx, rtx);
21229 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21230
21231 rtx low[2], high[2];
21232 int count;
21233
21234 if (CONST_INT_P (operands[2]))
21235 {
21236 split_double_mode (mode, operands, 2, low, high);
21237 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21238
21239 if (count >= half_width)
21240 {
21241 emit_move_insn (low[0], high[1]);
21242 ix86_expand_clear (high[0]);
21243
21244 if (count > half_width)
21245 emit_insn (gen_lshr3 (low[0], low[0],
21246 GEN_INT (count - half_width)));
21247 }
21248 else
21249 {
21250 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21251
21252 if (!rtx_equal_p (operands[0], operands[1]))
21253 emit_move_insn (operands[0], operands[1]);
21254
21255 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21256 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21257 }
21258 }
21259 else
21260 {
21261 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21262
21263 if (!rtx_equal_p (operands[0], operands[1]))
21264 emit_move_insn (operands[0], operands[1]);
21265
21266 split_double_mode (mode, operands, 1, low, high);
21267
21268 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21269 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21270
21271 if (TARGET_CMOVE && scratch)
21272 {
21273 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21274 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21275
21276 ix86_expand_clear (scratch);
21277 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21278 scratch));
21279 }
21280 else
21281 {
21282 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21283 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21284
21285 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21286 }
21287 }
21288 }
21289
21290 /* Predict just emitted jump instruction to be taken with probability PROB. */
21291 static void
21292 predict_jump (int prob)
21293 {
21294 rtx insn = get_last_insn ();
21295 gcc_assert (JUMP_P (insn));
21296 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21297 }
21298
21299 /* Helper function for the string operations below. Dest VARIABLE whether
21300 it is aligned to VALUE bytes. If true, jump to the label. */
21301 static rtx
21302 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21303 {
21304 rtx label = gen_label_rtx ();
21305 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21306 if (GET_MODE (variable) == DImode)
21307 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21308 else
21309 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21310 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21311 1, label);
21312 if (epilogue)
21313 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21314 else
21315 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21316 return label;
21317 }
21318
21319 /* Adjust COUNTER by the VALUE. */
21320 static void
21321 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21322 {
21323 rtx (*gen_add)(rtx, rtx, rtx)
21324 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21325
21326 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21327 }
21328
21329 /* Zero extend possibly SImode EXP to Pmode register. */
21330 rtx
21331 ix86_zero_extend_to_Pmode (rtx exp)
21332 {
21333 if (GET_MODE (exp) != Pmode)
21334 exp = convert_to_mode (Pmode, exp, 1);
21335 return force_reg (Pmode, exp);
21336 }
21337
21338 /* Divide COUNTREG by SCALE. */
21339 static rtx
21340 scale_counter (rtx countreg, int scale)
21341 {
21342 rtx sc;
21343
21344 if (scale == 1)
21345 return countreg;
21346 if (CONST_INT_P (countreg))
21347 return GEN_INT (INTVAL (countreg) / scale);
21348 gcc_assert (REG_P (countreg));
21349
21350 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21351 GEN_INT (exact_log2 (scale)),
21352 NULL, 1, OPTAB_DIRECT);
21353 return sc;
21354 }
21355
21356 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21357 DImode for constant loop counts. */
21358
21359 static enum machine_mode
21360 counter_mode (rtx count_exp)
21361 {
21362 if (GET_MODE (count_exp) != VOIDmode)
21363 return GET_MODE (count_exp);
21364 if (!CONST_INT_P (count_exp))
21365 return Pmode;
21366 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21367 return DImode;
21368 return SImode;
21369 }
21370
21371 /* When SRCPTR is non-NULL, output simple loop to move memory
21372 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21373 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21374 equivalent loop to set memory by VALUE (supposed to be in MODE).
21375
21376 The size is rounded down to whole number of chunk size moved at once.
21377 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21378
21379
21380 static void
21381 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21382 rtx destptr, rtx srcptr, rtx value,
21383 rtx count, enum machine_mode mode, int unroll,
21384 int expected_size)
21385 {
21386 rtx out_label, top_label, iter, tmp;
21387 enum machine_mode iter_mode = counter_mode (count);
21388 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21389 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21390 rtx size;
21391 rtx x_addr;
21392 rtx y_addr;
21393 int i;
21394
21395 top_label = gen_label_rtx ();
21396 out_label = gen_label_rtx ();
21397 iter = gen_reg_rtx (iter_mode);
21398
21399 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21400 NULL, 1, OPTAB_DIRECT);
21401 /* Those two should combine. */
21402 if (piece_size == const1_rtx)
21403 {
21404 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21405 true, out_label);
21406 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21407 }
21408 emit_move_insn (iter, const0_rtx);
21409
21410 emit_label (top_label);
21411
21412 tmp = convert_modes (Pmode, iter_mode, iter, true);
21413 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21414 destmem = change_address (destmem, mode, x_addr);
21415
21416 if (srcmem)
21417 {
21418 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21419 srcmem = change_address (srcmem, mode, y_addr);
21420
21421 /* When unrolling for chips that reorder memory reads and writes,
21422 we can save registers by using single temporary.
21423 Also using 4 temporaries is overkill in 32bit mode. */
21424 if (!TARGET_64BIT && 0)
21425 {
21426 for (i = 0; i < unroll; i++)
21427 {
21428 if (i)
21429 {
21430 destmem =
21431 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21432 srcmem =
21433 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21434 }
21435 emit_move_insn (destmem, srcmem);
21436 }
21437 }
21438 else
21439 {
21440 rtx tmpreg[4];
21441 gcc_assert (unroll <= 4);
21442 for (i = 0; i < unroll; i++)
21443 {
21444 tmpreg[i] = gen_reg_rtx (mode);
21445 if (i)
21446 {
21447 srcmem =
21448 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21449 }
21450 emit_move_insn (tmpreg[i], srcmem);
21451 }
21452 for (i = 0; i < unroll; i++)
21453 {
21454 if (i)
21455 {
21456 destmem =
21457 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21458 }
21459 emit_move_insn (destmem, tmpreg[i]);
21460 }
21461 }
21462 }
21463 else
21464 for (i = 0; i < unroll; i++)
21465 {
21466 if (i)
21467 destmem =
21468 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21469 emit_move_insn (destmem, value);
21470 }
21471
21472 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21473 true, OPTAB_LIB_WIDEN);
21474 if (tmp != iter)
21475 emit_move_insn (iter, tmp);
21476
21477 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21478 true, top_label);
21479 if (expected_size != -1)
21480 {
21481 expected_size /= GET_MODE_SIZE (mode) * unroll;
21482 if (expected_size == 0)
21483 predict_jump (0);
21484 else if (expected_size > REG_BR_PROB_BASE)
21485 predict_jump (REG_BR_PROB_BASE - 1);
21486 else
21487 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21488 }
21489 else
21490 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21491 iter = ix86_zero_extend_to_Pmode (iter);
21492 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21493 true, OPTAB_LIB_WIDEN);
21494 if (tmp != destptr)
21495 emit_move_insn (destptr, tmp);
21496 if (srcptr)
21497 {
21498 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21499 true, OPTAB_LIB_WIDEN);
21500 if (tmp != srcptr)
21501 emit_move_insn (srcptr, tmp);
21502 }
21503 emit_label (out_label);
21504 }
21505
21506 /* Output "rep; mov" instruction.
21507 Arguments have same meaning as for previous function */
21508 static void
21509 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21510 rtx destptr, rtx srcptr,
21511 rtx count,
21512 enum machine_mode mode)
21513 {
21514 rtx destexp;
21515 rtx srcexp;
21516 rtx countreg;
21517 HOST_WIDE_INT rounded_count;
21518
21519 /* If the size is known, it is shorter to use rep movs. */
21520 if (mode == QImode && CONST_INT_P (count)
21521 && !(INTVAL (count) & 3))
21522 mode = SImode;
21523
21524 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21525 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21526 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21527 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21528 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21529 if (mode != QImode)
21530 {
21531 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21532 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21533 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21534 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21535 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21536 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21537 }
21538 else
21539 {
21540 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21541 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21542 }
21543 if (CONST_INT_P (count))
21544 {
21545 rounded_count = (INTVAL (count)
21546 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21547 destmem = shallow_copy_rtx (destmem);
21548 srcmem = shallow_copy_rtx (srcmem);
21549 set_mem_size (destmem, rounded_count);
21550 set_mem_size (srcmem, rounded_count);
21551 }
21552 else
21553 {
21554 if (MEM_SIZE_KNOWN_P (destmem))
21555 clear_mem_size (destmem);
21556 if (MEM_SIZE_KNOWN_P (srcmem))
21557 clear_mem_size (srcmem);
21558 }
21559 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21560 destexp, srcexp));
21561 }
21562
21563 /* Output "rep; stos" instruction.
21564 Arguments have same meaning as for previous function */
21565 static void
21566 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21567 rtx count, enum machine_mode mode,
21568 rtx orig_value)
21569 {
21570 rtx destexp;
21571 rtx countreg;
21572 HOST_WIDE_INT rounded_count;
21573
21574 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21575 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21576 value = force_reg (mode, gen_lowpart (mode, value));
21577 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21578 if (mode != QImode)
21579 {
21580 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21581 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21582 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21583 }
21584 else
21585 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21586 if (orig_value == const0_rtx && CONST_INT_P (count))
21587 {
21588 rounded_count = (INTVAL (count)
21589 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21590 destmem = shallow_copy_rtx (destmem);
21591 set_mem_size (destmem, rounded_count);
21592 }
21593 else if (MEM_SIZE_KNOWN_P (destmem))
21594 clear_mem_size (destmem);
21595 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21596 }
21597
21598 static void
21599 emit_strmov (rtx destmem, rtx srcmem,
21600 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21601 {
21602 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21603 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21604 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21605 }
21606
21607 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21608 static void
21609 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21610 rtx destptr, rtx srcptr, rtx count, int max_size)
21611 {
21612 rtx src, dest;
21613 if (CONST_INT_P (count))
21614 {
21615 HOST_WIDE_INT countval = INTVAL (count);
21616 int offset = 0;
21617
21618 if ((countval & 0x10) && max_size > 16)
21619 {
21620 if (TARGET_64BIT)
21621 {
21622 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21623 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21624 }
21625 else
21626 gcc_unreachable ();
21627 offset += 16;
21628 }
21629 if ((countval & 0x08) && max_size > 8)
21630 {
21631 if (TARGET_64BIT)
21632 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21633 else
21634 {
21635 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21636 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21637 }
21638 offset += 8;
21639 }
21640 if ((countval & 0x04) && max_size > 4)
21641 {
21642 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21643 offset += 4;
21644 }
21645 if ((countval & 0x02) && max_size > 2)
21646 {
21647 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21648 offset += 2;
21649 }
21650 if ((countval & 0x01) && max_size > 1)
21651 {
21652 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21653 offset += 1;
21654 }
21655 return;
21656 }
21657 if (max_size > 8)
21658 {
21659 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21660 count, 1, OPTAB_DIRECT);
21661 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21662 count, QImode, 1, 4);
21663 return;
21664 }
21665
21666 /* When there are stringops, we can cheaply increase dest and src pointers.
21667 Otherwise we save code size by maintaining offset (zero is readily
21668 available from preceding rep operation) and using x86 addressing modes.
21669 */
21670 if (TARGET_SINGLE_STRINGOP)
21671 {
21672 if (max_size > 4)
21673 {
21674 rtx label = ix86_expand_aligntest (count, 4, true);
21675 src = change_address (srcmem, SImode, srcptr);
21676 dest = change_address (destmem, SImode, destptr);
21677 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21678 emit_label (label);
21679 LABEL_NUSES (label) = 1;
21680 }
21681 if (max_size > 2)
21682 {
21683 rtx label = ix86_expand_aligntest (count, 2, true);
21684 src = change_address (srcmem, HImode, srcptr);
21685 dest = change_address (destmem, HImode, destptr);
21686 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21687 emit_label (label);
21688 LABEL_NUSES (label) = 1;
21689 }
21690 if (max_size > 1)
21691 {
21692 rtx label = ix86_expand_aligntest (count, 1, true);
21693 src = change_address (srcmem, QImode, srcptr);
21694 dest = change_address (destmem, QImode, destptr);
21695 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21696 emit_label (label);
21697 LABEL_NUSES (label) = 1;
21698 }
21699 }
21700 else
21701 {
21702 rtx offset = force_reg (Pmode, const0_rtx);
21703 rtx tmp;
21704
21705 if (max_size > 4)
21706 {
21707 rtx label = ix86_expand_aligntest (count, 4, true);
21708 src = change_address (srcmem, SImode, srcptr);
21709 dest = change_address (destmem, SImode, destptr);
21710 emit_move_insn (dest, src);
21711 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21712 true, OPTAB_LIB_WIDEN);
21713 if (tmp != offset)
21714 emit_move_insn (offset, tmp);
21715 emit_label (label);
21716 LABEL_NUSES (label) = 1;
21717 }
21718 if (max_size > 2)
21719 {
21720 rtx label = ix86_expand_aligntest (count, 2, true);
21721 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21722 src = change_address (srcmem, HImode, tmp);
21723 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21724 dest = change_address (destmem, HImode, tmp);
21725 emit_move_insn (dest, src);
21726 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
21727 true, OPTAB_LIB_WIDEN);
21728 if (tmp != offset)
21729 emit_move_insn (offset, tmp);
21730 emit_label (label);
21731 LABEL_NUSES (label) = 1;
21732 }
21733 if (max_size > 1)
21734 {
21735 rtx label = ix86_expand_aligntest (count, 1, true);
21736 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21737 src = change_address (srcmem, QImode, tmp);
21738 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21739 dest = change_address (destmem, QImode, tmp);
21740 emit_move_insn (dest, src);
21741 emit_label (label);
21742 LABEL_NUSES (label) = 1;
21743 }
21744 }
21745 }
21746
21747 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21748 static void
21749 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
21750 rtx count, int max_size)
21751 {
21752 count =
21753 expand_simple_binop (counter_mode (count), AND, count,
21754 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
21755 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
21756 gen_lowpart (QImode, value), count, QImode,
21757 1, max_size / 2);
21758 }
21759
21760 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21761 static void
21762 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
21763 {
21764 rtx dest;
21765
21766 if (CONST_INT_P (count))
21767 {
21768 HOST_WIDE_INT countval = INTVAL (count);
21769 int offset = 0;
21770
21771 if ((countval & 0x10) && max_size > 16)
21772 {
21773 if (TARGET_64BIT)
21774 {
21775 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21776 emit_insn (gen_strset (destptr, dest, value));
21777 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
21778 emit_insn (gen_strset (destptr, dest, value));
21779 }
21780 else
21781 gcc_unreachable ();
21782 offset += 16;
21783 }
21784 if ((countval & 0x08) && max_size > 8)
21785 {
21786 if (TARGET_64BIT)
21787 {
21788 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21789 emit_insn (gen_strset (destptr, dest, value));
21790 }
21791 else
21792 {
21793 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21794 emit_insn (gen_strset (destptr, dest, value));
21795 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
21796 emit_insn (gen_strset (destptr, dest, value));
21797 }
21798 offset += 8;
21799 }
21800 if ((countval & 0x04) && max_size > 4)
21801 {
21802 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21803 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21804 offset += 4;
21805 }
21806 if ((countval & 0x02) && max_size > 2)
21807 {
21808 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
21809 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21810 offset += 2;
21811 }
21812 if ((countval & 0x01) && max_size > 1)
21813 {
21814 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
21815 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21816 offset += 1;
21817 }
21818 return;
21819 }
21820 if (max_size > 32)
21821 {
21822 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
21823 return;
21824 }
21825 if (max_size > 16)
21826 {
21827 rtx label = ix86_expand_aligntest (count, 16, true);
21828 if (TARGET_64BIT)
21829 {
21830 dest = change_address (destmem, DImode, destptr);
21831 emit_insn (gen_strset (destptr, dest, value));
21832 emit_insn (gen_strset (destptr, dest, value));
21833 }
21834 else
21835 {
21836 dest = change_address (destmem, SImode, destptr);
21837 emit_insn (gen_strset (destptr, dest, value));
21838 emit_insn (gen_strset (destptr, dest, value));
21839 emit_insn (gen_strset (destptr, dest, value));
21840 emit_insn (gen_strset (destptr, dest, value));
21841 }
21842 emit_label (label);
21843 LABEL_NUSES (label) = 1;
21844 }
21845 if (max_size > 8)
21846 {
21847 rtx label = ix86_expand_aligntest (count, 8, true);
21848 if (TARGET_64BIT)
21849 {
21850 dest = change_address (destmem, DImode, destptr);
21851 emit_insn (gen_strset (destptr, dest, value));
21852 }
21853 else
21854 {
21855 dest = change_address (destmem, SImode, destptr);
21856 emit_insn (gen_strset (destptr, dest, value));
21857 emit_insn (gen_strset (destptr, dest, value));
21858 }
21859 emit_label (label);
21860 LABEL_NUSES (label) = 1;
21861 }
21862 if (max_size > 4)
21863 {
21864 rtx label = ix86_expand_aligntest (count, 4, true);
21865 dest = change_address (destmem, SImode, destptr);
21866 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21867 emit_label (label);
21868 LABEL_NUSES (label) = 1;
21869 }
21870 if (max_size > 2)
21871 {
21872 rtx label = ix86_expand_aligntest (count, 2, true);
21873 dest = change_address (destmem, HImode, destptr);
21874 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21875 emit_label (label);
21876 LABEL_NUSES (label) = 1;
21877 }
21878 if (max_size > 1)
21879 {
21880 rtx label = ix86_expand_aligntest (count, 1, true);
21881 dest = change_address (destmem, QImode, destptr);
21882 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21883 emit_label (label);
21884 LABEL_NUSES (label) = 1;
21885 }
21886 }
21887
21888 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
21889 DESIRED_ALIGNMENT. */
21890 static void
21891 expand_movmem_prologue (rtx destmem, rtx srcmem,
21892 rtx destptr, rtx srcptr, rtx count,
21893 int align, int desired_alignment)
21894 {
21895 if (align <= 1 && desired_alignment > 1)
21896 {
21897 rtx label = ix86_expand_aligntest (destptr, 1, false);
21898 srcmem = change_address (srcmem, QImode, srcptr);
21899 destmem = change_address (destmem, QImode, destptr);
21900 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21901 ix86_adjust_counter (count, 1);
21902 emit_label (label);
21903 LABEL_NUSES (label) = 1;
21904 }
21905 if (align <= 2 && desired_alignment > 2)
21906 {
21907 rtx label = ix86_expand_aligntest (destptr, 2, false);
21908 srcmem = change_address (srcmem, HImode, srcptr);
21909 destmem = change_address (destmem, HImode, destptr);
21910 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21911 ix86_adjust_counter (count, 2);
21912 emit_label (label);
21913 LABEL_NUSES (label) = 1;
21914 }
21915 if (align <= 4 && desired_alignment > 4)
21916 {
21917 rtx label = ix86_expand_aligntest (destptr, 4, false);
21918 srcmem = change_address (srcmem, SImode, srcptr);
21919 destmem = change_address (destmem, SImode, destptr);
21920 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21921 ix86_adjust_counter (count, 4);
21922 emit_label (label);
21923 LABEL_NUSES (label) = 1;
21924 }
21925 gcc_assert (desired_alignment <= 8);
21926 }
21927
21928 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
21929 ALIGN_BYTES is how many bytes need to be copied. */
21930 static rtx
21931 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
21932 int desired_align, int align_bytes)
21933 {
21934 rtx src = *srcp;
21935 rtx orig_dst = dst;
21936 rtx orig_src = src;
21937 int off = 0;
21938 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
21939 if (src_align_bytes >= 0)
21940 src_align_bytes = desired_align - src_align_bytes;
21941 if (align_bytes & 1)
21942 {
21943 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21944 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
21945 off = 1;
21946 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21947 }
21948 if (align_bytes & 2)
21949 {
21950 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21951 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
21952 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21953 set_mem_align (dst, 2 * BITS_PER_UNIT);
21954 if (src_align_bytes >= 0
21955 && (src_align_bytes & 1) == (align_bytes & 1)
21956 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
21957 set_mem_align (src, 2 * BITS_PER_UNIT);
21958 off = 2;
21959 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21960 }
21961 if (align_bytes & 4)
21962 {
21963 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21964 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
21965 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21966 set_mem_align (dst, 4 * BITS_PER_UNIT);
21967 if (src_align_bytes >= 0)
21968 {
21969 unsigned int src_align = 0;
21970 if ((src_align_bytes & 3) == (align_bytes & 3))
21971 src_align = 4;
21972 else if ((src_align_bytes & 1) == (align_bytes & 1))
21973 src_align = 2;
21974 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21975 set_mem_align (src, src_align * BITS_PER_UNIT);
21976 }
21977 off = 4;
21978 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21979 }
21980 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21981 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
21982 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21983 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21984 if (src_align_bytes >= 0)
21985 {
21986 unsigned int src_align = 0;
21987 if ((src_align_bytes & 7) == (align_bytes & 7))
21988 src_align = 8;
21989 else if ((src_align_bytes & 3) == (align_bytes & 3))
21990 src_align = 4;
21991 else if ((src_align_bytes & 1) == (align_bytes & 1))
21992 src_align = 2;
21993 if (src_align > (unsigned int) desired_align)
21994 src_align = desired_align;
21995 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21996 set_mem_align (src, src_align * BITS_PER_UNIT);
21997 }
21998 if (MEM_SIZE_KNOWN_P (orig_dst))
21999 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22000 if (MEM_SIZE_KNOWN_P (orig_src))
22001 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
22002 *srcp = src;
22003 return dst;
22004 }
22005
22006 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
22007 DESIRED_ALIGNMENT. */
22008 static void
22009 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
22010 int align, int desired_alignment)
22011 {
22012 if (align <= 1 && desired_alignment > 1)
22013 {
22014 rtx label = ix86_expand_aligntest (destptr, 1, false);
22015 destmem = change_address (destmem, QImode, destptr);
22016 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
22017 ix86_adjust_counter (count, 1);
22018 emit_label (label);
22019 LABEL_NUSES (label) = 1;
22020 }
22021 if (align <= 2 && desired_alignment > 2)
22022 {
22023 rtx label = ix86_expand_aligntest (destptr, 2, false);
22024 destmem = change_address (destmem, HImode, destptr);
22025 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
22026 ix86_adjust_counter (count, 2);
22027 emit_label (label);
22028 LABEL_NUSES (label) = 1;
22029 }
22030 if (align <= 4 && desired_alignment > 4)
22031 {
22032 rtx label = ix86_expand_aligntest (destptr, 4, false);
22033 destmem = change_address (destmem, SImode, destptr);
22034 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22035 ix86_adjust_counter (count, 4);
22036 emit_label (label);
22037 LABEL_NUSES (label) = 1;
22038 }
22039 gcc_assert (desired_alignment <= 8);
22040 }
22041
22042 /* Set enough from DST to align DST known to by aligned by ALIGN to
22043 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
22044 static rtx
22045 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
22046 int desired_align, int align_bytes)
22047 {
22048 int off = 0;
22049 rtx orig_dst = dst;
22050 if (align_bytes & 1)
22051 {
22052 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22053 off = 1;
22054 emit_insn (gen_strset (destreg, dst,
22055 gen_lowpart (QImode, value)));
22056 }
22057 if (align_bytes & 2)
22058 {
22059 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22060 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22061 set_mem_align (dst, 2 * BITS_PER_UNIT);
22062 off = 2;
22063 emit_insn (gen_strset (destreg, dst,
22064 gen_lowpart (HImode, value)));
22065 }
22066 if (align_bytes & 4)
22067 {
22068 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22069 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22070 set_mem_align (dst, 4 * BITS_PER_UNIT);
22071 off = 4;
22072 emit_insn (gen_strset (destreg, dst,
22073 gen_lowpart (SImode, value)));
22074 }
22075 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22076 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22077 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22078 if (MEM_SIZE_KNOWN_P (orig_dst))
22079 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22080 return dst;
22081 }
22082
22083 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
22084 static enum stringop_alg
22085 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22086 int *dynamic_check)
22087 {
22088 const struct stringop_algs * algs;
22089 bool optimize_for_speed;
22090 /* Algorithms using the rep prefix want at least edi and ecx;
22091 additionally, memset wants eax and memcpy wants esi. Don't
22092 consider such algorithms if the user has appropriated those
22093 registers for their own purposes. */
22094 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22095 || (memset
22096 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22097
22098 #define ALG_USABLE_P(alg) (rep_prefix_usable \
22099 || (alg != rep_prefix_1_byte \
22100 && alg != rep_prefix_4_byte \
22101 && alg != rep_prefix_8_byte))
22102 const struct processor_costs *cost;
22103
22104 /* Even if the string operation call is cold, we still might spend a lot
22105 of time processing large blocks. */
22106 if (optimize_function_for_size_p (cfun)
22107 || (optimize_insn_for_size_p ()
22108 && expected_size != -1 && expected_size < 256))
22109 optimize_for_speed = false;
22110 else
22111 optimize_for_speed = true;
22112
22113 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
22114
22115 *dynamic_check = -1;
22116 if (memset)
22117 algs = &cost->memset[TARGET_64BIT != 0];
22118 else
22119 algs = &cost->memcpy[TARGET_64BIT != 0];
22120 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
22121 return ix86_stringop_alg;
22122 /* rep; movq or rep; movl is the smallest variant. */
22123 else if (!optimize_for_speed)
22124 {
22125 if (!count || (count & 3))
22126 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
22127 else
22128 return rep_prefix_usable ? rep_prefix_4_byte : loop;
22129 }
22130 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
22131 */
22132 else if (expected_size != -1 && expected_size < 4)
22133 return loop_1_byte;
22134 else if (expected_size != -1)
22135 {
22136 unsigned int i;
22137 enum stringop_alg alg = libcall;
22138 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22139 {
22140 /* We get here if the algorithms that were not libcall-based
22141 were rep-prefix based and we are unable to use rep prefixes
22142 based on global register usage. Break out of the loop and
22143 use the heuristic below. */
22144 if (algs->size[i].max == 0)
22145 break;
22146 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22147 {
22148 enum stringop_alg candidate = algs->size[i].alg;
22149
22150 if (candidate != libcall && ALG_USABLE_P (candidate))
22151 alg = candidate;
22152 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22153 last non-libcall inline algorithm. */
22154 if (TARGET_INLINE_ALL_STRINGOPS)
22155 {
22156 /* When the current size is best to be copied by a libcall,
22157 but we are still forced to inline, run the heuristic below
22158 that will pick code for medium sized blocks. */
22159 if (alg != libcall)
22160 return alg;
22161 break;
22162 }
22163 else if (ALG_USABLE_P (candidate))
22164 return candidate;
22165 }
22166 }
22167 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22168 }
22169 /* When asked to inline the call anyway, try to pick meaningful choice.
22170 We look for maximal size of block that is faster to copy by hand and
22171 take blocks of at most of that size guessing that average size will
22172 be roughly half of the block.
22173
22174 If this turns out to be bad, we might simply specify the preferred
22175 choice in ix86_costs. */
22176 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22177 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22178 {
22179 int max = -1;
22180 enum stringop_alg alg;
22181 int i;
22182 bool any_alg_usable_p = true;
22183
22184 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22185 {
22186 enum stringop_alg candidate = algs->size[i].alg;
22187 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22188
22189 if (candidate != libcall && candidate
22190 && ALG_USABLE_P (candidate))
22191 max = algs->size[i].max;
22192 }
22193 /* If there aren't any usable algorithms, then recursing on
22194 smaller sizes isn't going to find anything. Just return the
22195 simple byte-at-a-time copy loop. */
22196 if (!any_alg_usable_p)
22197 {
22198 /* Pick something reasonable. */
22199 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22200 *dynamic_check = 128;
22201 return loop_1_byte;
22202 }
22203 if (max == -1)
22204 max = 4096;
22205 alg = decide_alg (count, max / 2, memset, dynamic_check);
22206 gcc_assert (*dynamic_check == -1);
22207 gcc_assert (alg != libcall);
22208 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22209 *dynamic_check = max;
22210 return alg;
22211 }
22212 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22213 #undef ALG_USABLE_P
22214 }
22215
22216 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22217 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22218 static int
22219 decide_alignment (int align,
22220 enum stringop_alg alg,
22221 int expected_size)
22222 {
22223 int desired_align = 0;
22224 switch (alg)
22225 {
22226 case no_stringop:
22227 gcc_unreachable ();
22228 case loop:
22229 case unrolled_loop:
22230 desired_align = GET_MODE_SIZE (Pmode);
22231 break;
22232 case rep_prefix_8_byte:
22233 desired_align = 8;
22234 break;
22235 case rep_prefix_4_byte:
22236 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22237 copying whole cacheline at once. */
22238 if (TARGET_PENTIUMPRO)
22239 desired_align = 8;
22240 else
22241 desired_align = 4;
22242 break;
22243 case rep_prefix_1_byte:
22244 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22245 copying whole cacheline at once. */
22246 if (TARGET_PENTIUMPRO)
22247 desired_align = 8;
22248 else
22249 desired_align = 1;
22250 break;
22251 case loop_1_byte:
22252 desired_align = 1;
22253 break;
22254 case libcall:
22255 return 0;
22256 }
22257
22258 if (optimize_size)
22259 desired_align = 1;
22260 if (desired_align < align)
22261 desired_align = align;
22262 if (expected_size != -1 && expected_size < 4)
22263 desired_align = align;
22264 return desired_align;
22265 }
22266
22267 /* Return the smallest power of 2 greater than VAL. */
22268 static int
22269 smallest_pow2_greater_than (int val)
22270 {
22271 int ret = 1;
22272 while (ret <= val)
22273 ret <<= 1;
22274 return ret;
22275 }
22276
22277 /* Expand string move (memcpy) operation. Use i386 string operations
22278 when profitable. expand_setmem contains similar code. The code
22279 depends upon architecture, block size and alignment, but always has
22280 the same overall structure:
22281
22282 1) Prologue guard: Conditional that jumps up to epilogues for small
22283 blocks that can be handled by epilogue alone. This is faster
22284 but also needed for correctness, since prologue assume the block
22285 is larger than the desired alignment.
22286
22287 Optional dynamic check for size and libcall for large
22288 blocks is emitted here too, with -minline-stringops-dynamically.
22289
22290 2) Prologue: copy first few bytes in order to get destination
22291 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22292 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22293 copied. We emit either a jump tree on power of two sized
22294 blocks, or a byte loop.
22295
22296 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22297 with specified algorithm.
22298
22299 4) Epilogue: code copying tail of the block that is too small to be
22300 handled by main body (or up to size guarded by prologue guard). */
22301
22302 bool
22303 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22304 rtx expected_align_exp, rtx expected_size_exp)
22305 {
22306 rtx destreg;
22307 rtx srcreg;
22308 rtx label = NULL;
22309 rtx tmp;
22310 rtx jump_around_label = NULL;
22311 HOST_WIDE_INT align = 1;
22312 unsigned HOST_WIDE_INT count = 0;
22313 HOST_WIDE_INT expected_size = -1;
22314 int size_needed = 0, epilogue_size_needed;
22315 int desired_align = 0, align_bytes = 0;
22316 enum stringop_alg alg;
22317 int dynamic_check;
22318 bool need_zero_guard = false;
22319
22320 if (CONST_INT_P (align_exp))
22321 align = INTVAL (align_exp);
22322 /* i386 can do misaligned access on reasonably increased cost. */
22323 if (CONST_INT_P (expected_align_exp)
22324 && INTVAL (expected_align_exp) > align)
22325 align = INTVAL (expected_align_exp);
22326 /* ALIGN is the minimum of destination and source alignment, but we care here
22327 just about destination alignment. */
22328 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22329 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22330
22331 if (CONST_INT_P (count_exp))
22332 count = expected_size = INTVAL (count_exp);
22333 if (CONST_INT_P (expected_size_exp) && count == 0)
22334 expected_size = INTVAL (expected_size_exp);
22335
22336 /* Make sure we don't need to care about overflow later on. */
22337 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22338 return false;
22339
22340 /* Step 0: Decide on preferred algorithm, desired alignment and
22341 size of chunks to be copied by main loop. */
22342
22343 alg = decide_alg (count, expected_size, false, &dynamic_check);
22344 desired_align = decide_alignment (align, alg, expected_size);
22345
22346 if (!TARGET_ALIGN_STRINGOPS)
22347 align = desired_align;
22348
22349 if (alg == libcall)
22350 return false;
22351 gcc_assert (alg != no_stringop);
22352 if (!count)
22353 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22354 destreg = copy_addr_to_reg (XEXP (dst, 0));
22355 srcreg = copy_addr_to_reg (XEXP (src, 0));
22356 switch (alg)
22357 {
22358 case libcall:
22359 case no_stringop:
22360 gcc_unreachable ();
22361 case loop:
22362 need_zero_guard = true;
22363 size_needed = GET_MODE_SIZE (word_mode);
22364 break;
22365 case unrolled_loop:
22366 need_zero_guard = true;
22367 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22368 break;
22369 case rep_prefix_8_byte:
22370 size_needed = 8;
22371 break;
22372 case rep_prefix_4_byte:
22373 size_needed = 4;
22374 break;
22375 case rep_prefix_1_byte:
22376 size_needed = 1;
22377 break;
22378 case loop_1_byte:
22379 need_zero_guard = true;
22380 size_needed = 1;
22381 break;
22382 }
22383
22384 epilogue_size_needed = size_needed;
22385
22386 /* Step 1: Prologue guard. */
22387
22388 /* Alignment code needs count to be in register. */
22389 if (CONST_INT_P (count_exp) && desired_align > align)
22390 {
22391 if (INTVAL (count_exp) > desired_align
22392 && INTVAL (count_exp) > size_needed)
22393 {
22394 align_bytes
22395 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22396 if (align_bytes <= 0)
22397 align_bytes = 0;
22398 else
22399 align_bytes = desired_align - align_bytes;
22400 }
22401 if (align_bytes == 0)
22402 count_exp = force_reg (counter_mode (count_exp), count_exp);
22403 }
22404 gcc_assert (desired_align >= 1 && align >= 1);
22405
22406 /* Ensure that alignment prologue won't copy past end of block. */
22407 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22408 {
22409 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22410 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22411 Make sure it is power of 2. */
22412 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22413
22414 if (count)
22415 {
22416 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22417 {
22418 /* If main algorithm works on QImode, no epilogue is needed.
22419 For small sizes just don't align anything. */
22420 if (size_needed == 1)
22421 desired_align = align;
22422 else
22423 goto epilogue;
22424 }
22425 }
22426 else
22427 {
22428 label = gen_label_rtx ();
22429 emit_cmp_and_jump_insns (count_exp,
22430 GEN_INT (epilogue_size_needed),
22431 LTU, 0, counter_mode (count_exp), 1, label);
22432 if (expected_size == -1 || expected_size < epilogue_size_needed)
22433 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22434 else
22435 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22436 }
22437 }
22438
22439 /* Emit code to decide on runtime whether library call or inline should be
22440 used. */
22441 if (dynamic_check != -1)
22442 {
22443 if (CONST_INT_P (count_exp))
22444 {
22445 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22446 {
22447 emit_block_move_via_libcall (dst, src, count_exp, false);
22448 count_exp = const0_rtx;
22449 goto epilogue;
22450 }
22451 }
22452 else
22453 {
22454 rtx hot_label = gen_label_rtx ();
22455 jump_around_label = gen_label_rtx ();
22456 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22457 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22458 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22459 emit_block_move_via_libcall (dst, src, count_exp, false);
22460 emit_jump (jump_around_label);
22461 emit_label (hot_label);
22462 }
22463 }
22464
22465 /* Step 2: Alignment prologue. */
22466
22467 if (desired_align > align)
22468 {
22469 if (align_bytes == 0)
22470 {
22471 /* Except for the first move in epilogue, we no longer know
22472 constant offset in aliasing info. It don't seems to worth
22473 the pain to maintain it for the first move, so throw away
22474 the info early. */
22475 src = change_address (src, BLKmode, srcreg);
22476 dst = change_address (dst, BLKmode, destreg);
22477 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22478 desired_align);
22479 }
22480 else
22481 {
22482 /* If we know how many bytes need to be stored before dst is
22483 sufficiently aligned, maintain aliasing info accurately. */
22484 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22485 desired_align, align_bytes);
22486 count_exp = plus_constant (counter_mode (count_exp),
22487 count_exp, -align_bytes);
22488 count -= align_bytes;
22489 }
22490 if (need_zero_guard
22491 && (count < (unsigned HOST_WIDE_INT) size_needed
22492 || (align_bytes == 0
22493 && count < ((unsigned HOST_WIDE_INT) size_needed
22494 + desired_align - align))))
22495 {
22496 /* It is possible that we copied enough so the main loop will not
22497 execute. */
22498 gcc_assert (size_needed > 1);
22499 if (label == NULL_RTX)
22500 label = gen_label_rtx ();
22501 emit_cmp_and_jump_insns (count_exp,
22502 GEN_INT (size_needed),
22503 LTU, 0, counter_mode (count_exp), 1, label);
22504 if (expected_size == -1
22505 || expected_size < (desired_align - align) / 2 + size_needed)
22506 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22507 else
22508 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22509 }
22510 }
22511 if (label && size_needed == 1)
22512 {
22513 emit_label (label);
22514 LABEL_NUSES (label) = 1;
22515 label = NULL;
22516 epilogue_size_needed = 1;
22517 }
22518 else if (label == NULL_RTX)
22519 epilogue_size_needed = size_needed;
22520
22521 /* Step 3: Main loop. */
22522
22523 switch (alg)
22524 {
22525 case libcall:
22526 case no_stringop:
22527 gcc_unreachable ();
22528 case loop_1_byte:
22529 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22530 count_exp, QImode, 1, expected_size);
22531 break;
22532 case loop:
22533 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22534 count_exp, word_mode, 1, expected_size);
22535 break;
22536 case unrolled_loop:
22537 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22538 registers for 4 temporaries anyway. */
22539 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22540 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
22541 expected_size);
22542 break;
22543 case rep_prefix_8_byte:
22544 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22545 DImode);
22546 break;
22547 case rep_prefix_4_byte:
22548 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22549 SImode);
22550 break;
22551 case rep_prefix_1_byte:
22552 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22553 QImode);
22554 break;
22555 }
22556 /* Adjust properly the offset of src and dest memory for aliasing. */
22557 if (CONST_INT_P (count_exp))
22558 {
22559 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22560 (count / size_needed) * size_needed);
22561 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22562 (count / size_needed) * size_needed);
22563 }
22564 else
22565 {
22566 src = change_address (src, BLKmode, srcreg);
22567 dst = change_address (dst, BLKmode, destreg);
22568 }
22569
22570 /* Step 4: Epilogue to copy the remaining bytes. */
22571 epilogue:
22572 if (label)
22573 {
22574 /* When the main loop is done, COUNT_EXP might hold original count,
22575 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22576 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22577 bytes. Compensate if needed. */
22578
22579 if (size_needed < epilogue_size_needed)
22580 {
22581 tmp =
22582 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22583 GEN_INT (size_needed - 1), count_exp, 1,
22584 OPTAB_DIRECT);
22585 if (tmp != count_exp)
22586 emit_move_insn (count_exp, tmp);
22587 }
22588 emit_label (label);
22589 LABEL_NUSES (label) = 1;
22590 }
22591
22592 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22593 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22594 epilogue_size_needed);
22595 if (jump_around_label)
22596 emit_label (jump_around_label);
22597 return true;
22598 }
22599
22600 /* Helper function for memcpy. For QImode value 0xXY produce
22601 0xXYXYXYXY of wide specified by MODE. This is essentially
22602 a * 0x10101010, but we can do slightly better than
22603 synth_mult by unwinding the sequence by hand on CPUs with
22604 slow multiply. */
22605 static rtx
22606 promote_duplicated_reg (enum machine_mode mode, rtx val)
22607 {
22608 enum machine_mode valmode = GET_MODE (val);
22609 rtx tmp;
22610 int nops = mode == DImode ? 3 : 2;
22611
22612 gcc_assert (mode == SImode || mode == DImode);
22613 if (val == const0_rtx)
22614 return copy_to_mode_reg (mode, const0_rtx);
22615 if (CONST_INT_P (val))
22616 {
22617 HOST_WIDE_INT v = INTVAL (val) & 255;
22618
22619 v |= v << 8;
22620 v |= v << 16;
22621 if (mode == DImode)
22622 v |= (v << 16) << 16;
22623 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22624 }
22625
22626 if (valmode == VOIDmode)
22627 valmode = QImode;
22628 if (valmode != QImode)
22629 val = gen_lowpart (QImode, val);
22630 if (mode == QImode)
22631 return val;
22632 if (!TARGET_PARTIAL_REG_STALL)
22633 nops--;
22634 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22635 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22636 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22637 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22638 {
22639 rtx reg = convert_modes (mode, QImode, val, true);
22640 tmp = promote_duplicated_reg (mode, const1_rtx);
22641 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22642 OPTAB_DIRECT);
22643 }
22644 else
22645 {
22646 rtx reg = convert_modes (mode, QImode, val, true);
22647
22648 if (!TARGET_PARTIAL_REG_STALL)
22649 if (mode == SImode)
22650 emit_insn (gen_movsi_insv_1 (reg, reg));
22651 else
22652 emit_insn (gen_movdi_insv_1 (reg, reg));
22653 else
22654 {
22655 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22656 NULL, 1, OPTAB_DIRECT);
22657 reg =
22658 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22659 }
22660 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22661 NULL, 1, OPTAB_DIRECT);
22662 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22663 if (mode == SImode)
22664 return reg;
22665 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22666 NULL, 1, OPTAB_DIRECT);
22667 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22668 return reg;
22669 }
22670 }
22671
22672 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22673 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22674 alignment from ALIGN to DESIRED_ALIGN. */
22675 static rtx
22676 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22677 {
22678 rtx promoted_val;
22679
22680 if (TARGET_64BIT
22681 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22682 promoted_val = promote_duplicated_reg (DImode, val);
22683 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22684 promoted_val = promote_duplicated_reg (SImode, val);
22685 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22686 promoted_val = promote_duplicated_reg (HImode, val);
22687 else
22688 promoted_val = val;
22689
22690 return promoted_val;
22691 }
22692
22693 /* Expand string clear operation (bzero). Use i386 string operations when
22694 profitable. See expand_movmem comment for explanation of individual
22695 steps performed. */
22696 bool
22697 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22698 rtx expected_align_exp, rtx expected_size_exp)
22699 {
22700 rtx destreg;
22701 rtx label = NULL;
22702 rtx tmp;
22703 rtx jump_around_label = NULL;
22704 HOST_WIDE_INT align = 1;
22705 unsigned HOST_WIDE_INT count = 0;
22706 HOST_WIDE_INT expected_size = -1;
22707 int size_needed = 0, epilogue_size_needed;
22708 int desired_align = 0, align_bytes = 0;
22709 enum stringop_alg alg;
22710 rtx promoted_val = NULL;
22711 bool force_loopy_epilogue = false;
22712 int dynamic_check;
22713 bool need_zero_guard = false;
22714
22715 if (CONST_INT_P (align_exp))
22716 align = INTVAL (align_exp);
22717 /* i386 can do misaligned access on reasonably increased cost. */
22718 if (CONST_INT_P (expected_align_exp)
22719 && INTVAL (expected_align_exp) > align)
22720 align = INTVAL (expected_align_exp);
22721 if (CONST_INT_P (count_exp))
22722 count = expected_size = INTVAL (count_exp);
22723 if (CONST_INT_P (expected_size_exp) && count == 0)
22724 expected_size = INTVAL (expected_size_exp);
22725
22726 /* Make sure we don't need to care about overflow later on. */
22727 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22728 return false;
22729
22730 /* Step 0: Decide on preferred algorithm, desired alignment and
22731 size of chunks to be copied by main loop. */
22732
22733 alg = decide_alg (count, expected_size, true, &dynamic_check);
22734 desired_align = decide_alignment (align, alg, expected_size);
22735
22736 if (!TARGET_ALIGN_STRINGOPS)
22737 align = desired_align;
22738
22739 if (alg == libcall)
22740 return false;
22741 gcc_assert (alg != no_stringop);
22742 if (!count)
22743 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
22744 destreg = copy_addr_to_reg (XEXP (dst, 0));
22745 switch (alg)
22746 {
22747 case libcall:
22748 case no_stringop:
22749 gcc_unreachable ();
22750 case loop:
22751 need_zero_guard = true;
22752 size_needed = GET_MODE_SIZE (word_mode);
22753 break;
22754 case unrolled_loop:
22755 need_zero_guard = true;
22756 size_needed = GET_MODE_SIZE (word_mode) * 4;
22757 break;
22758 case rep_prefix_8_byte:
22759 size_needed = 8;
22760 break;
22761 case rep_prefix_4_byte:
22762 size_needed = 4;
22763 break;
22764 case rep_prefix_1_byte:
22765 size_needed = 1;
22766 break;
22767 case loop_1_byte:
22768 need_zero_guard = true;
22769 size_needed = 1;
22770 break;
22771 }
22772 epilogue_size_needed = size_needed;
22773
22774 /* Step 1: Prologue guard. */
22775
22776 /* Alignment code needs count to be in register. */
22777 if (CONST_INT_P (count_exp) && desired_align > align)
22778 {
22779 if (INTVAL (count_exp) > desired_align
22780 && INTVAL (count_exp) > size_needed)
22781 {
22782 align_bytes
22783 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22784 if (align_bytes <= 0)
22785 align_bytes = 0;
22786 else
22787 align_bytes = desired_align - align_bytes;
22788 }
22789 if (align_bytes == 0)
22790 {
22791 enum machine_mode mode = SImode;
22792 if (TARGET_64BIT && (count & ~0xffffffff))
22793 mode = DImode;
22794 count_exp = force_reg (mode, count_exp);
22795 }
22796 }
22797 /* Do the cheap promotion to allow better CSE across the
22798 main loop and epilogue (ie one load of the big constant in the
22799 front of all code. */
22800 if (CONST_INT_P (val_exp))
22801 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22802 desired_align, align);
22803 /* Ensure that alignment prologue won't copy past end of block. */
22804 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22805 {
22806 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22807 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
22808 Make sure it is power of 2. */
22809 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22810
22811 /* To improve performance of small blocks, we jump around the VAL
22812 promoting mode. This mean that if the promoted VAL is not constant,
22813 we might not use it in the epilogue and have to use byte
22814 loop variant. */
22815 if (epilogue_size_needed > 2 && !promoted_val)
22816 force_loopy_epilogue = true;
22817 if (count)
22818 {
22819 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22820 {
22821 /* If main algorithm works on QImode, no epilogue is needed.
22822 For small sizes just don't align anything. */
22823 if (size_needed == 1)
22824 desired_align = align;
22825 else
22826 goto epilogue;
22827 }
22828 }
22829 else
22830 {
22831 label = gen_label_rtx ();
22832 emit_cmp_and_jump_insns (count_exp,
22833 GEN_INT (epilogue_size_needed),
22834 LTU, 0, counter_mode (count_exp), 1, label);
22835 if (expected_size == -1 || expected_size <= epilogue_size_needed)
22836 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22837 else
22838 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22839 }
22840 }
22841 if (dynamic_check != -1)
22842 {
22843 rtx hot_label = gen_label_rtx ();
22844 jump_around_label = gen_label_rtx ();
22845 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22846 LEU, 0, counter_mode (count_exp), 1, hot_label);
22847 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22848 set_storage_via_libcall (dst, count_exp, val_exp, false);
22849 emit_jump (jump_around_label);
22850 emit_label (hot_label);
22851 }
22852
22853 /* Step 2: Alignment prologue. */
22854
22855 /* Do the expensive promotion once we branched off the small blocks. */
22856 if (!promoted_val)
22857 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22858 desired_align, align);
22859 gcc_assert (desired_align >= 1 && align >= 1);
22860
22861 if (desired_align > align)
22862 {
22863 if (align_bytes == 0)
22864 {
22865 /* Except for the first move in epilogue, we no longer know
22866 constant offset in aliasing info. It don't seems to worth
22867 the pain to maintain it for the first move, so throw away
22868 the info early. */
22869 dst = change_address (dst, BLKmode, destreg);
22870 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
22871 desired_align);
22872 }
22873 else
22874 {
22875 /* If we know how many bytes need to be stored before dst is
22876 sufficiently aligned, maintain aliasing info accurately. */
22877 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
22878 desired_align, align_bytes);
22879 count_exp = plus_constant (counter_mode (count_exp),
22880 count_exp, -align_bytes);
22881 count -= align_bytes;
22882 }
22883 if (need_zero_guard
22884 && (count < (unsigned HOST_WIDE_INT) size_needed
22885 || (align_bytes == 0
22886 && count < ((unsigned HOST_WIDE_INT) size_needed
22887 + desired_align - align))))
22888 {
22889 /* It is possible that we copied enough so the main loop will not
22890 execute. */
22891 gcc_assert (size_needed > 1);
22892 if (label == NULL_RTX)
22893 label = gen_label_rtx ();
22894 emit_cmp_and_jump_insns (count_exp,
22895 GEN_INT (size_needed),
22896 LTU, 0, counter_mode (count_exp), 1, label);
22897 if (expected_size == -1
22898 || expected_size < (desired_align - align) / 2 + size_needed)
22899 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22900 else
22901 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22902 }
22903 }
22904 if (label && size_needed == 1)
22905 {
22906 emit_label (label);
22907 LABEL_NUSES (label) = 1;
22908 label = NULL;
22909 promoted_val = val_exp;
22910 epilogue_size_needed = 1;
22911 }
22912 else if (label == NULL_RTX)
22913 epilogue_size_needed = size_needed;
22914
22915 /* Step 3: Main loop. */
22916
22917 switch (alg)
22918 {
22919 case libcall:
22920 case no_stringop:
22921 gcc_unreachable ();
22922 case loop_1_byte:
22923 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22924 count_exp, QImode, 1, expected_size);
22925 break;
22926 case loop:
22927 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22928 count_exp, word_mode, 1, expected_size);
22929 break;
22930 case unrolled_loop:
22931 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22932 count_exp, word_mode, 4, expected_size);
22933 break;
22934 case rep_prefix_8_byte:
22935 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22936 DImode, val_exp);
22937 break;
22938 case rep_prefix_4_byte:
22939 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22940 SImode, val_exp);
22941 break;
22942 case rep_prefix_1_byte:
22943 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22944 QImode, val_exp);
22945 break;
22946 }
22947 /* Adjust properly the offset of src and dest memory for aliasing. */
22948 if (CONST_INT_P (count_exp))
22949 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22950 (count / size_needed) * size_needed);
22951 else
22952 dst = change_address (dst, BLKmode, destreg);
22953
22954 /* Step 4: Epilogue to copy the remaining bytes. */
22955
22956 if (label)
22957 {
22958 /* When the main loop is done, COUNT_EXP might hold original count,
22959 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22960 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22961 bytes. Compensate if needed. */
22962
22963 if (size_needed < epilogue_size_needed)
22964 {
22965 tmp =
22966 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22967 GEN_INT (size_needed - 1), count_exp, 1,
22968 OPTAB_DIRECT);
22969 if (tmp != count_exp)
22970 emit_move_insn (count_exp, tmp);
22971 }
22972 emit_label (label);
22973 LABEL_NUSES (label) = 1;
22974 }
22975 epilogue:
22976 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22977 {
22978 if (force_loopy_epilogue)
22979 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
22980 epilogue_size_needed);
22981 else
22982 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
22983 epilogue_size_needed);
22984 }
22985 if (jump_around_label)
22986 emit_label (jump_around_label);
22987 return true;
22988 }
22989
22990 /* Expand the appropriate insns for doing strlen if not just doing
22991 repnz; scasb
22992
22993 out = result, initialized with the start address
22994 align_rtx = alignment of the address.
22995 scratch = scratch register, initialized with the startaddress when
22996 not aligned, otherwise undefined
22997
22998 This is just the body. It needs the initializations mentioned above and
22999 some address computing at the end. These things are done in i386.md. */
23000
23001 static void
23002 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
23003 {
23004 int align;
23005 rtx tmp;
23006 rtx align_2_label = NULL_RTX;
23007 rtx align_3_label = NULL_RTX;
23008 rtx align_4_label = gen_label_rtx ();
23009 rtx end_0_label = gen_label_rtx ();
23010 rtx mem;
23011 rtx tmpreg = gen_reg_rtx (SImode);
23012 rtx scratch = gen_reg_rtx (SImode);
23013 rtx cmp;
23014
23015 align = 0;
23016 if (CONST_INT_P (align_rtx))
23017 align = INTVAL (align_rtx);
23018
23019 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
23020
23021 /* Is there a known alignment and is it less than 4? */
23022 if (align < 4)
23023 {
23024 rtx scratch1 = gen_reg_rtx (Pmode);
23025 emit_move_insn (scratch1, out);
23026 /* Is there a known alignment and is it not 2? */
23027 if (align != 2)
23028 {
23029 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
23030 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
23031
23032 /* Leave just the 3 lower bits. */
23033 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
23034 NULL_RTX, 0, OPTAB_WIDEN);
23035
23036 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23037 Pmode, 1, align_4_label);
23038 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
23039 Pmode, 1, align_2_label);
23040 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
23041 Pmode, 1, align_3_label);
23042 }
23043 else
23044 {
23045 /* Since the alignment is 2, we have to check 2 or 0 bytes;
23046 check if is aligned to 4 - byte. */
23047
23048 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
23049 NULL_RTX, 0, OPTAB_WIDEN);
23050
23051 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23052 Pmode, 1, align_4_label);
23053 }
23054
23055 mem = change_address (src, QImode, out);
23056
23057 /* Now compare the bytes. */
23058
23059 /* Compare the first n unaligned byte on a byte per byte basis. */
23060 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
23061 QImode, 1, end_0_label);
23062
23063 /* Increment the address. */
23064 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23065
23066 /* Not needed with an alignment of 2 */
23067 if (align != 2)
23068 {
23069 emit_label (align_2_label);
23070
23071 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23072 end_0_label);
23073
23074 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23075
23076 emit_label (align_3_label);
23077 }
23078
23079 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23080 end_0_label);
23081
23082 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23083 }
23084
23085 /* Generate loop to check 4 bytes at a time. It is not a good idea to
23086 align this loop. It gives only huge programs, but does not help to
23087 speed up. */
23088 emit_label (align_4_label);
23089
23090 mem = change_address (src, SImode, out);
23091 emit_move_insn (scratch, mem);
23092 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23093
23094 /* This formula yields a nonzero result iff one of the bytes is zero.
23095 This saves three branches inside loop and many cycles. */
23096
23097 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23098 emit_insn (gen_one_cmplsi2 (scratch, scratch));
23099 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23100 emit_insn (gen_andsi3 (tmpreg, tmpreg,
23101 gen_int_mode (0x80808080, SImode)));
23102 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23103 align_4_label);
23104
23105 if (TARGET_CMOVE)
23106 {
23107 rtx reg = gen_reg_rtx (SImode);
23108 rtx reg2 = gen_reg_rtx (Pmode);
23109 emit_move_insn (reg, tmpreg);
23110 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23111
23112 /* If zero is not in the first two bytes, move two bytes forward. */
23113 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23114 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23115 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23116 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
23117 gen_rtx_IF_THEN_ELSE (SImode, tmp,
23118 reg,
23119 tmpreg)));
23120 /* Emit lea manually to avoid clobbering of flags. */
23121 emit_insn (gen_rtx_SET (SImode, reg2,
23122 gen_rtx_PLUS (Pmode, out, const2_rtx)));
23123
23124 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23125 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23126 emit_insn (gen_rtx_SET (VOIDmode, out,
23127 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
23128 reg2,
23129 out)));
23130 }
23131 else
23132 {
23133 rtx end_2_label = gen_label_rtx ();
23134 /* Is zero in the first two bytes? */
23135
23136 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23137 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23138 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23139 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23140 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23141 pc_rtx);
23142 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23143 JUMP_LABEL (tmp) = end_2_label;
23144
23145 /* Not in the first two. Move two bytes forward. */
23146 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23147 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23148
23149 emit_label (end_2_label);
23150
23151 }
23152
23153 /* Avoid branch in fixing the byte. */
23154 tmpreg = gen_lowpart (QImode, tmpreg);
23155 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23156 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23157 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23158 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23159
23160 emit_label (end_0_label);
23161 }
23162
23163 /* Expand strlen. */
23164
23165 bool
23166 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23167 {
23168 rtx addr, scratch1, scratch2, scratch3, scratch4;
23169
23170 /* The generic case of strlen expander is long. Avoid it's
23171 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
23172
23173 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23174 && !TARGET_INLINE_ALL_STRINGOPS
23175 && !optimize_insn_for_size_p ()
23176 && (!CONST_INT_P (align) || INTVAL (align) < 4))
23177 return false;
23178
23179 addr = force_reg (Pmode, XEXP (src, 0));
23180 scratch1 = gen_reg_rtx (Pmode);
23181
23182 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23183 && !optimize_insn_for_size_p ())
23184 {
23185 /* Well it seems that some optimizer does not combine a call like
23186 foo(strlen(bar), strlen(bar));
23187 when the move and the subtraction is done here. It does calculate
23188 the length just once when these instructions are done inside of
23189 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23190 often used and I use one fewer register for the lifetime of
23191 output_strlen_unroll() this is better. */
23192
23193 emit_move_insn (out, addr);
23194
23195 ix86_expand_strlensi_unroll_1 (out, src, align);
23196
23197 /* strlensi_unroll_1 returns the address of the zero at the end of
23198 the string, like memchr(), so compute the length by subtracting
23199 the start address. */
23200 emit_insn (ix86_gen_sub3 (out, out, addr));
23201 }
23202 else
23203 {
23204 rtx unspec;
23205
23206 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23207 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23208 return false;
23209
23210 scratch2 = gen_reg_rtx (Pmode);
23211 scratch3 = gen_reg_rtx (Pmode);
23212 scratch4 = force_reg (Pmode, constm1_rtx);
23213
23214 emit_move_insn (scratch3, addr);
23215 eoschar = force_reg (QImode, eoschar);
23216
23217 src = replace_equiv_address_nv (src, scratch3);
23218
23219 /* If .md starts supporting :P, this can be done in .md. */
23220 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23221 scratch4), UNSPEC_SCAS);
23222 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23223 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23224 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23225 }
23226 return true;
23227 }
23228
23229 /* For given symbol (function) construct code to compute address of it's PLT
23230 entry in large x86-64 PIC model. */
23231 static rtx
23232 construct_plt_address (rtx symbol)
23233 {
23234 rtx tmp, unspec;
23235
23236 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23237 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
23238 gcc_assert (Pmode == DImode);
23239
23240 tmp = gen_reg_rtx (Pmode);
23241 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23242
23243 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23244 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23245 return tmp;
23246 }
23247
23248 rtx
23249 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23250 rtx callarg2,
23251 rtx pop, bool sibcall)
23252 {
23253 /* We need to represent that SI and DI registers are clobbered
23254 by SYSV calls. */
23255 static int clobbered_registers[] = {
23256 XMM6_REG, XMM7_REG, XMM8_REG,
23257 XMM9_REG, XMM10_REG, XMM11_REG,
23258 XMM12_REG, XMM13_REG, XMM14_REG,
23259 XMM15_REG, SI_REG, DI_REG
23260 };
23261 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23262 rtx use = NULL, call;
23263 unsigned int vec_len;
23264
23265 if (pop == const0_rtx)
23266 pop = NULL;
23267 gcc_assert (!TARGET_64BIT || !pop);
23268
23269 if (TARGET_MACHO && !TARGET_64BIT)
23270 {
23271 #if TARGET_MACHO
23272 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23273 fnaddr = machopic_indirect_call_target (fnaddr);
23274 #endif
23275 }
23276 else
23277 {
23278 /* Static functions and indirect calls don't need the pic register. */
23279 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23280 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23281 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23282 use_reg (&use, pic_offset_table_rtx);
23283 }
23284
23285 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23286 {
23287 rtx al = gen_rtx_REG (QImode, AX_REG);
23288 emit_move_insn (al, callarg2);
23289 use_reg (&use, al);
23290 }
23291
23292 if (ix86_cmodel == CM_LARGE_PIC
23293 && MEM_P (fnaddr)
23294 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23295 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23296 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23297 else if (sibcall
23298 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23299 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23300 {
23301 fnaddr = XEXP (fnaddr, 0);
23302 if (GET_MODE (fnaddr) != word_mode)
23303 fnaddr = convert_to_mode (word_mode, fnaddr, 1);
23304 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23305 }
23306
23307 vec_len = 0;
23308 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23309 if (retval)
23310 call = gen_rtx_SET (VOIDmode, retval, call);
23311 vec[vec_len++] = call;
23312
23313 if (pop)
23314 {
23315 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23316 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23317 vec[vec_len++] = pop;
23318 }
23319
23320 if (TARGET_64BIT_MS_ABI
23321 && (!callarg2 || INTVAL (callarg2) != -2))
23322 {
23323 unsigned i;
23324
23325 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23326 UNSPEC_MS_TO_SYSV_CALL);
23327
23328 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23329 vec[vec_len++]
23330 = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
23331 ? TImode : DImode,
23332 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23333 ? TImode : DImode,
23334 clobbered_registers[i]));
23335 }
23336
23337 /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
23338 if (TARGET_VZEROUPPER)
23339 {
23340 int avx256;
23341 if (cfun->machine->callee_pass_avx256_p)
23342 {
23343 if (cfun->machine->callee_return_avx256_p)
23344 avx256 = callee_return_pass_avx256;
23345 else
23346 avx256 = callee_pass_avx256;
23347 }
23348 else if (cfun->machine->callee_return_avx256_p)
23349 avx256 = callee_return_avx256;
23350 else
23351 avx256 = call_no_avx256;
23352
23353 if (reload_completed)
23354 emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
23355 else
23356 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
23357 gen_rtvec (1, GEN_INT (avx256)),
23358 UNSPEC_CALL_NEEDS_VZEROUPPER);
23359 }
23360
23361 if (vec_len > 1)
23362 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23363 call = emit_call_insn (call);
23364 if (use)
23365 CALL_INSN_FUNCTION_USAGE (call) = use;
23366
23367 return call;
23368 }
23369
23370 void
23371 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
23372 {
23373 rtx pat = PATTERN (insn);
23374 rtvec vec = XVEC (pat, 0);
23375 int len = GET_NUM_ELEM (vec) - 1;
23376
23377 /* Strip off the last entry of the parallel. */
23378 gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
23379 gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
23380 if (len == 1)
23381 pat = RTVEC_ELT (vec, 0);
23382 else
23383 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
23384
23385 emit_insn (gen_avx_vzeroupper (vzeroupper));
23386 emit_call_insn (pat);
23387 }
23388
23389 /* Output the assembly for a call instruction. */
23390
23391 const char *
23392 ix86_output_call_insn (rtx insn, rtx call_op)
23393 {
23394 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23395 bool seh_nop_p = false;
23396 const char *xasm;
23397
23398 if (SIBLING_CALL_P (insn))
23399 {
23400 if (direct_p)
23401 xasm = "jmp\t%P0";
23402 /* SEH epilogue detection requires the indirect branch case
23403 to include REX.W. */
23404 else if (TARGET_SEH)
23405 xasm = "rex.W jmp %A0";
23406 else
23407 xasm = "jmp\t%A0";
23408
23409 output_asm_insn (xasm, &call_op);
23410 return "";
23411 }
23412
23413 /* SEH unwinding can require an extra nop to be emitted in several
23414 circumstances. Determine if we have one of those. */
23415 if (TARGET_SEH)
23416 {
23417 rtx i;
23418
23419 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23420 {
23421 /* If we get to another real insn, we don't need the nop. */
23422 if (INSN_P (i))
23423 break;
23424
23425 /* If we get to the epilogue note, prevent a catch region from
23426 being adjacent to the standard epilogue sequence. If non-
23427 call-exceptions, we'll have done this during epilogue emission. */
23428 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23429 && !flag_non_call_exceptions
23430 && !can_throw_internal (insn))
23431 {
23432 seh_nop_p = true;
23433 break;
23434 }
23435 }
23436
23437 /* If we didn't find a real insn following the call, prevent the
23438 unwinder from looking into the next function. */
23439 if (i == NULL)
23440 seh_nop_p = true;
23441 }
23442
23443 if (direct_p)
23444 xasm = "call\t%P0";
23445 else
23446 xasm = "call\t%A0";
23447
23448 output_asm_insn (xasm, &call_op);
23449
23450 if (seh_nop_p)
23451 return "nop";
23452
23453 return "";
23454 }
23455 \f
23456 /* Clear stack slot assignments remembered from previous functions.
23457 This is called from INIT_EXPANDERS once before RTL is emitted for each
23458 function. */
23459
23460 static struct machine_function *
23461 ix86_init_machine_status (void)
23462 {
23463 struct machine_function *f;
23464
23465 f = ggc_alloc_cleared_machine_function ();
23466 f->use_fast_prologue_epilogue_nregs = -1;
23467 f->tls_descriptor_call_expanded_p = 0;
23468 f->call_abi = ix86_abi;
23469
23470 return f;
23471 }
23472
23473 /* Return a MEM corresponding to a stack slot with mode MODE.
23474 Allocate a new slot if necessary.
23475
23476 The RTL for a function can have several slots available: N is
23477 which slot to use. */
23478
23479 rtx
23480 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23481 {
23482 struct stack_local_entry *s;
23483
23484 gcc_assert (n < MAX_386_STACK_LOCALS);
23485
23486 /* Virtual slot is valid only before vregs are instantiated. */
23487 gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
23488
23489 for (s = ix86_stack_locals; s; s = s->next)
23490 if (s->mode == mode && s->n == n)
23491 return validize_mem (copy_rtx (s->rtl));
23492
23493 s = ggc_alloc_stack_local_entry ();
23494 s->n = n;
23495 s->mode = mode;
23496 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23497
23498 s->next = ix86_stack_locals;
23499 ix86_stack_locals = s;
23500 return validize_mem (s->rtl);
23501 }
23502 \f
23503 /* Calculate the length of the memory address in the instruction encoding.
23504 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23505 or other prefixes. */
23506
23507 int
23508 memory_address_length (rtx addr)
23509 {
23510 struct ix86_address parts;
23511 rtx base, index, disp;
23512 int len;
23513 int ok;
23514
23515 if (GET_CODE (addr) == PRE_DEC
23516 || GET_CODE (addr) == POST_INC
23517 || GET_CODE (addr) == PRE_MODIFY
23518 || GET_CODE (addr) == POST_MODIFY)
23519 return 0;
23520
23521 ok = ix86_decompose_address (addr, &parts);
23522 gcc_assert (ok);
23523
23524 if (parts.base && GET_CODE (parts.base) == SUBREG)
23525 parts.base = SUBREG_REG (parts.base);
23526 if (parts.index && GET_CODE (parts.index) == SUBREG)
23527 parts.index = SUBREG_REG (parts.index);
23528
23529 base = parts.base;
23530 index = parts.index;
23531 disp = parts.disp;
23532
23533 /* Add length of addr32 prefix. */
23534 len = (GET_CODE (addr) == ZERO_EXTEND
23535 || GET_CODE (addr) == AND);
23536
23537 /* Rule of thumb:
23538 - esp as the base always wants an index,
23539 - ebp as the base always wants a displacement,
23540 - r12 as the base always wants an index,
23541 - r13 as the base always wants a displacement. */
23542
23543 /* Register Indirect. */
23544 if (base && !index && !disp)
23545 {
23546 /* esp (for its index) and ebp (for its displacement) need
23547 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23548 code. */
23549 if (REG_P (addr)
23550 && (addr == arg_pointer_rtx
23551 || addr == frame_pointer_rtx
23552 || REGNO (addr) == SP_REG
23553 || REGNO (addr) == BP_REG
23554 || REGNO (addr) == R12_REG
23555 || REGNO (addr) == R13_REG))
23556 len = 1;
23557 }
23558
23559 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23560 is not disp32, but disp32(%rip), so for disp32
23561 SIB byte is needed, unless print_operand_address
23562 optimizes it into disp32(%rip) or (%rip) is implied
23563 by UNSPEC. */
23564 else if (disp && !base && !index)
23565 {
23566 len = 4;
23567 if (TARGET_64BIT)
23568 {
23569 rtx symbol = disp;
23570
23571 if (GET_CODE (disp) == CONST)
23572 symbol = XEXP (disp, 0);
23573 if (GET_CODE (symbol) == PLUS
23574 && CONST_INT_P (XEXP (symbol, 1)))
23575 symbol = XEXP (symbol, 0);
23576
23577 if (GET_CODE (symbol) != LABEL_REF
23578 && (GET_CODE (symbol) != SYMBOL_REF
23579 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23580 && (GET_CODE (symbol) != UNSPEC
23581 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23582 && XINT (symbol, 1) != UNSPEC_PCREL
23583 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23584 len += 1;
23585 }
23586 }
23587
23588 else
23589 {
23590 /* Find the length of the displacement constant. */
23591 if (disp)
23592 {
23593 if (base && satisfies_constraint_K (disp))
23594 len = 1;
23595 else
23596 len = 4;
23597 }
23598 /* ebp always wants a displacement. Similarly r13. */
23599 else if (base && REG_P (base)
23600 && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23601 len = 1;
23602
23603 /* An index requires the two-byte modrm form.... */
23604 if (index
23605 /* ...like esp (or r12), which always wants an index. */
23606 || base == arg_pointer_rtx
23607 || base == frame_pointer_rtx
23608 || (base && REG_P (base)
23609 && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23610 len += 1;
23611 }
23612
23613 switch (parts.seg)
23614 {
23615 case SEG_FS:
23616 case SEG_GS:
23617 len += 1;
23618 break;
23619 default:
23620 break;
23621 }
23622
23623 return len;
23624 }
23625
23626 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23627 is set, expect that insn have 8bit immediate alternative. */
23628 int
23629 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23630 {
23631 int len = 0;
23632 int i;
23633 extract_insn_cached (insn);
23634 for (i = recog_data.n_operands - 1; i >= 0; --i)
23635 if (CONSTANT_P (recog_data.operand[i]))
23636 {
23637 enum attr_mode mode = get_attr_mode (insn);
23638
23639 gcc_assert (!len);
23640 if (shortform && CONST_INT_P (recog_data.operand[i]))
23641 {
23642 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23643 switch (mode)
23644 {
23645 case MODE_QI:
23646 len = 1;
23647 continue;
23648 case MODE_HI:
23649 ival = trunc_int_for_mode (ival, HImode);
23650 break;
23651 case MODE_SI:
23652 ival = trunc_int_for_mode (ival, SImode);
23653 break;
23654 default:
23655 break;
23656 }
23657 if (IN_RANGE (ival, -128, 127))
23658 {
23659 len = 1;
23660 continue;
23661 }
23662 }
23663 switch (mode)
23664 {
23665 case MODE_QI:
23666 len = 1;
23667 break;
23668 case MODE_HI:
23669 len = 2;
23670 break;
23671 case MODE_SI:
23672 len = 4;
23673 break;
23674 /* Immediates for DImode instructions are encoded as 32bit sign extended values. */
23675 case MODE_DI:
23676 len = 4;
23677 break;
23678 default:
23679 fatal_insn ("unknown insn mode", insn);
23680 }
23681 }
23682 return len;
23683 }
23684 /* Compute default value for "length_address" attribute. */
23685 int
23686 ix86_attr_length_address_default (rtx insn)
23687 {
23688 int i;
23689
23690 if (get_attr_type (insn) == TYPE_LEA)
23691 {
23692 rtx set = PATTERN (insn), addr;
23693
23694 if (GET_CODE (set) == PARALLEL)
23695 set = XVECEXP (set, 0, 0);
23696
23697 gcc_assert (GET_CODE (set) == SET);
23698
23699 addr = SET_SRC (set);
23700 if (TARGET_64BIT && get_attr_mode (insn) == MODE_SI)
23701 {
23702 if (GET_CODE (addr) == ZERO_EXTEND)
23703 addr = XEXP (addr, 0);
23704 if (GET_CODE (addr) == SUBREG)
23705 addr = SUBREG_REG (addr);
23706 }
23707
23708 return memory_address_length (addr);
23709 }
23710
23711 extract_insn_cached (insn);
23712 for (i = recog_data.n_operands - 1; i >= 0; --i)
23713 if (MEM_P (recog_data.operand[i]))
23714 {
23715 constrain_operands_cached (reload_completed);
23716 if (which_alternative != -1)
23717 {
23718 const char *constraints = recog_data.constraints[i];
23719 int alt = which_alternative;
23720
23721 while (*constraints == '=' || *constraints == '+')
23722 constraints++;
23723 while (alt-- > 0)
23724 while (*constraints++ != ',')
23725 ;
23726 /* Skip ignored operands. */
23727 if (*constraints == 'X')
23728 continue;
23729 }
23730 return memory_address_length (XEXP (recog_data.operand[i], 0));
23731 }
23732 return 0;
23733 }
23734
23735 /* Compute default value for "length_vex" attribute. It includes
23736 2 or 3 byte VEX prefix and 1 opcode byte. */
23737
23738 int
23739 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
23740 {
23741 int i;
23742
23743 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
23744 byte VEX prefix. */
23745 if (!has_0f_opcode || has_vex_w)
23746 return 3 + 1;
23747
23748 /* We can always use 2 byte VEX prefix in 32bit. */
23749 if (!TARGET_64BIT)
23750 return 2 + 1;
23751
23752 extract_insn_cached (insn);
23753
23754 for (i = recog_data.n_operands - 1; i >= 0; --i)
23755 if (REG_P (recog_data.operand[i]))
23756 {
23757 /* REX.W bit uses 3 byte VEX prefix. */
23758 if (GET_MODE (recog_data.operand[i]) == DImode
23759 && GENERAL_REG_P (recog_data.operand[i]))
23760 return 3 + 1;
23761 }
23762 else
23763 {
23764 /* REX.X or REX.B bits use 3 byte VEX prefix. */
23765 if (MEM_P (recog_data.operand[i])
23766 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
23767 return 3 + 1;
23768 }
23769
23770 return 2 + 1;
23771 }
23772 \f
23773 /* Return the maximum number of instructions a cpu can issue. */
23774
23775 static int
23776 ix86_issue_rate (void)
23777 {
23778 switch (ix86_tune)
23779 {
23780 case PROCESSOR_PENTIUM:
23781 case PROCESSOR_ATOM:
23782 case PROCESSOR_K6:
23783 case PROCESSOR_BTVER2:
23784 return 2;
23785
23786 case PROCESSOR_PENTIUMPRO:
23787 case PROCESSOR_PENTIUM4:
23788 case PROCESSOR_CORE2_32:
23789 case PROCESSOR_CORE2_64:
23790 case PROCESSOR_COREI7_32:
23791 case PROCESSOR_COREI7_64:
23792 case PROCESSOR_ATHLON:
23793 case PROCESSOR_K8:
23794 case PROCESSOR_AMDFAM10:
23795 case PROCESSOR_NOCONA:
23796 case PROCESSOR_GENERIC32:
23797 case PROCESSOR_GENERIC64:
23798 case PROCESSOR_BDVER1:
23799 case PROCESSOR_BDVER2:
23800 case PROCESSOR_BTVER1:
23801 return 3;
23802
23803 default:
23804 return 1;
23805 }
23806 }
23807
23808 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
23809 by DEP_INSN and nothing set by DEP_INSN. */
23810
23811 static bool
23812 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
23813 {
23814 rtx set, set2;
23815
23816 /* Simplify the test for uninteresting insns. */
23817 if (insn_type != TYPE_SETCC
23818 && insn_type != TYPE_ICMOV
23819 && insn_type != TYPE_FCMOV
23820 && insn_type != TYPE_IBR)
23821 return false;
23822
23823 if ((set = single_set (dep_insn)) != 0)
23824 {
23825 set = SET_DEST (set);
23826 set2 = NULL_RTX;
23827 }
23828 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
23829 && XVECLEN (PATTERN (dep_insn), 0) == 2
23830 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
23831 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
23832 {
23833 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23834 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23835 }
23836 else
23837 return false;
23838
23839 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
23840 return false;
23841
23842 /* This test is true if the dependent insn reads the flags but
23843 not any other potentially set register. */
23844 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
23845 return false;
23846
23847 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
23848 return false;
23849
23850 return true;
23851 }
23852
23853 /* Return true iff USE_INSN has a memory address with operands set by
23854 SET_INSN. */
23855
23856 bool
23857 ix86_agi_dependent (rtx set_insn, rtx use_insn)
23858 {
23859 int i;
23860 extract_insn_cached (use_insn);
23861 for (i = recog_data.n_operands - 1; i >= 0; --i)
23862 if (MEM_P (recog_data.operand[i]))
23863 {
23864 rtx addr = XEXP (recog_data.operand[i], 0);
23865 return modified_in_p (addr, set_insn) != 0;
23866 }
23867 return false;
23868 }
23869
23870 static int
23871 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
23872 {
23873 enum attr_type insn_type, dep_insn_type;
23874 enum attr_memory memory;
23875 rtx set, set2;
23876 int dep_insn_code_number;
23877
23878 /* Anti and output dependencies have zero cost on all CPUs. */
23879 if (REG_NOTE_KIND (link) != 0)
23880 return 0;
23881
23882 dep_insn_code_number = recog_memoized (dep_insn);
23883
23884 /* If we can't recognize the insns, we can't really do anything. */
23885 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
23886 return cost;
23887
23888 insn_type = get_attr_type (insn);
23889 dep_insn_type = get_attr_type (dep_insn);
23890
23891 switch (ix86_tune)
23892 {
23893 case PROCESSOR_PENTIUM:
23894 /* Address Generation Interlock adds a cycle of latency. */
23895 if (insn_type == TYPE_LEA)
23896 {
23897 rtx addr = PATTERN (insn);
23898
23899 if (GET_CODE (addr) == PARALLEL)
23900 addr = XVECEXP (addr, 0, 0);
23901
23902 gcc_assert (GET_CODE (addr) == SET);
23903
23904 addr = SET_SRC (addr);
23905 if (modified_in_p (addr, dep_insn))
23906 cost += 1;
23907 }
23908 else if (ix86_agi_dependent (dep_insn, insn))
23909 cost += 1;
23910
23911 /* ??? Compares pair with jump/setcc. */
23912 if (ix86_flags_dependent (insn, dep_insn, insn_type))
23913 cost = 0;
23914
23915 /* Floating point stores require value to be ready one cycle earlier. */
23916 if (insn_type == TYPE_FMOV
23917 && get_attr_memory (insn) == MEMORY_STORE
23918 && !ix86_agi_dependent (dep_insn, insn))
23919 cost += 1;
23920 break;
23921
23922 case PROCESSOR_PENTIUMPRO:
23923 memory = get_attr_memory (insn);
23924
23925 /* INT->FP conversion is expensive. */
23926 if (get_attr_fp_int_src (dep_insn))
23927 cost += 5;
23928
23929 /* There is one cycle extra latency between an FP op and a store. */
23930 if (insn_type == TYPE_FMOV
23931 && (set = single_set (dep_insn)) != NULL_RTX
23932 && (set2 = single_set (insn)) != NULL_RTX
23933 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
23934 && MEM_P (SET_DEST (set2)))
23935 cost += 1;
23936
23937 /* Show ability of reorder buffer to hide latency of load by executing
23938 in parallel with previous instruction in case
23939 previous instruction is not needed to compute the address. */
23940 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23941 && !ix86_agi_dependent (dep_insn, insn))
23942 {
23943 /* Claim moves to take one cycle, as core can issue one load
23944 at time and the next load can start cycle later. */
23945 if (dep_insn_type == TYPE_IMOV
23946 || dep_insn_type == TYPE_FMOV)
23947 cost = 1;
23948 else if (cost > 1)
23949 cost--;
23950 }
23951 break;
23952
23953 case PROCESSOR_K6:
23954 memory = get_attr_memory (insn);
23955
23956 /* The esp dependency is resolved before the instruction is really
23957 finished. */
23958 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
23959 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
23960 return 1;
23961
23962 /* INT->FP conversion is expensive. */
23963 if (get_attr_fp_int_src (dep_insn))
23964 cost += 5;
23965
23966 /* Show ability of reorder buffer to hide latency of load by executing
23967 in parallel with previous instruction in case
23968 previous instruction is not needed to compute the address. */
23969 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23970 && !ix86_agi_dependent (dep_insn, insn))
23971 {
23972 /* Claim moves to take one cycle, as core can issue one load
23973 at time and the next load can start cycle later. */
23974 if (dep_insn_type == TYPE_IMOV
23975 || dep_insn_type == TYPE_FMOV)
23976 cost = 1;
23977 else if (cost > 2)
23978 cost -= 2;
23979 else
23980 cost = 1;
23981 }
23982 break;
23983
23984 case PROCESSOR_ATHLON:
23985 case PROCESSOR_K8:
23986 case PROCESSOR_AMDFAM10:
23987 case PROCESSOR_BDVER1:
23988 case PROCESSOR_BDVER2:
23989 case PROCESSOR_BTVER1:
23990 case PROCESSOR_BTVER2:
23991 case PROCESSOR_ATOM:
23992 case PROCESSOR_GENERIC32:
23993 case PROCESSOR_GENERIC64:
23994 memory = get_attr_memory (insn);
23995
23996 /* Show ability of reorder buffer to hide latency of load by executing
23997 in parallel with previous instruction in case
23998 previous instruction is not needed to compute the address. */
23999 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24000 && !ix86_agi_dependent (dep_insn, insn))
24001 {
24002 enum attr_unit unit = get_attr_unit (insn);
24003 int loadcost = 3;
24004
24005 /* Because of the difference between the length of integer and
24006 floating unit pipeline preparation stages, the memory operands
24007 for floating point are cheaper.
24008
24009 ??? For Athlon it the difference is most probably 2. */
24010 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
24011 loadcost = 3;
24012 else
24013 loadcost = TARGET_ATHLON ? 2 : 0;
24014
24015 if (cost >= loadcost)
24016 cost -= loadcost;
24017 else
24018 cost = 0;
24019 }
24020
24021 default:
24022 break;
24023 }
24024
24025 return cost;
24026 }
24027
24028 /* How many alternative schedules to try. This should be as wide as the
24029 scheduling freedom in the DFA, but no wider. Making this value too
24030 large results extra work for the scheduler. */
24031
24032 static int
24033 ia32_multipass_dfa_lookahead (void)
24034 {
24035 switch (ix86_tune)
24036 {
24037 case PROCESSOR_PENTIUM:
24038 return 2;
24039
24040 case PROCESSOR_PENTIUMPRO:
24041 case PROCESSOR_K6:
24042 return 1;
24043
24044 case PROCESSOR_CORE2_32:
24045 case PROCESSOR_CORE2_64:
24046 case PROCESSOR_COREI7_32:
24047 case PROCESSOR_COREI7_64:
24048 /* Generally, we want haifa-sched:max_issue() to look ahead as far
24049 as many instructions can be executed on a cycle, i.e.,
24050 issue_rate. I wonder why tuning for many CPUs does not do this. */
24051 return ix86_issue_rate ();
24052
24053 default:
24054 return 0;
24055 }
24056 }
24057
24058 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
24059 execution. It is applied if
24060 (1) IMUL instruction is on the top of list;
24061 (2) There exists the only producer of independent IMUL instruction in
24062 ready list;
24063 (3) Put found producer on the top of ready list.
24064 Returns issue rate. */
24065
24066 static int
24067 ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
24068 int clock_var ATTRIBUTE_UNUSED)
24069 {
24070 static int issue_rate = -1;
24071 int n_ready = *pn_ready;
24072 rtx insn, insn1, insn2;
24073 int i;
24074 sd_iterator_def sd_it;
24075 dep_t dep;
24076 int index = -1;
24077
24078 /* Set up issue rate. */
24079 issue_rate = ix86_issue_rate();
24080
24081 /* Do reodering for Atom only. */
24082 if (ix86_tune != PROCESSOR_ATOM)
24083 return issue_rate;
24084 /* Nothing to do if ready list contains only 1 instruction. */
24085 if (n_ready <= 1)
24086 return issue_rate;
24087
24088 /* Check that IMUL instruction is on the top of ready list. */
24089 insn = ready[n_ready - 1];
24090 if (!NONDEBUG_INSN_P (insn))
24091 return issue_rate;
24092 insn = PATTERN (insn);
24093 if (GET_CODE (insn) == PARALLEL)
24094 insn = XVECEXP (insn, 0, 0);
24095 if (GET_CODE (insn) != SET)
24096 return issue_rate;
24097 if (!(GET_CODE (SET_SRC (insn)) == MULT
24098 && GET_MODE (SET_SRC (insn)) == SImode))
24099 return issue_rate;
24100
24101 /* Search for producer of independent IMUL instruction. */
24102 for (i = n_ready - 2; i>= 0; i--)
24103 {
24104 insn = ready[i];
24105 if (!NONDEBUG_INSN_P (insn))
24106 continue;
24107 /* Skip IMUL instruction. */
24108 insn2 = PATTERN (insn);
24109 if (GET_CODE (insn2) == PARALLEL)
24110 insn2 = XVECEXP (insn2, 0, 0);
24111 if (GET_CODE (insn2) == SET
24112 && GET_CODE (SET_SRC (insn2)) == MULT
24113 && GET_MODE (SET_SRC (insn2)) == SImode)
24114 continue;
24115
24116 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
24117 {
24118 rtx con;
24119 con = DEP_CON (dep);
24120 if (!NONDEBUG_INSN_P (con))
24121 continue;
24122 insn1 = PATTERN (con);
24123 if (GET_CODE (insn1) == PARALLEL)
24124 insn1 = XVECEXP (insn1, 0, 0);
24125
24126 if (GET_CODE (insn1) == SET
24127 && GET_CODE (SET_SRC (insn1)) == MULT
24128 && GET_MODE (SET_SRC (insn1)) == SImode)
24129 {
24130 sd_iterator_def sd_it1;
24131 dep_t dep1;
24132 /* Check if there is no other dependee for IMUL. */
24133 index = i;
24134 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
24135 {
24136 rtx pro;
24137 pro = DEP_PRO (dep1);
24138 if (!NONDEBUG_INSN_P (pro))
24139 continue;
24140 if (pro != insn)
24141 index = -1;
24142 }
24143 if (index >= 0)
24144 break;
24145 }
24146 }
24147 if (index >= 0)
24148 break;
24149 }
24150 if (index < 0)
24151 return issue_rate; /* Didn't find IMUL producer. */
24152
24153 if (sched_verbose > 1)
24154 fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n",
24155 INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1]));
24156
24157 /* Put IMUL producer (ready[index]) at the top of ready list. */
24158 insn1= ready[index];
24159 for (i = index; i < n_ready - 1; i++)
24160 ready[i] = ready[i + 1];
24161 ready[n_ready - 1] = insn1;
24162
24163 return issue_rate;
24164 }
24165
24166 \f
24167
24168 /* Model decoder of Core 2/i7.
24169 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
24170 track the instruction fetch block boundaries and make sure that long
24171 (9+ bytes) instructions are assigned to D0. */
24172
24173 /* Maximum length of an insn that can be handled by
24174 a secondary decoder unit. '8' for Core 2/i7. */
24175 static int core2i7_secondary_decoder_max_insn_size;
24176
24177 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
24178 '16' for Core 2/i7. */
24179 static int core2i7_ifetch_block_size;
24180
24181 /* Maximum number of instructions decoder can handle per cycle.
24182 '6' for Core 2/i7. */
24183 static int core2i7_ifetch_block_max_insns;
24184
24185 typedef struct ix86_first_cycle_multipass_data_ *
24186 ix86_first_cycle_multipass_data_t;
24187 typedef const struct ix86_first_cycle_multipass_data_ *
24188 const_ix86_first_cycle_multipass_data_t;
24189
24190 /* A variable to store target state across calls to max_issue within
24191 one cycle. */
24192 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
24193 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
24194
24195 /* Initialize DATA. */
24196 static void
24197 core2i7_first_cycle_multipass_init (void *_data)
24198 {
24199 ix86_first_cycle_multipass_data_t data
24200 = (ix86_first_cycle_multipass_data_t) _data;
24201
24202 data->ifetch_block_len = 0;
24203 data->ifetch_block_n_insns = 0;
24204 data->ready_try_change = NULL;
24205 data->ready_try_change_size = 0;
24206 }
24207
24208 /* Advancing the cycle; reset ifetch block counts. */
24209 static void
24210 core2i7_dfa_post_advance_cycle (void)
24211 {
24212 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
24213
24214 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24215
24216 data->ifetch_block_len = 0;
24217 data->ifetch_block_n_insns = 0;
24218 }
24219
24220 static int min_insn_size (rtx);
24221
24222 /* Filter out insns from ready_try that the core will not be able to issue
24223 on current cycle due to decoder. */
24224 static void
24225 core2i7_first_cycle_multipass_filter_ready_try
24226 (const_ix86_first_cycle_multipass_data_t data,
24227 char *ready_try, int n_ready, bool first_cycle_insn_p)
24228 {
24229 while (n_ready--)
24230 {
24231 rtx insn;
24232 int insn_size;
24233
24234 if (ready_try[n_ready])
24235 continue;
24236
24237 insn = get_ready_element (n_ready);
24238 insn_size = min_insn_size (insn);
24239
24240 if (/* If this is a too long an insn for a secondary decoder ... */
24241 (!first_cycle_insn_p
24242 && insn_size > core2i7_secondary_decoder_max_insn_size)
24243 /* ... or it would not fit into the ifetch block ... */
24244 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
24245 /* ... or the decoder is full already ... */
24246 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
24247 /* ... mask the insn out. */
24248 {
24249 ready_try[n_ready] = 1;
24250
24251 if (data->ready_try_change)
24252 SET_BIT (data->ready_try_change, n_ready);
24253 }
24254 }
24255 }
24256
24257 /* Prepare for a new round of multipass lookahead scheduling. */
24258 static void
24259 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
24260 bool first_cycle_insn_p)
24261 {
24262 ix86_first_cycle_multipass_data_t data
24263 = (ix86_first_cycle_multipass_data_t) _data;
24264 const_ix86_first_cycle_multipass_data_t prev_data
24265 = ix86_first_cycle_multipass_data;
24266
24267 /* Restore the state from the end of the previous round. */
24268 data->ifetch_block_len = prev_data->ifetch_block_len;
24269 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
24270
24271 /* Filter instructions that cannot be issued on current cycle due to
24272 decoder restrictions. */
24273 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24274 first_cycle_insn_p);
24275 }
24276
24277 /* INSN is being issued in current solution. Account for its impact on
24278 the decoder model. */
24279 static void
24280 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24281 rtx insn, const void *_prev_data)
24282 {
24283 ix86_first_cycle_multipass_data_t data
24284 = (ix86_first_cycle_multipass_data_t) _data;
24285 const_ix86_first_cycle_multipass_data_t prev_data
24286 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
24287
24288 int insn_size = min_insn_size (insn);
24289
24290 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
24291 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
24292 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
24293 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24294
24295 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
24296 if (!data->ready_try_change)
24297 {
24298 data->ready_try_change = sbitmap_alloc (n_ready);
24299 data->ready_try_change_size = n_ready;
24300 }
24301 else if (data->ready_try_change_size < n_ready)
24302 {
24303 data->ready_try_change = sbitmap_resize (data->ready_try_change,
24304 n_ready, 0);
24305 data->ready_try_change_size = n_ready;
24306 }
24307 sbitmap_zero (data->ready_try_change);
24308
24309 /* Filter out insns from ready_try that the core will not be able to issue
24310 on current cycle due to decoder. */
24311 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24312 false);
24313 }
24314
24315 /* Revert the effect on ready_try. */
24316 static void
24317 core2i7_first_cycle_multipass_backtrack (const void *_data,
24318 char *ready_try,
24319 int n_ready ATTRIBUTE_UNUSED)
24320 {
24321 const_ix86_first_cycle_multipass_data_t data
24322 = (const_ix86_first_cycle_multipass_data_t) _data;
24323 unsigned int i = 0;
24324 sbitmap_iterator sbi;
24325
24326 gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
24327 EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
24328 {
24329 ready_try[i] = 0;
24330 }
24331 }
24332
24333 /* Save the result of multipass lookahead scheduling for the next round. */
24334 static void
24335 core2i7_first_cycle_multipass_end (const void *_data)
24336 {
24337 const_ix86_first_cycle_multipass_data_t data
24338 = (const_ix86_first_cycle_multipass_data_t) _data;
24339 ix86_first_cycle_multipass_data_t next_data
24340 = ix86_first_cycle_multipass_data;
24341
24342 if (data != NULL)
24343 {
24344 next_data->ifetch_block_len = data->ifetch_block_len;
24345 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24346 }
24347 }
24348
24349 /* Deallocate target data. */
24350 static void
24351 core2i7_first_cycle_multipass_fini (void *_data)
24352 {
24353 ix86_first_cycle_multipass_data_t data
24354 = (ix86_first_cycle_multipass_data_t) _data;
24355
24356 if (data->ready_try_change)
24357 {
24358 sbitmap_free (data->ready_try_change);
24359 data->ready_try_change = NULL;
24360 data->ready_try_change_size = 0;
24361 }
24362 }
24363
24364 /* Prepare for scheduling pass. */
24365 static void
24366 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24367 int verbose ATTRIBUTE_UNUSED,
24368 int max_uid ATTRIBUTE_UNUSED)
24369 {
24370 /* Install scheduling hooks for current CPU. Some of these hooks are used
24371 in time-critical parts of the scheduler, so we only set them up when
24372 they are actually used. */
24373 switch (ix86_tune)
24374 {
24375 case PROCESSOR_CORE2_32:
24376 case PROCESSOR_CORE2_64:
24377 case PROCESSOR_COREI7_32:
24378 case PROCESSOR_COREI7_64:
24379 targetm.sched.dfa_post_advance_cycle
24380 = core2i7_dfa_post_advance_cycle;
24381 targetm.sched.first_cycle_multipass_init
24382 = core2i7_first_cycle_multipass_init;
24383 targetm.sched.first_cycle_multipass_begin
24384 = core2i7_first_cycle_multipass_begin;
24385 targetm.sched.first_cycle_multipass_issue
24386 = core2i7_first_cycle_multipass_issue;
24387 targetm.sched.first_cycle_multipass_backtrack
24388 = core2i7_first_cycle_multipass_backtrack;
24389 targetm.sched.first_cycle_multipass_end
24390 = core2i7_first_cycle_multipass_end;
24391 targetm.sched.first_cycle_multipass_fini
24392 = core2i7_first_cycle_multipass_fini;
24393
24394 /* Set decoder parameters. */
24395 core2i7_secondary_decoder_max_insn_size = 8;
24396 core2i7_ifetch_block_size = 16;
24397 core2i7_ifetch_block_max_insns = 6;
24398 break;
24399
24400 default:
24401 targetm.sched.dfa_post_advance_cycle = NULL;
24402 targetm.sched.first_cycle_multipass_init = NULL;
24403 targetm.sched.first_cycle_multipass_begin = NULL;
24404 targetm.sched.first_cycle_multipass_issue = NULL;
24405 targetm.sched.first_cycle_multipass_backtrack = NULL;
24406 targetm.sched.first_cycle_multipass_end = NULL;
24407 targetm.sched.first_cycle_multipass_fini = NULL;
24408 break;
24409 }
24410 }
24411
24412 \f
24413 /* Compute the alignment given to a constant that is being placed in memory.
24414 EXP is the constant and ALIGN is the alignment that the object would
24415 ordinarily have.
24416 The value of this function is used instead of that alignment to align
24417 the object. */
24418
24419 int
24420 ix86_constant_alignment (tree exp, int align)
24421 {
24422 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24423 || TREE_CODE (exp) == INTEGER_CST)
24424 {
24425 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24426 return 64;
24427 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24428 return 128;
24429 }
24430 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24431 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24432 return BITS_PER_WORD;
24433
24434 return align;
24435 }
24436
24437 /* Compute the alignment for a static variable.
24438 TYPE is the data type, and ALIGN is the alignment that
24439 the object would ordinarily have. The value of this function is used
24440 instead of that alignment to align the object. */
24441
24442 int
24443 ix86_data_alignment (tree type, int align)
24444 {
24445 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24446
24447 if (AGGREGATE_TYPE_P (type)
24448 && TYPE_SIZE (type)
24449 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24450 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24451 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24452 && align < max_align)
24453 align = max_align;
24454
24455 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24456 to 16byte boundary. */
24457 if (TARGET_64BIT)
24458 {
24459 if (AGGREGATE_TYPE_P (type)
24460 && TYPE_SIZE (type)
24461 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24462 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24463 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24464 return 128;
24465 }
24466
24467 if (TREE_CODE (type) == ARRAY_TYPE)
24468 {
24469 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24470 return 64;
24471 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24472 return 128;
24473 }
24474 else if (TREE_CODE (type) == COMPLEX_TYPE)
24475 {
24476
24477 if (TYPE_MODE (type) == DCmode && align < 64)
24478 return 64;
24479 if ((TYPE_MODE (type) == XCmode
24480 || TYPE_MODE (type) == TCmode) && align < 128)
24481 return 128;
24482 }
24483 else if ((TREE_CODE (type) == RECORD_TYPE
24484 || TREE_CODE (type) == UNION_TYPE
24485 || TREE_CODE (type) == QUAL_UNION_TYPE)
24486 && TYPE_FIELDS (type))
24487 {
24488 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24489 return 64;
24490 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24491 return 128;
24492 }
24493 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24494 || TREE_CODE (type) == INTEGER_TYPE)
24495 {
24496 if (TYPE_MODE (type) == DFmode && align < 64)
24497 return 64;
24498 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24499 return 128;
24500 }
24501
24502 return align;
24503 }
24504
24505 /* Compute the alignment for a local variable or a stack slot. EXP is
24506 the data type or decl itself, MODE is the widest mode available and
24507 ALIGN is the alignment that the object would ordinarily have. The
24508 value of this macro is used instead of that alignment to align the
24509 object. */
24510
24511 unsigned int
24512 ix86_local_alignment (tree exp, enum machine_mode mode,
24513 unsigned int align)
24514 {
24515 tree type, decl;
24516
24517 if (exp && DECL_P (exp))
24518 {
24519 type = TREE_TYPE (exp);
24520 decl = exp;
24521 }
24522 else
24523 {
24524 type = exp;
24525 decl = NULL;
24526 }
24527
24528 /* Don't do dynamic stack realignment for long long objects with
24529 -mpreferred-stack-boundary=2. */
24530 if (!TARGET_64BIT
24531 && align == 64
24532 && ix86_preferred_stack_boundary < 64
24533 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
24534 && (!type || !TYPE_USER_ALIGN (type))
24535 && (!decl || !DECL_USER_ALIGN (decl)))
24536 align = 32;
24537
24538 /* If TYPE is NULL, we are allocating a stack slot for caller-save
24539 register in MODE. We will return the largest alignment of XF
24540 and DF. */
24541 if (!type)
24542 {
24543 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
24544 align = GET_MODE_ALIGNMENT (DFmode);
24545 return align;
24546 }
24547
24548 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24549 to 16byte boundary. Exact wording is:
24550
24551 An array uses the same alignment as its elements, except that a local or
24552 global array variable of length at least 16 bytes or
24553 a C99 variable-length array variable always has alignment of at least 16 bytes.
24554
24555 This was added to allow use of aligned SSE instructions at arrays. This
24556 rule is meant for static storage (where compiler can not do the analysis
24557 by itself). We follow it for automatic variables only when convenient.
24558 We fully control everything in the function compiled and functions from
24559 other unit can not rely on the alignment.
24560
24561 Exclude va_list type. It is the common case of local array where
24562 we can not benefit from the alignment. */
24563 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
24564 && TARGET_SSE)
24565 {
24566 if (AGGREGATE_TYPE_P (type)
24567 && (va_list_type_node == NULL_TREE
24568 || (TYPE_MAIN_VARIANT (type)
24569 != TYPE_MAIN_VARIANT (va_list_type_node)))
24570 && TYPE_SIZE (type)
24571 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24572 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
24573 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24574 return 128;
24575 }
24576 if (TREE_CODE (type) == ARRAY_TYPE)
24577 {
24578 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24579 return 64;
24580 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24581 return 128;
24582 }
24583 else if (TREE_CODE (type) == COMPLEX_TYPE)
24584 {
24585 if (TYPE_MODE (type) == DCmode && align < 64)
24586 return 64;
24587 if ((TYPE_MODE (type) == XCmode
24588 || TYPE_MODE (type) == TCmode) && align < 128)
24589 return 128;
24590 }
24591 else if ((TREE_CODE (type) == RECORD_TYPE
24592 || TREE_CODE (type) == UNION_TYPE
24593 || TREE_CODE (type) == QUAL_UNION_TYPE)
24594 && TYPE_FIELDS (type))
24595 {
24596 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24597 return 64;
24598 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24599 return 128;
24600 }
24601 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24602 || TREE_CODE (type) == INTEGER_TYPE)
24603 {
24604
24605 if (TYPE_MODE (type) == DFmode && align < 64)
24606 return 64;
24607 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24608 return 128;
24609 }
24610 return align;
24611 }
24612
24613 /* Compute the minimum required alignment for dynamic stack realignment
24614 purposes for a local variable, parameter or a stack slot. EXP is
24615 the data type or decl itself, MODE is its mode and ALIGN is the
24616 alignment that the object would ordinarily have. */
24617
24618 unsigned int
24619 ix86_minimum_alignment (tree exp, enum machine_mode mode,
24620 unsigned int align)
24621 {
24622 tree type, decl;
24623
24624 if (exp && DECL_P (exp))
24625 {
24626 type = TREE_TYPE (exp);
24627 decl = exp;
24628 }
24629 else
24630 {
24631 type = exp;
24632 decl = NULL;
24633 }
24634
24635 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
24636 return align;
24637
24638 /* Don't do dynamic stack realignment for long long objects with
24639 -mpreferred-stack-boundary=2. */
24640 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
24641 && (!type || !TYPE_USER_ALIGN (type))
24642 && (!decl || !DECL_USER_ALIGN (decl)))
24643 return 32;
24644
24645 return align;
24646 }
24647 \f
24648 /* Find a location for the static chain incoming to a nested function.
24649 This is a register, unless all free registers are used by arguments. */
24650
24651 static rtx
24652 ix86_static_chain (const_tree fndecl, bool incoming_p)
24653 {
24654 unsigned regno;
24655
24656 if (!DECL_STATIC_CHAIN (fndecl))
24657 return NULL;
24658
24659 if (TARGET_64BIT)
24660 {
24661 /* We always use R10 in 64-bit mode. */
24662 regno = R10_REG;
24663 }
24664 else
24665 {
24666 tree fntype;
24667 unsigned int ccvt;
24668
24669 /* By default in 32-bit mode we use ECX to pass the static chain. */
24670 regno = CX_REG;
24671
24672 fntype = TREE_TYPE (fndecl);
24673 ccvt = ix86_get_callcvt (fntype);
24674 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
24675 {
24676 /* Fastcall functions use ecx/edx for arguments, which leaves
24677 us with EAX for the static chain.
24678 Thiscall functions use ecx for arguments, which also
24679 leaves us with EAX for the static chain. */
24680 regno = AX_REG;
24681 }
24682 else if (ix86_function_regparm (fntype, fndecl) == 3)
24683 {
24684 /* For regparm 3, we have no free call-clobbered registers in
24685 which to store the static chain. In order to implement this,
24686 we have the trampoline push the static chain to the stack.
24687 However, we can't push a value below the return address when
24688 we call the nested function directly, so we have to use an
24689 alternate entry point. For this we use ESI, and have the
24690 alternate entry point push ESI, so that things appear the
24691 same once we're executing the nested function. */
24692 if (incoming_p)
24693 {
24694 if (fndecl == current_function_decl)
24695 ix86_static_chain_on_stack = true;
24696 return gen_frame_mem (SImode,
24697 plus_constant (Pmode,
24698 arg_pointer_rtx, -8));
24699 }
24700 regno = SI_REG;
24701 }
24702 }
24703
24704 return gen_rtx_REG (Pmode, regno);
24705 }
24706
24707 /* Emit RTL insns to initialize the variable parts of a trampoline.
24708 FNDECL is the decl of the target address; M_TRAMP is a MEM for
24709 the trampoline, and CHAIN_VALUE is an RTX for the static chain
24710 to be passed to the target function. */
24711
24712 static void
24713 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
24714 {
24715 rtx mem, fnaddr;
24716 int opcode;
24717 int offset = 0;
24718
24719 fnaddr = XEXP (DECL_RTL (fndecl), 0);
24720
24721 if (TARGET_64BIT)
24722 {
24723 int size;
24724
24725 /* Load the function address to r11. Try to load address using
24726 the shorter movl instead of movabs. We may want to support
24727 movq for kernel mode, but kernel does not use trampolines at
24728 the moment. FNADDR is a 32bit address and may not be in
24729 DImode when ptr_mode == SImode. Always use movl in this
24730 case. */
24731 if (ptr_mode == SImode
24732 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
24733 {
24734 fnaddr = copy_addr_to_reg (fnaddr);
24735
24736 mem = adjust_address (m_tramp, HImode, offset);
24737 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
24738
24739 mem = adjust_address (m_tramp, SImode, offset + 2);
24740 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
24741 offset += 6;
24742 }
24743 else
24744 {
24745 mem = adjust_address (m_tramp, HImode, offset);
24746 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
24747
24748 mem = adjust_address (m_tramp, DImode, offset + 2);
24749 emit_move_insn (mem, fnaddr);
24750 offset += 10;
24751 }
24752
24753 /* Load static chain using movabs to r10. Use the shorter movl
24754 instead of movabs when ptr_mode == SImode. */
24755 if (ptr_mode == SImode)
24756 {
24757 opcode = 0xba41;
24758 size = 6;
24759 }
24760 else
24761 {
24762 opcode = 0xba49;
24763 size = 10;
24764 }
24765
24766 mem = adjust_address (m_tramp, HImode, offset);
24767 emit_move_insn (mem, gen_int_mode (opcode, HImode));
24768
24769 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
24770 emit_move_insn (mem, chain_value);
24771 offset += size;
24772
24773 /* Jump to r11; the last (unused) byte is a nop, only there to
24774 pad the write out to a single 32-bit store. */
24775 mem = adjust_address (m_tramp, SImode, offset);
24776 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
24777 offset += 4;
24778 }
24779 else
24780 {
24781 rtx disp, chain;
24782
24783 /* Depending on the static chain location, either load a register
24784 with a constant, or push the constant to the stack. All of the
24785 instructions are the same size. */
24786 chain = ix86_static_chain (fndecl, true);
24787 if (REG_P (chain))
24788 {
24789 switch (REGNO (chain))
24790 {
24791 case AX_REG:
24792 opcode = 0xb8; break;
24793 case CX_REG:
24794 opcode = 0xb9; break;
24795 default:
24796 gcc_unreachable ();
24797 }
24798 }
24799 else
24800 opcode = 0x68;
24801
24802 mem = adjust_address (m_tramp, QImode, offset);
24803 emit_move_insn (mem, gen_int_mode (opcode, QImode));
24804
24805 mem = adjust_address (m_tramp, SImode, offset + 1);
24806 emit_move_insn (mem, chain_value);
24807 offset += 5;
24808
24809 mem = adjust_address (m_tramp, QImode, offset);
24810 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
24811
24812 mem = adjust_address (m_tramp, SImode, offset + 1);
24813
24814 /* Compute offset from the end of the jmp to the target function.
24815 In the case in which the trampoline stores the static chain on
24816 the stack, we need to skip the first insn which pushes the
24817 (call-saved) register static chain; this push is 1 byte. */
24818 offset += 5;
24819 disp = expand_binop (SImode, sub_optab, fnaddr,
24820 plus_constant (Pmode, XEXP (m_tramp, 0),
24821 offset - (MEM_P (chain) ? 1 : 0)),
24822 NULL_RTX, 1, OPTAB_DIRECT);
24823 emit_move_insn (mem, disp);
24824 }
24825
24826 gcc_assert (offset <= TRAMPOLINE_SIZE);
24827
24828 #ifdef HAVE_ENABLE_EXECUTE_STACK
24829 #ifdef CHECK_EXECUTE_STACK_ENABLED
24830 if (CHECK_EXECUTE_STACK_ENABLED)
24831 #endif
24832 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
24833 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
24834 #endif
24835 }
24836 \f
24837 /* The following file contains several enumerations and data structures
24838 built from the definitions in i386-builtin-types.def. */
24839
24840 #include "i386-builtin-types.inc"
24841
24842 /* Table for the ix86 builtin non-function types. */
24843 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
24844
24845 /* Retrieve an element from the above table, building some of
24846 the types lazily. */
24847
24848 static tree
24849 ix86_get_builtin_type (enum ix86_builtin_type tcode)
24850 {
24851 unsigned int index;
24852 tree type, itype;
24853
24854 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
24855
24856 type = ix86_builtin_type_tab[(int) tcode];
24857 if (type != NULL)
24858 return type;
24859
24860 gcc_assert (tcode > IX86_BT_LAST_PRIM);
24861 if (tcode <= IX86_BT_LAST_VECT)
24862 {
24863 enum machine_mode mode;
24864
24865 index = tcode - IX86_BT_LAST_PRIM - 1;
24866 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
24867 mode = ix86_builtin_type_vect_mode[index];
24868
24869 type = build_vector_type_for_mode (itype, mode);
24870 }
24871 else
24872 {
24873 int quals;
24874
24875 index = tcode - IX86_BT_LAST_VECT - 1;
24876 if (tcode <= IX86_BT_LAST_PTR)
24877 quals = TYPE_UNQUALIFIED;
24878 else
24879 quals = TYPE_QUAL_CONST;
24880
24881 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
24882 if (quals != TYPE_UNQUALIFIED)
24883 itype = build_qualified_type (itype, quals);
24884
24885 type = build_pointer_type (itype);
24886 }
24887
24888 ix86_builtin_type_tab[(int) tcode] = type;
24889 return type;
24890 }
24891
24892 /* Table for the ix86 builtin function types. */
24893 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
24894
24895 /* Retrieve an element from the above table, building some of
24896 the types lazily. */
24897
24898 static tree
24899 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
24900 {
24901 tree type;
24902
24903 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
24904
24905 type = ix86_builtin_func_type_tab[(int) tcode];
24906 if (type != NULL)
24907 return type;
24908
24909 if (tcode <= IX86_BT_LAST_FUNC)
24910 {
24911 unsigned start = ix86_builtin_func_start[(int) tcode];
24912 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
24913 tree rtype, atype, args = void_list_node;
24914 unsigned i;
24915
24916 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
24917 for (i = after - 1; i > start; --i)
24918 {
24919 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
24920 args = tree_cons (NULL, atype, args);
24921 }
24922
24923 type = build_function_type (rtype, args);
24924 }
24925 else
24926 {
24927 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
24928 enum ix86_builtin_func_type icode;
24929
24930 icode = ix86_builtin_func_alias_base[index];
24931 type = ix86_get_builtin_func_type (icode);
24932 }
24933
24934 ix86_builtin_func_type_tab[(int) tcode] = type;
24935 return type;
24936 }
24937
24938
24939 /* Codes for all the SSE/MMX builtins. */
24940 enum ix86_builtins
24941 {
24942 IX86_BUILTIN_ADDPS,
24943 IX86_BUILTIN_ADDSS,
24944 IX86_BUILTIN_DIVPS,
24945 IX86_BUILTIN_DIVSS,
24946 IX86_BUILTIN_MULPS,
24947 IX86_BUILTIN_MULSS,
24948 IX86_BUILTIN_SUBPS,
24949 IX86_BUILTIN_SUBSS,
24950
24951 IX86_BUILTIN_CMPEQPS,
24952 IX86_BUILTIN_CMPLTPS,
24953 IX86_BUILTIN_CMPLEPS,
24954 IX86_BUILTIN_CMPGTPS,
24955 IX86_BUILTIN_CMPGEPS,
24956 IX86_BUILTIN_CMPNEQPS,
24957 IX86_BUILTIN_CMPNLTPS,
24958 IX86_BUILTIN_CMPNLEPS,
24959 IX86_BUILTIN_CMPNGTPS,
24960 IX86_BUILTIN_CMPNGEPS,
24961 IX86_BUILTIN_CMPORDPS,
24962 IX86_BUILTIN_CMPUNORDPS,
24963 IX86_BUILTIN_CMPEQSS,
24964 IX86_BUILTIN_CMPLTSS,
24965 IX86_BUILTIN_CMPLESS,
24966 IX86_BUILTIN_CMPNEQSS,
24967 IX86_BUILTIN_CMPNLTSS,
24968 IX86_BUILTIN_CMPNLESS,
24969 IX86_BUILTIN_CMPNGTSS,
24970 IX86_BUILTIN_CMPNGESS,
24971 IX86_BUILTIN_CMPORDSS,
24972 IX86_BUILTIN_CMPUNORDSS,
24973
24974 IX86_BUILTIN_COMIEQSS,
24975 IX86_BUILTIN_COMILTSS,
24976 IX86_BUILTIN_COMILESS,
24977 IX86_BUILTIN_COMIGTSS,
24978 IX86_BUILTIN_COMIGESS,
24979 IX86_BUILTIN_COMINEQSS,
24980 IX86_BUILTIN_UCOMIEQSS,
24981 IX86_BUILTIN_UCOMILTSS,
24982 IX86_BUILTIN_UCOMILESS,
24983 IX86_BUILTIN_UCOMIGTSS,
24984 IX86_BUILTIN_UCOMIGESS,
24985 IX86_BUILTIN_UCOMINEQSS,
24986
24987 IX86_BUILTIN_CVTPI2PS,
24988 IX86_BUILTIN_CVTPS2PI,
24989 IX86_BUILTIN_CVTSI2SS,
24990 IX86_BUILTIN_CVTSI642SS,
24991 IX86_BUILTIN_CVTSS2SI,
24992 IX86_BUILTIN_CVTSS2SI64,
24993 IX86_BUILTIN_CVTTPS2PI,
24994 IX86_BUILTIN_CVTTSS2SI,
24995 IX86_BUILTIN_CVTTSS2SI64,
24996
24997 IX86_BUILTIN_MAXPS,
24998 IX86_BUILTIN_MAXSS,
24999 IX86_BUILTIN_MINPS,
25000 IX86_BUILTIN_MINSS,
25001
25002 IX86_BUILTIN_LOADUPS,
25003 IX86_BUILTIN_STOREUPS,
25004 IX86_BUILTIN_MOVSS,
25005
25006 IX86_BUILTIN_MOVHLPS,
25007 IX86_BUILTIN_MOVLHPS,
25008 IX86_BUILTIN_LOADHPS,
25009 IX86_BUILTIN_LOADLPS,
25010 IX86_BUILTIN_STOREHPS,
25011 IX86_BUILTIN_STORELPS,
25012
25013 IX86_BUILTIN_MASKMOVQ,
25014 IX86_BUILTIN_MOVMSKPS,
25015 IX86_BUILTIN_PMOVMSKB,
25016
25017 IX86_BUILTIN_MOVNTPS,
25018 IX86_BUILTIN_MOVNTQ,
25019
25020 IX86_BUILTIN_LOADDQU,
25021 IX86_BUILTIN_STOREDQU,
25022
25023 IX86_BUILTIN_PACKSSWB,
25024 IX86_BUILTIN_PACKSSDW,
25025 IX86_BUILTIN_PACKUSWB,
25026
25027 IX86_BUILTIN_PADDB,
25028 IX86_BUILTIN_PADDW,
25029 IX86_BUILTIN_PADDD,
25030 IX86_BUILTIN_PADDQ,
25031 IX86_BUILTIN_PADDSB,
25032 IX86_BUILTIN_PADDSW,
25033 IX86_BUILTIN_PADDUSB,
25034 IX86_BUILTIN_PADDUSW,
25035 IX86_BUILTIN_PSUBB,
25036 IX86_BUILTIN_PSUBW,
25037 IX86_BUILTIN_PSUBD,
25038 IX86_BUILTIN_PSUBQ,
25039 IX86_BUILTIN_PSUBSB,
25040 IX86_BUILTIN_PSUBSW,
25041 IX86_BUILTIN_PSUBUSB,
25042 IX86_BUILTIN_PSUBUSW,
25043
25044 IX86_BUILTIN_PAND,
25045 IX86_BUILTIN_PANDN,
25046 IX86_BUILTIN_POR,
25047 IX86_BUILTIN_PXOR,
25048
25049 IX86_BUILTIN_PAVGB,
25050 IX86_BUILTIN_PAVGW,
25051
25052 IX86_BUILTIN_PCMPEQB,
25053 IX86_BUILTIN_PCMPEQW,
25054 IX86_BUILTIN_PCMPEQD,
25055 IX86_BUILTIN_PCMPGTB,
25056 IX86_BUILTIN_PCMPGTW,
25057 IX86_BUILTIN_PCMPGTD,
25058
25059 IX86_BUILTIN_PMADDWD,
25060
25061 IX86_BUILTIN_PMAXSW,
25062 IX86_BUILTIN_PMAXUB,
25063 IX86_BUILTIN_PMINSW,
25064 IX86_BUILTIN_PMINUB,
25065
25066 IX86_BUILTIN_PMULHUW,
25067 IX86_BUILTIN_PMULHW,
25068 IX86_BUILTIN_PMULLW,
25069
25070 IX86_BUILTIN_PSADBW,
25071 IX86_BUILTIN_PSHUFW,
25072
25073 IX86_BUILTIN_PSLLW,
25074 IX86_BUILTIN_PSLLD,
25075 IX86_BUILTIN_PSLLQ,
25076 IX86_BUILTIN_PSRAW,
25077 IX86_BUILTIN_PSRAD,
25078 IX86_BUILTIN_PSRLW,
25079 IX86_BUILTIN_PSRLD,
25080 IX86_BUILTIN_PSRLQ,
25081 IX86_BUILTIN_PSLLWI,
25082 IX86_BUILTIN_PSLLDI,
25083 IX86_BUILTIN_PSLLQI,
25084 IX86_BUILTIN_PSRAWI,
25085 IX86_BUILTIN_PSRADI,
25086 IX86_BUILTIN_PSRLWI,
25087 IX86_BUILTIN_PSRLDI,
25088 IX86_BUILTIN_PSRLQI,
25089
25090 IX86_BUILTIN_PUNPCKHBW,
25091 IX86_BUILTIN_PUNPCKHWD,
25092 IX86_BUILTIN_PUNPCKHDQ,
25093 IX86_BUILTIN_PUNPCKLBW,
25094 IX86_BUILTIN_PUNPCKLWD,
25095 IX86_BUILTIN_PUNPCKLDQ,
25096
25097 IX86_BUILTIN_SHUFPS,
25098
25099 IX86_BUILTIN_RCPPS,
25100 IX86_BUILTIN_RCPSS,
25101 IX86_BUILTIN_RSQRTPS,
25102 IX86_BUILTIN_RSQRTPS_NR,
25103 IX86_BUILTIN_RSQRTSS,
25104 IX86_BUILTIN_RSQRTF,
25105 IX86_BUILTIN_SQRTPS,
25106 IX86_BUILTIN_SQRTPS_NR,
25107 IX86_BUILTIN_SQRTSS,
25108
25109 IX86_BUILTIN_UNPCKHPS,
25110 IX86_BUILTIN_UNPCKLPS,
25111
25112 IX86_BUILTIN_ANDPS,
25113 IX86_BUILTIN_ANDNPS,
25114 IX86_BUILTIN_ORPS,
25115 IX86_BUILTIN_XORPS,
25116
25117 IX86_BUILTIN_EMMS,
25118 IX86_BUILTIN_LDMXCSR,
25119 IX86_BUILTIN_STMXCSR,
25120 IX86_BUILTIN_SFENCE,
25121
25122 /* 3DNow! Original */
25123 IX86_BUILTIN_FEMMS,
25124 IX86_BUILTIN_PAVGUSB,
25125 IX86_BUILTIN_PF2ID,
25126 IX86_BUILTIN_PFACC,
25127 IX86_BUILTIN_PFADD,
25128 IX86_BUILTIN_PFCMPEQ,
25129 IX86_BUILTIN_PFCMPGE,
25130 IX86_BUILTIN_PFCMPGT,
25131 IX86_BUILTIN_PFMAX,
25132 IX86_BUILTIN_PFMIN,
25133 IX86_BUILTIN_PFMUL,
25134 IX86_BUILTIN_PFRCP,
25135 IX86_BUILTIN_PFRCPIT1,
25136 IX86_BUILTIN_PFRCPIT2,
25137 IX86_BUILTIN_PFRSQIT1,
25138 IX86_BUILTIN_PFRSQRT,
25139 IX86_BUILTIN_PFSUB,
25140 IX86_BUILTIN_PFSUBR,
25141 IX86_BUILTIN_PI2FD,
25142 IX86_BUILTIN_PMULHRW,
25143
25144 /* 3DNow! Athlon Extensions */
25145 IX86_BUILTIN_PF2IW,
25146 IX86_BUILTIN_PFNACC,
25147 IX86_BUILTIN_PFPNACC,
25148 IX86_BUILTIN_PI2FW,
25149 IX86_BUILTIN_PSWAPDSI,
25150 IX86_BUILTIN_PSWAPDSF,
25151
25152 /* SSE2 */
25153 IX86_BUILTIN_ADDPD,
25154 IX86_BUILTIN_ADDSD,
25155 IX86_BUILTIN_DIVPD,
25156 IX86_BUILTIN_DIVSD,
25157 IX86_BUILTIN_MULPD,
25158 IX86_BUILTIN_MULSD,
25159 IX86_BUILTIN_SUBPD,
25160 IX86_BUILTIN_SUBSD,
25161
25162 IX86_BUILTIN_CMPEQPD,
25163 IX86_BUILTIN_CMPLTPD,
25164 IX86_BUILTIN_CMPLEPD,
25165 IX86_BUILTIN_CMPGTPD,
25166 IX86_BUILTIN_CMPGEPD,
25167 IX86_BUILTIN_CMPNEQPD,
25168 IX86_BUILTIN_CMPNLTPD,
25169 IX86_BUILTIN_CMPNLEPD,
25170 IX86_BUILTIN_CMPNGTPD,
25171 IX86_BUILTIN_CMPNGEPD,
25172 IX86_BUILTIN_CMPORDPD,
25173 IX86_BUILTIN_CMPUNORDPD,
25174 IX86_BUILTIN_CMPEQSD,
25175 IX86_BUILTIN_CMPLTSD,
25176 IX86_BUILTIN_CMPLESD,
25177 IX86_BUILTIN_CMPNEQSD,
25178 IX86_BUILTIN_CMPNLTSD,
25179 IX86_BUILTIN_CMPNLESD,
25180 IX86_BUILTIN_CMPORDSD,
25181 IX86_BUILTIN_CMPUNORDSD,
25182
25183 IX86_BUILTIN_COMIEQSD,
25184 IX86_BUILTIN_COMILTSD,
25185 IX86_BUILTIN_COMILESD,
25186 IX86_BUILTIN_COMIGTSD,
25187 IX86_BUILTIN_COMIGESD,
25188 IX86_BUILTIN_COMINEQSD,
25189 IX86_BUILTIN_UCOMIEQSD,
25190 IX86_BUILTIN_UCOMILTSD,
25191 IX86_BUILTIN_UCOMILESD,
25192 IX86_BUILTIN_UCOMIGTSD,
25193 IX86_BUILTIN_UCOMIGESD,
25194 IX86_BUILTIN_UCOMINEQSD,
25195
25196 IX86_BUILTIN_MAXPD,
25197 IX86_BUILTIN_MAXSD,
25198 IX86_BUILTIN_MINPD,
25199 IX86_BUILTIN_MINSD,
25200
25201 IX86_BUILTIN_ANDPD,
25202 IX86_BUILTIN_ANDNPD,
25203 IX86_BUILTIN_ORPD,
25204 IX86_BUILTIN_XORPD,
25205
25206 IX86_BUILTIN_SQRTPD,
25207 IX86_BUILTIN_SQRTSD,
25208
25209 IX86_BUILTIN_UNPCKHPD,
25210 IX86_BUILTIN_UNPCKLPD,
25211
25212 IX86_BUILTIN_SHUFPD,
25213
25214 IX86_BUILTIN_LOADUPD,
25215 IX86_BUILTIN_STOREUPD,
25216 IX86_BUILTIN_MOVSD,
25217
25218 IX86_BUILTIN_LOADHPD,
25219 IX86_BUILTIN_LOADLPD,
25220
25221 IX86_BUILTIN_CVTDQ2PD,
25222 IX86_BUILTIN_CVTDQ2PS,
25223
25224 IX86_BUILTIN_CVTPD2DQ,
25225 IX86_BUILTIN_CVTPD2PI,
25226 IX86_BUILTIN_CVTPD2PS,
25227 IX86_BUILTIN_CVTTPD2DQ,
25228 IX86_BUILTIN_CVTTPD2PI,
25229
25230 IX86_BUILTIN_CVTPI2PD,
25231 IX86_BUILTIN_CVTSI2SD,
25232 IX86_BUILTIN_CVTSI642SD,
25233
25234 IX86_BUILTIN_CVTSD2SI,
25235 IX86_BUILTIN_CVTSD2SI64,
25236 IX86_BUILTIN_CVTSD2SS,
25237 IX86_BUILTIN_CVTSS2SD,
25238 IX86_BUILTIN_CVTTSD2SI,
25239 IX86_BUILTIN_CVTTSD2SI64,
25240
25241 IX86_BUILTIN_CVTPS2DQ,
25242 IX86_BUILTIN_CVTPS2PD,
25243 IX86_BUILTIN_CVTTPS2DQ,
25244
25245 IX86_BUILTIN_MOVNTI,
25246 IX86_BUILTIN_MOVNTI64,
25247 IX86_BUILTIN_MOVNTPD,
25248 IX86_BUILTIN_MOVNTDQ,
25249
25250 IX86_BUILTIN_MOVQ128,
25251
25252 /* SSE2 MMX */
25253 IX86_BUILTIN_MASKMOVDQU,
25254 IX86_BUILTIN_MOVMSKPD,
25255 IX86_BUILTIN_PMOVMSKB128,
25256
25257 IX86_BUILTIN_PACKSSWB128,
25258 IX86_BUILTIN_PACKSSDW128,
25259 IX86_BUILTIN_PACKUSWB128,
25260
25261 IX86_BUILTIN_PADDB128,
25262 IX86_BUILTIN_PADDW128,
25263 IX86_BUILTIN_PADDD128,
25264 IX86_BUILTIN_PADDQ128,
25265 IX86_BUILTIN_PADDSB128,
25266 IX86_BUILTIN_PADDSW128,
25267 IX86_BUILTIN_PADDUSB128,
25268 IX86_BUILTIN_PADDUSW128,
25269 IX86_BUILTIN_PSUBB128,
25270 IX86_BUILTIN_PSUBW128,
25271 IX86_BUILTIN_PSUBD128,
25272 IX86_BUILTIN_PSUBQ128,
25273 IX86_BUILTIN_PSUBSB128,
25274 IX86_BUILTIN_PSUBSW128,
25275 IX86_BUILTIN_PSUBUSB128,
25276 IX86_BUILTIN_PSUBUSW128,
25277
25278 IX86_BUILTIN_PAND128,
25279 IX86_BUILTIN_PANDN128,
25280 IX86_BUILTIN_POR128,
25281 IX86_BUILTIN_PXOR128,
25282
25283 IX86_BUILTIN_PAVGB128,
25284 IX86_BUILTIN_PAVGW128,
25285
25286 IX86_BUILTIN_PCMPEQB128,
25287 IX86_BUILTIN_PCMPEQW128,
25288 IX86_BUILTIN_PCMPEQD128,
25289 IX86_BUILTIN_PCMPGTB128,
25290 IX86_BUILTIN_PCMPGTW128,
25291 IX86_BUILTIN_PCMPGTD128,
25292
25293 IX86_BUILTIN_PMADDWD128,
25294
25295 IX86_BUILTIN_PMAXSW128,
25296 IX86_BUILTIN_PMAXUB128,
25297 IX86_BUILTIN_PMINSW128,
25298 IX86_BUILTIN_PMINUB128,
25299
25300 IX86_BUILTIN_PMULUDQ,
25301 IX86_BUILTIN_PMULUDQ128,
25302 IX86_BUILTIN_PMULHUW128,
25303 IX86_BUILTIN_PMULHW128,
25304 IX86_BUILTIN_PMULLW128,
25305
25306 IX86_BUILTIN_PSADBW128,
25307 IX86_BUILTIN_PSHUFHW,
25308 IX86_BUILTIN_PSHUFLW,
25309 IX86_BUILTIN_PSHUFD,
25310
25311 IX86_BUILTIN_PSLLDQI128,
25312 IX86_BUILTIN_PSLLWI128,
25313 IX86_BUILTIN_PSLLDI128,
25314 IX86_BUILTIN_PSLLQI128,
25315 IX86_BUILTIN_PSRAWI128,
25316 IX86_BUILTIN_PSRADI128,
25317 IX86_BUILTIN_PSRLDQI128,
25318 IX86_BUILTIN_PSRLWI128,
25319 IX86_BUILTIN_PSRLDI128,
25320 IX86_BUILTIN_PSRLQI128,
25321
25322 IX86_BUILTIN_PSLLDQ128,
25323 IX86_BUILTIN_PSLLW128,
25324 IX86_BUILTIN_PSLLD128,
25325 IX86_BUILTIN_PSLLQ128,
25326 IX86_BUILTIN_PSRAW128,
25327 IX86_BUILTIN_PSRAD128,
25328 IX86_BUILTIN_PSRLW128,
25329 IX86_BUILTIN_PSRLD128,
25330 IX86_BUILTIN_PSRLQ128,
25331
25332 IX86_BUILTIN_PUNPCKHBW128,
25333 IX86_BUILTIN_PUNPCKHWD128,
25334 IX86_BUILTIN_PUNPCKHDQ128,
25335 IX86_BUILTIN_PUNPCKHQDQ128,
25336 IX86_BUILTIN_PUNPCKLBW128,
25337 IX86_BUILTIN_PUNPCKLWD128,
25338 IX86_BUILTIN_PUNPCKLDQ128,
25339 IX86_BUILTIN_PUNPCKLQDQ128,
25340
25341 IX86_BUILTIN_CLFLUSH,
25342 IX86_BUILTIN_MFENCE,
25343 IX86_BUILTIN_LFENCE,
25344 IX86_BUILTIN_PAUSE,
25345
25346 IX86_BUILTIN_BSRSI,
25347 IX86_BUILTIN_BSRDI,
25348 IX86_BUILTIN_RDPMC,
25349 IX86_BUILTIN_RDTSC,
25350 IX86_BUILTIN_RDTSCP,
25351 IX86_BUILTIN_ROLQI,
25352 IX86_BUILTIN_ROLHI,
25353 IX86_BUILTIN_RORQI,
25354 IX86_BUILTIN_RORHI,
25355
25356 /* SSE3. */
25357 IX86_BUILTIN_ADDSUBPS,
25358 IX86_BUILTIN_HADDPS,
25359 IX86_BUILTIN_HSUBPS,
25360 IX86_BUILTIN_MOVSHDUP,
25361 IX86_BUILTIN_MOVSLDUP,
25362 IX86_BUILTIN_ADDSUBPD,
25363 IX86_BUILTIN_HADDPD,
25364 IX86_BUILTIN_HSUBPD,
25365 IX86_BUILTIN_LDDQU,
25366
25367 IX86_BUILTIN_MONITOR,
25368 IX86_BUILTIN_MWAIT,
25369
25370 /* SSSE3. */
25371 IX86_BUILTIN_PHADDW,
25372 IX86_BUILTIN_PHADDD,
25373 IX86_BUILTIN_PHADDSW,
25374 IX86_BUILTIN_PHSUBW,
25375 IX86_BUILTIN_PHSUBD,
25376 IX86_BUILTIN_PHSUBSW,
25377 IX86_BUILTIN_PMADDUBSW,
25378 IX86_BUILTIN_PMULHRSW,
25379 IX86_BUILTIN_PSHUFB,
25380 IX86_BUILTIN_PSIGNB,
25381 IX86_BUILTIN_PSIGNW,
25382 IX86_BUILTIN_PSIGND,
25383 IX86_BUILTIN_PALIGNR,
25384 IX86_BUILTIN_PABSB,
25385 IX86_BUILTIN_PABSW,
25386 IX86_BUILTIN_PABSD,
25387
25388 IX86_BUILTIN_PHADDW128,
25389 IX86_BUILTIN_PHADDD128,
25390 IX86_BUILTIN_PHADDSW128,
25391 IX86_BUILTIN_PHSUBW128,
25392 IX86_BUILTIN_PHSUBD128,
25393 IX86_BUILTIN_PHSUBSW128,
25394 IX86_BUILTIN_PMADDUBSW128,
25395 IX86_BUILTIN_PMULHRSW128,
25396 IX86_BUILTIN_PSHUFB128,
25397 IX86_BUILTIN_PSIGNB128,
25398 IX86_BUILTIN_PSIGNW128,
25399 IX86_BUILTIN_PSIGND128,
25400 IX86_BUILTIN_PALIGNR128,
25401 IX86_BUILTIN_PABSB128,
25402 IX86_BUILTIN_PABSW128,
25403 IX86_BUILTIN_PABSD128,
25404
25405 /* AMDFAM10 - SSE4A New Instructions. */
25406 IX86_BUILTIN_MOVNTSD,
25407 IX86_BUILTIN_MOVNTSS,
25408 IX86_BUILTIN_EXTRQI,
25409 IX86_BUILTIN_EXTRQ,
25410 IX86_BUILTIN_INSERTQI,
25411 IX86_BUILTIN_INSERTQ,
25412
25413 /* SSE4.1. */
25414 IX86_BUILTIN_BLENDPD,
25415 IX86_BUILTIN_BLENDPS,
25416 IX86_BUILTIN_BLENDVPD,
25417 IX86_BUILTIN_BLENDVPS,
25418 IX86_BUILTIN_PBLENDVB128,
25419 IX86_BUILTIN_PBLENDW128,
25420
25421 IX86_BUILTIN_DPPD,
25422 IX86_BUILTIN_DPPS,
25423
25424 IX86_BUILTIN_INSERTPS128,
25425
25426 IX86_BUILTIN_MOVNTDQA,
25427 IX86_BUILTIN_MPSADBW128,
25428 IX86_BUILTIN_PACKUSDW128,
25429 IX86_BUILTIN_PCMPEQQ,
25430 IX86_BUILTIN_PHMINPOSUW128,
25431
25432 IX86_BUILTIN_PMAXSB128,
25433 IX86_BUILTIN_PMAXSD128,
25434 IX86_BUILTIN_PMAXUD128,
25435 IX86_BUILTIN_PMAXUW128,
25436
25437 IX86_BUILTIN_PMINSB128,
25438 IX86_BUILTIN_PMINSD128,
25439 IX86_BUILTIN_PMINUD128,
25440 IX86_BUILTIN_PMINUW128,
25441
25442 IX86_BUILTIN_PMOVSXBW128,
25443 IX86_BUILTIN_PMOVSXBD128,
25444 IX86_BUILTIN_PMOVSXBQ128,
25445 IX86_BUILTIN_PMOVSXWD128,
25446 IX86_BUILTIN_PMOVSXWQ128,
25447 IX86_BUILTIN_PMOVSXDQ128,
25448
25449 IX86_BUILTIN_PMOVZXBW128,
25450 IX86_BUILTIN_PMOVZXBD128,
25451 IX86_BUILTIN_PMOVZXBQ128,
25452 IX86_BUILTIN_PMOVZXWD128,
25453 IX86_BUILTIN_PMOVZXWQ128,
25454 IX86_BUILTIN_PMOVZXDQ128,
25455
25456 IX86_BUILTIN_PMULDQ128,
25457 IX86_BUILTIN_PMULLD128,
25458
25459 IX86_BUILTIN_ROUNDSD,
25460 IX86_BUILTIN_ROUNDSS,
25461
25462 IX86_BUILTIN_ROUNDPD,
25463 IX86_BUILTIN_ROUNDPS,
25464
25465 IX86_BUILTIN_FLOORPD,
25466 IX86_BUILTIN_CEILPD,
25467 IX86_BUILTIN_TRUNCPD,
25468 IX86_BUILTIN_RINTPD,
25469 IX86_BUILTIN_ROUNDPD_AZ,
25470
25471 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
25472 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
25473 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
25474
25475 IX86_BUILTIN_FLOORPS,
25476 IX86_BUILTIN_CEILPS,
25477 IX86_BUILTIN_TRUNCPS,
25478 IX86_BUILTIN_RINTPS,
25479 IX86_BUILTIN_ROUNDPS_AZ,
25480
25481 IX86_BUILTIN_FLOORPS_SFIX,
25482 IX86_BUILTIN_CEILPS_SFIX,
25483 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
25484
25485 IX86_BUILTIN_PTESTZ,
25486 IX86_BUILTIN_PTESTC,
25487 IX86_BUILTIN_PTESTNZC,
25488
25489 IX86_BUILTIN_VEC_INIT_V2SI,
25490 IX86_BUILTIN_VEC_INIT_V4HI,
25491 IX86_BUILTIN_VEC_INIT_V8QI,
25492 IX86_BUILTIN_VEC_EXT_V2DF,
25493 IX86_BUILTIN_VEC_EXT_V2DI,
25494 IX86_BUILTIN_VEC_EXT_V4SF,
25495 IX86_BUILTIN_VEC_EXT_V4SI,
25496 IX86_BUILTIN_VEC_EXT_V8HI,
25497 IX86_BUILTIN_VEC_EXT_V2SI,
25498 IX86_BUILTIN_VEC_EXT_V4HI,
25499 IX86_BUILTIN_VEC_EXT_V16QI,
25500 IX86_BUILTIN_VEC_SET_V2DI,
25501 IX86_BUILTIN_VEC_SET_V4SF,
25502 IX86_BUILTIN_VEC_SET_V4SI,
25503 IX86_BUILTIN_VEC_SET_V8HI,
25504 IX86_BUILTIN_VEC_SET_V4HI,
25505 IX86_BUILTIN_VEC_SET_V16QI,
25506
25507 IX86_BUILTIN_VEC_PACK_SFIX,
25508 IX86_BUILTIN_VEC_PACK_SFIX256,
25509
25510 /* SSE4.2. */
25511 IX86_BUILTIN_CRC32QI,
25512 IX86_BUILTIN_CRC32HI,
25513 IX86_BUILTIN_CRC32SI,
25514 IX86_BUILTIN_CRC32DI,
25515
25516 IX86_BUILTIN_PCMPESTRI128,
25517 IX86_BUILTIN_PCMPESTRM128,
25518 IX86_BUILTIN_PCMPESTRA128,
25519 IX86_BUILTIN_PCMPESTRC128,
25520 IX86_BUILTIN_PCMPESTRO128,
25521 IX86_BUILTIN_PCMPESTRS128,
25522 IX86_BUILTIN_PCMPESTRZ128,
25523 IX86_BUILTIN_PCMPISTRI128,
25524 IX86_BUILTIN_PCMPISTRM128,
25525 IX86_BUILTIN_PCMPISTRA128,
25526 IX86_BUILTIN_PCMPISTRC128,
25527 IX86_BUILTIN_PCMPISTRO128,
25528 IX86_BUILTIN_PCMPISTRS128,
25529 IX86_BUILTIN_PCMPISTRZ128,
25530
25531 IX86_BUILTIN_PCMPGTQ,
25532
25533 /* AES instructions */
25534 IX86_BUILTIN_AESENC128,
25535 IX86_BUILTIN_AESENCLAST128,
25536 IX86_BUILTIN_AESDEC128,
25537 IX86_BUILTIN_AESDECLAST128,
25538 IX86_BUILTIN_AESIMC128,
25539 IX86_BUILTIN_AESKEYGENASSIST128,
25540
25541 /* PCLMUL instruction */
25542 IX86_BUILTIN_PCLMULQDQ128,
25543
25544 /* AVX */
25545 IX86_BUILTIN_ADDPD256,
25546 IX86_BUILTIN_ADDPS256,
25547 IX86_BUILTIN_ADDSUBPD256,
25548 IX86_BUILTIN_ADDSUBPS256,
25549 IX86_BUILTIN_ANDPD256,
25550 IX86_BUILTIN_ANDPS256,
25551 IX86_BUILTIN_ANDNPD256,
25552 IX86_BUILTIN_ANDNPS256,
25553 IX86_BUILTIN_BLENDPD256,
25554 IX86_BUILTIN_BLENDPS256,
25555 IX86_BUILTIN_BLENDVPD256,
25556 IX86_BUILTIN_BLENDVPS256,
25557 IX86_BUILTIN_DIVPD256,
25558 IX86_BUILTIN_DIVPS256,
25559 IX86_BUILTIN_DPPS256,
25560 IX86_BUILTIN_HADDPD256,
25561 IX86_BUILTIN_HADDPS256,
25562 IX86_BUILTIN_HSUBPD256,
25563 IX86_BUILTIN_HSUBPS256,
25564 IX86_BUILTIN_MAXPD256,
25565 IX86_BUILTIN_MAXPS256,
25566 IX86_BUILTIN_MINPD256,
25567 IX86_BUILTIN_MINPS256,
25568 IX86_BUILTIN_MULPD256,
25569 IX86_BUILTIN_MULPS256,
25570 IX86_BUILTIN_ORPD256,
25571 IX86_BUILTIN_ORPS256,
25572 IX86_BUILTIN_SHUFPD256,
25573 IX86_BUILTIN_SHUFPS256,
25574 IX86_BUILTIN_SUBPD256,
25575 IX86_BUILTIN_SUBPS256,
25576 IX86_BUILTIN_XORPD256,
25577 IX86_BUILTIN_XORPS256,
25578 IX86_BUILTIN_CMPSD,
25579 IX86_BUILTIN_CMPSS,
25580 IX86_BUILTIN_CMPPD,
25581 IX86_BUILTIN_CMPPS,
25582 IX86_BUILTIN_CMPPD256,
25583 IX86_BUILTIN_CMPPS256,
25584 IX86_BUILTIN_CVTDQ2PD256,
25585 IX86_BUILTIN_CVTDQ2PS256,
25586 IX86_BUILTIN_CVTPD2PS256,
25587 IX86_BUILTIN_CVTPS2DQ256,
25588 IX86_BUILTIN_CVTPS2PD256,
25589 IX86_BUILTIN_CVTTPD2DQ256,
25590 IX86_BUILTIN_CVTPD2DQ256,
25591 IX86_BUILTIN_CVTTPS2DQ256,
25592 IX86_BUILTIN_EXTRACTF128PD256,
25593 IX86_BUILTIN_EXTRACTF128PS256,
25594 IX86_BUILTIN_EXTRACTF128SI256,
25595 IX86_BUILTIN_VZEROALL,
25596 IX86_BUILTIN_VZEROUPPER,
25597 IX86_BUILTIN_VPERMILVARPD,
25598 IX86_BUILTIN_VPERMILVARPS,
25599 IX86_BUILTIN_VPERMILVARPD256,
25600 IX86_BUILTIN_VPERMILVARPS256,
25601 IX86_BUILTIN_VPERMILPD,
25602 IX86_BUILTIN_VPERMILPS,
25603 IX86_BUILTIN_VPERMILPD256,
25604 IX86_BUILTIN_VPERMILPS256,
25605 IX86_BUILTIN_VPERMIL2PD,
25606 IX86_BUILTIN_VPERMIL2PS,
25607 IX86_BUILTIN_VPERMIL2PD256,
25608 IX86_BUILTIN_VPERMIL2PS256,
25609 IX86_BUILTIN_VPERM2F128PD256,
25610 IX86_BUILTIN_VPERM2F128PS256,
25611 IX86_BUILTIN_VPERM2F128SI256,
25612 IX86_BUILTIN_VBROADCASTSS,
25613 IX86_BUILTIN_VBROADCASTSD256,
25614 IX86_BUILTIN_VBROADCASTSS256,
25615 IX86_BUILTIN_VBROADCASTPD256,
25616 IX86_BUILTIN_VBROADCASTPS256,
25617 IX86_BUILTIN_VINSERTF128PD256,
25618 IX86_BUILTIN_VINSERTF128PS256,
25619 IX86_BUILTIN_VINSERTF128SI256,
25620 IX86_BUILTIN_LOADUPD256,
25621 IX86_BUILTIN_LOADUPS256,
25622 IX86_BUILTIN_STOREUPD256,
25623 IX86_BUILTIN_STOREUPS256,
25624 IX86_BUILTIN_LDDQU256,
25625 IX86_BUILTIN_MOVNTDQ256,
25626 IX86_BUILTIN_MOVNTPD256,
25627 IX86_BUILTIN_MOVNTPS256,
25628 IX86_BUILTIN_LOADDQU256,
25629 IX86_BUILTIN_STOREDQU256,
25630 IX86_BUILTIN_MASKLOADPD,
25631 IX86_BUILTIN_MASKLOADPS,
25632 IX86_BUILTIN_MASKSTOREPD,
25633 IX86_BUILTIN_MASKSTOREPS,
25634 IX86_BUILTIN_MASKLOADPD256,
25635 IX86_BUILTIN_MASKLOADPS256,
25636 IX86_BUILTIN_MASKSTOREPD256,
25637 IX86_BUILTIN_MASKSTOREPS256,
25638 IX86_BUILTIN_MOVSHDUP256,
25639 IX86_BUILTIN_MOVSLDUP256,
25640 IX86_BUILTIN_MOVDDUP256,
25641
25642 IX86_BUILTIN_SQRTPD256,
25643 IX86_BUILTIN_SQRTPS256,
25644 IX86_BUILTIN_SQRTPS_NR256,
25645 IX86_BUILTIN_RSQRTPS256,
25646 IX86_BUILTIN_RSQRTPS_NR256,
25647
25648 IX86_BUILTIN_RCPPS256,
25649
25650 IX86_BUILTIN_ROUNDPD256,
25651 IX86_BUILTIN_ROUNDPS256,
25652
25653 IX86_BUILTIN_FLOORPD256,
25654 IX86_BUILTIN_CEILPD256,
25655 IX86_BUILTIN_TRUNCPD256,
25656 IX86_BUILTIN_RINTPD256,
25657 IX86_BUILTIN_ROUNDPD_AZ256,
25658
25659 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
25660 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
25661 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
25662
25663 IX86_BUILTIN_FLOORPS256,
25664 IX86_BUILTIN_CEILPS256,
25665 IX86_BUILTIN_TRUNCPS256,
25666 IX86_BUILTIN_RINTPS256,
25667 IX86_BUILTIN_ROUNDPS_AZ256,
25668
25669 IX86_BUILTIN_FLOORPS_SFIX256,
25670 IX86_BUILTIN_CEILPS_SFIX256,
25671 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
25672
25673 IX86_BUILTIN_UNPCKHPD256,
25674 IX86_BUILTIN_UNPCKLPD256,
25675 IX86_BUILTIN_UNPCKHPS256,
25676 IX86_BUILTIN_UNPCKLPS256,
25677
25678 IX86_BUILTIN_SI256_SI,
25679 IX86_BUILTIN_PS256_PS,
25680 IX86_BUILTIN_PD256_PD,
25681 IX86_BUILTIN_SI_SI256,
25682 IX86_BUILTIN_PS_PS256,
25683 IX86_BUILTIN_PD_PD256,
25684
25685 IX86_BUILTIN_VTESTZPD,
25686 IX86_BUILTIN_VTESTCPD,
25687 IX86_BUILTIN_VTESTNZCPD,
25688 IX86_BUILTIN_VTESTZPS,
25689 IX86_BUILTIN_VTESTCPS,
25690 IX86_BUILTIN_VTESTNZCPS,
25691 IX86_BUILTIN_VTESTZPD256,
25692 IX86_BUILTIN_VTESTCPD256,
25693 IX86_BUILTIN_VTESTNZCPD256,
25694 IX86_BUILTIN_VTESTZPS256,
25695 IX86_BUILTIN_VTESTCPS256,
25696 IX86_BUILTIN_VTESTNZCPS256,
25697 IX86_BUILTIN_PTESTZ256,
25698 IX86_BUILTIN_PTESTC256,
25699 IX86_BUILTIN_PTESTNZC256,
25700
25701 IX86_BUILTIN_MOVMSKPD256,
25702 IX86_BUILTIN_MOVMSKPS256,
25703
25704 /* AVX2 */
25705 IX86_BUILTIN_MPSADBW256,
25706 IX86_BUILTIN_PABSB256,
25707 IX86_BUILTIN_PABSW256,
25708 IX86_BUILTIN_PABSD256,
25709 IX86_BUILTIN_PACKSSDW256,
25710 IX86_BUILTIN_PACKSSWB256,
25711 IX86_BUILTIN_PACKUSDW256,
25712 IX86_BUILTIN_PACKUSWB256,
25713 IX86_BUILTIN_PADDB256,
25714 IX86_BUILTIN_PADDW256,
25715 IX86_BUILTIN_PADDD256,
25716 IX86_BUILTIN_PADDQ256,
25717 IX86_BUILTIN_PADDSB256,
25718 IX86_BUILTIN_PADDSW256,
25719 IX86_BUILTIN_PADDUSB256,
25720 IX86_BUILTIN_PADDUSW256,
25721 IX86_BUILTIN_PALIGNR256,
25722 IX86_BUILTIN_AND256I,
25723 IX86_BUILTIN_ANDNOT256I,
25724 IX86_BUILTIN_PAVGB256,
25725 IX86_BUILTIN_PAVGW256,
25726 IX86_BUILTIN_PBLENDVB256,
25727 IX86_BUILTIN_PBLENDVW256,
25728 IX86_BUILTIN_PCMPEQB256,
25729 IX86_BUILTIN_PCMPEQW256,
25730 IX86_BUILTIN_PCMPEQD256,
25731 IX86_BUILTIN_PCMPEQQ256,
25732 IX86_BUILTIN_PCMPGTB256,
25733 IX86_BUILTIN_PCMPGTW256,
25734 IX86_BUILTIN_PCMPGTD256,
25735 IX86_BUILTIN_PCMPGTQ256,
25736 IX86_BUILTIN_PHADDW256,
25737 IX86_BUILTIN_PHADDD256,
25738 IX86_BUILTIN_PHADDSW256,
25739 IX86_BUILTIN_PHSUBW256,
25740 IX86_BUILTIN_PHSUBD256,
25741 IX86_BUILTIN_PHSUBSW256,
25742 IX86_BUILTIN_PMADDUBSW256,
25743 IX86_BUILTIN_PMADDWD256,
25744 IX86_BUILTIN_PMAXSB256,
25745 IX86_BUILTIN_PMAXSW256,
25746 IX86_BUILTIN_PMAXSD256,
25747 IX86_BUILTIN_PMAXUB256,
25748 IX86_BUILTIN_PMAXUW256,
25749 IX86_BUILTIN_PMAXUD256,
25750 IX86_BUILTIN_PMINSB256,
25751 IX86_BUILTIN_PMINSW256,
25752 IX86_BUILTIN_PMINSD256,
25753 IX86_BUILTIN_PMINUB256,
25754 IX86_BUILTIN_PMINUW256,
25755 IX86_BUILTIN_PMINUD256,
25756 IX86_BUILTIN_PMOVMSKB256,
25757 IX86_BUILTIN_PMOVSXBW256,
25758 IX86_BUILTIN_PMOVSXBD256,
25759 IX86_BUILTIN_PMOVSXBQ256,
25760 IX86_BUILTIN_PMOVSXWD256,
25761 IX86_BUILTIN_PMOVSXWQ256,
25762 IX86_BUILTIN_PMOVSXDQ256,
25763 IX86_BUILTIN_PMOVZXBW256,
25764 IX86_BUILTIN_PMOVZXBD256,
25765 IX86_BUILTIN_PMOVZXBQ256,
25766 IX86_BUILTIN_PMOVZXWD256,
25767 IX86_BUILTIN_PMOVZXWQ256,
25768 IX86_BUILTIN_PMOVZXDQ256,
25769 IX86_BUILTIN_PMULDQ256,
25770 IX86_BUILTIN_PMULHRSW256,
25771 IX86_BUILTIN_PMULHUW256,
25772 IX86_BUILTIN_PMULHW256,
25773 IX86_BUILTIN_PMULLW256,
25774 IX86_BUILTIN_PMULLD256,
25775 IX86_BUILTIN_PMULUDQ256,
25776 IX86_BUILTIN_POR256,
25777 IX86_BUILTIN_PSADBW256,
25778 IX86_BUILTIN_PSHUFB256,
25779 IX86_BUILTIN_PSHUFD256,
25780 IX86_BUILTIN_PSHUFHW256,
25781 IX86_BUILTIN_PSHUFLW256,
25782 IX86_BUILTIN_PSIGNB256,
25783 IX86_BUILTIN_PSIGNW256,
25784 IX86_BUILTIN_PSIGND256,
25785 IX86_BUILTIN_PSLLDQI256,
25786 IX86_BUILTIN_PSLLWI256,
25787 IX86_BUILTIN_PSLLW256,
25788 IX86_BUILTIN_PSLLDI256,
25789 IX86_BUILTIN_PSLLD256,
25790 IX86_BUILTIN_PSLLQI256,
25791 IX86_BUILTIN_PSLLQ256,
25792 IX86_BUILTIN_PSRAWI256,
25793 IX86_BUILTIN_PSRAW256,
25794 IX86_BUILTIN_PSRADI256,
25795 IX86_BUILTIN_PSRAD256,
25796 IX86_BUILTIN_PSRLDQI256,
25797 IX86_BUILTIN_PSRLWI256,
25798 IX86_BUILTIN_PSRLW256,
25799 IX86_BUILTIN_PSRLDI256,
25800 IX86_BUILTIN_PSRLD256,
25801 IX86_BUILTIN_PSRLQI256,
25802 IX86_BUILTIN_PSRLQ256,
25803 IX86_BUILTIN_PSUBB256,
25804 IX86_BUILTIN_PSUBW256,
25805 IX86_BUILTIN_PSUBD256,
25806 IX86_BUILTIN_PSUBQ256,
25807 IX86_BUILTIN_PSUBSB256,
25808 IX86_BUILTIN_PSUBSW256,
25809 IX86_BUILTIN_PSUBUSB256,
25810 IX86_BUILTIN_PSUBUSW256,
25811 IX86_BUILTIN_PUNPCKHBW256,
25812 IX86_BUILTIN_PUNPCKHWD256,
25813 IX86_BUILTIN_PUNPCKHDQ256,
25814 IX86_BUILTIN_PUNPCKHQDQ256,
25815 IX86_BUILTIN_PUNPCKLBW256,
25816 IX86_BUILTIN_PUNPCKLWD256,
25817 IX86_BUILTIN_PUNPCKLDQ256,
25818 IX86_BUILTIN_PUNPCKLQDQ256,
25819 IX86_BUILTIN_PXOR256,
25820 IX86_BUILTIN_MOVNTDQA256,
25821 IX86_BUILTIN_VBROADCASTSS_PS,
25822 IX86_BUILTIN_VBROADCASTSS_PS256,
25823 IX86_BUILTIN_VBROADCASTSD_PD256,
25824 IX86_BUILTIN_VBROADCASTSI256,
25825 IX86_BUILTIN_PBLENDD256,
25826 IX86_BUILTIN_PBLENDD128,
25827 IX86_BUILTIN_PBROADCASTB256,
25828 IX86_BUILTIN_PBROADCASTW256,
25829 IX86_BUILTIN_PBROADCASTD256,
25830 IX86_BUILTIN_PBROADCASTQ256,
25831 IX86_BUILTIN_PBROADCASTB128,
25832 IX86_BUILTIN_PBROADCASTW128,
25833 IX86_BUILTIN_PBROADCASTD128,
25834 IX86_BUILTIN_PBROADCASTQ128,
25835 IX86_BUILTIN_VPERMVARSI256,
25836 IX86_BUILTIN_VPERMDF256,
25837 IX86_BUILTIN_VPERMVARSF256,
25838 IX86_BUILTIN_VPERMDI256,
25839 IX86_BUILTIN_VPERMTI256,
25840 IX86_BUILTIN_VEXTRACT128I256,
25841 IX86_BUILTIN_VINSERT128I256,
25842 IX86_BUILTIN_MASKLOADD,
25843 IX86_BUILTIN_MASKLOADQ,
25844 IX86_BUILTIN_MASKLOADD256,
25845 IX86_BUILTIN_MASKLOADQ256,
25846 IX86_BUILTIN_MASKSTORED,
25847 IX86_BUILTIN_MASKSTOREQ,
25848 IX86_BUILTIN_MASKSTORED256,
25849 IX86_BUILTIN_MASKSTOREQ256,
25850 IX86_BUILTIN_PSLLVV4DI,
25851 IX86_BUILTIN_PSLLVV2DI,
25852 IX86_BUILTIN_PSLLVV8SI,
25853 IX86_BUILTIN_PSLLVV4SI,
25854 IX86_BUILTIN_PSRAVV8SI,
25855 IX86_BUILTIN_PSRAVV4SI,
25856 IX86_BUILTIN_PSRLVV4DI,
25857 IX86_BUILTIN_PSRLVV2DI,
25858 IX86_BUILTIN_PSRLVV8SI,
25859 IX86_BUILTIN_PSRLVV4SI,
25860
25861 IX86_BUILTIN_GATHERSIV2DF,
25862 IX86_BUILTIN_GATHERSIV4DF,
25863 IX86_BUILTIN_GATHERDIV2DF,
25864 IX86_BUILTIN_GATHERDIV4DF,
25865 IX86_BUILTIN_GATHERSIV4SF,
25866 IX86_BUILTIN_GATHERSIV8SF,
25867 IX86_BUILTIN_GATHERDIV4SF,
25868 IX86_BUILTIN_GATHERDIV8SF,
25869 IX86_BUILTIN_GATHERSIV2DI,
25870 IX86_BUILTIN_GATHERSIV4DI,
25871 IX86_BUILTIN_GATHERDIV2DI,
25872 IX86_BUILTIN_GATHERDIV4DI,
25873 IX86_BUILTIN_GATHERSIV4SI,
25874 IX86_BUILTIN_GATHERSIV8SI,
25875 IX86_BUILTIN_GATHERDIV4SI,
25876 IX86_BUILTIN_GATHERDIV8SI,
25877
25878 /* Alternate 4 element gather for the vectorizer where
25879 all operands are 32-byte wide. */
25880 IX86_BUILTIN_GATHERALTSIV4DF,
25881 IX86_BUILTIN_GATHERALTDIV8SF,
25882 IX86_BUILTIN_GATHERALTSIV4DI,
25883 IX86_BUILTIN_GATHERALTDIV8SI,
25884
25885 /* TFmode support builtins. */
25886 IX86_BUILTIN_INFQ,
25887 IX86_BUILTIN_HUGE_VALQ,
25888 IX86_BUILTIN_FABSQ,
25889 IX86_BUILTIN_COPYSIGNQ,
25890
25891 /* Vectorizer support builtins. */
25892 IX86_BUILTIN_CPYSGNPS,
25893 IX86_BUILTIN_CPYSGNPD,
25894 IX86_BUILTIN_CPYSGNPS256,
25895 IX86_BUILTIN_CPYSGNPD256,
25896
25897 /* FMA4 instructions. */
25898 IX86_BUILTIN_VFMADDSS,
25899 IX86_BUILTIN_VFMADDSD,
25900 IX86_BUILTIN_VFMADDPS,
25901 IX86_BUILTIN_VFMADDPD,
25902 IX86_BUILTIN_VFMADDPS256,
25903 IX86_BUILTIN_VFMADDPD256,
25904 IX86_BUILTIN_VFMADDSUBPS,
25905 IX86_BUILTIN_VFMADDSUBPD,
25906 IX86_BUILTIN_VFMADDSUBPS256,
25907 IX86_BUILTIN_VFMADDSUBPD256,
25908
25909 /* FMA3 instructions. */
25910 IX86_BUILTIN_VFMADDSS3,
25911 IX86_BUILTIN_VFMADDSD3,
25912
25913 /* XOP instructions. */
25914 IX86_BUILTIN_VPCMOV,
25915 IX86_BUILTIN_VPCMOV_V2DI,
25916 IX86_BUILTIN_VPCMOV_V4SI,
25917 IX86_BUILTIN_VPCMOV_V8HI,
25918 IX86_BUILTIN_VPCMOV_V16QI,
25919 IX86_BUILTIN_VPCMOV_V4SF,
25920 IX86_BUILTIN_VPCMOV_V2DF,
25921 IX86_BUILTIN_VPCMOV256,
25922 IX86_BUILTIN_VPCMOV_V4DI256,
25923 IX86_BUILTIN_VPCMOV_V8SI256,
25924 IX86_BUILTIN_VPCMOV_V16HI256,
25925 IX86_BUILTIN_VPCMOV_V32QI256,
25926 IX86_BUILTIN_VPCMOV_V8SF256,
25927 IX86_BUILTIN_VPCMOV_V4DF256,
25928
25929 IX86_BUILTIN_VPPERM,
25930
25931 IX86_BUILTIN_VPMACSSWW,
25932 IX86_BUILTIN_VPMACSWW,
25933 IX86_BUILTIN_VPMACSSWD,
25934 IX86_BUILTIN_VPMACSWD,
25935 IX86_BUILTIN_VPMACSSDD,
25936 IX86_BUILTIN_VPMACSDD,
25937 IX86_BUILTIN_VPMACSSDQL,
25938 IX86_BUILTIN_VPMACSSDQH,
25939 IX86_BUILTIN_VPMACSDQL,
25940 IX86_BUILTIN_VPMACSDQH,
25941 IX86_BUILTIN_VPMADCSSWD,
25942 IX86_BUILTIN_VPMADCSWD,
25943
25944 IX86_BUILTIN_VPHADDBW,
25945 IX86_BUILTIN_VPHADDBD,
25946 IX86_BUILTIN_VPHADDBQ,
25947 IX86_BUILTIN_VPHADDWD,
25948 IX86_BUILTIN_VPHADDWQ,
25949 IX86_BUILTIN_VPHADDDQ,
25950 IX86_BUILTIN_VPHADDUBW,
25951 IX86_BUILTIN_VPHADDUBD,
25952 IX86_BUILTIN_VPHADDUBQ,
25953 IX86_BUILTIN_VPHADDUWD,
25954 IX86_BUILTIN_VPHADDUWQ,
25955 IX86_BUILTIN_VPHADDUDQ,
25956 IX86_BUILTIN_VPHSUBBW,
25957 IX86_BUILTIN_VPHSUBWD,
25958 IX86_BUILTIN_VPHSUBDQ,
25959
25960 IX86_BUILTIN_VPROTB,
25961 IX86_BUILTIN_VPROTW,
25962 IX86_BUILTIN_VPROTD,
25963 IX86_BUILTIN_VPROTQ,
25964 IX86_BUILTIN_VPROTB_IMM,
25965 IX86_BUILTIN_VPROTW_IMM,
25966 IX86_BUILTIN_VPROTD_IMM,
25967 IX86_BUILTIN_VPROTQ_IMM,
25968
25969 IX86_BUILTIN_VPSHLB,
25970 IX86_BUILTIN_VPSHLW,
25971 IX86_BUILTIN_VPSHLD,
25972 IX86_BUILTIN_VPSHLQ,
25973 IX86_BUILTIN_VPSHAB,
25974 IX86_BUILTIN_VPSHAW,
25975 IX86_BUILTIN_VPSHAD,
25976 IX86_BUILTIN_VPSHAQ,
25977
25978 IX86_BUILTIN_VFRCZSS,
25979 IX86_BUILTIN_VFRCZSD,
25980 IX86_BUILTIN_VFRCZPS,
25981 IX86_BUILTIN_VFRCZPD,
25982 IX86_BUILTIN_VFRCZPS256,
25983 IX86_BUILTIN_VFRCZPD256,
25984
25985 IX86_BUILTIN_VPCOMEQUB,
25986 IX86_BUILTIN_VPCOMNEUB,
25987 IX86_BUILTIN_VPCOMLTUB,
25988 IX86_BUILTIN_VPCOMLEUB,
25989 IX86_BUILTIN_VPCOMGTUB,
25990 IX86_BUILTIN_VPCOMGEUB,
25991 IX86_BUILTIN_VPCOMFALSEUB,
25992 IX86_BUILTIN_VPCOMTRUEUB,
25993
25994 IX86_BUILTIN_VPCOMEQUW,
25995 IX86_BUILTIN_VPCOMNEUW,
25996 IX86_BUILTIN_VPCOMLTUW,
25997 IX86_BUILTIN_VPCOMLEUW,
25998 IX86_BUILTIN_VPCOMGTUW,
25999 IX86_BUILTIN_VPCOMGEUW,
26000 IX86_BUILTIN_VPCOMFALSEUW,
26001 IX86_BUILTIN_VPCOMTRUEUW,
26002
26003 IX86_BUILTIN_VPCOMEQUD,
26004 IX86_BUILTIN_VPCOMNEUD,
26005 IX86_BUILTIN_VPCOMLTUD,
26006 IX86_BUILTIN_VPCOMLEUD,
26007 IX86_BUILTIN_VPCOMGTUD,
26008 IX86_BUILTIN_VPCOMGEUD,
26009 IX86_BUILTIN_VPCOMFALSEUD,
26010 IX86_BUILTIN_VPCOMTRUEUD,
26011
26012 IX86_BUILTIN_VPCOMEQUQ,
26013 IX86_BUILTIN_VPCOMNEUQ,
26014 IX86_BUILTIN_VPCOMLTUQ,
26015 IX86_BUILTIN_VPCOMLEUQ,
26016 IX86_BUILTIN_VPCOMGTUQ,
26017 IX86_BUILTIN_VPCOMGEUQ,
26018 IX86_BUILTIN_VPCOMFALSEUQ,
26019 IX86_BUILTIN_VPCOMTRUEUQ,
26020
26021 IX86_BUILTIN_VPCOMEQB,
26022 IX86_BUILTIN_VPCOMNEB,
26023 IX86_BUILTIN_VPCOMLTB,
26024 IX86_BUILTIN_VPCOMLEB,
26025 IX86_BUILTIN_VPCOMGTB,
26026 IX86_BUILTIN_VPCOMGEB,
26027 IX86_BUILTIN_VPCOMFALSEB,
26028 IX86_BUILTIN_VPCOMTRUEB,
26029
26030 IX86_BUILTIN_VPCOMEQW,
26031 IX86_BUILTIN_VPCOMNEW,
26032 IX86_BUILTIN_VPCOMLTW,
26033 IX86_BUILTIN_VPCOMLEW,
26034 IX86_BUILTIN_VPCOMGTW,
26035 IX86_BUILTIN_VPCOMGEW,
26036 IX86_BUILTIN_VPCOMFALSEW,
26037 IX86_BUILTIN_VPCOMTRUEW,
26038
26039 IX86_BUILTIN_VPCOMEQD,
26040 IX86_BUILTIN_VPCOMNED,
26041 IX86_BUILTIN_VPCOMLTD,
26042 IX86_BUILTIN_VPCOMLED,
26043 IX86_BUILTIN_VPCOMGTD,
26044 IX86_BUILTIN_VPCOMGED,
26045 IX86_BUILTIN_VPCOMFALSED,
26046 IX86_BUILTIN_VPCOMTRUED,
26047
26048 IX86_BUILTIN_VPCOMEQQ,
26049 IX86_BUILTIN_VPCOMNEQ,
26050 IX86_BUILTIN_VPCOMLTQ,
26051 IX86_BUILTIN_VPCOMLEQ,
26052 IX86_BUILTIN_VPCOMGTQ,
26053 IX86_BUILTIN_VPCOMGEQ,
26054 IX86_BUILTIN_VPCOMFALSEQ,
26055 IX86_BUILTIN_VPCOMTRUEQ,
26056
26057 /* LWP instructions. */
26058 IX86_BUILTIN_LLWPCB,
26059 IX86_BUILTIN_SLWPCB,
26060 IX86_BUILTIN_LWPVAL32,
26061 IX86_BUILTIN_LWPVAL64,
26062 IX86_BUILTIN_LWPINS32,
26063 IX86_BUILTIN_LWPINS64,
26064
26065 IX86_BUILTIN_CLZS,
26066
26067 /* RTM */
26068 IX86_BUILTIN_XBEGIN,
26069 IX86_BUILTIN_XEND,
26070 IX86_BUILTIN_XABORT,
26071 IX86_BUILTIN_XTEST,
26072
26073 /* BMI instructions. */
26074 IX86_BUILTIN_BEXTR32,
26075 IX86_BUILTIN_BEXTR64,
26076 IX86_BUILTIN_CTZS,
26077
26078 /* TBM instructions. */
26079 IX86_BUILTIN_BEXTRI32,
26080 IX86_BUILTIN_BEXTRI64,
26081
26082 /* BMI2 instructions. */
26083 IX86_BUILTIN_BZHI32,
26084 IX86_BUILTIN_BZHI64,
26085 IX86_BUILTIN_PDEP32,
26086 IX86_BUILTIN_PDEP64,
26087 IX86_BUILTIN_PEXT32,
26088 IX86_BUILTIN_PEXT64,
26089
26090 /* FSGSBASE instructions. */
26091 IX86_BUILTIN_RDFSBASE32,
26092 IX86_BUILTIN_RDFSBASE64,
26093 IX86_BUILTIN_RDGSBASE32,
26094 IX86_BUILTIN_RDGSBASE64,
26095 IX86_BUILTIN_WRFSBASE32,
26096 IX86_BUILTIN_WRFSBASE64,
26097 IX86_BUILTIN_WRGSBASE32,
26098 IX86_BUILTIN_WRGSBASE64,
26099
26100 /* RDRND instructions. */
26101 IX86_BUILTIN_RDRAND16_STEP,
26102 IX86_BUILTIN_RDRAND32_STEP,
26103 IX86_BUILTIN_RDRAND64_STEP,
26104
26105 /* F16C instructions. */
26106 IX86_BUILTIN_CVTPH2PS,
26107 IX86_BUILTIN_CVTPH2PS256,
26108 IX86_BUILTIN_CVTPS2PH,
26109 IX86_BUILTIN_CVTPS2PH256,
26110
26111 /* CFString built-in for darwin */
26112 IX86_BUILTIN_CFSTRING,
26113
26114 /* Builtins to get CPU type and supported features. */
26115 IX86_BUILTIN_CPU_INIT,
26116 IX86_BUILTIN_CPU_IS,
26117 IX86_BUILTIN_CPU_SUPPORTS,
26118
26119 IX86_BUILTIN_MAX
26120 };
26121
26122 /* Table for the ix86 builtin decls. */
26123 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
26124
26125 /* Table of all of the builtin functions that are possible with different ISA's
26126 but are waiting to be built until a function is declared to use that
26127 ISA. */
26128 struct builtin_isa {
26129 const char *name; /* function name */
26130 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
26131 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
26132 bool const_p; /* true if the declaration is constant */
26133 bool set_and_not_built_p;
26134 };
26135
26136 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
26137
26138
26139 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
26140 of which isa_flags to use in the ix86_builtins_isa array. Stores the
26141 function decl in the ix86_builtins array. Returns the function decl or
26142 NULL_TREE, if the builtin was not added.
26143
26144 If the front end has a special hook for builtin functions, delay adding
26145 builtin functions that aren't in the current ISA until the ISA is changed
26146 with function specific optimization. Doing so, can save about 300K for the
26147 default compiler. When the builtin is expanded, check at that time whether
26148 it is valid.
26149
26150 If the front end doesn't have a special hook, record all builtins, even if
26151 it isn't an instruction set in the current ISA in case the user uses
26152 function specific options for a different ISA, so that we don't get scope
26153 errors if a builtin is added in the middle of a function scope. */
26154
26155 static inline tree
26156 def_builtin (HOST_WIDE_INT mask, const char *name,
26157 enum ix86_builtin_func_type tcode,
26158 enum ix86_builtins code)
26159 {
26160 tree decl = NULL_TREE;
26161
26162 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
26163 {
26164 ix86_builtins_isa[(int) code].isa = mask;
26165
26166 mask &= ~OPTION_MASK_ISA_64BIT;
26167 if (mask == 0
26168 || (mask & ix86_isa_flags) != 0
26169 || (lang_hooks.builtin_function
26170 == lang_hooks.builtin_function_ext_scope))
26171
26172 {
26173 tree type = ix86_get_builtin_func_type (tcode);
26174 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
26175 NULL, NULL_TREE);
26176 ix86_builtins[(int) code] = decl;
26177 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
26178 }
26179 else
26180 {
26181 ix86_builtins[(int) code] = NULL_TREE;
26182 ix86_builtins_isa[(int) code].tcode = tcode;
26183 ix86_builtins_isa[(int) code].name = name;
26184 ix86_builtins_isa[(int) code].const_p = false;
26185 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
26186 }
26187 }
26188
26189 return decl;
26190 }
26191
26192 /* Like def_builtin, but also marks the function decl "const". */
26193
26194 static inline tree
26195 def_builtin_const (HOST_WIDE_INT mask, const char *name,
26196 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
26197 {
26198 tree decl = def_builtin (mask, name, tcode, code);
26199 if (decl)
26200 TREE_READONLY (decl) = 1;
26201 else
26202 ix86_builtins_isa[(int) code].const_p = true;
26203
26204 return decl;
26205 }
26206
26207 /* Add any new builtin functions for a given ISA that may not have been
26208 declared. This saves a bit of space compared to adding all of the
26209 declarations to the tree, even if we didn't use them. */
26210
26211 static void
26212 ix86_add_new_builtins (HOST_WIDE_INT isa)
26213 {
26214 int i;
26215
26216 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
26217 {
26218 if ((ix86_builtins_isa[i].isa & isa) != 0
26219 && ix86_builtins_isa[i].set_and_not_built_p)
26220 {
26221 tree decl, type;
26222
26223 /* Don't define the builtin again. */
26224 ix86_builtins_isa[i].set_and_not_built_p = false;
26225
26226 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
26227 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
26228 type, i, BUILT_IN_MD, NULL,
26229 NULL_TREE);
26230
26231 ix86_builtins[i] = decl;
26232 if (ix86_builtins_isa[i].const_p)
26233 TREE_READONLY (decl) = 1;
26234 }
26235 }
26236 }
26237
26238 /* Bits for builtin_description.flag. */
26239
26240 /* Set when we don't support the comparison natively, and should
26241 swap_comparison in order to support it. */
26242 #define BUILTIN_DESC_SWAP_OPERANDS 1
26243
26244 struct builtin_description
26245 {
26246 const HOST_WIDE_INT mask;
26247 const enum insn_code icode;
26248 const char *const name;
26249 const enum ix86_builtins code;
26250 const enum rtx_code comparison;
26251 const int flag;
26252 };
26253
26254 static const struct builtin_description bdesc_comi[] =
26255 {
26256 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
26257 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
26258 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
26259 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
26260 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
26261 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
26262 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
26263 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
26264 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
26265 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
26266 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
26267 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
26268 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
26269 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
26270 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
26271 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
26272 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
26273 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
26274 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
26275 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
26276 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
26277 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
26278 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
26279 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
26280 };
26281
26282 static const struct builtin_description bdesc_pcmpestr[] =
26283 {
26284 /* SSE4.2 */
26285 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
26286 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
26287 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
26288 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
26289 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
26290 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
26291 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
26292 };
26293
26294 static const struct builtin_description bdesc_pcmpistr[] =
26295 {
26296 /* SSE4.2 */
26297 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
26298 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
26299 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
26300 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
26301 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
26302 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
26303 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
26304 };
26305
26306 /* Special builtins with variable number of arguments. */
26307 static const struct builtin_description bdesc_special_args[] =
26308 {
26309 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
26310 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
26311 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
26312
26313 /* MMX */
26314 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26315
26316 /* 3DNow! */
26317 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26318
26319 /* SSE */
26320 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26321 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26322 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26323
26324 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26325 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26326 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26327 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26328
26329 /* SSE or 3DNow!A */
26330 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26331 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
26332
26333 /* SSE2 */
26334 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26335 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26336 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26337 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
26338 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26339 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
26340 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
26341 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
26342 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
26343 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26344
26345 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26346 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26347
26348 /* SSE3 */
26349 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26350
26351 /* SSE4.1 */
26352 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
26353
26354 /* SSE4A */
26355 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26356 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26357
26358 /* AVX */
26359 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
26360 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
26361
26362 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26363 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26364 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26365 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26366 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26367
26368 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26369 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26370 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26371 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26372 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26373 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26374 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26375
26376 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26377 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26378 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26379
26380 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26381 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26382 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26383 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26384 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26385 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26386 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26387 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26388
26389 /* AVX2 */
26390 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26391 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26392 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26393 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26394 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26395 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26396 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26397 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26398 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26399
26400 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26401 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26402 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26403 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26404 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26405 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26406
26407 /* FSGSBASE */
26408 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26409 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26410 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26411 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26412 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26413 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26414 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26415 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26416
26417 /* RTM */
26418 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26419 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
26420 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
26421 };
26422
26423 /* Builtins with variable number of arguments. */
26424 static const struct builtin_description bdesc_args[] =
26425 {
26426 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26427 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26428 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26429 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26430 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26431 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26432 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26433
26434 /* MMX */
26435 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26436 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26437 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26438 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26439 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26440 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26441
26442 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26443 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26444 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26445 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26446 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26447 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26448 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26449 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26450
26451 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26452 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26453
26454 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26455 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26456 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26457 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26458
26459 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26460 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26461 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26462 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26463 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26464 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26465
26466 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26467 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26468 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26469 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26470 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
26471 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
26472
26473 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26474 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
26475 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26476
26477 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
26478
26479 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26480 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26481 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26482 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26483 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26484 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26485
26486 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26487 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26488 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26489 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26490 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26491 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26492
26493 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26494 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26495 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26496 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26497
26498 /* 3DNow! */
26499 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26500 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26501 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26502 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26503
26504 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26505 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26506 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26507 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26508 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26509 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26510 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26511 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26512 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26513 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26514 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26515 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26516 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26517 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26518 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26519
26520 /* 3DNow!A */
26521 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26522 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26523 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26524 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26525 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26526 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26527
26528 /* SSE */
26529 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
26530 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26531 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26532 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26533 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26534 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26535 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26536 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26537 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26538 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26539 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26540 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26541
26542 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26543
26544 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26545 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26546 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26547 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26548 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26549 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26550 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26551 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26552
26553 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26554 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26555 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26556 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26557 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26558 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26559 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26560 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26561 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26562 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26563 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
26564 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26565 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26566 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26567 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26568 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26569 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26570 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26571 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26572 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26573 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26574 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26575
26576 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26577 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26578 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26579 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26580
26581 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26582 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26583 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26584 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26585
26586 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26587
26588 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26589 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26590 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26591 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26592 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26593
26594 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
26595 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
26596 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
26597
26598 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
26599
26600 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26601 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26602 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26603
26604 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
26605 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
26606
26607 /* SSE MMX or 3Dnow!A */
26608 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26609 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26610 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26611
26612 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26613 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26614 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26615 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26616
26617 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
26618 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
26619
26620 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
26621
26622 /* SSE2 */
26623 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26624
26625 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
26626 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
26627 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26628 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
26629 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
26630
26631 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26632 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26633 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
26634 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26635 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26636
26637 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
26638
26639 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26640 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26641 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26642 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26643
26644 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26645 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
26646 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26647
26648 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26649 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26650 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26651 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26652 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26653 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26654 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26655 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26656
26657 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26658 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26659 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26660 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26661 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
26662 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26663 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26664 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26665 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26666 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26667 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26668 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26669 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26670 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26671 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26672 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26673 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26674 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26675 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26676 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26677
26678 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26679 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26680 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26681 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26682
26683 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26684 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26685 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26686 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26687
26688 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26689
26690 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26691 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26692 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26693
26694 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26695
26696 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26697 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26698 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26699 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26700 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26701 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26702 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26703 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26704
26705 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26706 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26707 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26708 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26709 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26710 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26711 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26712 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26713
26714 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26715 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
26716
26717 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26718 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26719 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26720 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26721
26722 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26723 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26724
26725 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26726 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26727 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26728 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26729 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26730 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26731
26732 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26733 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26734 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26735 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26736
26737 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26738 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26739 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26740 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26741 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26742 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26743 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26744 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26745
26746 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26747 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26748 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26749
26750 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26751 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
26752
26753 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
26754 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26755
26756 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
26757
26758 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
26759 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
26760 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
26761 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
26762
26763 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26764 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26765 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26766 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26767 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26768 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26769 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26770
26771 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26772 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26773 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26774 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26775 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26776 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26777 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26778
26779 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26780 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26781 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26782 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26783
26784 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
26785 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26786 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26787
26788 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
26789
26790 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26791
26792 /* SSE2 MMX */
26793 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26794 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26795
26796 /* SSE3 */
26797 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
26798 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26799
26800 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26801 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26802 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26803 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26804 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26805 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26806
26807 /* SSSE3 */
26808 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26809 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
26810 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26811 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
26812 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26813 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26814
26815 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26816 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26817 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26818 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26819 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26820 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26821 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26822 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26823 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26824 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26825 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26826 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26827 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
26828 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
26829 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26830 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26831 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26832 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26833 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26834 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26835 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26836 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26837 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26838 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26839
26840 /* SSSE3. */
26841 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
26842 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
26843
26844 /* SSE4.1 */
26845 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26846 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26847 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
26848 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
26849 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26850 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26851 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26852 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
26853 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
26854 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
26855
26856 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26857 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26858 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26859 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26860 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26861 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26862 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26863 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26864 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26865 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26866 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26867 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26868 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26869
26870 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26871 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26872 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26873 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26874 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26875 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26876 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26877 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26878 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26879 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26880 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26881 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26882
26883 /* SSE4.1 */
26884 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26885 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26886 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26887 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26888
26889 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
26890 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
26891 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
26892 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
26893
26894 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26895 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26896
26897 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26898 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26899
26900 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
26901 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
26902 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
26903 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
26904
26905 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
26906 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
26907
26908 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26909 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26910
26911 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26912 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26913 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26914
26915 /* SSE4.2 */
26916 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26917 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
26918 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
26919 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26920 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26921
26922 /* SSE4A */
26923 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
26924 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
26925 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
26926 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26927
26928 /* AES */
26929 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
26930 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26931
26932 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26933 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26934 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26935 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26936
26937 /* PCLMUL */
26938 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
26939
26940 /* AVX */
26941 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26942 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26943 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26944 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26945 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26946 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26947 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26948 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26949 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26950 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26951 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26952 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26953 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26954 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26955 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26956 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26957 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26958 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26959 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26960 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26961 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26962 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26963 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26964 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26965 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26966 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26967
26968 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
26969 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
26970 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
26971 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
26972
26973 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26974 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26975 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
26976 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
26977 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26978 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26979 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26980 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26981 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26982 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26983 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26984 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26985 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26986 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
26987 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
26988 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
26989 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
26990 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
26991 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
26992 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26993 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
26994 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26995 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26996 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26997 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26998 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26999 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27000 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27001 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27002 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27003 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27004 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
27005 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
27006 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
27007
27008 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27009 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27010 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27011
27012 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27013 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27014 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27015 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27016 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27017
27018 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27019
27020 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27021 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27022
27023 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
27024 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
27025 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
27026 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
27027
27028 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27029 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27030
27031 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27032 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27033
27034 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
27035 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
27036 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
27037 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
27038
27039 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
27040 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
27041
27042 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27043 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27044
27045 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27046 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27047 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27048 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27049
27050 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27051 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27052 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27053 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
27054 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
27055 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
27056
27057 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27058 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27059 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27060 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27061 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27062 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27063 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27064 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27065 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27066 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27067 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27068 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27069 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27070 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27071 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27072
27073 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
27074 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
27075
27076 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27077 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27078
27079 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27080
27081 /* AVX2 */
27082 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
27083 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
27084 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
27085 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
27086 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27087 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27088 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27089 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27090 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27091 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27092 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27093 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27094 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27095 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27096 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27097 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27098 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
27099 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27100 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27101 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27102 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27103 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
27104 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
27105 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27106 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27107 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27108 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27109 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27110 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27111 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27112 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27113 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27114 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27115 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27116 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27117 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27118 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27119 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27120 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
27121 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27122 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27123 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27124 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27125 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27126 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27127 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27128 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27129 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27130 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27131 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27132 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27133 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
27134 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27135 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27136 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27137 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27138 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27139 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27140 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27141 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27142 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27143 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27144 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27145 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27146 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27147 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27148 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27149 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27150 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27151 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27152 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27153 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27154 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27155 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27156 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
27157 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27158 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27159 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27160 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27161 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27162 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27163 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27164 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27165 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27166 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27167 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27168 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27169 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27170 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27171 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27172 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27173 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27174 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27175 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27176 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27177 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27178 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27179 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27180 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27181 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27182 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27183 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27184 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27185 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27186 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27187 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27188 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27189 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27190 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27191 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27192 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27193 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27194 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27195 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27196 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27197 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27198 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27199 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27200 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27201 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
27202 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27203 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
27204 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
27205 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27206 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27207 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27208 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27209 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27210 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27211 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27212 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27213 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27214 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
27215 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
27216 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
27217 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
27218 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27219 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27220 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27221 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27222 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27223 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27224 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27225 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27226 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27227 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27228
27229 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27230
27231 /* BMI */
27232 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27233 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27234 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27235
27236 /* TBM */
27237 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27238 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27239
27240 /* F16C */
27241 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
27242 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
27243 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
27244 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
27245
27246 /* BMI2 */
27247 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27248 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27249 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27250 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27251 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27252 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27253 };
27254
27255 /* FMA4 and XOP. */
27256 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
27257 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
27258 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
27259 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
27260 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
27261 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
27262 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
27263 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
27264 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
27265 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
27266 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
27267 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
27268 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
27269 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
27270 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
27271 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
27272 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
27273 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
27274 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
27275 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
27276 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
27277 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
27278 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
27279 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
27280 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
27281 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
27282 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
27283 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
27284 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
27285 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
27286 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
27287 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
27288 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
27289 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
27290 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
27291 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
27292 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
27293 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
27294 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
27295 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
27296 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
27297 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
27298 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
27299 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
27300 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
27301 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
27302 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
27303 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
27304 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
27305 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
27306 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
27307 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
27308
27309 static const struct builtin_description bdesc_multi_arg[] =
27310 {
27311 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
27312 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
27313 UNKNOWN, (int)MULTI_ARG_3_SF },
27314 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
27315 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
27316 UNKNOWN, (int)MULTI_ARG_3_DF },
27317
27318 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27319 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27320 UNKNOWN, (int)MULTI_ARG_3_SF },
27321 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27322 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27323 UNKNOWN, (int)MULTI_ARG_3_DF },
27324
27325 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
27326 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
27327 UNKNOWN, (int)MULTI_ARG_3_SF },
27328 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
27329 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
27330 UNKNOWN, (int)MULTI_ARG_3_DF },
27331 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
27332 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
27333 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27334 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
27335 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
27336 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27337
27338 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
27339 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
27340 UNKNOWN, (int)MULTI_ARG_3_SF },
27341 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
27342 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
27343 UNKNOWN, (int)MULTI_ARG_3_DF },
27344 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
27345 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
27346 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27347 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
27348 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
27349 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27350
27351 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
27352 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
27353 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
27354 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
27355 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
27356 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
27357 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
27358
27359 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27360 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27361 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
27362 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
27363 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
27364 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
27365 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
27366
27367 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
27368
27369 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27370 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27371 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27372 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27373 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27374 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27375 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27376 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27377 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27378 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27379 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27380 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27381
27382 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27383 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27384 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27385 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27386 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27387 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27388 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27389 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27390 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27391 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27392 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27393 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27394 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27395 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27396 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27397 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27398
27399 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
27400 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
27401 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27402 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27403 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27404 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27405
27406 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27407 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27408 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27409 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27410 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27411 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27412 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27413 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27414 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27415 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27416 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27417 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27418 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27419 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27420 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27421
27422 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27423 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27424 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27425 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
27426 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
27427 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
27428 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
27429
27430 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
27431 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27432 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27433 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
27434 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
27435 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
27436 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
27437
27438 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
27439 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27440 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27441 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
27442 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
27443 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
27444 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
27445
27446 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27447 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27448 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27449 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
27450 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
27451 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
27452 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
27453
27454 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
27455 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27456 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27457 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
27458 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
27459 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
27460 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
27461
27462 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
27463 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27464 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27465 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
27466 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
27467 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
27468 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
27469
27470 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
27471 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27472 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27473 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
27474 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
27475 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
27476 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
27477
27478 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27479 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27480 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27481 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
27482 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
27483 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
27484 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
27485
27486 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27487 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27488 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27489 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27490 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27491 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27492 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27493 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27494
27495 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27496 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27497 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27498 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27499 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27500 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27501 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27502 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27503
27504 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
27505 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
27506 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
27507 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
27508
27509 };
27510 \f
27511 /* TM vector builtins. */
27512
27513 /* Reuse the existing x86-specific `struct builtin_description' cause
27514 we're lazy. Add casts to make them fit. */
27515 static const struct builtin_description bdesc_tm[] =
27516 {
27517 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27518 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27519 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27520 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27521 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27522 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27523 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27524
27525 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27526 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27527 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27528 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27529 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27530 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27531 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27532
27533 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27534 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27535 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27536 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27537 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27538 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27539 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27540
27541 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
27542 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
27543 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
27544 };
27545
27546 /* TM callbacks. */
27547
27548 /* Return the builtin decl needed to load a vector of TYPE. */
27549
27550 static tree
27551 ix86_builtin_tm_load (tree type)
27552 {
27553 if (TREE_CODE (type) == VECTOR_TYPE)
27554 {
27555 switch (tree_low_cst (TYPE_SIZE (type), 1))
27556 {
27557 case 64:
27558 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
27559 case 128:
27560 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
27561 case 256:
27562 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
27563 }
27564 }
27565 return NULL_TREE;
27566 }
27567
27568 /* Return the builtin decl needed to store a vector of TYPE. */
27569
27570 static tree
27571 ix86_builtin_tm_store (tree type)
27572 {
27573 if (TREE_CODE (type) == VECTOR_TYPE)
27574 {
27575 switch (tree_low_cst (TYPE_SIZE (type), 1))
27576 {
27577 case 64:
27578 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
27579 case 128:
27580 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
27581 case 256:
27582 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
27583 }
27584 }
27585 return NULL_TREE;
27586 }
27587 \f
27588 /* Initialize the transactional memory vector load/store builtins. */
27589
27590 static void
27591 ix86_init_tm_builtins (void)
27592 {
27593 enum ix86_builtin_func_type ftype;
27594 const struct builtin_description *d;
27595 size_t i;
27596 tree decl;
27597 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
27598 tree attrs_log, attrs_type_log;
27599
27600 if (!flag_tm)
27601 return;
27602
27603 /* If there are no builtins defined, we must be compiling in a
27604 language without trans-mem support. */
27605 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
27606 return;
27607
27608 /* Use whatever attributes a normal TM load has. */
27609 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
27610 attrs_load = DECL_ATTRIBUTES (decl);
27611 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27612 /* Use whatever attributes a normal TM store has. */
27613 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
27614 attrs_store = DECL_ATTRIBUTES (decl);
27615 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27616 /* Use whatever attributes a normal TM log has. */
27617 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
27618 attrs_log = DECL_ATTRIBUTES (decl);
27619 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27620
27621 for (i = 0, d = bdesc_tm;
27622 i < ARRAY_SIZE (bdesc_tm);
27623 i++, d++)
27624 {
27625 if ((d->mask & ix86_isa_flags) != 0
27626 || (lang_hooks.builtin_function
27627 == lang_hooks.builtin_function_ext_scope))
27628 {
27629 tree type, attrs, attrs_type;
27630 enum built_in_function code = (enum built_in_function) d->code;
27631
27632 ftype = (enum ix86_builtin_func_type) d->flag;
27633 type = ix86_get_builtin_func_type (ftype);
27634
27635 if (BUILTIN_TM_LOAD_P (code))
27636 {
27637 attrs = attrs_load;
27638 attrs_type = attrs_type_load;
27639 }
27640 else if (BUILTIN_TM_STORE_P (code))
27641 {
27642 attrs = attrs_store;
27643 attrs_type = attrs_type_store;
27644 }
27645 else
27646 {
27647 attrs = attrs_log;
27648 attrs_type = attrs_type_log;
27649 }
27650 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
27651 /* The builtin without the prefix for
27652 calling it directly. */
27653 d->name + strlen ("__builtin_"),
27654 attrs);
27655 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
27656 set the TYPE_ATTRIBUTES. */
27657 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
27658
27659 set_builtin_decl (code, decl, false);
27660 }
27661 }
27662 }
27663
27664 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
27665 in the current target ISA to allow the user to compile particular modules
27666 with different target specific options that differ from the command line
27667 options. */
27668 static void
27669 ix86_init_mmx_sse_builtins (void)
27670 {
27671 const struct builtin_description * d;
27672 enum ix86_builtin_func_type ftype;
27673 size_t i;
27674
27675 /* Add all special builtins with variable number of operands. */
27676 for (i = 0, d = bdesc_special_args;
27677 i < ARRAY_SIZE (bdesc_special_args);
27678 i++, d++)
27679 {
27680 if (d->name == 0)
27681 continue;
27682
27683 ftype = (enum ix86_builtin_func_type) d->flag;
27684 def_builtin (d->mask, d->name, ftype, d->code);
27685 }
27686
27687 /* Add all builtins with variable number of operands. */
27688 for (i = 0, d = bdesc_args;
27689 i < ARRAY_SIZE (bdesc_args);
27690 i++, d++)
27691 {
27692 if (d->name == 0)
27693 continue;
27694
27695 ftype = (enum ix86_builtin_func_type) d->flag;
27696 def_builtin_const (d->mask, d->name, ftype, d->code);
27697 }
27698
27699 /* pcmpestr[im] insns. */
27700 for (i = 0, d = bdesc_pcmpestr;
27701 i < ARRAY_SIZE (bdesc_pcmpestr);
27702 i++, d++)
27703 {
27704 if (d->code == IX86_BUILTIN_PCMPESTRM128)
27705 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
27706 else
27707 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
27708 def_builtin_const (d->mask, d->name, ftype, d->code);
27709 }
27710
27711 /* pcmpistr[im] insns. */
27712 for (i = 0, d = bdesc_pcmpistr;
27713 i < ARRAY_SIZE (bdesc_pcmpistr);
27714 i++, d++)
27715 {
27716 if (d->code == IX86_BUILTIN_PCMPISTRM128)
27717 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
27718 else
27719 ftype = INT_FTYPE_V16QI_V16QI_INT;
27720 def_builtin_const (d->mask, d->name, ftype, d->code);
27721 }
27722
27723 /* comi/ucomi insns. */
27724 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
27725 {
27726 if (d->mask == OPTION_MASK_ISA_SSE2)
27727 ftype = INT_FTYPE_V2DF_V2DF;
27728 else
27729 ftype = INT_FTYPE_V4SF_V4SF;
27730 def_builtin_const (d->mask, d->name, ftype, d->code);
27731 }
27732
27733 /* SSE */
27734 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
27735 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
27736 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
27737 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
27738
27739 /* SSE or 3DNow!A */
27740 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27741 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
27742 IX86_BUILTIN_MASKMOVQ);
27743
27744 /* SSE2 */
27745 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
27746 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
27747
27748 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
27749 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
27750 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
27751 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
27752
27753 /* SSE3. */
27754 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
27755 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
27756 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
27757 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
27758
27759 /* AES */
27760 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
27761 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
27762 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
27763 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
27764 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
27765 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
27766 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
27767 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
27768 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
27769 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
27770 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
27771 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
27772
27773 /* PCLMUL */
27774 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
27775 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
27776
27777 /* RDRND */
27778 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
27779 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
27780 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
27781 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
27782 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
27783 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
27784 IX86_BUILTIN_RDRAND64_STEP);
27785
27786 /* AVX2 */
27787 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
27788 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
27789 IX86_BUILTIN_GATHERSIV2DF);
27790
27791 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
27792 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
27793 IX86_BUILTIN_GATHERSIV4DF);
27794
27795 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
27796 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
27797 IX86_BUILTIN_GATHERDIV2DF);
27798
27799 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
27800 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
27801 IX86_BUILTIN_GATHERDIV4DF);
27802
27803 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
27804 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
27805 IX86_BUILTIN_GATHERSIV4SF);
27806
27807 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
27808 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
27809 IX86_BUILTIN_GATHERSIV8SF);
27810
27811 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
27812 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
27813 IX86_BUILTIN_GATHERDIV4SF);
27814
27815 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
27816 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
27817 IX86_BUILTIN_GATHERDIV8SF);
27818
27819 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
27820 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
27821 IX86_BUILTIN_GATHERSIV2DI);
27822
27823 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
27824 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
27825 IX86_BUILTIN_GATHERSIV4DI);
27826
27827 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
27828 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
27829 IX86_BUILTIN_GATHERDIV2DI);
27830
27831 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
27832 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
27833 IX86_BUILTIN_GATHERDIV4DI);
27834
27835 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
27836 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
27837 IX86_BUILTIN_GATHERSIV4SI);
27838
27839 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
27840 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
27841 IX86_BUILTIN_GATHERSIV8SI);
27842
27843 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
27844 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
27845 IX86_BUILTIN_GATHERDIV4SI);
27846
27847 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
27848 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
27849 IX86_BUILTIN_GATHERDIV8SI);
27850
27851 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
27852 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
27853 IX86_BUILTIN_GATHERALTSIV4DF);
27854
27855 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
27856 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
27857 IX86_BUILTIN_GATHERALTDIV8SF);
27858
27859 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
27860 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
27861 IX86_BUILTIN_GATHERALTSIV4DI);
27862
27863 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
27864 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
27865 IX86_BUILTIN_GATHERALTDIV8SI);
27866
27867 /* RTM. */
27868 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
27869 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
27870
27871 /* MMX access to the vec_init patterns. */
27872 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
27873 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
27874
27875 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
27876 V4HI_FTYPE_HI_HI_HI_HI,
27877 IX86_BUILTIN_VEC_INIT_V4HI);
27878
27879 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
27880 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
27881 IX86_BUILTIN_VEC_INIT_V8QI);
27882
27883 /* Access to the vec_extract patterns. */
27884 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
27885 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
27886 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
27887 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
27888 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
27889 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
27890 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
27891 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
27892 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
27893 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
27894
27895 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27896 "__builtin_ia32_vec_ext_v4hi",
27897 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
27898
27899 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
27900 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
27901
27902 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
27903 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
27904
27905 /* Access to the vec_set patterns. */
27906 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
27907 "__builtin_ia32_vec_set_v2di",
27908 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
27909
27910 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
27911 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
27912
27913 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
27914 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
27915
27916 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
27917 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
27918
27919 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27920 "__builtin_ia32_vec_set_v4hi",
27921 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
27922
27923 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
27924 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
27925
27926 /* Add FMA4 multi-arg argument instructions */
27927 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
27928 {
27929 if (d->name == 0)
27930 continue;
27931
27932 ftype = (enum ix86_builtin_func_type) d->flag;
27933 def_builtin_const (d->mask, d->name, ftype, d->code);
27934 }
27935 }
27936
27937 /* This builds the processor_model struct type defined in
27938 libgcc/config/i386/cpuinfo.c */
27939
27940 static tree
27941 build_processor_model_struct (void)
27942 {
27943 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
27944 "__cpu_features"};
27945 tree field = NULL_TREE, field_chain = NULL_TREE;
27946 int i;
27947 tree type = make_node (RECORD_TYPE);
27948
27949 /* The first 3 fields are unsigned int. */
27950 for (i = 0; i < 3; ++i)
27951 {
27952 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
27953 get_identifier (field_name[i]), unsigned_type_node);
27954 if (field_chain != NULL_TREE)
27955 DECL_CHAIN (field) = field_chain;
27956 field_chain = field;
27957 }
27958
27959 /* The last field is an array of unsigned integers of size one. */
27960 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
27961 get_identifier (field_name[3]),
27962 build_array_type (unsigned_type_node,
27963 build_index_type (size_one_node)));
27964 if (field_chain != NULL_TREE)
27965 DECL_CHAIN (field) = field_chain;
27966 field_chain = field;
27967
27968 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
27969 return type;
27970 }
27971
27972 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
27973
27974 static tree
27975 make_var_decl (tree type, const char *name)
27976 {
27977 tree new_decl;
27978
27979 new_decl = build_decl (UNKNOWN_LOCATION,
27980 VAR_DECL,
27981 get_identifier(name),
27982 type);
27983
27984 DECL_EXTERNAL (new_decl) = 1;
27985 TREE_STATIC (new_decl) = 1;
27986 TREE_PUBLIC (new_decl) = 1;
27987 DECL_INITIAL (new_decl) = 0;
27988 DECL_ARTIFICIAL (new_decl) = 0;
27989 DECL_PRESERVE_P (new_decl) = 1;
27990
27991 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
27992 assemble_variable (new_decl, 0, 0, 0);
27993
27994 return new_decl;
27995 }
27996
27997 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
27998 into an integer defined in libgcc/config/i386/cpuinfo.c */
27999
28000 static tree
28001 fold_builtin_cpu (tree fndecl, tree *args)
28002 {
28003 unsigned int i;
28004 enum ix86_builtins fn_code = (enum ix86_builtins)
28005 DECL_FUNCTION_CODE (fndecl);
28006 tree param_string_cst = NULL;
28007
28008 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
28009 enum processor_features
28010 {
28011 F_CMOV = 0,
28012 F_MMX,
28013 F_POPCNT,
28014 F_SSE,
28015 F_SSE2,
28016 F_SSE3,
28017 F_SSSE3,
28018 F_SSE4_1,
28019 F_SSE4_2,
28020 F_AVX,
28021 F_AVX2,
28022 F_MAX
28023 };
28024
28025 /* These are the values for vendor types and cpu types and subtypes
28026 in cpuinfo.c. Cpu types and subtypes should be subtracted by
28027 the corresponding start value. */
28028 enum processor_model
28029 {
28030 M_INTEL = 1,
28031 M_AMD,
28032 M_CPU_TYPE_START,
28033 M_INTEL_ATOM,
28034 M_INTEL_CORE2,
28035 M_INTEL_COREI7,
28036 M_AMDFAM10H,
28037 M_AMDFAM15H,
28038 M_CPU_SUBTYPE_START,
28039 M_INTEL_COREI7_NEHALEM,
28040 M_INTEL_COREI7_WESTMERE,
28041 M_INTEL_COREI7_SANDYBRIDGE,
28042 M_AMDFAM10H_BARCELONA,
28043 M_AMDFAM10H_SHANGHAI,
28044 M_AMDFAM10H_ISTANBUL,
28045 M_AMDFAM15H_BDVER1,
28046 M_AMDFAM15H_BDVER2
28047 };
28048
28049 static struct _arch_names_table
28050 {
28051 const char *const name;
28052 const enum processor_model model;
28053 }
28054 const arch_names_table[] =
28055 {
28056 {"amd", M_AMD},
28057 {"intel", M_INTEL},
28058 {"atom", M_INTEL_ATOM},
28059 {"core2", M_INTEL_CORE2},
28060 {"corei7", M_INTEL_COREI7},
28061 {"nehalem", M_INTEL_COREI7_NEHALEM},
28062 {"westmere", M_INTEL_COREI7_WESTMERE},
28063 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
28064 {"amdfam10h", M_AMDFAM10H},
28065 {"barcelona", M_AMDFAM10H_BARCELONA},
28066 {"shanghai", M_AMDFAM10H_SHANGHAI},
28067 {"istanbul", M_AMDFAM10H_ISTANBUL},
28068 {"amdfam15h", M_AMDFAM15H},
28069 {"bdver1", M_AMDFAM15H_BDVER1},
28070 {"bdver2", M_AMDFAM15H_BDVER2},
28071 };
28072
28073 static struct _isa_names_table
28074 {
28075 const char *const name;
28076 const enum processor_features feature;
28077 }
28078 const isa_names_table[] =
28079 {
28080 {"cmov", F_CMOV},
28081 {"mmx", F_MMX},
28082 {"popcnt", F_POPCNT},
28083 {"sse", F_SSE},
28084 {"sse2", F_SSE2},
28085 {"sse3", F_SSE3},
28086 {"ssse3", F_SSSE3},
28087 {"sse4.1", F_SSE4_1},
28088 {"sse4.2", F_SSE4_2},
28089 {"avx", F_AVX},
28090 {"avx2", F_AVX2}
28091 };
28092
28093 static tree __processor_model_type = NULL_TREE;
28094 static tree __cpu_model_var = NULL_TREE;
28095
28096 if (__processor_model_type == NULL_TREE)
28097 __processor_model_type = build_processor_model_struct ();
28098
28099 if (__cpu_model_var == NULL_TREE)
28100 __cpu_model_var = make_var_decl (__processor_model_type,
28101 "__cpu_model");
28102
28103 gcc_assert ((args != NULL) && (*args != NULL));
28104
28105 param_string_cst = *args;
28106 while (param_string_cst
28107 && TREE_CODE (param_string_cst) != STRING_CST)
28108 {
28109 /* *args must be a expr that can contain other EXPRS leading to a
28110 STRING_CST. */
28111 if (!EXPR_P (param_string_cst))
28112 {
28113 error ("Parameter to builtin must be a string constant or literal");
28114 return integer_zero_node;
28115 }
28116 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
28117 }
28118
28119 gcc_assert (param_string_cst);
28120
28121 if (fn_code == IX86_BUILTIN_CPU_IS)
28122 {
28123 tree ref;
28124 tree field;
28125 unsigned int field_val = 0;
28126 unsigned int NUM_ARCH_NAMES
28127 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
28128
28129 for (i = 0; i < NUM_ARCH_NAMES; i++)
28130 if (strcmp (arch_names_table[i].name,
28131 TREE_STRING_POINTER (param_string_cst)) == 0)
28132 break;
28133
28134 if (i == NUM_ARCH_NAMES)
28135 {
28136 error ("Parameter to builtin not valid: %s",
28137 TREE_STRING_POINTER (param_string_cst));
28138 return integer_zero_node;
28139 }
28140
28141 field = TYPE_FIELDS (__processor_model_type);
28142 field_val = arch_names_table[i].model;
28143
28144 /* CPU types are stored in the next field. */
28145 if (field_val > M_CPU_TYPE_START
28146 && field_val < M_CPU_SUBTYPE_START)
28147 {
28148 field = DECL_CHAIN (field);
28149 field_val -= M_CPU_TYPE_START;
28150 }
28151
28152 /* CPU subtypes are stored in the next field. */
28153 if (field_val > M_CPU_SUBTYPE_START)
28154 {
28155 field = DECL_CHAIN ( DECL_CHAIN (field));
28156 field_val -= M_CPU_SUBTYPE_START;
28157 }
28158
28159 /* Get the appropriate field in __cpu_model. */
28160 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
28161 field, NULL_TREE);
28162
28163 /* Check the value. */
28164 return build2 (EQ_EXPR, unsigned_type_node, ref,
28165 build_int_cstu (unsigned_type_node, field_val));
28166 }
28167 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
28168 {
28169 tree ref;
28170 tree array_elt;
28171 tree field;
28172 unsigned int field_val = 0;
28173 unsigned int NUM_ISA_NAMES
28174 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
28175
28176 for (i = 0; i < NUM_ISA_NAMES; i++)
28177 if (strcmp (isa_names_table[i].name,
28178 TREE_STRING_POINTER (param_string_cst)) == 0)
28179 break;
28180
28181 if (i == NUM_ISA_NAMES)
28182 {
28183 error ("Parameter to builtin not valid: %s",
28184 TREE_STRING_POINTER (param_string_cst));
28185 return integer_zero_node;
28186 }
28187
28188 field = TYPE_FIELDS (__processor_model_type);
28189 /* Get the last field, which is __cpu_features. */
28190 while (DECL_CHAIN (field))
28191 field = DECL_CHAIN (field);
28192
28193 /* Get the appropriate field: __cpu_model.__cpu_features */
28194 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
28195 field, NULL_TREE);
28196
28197 /* Access the 0th element of __cpu_features array. */
28198 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
28199 integer_zero_node, NULL_TREE, NULL_TREE);
28200
28201 field_val = (1 << isa_names_table[i].feature);
28202 /* Return __cpu_model.__cpu_features[0] & field_val */
28203 return build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
28204 build_int_cstu (unsigned_type_node, field_val));
28205 }
28206 gcc_unreachable ();
28207 }
28208
28209 static tree
28210 ix86_fold_builtin (tree fndecl, int n_args,
28211 tree *args, bool ignore ATTRIBUTE_UNUSED)
28212 {
28213 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
28214 {
28215 enum ix86_builtins fn_code = (enum ix86_builtins)
28216 DECL_FUNCTION_CODE (fndecl);
28217 if (fn_code == IX86_BUILTIN_CPU_IS
28218 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
28219 {
28220 gcc_assert (n_args == 1);
28221 return fold_builtin_cpu (fndecl, args);
28222 }
28223 }
28224
28225 #ifdef SUBTARGET_FOLD_BUILTIN
28226 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
28227 #endif
28228
28229 return NULL_TREE;
28230 }
28231
28232 /* Make builtins to detect cpu type and features supported. NAME is
28233 the builtin name, CODE is the builtin code, and FTYPE is the function
28234 type of the builtin. */
28235
28236 static void
28237 make_cpu_type_builtin (const char* name, int code,
28238 enum ix86_builtin_func_type ftype, bool is_const)
28239 {
28240 tree decl;
28241 tree type;
28242
28243 type = ix86_get_builtin_func_type (ftype);
28244 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
28245 NULL, NULL_TREE);
28246 gcc_assert (decl != NULL_TREE);
28247 ix86_builtins[(int) code] = decl;
28248 TREE_READONLY (decl) = is_const;
28249 }
28250
28251 /* Make builtins to get CPU type and features supported. The created
28252 builtins are :
28253
28254 __builtin_cpu_init (), to detect cpu type and features,
28255 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
28256 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
28257 */
28258
28259 static void
28260 ix86_init_platform_type_builtins (void)
28261 {
28262 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
28263 INT_FTYPE_VOID, false);
28264 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
28265 INT_FTYPE_PCCHAR, true);
28266 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
28267 INT_FTYPE_PCCHAR, true);
28268 }
28269
28270 /* Internal method for ix86_init_builtins. */
28271
28272 static void
28273 ix86_init_builtins_va_builtins_abi (void)
28274 {
28275 tree ms_va_ref, sysv_va_ref;
28276 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
28277 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
28278 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
28279 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
28280
28281 if (!TARGET_64BIT)
28282 return;
28283 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
28284 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
28285 ms_va_ref = build_reference_type (ms_va_list_type_node);
28286 sysv_va_ref =
28287 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
28288
28289 fnvoid_va_end_ms =
28290 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
28291 fnvoid_va_start_ms =
28292 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
28293 fnvoid_va_end_sysv =
28294 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
28295 fnvoid_va_start_sysv =
28296 build_varargs_function_type_list (void_type_node, sysv_va_ref,
28297 NULL_TREE);
28298 fnvoid_va_copy_ms =
28299 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
28300 NULL_TREE);
28301 fnvoid_va_copy_sysv =
28302 build_function_type_list (void_type_node, sysv_va_ref,
28303 sysv_va_ref, NULL_TREE);
28304
28305 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
28306 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
28307 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
28308 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
28309 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
28310 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
28311 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
28312 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28313 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
28314 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28315 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
28316 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28317 }
28318
28319 static void
28320 ix86_init_builtin_types (void)
28321 {
28322 tree float128_type_node, float80_type_node;
28323
28324 /* The __float80 type. */
28325 float80_type_node = long_double_type_node;
28326 if (TYPE_MODE (float80_type_node) != XFmode)
28327 {
28328 /* The __float80 type. */
28329 float80_type_node = make_node (REAL_TYPE);
28330
28331 TYPE_PRECISION (float80_type_node) = 80;
28332 layout_type (float80_type_node);
28333 }
28334 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
28335
28336 /* The __float128 type. */
28337 float128_type_node = make_node (REAL_TYPE);
28338 TYPE_PRECISION (float128_type_node) = 128;
28339 layout_type (float128_type_node);
28340 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
28341
28342 /* This macro is built by i386-builtin-types.awk. */
28343 DEFINE_BUILTIN_PRIMITIVE_TYPES;
28344 }
28345
28346 static void
28347 ix86_init_builtins (void)
28348 {
28349 tree t;
28350
28351 ix86_init_builtin_types ();
28352
28353 /* Builtins to get CPU type and features. */
28354 ix86_init_platform_type_builtins ();
28355
28356 /* TFmode support builtins. */
28357 def_builtin_const (0, "__builtin_infq",
28358 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
28359 def_builtin_const (0, "__builtin_huge_valq",
28360 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
28361
28362 /* We will expand them to normal call if SSE isn't available since
28363 they are used by libgcc. */
28364 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
28365 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
28366 BUILT_IN_MD, "__fabstf2", NULL_TREE);
28367 TREE_READONLY (t) = 1;
28368 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
28369
28370 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
28371 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
28372 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
28373 TREE_READONLY (t) = 1;
28374 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
28375
28376 ix86_init_tm_builtins ();
28377 ix86_init_mmx_sse_builtins ();
28378
28379 if (TARGET_LP64)
28380 ix86_init_builtins_va_builtins_abi ();
28381
28382 #ifdef SUBTARGET_INIT_BUILTINS
28383 SUBTARGET_INIT_BUILTINS;
28384 #endif
28385 }
28386
28387 /* Return the ix86 builtin for CODE. */
28388
28389 static tree
28390 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
28391 {
28392 if (code >= IX86_BUILTIN_MAX)
28393 return error_mark_node;
28394
28395 return ix86_builtins[code];
28396 }
28397
28398 /* Errors in the source file can cause expand_expr to return const0_rtx
28399 where we expect a vector. To avoid crashing, use one of the vector
28400 clear instructions. */
28401 static rtx
28402 safe_vector_operand (rtx x, enum machine_mode mode)
28403 {
28404 if (x == const0_rtx)
28405 x = CONST0_RTX (mode);
28406 return x;
28407 }
28408
28409 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
28410
28411 static rtx
28412 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
28413 {
28414 rtx pat;
28415 tree arg0 = CALL_EXPR_ARG (exp, 0);
28416 tree arg1 = CALL_EXPR_ARG (exp, 1);
28417 rtx op0 = expand_normal (arg0);
28418 rtx op1 = expand_normal (arg1);
28419 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28420 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
28421 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
28422
28423 if (VECTOR_MODE_P (mode0))
28424 op0 = safe_vector_operand (op0, mode0);
28425 if (VECTOR_MODE_P (mode1))
28426 op1 = safe_vector_operand (op1, mode1);
28427
28428 if (optimize || !target
28429 || GET_MODE (target) != tmode
28430 || !insn_data[icode].operand[0].predicate (target, tmode))
28431 target = gen_reg_rtx (tmode);
28432
28433 if (GET_MODE (op1) == SImode && mode1 == TImode)
28434 {
28435 rtx x = gen_reg_rtx (V4SImode);
28436 emit_insn (gen_sse2_loadd (x, op1));
28437 op1 = gen_lowpart (TImode, x);
28438 }
28439
28440 if (!insn_data[icode].operand[1].predicate (op0, mode0))
28441 op0 = copy_to_mode_reg (mode0, op0);
28442 if (!insn_data[icode].operand[2].predicate (op1, mode1))
28443 op1 = copy_to_mode_reg (mode1, op1);
28444
28445 pat = GEN_FCN (icode) (target, op0, op1);
28446 if (! pat)
28447 return 0;
28448
28449 emit_insn (pat);
28450
28451 return target;
28452 }
28453
28454 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
28455
28456 static rtx
28457 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
28458 enum ix86_builtin_func_type m_type,
28459 enum rtx_code sub_code)
28460 {
28461 rtx pat;
28462 int i;
28463 int nargs;
28464 bool comparison_p = false;
28465 bool tf_p = false;
28466 bool last_arg_constant = false;
28467 int num_memory = 0;
28468 struct {
28469 rtx op;
28470 enum machine_mode mode;
28471 } args[4];
28472
28473 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28474
28475 switch (m_type)
28476 {
28477 case MULTI_ARG_4_DF2_DI_I:
28478 case MULTI_ARG_4_DF2_DI_I1:
28479 case MULTI_ARG_4_SF2_SI_I:
28480 case MULTI_ARG_4_SF2_SI_I1:
28481 nargs = 4;
28482 last_arg_constant = true;
28483 break;
28484
28485 case MULTI_ARG_3_SF:
28486 case MULTI_ARG_3_DF:
28487 case MULTI_ARG_3_SF2:
28488 case MULTI_ARG_3_DF2:
28489 case MULTI_ARG_3_DI:
28490 case MULTI_ARG_3_SI:
28491 case MULTI_ARG_3_SI_DI:
28492 case MULTI_ARG_3_HI:
28493 case MULTI_ARG_3_HI_SI:
28494 case MULTI_ARG_3_QI:
28495 case MULTI_ARG_3_DI2:
28496 case MULTI_ARG_3_SI2:
28497 case MULTI_ARG_3_HI2:
28498 case MULTI_ARG_3_QI2:
28499 nargs = 3;
28500 break;
28501
28502 case MULTI_ARG_2_SF:
28503 case MULTI_ARG_2_DF:
28504 case MULTI_ARG_2_DI:
28505 case MULTI_ARG_2_SI:
28506 case MULTI_ARG_2_HI:
28507 case MULTI_ARG_2_QI:
28508 nargs = 2;
28509 break;
28510
28511 case MULTI_ARG_2_DI_IMM:
28512 case MULTI_ARG_2_SI_IMM:
28513 case MULTI_ARG_2_HI_IMM:
28514 case MULTI_ARG_2_QI_IMM:
28515 nargs = 2;
28516 last_arg_constant = true;
28517 break;
28518
28519 case MULTI_ARG_1_SF:
28520 case MULTI_ARG_1_DF:
28521 case MULTI_ARG_1_SF2:
28522 case MULTI_ARG_1_DF2:
28523 case MULTI_ARG_1_DI:
28524 case MULTI_ARG_1_SI:
28525 case MULTI_ARG_1_HI:
28526 case MULTI_ARG_1_QI:
28527 case MULTI_ARG_1_SI_DI:
28528 case MULTI_ARG_1_HI_DI:
28529 case MULTI_ARG_1_HI_SI:
28530 case MULTI_ARG_1_QI_DI:
28531 case MULTI_ARG_1_QI_SI:
28532 case MULTI_ARG_1_QI_HI:
28533 nargs = 1;
28534 break;
28535
28536 case MULTI_ARG_2_DI_CMP:
28537 case MULTI_ARG_2_SI_CMP:
28538 case MULTI_ARG_2_HI_CMP:
28539 case MULTI_ARG_2_QI_CMP:
28540 nargs = 2;
28541 comparison_p = true;
28542 break;
28543
28544 case MULTI_ARG_2_SF_TF:
28545 case MULTI_ARG_2_DF_TF:
28546 case MULTI_ARG_2_DI_TF:
28547 case MULTI_ARG_2_SI_TF:
28548 case MULTI_ARG_2_HI_TF:
28549 case MULTI_ARG_2_QI_TF:
28550 nargs = 2;
28551 tf_p = true;
28552 break;
28553
28554 default:
28555 gcc_unreachable ();
28556 }
28557
28558 if (optimize || !target
28559 || GET_MODE (target) != tmode
28560 || !insn_data[icode].operand[0].predicate (target, tmode))
28561 target = gen_reg_rtx (tmode);
28562
28563 gcc_assert (nargs <= 4);
28564
28565 for (i = 0; i < nargs; i++)
28566 {
28567 tree arg = CALL_EXPR_ARG (exp, i);
28568 rtx op = expand_normal (arg);
28569 int adjust = (comparison_p) ? 1 : 0;
28570 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
28571
28572 if (last_arg_constant && i == nargs - 1)
28573 {
28574 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
28575 {
28576 enum insn_code new_icode = icode;
28577 switch (icode)
28578 {
28579 case CODE_FOR_xop_vpermil2v2df3:
28580 case CODE_FOR_xop_vpermil2v4sf3:
28581 case CODE_FOR_xop_vpermil2v4df3:
28582 case CODE_FOR_xop_vpermil2v8sf3:
28583 error ("the last argument must be a 2-bit immediate");
28584 return gen_reg_rtx (tmode);
28585 case CODE_FOR_xop_rotlv2di3:
28586 new_icode = CODE_FOR_rotlv2di3;
28587 goto xop_rotl;
28588 case CODE_FOR_xop_rotlv4si3:
28589 new_icode = CODE_FOR_rotlv4si3;
28590 goto xop_rotl;
28591 case CODE_FOR_xop_rotlv8hi3:
28592 new_icode = CODE_FOR_rotlv8hi3;
28593 goto xop_rotl;
28594 case CODE_FOR_xop_rotlv16qi3:
28595 new_icode = CODE_FOR_rotlv16qi3;
28596 xop_rotl:
28597 if (CONST_INT_P (op))
28598 {
28599 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
28600 op = GEN_INT (INTVAL (op) & mask);
28601 gcc_checking_assert
28602 (insn_data[icode].operand[i + 1].predicate (op, mode));
28603 }
28604 else
28605 {
28606 gcc_checking_assert
28607 (nargs == 2
28608 && insn_data[new_icode].operand[0].mode == tmode
28609 && insn_data[new_icode].operand[1].mode == tmode
28610 && insn_data[new_icode].operand[2].mode == mode
28611 && insn_data[new_icode].operand[0].predicate
28612 == insn_data[icode].operand[0].predicate
28613 && insn_data[new_icode].operand[1].predicate
28614 == insn_data[icode].operand[1].predicate);
28615 icode = new_icode;
28616 goto non_constant;
28617 }
28618 break;
28619 default:
28620 gcc_unreachable ();
28621 }
28622 }
28623 }
28624 else
28625 {
28626 non_constant:
28627 if (VECTOR_MODE_P (mode))
28628 op = safe_vector_operand (op, mode);
28629
28630 /* If we aren't optimizing, only allow one memory operand to be
28631 generated. */
28632 if (memory_operand (op, mode))
28633 num_memory++;
28634
28635 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
28636
28637 if (optimize
28638 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
28639 || num_memory > 1)
28640 op = force_reg (mode, op);
28641 }
28642
28643 args[i].op = op;
28644 args[i].mode = mode;
28645 }
28646
28647 switch (nargs)
28648 {
28649 case 1:
28650 pat = GEN_FCN (icode) (target, args[0].op);
28651 break;
28652
28653 case 2:
28654 if (tf_p)
28655 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
28656 GEN_INT ((int)sub_code));
28657 else if (! comparison_p)
28658 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
28659 else
28660 {
28661 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
28662 args[0].op,
28663 args[1].op);
28664
28665 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
28666 }
28667 break;
28668
28669 case 3:
28670 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
28671 break;
28672
28673 case 4:
28674 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
28675 break;
28676
28677 default:
28678 gcc_unreachable ();
28679 }
28680
28681 if (! pat)
28682 return 0;
28683
28684 emit_insn (pat);
28685 return target;
28686 }
28687
28688 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
28689 insns with vec_merge. */
28690
28691 static rtx
28692 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
28693 rtx target)
28694 {
28695 rtx pat;
28696 tree arg0 = CALL_EXPR_ARG (exp, 0);
28697 rtx op1, op0 = expand_normal (arg0);
28698 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28699 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
28700
28701 if (optimize || !target
28702 || GET_MODE (target) != tmode
28703 || !insn_data[icode].operand[0].predicate (target, tmode))
28704 target = gen_reg_rtx (tmode);
28705
28706 if (VECTOR_MODE_P (mode0))
28707 op0 = safe_vector_operand (op0, mode0);
28708
28709 if ((optimize && !register_operand (op0, mode0))
28710 || !insn_data[icode].operand[1].predicate (op0, mode0))
28711 op0 = copy_to_mode_reg (mode0, op0);
28712
28713 op1 = op0;
28714 if (!insn_data[icode].operand[2].predicate (op1, mode0))
28715 op1 = copy_to_mode_reg (mode0, op1);
28716
28717 pat = GEN_FCN (icode) (target, op0, op1);
28718 if (! pat)
28719 return 0;
28720 emit_insn (pat);
28721 return target;
28722 }
28723
28724 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
28725
28726 static rtx
28727 ix86_expand_sse_compare (const struct builtin_description *d,
28728 tree exp, rtx target, bool swap)
28729 {
28730 rtx pat;
28731 tree arg0 = CALL_EXPR_ARG (exp, 0);
28732 tree arg1 = CALL_EXPR_ARG (exp, 1);
28733 rtx op0 = expand_normal (arg0);
28734 rtx op1 = expand_normal (arg1);
28735 rtx op2;
28736 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28737 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28738 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28739 enum rtx_code comparison = d->comparison;
28740
28741 if (VECTOR_MODE_P (mode0))
28742 op0 = safe_vector_operand (op0, mode0);
28743 if (VECTOR_MODE_P (mode1))
28744 op1 = safe_vector_operand (op1, mode1);
28745
28746 /* Swap operands if we have a comparison that isn't available in
28747 hardware. */
28748 if (swap)
28749 {
28750 rtx tmp = gen_reg_rtx (mode1);
28751 emit_move_insn (tmp, op1);
28752 op1 = op0;
28753 op0 = tmp;
28754 }
28755
28756 if (optimize || !target
28757 || GET_MODE (target) != tmode
28758 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28759 target = gen_reg_rtx (tmode);
28760
28761 if ((optimize && !register_operand (op0, mode0))
28762 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
28763 op0 = copy_to_mode_reg (mode0, op0);
28764 if ((optimize && !register_operand (op1, mode1))
28765 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
28766 op1 = copy_to_mode_reg (mode1, op1);
28767
28768 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
28769 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28770 if (! pat)
28771 return 0;
28772 emit_insn (pat);
28773 return target;
28774 }
28775
28776 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
28777
28778 static rtx
28779 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
28780 rtx target)
28781 {
28782 rtx pat;
28783 tree arg0 = CALL_EXPR_ARG (exp, 0);
28784 tree arg1 = CALL_EXPR_ARG (exp, 1);
28785 rtx op0 = expand_normal (arg0);
28786 rtx op1 = expand_normal (arg1);
28787 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28788 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28789 enum rtx_code comparison = d->comparison;
28790
28791 if (VECTOR_MODE_P (mode0))
28792 op0 = safe_vector_operand (op0, mode0);
28793 if (VECTOR_MODE_P (mode1))
28794 op1 = safe_vector_operand (op1, mode1);
28795
28796 /* Swap operands if we have a comparison that isn't available in
28797 hardware. */
28798 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
28799 {
28800 rtx tmp = op1;
28801 op1 = op0;
28802 op0 = tmp;
28803 }
28804
28805 target = gen_reg_rtx (SImode);
28806 emit_move_insn (target, const0_rtx);
28807 target = gen_rtx_SUBREG (QImode, target, 0);
28808
28809 if ((optimize && !register_operand (op0, mode0))
28810 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28811 op0 = copy_to_mode_reg (mode0, op0);
28812 if ((optimize && !register_operand (op1, mode1))
28813 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28814 op1 = copy_to_mode_reg (mode1, op1);
28815
28816 pat = GEN_FCN (d->icode) (op0, op1);
28817 if (! pat)
28818 return 0;
28819 emit_insn (pat);
28820 emit_insn (gen_rtx_SET (VOIDmode,
28821 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28822 gen_rtx_fmt_ee (comparison, QImode,
28823 SET_DEST (pat),
28824 const0_rtx)));
28825
28826 return SUBREG_REG (target);
28827 }
28828
28829 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
28830
28831 static rtx
28832 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
28833 rtx target)
28834 {
28835 rtx pat;
28836 tree arg0 = CALL_EXPR_ARG (exp, 0);
28837 rtx op1, op0 = expand_normal (arg0);
28838 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28839 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28840
28841 if (optimize || target == 0
28842 || GET_MODE (target) != tmode
28843 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28844 target = gen_reg_rtx (tmode);
28845
28846 if (VECTOR_MODE_P (mode0))
28847 op0 = safe_vector_operand (op0, mode0);
28848
28849 if ((optimize && !register_operand (op0, mode0))
28850 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28851 op0 = copy_to_mode_reg (mode0, op0);
28852
28853 op1 = GEN_INT (d->comparison);
28854
28855 pat = GEN_FCN (d->icode) (target, op0, op1);
28856 if (! pat)
28857 return 0;
28858 emit_insn (pat);
28859 return target;
28860 }
28861
28862 static rtx
28863 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
28864 tree exp, rtx target)
28865 {
28866 rtx pat;
28867 tree arg0 = CALL_EXPR_ARG (exp, 0);
28868 tree arg1 = CALL_EXPR_ARG (exp, 1);
28869 rtx op0 = expand_normal (arg0);
28870 rtx op1 = expand_normal (arg1);
28871 rtx op2;
28872 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28873 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28874 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28875
28876 if (optimize || target == 0
28877 || GET_MODE (target) != tmode
28878 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28879 target = gen_reg_rtx (tmode);
28880
28881 op0 = safe_vector_operand (op0, mode0);
28882 op1 = safe_vector_operand (op1, mode1);
28883
28884 if ((optimize && !register_operand (op0, mode0))
28885 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28886 op0 = copy_to_mode_reg (mode0, op0);
28887 if ((optimize && !register_operand (op1, mode1))
28888 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28889 op1 = copy_to_mode_reg (mode1, op1);
28890
28891 op2 = GEN_INT (d->comparison);
28892
28893 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28894 if (! pat)
28895 return 0;
28896 emit_insn (pat);
28897 return target;
28898 }
28899
28900 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
28901
28902 static rtx
28903 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
28904 rtx target)
28905 {
28906 rtx pat;
28907 tree arg0 = CALL_EXPR_ARG (exp, 0);
28908 tree arg1 = CALL_EXPR_ARG (exp, 1);
28909 rtx op0 = expand_normal (arg0);
28910 rtx op1 = expand_normal (arg1);
28911 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28912 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28913 enum rtx_code comparison = d->comparison;
28914
28915 if (VECTOR_MODE_P (mode0))
28916 op0 = safe_vector_operand (op0, mode0);
28917 if (VECTOR_MODE_P (mode1))
28918 op1 = safe_vector_operand (op1, mode1);
28919
28920 target = gen_reg_rtx (SImode);
28921 emit_move_insn (target, const0_rtx);
28922 target = gen_rtx_SUBREG (QImode, target, 0);
28923
28924 if ((optimize && !register_operand (op0, mode0))
28925 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28926 op0 = copy_to_mode_reg (mode0, op0);
28927 if ((optimize && !register_operand (op1, mode1))
28928 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28929 op1 = copy_to_mode_reg (mode1, op1);
28930
28931 pat = GEN_FCN (d->icode) (op0, op1);
28932 if (! pat)
28933 return 0;
28934 emit_insn (pat);
28935 emit_insn (gen_rtx_SET (VOIDmode,
28936 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28937 gen_rtx_fmt_ee (comparison, QImode,
28938 SET_DEST (pat),
28939 const0_rtx)));
28940
28941 return SUBREG_REG (target);
28942 }
28943
28944 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
28945
28946 static rtx
28947 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
28948 tree exp, rtx target)
28949 {
28950 rtx pat;
28951 tree arg0 = CALL_EXPR_ARG (exp, 0);
28952 tree arg1 = CALL_EXPR_ARG (exp, 1);
28953 tree arg2 = CALL_EXPR_ARG (exp, 2);
28954 tree arg3 = CALL_EXPR_ARG (exp, 3);
28955 tree arg4 = CALL_EXPR_ARG (exp, 4);
28956 rtx scratch0, scratch1;
28957 rtx op0 = expand_normal (arg0);
28958 rtx op1 = expand_normal (arg1);
28959 rtx op2 = expand_normal (arg2);
28960 rtx op3 = expand_normal (arg3);
28961 rtx op4 = expand_normal (arg4);
28962 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
28963
28964 tmode0 = insn_data[d->icode].operand[0].mode;
28965 tmode1 = insn_data[d->icode].operand[1].mode;
28966 modev2 = insn_data[d->icode].operand[2].mode;
28967 modei3 = insn_data[d->icode].operand[3].mode;
28968 modev4 = insn_data[d->icode].operand[4].mode;
28969 modei5 = insn_data[d->icode].operand[5].mode;
28970 modeimm = insn_data[d->icode].operand[6].mode;
28971
28972 if (VECTOR_MODE_P (modev2))
28973 op0 = safe_vector_operand (op0, modev2);
28974 if (VECTOR_MODE_P (modev4))
28975 op2 = safe_vector_operand (op2, modev4);
28976
28977 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28978 op0 = copy_to_mode_reg (modev2, op0);
28979 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
28980 op1 = copy_to_mode_reg (modei3, op1);
28981 if ((optimize && !register_operand (op2, modev4))
28982 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
28983 op2 = copy_to_mode_reg (modev4, op2);
28984 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
28985 op3 = copy_to_mode_reg (modei5, op3);
28986
28987 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
28988 {
28989 error ("the fifth argument must be an 8-bit immediate");
28990 return const0_rtx;
28991 }
28992
28993 if (d->code == IX86_BUILTIN_PCMPESTRI128)
28994 {
28995 if (optimize || !target
28996 || GET_MODE (target) != tmode0
28997 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28998 target = gen_reg_rtx (tmode0);
28999
29000 scratch1 = gen_reg_rtx (tmode1);
29001
29002 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
29003 }
29004 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
29005 {
29006 if (optimize || !target
29007 || GET_MODE (target) != tmode1
29008 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
29009 target = gen_reg_rtx (tmode1);
29010
29011 scratch0 = gen_reg_rtx (tmode0);
29012
29013 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
29014 }
29015 else
29016 {
29017 gcc_assert (d->flag);
29018
29019 scratch0 = gen_reg_rtx (tmode0);
29020 scratch1 = gen_reg_rtx (tmode1);
29021
29022 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
29023 }
29024
29025 if (! pat)
29026 return 0;
29027
29028 emit_insn (pat);
29029
29030 if (d->flag)
29031 {
29032 target = gen_reg_rtx (SImode);
29033 emit_move_insn (target, const0_rtx);
29034 target = gen_rtx_SUBREG (QImode, target, 0);
29035
29036 emit_insn
29037 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
29038 gen_rtx_fmt_ee (EQ, QImode,
29039 gen_rtx_REG ((enum machine_mode) d->flag,
29040 FLAGS_REG),
29041 const0_rtx)));
29042 return SUBREG_REG (target);
29043 }
29044 else
29045 return target;
29046 }
29047
29048
29049 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
29050
29051 static rtx
29052 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
29053 tree exp, rtx target)
29054 {
29055 rtx pat;
29056 tree arg0 = CALL_EXPR_ARG (exp, 0);
29057 tree arg1 = CALL_EXPR_ARG (exp, 1);
29058 tree arg2 = CALL_EXPR_ARG (exp, 2);
29059 rtx scratch0, scratch1;
29060 rtx op0 = expand_normal (arg0);
29061 rtx op1 = expand_normal (arg1);
29062 rtx op2 = expand_normal (arg2);
29063 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
29064
29065 tmode0 = insn_data[d->icode].operand[0].mode;
29066 tmode1 = insn_data[d->icode].operand[1].mode;
29067 modev2 = insn_data[d->icode].operand[2].mode;
29068 modev3 = insn_data[d->icode].operand[3].mode;
29069 modeimm = insn_data[d->icode].operand[4].mode;
29070
29071 if (VECTOR_MODE_P (modev2))
29072 op0 = safe_vector_operand (op0, modev2);
29073 if (VECTOR_MODE_P (modev3))
29074 op1 = safe_vector_operand (op1, modev3);
29075
29076 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
29077 op0 = copy_to_mode_reg (modev2, op0);
29078 if ((optimize && !register_operand (op1, modev3))
29079 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
29080 op1 = copy_to_mode_reg (modev3, op1);
29081
29082 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
29083 {
29084 error ("the third argument must be an 8-bit immediate");
29085 return const0_rtx;
29086 }
29087
29088 if (d->code == IX86_BUILTIN_PCMPISTRI128)
29089 {
29090 if (optimize || !target
29091 || GET_MODE (target) != tmode0
29092 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
29093 target = gen_reg_rtx (tmode0);
29094
29095 scratch1 = gen_reg_rtx (tmode1);
29096
29097 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
29098 }
29099 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
29100 {
29101 if (optimize || !target
29102 || GET_MODE (target) != tmode1
29103 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
29104 target = gen_reg_rtx (tmode1);
29105
29106 scratch0 = gen_reg_rtx (tmode0);
29107
29108 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
29109 }
29110 else
29111 {
29112 gcc_assert (d->flag);
29113
29114 scratch0 = gen_reg_rtx (tmode0);
29115 scratch1 = gen_reg_rtx (tmode1);
29116
29117 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
29118 }
29119
29120 if (! pat)
29121 return 0;
29122
29123 emit_insn (pat);
29124
29125 if (d->flag)
29126 {
29127 target = gen_reg_rtx (SImode);
29128 emit_move_insn (target, const0_rtx);
29129 target = gen_rtx_SUBREG (QImode, target, 0);
29130
29131 emit_insn
29132 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
29133 gen_rtx_fmt_ee (EQ, QImode,
29134 gen_rtx_REG ((enum machine_mode) d->flag,
29135 FLAGS_REG),
29136 const0_rtx)));
29137 return SUBREG_REG (target);
29138 }
29139 else
29140 return target;
29141 }
29142
29143 /* Subroutine of ix86_expand_builtin to take care of insns with
29144 variable number of operands. */
29145
29146 static rtx
29147 ix86_expand_args_builtin (const struct builtin_description *d,
29148 tree exp, rtx target)
29149 {
29150 rtx pat, real_target;
29151 unsigned int i, nargs;
29152 unsigned int nargs_constant = 0;
29153 int num_memory = 0;
29154 struct
29155 {
29156 rtx op;
29157 enum machine_mode mode;
29158 } args[4];
29159 bool last_arg_count = false;
29160 enum insn_code icode = d->icode;
29161 const struct insn_data_d *insn_p = &insn_data[icode];
29162 enum machine_mode tmode = insn_p->operand[0].mode;
29163 enum machine_mode rmode = VOIDmode;
29164 bool swap = false;
29165 enum rtx_code comparison = d->comparison;
29166
29167 switch ((enum ix86_builtin_func_type) d->flag)
29168 {
29169 case V2DF_FTYPE_V2DF_ROUND:
29170 case V4DF_FTYPE_V4DF_ROUND:
29171 case V4SF_FTYPE_V4SF_ROUND:
29172 case V8SF_FTYPE_V8SF_ROUND:
29173 case V4SI_FTYPE_V4SF_ROUND:
29174 case V8SI_FTYPE_V8SF_ROUND:
29175 return ix86_expand_sse_round (d, exp, target);
29176 case V4SI_FTYPE_V2DF_V2DF_ROUND:
29177 case V8SI_FTYPE_V4DF_V4DF_ROUND:
29178 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
29179 case INT_FTYPE_V8SF_V8SF_PTEST:
29180 case INT_FTYPE_V4DI_V4DI_PTEST:
29181 case INT_FTYPE_V4DF_V4DF_PTEST:
29182 case INT_FTYPE_V4SF_V4SF_PTEST:
29183 case INT_FTYPE_V2DI_V2DI_PTEST:
29184 case INT_FTYPE_V2DF_V2DF_PTEST:
29185 return ix86_expand_sse_ptest (d, exp, target);
29186 case FLOAT128_FTYPE_FLOAT128:
29187 case FLOAT_FTYPE_FLOAT:
29188 case INT_FTYPE_INT:
29189 case UINT64_FTYPE_INT:
29190 case UINT16_FTYPE_UINT16:
29191 case INT64_FTYPE_INT64:
29192 case INT64_FTYPE_V4SF:
29193 case INT64_FTYPE_V2DF:
29194 case INT_FTYPE_V16QI:
29195 case INT_FTYPE_V8QI:
29196 case INT_FTYPE_V8SF:
29197 case INT_FTYPE_V4DF:
29198 case INT_FTYPE_V4SF:
29199 case INT_FTYPE_V2DF:
29200 case INT_FTYPE_V32QI:
29201 case V16QI_FTYPE_V16QI:
29202 case V8SI_FTYPE_V8SF:
29203 case V8SI_FTYPE_V4SI:
29204 case V8HI_FTYPE_V8HI:
29205 case V8HI_FTYPE_V16QI:
29206 case V8QI_FTYPE_V8QI:
29207 case V8SF_FTYPE_V8SF:
29208 case V8SF_FTYPE_V8SI:
29209 case V8SF_FTYPE_V4SF:
29210 case V8SF_FTYPE_V8HI:
29211 case V4SI_FTYPE_V4SI:
29212 case V4SI_FTYPE_V16QI:
29213 case V4SI_FTYPE_V4SF:
29214 case V4SI_FTYPE_V8SI:
29215 case V4SI_FTYPE_V8HI:
29216 case V4SI_FTYPE_V4DF:
29217 case V4SI_FTYPE_V2DF:
29218 case V4HI_FTYPE_V4HI:
29219 case V4DF_FTYPE_V4DF:
29220 case V4DF_FTYPE_V4SI:
29221 case V4DF_FTYPE_V4SF:
29222 case V4DF_FTYPE_V2DF:
29223 case V4SF_FTYPE_V4SF:
29224 case V4SF_FTYPE_V4SI:
29225 case V4SF_FTYPE_V8SF:
29226 case V4SF_FTYPE_V4DF:
29227 case V4SF_FTYPE_V8HI:
29228 case V4SF_FTYPE_V2DF:
29229 case V2DI_FTYPE_V2DI:
29230 case V2DI_FTYPE_V16QI:
29231 case V2DI_FTYPE_V8HI:
29232 case V2DI_FTYPE_V4SI:
29233 case V2DF_FTYPE_V2DF:
29234 case V2DF_FTYPE_V4SI:
29235 case V2DF_FTYPE_V4DF:
29236 case V2DF_FTYPE_V4SF:
29237 case V2DF_FTYPE_V2SI:
29238 case V2SI_FTYPE_V2SI:
29239 case V2SI_FTYPE_V4SF:
29240 case V2SI_FTYPE_V2SF:
29241 case V2SI_FTYPE_V2DF:
29242 case V2SF_FTYPE_V2SF:
29243 case V2SF_FTYPE_V2SI:
29244 case V32QI_FTYPE_V32QI:
29245 case V32QI_FTYPE_V16QI:
29246 case V16HI_FTYPE_V16HI:
29247 case V16HI_FTYPE_V8HI:
29248 case V8SI_FTYPE_V8SI:
29249 case V16HI_FTYPE_V16QI:
29250 case V8SI_FTYPE_V16QI:
29251 case V4DI_FTYPE_V16QI:
29252 case V8SI_FTYPE_V8HI:
29253 case V4DI_FTYPE_V8HI:
29254 case V4DI_FTYPE_V4SI:
29255 case V4DI_FTYPE_V2DI:
29256 nargs = 1;
29257 break;
29258 case V4SF_FTYPE_V4SF_VEC_MERGE:
29259 case V2DF_FTYPE_V2DF_VEC_MERGE:
29260 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
29261 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
29262 case V16QI_FTYPE_V16QI_V16QI:
29263 case V16QI_FTYPE_V8HI_V8HI:
29264 case V8QI_FTYPE_V8QI_V8QI:
29265 case V8QI_FTYPE_V4HI_V4HI:
29266 case V8HI_FTYPE_V8HI_V8HI:
29267 case V8HI_FTYPE_V16QI_V16QI:
29268 case V8HI_FTYPE_V4SI_V4SI:
29269 case V8SF_FTYPE_V8SF_V8SF:
29270 case V8SF_FTYPE_V8SF_V8SI:
29271 case V4SI_FTYPE_V4SI_V4SI:
29272 case V4SI_FTYPE_V8HI_V8HI:
29273 case V4SI_FTYPE_V4SF_V4SF:
29274 case V4SI_FTYPE_V2DF_V2DF:
29275 case V4HI_FTYPE_V4HI_V4HI:
29276 case V4HI_FTYPE_V8QI_V8QI:
29277 case V4HI_FTYPE_V2SI_V2SI:
29278 case V4DF_FTYPE_V4DF_V4DF:
29279 case V4DF_FTYPE_V4DF_V4DI:
29280 case V4SF_FTYPE_V4SF_V4SF:
29281 case V4SF_FTYPE_V4SF_V4SI:
29282 case V4SF_FTYPE_V4SF_V2SI:
29283 case V4SF_FTYPE_V4SF_V2DF:
29284 case V4SF_FTYPE_V4SF_DI:
29285 case V4SF_FTYPE_V4SF_SI:
29286 case V2DI_FTYPE_V2DI_V2DI:
29287 case V2DI_FTYPE_V16QI_V16QI:
29288 case V2DI_FTYPE_V4SI_V4SI:
29289 case V2UDI_FTYPE_V4USI_V4USI:
29290 case V2DI_FTYPE_V2DI_V16QI:
29291 case V2DI_FTYPE_V2DF_V2DF:
29292 case V2SI_FTYPE_V2SI_V2SI:
29293 case V2SI_FTYPE_V4HI_V4HI:
29294 case V2SI_FTYPE_V2SF_V2SF:
29295 case V2DF_FTYPE_V2DF_V2DF:
29296 case V2DF_FTYPE_V2DF_V4SF:
29297 case V2DF_FTYPE_V2DF_V2DI:
29298 case V2DF_FTYPE_V2DF_DI:
29299 case V2DF_FTYPE_V2DF_SI:
29300 case V2SF_FTYPE_V2SF_V2SF:
29301 case V1DI_FTYPE_V1DI_V1DI:
29302 case V1DI_FTYPE_V8QI_V8QI:
29303 case V1DI_FTYPE_V2SI_V2SI:
29304 case V32QI_FTYPE_V16HI_V16HI:
29305 case V16HI_FTYPE_V8SI_V8SI:
29306 case V32QI_FTYPE_V32QI_V32QI:
29307 case V16HI_FTYPE_V32QI_V32QI:
29308 case V16HI_FTYPE_V16HI_V16HI:
29309 case V8SI_FTYPE_V4DF_V4DF:
29310 case V8SI_FTYPE_V8SI_V8SI:
29311 case V8SI_FTYPE_V16HI_V16HI:
29312 case V4DI_FTYPE_V4DI_V4DI:
29313 case V4DI_FTYPE_V8SI_V8SI:
29314 case V4UDI_FTYPE_V8USI_V8USI:
29315 if (comparison == UNKNOWN)
29316 return ix86_expand_binop_builtin (icode, exp, target);
29317 nargs = 2;
29318 break;
29319 case V4SF_FTYPE_V4SF_V4SF_SWAP:
29320 case V2DF_FTYPE_V2DF_V2DF_SWAP:
29321 gcc_assert (comparison != UNKNOWN);
29322 nargs = 2;
29323 swap = true;
29324 break;
29325 case V16HI_FTYPE_V16HI_V8HI_COUNT:
29326 case V16HI_FTYPE_V16HI_SI_COUNT:
29327 case V8SI_FTYPE_V8SI_V4SI_COUNT:
29328 case V8SI_FTYPE_V8SI_SI_COUNT:
29329 case V4DI_FTYPE_V4DI_V2DI_COUNT:
29330 case V4DI_FTYPE_V4DI_INT_COUNT:
29331 case V8HI_FTYPE_V8HI_V8HI_COUNT:
29332 case V8HI_FTYPE_V8HI_SI_COUNT:
29333 case V4SI_FTYPE_V4SI_V4SI_COUNT:
29334 case V4SI_FTYPE_V4SI_SI_COUNT:
29335 case V4HI_FTYPE_V4HI_V4HI_COUNT:
29336 case V4HI_FTYPE_V4HI_SI_COUNT:
29337 case V2DI_FTYPE_V2DI_V2DI_COUNT:
29338 case V2DI_FTYPE_V2DI_SI_COUNT:
29339 case V2SI_FTYPE_V2SI_V2SI_COUNT:
29340 case V2SI_FTYPE_V2SI_SI_COUNT:
29341 case V1DI_FTYPE_V1DI_V1DI_COUNT:
29342 case V1DI_FTYPE_V1DI_SI_COUNT:
29343 nargs = 2;
29344 last_arg_count = true;
29345 break;
29346 case UINT64_FTYPE_UINT64_UINT64:
29347 case UINT_FTYPE_UINT_UINT:
29348 case UINT_FTYPE_UINT_USHORT:
29349 case UINT_FTYPE_UINT_UCHAR:
29350 case UINT16_FTYPE_UINT16_INT:
29351 case UINT8_FTYPE_UINT8_INT:
29352 nargs = 2;
29353 break;
29354 case V2DI_FTYPE_V2DI_INT_CONVERT:
29355 nargs = 2;
29356 rmode = V1TImode;
29357 nargs_constant = 1;
29358 break;
29359 case V4DI_FTYPE_V4DI_INT_CONVERT:
29360 nargs = 2;
29361 rmode = V2TImode;
29362 nargs_constant = 1;
29363 break;
29364 case V8HI_FTYPE_V8HI_INT:
29365 case V8HI_FTYPE_V8SF_INT:
29366 case V8HI_FTYPE_V4SF_INT:
29367 case V8SF_FTYPE_V8SF_INT:
29368 case V4SI_FTYPE_V4SI_INT:
29369 case V4SI_FTYPE_V8SI_INT:
29370 case V4HI_FTYPE_V4HI_INT:
29371 case V4DF_FTYPE_V4DF_INT:
29372 case V4SF_FTYPE_V4SF_INT:
29373 case V4SF_FTYPE_V8SF_INT:
29374 case V2DI_FTYPE_V2DI_INT:
29375 case V2DF_FTYPE_V2DF_INT:
29376 case V2DF_FTYPE_V4DF_INT:
29377 case V16HI_FTYPE_V16HI_INT:
29378 case V8SI_FTYPE_V8SI_INT:
29379 case V4DI_FTYPE_V4DI_INT:
29380 case V2DI_FTYPE_V4DI_INT:
29381 nargs = 2;
29382 nargs_constant = 1;
29383 break;
29384 case V16QI_FTYPE_V16QI_V16QI_V16QI:
29385 case V8SF_FTYPE_V8SF_V8SF_V8SF:
29386 case V4DF_FTYPE_V4DF_V4DF_V4DF:
29387 case V4SF_FTYPE_V4SF_V4SF_V4SF:
29388 case V2DF_FTYPE_V2DF_V2DF_V2DF:
29389 case V32QI_FTYPE_V32QI_V32QI_V32QI:
29390 nargs = 3;
29391 break;
29392 case V32QI_FTYPE_V32QI_V32QI_INT:
29393 case V16HI_FTYPE_V16HI_V16HI_INT:
29394 case V16QI_FTYPE_V16QI_V16QI_INT:
29395 case V4DI_FTYPE_V4DI_V4DI_INT:
29396 case V8HI_FTYPE_V8HI_V8HI_INT:
29397 case V8SI_FTYPE_V8SI_V8SI_INT:
29398 case V8SI_FTYPE_V8SI_V4SI_INT:
29399 case V8SF_FTYPE_V8SF_V8SF_INT:
29400 case V8SF_FTYPE_V8SF_V4SF_INT:
29401 case V4SI_FTYPE_V4SI_V4SI_INT:
29402 case V4DF_FTYPE_V4DF_V4DF_INT:
29403 case V4DF_FTYPE_V4DF_V2DF_INT:
29404 case V4SF_FTYPE_V4SF_V4SF_INT:
29405 case V2DI_FTYPE_V2DI_V2DI_INT:
29406 case V4DI_FTYPE_V4DI_V2DI_INT:
29407 case V2DF_FTYPE_V2DF_V2DF_INT:
29408 nargs = 3;
29409 nargs_constant = 1;
29410 break;
29411 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
29412 nargs = 3;
29413 rmode = V4DImode;
29414 nargs_constant = 1;
29415 break;
29416 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
29417 nargs = 3;
29418 rmode = V2DImode;
29419 nargs_constant = 1;
29420 break;
29421 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
29422 nargs = 3;
29423 rmode = DImode;
29424 nargs_constant = 1;
29425 break;
29426 case V2DI_FTYPE_V2DI_UINT_UINT:
29427 nargs = 3;
29428 nargs_constant = 2;
29429 break;
29430 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
29431 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
29432 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
29433 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
29434 nargs = 4;
29435 nargs_constant = 1;
29436 break;
29437 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
29438 nargs = 4;
29439 nargs_constant = 2;
29440 break;
29441 default:
29442 gcc_unreachable ();
29443 }
29444
29445 gcc_assert (nargs <= ARRAY_SIZE (args));
29446
29447 if (comparison != UNKNOWN)
29448 {
29449 gcc_assert (nargs == 2);
29450 return ix86_expand_sse_compare (d, exp, target, swap);
29451 }
29452
29453 if (rmode == VOIDmode || rmode == tmode)
29454 {
29455 if (optimize
29456 || target == 0
29457 || GET_MODE (target) != tmode
29458 || !insn_p->operand[0].predicate (target, tmode))
29459 target = gen_reg_rtx (tmode);
29460 real_target = target;
29461 }
29462 else
29463 {
29464 target = gen_reg_rtx (rmode);
29465 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
29466 }
29467
29468 for (i = 0; i < nargs; i++)
29469 {
29470 tree arg = CALL_EXPR_ARG (exp, i);
29471 rtx op = expand_normal (arg);
29472 enum machine_mode mode = insn_p->operand[i + 1].mode;
29473 bool match = insn_p->operand[i + 1].predicate (op, mode);
29474
29475 if (last_arg_count && (i + 1) == nargs)
29476 {
29477 /* SIMD shift insns take either an 8-bit immediate or
29478 register as count. But builtin functions take int as
29479 count. If count doesn't match, we put it in register. */
29480 if (!match)
29481 {
29482 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
29483 if (!insn_p->operand[i + 1].predicate (op, mode))
29484 op = copy_to_reg (op);
29485 }
29486 }
29487 else if ((nargs - i) <= nargs_constant)
29488 {
29489 if (!match)
29490 switch (icode)
29491 {
29492 case CODE_FOR_avx2_inserti128:
29493 case CODE_FOR_avx2_extracti128:
29494 error ("the last argument must be an 1-bit immediate");
29495 return const0_rtx;
29496
29497 case CODE_FOR_sse4_1_roundsd:
29498 case CODE_FOR_sse4_1_roundss:
29499
29500 case CODE_FOR_sse4_1_roundpd:
29501 case CODE_FOR_sse4_1_roundps:
29502 case CODE_FOR_avx_roundpd256:
29503 case CODE_FOR_avx_roundps256:
29504
29505 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
29506 case CODE_FOR_sse4_1_roundps_sfix:
29507 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
29508 case CODE_FOR_avx_roundps_sfix256:
29509
29510 case CODE_FOR_sse4_1_blendps:
29511 case CODE_FOR_avx_blendpd256:
29512 case CODE_FOR_avx_vpermilv4df:
29513 error ("the last argument must be a 4-bit immediate");
29514 return const0_rtx;
29515
29516 case CODE_FOR_sse4_1_blendpd:
29517 case CODE_FOR_avx_vpermilv2df:
29518 case CODE_FOR_xop_vpermil2v2df3:
29519 case CODE_FOR_xop_vpermil2v4sf3:
29520 case CODE_FOR_xop_vpermil2v4df3:
29521 case CODE_FOR_xop_vpermil2v8sf3:
29522 error ("the last argument must be a 2-bit immediate");
29523 return const0_rtx;
29524
29525 case CODE_FOR_avx_vextractf128v4df:
29526 case CODE_FOR_avx_vextractf128v8sf:
29527 case CODE_FOR_avx_vextractf128v8si:
29528 case CODE_FOR_avx_vinsertf128v4df:
29529 case CODE_FOR_avx_vinsertf128v8sf:
29530 case CODE_FOR_avx_vinsertf128v8si:
29531 error ("the last argument must be a 1-bit immediate");
29532 return const0_rtx;
29533
29534 case CODE_FOR_avx_vmcmpv2df3:
29535 case CODE_FOR_avx_vmcmpv4sf3:
29536 case CODE_FOR_avx_cmpv2df3:
29537 case CODE_FOR_avx_cmpv4sf3:
29538 case CODE_FOR_avx_cmpv4df3:
29539 case CODE_FOR_avx_cmpv8sf3:
29540 error ("the last argument must be a 5-bit immediate");
29541 return const0_rtx;
29542
29543 default:
29544 switch (nargs_constant)
29545 {
29546 case 2:
29547 if ((nargs - i) == nargs_constant)
29548 {
29549 error ("the next to last argument must be an 8-bit immediate");
29550 break;
29551 }
29552 case 1:
29553 error ("the last argument must be an 8-bit immediate");
29554 break;
29555 default:
29556 gcc_unreachable ();
29557 }
29558 return const0_rtx;
29559 }
29560 }
29561 else
29562 {
29563 if (VECTOR_MODE_P (mode))
29564 op = safe_vector_operand (op, mode);
29565
29566 /* If we aren't optimizing, only allow one memory operand to
29567 be generated. */
29568 if (memory_operand (op, mode))
29569 num_memory++;
29570
29571 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
29572 {
29573 if (optimize || !match || num_memory > 1)
29574 op = copy_to_mode_reg (mode, op);
29575 }
29576 else
29577 {
29578 op = copy_to_reg (op);
29579 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
29580 }
29581 }
29582
29583 args[i].op = op;
29584 args[i].mode = mode;
29585 }
29586
29587 switch (nargs)
29588 {
29589 case 1:
29590 pat = GEN_FCN (icode) (real_target, args[0].op);
29591 break;
29592 case 2:
29593 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
29594 break;
29595 case 3:
29596 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
29597 args[2].op);
29598 break;
29599 case 4:
29600 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
29601 args[2].op, args[3].op);
29602 break;
29603 default:
29604 gcc_unreachable ();
29605 }
29606
29607 if (! pat)
29608 return 0;
29609
29610 emit_insn (pat);
29611 return target;
29612 }
29613
29614 /* Subroutine of ix86_expand_builtin to take care of special insns
29615 with variable number of operands. */
29616
29617 static rtx
29618 ix86_expand_special_args_builtin (const struct builtin_description *d,
29619 tree exp, rtx target)
29620 {
29621 tree arg;
29622 rtx pat, op;
29623 unsigned int i, nargs, arg_adjust, memory;
29624 struct
29625 {
29626 rtx op;
29627 enum machine_mode mode;
29628 } args[3];
29629 enum insn_code icode = d->icode;
29630 bool last_arg_constant = false;
29631 const struct insn_data_d *insn_p = &insn_data[icode];
29632 enum machine_mode tmode = insn_p->operand[0].mode;
29633 enum { load, store } klass;
29634
29635 switch ((enum ix86_builtin_func_type) d->flag)
29636 {
29637 case VOID_FTYPE_VOID:
29638 if (icode == CODE_FOR_avx_vzeroupper)
29639 target = GEN_INT (vzeroupper_intrinsic);
29640 emit_insn (GEN_FCN (icode) (target));
29641 return 0;
29642 case VOID_FTYPE_UINT64:
29643 case VOID_FTYPE_UNSIGNED:
29644 nargs = 0;
29645 klass = store;
29646 memory = 0;
29647 break;
29648
29649 case INT_FTYPE_VOID:
29650 case UINT64_FTYPE_VOID:
29651 case UNSIGNED_FTYPE_VOID:
29652 nargs = 0;
29653 klass = load;
29654 memory = 0;
29655 break;
29656 case UINT64_FTYPE_PUNSIGNED:
29657 case V2DI_FTYPE_PV2DI:
29658 case V4DI_FTYPE_PV4DI:
29659 case V32QI_FTYPE_PCCHAR:
29660 case V16QI_FTYPE_PCCHAR:
29661 case V8SF_FTYPE_PCV4SF:
29662 case V8SF_FTYPE_PCFLOAT:
29663 case V4SF_FTYPE_PCFLOAT:
29664 case V4DF_FTYPE_PCV2DF:
29665 case V4DF_FTYPE_PCDOUBLE:
29666 case V2DF_FTYPE_PCDOUBLE:
29667 case VOID_FTYPE_PVOID:
29668 nargs = 1;
29669 klass = load;
29670 memory = 0;
29671 break;
29672 case VOID_FTYPE_PV2SF_V4SF:
29673 case VOID_FTYPE_PV4DI_V4DI:
29674 case VOID_FTYPE_PV2DI_V2DI:
29675 case VOID_FTYPE_PCHAR_V32QI:
29676 case VOID_FTYPE_PCHAR_V16QI:
29677 case VOID_FTYPE_PFLOAT_V8SF:
29678 case VOID_FTYPE_PFLOAT_V4SF:
29679 case VOID_FTYPE_PDOUBLE_V4DF:
29680 case VOID_FTYPE_PDOUBLE_V2DF:
29681 case VOID_FTYPE_PLONGLONG_LONGLONG:
29682 case VOID_FTYPE_PULONGLONG_ULONGLONG:
29683 case VOID_FTYPE_PINT_INT:
29684 nargs = 1;
29685 klass = store;
29686 /* Reserve memory operand for target. */
29687 memory = ARRAY_SIZE (args);
29688 break;
29689 case V4SF_FTYPE_V4SF_PCV2SF:
29690 case V2DF_FTYPE_V2DF_PCDOUBLE:
29691 nargs = 2;
29692 klass = load;
29693 memory = 1;
29694 break;
29695 case V8SF_FTYPE_PCV8SF_V8SI:
29696 case V4DF_FTYPE_PCV4DF_V4DI:
29697 case V4SF_FTYPE_PCV4SF_V4SI:
29698 case V2DF_FTYPE_PCV2DF_V2DI:
29699 case V8SI_FTYPE_PCV8SI_V8SI:
29700 case V4DI_FTYPE_PCV4DI_V4DI:
29701 case V4SI_FTYPE_PCV4SI_V4SI:
29702 case V2DI_FTYPE_PCV2DI_V2DI:
29703 nargs = 2;
29704 klass = load;
29705 memory = 0;
29706 break;
29707 case VOID_FTYPE_PV8SF_V8SI_V8SF:
29708 case VOID_FTYPE_PV4DF_V4DI_V4DF:
29709 case VOID_FTYPE_PV4SF_V4SI_V4SF:
29710 case VOID_FTYPE_PV2DF_V2DI_V2DF:
29711 case VOID_FTYPE_PV8SI_V8SI_V8SI:
29712 case VOID_FTYPE_PV4DI_V4DI_V4DI:
29713 case VOID_FTYPE_PV4SI_V4SI_V4SI:
29714 case VOID_FTYPE_PV2DI_V2DI_V2DI:
29715 nargs = 2;
29716 klass = store;
29717 /* Reserve memory operand for target. */
29718 memory = ARRAY_SIZE (args);
29719 break;
29720 case VOID_FTYPE_UINT_UINT_UINT:
29721 case VOID_FTYPE_UINT64_UINT_UINT:
29722 case UCHAR_FTYPE_UINT_UINT_UINT:
29723 case UCHAR_FTYPE_UINT64_UINT_UINT:
29724 nargs = 3;
29725 klass = load;
29726 memory = ARRAY_SIZE (args);
29727 last_arg_constant = true;
29728 break;
29729 default:
29730 gcc_unreachable ();
29731 }
29732
29733 gcc_assert (nargs <= ARRAY_SIZE (args));
29734
29735 if (klass == store)
29736 {
29737 arg = CALL_EXPR_ARG (exp, 0);
29738 op = expand_normal (arg);
29739 gcc_assert (target == 0);
29740 if (memory)
29741 {
29742 if (GET_MODE (op) != Pmode)
29743 op = convert_to_mode (Pmode, op, 1);
29744 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
29745 }
29746 else
29747 target = force_reg (tmode, op);
29748 arg_adjust = 1;
29749 }
29750 else
29751 {
29752 arg_adjust = 0;
29753 if (optimize
29754 || target == 0
29755 || !register_operand (target, tmode)
29756 || GET_MODE (target) != tmode)
29757 target = gen_reg_rtx (tmode);
29758 }
29759
29760 for (i = 0; i < nargs; i++)
29761 {
29762 enum machine_mode mode = insn_p->operand[i + 1].mode;
29763 bool match;
29764
29765 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
29766 op = expand_normal (arg);
29767 match = insn_p->operand[i + 1].predicate (op, mode);
29768
29769 if (last_arg_constant && (i + 1) == nargs)
29770 {
29771 if (!match)
29772 {
29773 if (icode == CODE_FOR_lwp_lwpvalsi3
29774 || icode == CODE_FOR_lwp_lwpinssi3
29775 || icode == CODE_FOR_lwp_lwpvaldi3
29776 || icode == CODE_FOR_lwp_lwpinsdi3)
29777 error ("the last argument must be a 32-bit immediate");
29778 else
29779 error ("the last argument must be an 8-bit immediate");
29780 return const0_rtx;
29781 }
29782 }
29783 else
29784 {
29785 if (i == memory)
29786 {
29787 /* This must be the memory operand. */
29788 if (GET_MODE (op) != Pmode)
29789 op = convert_to_mode (Pmode, op, 1);
29790 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
29791 gcc_assert (GET_MODE (op) == mode
29792 || GET_MODE (op) == VOIDmode);
29793 }
29794 else
29795 {
29796 /* This must be register. */
29797 if (VECTOR_MODE_P (mode))
29798 op = safe_vector_operand (op, mode);
29799
29800 gcc_assert (GET_MODE (op) == mode
29801 || GET_MODE (op) == VOIDmode);
29802 op = copy_to_mode_reg (mode, op);
29803 }
29804 }
29805
29806 args[i].op = op;
29807 args[i].mode = mode;
29808 }
29809
29810 switch (nargs)
29811 {
29812 case 0:
29813 pat = GEN_FCN (icode) (target);
29814 break;
29815 case 1:
29816 pat = GEN_FCN (icode) (target, args[0].op);
29817 break;
29818 case 2:
29819 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
29820 break;
29821 case 3:
29822 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
29823 break;
29824 default:
29825 gcc_unreachable ();
29826 }
29827
29828 if (! pat)
29829 return 0;
29830 emit_insn (pat);
29831 return klass == store ? 0 : target;
29832 }
29833
29834 /* Return the integer constant in ARG. Constrain it to be in the range
29835 of the subparts of VEC_TYPE; issue an error if not. */
29836
29837 static int
29838 get_element_number (tree vec_type, tree arg)
29839 {
29840 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
29841
29842 if (!host_integerp (arg, 1)
29843 || (elt = tree_low_cst (arg, 1), elt > max))
29844 {
29845 error ("selector must be an integer constant in the range 0..%wi", max);
29846 return 0;
29847 }
29848
29849 return elt;
29850 }
29851
29852 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29853 ix86_expand_vector_init. We DO have language-level syntax for this, in
29854 the form of (type){ init-list }. Except that since we can't place emms
29855 instructions from inside the compiler, we can't allow the use of MMX
29856 registers unless the user explicitly asks for it. So we do *not* define
29857 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
29858 we have builtins invoked by mmintrin.h that gives us license to emit
29859 these sorts of instructions. */
29860
29861 static rtx
29862 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
29863 {
29864 enum machine_mode tmode = TYPE_MODE (type);
29865 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
29866 int i, n_elt = GET_MODE_NUNITS (tmode);
29867 rtvec v = rtvec_alloc (n_elt);
29868
29869 gcc_assert (VECTOR_MODE_P (tmode));
29870 gcc_assert (call_expr_nargs (exp) == n_elt);
29871
29872 for (i = 0; i < n_elt; ++i)
29873 {
29874 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
29875 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
29876 }
29877
29878 if (!target || !register_operand (target, tmode))
29879 target = gen_reg_rtx (tmode);
29880
29881 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
29882 return target;
29883 }
29884
29885 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29886 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
29887 had a language-level syntax for referencing vector elements. */
29888
29889 static rtx
29890 ix86_expand_vec_ext_builtin (tree exp, rtx target)
29891 {
29892 enum machine_mode tmode, mode0;
29893 tree arg0, arg1;
29894 int elt;
29895 rtx op0;
29896
29897 arg0 = CALL_EXPR_ARG (exp, 0);
29898 arg1 = CALL_EXPR_ARG (exp, 1);
29899
29900 op0 = expand_normal (arg0);
29901 elt = get_element_number (TREE_TYPE (arg0), arg1);
29902
29903 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29904 mode0 = TYPE_MODE (TREE_TYPE (arg0));
29905 gcc_assert (VECTOR_MODE_P (mode0));
29906
29907 op0 = force_reg (mode0, op0);
29908
29909 if (optimize || !target || !register_operand (target, tmode))
29910 target = gen_reg_rtx (tmode);
29911
29912 ix86_expand_vector_extract (true, target, op0, elt);
29913
29914 return target;
29915 }
29916
29917 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29918 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
29919 a language-level syntax for referencing vector elements. */
29920
29921 static rtx
29922 ix86_expand_vec_set_builtin (tree exp)
29923 {
29924 enum machine_mode tmode, mode1;
29925 tree arg0, arg1, arg2;
29926 int elt;
29927 rtx op0, op1, target;
29928
29929 arg0 = CALL_EXPR_ARG (exp, 0);
29930 arg1 = CALL_EXPR_ARG (exp, 1);
29931 arg2 = CALL_EXPR_ARG (exp, 2);
29932
29933 tmode = TYPE_MODE (TREE_TYPE (arg0));
29934 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29935 gcc_assert (VECTOR_MODE_P (tmode));
29936
29937 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
29938 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
29939 elt = get_element_number (TREE_TYPE (arg0), arg2);
29940
29941 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
29942 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
29943
29944 op0 = force_reg (tmode, op0);
29945 op1 = force_reg (mode1, op1);
29946
29947 /* OP0 is the source of these builtin functions and shouldn't be
29948 modified. Create a copy, use it and return it as target. */
29949 target = gen_reg_rtx (tmode);
29950 emit_move_insn (target, op0);
29951 ix86_expand_vector_set (true, target, op1, elt);
29952
29953 return target;
29954 }
29955
29956 /* Expand an expression EXP that calls a built-in function,
29957 with result going to TARGET if that's convenient
29958 (and in mode MODE if that's convenient).
29959 SUBTARGET may be used as the target for computing one of EXP's operands.
29960 IGNORE is nonzero if the value is to be ignored. */
29961
29962 static rtx
29963 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
29964 enum machine_mode mode ATTRIBUTE_UNUSED,
29965 int ignore ATTRIBUTE_UNUSED)
29966 {
29967 const struct builtin_description *d;
29968 size_t i;
29969 enum insn_code icode;
29970 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
29971 tree arg0, arg1, arg2, arg3, arg4;
29972 rtx op0, op1, op2, op3, op4, pat;
29973 enum machine_mode mode0, mode1, mode2, mode3, mode4;
29974 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
29975
29976 /* For CPU builtins that can be folded, fold first and expand the fold. */
29977 switch (fcode)
29978 {
29979 case IX86_BUILTIN_CPU_INIT:
29980 {
29981 /* Make it call __cpu_indicator_init in libgcc. */
29982 tree call_expr, fndecl, type;
29983 type = build_function_type_list (integer_type_node, NULL_TREE);
29984 fndecl = build_fn_decl ("__cpu_indicator_init", type);
29985 call_expr = build_call_expr (fndecl, 0);
29986 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
29987 }
29988 case IX86_BUILTIN_CPU_IS:
29989 case IX86_BUILTIN_CPU_SUPPORTS:
29990 {
29991 tree arg0 = CALL_EXPR_ARG (exp, 0);
29992 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
29993 gcc_assert (fold_expr != NULL_TREE);
29994 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
29995 }
29996 }
29997
29998 /* Determine whether the builtin function is available under the current ISA.
29999 Originally the builtin was not created if it wasn't applicable to the
30000 current ISA based on the command line switches. With function specific
30001 options, we need to check in the context of the function making the call
30002 whether it is supported. */
30003 if (ix86_builtins_isa[fcode].isa
30004 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
30005 {
30006 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
30007 NULL, (enum fpmath_unit) 0, false);
30008
30009 if (!opts)
30010 error ("%qE needs unknown isa option", fndecl);
30011 else
30012 {
30013 gcc_assert (opts != NULL);
30014 error ("%qE needs isa option %s", fndecl, opts);
30015 free (opts);
30016 }
30017 return const0_rtx;
30018 }
30019
30020 switch (fcode)
30021 {
30022 case IX86_BUILTIN_MASKMOVQ:
30023 case IX86_BUILTIN_MASKMOVDQU:
30024 icode = (fcode == IX86_BUILTIN_MASKMOVQ
30025 ? CODE_FOR_mmx_maskmovq
30026 : CODE_FOR_sse2_maskmovdqu);
30027 /* Note the arg order is different from the operand order. */
30028 arg1 = CALL_EXPR_ARG (exp, 0);
30029 arg2 = CALL_EXPR_ARG (exp, 1);
30030 arg0 = CALL_EXPR_ARG (exp, 2);
30031 op0 = expand_normal (arg0);
30032 op1 = expand_normal (arg1);
30033 op2 = expand_normal (arg2);
30034 mode0 = insn_data[icode].operand[0].mode;
30035 mode1 = insn_data[icode].operand[1].mode;
30036 mode2 = insn_data[icode].operand[2].mode;
30037
30038 if (GET_MODE (op0) != Pmode)
30039 op0 = convert_to_mode (Pmode, op0, 1);
30040 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
30041
30042 if (!insn_data[icode].operand[0].predicate (op0, mode0))
30043 op0 = copy_to_mode_reg (mode0, op0);
30044 if (!insn_data[icode].operand[1].predicate (op1, mode1))
30045 op1 = copy_to_mode_reg (mode1, op1);
30046 if (!insn_data[icode].operand[2].predicate (op2, mode2))
30047 op2 = copy_to_mode_reg (mode2, op2);
30048 pat = GEN_FCN (icode) (op0, op1, op2);
30049 if (! pat)
30050 return 0;
30051 emit_insn (pat);
30052 return 0;
30053
30054 case IX86_BUILTIN_LDMXCSR:
30055 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
30056 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
30057 emit_move_insn (target, op0);
30058 emit_insn (gen_sse_ldmxcsr (target));
30059 return 0;
30060
30061 case IX86_BUILTIN_STMXCSR:
30062 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
30063 emit_insn (gen_sse_stmxcsr (target));
30064 return copy_to_mode_reg (SImode, target);
30065
30066 case IX86_BUILTIN_CLFLUSH:
30067 arg0 = CALL_EXPR_ARG (exp, 0);
30068 op0 = expand_normal (arg0);
30069 icode = CODE_FOR_sse2_clflush;
30070 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
30071 {
30072 if (GET_MODE (op0) != Pmode)
30073 op0 = convert_to_mode (Pmode, op0, 1);
30074 op0 = force_reg (Pmode, op0);
30075 }
30076
30077 emit_insn (gen_sse2_clflush (op0));
30078 return 0;
30079
30080 case IX86_BUILTIN_MONITOR:
30081 arg0 = CALL_EXPR_ARG (exp, 0);
30082 arg1 = CALL_EXPR_ARG (exp, 1);
30083 arg2 = CALL_EXPR_ARG (exp, 2);
30084 op0 = expand_normal (arg0);
30085 op1 = expand_normal (arg1);
30086 op2 = expand_normal (arg2);
30087 if (!REG_P (op0))
30088 {
30089 if (GET_MODE (op0) != Pmode)
30090 op0 = convert_to_mode (Pmode, op0, 1);
30091 op0 = force_reg (Pmode, op0);
30092 }
30093 if (!REG_P (op1))
30094 op1 = copy_to_mode_reg (SImode, op1);
30095 if (!REG_P (op2))
30096 op2 = copy_to_mode_reg (SImode, op2);
30097 emit_insn (ix86_gen_monitor (op0, op1, op2));
30098 return 0;
30099
30100 case IX86_BUILTIN_MWAIT:
30101 arg0 = CALL_EXPR_ARG (exp, 0);
30102 arg1 = CALL_EXPR_ARG (exp, 1);
30103 op0 = expand_normal (arg0);
30104 op1 = expand_normal (arg1);
30105 if (!REG_P (op0))
30106 op0 = copy_to_mode_reg (SImode, op0);
30107 if (!REG_P (op1))
30108 op1 = copy_to_mode_reg (SImode, op1);
30109 emit_insn (gen_sse3_mwait (op0, op1));
30110 return 0;
30111
30112 case IX86_BUILTIN_VEC_INIT_V2SI:
30113 case IX86_BUILTIN_VEC_INIT_V4HI:
30114 case IX86_BUILTIN_VEC_INIT_V8QI:
30115 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
30116
30117 case IX86_BUILTIN_VEC_EXT_V2DF:
30118 case IX86_BUILTIN_VEC_EXT_V2DI:
30119 case IX86_BUILTIN_VEC_EXT_V4SF:
30120 case IX86_BUILTIN_VEC_EXT_V4SI:
30121 case IX86_BUILTIN_VEC_EXT_V8HI:
30122 case IX86_BUILTIN_VEC_EXT_V2SI:
30123 case IX86_BUILTIN_VEC_EXT_V4HI:
30124 case IX86_BUILTIN_VEC_EXT_V16QI:
30125 return ix86_expand_vec_ext_builtin (exp, target);
30126
30127 case IX86_BUILTIN_VEC_SET_V2DI:
30128 case IX86_BUILTIN_VEC_SET_V4SF:
30129 case IX86_BUILTIN_VEC_SET_V4SI:
30130 case IX86_BUILTIN_VEC_SET_V8HI:
30131 case IX86_BUILTIN_VEC_SET_V4HI:
30132 case IX86_BUILTIN_VEC_SET_V16QI:
30133 return ix86_expand_vec_set_builtin (exp);
30134
30135 case IX86_BUILTIN_INFQ:
30136 case IX86_BUILTIN_HUGE_VALQ:
30137 {
30138 REAL_VALUE_TYPE inf;
30139 rtx tmp;
30140
30141 real_inf (&inf);
30142 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
30143
30144 tmp = validize_mem (force_const_mem (mode, tmp));
30145
30146 if (target == 0)
30147 target = gen_reg_rtx (mode);
30148
30149 emit_move_insn (target, tmp);
30150 return target;
30151 }
30152
30153 case IX86_BUILTIN_LLWPCB:
30154 arg0 = CALL_EXPR_ARG (exp, 0);
30155 op0 = expand_normal (arg0);
30156 icode = CODE_FOR_lwp_llwpcb;
30157 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
30158 {
30159 if (GET_MODE (op0) != Pmode)
30160 op0 = convert_to_mode (Pmode, op0, 1);
30161 op0 = force_reg (Pmode, op0);
30162 }
30163 emit_insn (gen_lwp_llwpcb (op0));
30164 return 0;
30165
30166 case IX86_BUILTIN_SLWPCB:
30167 icode = CODE_FOR_lwp_slwpcb;
30168 if (!target
30169 || !insn_data[icode].operand[0].predicate (target, Pmode))
30170 target = gen_reg_rtx (Pmode);
30171 emit_insn (gen_lwp_slwpcb (target));
30172 return target;
30173
30174 case IX86_BUILTIN_BEXTRI32:
30175 case IX86_BUILTIN_BEXTRI64:
30176 arg0 = CALL_EXPR_ARG (exp, 0);
30177 arg1 = CALL_EXPR_ARG (exp, 1);
30178 op0 = expand_normal (arg0);
30179 op1 = expand_normal (arg1);
30180 icode = (fcode == IX86_BUILTIN_BEXTRI32
30181 ? CODE_FOR_tbm_bextri_si
30182 : CODE_FOR_tbm_bextri_di);
30183 if (!CONST_INT_P (op1))
30184 {
30185 error ("last argument must be an immediate");
30186 return const0_rtx;
30187 }
30188 else
30189 {
30190 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
30191 unsigned char lsb_index = INTVAL (op1) & 0xFF;
30192 op1 = GEN_INT (length);
30193 op2 = GEN_INT (lsb_index);
30194 pat = GEN_FCN (icode) (target, op0, op1, op2);
30195 if (pat)
30196 emit_insn (pat);
30197 return target;
30198 }
30199
30200 case IX86_BUILTIN_RDRAND16_STEP:
30201 icode = CODE_FOR_rdrandhi_1;
30202 mode0 = HImode;
30203 goto rdrand_step;
30204
30205 case IX86_BUILTIN_RDRAND32_STEP:
30206 icode = CODE_FOR_rdrandsi_1;
30207 mode0 = SImode;
30208 goto rdrand_step;
30209
30210 case IX86_BUILTIN_RDRAND64_STEP:
30211 icode = CODE_FOR_rdranddi_1;
30212 mode0 = DImode;
30213
30214 rdrand_step:
30215 op0 = gen_reg_rtx (mode0);
30216 emit_insn (GEN_FCN (icode) (op0));
30217
30218 arg0 = CALL_EXPR_ARG (exp, 0);
30219 op1 = expand_normal (arg0);
30220 if (!address_operand (op1, VOIDmode))
30221 {
30222 op1 = convert_memory_address (Pmode, op1);
30223 op1 = copy_addr_to_reg (op1);
30224 }
30225 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
30226
30227 op1 = gen_reg_rtx (SImode);
30228 emit_move_insn (op1, CONST1_RTX (SImode));
30229
30230 /* Emit SImode conditional move. */
30231 if (mode0 == HImode)
30232 {
30233 op2 = gen_reg_rtx (SImode);
30234 emit_insn (gen_zero_extendhisi2 (op2, op0));
30235 }
30236 else if (mode0 == SImode)
30237 op2 = op0;
30238 else
30239 op2 = gen_rtx_SUBREG (SImode, op0, 0);
30240
30241 if (target == 0)
30242 target = gen_reg_rtx (SImode);
30243
30244 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
30245 const0_rtx);
30246 emit_insn (gen_rtx_SET (VOIDmode, target,
30247 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
30248 return target;
30249
30250 case IX86_BUILTIN_GATHERSIV2DF:
30251 icode = CODE_FOR_avx2_gathersiv2df;
30252 goto gather_gen;
30253 case IX86_BUILTIN_GATHERSIV4DF:
30254 icode = CODE_FOR_avx2_gathersiv4df;
30255 goto gather_gen;
30256 case IX86_BUILTIN_GATHERDIV2DF:
30257 icode = CODE_FOR_avx2_gatherdiv2df;
30258 goto gather_gen;
30259 case IX86_BUILTIN_GATHERDIV4DF:
30260 icode = CODE_FOR_avx2_gatherdiv4df;
30261 goto gather_gen;
30262 case IX86_BUILTIN_GATHERSIV4SF:
30263 icode = CODE_FOR_avx2_gathersiv4sf;
30264 goto gather_gen;
30265 case IX86_BUILTIN_GATHERSIV8SF:
30266 icode = CODE_FOR_avx2_gathersiv8sf;
30267 goto gather_gen;
30268 case IX86_BUILTIN_GATHERDIV4SF:
30269 icode = CODE_FOR_avx2_gatherdiv4sf;
30270 goto gather_gen;
30271 case IX86_BUILTIN_GATHERDIV8SF:
30272 icode = CODE_FOR_avx2_gatherdiv8sf;
30273 goto gather_gen;
30274 case IX86_BUILTIN_GATHERSIV2DI:
30275 icode = CODE_FOR_avx2_gathersiv2di;
30276 goto gather_gen;
30277 case IX86_BUILTIN_GATHERSIV4DI:
30278 icode = CODE_FOR_avx2_gathersiv4di;
30279 goto gather_gen;
30280 case IX86_BUILTIN_GATHERDIV2DI:
30281 icode = CODE_FOR_avx2_gatherdiv2di;
30282 goto gather_gen;
30283 case IX86_BUILTIN_GATHERDIV4DI:
30284 icode = CODE_FOR_avx2_gatherdiv4di;
30285 goto gather_gen;
30286 case IX86_BUILTIN_GATHERSIV4SI:
30287 icode = CODE_FOR_avx2_gathersiv4si;
30288 goto gather_gen;
30289 case IX86_BUILTIN_GATHERSIV8SI:
30290 icode = CODE_FOR_avx2_gathersiv8si;
30291 goto gather_gen;
30292 case IX86_BUILTIN_GATHERDIV4SI:
30293 icode = CODE_FOR_avx2_gatherdiv4si;
30294 goto gather_gen;
30295 case IX86_BUILTIN_GATHERDIV8SI:
30296 icode = CODE_FOR_avx2_gatherdiv8si;
30297 goto gather_gen;
30298 case IX86_BUILTIN_GATHERALTSIV4DF:
30299 icode = CODE_FOR_avx2_gathersiv4df;
30300 goto gather_gen;
30301 case IX86_BUILTIN_GATHERALTDIV8SF:
30302 icode = CODE_FOR_avx2_gatherdiv8sf;
30303 goto gather_gen;
30304 case IX86_BUILTIN_GATHERALTSIV4DI:
30305 icode = CODE_FOR_avx2_gathersiv4di;
30306 goto gather_gen;
30307 case IX86_BUILTIN_GATHERALTDIV8SI:
30308 icode = CODE_FOR_avx2_gatherdiv8si;
30309 goto gather_gen;
30310
30311 gather_gen:
30312 arg0 = CALL_EXPR_ARG (exp, 0);
30313 arg1 = CALL_EXPR_ARG (exp, 1);
30314 arg2 = CALL_EXPR_ARG (exp, 2);
30315 arg3 = CALL_EXPR_ARG (exp, 3);
30316 arg4 = CALL_EXPR_ARG (exp, 4);
30317 op0 = expand_normal (arg0);
30318 op1 = expand_normal (arg1);
30319 op2 = expand_normal (arg2);
30320 op3 = expand_normal (arg3);
30321 op4 = expand_normal (arg4);
30322 /* Note the arg order is different from the operand order. */
30323 mode0 = insn_data[icode].operand[1].mode;
30324 mode2 = insn_data[icode].operand[3].mode;
30325 mode3 = insn_data[icode].operand[4].mode;
30326 mode4 = insn_data[icode].operand[5].mode;
30327
30328 if (target == NULL_RTX
30329 || GET_MODE (target) != insn_data[icode].operand[0].mode)
30330 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
30331 else
30332 subtarget = target;
30333
30334 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
30335 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
30336 {
30337 rtx half = gen_reg_rtx (V4SImode);
30338 if (!nonimmediate_operand (op2, V8SImode))
30339 op2 = copy_to_mode_reg (V8SImode, op2);
30340 emit_insn (gen_vec_extract_lo_v8si (half, op2));
30341 op2 = half;
30342 }
30343 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
30344 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
30345 {
30346 rtx (*gen) (rtx, rtx);
30347 rtx half = gen_reg_rtx (mode0);
30348 if (mode0 == V4SFmode)
30349 gen = gen_vec_extract_lo_v8sf;
30350 else
30351 gen = gen_vec_extract_lo_v8si;
30352 if (!nonimmediate_operand (op0, GET_MODE (op0)))
30353 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
30354 emit_insn (gen (half, op0));
30355 op0 = half;
30356 if (!nonimmediate_operand (op3, GET_MODE (op3)))
30357 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
30358 emit_insn (gen (half, op3));
30359 op3 = half;
30360 }
30361
30362 /* Force memory operand only with base register here. But we
30363 don't want to do it on memory operand for other builtin
30364 functions. */
30365 if (GET_MODE (op1) != Pmode)
30366 op1 = convert_to_mode (Pmode, op1, 1);
30367 op1 = force_reg (Pmode, op1);
30368
30369 if (!insn_data[icode].operand[1].predicate (op0, mode0))
30370 op0 = copy_to_mode_reg (mode0, op0);
30371 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
30372 op1 = copy_to_mode_reg (Pmode, op1);
30373 if (!insn_data[icode].operand[3].predicate (op2, mode2))
30374 op2 = copy_to_mode_reg (mode2, op2);
30375 if (!insn_data[icode].operand[4].predicate (op3, mode3))
30376 op3 = copy_to_mode_reg (mode3, op3);
30377 if (!insn_data[icode].operand[5].predicate (op4, mode4))
30378 {
30379 error ("last argument must be scale 1, 2, 4, 8");
30380 return const0_rtx;
30381 }
30382
30383 /* Optimize. If mask is known to have all high bits set,
30384 replace op0 with pc_rtx to signal that the instruction
30385 overwrites the whole destination and doesn't use its
30386 previous contents. */
30387 if (optimize)
30388 {
30389 if (TREE_CODE (arg3) == VECTOR_CST)
30390 {
30391 unsigned int negative = 0;
30392 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
30393 {
30394 tree cst = VECTOR_CST_ELT (arg3, i);
30395 if (TREE_CODE (cst) == INTEGER_CST
30396 && tree_int_cst_sign_bit (cst))
30397 negative++;
30398 else if (TREE_CODE (cst) == REAL_CST
30399 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
30400 negative++;
30401 }
30402 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
30403 op0 = pc_rtx;
30404 }
30405 else if (TREE_CODE (arg3) == SSA_NAME)
30406 {
30407 /* Recognize also when mask is like:
30408 __v2df src = _mm_setzero_pd ();
30409 __v2df mask = _mm_cmpeq_pd (src, src);
30410 or
30411 __v8sf src = _mm256_setzero_ps ();
30412 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
30413 as that is a cheaper way to load all ones into
30414 a register than having to load a constant from
30415 memory. */
30416 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
30417 if (is_gimple_call (def_stmt))
30418 {
30419 tree fndecl = gimple_call_fndecl (def_stmt);
30420 if (fndecl
30421 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
30422 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
30423 {
30424 case IX86_BUILTIN_CMPPD:
30425 case IX86_BUILTIN_CMPPS:
30426 case IX86_BUILTIN_CMPPD256:
30427 case IX86_BUILTIN_CMPPS256:
30428 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
30429 break;
30430 /* FALLTHRU */
30431 case IX86_BUILTIN_CMPEQPD:
30432 case IX86_BUILTIN_CMPEQPS:
30433 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
30434 && initializer_zerop (gimple_call_arg (def_stmt,
30435 1)))
30436 op0 = pc_rtx;
30437 break;
30438 default:
30439 break;
30440 }
30441 }
30442 }
30443 }
30444
30445 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
30446 if (! pat)
30447 return const0_rtx;
30448 emit_insn (pat);
30449
30450 if (fcode == IX86_BUILTIN_GATHERDIV8SF
30451 || fcode == IX86_BUILTIN_GATHERDIV8SI)
30452 {
30453 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
30454 ? V4SFmode : V4SImode;
30455 if (target == NULL_RTX)
30456 target = gen_reg_rtx (tmode);
30457 if (tmode == V4SFmode)
30458 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
30459 else
30460 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
30461 }
30462 else
30463 target = subtarget;
30464
30465 return target;
30466
30467 case IX86_BUILTIN_XABORT:
30468 icode = CODE_FOR_xabort;
30469 arg0 = CALL_EXPR_ARG (exp, 0);
30470 op0 = expand_normal (arg0);
30471 mode0 = insn_data[icode].operand[0].mode;
30472 if (!insn_data[icode].operand[0].predicate (op0, mode0))
30473 {
30474 error ("the xabort's argument must be an 8-bit immediate");
30475 return const0_rtx;
30476 }
30477 emit_insn (gen_xabort (op0));
30478 return 0;
30479
30480 default:
30481 break;
30482 }
30483
30484 for (i = 0, d = bdesc_special_args;
30485 i < ARRAY_SIZE (bdesc_special_args);
30486 i++, d++)
30487 if (d->code == fcode)
30488 return ix86_expand_special_args_builtin (d, exp, target);
30489
30490 for (i = 0, d = bdesc_args;
30491 i < ARRAY_SIZE (bdesc_args);
30492 i++, d++)
30493 if (d->code == fcode)
30494 switch (fcode)
30495 {
30496 case IX86_BUILTIN_FABSQ:
30497 case IX86_BUILTIN_COPYSIGNQ:
30498 if (!TARGET_SSE)
30499 /* Emit a normal call if SSE isn't available. */
30500 return expand_call (exp, target, ignore);
30501 default:
30502 return ix86_expand_args_builtin (d, exp, target);
30503 }
30504
30505 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
30506 if (d->code == fcode)
30507 return ix86_expand_sse_comi (d, exp, target);
30508
30509 for (i = 0, d = bdesc_pcmpestr;
30510 i < ARRAY_SIZE (bdesc_pcmpestr);
30511 i++, d++)
30512 if (d->code == fcode)
30513 return ix86_expand_sse_pcmpestr (d, exp, target);
30514
30515 for (i = 0, d = bdesc_pcmpistr;
30516 i < ARRAY_SIZE (bdesc_pcmpistr);
30517 i++, d++)
30518 if (d->code == fcode)
30519 return ix86_expand_sse_pcmpistr (d, exp, target);
30520
30521 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
30522 if (d->code == fcode)
30523 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
30524 (enum ix86_builtin_func_type)
30525 d->flag, d->comparison);
30526
30527 gcc_unreachable ();
30528 }
30529
30530 /* Returns a function decl for a vectorized version of the builtin function
30531 with builtin function code FN and the result vector type TYPE, or NULL_TREE
30532 if it is not available. */
30533
30534 static tree
30535 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
30536 tree type_in)
30537 {
30538 enum machine_mode in_mode, out_mode;
30539 int in_n, out_n;
30540 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
30541
30542 if (TREE_CODE (type_out) != VECTOR_TYPE
30543 || TREE_CODE (type_in) != VECTOR_TYPE
30544 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
30545 return NULL_TREE;
30546
30547 out_mode = TYPE_MODE (TREE_TYPE (type_out));
30548 out_n = TYPE_VECTOR_SUBPARTS (type_out);
30549 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30550 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30551
30552 switch (fn)
30553 {
30554 case BUILT_IN_SQRT:
30555 if (out_mode == DFmode && in_mode == DFmode)
30556 {
30557 if (out_n == 2 && in_n == 2)
30558 return ix86_builtins[IX86_BUILTIN_SQRTPD];
30559 else if (out_n == 4 && in_n == 4)
30560 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
30561 }
30562 break;
30563
30564 case BUILT_IN_SQRTF:
30565 if (out_mode == SFmode && in_mode == SFmode)
30566 {
30567 if (out_n == 4 && in_n == 4)
30568 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
30569 else if (out_n == 8 && in_n == 8)
30570 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
30571 }
30572 break;
30573
30574 case BUILT_IN_IFLOOR:
30575 case BUILT_IN_LFLOOR:
30576 case BUILT_IN_LLFLOOR:
30577 /* The round insn does not trap on denormals. */
30578 if (flag_trapping_math || !TARGET_ROUND)
30579 break;
30580
30581 if (out_mode == SImode && in_mode == DFmode)
30582 {
30583 if (out_n == 4 && in_n == 2)
30584 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
30585 else if (out_n == 8 && in_n == 4)
30586 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
30587 }
30588 break;
30589
30590 case BUILT_IN_IFLOORF:
30591 case BUILT_IN_LFLOORF:
30592 case BUILT_IN_LLFLOORF:
30593 /* The round insn does not trap on denormals. */
30594 if (flag_trapping_math || !TARGET_ROUND)
30595 break;
30596
30597 if (out_mode == SImode && in_mode == SFmode)
30598 {
30599 if (out_n == 4 && in_n == 4)
30600 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
30601 else if (out_n == 8 && in_n == 8)
30602 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
30603 }
30604 break;
30605
30606 case BUILT_IN_ICEIL:
30607 case BUILT_IN_LCEIL:
30608 case BUILT_IN_LLCEIL:
30609 /* The round insn does not trap on denormals. */
30610 if (flag_trapping_math || !TARGET_ROUND)
30611 break;
30612
30613 if (out_mode == SImode && in_mode == DFmode)
30614 {
30615 if (out_n == 4 && in_n == 2)
30616 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
30617 else if (out_n == 8 && in_n == 4)
30618 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
30619 }
30620 break;
30621
30622 case BUILT_IN_ICEILF:
30623 case BUILT_IN_LCEILF:
30624 case BUILT_IN_LLCEILF:
30625 /* The round insn does not trap on denormals. */
30626 if (flag_trapping_math || !TARGET_ROUND)
30627 break;
30628
30629 if (out_mode == SImode && in_mode == SFmode)
30630 {
30631 if (out_n == 4 && in_n == 4)
30632 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
30633 else if (out_n == 8 && in_n == 8)
30634 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
30635 }
30636 break;
30637
30638 case BUILT_IN_IRINT:
30639 case BUILT_IN_LRINT:
30640 case BUILT_IN_LLRINT:
30641 if (out_mode == SImode && in_mode == DFmode)
30642 {
30643 if (out_n == 4 && in_n == 2)
30644 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
30645 else if (out_n == 8 && in_n == 4)
30646 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
30647 }
30648 break;
30649
30650 case BUILT_IN_IRINTF:
30651 case BUILT_IN_LRINTF:
30652 case BUILT_IN_LLRINTF:
30653 if (out_mode == SImode && in_mode == SFmode)
30654 {
30655 if (out_n == 4 && in_n == 4)
30656 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
30657 else if (out_n == 8 && in_n == 8)
30658 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
30659 }
30660 break;
30661
30662 case BUILT_IN_IROUND:
30663 case BUILT_IN_LROUND:
30664 case BUILT_IN_LLROUND:
30665 /* The round insn does not trap on denormals. */
30666 if (flag_trapping_math || !TARGET_ROUND)
30667 break;
30668
30669 if (out_mode == SImode && in_mode == DFmode)
30670 {
30671 if (out_n == 4 && in_n == 2)
30672 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
30673 else if (out_n == 8 && in_n == 4)
30674 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
30675 }
30676 break;
30677
30678 case BUILT_IN_IROUNDF:
30679 case BUILT_IN_LROUNDF:
30680 case BUILT_IN_LLROUNDF:
30681 /* The round insn does not trap on denormals. */
30682 if (flag_trapping_math || !TARGET_ROUND)
30683 break;
30684
30685 if (out_mode == SImode && in_mode == SFmode)
30686 {
30687 if (out_n == 4 && in_n == 4)
30688 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
30689 else if (out_n == 8 && in_n == 8)
30690 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
30691 }
30692 break;
30693
30694 case BUILT_IN_COPYSIGN:
30695 if (out_mode == DFmode && in_mode == DFmode)
30696 {
30697 if (out_n == 2 && in_n == 2)
30698 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
30699 else if (out_n == 4 && in_n == 4)
30700 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
30701 }
30702 break;
30703
30704 case BUILT_IN_COPYSIGNF:
30705 if (out_mode == SFmode && in_mode == SFmode)
30706 {
30707 if (out_n == 4 && in_n == 4)
30708 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
30709 else if (out_n == 8 && in_n == 8)
30710 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
30711 }
30712 break;
30713
30714 case BUILT_IN_FLOOR:
30715 /* The round insn does not trap on denormals. */
30716 if (flag_trapping_math || !TARGET_ROUND)
30717 break;
30718
30719 if (out_mode == DFmode && in_mode == DFmode)
30720 {
30721 if (out_n == 2 && in_n == 2)
30722 return ix86_builtins[IX86_BUILTIN_FLOORPD];
30723 else if (out_n == 4 && in_n == 4)
30724 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
30725 }
30726 break;
30727
30728 case BUILT_IN_FLOORF:
30729 /* The round insn does not trap on denormals. */
30730 if (flag_trapping_math || !TARGET_ROUND)
30731 break;
30732
30733 if (out_mode == SFmode && in_mode == SFmode)
30734 {
30735 if (out_n == 4 && in_n == 4)
30736 return ix86_builtins[IX86_BUILTIN_FLOORPS];
30737 else if (out_n == 8 && in_n == 8)
30738 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
30739 }
30740 break;
30741
30742 case BUILT_IN_CEIL:
30743 /* The round insn does not trap on denormals. */
30744 if (flag_trapping_math || !TARGET_ROUND)
30745 break;
30746
30747 if (out_mode == DFmode && in_mode == DFmode)
30748 {
30749 if (out_n == 2 && in_n == 2)
30750 return ix86_builtins[IX86_BUILTIN_CEILPD];
30751 else if (out_n == 4 && in_n == 4)
30752 return ix86_builtins[IX86_BUILTIN_CEILPD256];
30753 }
30754 break;
30755
30756 case BUILT_IN_CEILF:
30757 /* The round insn does not trap on denormals. */
30758 if (flag_trapping_math || !TARGET_ROUND)
30759 break;
30760
30761 if (out_mode == SFmode && in_mode == SFmode)
30762 {
30763 if (out_n == 4 && in_n == 4)
30764 return ix86_builtins[IX86_BUILTIN_CEILPS];
30765 else if (out_n == 8 && in_n == 8)
30766 return ix86_builtins[IX86_BUILTIN_CEILPS256];
30767 }
30768 break;
30769
30770 case BUILT_IN_TRUNC:
30771 /* The round insn does not trap on denormals. */
30772 if (flag_trapping_math || !TARGET_ROUND)
30773 break;
30774
30775 if (out_mode == DFmode && in_mode == DFmode)
30776 {
30777 if (out_n == 2 && in_n == 2)
30778 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
30779 else if (out_n == 4 && in_n == 4)
30780 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
30781 }
30782 break;
30783
30784 case BUILT_IN_TRUNCF:
30785 /* The round insn does not trap on denormals. */
30786 if (flag_trapping_math || !TARGET_ROUND)
30787 break;
30788
30789 if (out_mode == SFmode && in_mode == SFmode)
30790 {
30791 if (out_n == 4 && in_n == 4)
30792 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
30793 else if (out_n == 8 && in_n == 8)
30794 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
30795 }
30796 break;
30797
30798 case BUILT_IN_RINT:
30799 /* The round insn does not trap on denormals. */
30800 if (flag_trapping_math || !TARGET_ROUND)
30801 break;
30802
30803 if (out_mode == DFmode && in_mode == DFmode)
30804 {
30805 if (out_n == 2 && in_n == 2)
30806 return ix86_builtins[IX86_BUILTIN_RINTPD];
30807 else if (out_n == 4 && in_n == 4)
30808 return ix86_builtins[IX86_BUILTIN_RINTPD256];
30809 }
30810 break;
30811
30812 case BUILT_IN_RINTF:
30813 /* The round insn does not trap on denormals. */
30814 if (flag_trapping_math || !TARGET_ROUND)
30815 break;
30816
30817 if (out_mode == SFmode && in_mode == SFmode)
30818 {
30819 if (out_n == 4 && in_n == 4)
30820 return ix86_builtins[IX86_BUILTIN_RINTPS];
30821 else if (out_n == 8 && in_n == 8)
30822 return ix86_builtins[IX86_BUILTIN_RINTPS256];
30823 }
30824 break;
30825
30826 case BUILT_IN_ROUND:
30827 /* The round insn does not trap on denormals. */
30828 if (flag_trapping_math || !TARGET_ROUND)
30829 break;
30830
30831 if (out_mode == DFmode && in_mode == DFmode)
30832 {
30833 if (out_n == 2 && in_n == 2)
30834 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
30835 else if (out_n == 4 && in_n == 4)
30836 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
30837 }
30838 break;
30839
30840 case BUILT_IN_ROUNDF:
30841 /* The round insn does not trap on denormals. */
30842 if (flag_trapping_math || !TARGET_ROUND)
30843 break;
30844
30845 if (out_mode == SFmode && in_mode == SFmode)
30846 {
30847 if (out_n == 4 && in_n == 4)
30848 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
30849 else if (out_n == 8 && in_n == 8)
30850 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
30851 }
30852 break;
30853
30854 case BUILT_IN_FMA:
30855 if (out_mode == DFmode && in_mode == DFmode)
30856 {
30857 if (out_n == 2 && in_n == 2)
30858 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
30859 if (out_n == 4 && in_n == 4)
30860 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
30861 }
30862 break;
30863
30864 case BUILT_IN_FMAF:
30865 if (out_mode == SFmode && in_mode == SFmode)
30866 {
30867 if (out_n == 4 && in_n == 4)
30868 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
30869 if (out_n == 8 && in_n == 8)
30870 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
30871 }
30872 break;
30873
30874 default:
30875 break;
30876 }
30877
30878 /* Dispatch to a handler for a vectorization library. */
30879 if (ix86_veclib_handler)
30880 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
30881 type_in);
30882
30883 return NULL_TREE;
30884 }
30885
30886 /* Handler for an SVML-style interface to
30887 a library with vectorized intrinsics. */
30888
30889 static tree
30890 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
30891 {
30892 char name[20];
30893 tree fntype, new_fndecl, args;
30894 unsigned arity;
30895 const char *bname;
30896 enum machine_mode el_mode, in_mode;
30897 int n, in_n;
30898
30899 /* The SVML is suitable for unsafe math only. */
30900 if (!flag_unsafe_math_optimizations)
30901 return NULL_TREE;
30902
30903 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30904 n = TYPE_VECTOR_SUBPARTS (type_out);
30905 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30906 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30907 if (el_mode != in_mode
30908 || n != in_n)
30909 return NULL_TREE;
30910
30911 switch (fn)
30912 {
30913 case BUILT_IN_EXP:
30914 case BUILT_IN_LOG:
30915 case BUILT_IN_LOG10:
30916 case BUILT_IN_POW:
30917 case BUILT_IN_TANH:
30918 case BUILT_IN_TAN:
30919 case BUILT_IN_ATAN:
30920 case BUILT_IN_ATAN2:
30921 case BUILT_IN_ATANH:
30922 case BUILT_IN_CBRT:
30923 case BUILT_IN_SINH:
30924 case BUILT_IN_SIN:
30925 case BUILT_IN_ASINH:
30926 case BUILT_IN_ASIN:
30927 case BUILT_IN_COSH:
30928 case BUILT_IN_COS:
30929 case BUILT_IN_ACOSH:
30930 case BUILT_IN_ACOS:
30931 if (el_mode != DFmode || n != 2)
30932 return NULL_TREE;
30933 break;
30934
30935 case BUILT_IN_EXPF:
30936 case BUILT_IN_LOGF:
30937 case BUILT_IN_LOG10F:
30938 case BUILT_IN_POWF:
30939 case BUILT_IN_TANHF:
30940 case BUILT_IN_TANF:
30941 case BUILT_IN_ATANF:
30942 case BUILT_IN_ATAN2F:
30943 case BUILT_IN_ATANHF:
30944 case BUILT_IN_CBRTF:
30945 case BUILT_IN_SINHF:
30946 case BUILT_IN_SINF:
30947 case BUILT_IN_ASINHF:
30948 case BUILT_IN_ASINF:
30949 case BUILT_IN_COSHF:
30950 case BUILT_IN_COSF:
30951 case BUILT_IN_ACOSHF:
30952 case BUILT_IN_ACOSF:
30953 if (el_mode != SFmode || n != 4)
30954 return NULL_TREE;
30955 break;
30956
30957 default:
30958 return NULL_TREE;
30959 }
30960
30961 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30962
30963 if (fn == BUILT_IN_LOGF)
30964 strcpy (name, "vmlsLn4");
30965 else if (fn == BUILT_IN_LOG)
30966 strcpy (name, "vmldLn2");
30967 else if (n == 4)
30968 {
30969 sprintf (name, "vmls%s", bname+10);
30970 name[strlen (name)-1] = '4';
30971 }
30972 else
30973 sprintf (name, "vmld%s2", bname+10);
30974
30975 /* Convert to uppercase. */
30976 name[4] &= ~0x20;
30977
30978 arity = 0;
30979 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30980 args;
30981 args = TREE_CHAIN (args))
30982 arity++;
30983
30984 if (arity == 1)
30985 fntype = build_function_type_list (type_out, type_in, NULL);
30986 else
30987 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30988
30989 /* Build a function declaration for the vectorized function. */
30990 new_fndecl = build_decl (BUILTINS_LOCATION,
30991 FUNCTION_DECL, get_identifier (name), fntype);
30992 TREE_PUBLIC (new_fndecl) = 1;
30993 DECL_EXTERNAL (new_fndecl) = 1;
30994 DECL_IS_NOVOPS (new_fndecl) = 1;
30995 TREE_READONLY (new_fndecl) = 1;
30996
30997 return new_fndecl;
30998 }
30999
31000 /* Handler for an ACML-style interface to
31001 a library with vectorized intrinsics. */
31002
31003 static tree
31004 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
31005 {
31006 char name[20] = "__vr.._";
31007 tree fntype, new_fndecl, args;
31008 unsigned arity;
31009 const char *bname;
31010 enum machine_mode el_mode, in_mode;
31011 int n, in_n;
31012
31013 /* The ACML is 64bits only and suitable for unsafe math only as
31014 it does not correctly support parts of IEEE with the required
31015 precision such as denormals. */
31016 if (!TARGET_64BIT
31017 || !flag_unsafe_math_optimizations)
31018 return NULL_TREE;
31019
31020 el_mode = TYPE_MODE (TREE_TYPE (type_out));
31021 n = TYPE_VECTOR_SUBPARTS (type_out);
31022 in_mode = TYPE_MODE (TREE_TYPE (type_in));
31023 in_n = TYPE_VECTOR_SUBPARTS (type_in);
31024 if (el_mode != in_mode
31025 || n != in_n)
31026 return NULL_TREE;
31027
31028 switch (fn)
31029 {
31030 case BUILT_IN_SIN:
31031 case BUILT_IN_COS:
31032 case BUILT_IN_EXP:
31033 case BUILT_IN_LOG:
31034 case BUILT_IN_LOG2:
31035 case BUILT_IN_LOG10:
31036 name[4] = 'd';
31037 name[5] = '2';
31038 if (el_mode != DFmode
31039 || n != 2)
31040 return NULL_TREE;
31041 break;
31042
31043 case BUILT_IN_SINF:
31044 case BUILT_IN_COSF:
31045 case BUILT_IN_EXPF:
31046 case BUILT_IN_POWF:
31047 case BUILT_IN_LOGF:
31048 case BUILT_IN_LOG2F:
31049 case BUILT_IN_LOG10F:
31050 name[4] = 's';
31051 name[5] = '4';
31052 if (el_mode != SFmode
31053 || n != 4)
31054 return NULL_TREE;
31055 break;
31056
31057 default:
31058 return NULL_TREE;
31059 }
31060
31061 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
31062 sprintf (name + 7, "%s", bname+10);
31063
31064 arity = 0;
31065 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
31066 args;
31067 args = TREE_CHAIN (args))
31068 arity++;
31069
31070 if (arity == 1)
31071 fntype = build_function_type_list (type_out, type_in, NULL);
31072 else
31073 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
31074
31075 /* Build a function declaration for the vectorized function. */
31076 new_fndecl = build_decl (BUILTINS_LOCATION,
31077 FUNCTION_DECL, get_identifier (name), fntype);
31078 TREE_PUBLIC (new_fndecl) = 1;
31079 DECL_EXTERNAL (new_fndecl) = 1;
31080 DECL_IS_NOVOPS (new_fndecl) = 1;
31081 TREE_READONLY (new_fndecl) = 1;
31082
31083 return new_fndecl;
31084 }
31085
31086 /* Returns a decl of a function that implements gather load with
31087 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
31088 Return NULL_TREE if it is not available. */
31089
31090 static tree
31091 ix86_vectorize_builtin_gather (const_tree mem_vectype,
31092 const_tree index_type, int scale)
31093 {
31094 bool si;
31095 enum ix86_builtins code;
31096
31097 if (! TARGET_AVX2)
31098 return NULL_TREE;
31099
31100 if ((TREE_CODE (index_type) != INTEGER_TYPE
31101 && !POINTER_TYPE_P (index_type))
31102 || (TYPE_MODE (index_type) != SImode
31103 && TYPE_MODE (index_type) != DImode))
31104 return NULL_TREE;
31105
31106 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
31107 return NULL_TREE;
31108
31109 /* v*gather* insn sign extends index to pointer mode. */
31110 if (TYPE_PRECISION (index_type) < POINTER_SIZE
31111 && TYPE_UNSIGNED (index_type))
31112 return NULL_TREE;
31113
31114 if (scale <= 0
31115 || scale > 8
31116 || (scale & (scale - 1)) != 0)
31117 return NULL_TREE;
31118
31119 si = TYPE_MODE (index_type) == SImode;
31120 switch (TYPE_MODE (mem_vectype))
31121 {
31122 case V2DFmode:
31123 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
31124 break;
31125 case V4DFmode:
31126 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
31127 break;
31128 case V2DImode:
31129 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
31130 break;
31131 case V4DImode:
31132 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
31133 break;
31134 case V4SFmode:
31135 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
31136 break;
31137 case V8SFmode:
31138 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
31139 break;
31140 case V4SImode:
31141 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
31142 break;
31143 case V8SImode:
31144 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
31145 break;
31146 default:
31147 return NULL_TREE;
31148 }
31149
31150 return ix86_builtins[code];
31151 }
31152
31153 /* Returns a code for a target-specific builtin that implements
31154 reciprocal of the function, or NULL_TREE if not available. */
31155
31156 static tree
31157 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
31158 bool sqrt ATTRIBUTE_UNUSED)
31159 {
31160 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
31161 && flag_finite_math_only && !flag_trapping_math
31162 && flag_unsafe_math_optimizations))
31163 return NULL_TREE;
31164
31165 if (md_fn)
31166 /* Machine dependent builtins. */
31167 switch (fn)
31168 {
31169 /* Vectorized version of sqrt to rsqrt conversion. */
31170 case IX86_BUILTIN_SQRTPS_NR:
31171 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
31172
31173 case IX86_BUILTIN_SQRTPS_NR256:
31174 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
31175
31176 default:
31177 return NULL_TREE;
31178 }
31179 else
31180 /* Normal builtins. */
31181 switch (fn)
31182 {
31183 /* Sqrt to rsqrt conversion. */
31184 case BUILT_IN_SQRTF:
31185 return ix86_builtins[IX86_BUILTIN_RSQRTF];
31186
31187 default:
31188 return NULL_TREE;
31189 }
31190 }
31191 \f
31192 /* Helper for avx_vpermilps256_operand et al. This is also used by
31193 the expansion functions to turn the parallel back into a mask.
31194 The return value is 0 for no match and the imm8+1 for a match. */
31195
31196 int
31197 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
31198 {
31199 unsigned i, nelt = GET_MODE_NUNITS (mode);
31200 unsigned mask = 0;
31201 unsigned char ipar[8];
31202
31203 if (XVECLEN (par, 0) != (int) nelt)
31204 return 0;
31205
31206 /* Validate that all of the elements are constants, and not totally
31207 out of range. Copy the data into an integral array to make the
31208 subsequent checks easier. */
31209 for (i = 0; i < nelt; ++i)
31210 {
31211 rtx er = XVECEXP (par, 0, i);
31212 unsigned HOST_WIDE_INT ei;
31213
31214 if (!CONST_INT_P (er))
31215 return 0;
31216 ei = INTVAL (er);
31217 if (ei >= nelt)
31218 return 0;
31219 ipar[i] = ei;
31220 }
31221
31222 switch (mode)
31223 {
31224 case V4DFmode:
31225 /* In the 256-bit DFmode case, we can only move elements within
31226 a 128-bit lane. */
31227 for (i = 0; i < 2; ++i)
31228 {
31229 if (ipar[i] >= 2)
31230 return 0;
31231 mask |= ipar[i] << i;
31232 }
31233 for (i = 2; i < 4; ++i)
31234 {
31235 if (ipar[i] < 2)
31236 return 0;
31237 mask |= (ipar[i] - 2) << i;
31238 }
31239 break;
31240
31241 case V8SFmode:
31242 /* In the 256-bit SFmode case, we have full freedom of movement
31243 within the low 128-bit lane, but the high 128-bit lane must
31244 mirror the exact same pattern. */
31245 for (i = 0; i < 4; ++i)
31246 if (ipar[i] + 4 != ipar[i + 4])
31247 return 0;
31248 nelt = 4;
31249 /* FALLTHRU */
31250
31251 case V2DFmode:
31252 case V4SFmode:
31253 /* In the 128-bit case, we've full freedom in the placement of
31254 the elements from the source operand. */
31255 for (i = 0; i < nelt; ++i)
31256 mask |= ipar[i] << (i * (nelt / 2));
31257 break;
31258
31259 default:
31260 gcc_unreachable ();
31261 }
31262
31263 /* Make sure success has a non-zero value by adding one. */
31264 return mask + 1;
31265 }
31266
31267 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
31268 the expansion functions to turn the parallel back into a mask.
31269 The return value is 0 for no match and the imm8+1 for a match. */
31270
31271 int
31272 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
31273 {
31274 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
31275 unsigned mask = 0;
31276 unsigned char ipar[8];
31277
31278 if (XVECLEN (par, 0) != (int) nelt)
31279 return 0;
31280
31281 /* Validate that all of the elements are constants, and not totally
31282 out of range. Copy the data into an integral array to make the
31283 subsequent checks easier. */
31284 for (i = 0; i < nelt; ++i)
31285 {
31286 rtx er = XVECEXP (par, 0, i);
31287 unsigned HOST_WIDE_INT ei;
31288
31289 if (!CONST_INT_P (er))
31290 return 0;
31291 ei = INTVAL (er);
31292 if (ei >= 2 * nelt)
31293 return 0;
31294 ipar[i] = ei;
31295 }
31296
31297 /* Validate that the halves of the permute are halves. */
31298 for (i = 0; i < nelt2 - 1; ++i)
31299 if (ipar[i] + 1 != ipar[i + 1])
31300 return 0;
31301 for (i = nelt2; i < nelt - 1; ++i)
31302 if (ipar[i] + 1 != ipar[i + 1])
31303 return 0;
31304
31305 /* Reconstruct the mask. */
31306 for (i = 0; i < 2; ++i)
31307 {
31308 unsigned e = ipar[i * nelt2];
31309 if (e % nelt2)
31310 return 0;
31311 e /= nelt2;
31312 mask |= e << (i * 4);
31313 }
31314
31315 /* Make sure success has a non-zero value by adding one. */
31316 return mask + 1;
31317 }
31318 \f
31319 /* Store OPERAND to the memory after reload is completed. This means
31320 that we can't easily use assign_stack_local. */
31321 rtx
31322 ix86_force_to_memory (enum machine_mode mode, rtx operand)
31323 {
31324 rtx result;
31325
31326 gcc_assert (reload_completed);
31327 if (ix86_using_red_zone ())
31328 {
31329 result = gen_rtx_MEM (mode,
31330 gen_rtx_PLUS (Pmode,
31331 stack_pointer_rtx,
31332 GEN_INT (-RED_ZONE_SIZE)));
31333 emit_move_insn (result, operand);
31334 }
31335 else if (TARGET_64BIT)
31336 {
31337 switch (mode)
31338 {
31339 case HImode:
31340 case SImode:
31341 operand = gen_lowpart (DImode, operand);
31342 /* FALLTHRU */
31343 case DImode:
31344 emit_insn (
31345 gen_rtx_SET (VOIDmode,
31346 gen_rtx_MEM (DImode,
31347 gen_rtx_PRE_DEC (DImode,
31348 stack_pointer_rtx)),
31349 operand));
31350 break;
31351 default:
31352 gcc_unreachable ();
31353 }
31354 result = gen_rtx_MEM (mode, stack_pointer_rtx);
31355 }
31356 else
31357 {
31358 switch (mode)
31359 {
31360 case DImode:
31361 {
31362 rtx operands[2];
31363 split_double_mode (mode, &operand, 1, operands, operands + 1);
31364 emit_insn (
31365 gen_rtx_SET (VOIDmode,
31366 gen_rtx_MEM (SImode,
31367 gen_rtx_PRE_DEC (Pmode,
31368 stack_pointer_rtx)),
31369 operands[1]));
31370 emit_insn (
31371 gen_rtx_SET (VOIDmode,
31372 gen_rtx_MEM (SImode,
31373 gen_rtx_PRE_DEC (Pmode,
31374 stack_pointer_rtx)),
31375 operands[0]));
31376 }
31377 break;
31378 case HImode:
31379 /* Store HImodes as SImodes. */
31380 operand = gen_lowpart (SImode, operand);
31381 /* FALLTHRU */
31382 case SImode:
31383 emit_insn (
31384 gen_rtx_SET (VOIDmode,
31385 gen_rtx_MEM (GET_MODE (operand),
31386 gen_rtx_PRE_DEC (SImode,
31387 stack_pointer_rtx)),
31388 operand));
31389 break;
31390 default:
31391 gcc_unreachable ();
31392 }
31393 result = gen_rtx_MEM (mode, stack_pointer_rtx);
31394 }
31395 return result;
31396 }
31397
31398 /* Free operand from the memory. */
31399 void
31400 ix86_free_from_memory (enum machine_mode mode)
31401 {
31402 if (!ix86_using_red_zone ())
31403 {
31404 int size;
31405
31406 if (mode == DImode || TARGET_64BIT)
31407 size = 8;
31408 else
31409 size = 4;
31410 /* Use LEA to deallocate stack space. In peephole2 it will be converted
31411 to pop or add instruction if registers are available. */
31412 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
31413 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
31414 GEN_INT (size))));
31415 }
31416 }
31417
31418 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
31419
31420 Put float CONST_DOUBLE in the constant pool instead of fp regs.
31421 QImode must go into class Q_REGS.
31422 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
31423 movdf to do mem-to-mem moves through integer regs. */
31424
31425 static reg_class_t
31426 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
31427 {
31428 enum machine_mode mode = GET_MODE (x);
31429
31430 /* We're only allowed to return a subclass of CLASS. Many of the
31431 following checks fail for NO_REGS, so eliminate that early. */
31432 if (regclass == NO_REGS)
31433 return NO_REGS;
31434
31435 /* All classes can load zeros. */
31436 if (x == CONST0_RTX (mode))
31437 return regclass;
31438
31439 /* Force constants into memory if we are loading a (nonzero) constant into
31440 an MMX or SSE register. This is because there are no MMX/SSE instructions
31441 to load from a constant. */
31442 if (CONSTANT_P (x)
31443 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
31444 return NO_REGS;
31445
31446 /* Prefer SSE regs only, if we can use them for math. */
31447 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
31448 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
31449
31450 /* Floating-point constants need more complex checks. */
31451 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
31452 {
31453 /* General regs can load everything. */
31454 if (reg_class_subset_p (regclass, GENERAL_REGS))
31455 return regclass;
31456
31457 /* Floats can load 0 and 1 plus some others. Note that we eliminated
31458 zero above. We only want to wind up preferring 80387 registers if
31459 we plan on doing computation with them. */
31460 if (TARGET_80387
31461 && standard_80387_constant_p (x) > 0)
31462 {
31463 /* Limit class to non-sse. */
31464 if (regclass == FLOAT_SSE_REGS)
31465 return FLOAT_REGS;
31466 if (regclass == FP_TOP_SSE_REGS)
31467 return FP_TOP_REG;
31468 if (regclass == FP_SECOND_SSE_REGS)
31469 return FP_SECOND_REG;
31470 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
31471 return regclass;
31472 }
31473
31474 return NO_REGS;
31475 }
31476
31477 /* Generally when we see PLUS here, it's the function invariant
31478 (plus soft-fp const_int). Which can only be computed into general
31479 regs. */
31480 if (GET_CODE (x) == PLUS)
31481 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
31482
31483 /* QImode constants are easy to load, but non-constant QImode data
31484 must go into Q_REGS. */
31485 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
31486 {
31487 if (reg_class_subset_p (regclass, Q_REGS))
31488 return regclass;
31489 if (reg_class_subset_p (Q_REGS, regclass))
31490 return Q_REGS;
31491 return NO_REGS;
31492 }
31493
31494 return regclass;
31495 }
31496
31497 /* Discourage putting floating-point values in SSE registers unless
31498 SSE math is being used, and likewise for the 387 registers. */
31499 static reg_class_t
31500 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
31501 {
31502 enum machine_mode mode = GET_MODE (x);
31503
31504 /* Restrict the output reload class to the register bank that we are doing
31505 math on. If we would like not to return a subset of CLASS, reject this
31506 alternative: if reload cannot do this, it will still use its choice. */
31507 mode = GET_MODE (x);
31508 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
31509 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
31510
31511 if (X87_FLOAT_MODE_P (mode))
31512 {
31513 if (regclass == FP_TOP_SSE_REGS)
31514 return FP_TOP_REG;
31515 else if (regclass == FP_SECOND_SSE_REGS)
31516 return FP_SECOND_REG;
31517 else
31518 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
31519 }
31520
31521 return regclass;
31522 }
31523
31524 static reg_class_t
31525 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
31526 enum machine_mode mode, secondary_reload_info *sri)
31527 {
31528 /* Double-word spills from general registers to non-offsettable memory
31529 references (zero-extended addresses) require special handling. */
31530 if (TARGET_64BIT
31531 && MEM_P (x)
31532 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
31533 && rclass == GENERAL_REGS
31534 && !offsettable_memref_p (x))
31535 {
31536 sri->icode = (in_p
31537 ? CODE_FOR_reload_noff_load
31538 : CODE_FOR_reload_noff_store);
31539 /* Add the cost of moving address to a temporary. */
31540 sri->extra_cost = 1;
31541
31542 return NO_REGS;
31543 }
31544
31545 /* QImode spills from non-QI registers require
31546 intermediate register on 32bit targets. */
31547 if (!TARGET_64BIT
31548 && !in_p && mode == QImode
31549 && (rclass == GENERAL_REGS
31550 || rclass == LEGACY_REGS
31551 || rclass == INDEX_REGS))
31552 {
31553 int regno;
31554
31555 if (REG_P (x))
31556 regno = REGNO (x);
31557 else
31558 regno = -1;
31559
31560 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
31561 regno = true_regnum (x);
31562
31563 /* Return Q_REGS if the operand is in memory. */
31564 if (regno == -1)
31565 return Q_REGS;
31566 }
31567
31568 /* This condition handles corner case where an expression involving
31569 pointers gets vectorized. We're trying to use the address of a
31570 stack slot as a vector initializer.
31571
31572 (set (reg:V2DI 74 [ vect_cst_.2 ])
31573 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
31574
31575 Eventually frame gets turned into sp+offset like this:
31576
31577 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31578 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
31579 (const_int 392 [0x188]))))
31580
31581 That later gets turned into:
31582
31583 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31584 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
31585 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
31586
31587 We'll have the following reload recorded:
31588
31589 Reload 0: reload_in (DI) =
31590 (plus:DI (reg/f:DI 7 sp)
31591 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
31592 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31593 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
31594 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
31595 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31596 reload_reg_rtx: (reg:V2DI 22 xmm1)
31597
31598 Which isn't going to work since SSE instructions can't handle scalar
31599 additions. Returning GENERAL_REGS forces the addition into integer
31600 register and reload can handle subsequent reloads without problems. */
31601
31602 if (in_p && GET_CODE (x) == PLUS
31603 && SSE_CLASS_P (rclass)
31604 && SCALAR_INT_MODE_P (mode))
31605 return GENERAL_REGS;
31606
31607 return NO_REGS;
31608 }
31609
31610 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
31611
31612 static bool
31613 ix86_class_likely_spilled_p (reg_class_t rclass)
31614 {
31615 switch (rclass)
31616 {
31617 case AREG:
31618 case DREG:
31619 case CREG:
31620 case BREG:
31621 case AD_REGS:
31622 case SIREG:
31623 case DIREG:
31624 case SSE_FIRST_REG:
31625 case FP_TOP_REG:
31626 case FP_SECOND_REG:
31627 return true;
31628
31629 default:
31630 break;
31631 }
31632
31633 return false;
31634 }
31635
31636 /* If we are copying between general and FP registers, we need a memory
31637 location. The same is true for SSE and MMX registers.
31638
31639 To optimize register_move_cost performance, allow inline variant.
31640
31641 The macro can't work reliably when one of the CLASSES is class containing
31642 registers from multiple units (SSE, MMX, integer). We avoid this by never
31643 combining those units in single alternative in the machine description.
31644 Ensure that this constraint holds to avoid unexpected surprises.
31645
31646 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
31647 enforce these sanity checks. */
31648
31649 static inline bool
31650 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
31651 enum machine_mode mode, int strict)
31652 {
31653 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
31654 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
31655 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
31656 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
31657 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
31658 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
31659 {
31660 gcc_assert (!strict);
31661 return true;
31662 }
31663
31664 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
31665 return true;
31666
31667 /* ??? This is a lie. We do have moves between mmx/general, and for
31668 mmx/sse2. But by saying we need secondary memory we discourage the
31669 register allocator from using the mmx registers unless needed. */
31670 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
31671 return true;
31672
31673 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31674 {
31675 /* SSE1 doesn't have any direct moves from other classes. */
31676 if (!TARGET_SSE2)
31677 return true;
31678
31679 /* If the target says that inter-unit moves are more expensive
31680 than moving through memory, then don't generate them. */
31681 if (!TARGET_INTER_UNIT_MOVES)
31682 return true;
31683
31684 /* Between SSE and general, we have moves no larger than word size. */
31685 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
31686 return true;
31687 }
31688
31689 return false;
31690 }
31691
31692 bool
31693 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
31694 enum machine_mode mode, int strict)
31695 {
31696 return inline_secondary_memory_needed (class1, class2, mode, strict);
31697 }
31698
31699 /* Implement the TARGET_CLASS_MAX_NREGS hook.
31700
31701 On the 80386, this is the size of MODE in words,
31702 except in the FP regs, where a single reg is always enough. */
31703
31704 static unsigned char
31705 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
31706 {
31707 if (MAYBE_INTEGER_CLASS_P (rclass))
31708 {
31709 if (mode == XFmode)
31710 return (TARGET_64BIT ? 2 : 3);
31711 else if (mode == XCmode)
31712 return (TARGET_64BIT ? 4 : 6);
31713 else
31714 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
31715 }
31716 else
31717 {
31718 if (COMPLEX_MODE_P (mode))
31719 return 2;
31720 else
31721 return 1;
31722 }
31723 }
31724
31725 /* Return true if the registers in CLASS cannot represent the change from
31726 modes FROM to TO. */
31727
31728 bool
31729 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
31730 enum reg_class regclass)
31731 {
31732 if (from == to)
31733 return false;
31734
31735 /* x87 registers can't do subreg at all, as all values are reformatted
31736 to extended precision. */
31737 if (MAYBE_FLOAT_CLASS_P (regclass))
31738 return true;
31739
31740 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
31741 {
31742 /* Vector registers do not support QI or HImode loads. If we don't
31743 disallow a change to these modes, reload will assume it's ok to
31744 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
31745 the vec_dupv4hi pattern. */
31746 if (GET_MODE_SIZE (from) < 4)
31747 return true;
31748
31749 /* Vector registers do not support subreg with nonzero offsets, which
31750 are otherwise valid for integer registers. Since we can't see
31751 whether we have a nonzero offset from here, prohibit all
31752 nonparadoxical subregs changing size. */
31753 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
31754 return true;
31755 }
31756
31757 return false;
31758 }
31759
31760 /* Return the cost of moving data of mode M between a
31761 register and memory. A value of 2 is the default; this cost is
31762 relative to those in `REGISTER_MOVE_COST'.
31763
31764 This function is used extensively by register_move_cost that is used to
31765 build tables at startup. Make it inline in this case.
31766 When IN is 2, return maximum of in and out move cost.
31767
31768 If moving between registers and memory is more expensive than
31769 between two registers, you should define this macro to express the
31770 relative cost.
31771
31772 Model also increased moving costs of QImode registers in non
31773 Q_REGS classes.
31774 */
31775 static inline int
31776 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
31777 int in)
31778 {
31779 int cost;
31780 if (FLOAT_CLASS_P (regclass))
31781 {
31782 int index;
31783 switch (mode)
31784 {
31785 case SFmode:
31786 index = 0;
31787 break;
31788 case DFmode:
31789 index = 1;
31790 break;
31791 case XFmode:
31792 index = 2;
31793 break;
31794 default:
31795 return 100;
31796 }
31797 if (in == 2)
31798 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
31799 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
31800 }
31801 if (SSE_CLASS_P (regclass))
31802 {
31803 int index;
31804 switch (GET_MODE_SIZE (mode))
31805 {
31806 case 4:
31807 index = 0;
31808 break;
31809 case 8:
31810 index = 1;
31811 break;
31812 case 16:
31813 index = 2;
31814 break;
31815 default:
31816 return 100;
31817 }
31818 if (in == 2)
31819 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
31820 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
31821 }
31822 if (MMX_CLASS_P (regclass))
31823 {
31824 int index;
31825 switch (GET_MODE_SIZE (mode))
31826 {
31827 case 4:
31828 index = 0;
31829 break;
31830 case 8:
31831 index = 1;
31832 break;
31833 default:
31834 return 100;
31835 }
31836 if (in)
31837 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
31838 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
31839 }
31840 switch (GET_MODE_SIZE (mode))
31841 {
31842 case 1:
31843 if (Q_CLASS_P (regclass) || TARGET_64BIT)
31844 {
31845 if (!in)
31846 return ix86_cost->int_store[0];
31847 if (TARGET_PARTIAL_REG_DEPENDENCY
31848 && optimize_function_for_speed_p (cfun))
31849 cost = ix86_cost->movzbl_load;
31850 else
31851 cost = ix86_cost->int_load[0];
31852 if (in == 2)
31853 return MAX (cost, ix86_cost->int_store[0]);
31854 return cost;
31855 }
31856 else
31857 {
31858 if (in == 2)
31859 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
31860 if (in)
31861 return ix86_cost->movzbl_load;
31862 else
31863 return ix86_cost->int_store[0] + 4;
31864 }
31865 break;
31866 case 2:
31867 if (in == 2)
31868 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
31869 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
31870 default:
31871 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
31872 if (mode == TFmode)
31873 mode = XFmode;
31874 if (in == 2)
31875 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
31876 else if (in)
31877 cost = ix86_cost->int_load[2];
31878 else
31879 cost = ix86_cost->int_store[2];
31880 return (cost * (((int) GET_MODE_SIZE (mode)
31881 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
31882 }
31883 }
31884
31885 static int
31886 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
31887 bool in)
31888 {
31889 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
31890 }
31891
31892
31893 /* Return the cost of moving data from a register in class CLASS1 to
31894 one in class CLASS2.
31895
31896 It is not required that the cost always equal 2 when FROM is the same as TO;
31897 on some machines it is expensive to move between registers if they are not
31898 general registers. */
31899
31900 static int
31901 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
31902 reg_class_t class2_i)
31903 {
31904 enum reg_class class1 = (enum reg_class) class1_i;
31905 enum reg_class class2 = (enum reg_class) class2_i;
31906
31907 /* In case we require secondary memory, compute cost of the store followed
31908 by load. In order to avoid bad register allocation choices, we need
31909 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
31910
31911 if (inline_secondary_memory_needed (class1, class2, mode, 0))
31912 {
31913 int cost = 1;
31914
31915 cost += inline_memory_move_cost (mode, class1, 2);
31916 cost += inline_memory_move_cost (mode, class2, 2);
31917
31918 /* In case of copying from general_purpose_register we may emit multiple
31919 stores followed by single load causing memory size mismatch stall.
31920 Count this as arbitrarily high cost of 20. */
31921 if (targetm.class_max_nregs (class1, mode)
31922 > targetm.class_max_nregs (class2, mode))
31923 cost += 20;
31924
31925 /* In the case of FP/MMX moves, the registers actually overlap, and we
31926 have to switch modes in order to treat them differently. */
31927 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
31928 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
31929 cost += 20;
31930
31931 return cost;
31932 }
31933
31934 /* Moves between SSE/MMX and integer unit are expensive. */
31935 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
31936 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31937
31938 /* ??? By keeping returned value relatively high, we limit the number
31939 of moves between integer and MMX/SSE registers for all targets.
31940 Additionally, high value prevents problem with x86_modes_tieable_p(),
31941 where integer modes in MMX/SSE registers are not tieable
31942 because of missing QImode and HImode moves to, from or between
31943 MMX/SSE registers. */
31944 return MAX (8, ix86_cost->mmxsse_to_integer);
31945
31946 if (MAYBE_FLOAT_CLASS_P (class1))
31947 return ix86_cost->fp_move;
31948 if (MAYBE_SSE_CLASS_P (class1))
31949 return ix86_cost->sse_move;
31950 if (MAYBE_MMX_CLASS_P (class1))
31951 return ix86_cost->mmx_move;
31952 return 2;
31953 }
31954
31955 /* Return TRUE if hard register REGNO can hold a value of machine-mode
31956 MODE. */
31957
31958 bool
31959 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
31960 {
31961 /* Flags and only flags can only hold CCmode values. */
31962 if (CC_REGNO_P (regno))
31963 return GET_MODE_CLASS (mode) == MODE_CC;
31964 if (GET_MODE_CLASS (mode) == MODE_CC
31965 || GET_MODE_CLASS (mode) == MODE_RANDOM
31966 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
31967 return false;
31968 if (FP_REGNO_P (regno))
31969 return VALID_FP_MODE_P (mode);
31970 if (SSE_REGNO_P (regno))
31971 {
31972 /* We implement the move patterns for all vector modes into and
31973 out of SSE registers, even when no operation instructions
31974 are available. OImode move is available only when AVX is
31975 enabled. */
31976 return ((TARGET_AVX && mode == OImode)
31977 || VALID_AVX256_REG_MODE (mode)
31978 || VALID_SSE_REG_MODE (mode)
31979 || VALID_SSE2_REG_MODE (mode)
31980 || VALID_MMX_REG_MODE (mode)
31981 || VALID_MMX_REG_MODE_3DNOW (mode));
31982 }
31983 if (MMX_REGNO_P (regno))
31984 {
31985 /* We implement the move patterns for 3DNOW modes even in MMX mode,
31986 so if the register is available at all, then we can move data of
31987 the given mode into or out of it. */
31988 return (VALID_MMX_REG_MODE (mode)
31989 || VALID_MMX_REG_MODE_3DNOW (mode));
31990 }
31991
31992 if (mode == QImode)
31993 {
31994 /* Take care for QImode values - they can be in non-QI regs,
31995 but then they do cause partial register stalls. */
31996 if (regno <= BX_REG || TARGET_64BIT)
31997 return true;
31998 if (!TARGET_PARTIAL_REG_STALL)
31999 return true;
32000 return !can_create_pseudo_p ();
32001 }
32002 /* We handle both integer and floats in the general purpose registers. */
32003 else if (VALID_INT_MODE_P (mode))
32004 return true;
32005 else if (VALID_FP_MODE_P (mode))
32006 return true;
32007 else if (VALID_DFP_MODE_P (mode))
32008 return true;
32009 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
32010 on to use that value in smaller contexts, this can easily force a
32011 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
32012 supporting DImode, allow it. */
32013 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
32014 return true;
32015
32016 return false;
32017 }
32018
32019 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
32020 tieable integer mode. */
32021
32022 static bool
32023 ix86_tieable_integer_mode_p (enum machine_mode mode)
32024 {
32025 switch (mode)
32026 {
32027 case HImode:
32028 case SImode:
32029 return true;
32030
32031 case QImode:
32032 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
32033
32034 case DImode:
32035 return TARGET_64BIT;
32036
32037 default:
32038 return false;
32039 }
32040 }
32041
32042 /* Return true if MODE1 is accessible in a register that can hold MODE2
32043 without copying. That is, all register classes that can hold MODE2
32044 can also hold MODE1. */
32045
32046 bool
32047 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
32048 {
32049 if (mode1 == mode2)
32050 return true;
32051
32052 if (ix86_tieable_integer_mode_p (mode1)
32053 && ix86_tieable_integer_mode_p (mode2))
32054 return true;
32055
32056 /* MODE2 being XFmode implies fp stack or general regs, which means we
32057 can tie any smaller floating point modes to it. Note that we do not
32058 tie this with TFmode. */
32059 if (mode2 == XFmode)
32060 return mode1 == SFmode || mode1 == DFmode;
32061
32062 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
32063 that we can tie it with SFmode. */
32064 if (mode2 == DFmode)
32065 return mode1 == SFmode;
32066
32067 /* If MODE2 is only appropriate for an SSE register, then tie with
32068 any other mode acceptable to SSE registers. */
32069 if (GET_MODE_SIZE (mode2) == 32
32070 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
32071 return (GET_MODE_SIZE (mode1) == 32
32072 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
32073 if (GET_MODE_SIZE (mode2) == 16
32074 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
32075 return (GET_MODE_SIZE (mode1) == 16
32076 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
32077
32078 /* If MODE2 is appropriate for an MMX register, then tie
32079 with any other mode acceptable to MMX registers. */
32080 if (GET_MODE_SIZE (mode2) == 8
32081 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
32082 return (GET_MODE_SIZE (mode1) == 8
32083 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
32084
32085 return false;
32086 }
32087
32088 /* Return the cost of moving between two registers of mode MODE. */
32089
32090 static int
32091 ix86_set_reg_reg_cost (enum machine_mode mode)
32092 {
32093 unsigned int units = UNITS_PER_WORD;
32094
32095 switch (GET_MODE_CLASS (mode))
32096 {
32097 default:
32098 break;
32099
32100 case MODE_CC:
32101 units = GET_MODE_SIZE (CCmode);
32102 break;
32103
32104 case MODE_FLOAT:
32105 if ((TARGET_SSE && mode == TFmode)
32106 || (TARGET_80387 && mode == XFmode)
32107 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
32108 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
32109 units = GET_MODE_SIZE (mode);
32110 break;
32111
32112 case MODE_COMPLEX_FLOAT:
32113 if ((TARGET_SSE && mode == TCmode)
32114 || (TARGET_80387 && mode == XCmode)
32115 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
32116 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
32117 units = GET_MODE_SIZE (mode);
32118 break;
32119
32120 case MODE_VECTOR_INT:
32121 case MODE_VECTOR_FLOAT:
32122 if ((TARGET_AVX && VALID_AVX256_REG_MODE (mode))
32123 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
32124 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
32125 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
32126 units = GET_MODE_SIZE (mode);
32127 }
32128
32129 /* Return the cost of moving between two registers of mode MODE,
32130 assuming that the move will be in pieces of at most UNITS bytes. */
32131 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
32132 }
32133
32134 /* Compute a (partial) cost for rtx X. Return true if the complete
32135 cost has been computed, and false if subexpressions should be
32136 scanned. In either case, *TOTAL contains the cost result. */
32137
32138 static bool
32139 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
32140 bool speed)
32141 {
32142 enum rtx_code code = (enum rtx_code) code_i;
32143 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
32144 enum machine_mode mode = GET_MODE (x);
32145 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
32146
32147 switch (code)
32148 {
32149 case SET:
32150 if (register_operand (SET_DEST (x), VOIDmode)
32151 && reg_or_0_operand (SET_SRC (x), VOIDmode))
32152 {
32153 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
32154 return true;
32155 }
32156 return false;
32157
32158 case CONST_INT:
32159 case CONST:
32160 case LABEL_REF:
32161 case SYMBOL_REF:
32162 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
32163 *total = 3;
32164 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
32165 *total = 2;
32166 else if (flag_pic && SYMBOLIC_CONST (x)
32167 && (!TARGET_64BIT
32168 || (!GET_CODE (x) != LABEL_REF
32169 && (GET_CODE (x) != SYMBOL_REF
32170 || !SYMBOL_REF_LOCAL_P (x)))))
32171 *total = 1;
32172 else
32173 *total = 0;
32174 return true;
32175
32176 case CONST_DOUBLE:
32177 if (mode == VOIDmode)
32178 {
32179 *total = 0;
32180 return true;
32181 }
32182 switch (standard_80387_constant_p (x))
32183 {
32184 case 1: /* 0.0 */
32185 *total = 1;
32186 return true;
32187 default: /* Other constants */
32188 *total = 2;
32189 return true;
32190 case 0:
32191 case -1:
32192 break;
32193 }
32194 if (SSE_FLOAT_MODE_P (mode))
32195 {
32196 case CONST_VECTOR:
32197 switch (standard_sse_constant_p (x))
32198 {
32199 case 0:
32200 break;
32201 case 1: /* 0: xor eliminates false dependency */
32202 *total = 0;
32203 return true;
32204 default: /* -1: cmp contains false dependency */
32205 *total = 1;
32206 return true;
32207 }
32208 }
32209 /* Fall back to (MEM (SYMBOL_REF)), since that's where
32210 it'll probably end up. Add a penalty for size. */
32211 *total = (COSTS_N_INSNS (1)
32212 + (flag_pic != 0 && !TARGET_64BIT)
32213 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
32214 return true;
32215
32216 case ZERO_EXTEND:
32217 /* The zero extensions is often completely free on x86_64, so make
32218 it as cheap as possible. */
32219 if (TARGET_64BIT && mode == DImode
32220 && GET_MODE (XEXP (x, 0)) == SImode)
32221 *total = 1;
32222 else if (TARGET_ZERO_EXTEND_WITH_AND)
32223 *total = cost->add;
32224 else
32225 *total = cost->movzx;
32226 return false;
32227
32228 case SIGN_EXTEND:
32229 *total = cost->movsx;
32230 return false;
32231
32232 case ASHIFT:
32233 if (SCALAR_INT_MODE_P (mode)
32234 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
32235 && CONST_INT_P (XEXP (x, 1)))
32236 {
32237 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
32238 if (value == 1)
32239 {
32240 *total = cost->add;
32241 return false;
32242 }
32243 if ((value == 2 || value == 3)
32244 && cost->lea <= cost->shift_const)
32245 {
32246 *total = cost->lea;
32247 return false;
32248 }
32249 }
32250 /* FALLTHRU */
32251
32252 case ROTATE:
32253 case ASHIFTRT:
32254 case LSHIFTRT:
32255 case ROTATERT:
32256 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
32257 {
32258 /* ??? Should be SSE vector operation cost. */
32259 /* At least for published AMD latencies, this really is the same
32260 as the latency for a simple fpu operation like fabs. */
32261 /* V*QImode is emulated with 1-11 insns. */
32262 if (mode == V16QImode || mode == V32QImode)
32263 {
32264 int count = 11;
32265 if (TARGET_XOP && mode == V16QImode)
32266 {
32267 /* For XOP we use vpshab, which requires a broadcast of the
32268 value to the variable shift insn. For constants this
32269 means a V16Q const in mem; even when we can perform the
32270 shift with one insn set the cost to prefer paddb. */
32271 if (CONSTANT_P (XEXP (x, 1)))
32272 {
32273 *total = (cost->fabs
32274 + rtx_cost (XEXP (x, 0), code, 0, speed)
32275 + (speed ? 2 : COSTS_N_BYTES (16)));
32276 return true;
32277 }
32278 count = 3;
32279 }
32280 else if (TARGET_SSSE3)
32281 count = 7;
32282 *total = cost->fabs * count;
32283 }
32284 else
32285 *total = cost->fabs;
32286 }
32287 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
32288 {
32289 if (CONST_INT_P (XEXP (x, 1)))
32290 {
32291 if (INTVAL (XEXP (x, 1)) > 32)
32292 *total = cost->shift_const + COSTS_N_INSNS (2);
32293 else
32294 *total = cost->shift_const * 2;
32295 }
32296 else
32297 {
32298 if (GET_CODE (XEXP (x, 1)) == AND)
32299 *total = cost->shift_var * 2;
32300 else
32301 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
32302 }
32303 }
32304 else
32305 {
32306 if (CONST_INT_P (XEXP (x, 1)))
32307 *total = cost->shift_const;
32308 else
32309 *total = cost->shift_var;
32310 }
32311 return false;
32312
32313 case FMA:
32314 {
32315 rtx sub;
32316
32317 gcc_assert (FLOAT_MODE_P (mode));
32318 gcc_assert (TARGET_FMA || TARGET_FMA4);
32319
32320 /* ??? SSE scalar/vector cost should be used here. */
32321 /* ??? Bald assumption that fma has the same cost as fmul. */
32322 *total = cost->fmul;
32323 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
32324
32325 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
32326 sub = XEXP (x, 0);
32327 if (GET_CODE (sub) == NEG)
32328 sub = XEXP (sub, 0);
32329 *total += rtx_cost (sub, FMA, 0, speed);
32330
32331 sub = XEXP (x, 2);
32332 if (GET_CODE (sub) == NEG)
32333 sub = XEXP (sub, 0);
32334 *total += rtx_cost (sub, FMA, 2, speed);
32335 return true;
32336 }
32337
32338 case MULT:
32339 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32340 {
32341 /* ??? SSE scalar cost should be used here. */
32342 *total = cost->fmul;
32343 return false;
32344 }
32345 else if (X87_FLOAT_MODE_P (mode))
32346 {
32347 *total = cost->fmul;
32348 return false;
32349 }
32350 else if (FLOAT_MODE_P (mode))
32351 {
32352 /* ??? SSE vector cost should be used here. */
32353 *total = cost->fmul;
32354 return false;
32355 }
32356 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
32357 {
32358 /* V*QImode is emulated with 7-13 insns. */
32359 if (mode == V16QImode || mode == V32QImode)
32360 {
32361 int extra = 11;
32362 if (TARGET_XOP && mode == V16QImode)
32363 extra = 5;
32364 else if (TARGET_SSSE3)
32365 extra = 6;
32366 *total = cost->fmul * 2 + cost->fabs * extra;
32367 }
32368 /* V*DImode is emulated with 5-8 insns. */
32369 else if (mode == V2DImode || mode == V4DImode)
32370 {
32371 if (TARGET_XOP && mode == V2DImode)
32372 *total = cost->fmul * 2 + cost->fabs * 3;
32373 else
32374 *total = cost->fmul * 3 + cost->fabs * 5;
32375 }
32376 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
32377 insns, including two PMULUDQ. */
32378 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
32379 *total = cost->fmul * 2 + cost->fabs * 5;
32380 else
32381 *total = cost->fmul;
32382 return false;
32383 }
32384 else
32385 {
32386 rtx op0 = XEXP (x, 0);
32387 rtx op1 = XEXP (x, 1);
32388 int nbits;
32389 if (CONST_INT_P (XEXP (x, 1)))
32390 {
32391 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
32392 for (nbits = 0; value != 0; value &= value - 1)
32393 nbits++;
32394 }
32395 else
32396 /* This is arbitrary. */
32397 nbits = 7;
32398
32399 /* Compute costs correctly for widening multiplication. */
32400 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
32401 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
32402 == GET_MODE_SIZE (mode))
32403 {
32404 int is_mulwiden = 0;
32405 enum machine_mode inner_mode = GET_MODE (op0);
32406
32407 if (GET_CODE (op0) == GET_CODE (op1))
32408 is_mulwiden = 1, op1 = XEXP (op1, 0);
32409 else if (CONST_INT_P (op1))
32410 {
32411 if (GET_CODE (op0) == SIGN_EXTEND)
32412 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
32413 == INTVAL (op1);
32414 else
32415 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
32416 }
32417
32418 if (is_mulwiden)
32419 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
32420 }
32421
32422 *total = (cost->mult_init[MODE_INDEX (mode)]
32423 + nbits * cost->mult_bit
32424 + rtx_cost (op0, outer_code, opno, speed)
32425 + rtx_cost (op1, outer_code, opno, speed));
32426
32427 return true;
32428 }
32429
32430 case DIV:
32431 case UDIV:
32432 case MOD:
32433 case UMOD:
32434 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32435 /* ??? SSE cost should be used here. */
32436 *total = cost->fdiv;
32437 else if (X87_FLOAT_MODE_P (mode))
32438 *total = cost->fdiv;
32439 else if (FLOAT_MODE_P (mode))
32440 /* ??? SSE vector cost should be used here. */
32441 *total = cost->fdiv;
32442 else
32443 *total = cost->divide[MODE_INDEX (mode)];
32444 return false;
32445
32446 case PLUS:
32447 if (GET_MODE_CLASS (mode) == MODE_INT
32448 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
32449 {
32450 if (GET_CODE (XEXP (x, 0)) == PLUS
32451 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
32452 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
32453 && CONSTANT_P (XEXP (x, 1)))
32454 {
32455 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
32456 if (val == 2 || val == 4 || val == 8)
32457 {
32458 *total = cost->lea;
32459 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
32460 outer_code, opno, speed);
32461 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
32462 outer_code, opno, speed);
32463 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
32464 return true;
32465 }
32466 }
32467 else if (GET_CODE (XEXP (x, 0)) == MULT
32468 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
32469 {
32470 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
32471 if (val == 2 || val == 4 || val == 8)
32472 {
32473 *total = cost->lea;
32474 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
32475 outer_code, opno, speed);
32476 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
32477 return true;
32478 }
32479 }
32480 else if (GET_CODE (XEXP (x, 0)) == PLUS)
32481 {
32482 *total = cost->lea;
32483 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
32484 outer_code, opno, speed);
32485 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
32486 outer_code, opno, speed);
32487 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
32488 return true;
32489 }
32490 }
32491 /* FALLTHRU */
32492
32493 case MINUS:
32494 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32495 {
32496 /* ??? SSE cost should be used here. */
32497 *total = cost->fadd;
32498 return false;
32499 }
32500 else if (X87_FLOAT_MODE_P (mode))
32501 {
32502 *total = cost->fadd;
32503 return false;
32504 }
32505 else if (FLOAT_MODE_P (mode))
32506 {
32507 /* ??? SSE vector cost should be used here. */
32508 *total = cost->fadd;
32509 return false;
32510 }
32511 /* FALLTHRU */
32512
32513 case AND:
32514 case IOR:
32515 case XOR:
32516 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
32517 {
32518 *total = (cost->add * 2
32519 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
32520 << (GET_MODE (XEXP (x, 0)) != DImode))
32521 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
32522 << (GET_MODE (XEXP (x, 1)) != DImode)));
32523 return true;
32524 }
32525 /* FALLTHRU */
32526
32527 case NEG:
32528 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32529 {
32530 /* ??? SSE cost should be used here. */
32531 *total = cost->fchs;
32532 return false;
32533 }
32534 else if (X87_FLOAT_MODE_P (mode))
32535 {
32536 *total = cost->fchs;
32537 return false;
32538 }
32539 else if (FLOAT_MODE_P (mode))
32540 {
32541 /* ??? SSE vector cost should be used here. */
32542 *total = cost->fchs;
32543 return false;
32544 }
32545 /* FALLTHRU */
32546
32547 case NOT:
32548 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
32549 {
32550 /* ??? Should be SSE vector operation cost. */
32551 /* At least for published AMD latencies, this really is the same
32552 as the latency for a simple fpu operation like fabs. */
32553 *total = cost->fabs;
32554 }
32555 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
32556 *total = cost->add * 2;
32557 else
32558 *total = cost->add;
32559 return false;
32560
32561 case COMPARE:
32562 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
32563 && XEXP (XEXP (x, 0), 1) == const1_rtx
32564 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
32565 && XEXP (x, 1) == const0_rtx)
32566 {
32567 /* This kind of construct is implemented using test[bwl].
32568 Treat it as if we had an AND. */
32569 *total = (cost->add
32570 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
32571 + rtx_cost (const1_rtx, outer_code, opno, speed));
32572 return true;
32573 }
32574 return false;
32575
32576 case FLOAT_EXTEND:
32577 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
32578 *total = 0;
32579 return false;
32580
32581 case ABS:
32582 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32583 /* ??? SSE cost should be used here. */
32584 *total = cost->fabs;
32585 else if (X87_FLOAT_MODE_P (mode))
32586 *total = cost->fabs;
32587 else if (FLOAT_MODE_P (mode))
32588 /* ??? SSE vector cost should be used here. */
32589 *total = cost->fabs;
32590 return false;
32591
32592 case SQRT:
32593 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32594 /* ??? SSE cost should be used here. */
32595 *total = cost->fsqrt;
32596 else if (X87_FLOAT_MODE_P (mode))
32597 *total = cost->fsqrt;
32598 else if (FLOAT_MODE_P (mode))
32599 /* ??? SSE vector cost should be used here. */
32600 *total = cost->fsqrt;
32601 return false;
32602
32603 case UNSPEC:
32604 if (XINT (x, 1) == UNSPEC_TP)
32605 *total = 0;
32606 return false;
32607
32608 case VEC_SELECT:
32609 case VEC_CONCAT:
32610 case VEC_MERGE:
32611 case VEC_DUPLICATE:
32612 /* ??? Assume all of these vector manipulation patterns are
32613 recognizable. In which case they all pretty much have the
32614 same cost. */
32615 *total = cost->fabs;
32616 return true;
32617
32618 default:
32619 return false;
32620 }
32621 }
32622
32623 #if TARGET_MACHO
32624
32625 static int current_machopic_label_num;
32626
32627 /* Given a symbol name and its associated stub, write out the
32628 definition of the stub. */
32629
32630 void
32631 machopic_output_stub (FILE *file, const char *symb, const char *stub)
32632 {
32633 unsigned int length;
32634 char *binder_name, *symbol_name, lazy_ptr_name[32];
32635 int label = ++current_machopic_label_num;
32636
32637 /* For 64-bit we shouldn't get here. */
32638 gcc_assert (!TARGET_64BIT);
32639
32640 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
32641 symb = targetm.strip_name_encoding (symb);
32642
32643 length = strlen (stub);
32644 binder_name = XALLOCAVEC (char, length + 32);
32645 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
32646
32647 length = strlen (symb);
32648 symbol_name = XALLOCAVEC (char, length + 32);
32649 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
32650
32651 sprintf (lazy_ptr_name, "L%d$lz", label);
32652
32653 if (MACHOPIC_ATT_STUB)
32654 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
32655 else if (MACHOPIC_PURE)
32656 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
32657 else
32658 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
32659
32660 fprintf (file, "%s:\n", stub);
32661 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
32662
32663 if (MACHOPIC_ATT_STUB)
32664 {
32665 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
32666 }
32667 else if (MACHOPIC_PURE)
32668 {
32669 /* PIC stub. */
32670 /* 25-byte PIC stub using "CALL get_pc_thunk". */
32671 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
32672 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
32673 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
32674 label, lazy_ptr_name, label);
32675 fprintf (file, "\tjmp\t*%%ecx\n");
32676 }
32677 else
32678 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
32679
32680 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
32681 it needs no stub-binding-helper. */
32682 if (MACHOPIC_ATT_STUB)
32683 return;
32684
32685 fprintf (file, "%s:\n", binder_name);
32686
32687 if (MACHOPIC_PURE)
32688 {
32689 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
32690 fprintf (file, "\tpushl\t%%ecx\n");
32691 }
32692 else
32693 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
32694
32695 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
32696
32697 /* N.B. Keep the correspondence of these
32698 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
32699 old-pic/new-pic/non-pic stubs; altering this will break
32700 compatibility with existing dylibs. */
32701 if (MACHOPIC_PURE)
32702 {
32703 /* 25-byte PIC stub using "CALL get_pc_thunk". */
32704 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
32705 }
32706 else
32707 /* 16-byte -mdynamic-no-pic stub. */
32708 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
32709
32710 fprintf (file, "%s:\n", lazy_ptr_name);
32711 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
32712 fprintf (file, ASM_LONG "%s\n", binder_name);
32713 }
32714 #endif /* TARGET_MACHO */
32715
32716 /* Order the registers for register allocator. */
32717
32718 void
32719 x86_order_regs_for_local_alloc (void)
32720 {
32721 int pos = 0;
32722 int i;
32723
32724 /* First allocate the local general purpose registers. */
32725 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
32726 if (GENERAL_REGNO_P (i) && call_used_regs[i])
32727 reg_alloc_order [pos++] = i;
32728
32729 /* Global general purpose registers. */
32730 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
32731 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
32732 reg_alloc_order [pos++] = i;
32733
32734 /* x87 registers come first in case we are doing FP math
32735 using them. */
32736 if (!TARGET_SSE_MATH)
32737 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
32738 reg_alloc_order [pos++] = i;
32739
32740 /* SSE registers. */
32741 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
32742 reg_alloc_order [pos++] = i;
32743 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
32744 reg_alloc_order [pos++] = i;
32745
32746 /* x87 registers. */
32747 if (TARGET_SSE_MATH)
32748 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
32749 reg_alloc_order [pos++] = i;
32750
32751 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
32752 reg_alloc_order [pos++] = i;
32753
32754 /* Initialize the rest of array as we do not allocate some registers
32755 at all. */
32756 while (pos < FIRST_PSEUDO_REGISTER)
32757 reg_alloc_order [pos++] = 0;
32758 }
32759
32760 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
32761 in struct attribute_spec handler. */
32762 static tree
32763 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
32764 tree args,
32765 int flags ATTRIBUTE_UNUSED,
32766 bool *no_add_attrs)
32767 {
32768 if (TREE_CODE (*node) != FUNCTION_TYPE
32769 && TREE_CODE (*node) != METHOD_TYPE
32770 && TREE_CODE (*node) != FIELD_DECL
32771 && TREE_CODE (*node) != TYPE_DECL)
32772 {
32773 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32774 name);
32775 *no_add_attrs = true;
32776 return NULL_TREE;
32777 }
32778 if (TARGET_64BIT)
32779 {
32780 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
32781 name);
32782 *no_add_attrs = true;
32783 return NULL_TREE;
32784 }
32785 if (is_attribute_p ("callee_pop_aggregate_return", name))
32786 {
32787 tree cst;
32788
32789 cst = TREE_VALUE (args);
32790 if (TREE_CODE (cst) != INTEGER_CST)
32791 {
32792 warning (OPT_Wattributes,
32793 "%qE attribute requires an integer constant argument",
32794 name);
32795 *no_add_attrs = true;
32796 }
32797 else if (compare_tree_int (cst, 0) != 0
32798 && compare_tree_int (cst, 1) != 0)
32799 {
32800 warning (OPT_Wattributes,
32801 "argument to %qE attribute is neither zero, nor one",
32802 name);
32803 *no_add_attrs = true;
32804 }
32805
32806 return NULL_TREE;
32807 }
32808
32809 return NULL_TREE;
32810 }
32811
32812 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
32813 struct attribute_spec.handler. */
32814 static tree
32815 ix86_handle_abi_attribute (tree *node, tree name,
32816 tree args ATTRIBUTE_UNUSED,
32817 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32818 {
32819 if (TREE_CODE (*node) != FUNCTION_TYPE
32820 && TREE_CODE (*node) != METHOD_TYPE
32821 && TREE_CODE (*node) != FIELD_DECL
32822 && TREE_CODE (*node) != TYPE_DECL)
32823 {
32824 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32825 name);
32826 *no_add_attrs = true;
32827 return NULL_TREE;
32828 }
32829
32830 /* Can combine regparm with all attributes but fastcall. */
32831 if (is_attribute_p ("ms_abi", name))
32832 {
32833 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
32834 {
32835 error ("ms_abi and sysv_abi attributes are not compatible");
32836 }
32837
32838 return NULL_TREE;
32839 }
32840 else if (is_attribute_p ("sysv_abi", name))
32841 {
32842 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
32843 {
32844 error ("ms_abi and sysv_abi attributes are not compatible");
32845 }
32846
32847 return NULL_TREE;
32848 }
32849
32850 return NULL_TREE;
32851 }
32852
32853 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
32854 struct attribute_spec.handler. */
32855 static tree
32856 ix86_handle_struct_attribute (tree *node, tree name,
32857 tree args ATTRIBUTE_UNUSED,
32858 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32859 {
32860 tree *type = NULL;
32861 if (DECL_P (*node))
32862 {
32863 if (TREE_CODE (*node) == TYPE_DECL)
32864 type = &TREE_TYPE (*node);
32865 }
32866 else
32867 type = node;
32868
32869 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
32870 {
32871 warning (OPT_Wattributes, "%qE attribute ignored",
32872 name);
32873 *no_add_attrs = true;
32874 }
32875
32876 else if ((is_attribute_p ("ms_struct", name)
32877 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
32878 || ((is_attribute_p ("gcc_struct", name)
32879 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
32880 {
32881 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
32882 name);
32883 *no_add_attrs = true;
32884 }
32885
32886 return NULL_TREE;
32887 }
32888
32889 static tree
32890 ix86_handle_fndecl_attribute (tree *node, tree name,
32891 tree args ATTRIBUTE_UNUSED,
32892 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32893 {
32894 if (TREE_CODE (*node) != FUNCTION_DECL)
32895 {
32896 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32897 name);
32898 *no_add_attrs = true;
32899 }
32900 return NULL_TREE;
32901 }
32902
32903 static bool
32904 ix86_ms_bitfield_layout_p (const_tree record_type)
32905 {
32906 return ((TARGET_MS_BITFIELD_LAYOUT
32907 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
32908 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
32909 }
32910
32911 /* Returns an expression indicating where the this parameter is
32912 located on entry to the FUNCTION. */
32913
32914 static rtx
32915 x86_this_parameter (tree function)
32916 {
32917 tree type = TREE_TYPE (function);
32918 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
32919 int nregs;
32920
32921 if (TARGET_64BIT)
32922 {
32923 const int *parm_regs;
32924
32925 if (ix86_function_type_abi (type) == MS_ABI)
32926 parm_regs = x86_64_ms_abi_int_parameter_registers;
32927 else
32928 parm_regs = x86_64_int_parameter_registers;
32929 return gen_rtx_REG (Pmode, parm_regs[aggr]);
32930 }
32931
32932 nregs = ix86_function_regparm (type, function);
32933
32934 if (nregs > 0 && !stdarg_p (type))
32935 {
32936 int regno;
32937 unsigned int ccvt = ix86_get_callcvt (type);
32938
32939 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
32940 regno = aggr ? DX_REG : CX_REG;
32941 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
32942 {
32943 regno = CX_REG;
32944 if (aggr)
32945 return gen_rtx_MEM (SImode,
32946 plus_constant (Pmode, stack_pointer_rtx, 4));
32947 }
32948 else
32949 {
32950 regno = AX_REG;
32951 if (aggr)
32952 {
32953 regno = DX_REG;
32954 if (nregs == 1)
32955 return gen_rtx_MEM (SImode,
32956 plus_constant (Pmode,
32957 stack_pointer_rtx, 4));
32958 }
32959 }
32960 return gen_rtx_REG (SImode, regno);
32961 }
32962
32963 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
32964 aggr ? 8 : 4));
32965 }
32966
32967 /* Determine whether x86_output_mi_thunk can succeed. */
32968
32969 static bool
32970 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
32971 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
32972 HOST_WIDE_INT vcall_offset, const_tree function)
32973 {
32974 /* 64-bit can handle anything. */
32975 if (TARGET_64BIT)
32976 return true;
32977
32978 /* For 32-bit, everything's fine if we have one free register. */
32979 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
32980 return true;
32981
32982 /* Need a free register for vcall_offset. */
32983 if (vcall_offset)
32984 return false;
32985
32986 /* Need a free register for GOT references. */
32987 if (flag_pic && !targetm.binds_local_p (function))
32988 return false;
32989
32990 /* Otherwise ok. */
32991 return true;
32992 }
32993
32994 /* Output the assembler code for a thunk function. THUNK_DECL is the
32995 declaration for the thunk function itself, FUNCTION is the decl for
32996 the target function. DELTA is an immediate constant offset to be
32997 added to THIS. If VCALL_OFFSET is nonzero, the word at
32998 *(*this + vcall_offset) should be added to THIS. */
32999
33000 static void
33001 x86_output_mi_thunk (FILE *file,
33002 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
33003 HOST_WIDE_INT vcall_offset, tree function)
33004 {
33005 rtx this_param = x86_this_parameter (function);
33006 rtx this_reg, tmp, fnaddr;
33007 unsigned int tmp_regno;
33008
33009 if (TARGET_64BIT)
33010 tmp_regno = R10_REG;
33011 else
33012 {
33013 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
33014 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
33015 tmp_regno = AX_REG;
33016 else
33017 tmp_regno = CX_REG;
33018 }
33019
33020 emit_note (NOTE_INSN_PROLOGUE_END);
33021
33022 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
33023 pull it in now and let DELTA benefit. */
33024 if (REG_P (this_param))
33025 this_reg = this_param;
33026 else if (vcall_offset)
33027 {
33028 /* Put the this parameter into %eax. */
33029 this_reg = gen_rtx_REG (Pmode, AX_REG);
33030 emit_move_insn (this_reg, this_param);
33031 }
33032 else
33033 this_reg = NULL_RTX;
33034
33035 /* Adjust the this parameter by a fixed constant. */
33036 if (delta)
33037 {
33038 rtx delta_rtx = GEN_INT (delta);
33039 rtx delta_dst = this_reg ? this_reg : this_param;
33040
33041 if (TARGET_64BIT)
33042 {
33043 if (!x86_64_general_operand (delta_rtx, Pmode))
33044 {
33045 tmp = gen_rtx_REG (Pmode, tmp_regno);
33046 emit_move_insn (tmp, delta_rtx);
33047 delta_rtx = tmp;
33048 }
33049 }
33050
33051 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
33052 }
33053
33054 /* Adjust the this parameter by a value stored in the vtable. */
33055 if (vcall_offset)
33056 {
33057 rtx vcall_addr, vcall_mem, this_mem;
33058
33059 tmp = gen_rtx_REG (Pmode, tmp_regno);
33060
33061 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
33062 if (Pmode != ptr_mode)
33063 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
33064 emit_move_insn (tmp, this_mem);
33065
33066 /* Adjust the this parameter. */
33067 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
33068 if (TARGET_64BIT
33069 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
33070 {
33071 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
33072 emit_move_insn (tmp2, GEN_INT (vcall_offset));
33073 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
33074 }
33075
33076 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
33077 if (Pmode != ptr_mode)
33078 emit_insn (gen_addsi_1_zext (this_reg,
33079 gen_rtx_REG (ptr_mode,
33080 REGNO (this_reg)),
33081 vcall_mem));
33082 else
33083 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
33084 }
33085
33086 /* If necessary, drop THIS back to its stack slot. */
33087 if (this_reg && this_reg != this_param)
33088 emit_move_insn (this_param, this_reg);
33089
33090 fnaddr = XEXP (DECL_RTL (function), 0);
33091 if (TARGET_64BIT)
33092 {
33093 if (!flag_pic || targetm.binds_local_p (function)
33094 || cfun->machine->call_abi == MS_ABI)
33095 ;
33096 else
33097 {
33098 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
33099 tmp = gen_rtx_CONST (Pmode, tmp);
33100 fnaddr = gen_rtx_MEM (Pmode, tmp);
33101 }
33102 }
33103 else
33104 {
33105 if (!flag_pic || targetm.binds_local_p (function))
33106 ;
33107 #if TARGET_MACHO
33108 else if (TARGET_MACHO)
33109 {
33110 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
33111 fnaddr = XEXP (fnaddr, 0);
33112 }
33113 #endif /* TARGET_MACHO */
33114 else
33115 {
33116 tmp = gen_rtx_REG (Pmode, CX_REG);
33117 output_set_got (tmp, NULL_RTX);
33118
33119 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
33120 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
33121 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
33122 }
33123 }
33124
33125 /* Our sibling call patterns do not allow memories, because we have no
33126 predicate that can distinguish between frame and non-frame memory.
33127 For our purposes here, we can get away with (ab)using a jump pattern,
33128 because we're going to do no optimization. */
33129 if (MEM_P (fnaddr))
33130 emit_jump_insn (gen_indirect_jump (fnaddr));
33131 else
33132 {
33133 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
33134 fnaddr = legitimize_pic_address (fnaddr,
33135 gen_rtx_REG (Pmode, tmp_regno));
33136
33137 if (!sibcall_insn_operand (fnaddr, word_mode))
33138 {
33139 tmp = gen_rtx_REG (word_mode, tmp_regno);
33140 if (GET_MODE (fnaddr) != word_mode)
33141 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
33142 emit_move_insn (tmp, fnaddr);
33143 fnaddr = tmp;
33144 }
33145
33146 tmp = gen_rtx_MEM (QImode, fnaddr);
33147 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
33148 tmp = emit_call_insn (tmp);
33149 SIBLING_CALL_P (tmp) = 1;
33150 }
33151 emit_barrier ();
33152
33153 /* Emit just enough of rest_of_compilation to get the insns emitted.
33154 Note that use_thunk calls assemble_start_function et al. */
33155 tmp = get_insns ();
33156 insn_locators_alloc ();
33157 shorten_branches (tmp);
33158 final_start_function (tmp, file, 1);
33159 final (tmp, file, 1);
33160 final_end_function ();
33161 }
33162
33163 static void
33164 x86_file_start (void)
33165 {
33166 default_file_start ();
33167 #if TARGET_MACHO
33168 darwin_file_start ();
33169 #endif
33170 if (X86_FILE_START_VERSION_DIRECTIVE)
33171 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
33172 if (X86_FILE_START_FLTUSED)
33173 fputs ("\t.global\t__fltused\n", asm_out_file);
33174 if (ix86_asm_dialect == ASM_INTEL)
33175 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
33176 }
33177
33178 int
33179 x86_field_alignment (tree field, int computed)
33180 {
33181 enum machine_mode mode;
33182 tree type = TREE_TYPE (field);
33183
33184 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
33185 return computed;
33186 mode = TYPE_MODE (strip_array_types (type));
33187 if (mode == DFmode || mode == DCmode
33188 || GET_MODE_CLASS (mode) == MODE_INT
33189 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
33190 return MIN (32, computed);
33191 return computed;
33192 }
33193
33194 /* Output assembler code to FILE to increment profiler label # LABELNO
33195 for profiling a function entry. */
33196 void
33197 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
33198 {
33199 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
33200 : MCOUNT_NAME);
33201
33202 if (TARGET_64BIT)
33203 {
33204 #ifndef NO_PROFILE_COUNTERS
33205 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
33206 #endif
33207
33208 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
33209 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
33210 else
33211 fprintf (file, "\tcall\t%s\n", mcount_name);
33212 }
33213 else if (flag_pic)
33214 {
33215 #ifndef NO_PROFILE_COUNTERS
33216 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
33217 LPREFIX, labelno);
33218 #endif
33219 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
33220 }
33221 else
33222 {
33223 #ifndef NO_PROFILE_COUNTERS
33224 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
33225 LPREFIX, labelno);
33226 #endif
33227 fprintf (file, "\tcall\t%s\n", mcount_name);
33228 }
33229 }
33230
33231 /* We don't have exact information about the insn sizes, but we may assume
33232 quite safely that we are informed about all 1 byte insns and memory
33233 address sizes. This is enough to eliminate unnecessary padding in
33234 99% of cases. */
33235
33236 static int
33237 min_insn_size (rtx insn)
33238 {
33239 int l = 0, len;
33240
33241 if (!INSN_P (insn) || !active_insn_p (insn))
33242 return 0;
33243
33244 /* Discard alignments we've emit and jump instructions. */
33245 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
33246 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
33247 return 0;
33248 if (JUMP_TABLE_DATA_P (insn))
33249 return 0;
33250
33251 /* Important case - calls are always 5 bytes.
33252 It is common to have many calls in the row. */
33253 if (CALL_P (insn)
33254 && symbolic_reference_mentioned_p (PATTERN (insn))
33255 && !SIBLING_CALL_P (insn))
33256 return 5;
33257 len = get_attr_length (insn);
33258 if (len <= 1)
33259 return 1;
33260
33261 /* For normal instructions we rely on get_attr_length being exact,
33262 with a few exceptions. */
33263 if (!JUMP_P (insn))
33264 {
33265 enum attr_type type = get_attr_type (insn);
33266
33267 switch (type)
33268 {
33269 case TYPE_MULTI:
33270 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
33271 || asm_noperands (PATTERN (insn)) >= 0)
33272 return 0;
33273 break;
33274 case TYPE_OTHER:
33275 case TYPE_FCMP:
33276 break;
33277 default:
33278 /* Otherwise trust get_attr_length. */
33279 return len;
33280 }
33281
33282 l = get_attr_length_address (insn);
33283 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
33284 l = 4;
33285 }
33286 if (l)
33287 return 1+l;
33288 else
33289 return 2;
33290 }
33291
33292 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
33293
33294 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
33295 window. */
33296
33297 static void
33298 ix86_avoid_jump_mispredicts (void)
33299 {
33300 rtx insn, start = get_insns ();
33301 int nbytes = 0, njumps = 0;
33302 int isjump = 0;
33303
33304 /* Look for all minimal intervals of instructions containing 4 jumps.
33305 The intervals are bounded by START and INSN. NBYTES is the total
33306 size of instructions in the interval including INSN and not including
33307 START. When the NBYTES is smaller than 16 bytes, it is possible
33308 that the end of START and INSN ends up in the same 16byte page.
33309
33310 The smallest offset in the page INSN can start is the case where START
33311 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
33312 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
33313 */
33314 for (insn = start; insn; insn = NEXT_INSN (insn))
33315 {
33316 int min_size;
33317
33318 if (LABEL_P (insn))
33319 {
33320 int align = label_to_alignment (insn);
33321 int max_skip = label_to_max_skip (insn);
33322
33323 if (max_skip > 15)
33324 max_skip = 15;
33325 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
33326 already in the current 16 byte page, because otherwise
33327 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
33328 bytes to reach 16 byte boundary. */
33329 if (align <= 0
33330 || (align <= 3 && max_skip != (1 << align) - 1))
33331 max_skip = 0;
33332 if (dump_file)
33333 fprintf (dump_file, "Label %i with max_skip %i\n",
33334 INSN_UID (insn), max_skip);
33335 if (max_skip)
33336 {
33337 while (nbytes + max_skip >= 16)
33338 {
33339 start = NEXT_INSN (start);
33340 if ((JUMP_P (start)
33341 && GET_CODE (PATTERN (start)) != ADDR_VEC
33342 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
33343 || CALL_P (start))
33344 njumps--, isjump = 1;
33345 else
33346 isjump = 0;
33347 nbytes -= min_insn_size (start);
33348 }
33349 }
33350 continue;
33351 }
33352
33353 min_size = min_insn_size (insn);
33354 nbytes += min_size;
33355 if (dump_file)
33356 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
33357 INSN_UID (insn), min_size);
33358 if ((JUMP_P (insn)
33359 && GET_CODE (PATTERN (insn)) != ADDR_VEC
33360 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
33361 || CALL_P (insn))
33362 njumps++;
33363 else
33364 continue;
33365
33366 while (njumps > 3)
33367 {
33368 start = NEXT_INSN (start);
33369 if ((JUMP_P (start)
33370 && GET_CODE (PATTERN (start)) != ADDR_VEC
33371 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
33372 || CALL_P (start))
33373 njumps--, isjump = 1;
33374 else
33375 isjump = 0;
33376 nbytes -= min_insn_size (start);
33377 }
33378 gcc_assert (njumps >= 0);
33379 if (dump_file)
33380 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
33381 INSN_UID (start), INSN_UID (insn), nbytes);
33382
33383 if (njumps == 3 && isjump && nbytes < 16)
33384 {
33385 int padsize = 15 - nbytes + min_insn_size (insn);
33386
33387 if (dump_file)
33388 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
33389 INSN_UID (insn), padsize);
33390 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
33391 }
33392 }
33393 }
33394 #endif
33395
33396 /* AMD Athlon works faster
33397 when RET is not destination of conditional jump or directly preceded
33398 by other jump instruction. We avoid the penalty by inserting NOP just
33399 before the RET instructions in such cases. */
33400 static void
33401 ix86_pad_returns (void)
33402 {
33403 edge e;
33404 edge_iterator ei;
33405
33406 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
33407 {
33408 basic_block bb = e->src;
33409 rtx ret = BB_END (bb);
33410 rtx prev;
33411 bool replace = false;
33412
33413 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
33414 || optimize_bb_for_size_p (bb))
33415 continue;
33416 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
33417 if (active_insn_p (prev) || LABEL_P (prev))
33418 break;
33419 if (prev && LABEL_P (prev))
33420 {
33421 edge e;
33422 edge_iterator ei;
33423
33424 FOR_EACH_EDGE (e, ei, bb->preds)
33425 if (EDGE_FREQUENCY (e) && e->src->index >= 0
33426 && !(e->flags & EDGE_FALLTHRU))
33427 replace = true;
33428 }
33429 if (!replace)
33430 {
33431 prev = prev_active_insn (ret);
33432 if (prev
33433 && ((JUMP_P (prev) && any_condjump_p (prev))
33434 || CALL_P (prev)))
33435 replace = true;
33436 /* Empty functions get branch mispredict even when
33437 the jump destination is not visible to us. */
33438 if (!prev && !optimize_function_for_size_p (cfun))
33439 replace = true;
33440 }
33441 if (replace)
33442 {
33443 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
33444 delete_insn (ret);
33445 }
33446 }
33447 }
33448
33449 /* Count the minimum number of instructions in BB. Return 4 if the
33450 number of instructions >= 4. */
33451
33452 static int
33453 ix86_count_insn_bb (basic_block bb)
33454 {
33455 rtx insn;
33456 int insn_count = 0;
33457
33458 /* Count number of instructions in this block. Return 4 if the number
33459 of instructions >= 4. */
33460 FOR_BB_INSNS (bb, insn)
33461 {
33462 /* Only happen in exit blocks. */
33463 if (JUMP_P (insn)
33464 && ANY_RETURN_P (PATTERN (insn)))
33465 break;
33466
33467 if (NONDEBUG_INSN_P (insn)
33468 && GET_CODE (PATTERN (insn)) != USE
33469 && GET_CODE (PATTERN (insn)) != CLOBBER)
33470 {
33471 insn_count++;
33472 if (insn_count >= 4)
33473 return insn_count;
33474 }
33475 }
33476
33477 return insn_count;
33478 }
33479
33480
33481 /* Count the minimum number of instructions in code path in BB.
33482 Return 4 if the number of instructions >= 4. */
33483
33484 static int
33485 ix86_count_insn (basic_block bb)
33486 {
33487 edge e;
33488 edge_iterator ei;
33489 int min_prev_count;
33490
33491 /* Only bother counting instructions along paths with no
33492 more than 2 basic blocks between entry and exit. Given
33493 that BB has an edge to exit, determine if a predecessor
33494 of BB has an edge from entry. If so, compute the number
33495 of instructions in the predecessor block. If there
33496 happen to be multiple such blocks, compute the minimum. */
33497 min_prev_count = 4;
33498 FOR_EACH_EDGE (e, ei, bb->preds)
33499 {
33500 edge prev_e;
33501 edge_iterator prev_ei;
33502
33503 if (e->src == ENTRY_BLOCK_PTR)
33504 {
33505 min_prev_count = 0;
33506 break;
33507 }
33508 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
33509 {
33510 if (prev_e->src == ENTRY_BLOCK_PTR)
33511 {
33512 int count = ix86_count_insn_bb (e->src);
33513 if (count < min_prev_count)
33514 min_prev_count = count;
33515 break;
33516 }
33517 }
33518 }
33519
33520 if (min_prev_count < 4)
33521 min_prev_count += ix86_count_insn_bb (bb);
33522
33523 return min_prev_count;
33524 }
33525
33526 /* Pad short function to 4 instructions. */
33527
33528 static void
33529 ix86_pad_short_function (void)
33530 {
33531 edge e;
33532 edge_iterator ei;
33533
33534 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
33535 {
33536 rtx ret = BB_END (e->src);
33537 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
33538 {
33539 int insn_count = ix86_count_insn (e->src);
33540
33541 /* Pad short function. */
33542 if (insn_count < 4)
33543 {
33544 rtx insn = ret;
33545
33546 /* Find epilogue. */
33547 while (insn
33548 && (!NOTE_P (insn)
33549 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
33550 insn = PREV_INSN (insn);
33551
33552 if (!insn)
33553 insn = ret;
33554
33555 /* Two NOPs count as one instruction. */
33556 insn_count = 2 * (4 - insn_count);
33557 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
33558 }
33559 }
33560 }
33561 }
33562
33563 /* Implement machine specific optimizations. We implement padding of returns
33564 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
33565 static void
33566 ix86_reorg (void)
33567 {
33568 /* We are freeing block_for_insn in the toplev to keep compatibility
33569 with old MDEP_REORGS that are not CFG based. Recompute it now. */
33570 compute_bb_for_insn ();
33571
33572 /* Run the vzeroupper optimization if needed. */
33573 if (TARGET_VZEROUPPER)
33574 move_or_delete_vzeroupper ();
33575
33576 if (optimize && optimize_function_for_speed_p (cfun))
33577 {
33578 if (TARGET_PAD_SHORT_FUNCTION)
33579 ix86_pad_short_function ();
33580 else if (TARGET_PAD_RETURNS)
33581 ix86_pad_returns ();
33582 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
33583 if (TARGET_FOUR_JUMP_LIMIT)
33584 ix86_avoid_jump_mispredicts ();
33585 #endif
33586 }
33587 }
33588
33589 /* Return nonzero when QImode register that must be represented via REX prefix
33590 is used. */
33591 bool
33592 x86_extended_QIreg_mentioned_p (rtx insn)
33593 {
33594 int i;
33595 extract_insn_cached (insn);
33596 for (i = 0; i < recog_data.n_operands; i++)
33597 if (REG_P (recog_data.operand[i])
33598 && REGNO (recog_data.operand[i]) > BX_REG)
33599 return true;
33600 return false;
33601 }
33602
33603 /* Return nonzero when P points to register encoded via REX prefix.
33604 Called via for_each_rtx. */
33605 static int
33606 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
33607 {
33608 unsigned int regno;
33609 if (!REG_P (*p))
33610 return 0;
33611 regno = REGNO (*p);
33612 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
33613 }
33614
33615 /* Return true when INSN mentions register that must be encoded using REX
33616 prefix. */
33617 bool
33618 x86_extended_reg_mentioned_p (rtx insn)
33619 {
33620 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
33621 extended_reg_mentioned_1, NULL);
33622 }
33623
33624 /* If profitable, negate (without causing overflow) integer constant
33625 of mode MODE at location LOC. Return true in this case. */
33626 bool
33627 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
33628 {
33629 HOST_WIDE_INT val;
33630
33631 if (!CONST_INT_P (*loc))
33632 return false;
33633
33634 switch (mode)
33635 {
33636 case DImode:
33637 /* DImode x86_64 constants must fit in 32 bits. */
33638 gcc_assert (x86_64_immediate_operand (*loc, mode));
33639
33640 mode = SImode;
33641 break;
33642
33643 case SImode:
33644 case HImode:
33645 case QImode:
33646 break;
33647
33648 default:
33649 gcc_unreachable ();
33650 }
33651
33652 /* Avoid overflows. */
33653 if (mode_signbit_p (mode, *loc))
33654 return false;
33655
33656 val = INTVAL (*loc);
33657
33658 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
33659 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
33660 if ((val < 0 && val != -128)
33661 || val == 128)
33662 {
33663 *loc = GEN_INT (-val);
33664 return true;
33665 }
33666
33667 return false;
33668 }
33669
33670 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
33671 optabs would emit if we didn't have TFmode patterns. */
33672
33673 void
33674 x86_emit_floatuns (rtx operands[2])
33675 {
33676 rtx neglab, donelab, i0, i1, f0, in, out;
33677 enum machine_mode mode, inmode;
33678
33679 inmode = GET_MODE (operands[1]);
33680 gcc_assert (inmode == SImode || inmode == DImode);
33681
33682 out = operands[0];
33683 in = force_reg (inmode, operands[1]);
33684 mode = GET_MODE (out);
33685 neglab = gen_label_rtx ();
33686 donelab = gen_label_rtx ();
33687 f0 = gen_reg_rtx (mode);
33688
33689 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
33690
33691 expand_float (out, in, 0);
33692
33693 emit_jump_insn (gen_jump (donelab));
33694 emit_barrier ();
33695
33696 emit_label (neglab);
33697
33698 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
33699 1, OPTAB_DIRECT);
33700 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
33701 1, OPTAB_DIRECT);
33702 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
33703
33704 expand_float (f0, i0, 0);
33705
33706 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
33707
33708 emit_label (donelab);
33709 }
33710 \f
33711 /* AVX2 does support 32-byte integer vector operations,
33712 thus the longest vector we are faced with is V32QImode. */
33713 #define MAX_VECT_LEN 32
33714
33715 struct expand_vec_perm_d
33716 {
33717 rtx target, op0, op1;
33718 unsigned char perm[MAX_VECT_LEN];
33719 enum machine_mode vmode;
33720 unsigned char nelt;
33721 bool one_operand_p;
33722 bool testing_p;
33723 };
33724
33725 static bool canonicalize_perm (struct expand_vec_perm_d *d);
33726 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
33727 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
33728
33729 /* Get a vector mode of the same size as the original but with elements
33730 twice as wide. This is only guaranteed to apply to integral vectors. */
33731
33732 static inline enum machine_mode
33733 get_mode_wider_vector (enum machine_mode o)
33734 {
33735 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
33736 enum machine_mode n = GET_MODE_WIDER_MODE (o);
33737 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
33738 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
33739 return n;
33740 }
33741
33742 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33743 with all elements equal to VAR. Return true if successful. */
33744
33745 static bool
33746 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
33747 rtx target, rtx val)
33748 {
33749 bool ok;
33750
33751 switch (mode)
33752 {
33753 case V2SImode:
33754 case V2SFmode:
33755 if (!mmx_ok)
33756 return false;
33757 /* FALLTHRU */
33758
33759 case V4DFmode:
33760 case V4DImode:
33761 case V8SFmode:
33762 case V8SImode:
33763 case V2DFmode:
33764 case V2DImode:
33765 case V4SFmode:
33766 case V4SImode:
33767 {
33768 rtx insn, dup;
33769
33770 /* First attempt to recognize VAL as-is. */
33771 dup = gen_rtx_VEC_DUPLICATE (mode, val);
33772 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
33773 if (recog_memoized (insn) < 0)
33774 {
33775 rtx seq;
33776 /* If that fails, force VAL into a register. */
33777
33778 start_sequence ();
33779 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
33780 seq = get_insns ();
33781 end_sequence ();
33782 if (seq)
33783 emit_insn_before (seq, insn);
33784
33785 ok = recog_memoized (insn) >= 0;
33786 gcc_assert (ok);
33787 }
33788 }
33789 return true;
33790
33791 case V4HImode:
33792 if (!mmx_ok)
33793 return false;
33794 if (TARGET_SSE || TARGET_3DNOW_A)
33795 {
33796 rtx x;
33797
33798 val = gen_lowpart (SImode, val);
33799 x = gen_rtx_TRUNCATE (HImode, val);
33800 x = gen_rtx_VEC_DUPLICATE (mode, x);
33801 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33802 return true;
33803 }
33804 goto widen;
33805
33806 case V8QImode:
33807 if (!mmx_ok)
33808 return false;
33809 goto widen;
33810
33811 case V8HImode:
33812 if (TARGET_SSE2)
33813 {
33814 struct expand_vec_perm_d dperm;
33815 rtx tmp1, tmp2;
33816
33817 permute:
33818 memset (&dperm, 0, sizeof (dperm));
33819 dperm.target = target;
33820 dperm.vmode = mode;
33821 dperm.nelt = GET_MODE_NUNITS (mode);
33822 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
33823 dperm.one_operand_p = true;
33824
33825 /* Extend to SImode using a paradoxical SUBREG. */
33826 tmp1 = gen_reg_rtx (SImode);
33827 emit_move_insn (tmp1, gen_lowpart (SImode, val));
33828
33829 /* Insert the SImode value as low element of a V4SImode vector. */
33830 tmp2 = gen_lowpart (V4SImode, dperm.op0);
33831 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
33832
33833 ok = (expand_vec_perm_1 (&dperm)
33834 || expand_vec_perm_broadcast_1 (&dperm));
33835 gcc_assert (ok);
33836 return ok;
33837 }
33838 goto widen;
33839
33840 case V16QImode:
33841 if (TARGET_SSE2)
33842 goto permute;
33843 goto widen;
33844
33845 widen:
33846 /* Replicate the value once into the next wider mode and recurse. */
33847 {
33848 enum machine_mode smode, wsmode, wvmode;
33849 rtx x;
33850
33851 smode = GET_MODE_INNER (mode);
33852 wvmode = get_mode_wider_vector (mode);
33853 wsmode = GET_MODE_INNER (wvmode);
33854
33855 val = convert_modes (wsmode, smode, val, true);
33856 x = expand_simple_binop (wsmode, ASHIFT, val,
33857 GEN_INT (GET_MODE_BITSIZE (smode)),
33858 NULL_RTX, 1, OPTAB_LIB_WIDEN);
33859 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
33860
33861 x = gen_lowpart (wvmode, target);
33862 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
33863 gcc_assert (ok);
33864 return ok;
33865 }
33866
33867 case V16HImode:
33868 case V32QImode:
33869 {
33870 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
33871 rtx x = gen_reg_rtx (hvmode);
33872
33873 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
33874 gcc_assert (ok);
33875
33876 x = gen_rtx_VEC_CONCAT (mode, x, x);
33877 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33878 }
33879 return true;
33880
33881 default:
33882 return false;
33883 }
33884 }
33885
33886 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33887 whose ONE_VAR element is VAR, and other elements are zero. Return true
33888 if successful. */
33889
33890 static bool
33891 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
33892 rtx target, rtx var, int one_var)
33893 {
33894 enum machine_mode vsimode;
33895 rtx new_target;
33896 rtx x, tmp;
33897 bool use_vector_set = false;
33898
33899 switch (mode)
33900 {
33901 case V2DImode:
33902 /* For SSE4.1, we normally use vector set. But if the second
33903 element is zero and inter-unit moves are OK, we use movq
33904 instead. */
33905 use_vector_set = (TARGET_64BIT
33906 && TARGET_SSE4_1
33907 && !(TARGET_INTER_UNIT_MOVES
33908 && one_var == 0));
33909 break;
33910 case V16QImode:
33911 case V4SImode:
33912 case V4SFmode:
33913 use_vector_set = TARGET_SSE4_1;
33914 break;
33915 case V8HImode:
33916 use_vector_set = TARGET_SSE2;
33917 break;
33918 case V4HImode:
33919 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
33920 break;
33921 case V32QImode:
33922 case V16HImode:
33923 case V8SImode:
33924 case V8SFmode:
33925 case V4DFmode:
33926 use_vector_set = TARGET_AVX;
33927 break;
33928 case V4DImode:
33929 /* Use ix86_expand_vector_set in 64bit mode only. */
33930 use_vector_set = TARGET_AVX && TARGET_64BIT;
33931 break;
33932 default:
33933 break;
33934 }
33935
33936 if (use_vector_set)
33937 {
33938 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
33939 var = force_reg (GET_MODE_INNER (mode), var);
33940 ix86_expand_vector_set (mmx_ok, target, var, one_var);
33941 return true;
33942 }
33943
33944 switch (mode)
33945 {
33946 case V2SFmode:
33947 case V2SImode:
33948 if (!mmx_ok)
33949 return false;
33950 /* FALLTHRU */
33951
33952 case V2DFmode:
33953 case V2DImode:
33954 if (one_var != 0)
33955 return false;
33956 var = force_reg (GET_MODE_INNER (mode), var);
33957 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
33958 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33959 return true;
33960
33961 case V4SFmode:
33962 case V4SImode:
33963 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
33964 new_target = gen_reg_rtx (mode);
33965 else
33966 new_target = target;
33967 var = force_reg (GET_MODE_INNER (mode), var);
33968 x = gen_rtx_VEC_DUPLICATE (mode, var);
33969 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
33970 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
33971 if (one_var != 0)
33972 {
33973 /* We need to shuffle the value to the correct position, so
33974 create a new pseudo to store the intermediate result. */
33975
33976 /* With SSE2, we can use the integer shuffle insns. */
33977 if (mode != V4SFmode && TARGET_SSE2)
33978 {
33979 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
33980 const1_rtx,
33981 GEN_INT (one_var == 1 ? 0 : 1),
33982 GEN_INT (one_var == 2 ? 0 : 1),
33983 GEN_INT (one_var == 3 ? 0 : 1)));
33984 if (target != new_target)
33985 emit_move_insn (target, new_target);
33986 return true;
33987 }
33988
33989 /* Otherwise convert the intermediate result to V4SFmode and
33990 use the SSE1 shuffle instructions. */
33991 if (mode != V4SFmode)
33992 {
33993 tmp = gen_reg_rtx (V4SFmode);
33994 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
33995 }
33996 else
33997 tmp = new_target;
33998
33999 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
34000 const1_rtx,
34001 GEN_INT (one_var == 1 ? 0 : 1),
34002 GEN_INT (one_var == 2 ? 0+4 : 1+4),
34003 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
34004
34005 if (mode != V4SFmode)
34006 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
34007 else if (tmp != target)
34008 emit_move_insn (target, tmp);
34009 }
34010 else if (target != new_target)
34011 emit_move_insn (target, new_target);
34012 return true;
34013
34014 case V8HImode:
34015 case V16QImode:
34016 vsimode = V4SImode;
34017 goto widen;
34018 case V4HImode:
34019 case V8QImode:
34020 if (!mmx_ok)
34021 return false;
34022 vsimode = V2SImode;
34023 goto widen;
34024 widen:
34025 if (one_var != 0)
34026 return false;
34027
34028 /* Zero extend the variable element to SImode and recurse. */
34029 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
34030
34031 x = gen_reg_rtx (vsimode);
34032 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
34033 var, one_var))
34034 gcc_unreachable ();
34035
34036 emit_move_insn (target, gen_lowpart (mode, x));
34037 return true;
34038
34039 default:
34040 return false;
34041 }
34042 }
34043
34044 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
34045 consisting of the values in VALS. It is known that all elements
34046 except ONE_VAR are constants. Return true if successful. */
34047
34048 static bool
34049 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
34050 rtx target, rtx vals, int one_var)
34051 {
34052 rtx var = XVECEXP (vals, 0, one_var);
34053 enum machine_mode wmode;
34054 rtx const_vec, x;
34055
34056 const_vec = copy_rtx (vals);
34057 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
34058 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
34059
34060 switch (mode)
34061 {
34062 case V2DFmode:
34063 case V2DImode:
34064 case V2SFmode:
34065 case V2SImode:
34066 /* For the two element vectors, it's just as easy to use
34067 the general case. */
34068 return false;
34069
34070 case V4DImode:
34071 /* Use ix86_expand_vector_set in 64bit mode only. */
34072 if (!TARGET_64BIT)
34073 return false;
34074 case V4DFmode:
34075 case V8SFmode:
34076 case V8SImode:
34077 case V16HImode:
34078 case V32QImode:
34079 case V4SFmode:
34080 case V4SImode:
34081 case V8HImode:
34082 case V4HImode:
34083 break;
34084
34085 case V16QImode:
34086 if (TARGET_SSE4_1)
34087 break;
34088 wmode = V8HImode;
34089 goto widen;
34090 case V8QImode:
34091 wmode = V4HImode;
34092 goto widen;
34093 widen:
34094 /* There's no way to set one QImode entry easily. Combine
34095 the variable value with its adjacent constant value, and
34096 promote to an HImode set. */
34097 x = XVECEXP (vals, 0, one_var ^ 1);
34098 if (one_var & 1)
34099 {
34100 var = convert_modes (HImode, QImode, var, true);
34101 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
34102 NULL_RTX, 1, OPTAB_LIB_WIDEN);
34103 x = GEN_INT (INTVAL (x) & 0xff);
34104 }
34105 else
34106 {
34107 var = convert_modes (HImode, QImode, var, true);
34108 x = gen_int_mode (INTVAL (x) << 8, HImode);
34109 }
34110 if (x != const0_rtx)
34111 var = expand_simple_binop (HImode, IOR, var, x, var,
34112 1, OPTAB_LIB_WIDEN);
34113
34114 x = gen_reg_rtx (wmode);
34115 emit_move_insn (x, gen_lowpart (wmode, const_vec));
34116 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
34117
34118 emit_move_insn (target, gen_lowpart (mode, x));
34119 return true;
34120
34121 default:
34122 return false;
34123 }
34124
34125 emit_move_insn (target, const_vec);
34126 ix86_expand_vector_set (mmx_ok, target, var, one_var);
34127 return true;
34128 }
34129
34130 /* A subroutine of ix86_expand_vector_init_general. Use vector
34131 concatenate to handle the most general case: all values variable,
34132 and none identical. */
34133
34134 static void
34135 ix86_expand_vector_init_concat (enum machine_mode mode,
34136 rtx target, rtx *ops, int n)
34137 {
34138 enum machine_mode cmode, hmode = VOIDmode;
34139 rtx first[8], second[4];
34140 rtvec v;
34141 int i, j;
34142
34143 switch (n)
34144 {
34145 case 2:
34146 switch (mode)
34147 {
34148 case V8SImode:
34149 cmode = V4SImode;
34150 break;
34151 case V8SFmode:
34152 cmode = V4SFmode;
34153 break;
34154 case V4DImode:
34155 cmode = V2DImode;
34156 break;
34157 case V4DFmode:
34158 cmode = V2DFmode;
34159 break;
34160 case V4SImode:
34161 cmode = V2SImode;
34162 break;
34163 case V4SFmode:
34164 cmode = V2SFmode;
34165 break;
34166 case V2DImode:
34167 cmode = DImode;
34168 break;
34169 case V2SImode:
34170 cmode = SImode;
34171 break;
34172 case V2DFmode:
34173 cmode = DFmode;
34174 break;
34175 case V2SFmode:
34176 cmode = SFmode;
34177 break;
34178 default:
34179 gcc_unreachable ();
34180 }
34181
34182 if (!register_operand (ops[1], cmode))
34183 ops[1] = force_reg (cmode, ops[1]);
34184 if (!register_operand (ops[0], cmode))
34185 ops[0] = force_reg (cmode, ops[0]);
34186 emit_insn (gen_rtx_SET (VOIDmode, target,
34187 gen_rtx_VEC_CONCAT (mode, ops[0],
34188 ops[1])));
34189 break;
34190
34191 case 4:
34192 switch (mode)
34193 {
34194 case V4DImode:
34195 cmode = V2DImode;
34196 break;
34197 case V4DFmode:
34198 cmode = V2DFmode;
34199 break;
34200 case V4SImode:
34201 cmode = V2SImode;
34202 break;
34203 case V4SFmode:
34204 cmode = V2SFmode;
34205 break;
34206 default:
34207 gcc_unreachable ();
34208 }
34209 goto half;
34210
34211 case 8:
34212 switch (mode)
34213 {
34214 case V8SImode:
34215 cmode = V2SImode;
34216 hmode = V4SImode;
34217 break;
34218 case V8SFmode:
34219 cmode = V2SFmode;
34220 hmode = V4SFmode;
34221 break;
34222 default:
34223 gcc_unreachable ();
34224 }
34225 goto half;
34226
34227 half:
34228 /* FIXME: We process inputs backward to help RA. PR 36222. */
34229 i = n - 1;
34230 j = (n >> 1) - 1;
34231 for (; i > 0; i -= 2, j--)
34232 {
34233 first[j] = gen_reg_rtx (cmode);
34234 v = gen_rtvec (2, ops[i - 1], ops[i]);
34235 ix86_expand_vector_init (false, first[j],
34236 gen_rtx_PARALLEL (cmode, v));
34237 }
34238
34239 n >>= 1;
34240 if (n > 2)
34241 {
34242 gcc_assert (hmode != VOIDmode);
34243 for (i = j = 0; i < n; i += 2, j++)
34244 {
34245 second[j] = gen_reg_rtx (hmode);
34246 ix86_expand_vector_init_concat (hmode, second [j],
34247 &first [i], 2);
34248 }
34249 n >>= 1;
34250 ix86_expand_vector_init_concat (mode, target, second, n);
34251 }
34252 else
34253 ix86_expand_vector_init_concat (mode, target, first, n);
34254 break;
34255
34256 default:
34257 gcc_unreachable ();
34258 }
34259 }
34260
34261 /* A subroutine of ix86_expand_vector_init_general. Use vector
34262 interleave to handle the most general case: all values variable,
34263 and none identical. */
34264
34265 static void
34266 ix86_expand_vector_init_interleave (enum machine_mode mode,
34267 rtx target, rtx *ops, int n)
34268 {
34269 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
34270 int i, j;
34271 rtx op0, op1;
34272 rtx (*gen_load_even) (rtx, rtx, rtx);
34273 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
34274 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
34275
34276 switch (mode)
34277 {
34278 case V8HImode:
34279 gen_load_even = gen_vec_setv8hi;
34280 gen_interleave_first_low = gen_vec_interleave_lowv4si;
34281 gen_interleave_second_low = gen_vec_interleave_lowv2di;
34282 inner_mode = HImode;
34283 first_imode = V4SImode;
34284 second_imode = V2DImode;
34285 third_imode = VOIDmode;
34286 break;
34287 case V16QImode:
34288 gen_load_even = gen_vec_setv16qi;
34289 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
34290 gen_interleave_second_low = gen_vec_interleave_lowv4si;
34291 inner_mode = QImode;
34292 first_imode = V8HImode;
34293 second_imode = V4SImode;
34294 third_imode = V2DImode;
34295 break;
34296 default:
34297 gcc_unreachable ();
34298 }
34299
34300 for (i = 0; i < n; i++)
34301 {
34302 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
34303 op0 = gen_reg_rtx (SImode);
34304 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
34305
34306 /* Insert the SImode value as low element of V4SImode vector. */
34307 op1 = gen_reg_rtx (V4SImode);
34308 op0 = gen_rtx_VEC_MERGE (V4SImode,
34309 gen_rtx_VEC_DUPLICATE (V4SImode,
34310 op0),
34311 CONST0_RTX (V4SImode),
34312 const1_rtx);
34313 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
34314
34315 /* Cast the V4SImode vector back to a vector in orignal mode. */
34316 op0 = gen_reg_rtx (mode);
34317 emit_move_insn (op0, gen_lowpart (mode, op1));
34318
34319 /* Load even elements into the second positon. */
34320 emit_insn (gen_load_even (op0,
34321 force_reg (inner_mode,
34322 ops [i + i + 1]),
34323 const1_rtx));
34324
34325 /* Cast vector to FIRST_IMODE vector. */
34326 ops[i] = gen_reg_rtx (first_imode);
34327 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
34328 }
34329
34330 /* Interleave low FIRST_IMODE vectors. */
34331 for (i = j = 0; i < n; i += 2, j++)
34332 {
34333 op0 = gen_reg_rtx (first_imode);
34334 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
34335
34336 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
34337 ops[j] = gen_reg_rtx (second_imode);
34338 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
34339 }
34340
34341 /* Interleave low SECOND_IMODE vectors. */
34342 switch (second_imode)
34343 {
34344 case V4SImode:
34345 for (i = j = 0; i < n / 2; i += 2, j++)
34346 {
34347 op0 = gen_reg_rtx (second_imode);
34348 emit_insn (gen_interleave_second_low (op0, ops[i],
34349 ops[i + 1]));
34350
34351 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
34352 vector. */
34353 ops[j] = gen_reg_rtx (third_imode);
34354 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
34355 }
34356 second_imode = V2DImode;
34357 gen_interleave_second_low = gen_vec_interleave_lowv2di;
34358 /* FALLTHRU */
34359
34360 case V2DImode:
34361 op0 = gen_reg_rtx (second_imode);
34362 emit_insn (gen_interleave_second_low (op0, ops[0],
34363 ops[1]));
34364
34365 /* Cast the SECOND_IMODE vector back to a vector on original
34366 mode. */
34367 emit_insn (gen_rtx_SET (VOIDmode, target,
34368 gen_lowpart (mode, op0)));
34369 break;
34370
34371 default:
34372 gcc_unreachable ();
34373 }
34374 }
34375
34376 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
34377 all values variable, and none identical. */
34378
34379 static void
34380 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
34381 rtx target, rtx vals)
34382 {
34383 rtx ops[32], op0, op1;
34384 enum machine_mode half_mode = VOIDmode;
34385 int n, i;
34386
34387 switch (mode)
34388 {
34389 case V2SFmode:
34390 case V2SImode:
34391 if (!mmx_ok && !TARGET_SSE)
34392 break;
34393 /* FALLTHRU */
34394
34395 case V8SFmode:
34396 case V8SImode:
34397 case V4DFmode:
34398 case V4DImode:
34399 case V4SFmode:
34400 case V4SImode:
34401 case V2DFmode:
34402 case V2DImode:
34403 n = GET_MODE_NUNITS (mode);
34404 for (i = 0; i < n; i++)
34405 ops[i] = XVECEXP (vals, 0, i);
34406 ix86_expand_vector_init_concat (mode, target, ops, n);
34407 return;
34408
34409 case V32QImode:
34410 half_mode = V16QImode;
34411 goto half;
34412
34413 case V16HImode:
34414 half_mode = V8HImode;
34415 goto half;
34416
34417 half:
34418 n = GET_MODE_NUNITS (mode);
34419 for (i = 0; i < n; i++)
34420 ops[i] = XVECEXP (vals, 0, i);
34421 op0 = gen_reg_rtx (half_mode);
34422 op1 = gen_reg_rtx (half_mode);
34423 ix86_expand_vector_init_interleave (half_mode, op0, ops,
34424 n >> 2);
34425 ix86_expand_vector_init_interleave (half_mode, op1,
34426 &ops [n >> 1], n >> 2);
34427 emit_insn (gen_rtx_SET (VOIDmode, target,
34428 gen_rtx_VEC_CONCAT (mode, op0, op1)));
34429 return;
34430
34431 case V16QImode:
34432 if (!TARGET_SSE4_1)
34433 break;
34434 /* FALLTHRU */
34435
34436 case V8HImode:
34437 if (!TARGET_SSE2)
34438 break;
34439
34440 /* Don't use ix86_expand_vector_init_interleave if we can't
34441 move from GPR to SSE register directly. */
34442 if (!TARGET_INTER_UNIT_MOVES)
34443 break;
34444
34445 n = GET_MODE_NUNITS (mode);
34446 for (i = 0; i < n; i++)
34447 ops[i] = XVECEXP (vals, 0, i);
34448 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
34449 return;
34450
34451 case V4HImode:
34452 case V8QImode:
34453 break;
34454
34455 default:
34456 gcc_unreachable ();
34457 }
34458
34459 {
34460 int i, j, n_elts, n_words, n_elt_per_word;
34461 enum machine_mode inner_mode;
34462 rtx words[4], shift;
34463
34464 inner_mode = GET_MODE_INNER (mode);
34465 n_elts = GET_MODE_NUNITS (mode);
34466 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
34467 n_elt_per_word = n_elts / n_words;
34468 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
34469
34470 for (i = 0; i < n_words; ++i)
34471 {
34472 rtx word = NULL_RTX;
34473
34474 for (j = 0; j < n_elt_per_word; ++j)
34475 {
34476 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
34477 elt = convert_modes (word_mode, inner_mode, elt, true);
34478
34479 if (j == 0)
34480 word = elt;
34481 else
34482 {
34483 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
34484 word, 1, OPTAB_LIB_WIDEN);
34485 word = expand_simple_binop (word_mode, IOR, word, elt,
34486 word, 1, OPTAB_LIB_WIDEN);
34487 }
34488 }
34489
34490 words[i] = word;
34491 }
34492
34493 if (n_words == 1)
34494 emit_move_insn (target, gen_lowpart (mode, words[0]));
34495 else if (n_words == 2)
34496 {
34497 rtx tmp = gen_reg_rtx (mode);
34498 emit_clobber (tmp);
34499 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
34500 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
34501 emit_move_insn (target, tmp);
34502 }
34503 else if (n_words == 4)
34504 {
34505 rtx tmp = gen_reg_rtx (V4SImode);
34506 gcc_assert (word_mode == SImode);
34507 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
34508 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
34509 emit_move_insn (target, gen_lowpart (mode, tmp));
34510 }
34511 else
34512 gcc_unreachable ();
34513 }
34514 }
34515
34516 /* Initialize vector TARGET via VALS. Suppress the use of MMX
34517 instructions unless MMX_OK is true. */
34518
34519 void
34520 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
34521 {
34522 enum machine_mode mode = GET_MODE (target);
34523 enum machine_mode inner_mode = GET_MODE_INNER (mode);
34524 int n_elts = GET_MODE_NUNITS (mode);
34525 int n_var = 0, one_var = -1;
34526 bool all_same = true, all_const_zero = true;
34527 int i;
34528 rtx x;
34529
34530 for (i = 0; i < n_elts; ++i)
34531 {
34532 x = XVECEXP (vals, 0, i);
34533 if (!(CONST_INT_P (x)
34534 || GET_CODE (x) == CONST_DOUBLE
34535 || GET_CODE (x) == CONST_FIXED))
34536 n_var++, one_var = i;
34537 else if (x != CONST0_RTX (inner_mode))
34538 all_const_zero = false;
34539 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
34540 all_same = false;
34541 }
34542
34543 /* Constants are best loaded from the constant pool. */
34544 if (n_var == 0)
34545 {
34546 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
34547 return;
34548 }
34549
34550 /* If all values are identical, broadcast the value. */
34551 if (all_same
34552 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
34553 XVECEXP (vals, 0, 0)))
34554 return;
34555
34556 /* Values where only one field is non-constant are best loaded from
34557 the pool and overwritten via move later. */
34558 if (n_var == 1)
34559 {
34560 if (all_const_zero
34561 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
34562 XVECEXP (vals, 0, one_var),
34563 one_var))
34564 return;
34565
34566 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
34567 return;
34568 }
34569
34570 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
34571 }
34572
34573 void
34574 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
34575 {
34576 enum machine_mode mode = GET_MODE (target);
34577 enum machine_mode inner_mode = GET_MODE_INNER (mode);
34578 enum machine_mode half_mode;
34579 bool use_vec_merge = false;
34580 rtx tmp;
34581 static rtx (*gen_extract[6][2]) (rtx, rtx)
34582 = {
34583 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
34584 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
34585 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
34586 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
34587 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
34588 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
34589 };
34590 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
34591 = {
34592 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
34593 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
34594 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
34595 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
34596 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
34597 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
34598 };
34599 int i, j, n;
34600
34601 switch (mode)
34602 {
34603 case V2SFmode:
34604 case V2SImode:
34605 if (mmx_ok)
34606 {
34607 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
34608 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
34609 if (elt == 0)
34610 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
34611 else
34612 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
34613 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34614 return;
34615 }
34616 break;
34617
34618 case V2DImode:
34619 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
34620 if (use_vec_merge)
34621 break;
34622
34623 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
34624 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
34625 if (elt == 0)
34626 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
34627 else
34628 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
34629 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34630 return;
34631
34632 case V2DFmode:
34633 {
34634 rtx op0, op1;
34635
34636 /* For the two element vectors, we implement a VEC_CONCAT with
34637 the extraction of the other element. */
34638
34639 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
34640 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
34641
34642 if (elt == 0)
34643 op0 = val, op1 = tmp;
34644 else
34645 op0 = tmp, op1 = val;
34646
34647 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
34648 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34649 }
34650 return;
34651
34652 case V4SFmode:
34653 use_vec_merge = TARGET_SSE4_1;
34654 if (use_vec_merge)
34655 break;
34656
34657 switch (elt)
34658 {
34659 case 0:
34660 use_vec_merge = true;
34661 break;
34662
34663 case 1:
34664 /* tmp = target = A B C D */
34665 tmp = copy_to_reg (target);
34666 /* target = A A B B */
34667 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
34668 /* target = X A B B */
34669 ix86_expand_vector_set (false, target, val, 0);
34670 /* target = A X C D */
34671 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
34672 const1_rtx, const0_rtx,
34673 GEN_INT (2+4), GEN_INT (3+4)));
34674 return;
34675
34676 case 2:
34677 /* tmp = target = A B C D */
34678 tmp = copy_to_reg (target);
34679 /* tmp = X B C D */
34680 ix86_expand_vector_set (false, tmp, val, 0);
34681 /* target = A B X D */
34682 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
34683 const0_rtx, const1_rtx,
34684 GEN_INT (0+4), GEN_INT (3+4)));
34685 return;
34686
34687 case 3:
34688 /* tmp = target = A B C D */
34689 tmp = copy_to_reg (target);
34690 /* tmp = X B C D */
34691 ix86_expand_vector_set (false, tmp, val, 0);
34692 /* target = A B X D */
34693 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
34694 const0_rtx, const1_rtx,
34695 GEN_INT (2+4), GEN_INT (0+4)));
34696 return;
34697
34698 default:
34699 gcc_unreachable ();
34700 }
34701 break;
34702
34703 case V4SImode:
34704 use_vec_merge = TARGET_SSE4_1;
34705 if (use_vec_merge)
34706 break;
34707
34708 /* Element 0 handled by vec_merge below. */
34709 if (elt == 0)
34710 {
34711 use_vec_merge = true;
34712 break;
34713 }
34714
34715 if (TARGET_SSE2)
34716 {
34717 /* With SSE2, use integer shuffles to swap element 0 and ELT,
34718 store into element 0, then shuffle them back. */
34719
34720 rtx order[4];
34721
34722 order[0] = GEN_INT (elt);
34723 order[1] = const1_rtx;
34724 order[2] = const2_rtx;
34725 order[3] = GEN_INT (3);
34726 order[elt] = const0_rtx;
34727
34728 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
34729 order[1], order[2], order[3]));
34730
34731 ix86_expand_vector_set (false, target, val, 0);
34732
34733 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
34734 order[1], order[2], order[3]));
34735 }
34736 else
34737 {
34738 /* For SSE1, we have to reuse the V4SF code. */
34739 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
34740 gen_lowpart (SFmode, val), elt);
34741 }
34742 return;
34743
34744 case V8HImode:
34745 use_vec_merge = TARGET_SSE2;
34746 break;
34747 case V4HImode:
34748 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
34749 break;
34750
34751 case V16QImode:
34752 use_vec_merge = TARGET_SSE4_1;
34753 break;
34754
34755 case V8QImode:
34756 break;
34757
34758 case V32QImode:
34759 half_mode = V16QImode;
34760 j = 0;
34761 n = 16;
34762 goto half;
34763
34764 case V16HImode:
34765 half_mode = V8HImode;
34766 j = 1;
34767 n = 8;
34768 goto half;
34769
34770 case V8SImode:
34771 half_mode = V4SImode;
34772 j = 2;
34773 n = 4;
34774 goto half;
34775
34776 case V4DImode:
34777 half_mode = V2DImode;
34778 j = 3;
34779 n = 2;
34780 goto half;
34781
34782 case V8SFmode:
34783 half_mode = V4SFmode;
34784 j = 4;
34785 n = 4;
34786 goto half;
34787
34788 case V4DFmode:
34789 half_mode = V2DFmode;
34790 j = 5;
34791 n = 2;
34792 goto half;
34793
34794 half:
34795 /* Compute offset. */
34796 i = elt / n;
34797 elt %= n;
34798
34799 gcc_assert (i <= 1);
34800
34801 /* Extract the half. */
34802 tmp = gen_reg_rtx (half_mode);
34803 emit_insn (gen_extract[j][i] (tmp, target));
34804
34805 /* Put val in tmp at elt. */
34806 ix86_expand_vector_set (false, tmp, val, elt);
34807
34808 /* Put it back. */
34809 emit_insn (gen_insert[j][i] (target, target, tmp));
34810 return;
34811
34812 default:
34813 break;
34814 }
34815
34816 if (use_vec_merge)
34817 {
34818 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
34819 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
34820 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34821 }
34822 else
34823 {
34824 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
34825
34826 emit_move_insn (mem, target);
34827
34828 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
34829 emit_move_insn (tmp, val);
34830
34831 emit_move_insn (target, mem);
34832 }
34833 }
34834
34835 void
34836 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
34837 {
34838 enum machine_mode mode = GET_MODE (vec);
34839 enum machine_mode inner_mode = GET_MODE_INNER (mode);
34840 bool use_vec_extr = false;
34841 rtx tmp;
34842
34843 switch (mode)
34844 {
34845 case V2SImode:
34846 case V2SFmode:
34847 if (!mmx_ok)
34848 break;
34849 /* FALLTHRU */
34850
34851 case V2DFmode:
34852 case V2DImode:
34853 use_vec_extr = true;
34854 break;
34855
34856 case V4SFmode:
34857 use_vec_extr = TARGET_SSE4_1;
34858 if (use_vec_extr)
34859 break;
34860
34861 switch (elt)
34862 {
34863 case 0:
34864 tmp = vec;
34865 break;
34866
34867 case 1:
34868 case 3:
34869 tmp = gen_reg_rtx (mode);
34870 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
34871 GEN_INT (elt), GEN_INT (elt),
34872 GEN_INT (elt+4), GEN_INT (elt+4)));
34873 break;
34874
34875 case 2:
34876 tmp = gen_reg_rtx (mode);
34877 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
34878 break;
34879
34880 default:
34881 gcc_unreachable ();
34882 }
34883 vec = tmp;
34884 use_vec_extr = true;
34885 elt = 0;
34886 break;
34887
34888 case V4SImode:
34889 use_vec_extr = TARGET_SSE4_1;
34890 if (use_vec_extr)
34891 break;
34892
34893 if (TARGET_SSE2)
34894 {
34895 switch (elt)
34896 {
34897 case 0:
34898 tmp = vec;
34899 break;
34900
34901 case 1:
34902 case 3:
34903 tmp = gen_reg_rtx (mode);
34904 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
34905 GEN_INT (elt), GEN_INT (elt),
34906 GEN_INT (elt), GEN_INT (elt)));
34907 break;
34908
34909 case 2:
34910 tmp = gen_reg_rtx (mode);
34911 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
34912 break;
34913
34914 default:
34915 gcc_unreachable ();
34916 }
34917 vec = tmp;
34918 use_vec_extr = true;
34919 elt = 0;
34920 }
34921 else
34922 {
34923 /* For SSE1, we have to reuse the V4SF code. */
34924 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
34925 gen_lowpart (V4SFmode, vec), elt);
34926 return;
34927 }
34928 break;
34929
34930 case V8HImode:
34931 use_vec_extr = TARGET_SSE2;
34932 break;
34933 case V4HImode:
34934 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
34935 break;
34936
34937 case V16QImode:
34938 use_vec_extr = TARGET_SSE4_1;
34939 break;
34940
34941 case V8SFmode:
34942 if (TARGET_AVX)
34943 {
34944 tmp = gen_reg_rtx (V4SFmode);
34945 if (elt < 4)
34946 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
34947 else
34948 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
34949 ix86_expand_vector_extract (false, target, tmp, elt & 3);
34950 return;
34951 }
34952 break;
34953
34954 case V4DFmode:
34955 if (TARGET_AVX)
34956 {
34957 tmp = gen_reg_rtx (V2DFmode);
34958 if (elt < 2)
34959 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
34960 else
34961 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
34962 ix86_expand_vector_extract (false, target, tmp, elt & 1);
34963 return;
34964 }
34965 break;
34966
34967 case V32QImode:
34968 if (TARGET_AVX)
34969 {
34970 tmp = gen_reg_rtx (V16QImode);
34971 if (elt < 16)
34972 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
34973 else
34974 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
34975 ix86_expand_vector_extract (false, target, tmp, elt & 15);
34976 return;
34977 }
34978 break;
34979
34980 case V16HImode:
34981 if (TARGET_AVX)
34982 {
34983 tmp = gen_reg_rtx (V8HImode);
34984 if (elt < 8)
34985 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
34986 else
34987 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
34988 ix86_expand_vector_extract (false, target, tmp, elt & 7);
34989 return;
34990 }
34991 break;
34992
34993 case V8SImode:
34994 if (TARGET_AVX)
34995 {
34996 tmp = gen_reg_rtx (V4SImode);
34997 if (elt < 4)
34998 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
34999 else
35000 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
35001 ix86_expand_vector_extract (false, target, tmp, elt & 3);
35002 return;
35003 }
35004 break;
35005
35006 case V4DImode:
35007 if (TARGET_AVX)
35008 {
35009 tmp = gen_reg_rtx (V2DImode);
35010 if (elt < 2)
35011 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
35012 else
35013 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
35014 ix86_expand_vector_extract (false, target, tmp, elt & 1);
35015 return;
35016 }
35017 break;
35018
35019 case V8QImode:
35020 /* ??? Could extract the appropriate HImode element and shift. */
35021 default:
35022 break;
35023 }
35024
35025 if (use_vec_extr)
35026 {
35027 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
35028 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
35029
35030 /* Let the rtl optimizers know about the zero extension performed. */
35031 if (inner_mode == QImode || inner_mode == HImode)
35032 {
35033 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
35034 target = gen_lowpart (SImode, target);
35035 }
35036
35037 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
35038 }
35039 else
35040 {
35041 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
35042
35043 emit_move_insn (mem, vec);
35044
35045 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
35046 emit_move_insn (target, tmp);
35047 }
35048 }
35049
35050 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
35051 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
35052 The upper bits of DEST are undefined, though they shouldn't cause
35053 exceptions (some bits from src or all zeros are ok). */
35054
35055 static void
35056 emit_reduc_half (rtx dest, rtx src, int i)
35057 {
35058 rtx tem;
35059 switch (GET_MODE (src))
35060 {
35061 case V4SFmode:
35062 if (i == 128)
35063 tem = gen_sse_movhlps (dest, src, src);
35064 else
35065 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
35066 GEN_INT (1 + 4), GEN_INT (1 + 4));
35067 break;
35068 case V2DFmode:
35069 tem = gen_vec_interleave_highv2df (dest, src, src);
35070 break;
35071 case V16QImode:
35072 case V8HImode:
35073 case V4SImode:
35074 case V2DImode:
35075 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
35076 gen_lowpart (V1TImode, src),
35077 GEN_INT (i / 2));
35078 break;
35079 case V8SFmode:
35080 if (i == 256)
35081 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
35082 else
35083 tem = gen_avx_shufps256 (dest, src, src,
35084 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
35085 break;
35086 case V4DFmode:
35087 if (i == 256)
35088 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
35089 else
35090 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
35091 break;
35092 case V32QImode:
35093 case V16HImode:
35094 case V8SImode:
35095 case V4DImode:
35096 if (i == 256)
35097 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
35098 gen_lowpart (V4DImode, src),
35099 gen_lowpart (V4DImode, src),
35100 const1_rtx);
35101 else
35102 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
35103 gen_lowpart (V2TImode, src),
35104 GEN_INT (i / 2));
35105 break;
35106 default:
35107 gcc_unreachable ();
35108 }
35109 emit_insn (tem);
35110 }
35111
35112 /* Expand a vector reduction. FN is the binary pattern to reduce;
35113 DEST is the destination; IN is the input vector. */
35114
35115 void
35116 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
35117 {
35118 rtx half, dst, vec = in;
35119 enum machine_mode mode = GET_MODE (in);
35120 int i;
35121
35122 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
35123 if (TARGET_SSE4_1
35124 && mode == V8HImode
35125 && fn == gen_uminv8hi3)
35126 {
35127 emit_insn (gen_sse4_1_phminposuw (dest, in));
35128 return;
35129 }
35130
35131 for (i = GET_MODE_BITSIZE (mode);
35132 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
35133 i >>= 1)
35134 {
35135 half = gen_reg_rtx (mode);
35136 emit_reduc_half (half, vec, i);
35137 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
35138 dst = dest;
35139 else
35140 dst = gen_reg_rtx (mode);
35141 emit_insn (fn (dst, half, vec));
35142 vec = dst;
35143 }
35144 }
35145 \f
35146 /* Target hook for scalar_mode_supported_p. */
35147 static bool
35148 ix86_scalar_mode_supported_p (enum machine_mode mode)
35149 {
35150 if (DECIMAL_FLOAT_MODE_P (mode))
35151 return default_decimal_float_supported_p ();
35152 else if (mode == TFmode)
35153 return true;
35154 else
35155 return default_scalar_mode_supported_p (mode);
35156 }
35157
35158 /* Implements target hook vector_mode_supported_p. */
35159 static bool
35160 ix86_vector_mode_supported_p (enum machine_mode mode)
35161 {
35162 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
35163 return true;
35164 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
35165 return true;
35166 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
35167 return true;
35168 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
35169 return true;
35170 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
35171 return true;
35172 return false;
35173 }
35174
35175 /* Target hook for c_mode_for_suffix. */
35176 static enum machine_mode
35177 ix86_c_mode_for_suffix (char suffix)
35178 {
35179 if (suffix == 'q')
35180 return TFmode;
35181 if (suffix == 'w')
35182 return XFmode;
35183
35184 return VOIDmode;
35185 }
35186
35187 /* Worker function for TARGET_MD_ASM_CLOBBERS.
35188
35189 We do this in the new i386 backend to maintain source compatibility
35190 with the old cc0-based compiler. */
35191
35192 static tree
35193 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
35194 tree inputs ATTRIBUTE_UNUSED,
35195 tree clobbers)
35196 {
35197 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
35198 clobbers);
35199 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
35200 clobbers);
35201 return clobbers;
35202 }
35203
35204 /* Implements target vector targetm.asm.encode_section_info. */
35205
35206 static void ATTRIBUTE_UNUSED
35207 ix86_encode_section_info (tree decl, rtx rtl, int first)
35208 {
35209 default_encode_section_info (decl, rtl, first);
35210
35211 if (TREE_CODE (decl) == VAR_DECL
35212 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
35213 && ix86_in_large_data_p (decl))
35214 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
35215 }
35216
35217 /* Worker function for REVERSE_CONDITION. */
35218
35219 enum rtx_code
35220 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
35221 {
35222 return (mode != CCFPmode && mode != CCFPUmode
35223 ? reverse_condition (code)
35224 : reverse_condition_maybe_unordered (code));
35225 }
35226
35227 /* Output code to perform an x87 FP register move, from OPERANDS[1]
35228 to OPERANDS[0]. */
35229
35230 const char *
35231 output_387_reg_move (rtx insn, rtx *operands)
35232 {
35233 if (REG_P (operands[0]))
35234 {
35235 if (REG_P (operands[1])
35236 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
35237 {
35238 if (REGNO (operands[0]) == FIRST_STACK_REG)
35239 return output_387_ffreep (operands, 0);
35240 return "fstp\t%y0";
35241 }
35242 if (STACK_TOP_P (operands[0]))
35243 return "fld%Z1\t%y1";
35244 return "fst\t%y0";
35245 }
35246 else if (MEM_P (operands[0]))
35247 {
35248 gcc_assert (REG_P (operands[1]));
35249 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
35250 return "fstp%Z0\t%y0";
35251 else
35252 {
35253 /* There is no non-popping store to memory for XFmode.
35254 So if we need one, follow the store with a load. */
35255 if (GET_MODE (operands[0]) == XFmode)
35256 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
35257 else
35258 return "fst%Z0\t%y0";
35259 }
35260 }
35261 else
35262 gcc_unreachable();
35263 }
35264
35265 /* Output code to perform a conditional jump to LABEL, if C2 flag in
35266 FP status register is set. */
35267
35268 void
35269 ix86_emit_fp_unordered_jump (rtx label)
35270 {
35271 rtx reg = gen_reg_rtx (HImode);
35272 rtx temp;
35273
35274 emit_insn (gen_x86_fnstsw_1 (reg));
35275
35276 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
35277 {
35278 emit_insn (gen_x86_sahf_1 (reg));
35279
35280 temp = gen_rtx_REG (CCmode, FLAGS_REG);
35281 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
35282 }
35283 else
35284 {
35285 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
35286
35287 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
35288 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
35289 }
35290
35291 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
35292 gen_rtx_LABEL_REF (VOIDmode, label),
35293 pc_rtx);
35294 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
35295
35296 emit_jump_insn (temp);
35297 predict_jump (REG_BR_PROB_BASE * 10 / 100);
35298 }
35299
35300 /* Output code to perform a log1p XFmode calculation. */
35301
35302 void ix86_emit_i387_log1p (rtx op0, rtx op1)
35303 {
35304 rtx label1 = gen_label_rtx ();
35305 rtx label2 = gen_label_rtx ();
35306
35307 rtx tmp = gen_reg_rtx (XFmode);
35308 rtx tmp2 = gen_reg_rtx (XFmode);
35309 rtx test;
35310
35311 emit_insn (gen_absxf2 (tmp, op1));
35312 test = gen_rtx_GE (VOIDmode, tmp,
35313 CONST_DOUBLE_FROM_REAL_VALUE (
35314 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
35315 XFmode));
35316 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
35317
35318 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
35319 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
35320 emit_jump (label2);
35321
35322 emit_label (label1);
35323 emit_move_insn (tmp, CONST1_RTX (XFmode));
35324 emit_insn (gen_addxf3 (tmp, op1, tmp));
35325 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
35326 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
35327
35328 emit_label (label2);
35329 }
35330
35331 /* Emit code for round calculation. */
35332 void ix86_emit_i387_round (rtx op0, rtx op1)
35333 {
35334 enum machine_mode inmode = GET_MODE (op1);
35335 enum machine_mode outmode = GET_MODE (op0);
35336 rtx e1, e2, res, tmp, tmp1, half;
35337 rtx scratch = gen_reg_rtx (HImode);
35338 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
35339 rtx jump_label = gen_label_rtx ();
35340 rtx insn;
35341 rtx (*gen_abs) (rtx, rtx);
35342 rtx (*gen_neg) (rtx, rtx);
35343
35344 switch (inmode)
35345 {
35346 case SFmode:
35347 gen_abs = gen_abssf2;
35348 break;
35349 case DFmode:
35350 gen_abs = gen_absdf2;
35351 break;
35352 case XFmode:
35353 gen_abs = gen_absxf2;
35354 break;
35355 default:
35356 gcc_unreachable ();
35357 }
35358
35359 switch (outmode)
35360 {
35361 case SFmode:
35362 gen_neg = gen_negsf2;
35363 break;
35364 case DFmode:
35365 gen_neg = gen_negdf2;
35366 break;
35367 case XFmode:
35368 gen_neg = gen_negxf2;
35369 break;
35370 case HImode:
35371 gen_neg = gen_neghi2;
35372 break;
35373 case SImode:
35374 gen_neg = gen_negsi2;
35375 break;
35376 case DImode:
35377 gen_neg = gen_negdi2;
35378 break;
35379 default:
35380 gcc_unreachable ();
35381 }
35382
35383 e1 = gen_reg_rtx (inmode);
35384 e2 = gen_reg_rtx (inmode);
35385 res = gen_reg_rtx (outmode);
35386
35387 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
35388
35389 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
35390
35391 /* scratch = fxam(op1) */
35392 emit_insn (gen_rtx_SET (VOIDmode, scratch,
35393 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
35394 UNSPEC_FXAM)));
35395 /* e1 = fabs(op1) */
35396 emit_insn (gen_abs (e1, op1));
35397
35398 /* e2 = e1 + 0.5 */
35399 half = force_reg (inmode, half);
35400 emit_insn (gen_rtx_SET (VOIDmode, e2,
35401 gen_rtx_PLUS (inmode, e1, half)));
35402
35403 /* res = floor(e2) */
35404 if (inmode != XFmode)
35405 {
35406 tmp1 = gen_reg_rtx (XFmode);
35407
35408 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
35409 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
35410 }
35411 else
35412 tmp1 = e2;
35413
35414 switch (outmode)
35415 {
35416 case SFmode:
35417 case DFmode:
35418 {
35419 rtx tmp0 = gen_reg_rtx (XFmode);
35420
35421 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
35422
35423 emit_insn (gen_rtx_SET (VOIDmode, res,
35424 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
35425 UNSPEC_TRUNC_NOOP)));
35426 }
35427 break;
35428 case XFmode:
35429 emit_insn (gen_frndintxf2_floor (res, tmp1));
35430 break;
35431 case HImode:
35432 emit_insn (gen_lfloorxfhi2 (res, tmp1));
35433 break;
35434 case SImode:
35435 emit_insn (gen_lfloorxfsi2 (res, tmp1));
35436 break;
35437 case DImode:
35438 emit_insn (gen_lfloorxfdi2 (res, tmp1));
35439 break;
35440 default:
35441 gcc_unreachable ();
35442 }
35443
35444 /* flags = signbit(a) */
35445 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
35446
35447 /* if (flags) then res = -res */
35448 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
35449 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
35450 gen_rtx_LABEL_REF (VOIDmode, jump_label),
35451 pc_rtx);
35452 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
35453 predict_jump (REG_BR_PROB_BASE * 50 / 100);
35454 JUMP_LABEL (insn) = jump_label;
35455
35456 emit_insn (gen_neg (res, res));
35457
35458 emit_label (jump_label);
35459 LABEL_NUSES (jump_label) = 1;
35460
35461 emit_move_insn (op0, res);
35462 }
35463
35464 /* Output code to perform a Newton-Rhapson approximation of a single precision
35465 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
35466
35467 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
35468 {
35469 rtx x0, x1, e0, e1;
35470
35471 x0 = gen_reg_rtx (mode);
35472 e0 = gen_reg_rtx (mode);
35473 e1 = gen_reg_rtx (mode);
35474 x1 = gen_reg_rtx (mode);
35475
35476 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
35477
35478 b = force_reg (mode, b);
35479
35480 /* x0 = rcp(b) estimate */
35481 emit_insn (gen_rtx_SET (VOIDmode, x0,
35482 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
35483 UNSPEC_RCP)));
35484 /* e0 = x0 * b */
35485 emit_insn (gen_rtx_SET (VOIDmode, e0,
35486 gen_rtx_MULT (mode, x0, b)));
35487
35488 /* e0 = x0 * e0 */
35489 emit_insn (gen_rtx_SET (VOIDmode, e0,
35490 gen_rtx_MULT (mode, x0, e0)));
35491
35492 /* e1 = x0 + x0 */
35493 emit_insn (gen_rtx_SET (VOIDmode, e1,
35494 gen_rtx_PLUS (mode, x0, x0)));
35495
35496 /* x1 = e1 - e0 */
35497 emit_insn (gen_rtx_SET (VOIDmode, x1,
35498 gen_rtx_MINUS (mode, e1, e0)));
35499
35500 /* res = a * x1 */
35501 emit_insn (gen_rtx_SET (VOIDmode, res,
35502 gen_rtx_MULT (mode, a, x1)));
35503 }
35504
35505 /* Output code to perform a Newton-Rhapson approximation of a
35506 single precision floating point [reciprocal] square root. */
35507
35508 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
35509 bool recip)
35510 {
35511 rtx x0, e0, e1, e2, e3, mthree, mhalf;
35512 REAL_VALUE_TYPE r;
35513
35514 x0 = gen_reg_rtx (mode);
35515 e0 = gen_reg_rtx (mode);
35516 e1 = gen_reg_rtx (mode);
35517 e2 = gen_reg_rtx (mode);
35518 e3 = gen_reg_rtx (mode);
35519
35520 real_from_integer (&r, VOIDmode, -3, -1, 0);
35521 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
35522
35523 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
35524 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
35525
35526 if (VECTOR_MODE_P (mode))
35527 {
35528 mthree = ix86_build_const_vector (mode, true, mthree);
35529 mhalf = ix86_build_const_vector (mode, true, mhalf);
35530 }
35531
35532 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
35533 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
35534
35535 a = force_reg (mode, a);
35536
35537 /* x0 = rsqrt(a) estimate */
35538 emit_insn (gen_rtx_SET (VOIDmode, x0,
35539 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
35540 UNSPEC_RSQRT)));
35541
35542 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
35543 if (!recip)
35544 {
35545 rtx zero, mask;
35546
35547 zero = gen_reg_rtx (mode);
35548 mask = gen_reg_rtx (mode);
35549
35550 zero = force_reg (mode, CONST0_RTX(mode));
35551 emit_insn (gen_rtx_SET (VOIDmode, mask,
35552 gen_rtx_NE (mode, zero, a)));
35553
35554 emit_insn (gen_rtx_SET (VOIDmode, x0,
35555 gen_rtx_AND (mode, x0, mask)));
35556 }
35557
35558 /* e0 = x0 * a */
35559 emit_insn (gen_rtx_SET (VOIDmode, e0,
35560 gen_rtx_MULT (mode, x0, a)));
35561 /* e1 = e0 * x0 */
35562 emit_insn (gen_rtx_SET (VOIDmode, e1,
35563 gen_rtx_MULT (mode, e0, x0)));
35564
35565 /* e2 = e1 - 3. */
35566 mthree = force_reg (mode, mthree);
35567 emit_insn (gen_rtx_SET (VOIDmode, e2,
35568 gen_rtx_PLUS (mode, e1, mthree)));
35569
35570 mhalf = force_reg (mode, mhalf);
35571 if (recip)
35572 /* e3 = -.5 * x0 */
35573 emit_insn (gen_rtx_SET (VOIDmode, e3,
35574 gen_rtx_MULT (mode, x0, mhalf)));
35575 else
35576 /* e3 = -.5 * e0 */
35577 emit_insn (gen_rtx_SET (VOIDmode, e3,
35578 gen_rtx_MULT (mode, e0, mhalf)));
35579 /* ret = e2 * e3 */
35580 emit_insn (gen_rtx_SET (VOIDmode, res,
35581 gen_rtx_MULT (mode, e2, e3)));
35582 }
35583
35584 #ifdef TARGET_SOLARIS
35585 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
35586
35587 static void
35588 i386_solaris_elf_named_section (const char *name, unsigned int flags,
35589 tree decl)
35590 {
35591 /* With Binutils 2.15, the "@unwind" marker must be specified on
35592 every occurrence of the ".eh_frame" section, not just the first
35593 one. */
35594 if (TARGET_64BIT
35595 && strcmp (name, ".eh_frame") == 0)
35596 {
35597 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
35598 flags & SECTION_WRITE ? "aw" : "a");
35599 return;
35600 }
35601
35602 #ifndef USE_GAS
35603 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
35604 {
35605 solaris_elf_asm_comdat_section (name, flags, decl);
35606 return;
35607 }
35608 #endif
35609
35610 default_elf_asm_named_section (name, flags, decl);
35611 }
35612 #endif /* TARGET_SOLARIS */
35613
35614 /* Return the mangling of TYPE if it is an extended fundamental type. */
35615
35616 static const char *
35617 ix86_mangle_type (const_tree type)
35618 {
35619 type = TYPE_MAIN_VARIANT (type);
35620
35621 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
35622 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
35623 return NULL;
35624
35625 switch (TYPE_MODE (type))
35626 {
35627 case TFmode:
35628 /* __float128 is "g". */
35629 return "g";
35630 case XFmode:
35631 /* "long double" or __float80 is "e". */
35632 return "e";
35633 default:
35634 return NULL;
35635 }
35636 }
35637
35638 /* For 32-bit code we can save PIC register setup by using
35639 __stack_chk_fail_local hidden function instead of calling
35640 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
35641 register, so it is better to call __stack_chk_fail directly. */
35642
35643 static tree ATTRIBUTE_UNUSED
35644 ix86_stack_protect_fail (void)
35645 {
35646 return TARGET_64BIT
35647 ? default_external_stack_protect_fail ()
35648 : default_hidden_stack_protect_fail ();
35649 }
35650
35651 /* Select a format to encode pointers in exception handling data. CODE
35652 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
35653 true if the symbol may be affected by dynamic relocations.
35654
35655 ??? All x86 object file formats are capable of representing this.
35656 After all, the relocation needed is the same as for the call insn.
35657 Whether or not a particular assembler allows us to enter such, I
35658 guess we'll have to see. */
35659 int
35660 asm_preferred_eh_data_format (int code, int global)
35661 {
35662 if (flag_pic)
35663 {
35664 int type = DW_EH_PE_sdata8;
35665 if (!TARGET_64BIT
35666 || ix86_cmodel == CM_SMALL_PIC
35667 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
35668 type = DW_EH_PE_sdata4;
35669 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
35670 }
35671 if (ix86_cmodel == CM_SMALL
35672 || (ix86_cmodel == CM_MEDIUM && code))
35673 return DW_EH_PE_udata4;
35674 return DW_EH_PE_absptr;
35675 }
35676 \f
35677 /* Expand copysign from SIGN to the positive value ABS_VALUE
35678 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
35679 the sign-bit. */
35680 static void
35681 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
35682 {
35683 enum machine_mode mode = GET_MODE (sign);
35684 rtx sgn = gen_reg_rtx (mode);
35685 if (mask == NULL_RTX)
35686 {
35687 enum machine_mode vmode;
35688
35689 if (mode == SFmode)
35690 vmode = V4SFmode;
35691 else if (mode == DFmode)
35692 vmode = V2DFmode;
35693 else
35694 vmode = mode;
35695
35696 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
35697 if (!VECTOR_MODE_P (mode))
35698 {
35699 /* We need to generate a scalar mode mask in this case. */
35700 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
35701 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
35702 mask = gen_reg_rtx (mode);
35703 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
35704 }
35705 }
35706 else
35707 mask = gen_rtx_NOT (mode, mask);
35708 emit_insn (gen_rtx_SET (VOIDmode, sgn,
35709 gen_rtx_AND (mode, mask, sign)));
35710 emit_insn (gen_rtx_SET (VOIDmode, result,
35711 gen_rtx_IOR (mode, abs_value, sgn)));
35712 }
35713
35714 /* Expand fabs (OP0) and return a new rtx that holds the result. The
35715 mask for masking out the sign-bit is stored in *SMASK, if that is
35716 non-null. */
35717 static rtx
35718 ix86_expand_sse_fabs (rtx op0, rtx *smask)
35719 {
35720 enum machine_mode vmode, mode = GET_MODE (op0);
35721 rtx xa, mask;
35722
35723 xa = gen_reg_rtx (mode);
35724 if (mode == SFmode)
35725 vmode = V4SFmode;
35726 else if (mode == DFmode)
35727 vmode = V2DFmode;
35728 else
35729 vmode = mode;
35730 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
35731 if (!VECTOR_MODE_P (mode))
35732 {
35733 /* We need to generate a scalar mode mask in this case. */
35734 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
35735 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
35736 mask = gen_reg_rtx (mode);
35737 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
35738 }
35739 emit_insn (gen_rtx_SET (VOIDmode, xa,
35740 gen_rtx_AND (mode, op0, mask)));
35741
35742 if (smask)
35743 *smask = mask;
35744
35745 return xa;
35746 }
35747
35748 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
35749 swapping the operands if SWAP_OPERANDS is true. The expanded
35750 code is a forward jump to a newly created label in case the
35751 comparison is true. The generated label rtx is returned. */
35752 static rtx
35753 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
35754 bool swap_operands)
35755 {
35756 rtx label, tmp;
35757
35758 if (swap_operands)
35759 {
35760 tmp = op0;
35761 op0 = op1;
35762 op1 = tmp;
35763 }
35764
35765 label = gen_label_rtx ();
35766 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
35767 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35768 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
35769 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
35770 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
35771 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
35772 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
35773 JUMP_LABEL (tmp) = label;
35774
35775 return label;
35776 }
35777
35778 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
35779 using comparison code CODE. Operands are swapped for the comparison if
35780 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
35781 static rtx
35782 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
35783 bool swap_operands)
35784 {
35785 rtx (*insn)(rtx, rtx, rtx, rtx);
35786 enum machine_mode mode = GET_MODE (op0);
35787 rtx mask = gen_reg_rtx (mode);
35788
35789 if (swap_operands)
35790 {
35791 rtx tmp = op0;
35792 op0 = op1;
35793 op1 = tmp;
35794 }
35795
35796 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
35797
35798 emit_insn (insn (mask, op0, op1,
35799 gen_rtx_fmt_ee (code, mode, op0, op1)));
35800 return mask;
35801 }
35802
35803 /* Generate and return a rtx of mode MODE for 2**n where n is the number
35804 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
35805 static rtx
35806 ix86_gen_TWO52 (enum machine_mode mode)
35807 {
35808 REAL_VALUE_TYPE TWO52r;
35809 rtx TWO52;
35810
35811 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
35812 TWO52 = const_double_from_real_value (TWO52r, mode);
35813 TWO52 = force_reg (mode, TWO52);
35814
35815 return TWO52;
35816 }
35817
35818 /* Expand SSE sequence for computing lround from OP1 storing
35819 into OP0. */
35820 void
35821 ix86_expand_lround (rtx op0, rtx op1)
35822 {
35823 /* C code for the stuff we're doing below:
35824 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
35825 return (long)tmp;
35826 */
35827 enum machine_mode mode = GET_MODE (op1);
35828 const struct real_format *fmt;
35829 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35830 rtx adj;
35831
35832 /* load nextafter (0.5, 0.0) */
35833 fmt = REAL_MODE_FORMAT (mode);
35834 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35835 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35836
35837 /* adj = copysign (0.5, op1) */
35838 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
35839 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
35840
35841 /* adj = op1 + adj */
35842 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
35843
35844 /* op0 = (imode)adj */
35845 expand_fix (op0, adj, 0);
35846 }
35847
35848 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
35849 into OPERAND0. */
35850 void
35851 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
35852 {
35853 /* C code for the stuff we're doing below (for do_floor):
35854 xi = (long)op1;
35855 xi -= (double)xi > op1 ? 1 : 0;
35856 return xi;
35857 */
35858 enum machine_mode fmode = GET_MODE (op1);
35859 enum machine_mode imode = GET_MODE (op0);
35860 rtx ireg, freg, label, tmp;
35861
35862 /* reg = (long)op1 */
35863 ireg = gen_reg_rtx (imode);
35864 expand_fix (ireg, op1, 0);
35865
35866 /* freg = (double)reg */
35867 freg = gen_reg_rtx (fmode);
35868 expand_float (freg, ireg, 0);
35869
35870 /* ireg = (freg > op1) ? ireg - 1 : ireg */
35871 label = ix86_expand_sse_compare_and_jump (UNLE,
35872 freg, op1, !do_floor);
35873 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
35874 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
35875 emit_move_insn (ireg, tmp);
35876
35877 emit_label (label);
35878 LABEL_NUSES (label) = 1;
35879
35880 emit_move_insn (op0, ireg);
35881 }
35882
35883 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
35884 result in OPERAND0. */
35885 void
35886 ix86_expand_rint (rtx operand0, rtx operand1)
35887 {
35888 /* C code for the stuff we're doing below:
35889 xa = fabs (operand1);
35890 if (!isless (xa, 2**52))
35891 return operand1;
35892 xa = xa + 2**52 - 2**52;
35893 return copysign (xa, operand1);
35894 */
35895 enum machine_mode mode = GET_MODE (operand0);
35896 rtx res, xa, label, TWO52, mask;
35897
35898 res = gen_reg_rtx (mode);
35899 emit_move_insn (res, operand1);
35900
35901 /* xa = abs (operand1) */
35902 xa = ix86_expand_sse_fabs (res, &mask);
35903
35904 /* if (!isless (xa, TWO52)) goto label; */
35905 TWO52 = ix86_gen_TWO52 (mode);
35906 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35907
35908 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35909 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35910
35911 ix86_sse_copysign_to_positive (res, xa, res, mask);
35912
35913 emit_label (label);
35914 LABEL_NUSES (label) = 1;
35915
35916 emit_move_insn (operand0, res);
35917 }
35918
35919 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35920 into OPERAND0. */
35921 void
35922 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
35923 {
35924 /* C code for the stuff we expand below.
35925 double xa = fabs (x), x2;
35926 if (!isless (xa, TWO52))
35927 return x;
35928 xa = xa + TWO52 - TWO52;
35929 x2 = copysign (xa, x);
35930 Compensate. Floor:
35931 if (x2 > x)
35932 x2 -= 1;
35933 Compensate. Ceil:
35934 if (x2 < x)
35935 x2 -= -1;
35936 return x2;
35937 */
35938 enum machine_mode mode = GET_MODE (operand0);
35939 rtx xa, TWO52, tmp, label, one, res, mask;
35940
35941 TWO52 = ix86_gen_TWO52 (mode);
35942
35943 /* Temporary for holding the result, initialized to the input
35944 operand to ease control flow. */
35945 res = gen_reg_rtx (mode);
35946 emit_move_insn (res, operand1);
35947
35948 /* xa = abs (operand1) */
35949 xa = ix86_expand_sse_fabs (res, &mask);
35950
35951 /* if (!isless (xa, TWO52)) goto label; */
35952 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35953
35954 /* xa = xa + TWO52 - TWO52; */
35955 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35956 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35957
35958 /* xa = copysign (xa, operand1) */
35959 ix86_sse_copysign_to_positive (xa, xa, res, mask);
35960
35961 /* generate 1.0 or -1.0 */
35962 one = force_reg (mode,
35963 const_double_from_real_value (do_floor
35964 ? dconst1 : dconstm1, mode));
35965
35966 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35967 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35968 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35969 gen_rtx_AND (mode, one, tmp)));
35970 /* We always need to subtract here to preserve signed zero. */
35971 tmp = expand_simple_binop (mode, MINUS,
35972 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35973 emit_move_insn (res, tmp);
35974
35975 emit_label (label);
35976 LABEL_NUSES (label) = 1;
35977
35978 emit_move_insn (operand0, res);
35979 }
35980
35981 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35982 into OPERAND0. */
35983 void
35984 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
35985 {
35986 /* C code for the stuff we expand below.
35987 double xa = fabs (x), x2;
35988 if (!isless (xa, TWO52))
35989 return x;
35990 x2 = (double)(long)x;
35991 Compensate. Floor:
35992 if (x2 > x)
35993 x2 -= 1;
35994 Compensate. Ceil:
35995 if (x2 < x)
35996 x2 += 1;
35997 if (HONOR_SIGNED_ZEROS (mode))
35998 return copysign (x2, x);
35999 return x2;
36000 */
36001 enum machine_mode mode = GET_MODE (operand0);
36002 rtx xa, xi, TWO52, tmp, label, one, res, mask;
36003
36004 TWO52 = ix86_gen_TWO52 (mode);
36005
36006 /* Temporary for holding the result, initialized to the input
36007 operand to ease control flow. */
36008 res = gen_reg_rtx (mode);
36009 emit_move_insn (res, operand1);
36010
36011 /* xa = abs (operand1) */
36012 xa = ix86_expand_sse_fabs (res, &mask);
36013
36014 /* if (!isless (xa, TWO52)) goto label; */
36015 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36016
36017 /* xa = (double)(long)x */
36018 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
36019 expand_fix (xi, res, 0);
36020 expand_float (xa, xi, 0);
36021
36022 /* generate 1.0 */
36023 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
36024
36025 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
36026 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
36027 emit_insn (gen_rtx_SET (VOIDmode, tmp,
36028 gen_rtx_AND (mode, one, tmp)));
36029 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
36030 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
36031 emit_move_insn (res, tmp);
36032
36033 if (HONOR_SIGNED_ZEROS (mode))
36034 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
36035
36036 emit_label (label);
36037 LABEL_NUSES (label) = 1;
36038
36039 emit_move_insn (operand0, res);
36040 }
36041
36042 /* Expand SSE sequence for computing round from OPERAND1 storing
36043 into OPERAND0. Sequence that works without relying on DImode truncation
36044 via cvttsd2siq that is only available on 64bit targets. */
36045 void
36046 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
36047 {
36048 /* C code for the stuff we expand below.
36049 double xa = fabs (x), xa2, x2;
36050 if (!isless (xa, TWO52))
36051 return x;
36052 Using the absolute value and copying back sign makes
36053 -0.0 -> -0.0 correct.
36054 xa2 = xa + TWO52 - TWO52;
36055 Compensate.
36056 dxa = xa2 - xa;
36057 if (dxa <= -0.5)
36058 xa2 += 1;
36059 else if (dxa > 0.5)
36060 xa2 -= 1;
36061 x2 = copysign (xa2, x);
36062 return x2;
36063 */
36064 enum machine_mode mode = GET_MODE (operand0);
36065 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
36066
36067 TWO52 = ix86_gen_TWO52 (mode);
36068
36069 /* Temporary for holding the result, initialized to the input
36070 operand to ease control flow. */
36071 res = gen_reg_rtx (mode);
36072 emit_move_insn (res, operand1);
36073
36074 /* xa = abs (operand1) */
36075 xa = ix86_expand_sse_fabs (res, &mask);
36076
36077 /* if (!isless (xa, TWO52)) goto label; */
36078 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36079
36080 /* xa2 = xa + TWO52 - TWO52; */
36081 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
36082 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
36083
36084 /* dxa = xa2 - xa; */
36085 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
36086
36087 /* generate 0.5, 1.0 and -0.5 */
36088 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
36089 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
36090 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
36091 0, OPTAB_DIRECT);
36092
36093 /* Compensate. */
36094 tmp = gen_reg_rtx (mode);
36095 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
36096 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
36097 emit_insn (gen_rtx_SET (VOIDmode, tmp,
36098 gen_rtx_AND (mode, one, tmp)));
36099 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
36100 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
36101 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
36102 emit_insn (gen_rtx_SET (VOIDmode, tmp,
36103 gen_rtx_AND (mode, one, tmp)));
36104 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
36105
36106 /* res = copysign (xa2, operand1) */
36107 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
36108
36109 emit_label (label);
36110 LABEL_NUSES (label) = 1;
36111
36112 emit_move_insn (operand0, res);
36113 }
36114
36115 /* Expand SSE sequence for computing trunc from OPERAND1 storing
36116 into OPERAND0. */
36117 void
36118 ix86_expand_trunc (rtx operand0, rtx operand1)
36119 {
36120 /* C code for SSE variant we expand below.
36121 double xa = fabs (x), x2;
36122 if (!isless (xa, TWO52))
36123 return x;
36124 x2 = (double)(long)x;
36125 if (HONOR_SIGNED_ZEROS (mode))
36126 return copysign (x2, x);
36127 return x2;
36128 */
36129 enum machine_mode mode = GET_MODE (operand0);
36130 rtx xa, xi, TWO52, label, res, mask;
36131
36132 TWO52 = ix86_gen_TWO52 (mode);
36133
36134 /* Temporary for holding the result, initialized to the input
36135 operand to ease control flow. */
36136 res = gen_reg_rtx (mode);
36137 emit_move_insn (res, operand1);
36138
36139 /* xa = abs (operand1) */
36140 xa = ix86_expand_sse_fabs (res, &mask);
36141
36142 /* if (!isless (xa, TWO52)) goto label; */
36143 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36144
36145 /* x = (double)(long)x */
36146 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
36147 expand_fix (xi, res, 0);
36148 expand_float (res, xi, 0);
36149
36150 if (HONOR_SIGNED_ZEROS (mode))
36151 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
36152
36153 emit_label (label);
36154 LABEL_NUSES (label) = 1;
36155
36156 emit_move_insn (operand0, res);
36157 }
36158
36159 /* Expand SSE sequence for computing trunc from OPERAND1 storing
36160 into OPERAND0. */
36161 void
36162 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
36163 {
36164 enum machine_mode mode = GET_MODE (operand0);
36165 rtx xa, mask, TWO52, label, one, res, smask, tmp;
36166
36167 /* C code for SSE variant we expand below.
36168 double xa = fabs (x), x2;
36169 if (!isless (xa, TWO52))
36170 return x;
36171 xa2 = xa + TWO52 - TWO52;
36172 Compensate:
36173 if (xa2 > xa)
36174 xa2 -= 1.0;
36175 x2 = copysign (xa2, x);
36176 return x2;
36177 */
36178
36179 TWO52 = ix86_gen_TWO52 (mode);
36180
36181 /* Temporary for holding the result, initialized to the input
36182 operand to ease control flow. */
36183 res = gen_reg_rtx (mode);
36184 emit_move_insn (res, operand1);
36185
36186 /* xa = abs (operand1) */
36187 xa = ix86_expand_sse_fabs (res, &smask);
36188
36189 /* if (!isless (xa, TWO52)) goto label; */
36190 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36191
36192 /* res = xa + TWO52 - TWO52; */
36193 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
36194 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
36195 emit_move_insn (res, tmp);
36196
36197 /* generate 1.0 */
36198 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
36199
36200 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
36201 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
36202 emit_insn (gen_rtx_SET (VOIDmode, mask,
36203 gen_rtx_AND (mode, mask, one)));
36204 tmp = expand_simple_binop (mode, MINUS,
36205 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
36206 emit_move_insn (res, tmp);
36207
36208 /* res = copysign (res, operand1) */
36209 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
36210
36211 emit_label (label);
36212 LABEL_NUSES (label) = 1;
36213
36214 emit_move_insn (operand0, res);
36215 }
36216
36217 /* Expand SSE sequence for computing round from OPERAND1 storing
36218 into OPERAND0. */
36219 void
36220 ix86_expand_round (rtx operand0, rtx operand1)
36221 {
36222 /* C code for the stuff we're doing below:
36223 double xa = fabs (x);
36224 if (!isless (xa, TWO52))
36225 return x;
36226 xa = (double)(long)(xa + nextafter (0.5, 0.0));
36227 return copysign (xa, x);
36228 */
36229 enum machine_mode mode = GET_MODE (operand0);
36230 rtx res, TWO52, xa, label, xi, half, mask;
36231 const struct real_format *fmt;
36232 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
36233
36234 /* Temporary for holding the result, initialized to the input
36235 operand to ease control flow. */
36236 res = gen_reg_rtx (mode);
36237 emit_move_insn (res, operand1);
36238
36239 TWO52 = ix86_gen_TWO52 (mode);
36240 xa = ix86_expand_sse_fabs (res, &mask);
36241 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36242
36243 /* load nextafter (0.5, 0.0) */
36244 fmt = REAL_MODE_FORMAT (mode);
36245 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
36246 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
36247
36248 /* xa = xa + 0.5 */
36249 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
36250 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
36251
36252 /* xa = (double)(int64_t)xa */
36253 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
36254 expand_fix (xi, xa, 0);
36255 expand_float (xa, xi, 0);
36256
36257 /* res = copysign (xa, operand1) */
36258 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
36259
36260 emit_label (label);
36261 LABEL_NUSES (label) = 1;
36262
36263 emit_move_insn (operand0, res);
36264 }
36265
36266 /* Expand SSE sequence for computing round
36267 from OP1 storing into OP0 using sse4 round insn. */
36268 void
36269 ix86_expand_round_sse4 (rtx op0, rtx op1)
36270 {
36271 enum machine_mode mode = GET_MODE (op0);
36272 rtx e1, e2, res, half;
36273 const struct real_format *fmt;
36274 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
36275 rtx (*gen_copysign) (rtx, rtx, rtx);
36276 rtx (*gen_round) (rtx, rtx, rtx);
36277
36278 switch (mode)
36279 {
36280 case SFmode:
36281 gen_copysign = gen_copysignsf3;
36282 gen_round = gen_sse4_1_roundsf2;
36283 break;
36284 case DFmode:
36285 gen_copysign = gen_copysigndf3;
36286 gen_round = gen_sse4_1_rounddf2;
36287 break;
36288 default:
36289 gcc_unreachable ();
36290 }
36291
36292 /* round (a) = trunc (a + copysign (0.5, a)) */
36293
36294 /* load nextafter (0.5, 0.0) */
36295 fmt = REAL_MODE_FORMAT (mode);
36296 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
36297 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
36298 half = const_double_from_real_value (pred_half, mode);
36299
36300 /* e1 = copysign (0.5, op1) */
36301 e1 = gen_reg_rtx (mode);
36302 emit_insn (gen_copysign (e1, half, op1));
36303
36304 /* e2 = op1 + e1 */
36305 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
36306
36307 /* res = trunc (e2) */
36308 res = gen_reg_rtx (mode);
36309 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
36310
36311 emit_move_insn (op0, res);
36312 }
36313 \f
36314
36315 /* Table of valid machine attributes. */
36316 static const struct attribute_spec ix86_attribute_table[] =
36317 {
36318 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
36319 affects_type_identity } */
36320 /* Stdcall attribute says callee is responsible for popping arguments
36321 if they are not variable. */
36322 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36323 true },
36324 /* Fastcall attribute says callee is responsible for popping arguments
36325 if they are not variable. */
36326 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36327 true },
36328 /* Thiscall attribute says callee is responsible for popping arguments
36329 if they are not variable. */
36330 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36331 true },
36332 /* Cdecl attribute says the callee is a normal C declaration */
36333 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36334 true },
36335 /* Regparm attribute specifies how many integer arguments are to be
36336 passed in registers. */
36337 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
36338 true },
36339 /* Sseregparm attribute says we are using x86_64 calling conventions
36340 for FP arguments. */
36341 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
36342 true },
36343 /* The transactional memory builtins are implicitly regparm or fastcall
36344 depending on the ABI. Override the generic do-nothing attribute that
36345 these builtins were declared with. */
36346 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
36347 true },
36348 /* force_align_arg_pointer says this function realigns the stack at entry. */
36349 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
36350 false, true, true, ix86_handle_cconv_attribute, false },
36351 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
36352 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
36353 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
36354 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
36355 false },
36356 #endif
36357 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
36358 false },
36359 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
36360 false },
36361 #ifdef SUBTARGET_ATTRIBUTE_TABLE
36362 SUBTARGET_ATTRIBUTE_TABLE,
36363 #endif
36364 /* ms_abi and sysv_abi calling convention function attributes. */
36365 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
36366 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
36367 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
36368 false },
36369 { "callee_pop_aggregate_return", 1, 1, false, true, true,
36370 ix86_handle_callee_pop_aggregate_return, true },
36371 /* End element. */
36372 { NULL, 0, 0, false, false, false, NULL, false }
36373 };
36374
36375 /* Implement targetm.vectorize.builtin_vectorization_cost. */
36376 static int
36377 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
36378 tree vectype,
36379 int misalign ATTRIBUTE_UNUSED)
36380 {
36381 unsigned elements;
36382
36383 switch (type_of_cost)
36384 {
36385 case scalar_stmt:
36386 return ix86_cost->scalar_stmt_cost;
36387
36388 case scalar_load:
36389 return ix86_cost->scalar_load_cost;
36390
36391 case scalar_store:
36392 return ix86_cost->scalar_store_cost;
36393
36394 case vector_stmt:
36395 return ix86_cost->vec_stmt_cost;
36396
36397 case vector_load:
36398 return ix86_cost->vec_align_load_cost;
36399
36400 case vector_store:
36401 return ix86_cost->vec_store_cost;
36402
36403 case vec_to_scalar:
36404 return ix86_cost->vec_to_scalar_cost;
36405
36406 case scalar_to_vec:
36407 return ix86_cost->scalar_to_vec_cost;
36408
36409 case unaligned_load:
36410 case unaligned_store:
36411 return ix86_cost->vec_unalign_load_cost;
36412
36413 case cond_branch_taken:
36414 return ix86_cost->cond_taken_branch_cost;
36415
36416 case cond_branch_not_taken:
36417 return ix86_cost->cond_not_taken_branch_cost;
36418
36419 case vec_perm:
36420 case vec_promote_demote:
36421 return ix86_cost->vec_stmt_cost;
36422
36423 case vec_construct:
36424 elements = TYPE_VECTOR_SUBPARTS (vectype);
36425 return elements / 2 + 1;
36426
36427 default:
36428 gcc_unreachable ();
36429 }
36430 }
36431
36432 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
36433 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
36434 insn every time. */
36435
36436 static GTY(()) rtx vselect_insn;
36437
36438 /* Initialize vselect_insn. */
36439
36440 static void
36441 init_vselect_insn (void)
36442 {
36443 unsigned i;
36444 rtx x;
36445
36446 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
36447 for (i = 0; i < MAX_VECT_LEN; ++i)
36448 XVECEXP (x, 0, i) = const0_rtx;
36449 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
36450 const0_rtx), x);
36451 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
36452 start_sequence ();
36453 vselect_insn = emit_insn (x);
36454 end_sequence ();
36455 }
36456
36457 /* Construct (set target (vec_select op0 (parallel perm))) and
36458 return true if that's a valid instruction in the active ISA. */
36459
36460 static bool
36461 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
36462 unsigned nelt, bool testing_p)
36463 {
36464 unsigned int i;
36465 rtx x, save_vconcat;
36466 int icode;
36467
36468 if (vselect_insn == NULL_RTX)
36469 init_vselect_insn ();
36470
36471 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
36472 PUT_NUM_ELEM (XVEC (x, 0), nelt);
36473 for (i = 0; i < nelt; ++i)
36474 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
36475 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
36476 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
36477 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
36478 SET_DEST (PATTERN (vselect_insn)) = target;
36479 icode = recog_memoized (vselect_insn);
36480
36481 if (icode >= 0 && !testing_p)
36482 emit_insn (copy_rtx (PATTERN (vselect_insn)));
36483
36484 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
36485 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
36486 INSN_CODE (vselect_insn) = -1;
36487
36488 return icode >= 0;
36489 }
36490
36491 /* Similar, but generate a vec_concat from op0 and op1 as well. */
36492
36493 static bool
36494 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
36495 const unsigned char *perm, unsigned nelt,
36496 bool testing_p)
36497 {
36498 enum machine_mode v2mode;
36499 rtx x;
36500 bool ok;
36501
36502 if (vselect_insn == NULL_RTX)
36503 init_vselect_insn ();
36504
36505 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
36506 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
36507 PUT_MODE (x, v2mode);
36508 XEXP (x, 0) = op0;
36509 XEXP (x, 1) = op1;
36510 ok = expand_vselect (target, x, perm, nelt, testing_p);
36511 XEXP (x, 0) = const0_rtx;
36512 XEXP (x, 1) = const0_rtx;
36513 return ok;
36514 }
36515
36516 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36517 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
36518
36519 static bool
36520 expand_vec_perm_blend (struct expand_vec_perm_d *d)
36521 {
36522 enum machine_mode vmode = d->vmode;
36523 unsigned i, mask, nelt = d->nelt;
36524 rtx target, op0, op1, x;
36525 rtx rperm[32], vperm;
36526
36527 if (d->one_operand_p)
36528 return false;
36529 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
36530 ;
36531 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
36532 ;
36533 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
36534 ;
36535 else
36536 return false;
36537
36538 /* This is a blend, not a permute. Elements must stay in their
36539 respective lanes. */
36540 for (i = 0; i < nelt; ++i)
36541 {
36542 unsigned e = d->perm[i];
36543 if (!(e == i || e == i + nelt))
36544 return false;
36545 }
36546
36547 if (d->testing_p)
36548 return true;
36549
36550 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
36551 decision should be extracted elsewhere, so that we only try that
36552 sequence once all budget==3 options have been tried. */
36553 target = d->target;
36554 op0 = d->op0;
36555 op1 = d->op1;
36556 mask = 0;
36557
36558 switch (vmode)
36559 {
36560 case V4DFmode:
36561 case V8SFmode:
36562 case V2DFmode:
36563 case V4SFmode:
36564 case V8HImode:
36565 case V8SImode:
36566 for (i = 0; i < nelt; ++i)
36567 mask |= (d->perm[i] >= nelt) << i;
36568 break;
36569
36570 case V2DImode:
36571 for (i = 0; i < 2; ++i)
36572 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
36573 vmode = V8HImode;
36574 goto do_subreg;
36575
36576 case V4SImode:
36577 for (i = 0; i < 4; ++i)
36578 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
36579 vmode = V8HImode;
36580 goto do_subreg;
36581
36582 case V16QImode:
36583 /* See if bytes move in pairs so we can use pblendw with
36584 an immediate argument, rather than pblendvb with a vector
36585 argument. */
36586 for (i = 0; i < 16; i += 2)
36587 if (d->perm[i] + 1 != d->perm[i + 1])
36588 {
36589 use_pblendvb:
36590 for (i = 0; i < nelt; ++i)
36591 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
36592
36593 finish_pblendvb:
36594 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
36595 vperm = force_reg (vmode, vperm);
36596
36597 if (GET_MODE_SIZE (vmode) == 16)
36598 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
36599 else
36600 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
36601 return true;
36602 }
36603
36604 for (i = 0; i < 8; ++i)
36605 mask |= (d->perm[i * 2] >= 16) << i;
36606 vmode = V8HImode;
36607 /* FALLTHRU */
36608
36609 do_subreg:
36610 target = gen_lowpart (vmode, target);
36611 op0 = gen_lowpart (vmode, op0);
36612 op1 = gen_lowpart (vmode, op1);
36613 break;
36614
36615 case V32QImode:
36616 /* See if bytes move in pairs. If not, vpblendvb must be used. */
36617 for (i = 0; i < 32; i += 2)
36618 if (d->perm[i] + 1 != d->perm[i + 1])
36619 goto use_pblendvb;
36620 /* See if bytes move in quadruplets. If yes, vpblendd
36621 with immediate can be used. */
36622 for (i = 0; i < 32; i += 4)
36623 if (d->perm[i] + 2 != d->perm[i + 2])
36624 break;
36625 if (i < 32)
36626 {
36627 /* See if bytes move the same in both lanes. If yes,
36628 vpblendw with immediate can be used. */
36629 for (i = 0; i < 16; i += 2)
36630 if (d->perm[i] + 16 != d->perm[i + 16])
36631 goto use_pblendvb;
36632
36633 /* Use vpblendw. */
36634 for (i = 0; i < 16; ++i)
36635 mask |= (d->perm[i * 2] >= 32) << i;
36636 vmode = V16HImode;
36637 goto do_subreg;
36638 }
36639
36640 /* Use vpblendd. */
36641 for (i = 0; i < 8; ++i)
36642 mask |= (d->perm[i * 4] >= 32) << i;
36643 vmode = V8SImode;
36644 goto do_subreg;
36645
36646 case V16HImode:
36647 /* See if words move in pairs. If yes, vpblendd can be used. */
36648 for (i = 0; i < 16; i += 2)
36649 if (d->perm[i] + 1 != d->perm[i + 1])
36650 break;
36651 if (i < 16)
36652 {
36653 /* See if words move the same in both lanes. If not,
36654 vpblendvb must be used. */
36655 for (i = 0; i < 8; i++)
36656 if (d->perm[i] + 8 != d->perm[i + 8])
36657 {
36658 /* Use vpblendvb. */
36659 for (i = 0; i < 32; ++i)
36660 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
36661
36662 vmode = V32QImode;
36663 nelt = 32;
36664 target = gen_lowpart (vmode, target);
36665 op0 = gen_lowpart (vmode, op0);
36666 op1 = gen_lowpart (vmode, op1);
36667 goto finish_pblendvb;
36668 }
36669
36670 /* Use vpblendw. */
36671 for (i = 0; i < 16; ++i)
36672 mask |= (d->perm[i] >= 16) << i;
36673 break;
36674 }
36675
36676 /* Use vpblendd. */
36677 for (i = 0; i < 8; ++i)
36678 mask |= (d->perm[i * 2] >= 16) << i;
36679 vmode = V8SImode;
36680 goto do_subreg;
36681
36682 case V4DImode:
36683 /* Use vpblendd. */
36684 for (i = 0; i < 4; ++i)
36685 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
36686 vmode = V8SImode;
36687 goto do_subreg;
36688
36689 default:
36690 gcc_unreachable ();
36691 }
36692
36693 /* This matches five different patterns with the different modes. */
36694 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
36695 x = gen_rtx_SET (VOIDmode, target, x);
36696 emit_insn (x);
36697
36698 return true;
36699 }
36700
36701 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36702 in terms of the variable form of vpermilps.
36703
36704 Note that we will have already failed the immediate input vpermilps,
36705 which requires that the high and low part shuffle be identical; the
36706 variable form doesn't require that. */
36707
36708 static bool
36709 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
36710 {
36711 rtx rperm[8], vperm;
36712 unsigned i;
36713
36714 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
36715 return false;
36716
36717 /* We can only permute within the 128-bit lane. */
36718 for (i = 0; i < 8; ++i)
36719 {
36720 unsigned e = d->perm[i];
36721 if (i < 4 ? e >= 4 : e < 4)
36722 return false;
36723 }
36724
36725 if (d->testing_p)
36726 return true;
36727
36728 for (i = 0; i < 8; ++i)
36729 {
36730 unsigned e = d->perm[i];
36731
36732 /* Within each 128-bit lane, the elements of op0 are numbered
36733 from 0 and the elements of op1 are numbered from 4. */
36734 if (e >= 8 + 4)
36735 e -= 8;
36736 else if (e >= 4)
36737 e -= 4;
36738
36739 rperm[i] = GEN_INT (e);
36740 }
36741
36742 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
36743 vperm = force_reg (V8SImode, vperm);
36744 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
36745
36746 return true;
36747 }
36748
36749 /* Return true if permutation D can be performed as VMODE permutation
36750 instead. */
36751
36752 static bool
36753 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
36754 {
36755 unsigned int i, j, chunk;
36756
36757 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
36758 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
36759 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
36760 return false;
36761
36762 if (GET_MODE_NUNITS (vmode) >= d->nelt)
36763 return true;
36764
36765 chunk = d->nelt / GET_MODE_NUNITS (vmode);
36766 for (i = 0; i < d->nelt; i += chunk)
36767 if (d->perm[i] & (chunk - 1))
36768 return false;
36769 else
36770 for (j = 1; j < chunk; ++j)
36771 if (d->perm[i] + j != d->perm[i + j])
36772 return false;
36773
36774 return true;
36775 }
36776
36777 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36778 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
36779
36780 static bool
36781 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
36782 {
36783 unsigned i, nelt, eltsz, mask;
36784 unsigned char perm[32];
36785 enum machine_mode vmode = V16QImode;
36786 rtx rperm[32], vperm, target, op0, op1;
36787
36788 nelt = d->nelt;
36789
36790 if (!d->one_operand_p)
36791 {
36792 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
36793 {
36794 if (TARGET_AVX2
36795 && valid_perm_using_mode_p (V2TImode, d))
36796 {
36797 if (d->testing_p)
36798 return true;
36799
36800 /* Use vperm2i128 insn. The pattern uses
36801 V4DImode instead of V2TImode. */
36802 target = gen_lowpart (V4DImode, d->target);
36803 op0 = gen_lowpart (V4DImode, d->op0);
36804 op1 = gen_lowpart (V4DImode, d->op1);
36805 rperm[0]
36806 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
36807 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
36808 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
36809 return true;
36810 }
36811 return false;
36812 }
36813 }
36814 else
36815 {
36816 if (GET_MODE_SIZE (d->vmode) == 16)
36817 {
36818 if (!TARGET_SSSE3)
36819 return false;
36820 }
36821 else if (GET_MODE_SIZE (d->vmode) == 32)
36822 {
36823 if (!TARGET_AVX2)
36824 return false;
36825
36826 /* V4DImode should be already handled through
36827 expand_vselect by vpermq instruction. */
36828 gcc_assert (d->vmode != V4DImode);
36829
36830 vmode = V32QImode;
36831 if (d->vmode == V8SImode
36832 || d->vmode == V16HImode
36833 || d->vmode == V32QImode)
36834 {
36835 /* First see if vpermq can be used for
36836 V8SImode/V16HImode/V32QImode. */
36837 if (valid_perm_using_mode_p (V4DImode, d))
36838 {
36839 for (i = 0; i < 4; i++)
36840 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
36841 if (d->testing_p)
36842 return true;
36843 return expand_vselect (gen_lowpart (V4DImode, d->target),
36844 gen_lowpart (V4DImode, d->op0),
36845 perm, 4, false);
36846 }
36847
36848 /* Next see if vpermd can be used. */
36849 if (valid_perm_using_mode_p (V8SImode, d))
36850 vmode = V8SImode;
36851 }
36852 /* Or if vpermps can be used. */
36853 else if (d->vmode == V8SFmode)
36854 vmode = V8SImode;
36855
36856 if (vmode == V32QImode)
36857 {
36858 /* vpshufb only works intra lanes, it is not
36859 possible to shuffle bytes in between the lanes. */
36860 for (i = 0; i < nelt; ++i)
36861 if ((d->perm[i] ^ i) & (nelt / 2))
36862 return false;
36863 }
36864 }
36865 else
36866 return false;
36867 }
36868
36869 if (d->testing_p)
36870 return true;
36871
36872 if (vmode == V8SImode)
36873 for (i = 0; i < 8; ++i)
36874 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
36875 else
36876 {
36877 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36878 if (!d->one_operand_p)
36879 mask = 2 * nelt - 1;
36880 else if (vmode == V16QImode)
36881 mask = nelt - 1;
36882 else
36883 mask = nelt / 2 - 1;
36884
36885 for (i = 0; i < nelt; ++i)
36886 {
36887 unsigned j, e = d->perm[i] & mask;
36888 for (j = 0; j < eltsz; ++j)
36889 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
36890 }
36891 }
36892
36893 vperm = gen_rtx_CONST_VECTOR (vmode,
36894 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
36895 vperm = force_reg (vmode, vperm);
36896
36897 target = gen_lowpart (vmode, d->target);
36898 op0 = gen_lowpart (vmode, d->op0);
36899 if (d->one_operand_p)
36900 {
36901 if (vmode == V16QImode)
36902 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
36903 else if (vmode == V32QImode)
36904 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
36905 else if (vmode == V8SFmode)
36906 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
36907 else
36908 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
36909 }
36910 else
36911 {
36912 op1 = gen_lowpart (vmode, d->op1);
36913 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
36914 }
36915
36916 return true;
36917 }
36918
36919 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
36920 in a single instruction. */
36921
36922 static bool
36923 expand_vec_perm_1 (struct expand_vec_perm_d *d)
36924 {
36925 unsigned i, nelt = d->nelt;
36926 unsigned char perm2[MAX_VECT_LEN];
36927
36928 /* Check plain VEC_SELECT first, because AVX has instructions that could
36929 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
36930 input where SEL+CONCAT may not. */
36931 if (d->one_operand_p)
36932 {
36933 int mask = nelt - 1;
36934 bool identity_perm = true;
36935 bool broadcast_perm = true;
36936
36937 for (i = 0; i < nelt; i++)
36938 {
36939 perm2[i] = d->perm[i] & mask;
36940 if (perm2[i] != i)
36941 identity_perm = false;
36942 if (perm2[i])
36943 broadcast_perm = false;
36944 }
36945
36946 if (identity_perm)
36947 {
36948 if (!d->testing_p)
36949 emit_move_insn (d->target, d->op0);
36950 return true;
36951 }
36952 else if (broadcast_perm && TARGET_AVX2)
36953 {
36954 /* Use vpbroadcast{b,w,d}. */
36955 rtx (*gen) (rtx, rtx) = NULL;
36956 switch (d->vmode)
36957 {
36958 case V32QImode:
36959 gen = gen_avx2_pbroadcastv32qi_1;
36960 break;
36961 case V16HImode:
36962 gen = gen_avx2_pbroadcastv16hi_1;
36963 break;
36964 case V8SImode:
36965 gen = gen_avx2_pbroadcastv8si_1;
36966 break;
36967 case V16QImode:
36968 gen = gen_avx2_pbroadcastv16qi;
36969 break;
36970 case V8HImode:
36971 gen = gen_avx2_pbroadcastv8hi;
36972 break;
36973 case V8SFmode:
36974 gen = gen_avx2_vec_dupv8sf_1;
36975 break;
36976 /* For other modes prefer other shuffles this function creates. */
36977 default: break;
36978 }
36979 if (gen != NULL)
36980 {
36981 if (!d->testing_p)
36982 emit_insn (gen (d->target, d->op0));
36983 return true;
36984 }
36985 }
36986
36987 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
36988 return true;
36989
36990 /* There are plenty of patterns in sse.md that are written for
36991 SEL+CONCAT and are not replicated for a single op. Perhaps
36992 that should be changed, to avoid the nastiness here. */
36993
36994 /* Recognize interleave style patterns, which means incrementing
36995 every other permutation operand. */
36996 for (i = 0; i < nelt; i += 2)
36997 {
36998 perm2[i] = d->perm[i] & mask;
36999 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
37000 }
37001 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
37002 d->testing_p))
37003 return true;
37004
37005 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
37006 if (nelt >= 4)
37007 {
37008 for (i = 0; i < nelt; i += 4)
37009 {
37010 perm2[i + 0] = d->perm[i + 0] & mask;
37011 perm2[i + 1] = d->perm[i + 1] & mask;
37012 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
37013 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
37014 }
37015
37016 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
37017 d->testing_p))
37018 return true;
37019 }
37020 }
37021
37022 /* Finally, try the fully general two operand permute. */
37023 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
37024 d->testing_p))
37025 return true;
37026
37027 /* Recognize interleave style patterns with reversed operands. */
37028 if (!d->one_operand_p)
37029 {
37030 for (i = 0; i < nelt; ++i)
37031 {
37032 unsigned e = d->perm[i];
37033 if (e >= nelt)
37034 e -= nelt;
37035 else
37036 e += nelt;
37037 perm2[i] = e;
37038 }
37039
37040 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
37041 d->testing_p))
37042 return true;
37043 }
37044
37045 /* Try the SSE4.1 blend variable merge instructions. */
37046 if (expand_vec_perm_blend (d))
37047 return true;
37048
37049 /* Try one of the AVX vpermil variable permutations. */
37050 if (expand_vec_perm_vpermil (d))
37051 return true;
37052
37053 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
37054 vpshufb, vpermd, vpermps or vpermq variable permutation. */
37055 if (expand_vec_perm_pshufb (d))
37056 return true;
37057
37058 return false;
37059 }
37060
37061 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
37062 in terms of a pair of pshuflw + pshufhw instructions. */
37063
37064 static bool
37065 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
37066 {
37067 unsigned char perm2[MAX_VECT_LEN];
37068 unsigned i;
37069 bool ok;
37070
37071 if (d->vmode != V8HImode || !d->one_operand_p)
37072 return false;
37073
37074 /* The two permutations only operate in 64-bit lanes. */
37075 for (i = 0; i < 4; ++i)
37076 if (d->perm[i] >= 4)
37077 return false;
37078 for (i = 4; i < 8; ++i)
37079 if (d->perm[i] < 4)
37080 return false;
37081
37082 if (d->testing_p)
37083 return true;
37084
37085 /* Emit the pshuflw. */
37086 memcpy (perm2, d->perm, 4);
37087 for (i = 4; i < 8; ++i)
37088 perm2[i] = i;
37089 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
37090 gcc_assert (ok);
37091
37092 /* Emit the pshufhw. */
37093 memcpy (perm2 + 4, d->perm + 4, 4);
37094 for (i = 0; i < 4; ++i)
37095 perm2[i] = i;
37096 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
37097 gcc_assert (ok);
37098
37099 return true;
37100 }
37101
37102 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
37103 the permutation using the SSSE3 palignr instruction. This succeeds
37104 when all of the elements in PERM fit within one vector and we merely
37105 need to shift them down so that a single vector permutation has a
37106 chance to succeed. */
37107
37108 static bool
37109 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
37110 {
37111 unsigned i, nelt = d->nelt;
37112 unsigned min, max;
37113 bool in_order, ok;
37114 rtx shift;
37115
37116 /* Even with AVX, palignr only operates on 128-bit vectors. */
37117 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
37118 return false;
37119
37120 min = nelt, max = 0;
37121 for (i = 0; i < nelt; ++i)
37122 {
37123 unsigned e = d->perm[i];
37124 if (e < min)
37125 min = e;
37126 if (e > max)
37127 max = e;
37128 }
37129 if (min == 0 || max - min >= nelt)
37130 return false;
37131
37132 /* Given that we have SSSE3, we know we'll be able to implement the
37133 single operand permutation after the palignr with pshufb. */
37134 if (d->testing_p)
37135 return true;
37136
37137 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
37138 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
37139 gen_lowpart (TImode, d->op1),
37140 gen_lowpart (TImode, d->op0), shift));
37141
37142 d->op0 = d->op1 = d->target;
37143 d->one_operand_p = true;
37144
37145 in_order = true;
37146 for (i = 0; i < nelt; ++i)
37147 {
37148 unsigned e = d->perm[i] - min;
37149 if (e != i)
37150 in_order = false;
37151 d->perm[i] = e;
37152 }
37153
37154 /* Test for the degenerate case where the alignment by itself
37155 produces the desired permutation. */
37156 if (in_order)
37157 return true;
37158
37159 ok = expand_vec_perm_1 (d);
37160 gcc_assert (ok);
37161
37162 return ok;
37163 }
37164
37165 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
37166
37167 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
37168 a two vector permutation into a single vector permutation by using
37169 an interleave operation to merge the vectors. */
37170
37171 static bool
37172 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
37173 {
37174 struct expand_vec_perm_d dremap, dfinal;
37175 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
37176 unsigned HOST_WIDE_INT contents;
37177 unsigned char remap[2 * MAX_VECT_LEN];
37178 rtx seq;
37179 bool ok, same_halves = false;
37180
37181 if (GET_MODE_SIZE (d->vmode) == 16)
37182 {
37183 if (d->one_operand_p)
37184 return false;
37185 }
37186 else if (GET_MODE_SIZE (d->vmode) == 32)
37187 {
37188 if (!TARGET_AVX)
37189 return false;
37190 /* For 32-byte modes allow even d->one_operand_p.
37191 The lack of cross-lane shuffling in some instructions
37192 might prevent a single insn shuffle. */
37193 dfinal = *d;
37194 dfinal.testing_p = true;
37195 /* If expand_vec_perm_interleave3 can expand this into
37196 a 3 insn sequence, give up and let it be expanded as
37197 3 insn sequence. While that is one insn longer,
37198 it doesn't need a memory operand and in the common
37199 case that both interleave low and high permutations
37200 with the same operands are adjacent needs 4 insns
37201 for both after CSE. */
37202 if (expand_vec_perm_interleave3 (&dfinal))
37203 return false;
37204 }
37205 else
37206 return false;
37207
37208 /* Examine from whence the elements come. */
37209 contents = 0;
37210 for (i = 0; i < nelt; ++i)
37211 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
37212
37213 memset (remap, 0xff, sizeof (remap));
37214 dremap = *d;
37215
37216 if (GET_MODE_SIZE (d->vmode) == 16)
37217 {
37218 unsigned HOST_WIDE_INT h1, h2, h3, h4;
37219
37220 /* Split the two input vectors into 4 halves. */
37221 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
37222 h2 = h1 << nelt2;
37223 h3 = h2 << nelt2;
37224 h4 = h3 << nelt2;
37225
37226 /* If the elements from the low halves use interleave low, and similarly
37227 for interleave high. If the elements are from mis-matched halves, we
37228 can use shufps for V4SF/V4SI or do a DImode shuffle. */
37229 if ((contents & (h1 | h3)) == contents)
37230 {
37231 /* punpckl* */
37232 for (i = 0; i < nelt2; ++i)
37233 {
37234 remap[i] = i * 2;
37235 remap[i + nelt] = i * 2 + 1;
37236 dremap.perm[i * 2] = i;
37237 dremap.perm[i * 2 + 1] = i + nelt;
37238 }
37239 if (!TARGET_SSE2 && d->vmode == V4SImode)
37240 dremap.vmode = V4SFmode;
37241 }
37242 else if ((contents & (h2 | h4)) == contents)
37243 {
37244 /* punpckh* */
37245 for (i = 0; i < nelt2; ++i)
37246 {
37247 remap[i + nelt2] = i * 2;
37248 remap[i + nelt + nelt2] = i * 2 + 1;
37249 dremap.perm[i * 2] = i + nelt2;
37250 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
37251 }
37252 if (!TARGET_SSE2 && d->vmode == V4SImode)
37253 dremap.vmode = V4SFmode;
37254 }
37255 else if ((contents & (h1 | h4)) == contents)
37256 {
37257 /* shufps */
37258 for (i = 0; i < nelt2; ++i)
37259 {
37260 remap[i] = i;
37261 remap[i + nelt + nelt2] = i + nelt2;
37262 dremap.perm[i] = i;
37263 dremap.perm[i + nelt2] = i + nelt + nelt2;
37264 }
37265 if (nelt != 4)
37266 {
37267 /* shufpd */
37268 dremap.vmode = V2DImode;
37269 dremap.nelt = 2;
37270 dremap.perm[0] = 0;
37271 dremap.perm[1] = 3;
37272 }
37273 }
37274 else if ((contents & (h2 | h3)) == contents)
37275 {
37276 /* shufps */
37277 for (i = 0; i < nelt2; ++i)
37278 {
37279 remap[i + nelt2] = i;
37280 remap[i + nelt] = i + nelt2;
37281 dremap.perm[i] = i + nelt2;
37282 dremap.perm[i + nelt2] = i + nelt;
37283 }
37284 if (nelt != 4)
37285 {
37286 /* shufpd */
37287 dremap.vmode = V2DImode;
37288 dremap.nelt = 2;
37289 dremap.perm[0] = 1;
37290 dremap.perm[1] = 2;
37291 }
37292 }
37293 else
37294 return false;
37295 }
37296 else
37297 {
37298 unsigned int nelt4 = nelt / 4, nzcnt = 0;
37299 unsigned HOST_WIDE_INT q[8];
37300 unsigned int nonzero_halves[4];
37301
37302 /* Split the two input vectors into 8 quarters. */
37303 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
37304 for (i = 1; i < 8; ++i)
37305 q[i] = q[0] << (nelt4 * i);
37306 for (i = 0; i < 4; ++i)
37307 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
37308 {
37309 nonzero_halves[nzcnt] = i;
37310 ++nzcnt;
37311 }
37312
37313 if (nzcnt == 1)
37314 {
37315 gcc_assert (d->one_operand_p);
37316 nonzero_halves[1] = nonzero_halves[0];
37317 same_halves = true;
37318 }
37319 else if (d->one_operand_p)
37320 {
37321 gcc_assert (nonzero_halves[0] == 0);
37322 gcc_assert (nonzero_halves[1] == 1);
37323 }
37324
37325 if (nzcnt <= 2)
37326 {
37327 if (d->perm[0] / nelt2 == nonzero_halves[1])
37328 {
37329 /* Attempt to increase the likelihood that dfinal
37330 shuffle will be intra-lane. */
37331 char tmph = nonzero_halves[0];
37332 nonzero_halves[0] = nonzero_halves[1];
37333 nonzero_halves[1] = tmph;
37334 }
37335
37336 /* vperm2f128 or vperm2i128. */
37337 for (i = 0; i < nelt2; ++i)
37338 {
37339 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
37340 remap[i + nonzero_halves[0] * nelt2] = i;
37341 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
37342 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
37343 }
37344
37345 if (d->vmode != V8SFmode
37346 && d->vmode != V4DFmode
37347 && d->vmode != V8SImode)
37348 {
37349 dremap.vmode = V8SImode;
37350 dremap.nelt = 8;
37351 for (i = 0; i < 4; ++i)
37352 {
37353 dremap.perm[i] = i + nonzero_halves[0] * 4;
37354 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
37355 }
37356 }
37357 }
37358 else if (d->one_operand_p)
37359 return false;
37360 else if (TARGET_AVX2
37361 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
37362 {
37363 /* vpunpckl* */
37364 for (i = 0; i < nelt4; ++i)
37365 {
37366 remap[i] = i * 2;
37367 remap[i + nelt] = i * 2 + 1;
37368 remap[i + nelt2] = i * 2 + nelt2;
37369 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
37370 dremap.perm[i * 2] = i;
37371 dremap.perm[i * 2 + 1] = i + nelt;
37372 dremap.perm[i * 2 + nelt2] = i + nelt2;
37373 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
37374 }
37375 }
37376 else if (TARGET_AVX2
37377 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
37378 {
37379 /* vpunpckh* */
37380 for (i = 0; i < nelt4; ++i)
37381 {
37382 remap[i + nelt4] = i * 2;
37383 remap[i + nelt + nelt4] = i * 2 + 1;
37384 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
37385 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
37386 dremap.perm[i * 2] = i + nelt4;
37387 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
37388 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
37389 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
37390 }
37391 }
37392 else
37393 return false;
37394 }
37395
37396 /* Use the remapping array set up above to move the elements from their
37397 swizzled locations into their final destinations. */
37398 dfinal = *d;
37399 for (i = 0; i < nelt; ++i)
37400 {
37401 unsigned e = remap[d->perm[i]];
37402 gcc_assert (e < nelt);
37403 /* If same_halves is true, both halves of the remapped vector are the
37404 same. Avoid cross-lane accesses if possible. */
37405 if (same_halves && i >= nelt2)
37406 {
37407 gcc_assert (e < nelt2);
37408 dfinal.perm[i] = e + nelt2;
37409 }
37410 else
37411 dfinal.perm[i] = e;
37412 }
37413 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
37414 dfinal.op1 = dfinal.op0;
37415 dfinal.one_operand_p = true;
37416 dremap.target = dfinal.op0;
37417
37418 /* Test if the final remap can be done with a single insn. For V4SFmode or
37419 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
37420 start_sequence ();
37421 ok = expand_vec_perm_1 (&dfinal);
37422 seq = get_insns ();
37423 end_sequence ();
37424
37425 if (!ok)
37426 return false;
37427
37428 if (d->testing_p)
37429 return true;
37430
37431 if (dremap.vmode != dfinal.vmode)
37432 {
37433 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
37434 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
37435 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
37436 }
37437
37438 ok = expand_vec_perm_1 (&dremap);
37439 gcc_assert (ok);
37440
37441 emit_insn (seq);
37442 return true;
37443 }
37444
37445 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
37446 a single vector cross-lane permutation into vpermq followed
37447 by any of the single insn permutations. */
37448
37449 static bool
37450 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
37451 {
37452 struct expand_vec_perm_d dremap, dfinal;
37453 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
37454 unsigned contents[2];
37455 bool ok;
37456
37457 if (!(TARGET_AVX2
37458 && (d->vmode == V32QImode || d->vmode == V16HImode)
37459 && d->one_operand_p))
37460 return false;
37461
37462 contents[0] = 0;
37463 contents[1] = 0;
37464 for (i = 0; i < nelt2; ++i)
37465 {
37466 contents[0] |= 1u << (d->perm[i] / nelt4);
37467 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
37468 }
37469
37470 for (i = 0; i < 2; ++i)
37471 {
37472 unsigned int cnt = 0;
37473 for (j = 0; j < 4; ++j)
37474 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
37475 return false;
37476 }
37477
37478 if (d->testing_p)
37479 return true;
37480
37481 dremap = *d;
37482 dremap.vmode = V4DImode;
37483 dremap.nelt = 4;
37484 dremap.target = gen_reg_rtx (V4DImode);
37485 dremap.op0 = gen_lowpart (V4DImode, d->op0);
37486 dremap.op1 = dremap.op0;
37487 dremap.one_operand_p = true;
37488 for (i = 0; i < 2; ++i)
37489 {
37490 unsigned int cnt = 0;
37491 for (j = 0; j < 4; ++j)
37492 if ((contents[i] & (1u << j)) != 0)
37493 dremap.perm[2 * i + cnt++] = j;
37494 for (; cnt < 2; ++cnt)
37495 dremap.perm[2 * i + cnt] = 0;
37496 }
37497
37498 dfinal = *d;
37499 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
37500 dfinal.op1 = dfinal.op0;
37501 dfinal.one_operand_p = true;
37502 for (i = 0, j = 0; i < nelt; ++i)
37503 {
37504 if (i == nelt2)
37505 j = 2;
37506 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
37507 if ((d->perm[i] / nelt4) == dremap.perm[j])
37508 ;
37509 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
37510 dfinal.perm[i] |= nelt4;
37511 else
37512 gcc_unreachable ();
37513 }
37514
37515 ok = expand_vec_perm_1 (&dremap);
37516 gcc_assert (ok);
37517
37518 ok = expand_vec_perm_1 (&dfinal);
37519 gcc_assert (ok);
37520
37521 return true;
37522 }
37523
37524 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
37525 a vector permutation using two instructions, vperm2f128 resp.
37526 vperm2i128 followed by any single in-lane permutation. */
37527
37528 static bool
37529 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
37530 {
37531 struct expand_vec_perm_d dfirst, dsecond;
37532 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
37533 bool ok;
37534
37535 if (!TARGET_AVX
37536 || GET_MODE_SIZE (d->vmode) != 32
37537 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
37538 return false;
37539
37540 dsecond = *d;
37541 dsecond.one_operand_p = false;
37542 dsecond.testing_p = true;
37543
37544 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
37545 immediate. For perm < 16 the second permutation uses
37546 d->op0 as first operand, for perm >= 16 it uses d->op1
37547 as first operand. The second operand is the result of
37548 vperm2[fi]128. */
37549 for (perm = 0; perm < 32; perm++)
37550 {
37551 /* Ignore permutations which do not move anything cross-lane. */
37552 if (perm < 16)
37553 {
37554 /* The second shuffle for e.g. V4DFmode has
37555 0123 and ABCD operands.
37556 Ignore AB23, as 23 is already in the second lane
37557 of the first operand. */
37558 if ((perm & 0xc) == (1 << 2)) continue;
37559 /* And 01CD, as 01 is in the first lane of the first
37560 operand. */
37561 if ((perm & 3) == 0) continue;
37562 /* And 4567, as then the vperm2[fi]128 doesn't change
37563 anything on the original 4567 second operand. */
37564 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
37565 }
37566 else
37567 {
37568 /* The second shuffle for e.g. V4DFmode has
37569 4567 and ABCD operands.
37570 Ignore AB67, as 67 is already in the second lane
37571 of the first operand. */
37572 if ((perm & 0xc) == (3 << 2)) continue;
37573 /* And 45CD, as 45 is in the first lane of the first
37574 operand. */
37575 if ((perm & 3) == 2) continue;
37576 /* And 0123, as then the vperm2[fi]128 doesn't change
37577 anything on the original 0123 first operand. */
37578 if ((perm & 0xf) == (1 << 2)) continue;
37579 }
37580
37581 for (i = 0; i < nelt; i++)
37582 {
37583 j = d->perm[i] / nelt2;
37584 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
37585 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
37586 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
37587 dsecond.perm[i] = d->perm[i] & (nelt - 1);
37588 else
37589 break;
37590 }
37591
37592 if (i == nelt)
37593 {
37594 start_sequence ();
37595 ok = expand_vec_perm_1 (&dsecond);
37596 end_sequence ();
37597 }
37598 else
37599 ok = false;
37600
37601 if (ok)
37602 {
37603 if (d->testing_p)
37604 return true;
37605
37606 /* Found a usable second shuffle. dfirst will be
37607 vperm2f128 on d->op0 and d->op1. */
37608 dsecond.testing_p = false;
37609 dfirst = *d;
37610 dfirst.target = gen_reg_rtx (d->vmode);
37611 for (i = 0; i < nelt; i++)
37612 dfirst.perm[i] = (i & (nelt2 - 1))
37613 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
37614
37615 ok = expand_vec_perm_1 (&dfirst);
37616 gcc_assert (ok);
37617
37618 /* And dsecond is some single insn shuffle, taking
37619 d->op0 and result of vperm2f128 (if perm < 16) or
37620 d->op1 and result of vperm2f128 (otherwise). */
37621 dsecond.op1 = dfirst.target;
37622 if (perm >= 16)
37623 dsecond.op0 = dfirst.op1;
37624
37625 ok = expand_vec_perm_1 (&dsecond);
37626 gcc_assert (ok);
37627
37628 return true;
37629 }
37630
37631 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
37632 if (d->one_operand_p)
37633 return false;
37634 }
37635
37636 return false;
37637 }
37638
37639 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
37640 a two vector permutation using 2 intra-lane interleave insns
37641 and cross-lane shuffle for 32-byte vectors. */
37642
37643 static bool
37644 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
37645 {
37646 unsigned i, nelt;
37647 rtx (*gen) (rtx, rtx, rtx);
37648
37649 if (d->one_operand_p)
37650 return false;
37651 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
37652 ;
37653 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
37654 ;
37655 else
37656 return false;
37657
37658 nelt = d->nelt;
37659 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
37660 return false;
37661 for (i = 0; i < nelt; i += 2)
37662 if (d->perm[i] != d->perm[0] + i / 2
37663 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
37664 return false;
37665
37666 if (d->testing_p)
37667 return true;
37668
37669 switch (d->vmode)
37670 {
37671 case V32QImode:
37672 if (d->perm[0])
37673 gen = gen_vec_interleave_highv32qi;
37674 else
37675 gen = gen_vec_interleave_lowv32qi;
37676 break;
37677 case V16HImode:
37678 if (d->perm[0])
37679 gen = gen_vec_interleave_highv16hi;
37680 else
37681 gen = gen_vec_interleave_lowv16hi;
37682 break;
37683 case V8SImode:
37684 if (d->perm[0])
37685 gen = gen_vec_interleave_highv8si;
37686 else
37687 gen = gen_vec_interleave_lowv8si;
37688 break;
37689 case V4DImode:
37690 if (d->perm[0])
37691 gen = gen_vec_interleave_highv4di;
37692 else
37693 gen = gen_vec_interleave_lowv4di;
37694 break;
37695 case V8SFmode:
37696 if (d->perm[0])
37697 gen = gen_vec_interleave_highv8sf;
37698 else
37699 gen = gen_vec_interleave_lowv8sf;
37700 break;
37701 case V4DFmode:
37702 if (d->perm[0])
37703 gen = gen_vec_interleave_highv4df;
37704 else
37705 gen = gen_vec_interleave_lowv4df;
37706 break;
37707 default:
37708 gcc_unreachable ();
37709 }
37710
37711 emit_insn (gen (d->target, d->op0, d->op1));
37712 return true;
37713 }
37714
37715 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
37716 a single vector permutation using a single intra-lane vector
37717 permutation, vperm2f128 swapping the lanes and vblend* insn blending
37718 the non-swapped and swapped vectors together. */
37719
37720 static bool
37721 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
37722 {
37723 struct expand_vec_perm_d dfirst, dsecond;
37724 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
37725 rtx seq;
37726 bool ok;
37727 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
37728
37729 if (!TARGET_AVX
37730 || TARGET_AVX2
37731 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
37732 || !d->one_operand_p)
37733 return false;
37734
37735 dfirst = *d;
37736 for (i = 0; i < nelt; i++)
37737 dfirst.perm[i] = 0xff;
37738 for (i = 0, msk = 0; i < nelt; i++)
37739 {
37740 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
37741 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
37742 return false;
37743 dfirst.perm[j] = d->perm[i];
37744 if (j != i)
37745 msk |= (1 << i);
37746 }
37747 for (i = 0; i < nelt; i++)
37748 if (dfirst.perm[i] == 0xff)
37749 dfirst.perm[i] = i;
37750
37751 if (!d->testing_p)
37752 dfirst.target = gen_reg_rtx (dfirst.vmode);
37753
37754 start_sequence ();
37755 ok = expand_vec_perm_1 (&dfirst);
37756 seq = get_insns ();
37757 end_sequence ();
37758
37759 if (!ok)
37760 return false;
37761
37762 if (d->testing_p)
37763 return true;
37764
37765 emit_insn (seq);
37766
37767 dsecond = *d;
37768 dsecond.op0 = dfirst.target;
37769 dsecond.op1 = dfirst.target;
37770 dsecond.one_operand_p = true;
37771 dsecond.target = gen_reg_rtx (dsecond.vmode);
37772 for (i = 0; i < nelt; i++)
37773 dsecond.perm[i] = i ^ nelt2;
37774
37775 ok = expand_vec_perm_1 (&dsecond);
37776 gcc_assert (ok);
37777
37778 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
37779 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
37780 return true;
37781 }
37782
37783 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
37784 permutation using two vperm2f128, followed by a vshufpd insn blending
37785 the two vectors together. */
37786
37787 static bool
37788 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
37789 {
37790 struct expand_vec_perm_d dfirst, dsecond, dthird;
37791 bool ok;
37792
37793 if (!TARGET_AVX || (d->vmode != V4DFmode))
37794 return false;
37795
37796 if (d->testing_p)
37797 return true;
37798
37799 dfirst = *d;
37800 dsecond = *d;
37801 dthird = *d;
37802
37803 dfirst.perm[0] = (d->perm[0] & ~1);
37804 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
37805 dfirst.perm[2] = (d->perm[2] & ~1);
37806 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
37807 dsecond.perm[0] = (d->perm[1] & ~1);
37808 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
37809 dsecond.perm[2] = (d->perm[3] & ~1);
37810 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
37811 dthird.perm[0] = (d->perm[0] % 2);
37812 dthird.perm[1] = (d->perm[1] % 2) + 4;
37813 dthird.perm[2] = (d->perm[2] % 2) + 2;
37814 dthird.perm[3] = (d->perm[3] % 2) + 6;
37815
37816 dfirst.target = gen_reg_rtx (dfirst.vmode);
37817 dsecond.target = gen_reg_rtx (dsecond.vmode);
37818 dthird.op0 = dfirst.target;
37819 dthird.op1 = dsecond.target;
37820 dthird.one_operand_p = false;
37821
37822 canonicalize_perm (&dfirst);
37823 canonicalize_perm (&dsecond);
37824
37825 ok = expand_vec_perm_1 (&dfirst)
37826 && expand_vec_perm_1 (&dsecond)
37827 && expand_vec_perm_1 (&dthird);
37828
37829 gcc_assert (ok);
37830
37831 return true;
37832 }
37833
37834 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
37835 permutation with two pshufb insns and an ior. We should have already
37836 failed all two instruction sequences. */
37837
37838 static bool
37839 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
37840 {
37841 rtx rperm[2][16], vperm, l, h, op, m128;
37842 unsigned int i, nelt, eltsz;
37843
37844 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
37845 return false;
37846 gcc_assert (!d->one_operand_p);
37847
37848 nelt = d->nelt;
37849 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37850
37851 /* Generate two permutation masks. If the required element is within
37852 the given vector it is shuffled into the proper lane. If the required
37853 element is in the other vector, force a zero into the lane by setting
37854 bit 7 in the permutation mask. */
37855 m128 = GEN_INT (-128);
37856 for (i = 0; i < nelt; ++i)
37857 {
37858 unsigned j, e = d->perm[i];
37859 unsigned which = (e >= nelt);
37860 if (e >= nelt)
37861 e -= nelt;
37862
37863 for (j = 0; j < eltsz; ++j)
37864 {
37865 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
37866 rperm[1-which][i*eltsz + j] = m128;
37867 }
37868 }
37869
37870 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
37871 vperm = force_reg (V16QImode, vperm);
37872
37873 l = gen_reg_rtx (V16QImode);
37874 op = gen_lowpart (V16QImode, d->op0);
37875 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
37876
37877 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
37878 vperm = force_reg (V16QImode, vperm);
37879
37880 h = gen_reg_rtx (V16QImode);
37881 op = gen_lowpart (V16QImode, d->op1);
37882 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
37883
37884 op = gen_lowpart (V16QImode, d->target);
37885 emit_insn (gen_iorv16qi3 (op, l, h));
37886
37887 return true;
37888 }
37889
37890 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
37891 with two vpshufb insns, vpermq and vpor. We should have already failed
37892 all two or three instruction sequences. */
37893
37894 static bool
37895 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
37896 {
37897 rtx rperm[2][32], vperm, l, h, hp, op, m128;
37898 unsigned int i, nelt, eltsz;
37899
37900 if (!TARGET_AVX2
37901 || !d->one_operand_p
37902 || (d->vmode != V32QImode && d->vmode != V16HImode))
37903 return false;
37904
37905 if (d->testing_p)
37906 return true;
37907
37908 nelt = d->nelt;
37909 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37910
37911 /* Generate two permutation masks. If the required element is within
37912 the same lane, it is shuffled in. If the required element from the
37913 other lane, force a zero by setting bit 7 in the permutation mask.
37914 In the other mask the mask has non-negative elements if element
37915 is requested from the other lane, but also moved to the other lane,
37916 so that the result of vpshufb can have the two V2TImode halves
37917 swapped. */
37918 m128 = GEN_INT (-128);
37919 for (i = 0; i < nelt; ++i)
37920 {
37921 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37922 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
37923
37924 for (j = 0; j < eltsz; ++j)
37925 {
37926 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
37927 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
37928 }
37929 }
37930
37931 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
37932 vperm = force_reg (V32QImode, vperm);
37933
37934 h = gen_reg_rtx (V32QImode);
37935 op = gen_lowpart (V32QImode, d->op0);
37936 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
37937
37938 /* Swap the 128-byte lanes of h into hp. */
37939 hp = gen_reg_rtx (V4DImode);
37940 op = gen_lowpart (V4DImode, h);
37941 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
37942 const1_rtx));
37943
37944 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
37945 vperm = force_reg (V32QImode, vperm);
37946
37947 l = gen_reg_rtx (V32QImode);
37948 op = gen_lowpart (V32QImode, d->op0);
37949 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
37950
37951 op = gen_lowpart (V32QImode, d->target);
37952 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
37953
37954 return true;
37955 }
37956
37957 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
37958 and extract-odd permutations of two V32QImode and V16QImode operand
37959 with two vpshufb insns, vpor and vpermq. We should have already
37960 failed all two or three instruction sequences. */
37961
37962 static bool
37963 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
37964 {
37965 rtx rperm[2][32], vperm, l, h, ior, op, m128;
37966 unsigned int i, nelt, eltsz;
37967
37968 if (!TARGET_AVX2
37969 || d->one_operand_p
37970 || (d->vmode != V32QImode && d->vmode != V16HImode))
37971 return false;
37972
37973 for (i = 0; i < d->nelt; ++i)
37974 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
37975 return false;
37976
37977 if (d->testing_p)
37978 return true;
37979
37980 nelt = d->nelt;
37981 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37982
37983 /* Generate two permutation masks. In the first permutation mask
37984 the first quarter will contain indexes for the first half
37985 of the op0, the second quarter will contain bit 7 set, third quarter
37986 will contain indexes for the second half of the op0 and the
37987 last quarter bit 7 set. In the second permutation mask
37988 the first quarter will contain bit 7 set, the second quarter
37989 indexes for the first half of the op1, the third quarter bit 7 set
37990 and last quarter indexes for the second half of the op1.
37991 I.e. the first mask e.g. for V32QImode extract even will be:
37992 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
37993 (all values masked with 0xf except for -128) and second mask
37994 for extract even will be
37995 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
37996 m128 = GEN_INT (-128);
37997 for (i = 0; i < nelt; ++i)
37998 {
37999 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
38000 unsigned which = d->perm[i] >= nelt;
38001 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
38002
38003 for (j = 0; j < eltsz; ++j)
38004 {
38005 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
38006 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
38007 }
38008 }
38009
38010 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
38011 vperm = force_reg (V32QImode, vperm);
38012
38013 l = gen_reg_rtx (V32QImode);
38014 op = gen_lowpart (V32QImode, d->op0);
38015 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
38016
38017 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
38018 vperm = force_reg (V32QImode, vperm);
38019
38020 h = gen_reg_rtx (V32QImode);
38021 op = gen_lowpart (V32QImode, d->op1);
38022 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
38023
38024 ior = gen_reg_rtx (V32QImode);
38025 emit_insn (gen_iorv32qi3 (ior, l, h));
38026
38027 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
38028 op = gen_lowpart (V4DImode, d->target);
38029 ior = gen_lowpart (V4DImode, ior);
38030 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
38031 const1_rtx, GEN_INT (3)));
38032
38033 return true;
38034 }
38035
38036 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
38037 and extract-odd permutations. */
38038
38039 static bool
38040 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
38041 {
38042 rtx t1, t2, t3;
38043
38044 switch (d->vmode)
38045 {
38046 case V4DFmode:
38047 t1 = gen_reg_rtx (V4DFmode);
38048 t2 = gen_reg_rtx (V4DFmode);
38049
38050 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
38051 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
38052 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
38053
38054 /* Now an unpck[lh]pd will produce the result required. */
38055 if (odd)
38056 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
38057 else
38058 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
38059 emit_insn (t3);
38060 break;
38061
38062 case V8SFmode:
38063 {
38064 int mask = odd ? 0xdd : 0x88;
38065
38066 t1 = gen_reg_rtx (V8SFmode);
38067 t2 = gen_reg_rtx (V8SFmode);
38068 t3 = gen_reg_rtx (V8SFmode);
38069
38070 /* Shuffle within the 128-bit lanes to produce:
38071 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
38072 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
38073 GEN_INT (mask)));
38074
38075 /* Shuffle the lanes around to produce:
38076 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
38077 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
38078 GEN_INT (0x3)));
38079
38080 /* Shuffle within the 128-bit lanes to produce:
38081 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
38082 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
38083
38084 /* Shuffle within the 128-bit lanes to produce:
38085 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
38086 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
38087
38088 /* Shuffle the lanes around to produce:
38089 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
38090 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
38091 GEN_INT (0x20)));
38092 }
38093 break;
38094
38095 case V2DFmode:
38096 case V4SFmode:
38097 case V2DImode:
38098 case V4SImode:
38099 /* These are always directly implementable by expand_vec_perm_1. */
38100 gcc_unreachable ();
38101
38102 case V8HImode:
38103 if (TARGET_SSSE3)
38104 return expand_vec_perm_pshufb2 (d);
38105 else
38106 {
38107 /* We need 2*log2(N)-1 operations to achieve odd/even
38108 with interleave. */
38109 t1 = gen_reg_rtx (V8HImode);
38110 t2 = gen_reg_rtx (V8HImode);
38111 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
38112 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
38113 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
38114 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
38115 if (odd)
38116 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
38117 else
38118 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
38119 emit_insn (t3);
38120 }
38121 break;
38122
38123 case V16QImode:
38124 if (TARGET_SSSE3)
38125 return expand_vec_perm_pshufb2 (d);
38126 else
38127 {
38128 t1 = gen_reg_rtx (V16QImode);
38129 t2 = gen_reg_rtx (V16QImode);
38130 t3 = gen_reg_rtx (V16QImode);
38131 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
38132 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
38133 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
38134 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
38135 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
38136 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
38137 if (odd)
38138 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
38139 else
38140 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
38141 emit_insn (t3);
38142 }
38143 break;
38144
38145 case V16HImode:
38146 case V32QImode:
38147 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
38148
38149 case V4DImode:
38150 if (!TARGET_AVX2)
38151 {
38152 struct expand_vec_perm_d d_copy = *d;
38153 d_copy.vmode = V4DFmode;
38154 d_copy.target = gen_lowpart (V4DFmode, d->target);
38155 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
38156 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
38157 return expand_vec_perm_even_odd_1 (&d_copy, odd);
38158 }
38159
38160 t1 = gen_reg_rtx (V4DImode);
38161 t2 = gen_reg_rtx (V4DImode);
38162
38163 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
38164 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
38165 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
38166
38167 /* Now an vpunpck[lh]qdq will produce the result required. */
38168 if (odd)
38169 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
38170 else
38171 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
38172 emit_insn (t3);
38173 break;
38174
38175 case V8SImode:
38176 if (!TARGET_AVX2)
38177 {
38178 struct expand_vec_perm_d d_copy = *d;
38179 d_copy.vmode = V8SFmode;
38180 d_copy.target = gen_lowpart (V8SFmode, d->target);
38181 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
38182 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
38183 return expand_vec_perm_even_odd_1 (&d_copy, odd);
38184 }
38185
38186 t1 = gen_reg_rtx (V8SImode);
38187 t2 = gen_reg_rtx (V8SImode);
38188
38189 /* Shuffle the lanes around into
38190 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
38191 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
38192 gen_lowpart (V4DImode, d->op0),
38193 gen_lowpart (V4DImode, d->op1),
38194 GEN_INT (0x20)));
38195 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
38196 gen_lowpart (V4DImode, d->op0),
38197 gen_lowpart (V4DImode, d->op1),
38198 GEN_INT (0x31)));
38199
38200 /* Swap the 2nd and 3rd position in each lane into
38201 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
38202 emit_insn (gen_avx2_pshufdv3 (t1, t1,
38203 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
38204 emit_insn (gen_avx2_pshufdv3 (t2, t2,
38205 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
38206
38207 /* Now an vpunpck[lh]qdq will produce
38208 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
38209 if (odd)
38210 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
38211 gen_lowpart (V4DImode, t1),
38212 gen_lowpart (V4DImode, t2));
38213 else
38214 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
38215 gen_lowpart (V4DImode, t1),
38216 gen_lowpart (V4DImode, t2));
38217 emit_insn (t3);
38218 break;
38219
38220 default:
38221 gcc_unreachable ();
38222 }
38223
38224 return true;
38225 }
38226
38227 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
38228 extract-even and extract-odd permutations. */
38229
38230 static bool
38231 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
38232 {
38233 unsigned i, odd, nelt = d->nelt;
38234
38235 odd = d->perm[0];
38236 if (odd != 0 && odd != 1)
38237 return false;
38238
38239 for (i = 1; i < nelt; ++i)
38240 if (d->perm[i] != 2 * i + odd)
38241 return false;
38242
38243 return expand_vec_perm_even_odd_1 (d, odd);
38244 }
38245
38246 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
38247 permutations. We assume that expand_vec_perm_1 has already failed. */
38248
38249 static bool
38250 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
38251 {
38252 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
38253 enum machine_mode vmode = d->vmode;
38254 unsigned char perm2[4];
38255 rtx op0 = d->op0;
38256 bool ok;
38257
38258 switch (vmode)
38259 {
38260 case V4DFmode:
38261 case V8SFmode:
38262 /* These are special-cased in sse.md so that we can optionally
38263 use the vbroadcast instruction. They expand to two insns
38264 if the input happens to be in a register. */
38265 gcc_unreachable ();
38266
38267 case V2DFmode:
38268 case V2DImode:
38269 case V4SFmode:
38270 case V4SImode:
38271 /* These are always implementable using standard shuffle patterns. */
38272 gcc_unreachable ();
38273
38274 case V8HImode:
38275 case V16QImode:
38276 /* These can be implemented via interleave. We save one insn by
38277 stopping once we have promoted to V4SImode and then use pshufd. */
38278 do
38279 {
38280 rtx dest;
38281 rtx (*gen) (rtx, rtx, rtx)
38282 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
38283 : gen_vec_interleave_lowv8hi;
38284
38285 if (elt >= nelt2)
38286 {
38287 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
38288 : gen_vec_interleave_highv8hi;
38289 elt -= nelt2;
38290 }
38291 nelt2 /= 2;
38292
38293 dest = gen_reg_rtx (vmode);
38294 emit_insn (gen (dest, op0, op0));
38295 vmode = get_mode_wider_vector (vmode);
38296 op0 = gen_lowpart (vmode, dest);
38297 }
38298 while (vmode != V4SImode);
38299
38300 memset (perm2, elt, 4);
38301 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
38302 d->testing_p);
38303 gcc_assert (ok);
38304 return true;
38305
38306 case V32QImode:
38307 case V16HImode:
38308 case V8SImode:
38309 case V4DImode:
38310 /* For AVX2 broadcasts of the first element vpbroadcast* or
38311 vpermq should be used by expand_vec_perm_1. */
38312 gcc_assert (!TARGET_AVX2 || d->perm[0]);
38313 return false;
38314
38315 default:
38316 gcc_unreachable ();
38317 }
38318 }
38319
38320 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
38321 broadcast permutations. */
38322
38323 static bool
38324 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
38325 {
38326 unsigned i, elt, nelt = d->nelt;
38327
38328 if (!d->one_operand_p)
38329 return false;
38330
38331 elt = d->perm[0];
38332 for (i = 1; i < nelt; ++i)
38333 if (d->perm[i] != elt)
38334 return false;
38335
38336 return expand_vec_perm_broadcast_1 (d);
38337 }
38338
38339 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
38340 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
38341 all the shorter instruction sequences. */
38342
38343 static bool
38344 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
38345 {
38346 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
38347 unsigned int i, nelt, eltsz;
38348 bool used[4];
38349
38350 if (!TARGET_AVX2
38351 || d->one_operand_p
38352 || (d->vmode != V32QImode && d->vmode != V16HImode))
38353 return false;
38354
38355 if (d->testing_p)
38356 return true;
38357
38358 nelt = d->nelt;
38359 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38360
38361 /* Generate 4 permutation masks. If the required element is within
38362 the same lane, it is shuffled in. If the required element from the
38363 other lane, force a zero by setting bit 7 in the permutation mask.
38364 In the other mask the mask has non-negative elements if element
38365 is requested from the other lane, but also moved to the other lane,
38366 so that the result of vpshufb can have the two V2TImode halves
38367 swapped. */
38368 m128 = GEN_INT (-128);
38369 for (i = 0; i < 32; ++i)
38370 {
38371 rperm[0][i] = m128;
38372 rperm[1][i] = m128;
38373 rperm[2][i] = m128;
38374 rperm[3][i] = m128;
38375 }
38376 used[0] = false;
38377 used[1] = false;
38378 used[2] = false;
38379 used[3] = false;
38380 for (i = 0; i < nelt; ++i)
38381 {
38382 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
38383 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
38384 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
38385
38386 for (j = 0; j < eltsz; ++j)
38387 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
38388 used[which] = true;
38389 }
38390
38391 for (i = 0; i < 2; ++i)
38392 {
38393 if (!used[2 * i + 1])
38394 {
38395 h[i] = NULL_RTX;
38396 continue;
38397 }
38398 vperm = gen_rtx_CONST_VECTOR (V32QImode,
38399 gen_rtvec_v (32, rperm[2 * i + 1]));
38400 vperm = force_reg (V32QImode, vperm);
38401 h[i] = gen_reg_rtx (V32QImode);
38402 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
38403 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
38404 }
38405
38406 /* Swap the 128-byte lanes of h[X]. */
38407 for (i = 0; i < 2; ++i)
38408 {
38409 if (h[i] == NULL_RTX)
38410 continue;
38411 op = gen_reg_rtx (V4DImode);
38412 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
38413 const2_rtx, GEN_INT (3), const0_rtx,
38414 const1_rtx));
38415 h[i] = gen_lowpart (V32QImode, op);
38416 }
38417
38418 for (i = 0; i < 2; ++i)
38419 {
38420 if (!used[2 * i])
38421 {
38422 l[i] = NULL_RTX;
38423 continue;
38424 }
38425 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
38426 vperm = force_reg (V32QImode, vperm);
38427 l[i] = gen_reg_rtx (V32QImode);
38428 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
38429 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
38430 }
38431
38432 for (i = 0; i < 2; ++i)
38433 {
38434 if (h[i] && l[i])
38435 {
38436 op = gen_reg_rtx (V32QImode);
38437 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
38438 l[i] = op;
38439 }
38440 else if (h[i])
38441 l[i] = h[i];
38442 }
38443
38444 gcc_assert (l[0] && l[1]);
38445 op = gen_lowpart (V32QImode, d->target);
38446 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
38447 return true;
38448 }
38449
38450 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
38451 With all of the interface bits taken care of, perform the expansion
38452 in D and return true on success. */
38453
38454 static bool
38455 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
38456 {
38457 /* Try a single instruction expansion. */
38458 if (expand_vec_perm_1 (d))
38459 return true;
38460
38461 /* Try sequences of two instructions. */
38462
38463 if (expand_vec_perm_pshuflw_pshufhw (d))
38464 return true;
38465
38466 if (expand_vec_perm_palignr (d))
38467 return true;
38468
38469 if (expand_vec_perm_interleave2 (d))
38470 return true;
38471
38472 if (expand_vec_perm_broadcast (d))
38473 return true;
38474
38475 if (expand_vec_perm_vpermq_perm_1 (d))
38476 return true;
38477
38478 if (expand_vec_perm_vperm2f128 (d))
38479 return true;
38480
38481 /* Try sequences of three instructions. */
38482
38483 if (expand_vec_perm_2vperm2f128_vshuf (d))
38484 return true;
38485
38486 if (expand_vec_perm_pshufb2 (d))
38487 return true;
38488
38489 if (expand_vec_perm_interleave3 (d))
38490 return true;
38491
38492 if (expand_vec_perm_vperm2f128_vblend (d))
38493 return true;
38494
38495 /* Try sequences of four instructions. */
38496
38497 if (expand_vec_perm_vpshufb2_vpermq (d))
38498 return true;
38499
38500 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
38501 return true;
38502
38503 /* ??? Look for narrow permutations whose element orderings would
38504 allow the promotion to a wider mode. */
38505
38506 /* ??? Look for sequences of interleave or a wider permute that place
38507 the data into the correct lanes for a half-vector shuffle like
38508 pshuf[lh]w or vpermilps. */
38509
38510 /* ??? Look for sequences of interleave that produce the desired results.
38511 The combinatorics of punpck[lh] get pretty ugly... */
38512
38513 if (expand_vec_perm_even_odd (d))
38514 return true;
38515
38516 /* Even longer sequences. */
38517 if (expand_vec_perm_vpshufb4_vpermq2 (d))
38518 return true;
38519
38520 return false;
38521 }
38522
38523 /* If a permutation only uses one operand, make it clear. Returns true
38524 if the permutation references both operands. */
38525
38526 static bool
38527 canonicalize_perm (struct expand_vec_perm_d *d)
38528 {
38529 int i, which, nelt = d->nelt;
38530
38531 for (i = which = 0; i < nelt; ++i)
38532 which |= (d->perm[i] < nelt ? 1 : 2);
38533
38534 d->one_operand_p = true;
38535 switch (which)
38536 {
38537 default:
38538 gcc_unreachable();
38539
38540 case 3:
38541 if (!rtx_equal_p (d->op0, d->op1))
38542 {
38543 d->one_operand_p = false;
38544 break;
38545 }
38546 /* The elements of PERM do not suggest that only the first operand
38547 is used, but both operands are identical. Allow easier matching
38548 of the permutation by folding the permutation into the single
38549 input vector. */
38550 /* FALLTHRU */
38551
38552 case 2:
38553 for (i = 0; i < nelt; ++i)
38554 d->perm[i] &= nelt - 1;
38555 d->op0 = d->op1;
38556 break;
38557
38558 case 1:
38559 d->op1 = d->op0;
38560 break;
38561 }
38562
38563 return (which == 3);
38564 }
38565
38566 bool
38567 ix86_expand_vec_perm_const (rtx operands[4])
38568 {
38569 struct expand_vec_perm_d d;
38570 unsigned char perm[MAX_VECT_LEN];
38571 int i, nelt;
38572 bool two_args;
38573 rtx sel;
38574
38575 d.target = operands[0];
38576 d.op0 = operands[1];
38577 d.op1 = operands[2];
38578 sel = operands[3];
38579
38580 d.vmode = GET_MODE (d.target);
38581 gcc_assert (VECTOR_MODE_P (d.vmode));
38582 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
38583 d.testing_p = false;
38584
38585 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
38586 gcc_assert (XVECLEN (sel, 0) == nelt);
38587 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
38588
38589 for (i = 0; i < nelt; ++i)
38590 {
38591 rtx e = XVECEXP (sel, 0, i);
38592 int ei = INTVAL (e) & (2 * nelt - 1);
38593 d.perm[i] = ei;
38594 perm[i] = ei;
38595 }
38596
38597 two_args = canonicalize_perm (&d);
38598
38599 if (ix86_expand_vec_perm_const_1 (&d))
38600 return true;
38601
38602 /* If the selector says both arguments are needed, but the operands are the
38603 same, the above tried to expand with one_operand_p and flattened selector.
38604 If that didn't work, retry without one_operand_p; we succeeded with that
38605 during testing. */
38606 if (two_args && d.one_operand_p)
38607 {
38608 d.one_operand_p = false;
38609 memcpy (d.perm, perm, sizeof (perm));
38610 return ix86_expand_vec_perm_const_1 (&d);
38611 }
38612
38613 return false;
38614 }
38615
38616 /* Implement targetm.vectorize.vec_perm_const_ok. */
38617
38618 static bool
38619 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
38620 const unsigned char *sel)
38621 {
38622 struct expand_vec_perm_d d;
38623 unsigned int i, nelt, which;
38624 bool ret;
38625
38626 d.vmode = vmode;
38627 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
38628 d.testing_p = true;
38629
38630 /* Given sufficient ISA support we can just return true here
38631 for selected vector modes. */
38632 if (GET_MODE_SIZE (d.vmode) == 16)
38633 {
38634 /* All implementable with a single vpperm insn. */
38635 if (TARGET_XOP)
38636 return true;
38637 /* All implementable with 2 pshufb + 1 ior. */
38638 if (TARGET_SSSE3)
38639 return true;
38640 /* All implementable with shufpd or unpck[lh]pd. */
38641 if (d.nelt == 2)
38642 return true;
38643 }
38644
38645 /* Extract the values from the vector CST into the permutation
38646 array in D. */
38647 memcpy (d.perm, sel, nelt);
38648 for (i = which = 0; i < nelt; ++i)
38649 {
38650 unsigned char e = d.perm[i];
38651 gcc_assert (e < 2 * nelt);
38652 which |= (e < nelt ? 1 : 2);
38653 }
38654
38655 /* For all elements from second vector, fold the elements to first. */
38656 if (which == 2)
38657 for (i = 0; i < nelt; ++i)
38658 d.perm[i] -= nelt;
38659
38660 /* Check whether the mask can be applied to the vector type. */
38661 d.one_operand_p = (which != 3);
38662
38663 /* Implementable with shufps or pshufd. */
38664 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
38665 return true;
38666
38667 /* Otherwise we have to go through the motions and see if we can
38668 figure out how to generate the requested permutation. */
38669 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
38670 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
38671 if (!d.one_operand_p)
38672 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
38673
38674 start_sequence ();
38675 ret = ix86_expand_vec_perm_const_1 (&d);
38676 end_sequence ();
38677
38678 return ret;
38679 }
38680
38681 void
38682 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
38683 {
38684 struct expand_vec_perm_d d;
38685 unsigned i, nelt;
38686
38687 d.target = targ;
38688 d.op0 = op0;
38689 d.op1 = op1;
38690 d.vmode = GET_MODE (targ);
38691 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
38692 d.one_operand_p = false;
38693 d.testing_p = false;
38694
38695 for (i = 0; i < nelt; ++i)
38696 d.perm[i] = i * 2 + odd;
38697
38698 /* We'll either be able to implement the permutation directly... */
38699 if (expand_vec_perm_1 (&d))
38700 return;
38701
38702 /* ... or we use the special-case patterns. */
38703 expand_vec_perm_even_odd_1 (&d, odd);
38704 }
38705
38706 static void
38707 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
38708 {
38709 struct expand_vec_perm_d d;
38710 unsigned i, nelt, base;
38711 bool ok;
38712
38713 d.target = targ;
38714 d.op0 = op0;
38715 d.op1 = op1;
38716 d.vmode = GET_MODE (targ);
38717 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
38718 d.one_operand_p = false;
38719 d.testing_p = false;
38720
38721 base = high_p ? nelt / 2 : 0;
38722 for (i = 0; i < nelt / 2; ++i)
38723 {
38724 d.perm[i * 2] = i + base;
38725 d.perm[i * 2 + 1] = i + base + nelt;
38726 }
38727
38728 /* Note that for AVX this isn't one instruction. */
38729 ok = ix86_expand_vec_perm_const_1 (&d);
38730 gcc_assert (ok);
38731 }
38732
38733
38734 /* Expand a vector operation CODE for a V*QImode in terms of the
38735 same operation on V*HImode. */
38736
38737 void
38738 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
38739 {
38740 enum machine_mode qimode = GET_MODE (dest);
38741 enum machine_mode himode;
38742 rtx (*gen_il) (rtx, rtx, rtx);
38743 rtx (*gen_ih) (rtx, rtx, rtx);
38744 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
38745 struct expand_vec_perm_d d;
38746 bool ok, full_interleave;
38747 bool uns_p = false;
38748 int i;
38749
38750 switch (qimode)
38751 {
38752 case V16QImode:
38753 himode = V8HImode;
38754 gen_il = gen_vec_interleave_lowv16qi;
38755 gen_ih = gen_vec_interleave_highv16qi;
38756 break;
38757 case V32QImode:
38758 himode = V16HImode;
38759 gen_il = gen_avx2_interleave_lowv32qi;
38760 gen_ih = gen_avx2_interleave_highv32qi;
38761 break;
38762 default:
38763 gcc_unreachable ();
38764 }
38765
38766 op2_l = op2_h = op2;
38767 switch (code)
38768 {
38769 case MULT:
38770 /* Unpack data such that we've got a source byte in each low byte of
38771 each word. We don't care what goes into the high byte of each word.
38772 Rather than trying to get zero in there, most convenient is to let
38773 it be a copy of the low byte. */
38774 op2_l = gen_reg_rtx (qimode);
38775 op2_h = gen_reg_rtx (qimode);
38776 emit_insn (gen_il (op2_l, op2, op2));
38777 emit_insn (gen_ih (op2_h, op2, op2));
38778 /* FALLTHRU */
38779
38780 op1_l = gen_reg_rtx (qimode);
38781 op1_h = gen_reg_rtx (qimode);
38782 emit_insn (gen_il (op1_l, op1, op1));
38783 emit_insn (gen_ih (op1_h, op1, op1));
38784 full_interleave = qimode == V16QImode;
38785 break;
38786
38787 case ASHIFT:
38788 case LSHIFTRT:
38789 uns_p = true;
38790 /* FALLTHRU */
38791 case ASHIFTRT:
38792 op1_l = gen_reg_rtx (himode);
38793 op1_h = gen_reg_rtx (himode);
38794 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
38795 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
38796 full_interleave = true;
38797 break;
38798 default:
38799 gcc_unreachable ();
38800 }
38801
38802 /* Perform the operation. */
38803 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
38804 1, OPTAB_DIRECT);
38805 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
38806 1, OPTAB_DIRECT);
38807 gcc_assert (res_l && res_h);
38808
38809 /* Merge the data back into the right place. */
38810 d.target = dest;
38811 d.op0 = gen_lowpart (qimode, res_l);
38812 d.op1 = gen_lowpart (qimode, res_h);
38813 d.vmode = qimode;
38814 d.nelt = GET_MODE_NUNITS (qimode);
38815 d.one_operand_p = false;
38816 d.testing_p = false;
38817
38818 if (full_interleave)
38819 {
38820 /* For SSE2, we used an full interleave, so the desired
38821 results are in the even elements. */
38822 for (i = 0; i < 32; ++i)
38823 d.perm[i] = i * 2;
38824 }
38825 else
38826 {
38827 /* For AVX, the interleave used above was not cross-lane. So the
38828 extraction is evens but with the second and third quarter swapped.
38829 Happily, that is even one insn shorter than even extraction. */
38830 for (i = 0; i < 32; ++i)
38831 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
38832 }
38833
38834 ok = ix86_expand_vec_perm_const_1 (&d);
38835 gcc_assert (ok);
38836
38837 set_unique_reg_note (get_last_insn (), REG_EQUAL,
38838 gen_rtx_fmt_ee (code, qimode, op1, op2));
38839 }
38840
38841 void
38842 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
38843 bool uns_p, bool odd_p)
38844 {
38845 enum machine_mode mode = GET_MODE (op1);
38846 enum machine_mode wmode = GET_MODE (dest);
38847 rtx x;
38848
38849 /* We only play even/odd games with vectors of SImode. */
38850 gcc_assert (mode == V4SImode || mode == V8SImode);
38851
38852 /* If we're looking for the odd results, shift those members down to
38853 the even slots. For some cpus this is faster than a PSHUFD. */
38854 if (odd_p)
38855 {
38856 if (TARGET_XOP && mode == V4SImode)
38857 {
38858 x = force_reg (wmode, CONST0_RTX (wmode));
38859 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
38860 return;
38861 }
38862
38863 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
38864 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
38865 x, NULL, 1, OPTAB_DIRECT);
38866 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
38867 x, NULL, 1, OPTAB_DIRECT);
38868 op1 = gen_lowpart (mode, op1);
38869 op2 = gen_lowpart (mode, op2);
38870 }
38871
38872 if (mode == V8SImode)
38873 {
38874 if (uns_p)
38875 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
38876 else
38877 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
38878 }
38879 else if (uns_p)
38880 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
38881 else if (TARGET_SSE4_1)
38882 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
38883 else
38884 {
38885 rtx s1, s2, t0, t1, t2;
38886
38887 /* The easiest way to implement this without PMULDQ is to go through
38888 the motions as if we are performing a full 64-bit multiply. With
38889 the exception that we need to do less shuffling of the elements. */
38890
38891 /* Compute the sign-extension, aka highparts, of the two operands. */
38892 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
38893 op1, pc_rtx, pc_rtx);
38894 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
38895 op2, pc_rtx, pc_rtx);
38896
38897 /* Multiply LO(A) * HI(B), and vice-versa. */
38898 t1 = gen_reg_rtx (wmode);
38899 t2 = gen_reg_rtx (wmode);
38900 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
38901 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
38902
38903 /* Multiply LO(A) * LO(B). */
38904 t0 = gen_reg_rtx (wmode);
38905 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
38906
38907 /* Combine and shift the highparts into place. */
38908 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
38909 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
38910 1, OPTAB_DIRECT);
38911
38912 /* Combine high and low parts. */
38913 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
38914 return;
38915 }
38916 emit_insn (x);
38917 }
38918
38919 void
38920 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
38921 bool uns_p, bool high_p)
38922 {
38923 enum machine_mode wmode = GET_MODE (dest);
38924 enum machine_mode mode = GET_MODE (op1);
38925 rtx t1, t2, t3, t4, mask;
38926
38927 switch (mode)
38928 {
38929 case V4SImode:
38930 t1 = gen_reg_rtx (mode);
38931 t2 = gen_reg_rtx (mode);
38932 if (TARGET_XOP && !uns_p)
38933 {
38934 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
38935 shuffle the elements once so that all elements are in the right
38936 place for immediate use: { A C B D }. */
38937 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
38938 const1_rtx, GEN_INT (3)));
38939 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
38940 const1_rtx, GEN_INT (3)));
38941 }
38942 else
38943 {
38944 /* Put the elements into place for the multiply. */
38945 ix86_expand_vec_interleave (t1, op1, op1, high_p);
38946 ix86_expand_vec_interleave (t2, op2, op2, high_p);
38947 high_p = false;
38948 }
38949 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
38950 break;
38951
38952 case V8SImode:
38953 /* Shuffle the elements between the lanes. After this we
38954 have { A B E F | C D G H } for each operand. */
38955 t1 = gen_reg_rtx (V4DImode);
38956 t2 = gen_reg_rtx (V4DImode);
38957 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
38958 const0_rtx, const2_rtx,
38959 const1_rtx, GEN_INT (3)));
38960 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
38961 const0_rtx, const2_rtx,
38962 const1_rtx, GEN_INT (3)));
38963
38964 /* Shuffle the elements within the lanes. After this we
38965 have { A A B B | C C D D } or { E E F F | G G H H }. */
38966 t3 = gen_reg_rtx (V8SImode);
38967 t4 = gen_reg_rtx (V8SImode);
38968 mask = GEN_INT (high_p
38969 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
38970 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
38971 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
38972 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
38973
38974 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
38975 break;
38976
38977 case V8HImode:
38978 case V16HImode:
38979 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
38980 uns_p, OPTAB_DIRECT);
38981 t2 = expand_binop (mode,
38982 uns_p ? umul_highpart_optab : smul_highpart_optab,
38983 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
38984 gcc_assert (t1 && t2);
38985
38986 ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
38987 break;
38988
38989 case V16QImode:
38990 case V32QImode:
38991 t1 = gen_reg_rtx (wmode);
38992 t2 = gen_reg_rtx (wmode);
38993 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
38994 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
38995
38996 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
38997 break;
38998
38999 default:
39000 gcc_unreachable ();
39001 }
39002 }
39003
39004 void
39005 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
39006 {
39007 rtx res_1, res_2;
39008
39009 res_1 = gen_reg_rtx (V4SImode);
39010 res_2 = gen_reg_rtx (V4SImode);
39011 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
39012 op1, op2, true, false);
39013 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
39014 op1, op2, true, true);
39015
39016 /* Move the results in element 2 down to element 1; we don't care
39017 what goes in elements 2 and 3. Then we can merge the parts
39018 back together with an interleave.
39019
39020 Note that two other sequences were tried:
39021 (1) Use interleaves at the start instead of psrldq, which allows
39022 us to use a single shufps to merge things back at the end.
39023 (2) Use shufps here to combine the two vectors, then pshufd to
39024 put the elements in the correct order.
39025 In both cases the cost of the reformatting stall was too high
39026 and the overall sequence slower. */
39027
39028 emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
39029 const0_rtx, const0_rtx));
39030 emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
39031 const0_rtx, const0_rtx));
39032 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
39033
39034 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
39035 }
39036
39037 void
39038 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
39039 {
39040 enum machine_mode mode = GET_MODE (op0);
39041 rtx t1, t2, t3, t4, t5, t6;
39042
39043 if (TARGET_XOP && mode == V2DImode)
39044 {
39045 /* op1: A,B,C,D, op2: E,F,G,H */
39046 op1 = gen_lowpart (V4SImode, op1);
39047 op2 = gen_lowpart (V4SImode, op2);
39048
39049 t1 = gen_reg_rtx (V4SImode);
39050 t2 = gen_reg_rtx (V4SImode);
39051 t3 = gen_reg_rtx (V2DImode);
39052 t4 = gen_reg_rtx (V2DImode);
39053
39054 /* t1: B,A,D,C */
39055 emit_insn (gen_sse2_pshufd_1 (t1, op1,
39056 GEN_INT (1),
39057 GEN_INT (0),
39058 GEN_INT (3),
39059 GEN_INT (2)));
39060
39061 /* t2: (B*E),(A*F),(D*G),(C*H) */
39062 emit_insn (gen_mulv4si3 (t2, t1, op2));
39063
39064 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
39065 emit_insn (gen_xop_phadddq (t3, t2));
39066
39067 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
39068 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
39069
39070 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
39071 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
39072 }
39073 else
39074 {
39075 enum machine_mode nmode;
39076 rtx (*umul) (rtx, rtx, rtx);
39077
39078 if (mode == V2DImode)
39079 {
39080 umul = gen_vec_widen_umult_even_v4si;
39081 nmode = V4SImode;
39082 }
39083 else if (mode == V4DImode)
39084 {
39085 umul = gen_vec_widen_umult_even_v8si;
39086 nmode = V8SImode;
39087 }
39088 else
39089 gcc_unreachable ();
39090
39091
39092 /* Multiply low parts. */
39093 t1 = gen_reg_rtx (mode);
39094 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
39095
39096 /* Shift input vectors right 32 bits so we can multiply high parts. */
39097 t6 = GEN_INT (32);
39098 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
39099 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
39100
39101 /* Multiply high parts by low parts. */
39102 t4 = gen_reg_rtx (mode);
39103 t5 = gen_reg_rtx (mode);
39104 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
39105 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
39106
39107 /* Combine and shift the highparts back. */
39108 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
39109 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
39110
39111 /* Combine high and low parts. */
39112 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
39113 }
39114
39115 set_unique_reg_note (get_last_insn (), REG_EQUAL,
39116 gen_rtx_MULT (mode, op1, op2));
39117 }
39118
39119 /* Expand an insert into a vector register through pinsr insn.
39120 Return true if successful. */
39121
39122 bool
39123 ix86_expand_pinsr (rtx *operands)
39124 {
39125 rtx dst = operands[0];
39126 rtx src = operands[3];
39127
39128 unsigned int size = INTVAL (operands[1]);
39129 unsigned int pos = INTVAL (operands[2]);
39130
39131 if (GET_CODE (dst) == SUBREG)
39132 {
39133 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
39134 dst = SUBREG_REG (dst);
39135 }
39136
39137 if (GET_CODE (src) == SUBREG)
39138 src = SUBREG_REG (src);
39139
39140 switch (GET_MODE (dst))
39141 {
39142 case V16QImode:
39143 case V8HImode:
39144 case V4SImode:
39145 case V2DImode:
39146 {
39147 enum machine_mode srcmode, dstmode;
39148 rtx (*pinsr)(rtx, rtx, rtx, rtx);
39149
39150 srcmode = mode_for_size (size, MODE_INT, 0);
39151
39152 switch (srcmode)
39153 {
39154 case QImode:
39155 if (!TARGET_SSE4_1)
39156 return false;
39157 dstmode = V16QImode;
39158 pinsr = gen_sse4_1_pinsrb;
39159 break;
39160
39161 case HImode:
39162 if (!TARGET_SSE2)
39163 return false;
39164 dstmode = V8HImode;
39165 pinsr = gen_sse2_pinsrw;
39166 break;
39167
39168 case SImode:
39169 if (!TARGET_SSE4_1)
39170 return false;
39171 dstmode = V4SImode;
39172 pinsr = gen_sse4_1_pinsrd;
39173 break;
39174
39175 case DImode:
39176 gcc_assert (TARGET_64BIT);
39177 if (!TARGET_SSE4_1)
39178 return false;
39179 dstmode = V2DImode;
39180 pinsr = gen_sse4_1_pinsrq;
39181 break;
39182
39183 default:
39184 return false;
39185 }
39186
39187 dst = gen_lowpart (dstmode, dst);
39188 src = gen_lowpart (srcmode, src);
39189
39190 pos /= size;
39191
39192 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
39193 return true;
39194 }
39195
39196 default:
39197 return false;
39198 }
39199 }
39200 \f
39201 /* This function returns the calling abi specific va_list type node.
39202 It returns the FNDECL specific va_list type. */
39203
39204 static tree
39205 ix86_fn_abi_va_list (tree fndecl)
39206 {
39207 if (!TARGET_64BIT)
39208 return va_list_type_node;
39209 gcc_assert (fndecl != NULL_TREE);
39210
39211 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
39212 return ms_va_list_type_node;
39213 else
39214 return sysv_va_list_type_node;
39215 }
39216
39217 /* Returns the canonical va_list type specified by TYPE. If there
39218 is no valid TYPE provided, it return NULL_TREE. */
39219
39220 static tree
39221 ix86_canonical_va_list_type (tree type)
39222 {
39223 tree wtype, htype;
39224
39225 /* Resolve references and pointers to va_list type. */
39226 if (TREE_CODE (type) == MEM_REF)
39227 type = TREE_TYPE (type);
39228 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
39229 type = TREE_TYPE (type);
39230 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
39231 type = TREE_TYPE (type);
39232
39233 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
39234 {
39235 wtype = va_list_type_node;
39236 gcc_assert (wtype != NULL_TREE);
39237 htype = type;
39238 if (TREE_CODE (wtype) == ARRAY_TYPE)
39239 {
39240 /* If va_list is an array type, the argument may have decayed
39241 to a pointer type, e.g. by being passed to another function.
39242 In that case, unwrap both types so that we can compare the
39243 underlying records. */
39244 if (TREE_CODE (htype) == ARRAY_TYPE
39245 || POINTER_TYPE_P (htype))
39246 {
39247 wtype = TREE_TYPE (wtype);
39248 htype = TREE_TYPE (htype);
39249 }
39250 }
39251 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
39252 return va_list_type_node;
39253 wtype = sysv_va_list_type_node;
39254 gcc_assert (wtype != NULL_TREE);
39255 htype = type;
39256 if (TREE_CODE (wtype) == ARRAY_TYPE)
39257 {
39258 /* If va_list is an array type, the argument may have decayed
39259 to a pointer type, e.g. by being passed to another function.
39260 In that case, unwrap both types so that we can compare the
39261 underlying records. */
39262 if (TREE_CODE (htype) == ARRAY_TYPE
39263 || POINTER_TYPE_P (htype))
39264 {
39265 wtype = TREE_TYPE (wtype);
39266 htype = TREE_TYPE (htype);
39267 }
39268 }
39269 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
39270 return sysv_va_list_type_node;
39271 wtype = ms_va_list_type_node;
39272 gcc_assert (wtype != NULL_TREE);
39273 htype = type;
39274 if (TREE_CODE (wtype) == ARRAY_TYPE)
39275 {
39276 /* If va_list is an array type, the argument may have decayed
39277 to a pointer type, e.g. by being passed to another function.
39278 In that case, unwrap both types so that we can compare the
39279 underlying records. */
39280 if (TREE_CODE (htype) == ARRAY_TYPE
39281 || POINTER_TYPE_P (htype))
39282 {
39283 wtype = TREE_TYPE (wtype);
39284 htype = TREE_TYPE (htype);
39285 }
39286 }
39287 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
39288 return ms_va_list_type_node;
39289 return NULL_TREE;
39290 }
39291 return std_canonical_va_list_type (type);
39292 }
39293
39294 /* Iterate through the target-specific builtin types for va_list.
39295 IDX denotes the iterator, *PTREE is set to the result type of
39296 the va_list builtin, and *PNAME to its internal type.
39297 Returns zero if there is no element for this index, otherwise
39298 IDX should be increased upon the next call.
39299 Note, do not iterate a base builtin's name like __builtin_va_list.
39300 Used from c_common_nodes_and_builtins. */
39301
39302 static int
39303 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
39304 {
39305 if (TARGET_64BIT)
39306 {
39307 switch (idx)
39308 {
39309 default:
39310 break;
39311
39312 case 0:
39313 *ptree = ms_va_list_type_node;
39314 *pname = "__builtin_ms_va_list";
39315 return 1;
39316
39317 case 1:
39318 *ptree = sysv_va_list_type_node;
39319 *pname = "__builtin_sysv_va_list";
39320 return 1;
39321 }
39322 }
39323
39324 return 0;
39325 }
39326
39327 #undef TARGET_SCHED_DISPATCH
39328 #define TARGET_SCHED_DISPATCH has_dispatch
39329 #undef TARGET_SCHED_DISPATCH_DO
39330 #define TARGET_SCHED_DISPATCH_DO do_dispatch
39331 #undef TARGET_SCHED_REASSOCIATION_WIDTH
39332 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
39333 #undef TARGET_SCHED_REORDER
39334 #define TARGET_SCHED_REORDER ix86_sched_reorder
39335
39336 /* The size of the dispatch window is the total number of bytes of
39337 object code allowed in a window. */
39338 #define DISPATCH_WINDOW_SIZE 16
39339
39340 /* Number of dispatch windows considered for scheduling. */
39341 #define MAX_DISPATCH_WINDOWS 3
39342
39343 /* Maximum number of instructions in a window. */
39344 #define MAX_INSN 4
39345
39346 /* Maximum number of immediate operands in a window. */
39347 #define MAX_IMM 4
39348
39349 /* Maximum number of immediate bits allowed in a window. */
39350 #define MAX_IMM_SIZE 128
39351
39352 /* Maximum number of 32 bit immediates allowed in a window. */
39353 #define MAX_IMM_32 4
39354
39355 /* Maximum number of 64 bit immediates allowed in a window. */
39356 #define MAX_IMM_64 2
39357
39358 /* Maximum total of loads or prefetches allowed in a window. */
39359 #define MAX_LOAD 2
39360
39361 /* Maximum total of stores allowed in a window. */
39362 #define MAX_STORE 1
39363
39364 #undef BIG
39365 #define BIG 100
39366
39367
39368 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
39369 enum dispatch_group {
39370 disp_no_group = 0,
39371 disp_load,
39372 disp_store,
39373 disp_load_store,
39374 disp_prefetch,
39375 disp_imm,
39376 disp_imm_32,
39377 disp_imm_64,
39378 disp_branch,
39379 disp_cmp,
39380 disp_jcc,
39381 disp_last
39382 };
39383
39384 /* Number of allowable groups in a dispatch window. It is an array
39385 indexed by dispatch_group enum. 100 is used as a big number,
39386 because the number of these kind of operations does not have any
39387 effect in dispatch window, but we need them for other reasons in
39388 the table. */
39389 static unsigned int num_allowable_groups[disp_last] = {
39390 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
39391 };
39392
39393 char group_name[disp_last + 1][16] = {
39394 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
39395 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
39396 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
39397 };
39398
39399 /* Instruction path. */
39400 enum insn_path {
39401 no_path = 0,
39402 path_single, /* Single micro op. */
39403 path_double, /* Double micro op. */
39404 path_multi, /* Instructions with more than 2 micro op.. */
39405 last_path
39406 };
39407
39408 /* sched_insn_info defines a window to the instructions scheduled in
39409 the basic block. It contains a pointer to the insn_info table and
39410 the instruction scheduled.
39411
39412 Windows are allocated for each basic block and are linked
39413 together. */
39414 typedef struct sched_insn_info_s {
39415 rtx insn;
39416 enum dispatch_group group;
39417 enum insn_path path;
39418 int byte_len;
39419 int imm_bytes;
39420 } sched_insn_info;
39421
39422 /* Linked list of dispatch windows. This is a two way list of
39423 dispatch windows of a basic block. It contains information about
39424 the number of uops in the window and the total number of
39425 instructions and of bytes in the object code for this dispatch
39426 window. */
39427 typedef struct dispatch_windows_s {
39428 int num_insn; /* Number of insn in the window. */
39429 int num_uops; /* Number of uops in the window. */
39430 int window_size; /* Number of bytes in the window. */
39431 int window_num; /* Window number between 0 or 1. */
39432 int num_imm; /* Number of immediates in an insn. */
39433 int num_imm_32; /* Number of 32 bit immediates in an insn. */
39434 int num_imm_64; /* Number of 64 bit immediates in an insn. */
39435 int imm_size; /* Total immediates in the window. */
39436 int num_loads; /* Total memory loads in the window. */
39437 int num_stores; /* Total memory stores in the window. */
39438 int violation; /* Violation exists in window. */
39439 sched_insn_info *window; /* Pointer to the window. */
39440 struct dispatch_windows_s *next;
39441 struct dispatch_windows_s *prev;
39442 } dispatch_windows;
39443
39444 /* Immediate valuse used in an insn. */
39445 typedef struct imm_info_s
39446 {
39447 int imm;
39448 int imm32;
39449 int imm64;
39450 } imm_info;
39451
39452 static dispatch_windows *dispatch_window_list;
39453 static dispatch_windows *dispatch_window_list1;
39454
39455 /* Get dispatch group of insn. */
39456
39457 static enum dispatch_group
39458 get_mem_group (rtx insn)
39459 {
39460 enum attr_memory memory;
39461
39462 if (INSN_CODE (insn) < 0)
39463 return disp_no_group;
39464 memory = get_attr_memory (insn);
39465 if (memory == MEMORY_STORE)
39466 return disp_store;
39467
39468 if (memory == MEMORY_LOAD)
39469 return disp_load;
39470
39471 if (memory == MEMORY_BOTH)
39472 return disp_load_store;
39473
39474 return disp_no_group;
39475 }
39476
39477 /* Return true if insn is a compare instruction. */
39478
39479 static bool
39480 is_cmp (rtx insn)
39481 {
39482 enum attr_type type;
39483
39484 type = get_attr_type (insn);
39485 return (type == TYPE_TEST
39486 || type == TYPE_ICMP
39487 || type == TYPE_FCMP
39488 || GET_CODE (PATTERN (insn)) == COMPARE);
39489 }
39490
39491 /* Return true if a dispatch violation encountered. */
39492
39493 static bool
39494 dispatch_violation (void)
39495 {
39496 if (dispatch_window_list->next)
39497 return dispatch_window_list->next->violation;
39498 return dispatch_window_list->violation;
39499 }
39500
39501 /* Return true if insn is a branch instruction. */
39502
39503 static bool
39504 is_branch (rtx insn)
39505 {
39506 return (CALL_P (insn) || JUMP_P (insn));
39507 }
39508
39509 /* Return true if insn is a prefetch instruction. */
39510
39511 static bool
39512 is_prefetch (rtx insn)
39513 {
39514 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
39515 }
39516
39517 /* This function initializes a dispatch window and the list container holding a
39518 pointer to the window. */
39519
39520 static void
39521 init_window (int window_num)
39522 {
39523 int i;
39524 dispatch_windows *new_list;
39525
39526 if (window_num == 0)
39527 new_list = dispatch_window_list;
39528 else
39529 new_list = dispatch_window_list1;
39530
39531 new_list->num_insn = 0;
39532 new_list->num_uops = 0;
39533 new_list->window_size = 0;
39534 new_list->next = NULL;
39535 new_list->prev = NULL;
39536 new_list->window_num = window_num;
39537 new_list->num_imm = 0;
39538 new_list->num_imm_32 = 0;
39539 new_list->num_imm_64 = 0;
39540 new_list->imm_size = 0;
39541 new_list->num_loads = 0;
39542 new_list->num_stores = 0;
39543 new_list->violation = false;
39544
39545 for (i = 0; i < MAX_INSN; i++)
39546 {
39547 new_list->window[i].insn = NULL;
39548 new_list->window[i].group = disp_no_group;
39549 new_list->window[i].path = no_path;
39550 new_list->window[i].byte_len = 0;
39551 new_list->window[i].imm_bytes = 0;
39552 }
39553 return;
39554 }
39555
39556 /* This function allocates and initializes a dispatch window and the
39557 list container holding a pointer to the window. */
39558
39559 static dispatch_windows *
39560 allocate_window (void)
39561 {
39562 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
39563 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
39564
39565 return new_list;
39566 }
39567
39568 /* This routine initializes the dispatch scheduling information. It
39569 initiates building dispatch scheduler tables and constructs the
39570 first dispatch window. */
39571
39572 static void
39573 init_dispatch_sched (void)
39574 {
39575 /* Allocate a dispatch list and a window. */
39576 dispatch_window_list = allocate_window ();
39577 dispatch_window_list1 = allocate_window ();
39578 init_window (0);
39579 init_window (1);
39580 }
39581
39582 /* This function returns true if a branch is detected. End of a basic block
39583 does not have to be a branch, but here we assume only branches end a
39584 window. */
39585
39586 static bool
39587 is_end_basic_block (enum dispatch_group group)
39588 {
39589 return group == disp_branch;
39590 }
39591
39592 /* This function is called when the end of a window processing is reached. */
39593
39594 static void
39595 process_end_window (void)
39596 {
39597 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
39598 if (dispatch_window_list->next)
39599 {
39600 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
39601 gcc_assert (dispatch_window_list->window_size
39602 + dispatch_window_list1->window_size <= 48);
39603 init_window (1);
39604 }
39605 init_window (0);
39606 }
39607
39608 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
39609 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
39610 for 48 bytes of instructions. Note that these windows are not dispatch
39611 windows that their sizes are DISPATCH_WINDOW_SIZE. */
39612
39613 static dispatch_windows *
39614 allocate_next_window (int window_num)
39615 {
39616 if (window_num == 0)
39617 {
39618 if (dispatch_window_list->next)
39619 init_window (1);
39620 init_window (0);
39621 return dispatch_window_list;
39622 }
39623
39624 dispatch_window_list->next = dispatch_window_list1;
39625 dispatch_window_list1->prev = dispatch_window_list;
39626
39627 return dispatch_window_list1;
39628 }
39629
39630 /* Increment the number of immediate operands of an instruction. */
39631
39632 static int
39633 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
39634 {
39635 if (*in_rtx == 0)
39636 return 0;
39637
39638 switch ( GET_CODE (*in_rtx))
39639 {
39640 case CONST:
39641 case SYMBOL_REF:
39642 case CONST_INT:
39643 (imm_values->imm)++;
39644 if (x86_64_immediate_operand (*in_rtx, SImode))
39645 (imm_values->imm32)++;
39646 else
39647 (imm_values->imm64)++;
39648 break;
39649
39650 case CONST_DOUBLE:
39651 (imm_values->imm)++;
39652 (imm_values->imm64)++;
39653 break;
39654
39655 case CODE_LABEL:
39656 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
39657 {
39658 (imm_values->imm)++;
39659 (imm_values->imm32)++;
39660 }
39661 break;
39662
39663 default:
39664 break;
39665 }
39666
39667 return 0;
39668 }
39669
39670 /* Compute number of immediate operands of an instruction. */
39671
39672 static void
39673 find_constant (rtx in_rtx, imm_info *imm_values)
39674 {
39675 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
39676 (rtx_function) find_constant_1, (void *) imm_values);
39677 }
39678
39679 /* Return total size of immediate operands of an instruction along with number
39680 of corresponding immediate-operands. It initializes its parameters to zero
39681 befor calling FIND_CONSTANT.
39682 INSN is the input instruction. IMM is the total of immediates.
39683 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
39684 bit immediates. */
39685
39686 static int
39687 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
39688 {
39689 imm_info imm_values = {0, 0, 0};
39690
39691 find_constant (insn, &imm_values);
39692 *imm = imm_values.imm;
39693 *imm32 = imm_values.imm32;
39694 *imm64 = imm_values.imm64;
39695 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
39696 }
39697
39698 /* This function indicates if an operand of an instruction is an
39699 immediate. */
39700
39701 static bool
39702 has_immediate (rtx insn)
39703 {
39704 int num_imm_operand;
39705 int num_imm32_operand;
39706 int num_imm64_operand;
39707
39708 if (insn)
39709 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
39710 &num_imm64_operand);
39711 return false;
39712 }
39713
39714 /* Return single or double path for instructions. */
39715
39716 static enum insn_path
39717 get_insn_path (rtx insn)
39718 {
39719 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
39720
39721 if ((int)path == 0)
39722 return path_single;
39723
39724 if ((int)path == 1)
39725 return path_double;
39726
39727 return path_multi;
39728 }
39729
39730 /* Return insn dispatch group. */
39731
39732 static enum dispatch_group
39733 get_insn_group (rtx insn)
39734 {
39735 enum dispatch_group group = get_mem_group (insn);
39736 if (group)
39737 return group;
39738
39739 if (is_branch (insn))
39740 return disp_branch;
39741
39742 if (is_cmp (insn))
39743 return disp_cmp;
39744
39745 if (has_immediate (insn))
39746 return disp_imm;
39747
39748 if (is_prefetch (insn))
39749 return disp_prefetch;
39750
39751 return disp_no_group;
39752 }
39753
39754 /* Count number of GROUP restricted instructions in a dispatch
39755 window WINDOW_LIST. */
39756
39757 static int
39758 count_num_restricted (rtx insn, dispatch_windows *window_list)
39759 {
39760 enum dispatch_group group = get_insn_group (insn);
39761 int imm_size;
39762 int num_imm_operand;
39763 int num_imm32_operand;
39764 int num_imm64_operand;
39765
39766 if (group == disp_no_group)
39767 return 0;
39768
39769 if (group == disp_imm)
39770 {
39771 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
39772 &num_imm64_operand);
39773 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
39774 || num_imm_operand + window_list->num_imm > MAX_IMM
39775 || (num_imm32_operand > 0
39776 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
39777 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
39778 || (num_imm64_operand > 0
39779 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
39780 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
39781 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
39782 && num_imm64_operand > 0
39783 && ((window_list->num_imm_64 > 0
39784 && window_list->num_insn >= 2)
39785 || window_list->num_insn >= 3)))
39786 return BIG;
39787
39788 return 1;
39789 }
39790
39791 if ((group == disp_load_store
39792 && (window_list->num_loads >= MAX_LOAD
39793 || window_list->num_stores >= MAX_STORE))
39794 || ((group == disp_load
39795 || group == disp_prefetch)
39796 && window_list->num_loads >= MAX_LOAD)
39797 || (group == disp_store
39798 && window_list->num_stores >= MAX_STORE))
39799 return BIG;
39800
39801 return 1;
39802 }
39803
39804 /* This function returns true if insn satisfies dispatch rules on the
39805 last window scheduled. */
39806
39807 static bool
39808 fits_dispatch_window (rtx insn)
39809 {
39810 dispatch_windows *window_list = dispatch_window_list;
39811 dispatch_windows *window_list_next = dispatch_window_list->next;
39812 unsigned int num_restrict;
39813 enum dispatch_group group = get_insn_group (insn);
39814 enum insn_path path = get_insn_path (insn);
39815 int sum;
39816
39817 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
39818 instructions should be given the lowest priority in the
39819 scheduling process in Haifa scheduler to make sure they will be
39820 scheduled in the same dispatch window as the reference to them. */
39821 if (group == disp_jcc || group == disp_cmp)
39822 return false;
39823
39824 /* Check nonrestricted. */
39825 if (group == disp_no_group || group == disp_branch)
39826 return true;
39827
39828 /* Get last dispatch window. */
39829 if (window_list_next)
39830 window_list = window_list_next;
39831
39832 if (window_list->window_num == 1)
39833 {
39834 sum = window_list->prev->window_size + window_list->window_size;
39835
39836 if (sum == 32
39837 || (min_insn_size (insn) + sum) >= 48)
39838 /* Window 1 is full. Go for next window. */
39839 return true;
39840 }
39841
39842 num_restrict = count_num_restricted (insn, window_list);
39843
39844 if (num_restrict > num_allowable_groups[group])
39845 return false;
39846
39847 /* See if it fits in the first window. */
39848 if (window_list->window_num == 0)
39849 {
39850 /* The first widow should have only single and double path
39851 uops. */
39852 if (path == path_double
39853 && (window_list->num_uops + 2) > MAX_INSN)
39854 return false;
39855 else if (path != path_single)
39856 return false;
39857 }
39858 return true;
39859 }
39860
39861 /* Add an instruction INSN with NUM_UOPS micro-operations to the
39862 dispatch window WINDOW_LIST. */
39863
39864 static void
39865 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
39866 {
39867 int byte_len = min_insn_size (insn);
39868 int num_insn = window_list->num_insn;
39869 int imm_size;
39870 sched_insn_info *window = window_list->window;
39871 enum dispatch_group group = get_insn_group (insn);
39872 enum insn_path path = get_insn_path (insn);
39873 int num_imm_operand;
39874 int num_imm32_operand;
39875 int num_imm64_operand;
39876
39877 if (!window_list->violation && group != disp_cmp
39878 && !fits_dispatch_window (insn))
39879 window_list->violation = true;
39880
39881 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
39882 &num_imm64_operand);
39883
39884 /* Initialize window with new instruction. */
39885 window[num_insn].insn = insn;
39886 window[num_insn].byte_len = byte_len;
39887 window[num_insn].group = group;
39888 window[num_insn].path = path;
39889 window[num_insn].imm_bytes = imm_size;
39890
39891 window_list->window_size += byte_len;
39892 window_list->num_insn = num_insn + 1;
39893 window_list->num_uops = window_list->num_uops + num_uops;
39894 window_list->imm_size += imm_size;
39895 window_list->num_imm += num_imm_operand;
39896 window_list->num_imm_32 += num_imm32_operand;
39897 window_list->num_imm_64 += num_imm64_operand;
39898
39899 if (group == disp_store)
39900 window_list->num_stores += 1;
39901 else if (group == disp_load
39902 || group == disp_prefetch)
39903 window_list->num_loads += 1;
39904 else if (group == disp_load_store)
39905 {
39906 window_list->num_stores += 1;
39907 window_list->num_loads += 1;
39908 }
39909 }
39910
39911 /* Adds a scheduled instruction, INSN, to the current dispatch window.
39912 If the total bytes of instructions or the number of instructions in
39913 the window exceed allowable, it allocates a new window. */
39914
39915 static void
39916 add_to_dispatch_window (rtx insn)
39917 {
39918 int byte_len;
39919 dispatch_windows *window_list;
39920 dispatch_windows *next_list;
39921 dispatch_windows *window0_list;
39922 enum insn_path path;
39923 enum dispatch_group insn_group;
39924 bool insn_fits;
39925 int num_insn;
39926 int num_uops;
39927 int window_num;
39928 int insn_num_uops;
39929 int sum;
39930
39931 if (INSN_CODE (insn) < 0)
39932 return;
39933
39934 byte_len = min_insn_size (insn);
39935 window_list = dispatch_window_list;
39936 next_list = window_list->next;
39937 path = get_insn_path (insn);
39938 insn_group = get_insn_group (insn);
39939
39940 /* Get the last dispatch window. */
39941 if (next_list)
39942 window_list = dispatch_window_list->next;
39943
39944 if (path == path_single)
39945 insn_num_uops = 1;
39946 else if (path == path_double)
39947 insn_num_uops = 2;
39948 else
39949 insn_num_uops = (int) path;
39950
39951 /* If current window is full, get a new window.
39952 Window number zero is full, if MAX_INSN uops are scheduled in it.
39953 Window number one is full, if window zero's bytes plus window
39954 one's bytes is 32, or if the bytes of the new instruction added
39955 to the total makes it greater than 48, or it has already MAX_INSN
39956 instructions in it. */
39957 num_insn = window_list->num_insn;
39958 num_uops = window_list->num_uops;
39959 window_num = window_list->window_num;
39960 insn_fits = fits_dispatch_window (insn);
39961
39962 if (num_insn >= MAX_INSN
39963 || num_uops + insn_num_uops > MAX_INSN
39964 || !(insn_fits))
39965 {
39966 window_num = ~window_num & 1;
39967 window_list = allocate_next_window (window_num);
39968 }
39969
39970 if (window_num == 0)
39971 {
39972 add_insn_window (insn, window_list, insn_num_uops);
39973 if (window_list->num_insn >= MAX_INSN
39974 && insn_group == disp_branch)
39975 {
39976 process_end_window ();
39977 return;
39978 }
39979 }
39980 else if (window_num == 1)
39981 {
39982 window0_list = window_list->prev;
39983 sum = window0_list->window_size + window_list->window_size;
39984 if (sum == 32
39985 || (byte_len + sum) >= 48)
39986 {
39987 process_end_window ();
39988 window_list = dispatch_window_list;
39989 }
39990
39991 add_insn_window (insn, window_list, insn_num_uops);
39992 }
39993 else
39994 gcc_unreachable ();
39995
39996 if (is_end_basic_block (insn_group))
39997 {
39998 /* End of basic block is reached do end-basic-block process. */
39999 process_end_window ();
40000 return;
40001 }
40002 }
40003
40004 /* Print the dispatch window, WINDOW_NUM, to FILE. */
40005
40006 DEBUG_FUNCTION static void
40007 debug_dispatch_window_file (FILE *file, int window_num)
40008 {
40009 dispatch_windows *list;
40010 int i;
40011
40012 if (window_num == 0)
40013 list = dispatch_window_list;
40014 else
40015 list = dispatch_window_list1;
40016
40017 fprintf (file, "Window #%d:\n", list->window_num);
40018 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
40019 list->num_insn, list->num_uops, list->window_size);
40020 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
40021 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
40022
40023 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
40024 list->num_stores);
40025 fprintf (file, " insn info:\n");
40026
40027 for (i = 0; i < MAX_INSN; i++)
40028 {
40029 if (!list->window[i].insn)
40030 break;
40031 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
40032 i, group_name[list->window[i].group],
40033 i, (void *)list->window[i].insn,
40034 i, list->window[i].path,
40035 i, list->window[i].byte_len,
40036 i, list->window[i].imm_bytes);
40037 }
40038 }
40039
40040 /* Print to stdout a dispatch window. */
40041
40042 DEBUG_FUNCTION void
40043 debug_dispatch_window (int window_num)
40044 {
40045 debug_dispatch_window_file (stdout, window_num);
40046 }
40047
40048 /* Print INSN dispatch information to FILE. */
40049
40050 DEBUG_FUNCTION static void
40051 debug_insn_dispatch_info_file (FILE *file, rtx insn)
40052 {
40053 int byte_len;
40054 enum insn_path path;
40055 enum dispatch_group group;
40056 int imm_size;
40057 int num_imm_operand;
40058 int num_imm32_operand;
40059 int num_imm64_operand;
40060
40061 if (INSN_CODE (insn) < 0)
40062 return;
40063
40064 byte_len = min_insn_size (insn);
40065 path = get_insn_path (insn);
40066 group = get_insn_group (insn);
40067 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
40068 &num_imm64_operand);
40069
40070 fprintf (file, " insn info:\n");
40071 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
40072 group_name[group], path, byte_len);
40073 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
40074 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
40075 }
40076
40077 /* Print to STDERR the status of the ready list with respect to
40078 dispatch windows. */
40079
40080 DEBUG_FUNCTION void
40081 debug_ready_dispatch (void)
40082 {
40083 int i;
40084 int no_ready = number_in_ready ();
40085
40086 fprintf (stdout, "Number of ready: %d\n", no_ready);
40087
40088 for (i = 0; i < no_ready; i++)
40089 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
40090 }
40091
40092 /* This routine is the driver of the dispatch scheduler. */
40093
40094 static void
40095 do_dispatch (rtx insn, int mode)
40096 {
40097 if (mode == DISPATCH_INIT)
40098 init_dispatch_sched ();
40099 else if (mode == ADD_TO_DISPATCH_WINDOW)
40100 add_to_dispatch_window (insn);
40101 }
40102
40103 /* Return TRUE if Dispatch Scheduling is supported. */
40104
40105 static bool
40106 has_dispatch (rtx insn, int action)
40107 {
40108 if ((TARGET_BDVER1 || TARGET_BDVER2)
40109 && flag_dispatch_scheduler)
40110 switch (action)
40111 {
40112 default:
40113 return false;
40114
40115 case IS_DISPATCH_ON:
40116 return true;
40117 break;
40118
40119 case IS_CMP:
40120 return is_cmp (insn);
40121
40122 case DISPATCH_VIOLATION:
40123 return dispatch_violation ();
40124
40125 case FITS_DISPATCH_WINDOW:
40126 return fits_dispatch_window (insn);
40127 }
40128
40129 return false;
40130 }
40131
40132 /* Implementation of reassociation_width target hook used by
40133 reassoc phase to identify parallelism level in reassociated
40134 tree. Statements tree_code is passed in OPC. Arguments type
40135 is passed in MODE.
40136
40137 Currently parallel reassociation is enabled for Atom
40138 processors only and we set reassociation width to be 2
40139 because Atom may issue up to 2 instructions per cycle.
40140
40141 Return value should be fixed if parallel reassociation is
40142 enabled for other processors. */
40143
40144 static int
40145 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
40146 enum machine_mode mode)
40147 {
40148 int res = 1;
40149
40150 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
40151 res = 2;
40152 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
40153 res = 2;
40154
40155 return res;
40156 }
40157
40158 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
40159 place emms and femms instructions. */
40160
40161 static enum machine_mode
40162 ix86_preferred_simd_mode (enum machine_mode mode)
40163 {
40164 if (!TARGET_SSE)
40165 return word_mode;
40166
40167 switch (mode)
40168 {
40169 case QImode:
40170 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
40171 case HImode:
40172 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
40173 case SImode:
40174 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
40175 case DImode:
40176 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
40177
40178 case SFmode:
40179 if (TARGET_AVX && !TARGET_PREFER_AVX128)
40180 return V8SFmode;
40181 else
40182 return V4SFmode;
40183
40184 case DFmode:
40185 if (!TARGET_VECTORIZE_DOUBLE)
40186 return word_mode;
40187 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
40188 return V4DFmode;
40189 else if (TARGET_SSE2)
40190 return V2DFmode;
40191 /* FALLTHRU */
40192
40193 default:
40194 return word_mode;
40195 }
40196 }
40197
40198 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
40199 vectors. */
40200
40201 static unsigned int
40202 ix86_autovectorize_vector_sizes (void)
40203 {
40204 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
40205 }
40206
40207 /* Implement targetm.vectorize.init_cost. */
40208
40209 static void *
40210 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
40211 {
40212 unsigned *cost = XNEWVEC (unsigned, 3);
40213 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
40214 return cost;
40215 }
40216
40217 /* Implement targetm.vectorize.add_stmt_cost. */
40218
40219 static unsigned
40220 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
40221 struct _stmt_vec_info *stmt_info, int misalign,
40222 enum vect_cost_model_location where)
40223 {
40224 unsigned *cost = (unsigned *) data;
40225 unsigned retval = 0;
40226
40227 if (flag_vect_cost_model)
40228 {
40229 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
40230 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
40231
40232 /* Statements in an inner loop relative to the loop being
40233 vectorized are weighted more heavily. The value here is
40234 arbitrary and could potentially be improved with analysis. */
40235 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
40236 count *= 50; /* FIXME. */
40237
40238 retval = (unsigned) (count * stmt_cost);
40239 cost[where] += retval;
40240 }
40241
40242 return retval;
40243 }
40244
40245 /* Implement targetm.vectorize.finish_cost. */
40246
40247 static void
40248 ix86_finish_cost (void *data, unsigned *prologue_cost,
40249 unsigned *body_cost, unsigned *epilogue_cost)
40250 {
40251 unsigned *cost = (unsigned *) data;
40252 *prologue_cost = cost[vect_prologue];
40253 *body_cost = cost[vect_body];
40254 *epilogue_cost = cost[vect_epilogue];
40255 }
40256
40257 /* Implement targetm.vectorize.destroy_cost_data. */
40258
40259 static void
40260 ix86_destroy_cost_data (void *data)
40261 {
40262 free (data);
40263 }
40264
40265 /* Validate target specific memory model bits in VAL. */
40266
40267 static unsigned HOST_WIDE_INT
40268 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
40269 {
40270 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
40271 unsigned HOST_WIDE_INT strong;
40272
40273 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
40274 |MEMMODEL_MASK)
40275 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
40276 {
40277 warning (OPT_Winvalid_memory_model,
40278 "Unknown architecture specific memory model");
40279 return MEMMODEL_SEQ_CST;
40280 }
40281 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
40282 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
40283 {
40284 warning (OPT_Winvalid_memory_model,
40285 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
40286 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
40287 }
40288 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
40289 {
40290 warning (OPT_Winvalid_memory_model,
40291 "HLE_RELEASE not used with RELEASE or stronger memory model");
40292 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
40293 }
40294 return val;
40295 }
40296
40297 /* Initialize the GCC target structure. */
40298 #undef TARGET_RETURN_IN_MEMORY
40299 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
40300
40301 #undef TARGET_LEGITIMIZE_ADDRESS
40302 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
40303
40304 #undef TARGET_ATTRIBUTE_TABLE
40305 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
40306 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
40307 # undef TARGET_MERGE_DECL_ATTRIBUTES
40308 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
40309 #endif
40310
40311 #undef TARGET_COMP_TYPE_ATTRIBUTES
40312 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
40313
40314 #undef TARGET_INIT_BUILTINS
40315 #define TARGET_INIT_BUILTINS ix86_init_builtins
40316 #undef TARGET_BUILTIN_DECL
40317 #define TARGET_BUILTIN_DECL ix86_builtin_decl
40318 #undef TARGET_EXPAND_BUILTIN
40319 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
40320
40321 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
40322 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
40323 ix86_builtin_vectorized_function
40324
40325 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
40326 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
40327
40328 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
40329 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
40330
40331 #undef TARGET_VECTORIZE_BUILTIN_GATHER
40332 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
40333
40334 #undef TARGET_BUILTIN_RECIPROCAL
40335 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
40336
40337 #undef TARGET_ASM_FUNCTION_EPILOGUE
40338 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
40339
40340 #undef TARGET_ENCODE_SECTION_INFO
40341 #ifndef SUBTARGET_ENCODE_SECTION_INFO
40342 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
40343 #else
40344 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
40345 #endif
40346
40347 #undef TARGET_ASM_OPEN_PAREN
40348 #define TARGET_ASM_OPEN_PAREN ""
40349 #undef TARGET_ASM_CLOSE_PAREN
40350 #define TARGET_ASM_CLOSE_PAREN ""
40351
40352 #undef TARGET_ASM_BYTE_OP
40353 #define TARGET_ASM_BYTE_OP ASM_BYTE
40354
40355 #undef TARGET_ASM_ALIGNED_HI_OP
40356 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
40357 #undef TARGET_ASM_ALIGNED_SI_OP
40358 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
40359 #ifdef ASM_QUAD
40360 #undef TARGET_ASM_ALIGNED_DI_OP
40361 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
40362 #endif
40363
40364 #undef TARGET_PROFILE_BEFORE_PROLOGUE
40365 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
40366
40367 #undef TARGET_ASM_UNALIGNED_HI_OP
40368 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
40369 #undef TARGET_ASM_UNALIGNED_SI_OP
40370 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
40371 #undef TARGET_ASM_UNALIGNED_DI_OP
40372 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
40373
40374 #undef TARGET_PRINT_OPERAND
40375 #define TARGET_PRINT_OPERAND ix86_print_operand
40376 #undef TARGET_PRINT_OPERAND_ADDRESS
40377 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
40378 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
40379 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
40380 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
40381 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
40382
40383 #undef TARGET_SCHED_INIT_GLOBAL
40384 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
40385 #undef TARGET_SCHED_ADJUST_COST
40386 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
40387 #undef TARGET_SCHED_ISSUE_RATE
40388 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
40389 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
40390 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
40391 ia32_multipass_dfa_lookahead
40392
40393 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
40394 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
40395
40396 #undef TARGET_MEMMODEL_CHECK
40397 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
40398
40399 #ifdef HAVE_AS_TLS
40400 #undef TARGET_HAVE_TLS
40401 #define TARGET_HAVE_TLS true
40402 #endif
40403 #undef TARGET_CANNOT_FORCE_CONST_MEM
40404 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
40405 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
40406 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
40407
40408 #undef TARGET_DELEGITIMIZE_ADDRESS
40409 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
40410
40411 #undef TARGET_MS_BITFIELD_LAYOUT_P
40412 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
40413
40414 #if TARGET_MACHO
40415 #undef TARGET_BINDS_LOCAL_P
40416 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
40417 #endif
40418 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
40419 #undef TARGET_BINDS_LOCAL_P
40420 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
40421 #endif
40422
40423 #undef TARGET_ASM_OUTPUT_MI_THUNK
40424 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
40425 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
40426 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
40427
40428 #undef TARGET_ASM_FILE_START
40429 #define TARGET_ASM_FILE_START x86_file_start
40430
40431 #undef TARGET_OPTION_OVERRIDE
40432 #define TARGET_OPTION_OVERRIDE ix86_option_override
40433
40434 #undef TARGET_REGISTER_MOVE_COST
40435 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
40436 #undef TARGET_MEMORY_MOVE_COST
40437 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
40438 #undef TARGET_RTX_COSTS
40439 #define TARGET_RTX_COSTS ix86_rtx_costs
40440 #undef TARGET_ADDRESS_COST
40441 #define TARGET_ADDRESS_COST ix86_address_cost
40442
40443 #undef TARGET_FIXED_CONDITION_CODE_REGS
40444 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
40445 #undef TARGET_CC_MODES_COMPATIBLE
40446 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
40447
40448 #undef TARGET_MACHINE_DEPENDENT_REORG
40449 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
40450
40451 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
40452 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
40453
40454 #undef TARGET_BUILD_BUILTIN_VA_LIST
40455 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
40456
40457 #undef TARGET_FOLD_BUILTIN
40458 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
40459
40460 #undef TARGET_ENUM_VA_LIST_P
40461 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
40462
40463 #undef TARGET_FN_ABI_VA_LIST
40464 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
40465
40466 #undef TARGET_CANONICAL_VA_LIST_TYPE
40467 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
40468
40469 #undef TARGET_EXPAND_BUILTIN_VA_START
40470 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
40471
40472 #undef TARGET_MD_ASM_CLOBBERS
40473 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
40474
40475 #undef TARGET_PROMOTE_PROTOTYPES
40476 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
40477 #undef TARGET_STRUCT_VALUE_RTX
40478 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
40479 #undef TARGET_SETUP_INCOMING_VARARGS
40480 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
40481 #undef TARGET_MUST_PASS_IN_STACK
40482 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
40483 #undef TARGET_FUNCTION_ARG_ADVANCE
40484 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
40485 #undef TARGET_FUNCTION_ARG
40486 #define TARGET_FUNCTION_ARG ix86_function_arg
40487 #undef TARGET_FUNCTION_ARG_BOUNDARY
40488 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
40489 #undef TARGET_PASS_BY_REFERENCE
40490 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
40491 #undef TARGET_INTERNAL_ARG_POINTER
40492 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
40493 #undef TARGET_UPDATE_STACK_BOUNDARY
40494 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
40495 #undef TARGET_GET_DRAP_RTX
40496 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
40497 #undef TARGET_STRICT_ARGUMENT_NAMING
40498 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
40499 #undef TARGET_STATIC_CHAIN
40500 #define TARGET_STATIC_CHAIN ix86_static_chain
40501 #undef TARGET_TRAMPOLINE_INIT
40502 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
40503 #undef TARGET_RETURN_POPS_ARGS
40504 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
40505
40506 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
40507 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
40508
40509 #undef TARGET_SCALAR_MODE_SUPPORTED_P
40510 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
40511
40512 #undef TARGET_VECTOR_MODE_SUPPORTED_P
40513 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
40514
40515 #undef TARGET_C_MODE_FOR_SUFFIX
40516 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
40517
40518 #ifdef HAVE_AS_TLS
40519 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
40520 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
40521 #endif
40522
40523 #ifdef SUBTARGET_INSERT_ATTRIBUTES
40524 #undef TARGET_INSERT_ATTRIBUTES
40525 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
40526 #endif
40527
40528 #undef TARGET_MANGLE_TYPE
40529 #define TARGET_MANGLE_TYPE ix86_mangle_type
40530
40531 #if !TARGET_MACHO
40532 #undef TARGET_STACK_PROTECT_FAIL
40533 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
40534 #endif
40535
40536 #undef TARGET_FUNCTION_VALUE
40537 #define TARGET_FUNCTION_VALUE ix86_function_value
40538
40539 #undef TARGET_FUNCTION_VALUE_REGNO_P
40540 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
40541
40542 #undef TARGET_PROMOTE_FUNCTION_MODE
40543 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
40544
40545 #undef TARGET_SECONDARY_RELOAD
40546 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
40547
40548 #undef TARGET_CLASS_MAX_NREGS
40549 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
40550
40551 #undef TARGET_PREFERRED_RELOAD_CLASS
40552 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
40553 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
40554 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
40555 #undef TARGET_CLASS_LIKELY_SPILLED_P
40556 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
40557
40558 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
40559 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
40560 ix86_builtin_vectorization_cost
40561 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
40562 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
40563 ix86_vectorize_vec_perm_const_ok
40564 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
40565 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
40566 ix86_preferred_simd_mode
40567 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
40568 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
40569 ix86_autovectorize_vector_sizes
40570 #undef TARGET_VECTORIZE_INIT_COST
40571 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
40572 #undef TARGET_VECTORIZE_ADD_STMT_COST
40573 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
40574 #undef TARGET_VECTORIZE_FINISH_COST
40575 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
40576 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
40577 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
40578
40579 #undef TARGET_SET_CURRENT_FUNCTION
40580 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
40581
40582 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
40583 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
40584
40585 #undef TARGET_OPTION_SAVE
40586 #define TARGET_OPTION_SAVE ix86_function_specific_save
40587
40588 #undef TARGET_OPTION_RESTORE
40589 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
40590
40591 #undef TARGET_OPTION_PRINT
40592 #define TARGET_OPTION_PRINT ix86_function_specific_print
40593
40594 #undef TARGET_CAN_INLINE_P
40595 #define TARGET_CAN_INLINE_P ix86_can_inline_p
40596
40597 #undef TARGET_EXPAND_TO_RTL_HOOK
40598 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
40599
40600 #undef TARGET_LEGITIMATE_ADDRESS_P
40601 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
40602
40603 #undef TARGET_LEGITIMATE_CONSTANT_P
40604 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
40605
40606 #undef TARGET_FRAME_POINTER_REQUIRED
40607 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
40608
40609 #undef TARGET_CAN_ELIMINATE
40610 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
40611
40612 #undef TARGET_EXTRA_LIVE_ON_ENTRY
40613 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
40614
40615 #undef TARGET_ASM_CODE_END
40616 #define TARGET_ASM_CODE_END ix86_code_end
40617
40618 #undef TARGET_CONDITIONAL_REGISTER_USAGE
40619 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
40620
40621 #if TARGET_MACHO
40622 #undef TARGET_INIT_LIBFUNCS
40623 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
40624 #endif
40625
40626 struct gcc_target targetm = TARGET_INITIALIZER;
40627 \f
40628 #include "gt-i386.h"