]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/i386/i386.c
6542161b8b77a36d8bcc53b8286a392f607d7885
[thirdparty/gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
4 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "reload.h"
51 #include "cgraph.h"
52 #include "gimple.h"
53 #include "dwarf2.h"
54 #include "df.h"
55 #include "tm-constrs.h"
56 #include "params.h"
57 #include "cselib.h"
58 #include "debug.h"
59 #include "sched-int.h"
60 #include "sbitmap.h"
61 #include "fibheap.h"
62 #include "opts.h"
63 #include "diagnostic.h"
64 #include "dumpfile.h"
65
66 enum upper_128bits_state
67 {
68 unknown = 0,
69 unused,
70 used
71 };
72
73 typedef struct block_info_def
74 {
75 /* State of the upper 128bits of AVX registers at exit. */
76 enum upper_128bits_state state;
77 /* TRUE if state of the upper 128bits of AVX registers is unchanged
78 in this block. */
79 bool unchanged;
80 /* TRUE if block has been processed. */
81 bool processed;
82 /* TRUE if block has been scanned. */
83 bool scanned;
84 /* Previous state of the upper 128bits of AVX registers at entry. */
85 enum upper_128bits_state prev;
86 } *block_info;
87
88 #define BLOCK_INFO(B) ((block_info) (B)->aux)
89
90 enum call_avx256_state
91 {
92 /* Callee returns 256bit AVX register. */
93 callee_return_avx256 = -1,
94 /* Callee returns and passes 256bit AVX register. */
95 callee_return_pass_avx256,
96 /* Callee passes 256bit AVX register. */
97 callee_pass_avx256,
98 /* Callee doesn't return nor passe 256bit AVX register, or no
99 256bit AVX register in function return. */
100 call_no_avx256,
101 /* vzeroupper intrinsic. */
102 vzeroupper_intrinsic
103 };
104
105 /* Check if a 256bit AVX register is referenced in stores. */
106
107 static void
108 check_avx256_stores (rtx dest, const_rtx set, void *data)
109 {
110 if ((REG_P (dest)
111 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
112 || (GET_CODE (set) == SET
113 && REG_P (SET_SRC (set))
114 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
115 {
116 enum upper_128bits_state *state
117 = (enum upper_128bits_state *) data;
118 *state = used;
119 }
120 }
121
122 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
123 in basic block BB. Delete it if upper 128bit AVX registers are
124 unused. If it isn't deleted, move it to just before a jump insn.
125
126 STATE is state of the upper 128bits of AVX registers at entry. */
127
128 static void
129 move_or_delete_vzeroupper_2 (basic_block bb,
130 enum upper_128bits_state state)
131 {
132 rtx insn, bb_end;
133 rtx vzeroupper_insn = NULL_RTX;
134 rtx pat;
135 int avx256;
136 bool unchanged;
137
138 if (BLOCK_INFO (bb)->unchanged)
139 {
140 if (dump_file)
141 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
142 bb->index, state);
143
144 BLOCK_INFO (bb)->state = state;
145 return;
146 }
147
148 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
149 {
150 if (dump_file)
151 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
152 bb->index, BLOCK_INFO (bb)->state);
153 return;
154 }
155
156 BLOCK_INFO (bb)->prev = state;
157
158 if (dump_file)
159 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
160 bb->index, state);
161
162 unchanged = true;
163
164 /* BB_END changes when it is deleted. */
165 bb_end = BB_END (bb);
166 insn = BB_HEAD (bb);
167 while (insn != bb_end)
168 {
169 insn = NEXT_INSN (insn);
170
171 if (!NONDEBUG_INSN_P (insn))
172 continue;
173
174 /* Move vzeroupper before jump/call. */
175 if (JUMP_P (insn) || CALL_P (insn))
176 {
177 if (!vzeroupper_insn)
178 continue;
179
180 if (PREV_INSN (insn) != vzeroupper_insn)
181 {
182 if (dump_file)
183 {
184 fprintf (dump_file, "Move vzeroupper after:\n");
185 print_rtl_single (dump_file, PREV_INSN (insn));
186 fprintf (dump_file, "before:\n");
187 print_rtl_single (dump_file, insn);
188 }
189 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
190 PREV_INSN (insn));
191 }
192 vzeroupper_insn = NULL_RTX;
193 continue;
194 }
195
196 pat = PATTERN (insn);
197
198 /* Check insn for vzeroupper intrinsic. */
199 if (GET_CODE (pat) == UNSPEC_VOLATILE
200 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
201 {
202 if (dump_file)
203 {
204 /* Found vzeroupper intrinsic. */
205 fprintf (dump_file, "Found vzeroupper:\n");
206 print_rtl_single (dump_file, insn);
207 }
208 }
209 else
210 {
211 /* Check insn for vzeroall intrinsic. */
212 if (GET_CODE (pat) == PARALLEL
213 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
214 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
215 {
216 state = unused;
217 unchanged = false;
218
219 /* Delete pending vzeroupper insertion. */
220 if (vzeroupper_insn)
221 {
222 delete_insn (vzeroupper_insn);
223 vzeroupper_insn = NULL_RTX;
224 }
225 }
226 else if (state != used)
227 {
228 note_stores (pat, check_avx256_stores, &state);
229 if (state == used)
230 unchanged = false;
231 }
232 continue;
233 }
234
235 /* Process vzeroupper intrinsic. */
236 avx256 = INTVAL (XVECEXP (pat, 0, 0));
237
238 if (state == unused)
239 {
240 /* Since the upper 128bits are cleared, callee must not pass
241 256bit AVX register. We only need to check if callee
242 returns 256bit AVX register. */
243 if (avx256 == callee_return_avx256)
244 {
245 state = used;
246 unchanged = false;
247 }
248
249 /* Remove unnecessary vzeroupper since upper 128bits are
250 cleared. */
251 if (dump_file)
252 {
253 fprintf (dump_file, "Delete redundant vzeroupper:\n");
254 print_rtl_single (dump_file, insn);
255 }
256 delete_insn (insn);
257 }
258 else
259 {
260 /* Set state to UNUSED if callee doesn't return 256bit AVX
261 register. */
262 if (avx256 != callee_return_pass_avx256)
263 state = unused;
264
265 if (avx256 == callee_return_pass_avx256
266 || avx256 == callee_pass_avx256)
267 {
268 /* Must remove vzeroupper since callee passes in 256bit
269 AVX register. */
270 if (dump_file)
271 {
272 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
273 print_rtl_single (dump_file, insn);
274 }
275 delete_insn (insn);
276 }
277 else
278 {
279 vzeroupper_insn = insn;
280 unchanged = false;
281 }
282 }
283 }
284
285 BLOCK_INFO (bb)->state = state;
286 BLOCK_INFO (bb)->unchanged = unchanged;
287 BLOCK_INFO (bb)->scanned = true;
288
289 if (dump_file)
290 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
291 bb->index, unchanged ? "unchanged" : "changed",
292 state);
293 }
294
295 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
296 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
297 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
298 state is changed. */
299
300 static bool
301 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
302 {
303 edge e;
304 edge_iterator ei;
305 enum upper_128bits_state state, old_state, new_state;
306 bool seen_unknown;
307
308 if (dump_file)
309 fprintf (dump_file, " Process [bb %i]: status: %d\n",
310 block->index, BLOCK_INFO (block)->processed);
311
312 if (BLOCK_INFO (block)->processed)
313 return false;
314
315 state = unused;
316
317 /* Check all predecessor edges of this block. */
318 seen_unknown = false;
319 FOR_EACH_EDGE (e, ei, block->preds)
320 {
321 if (e->src == block)
322 continue;
323 switch (BLOCK_INFO (e->src)->state)
324 {
325 case unknown:
326 if (!unknown_is_unused)
327 seen_unknown = true;
328 case unused:
329 break;
330 case used:
331 state = used;
332 goto done;
333 }
334 }
335
336 if (seen_unknown)
337 state = unknown;
338
339 done:
340 old_state = BLOCK_INFO (block)->state;
341 move_or_delete_vzeroupper_2 (block, state);
342 new_state = BLOCK_INFO (block)->state;
343
344 if (state != unknown || new_state == used)
345 BLOCK_INFO (block)->processed = true;
346
347 /* Need to rescan if the upper 128bits of AVX registers are changed
348 to USED at exit. */
349 if (new_state != old_state)
350 {
351 if (new_state == used)
352 cfun->machine->rescan_vzeroupper_p = 1;
353 return true;
354 }
355 else
356 return false;
357 }
358
359 /* Go through the instruction stream looking for vzeroupper. Delete
360 it if upper 128bit AVX registers are unused. If it isn't deleted,
361 move it to just before a jump insn. */
362
363 static void
364 move_or_delete_vzeroupper (void)
365 {
366 edge e;
367 edge_iterator ei;
368 basic_block bb;
369 fibheap_t worklist, pending, fibheap_swap;
370 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
371 int *bb_order;
372 int *rc_order;
373 int i;
374
375 /* Set up block info for each basic block. */
376 alloc_aux_for_blocks (sizeof (struct block_info_def));
377
378 /* Process outgoing edges of entry point. */
379 if (dump_file)
380 fprintf (dump_file, "Process outgoing edges of entry point\n");
381
382 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
383 {
384 move_or_delete_vzeroupper_2 (e->dest,
385 cfun->machine->caller_pass_avx256_p
386 ? used : unused);
387 BLOCK_INFO (e->dest)->processed = true;
388 }
389
390 /* Compute reverse completion order of depth first search of the CFG
391 so that the data-flow runs faster. */
392 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
393 bb_order = XNEWVEC (int, last_basic_block);
394 pre_and_rev_post_order_compute (NULL, rc_order, false);
395 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
396 bb_order[rc_order[i]] = i;
397 free (rc_order);
398
399 worklist = fibheap_new ();
400 pending = fibheap_new ();
401 visited = sbitmap_alloc (last_basic_block);
402 in_worklist = sbitmap_alloc (last_basic_block);
403 in_pending = sbitmap_alloc (last_basic_block);
404 sbitmap_zero (in_worklist);
405
406 /* Don't check outgoing edges of entry point. */
407 sbitmap_ones (in_pending);
408 FOR_EACH_BB (bb)
409 if (BLOCK_INFO (bb)->processed)
410 RESET_BIT (in_pending, bb->index);
411 else
412 {
413 move_or_delete_vzeroupper_1 (bb, false);
414 fibheap_insert (pending, bb_order[bb->index], bb);
415 }
416
417 if (dump_file)
418 fprintf (dump_file, "Check remaining basic blocks\n");
419
420 while (!fibheap_empty (pending))
421 {
422 fibheap_swap = pending;
423 pending = worklist;
424 worklist = fibheap_swap;
425 sbitmap_swap = in_pending;
426 in_pending = in_worklist;
427 in_worklist = sbitmap_swap;
428
429 sbitmap_zero (visited);
430
431 cfun->machine->rescan_vzeroupper_p = 0;
432
433 while (!fibheap_empty (worklist))
434 {
435 bb = (basic_block) fibheap_extract_min (worklist);
436 RESET_BIT (in_worklist, bb->index);
437 gcc_assert (!TEST_BIT (visited, bb->index));
438 if (!TEST_BIT (visited, bb->index))
439 {
440 edge_iterator ei;
441
442 SET_BIT (visited, bb->index);
443
444 if (move_or_delete_vzeroupper_1 (bb, false))
445 FOR_EACH_EDGE (e, ei, bb->succs)
446 {
447 if (e->dest == EXIT_BLOCK_PTR
448 || BLOCK_INFO (e->dest)->processed)
449 continue;
450
451 if (TEST_BIT (visited, e->dest->index))
452 {
453 if (!TEST_BIT (in_pending, e->dest->index))
454 {
455 /* Send E->DEST to next round. */
456 SET_BIT (in_pending, e->dest->index);
457 fibheap_insert (pending,
458 bb_order[e->dest->index],
459 e->dest);
460 }
461 }
462 else if (!TEST_BIT (in_worklist, e->dest->index))
463 {
464 /* Add E->DEST to current round. */
465 SET_BIT (in_worklist, e->dest->index);
466 fibheap_insert (worklist, bb_order[e->dest->index],
467 e->dest);
468 }
469 }
470 }
471 }
472
473 if (!cfun->machine->rescan_vzeroupper_p)
474 break;
475 }
476
477 free (bb_order);
478 fibheap_delete (worklist);
479 fibheap_delete (pending);
480 sbitmap_free (visited);
481 sbitmap_free (in_worklist);
482 sbitmap_free (in_pending);
483
484 if (dump_file)
485 fprintf (dump_file, "Process remaining basic blocks\n");
486
487 FOR_EACH_BB (bb)
488 move_or_delete_vzeroupper_1 (bb, true);
489
490 free_aux_for_blocks ();
491 }
492
493 static rtx legitimize_dllimport_symbol (rtx, bool);
494
495 #ifndef CHECK_STACK_LIMIT
496 #define CHECK_STACK_LIMIT (-1)
497 #endif
498
499 /* Return index of given mode in mult and division cost tables. */
500 #define MODE_INDEX(mode) \
501 ((mode) == QImode ? 0 \
502 : (mode) == HImode ? 1 \
503 : (mode) == SImode ? 2 \
504 : (mode) == DImode ? 3 \
505 : 4)
506
507 /* Processor costs (relative to an add) */
508 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
509 #define COSTS_N_BYTES(N) ((N) * 2)
510
511 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
512
513 const
514 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
515 COSTS_N_BYTES (2), /* cost of an add instruction */
516 COSTS_N_BYTES (3), /* cost of a lea instruction */
517 COSTS_N_BYTES (2), /* variable shift costs */
518 COSTS_N_BYTES (3), /* constant shift costs */
519 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
520 COSTS_N_BYTES (3), /* HI */
521 COSTS_N_BYTES (3), /* SI */
522 COSTS_N_BYTES (3), /* DI */
523 COSTS_N_BYTES (5)}, /* other */
524 0, /* cost of multiply per each bit set */
525 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
526 COSTS_N_BYTES (3), /* HI */
527 COSTS_N_BYTES (3), /* SI */
528 COSTS_N_BYTES (3), /* DI */
529 COSTS_N_BYTES (5)}, /* other */
530 COSTS_N_BYTES (3), /* cost of movsx */
531 COSTS_N_BYTES (3), /* cost of movzx */
532 0, /* "large" insn */
533 2, /* MOVE_RATIO */
534 2, /* cost for loading QImode using movzbl */
535 {2, 2, 2}, /* cost of loading integer registers
536 in QImode, HImode and SImode.
537 Relative to reg-reg move (2). */
538 {2, 2, 2}, /* cost of storing integer registers */
539 2, /* cost of reg,reg fld/fst */
540 {2, 2, 2}, /* cost of loading fp registers
541 in SFmode, DFmode and XFmode */
542 {2, 2, 2}, /* cost of storing fp registers
543 in SFmode, DFmode and XFmode */
544 3, /* cost of moving MMX register */
545 {3, 3}, /* cost of loading MMX registers
546 in SImode and DImode */
547 {3, 3}, /* cost of storing MMX registers
548 in SImode and DImode */
549 3, /* cost of moving SSE register */
550 {3, 3, 3}, /* cost of loading SSE registers
551 in SImode, DImode and TImode */
552 {3, 3, 3}, /* cost of storing SSE registers
553 in SImode, DImode and TImode */
554 3, /* MMX or SSE register to integer */
555 0, /* size of l1 cache */
556 0, /* size of l2 cache */
557 0, /* size of prefetch block */
558 0, /* number of parallel prefetches */
559 2, /* Branch cost */
560 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
561 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
562 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
563 COSTS_N_BYTES (2), /* cost of FABS instruction. */
564 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
565 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
566 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
567 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
568 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
569 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
570 1, /* scalar_stmt_cost. */
571 1, /* scalar load_cost. */
572 1, /* scalar_store_cost. */
573 1, /* vec_stmt_cost. */
574 1, /* vec_to_scalar_cost. */
575 1, /* scalar_to_vec_cost. */
576 1, /* vec_align_load_cost. */
577 1, /* vec_unalign_load_cost. */
578 1, /* vec_store_cost. */
579 1, /* cond_taken_branch_cost. */
580 1, /* cond_not_taken_branch_cost. */
581 };
582
583 /* Processor costs (relative to an add) */
584 static const
585 struct processor_costs i386_cost = { /* 386 specific costs */
586 COSTS_N_INSNS (1), /* cost of an add instruction */
587 COSTS_N_INSNS (1), /* cost of a lea instruction */
588 COSTS_N_INSNS (3), /* variable shift costs */
589 COSTS_N_INSNS (2), /* constant shift costs */
590 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
591 COSTS_N_INSNS (6), /* HI */
592 COSTS_N_INSNS (6), /* SI */
593 COSTS_N_INSNS (6), /* DI */
594 COSTS_N_INSNS (6)}, /* other */
595 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
596 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
597 COSTS_N_INSNS (23), /* HI */
598 COSTS_N_INSNS (23), /* SI */
599 COSTS_N_INSNS (23), /* DI */
600 COSTS_N_INSNS (23)}, /* other */
601 COSTS_N_INSNS (3), /* cost of movsx */
602 COSTS_N_INSNS (2), /* cost of movzx */
603 15, /* "large" insn */
604 3, /* MOVE_RATIO */
605 4, /* cost for loading QImode using movzbl */
606 {2, 4, 2}, /* cost of loading integer registers
607 in QImode, HImode and SImode.
608 Relative to reg-reg move (2). */
609 {2, 4, 2}, /* cost of storing integer registers */
610 2, /* cost of reg,reg fld/fst */
611 {8, 8, 8}, /* cost of loading fp registers
612 in SFmode, DFmode and XFmode */
613 {8, 8, 8}, /* cost of storing fp registers
614 in SFmode, DFmode and XFmode */
615 2, /* cost of moving MMX register */
616 {4, 8}, /* cost of loading MMX registers
617 in SImode and DImode */
618 {4, 8}, /* cost of storing MMX registers
619 in SImode and DImode */
620 2, /* cost of moving SSE register */
621 {4, 8, 16}, /* cost of loading SSE registers
622 in SImode, DImode and TImode */
623 {4, 8, 16}, /* cost of storing SSE registers
624 in SImode, DImode and TImode */
625 3, /* MMX or SSE register to integer */
626 0, /* size of l1 cache */
627 0, /* size of l2 cache */
628 0, /* size of prefetch block */
629 0, /* number of parallel prefetches */
630 1, /* Branch cost */
631 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
632 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
633 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
634 COSTS_N_INSNS (22), /* cost of FABS instruction. */
635 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
636 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
637 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
638 DUMMY_STRINGOP_ALGS},
639 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
640 DUMMY_STRINGOP_ALGS},
641 1, /* scalar_stmt_cost. */
642 1, /* scalar load_cost. */
643 1, /* scalar_store_cost. */
644 1, /* vec_stmt_cost. */
645 1, /* vec_to_scalar_cost. */
646 1, /* scalar_to_vec_cost. */
647 1, /* vec_align_load_cost. */
648 2, /* vec_unalign_load_cost. */
649 1, /* vec_store_cost. */
650 3, /* cond_taken_branch_cost. */
651 1, /* cond_not_taken_branch_cost. */
652 };
653
654 static const
655 struct processor_costs i486_cost = { /* 486 specific costs */
656 COSTS_N_INSNS (1), /* cost of an add instruction */
657 COSTS_N_INSNS (1), /* cost of a lea instruction */
658 COSTS_N_INSNS (3), /* variable shift costs */
659 COSTS_N_INSNS (2), /* constant shift costs */
660 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
661 COSTS_N_INSNS (12), /* HI */
662 COSTS_N_INSNS (12), /* SI */
663 COSTS_N_INSNS (12), /* DI */
664 COSTS_N_INSNS (12)}, /* other */
665 1, /* cost of multiply per each bit set */
666 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
667 COSTS_N_INSNS (40), /* HI */
668 COSTS_N_INSNS (40), /* SI */
669 COSTS_N_INSNS (40), /* DI */
670 COSTS_N_INSNS (40)}, /* other */
671 COSTS_N_INSNS (3), /* cost of movsx */
672 COSTS_N_INSNS (2), /* cost of movzx */
673 15, /* "large" insn */
674 3, /* MOVE_RATIO */
675 4, /* cost for loading QImode using movzbl */
676 {2, 4, 2}, /* cost of loading integer registers
677 in QImode, HImode and SImode.
678 Relative to reg-reg move (2). */
679 {2, 4, 2}, /* cost of storing integer registers */
680 2, /* cost of reg,reg fld/fst */
681 {8, 8, 8}, /* cost of loading fp registers
682 in SFmode, DFmode and XFmode */
683 {8, 8, 8}, /* cost of storing fp registers
684 in SFmode, DFmode and XFmode */
685 2, /* cost of moving MMX register */
686 {4, 8}, /* cost of loading MMX registers
687 in SImode and DImode */
688 {4, 8}, /* cost of storing MMX registers
689 in SImode and DImode */
690 2, /* cost of moving SSE register */
691 {4, 8, 16}, /* cost of loading SSE registers
692 in SImode, DImode and TImode */
693 {4, 8, 16}, /* cost of storing SSE registers
694 in SImode, DImode and TImode */
695 3, /* MMX or SSE register to integer */
696 4, /* size of l1 cache. 486 has 8kB cache
697 shared for code and data, so 4kB is
698 not really precise. */
699 4, /* size of l2 cache */
700 0, /* size of prefetch block */
701 0, /* number of parallel prefetches */
702 1, /* Branch cost */
703 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
704 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
705 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
706 COSTS_N_INSNS (3), /* cost of FABS instruction. */
707 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
708 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
709 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
710 DUMMY_STRINGOP_ALGS},
711 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
712 DUMMY_STRINGOP_ALGS},
713 1, /* scalar_stmt_cost. */
714 1, /* scalar load_cost. */
715 1, /* scalar_store_cost. */
716 1, /* vec_stmt_cost. */
717 1, /* vec_to_scalar_cost. */
718 1, /* scalar_to_vec_cost. */
719 1, /* vec_align_load_cost. */
720 2, /* vec_unalign_load_cost. */
721 1, /* vec_store_cost. */
722 3, /* cond_taken_branch_cost. */
723 1, /* cond_not_taken_branch_cost. */
724 };
725
726 static const
727 struct processor_costs pentium_cost = {
728 COSTS_N_INSNS (1), /* cost of an add instruction */
729 COSTS_N_INSNS (1), /* cost of a lea instruction */
730 COSTS_N_INSNS (4), /* variable shift costs */
731 COSTS_N_INSNS (1), /* constant shift costs */
732 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
733 COSTS_N_INSNS (11), /* HI */
734 COSTS_N_INSNS (11), /* SI */
735 COSTS_N_INSNS (11), /* DI */
736 COSTS_N_INSNS (11)}, /* other */
737 0, /* cost of multiply per each bit set */
738 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
739 COSTS_N_INSNS (25), /* HI */
740 COSTS_N_INSNS (25), /* SI */
741 COSTS_N_INSNS (25), /* DI */
742 COSTS_N_INSNS (25)}, /* other */
743 COSTS_N_INSNS (3), /* cost of movsx */
744 COSTS_N_INSNS (2), /* cost of movzx */
745 8, /* "large" insn */
746 6, /* MOVE_RATIO */
747 6, /* cost for loading QImode using movzbl */
748 {2, 4, 2}, /* cost of loading integer registers
749 in QImode, HImode and SImode.
750 Relative to reg-reg move (2). */
751 {2, 4, 2}, /* cost of storing integer registers */
752 2, /* cost of reg,reg fld/fst */
753 {2, 2, 6}, /* cost of loading fp registers
754 in SFmode, DFmode and XFmode */
755 {4, 4, 6}, /* cost of storing fp registers
756 in SFmode, DFmode and XFmode */
757 8, /* cost of moving MMX register */
758 {8, 8}, /* cost of loading MMX registers
759 in SImode and DImode */
760 {8, 8}, /* cost of storing MMX registers
761 in SImode and DImode */
762 2, /* cost of moving SSE register */
763 {4, 8, 16}, /* cost of loading SSE registers
764 in SImode, DImode and TImode */
765 {4, 8, 16}, /* cost of storing SSE registers
766 in SImode, DImode and TImode */
767 3, /* MMX or SSE register to integer */
768 8, /* size of l1 cache. */
769 8, /* size of l2 cache */
770 0, /* size of prefetch block */
771 0, /* number of parallel prefetches */
772 2, /* Branch cost */
773 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
774 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
775 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
776 COSTS_N_INSNS (1), /* cost of FABS instruction. */
777 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
778 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
779 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
780 DUMMY_STRINGOP_ALGS},
781 {{libcall, {{-1, rep_prefix_4_byte}}},
782 DUMMY_STRINGOP_ALGS},
783 1, /* scalar_stmt_cost. */
784 1, /* scalar load_cost. */
785 1, /* scalar_store_cost. */
786 1, /* vec_stmt_cost. */
787 1, /* vec_to_scalar_cost. */
788 1, /* scalar_to_vec_cost. */
789 1, /* vec_align_load_cost. */
790 2, /* vec_unalign_load_cost. */
791 1, /* vec_store_cost. */
792 3, /* cond_taken_branch_cost. */
793 1, /* cond_not_taken_branch_cost. */
794 };
795
796 static const
797 struct processor_costs pentiumpro_cost = {
798 COSTS_N_INSNS (1), /* cost of an add instruction */
799 COSTS_N_INSNS (1), /* cost of a lea instruction */
800 COSTS_N_INSNS (1), /* variable shift costs */
801 COSTS_N_INSNS (1), /* constant shift costs */
802 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
803 COSTS_N_INSNS (4), /* HI */
804 COSTS_N_INSNS (4), /* SI */
805 COSTS_N_INSNS (4), /* DI */
806 COSTS_N_INSNS (4)}, /* other */
807 0, /* cost of multiply per each bit set */
808 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
809 COSTS_N_INSNS (17), /* HI */
810 COSTS_N_INSNS (17), /* SI */
811 COSTS_N_INSNS (17), /* DI */
812 COSTS_N_INSNS (17)}, /* other */
813 COSTS_N_INSNS (1), /* cost of movsx */
814 COSTS_N_INSNS (1), /* cost of movzx */
815 8, /* "large" insn */
816 6, /* MOVE_RATIO */
817 2, /* cost for loading QImode using movzbl */
818 {4, 4, 4}, /* cost of loading integer registers
819 in QImode, HImode and SImode.
820 Relative to reg-reg move (2). */
821 {2, 2, 2}, /* cost of storing integer registers */
822 2, /* cost of reg,reg fld/fst */
823 {2, 2, 6}, /* cost of loading fp registers
824 in SFmode, DFmode and XFmode */
825 {4, 4, 6}, /* cost of storing fp registers
826 in SFmode, DFmode and XFmode */
827 2, /* cost of moving MMX register */
828 {2, 2}, /* cost of loading MMX registers
829 in SImode and DImode */
830 {2, 2}, /* cost of storing MMX registers
831 in SImode and DImode */
832 2, /* cost of moving SSE register */
833 {2, 2, 8}, /* cost of loading SSE registers
834 in SImode, DImode and TImode */
835 {2, 2, 8}, /* cost of storing SSE registers
836 in SImode, DImode and TImode */
837 3, /* MMX or SSE register to integer */
838 8, /* size of l1 cache. */
839 256, /* size of l2 cache */
840 32, /* size of prefetch block */
841 6, /* number of parallel prefetches */
842 2, /* Branch cost */
843 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
844 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
845 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
846 COSTS_N_INSNS (2), /* cost of FABS instruction. */
847 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
848 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
849 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
850 (we ensure the alignment). For small blocks inline loop is still a
851 noticeable win, for bigger blocks either rep movsl or rep movsb is
852 way to go. Rep movsb has apparently more expensive startup time in CPU,
853 but after 4K the difference is down in the noise. */
854 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
855 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
856 DUMMY_STRINGOP_ALGS},
857 {{rep_prefix_4_byte, {{1024, unrolled_loop},
858 {8192, rep_prefix_4_byte}, {-1, libcall}}},
859 DUMMY_STRINGOP_ALGS},
860 1, /* scalar_stmt_cost. */
861 1, /* scalar load_cost. */
862 1, /* scalar_store_cost. */
863 1, /* vec_stmt_cost. */
864 1, /* vec_to_scalar_cost. */
865 1, /* scalar_to_vec_cost. */
866 1, /* vec_align_load_cost. */
867 2, /* vec_unalign_load_cost. */
868 1, /* vec_store_cost. */
869 3, /* cond_taken_branch_cost. */
870 1, /* cond_not_taken_branch_cost. */
871 };
872
873 static const
874 struct processor_costs geode_cost = {
875 COSTS_N_INSNS (1), /* cost of an add instruction */
876 COSTS_N_INSNS (1), /* cost of a lea instruction */
877 COSTS_N_INSNS (2), /* variable shift costs */
878 COSTS_N_INSNS (1), /* constant shift costs */
879 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
880 COSTS_N_INSNS (4), /* HI */
881 COSTS_N_INSNS (7), /* SI */
882 COSTS_N_INSNS (7), /* DI */
883 COSTS_N_INSNS (7)}, /* other */
884 0, /* cost of multiply per each bit set */
885 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
886 COSTS_N_INSNS (23), /* HI */
887 COSTS_N_INSNS (39), /* SI */
888 COSTS_N_INSNS (39), /* DI */
889 COSTS_N_INSNS (39)}, /* other */
890 COSTS_N_INSNS (1), /* cost of movsx */
891 COSTS_N_INSNS (1), /* cost of movzx */
892 8, /* "large" insn */
893 4, /* MOVE_RATIO */
894 1, /* cost for loading QImode using movzbl */
895 {1, 1, 1}, /* cost of loading integer registers
896 in QImode, HImode and SImode.
897 Relative to reg-reg move (2). */
898 {1, 1, 1}, /* cost of storing integer registers */
899 1, /* cost of reg,reg fld/fst */
900 {1, 1, 1}, /* cost of loading fp registers
901 in SFmode, DFmode and XFmode */
902 {4, 6, 6}, /* cost of storing fp registers
903 in SFmode, DFmode and XFmode */
904
905 1, /* cost of moving MMX register */
906 {1, 1}, /* cost of loading MMX registers
907 in SImode and DImode */
908 {1, 1}, /* cost of storing MMX registers
909 in SImode and DImode */
910 1, /* cost of moving SSE register */
911 {1, 1, 1}, /* cost of loading SSE registers
912 in SImode, DImode and TImode */
913 {1, 1, 1}, /* cost of storing SSE registers
914 in SImode, DImode and TImode */
915 1, /* MMX or SSE register to integer */
916 64, /* size of l1 cache. */
917 128, /* size of l2 cache. */
918 32, /* size of prefetch block */
919 1, /* number of parallel prefetches */
920 1, /* Branch cost */
921 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
922 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
923 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
924 COSTS_N_INSNS (1), /* cost of FABS instruction. */
925 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
926 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
927 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
928 DUMMY_STRINGOP_ALGS},
929 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
930 DUMMY_STRINGOP_ALGS},
931 1, /* scalar_stmt_cost. */
932 1, /* scalar load_cost. */
933 1, /* scalar_store_cost. */
934 1, /* vec_stmt_cost. */
935 1, /* vec_to_scalar_cost. */
936 1, /* scalar_to_vec_cost. */
937 1, /* vec_align_load_cost. */
938 2, /* vec_unalign_load_cost. */
939 1, /* vec_store_cost. */
940 3, /* cond_taken_branch_cost. */
941 1, /* cond_not_taken_branch_cost. */
942 };
943
944 static const
945 struct processor_costs k6_cost = {
946 COSTS_N_INSNS (1), /* cost of an add instruction */
947 COSTS_N_INSNS (2), /* cost of a lea instruction */
948 COSTS_N_INSNS (1), /* variable shift costs */
949 COSTS_N_INSNS (1), /* constant shift costs */
950 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
951 COSTS_N_INSNS (3), /* HI */
952 COSTS_N_INSNS (3), /* SI */
953 COSTS_N_INSNS (3), /* DI */
954 COSTS_N_INSNS (3)}, /* other */
955 0, /* cost of multiply per each bit set */
956 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
957 COSTS_N_INSNS (18), /* HI */
958 COSTS_N_INSNS (18), /* SI */
959 COSTS_N_INSNS (18), /* DI */
960 COSTS_N_INSNS (18)}, /* other */
961 COSTS_N_INSNS (2), /* cost of movsx */
962 COSTS_N_INSNS (2), /* cost of movzx */
963 8, /* "large" insn */
964 4, /* MOVE_RATIO */
965 3, /* cost for loading QImode using movzbl */
966 {4, 5, 4}, /* cost of loading integer registers
967 in QImode, HImode and SImode.
968 Relative to reg-reg move (2). */
969 {2, 3, 2}, /* cost of storing integer registers */
970 4, /* cost of reg,reg fld/fst */
971 {6, 6, 6}, /* cost of loading fp registers
972 in SFmode, DFmode and XFmode */
973 {4, 4, 4}, /* cost of storing fp registers
974 in SFmode, DFmode and XFmode */
975 2, /* cost of moving MMX register */
976 {2, 2}, /* cost of loading MMX registers
977 in SImode and DImode */
978 {2, 2}, /* cost of storing MMX registers
979 in SImode and DImode */
980 2, /* cost of moving SSE register */
981 {2, 2, 8}, /* cost of loading SSE registers
982 in SImode, DImode and TImode */
983 {2, 2, 8}, /* cost of storing SSE registers
984 in SImode, DImode and TImode */
985 6, /* MMX or SSE register to integer */
986 32, /* size of l1 cache. */
987 32, /* size of l2 cache. Some models
988 have integrated l2 cache, but
989 optimizing for k6 is not important
990 enough to worry about that. */
991 32, /* size of prefetch block */
992 1, /* number of parallel prefetches */
993 1, /* Branch cost */
994 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
995 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
996 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
997 COSTS_N_INSNS (2), /* cost of FABS instruction. */
998 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
999 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
1000 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1001 DUMMY_STRINGOP_ALGS},
1002 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1003 DUMMY_STRINGOP_ALGS},
1004 1, /* scalar_stmt_cost. */
1005 1, /* scalar load_cost. */
1006 1, /* scalar_store_cost. */
1007 1, /* vec_stmt_cost. */
1008 1, /* vec_to_scalar_cost. */
1009 1, /* scalar_to_vec_cost. */
1010 1, /* vec_align_load_cost. */
1011 2, /* vec_unalign_load_cost. */
1012 1, /* vec_store_cost. */
1013 3, /* cond_taken_branch_cost. */
1014 1, /* cond_not_taken_branch_cost. */
1015 };
1016
1017 static const
1018 struct processor_costs athlon_cost = {
1019 COSTS_N_INSNS (1), /* cost of an add instruction */
1020 COSTS_N_INSNS (2), /* cost of a lea instruction */
1021 COSTS_N_INSNS (1), /* variable shift costs */
1022 COSTS_N_INSNS (1), /* constant shift costs */
1023 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1024 COSTS_N_INSNS (5), /* HI */
1025 COSTS_N_INSNS (5), /* SI */
1026 COSTS_N_INSNS (5), /* DI */
1027 COSTS_N_INSNS (5)}, /* other */
1028 0, /* cost of multiply per each bit set */
1029 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1030 COSTS_N_INSNS (26), /* HI */
1031 COSTS_N_INSNS (42), /* SI */
1032 COSTS_N_INSNS (74), /* DI */
1033 COSTS_N_INSNS (74)}, /* other */
1034 COSTS_N_INSNS (1), /* cost of movsx */
1035 COSTS_N_INSNS (1), /* cost of movzx */
1036 8, /* "large" insn */
1037 9, /* MOVE_RATIO */
1038 4, /* cost for loading QImode using movzbl */
1039 {3, 4, 3}, /* cost of loading integer registers
1040 in QImode, HImode and SImode.
1041 Relative to reg-reg move (2). */
1042 {3, 4, 3}, /* cost of storing integer registers */
1043 4, /* cost of reg,reg fld/fst */
1044 {4, 4, 12}, /* cost of loading fp registers
1045 in SFmode, DFmode and XFmode */
1046 {6, 6, 8}, /* cost of storing fp registers
1047 in SFmode, DFmode and XFmode */
1048 2, /* cost of moving MMX register */
1049 {4, 4}, /* cost of loading MMX registers
1050 in SImode and DImode */
1051 {4, 4}, /* cost of storing MMX registers
1052 in SImode and DImode */
1053 2, /* cost of moving SSE register */
1054 {4, 4, 6}, /* cost of loading SSE registers
1055 in SImode, DImode and TImode */
1056 {4, 4, 5}, /* cost of storing SSE registers
1057 in SImode, DImode and TImode */
1058 5, /* MMX or SSE register to integer */
1059 64, /* size of l1 cache. */
1060 256, /* size of l2 cache. */
1061 64, /* size of prefetch block */
1062 6, /* number of parallel prefetches */
1063 5, /* Branch cost */
1064 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1065 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1066 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1067 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1068 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1069 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1070 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1071 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1072 128 bytes for memset. */
1073 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1074 DUMMY_STRINGOP_ALGS},
1075 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1076 DUMMY_STRINGOP_ALGS},
1077 1, /* scalar_stmt_cost. */
1078 1, /* scalar load_cost. */
1079 1, /* scalar_store_cost. */
1080 1, /* vec_stmt_cost. */
1081 1, /* vec_to_scalar_cost. */
1082 1, /* scalar_to_vec_cost. */
1083 1, /* vec_align_load_cost. */
1084 2, /* vec_unalign_load_cost. */
1085 1, /* vec_store_cost. */
1086 3, /* cond_taken_branch_cost. */
1087 1, /* cond_not_taken_branch_cost. */
1088 };
1089
1090 static const
1091 struct processor_costs k8_cost = {
1092 COSTS_N_INSNS (1), /* cost of an add instruction */
1093 COSTS_N_INSNS (2), /* cost of a lea instruction */
1094 COSTS_N_INSNS (1), /* variable shift costs */
1095 COSTS_N_INSNS (1), /* constant shift costs */
1096 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1097 COSTS_N_INSNS (4), /* HI */
1098 COSTS_N_INSNS (3), /* SI */
1099 COSTS_N_INSNS (4), /* DI */
1100 COSTS_N_INSNS (5)}, /* other */
1101 0, /* cost of multiply per each bit set */
1102 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1103 COSTS_N_INSNS (26), /* HI */
1104 COSTS_N_INSNS (42), /* SI */
1105 COSTS_N_INSNS (74), /* DI */
1106 COSTS_N_INSNS (74)}, /* other */
1107 COSTS_N_INSNS (1), /* cost of movsx */
1108 COSTS_N_INSNS (1), /* cost of movzx */
1109 8, /* "large" insn */
1110 9, /* MOVE_RATIO */
1111 4, /* cost for loading QImode using movzbl */
1112 {3, 4, 3}, /* cost of loading integer registers
1113 in QImode, HImode and SImode.
1114 Relative to reg-reg move (2). */
1115 {3, 4, 3}, /* cost of storing integer registers */
1116 4, /* cost of reg,reg fld/fst */
1117 {4, 4, 12}, /* cost of loading fp registers
1118 in SFmode, DFmode and XFmode */
1119 {6, 6, 8}, /* cost of storing fp registers
1120 in SFmode, DFmode and XFmode */
1121 2, /* cost of moving MMX register */
1122 {3, 3}, /* cost of loading MMX registers
1123 in SImode and DImode */
1124 {4, 4}, /* cost of storing MMX registers
1125 in SImode and DImode */
1126 2, /* cost of moving SSE register */
1127 {4, 3, 6}, /* cost of loading SSE registers
1128 in SImode, DImode and TImode */
1129 {4, 4, 5}, /* cost of storing SSE registers
1130 in SImode, DImode and TImode */
1131 5, /* MMX or SSE register to integer */
1132 64, /* size of l1 cache. */
1133 512, /* size of l2 cache. */
1134 64, /* size of prefetch block */
1135 /* New AMD processors never drop prefetches; if they cannot be performed
1136 immediately, they are queued. We set number of simultaneous prefetches
1137 to a large constant to reflect this (it probably is not a good idea not
1138 to limit number of prefetches at all, as their execution also takes some
1139 time). */
1140 100, /* number of parallel prefetches */
1141 3, /* Branch cost */
1142 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1143 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1144 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1145 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1146 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1147 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1148 /* K8 has optimized REP instruction for medium sized blocks, but for very
1149 small blocks it is better to use loop. For large blocks, libcall can
1150 do nontemporary accesses and beat inline considerably. */
1151 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1152 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1153 {{libcall, {{8, loop}, {24, unrolled_loop},
1154 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1155 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1156 4, /* scalar_stmt_cost. */
1157 2, /* scalar load_cost. */
1158 2, /* scalar_store_cost. */
1159 5, /* vec_stmt_cost. */
1160 0, /* vec_to_scalar_cost. */
1161 2, /* scalar_to_vec_cost. */
1162 2, /* vec_align_load_cost. */
1163 3, /* vec_unalign_load_cost. */
1164 3, /* vec_store_cost. */
1165 3, /* cond_taken_branch_cost. */
1166 2, /* cond_not_taken_branch_cost. */
1167 };
1168
1169 struct processor_costs amdfam10_cost = {
1170 COSTS_N_INSNS (1), /* cost of an add instruction */
1171 COSTS_N_INSNS (2), /* cost of a lea instruction */
1172 COSTS_N_INSNS (1), /* variable shift costs */
1173 COSTS_N_INSNS (1), /* constant shift costs */
1174 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1175 COSTS_N_INSNS (4), /* HI */
1176 COSTS_N_INSNS (3), /* SI */
1177 COSTS_N_INSNS (4), /* DI */
1178 COSTS_N_INSNS (5)}, /* other */
1179 0, /* cost of multiply per each bit set */
1180 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1181 COSTS_N_INSNS (35), /* HI */
1182 COSTS_N_INSNS (51), /* SI */
1183 COSTS_N_INSNS (83), /* DI */
1184 COSTS_N_INSNS (83)}, /* other */
1185 COSTS_N_INSNS (1), /* cost of movsx */
1186 COSTS_N_INSNS (1), /* cost of movzx */
1187 8, /* "large" insn */
1188 9, /* MOVE_RATIO */
1189 4, /* cost for loading QImode using movzbl */
1190 {3, 4, 3}, /* cost of loading integer registers
1191 in QImode, HImode and SImode.
1192 Relative to reg-reg move (2). */
1193 {3, 4, 3}, /* cost of storing integer registers */
1194 4, /* cost of reg,reg fld/fst */
1195 {4, 4, 12}, /* cost of loading fp registers
1196 in SFmode, DFmode and XFmode */
1197 {6, 6, 8}, /* cost of storing fp registers
1198 in SFmode, DFmode and XFmode */
1199 2, /* cost of moving MMX register */
1200 {3, 3}, /* cost of loading MMX registers
1201 in SImode and DImode */
1202 {4, 4}, /* cost of storing MMX registers
1203 in SImode and DImode */
1204 2, /* cost of moving SSE register */
1205 {4, 4, 3}, /* cost of loading SSE registers
1206 in SImode, DImode and TImode */
1207 {4, 4, 5}, /* cost of storing SSE registers
1208 in SImode, DImode and TImode */
1209 3, /* MMX or SSE register to integer */
1210 /* On K8:
1211 MOVD reg64, xmmreg Double FSTORE 4
1212 MOVD reg32, xmmreg Double FSTORE 4
1213 On AMDFAM10:
1214 MOVD reg64, xmmreg Double FADD 3
1215 1/1 1/1
1216 MOVD reg32, xmmreg Double FADD 3
1217 1/1 1/1 */
1218 64, /* size of l1 cache. */
1219 512, /* size of l2 cache. */
1220 64, /* size of prefetch block */
1221 /* New AMD processors never drop prefetches; if they cannot be performed
1222 immediately, they are queued. We set number of simultaneous prefetches
1223 to a large constant to reflect this (it probably is not a good idea not
1224 to limit number of prefetches at all, as their execution also takes some
1225 time). */
1226 100, /* number of parallel prefetches */
1227 2, /* Branch cost */
1228 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1229 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1230 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1231 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1232 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1233 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1234
1235 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1236 very small blocks it is better to use loop. For large blocks, libcall can
1237 do nontemporary accesses and beat inline considerably. */
1238 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1239 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1240 {{libcall, {{8, loop}, {24, unrolled_loop},
1241 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1242 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1243 4, /* scalar_stmt_cost. */
1244 2, /* scalar load_cost. */
1245 2, /* scalar_store_cost. */
1246 6, /* vec_stmt_cost. */
1247 0, /* vec_to_scalar_cost. */
1248 2, /* scalar_to_vec_cost. */
1249 2, /* vec_align_load_cost. */
1250 2, /* vec_unalign_load_cost. */
1251 2, /* vec_store_cost. */
1252 2, /* cond_taken_branch_cost. */
1253 1, /* cond_not_taken_branch_cost. */
1254 };
1255
1256 struct processor_costs bdver1_cost = {
1257 COSTS_N_INSNS (1), /* cost of an add instruction */
1258 COSTS_N_INSNS (1), /* cost of a lea instruction */
1259 COSTS_N_INSNS (1), /* variable shift costs */
1260 COSTS_N_INSNS (1), /* constant shift costs */
1261 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1262 COSTS_N_INSNS (4), /* HI */
1263 COSTS_N_INSNS (4), /* SI */
1264 COSTS_N_INSNS (6), /* DI */
1265 COSTS_N_INSNS (6)}, /* other */
1266 0, /* cost of multiply per each bit set */
1267 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1268 COSTS_N_INSNS (35), /* HI */
1269 COSTS_N_INSNS (51), /* SI */
1270 COSTS_N_INSNS (83), /* DI */
1271 COSTS_N_INSNS (83)}, /* other */
1272 COSTS_N_INSNS (1), /* cost of movsx */
1273 COSTS_N_INSNS (1), /* cost of movzx */
1274 8, /* "large" insn */
1275 9, /* MOVE_RATIO */
1276 4, /* cost for loading QImode using movzbl */
1277 {5, 5, 4}, /* cost of loading integer registers
1278 in QImode, HImode and SImode.
1279 Relative to reg-reg move (2). */
1280 {4, 4, 4}, /* cost of storing integer registers */
1281 2, /* cost of reg,reg fld/fst */
1282 {5, 5, 12}, /* cost of loading fp registers
1283 in SFmode, DFmode and XFmode */
1284 {4, 4, 8}, /* cost of storing fp registers
1285 in SFmode, DFmode and XFmode */
1286 2, /* cost of moving MMX register */
1287 {4, 4}, /* cost of loading MMX registers
1288 in SImode and DImode */
1289 {4, 4}, /* cost of storing MMX registers
1290 in SImode and DImode */
1291 2, /* cost of moving SSE register */
1292 {4, 4, 4}, /* cost of loading SSE registers
1293 in SImode, DImode and TImode */
1294 {4, 4, 4}, /* cost of storing SSE registers
1295 in SImode, DImode and TImode */
1296 2, /* MMX or SSE register to integer */
1297 /* On K8:
1298 MOVD reg64, xmmreg Double FSTORE 4
1299 MOVD reg32, xmmreg Double FSTORE 4
1300 On AMDFAM10:
1301 MOVD reg64, xmmreg Double FADD 3
1302 1/1 1/1
1303 MOVD reg32, xmmreg Double FADD 3
1304 1/1 1/1 */
1305 16, /* size of l1 cache. */
1306 2048, /* size of l2 cache. */
1307 64, /* size of prefetch block */
1308 /* New AMD processors never drop prefetches; if they cannot be performed
1309 immediately, they are queued. We set number of simultaneous prefetches
1310 to a large constant to reflect this (it probably is not a good idea not
1311 to limit number of prefetches at all, as their execution also takes some
1312 time). */
1313 100, /* number of parallel prefetches */
1314 2, /* Branch cost */
1315 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1316 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1317 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1318 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1319 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1320 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1321
1322 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1323 very small blocks it is better to use loop. For large blocks, libcall
1324 can do nontemporary accesses and beat inline considerably. */
1325 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1326 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1327 {{libcall, {{8, loop}, {24, unrolled_loop},
1328 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1329 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1330 6, /* scalar_stmt_cost. */
1331 4, /* scalar load_cost. */
1332 4, /* scalar_store_cost. */
1333 6, /* vec_stmt_cost. */
1334 0, /* vec_to_scalar_cost. */
1335 2, /* scalar_to_vec_cost. */
1336 4, /* vec_align_load_cost. */
1337 4, /* vec_unalign_load_cost. */
1338 4, /* vec_store_cost. */
1339 2, /* cond_taken_branch_cost. */
1340 1, /* cond_not_taken_branch_cost. */
1341 };
1342
1343 struct processor_costs bdver2_cost = {
1344 COSTS_N_INSNS (1), /* cost of an add instruction */
1345 COSTS_N_INSNS (1), /* cost of a lea instruction */
1346 COSTS_N_INSNS (1), /* variable shift costs */
1347 COSTS_N_INSNS (1), /* constant shift costs */
1348 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1349 COSTS_N_INSNS (4), /* HI */
1350 COSTS_N_INSNS (4), /* SI */
1351 COSTS_N_INSNS (6), /* DI */
1352 COSTS_N_INSNS (6)}, /* other */
1353 0, /* cost of multiply per each bit set */
1354 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1355 COSTS_N_INSNS (35), /* HI */
1356 COSTS_N_INSNS (51), /* SI */
1357 COSTS_N_INSNS (83), /* DI */
1358 COSTS_N_INSNS (83)}, /* other */
1359 COSTS_N_INSNS (1), /* cost of movsx */
1360 COSTS_N_INSNS (1), /* cost of movzx */
1361 8, /* "large" insn */
1362 9, /* MOVE_RATIO */
1363 4, /* cost for loading QImode using movzbl */
1364 {5, 5, 4}, /* cost of loading integer registers
1365 in QImode, HImode and SImode.
1366 Relative to reg-reg move (2). */
1367 {4, 4, 4}, /* cost of storing integer registers */
1368 2, /* cost of reg,reg fld/fst */
1369 {5, 5, 12}, /* cost of loading fp registers
1370 in SFmode, DFmode and XFmode */
1371 {4, 4, 8}, /* cost of storing fp registers
1372 in SFmode, DFmode and XFmode */
1373 2, /* cost of moving MMX register */
1374 {4, 4}, /* cost of loading MMX registers
1375 in SImode and DImode */
1376 {4, 4}, /* cost of storing MMX registers
1377 in SImode and DImode */
1378 2, /* cost of moving SSE register */
1379 {4, 4, 4}, /* cost of loading SSE registers
1380 in SImode, DImode and TImode */
1381 {4, 4, 4}, /* cost of storing SSE registers
1382 in SImode, DImode and TImode */
1383 2, /* MMX or SSE register to integer */
1384 /* On K8:
1385 MOVD reg64, xmmreg Double FSTORE 4
1386 MOVD reg32, xmmreg Double FSTORE 4
1387 On AMDFAM10:
1388 MOVD reg64, xmmreg Double FADD 3
1389 1/1 1/1
1390 MOVD reg32, xmmreg Double FADD 3
1391 1/1 1/1 */
1392 16, /* size of l1 cache. */
1393 2048, /* size of l2 cache. */
1394 64, /* size of prefetch block */
1395 /* New AMD processors never drop prefetches; if they cannot be performed
1396 immediately, they are queued. We set number of simultaneous prefetches
1397 to a large constant to reflect this (it probably is not a good idea not
1398 to limit number of prefetches at all, as their execution also takes some
1399 time). */
1400 100, /* number of parallel prefetches */
1401 2, /* Branch cost */
1402 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1403 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1404 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1405 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1406 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1407 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1408
1409 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1410 very small blocks it is better to use loop. For large blocks, libcall
1411 can do nontemporary accesses and beat inline considerably. */
1412 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1413 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1414 {{libcall, {{8, loop}, {24, unrolled_loop},
1415 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1416 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1417 6, /* scalar_stmt_cost. */
1418 4, /* scalar load_cost. */
1419 4, /* scalar_store_cost. */
1420 6, /* vec_stmt_cost. */
1421 0, /* vec_to_scalar_cost. */
1422 2, /* scalar_to_vec_cost. */
1423 4, /* vec_align_load_cost. */
1424 4, /* vec_unalign_load_cost. */
1425 4, /* vec_store_cost. */
1426 2, /* cond_taken_branch_cost. */
1427 1, /* cond_not_taken_branch_cost. */
1428 };
1429
1430 struct processor_costs btver1_cost = {
1431 COSTS_N_INSNS (1), /* cost of an add instruction */
1432 COSTS_N_INSNS (2), /* cost of a lea instruction */
1433 COSTS_N_INSNS (1), /* variable shift costs */
1434 COSTS_N_INSNS (1), /* constant shift costs */
1435 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1436 COSTS_N_INSNS (4), /* HI */
1437 COSTS_N_INSNS (3), /* SI */
1438 COSTS_N_INSNS (4), /* DI */
1439 COSTS_N_INSNS (5)}, /* other */
1440 0, /* cost of multiply per each bit set */
1441 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1442 COSTS_N_INSNS (35), /* HI */
1443 COSTS_N_INSNS (51), /* SI */
1444 COSTS_N_INSNS (83), /* DI */
1445 COSTS_N_INSNS (83)}, /* other */
1446 COSTS_N_INSNS (1), /* cost of movsx */
1447 COSTS_N_INSNS (1), /* cost of movzx */
1448 8, /* "large" insn */
1449 9, /* MOVE_RATIO */
1450 4, /* cost for loading QImode using movzbl */
1451 {3, 4, 3}, /* cost of loading integer registers
1452 in QImode, HImode and SImode.
1453 Relative to reg-reg move (2). */
1454 {3, 4, 3}, /* cost of storing integer registers */
1455 4, /* cost of reg,reg fld/fst */
1456 {4, 4, 12}, /* cost of loading fp registers
1457 in SFmode, DFmode and XFmode */
1458 {6, 6, 8}, /* cost of storing fp registers
1459 in SFmode, DFmode and XFmode */
1460 2, /* cost of moving MMX register */
1461 {3, 3}, /* cost of loading MMX registers
1462 in SImode and DImode */
1463 {4, 4}, /* cost of storing MMX registers
1464 in SImode and DImode */
1465 2, /* cost of moving SSE register */
1466 {4, 4, 3}, /* cost of loading SSE registers
1467 in SImode, DImode and TImode */
1468 {4, 4, 5}, /* cost of storing SSE registers
1469 in SImode, DImode and TImode */
1470 3, /* MMX or SSE register to integer */
1471 /* On K8:
1472 MOVD reg64, xmmreg Double FSTORE 4
1473 MOVD reg32, xmmreg Double FSTORE 4
1474 On AMDFAM10:
1475 MOVD reg64, xmmreg Double FADD 3
1476 1/1 1/1
1477 MOVD reg32, xmmreg Double FADD 3
1478 1/1 1/1 */
1479 32, /* size of l1 cache. */
1480 512, /* size of l2 cache. */
1481 64, /* size of prefetch block */
1482 100, /* number of parallel prefetches */
1483 2, /* Branch cost */
1484 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1485 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1486 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1487 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1488 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1489 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1490
1491 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1492 very small blocks it is better to use loop. For large blocks, libcall can
1493 do nontemporary accesses and beat inline considerably. */
1494 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1495 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1496 {{libcall, {{8, loop}, {24, unrolled_loop},
1497 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1498 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1499 4, /* scalar_stmt_cost. */
1500 2, /* scalar load_cost. */
1501 2, /* scalar_store_cost. */
1502 6, /* vec_stmt_cost. */
1503 0, /* vec_to_scalar_cost. */
1504 2, /* scalar_to_vec_cost. */
1505 2, /* vec_align_load_cost. */
1506 2, /* vec_unalign_load_cost. */
1507 2, /* vec_store_cost. */
1508 2, /* cond_taken_branch_cost. */
1509 1, /* cond_not_taken_branch_cost. */
1510 };
1511
1512 struct processor_costs btver2_cost = {
1513 COSTS_N_INSNS (1), /* cost of an add instruction */
1514 COSTS_N_INSNS (2), /* cost of a lea instruction */
1515 COSTS_N_INSNS (1), /* variable shift costs */
1516 COSTS_N_INSNS (1), /* constant shift costs */
1517 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1518 COSTS_N_INSNS (4), /* HI */
1519 COSTS_N_INSNS (3), /* SI */
1520 COSTS_N_INSNS (4), /* DI */
1521 COSTS_N_INSNS (5)}, /* other */
1522 0, /* cost of multiply per each bit set */
1523 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1524 COSTS_N_INSNS (35), /* HI */
1525 COSTS_N_INSNS (51), /* SI */
1526 COSTS_N_INSNS (83), /* DI */
1527 COSTS_N_INSNS (83)}, /* other */
1528 COSTS_N_INSNS (1), /* cost of movsx */
1529 COSTS_N_INSNS (1), /* cost of movzx */
1530 8, /* "large" insn */
1531 9, /* MOVE_RATIO */
1532 4, /* cost for loading QImode using movzbl */
1533 {3, 4, 3}, /* cost of loading integer registers
1534 in QImode, HImode and SImode.
1535 Relative to reg-reg move (2). */
1536 {3, 4, 3}, /* cost of storing integer registers */
1537 4, /* cost of reg,reg fld/fst */
1538 {4, 4, 12}, /* cost of loading fp registers
1539 in SFmode, DFmode and XFmode */
1540 {6, 6, 8}, /* cost of storing fp registers
1541 in SFmode, DFmode and XFmode */
1542 2, /* cost of moving MMX register */
1543 {3, 3}, /* cost of loading MMX registers
1544 in SImode and DImode */
1545 {4, 4}, /* cost of storing MMX registers
1546 in SImode and DImode */
1547 2, /* cost of moving SSE register */
1548 {4, 4, 3}, /* cost of loading SSE registers
1549 in SImode, DImode and TImode */
1550 {4, 4, 5}, /* cost of storing SSE registers
1551 in SImode, DImode and TImode */
1552 3, /* MMX or SSE register to integer */
1553 /* On K8:
1554 MOVD reg64, xmmreg Double FSTORE 4
1555 MOVD reg32, xmmreg Double FSTORE 4
1556 On AMDFAM10:
1557 MOVD reg64, xmmreg Double FADD 3
1558 1/1 1/1
1559 MOVD reg32, xmmreg Double FADD 3
1560 1/1 1/1 */
1561 32, /* size of l1 cache. */
1562 2048, /* size of l2 cache. */
1563 64, /* size of prefetch block */
1564 100, /* number of parallel prefetches */
1565 2, /* Branch cost */
1566 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1567 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1568 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1569 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1570 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1571 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1572
1573 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1574 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1575 {{libcall, {{8, loop}, {24, unrolled_loop},
1576 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1577 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1578 4, /* scalar_stmt_cost. */
1579 2, /* scalar load_cost. */
1580 2, /* scalar_store_cost. */
1581 6, /* vec_stmt_cost. */
1582 0, /* vec_to_scalar_cost. */
1583 2, /* scalar_to_vec_cost. */
1584 2, /* vec_align_load_cost. */
1585 2, /* vec_unalign_load_cost. */
1586 2, /* vec_store_cost. */
1587 2, /* cond_taken_branch_cost. */
1588 1, /* cond_not_taken_branch_cost. */
1589 };
1590
1591 static const
1592 struct processor_costs pentium4_cost = {
1593 COSTS_N_INSNS (1), /* cost of an add instruction */
1594 COSTS_N_INSNS (3), /* cost of a lea instruction */
1595 COSTS_N_INSNS (4), /* variable shift costs */
1596 COSTS_N_INSNS (4), /* constant shift costs */
1597 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1598 COSTS_N_INSNS (15), /* HI */
1599 COSTS_N_INSNS (15), /* SI */
1600 COSTS_N_INSNS (15), /* DI */
1601 COSTS_N_INSNS (15)}, /* other */
1602 0, /* cost of multiply per each bit set */
1603 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1604 COSTS_N_INSNS (56), /* HI */
1605 COSTS_N_INSNS (56), /* SI */
1606 COSTS_N_INSNS (56), /* DI */
1607 COSTS_N_INSNS (56)}, /* other */
1608 COSTS_N_INSNS (1), /* cost of movsx */
1609 COSTS_N_INSNS (1), /* cost of movzx */
1610 16, /* "large" insn */
1611 6, /* MOVE_RATIO */
1612 2, /* cost for loading QImode using movzbl */
1613 {4, 5, 4}, /* cost of loading integer registers
1614 in QImode, HImode and SImode.
1615 Relative to reg-reg move (2). */
1616 {2, 3, 2}, /* cost of storing integer registers */
1617 2, /* cost of reg,reg fld/fst */
1618 {2, 2, 6}, /* cost of loading fp registers
1619 in SFmode, DFmode and XFmode */
1620 {4, 4, 6}, /* cost of storing fp registers
1621 in SFmode, DFmode and XFmode */
1622 2, /* cost of moving MMX register */
1623 {2, 2}, /* cost of loading MMX registers
1624 in SImode and DImode */
1625 {2, 2}, /* cost of storing MMX registers
1626 in SImode and DImode */
1627 12, /* cost of moving SSE register */
1628 {12, 12, 12}, /* cost of loading SSE registers
1629 in SImode, DImode and TImode */
1630 {2, 2, 8}, /* cost of storing SSE registers
1631 in SImode, DImode and TImode */
1632 10, /* MMX or SSE register to integer */
1633 8, /* size of l1 cache. */
1634 256, /* size of l2 cache. */
1635 64, /* size of prefetch block */
1636 6, /* number of parallel prefetches */
1637 2, /* Branch cost */
1638 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1639 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1640 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1641 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1642 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1643 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1644 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1645 DUMMY_STRINGOP_ALGS},
1646 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1647 {-1, libcall}}},
1648 DUMMY_STRINGOP_ALGS},
1649 1, /* scalar_stmt_cost. */
1650 1, /* scalar load_cost. */
1651 1, /* scalar_store_cost. */
1652 1, /* vec_stmt_cost. */
1653 1, /* vec_to_scalar_cost. */
1654 1, /* scalar_to_vec_cost. */
1655 1, /* vec_align_load_cost. */
1656 2, /* vec_unalign_load_cost. */
1657 1, /* vec_store_cost. */
1658 3, /* cond_taken_branch_cost. */
1659 1, /* cond_not_taken_branch_cost. */
1660 };
1661
1662 static const
1663 struct processor_costs nocona_cost = {
1664 COSTS_N_INSNS (1), /* cost of an add instruction */
1665 COSTS_N_INSNS (1), /* cost of a lea instruction */
1666 COSTS_N_INSNS (1), /* variable shift costs */
1667 COSTS_N_INSNS (1), /* constant shift costs */
1668 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1669 COSTS_N_INSNS (10), /* HI */
1670 COSTS_N_INSNS (10), /* SI */
1671 COSTS_N_INSNS (10), /* DI */
1672 COSTS_N_INSNS (10)}, /* other */
1673 0, /* cost of multiply per each bit set */
1674 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1675 COSTS_N_INSNS (66), /* HI */
1676 COSTS_N_INSNS (66), /* SI */
1677 COSTS_N_INSNS (66), /* DI */
1678 COSTS_N_INSNS (66)}, /* other */
1679 COSTS_N_INSNS (1), /* cost of movsx */
1680 COSTS_N_INSNS (1), /* cost of movzx */
1681 16, /* "large" insn */
1682 17, /* MOVE_RATIO */
1683 4, /* cost for loading QImode using movzbl */
1684 {4, 4, 4}, /* cost of loading integer registers
1685 in QImode, HImode and SImode.
1686 Relative to reg-reg move (2). */
1687 {4, 4, 4}, /* cost of storing integer registers */
1688 3, /* cost of reg,reg fld/fst */
1689 {12, 12, 12}, /* cost of loading fp registers
1690 in SFmode, DFmode and XFmode */
1691 {4, 4, 4}, /* cost of storing fp registers
1692 in SFmode, DFmode and XFmode */
1693 6, /* cost of moving MMX register */
1694 {12, 12}, /* cost of loading MMX registers
1695 in SImode and DImode */
1696 {12, 12}, /* cost of storing MMX registers
1697 in SImode and DImode */
1698 6, /* cost of moving SSE register */
1699 {12, 12, 12}, /* cost of loading SSE registers
1700 in SImode, DImode and TImode */
1701 {12, 12, 12}, /* cost of storing SSE registers
1702 in SImode, DImode and TImode */
1703 8, /* MMX or SSE register to integer */
1704 8, /* size of l1 cache. */
1705 1024, /* size of l2 cache. */
1706 128, /* size of prefetch block */
1707 8, /* number of parallel prefetches */
1708 1, /* Branch cost */
1709 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1710 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1711 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1712 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1713 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1714 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1715 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1716 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1717 {100000, unrolled_loop}, {-1, libcall}}}},
1718 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1719 {-1, libcall}}},
1720 {libcall, {{24, loop}, {64, unrolled_loop},
1721 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1722 1, /* scalar_stmt_cost. */
1723 1, /* scalar load_cost. */
1724 1, /* scalar_store_cost. */
1725 1, /* vec_stmt_cost. */
1726 1, /* vec_to_scalar_cost. */
1727 1, /* scalar_to_vec_cost. */
1728 1, /* vec_align_load_cost. */
1729 2, /* vec_unalign_load_cost. */
1730 1, /* vec_store_cost. */
1731 3, /* cond_taken_branch_cost. */
1732 1, /* cond_not_taken_branch_cost. */
1733 };
1734
1735 static const
1736 struct processor_costs atom_cost = {
1737 COSTS_N_INSNS (1), /* cost of an add instruction */
1738 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1739 COSTS_N_INSNS (1), /* variable shift costs */
1740 COSTS_N_INSNS (1), /* constant shift costs */
1741 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1742 COSTS_N_INSNS (4), /* HI */
1743 COSTS_N_INSNS (3), /* SI */
1744 COSTS_N_INSNS (4), /* DI */
1745 COSTS_N_INSNS (2)}, /* other */
1746 0, /* cost of multiply per each bit set */
1747 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1748 COSTS_N_INSNS (26), /* HI */
1749 COSTS_N_INSNS (42), /* SI */
1750 COSTS_N_INSNS (74), /* DI */
1751 COSTS_N_INSNS (74)}, /* other */
1752 COSTS_N_INSNS (1), /* cost of movsx */
1753 COSTS_N_INSNS (1), /* cost of movzx */
1754 8, /* "large" insn */
1755 17, /* MOVE_RATIO */
1756 4, /* cost for loading QImode using movzbl */
1757 {4, 4, 4}, /* cost of loading integer registers
1758 in QImode, HImode and SImode.
1759 Relative to reg-reg move (2). */
1760 {4, 4, 4}, /* cost of storing integer registers */
1761 4, /* cost of reg,reg fld/fst */
1762 {12, 12, 12}, /* cost of loading fp registers
1763 in SFmode, DFmode and XFmode */
1764 {6, 6, 8}, /* cost of storing fp registers
1765 in SFmode, DFmode and XFmode */
1766 2, /* cost of moving MMX register */
1767 {8, 8}, /* cost of loading MMX registers
1768 in SImode and DImode */
1769 {8, 8}, /* cost of storing MMX registers
1770 in SImode and DImode */
1771 2, /* cost of moving SSE register */
1772 {8, 8, 8}, /* cost of loading SSE registers
1773 in SImode, DImode and TImode */
1774 {8, 8, 8}, /* cost of storing SSE registers
1775 in SImode, DImode and TImode */
1776 5, /* MMX or SSE register to integer */
1777 32, /* size of l1 cache. */
1778 256, /* size of l2 cache. */
1779 64, /* size of prefetch block */
1780 6, /* number of parallel prefetches */
1781 3, /* Branch cost */
1782 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1783 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1784 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1785 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1786 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1787 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1788 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1789 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1790 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1791 {{libcall, {{8, loop}, {15, unrolled_loop},
1792 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1793 {libcall, {{24, loop}, {32, unrolled_loop},
1794 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1795 1, /* scalar_stmt_cost. */
1796 1, /* scalar load_cost. */
1797 1, /* scalar_store_cost. */
1798 1, /* vec_stmt_cost. */
1799 1, /* vec_to_scalar_cost. */
1800 1, /* scalar_to_vec_cost. */
1801 1, /* vec_align_load_cost. */
1802 2, /* vec_unalign_load_cost. */
1803 1, /* vec_store_cost. */
1804 3, /* cond_taken_branch_cost. */
1805 1, /* cond_not_taken_branch_cost. */
1806 };
1807
1808 /* Generic64 should produce code tuned for Nocona and K8. */
1809 static const
1810 struct processor_costs generic64_cost = {
1811 COSTS_N_INSNS (1), /* cost of an add instruction */
1812 /* On all chips taken into consideration lea is 2 cycles and more. With
1813 this cost however our current implementation of synth_mult results in
1814 use of unnecessary temporary registers causing regression on several
1815 SPECfp benchmarks. */
1816 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1817 COSTS_N_INSNS (1), /* variable shift costs */
1818 COSTS_N_INSNS (1), /* constant shift costs */
1819 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1820 COSTS_N_INSNS (4), /* HI */
1821 COSTS_N_INSNS (3), /* SI */
1822 COSTS_N_INSNS (4), /* DI */
1823 COSTS_N_INSNS (2)}, /* other */
1824 0, /* cost of multiply per each bit set */
1825 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1826 COSTS_N_INSNS (26), /* HI */
1827 COSTS_N_INSNS (42), /* SI */
1828 COSTS_N_INSNS (74), /* DI */
1829 COSTS_N_INSNS (74)}, /* other */
1830 COSTS_N_INSNS (1), /* cost of movsx */
1831 COSTS_N_INSNS (1), /* cost of movzx */
1832 8, /* "large" insn */
1833 17, /* MOVE_RATIO */
1834 4, /* cost for loading QImode using movzbl */
1835 {4, 4, 4}, /* cost of loading integer registers
1836 in QImode, HImode and SImode.
1837 Relative to reg-reg move (2). */
1838 {4, 4, 4}, /* cost of storing integer registers */
1839 4, /* cost of reg,reg fld/fst */
1840 {12, 12, 12}, /* cost of loading fp registers
1841 in SFmode, DFmode and XFmode */
1842 {6, 6, 8}, /* cost of storing fp registers
1843 in SFmode, DFmode and XFmode */
1844 2, /* cost of moving MMX register */
1845 {8, 8}, /* cost of loading MMX registers
1846 in SImode and DImode */
1847 {8, 8}, /* cost of storing MMX registers
1848 in SImode and DImode */
1849 2, /* cost of moving SSE register */
1850 {8, 8, 8}, /* cost of loading SSE registers
1851 in SImode, DImode and TImode */
1852 {8, 8, 8}, /* cost of storing SSE registers
1853 in SImode, DImode and TImode */
1854 5, /* MMX or SSE register to integer */
1855 32, /* size of l1 cache. */
1856 512, /* size of l2 cache. */
1857 64, /* size of prefetch block */
1858 6, /* number of parallel prefetches */
1859 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1860 value is increased to perhaps more appropriate value of 5. */
1861 3, /* Branch cost */
1862 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1863 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1864 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1865 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1866 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1867 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1868 {DUMMY_STRINGOP_ALGS,
1869 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1870 {DUMMY_STRINGOP_ALGS,
1871 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1872 1, /* scalar_stmt_cost. */
1873 1, /* scalar load_cost. */
1874 1, /* scalar_store_cost. */
1875 1, /* vec_stmt_cost. */
1876 1, /* vec_to_scalar_cost. */
1877 1, /* scalar_to_vec_cost. */
1878 1, /* vec_align_load_cost. */
1879 2, /* vec_unalign_load_cost. */
1880 1, /* vec_store_cost. */
1881 3, /* cond_taken_branch_cost. */
1882 1, /* cond_not_taken_branch_cost. */
1883 };
1884
1885 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1886 Athlon and K8. */
1887 static const
1888 struct processor_costs generic32_cost = {
1889 COSTS_N_INSNS (1), /* cost of an add instruction */
1890 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1891 COSTS_N_INSNS (1), /* variable shift costs */
1892 COSTS_N_INSNS (1), /* constant shift costs */
1893 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1894 COSTS_N_INSNS (4), /* HI */
1895 COSTS_N_INSNS (3), /* SI */
1896 COSTS_N_INSNS (4), /* DI */
1897 COSTS_N_INSNS (2)}, /* other */
1898 0, /* cost of multiply per each bit set */
1899 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1900 COSTS_N_INSNS (26), /* HI */
1901 COSTS_N_INSNS (42), /* SI */
1902 COSTS_N_INSNS (74), /* DI */
1903 COSTS_N_INSNS (74)}, /* other */
1904 COSTS_N_INSNS (1), /* cost of movsx */
1905 COSTS_N_INSNS (1), /* cost of movzx */
1906 8, /* "large" insn */
1907 17, /* MOVE_RATIO */
1908 4, /* cost for loading QImode using movzbl */
1909 {4, 4, 4}, /* cost of loading integer registers
1910 in QImode, HImode and SImode.
1911 Relative to reg-reg move (2). */
1912 {4, 4, 4}, /* cost of storing integer registers */
1913 4, /* cost of reg,reg fld/fst */
1914 {12, 12, 12}, /* cost of loading fp registers
1915 in SFmode, DFmode and XFmode */
1916 {6, 6, 8}, /* cost of storing fp registers
1917 in SFmode, DFmode and XFmode */
1918 2, /* cost of moving MMX register */
1919 {8, 8}, /* cost of loading MMX registers
1920 in SImode and DImode */
1921 {8, 8}, /* cost of storing MMX registers
1922 in SImode and DImode */
1923 2, /* cost of moving SSE register */
1924 {8, 8, 8}, /* cost of loading SSE registers
1925 in SImode, DImode and TImode */
1926 {8, 8, 8}, /* cost of storing SSE registers
1927 in SImode, DImode and TImode */
1928 5, /* MMX or SSE register to integer */
1929 32, /* size of l1 cache. */
1930 256, /* size of l2 cache. */
1931 64, /* size of prefetch block */
1932 6, /* number of parallel prefetches */
1933 3, /* Branch cost */
1934 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1935 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1936 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1937 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1938 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1939 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1940 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1941 DUMMY_STRINGOP_ALGS},
1942 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1943 DUMMY_STRINGOP_ALGS},
1944 1, /* scalar_stmt_cost. */
1945 1, /* scalar load_cost. */
1946 1, /* scalar_store_cost. */
1947 1, /* vec_stmt_cost. */
1948 1, /* vec_to_scalar_cost. */
1949 1, /* scalar_to_vec_cost. */
1950 1, /* vec_align_load_cost. */
1951 2, /* vec_unalign_load_cost. */
1952 1, /* vec_store_cost. */
1953 3, /* cond_taken_branch_cost. */
1954 1, /* cond_not_taken_branch_cost. */
1955 };
1956
1957 /* Set by -mtune. */
1958 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1959
1960 /* Set by -mtune or -Os. */
1961 const struct processor_costs *ix86_cost = &pentium_cost;
1962
1963 /* Processor feature/optimization bitmasks. */
1964 #define m_386 (1<<PROCESSOR_I386)
1965 #define m_486 (1<<PROCESSOR_I486)
1966 #define m_PENT (1<<PROCESSOR_PENTIUM)
1967 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1968 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1969 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1970 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1971 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1972 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1973 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1974 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1975 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1976 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1977 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1978 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1979 #define m_ATOM (1<<PROCESSOR_ATOM)
1980
1981 #define m_GEODE (1<<PROCESSOR_GEODE)
1982 #define m_K6 (1<<PROCESSOR_K6)
1983 #define m_K6_GEODE (m_K6 | m_GEODE)
1984 #define m_K8 (1<<PROCESSOR_K8)
1985 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1986 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1987 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1988 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1989 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1990 #define m_BDVER (m_BDVER1 | m_BDVER2)
1991 #define m_BTVER (m_BTVER1 | m_BTVER2)
1992 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1993 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1994 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1995
1996 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1997 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1998
1999 /* Generic instruction choice should be common subset of supported CPUs
2000 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
2001 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
2002
2003 /* Feature tests against the various tunings. */
2004 unsigned char ix86_tune_features[X86_TUNE_LAST];
2005
2006 /* Feature tests against the various tunings used to create ix86_tune_features
2007 based on the processor mask. */
2008 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
2009 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
2010 negatively, so enabling for Generic64 seems like good code size
2011 tradeoff. We can't enable it for 32bit generic because it does not
2012 work well with PPro base chips. */
2013 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
2014
2015 /* X86_TUNE_PUSH_MEMORY */
2016 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2017
2018 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
2019 m_486 | m_PENT,
2020
2021 /* X86_TUNE_UNROLL_STRLEN */
2022 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
2023
2024 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
2025 on simulation result. But after P4 was made, no performance benefit
2026 was observed with branch hints. It also increases the code size.
2027 As a result, icc never generates branch hints. */
2028 0,
2029
2030 /* X86_TUNE_DOUBLE_WITH_ADD */
2031 ~m_386,
2032
2033 /* X86_TUNE_USE_SAHF */
2034 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC,
2035
2036 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
2037 partial dependencies. */
2038 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2039
2040 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
2041 register stalls on Generic32 compilation setting as well. However
2042 in current implementation the partial register stalls are not eliminated
2043 very well - they can be introduced via subregs synthesized by combine
2044 and can happen in caller/callee saving sequences. Because this option
2045 pays back little on PPro based chips and is in conflict with partial reg
2046 dependencies used by Athlon/P4 based chips, it is better to leave it off
2047 for generic32 for now. */
2048 m_PPRO,
2049
2050 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
2051 m_CORE2I7 | m_GENERIC,
2052
2053 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
2054 * on 16-bit immediate moves into memory on Core2 and Corei7. */
2055 m_CORE2I7 | m_GENERIC,
2056
2057 /* X86_TUNE_USE_HIMODE_FIOP */
2058 m_386 | m_486 | m_K6_GEODE,
2059
2060 /* X86_TUNE_USE_SIMODE_FIOP */
2061 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
2062
2063 /* X86_TUNE_USE_MOV0 */
2064 m_K6,
2065
2066 /* X86_TUNE_USE_CLTD */
2067 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
2068
2069 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
2070 m_PENT4,
2071
2072 /* X86_TUNE_SPLIT_LONG_MOVES */
2073 m_PPRO,
2074
2075 /* X86_TUNE_READ_MODIFY_WRITE */
2076 ~m_PENT,
2077
2078 /* X86_TUNE_READ_MODIFY */
2079 ~(m_PENT | m_PPRO),
2080
2081 /* X86_TUNE_PROMOTE_QIMODE */
2082 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2083
2084 /* X86_TUNE_FAST_PREFIX */
2085 ~(m_386 | m_486 | m_PENT),
2086
2087 /* X86_TUNE_SINGLE_STRINGOP */
2088 m_386 | m_P4_NOCONA,
2089
2090 /* X86_TUNE_QIMODE_MATH */
2091 ~0,
2092
2093 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2094 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2095 might be considered for Generic32 if our scheme for avoiding partial
2096 stalls was more effective. */
2097 ~m_PPRO,
2098
2099 /* X86_TUNE_PROMOTE_QI_REGS */
2100 0,
2101
2102 /* X86_TUNE_PROMOTE_HI_REGS */
2103 m_PPRO,
2104
2105 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2106 over esp addition. */
2107 m_386 | m_486 | m_PENT | m_PPRO,
2108
2109 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2110 over esp addition. */
2111 m_PENT,
2112
2113 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2114 over esp subtraction. */
2115 m_386 | m_486 | m_PENT | m_K6_GEODE,
2116
2117 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2118 over esp subtraction. */
2119 m_PENT | m_K6_GEODE,
2120
2121 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2122 for DFmode copies */
2123 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2124
2125 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2126 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2127
2128 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2129 conflict here in between PPro/Pentium4 based chips that thread 128bit
2130 SSE registers as single units versus K8 based chips that divide SSE
2131 registers to two 64bit halves. This knob promotes all store destinations
2132 to be 128bit to allow register renaming on 128bit SSE units, but usually
2133 results in one extra microop on 64bit SSE units. Experimental results
2134 shows that disabling this option on P4 brings over 20% SPECfp regression,
2135 while enabling it on K8 brings roughly 2.4% regression that can be partly
2136 masked by careful scheduling of moves. */
2137 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2138
2139 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2140 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER,
2141
2142 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2143 m_COREI7 | m_BDVER,
2144
2145 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2146 m_BDVER ,
2147
2148 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2149 are resolved on SSE register parts instead of whole registers, so we may
2150 maintain just lower part of scalar values in proper format leaving the
2151 upper part undefined. */
2152 m_ATHLON_K8,
2153
2154 /* X86_TUNE_SSE_TYPELESS_STORES */
2155 m_AMD_MULTIPLE,
2156
2157 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2158 m_PPRO | m_P4_NOCONA,
2159
2160 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2161 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2162
2163 /* X86_TUNE_PROLOGUE_USING_MOVE */
2164 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2165
2166 /* X86_TUNE_EPILOGUE_USING_MOVE */
2167 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2168
2169 /* X86_TUNE_SHIFT1 */
2170 ~m_486,
2171
2172 /* X86_TUNE_USE_FFREEP */
2173 m_AMD_MULTIPLE,
2174
2175 /* X86_TUNE_INTER_UNIT_MOVES */
2176 ~(m_AMD_MULTIPLE | m_GENERIC),
2177
2178 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2179 ~(m_AMDFAM10 | m_BDVER ),
2180
2181 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2182 than 4 branch instructions in the 16 byte window. */
2183 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2184
2185 /* X86_TUNE_SCHEDULE */
2186 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2187
2188 /* X86_TUNE_USE_BT */
2189 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2190
2191 /* X86_TUNE_USE_INCDEC */
2192 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2193
2194 /* X86_TUNE_PAD_RETURNS */
2195 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2196
2197 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2198 m_ATOM,
2199
2200 /* X86_TUNE_EXT_80387_CONSTANTS */
2201 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2202
2203 /* X86_TUNE_SHORTEN_X87_SSE */
2204 ~m_K8,
2205
2206 /* X86_TUNE_AVOID_VECTOR_DECODE */
2207 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2208
2209 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2210 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2211 ~(m_386 | m_486),
2212
2213 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2214 vector path on AMD machines. */
2215 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
2216
2217 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2218 machines. */
2219 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
2220
2221 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2222 than a MOV. */
2223 m_PENT,
2224
2225 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2226 but one byte longer. */
2227 m_PENT,
2228
2229 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2230 operand that cannot be represented using a modRM byte. The XOR
2231 replacement is long decoded, so this split helps here as well. */
2232 m_K6,
2233
2234 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2235 from FP to FP. */
2236 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2237
2238 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2239 from integer to FP. */
2240 m_AMDFAM10,
2241
2242 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2243 with a subsequent conditional jump instruction into a single
2244 compare-and-branch uop. */
2245 m_BDVER,
2246
2247 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2248 will impact LEA instruction selection. */
2249 m_ATOM,
2250
2251 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2252 instructions. */
2253 ~m_ATOM,
2254
2255 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2256 at -O3. For the moment, the prefetching seems badly tuned for Intel
2257 chips. */
2258 m_K6_GEODE | m_AMD_MULTIPLE,
2259
2260 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2261 the auto-vectorizer. */
2262 m_BDVER,
2263
2264 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2265 during reassociation of integer computation. */
2266 m_ATOM,
2267
2268 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2269 during reassociation of fp computation. */
2270 m_ATOM,
2271
2272 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
2273 regs instead of memory. */
2274 m_COREI7 | m_CORE2I7
2275 };
2276
2277 /* Feature tests against the various architecture variations. */
2278 unsigned char ix86_arch_features[X86_ARCH_LAST];
2279
2280 /* Feature tests against the various architecture variations, used to create
2281 ix86_arch_features based on the processor mask. */
2282 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2283 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2284 ~(m_386 | m_486 | m_PENT | m_K6),
2285
2286 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2287 ~m_386,
2288
2289 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2290 ~(m_386 | m_486),
2291
2292 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2293 ~m_386,
2294
2295 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2296 ~m_386,
2297 };
2298
2299 static const unsigned int x86_accumulate_outgoing_args
2300 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2301
2302 static const unsigned int x86_arch_always_fancy_math_387
2303 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2304
2305 static const unsigned int x86_avx256_split_unaligned_load
2306 = m_COREI7 | m_GENERIC;
2307
2308 static const unsigned int x86_avx256_split_unaligned_store
2309 = m_COREI7 | m_BDVER | m_GENERIC;
2310
2311 /* In case the average insn count for single function invocation is
2312 lower than this constant, emit fast (but longer) prologue and
2313 epilogue code. */
2314 #define FAST_PROLOGUE_INSN_COUNT 20
2315
2316 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2317 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2318 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2319 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2320
2321 /* Array of the smallest class containing reg number REGNO, indexed by
2322 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2323
2324 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2325 {
2326 /* ax, dx, cx, bx */
2327 AREG, DREG, CREG, BREG,
2328 /* si, di, bp, sp */
2329 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2330 /* FP registers */
2331 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2332 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2333 /* arg pointer */
2334 NON_Q_REGS,
2335 /* flags, fpsr, fpcr, frame */
2336 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2337 /* SSE registers */
2338 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2339 SSE_REGS, SSE_REGS,
2340 /* MMX registers */
2341 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2342 MMX_REGS, MMX_REGS,
2343 /* REX registers */
2344 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2345 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2346 /* SSE REX registers */
2347 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2348 SSE_REGS, SSE_REGS,
2349 };
2350
2351 /* The "default" register map used in 32bit mode. */
2352
2353 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2354 {
2355 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2356 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2357 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2358 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2359 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2360 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2361 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2362 };
2363
2364 /* The "default" register map used in 64bit mode. */
2365
2366 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2367 {
2368 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2369 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2370 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2371 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2372 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2373 8,9,10,11,12,13,14,15, /* extended integer registers */
2374 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2375 };
2376
2377 /* Define the register numbers to be used in Dwarf debugging information.
2378 The SVR4 reference port C compiler uses the following register numbers
2379 in its Dwarf output code:
2380 0 for %eax (gcc regno = 0)
2381 1 for %ecx (gcc regno = 2)
2382 2 for %edx (gcc regno = 1)
2383 3 for %ebx (gcc regno = 3)
2384 4 for %esp (gcc regno = 7)
2385 5 for %ebp (gcc regno = 6)
2386 6 for %esi (gcc regno = 4)
2387 7 for %edi (gcc regno = 5)
2388 The following three DWARF register numbers are never generated by
2389 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2390 believes these numbers have these meanings.
2391 8 for %eip (no gcc equivalent)
2392 9 for %eflags (gcc regno = 17)
2393 10 for %trapno (no gcc equivalent)
2394 It is not at all clear how we should number the FP stack registers
2395 for the x86 architecture. If the version of SDB on x86/svr4 were
2396 a bit less brain dead with respect to floating-point then we would
2397 have a precedent to follow with respect to DWARF register numbers
2398 for x86 FP registers, but the SDB on x86/svr4 is so completely
2399 broken with respect to FP registers that it is hardly worth thinking
2400 of it as something to strive for compatibility with.
2401 The version of x86/svr4 SDB I have at the moment does (partially)
2402 seem to believe that DWARF register number 11 is associated with
2403 the x86 register %st(0), but that's about all. Higher DWARF
2404 register numbers don't seem to be associated with anything in
2405 particular, and even for DWARF regno 11, SDB only seems to under-
2406 stand that it should say that a variable lives in %st(0) (when
2407 asked via an `=' command) if we said it was in DWARF regno 11,
2408 but SDB still prints garbage when asked for the value of the
2409 variable in question (via a `/' command).
2410 (Also note that the labels SDB prints for various FP stack regs
2411 when doing an `x' command are all wrong.)
2412 Note that these problems generally don't affect the native SVR4
2413 C compiler because it doesn't allow the use of -O with -g and
2414 because when it is *not* optimizing, it allocates a memory
2415 location for each floating-point variable, and the memory
2416 location is what gets described in the DWARF AT_location
2417 attribute for the variable in question.
2418 Regardless of the severe mental illness of the x86/svr4 SDB, we
2419 do something sensible here and we use the following DWARF
2420 register numbers. Note that these are all stack-top-relative
2421 numbers.
2422 11 for %st(0) (gcc regno = 8)
2423 12 for %st(1) (gcc regno = 9)
2424 13 for %st(2) (gcc regno = 10)
2425 14 for %st(3) (gcc regno = 11)
2426 15 for %st(4) (gcc regno = 12)
2427 16 for %st(5) (gcc regno = 13)
2428 17 for %st(6) (gcc regno = 14)
2429 18 for %st(7) (gcc regno = 15)
2430 */
2431 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2432 {
2433 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2434 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2435 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2436 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2437 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2438 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2439 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2440 };
2441
2442 /* Define parameter passing and return registers. */
2443
2444 static int const x86_64_int_parameter_registers[6] =
2445 {
2446 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2447 };
2448
2449 static int const x86_64_ms_abi_int_parameter_registers[4] =
2450 {
2451 CX_REG, DX_REG, R8_REG, R9_REG
2452 };
2453
2454 static int const x86_64_int_return_registers[4] =
2455 {
2456 AX_REG, DX_REG, DI_REG, SI_REG
2457 };
2458
2459 /* Define the structure for the machine field in struct function. */
2460
2461 struct GTY(()) stack_local_entry {
2462 unsigned short mode;
2463 unsigned short n;
2464 rtx rtl;
2465 struct stack_local_entry *next;
2466 };
2467
2468 /* Structure describing stack frame layout.
2469 Stack grows downward:
2470
2471 [arguments]
2472 <- ARG_POINTER
2473 saved pc
2474
2475 saved static chain if ix86_static_chain_on_stack
2476
2477 saved frame pointer if frame_pointer_needed
2478 <- HARD_FRAME_POINTER
2479 [saved regs]
2480 <- regs_save_offset
2481 [padding0]
2482
2483 [saved SSE regs]
2484 <- sse_regs_save_offset
2485 [padding1] |
2486 | <- FRAME_POINTER
2487 [va_arg registers] |
2488 |
2489 [frame] |
2490 |
2491 [padding2] | = to_allocate
2492 <- STACK_POINTER
2493 */
2494 struct ix86_frame
2495 {
2496 int nsseregs;
2497 int nregs;
2498 int va_arg_size;
2499 int red_zone_size;
2500 int outgoing_arguments_size;
2501
2502 /* The offsets relative to ARG_POINTER. */
2503 HOST_WIDE_INT frame_pointer_offset;
2504 HOST_WIDE_INT hard_frame_pointer_offset;
2505 HOST_WIDE_INT stack_pointer_offset;
2506 HOST_WIDE_INT hfp_save_offset;
2507 HOST_WIDE_INT reg_save_offset;
2508 HOST_WIDE_INT sse_reg_save_offset;
2509
2510 /* When save_regs_using_mov is set, emit prologue using
2511 move instead of push instructions. */
2512 bool save_regs_using_mov;
2513 };
2514
2515 /* Which cpu are we scheduling for. */
2516 enum attr_cpu ix86_schedule;
2517
2518 /* Which cpu are we optimizing for. */
2519 enum processor_type ix86_tune;
2520
2521 /* Which instruction set architecture to use. */
2522 enum processor_type ix86_arch;
2523
2524 /* True if processor has SSE prefetch instruction. */
2525 unsigned char x86_prefetch_sse;
2526
2527 /* -mstackrealign option */
2528 static const char ix86_force_align_arg_pointer_string[]
2529 = "force_align_arg_pointer";
2530
2531 static rtx (*ix86_gen_leave) (void);
2532 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2533 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2534 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2535 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2536 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2537 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2538 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2539 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2540 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2541 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2542 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2543
2544 /* Preferred alignment for stack boundary in bits. */
2545 unsigned int ix86_preferred_stack_boundary;
2546
2547 /* Alignment for incoming stack boundary in bits specified at
2548 command line. */
2549 static unsigned int ix86_user_incoming_stack_boundary;
2550
2551 /* Default alignment for incoming stack boundary in bits. */
2552 static unsigned int ix86_default_incoming_stack_boundary;
2553
2554 /* Alignment for incoming stack boundary in bits. */
2555 unsigned int ix86_incoming_stack_boundary;
2556
2557 /* Calling abi specific va_list type nodes. */
2558 static GTY(()) tree sysv_va_list_type_node;
2559 static GTY(()) tree ms_va_list_type_node;
2560
2561 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2562 char internal_label_prefix[16];
2563 int internal_label_prefix_len;
2564
2565 /* Fence to use after loop using movnt. */
2566 tree x86_mfence;
2567
2568 /* Register class used for passing given 64bit part of the argument.
2569 These represent classes as documented by the PS ABI, with the exception
2570 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2571 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2572
2573 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2574 whenever possible (upper half does contain padding). */
2575 enum x86_64_reg_class
2576 {
2577 X86_64_NO_CLASS,
2578 X86_64_INTEGER_CLASS,
2579 X86_64_INTEGERSI_CLASS,
2580 X86_64_SSE_CLASS,
2581 X86_64_SSESF_CLASS,
2582 X86_64_SSEDF_CLASS,
2583 X86_64_SSEUP_CLASS,
2584 X86_64_X87_CLASS,
2585 X86_64_X87UP_CLASS,
2586 X86_64_COMPLEX_X87_CLASS,
2587 X86_64_MEMORY_CLASS
2588 };
2589
2590 #define MAX_CLASSES 4
2591
2592 /* Table of constants used by fldpi, fldln2, etc.... */
2593 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2594 static bool ext_80387_constants_init = 0;
2595
2596 \f
2597 static struct machine_function * ix86_init_machine_status (void);
2598 static rtx ix86_function_value (const_tree, const_tree, bool);
2599 static bool ix86_function_value_regno_p (const unsigned int);
2600 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2601 const_tree);
2602 static rtx ix86_static_chain (const_tree, bool);
2603 static int ix86_function_regparm (const_tree, const_tree);
2604 static void ix86_compute_frame_layout (struct ix86_frame *);
2605 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2606 rtx, rtx, int);
2607 static void ix86_add_new_builtins (HOST_WIDE_INT);
2608 static tree ix86_canonical_va_list_type (tree);
2609 static void predict_jump (int);
2610 static unsigned int split_stack_prologue_scratch_regno (void);
2611 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2612
2613 enum ix86_function_specific_strings
2614 {
2615 IX86_FUNCTION_SPECIFIC_ARCH,
2616 IX86_FUNCTION_SPECIFIC_TUNE,
2617 IX86_FUNCTION_SPECIFIC_MAX
2618 };
2619
2620 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2621 const char *, enum fpmath_unit, bool);
2622 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2623 static void ix86_function_specific_save (struct cl_target_option *);
2624 static void ix86_function_specific_restore (struct cl_target_option *);
2625 static void ix86_function_specific_print (FILE *, int,
2626 struct cl_target_option *);
2627 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2628 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2629 struct gcc_options *);
2630 static bool ix86_can_inline_p (tree, tree);
2631 static void ix86_set_current_function (tree);
2632 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2633
2634 static enum calling_abi ix86_function_abi (const_tree);
2635
2636 \f
2637 #ifndef SUBTARGET32_DEFAULT_CPU
2638 #define SUBTARGET32_DEFAULT_CPU "i386"
2639 #endif
2640
2641 /* The svr4 ABI for the i386 says that records and unions are returned
2642 in memory. */
2643 #ifndef DEFAULT_PCC_STRUCT_RETURN
2644 #define DEFAULT_PCC_STRUCT_RETURN 1
2645 #endif
2646
2647 /* Whether -mtune= or -march= were specified */
2648 static int ix86_tune_defaulted;
2649 static int ix86_arch_specified;
2650
2651 /* Vectorization library interface and handlers. */
2652 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2653
2654 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2655 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2656
2657 /* Processor target table, indexed by processor number */
2658 struct ptt
2659 {
2660 const struct processor_costs *cost; /* Processor costs */
2661 const int align_loop; /* Default alignments. */
2662 const int align_loop_max_skip;
2663 const int align_jump;
2664 const int align_jump_max_skip;
2665 const int align_func;
2666 };
2667
2668 static const struct ptt processor_target_table[PROCESSOR_max] =
2669 {
2670 {&i386_cost, 4, 3, 4, 3, 4},
2671 {&i486_cost, 16, 15, 16, 15, 16},
2672 {&pentium_cost, 16, 7, 16, 7, 16},
2673 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2674 {&geode_cost, 0, 0, 0, 0, 0},
2675 {&k6_cost, 32, 7, 32, 7, 32},
2676 {&athlon_cost, 16, 7, 16, 7, 16},
2677 {&pentium4_cost, 0, 0, 0, 0, 0},
2678 {&k8_cost, 16, 7, 16, 7, 16},
2679 {&nocona_cost, 0, 0, 0, 0, 0},
2680 /* Core 2 32-bit. */
2681 {&generic32_cost, 16, 10, 16, 10, 16},
2682 /* Core 2 64-bit. */
2683 {&generic64_cost, 16, 10, 16, 10, 16},
2684 /* Core i7 32-bit. */
2685 {&generic32_cost, 16, 10, 16, 10, 16},
2686 /* Core i7 64-bit. */
2687 {&generic64_cost, 16, 10, 16, 10, 16},
2688 {&generic32_cost, 16, 7, 16, 7, 16},
2689 {&generic64_cost, 16, 10, 16, 10, 16},
2690 {&amdfam10_cost, 32, 24, 32, 7, 32},
2691 {&bdver1_cost, 32, 24, 32, 7, 32},
2692 {&bdver2_cost, 32, 24, 32, 7, 32},
2693 {&btver1_cost, 32, 24, 32, 7, 32},
2694 {&btver2_cost, 32, 24, 32, 7, 32},
2695 {&atom_cost, 16, 15, 16, 7, 16}
2696 };
2697
2698 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2699 {
2700 "generic",
2701 "i386",
2702 "i486",
2703 "pentium",
2704 "pentium-mmx",
2705 "pentiumpro",
2706 "pentium2",
2707 "pentium3",
2708 "pentium4",
2709 "pentium-m",
2710 "prescott",
2711 "nocona",
2712 "core2",
2713 "corei7",
2714 "atom",
2715 "geode",
2716 "k6",
2717 "k6-2",
2718 "k6-3",
2719 "athlon",
2720 "athlon-4",
2721 "k8",
2722 "amdfam10",
2723 "bdver1",
2724 "bdver2",
2725 "btver1",
2726 "btver2"
2727 };
2728 \f
2729 /* Return true if a red-zone is in use. */
2730
2731 static inline bool
2732 ix86_using_red_zone (void)
2733 {
2734 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2735 }
2736 \f
2737 /* Return a string that documents the current -m options. The caller is
2738 responsible for freeing the string. */
2739
2740 static char *
2741 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2742 const char *tune, enum fpmath_unit fpmath,
2743 bool add_nl_p)
2744 {
2745 struct ix86_target_opts
2746 {
2747 const char *option; /* option string */
2748 HOST_WIDE_INT mask; /* isa mask options */
2749 };
2750
2751 /* This table is ordered so that options like -msse4.2 that imply
2752 preceding options while match those first. */
2753 static struct ix86_target_opts isa_opts[] =
2754 {
2755 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2756 { "-mfma", OPTION_MASK_ISA_FMA },
2757 { "-mxop", OPTION_MASK_ISA_XOP },
2758 { "-mlwp", OPTION_MASK_ISA_LWP },
2759 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2760 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2761 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2762 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2763 { "-msse3", OPTION_MASK_ISA_SSE3 },
2764 { "-msse2", OPTION_MASK_ISA_SSE2 },
2765 { "-msse", OPTION_MASK_ISA_SSE },
2766 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2767 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2768 { "-mmmx", OPTION_MASK_ISA_MMX },
2769 { "-mabm", OPTION_MASK_ISA_ABM },
2770 { "-mbmi", OPTION_MASK_ISA_BMI },
2771 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2772 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2773 { "-mhle", OPTION_MASK_ISA_HLE },
2774 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2775 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2776 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2777 { "-madx", OPTION_MASK_ISA_ADX },
2778 { "-mtbm", OPTION_MASK_ISA_TBM },
2779 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2780 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2781 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2782 { "-maes", OPTION_MASK_ISA_AES },
2783 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2784 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2785 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2786 { "-mf16c", OPTION_MASK_ISA_F16C },
2787 { "-mrtm", OPTION_MASK_ISA_RTM },
2788 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2789 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2790 };
2791
2792 /* Flag options. */
2793 static struct ix86_target_opts flag_opts[] =
2794 {
2795 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2796 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2797 { "-m80387", MASK_80387 },
2798 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2799 { "-malign-double", MASK_ALIGN_DOUBLE },
2800 { "-mcld", MASK_CLD },
2801 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2802 { "-mieee-fp", MASK_IEEE_FP },
2803 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2804 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2805 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2806 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2807 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2808 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2809 { "-mno-red-zone", MASK_NO_RED_ZONE },
2810 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2811 { "-mrecip", MASK_RECIP },
2812 { "-mrtd", MASK_RTD },
2813 { "-msseregparm", MASK_SSEREGPARM },
2814 { "-mstack-arg-probe", MASK_STACK_PROBE },
2815 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2816 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2817 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2818 { "-mvzeroupper", MASK_VZEROUPPER },
2819 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2820 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2821 { "-mprefer-avx128", MASK_PREFER_AVX128},
2822 };
2823
2824 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2825
2826 char isa_other[40];
2827 char target_other[40];
2828 unsigned num = 0;
2829 unsigned i, j;
2830 char *ret;
2831 char *ptr;
2832 size_t len;
2833 size_t line_len;
2834 size_t sep_len;
2835 const char *abi;
2836
2837 memset (opts, '\0', sizeof (opts));
2838
2839 /* Add -march= option. */
2840 if (arch)
2841 {
2842 opts[num][0] = "-march=";
2843 opts[num++][1] = arch;
2844 }
2845
2846 /* Add -mtune= option. */
2847 if (tune)
2848 {
2849 opts[num][0] = "-mtune=";
2850 opts[num++][1] = tune;
2851 }
2852
2853 /* Add -m32/-m64/-mx32. */
2854 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2855 {
2856 if ((isa & OPTION_MASK_ABI_64) != 0)
2857 abi = "-m64";
2858 else
2859 abi = "-mx32";
2860 isa &= ~ (OPTION_MASK_ISA_64BIT
2861 | OPTION_MASK_ABI_64
2862 | OPTION_MASK_ABI_X32);
2863 }
2864 else
2865 abi = "-m32";
2866 opts[num++][0] = abi;
2867
2868 /* Pick out the options in isa options. */
2869 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2870 {
2871 if ((isa & isa_opts[i].mask) != 0)
2872 {
2873 opts[num++][0] = isa_opts[i].option;
2874 isa &= ~ isa_opts[i].mask;
2875 }
2876 }
2877
2878 if (isa && add_nl_p)
2879 {
2880 opts[num++][0] = isa_other;
2881 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2882 isa);
2883 }
2884
2885 /* Add flag options. */
2886 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2887 {
2888 if ((flags & flag_opts[i].mask) != 0)
2889 {
2890 opts[num++][0] = flag_opts[i].option;
2891 flags &= ~ flag_opts[i].mask;
2892 }
2893 }
2894
2895 if (flags && add_nl_p)
2896 {
2897 opts[num++][0] = target_other;
2898 sprintf (target_other, "(other flags: %#x)", flags);
2899 }
2900
2901 /* Add -fpmath= option. */
2902 if (fpmath)
2903 {
2904 opts[num][0] = "-mfpmath=";
2905 switch ((int) fpmath)
2906 {
2907 case FPMATH_387:
2908 opts[num++][1] = "387";
2909 break;
2910
2911 case FPMATH_SSE:
2912 opts[num++][1] = "sse";
2913 break;
2914
2915 case FPMATH_387 | FPMATH_SSE:
2916 opts[num++][1] = "sse+387";
2917 break;
2918
2919 default:
2920 gcc_unreachable ();
2921 }
2922 }
2923
2924 /* Any options? */
2925 if (num == 0)
2926 return NULL;
2927
2928 gcc_assert (num < ARRAY_SIZE (opts));
2929
2930 /* Size the string. */
2931 len = 0;
2932 sep_len = (add_nl_p) ? 3 : 1;
2933 for (i = 0; i < num; i++)
2934 {
2935 len += sep_len;
2936 for (j = 0; j < 2; j++)
2937 if (opts[i][j])
2938 len += strlen (opts[i][j]);
2939 }
2940
2941 /* Build the string. */
2942 ret = ptr = (char *) xmalloc (len);
2943 line_len = 0;
2944
2945 for (i = 0; i < num; i++)
2946 {
2947 size_t len2[2];
2948
2949 for (j = 0; j < 2; j++)
2950 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2951
2952 if (i != 0)
2953 {
2954 *ptr++ = ' ';
2955 line_len++;
2956
2957 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2958 {
2959 *ptr++ = '\\';
2960 *ptr++ = '\n';
2961 line_len = 0;
2962 }
2963 }
2964
2965 for (j = 0; j < 2; j++)
2966 if (opts[i][j])
2967 {
2968 memcpy (ptr, opts[i][j], len2[j]);
2969 ptr += len2[j];
2970 line_len += len2[j];
2971 }
2972 }
2973
2974 *ptr = '\0';
2975 gcc_assert (ret + len >= ptr);
2976
2977 return ret;
2978 }
2979
2980 /* Return true, if profiling code should be emitted before
2981 prologue. Otherwise it returns false.
2982 Note: For x86 with "hotfix" it is sorried. */
2983 static bool
2984 ix86_profile_before_prologue (void)
2985 {
2986 return flag_fentry != 0;
2987 }
2988
2989 /* Function that is callable from the debugger to print the current
2990 options. */
2991 void
2992 ix86_debug_options (void)
2993 {
2994 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2995 ix86_arch_string, ix86_tune_string,
2996 ix86_fpmath, true);
2997
2998 if (opts)
2999 {
3000 fprintf (stderr, "%s\n\n", opts);
3001 free (opts);
3002 }
3003 else
3004 fputs ("<no options>\n\n", stderr);
3005
3006 return;
3007 }
3008 \f
3009 /* Override various settings based on options. If MAIN_ARGS_P, the
3010 options are from the command line, otherwise they are from
3011 attributes. */
3012
3013 static void
3014 ix86_option_override_internal (bool main_args_p)
3015 {
3016 int i;
3017 unsigned int ix86_arch_mask, ix86_tune_mask;
3018 const bool ix86_tune_specified = (ix86_tune_string != NULL);
3019 const char *prefix;
3020 const char *suffix;
3021 const char *sw;
3022
3023 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
3024 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
3025 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
3026 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
3027 #define PTA_AES (HOST_WIDE_INT_1 << 4)
3028 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
3029 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
3030 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
3031 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
3032 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
3033 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
3034 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
3035 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
3036 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
3037 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
3038 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
3039 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
3040 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
3041 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
3042 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
3043 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
3044 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
3045 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
3046 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
3047 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
3048 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
3049 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
3050 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
3051 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
3052 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
3053 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
3054 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
3055 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
3056 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
3057 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
3058 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
3059 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
3060 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
3061 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
3062 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
3063
3064 /* if this reaches 64, need to widen struct pta flags below */
3065
3066 static struct pta
3067 {
3068 const char *const name; /* processor name or nickname. */
3069 const enum processor_type processor;
3070 const enum attr_cpu schedule;
3071 const unsigned HOST_WIDE_INT flags;
3072 }
3073 const processor_alias_table[] =
3074 {
3075 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3076 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3077 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3078 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3079 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3080 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3081 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3082 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3083 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
3084 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3085 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3086 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3087 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3088 PTA_MMX | PTA_SSE | PTA_FXSR},
3089 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3090 PTA_MMX | PTA_SSE | PTA_FXSR},
3091 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3092 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3093 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3094 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3095 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3096 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3097 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3098 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3099 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3100 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3101 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3102 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
3103 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3104 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
3105 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
3106 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3107 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_FXSR},
3108 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
3109 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3110 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3111 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
3112 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3113 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
3114 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3115 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3116 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3117 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3118 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
3119 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3120 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
3121 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3122 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
3123 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
3124 | PTA_XSAVEOPT},
3125 {"atom", PROCESSOR_ATOM, CPU_ATOM,
3126 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3127 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
3128 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3129 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3130 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3131 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3132 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3133 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3134 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3135 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3136 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3137 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3138 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3139 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3140 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3141 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3142 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3143 {"x86-64", PROCESSOR_K8, CPU_K8,
3144 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3145 {"k8", PROCESSOR_K8, CPU_K8,
3146 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3147 | PTA_SSE2 | PTA_NO_SAHF},
3148 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3149 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3150 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3151 {"opteron", PROCESSOR_K8, CPU_K8,
3152 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3153 | PTA_SSE2 | PTA_NO_SAHF},
3154 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3155 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3156 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3157 {"athlon64", PROCESSOR_K8, CPU_K8,
3158 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3159 | PTA_SSE2 | PTA_NO_SAHF},
3160 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3161 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3162 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3163 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3164 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3165 | PTA_SSE2 | PTA_NO_SAHF},
3166 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3167 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3168 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3169 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3170 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3171 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3172 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3173 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3174 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3175 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3176 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3177 | PTA_XSAVEOPT},
3178 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3179 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3180 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3181 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3182 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3183 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3184 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3185 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3186 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3187 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3188 {"btver2", PROCESSOR_BTVER2, CPU_GENERIC64,
3189 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3190 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3191 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3192 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3193 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3194
3195 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3196 PTA_HLE /* flags are only used for -march switch. */ },
3197 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3198 PTA_64BIT
3199 | PTA_HLE /* flags are only used for -march switch. */ },
3200 };
3201
3202 /* -mrecip options. */
3203 static struct
3204 {
3205 const char *string; /* option name */
3206 unsigned int mask; /* mask bits to set */
3207 }
3208 const recip_options[] =
3209 {
3210 { "all", RECIP_MASK_ALL },
3211 { "none", RECIP_MASK_NONE },
3212 { "div", RECIP_MASK_DIV },
3213 { "sqrt", RECIP_MASK_SQRT },
3214 { "vec-div", RECIP_MASK_VEC_DIV },
3215 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3216 };
3217
3218 int const pta_size = ARRAY_SIZE (processor_alias_table);
3219
3220 /* Set up prefix/suffix so the error messages refer to either the command
3221 line argument, or the attribute(target). */
3222 if (main_args_p)
3223 {
3224 prefix = "-m";
3225 suffix = "";
3226 sw = "switch";
3227 }
3228 else
3229 {
3230 prefix = "option(\"";
3231 suffix = "\")";
3232 sw = "attribute";
3233 }
3234
3235 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3236 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3237 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
3238 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3239 #ifdef TARGET_BI_ARCH
3240 else
3241 {
3242 #if TARGET_BI_ARCH == 1
3243 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3244 is on and OPTION_MASK_ABI_X32 is off. We turn off
3245 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3246 -mx32. */
3247 if (TARGET_X32)
3248 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3249 #else
3250 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3251 on and OPTION_MASK_ABI_64 is off. We turn off
3252 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3253 -m64. */
3254 if (TARGET_LP64)
3255 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3256 #endif
3257 }
3258 #endif
3259
3260 if (TARGET_X32)
3261 {
3262 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3263 OPTION_MASK_ABI_64 for TARGET_X32. */
3264 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3265 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3266 }
3267 else if (TARGET_LP64)
3268 {
3269 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3270 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3271 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3272 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3273 }
3274
3275 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3276 SUBTARGET_OVERRIDE_OPTIONS;
3277 #endif
3278
3279 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3280 SUBSUBTARGET_OVERRIDE_OPTIONS;
3281 #endif
3282
3283 /* -fPIC is the default for x86_64. */
3284 if (TARGET_MACHO && TARGET_64BIT)
3285 flag_pic = 2;
3286
3287 /* Need to check -mtune=generic first. */
3288 if (ix86_tune_string)
3289 {
3290 if (!strcmp (ix86_tune_string, "generic")
3291 || !strcmp (ix86_tune_string, "i686")
3292 /* As special support for cross compilers we read -mtune=native
3293 as -mtune=generic. With native compilers we won't see the
3294 -mtune=native, as it was changed by the driver. */
3295 || !strcmp (ix86_tune_string, "native"))
3296 {
3297 if (TARGET_64BIT)
3298 ix86_tune_string = "generic64";
3299 else
3300 ix86_tune_string = "generic32";
3301 }
3302 /* If this call is for setting the option attribute, allow the
3303 generic32/generic64 that was previously set. */
3304 else if (!main_args_p
3305 && (!strcmp (ix86_tune_string, "generic32")
3306 || !strcmp (ix86_tune_string, "generic64")))
3307 ;
3308 else if (!strncmp (ix86_tune_string, "generic", 7))
3309 error ("bad value (%s) for %stune=%s %s",
3310 ix86_tune_string, prefix, suffix, sw);
3311 else if (!strcmp (ix86_tune_string, "x86-64"))
3312 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3313 "%stune=k8%s or %stune=generic%s instead as appropriate",
3314 prefix, suffix, prefix, suffix, prefix, suffix);
3315 }
3316 else
3317 {
3318 if (ix86_arch_string)
3319 ix86_tune_string = ix86_arch_string;
3320 if (!ix86_tune_string)
3321 {
3322 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3323 ix86_tune_defaulted = 1;
3324 }
3325
3326 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3327 need to use a sensible tune option. */
3328 if (!strcmp (ix86_tune_string, "generic")
3329 || !strcmp (ix86_tune_string, "x86-64")
3330 || !strcmp (ix86_tune_string, "i686"))
3331 {
3332 if (TARGET_64BIT)
3333 ix86_tune_string = "generic64";
3334 else
3335 ix86_tune_string = "generic32";
3336 }
3337 }
3338
3339 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3340 {
3341 /* rep; movq isn't available in 32-bit code. */
3342 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3343 ix86_stringop_alg = no_stringop;
3344 }
3345
3346 if (!ix86_arch_string)
3347 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3348 else
3349 ix86_arch_specified = 1;
3350
3351 if (global_options_set.x_ix86_pmode)
3352 {
3353 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3354 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3355 error ("address mode %qs not supported in the %s bit mode",
3356 TARGET_64BIT ? "short" : "long",
3357 TARGET_64BIT ? "64" : "32");
3358 }
3359 else
3360 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3361
3362 if (!global_options_set.x_ix86_abi)
3363 ix86_abi = DEFAULT_ABI;
3364
3365 if (global_options_set.x_ix86_cmodel)
3366 {
3367 switch (ix86_cmodel)
3368 {
3369 case CM_SMALL:
3370 case CM_SMALL_PIC:
3371 if (flag_pic)
3372 ix86_cmodel = CM_SMALL_PIC;
3373 if (!TARGET_64BIT)
3374 error ("code model %qs not supported in the %s bit mode",
3375 "small", "32");
3376 break;
3377
3378 case CM_MEDIUM:
3379 case CM_MEDIUM_PIC:
3380 if (flag_pic)
3381 ix86_cmodel = CM_MEDIUM_PIC;
3382 if (!TARGET_64BIT)
3383 error ("code model %qs not supported in the %s bit mode",
3384 "medium", "32");
3385 else if (TARGET_X32)
3386 error ("code model %qs not supported in x32 mode",
3387 "medium");
3388 break;
3389
3390 case CM_LARGE:
3391 case CM_LARGE_PIC:
3392 if (flag_pic)
3393 ix86_cmodel = CM_LARGE_PIC;
3394 if (!TARGET_64BIT)
3395 error ("code model %qs not supported in the %s bit mode",
3396 "large", "32");
3397 else if (TARGET_X32)
3398 error ("code model %qs not supported in x32 mode",
3399 "large");
3400 break;
3401
3402 case CM_32:
3403 if (flag_pic)
3404 error ("code model %s does not support PIC mode", "32");
3405 if (TARGET_64BIT)
3406 error ("code model %qs not supported in the %s bit mode",
3407 "32", "64");
3408 break;
3409
3410 case CM_KERNEL:
3411 if (flag_pic)
3412 {
3413 error ("code model %s does not support PIC mode", "kernel");
3414 ix86_cmodel = CM_32;
3415 }
3416 if (!TARGET_64BIT)
3417 error ("code model %qs not supported in the %s bit mode",
3418 "kernel", "32");
3419 break;
3420
3421 default:
3422 gcc_unreachable ();
3423 }
3424 }
3425 else
3426 {
3427 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3428 use of rip-relative addressing. This eliminates fixups that
3429 would otherwise be needed if this object is to be placed in a
3430 DLL, and is essentially just as efficient as direct addressing. */
3431 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3432 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3433 else if (TARGET_64BIT)
3434 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3435 else
3436 ix86_cmodel = CM_32;
3437 }
3438 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3439 {
3440 error ("-masm=intel not supported in this configuration");
3441 ix86_asm_dialect = ASM_ATT;
3442 }
3443 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3444 sorry ("%i-bit mode not compiled in",
3445 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3446
3447 for (i = 0; i < pta_size; i++)
3448 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3449 {
3450 ix86_schedule = processor_alias_table[i].schedule;
3451 ix86_arch = processor_alias_table[i].processor;
3452 /* Default cpu tuning to the architecture. */
3453 ix86_tune = ix86_arch;
3454
3455 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3456 error ("CPU you selected does not support x86-64 "
3457 "instruction set");
3458
3459 if (processor_alias_table[i].flags & PTA_MMX
3460 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3461 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3462 if (processor_alias_table[i].flags & PTA_3DNOW
3463 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3464 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3465 if (processor_alias_table[i].flags & PTA_3DNOW_A
3466 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3467 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3468 if (processor_alias_table[i].flags & PTA_SSE
3469 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3470 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3471 if (processor_alias_table[i].flags & PTA_SSE2
3472 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3473 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3474 if (processor_alias_table[i].flags & PTA_SSE3
3475 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3476 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3477 if (processor_alias_table[i].flags & PTA_SSSE3
3478 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3479 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3480 if (processor_alias_table[i].flags & PTA_SSE4_1
3481 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3482 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3483 if (processor_alias_table[i].flags & PTA_SSE4_2
3484 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3485 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3486 if (processor_alias_table[i].flags & PTA_AVX
3487 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3488 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3489 if (processor_alias_table[i].flags & PTA_AVX2
3490 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3491 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3492 if (processor_alias_table[i].flags & PTA_FMA
3493 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3494 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3495 if (processor_alias_table[i].flags & PTA_SSE4A
3496 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3497 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3498 if (processor_alias_table[i].flags & PTA_FMA4
3499 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3500 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3501 if (processor_alias_table[i].flags & PTA_XOP
3502 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3503 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3504 if (processor_alias_table[i].flags & PTA_LWP
3505 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3506 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3507 if (processor_alias_table[i].flags & PTA_ABM
3508 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3509 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3510 if (processor_alias_table[i].flags & PTA_BMI
3511 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3512 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3513 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3514 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3515 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3516 if (processor_alias_table[i].flags & PTA_TBM
3517 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3518 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3519 if (processor_alias_table[i].flags & PTA_BMI2
3520 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3521 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3522 if (processor_alias_table[i].flags & PTA_CX16
3523 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3524 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3525 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3526 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3527 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3528 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3529 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3530 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3531 if (processor_alias_table[i].flags & PTA_MOVBE
3532 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3533 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3534 if (processor_alias_table[i].flags & PTA_AES
3535 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3536 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3537 if (processor_alias_table[i].flags & PTA_PCLMUL
3538 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3539 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3540 if (processor_alias_table[i].flags & PTA_FSGSBASE
3541 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3542 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3543 if (processor_alias_table[i].flags & PTA_RDRND
3544 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3545 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3546 if (processor_alias_table[i].flags & PTA_F16C
3547 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3548 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3549 if (processor_alias_table[i].flags & PTA_RTM
3550 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3551 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3552 if (processor_alias_table[i].flags & PTA_HLE
3553 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3554 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3555 if (processor_alias_table[i].flags & PTA_PRFCHW
3556 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3557 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3558 if (processor_alias_table[i].flags & PTA_RDSEED
3559 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3560 ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3561 if (processor_alias_table[i].flags & PTA_ADX
3562 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3563 ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3564 if (processor_alias_table[i].flags & PTA_FXSR
3565 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3566 ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3567 if (processor_alias_table[i].flags & PTA_XSAVE
3568 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3569 ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3570 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3571 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3572 ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3573 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3574 x86_prefetch_sse = true;
3575
3576 break;
3577 }
3578
3579 if (!strcmp (ix86_arch_string, "generic"))
3580 error ("generic CPU can be used only for %stune=%s %s",
3581 prefix, suffix, sw);
3582 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3583 error ("bad value (%s) for %sarch=%s %s",
3584 ix86_arch_string, prefix, suffix, sw);
3585
3586 ix86_arch_mask = 1u << ix86_arch;
3587 for (i = 0; i < X86_ARCH_LAST; ++i)
3588 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3589
3590 for (i = 0; i < pta_size; i++)
3591 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3592 {
3593 ix86_schedule = processor_alias_table[i].schedule;
3594 ix86_tune = processor_alias_table[i].processor;
3595 if (TARGET_64BIT)
3596 {
3597 if (!(processor_alias_table[i].flags & PTA_64BIT))
3598 {
3599 if (ix86_tune_defaulted)
3600 {
3601 ix86_tune_string = "x86-64";
3602 for (i = 0; i < pta_size; i++)
3603 if (! strcmp (ix86_tune_string,
3604 processor_alias_table[i].name))
3605 break;
3606 ix86_schedule = processor_alias_table[i].schedule;
3607 ix86_tune = processor_alias_table[i].processor;
3608 }
3609 else
3610 error ("CPU you selected does not support x86-64 "
3611 "instruction set");
3612 }
3613 }
3614 else
3615 {
3616 /* Adjust tuning when compiling for 32-bit ABI. */
3617 switch (ix86_tune)
3618 {
3619 case PROCESSOR_GENERIC64:
3620 ix86_tune = PROCESSOR_GENERIC32;
3621 ix86_schedule = CPU_PENTIUMPRO;
3622 break;
3623
3624 case PROCESSOR_CORE2_64:
3625 ix86_tune = PROCESSOR_CORE2_32;
3626 break;
3627
3628 case PROCESSOR_COREI7_64:
3629 ix86_tune = PROCESSOR_COREI7_32;
3630 break;
3631
3632 default:
3633 break;
3634 }
3635 }
3636 /* Intel CPUs have always interpreted SSE prefetch instructions as
3637 NOPs; so, we can enable SSE prefetch instructions even when
3638 -mtune (rather than -march) points us to a processor that has them.
3639 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3640 higher processors. */
3641 if (TARGET_CMOV
3642 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3643 x86_prefetch_sse = true;
3644 break;
3645 }
3646
3647 if (ix86_tune_specified && i == pta_size)
3648 error ("bad value (%s) for %stune=%s %s",
3649 ix86_tune_string, prefix, suffix, sw);
3650
3651 ix86_tune_mask = 1u << ix86_tune;
3652 for (i = 0; i < X86_TUNE_LAST; ++i)
3653 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3654
3655 #ifndef USE_IX86_FRAME_POINTER
3656 #define USE_IX86_FRAME_POINTER 0
3657 #endif
3658
3659 #ifndef USE_X86_64_FRAME_POINTER
3660 #define USE_X86_64_FRAME_POINTER 0
3661 #endif
3662
3663 /* Set the default values for switches whose default depends on TARGET_64BIT
3664 in case they weren't overwritten by command line options. */
3665 if (TARGET_64BIT)
3666 {
3667 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3668 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3669 if (flag_asynchronous_unwind_tables == 2)
3670 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3671 if (flag_pcc_struct_return == 2)
3672 flag_pcc_struct_return = 0;
3673 }
3674 else
3675 {
3676 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3677 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3678 if (flag_asynchronous_unwind_tables == 2)
3679 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3680 if (flag_pcc_struct_return == 2)
3681 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3682 }
3683
3684 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3685 if (optimize_size)
3686 ix86_cost = &ix86_size_cost;
3687 else
3688 ix86_cost = ix86_tune_cost;
3689
3690 /* Arrange to set up i386_stack_locals for all functions. */
3691 init_machine_status = ix86_init_machine_status;
3692
3693 /* Validate -mregparm= value. */
3694 if (global_options_set.x_ix86_regparm)
3695 {
3696 if (TARGET_64BIT)
3697 warning (0, "-mregparm is ignored in 64-bit mode");
3698 if (ix86_regparm > REGPARM_MAX)
3699 {
3700 error ("-mregparm=%d is not between 0 and %d",
3701 ix86_regparm, REGPARM_MAX);
3702 ix86_regparm = 0;
3703 }
3704 }
3705 if (TARGET_64BIT)
3706 ix86_regparm = REGPARM_MAX;
3707
3708 /* Default align_* from the processor table. */
3709 if (align_loops == 0)
3710 {
3711 align_loops = processor_target_table[ix86_tune].align_loop;
3712 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3713 }
3714 if (align_jumps == 0)
3715 {
3716 align_jumps = processor_target_table[ix86_tune].align_jump;
3717 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3718 }
3719 if (align_functions == 0)
3720 {
3721 align_functions = processor_target_table[ix86_tune].align_func;
3722 }
3723
3724 /* Provide default for -mbranch-cost= value. */
3725 if (!global_options_set.x_ix86_branch_cost)
3726 ix86_branch_cost = ix86_cost->branch_cost;
3727
3728 if (TARGET_64BIT)
3729 {
3730 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3731
3732 /* Enable by default the SSE and MMX builtins. Do allow the user to
3733 explicitly disable any of these. In particular, disabling SSE and
3734 MMX for kernel code is extremely useful. */
3735 if (!ix86_arch_specified)
3736 ix86_isa_flags
3737 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3738 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3739
3740 if (TARGET_RTD)
3741 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3742 }
3743 else
3744 {
3745 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3746
3747 if (!ix86_arch_specified)
3748 ix86_isa_flags
3749 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3750
3751 /* i386 ABI does not specify red zone. It still makes sense to use it
3752 when programmer takes care to stack from being destroyed. */
3753 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3754 target_flags |= MASK_NO_RED_ZONE;
3755 }
3756
3757 /* Keep nonleaf frame pointers. */
3758 if (flag_omit_frame_pointer)
3759 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3760 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3761 flag_omit_frame_pointer = 1;
3762
3763 /* If we're doing fast math, we don't care about comparison order
3764 wrt NaNs. This lets us use a shorter comparison sequence. */
3765 if (flag_finite_math_only)
3766 target_flags &= ~MASK_IEEE_FP;
3767
3768 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3769 since the insns won't need emulation. */
3770 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3771 target_flags &= ~MASK_NO_FANCY_MATH_387;
3772
3773 /* Likewise, if the target doesn't have a 387, or we've specified
3774 software floating point, don't use 387 inline intrinsics. */
3775 if (!TARGET_80387)
3776 target_flags |= MASK_NO_FANCY_MATH_387;
3777
3778 /* Turn on MMX builtins for -msse. */
3779 if (TARGET_SSE)
3780 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3781
3782 /* Enable SSE prefetch. */
3783 if (TARGET_SSE || TARGET_PRFCHW)
3784 x86_prefetch_sse = true;
3785
3786 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3787 if (TARGET_SSE4_2 || TARGET_ABM)
3788 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3789
3790 /* Turn on lzcnt instruction for -mabm. */
3791 if (TARGET_ABM)
3792 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3793
3794 /* Validate -mpreferred-stack-boundary= value or default it to
3795 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3796 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3797 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3798 {
3799 int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
3800 int max = (TARGET_SEH ? 4 : 12);
3801
3802 if (ix86_preferred_stack_boundary_arg < min
3803 || ix86_preferred_stack_boundary_arg > max)
3804 {
3805 if (min == max)
3806 error ("-mpreferred-stack-boundary is not supported "
3807 "for this target");
3808 else
3809 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3810 ix86_preferred_stack_boundary_arg, min, max);
3811 }
3812 else
3813 ix86_preferred_stack_boundary
3814 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3815 }
3816
3817 /* Set the default value for -mstackrealign. */
3818 if (ix86_force_align_arg_pointer == -1)
3819 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3820
3821 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3822
3823 /* Validate -mincoming-stack-boundary= value or default it to
3824 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3825 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3826 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3827 {
3828 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3829 || ix86_incoming_stack_boundary_arg > 12)
3830 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3831 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3832 else
3833 {
3834 ix86_user_incoming_stack_boundary
3835 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3836 ix86_incoming_stack_boundary
3837 = ix86_user_incoming_stack_boundary;
3838 }
3839 }
3840
3841 /* Accept -msseregparm only if at least SSE support is enabled. */
3842 if (TARGET_SSEREGPARM
3843 && ! TARGET_SSE)
3844 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3845
3846 if (global_options_set.x_ix86_fpmath)
3847 {
3848 if (ix86_fpmath & FPMATH_SSE)
3849 {
3850 if (!TARGET_SSE)
3851 {
3852 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3853 ix86_fpmath = FPMATH_387;
3854 }
3855 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3856 {
3857 warning (0, "387 instruction set disabled, using SSE arithmetics");
3858 ix86_fpmath = FPMATH_SSE;
3859 }
3860 }
3861 }
3862 else
3863 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3864
3865 /* If the i387 is disabled, then do not return values in it. */
3866 if (!TARGET_80387)
3867 target_flags &= ~MASK_FLOAT_RETURNS;
3868
3869 /* Use external vectorized library in vectorizing intrinsics. */
3870 if (global_options_set.x_ix86_veclibabi_type)
3871 switch (ix86_veclibabi_type)
3872 {
3873 case ix86_veclibabi_type_svml:
3874 ix86_veclib_handler = ix86_veclibabi_svml;
3875 break;
3876
3877 case ix86_veclibabi_type_acml:
3878 ix86_veclib_handler = ix86_veclibabi_acml;
3879 break;
3880
3881 default:
3882 gcc_unreachable ();
3883 }
3884
3885 if ((!USE_IX86_FRAME_POINTER
3886 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3887 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3888 && !optimize_size)
3889 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3890
3891 /* ??? Unwind info is not correct around the CFG unless either a frame
3892 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3893 unwind info generation to be aware of the CFG and propagating states
3894 around edges. */
3895 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3896 || flag_exceptions || flag_non_call_exceptions)
3897 && flag_omit_frame_pointer
3898 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3899 {
3900 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3901 warning (0, "unwind tables currently require either a frame pointer "
3902 "or %saccumulate-outgoing-args%s for correctness",
3903 prefix, suffix);
3904 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3905 }
3906
3907 /* If stack probes are required, the space used for large function
3908 arguments on the stack must also be probed, so enable
3909 -maccumulate-outgoing-args so this happens in the prologue. */
3910 if (TARGET_STACK_PROBE
3911 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3912 {
3913 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3914 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3915 "for correctness", prefix, suffix);
3916 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3917 }
3918
3919 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3920 {
3921 char *p;
3922 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3923 p = strchr (internal_label_prefix, 'X');
3924 internal_label_prefix_len = p - internal_label_prefix;
3925 *p = '\0';
3926 }
3927
3928 /* When scheduling description is not available, disable scheduler pass
3929 so it won't slow down the compilation and make x87 code slower. */
3930 if (!TARGET_SCHEDULE)
3931 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3932
3933 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3934 ix86_tune_cost->simultaneous_prefetches,
3935 global_options.x_param_values,
3936 global_options_set.x_param_values);
3937 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3938 ix86_tune_cost->prefetch_block,
3939 global_options.x_param_values,
3940 global_options_set.x_param_values);
3941 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3942 ix86_tune_cost->l1_cache_size,
3943 global_options.x_param_values,
3944 global_options_set.x_param_values);
3945 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3946 ix86_tune_cost->l2_cache_size,
3947 global_options.x_param_values,
3948 global_options_set.x_param_values);
3949
3950 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3951 if (flag_prefetch_loop_arrays < 0
3952 && HAVE_prefetch
3953 && optimize >= 3
3954 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3955 flag_prefetch_loop_arrays = 1;
3956
3957 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3958 can be optimized to ap = __builtin_next_arg (0). */
3959 if (!TARGET_64BIT && !flag_split_stack)
3960 targetm.expand_builtin_va_start = NULL;
3961
3962 if (TARGET_64BIT)
3963 {
3964 ix86_gen_leave = gen_leave_rex64;
3965 if (Pmode == DImode)
3966 {
3967 ix86_gen_monitor = gen_sse3_monitor64_di;
3968 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3969 ix86_gen_tls_local_dynamic_base_64
3970 = gen_tls_local_dynamic_base_64_di;
3971 }
3972 else
3973 {
3974 ix86_gen_monitor = gen_sse3_monitor64_si;
3975 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3976 ix86_gen_tls_local_dynamic_base_64
3977 = gen_tls_local_dynamic_base_64_si;
3978 }
3979 }
3980 else
3981 {
3982 ix86_gen_leave = gen_leave;
3983 ix86_gen_monitor = gen_sse3_monitor;
3984 }
3985
3986 if (Pmode == DImode)
3987 {
3988 ix86_gen_add3 = gen_adddi3;
3989 ix86_gen_sub3 = gen_subdi3;
3990 ix86_gen_sub3_carry = gen_subdi3_carry;
3991 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3992 ix86_gen_andsp = gen_anddi3;
3993 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3994 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3995 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3996 }
3997 else
3998 {
3999 ix86_gen_add3 = gen_addsi3;
4000 ix86_gen_sub3 = gen_subsi3;
4001 ix86_gen_sub3_carry = gen_subsi3_carry;
4002 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
4003 ix86_gen_andsp = gen_andsi3;
4004 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
4005 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
4006 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
4007 }
4008
4009 #ifdef USE_IX86_CLD
4010 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
4011 if (!TARGET_64BIT)
4012 target_flags |= MASK_CLD & ~target_flags_explicit;
4013 #endif
4014
4015 if (!TARGET_64BIT && flag_pic)
4016 {
4017 if (flag_fentry > 0)
4018 sorry ("-mfentry isn%'t supported for 32-bit in combination "
4019 "with -fpic");
4020 flag_fentry = 0;
4021 }
4022 else if (TARGET_SEH)
4023 {
4024 if (flag_fentry == 0)
4025 sorry ("-mno-fentry isn%'t compatible with SEH");
4026 flag_fentry = 1;
4027 }
4028 else if (flag_fentry < 0)
4029 {
4030 #if defined(PROFILE_BEFORE_PROLOGUE)
4031 flag_fentry = 1;
4032 #else
4033 flag_fentry = 0;
4034 #endif
4035 }
4036
4037 if (TARGET_AVX)
4038 {
4039 /* When not optimize for size, enable vzeroupper optimization for
4040 TARGET_AVX with -fexpensive-optimizations and split 32-byte
4041 AVX unaligned load/store. */
4042 if (!optimize_size)
4043 {
4044 if (flag_expensive_optimizations
4045 && !(target_flags_explicit & MASK_VZEROUPPER))
4046 target_flags |= MASK_VZEROUPPER;
4047 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
4048 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4049 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4050 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
4051 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4052 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4053 /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */
4054 if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
4055 target_flags |= MASK_PREFER_AVX128;
4056 }
4057 }
4058 else
4059 {
4060 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
4061 target_flags &= ~MASK_VZEROUPPER;
4062 }
4063
4064 if (ix86_recip_name)
4065 {
4066 char *p = ASTRDUP (ix86_recip_name);
4067 char *q;
4068 unsigned int mask, i;
4069 bool invert;
4070
4071 while ((q = strtok (p, ",")) != NULL)
4072 {
4073 p = NULL;
4074 if (*q == '!')
4075 {
4076 invert = true;
4077 q++;
4078 }
4079 else
4080 invert = false;
4081
4082 if (!strcmp (q, "default"))
4083 mask = RECIP_MASK_ALL;
4084 else
4085 {
4086 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4087 if (!strcmp (q, recip_options[i].string))
4088 {
4089 mask = recip_options[i].mask;
4090 break;
4091 }
4092
4093 if (i == ARRAY_SIZE (recip_options))
4094 {
4095 error ("unknown option for -mrecip=%s", q);
4096 invert = false;
4097 mask = RECIP_MASK_NONE;
4098 }
4099 }
4100
4101 recip_mask_explicit |= mask;
4102 if (invert)
4103 recip_mask &= ~mask;
4104 else
4105 recip_mask |= mask;
4106 }
4107 }
4108
4109 if (TARGET_RECIP)
4110 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
4111 else if (target_flags_explicit & MASK_RECIP)
4112 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
4113
4114 /* Default long double to 64-bit for Bionic. */
4115 if (TARGET_HAS_BIONIC
4116 && !(target_flags_explicit & MASK_LONG_DOUBLE_64))
4117 target_flags |= MASK_LONG_DOUBLE_64;
4118
4119 /* Save the initial options in case the user does function specific
4120 options. */
4121 if (main_args_p)
4122 target_option_default_node = target_option_current_node
4123 = build_target_option_node ();
4124 }
4125
4126 /* Return TRUE if VAL is passed in register with 256bit AVX modes. */
4127
4128 static bool
4129 function_pass_avx256_p (const_rtx val)
4130 {
4131 if (!val)
4132 return false;
4133
4134 if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
4135 return true;
4136
4137 if (GET_CODE (val) == PARALLEL)
4138 {
4139 int i;
4140 rtx r;
4141
4142 for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
4143 {
4144 r = XVECEXP (val, 0, i);
4145 if (GET_CODE (r) == EXPR_LIST
4146 && XEXP (r, 0)
4147 && REG_P (XEXP (r, 0))
4148 && (GET_MODE (XEXP (r, 0)) == OImode
4149 || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
4150 return true;
4151 }
4152 }
4153
4154 return false;
4155 }
4156
4157 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4158
4159 static void
4160 ix86_option_override (void)
4161 {
4162 ix86_option_override_internal (true);
4163 }
4164
4165 /* Update register usage after having seen the compiler flags. */
4166
4167 static void
4168 ix86_conditional_register_usage (void)
4169 {
4170 int i, c_mask;
4171 unsigned int j;
4172
4173 /* The PIC register, if it exists, is fixed. */
4174 j = PIC_OFFSET_TABLE_REGNUM;
4175 if (j != INVALID_REGNUM)
4176 fixed_regs[j] = call_used_regs[j] = 1;
4177
4178 /* For 32-bit targets, squash the REX registers. */
4179 if (! TARGET_64BIT)
4180 {
4181 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4182 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4183 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4184 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4185 }
4186
4187 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4188 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
4189 : TARGET_64BIT ? (1 << 2)
4190 : (1 << 1));
4191
4192 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4193
4194 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4195 {
4196 /* Set/reset conditionally defined registers from
4197 CALL_USED_REGISTERS initializer. */
4198 if (call_used_regs[i] > 1)
4199 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4200
4201 /* Calculate registers of CLOBBERED_REGS register set
4202 as call used registers from GENERAL_REGS register set. */
4203 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4204 && call_used_regs[i])
4205 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4206 }
4207
4208 /* If MMX is disabled, squash the registers. */
4209 if (! TARGET_MMX)
4210 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4211 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4212 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4213
4214 /* If SSE is disabled, squash the registers. */
4215 if (! TARGET_SSE)
4216 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4217 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4218 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4219
4220 /* If the FPU is disabled, squash the registers. */
4221 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4222 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4223 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4224 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4225 }
4226
4227 \f
4228 /* Save the current options */
4229
4230 static void
4231 ix86_function_specific_save (struct cl_target_option *ptr)
4232 {
4233 ptr->arch = ix86_arch;
4234 ptr->schedule = ix86_schedule;
4235 ptr->tune = ix86_tune;
4236 ptr->branch_cost = ix86_branch_cost;
4237 ptr->tune_defaulted = ix86_tune_defaulted;
4238 ptr->arch_specified = ix86_arch_specified;
4239 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4240 ptr->ix86_target_flags_explicit = target_flags_explicit;
4241 ptr->x_recip_mask_explicit = recip_mask_explicit;
4242
4243 /* The fields are char but the variables are not; make sure the
4244 values fit in the fields. */
4245 gcc_assert (ptr->arch == ix86_arch);
4246 gcc_assert (ptr->schedule == ix86_schedule);
4247 gcc_assert (ptr->tune == ix86_tune);
4248 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4249 }
4250
4251 /* Restore the current options */
4252
4253 static void
4254 ix86_function_specific_restore (struct cl_target_option *ptr)
4255 {
4256 enum processor_type old_tune = ix86_tune;
4257 enum processor_type old_arch = ix86_arch;
4258 unsigned int ix86_arch_mask, ix86_tune_mask;
4259 int i;
4260
4261 ix86_arch = (enum processor_type) ptr->arch;
4262 ix86_schedule = (enum attr_cpu) ptr->schedule;
4263 ix86_tune = (enum processor_type) ptr->tune;
4264 ix86_branch_cost = ptr->branch_cost;
4265 ix86_tune_defaulted = ptr->tune_defaulted;
4266 ix86_arch_specified = ptr->arch_specified;
4267 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4268 target_flags_explicit = ptr->ix86_target_flags_explicit;
4269 recip_mask_explicit = ptr->x_recip_mask_explicit;
4270
4271 /* Recreate the arch feature tests if the arch changed */
4272 if (old_arch != ix86_arch)
4273 {
4274 ix86_arch_mask = 1u << ix86_arch;
4275 for (i = 0; i < X86_ARCH_LAST; ++i)
4276 ix86_arch_features[i]
4277 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4278 }
4279
4280 /* Recreate the tune optimization tests */
4281 if (old_tune != ix86_tune)
4282 {
4283 ix86_tune_mask = 1u << ix86_tune;
4284 for (i = 0; i < X86_TUNE_LAST; ++i)
4285 ix86_tune_features[i]
4286 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4287 }
4288 }
4289
4290 /* Print the current options */
4291
4292 static void
4293 ix86_function_specific_print (FILE *file, int indent,
4294 struct cl_target_option *ptr)
4295 {
4296 char *target_string
4297 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4298 NULL, NULL, ptr->x_ix86_fpmath, false);
4299
4300 fprintf (file, "%*sarch = %d (%s)\n",
4301 indent, "",
4302 ptr->arch,
4303 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4304 ? cpu_names[ptr->arch]
4305 : "<unknown>"));
4306
4307 fprintf (file, "%*stune = %d (%s)\n",
4308 indent, "",
4309 ptr->tune,
4310 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4311 ? cpu_names[ptr->tune]
4312 : "<unknown>"));
4313
4314 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4315
4316 if (target_string)
4317 {
4318 fprintf (file, "%*s%s\n", indent, "", target_string);
4319 free (target_string);
4320 }
4321 }
4322
4323 \f
4324 /* Inner function to process the attribute((target(...))), take an argument and
4325 set the current options from the argument. If we have a list, recursively go
4326 over the list. */
4327
4328 static bool
4329 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4330 struct gcc_options *enum_opts_set)
4331 {
4332 char *next_optstr;
4333 bool ret = true;
4334
4335 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4336 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4337 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4338 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4339 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4340
4341 enum ix86_opt_type
4342 {
4343 ix86_opt_unknown,
4344 ix86_opt_yes,
4345 ix86_opt_no,
4346 ix86_opt_str,
4347 ix86_opt_enum,
4348 ix86_opt_isa
4349 };
4350
4351 static const struct
4352 {
4353 const char *string;
4354 size_t len;
4355 enum ix86_opt_type type;
4356 int opt;
4357 int mask;
4358 } attrs[] = {
4359 /* isa options */
4360 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4361 IX86_ATTR_ISA ("abm", OPT_mabm),
4362 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4363 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4364 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4365 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4366 IX86_ATTR_ISA ("aes", OPT_maes),
4367 IX86_ATTR_ISA ("avx", OPT_mavx),
4368 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4369 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4370 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4371 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4372 IX86_ATTR_ISA ("sse", OPT_msse),
4373 IX86_ATTR_ISA ("sse2", OPT_msse2),
4374 IX86_ATTR_ISA ("sse3", OPT_msse3),
4375 IX86_ATTR_ISA ("sse4", OPT_msse4),
4376 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4377 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4378 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4379 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4380 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4381 IX86_ATTR_ISA ("fma", OPT_mfma),
4382 IX86_ATTR_ISA ("xop", OPT_mxop),
4383 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4384 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4385 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4386 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4387 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4388 IX86_ATTR_ISA ("hle", OPT_mhle),
4389 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4390 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4391 IX86_ATTR_ISA ("adx", OPT_madx),
4392 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4393 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4394 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4395
4396 /* enum options */
4397 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4398
4399 /* string options */
4400 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4401 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4402
4403 /* flag options */
4404 IX86_ATTR_YES ("cld",
4405 OPT_mcld,
4406 MASK_CLD),
4407
4408 IX86_ATTR_NO ("fancy-math-387",
4409 OPT_mfancy_math_387,
4410 MASK_NO_FANCY_MATH_387),
4411
4412 IX86_ATTR_YES ("ieee-fp",
4413 OPT_mieee_fp,
4414 MASK_IEEE_FP),
4415
4416 IX86_ATTR_YES ("inline-all-stringops",
4417 OPT_minline_all_stringops,
4418 MASK_INLINE_ALL_STRINGOPS),
4419
4420 IX86_ATTR_YES ("inline-stringops-dynamically",
4421 OPT_minline_stringops_dynamically,
4422 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4423
4424 IX86_ATTR_NO ("align-stringops",
4425 OPT_mno_align_stringops,
4426 MASK_NO_ALIGN_STRINGOPS),
4427
4428 IX86_ATTR_YES ("recip",
4429 OPT_mrecip,
4430 MASK_RECIP),
4431
4432 };
4433
4434 /* If this is a list, recurse to get the options. */
4435 if (TREE_CODE (args) == TREE_LIST)
4436 {
4437 bool ret = true;
4438
4439 for (; args; args = TREE_CHAIN (args))
4440 if (TREE_VALUE (args)
4441 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4442 p_strings, enum_opts_set))
4443 ret = false;
4444
4445 return ret;
4446 }
4447
4448 else if (TREE_CODE (args) != STRING_CST)
4449 gcc_unreachable ();
4450
4451 /* Handle multiple arguments separated by commas. */
4452 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4453
4454 while (next_optstr && *next_optstr != '\0')
4455 {
4456 char *p = next_optstr;
4457 char *orig_p = p;
4458 char *comma = strchr (next_optstr, ',');
4459 const char *opt_string;
4460 size_t len, opt_len;
4461 int opt;
4462 bool opt_set_p;
4463 char ch;
4464 unsigned i;
4465 enum ix86_opt_type type = ix86_opt_unknown;
4466 int mask = 0;
4467
4468 if (comma)
4469 {
4470 *comma = '\0';
4471 len = comma - next_optstr;
4472 next_optstr = comma + 1;
4473 }
4474 else
4475 {
4476 len = strlen (p);
4477 next_optstr = NULL;
4478 }
4479
4480 /* Recognize no-xxx. */
4481 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4482 {
4483 opt_set_p = false;
4484 p += 3;
4485 len -= 3;
4486 }
4487 else
4488 opt_set_p = true;
4489
4490 /* Find the option. */
4491 ch = *p;
4492 opt = N_OPTS;
4493 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4494 {
4495 type = attrs[i].type;
4496 opt_len = attrs[i].len;
4497 if (ch == attrs[i].string[0]
4498 && ((type != ix86_opt_str && type != ix86_opt_enum)
4499 ? len == opt_len
4500 : len > opt_len)
4501 && memcmp (p, attrs[i].string, opt_len) == 0)
4502 {
4503 opt = attrs[i].opt;
4504 mask = attrs[i].mask;
4505 opt_string = attrs[i].string;
4506 break;
4507 }
4508 }
4509
4510 /* Process the option. */
4511 if (opt == N_OPTS)
4512 {
4513 error ("attribute(target(\"%s\")) is unknown", orig_p);
4514 ret = false;
4515 }
4516
4517 else if (type == ix86_opt_isa)
4518 {
4519 struct cl_decoded_option decoded;
4520
4521 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4522 ix86_handle_option (&global_options, &global_options_set,
4523 &decoded, input_location);
4524 }
4525
4526 else if (type == ix86_opt_yes || type == ix86_opt_no)
4527 {
4528 if (type == ix86_opt_no)
4529 opt_set_p = !opt_set_p;
4530
4531 if (opt_set_p)
4532 target_flags |= mask;
4533 else
4534 target_flags &= ~mask;
4535 }
4536
4537 else if (type == ix86_opt_str)
4538 {
4539 if (p_strings[opt])
4540 {
4541 error ("option(\"%s\") was already specified", opt_string);
4542 ret = false;
4543 }
4544 else
4545 p_strings[opt] = xstrdup (p + opt_len);
4546 }
4547
4548 else if (type == ix86_opt_enum)
4549 {
4550 bool arg_ok;
4551 int value;
4552
4553 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4554 if (arg_ok)
4555 set_option (&global_options, enum_opts_set, opt, value,
4556 p + opt_len, DK_UNSPECIFIED, input_location,
4557 global_dc);
4558 else
4559 {
4560 error ("attribute(target(\"%s\")) is unknown", orig_p);
4561 ret = false;
4562 }
4563 }
4564
4565 else
4566 gcc_unreachable ();
4567 }
4568
4569 return ret;
4570 }
4571
4572 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4573
4574 tree
4575 ix86_valid_target_attribute_tree (tree args)
4576 {
4577 const char *orig_arch_string = ix86_arch_string;
4578 const char *orig_tune_string = ix86_tune_string;
4579 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4580 int orig_tune_defaulted = ix86_tune_defaulted;
4581 int orig_arch_specified = ix86_arch_specified;
4582 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4583 tree t = NULL_TREE;
4584 int i;
4585 struct cl_target_option *def
4586 = TREE_TARGET_OPTION (target_option_default_node);
4587 struct gcc_options enum_opts_set;
4588
4589 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4590
4591 /* Process each of the options on the chain. */
4592 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4593 &enum_opts_set))
4594 return NULL_TREE;
4595
4596 /* If the changed options are different from the default, rerun
4597 ix86_option_override_internal, and then save the options away.
4598 The string options are are attribute options, and will be undone
4599 when we copy the save structure. */
4600 if (ix86_isa_flags != def->x_ix86_isa_flags
4601 || target_flags != def->x_target_flags
4602 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4603 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4604 || enum_opts_set.x_ix86_fpmath)
4605 {
4606 /* If we are using the default tune= or arch=, undo the string assigned,
4607 and use the default. */
4608 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4609 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4610 else if (!orig_arch_specified)
4611 ix86_arch_string = NULL;
4612
4613 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4614 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4615 else if (orig_tune_defaulted)
4616 ix86_tune_string = NULL;
4617
4618 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4619 if (enum_opts_set.x_ix86_fpmath)
4620 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4621 else if (!TARGET_64BIT && TARGET_SSE)
4622 {
4623 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4624 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4625 }
4626
4627 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4628 ix86_option_override_internal (false);
4629
4630 /* Add any builtin functions with the new isa if any. */
4631 ix86_add_new_builtins (ix86_isa_flags);
4632
4633 /* Save the current options unless we are validating options for
4634 #pragma. */
4635 t = build_target_option_node ();
4636
4637 ix86_arch_string = orig_arch_string;
4638 ix86_tune_string = orig_tune_string;
4639 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4640
4641 /* Free up memory allocated to hold the strings */
4642 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4643 free (option_strings[i]);
4644 }
4645
4646 return t;
4647 }
4648
4649 /* Hook to validate attribute((target("string"))). */
4650
4651 static bool
4652 ix86_valid_target_attribute_p (tree fndecl,
4653 tree ARG_UNUSED (name),
4654 tree args,
4655 int ARG_UNUSED (flags))
4656 {
4657 struct cl_target_option cur_target;
4658 bool ret = true;
4659 tree old_optimize = build_optimization_node ();
4660 tree new_target, new_optimize;
4661 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4662
4663 /* If the function changed the optimization levels as well as setting target
4664 options, start with the optimizations specified. */
4665 if (func_optimize && func_optimize != old_optimize)
4666 cl_optimization_restore (&global_options,
4667 TREE_OPTIMIZATION (func_optimize));
4668
4669 /* The target attributes may also change some optimization flags, so update
4670 the optimization options if necessary. */
4671 cl_target_option_save (&cur_target, &global_options);
4672 new_target = ix86_valid_target_attribute_tree (args);
4673 new_optimize = build_optimization_node ();
4674
4675 if (!new_target)
4676 ret = false;
4677
4678 else if (fndecl)
4679 {
4680 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4681
4682 if (old_optimize != new_optimize)
4683 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4684 }
4685
4686 cl_target_option_restore (&global_options, &cur_target);
4687
4688 if (old_optimize != new_optimize)
4689 cl_optimization_restore (&global_options,
4690 TREE_OPTIMIZATION (old_optimize));
4691
4692 return ret;
4693 }
4694
4695 \f
4696 /* Hook to determine if one function can safely inline another. */
4697
4698 static bool
4699 ix86_can_inline_p (tree caller, tree callee)
4700 {
4701 bool ret = false;
4702 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4703 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4704
4705 /* If callee has no option attributes, then it is ok to inline. */
4706 if (!callee_tree)
4707 ret = true;
4708
4709 /* If caller has no option attributes, but callee does then it is not ok to
4710 inline. */
4711 else if (!caller_tree)
4712 ret = false;
4713
4714 else
4715 {
4716 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4717 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4718
4719 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4720 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4721 function. */
4722 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4723 != callee_opts->x_ix86_isa_flags)
4724 ret = false;
4725
4726 /* See if we have the same non-isa options. */
4727 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4728 ret = false;
4729
4730 /* See if arch, tune, etc. are the same. */
4731 else if (caller_opts->arch != callee_opts->arch)
4732 ret = false;
4733
4734 else if (caller_opts->tune != callee_opts->tune)
4735 ret = false;
4736
4737 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4738 ret = false;
4739
4740 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4741 ret = false;
4742
4743 else
4744 ret = true;
4745 }
4746
4747 return ret;
4748 }
4749
4750 \f
4751 /* Remember the last target of ix86_set_current_function. */
4752 static GTY(()) tree ix86_previous_fndecl;
4753
4754 /* Establish appropriate back-end context for processing the function
4755 FNDECL. The argument might be NULL to indicate processing at top
4756 level, outside of any function scope. */
4757 static void
4758 ix86_set_current_function (tree fndecl)
4759 {
4760 /* Only change the context if the function changes. This hook is called
4761 several times in the course of compiling a function, and we don't want to
4762 slow things down too much or call target_reinit when it isn't safe. */
4763 if (fndecl && fndecl != ix86_previous_fndecl)
4764 {
4765 tree old_tree = (ix86_previous_fndecl
4766 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4767 : NULL_TREE);
4768
4769 tree new_tree = (fndecl
4770 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4771 : NULL_TREE);
4772
4773 ix86_previous_fndecl = fndecl;
4774 if (old_tree == new_tree)
4775 ;
4776
4777 else if (new_tree)
4778 {
4779 cl_target_option_restore (&global_options,
4780 TREE_TARGET_OPTION (new_tree));
4781 target_reinit ();
4782 }
4783
4784 else if (old_tree)
4785 {
4786 struct cl_target_option *def
4787 = TREE_TARGET_OPTION (target_option_current_node);
4788
4789 cl_target_option_restore (&global_options, def);
4790 target_reinit ();
4791 }
4792 }
4793 }
4794
4795 \f
4796 /* Return true if this goes in large data/bss. */
4797
4798 static bool
4799 ix86_in_large_data_p (tree exp)
4800 {
4801 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4802 return false;
4803
4804 /* Functions are never large data. */
4805 if (TREE_CODE (exp) == FUNCTION_DECL)
4806 return false;
4807
4808 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4809 {
4810 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4811 if (strcmp (section, ".ldata") == 0
4812 || strcmp (section, ".lbss") == 0)
4813 return true;
4814 return false;
4815 }
4816 else
4817 {
4818 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4819
4820 /* If this is an incomplete type with size 0, then we can't put it
4821 in data because it might be too big when completed. */
4822 if (!size || size > ix86_section_threshold)
4823 return true;
4824 }
4825
4826 return false;
4827 }
4828
4829 /* Switch to the appropriate section for output of DECL.
4830 DECL is either a `VAR_DECL' node or a constant of some sort.
4831 RELOC indicates whether forming the initial value of DECL requires
4832 link-time relocations. */
4833
4834 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4835 ATTRIBUTE_UNUSED;
4836
4837 static section *
4838 x86_64_elf_select_section (tree decl, int reloc,
4839 unsigned HOST_WIDE_INT align)
4840 {
4841 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4842 && ix86_in_large_data_p (decl))
4843 {
4844 const char *sname = NULL;
4845 unsigned int flags = SECTION_WRITE;
4846 switch (categorize_decl_for_section (decl, reloc))
4847 {
4848 case SECCAT_DATA:
4849 sname = ".ldata";
4850 break;
4851 case SECCAT_DATA_REL:
4852 sname = ".ldata.rel";
4853 break;
4854 case SECCAT_DATA_REL_LOCAL:
4855 sname = ".ldata.rel.local";
4856 break;
4857 case SECCAT_DATA_REL_RO:
4858 sname = ".ldata.rel.ro";
4859 break;
4860 case SECCAT_DATA_REL_RO_LOCAL:
4861 sname = ".ldata.rel.ro.local";
4862 break;
4863 case SECCAT_BSS:
4864 sname = ".lbss";
4865 flags |= SECTION_BSS;
4866 break;
4867 case SECCAT_RODATA:
4868 case SECCAT_RODATA_MERGE_STR:
4869 case SECCAT_RODATA_MERGE_STR_INIT:
4870 case SECCAT_RODATA_MERGE_CONST:
4871 sname = ".lrodata";
4872 flags = 0;
4873 break;
4874 case SECCAT_SRODATA:
4875 case SECCAT_SDATA:
4876 case SECCAT_SBSS:
4877 gcc_unreachable ();
4878 case SECCAT_TEXT:
4879 case SECCAT_TDATA:
4880 case SECCAT_TBSS:
4881 /* We don't split these for medium model. Place them into
4882 default sections and hope for best. */
4883 break;
4884 }
4885 if (sname)
4886 {
4887 /* We might get called with string constants, but get_named_section
4888 doesn't like them as they are not DECLs. Also, we need to set
4889 flags in that case. */
4890 if (!DECL_P (decl))
4891 return get_section (sname, flags, NULL);
4892 return get_named_section (decl, sname, reloc);
4893 }
4894 }
4895 return default_elf_select_section (decl, reloc, align);
4896 }
4897
4898 /* Build up a unique section name, expressed as a
4899 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4900 RELOC indicates whether the initial value of EXP requires
4901 link-time relocations. */
4902
4903 static void ATTRIBUTE_UNUSED
4904 x86_64_elf_unique_section (tree decl, int reloc)
4905 {
4906 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4907 && ix86_in_large_data_p (decl))
4908 {
4909 const char *prefix = NULL;
4910 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4911 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4912
4913 switch (categorize_decl_for_section (decl, reloc))
4914 {
4915 case SECCAT_DATA:
4916 case SECCAT_DATA_REL:
4917 case SECCAT_DATA_REL_LOCAL:
4918 case SECCAT_DATA_REL_RO:
4919 case SECCAT_DATA_REL_RO_LOCAL:
4920 prefix = one_only ? ".ld" : ".ldata";
4921 break;
4922 case SECCAT_BSS:
4923 prefix = one_only ? ".lb" : ".lbss";
4924 break;
4925 case SECCAT_RODATA:
4926 case SECCAT_RODATA_MERGE_STR:
4927 case SECCAT_RODATA_MERGE_STR_INIT:
4928 case SECCAT_RODATA_MERGE_CONST:
4929 prefix = one_only ? ".lr" : ".lrodata";
4930 break;
4931 case SECCAT_SRODATA:
4932 case SECCAT_SDATA:
4933 case SECCAT_SBSS:
4934 gcc_unreachable ();
4935 case SECCAT_TEXT:
4936 case SECCAT_TDATA:
4937 case SECCAT_TBSS:
4938 /* We don't split these for medium model. Place them into
4939 default sections and hope for best. */
4940 break;
4941 }
4942 if (prefix)
4943 {
4944 const char *name, *linkonce;
4945 char *string;
4946
4947 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4948 name = targetm.strip_name_encoding (name);
4949
4950 /* If we're using one_only, then there needs to be a .gnu.linkonce
4951 prefix to the section name. */
4952 linkonce = one_only ? ".gnu.linkonce" : "";
4953
4954 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4955
4956 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4957 return;
4958 }
4959 }
4960 default_unique_section (decl, reloc);
4961 }
4962
4963 #ifdef COMMON_ASM_OP
4964 /* This says how to output assembler code to declare an
4965 uninitialized external linkage data object.
4966
4967 For medium model x86-64 we need to use .largecomm opcode for
4968 large objects. */
4969 void
4970 x86_elf_aligned_common (FILE *file,
4971 const char *name, unsigned HOST_WIDE_INT size,
4972 int align)
4973 {
4974 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4975 && size > (unsigned int)ix86_section_threshold)
4976 fputs (".largecomm\t", file);
4977 else
4978 fputs (COMMON_ASM_OP, file);
4979 assemble_name (file, name);
4980 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4981 size, align / BITS_PER_UNIT);
4982 }
4983 #endif
4984
4985 /* Utility function for targets to use in implementing
4986 ASM_OUTPUT_ALIGNED_BSS. */
4987
4988 void
4989 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4990 const char *name, unsigned HOST_WIDE_INT size,
4991 int align)
4992 {
4993 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4994 && size > (unsigned int)ix86_section_threshold)
4995 switch_to_section (get_named_section (decl, ".lbss", 0));
4996 else
4997 switch_to_section (bss_section);
4998 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4999 #ifdef ASM_DECLARE_OBJECT_NAME
5000 last_assemble_variable_decl = decl;
5001 ASM_DECLARE_OBJECT_NAME (file, name, decl);
5002 #else
5003 /* Standard thing is just output label for the object. */
5004 ASM_OUTPUT_LABEL (file, name);
5005 #endif /* ASM_DECLARE_OBJECT_NAME */
5006 ASM_OUTPUT_SKIP (file, size ? size : 1);
5007 }
5008 \f
5009 /* Decide whether we must probe the stack before any space allocation
5010 on this target. It's essentially TARGET_STACK_PROBE except when
5011 -fstack-check causes the stack to be already probed differently. */
5012
5013 bool
5014 ix86_target_stack_probe (void)
5015 {
5016 /* Do not probe the stack twice if static stack checking is enabled. */
5017 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5018 return false;
5019
5020 return TARGET_STACK_PROBE;
5021 }
5022 \f
5023 /* Decide whether we can make a sibling call to a function. DECL is the
5024 declaration of the function being targeted by the call and EXP is the
5025 CALL_EXPR representing the call. */
5026
5027 static bool
5028 ix86_function_ok_for_sibcall (tree decl, tree exp)
5029 {
5030 tree type, decl_or_type;
5031 rtx a, b;
5032
5033 /* If we are generating position-independent code, we cannot sibcall
5034 optimize any indirect call, or a direct call to a global function,
5035 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
5036 if (!TARGET_MACHO
5037 && !TARGET_64BIT
5038 && flag_pic
5039 && (!decl || !targetm.binds_local_p (decl)))
5040 return false;
5041
5042 /* If we need to align the outgoing stack, then sibcalling would
5043 unalign the stack, which may break the called function. */
5044 if (ix86_minimum_incoming_stack_boundary (true)
5045 < PREFERRED_STACK_BOUNDARY)
5046 return false;
5047
5048 if (decl)
5049 {
5050 decl_or_type = decl;
5051 type = TREE_TYPE (decl);
5052 }
5053 else
5054 {
5055 /* We're looking at the CALL_EXPR, we need the type of the function. */
5056 type = CALL_EXPR_FN (exp); /* pointer expression */
5057 type = TREE_TYPE (type); /* pointer type */
5058 type = TREE_TYPE (type); /* function type */
5059 decl_or_type = type;
5060 }
5061
5062 /* Check that the return value locations are the same. Like
5063 if we are returning floats on the 80387 register stack, we cannot
5064 make a sibcall from a function that doesn't return a float to a
5065 function that does or, conversely, from a function that does return
5066 a float to a function that doesn't; the necessary stack adjustment
5067 would not be executed. This is also the place we notice
5068 differences in the return value ABI. Note that it is ok for one
5069 of the functions to have void return type as long as the return
5070 value of the other is passed in a register. */
5071 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
5072 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
5073 cfun->decl, false);
5074 if (STACK_REG_P (a) || STACK_REG_P (b))
5075 {
5076 if (!rtx_equal_p (a, b))
5077 return false;
5078 }
5079 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
5080 {
5081 /* Disable sibcall if we need to generate vzeroupper after
5082 callee returns. */
5083 if (TARGET_VZEROUPPER
5084 && cfun->machine->callee_return_avx256_p
5085 && !cfun->machine->caller_return_avx256_p)
5086 return false;
5087 }
5088 else if (!rtx_equal_p (a, b))
5089 return false;
5090
5091 if (TARGET_64BIT)
5092 {
5093 /* The SYSV ABI has more call-clobbered registers;
5094 disallow sibcalls from MS to SYSV. */
5095 if (cfun->machine->call_abi == MS_ABI
5096 && ix86_function_type_abi (type) == SYSV_ABI)
5097 return false;
5098 }
5099 else
5100 {
5101 /* If this call is indirect, we'll need to be able to use a
5102 call-clobbered register for the address of the target function.
5103 Make sure that all such registers are not used for passing
5104 parameters. Note that DLLIMPORT functions are indirect. */
5105 if (!decl
5106 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
5107 {
5108 if (ix86_function_regparm (type, NULL) >= 3)
5109 {
5110 /* ??? Need to count the actual number of registers to be used,
5111 not the possible number of registers. Fix later. */
5112 return false;
5113 }
5114 }
5115 }
5116
5117 /* Otherwise okay. That also includes certain types of indirect calls. */
5118 return true;
5119 }
5120
5121 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
5122 and "sseregparm" calling convention attributes;
5123 arguments as in struct attribute_spec.handler. */
5124
5125 static tree
5126 ix86_handle_cconv_attribute (tree *node, tree name,
5127 tree args,
5128 int flags ATTRIBUTE_UNUSED,
5129 bool *no_add_attrs)
5130 {
5131 if (TREE_CODE (*node) != FUNCTION_TYPE
5132 && TREE_CODE (*node) != METHOD_TYPE
5133 && TREE_CODE (*node) != FIELD_DECL
5134 && TREE_CODE (*node) != TYPE_DECL)
5135 {
5136 warning (OPT_Wattributes, "%qE attribute only applies to functions",
5137 name);
5138 *no_add_attrs = true;
5139 return NULL_TREE;
5140 }
5141
5142 /* Can combine regparm with all attributes but fastcall, and thiscall. */
5143 if (is_attribute_p ("regparm", name))
5144 {
5145 tree cst;
5146
5147 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5148 {
5149 error ("fastcall and regparm attributes are not compatible");
5150 }
5151
5152 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5153 {
5154 error ("regparam and thiscall attributes are not compatible");
5155 }
5156
5157 cst = TREE_VALUE (args);
5158 if (TREE_CODE (cst) != INTEGER_CST)
5159 {
5160 warning (OPT_Wattributes,
5161 "%qE attribute requires an integer constant argument",
5162 name);
5163 *no_add_attrs = true;
5164 }
5165 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
5166 {
5167 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
5168 name, REGPARM_MAX);
5169 *no_add_attrs = true;
5170 }
5171
5172 return NULL_TREE;
5173 }
5174
5175 if (TARGET_64BIT)
5176 {
5177 /* Do not warn when emulating the MS ABI. */
5178 if ((TREE_CODE (*node) != FUNCTION_TYPE
5179 && TREE_CODE (*node) != METHOD_TYPE)
5180 || ix86_function_type_abi (*node) != MS_ABI)
5181 warning (OPT_Wattributes, "%qE attribute ignored",
5182 name);
5183 *no_add_attrs = true;
5184 return NULL_TREE;
5185 }
5186
5187 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
5188 if (is_attribute_p ("fastcall", name))
5189 {
5190 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5191 {
5192 error ("fastcall and cdecl attributes are not compatible");
5193 }
5194 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5195 {
5196 error ("fastcall and stdcall attributes are not compatible");
5197 }
5198 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5199 {
5200 error ("fastcall and regparm attributes are not compatible");
5201 }
5202 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5203 {
5204 error ("fastcall and thiscall attributes are not compatible");
5205 }
5206 }
5207
5208 /* Can combine stdcall with fastcall (redundant), regparm and
5209 sseregparm. */
5210 else if (is_attribute_p ("stdcall", name))
5211 {
5212 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5213 {
5214 error ("stdcall and cdecl attributes are not compatible");
5215 }
5216 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5217 {
5218 error ("stdcall and fastcall attributes are not compatible");
5219 }
5220 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5221 {
5222 error ("stdcall and thiscall attributes are not compatible");
5223 }
5224 }
5225
5226 /* Can combine cdecl with regparm and sseregparm. */
5227 else if (is_attribute_p ("cdecl", name))
5228 {
5229 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5230 {
5231 error ("stdcall and cdecl attributes are not compatible");
5232 }
5233 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5234 {
5235 error ("fastcall and cdecl attributes are not compatible");
5236 }
5237 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5238 {
5239 error ("cdecl and thiscall attributes are not compatible");
5240 }
5241 }
5242 else if (is_attribute_p ("thiscall", name))
5243 {
5244 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5245 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5246 name);
5247 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5248 {
5249 error ("stdcall and thiscall attributes are not compatible");
5250 }
5251 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5252 {
5253 error ("fastcall and thiscall attributes are not compatible");
5254 }
5255 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5256 {
5257 error ("cdecl and thiscall attributes are not compatible");
5258 }
5259 }
5260
5261 /* Can combine sseregparm with all attributes. */
5262
5263 return NULL_TREE;
5264 }
5265
5266 /* The transactional memory builtins are implicitly regparm or fastcall
5267 depending on the ABI. Override the generic do-nothing attribute that
5268 these builtins were declared with, and replace it with one of the two
5269 attributes that we expect elsewhere. */
5270
5271 static tree
5272 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5273 tree args ATTRIBUTE_UNUSED,
5274 int flags ATTRIBUTE_UNUSED,
5275 bool *no_add_attrs)
5276 {
5277 tree alt;
5278
5279 /* In no case do we want to add the placeholder attribute. */
5280 *no_add_attrs = true;
5281
5282 /* The 64-bit ABI is unchanged for transactional memory. */
5283 if (TARGET_64BIT)
5284 return NULL_TREE;
5285
5286 /* ??? Is there a better way to validate 32-bit windows? We have
5287 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5288 if (CHECK_STACK_LIMIT > 0)
5289 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5290 else
5291 {
5292 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5293 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5294 }
5295 decl_attributes (node, alt, flags);
5296
5297 return NULL_TREE;
5298 }
5299
5300 /* This function determines from TYPE the calling-convention. */
5301
5302 unsigned int
5303 ix86_get_callcvt (const_tree type)
5304 {
5305 unsigned int ret = 0;
5306 bool is_stdarg;
5307 tree attrs;
5308
5309 if (TARGET_64BIT)
5310 return IX86_CALLCVT_CDECL;
5311
5312 attrs = TYPE_ATTRIBUTES (type);
5313 if (attrs != NULL_TREE)
5314 {
5315 if (lookup_attribute ("cdecl", attrs))
5316 ret |= IX86_CALLCVT_CDECL;
5317 else if (lookup_attribute ("stdcall", attrs))
5318 ret |= IX86_CALLCVT_STDCALL;
5319 else if (lookup_attribute ("fastcall", attrs))
5320 ret |= IX86_CALLCVT_FASTCALL;
5321 else if (lookup_attribute ("thiscall", attrs))
5322 ret |= IX86_CALLCVT_THISCALL;
5323
5324 /* Regparam isn't allowed for thiscall and fastcall. */
5325 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5326 {
5327 if (lookup_attribute ("regparm", attrs))
5328 ret |= IX86_CALLCVT_REGPARM;
5329 if (lookup_attribute ("sseregparm", attrs))
5330 ret |= IX86_CALLCVT_SSEREGPARM;
5331 }
5332
5333 if (IX86_BASE_CALLCVT(ret) != 0)
5334 return ret;
5335 }
5336
5337 is_stdarg = stdarg_p (type);
5338 if (TARGET_RTD && !is_stdarg)
5339 return IX86_CALLCVT_STDCALL | ret;
5340
5341 if (ret != 0
5342 || is_stdarg
5343 || TREE_CODE (type) != METHOD_TYPE
5344 || ix86_function_type_abi (type) != MS_ABI)
5345 return IX86_CALLCVT_CDECL | ret;
5346
5347 return IX86_CALLCVT_THISCALL;
5348 }
5349
5350 /* Return 0 if the attributes for two types are incompatible, 1 if they
5351 are compatible, and 2 if they are nearly compatible (which causes a
5352 warning to be generated). */
5353
5354 static int
5355 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5356 {
5357 unsigned int ccvt1, ccvt2;
5358
5359 if (TREE_CODE (type1) != FUNCTION_TYPE
5360 && TREE_CODE (type1) != METHOD_TYPE)
5361 return 1;
5362
5363 ccvt1 = ix86_get_callcvt (type1);
5364 ccvt2 = ix86_get_callcvt (type2);
5365 if (ccvt1 != ccvt2)
5366 return 0;
5367 if (ix86_function_regparm (type1, NULL)
5368 != ix86_function_regparm (type2, NULL))
5369 return 0;
5370
5371 return 1;
5372 }
5373 \f
5374 /* Return the regparm value for a function with the indicated TYPE and DECL.
5375 DECL may be NULL when calling function indirectly
5376 or considering a libcall. */
5377
5378 static int
5379 ix86_function_regparm (const_tree type, const_tree decl)
5380 {
5381 tree attr;
5382 int regparm;
5383 unsigned int ccvt;
5384
5385 if (TARGET_64BIT)
5386 return (ix86_function_type_abi (type) == SYSV_ABI
5387 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5388 ccvt = ix86_get_callcvt (type);
5389 regparm = ix86_regparm;
5390
5391 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5392 {
5393 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5394 if (attr)
5395 {
5396 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5397 return regparm;
5398 }
5399 }
5400 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5401 return 2;
5402 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5403 return 1;
5404
5405 /* Use register calling convention for local functions when possible. */
5406 if (decl
5407 && TREE_CODE (decl) == FUNCTION_DECL
5408 && optimize
5409 && !(profile_flag && !flag_fentry))
5410 {
5411 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5412 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5413 if (i && i->local && i->can_change_signature)
5414 {
5415 int local_regparm, globals = 0, regno;
5416
5417 /* Make sure no regparm register is taken by a
5418 fixed register variable. */
5419 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5420 if (fixed_regs[local_regparm])
5421 break;
5422
5423 /* We don't want to use regparm(3) for nested functions as
5424 these use a static chain pointer in the third argument. */
5425 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5426 local_regparm = 2;
5427
5428 /* In 32-bit mode save a register for the split stack. */
5429 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5430 local_regparm = 2;
5431
5432 /* Each fixed register usage increases register pressure,
5433 so less registers should be used for argument passing.
5434 This functionality can be overriden by an explicit
5435 regparm value. */
5436 for (regno = AX_REG; regno <= DI_REG; regno++)
5437 if (fixed_regs[regno])
5438 globals++;
5439
5440 local_regparm
5441 = globals < local_regparm ? local_regparm - globals : 0;
5442
5443 if (local_regparm > regparm)
5444 regparm = local_regparm;
5445 }
5446 }
5447
5448 return regparm;
5449 }
5450
5451 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5452 DFmode (2) arguments in SSE registers for a function with the
5453 indicated TYPE and DECL. DECL may be NULL when calling function
5454 indirectly or considering a libcall. Otherwise return 0. */
5455
5456 static int
5457 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5458 {
5459 gcc_assert (!TARGET_64BIT);
5460
5461 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5462 by the sseregparm attribute. */
5463 if (TARGET_SSEREGPARM
5464 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5465 {
5466 if (!TARGET_SSE)
5467 {
5468 if (warn)
5469 {
5470 if (decl)
5471 error ("calling %qD with attribute sseregparm without "
5472 "SSE/SSE2 enabled", decl);
5473 else
5474 error ("calling %qT with attribute sseregparm without "
5475 "SSE/SSE2 enabled", type);
5476 }
5477 return 0;
5478 }
5479
5480 return 2;
5481 }
5482
5483 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5484 (and DFmode for SSE2) arguments in SSE registers. */
5485 if (decl && TARGET_SSE_MATH && optimize
5486 && !(profile_flag && !flag_fentry))
5487 {
5488 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5489 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5490 if (i && i->local && i->can_change_signature)
5491 return TARGET_SSE2 ? 2 : 1;
5492 }
5493
5494 return 0;
5495 }
5496
5497 /* Return true if EAX is live at the start of the function. Used by
5498 ix86_expand_prologue to determine if we need special help before
5499 calling allocate_stack_worker. */
5500
5501 static bool
5502 ix86_eax_live_at_start_p (void)
5503 {
5504 /* Cheat. Don't bother working forward from ix86_function_regparm
5505 to the function type to whether an actual argument is located in
5506 eax. Instead just look at cfg info, which is still close enough
5507 to correct at this point. This gives false positives for broken
5508 functions that might use uninitialized data that happens to be
5509 allocated in eax, but who cares? */
5510 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5511 }
5512
5513 static bool
5514 ix86_keep_aggregate_return_pointer (tree fntype)
5515 {
5516 tree attr;
5517
5518 if (!TARGET_64BIT)
5519 {
5520 attr = lookup_attribute ("callee_pop_aggregate_return",
5521 TYPE_ATTRIBUTES (fntype));
5522 if (attr)
5523 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5524
5525 /* For 32-bit MS-ABI the default is to keep aggregate
5526 return pointer. */
5527 if (ix86_function_type_abi (fntype) == MS_ABI)
5528 return true;
5529 }
5530 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5531 }
5532
5533 /* Value is the number of bytes of arguments automatically
5534 popped when returning from a subroutine call.
5535 FUNDECL is the declaration node of the function (as a tree),
5536 FUNTYPE is the data type of the function (as a tree),
5537 or for a library call it is an identifier node for the subroutine name.
5538 SIZE is the number of bytes of arguments passed on the stack.
5539
5540 On the 80386, the RTD insn may be used to pop them if the number
5541 of args is fixed, but if the number is variable then the caller
5542 must pop them all. RTD can't be used for library calls now
5543 because the library is compiled with the Unix compiler.
5544 Use of RTD is a selectable option, since it is incompatible with
5545 standard Unix calling sequences. If the option is not selected,
5546 the caller must always pop the args.
5547
5548 The attribute stdcall is equivalent to RTD on a per module basis. */
5549
5550 static int
5551 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5552 {
5553 unsigned int ccvt;
5554
5555 /* None of the 64-bit ABIs pop arguments. */
5556 if (TARGET_64BIT)
5557 return 0;
5558
5559 ccvt = ix86_get_callcvt (funtype);
5560
5561 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5562 | IX86_CALLCVT_THISCALL)) != 0
5563 && ! stdarg_p (funtype))
5564 return size;
5565
5566 /* Lose any fake structure return argument if it is passed on the stack. */
5567 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5568 && !ix86_keep_aggregate_return_pointer (funtype))
5569 {
5570 int nregs = ix86_function_regparm (funtype, fundecl);
5571 if (nregs == 0)
5572 return GET_MODE_SIZE (Pmode);
5573 }
5574
5575 return 0;
5576 }
5577
5578 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5579
5580 static bool
5581 ix86_legitimate_combined_insn (rtx insn)
5582 {
5583 /* Check operand constraints in case hard registers were propagated
5584 into insn pattern. This check prevents combine pass from
5585 generating insn patterns with invalid hard register operands.
5586 These invalid insns can eventually confuse reload to error out
5587 with a spill failure. See also PRs 46829 and 46843. */
5588 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5589 {
5590 int i;
5591
5592 extract_insn (insn);
5593 preprocess_constraints ();
5594
5595 for (i = 0; i < recog_data.n_operands; i++)
5596 {
5597 rtx op = recog_data.operand[i];
5598 enum machine_mode mode = GET_MODE (op);
5599 struct operand_alternative *op_alt;
5600 int offset = 0;
5601 bool win;
5602 int j;
5603
5604 /* A unary operator may be accepted by the predicate, but it
5605 is irrelevant for matching constraints. */
5606 if (UNARY_P (op))
5607 op = XEXP (op, 0);
5608
5609 if (GET_CODE (op) == SUBREG)
5610 {
5611 if (REG_P (SUBREG_REG (op))
5612 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5613 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5614 GET_MODE (SUBREG_REG (op)),
5615 SUBREG_BYTE (op),
5616 GET_MODE (op));
5617 op = SUBREG_REG (op);
5618 }
5619
5620 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5621 continue;
5622
5623 op_alt = recog_op_alt[i];
5624
5625 /* Operand has no constraints, anything is OK. */
5626 win = !recog_data.n_alternatives;
5627
5628 for (j = 0; j < recog_data.n_alternatives; j++)
5629 {
5630 if (op_alt[j].anything_ok
5631 || (op_alt[j].matches != -1
5632 && operands_match_p
5633 (recog_data.operand[i],
5634 recog_data.operand[op_alt[j].matches]))
5635 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5636 {
5637 win = true;
5638 break;
5639 }
5640 }
5641
5642 if (!win)
5643 return false;
5644 }
5645 }
5646
5647 return true;
5648 }
5649 \f
5650 /* Argument support functions. */
5651
5652 /* Return true when register may be used to pass function parameters. */
5653 bool
5654 ix86_function_arg_regno_p (int regno)
5655 {
5656 int i;
5657 const int *parm_regs;
5658
5659 if (!TARGET_64BIT)
5660 {
5661 if (TARGET_MACHO)
5662 return (regno < REGPARM_MAX
5663 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5664 else
5665 return (regno < REGPARM_MAX
5666 || (TARGET_MMX && MMX_REGNO_P (regno)
5667 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5668 || (TARGET_SSE && SSE_REGNO_P (regno)
5669 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5670 }
5671
5672 if (TARGET_MACHO)
5673 {
5674 if (SSE_REGNO_P (regno) && TARGET_SSE)
5675 return true;
5676 }
5677 else
5678 {
5679 if (TARGET_SSE && SSE_REGNO_P (regno)
5680 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5681 return true;
5682 }
5683
5684 /* TODO: The function should depend on current function ABI but
5685 builtins.c would need updating then. Therefore we use the
5686 default ABI. */
5687
5688 /* RAX is used as hidden argument to va_arg functions. */
5689 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5690 return true;
5691
5692 if (ix86_abi == MS_ABI)
5693 parm_regs = x86_64_ms_abi_int_parameter_registers;
5694 else
5695 parm_regs = x86_64_int_parameter_registers;
5696 for (i = 0; i < (ix86_abi == MS_ABI
5697 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5698 if (regno == parm_regs[i])
5699 return true;
5700 return false;
5701 }
5702
5703 /* Return if we do not know how to pass TYPE solely in registers. */
5704
5705 static bool
5706 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5707 {
5708 if (must_pass_in_stack_var_size_or_pad (mode, type))
5709 return true;
5710
5711 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5712 The layout_type routine is crafty and tries to trick us into passing
5713 currently unsupported vector types on the stack by using TImode. */
5714 return (!TARGET_64BIT && mode == TImode
5715 && type && TREE_CODE (type) != VECTOR_TYPE);
5716 }
5717
5718 /* It returns the size, in bytes, of the area reserved for arguments passed
5719 in registers for the function represented by fndecl dependent to the used
5720 abi format. */
5721 int
5722 ix86_reg_parm_stack_space (const_tree fndecl)
5723 {
5724 enum calling_abi call_abi = SYSV_ABI;
5725 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5726 call_abi = ix86_function_abi (fndecl);
5727 else
5728 call_abi = ix86_function_type_abi (fndecl);
5729 if (TARGET_64BIT && call_abi == MS_ABI)
5730 return 32;
5731 return 0;
5732 }
5733
5734 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5735 call abi used. */
5736 enum calling_abi
5737 ix86_function_type_abi (const_tree fntype)
5738 {
5739 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5740 {
5741 enum calling_abi abi = ix86_abi;
5742 if (abi == SYSV_ABI)
5743 {
5744 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5745 abi = MS_ABI;
5746 }
5747 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5748 abi = SYSV_ABI;
5749 return abi;
5750 }
5751 return ix86_abi;
5752 }
5753
5754 static bool
5755 ix86_function_ms_hook_prologue (const_tree fn)
5756 {
5757 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5758 {
5759 if (decl_function_context (fn) != NULL_TREE)
5760 error_at (DECL_SOURCE_LOCATION (fn),
5761 "ms_hook_prologue is not compatible with nested function");
5762 else
5763 return true;
5764 }
5765 return false;
5766 }
5767
5768 static enum calling_abi
5769 ix86_function_abi (const_tree fndecl)
5770 {
5771 if (! fndecl)
5772 return ix86_abi;
5773 return ix86_function_type_abi (TREE_TYPE (fndecl));
5774 }
5775
5776 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5777 call abi used. */
5778 enum calling_abi
5779 ix86_cfun_abi (void)
5780 {
5781 if (! cfun)
5782 return ix86_abi;
5783 return cfun->machine->call_abi;
5784 }
5785
5786 /* Write the extra assembler code needed to declare a function properly. */
5787
5788 void
5789 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5790 tree decl)
5791 {
5792 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5793
5794 if (is_ms_hook)
5795 {
5796 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5797 unsigned int filler_cc = 0xcccccccc;
5798
5799 for (i = 0; i < filler_count; i += 4)
5800 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5801 }
5802
5803 #ifdef SUBTARGET_ASM_UNWIND_INIT
5804 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5805 #endif
5806
5807 ASM_OUTPUT_LABEL (asm_out_file, fname);
5808
5809 /* Output magic byte marker, if hot-patch attribute is set. */
5810 if (is_ms_hook)
5811 {
5812 if (TARGET_64BIT)
5813 {
5814 /* leaq [%rsp + 0], %rsp */
5815 asm_fprintf (asm_out_file, ASM_BYTE
5816 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5817 }
5818 else
5819 {
5820 /* movl.s %edi, %edi
5821 push %ebp
5822 movl.s %esp, %ebp */
5823 asm_fprintf (asm_out_file, ASM_BYTE
5824 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5825 }
5826 }
5827 }
5828
5829 /* regclass.c */
5830 extern void init_regs (void);
5831
5832 /* Implementation of call abi switching target hook. Specific to FNDECL
5833 the specific call register sets are set. See also
5834 ix86_conditional_register_usage for more details. */
5835 void
5836 ix86_call_abi_override (const_tree fndecl)
5837 {
5838 if (fndecl == NULL_TREE)
5839 cfun->machine->call_abi = ix86_abi;
5840 else
5841 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5842 }
5843
5844 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5845 expensive re-initialization of init_regs each time we switch function context
5846 since this is needed only during RTL expansion. */
5847 static void
5848 ix86_maybe_switch_abi (void)
5849 {
5850 if (TARGET_64BIT &&
5851 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5852 reinit_regs ();
5853 }
5854
5855 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5856 for a call to a function whose data type is FNTYPE.
5857 For a library call, FNTYPE is 0. */
5858
5859 void
5860 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5861 tree fntype, /* tree ptr for function decl */
5862 rtx libname, /* SYMBOL_REF of library name or 0 */
5863 tree fndecl,
5864 int caller)
5865 {
5866 struct cgraph_local_info *i;
5867 tree fnret_type;
5868
5869 memset (cum, 0, sizeof (*cum));
5870
5871 /* Initialize for the current callee. */
5872 if (caller)
5873 {
5874 cfun->machine->callee_pass_avx256_p = false;
5875 cfun->machine->callee_return_avx256_p = false;
5876 }
5877
5878 if (fndecl)
5879 {
5880 i = cgraph_local_info (fndecl);
5881 cum->call_abi = ix86_function_abi (fndecl);
5882 fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5883 }
5884 else
5885 {
5886 i = NULL;
5887 cum->call_abi = ix86_function_type_abi (fntype);
5888 if (fntype)
5889 fnret_type = TREE_TYPE (fntype);
5890 else
5891 fnret_type = NULL;
5892 }
5893
5894 if (TARGET_VZEROUPPER && fnret_type)
5895 {
5896 rtx fnret_value = ix86_function_value (fnret_type, fntype,
5897 false);
5898 if (function_pass_avx256_p (fnret_value))
5899 {
5900 /* The return value of this function uses 256bit AVX modes. */
5901 if (caller)
5902 cfun->machine->callee_return_avx256_p = true;
5903 else
5904 cfun->machine->caller_return_avx256_p = true;
5905 }
5906 }
5907
5908 cum->caller = caller;
5909
5910 /* Set up the number of registers to use for passing arguments. */
5911
5912 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5913 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5914 "or subtarget optimization implying it");
5915 cum->nregs = ix86_regparm;
5916 if (TARGET_64BIT)
5917 {
5918 cum->nregs = (cum->call_abi == SYSV_ABI
5919 ? X86_64_REGPARM_MAX
5920 : X86_64_MS_REGPARM_MAX);
5921 }
5922 if (TARGET_SSE)
5923 {
5924 cum->sse_nregs = SSE_REGPARM_MAX;
5925 if (TARGET_64BIT)
5926 {
5927 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5928 ? X86_64_SSE_REGPARM_MAX
5929 : X86_64_MS_SSE_REGPARM_MAX);
5930 }
5931 }
5932 if (TARGET_MMX)
5933 cum->mmx_nregs = MMX_REGPARM_MAX;
5934 cum->warn_avx = true;
5935 cum->warn_sse = true;
5936 cum->warn_mmx = true;
5937
5938 /* Because type might mismatch in between caller and callee, we need to
5939 use actual type of function for local calls.
5940 FIXME: cgraph_analyze can be told to actually record if function uses
5941 va_start so for local functions maybe_vaarg can be made aggressive
5942 helping K&R code.
5943 FIXME: once typesytem is fixed, we won't need this code anymore. */
5944 if (i && i->local && i->can_change_signature)
5945 fntype = TREE_TYPE (fndecl);
5946 cum->maybe_vaarg = (fntype
5947 ? (!prototype_p (fntype) || stdarg_p (fntype))
5948 : !libname);
5949
5950 if (!TARGET_64BIT)
5951 {
5952 /* If there are variable arguments, then we won't pass anything
5953 in registers in 32-bit mode. */
5954 if (stdarg_p (fntype))
5955 {
5956 cum->nregs = 0;
5957 cum->sse_nregs = 0;
5958 cum->mmx_nregs = 0;
5959 cum->warn_avx = 0;
5960 cum->warn_sse = 0;
5961 cum->warn_mmx = 0;
5962 return;
5963 }
5964
5965 /* Use ecx and edx registers if function has fastcall attribute,
5966 else look for regparm information. */
5967 if (fntype)
5968 {
5969 unsigned int ccvt = ix86_get_callcvt (fntype);
5970 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5971 {
5972 cum->nregs = 1;
5973 cum->fastcall = 1; /* Same first register as in fastcall. */
5974 }
5975 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5976 {
5977 cum->nregs = 2;
5978 cum->fastcall = 1;
5979 }
5980 else
5981 cum->nregs = ix86_function_regparm (fntype, fndecl);
5982 }
5983
5984 /* Set up the number of SSE registers used for passing SFmode
5985 and DFmode arguments. Warn for mismatching ABI. */
5986 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5987 }
5988 }
5989
5990 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5991 But in the case of vector types, it is some vector mode.
5992
5993 When we have only some of our vector isa extensions enabled, then there
5994 are some modes for which vector_mode_supported_p is false. For these
5995 modes, the generic vector support in gcc will choose some non-vector mode
5996 in order to implement the type. By computing the natural mode, we'll
5997 select the proper ABI location for the operand and not depend on whatever
5998 the middle-end decides to do with these vector types.
5999
6000 The midde-end can't deal with the vector types > 16 bytes. In this
6001 case, we return the original mode and warn ABI change if CUM isn't
6002 NULL. */
6003
6004 static enum machine_mode
6005 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
6006 {
6007 enum machine_mode mode = TYPE_MODE (type);
6008
6009 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
6010 {
6011 HOST_WIDE_INT size = int_size_in_bytes (type);
6012 if ((size == 8 || size == 16 || size == 32)
6013 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
6014 && TYPE_VECTOR_SUBPARTS (type) > 1)
6015 {
6016 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
6017
6018 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
6019 mode = MIN_MODE_VECTOR_FLOAT;
6020 else
6021 mode = MIN_MODE_VECTOR_INT;
6022
6023 /* Get the mode which has this inner mode and number of units. */
6024 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
6025 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
6026 && GET_MODE_INNER (mode) == innermode)
6027 {
6028 if (size == 32 && !TARGET_AVX)
6029 {
6030 static bool warnedavx;
6031
6032 if (cum
6033 && !warnedavx
6034 && cum->warn_avx)
6035 {
6036 warnedavx = true;
6037 warning (0, "AVX vector argument without AVX "
6038 "enabled changes the ABI");
6039 }
6040 return TYPE_MODE (type);
6041 }
6042 else if ((size == 8 || size == 16) && !TARGET_SSE)
6043 {
6044 static bool warnedsse;
6045
6046 if (cum
6047 && !warnedsse
6048 && cum->warn_sse)
6049 {
6050 warnedsse = true;
6051 warning (0, "SSE vector argument without SSE "
6052 "enabled changes the ABI");
6053 }
6054 return mode;
6055 }
6056 else
6057 return mode;
6058 }
6059
6060 gcc_unreachable ();
6061 }
6062 }
6063
6064 return mode;
6065 }
6066
6067 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
6068 this may not agree with the mode that the type system has chosen for the
6069 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
6070 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
6071
6072 static rtx
6073 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
6074 unsigned int regno)
6075 {
6076 rtx tmp;
6077
6078 if (orig_mode != BLKmode)
6079 tmp = gen_rtx_REG (orig_mode, regno);
6080 else
6081 {
6082 tmp = gen_rtx_REG (mode, regno);
6083 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
6084 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
6085 }
6086
6087 return tmp;
6088 }
6089
6090 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
6091 of this code is to classify each 8bytes of incoming argument by the register
6092 class and assign registers accordingly. */
6093
6094 /* Return the union class of CLASS1 and CLASS2.
6095 See the x86-64 PS ABI for details. */
6096
6097 static enum x86_64_reg_class
6098 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
6099 {
6100 /* Rule #1: If both classes are equal, this is the resulting class. */
6101 if (class1 == class2)
6102 return class1;
6103
6104 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
6105 the other class. */
6106 if (class1 == X86_64_NO_CLASS)
6107 return class2;
6108 if (class2 == X86_64_NO_CLASS)
6109 return class1;
6110
6111 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
6112 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
6113 return X86_64_MEMORY_CLASS;
6114
6115 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
6116 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
6117 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
6118 return X86_64_INTEGERSI_CLASS;
6119 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
6120 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
6121 return X86_64_INTEGER_CLASS;
6122
6123 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
6124 MEMORY is used. */
6125 if (class1 == X86_64_X87_CLASS
6126 || class1 == X86_64_X87UP_CLASS
6127 || class1 == X86_64_COMPLEX_X87_CLASS
6128 || class2 == X86_64_X87_CLASS
6129 || class2 == X86_64_X87UP_CLASS
6130 || class2 == X86_64_COMPLEX_X87_CLASS)
6131 return X86_64_MEMORY_CLASS;
6132
6133 /* Rule #6: Otherwise class SSE is used. */
6134 return X86_64_SSE_CLASS;
6135 }
6136
6137 /* Classify the argument of type TYPE and mode MODE.
6138 CLASSES will be filled by the register class used to pass each word
6139 of the operand. The number of words is returned. In case the parameter
6140 should be passed in memory, 0 is returned. As a special case for zero
6141 sized containers, classes[0] will be NO_CLASS and 1 is returned.
6142
6143 BIT_OFFSET is used internally for handling records and specifies offset
6144 of the offset in bits modulo 256 to avoid overflow cases.
6145
6146 See the x86-64 PS ABI for details.
6147 */
6148
6149 static int
6150 classify_argument (enum machine_mode mode, const_tree type,
6151 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
6152 {
6153 HOST_WIDE_INT bytes =
6154 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6155 int words
6156 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6157
6158 /* Variable sized entities are always passed/returned in memory. */
6159 if (bytes < 0)
6160 return 0;
6161
6162 if (mode != VOIDmode
6163 && targetm.calls.must_pass_in_stack (mode, type))
6164 return 0;
6165
6166 if (type && AGGREGATE_TYPE_P (type))
6167 {
6168 int i;
6169 tree field;
6170 enum x86_64_reg_class subclasses[MAX_CLASSES];
6171
6172 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
6173 if (bytes > 32)
6174 return 0;
6175
6176 for (i = 0; i < words; i++)
6177 classes[i] = X86_64_NO_CLASS;
6178
6179 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
6180 signalize memory class, so handle it as special case. */
6181 if (!words)
6182 {
6183 classes[0] = X86_64_NO_CLASS;
6184 return 1;
6185 }
6186
6187 /* Classify each field of record and merge classes. */
6188 switch (TREE_CODE (type))
6189 {
6190 case RECORD_TYPE:
6191 /* And now merge the fields of structure. */
6192 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6193 {
6194 if (TREE_CODE (field) == FIELD_DECL)
6195 {
6196 int num;
6197
6198 if (TREE_TYPE (field) == error_mark_node)
6199 continue;
6200
6201 /* Bitfields are always classified as integer. Handle them
6202 early, since later code would consider them to be
6203 misaligned integers. */
6204 if (DECL_BIT_FIELD (field))
6205 {
6206 for (i = (int_bit_position (field)
6207 + (bit_offset % 64)) / 8 / 8;
6208 i < ((int_bit_position (field) + (bit_offset % 64))
6209 + tree_low_cst (DECL_SIZE (field), 0)
6210 + 63) / 8 / 8; i++)
6211 classes[i] =
6212 merge_classes (X86_64_INTEGER_CLASS,
6213 classes[i]);
6214 }
6215 else
6216 {
6217 int pos;
6218
6219 type = TREE_TYPE (field);
6220
6221 /* Flexible array member is ignored. */
6222 if (TYPE_MODE (type) == BLKmode
6223 && TREE_CODE (type) == ARRAY_TYPE
6224 && TYPE_SIZE (type) == NULL_TREE
6225 && TYPE_DOMAIN (type) != NULL_TREE
6226 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6227 == NULL_TREE))
6228 {
6229 static bool warned;
6230
6231 if (!warned && warn_psabi)
6232 {
6233 warned = true;
6234 inform (input_location,
6235 "the ABI of passing struct with"
6236 " a flexible array member has"
6237 " changed in GCC 4.4");
6238 }
6239 continue;
6240 }
6241 num = classify_argument (TYPE_MODE (type), type,
6242 subclasses,
6243 (int_bit_position (field)
6244 + bit_offset) % 256);
6245 if (!num)
6246 return 0;
6247 pos = (int_bit_position (field)
6248 + (bit_offset % 64)) / 8 / 8;
6249 for (i = 0; i < num && (i + pos) < words; i++)
6250 classes[i + pos] =
6251 merge_classes (subclasses[i], classes[i + pos]);
6252 }
6253 }
6254 }
6255 break;
6256
6257 case ARRAY_TYPE:
6258 /* Arrays are handled as small records. */
6259 {
6260 int num;
6261 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6262 TREE_TYPE (type), subclasses, bit_offset);
6263 if (!num)
6264 return 0;
6265
6266 /* The partial classes are now full classes. */
6267 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6268 subclasses[0] = X86_64_SSE_CLASS;
6269 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6270 && !((bit_offset % 64) == 0 && bytes == 4))
6271 subclasses[0] = X86_64_INTEGER_CLASS;
6272
6273 for (i = 0; i < words; i++)
6274 classes[i] = subclasses[i % num];
6275
6276 break;
6277 }
6278 case UNION_TYPE:
6279 case QUAL_UNION_TYPE:
6280 /* Unions are similar to RECORD_TYPE but offset is always 0.
6281 */
6282 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6283 {
6284 if (TREE_CODE (field) == FIELD_DECL)
6285 {
6286 int num;
6287
6288 if (TREE_TYPE (field) == error_mark_node)
6289 continue;
6290
6291 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6292 TREE_TYPE (field), subclasses,
6293 bit_offset);
6294 if (!num)
6295 return 0;
6296 for (i = 0; i < num; i++)
6297 classes[i] = merge_classes (subclasses[i], classes[i]);
6298 }
6299 }
6300 break;
6301
6302 default:
6303 gcc_unreachable ();
6304 }
6305
6306 if (words > 2)
6307 {
6308 /* When size > 16 bytes, if the first one isn't
6309 X86_64_SSE_CLASS or any other ones aren't
6310 X86_64_SSEUP_CLASS, everything should be passed in
6311 memory. */
6312 if (classes[0] != X86_64_SSE_CLASS)
6313 return 0;
6314
6315 for (i = 1; i < words; i++)
6316 if (classes[i] != X86_64_SSEUP_CLASS)
6317 return 0;
6318 }
6319
6320 /* Final merger cleanup. */
6321 for (i = 0; i < words; i++)
6322 {
6323 /* If one class is MEMORY, everything should be passed in
6324 memory. */
6325 if (classes[i] == X86_64_MEMORY_CLASS)
6326 return 0;
6327
6328 /* The X86_64_SSEUP_CLASS should be always preceded by
6329 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6330 if (classes[i] == X86_64_SSEUP_CLASS
6331 && classes[i - 1] != X86_64_SSE_CLASS
6332 && classes[i - 1] != X86_64_SSEUP_CLASS)
6333 {
6334 /* The first one should never be X86_64_SSEUP_CLASS. */
6335 gcc_assert (i != 0);
6336 classes[i] = X86_64_SSE_CLASS;
6337 }
6338
6339 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6340 everything should be passed in memory. */
6341 if (classes[i] == X86_64_X87UP_CLASS
6342 && (classes[i - 1] != X86_64_X87_CLASS))
6343 {
6344 static bool warned;
6345
6346 /* The first one should never be X86_64_X87UP_CLASS. */
6347 gcc_assert (i != 0);
6348 if (!warned && warn_psabi)
6349 {
6350 warned = true;
6351 inform (input_location,
6352 "the ABI of passing union with long double"
6353 " has changed in GCC 4.4");
6354 }
6355 return 0;
6356 }
6357 }
6358 return words;
6359 }
6360
6361 /* Compute alignment needed. We align all types to natural boundaries with
6362 exception of XFmode that is aligned to 64bits. */
6363 if (mode != VOIDmode && mode != BLKmode)
6364 {
6365 int mode_alignment = GET_MODE_BITSIZE (mode);
6366
6367 if (mode == XFmode)
6368 mode_alignment = 128;
6369 else if (mode == XCmode)
6370 mode_alignment = 256;
6371 if (COMPLEX_MODE_P (mode))
6372 mode_alignment /= 2;
6373 /* Misaligned fields are always returned in memory. */
6374 if (bit_offset % mode_alignment)
6375 return 0;
6376 }
6377
6378 /* for V1xx modes, just use the base mode */
6379 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6380 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6381 mode = GET_MODE_INNER (mode);
6382
6383 /* Classification of atomic types. */
6384 switch (mode)
6385 {
6386 case SDmode:
6387 case DDmode:
6388 classes[0] = X86_64_SSE_CLASS;
6389 return 1;
6390 case TDmode:
6391 classes[0] = X86_64_SSE_CLASS;
6392 classes[1] = X86_64_SSEUP_CLASS;
6393 return 2;
6394 case DImode:
6395 case SImode:
6396 case HImode:
6397 case QImode:
6398 case CSImode:
6399 case CHImode:
6400 case CQImode:
6401 {
6402 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6403
6404 if (size <= 32)
6405 {
6406 classes[0] = X86_64_INTEGERSI_CLASS;
6407 return 1;
6408 }
6409 else if (size <= 64)
6410 {
6411 classes[0] = X86_64_INTEGER_CLASS;
6412 return 1;
6413 }
6414 else if (size <= 64+32)
6415 {
6416 classes[0] = X86_64_INTEGER_CLASS;
6417 classes[1] = X86_64_INTEGERSI_CLASS;
6418 return 2;
6419 }
6420 else if (size <= 64+64)
6421 {
6422 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6423 return 2;
6424 }
6425 else
6426 gcc_unreachable ();
6427 }
6428 case CDImode:
6429 case TImode:
6430 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6431 return 2;
6432 case COImode:
6433 case OImode:
6434 /* OImode shouldn't be used directly. */
6435 gcc_unreachable ();
6436 case CTImode:
6437 return 0;
6438 case SFmode:
6439 if (!(bit_offset % 64))
6440 classes[0] = X86_64_SSESF_CLASS;
6441 else
6442 classes[0] = X86_64_SSE_CLASS;
6443 return 1;
6444 case DFmode:
6445 classes[0] = X86_64_SSEDF_CLASS;
6446 return 1;
6447 case XFmode:
6448 classes[0] = X86_64_X87_CLASS;
6449 classes[1] = X86_64_X87UP_CLASS;
6450 return 2;
6451 case TFmode:
6452 classes[0] = X86_64_SSE_CLASS;
6453 classes[1] = X86_64_SSEUP_CLASS;
6454 return 2;
6455 case SCmode:
6456 classes[0] = X86_64_SSE_CLASS;
6457 if (!(bit_offset % 64))
6458 return 1;
6459 else
6460 {
6461 static bool warned;
6462
6463 if (!warned && warn_psabi)
6464 {
6465 warned = true;
6466 inform (input_location,
6467 "the ABI of passing structure with complex float"
6468 " member has changed in GCC 4.4");
6469 }
6470 classes[1] = X86_64_SSESF_CLASS;
6471 return 2;
6472 }
6473 case DCmode:
6474 classes[0] = X86_64_SSEDF_CLASS;
6475 classes[1] = X86_64_SSEDF_CLASS;
6476 return 2;
6477 case XCmode:
6478 classes[0] = X86_64_COMPLEX_X87_CLASS;
6479 return 1;
6480 case TCmode:
6481 /* This modes is larger than 16 bytes. */
6482 return 0;
6483 case V8SFmode:
6484 case V8SImode:
6485 case V32QImode:
6486 case V16HImode:
6487 case V4DFmode:
6488 case V4DImode:
6489 classes[0] = X86_64_SSE_CLASS;
6490 classes[1] = X86_64_SSEUP_CLASS;
6491 classes[2] = X86_64_SSEUP_CLASS;
6492 classes[3] = X86_64_SSEUP_CLASS;
6493 return 4;
6494 case V4SFmode:
6495 case V4SImode:
6496 case V16QImode:
6497 case V8HImode:
6498 case V2DFmode:
6499 case V2DImode:
6500 classes[0] = X86_64_SSE_CLASS;
6501 classes[1] = X86_64_SSEUP_CLASS;
6502 return 2;
6503 case V1TImode:
6504 case V1DImode:
6505 case V2SFmode:
6506 case V2SImode:
6507 case V4HImode:
6508 case V8QImode:
6509 classes[0] = X86_64_SSE_CLASS;
6510 return 1;
6511 case BLKmode:
6512 case VOIDmode:
6513 return 0;
6514 default:
6515 gcc_assert (VECTOR_MODE_P (mode));
6516
6517 if (bytes > 16)
6518 return 0;
6519
6520 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6521
6522 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6523 classes[0] = X86_64_INTEGERSI_CLASS;
6524 else
6525 classes[0] = X86_64_INTEGER_CLASS;
6526 classes[1] = X86_64_INTEGER_CLASS;
6527 return 1 + (bytes > 8);
6528 }
6529 }
6530
6531 /* Examine the argument and return set number of register required in each
6532 class. Return 0 iff parameter should be passed in memory. */
6533 static int
6534 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6535 int *int_nregs, int *sse_nregs)
6536 {
6537 enum x86_64_reg_class regclass[MAX_CLASSES];
6538 int n = classify_argument (mode, type, regclass, 0);
6539
6540 *int_nregs = 0;
6541 *sse_nregs = 0;
6542 if (!n)
6543 return 0;
6544 for (n--; n >= 0; n--)
6545 switch (regclass[n])
6546 {
6547 case X86_64_INTEGER_CLASS:
6548 case X86_64_INTEGERSI_CLASS:
6549 (*int_nregs)++;
6550 break;
6551 case X86_64_SSE_CLASS:
6552 case X86_64_SSESF_CLASS:
6553 case X86_64_SSEDF_CLASS:
6554 (*sse_nregs)++;
6555 break;
6556 case X86_64_NO_CLASS:
6557 case X86_64_SSEUP_CLASS:
6558 break;
6559 case X86_64_X87_CLASS:
6560 case X86_64_X87UP_CLASS:
6561 if (!in_return)
6562 return 0;
6563 break;
6564 case X86_64_COMPLEX_X87_CLASS:
6565 return in_return ? 2 : 0;
6566 case X86_64_MEMORY_CLASS:
6567 gcc_unreachable ();
6568 }
6569 return 1;
6570 }
6571
6572 /* Construct container for the argument used by GCC interface. See
6573 FUNCTION_ARG for the detailed description. */
6574
6575 static rtx
6576 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6577 const_tree type, int in_return, int nintregs, int nsseregs,
6578 const int *intreg, int sse_regno)
6579 {
6580 /* The following variables hold the static issued_error state. */
6581 static bool issued_sse_arg_error;
6582 static bool issued_sse_ret_error;
6583 static bool issued_x87_ret_error;
6584
6585 enum machine_mode tmpmode;
6586 int bytes =
6587 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6588 enum x86_64_reg_class regclass[MAX_CLASSES];
6589 int n;
6590 int i;
6591 int nexps = 0;
6592 int needed_sseregs, needed_intregs;
6593 rtx exp[MAX_CLASSES];
6594 rtx ret;
6595
6596 n = classify_argument (mode, type, regclass, 0);
6597 if (!n)
6598 return NULL;
6599 if (!examine_argument (mode, type, in_return, &needed_intregs,
6600 &needed_sseregs))
6601 return NULL;
6602 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6603 return NULL;
6604
6605 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6606 some less clueful developer tries to use floating-point anyway. */
6607 if (needed_sseregs && !TARGET_SSE)
6608 {
6609 if (in_return)
6610 {
6611 if (!issued_sse_ret_error)
6612 {
6613 error ("SSE register return with SSE disabled");
6614 issued_sse_ret_error = true;
6615 }
6616 }
6617 else if (!issued_sse_arg_error)
6618 {
6619 error ("SSE register argument with SSE disabled");
6620 issued_sse_arg_error = true;
6621 }
6622 return NULL;
6623 }
6624
6625 /* Likewise, error if the ABI requires us to return values in the
6626 x87 registers and the user specified -mno-80387. */
6627 if (!TARGET_80387 && in_return)
6628 for (i = 0; i < n; i++)
6629 if (regclass[i] == X86_64_X87_CLASS
6630 || regclass[i] == X86_64_X87UP_CLASS
6631 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6632 {
6633 if (!issued_x87_ret_error)
6634 {
6635 error ("x87 register return with x87 disabled");
6636 issued_x87_ret_error = true;
6637 }
6638 return NULL;
6639 }
6640
6641 /* First construct simple cases. Avoid SCmode, since we want to use
6642 single register to pass this type. */
6643 if (n == 1 && mode != SCmode)
6644 switch (regclass[0])
6645 {
6646 case X86_64_INTEGER_CLASS:
6647 case X86_64_INTEGERSI_CLASS:
6648 return gen_rtx_REG (mode, intreg[0]);
6649 case X86_64_SSE_CLASS:
6650 case X86_64_SSESF_CLASS:
6651 case X86_64_SSEDF_CLASS:
6652 if (mode != BLKmode)
6653 return gen_reg_or_parallel (mode, orig_mode,
6654 SSE_REGNO (sse_regno));
6655 break;
6656 case X86_64_X87_CLASS:
6657 case X86_64_COMPLEX_X87_CLASS:
6658 return gen_rtx_REG (mode, FIRST_STACK_REG);
6659 case X86_64_NO_CLASS:
6660 /* Zero sized array, struct or class. */
6661 return NULL;
6662 default:
6663 gcc_unreachable ();
6664 }
6665 if (n == 2
6666 && regclass[0] == X86_64_SSE_CLASS
6667 && regclass[1] == X86_64_SSEUP_CLASS
6668 && mode != BLKmode)
6669 return gen_reg_or_parallel (mode, orig_mode,
6670 SSE_REGNO (sse_regno));
6671 if (n == 4
6672 && regclass[0] == X86_64_SSE_CLASS
6673 && regclass[1] == X86_64_SSEUP_CLASS
6674 && regclass[2] == X86_64_SSEUP_CLASS
6675 && regclass[3] == X86_64_SSEUP_CLASS
6676 && mode != BLKmode)
6677 return gen_reg_or_parallel (mode, orig_mode,
6678 SSE_REGNO (sse_regno));
6679 if (n == 2
6680 && regclass[0] == X86_64_X87_CLASS
6681 && regclass[1] == X86_64_X87UP_CLASS)
6682 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6683
6684 if (n == 2
6685 && regclass[0] == X86_64_INTEGER_CLASS
6686 && regclass[1] == X86_64_INTEGER_CLASS
6687 && (mode == CDImode || mode == TImode || mode == TFmode)
6688 && intreg[0] + 1 == intreg[1])
6689 return gen_rtx_REG (mode, intreg[0]);
6690
6691 /* Otherwise figure out the entries of the PARALLEL. */
6692 for (i = 0; i < n; i++)
6693 {
6694 int pos;
6695
6696 switch (regclass[i])
6697 {
6698 case X86_64_NO_CLASS:
6699 break;
6700 case X86_64_INTEGER_CLASS:
6701 case X86_64_INTEGERSI_CLASS:
6702 /* Merge TImodes on aligned occasions here too. */
6703 if (i * 8 + 8 > bytes)
6704 tmpmode
6705 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6706 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6707 tmpmode = SImode;
6708 else
6709 tmpmode = DImode;
6710 /* We've requested 24 bytes we
6711 don't have mode for. Use DImode. */
6712 if (tmpmode == BLKmode)
6713 tmpmode = DImode;
6714 exp [nexps++]
6715 = gen_rtx_EXPR_LIST (VOIDmode,
6716 gen_rtx_REG (tmpmode, *intreg),
6717 GEN_INT (i*8));
6718 intreg++;
6719 break;
6720 case X86_64_SSESF_CLASS:
6721 exp [nexps++]
6722 = gen_rtx_EXPR_LIST (VOIDmode,
6723 gen_rtx_REG (SFmode,
6724 SSE_REGNO (sse_regno)),
6725 GEN_INT (i*8));
6726 sse_regno++;
6727 break;
6728 case X86_64_SSEDF_CLASS:
6729 exp [nexps++]
6730 = gen_rtx_EXPR_LIST (VOIDmode,
6731 gen_rtx_REG (DFmode,
6732 SSE_REGNO (sse_regno)),
6733 GEN_INT (i*8));
6734 sse_regno++;
6735 break;
6736 case X86_64_SSE_CLASS:
6737 pos = i;
6738 switch (n)
6739 {
6740 case 1:
6741 tmpmode = DImode;
6742 break;
6743 case 2:
6744 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6745 {
6746 tmpmode = TImode;
6747 i++;
6748 }
6749 else
6750 tmpmode = DImode;
6751 break;
6752 case 4:
6753 gcc_assert (i == 0
6754 && regclass[1] == X86_64_SSEUP_CLASS
6755 && regclass[2] == X86_64_SSEUP_CLASS
6756 && regclass[3] == X86_64_SSEUP_CLASS);
6757 tmpmode = OImode;
6758 i += 3;
6759 break;
6760 default:
6761 gcc_unreachable ();
6762 }
6763 exp [nexps++]
6764 = gen_rtx_EXPR_LIST (VOIDmode,
6765 gen_rtx_REG (tmpmode,
6766 SSE_REGNO (sse_regno)),
6767 GEN_INT (pos*8));
6768 sse_regno++;
6769 break;
6770 default:
6771 gcc_unreachable ();
6772 }
6773 }
6774
6775 /* Empty aligned struct, union or class. */
6776 if (nexps == 0)
6777 return NULL;
6778
6779 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6780 for (i = 0; i < nexps; i++)
6781 XVECEXP (ret, 0, i) = exp [i];
6782 return ret;
6783 }
6784
6785 /* Update the data in CUM to advance over an argument of mode MODE
6786 and data type TYPE. (TYPE is null for libcalls where that information
6787 may not be available.) */
6788
6789 static void
6790 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6791 const_tree type, HOST_WIDE_INT bytes,
6792 HOST_WIDE_INT words)
6793 {
6794 switch (mode)
6795 {
6796 default:
6797 break;
6798
6799 case BLKmode:
6800 if (bytes < 0)
6801 break;
6802 /* FALLTHRU */
6803
6804 case DImode:
6805 case SImode:
6806 case HImode:
6807 case QImode:
6808 cum->words += words;
6809 cum->nregs -= words;
6810 cum->regno += words;
6811
6812 if (cum->nregs <= 0)
6813 {
6814 cum->nregs = 0;
6815 cum->regno = 0;
6816 }
6817 break;
6818
6819 case OImode:
6820 /* OImode shouldn't be used directly. */
6821 gcc_unreachable ();
6822
6823 case DFmode:
6824 if (cum->float_in_sse < 2)
6825 break;
6826 case SFmode:
6827 if (cum->float_in_sse < 1)
6828 break;
6829 /* FALLTHRU */
6830
6831 case V8SFmode:
6832 case V8SImode:
6833 case V32QImode:
6834 case V16HImode:
6835 case V4DFmode:
6836 case V4DImode:
6837 case TImode:
6838 case V16QImode:
6839 case V8HImode:
6840 case V4SImode:
6841 case V2DImode:
6842 case V4SFmode:
6843 case V2DFmode:
6844 if (!type || !AGGREGATE_TYPE_P (type))
6845 {
6846 cum->sse_words += words;
6847 cum->sse_nregs -= 1;
6848 cum->sse_regno += 1;
6849 if (cum->sse_nregs <= 0)
6850 {
6851 cum->sse_nregs = 0;
6852 cum->sse_regno = 0;
6853 }
6854 }
6855 break;
6856
6857 case V8QImode:
6858 case V4HImode:
6859 case V2SImode:
6860 case V2SFmode:
6861 case V1TImode:
6862 case V1DImode:
6863 if (!type || !AGGREGATE_TYPE_P (type))
6864 {
6865 cum->mmx_words += words;
6866 cum->mmx_nregs -= 1;
6867 cum->mmx_regno += 1;
6868 if (cum->mmx_nregs <= 0)
6869 {
6870 cum->mmx_nregs = 0;
6871 cum->mmx_regno = 0;
6872 }
6873 }
6874 break;
6875 }
6876 }
6877
6878 static void
6879 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6880 const_tree type, HOST_WIDE_INT words, bool named)
6881 {
6882 int int_nregs, sse_nregs;
6883
6884 /* Unnamed 256bit vector mode parameters are passed on stack. */
6885 if (!named && VALID_AVX256_REG_MODE (mode))
6886 return;
6887
6888 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6889 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6890 {
6891 cum->nregs -= int_nregs;
6892 cum->sse_nregs -= sse_nregs;
6893 cum->regno += int_nregs;
6894 cum->sse_regno += sse_nregs;
6895 }
6896 else
6897 {
6898 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6899 cum->words = (cum->words + align - 1) & ~(align - 1);
6900 cum->words += words;
6901 }
6902 }
6903
6904 static void
6905 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6906 HOST_WIDE_INT words)
6907 {
6908 /* Otherwise, this should be passed indirect. */
6909 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6910
6911 cum->words += words;
6912 if (cum->nregs > 0)
6913 {
6914 cum->nregs -= 1;
6915 cum->regno += 1;
6916 }
6917 }
6918
6919 /* Update the data in CUM to advance over an argument of mode MODE and
6920 data type TYPE. (TYPE is null for libcalls where that information
6921 may not be available.) */
6922
6923 static void
6924 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6925 const_tree type, bool named)
6926 {
6927 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6928 HOST_WIDE_INT bytes, words;
6929
6930 if (mode == BLKmode)
6931 bytes = int_size_in_bytes (type);
6932 else
6933 bytes = GET_MODE_SIZE (mode);
6934 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6935
6936 if (type)
6937 mode = type_natural_mode (type, NULL);
6938
6939 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6940 function_arg_advance_ms_64 (cum, bytes, words);
6941 else if (TARGET_64BIT)
6942 function_arg_advance_64 (cum, mode, type, words, named);
6943 else
6944 function_arg_advance_32 (cum, mode, type, bytes, words);
6945 }
6946
6947 /* Define where to put the arguments to a function.
6948 Value is zero to push the argument on the stack,
6949 or a hard register in which to store the argument.
6950
6951 MODE is the argument's machine mode.
6952 TYPE is the data type of the argument (as a tree).
6953 This is null for libcalls where that information may
6954 not be available.
6955 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6956 the preceding args and about the function being called.
6957 NAMED is nonzero if this argument is a named parameter
6958 (otherwise it is an extra parameter matching an ellipsis). */
6959
6960 static rtx
6961 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6962 enum machine_mode orig_mode, const_tree type,
6963 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6964 {
6965 static bool warnedsse, warnedmmx;
6966
6967 /* Avoid the AL settings for the Unix64 ABI. */
6968 if (mode == VOIDmode)
6969 return constm1_rtx;
6970
6971 switch (mode)
6972 {
6973 default:
6974 break;
6975
6976 case BLKmode:
6977 if (bytes < 0)
6978 break;
6979 /* FALLTHRU */
6980 case DImode:
6981 case SImode:
6982 case HImode:
6983 case QImode:
6984 if (words <= cum->nregs)
6985 {
6986 int regno = cum->regno;
6987
6988 /* Fastcall allocates the first two DWORD (SImode) or
6989 smaller arguments to ECX and EDX if it isn't an
6990 aggregate type . */
6991 if (cum->fastcall)
6992 {
6993 if (mode == BLKmode
6994 || mode == DImode
6995 || (type && AGGREGATE_TYPE_P (type)))
6996 break;
6997
6998 /* ECX not EAX is the first allocated register. */
6999 if (regno == AX_REG)
7000 regno = CX_REG;
7001 }
7002 return gen_rtx_REG (mode, regno);
7003 }
7004 break;
7005
7006 case DFmode:
7007 if (cum->float_in_sse < 2)
7008 break;
7009 case SFmode:
7010 if (cum->float_in_sse < 1)
7011 break;
7012 /* FALLTHRU */
7013 case TImode:
7014 /* In 32bit, we pass TImode in xmm registers. */
7015 case V16QImode:
7016 case V8HImode:
7017 case V4SImode:
7018 case V2DImode:
7019 case V4SFmode:
7020 case V2DFmode:
7021 if (!type || !AGGREGATE_TYPE_P (type))
7022 {
7023 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
7024 {
7025 warnedsse = true;
7026 warning (0, "SSE vector argument without SSE enabled "
7027 "changes the ABI");
7028 }
7029 if (cum->sse_nregs)
7030 return gen_reg_or_parallel (mode, orig_mode,
7031 cum->sse_regno + FIRST_SSE_REG);
7032 }
7033 break;
7034
7035 case OImode:
7036 /* OImode shouldn't be used directly. */
7037 gcc_unreachable ();
7038
7039 case V8SFmode:
7040 case V8SImode:
7041 case V32QImode:
7042 case V16HImode:
7043 case V4DFmode:
7044 case V4DImode:
7045 if (!type || !AGGREGATE_TYPE_P (type))
7046 {
7047 if (cum->sse_nregs)
7048 return gen_reg_or_parallel (mode, orig_mode,
7049 cum->sse_regno + FIRST_SSE_REG);
7050 }
7051 break;
7052
7053 case V8QImode:
7054 case V4HImode:
7055 case V2SImode:
7056 case V2SFmode:
7057 case V1TImode:
7058 case V1DImode:
7059 if (!type || !AGGREGATE_TYPE_P (type))
7060 {
7061 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
7062 {
7063 warnedmmx = true;
7064 warning (0, "MMX vector argument without MMX enabled "
7065 "changes the ABI");
7066 }
7067 if (cum->mmx_nregs)
7068 return gen_reg_or_parallel (mode, orig_mode,
7069 cum->mmx_regno + FIRST_MMX_REG);
7070 }
7071 break;
7072 }
7073
7074 return NULL_RTX;
7075 }
7076
7077 static rtx
7078 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7079 enum machine_mode orig_mode, const_tree type, bool named)
7080 {
7081 /* Handle a hidden AL argument containing number of registers
7082 for varargs x86-64 functions. */
7083 if (mode == VOIDmode)
7084 return GEN_INT (cum->maybe_vaarg
7085 ? (cum->sse_nregs < 0
7086 ? X86_64_SSE_REGPARM_MAX
7087 : cum->sse_regno)
7088 : -1);
7089
7090 switch (mode)
7091 {
7092 default:
7093 break;
7094
7095 case V8SFmode:
7096 case V8SImode:
7097 case V32QImode:
7098 case V16HImode:
7099 case V4DFmode:
7100 case V4DImode:
7101 /* Unnamed 256bit vector mode parameters are passed on stack. */
7102 if (!named)
7103 return NULL;
7104 break;
7105 }
7106
7107 return construct_container (mode, orig_mode, type, 0, cum->nregs,
7108 cum->sse_nregs,
7109 &x86_64_int_parameter_registers [cum->regno],
7110 cum->sse_regno);
7111 }
7112
7113 static rtx
7114 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7115 enum machine_mode orig_mode, bool named,
7116 HOST_WIDE_INT bytes)
7117 {
7118 unsigned int regno;
7119
7120 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
7121 We use value of -2 to specify that current function call is MSABI. */
7122 if (mode == VOIDmode)
7123 return GEN_INT (-2);
7124
7125 /* If we've run out of registers, it goes on the stack. */
7126 if (cum->nregs == 0)
7127 return NULL_RTX;
7128
7129 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
7130
7131 /* Only floating point modes are passed in anything but integer regs. */
7132 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
7133 {
7134 if (named)
7135 regno = cum->regno + FIRST_SSE_REG;
7136 else
7137 {
7138 rtx t1, t2;
7139
7140 /* Unnamed floating parameters are passed in both the
7141 SSE and integer registers. */
7142 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
7143 t2 = gen_rtx_REG (mode, regno);
7144 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
7145 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
7146 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
7147 }
7148 }
7149 /* Handle aggregated types passed in register. */
7150 if (orig_mode == BLKmode)
7151 {
7152 if (bytes > 0 && bytes <= 8)
7153 mode = (bytes > 4 ? DImode : SImode);
7154 if (mode == BLKmode)
7155 mode = DImode;
7156 }
7157
7158 return gen_reg_or_parallel (mode, orig_mode, regno);
7159 }
7160
7161 /* Return where to put the arguments to a function.
7162 Return zero to push the argument on the stack, or a hard register in which to store the argument.
7163
7164 MODE is the argument's machine mode. TYPE is the data type of the
7165 argument. It is null for libcalls where that information may not be
7166 available. CUM gives information about the preceding args and about
7167 the function being called. NAMED is nonzero if this argument is a
7168 named parameter (otherwise it is an extra parameter matching an
7169 ellipsis). */
7170
7171 static rtx
7172 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
7173 const_tree type, bool named)
7174 {
7175 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7176 enum machine_mode mode = omode;
7177 HOST_WIDE_INT bytes, words;
7178 rtx arg;
7179
7180 if (mode == BLKmode)
7181 bytes = int_size_in_bytes (type);
7182 else
7183 bytes = GET_MODE_SIZE (mode);
7184 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7185
7186 /* To simplify the code below, represent vector types with a vector mode
7187 even if MMX/SSE are not active. */
7188 if (type && TREE_CODE (type) == VECTOR_TYPE)
7189 mode = type_natural_mode (type, cum);
7190
7191 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7192 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
7193 else if (TARGET_64BIT)
7194 arg = function_arg_64 (cum, mode, omode, type, named);
7195 else
7196 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
7197
7198 if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
7199 {
7200 /* This argument uses 256bit AVX modes. */
7201 if (cum->caller)
7202 cfun->machine->callee_pass_avx256_p = true;
7203 else
7204 cfun->machine->caller_pass_avx256_p = true;
7205 }
7206
7207 return arg;
7208 }
7209
7210 /* A C expression that indicates when an argument must be passed by
7211 reference. If nonzero for an argument, a copy of that argument is
7212 made in memory and a pointer to the argument is passed instead of
7213 the argument itself. The pointer is passed in whatever way is
7214 appropriate for passing a pointer to that type. */
7215
7216 static bool
7217 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
7218 enum machine_mode mode ATTRIBUTE_UNUSED,
7219 const_tree type, bool named ATTRIBUTE_UNUSED)
7220 {
7221 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7222
7223 /* See Windows x64 Software Convention. */
7224 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7225 {
7226 int msize = (int) GET_MODE_SIZE (mode);
7227 if (type)
7228 {
7229 /* Arrays are passed by reference. */
7230 if (TREE_CODE (type) == ARRAY_TYPE)
7231 return true;
7232
7233 if (AGGREGATE_TYPE_P (type))
7234 {
7235 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7236 are passed by reference. */
7237 msize = int_size_in_bytes (type);
7238 }
7239 }
7240
7241 /* __m128 is passed by reference. */
7242 switch (msize) {
7243 case 1: case 2: case 4: case 8:
7244 break;
7245 default:
7246 return true;
7247 }
7248 }
7249 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7250 return 1;
7251
7252 return 0;
7253 }
7254
7255 /* Return true when TYPE should be 128bit aligned for 32bit argument
7256 passing ABI. XXX: This function is obsolete and is only used for
7257 checking psABI compatibility with previous versions of GCC. */
7258
7259 static bool
7260 ix86_compat_aligned_value_p (const_tree type)
7261 {
7262 enum machine_mode mode = TYPE_MODE (type);
7263 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7264 || mode == TDmode
7265 || mode == TFmode
7266 || mode == TCmode)
7267 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7268 return true;
7269 if (TYPE_ALIGN (type) < 128)
7270 return false;
7271
7272 if (AGGREGATE_TYPE_P (type))
7273 {
7274 /* Walk the aggregates recursively. */
7275 switch (TREE_CODE (type))
7276 {
7277 case RECORD_TYPE:
7278 case UNION_TYPE:
7279 case QUAL_UNION_TYPE:
7280 {
7281 tree field;
7282
7283 /* Walk all the structure fields. */
7284 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7285 {
7286 if (TREE_CODE (field) == FIELD_DECL
7287 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7288 return true;
7289 }
7290 break;
7291 }
7292
7293 case ARRAY_TYPE:
7294 /* Just for use if some languages passes arrays by value. */
7295 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7296 return true;
7297 break;
7298
7299 default:
7300 gcc_unreachable ();
7301 }
7302 }
7303 return false;
7304 }
7305
7306 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7307 XXX: This function is obsolete and is only used for checking psABI
7308 compatibility with previous versions of GCC. */
7309
7310 static unsigned int
7311 ix86_compat_function_arg_boundary (enum machine_mode mode,
7312 const_tree type, unsigned int align)
7313 {
7314 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7315 natural boundaries. */
7316 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7317 {
7318 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7319 make an exception for SSE modes since these require 128bit
7320 alignment.
7321
7322 The handling here differs from field_alignment. ICC aligns MMX
7323 arguments to 4 byte boundaries, while structure fields are aligned
7324 to 8 byte boundaries. */
7325 if (!type)
7326 {
7327 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7328 align = PARM_BOUNDARY;
7329 }
7330 else
7331 {
7332 if (!ix86_compat_aligned_value_p (type))
7333 align = PARM_BOUNDARY;
7334 }
7335 }
7336 if (align > BIGGEST_ALIGNMENT)
7337 align = BIGGEST_ALIGNMENT;
7338 return align;
7339 }
7340
7341 /* Return true when TYPE should be 128bit aligned for 32bit argument
7342 passing ABI. */
7343
7344 static bool
7345 ix86_contains_aligned_value_p (const_tree type)
7346 {
7347 enum machine_mode mode = TYPE_MODE (type);
7348
7349 if (mode == XFmode || mode == XCmode)
7350 return false;
7351
7352 if (TYPE_ALIGN (type) < 128)
7353 return false;
7354
7355 if (AGGREGATE_TYPE_P (type))
7356 {
7357 /* Walk the aggregates recursively. */
7358 switch (TREE_CODE (type))
7359 {
7360 case RECORD_TYPE:
7361 case UNION_TYPE:
7362 case QUAL_UNION_TYPE:
7363 {
7364 tree field;
7365
7366 /* Walk all the structure fields. */
7367 for (field = TYPE_FIELDS (type);
7368 field;
7369 field = DECL_CHAIN (field))
7370 {
7371 if (TREE_CODE (field) == FIELD_DECL
7372 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7373 return true;
7374 }
7375 break;
7376 }
7377
7378 case ARRAY_TYPE:
7379 /* Just for use if some languages passes arrays by value. */
7380 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7381 return true;
7382 break;
7383
7384 default:
7385 gcc_unreachable ();
7386 }
7387 }
7388 else
7389 return TYPE_ALIGN (type) >= 128;
7390
7391 return false;
7392 }
7393
7394 /* Gives the alignment boundary, in bits, of an argument with the
7395 specified mode and type. */
7396
7397 static unsigned int
7398 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7399 {
7400 unsigned int align;
7401 if (type)
7402 {
7403 /* Since the main variant type is used for call, we convert it to
7404 the main variant type. */
7405 type = TYPE_MAIN_VARIANT (type);
7406 align = TYPE_ALIGN (type);
7407 }
7408 else
7409 align = GET_MODE_ALIGNMENT (mode);
7410 if (align < PARM_BOUNDARY)
7411 align = PARM_BOUNDARY;
7412 else
7413 {
7414 static bool warned;
7415 unsigned int saved_align = align;
7416
7417 if (!TARGET_64BIT)
7418 {
7419 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7420 if (!type)
7421 {
7422 if (mode == XFmode || mode == XCmode)
7423 align = PARM_BOUNDARY;
7424 }
7425 else if (!ix86_contains_aligned_value_p (type))
7426 align = PARM_BOUNDARY;
7427
7428 if (align < 128)
7429 align = PARM_BOUNDARY;
7430 }
7431
7432 if (warn_psabi
7433 && !warned
7434 && align != ix86_compat_function_arg_boundary (mode, type,
7435 saved_align))
7436 {
7437 warned = true;
7438 inform (input_location,
7439 "The ABI for passing parameters with %d-byte"
7440 " alignment has changed in GCC 4.6",
7441 align / BITS_PER_UNIT);
7442 }
7443 }
7444
7445 return align;
7446 }
7447
7448 /* Return true if N is a possible register number of function value. */
7449
7450 static bool
7451 ix86_function_value_regno_p (const unsigned int regno)
7452 {
7453 switch (regno)
7454 {
7455 case AX_REG:
7456 return true;
7457
7458 case FIRST_FLOAT_REG:
7459 /* TODO: The function should depend on current function ABI but
7460 builtins.c would need updating then. Therefore we use the
7461 default ABI. */
7462 if (TARGET_64BIT && ix86_abi == MS_ABI)
7463 return false;
7464 return TARGET_FLOAT_RETURNS_IN_80387;
7465
7466 case FIRST_SSE_REG:
7467 return TARGET_SSE;
7468
7469 case FIRST_MMX_REG:
7470 if (TARGET_MACHO || TARGET_64BIT)
7471 return false;
7472 return TARGET_MMX;
7473 }
7474
7475 return false;
7476 }
7477
7478 /* Define how to find the value returned by a function.
7479 VALTYPE is the data type of the value (as a tree).
7480 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7481 otherwise, FUNC is 0. */
7482
7483 static rtx
7484 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7485 const_tree fntype, const_tree fn)
7486 {
7487 unsigned int regno;
7488
7489 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7490 we normally prevent this case when mmx is not available. However
7491 some ABIs may require the result to be returned like DImode. */
7492 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7493 regno = FIRST_MMX_REG;
7494
7495 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7496 we prevent this case when sse is not available. However some ABIs
7497 may require the result to be returned like integer TImode. */
7498 else if (mode == TImode
7499 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7500 regno = FIRST_SSE_REG;
7501
7502 /* 32-byte vector modes in %ymm0. */
7503 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7504 regno = FIRST_SSE_REG;
7505
7506 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7507 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7508 regno = FIRST_FLOAT_REG;
7509 else
7510 /* Most things go in %eax. */
7511 regno = AX_REG;
7512
7513 /* Override FP return register with %xmm0 for local functions when
7514 SSE math is enabled or for functions with sseregparm attribute. */
7515 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7516 {
7517 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7518 if ((sse_level >= 1 && mode == SFmode)
7519 || (sse_level == 2 && mode == DFmode))
7520 regno = FIRST_SSE_REG;
7521 }
7522
7523 /* OImode shouldn't be used directly. */
7524 gcc_assert (mode != OImode);
7525
7526 return gen_rtx_REG (orig_mode, regno);
7527 }
7528
7529 static rtx
7530 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7531 const_tree valtype)
7532 {
7533 rtx ret;
7534
7535 /* Handle libcalls, which don't provide a type node. */
7536 if (valtype == NULL)
7537 {
7538 unsigned int regno;
7539
7540 switch (mode)
7541 {
7542 case SFmode:
7543 case SCmode:
7544 case DFmode:
7545 case DCmode:
7546 case TFmode:
7547 case SDmode:
7548 case DDmode:
7549 case TDmode:
7550 regno = FIRST_SSE_REG;
7551 break;
7552 case XFmode:
7553 case XCmode:
7554 regno = FIRST_FLOAT_REG;
7555 break;
7556 case TCmode:
7557 return NULL;
7558 default:
7559 regno = AX_REG;
7560 }
7561
7562 return gen_rtx_REG (mode, regno);
7563 }
7564 else if (POINTER_TYPE_P (valtype))
7565 {
7566 /* Pointers are always returned in word_mode. */
7567 mode = word_mode;
7568 }
7569
7570 ret = construct_container (mode, orig_mode, valtype, 1,
7571 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7572 x86_64_int_return_registers, 0);
7573
7574 /* For zero sized structures, construct_container returns NULL, but we
7575 need to keep rest of compiler happy by returning meaningful value. */
7576 if (!ret)
7577 ret = gen_rtx_REG (orig_mode, AX_REG);
7578
7579 return ret;
7580 }
7581
7582 static rtx
7583 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7584 {
7585 unsigned int regno = AX_REG;
7586
7587 if (TARGET_SSE)
7588 {
7589 switch (GET_MODE_SIZE (mode))
7590 {
7591 case 16:
7592 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7593 && !COMPLEX_MODE_P (mode))
7594 regno = FIRST_SSE_REG;
7595 break;
7596 case 8:
7597 case 4:
7598 if (mode == SFmode || mode == DFmode)
7599 regno = FIRST_SSE_REG;
7600 break;
7601 default:
7602 break;
7603 }
7604 }
7605 return gen_rtx_REG (orig_mode, regno);
7606 }
7607
7608 static rtx
7609 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7610 enum machine_mode orig_mode, enum machine_mode mode)
7611 {
7612 const_tree fn, fntype;
7613
7614 fn = NULL_TREE;
7615 if (fntype_or_decl && DECL_P (fntype_or_decl))
7616 fn = fntype_or_decl;
7617 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7618
7619 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7620 return function_value_ms_64 (orig_mode, mode);
7621 else if (TARGET_64BIT)
7622 return function_value_64 (orig_mode, mode, valtype);
7623 else
7624 return function_value_32 (orig_mode, mode, fntype, fn);
7625 }
7626
7627 static rtx
7628 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7629 bool outgoing ATTRIBUTE_UNUSED)
7630 {
7631 enum machine_mode mode, orig_mode;
7632
7633 orig_mode = TYPE_MODE (valtype);
7634 mode = type_natural_mode (valtype, NULL);
7635 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7636 }
7637
7638 /* Pointer function arguments and return values are promoted to
7639 word_mode. */
7640
7641 static enum machine_mode
7642 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7643 int *punsignedp, const_tree fntype,
7644 int for_return)
7645 {
7646 if (type != NULL_TREE && POINTER_TYPE_P (type))
7647 {
7648 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7649 return word_mode;
7650 }
7651 return default_promote_function_mode (type, mode, punsignedp, fntype,
7652 for_return);
7653 }
7654
7655 /* Return true if a structure, union or array with MODE containing FIELD
7656 should be accessed using BLKmode. */
7657
7658 static bool
7659 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7660 {
7661 /* Union with XFmode must be in BLKmode. */
7662 return (mode == XFmode
7663 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7664 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7665 }
7666
7667 rtx
7668 ix86_libcall_value (enum machine_mode mode)
7669 {
7670 return ix86_function_value_1 (NULL, NULL, mode, mode);
7671 }
7672
7673 /* Return true iff type is returned in memory. */
7674
7675 static bool ATTRIBUTE_UNUSED
7676 return_in_memory_32 (const_tree type, enum machine_mode mode)
7677 {
7678 HOST_WIDE_INT size;
7679
7680 if (mode == BLKmode)
7681 return true;
7682
7683 size = int_size_in_bytes (type);
7684
7685 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7686 return false;
7687
7688 if (VECTOR_MODE_P (mode) || mode == TImode)
7689 {
7690 /* User-created vectors small enough to fit in EAX. */
7691 if (size < 8)
7692 return false;
7693
7694 /* MMX/3dNow values are returned in MM0,
7695 except when it doesn't exits or the ABI prescribes otherwise. */
7696 if (size == 8)
7697 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7698
7699 /* SSE values are returned in XMM0, except when it doesn't exist. */
7700 if (size == 16)
7701 return !TARGET_SSE;
7702
7703 /* AVX values are returned in YMM0, except when it doesn't exist. */
7704 if (size == 32)
7705 return !TARGET_AVX;
7706 }
7707
7708 if (mode == XFmode)
7709 return false;
7710
7711 if (size > 12)
7712 return true;
7713
7714 /* OImode shouldn't be used directly. */
7715 gcc_assert (mode != OImode);
7716
7717 return false;
7718 }
7719
7720 static bool ATTRIBUTE_UNUSED
7721 return_in_memory_64 (const_tree type, enum machine_mode mode)
7722 {
7723 int needed_intregs, needed_sseregs;
7724 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7725 }
7726
7727 static bool ATTRIBUTE_UNUSED
7728 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7729 {
7730 HOST_WIDE_INT size = int_size_in_bytes (type);
7731
7732 /* __m128 is returned in xmm0. */
7733 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7734 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7735 return false;
7736
7737 /* Otherwise, the size must be exactly in [1248]. */
7738 return size != 1 && size != 2 && size != 4 && size != 8;
7739 }
7740
7741 static bool
7742 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7743 {
7744 #ifdef SUBTARGET_RETURN_IN_MEMORY
7745 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7746 #else
7747 const enum machine_mode mode = type_natural_mode (type, NULL);
7748
7749 if (TARGET_64BIT)
7750 {
7751 if (ix86_function_type_abi (fntype) == MS_ABI)
7752 return return_in_memory_ms_64 (type, mode);
7753 else
7754 return return_in_memory_64 (type, mode);
7755 }
7756 else
7757 return return_in_memory_32 (type, mode);
7758 #endif
7759 }
7760
7761 /* When returning SSE vector types, we have a choice of either
7762 (1) being abi incompatible with a -march switch, or
7763 (2) generating an error.
7764 Given no good solution, I think the safest thing is one warning.
7765 The user won't be able to use -Werror, but....
7766
7767 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7768 called in response to actually generating a caller or callee that
7769 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7770 via aggregate_value_p for general type probing from tree-ssa. */
7771
7772 static rtx
7773 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7774 {
7775 static bool warnedsse, warnedmmx;
7776
7777 if (!TARGET_64BIT && type)
7778 {
7779 /* Look at the return type of the function, not the function type. */
7780 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7781
7782 if (!TARGET_SSE && !warnedsse)
7783 {
7784 if (mode == TImode
7785 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7786 {
7787 warnedsse = true;
7788 warning (0, "SSE vector return without SSE enabled "
7789 "changes the ABI");
7790 }
7791 }
7792
7793 if (!TARGET_MMX && !warnedmmx)
7794 {
7795 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7796 {
7797 warnedmmx = true;
7798 warning (0, "MMX vector return without MMX enabled "
7799 "changes the ABI");
7800 }
7801 }
7802 }
7803
7804 return NULL;
7805 }
7806
7807 \f
7808 /* Create the va_list data type. */
7809
7810 /* Returns the calling convention specific va_list date type.
7811 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7812
7813 static tree
7814 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7815 {
7816 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7817
7818 /* For i386 we use plain pointer to argument area. */
7819 if (!TARGET_64BIT || abi == MS_ABI)
7820 return build_pointer_type (char_type_node);
7821
7822 record = lang_hooks.types.make_type (RECORD_TYPE);
7823 type_decl = build_decl (BUILTINS_LOCATION,
7824 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7825
7826 f_gpr = build_decl (BUILTINS_LOCATION,
7827 FIELD_DECL, get_identifier ("gp_offset"),
7828 unsigned_type_node);
7829 f_fpr = build_decl (BUILTINS_LOCATION,
7830 FIELD_DECL, get_identifier ("fp_offset"),
7831 unsigned_type_node);
7832 f_ovf = build_decl (BUILTINS_LOCATION,
7833 FIELD_DECL, get_identifier ("overflow_arg_area"),
7834 ptr_type_node);
7835 f_sav = build_decl (BUILTINS_LOCATION,
7836 FIELD_DECL, get_identifier ("reg_save_area"),
7837 ptr_type_node);
7838
7839 va_list_gpr_counter_field = f_gpr;
7840 va_list_fpr_counter_field = f_fpr;
7841
7842 DECL_FIELD_CONTEXT (f_gpr) = record;
7843 DECL_FIELD_CONTEXT (f_fpr) = record;
7844 DECL_FIELD_CONTEXT (f_ovf) = record;
7845 DECL_FIELD_CONTEXT (f_sav) = record;
7846
7847 TYPE_STUB_DECL (record) = type_decl;
7848 TYPE_NAME (record) = type_decl;
7849 TYPE_FIELDS (record) = f_gpr;
7850 DECL_CHAIN (f_gpr) = f_fpr;
7851 DECL_CHAIN (f_fpr) = f_ovf;
7852 DECL_CHAIN (f_ovf) = f_sav;
7853
7854 layout_type (record);
7855
7856 /* The correct type is an array type of one element. */
7857 return build_array_type (record, build_index_type (size_zero_node));
7858 }
7859
7860 /* Setup the builtin va_list data type and for 64-bit the additional
7861 calling convention specific va_list data types. */
7862
7863 static tree
7864 ix86_build_builtin_va_list (void)
7865 {
7866 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7867
7868 /* Initialize abi specific va_list builtin types. */
7869 if (TARGET_64BIT)
7870 {
7871 tree t;
7872 if (ix86_abi == MS_ABI)
7873 {
7874 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7875 if (TREE_CODE (t) != RECORD_TYPE)
7876 t = build_variant_type_copy (t);
7877 sysv_va_list_type_node = t;
7878 }
7879 else
7880 {
7881 t = ret;
7882 if (TREE_CODE (t) != RECORD_TYPE)
7883 t = build_variant_type_copy (t);
7884 sysv_va_list_type_node = t;
7885 }
7886 if (ix86_abi != MS_ABI)
7887 {
7888 t = ix86_build_builtin_va_list_abi (MS_ABI);
7889 if (TREE_CODE (t) != RECORD_TYPE)
7890 t = build_variant_type_copy (t);
7891 ms_va_list_type_node = t;
7892 }
7893 else
7894 {
7895 t = ret;
7896 if (TREE_CODE (t) != RECORD_TYPE)
7897 t = build_variant_type_copy (t);
7898 ms_va_list_type_node = t;
7899 }
7900 }
7901
7902 return ret;
7903 }
7904
7905 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7906
7907 static void
7908 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7909 {
7910 rtx save_area, mem;
7911 alias_set_type set;
7912 int i, max;
7913
7914 /* GPR size of varargs save area. */
7915 if (cfun->va_list_gpr_size)
7916 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7917 else
7918 ix86_varargs_gpr_size = 0;
7919
7920 /* FPR size of varargs save area. We don't need it if we don't pass
7921 anything in SSE registers. */
7922 if (TARGET_SSE && cfun->va_list_fpr_size)
7923 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7924 else
7925 ix86_varargs_fpr_size = 0;
7926
7927 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7928 return;
7929
7930 save_area = frame_pointer_rtx;
7931 set = get_varargs_alias_set ();
7932
7933 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7934 if (max > X86_64_REGPARM_MAX)
7935 max = X86_64_REGPARM_MAX;
7936
7937 for (i = cum->regno; i < max; i++)
7938 {
7939 mem = gen_rtx_MEM (word_mode,
7940 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7941 MEM_NOTRAP_P (mem) = 1;
7942 set_mem_alias_set (mem, set);
7943 emit_move_insn (mem,
7944 gen_rtx_REG (word_mode,
7945 x86_64_int_parameter_registers[i]));
7946 }
7947
7948 if (ix86_varargs_fpr_size)
7949 {
7950 enum machine_mode smode;
7951 rtx label, test;
7952
7953 /* Now emit code to save SSE registers. The AX parameter contains number
7954 of SSE parameter registers used to call this function, though all we
7955 actually check here is the zero/non-zero status. */
7956
7957 label = gen_label_rtx ();
7958 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7959 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7960 label));
7961
7962 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7963 we used movdqa (i.e. TImode) instead? Perhaps even better would
7964 be if we could determine the real mode of the data, via a hook
7965 into pass_stdarg. Ignore all that for now. */
7966 smode = V4SFmode;
7967 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7968 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7969
7970 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7971 if (max > X86_64_SSE_REGPARM_MAX)
7972 max = X86_64_SSE_REGPARM_MAX;
7973
7974 for (i = cum->sse_regno; i < max; ++i)
7975 {
7976 mem = plus_constant (Pmode, save_area,
7977 i * 16 + ix86_varargs_gpr_size);
7978 mem = gen_rtx_MEM (smode, mem);
7979 MEM_NOTRAP_P (mem) = 1;
7980 set_mem_alias_set (mem, set);
7981 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7982
7983 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7984 }
7985
7986 emit_label (label);
7987 }
7988 }
7989
7990 static void
7991 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7992 {
7993 alias_set_type set = get_varargs_alias_set ();
7994 int i;
7995
7996 /* Reset to zero, as there might be a sysv vaarg used
7997 before. */
7998 ix86_varargs_gpr_size = 0;
7999 ix86_varargs_fpr_size = 0;
8000
8001 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
8002 {
8003 rtx reg, mem;
8004
8005 mem = gen_rtx_MEM (Pmode,
8006 plus_constant (Pmode, virtual_incoming_args_rtx,
8007 i * UNITS_PER_WORD));
8008 MEM_NOTRAP_P (mem) = 1;
8009 set_mem_alias_set (mem, set);
8010
8011 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
8012 emit_move_insn (mem, reg);
8013 }
8014 }
8015
8016 static void
8017 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
8018 tree type, int *pretend_size ATTRIBUTE_UNUSED,
8019 int no_rtl)
8020 {
8021 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8022 CUMULATIVE_ARGS next_cum;
8023 tree fntype;
8024
8025 /* This argument doesn't appear to be used anymore. Which is good,
8026 because the old code here didn't suppress rtl generation. */
8027 gcc_assert (!no_rtl);
8028
8029 if (!TARGET_64BIT)
8030 return;
8031
8032 fntype = TREE_TYPE (current_function_decl);
8033
8034 /* For varargs, we do not want to skip the dummy va_dcl argument.
8035 For stdargs, we do want to skip the last named argument. */
8036 next_cum = *cum;
8037 if (stdarg_p (fntype))
8038 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
8039 true);
8040
8041 if (cum->call_abi == MS_ABI)
8042 setup_incoming_varargs_ms_64 (&next_cum);
8043 else
8044 setup_incoming_varargs_64 (&next_cum);
8045 }
8046
8047 /* Checks if TYPE is of kind va_list char *. */
8048
8049 static bool
8050 is_va_list_char_pointer (tree type)
8051 {
8052 tree canonic;
8053
8054 /* For 32-bit it is always true. */
8055 if (!TARGET_64BIT)
8056 return true;
8057 canonic = ix86_canonical_va_list_type (type);
8058 return (canonic == ms_va_list_type_node
8059 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
8060 }
8061
8062 /* Implement va_start. */
8063
8064 static void
8065 ix86_va_start (tree valist, rtx nextarg)
8066 {
8067 HOST_WIDE_INT words, n_gpr, n_fpr;
8068 tree f_gpr, f_fpr, f_ovf, f_sav;
8069 tree gpr, fpr, ovf, sav, t;
8070 tree type;
8071 rtx ovf_rtx;
8072
8073 if (flag_split_stack
8074 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8075 {
8076 unsigned int scratch_regno;
8077
8078 /* When we are splitting the stack, we can't refer to the stack
8079 arguments using internal_arg_pointer, because they may be on
8080 the old stack. The split stack prologue will arrange to
8081 leave a pointer to the old stack arguments in a scratch
8082 register, which we here copy to a pseudo-register. The split
8083 stack prologue can't set the pseudo-register directly because
8084 it (the prologue) runs before any registers have been saved. */
8085
8086 scratch_regno = split_stack_prologue_scratch_regno ();
8087 if (scratch_regno != INVALID_REGNUM)
8088 {
8089 rtx reg, seq;
8090
8091 reg = gen_reg_rtx (Pmode);
8092 cfun->machine->split_stack_varargs_pointer = reg;
8093
8094 start_sequence ();
8095 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
8096 seq = get_insns ();
8097 end_sequence ();
8098
8099 push_topmost_sequence ();
8100 emit_insn_after (seq, entry_of_function ());
8101 pop_topmost_sequence ();
8102 }
8103 }
8104
8105 /* Only 64bit target needs something special. */
8106 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8107 {
8108 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8109 std_expand_builtin_va_start (valist, nextarg);
8110 else
8111 {
8112 rtx va_r, next;
8113
8114 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
8115 next = expand_binop (ptr_mode, add_optab,
8116 cfun->machine->split_stack_varargs_pointer,
8117 crtl->args.arg_offset_rtx,
8118 NULL_RTX, 0, OPTAB_LIB_WIDEN);
8119 convert_move (va_r, next, 0);
8120 }
8121 return;
8122 }
8123
8124 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8125 f_fpr = DECL_CHAIN (f_gpr);
8126 f_ovf = DECL_CHAIN (f_fpr);
8127 f_sav = DECL_CHAIN (f_ovf);
8128
8129 valist = build_simple_mem_ref (valist);
8130 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
8131 /* The following should be folded into the MEM_REF offset. */
8132 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
8133 f_gpr, NULL_TREE);
8134 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
8135 f_fpr, NULL_TREE);
8136 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
8137 f_ovf, NULL_TREE);
8138 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
8139 f_sav, NULL_TREE);
8140
8141 /* Count number of gp and fp argument registers used. */
8142 words = crtl->args.info.words;
8143 n_gpr = crtl->args.info.regno;
8144 n_fpr = crtl->args.info.sse_regno;
8145
8146 if (cfun->va_list_gpr_size)
8147 {
8148 type = TREE_TYPE (gpr);
8149 t = build2 (MODIFY_EXPR, type,
8150 gpr, build_int_cst (type, n_gpr * 8));
8151 TREE_SIDE_EFFECTS (t) = 1;
8152 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8153 }
8154
8155 if (TARGET_SSE && cfun->va_list_fpr_size)
8156 {
8157 type = TREE_TYPE (fpr);
8158 t = build2 (MODIFY_EXPR, type, fpr,
8159 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
8160 TREE_SIDE_EFFECTS (t) = 1;
8161 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8162 }
8163
8164 /* Find the overflow area. */
8165 type = TREE_TYPE (ovf);
8166 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8167 ovf_rtx = crtl->args.internal_arg_pointer;
8168 else
8169 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
8170 t = make_tree (type, ovf_rtx);
8171 if (words != 0)
8172 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
8173 t = build2 (MODIFY_EXPR, type, ovf, t);
8174 TREE_SIDE_EFFECTS (t) = 1;
8175 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8176
8177 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
8178 {
8179 /* Find the register save area.
8180 Prologue of the function save it right above stack frame. */
8181 type = TREE_TYPE (sav);
8182 t = make_tree (type, frame_pointer_rtx);
8183 if (!ix86_varargs_gpr_size)
8184 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
8185 t = build2 (MODIFY_EXPR, type, sav, t);
8186 TREE_SIDE_EFFECTS (t) = 1;
8187 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8188 }
8189 }
8190
8191 /* Implement va_arg. */
8192
8193 static tree
8194 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
8195 gimple_seq *post_p)
8196 {
8197 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
8198 tree f_gpr, f_fpr, f_ovf, f_sav;
8199 tree gpr, fpr, ovf, sav, t;
8200 int size, rsize;
8201 tree lab_false, lab_over = NULL_TREE;
8202 tree addr, t2;
8203 rtx container;
8204 int indirect_p = 0;
8205 tree ptrtype;
8206 enum machine_mode nat_mode;
8207 unsigned int arg_boundary;
8208
8209 /* Only 64bit target needs something special. */
8210 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8211 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
8212
8213 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8214 f_fpr = DECL_CHAIN (f_gpr);
8215 f_ovf = DECL_CHAIN (f_fpr);
8216 f_sav = DECL_CHAIN (f_ovf);
8217
8218 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
8219 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
8220 valist = build_va_arg_indirect_ref (valist);
8221 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
8222 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
8223 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
8224
8225 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8226 if (indirect_p)
8227 type = build_pointer_type (type);
8228 size = int_size_in_bytes (type);
8229 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8230
8231 nat_mode = type_natural_mode (type, NULL);
8232 switch (nat_mode)
8233 {
8234 case V8SFmode:
8235 case V8SImode:
8236 case V32QImode:
8237 case V16HImode:
8238 case V4DFmode:
8239 case V4DImode:
8240 /* Unnamed 256bit vector mode parameters are passed on stack. */
8241 if (!TARGET_64BIT_MS_ABI)
8242 {
8243 container = NULL;
8244 break;
8245 }
8246
8247 default:
8248 container = construct_container (nat_mode, TYPE_MODE (type),
8249 type, 0, X86_64_REGPARM_MAX,
8250 X86_64_SSE_REGPARM_MAX, intreg,
8251 0);
8252 break;
8253 }
8254
8255 /* Pull the value out of the saved registers. */
8256
8257 addr = create_tmp_var (ptr_type_node, "addr");
8258
8259 if (container)
8260 {
8261 int needed_intregs, needed_sseregs;
8262 bool need_temp;
8263 tree int_addr, sse_addr;
8264
8265 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8266 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8267
8268 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8269
8270 need_temp = (!REG_P (container)
8271 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8272 || TYPE_ALIGN (type) > 128));
8273
8274 /* In case we are passing structure, verify that it is consecutive block
8275 on the register save area. If not we need to do moves. */
8276 if (!need_temp && !REG_P (container))
8277 {
8278 /* Verify that all registers are strictly consecutive */
8279 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8280 {
8281 int i;
8282
8283 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8284 {
8285 rtx slot = XVECEXP (container, 0, i);
8286 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8287 || INTVAL (XEXP (slot, 1)) != i * 16)
8288 need_temp = 1;
8289 }
8290 }
8291 else
8292 {
8293 int i;
8294
8295 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8296 {
8297 rtx slot = XVECEXP (container, 0, i);
8298 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8299 || INTVAL (XEXP (slot, 1)) != i * 8)
8300 need_temp = 1;
8301 }
8302 }
8303 }
8304 if (!need_temp)
8305 {
8306 int_addr = addr;
8307 sse_addr = addr;
8308 }
8309 else
8310 {
8311 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8312 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8313 }
8314
8315 /* First ensure that we fit completely in registers. */
8316 if (needed_intregs)
8317 {
8318 t = build_int_cst (TREE_TYPE (gpr),
8319 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8320 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8321 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8322 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8323 gimplify_and_add (t, pre_p);
8324 }
8325 if (needed_sseregs)
8326 {
8327 t = build_int_cst (TREE_TYPE (fpr),
8328 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8329 + X86_64_REGPARM_MAX * 8);
8330 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8331 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8332 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8333 gimplify_and_add (t, pre_p);
8334 }
8335
8336 /* Compute index to start of area used for integer regs. */
8337 if (needed_intregs)
8338 {
8339 /* int_addr = gpr + sav; */
8340 t = fold_build_pointer_plus (sav, gpr);
8341 gimplify_assign (int_addr, t, pre_p);
8342 }
8343 if (needed_sseregs)
8344 {
8345 /* sse_addr = fpr + sav; */
8346 t = fold_build_pointer_plus (sav, fpr);
8347 gimplify_assign (sse_addr, t, pre_p);
8348 }
8349 if (need_temp)
8350 {
8351 int i, prev_size = 0;
8352 tree temp = create_tmp_var (type, "va_arg_tmp");
8353
8354 /* addr = &temp; */
8355 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8356 gimplify_assign (addr, t, pre_p);
8357
8358 for (i = 0; i < XVECLEN (container, 0); i++)
8359 {
8360 rtx slot = XVECEXP (container, 0, i);
8361 rtx reg = XEXP (slot, 0);
8362 enum machine_mode mode = GET_MODE (reg);
8363 tree piece_type;
8364 tree addr_type;
8365 tree daddr_type;
8366 tree src_addr, src;
8367 int src_offset;
8368 tree dest_addr, dest;
8369 int cur_size = GET_MODE_SIZE (mode);
8370
8371 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8372 prev_size = INTVAL (XEXP (slot, 1));
8373 if (prev_size + cur_size > size)
8374 {
8375 cur_size = size - prev_size;
8376 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8377 if (mode == BLKmode)
8378 mode = QImode;
8379 }
8380 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8381 if (mode == GET_MODE (reg))
8382 addr_type = build_pointer_type (piece_type);
8383 else
8384 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8385 true);
8386 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8387 true);
8388
8389 if (SSE_REGNO_P (REGNO (reg)))
8390 {
8391 src_addr = sse_addr;
8392 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8393 }
8394 else
8395 {
8396 src_addr = int_addr;
8397 src_offset = REGNO (reg) * 8;
8398 }
8399 src_addr = fold_convert (addr_type, src_addr);
8400 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8401
8402 dest_addr = fold_convert (daddr_type, addr);
8403 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8404 if (cur_size == GET_MODE_SIZE (mode))
8405 {
8406 src = build_va_arg_indirect_ref (src_addr);
8407 dest = build_va_arg_indirect_ref (dest_addr);
8408
8409 gimplify_assign (dest, src, pre_p);
8410 }
8411 else
8412 {
8413 tree copy
8414 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8415 3, dest_addr, src_addr,
8416 size_int (cur_size));
8417 gimplify_and_add (copy, pre_p);
8418 }
8419 prev_size += cur_size;
8420 }
8421 }
8422
8423 if (needed_intregs)
8424 {
8425 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8426 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8427 gimplify_assign (gpr, t, pre_p);
8428 }
8429
8430 if (needed_sseregs)
8431 {
8432 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8433 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8434 gimplify_assign (fpr, t, pre_p);
8435 }
8436
8437 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8438
8439 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8440 }
8441
8442 /* ... otherwise out of the overflow area. */
8443
8444 /* When we align parameter on stack for caller, if the parameter
8445 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8446 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8447 here with caller. */
8448 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8449 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8450 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8451
8452 /* Care for on-stack alignment if needed. */
8453 if (arg_boundary <= 64 || size == 0)
8454 t = ovf;
8455 else
8456 {
8457 HOST_WIDE_INT align = arg_boundary / 8;
8458 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8459 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8460 build_int_cst (TREE_TYPE (t), -align));
8461 }
8462
8463 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8464 gimplify_assign (addr, t, pre_p);
8465
8466 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8467 gimplify_assign (unshare_expr (ovf), t, pre_p);
8468
8469 if (container)
8470 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8471
8472 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8473 addr = fold_convert (ptrtype, addr);
8474
8475 if (indirect_p)
8476 addr = build_va_arg_indirect_ref (addr);
8477 return build_va_arg_indirect_ref (addr);
8478 }
8479 \f
8480 /* Return true if OPNUM's MEM should be matched
8481 in movabs* patterns. */
8482
8483 bool
8484 ix86_check_movabs (rtx insn, int opnum)
8485 {
8486 rtx set, mem;
8487
8488 set = PATTERN (insn);
8489 if (GET_CODE (set) == PARALLEL)
8490 set = XVECEXP (set, 0, 0);
8491 gcc_assert (GET_CODE (set) == SET);
8492 mem = XEXP (set, opnum);
8493 while (GET_CODE (mem) == SUBREG)
8494 mem = SUBREG_REG (mem);
8495 gcc_assert (MEM_P (mem));
8496 return volatile_ok || !MEM_VOLATILE_P (mem);
8497 }
8498 \f
8499 /* Initialize the table of extra 80387 mathematical constants. */
8500
8501 static void
8502 init_ext_80387_constants (void)
8503 {
8504 static const char * cst[5] =
8505 {
8506 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8507 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8508 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8509 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8510 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8511 };
8512 int i;
8513
8514 for (i = 0; i < 5; i++)
8515 {
8516 real_from_string (&ext_80387_constants_table[i], cst[i]);
8517 /* Ensure each constant is rounded to XFmode precision. */
8518 real_convert (&ext_80387_constants_table[i],
8519 XFmode, &ext_80387_constants_table[i]);
8520 }
8521
8522 ext_80387_constants_init = 1;
8523 }
8524
8525 /* Return non-zero if the constant is something that
8526 can be loaded with a special instruction. */
8527
8528 int
8529 standard_80387_constant_p (rtx x)
8530 {
8531 enum machine_mode mode = GET_MODE (x);
8532
8533 REAL_VALUE_TYPE r;
8534
8535 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8536 return -1;
8537
8538 if (x == CONST0_RTX (mode))
8539 return 1;
8540 if (x == CONST1_RTX (mode))
8541 return 2;
8542
8543 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8544
8545 /* For XFmode constants, try to find a special 80387 instruction when
8546 optimizing for size or on those CPUs that benefit from them. */
8547 if (mode == XFmode
8548 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8549 {
8550 int i;
8551
8552 if (! ext_80387_constants_init)
8553 init_ext_80387_constants ();
8554
8555 for (i = 0; i < 5; i++)
8556 if (real_identical (&r, &ext_80387_constants_table[i]))
8557 return i + 3;
8558 }
8559
8560 /* Load of the constant -0.0 or -1.0 will be split as
8561 fldz;fchs or fld1;fchs sequence. */
8562 if (real_isnegzero (&r))
8563 return 8;
8564 if (real_identical (&r, &dconstm1))
8565 return 9;
8566
8567 return 0;
8568 }
8569
8570 /* Return the opcode of the special instruction to be used to load
8571 the constant X. */
8572
8573 const char *
8574 standard_80387_constant_opcode (rtx x)
8575 {
8576 switch (standard_80387_constant_p (x))
8577 {
8578 case 1:
8579 return "fldz";
8580 case 2:
8581 return "fld1";
8582 case 3:
8583 return "fldlg2";
8584 case 4:
8585 return "fldln2";
8586 case 5:
8587 return "fldl2e";
8588 case 6:
8589 return "fldl2t";
8590 case 7:
8591 return "fldpi";
8592 case 8:
8593 case 9:
8594 return "#";
8595 default:
8596 gcc_unreachable ();
8597 }
8598 }
8599
8600 /* Return the CONST_DOUBLE representing the 80387 constant that is
8601 loaded by the specified special instruction. The argument IDX
8602 matches the return value from standard_80387_constant_p. */
8603
8604 rtx
8605 standard_80387_constant_rtx (int idx)
8606 {
8607 int i;
8608
8609 if (! ext_80387_constants_init)
8610 init_ext_80387_constants ();
8611
8612 switch (idx)
8613 {
8614 case 3:
8615 case 4:
8616 case 5:
8617 case 6:
8618 case 7:
8619 i = idx - 3;
8620 break;
8621
8622 default:
8623 gcc_unreachable ();
8624 }
8625
8626 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8627 XFmode);
8628 }
8629
8630 /* Return 1 if X is all 0s and 2 if x is all 1s
8631 in supported SSE/AVX vector mode. */
8632
8633 int
8634 standard_sse_constant_p (rtx x)
8635 {
8636 enum machine_mode mode = GET_MODE (x);
8637
8638 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8639 return 1;
8640 if (vector_all_ones_operand (x, mode))
8641 switch (mode)
8642 {
8643 case V16QImode:
8644 case V8HImode:
8645 case V4SImode:
8646 case V2DImode:
8647 if (TARGET_SSE2)
8648 return 2;
8649 case V32QImode:
8650 case V16HImode:
8651 case V8SImode:
8652 case V4DImode:
8653 if (TARGET_AVX2)
8654 return 2;
8655 default:
8656 break;
8657 }
8658
8659 return 0;
8660 }
8661
8662 /* Return the opcode of the special instruction to be used to load
8663 the constant X. */
8664
8665 const char *
8666 standard_sse_constant_opcode (rtx insn, rtx x)
8667 {
8668 switch (standard_sse_constant_p (x))
8669 {
8670 case 1:
8671 switch (get_attr_mode (insn))
8672 {
8673 case MODE_TI:
8674 return "%vpxor\t%0, %d0";
8675 case MODE_V2DF:
8676 return "%vxorpd\t%0, %d0";
8677 case MODE_V4SF:
8678 return "%vxorps\t%0, %d0";
8679
8680 case MODE_OI:
8681 return "vpxor\t%x0, %x0, %x0";
8682 case MODE_V4DF:
8683 return "vxorpd\t%x0, %x0, %x0";
8684 case MODE_V8SF:
8685 return "vxorps\t%x0, %x0, %x0";
8686
8687 default:
8688 break;
8689 }
8690
8691 case 2:
8692 if (TARGET_AVX)
8693 return "vpcmpeqd\t%0, %0, %0";
8694 else
8695 return "pcmpeqd\t%0, %0";
8696
8697 default:
8698 break;
8699 }
8700 gcc_unreachable ();
8701 }
8702
8703 /* Returns true if OP contains a symbol reference */
8704
8705 bool
8706 symbolic_reference_mentioned_p (rtx op)
8707 {
8708 const char *fmt;
8709 int i;
8710
8711 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8712 return true;
8713
8714 fmt = GET_RTX_FORMAT (GET_CODE (op));
8715 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8716 {
8717 if (fmt[i] == 'E')
8718 {
8719 int j;
8720
8721 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8722 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8723 return true;
8724 }
8725
8726 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8727 return true;
8728 }
8729
8730 return false;
8731 }
8732
8733 /* Return true if it is appropriate to emit `ret' instructions in the
8734 body of a function. Do this only if the epilogue is simple, needing a
8735 couple of insns. Prior to reloading, we can't tell how many registers
8736 must be saved, so return false then. Return false if there is no frame
8737 marker to de-allocate. */
8738
8739 bool
8740 ix86_can_use_return_insn_p (void)
8741 {
8742 struct ix86_frame frame;
8743
8744 if (! reload_completed || frame_pointer_needed)
8745 return 0;
8746
8747 /* Don't allow more than 32k pop, since that's all we can do
8748 with one instruction. */
8749 if (crtl->args.pops_args && crtl->args.size >= 32768)
8750 return 0;
8751
8752 ix86_compute_frame_layout (&frame);
8753 return (frame.stack_pointer_offset == UNITS_PER_WORD
8754 && (frame.nregs + frame.nsseregs) == 0);
8755 }
8756 \f
8757 /* Value should be nonzero if functions must have frame pointers.
8758 Zero means the frame pointer need not be set up (and parms may
8759 be accessed via the stack pointer) in functions that seem suitable. */
8760
8761 static bool
8762 ix86_frame_pointer_required (void)
8763 {
8764 /* If we accessed previous frames, then the generated code expects
8765 to be able to access the saved ebp value in our frame. */
8766 if (cfun->machine->accesses_prev_frame)
8767 return true;
8768
8769 /* Several x86 os'es need a frame pointer for other reasons,
8770 usually pertaining to setjmp. */
8771 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8772 return true;
8773
8774 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8775 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8776 return true;
8777
8778 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8779 allocation is 4GB. */
8780 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8781 return true;
8782
8783 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8784 turns off the frame pointer by default. Turn it back on now if
8785 we've not got a leaf function. */
8786 if (TARGET_OMIT_LEAF_FRAME_POINTER
8787 && (!crtl->is_leaf
8788 || ix86_current_function_calls_tls_descriptor))
8789 return true;
8790
8791 if (crtl->profile && !flag_fentry)
8792 return true;
8793
8794 return false;
8795 }
8796
8797 /* Record that the current function accesses previous call frames. */
8798
8799 void
8800 ix86_setup_frame_addresses (void)
8801 {
8802 cfun->machine->accesses_prev_frame = 1;
8803 }
8804 \f
8805 #ifndef USE_HIDDEN_LINKONCE
8806 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8807 # define USE_HIDDEN_LINKONCE 1
8808 # else
8809 # define USE_HIDDEN_LINKONCE 0
8810 # endif
8811 #endif
8812
8813 static int pic_labels_used;
8814
8815 /* Fills in the label name that should be used for a pc thunk for
8816 the given register. */
8817
8818 static void
8819 get_pc_thunk_name (char name[32], unsigned int regno)
8820 {
8821 gcc_assert (!TARGET_64BIT);
8822
8823 if (USE_HIDDEN_LINKONCE)
8824 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8825 else
8826 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8827 }
8828
8829
8830 /* This function generates code for -fpic that loads %ebx with
8831 the return address of the caller and then returns. */
8832
8833 static void
8834 ix86_code_end (void)
8835 {
8836 rtx xops[2];
8837 int regno;
8838
8839 for (regno = AX_REG; regno <= SP_REG; regno++)
8840 {
8841 char name[32];
8842 tree decl;
8843
8844 if (!(pic_labels_used & (1 << regno)))
8845 continue;
8846
8847 get_pc_thunk_name (name, regno);
8848
8849 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8850 get_identifier (name),
8851 build_function_type_list (void_type_node, NULL_TREE));
8852 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8853 NULL_TREE, void_type_node);
8854 TREE_PUBLIC (decl) = 1;
8855 TREE_STATIC (decl) = 1;
8856 DECL_IGNORED_P (decl) = 1;
8857
8858 #if TARGET_MACHO
8859 if (TARGET_MACHO)
8860 {
8861 switch_to_section (darwin_sections[text_coal_section]);
8862 fputs ("\t.weak_definition\t", asm_out_file);
8863 assemble_name (asm_out_file, name);
8864 fputs ("\n\t.private_extern\t", asm_out_file);
8865 assemble_name (asm_out_file, name);
8866 putc ('\n', asm_out_file);
8867 ASM_OUTPUT_LABEL (asm_out_file, name);
8868 DECL_WEAK (decl) = 1;
8869 }
8870 else
8871 #endif
8872 if (USE_HIDDEN_LINKONCE)
8873 {
8874 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8875
8876 targetm.asm_out.unique_section (decl, 0);
8877 switch_to_section (get_named_section (decl, NULL, 0));
8878
8879 targetm.asm_out.globalize_label (asm_out_file, name);
8880 fputs ("\t.hidden\t", asm_out_file);
8881 assemble_name (asm_out_file, name);
8882 putc ('\n', asm_out_file);
8883 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8884 }
8885 else
8886 {
8887 switch_to_section (text_section);
8888 ASM_OUTPUT_LABEL (asm_out_file, name);
8889 }
8890
8891 DECL_INITIAL (decl) = make_node (BLOCK);
8892 current_function_decl = decl;
8893 init_function_start (decl);
8894 first_function_block_is_cold = false;
8895 /* Make sure unwind info is emitted for the thunk if needed. */
8896 final_start_function (emit_barrier (), asm_out_file, 1);
8897
8898 /* Pad stack IP move with 4 instructions (two NOPs count
8899 as one instruction). */
8900 if (TARGET_PAD_SHORT_FUNCTION)
8901 {
8902 int i = 8;
8903
8904 while (i--)
8905 fputs ("\tnop\n", asm_out_file);
8906 }
8907
8908 xops[0] = gen_rtx_REG (Pmode, regno);
8909 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8910 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8911 fputs ("\tret\n", asm_out_file);
8912 final_end_function ();
8913 init_insn_lengths ();
8914 free_after_compilation (cfun);
8915 set_cfun (NULL);
8916 current_function_decl = NULL;
8917 }
8918
8919 if (flag_split_stack)
8920 file_end_indicate_split_stack ();
8921 }
8922
8923 /* Emit code for the SET_GOT patterns. */
8924
8925 const char *
8926 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8927 {
8928 rtx xops[3];
8929
8930 xops[0] = dest;
8931
8932 if (TARGET_VXWORKS_RTP && flag_pic)
8933 {
8934 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8935 xops[2] = gen_rtx_MEM (Pmode,
8936 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8937 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8938
8939 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8940 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8941 an unadorned address. */
8942 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8943 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8944 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8945 return "";
8946 }
8947
8948 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8949
8950 if (!flag_pic)
8951 {
8952 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8953
8954 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8955
8956 #if TARGET_MACHO
8957 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8958 is what will be referenced by the Mach-O PIC subsystem. */
8959 if (!label)
8960 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8961 #endif
8962
8963 targetm.asm_out.internal_label (asm_out_file, "L",
8964 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8965 }
8966 else
8967 {
8968 char name[32];
8969 get_pc_thunk_name (name, REGNO (dest));
8970 pic_labels_used |= 1 << REGNO (dest);
8971
8972 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8973 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8974 output_asm_insn ("call\t%X2", xops);
8975 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8976 is what will be referenced by the Mach-O PIC subsystem. */
8977 #if TARGET_MACHO
8978 if (!label)
8979 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8980 else
8981 targetm.asm_out.internal_label (asm_out_file, "L",
8982 CODE_LABEL_NUMBER (label));
8983 #endif
8984 }
8985
8986 if (!TARGET_MACHO)
8987 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8988
8989 return "";
8990 }
8991
8992 /* Generate an "push" pattern for input ARG. */
8993
8994 static rtx
8995 gen_push (rtx arg)
8996 {
8997 struct machine_function *m = cfun->machine;
8998
8999 if (m->fs.cfa_reg == stack_pointer_rtx)
9000 m->fs.cfa_offset += UNITS_PER_WORD;
9001 m->fs.sp_offset += UNITS_PER_WORD;
9002
9003 if (REG_P (arg) && GET_MODE (arg) != word_mode)
9004 arg = gen_rtx_REG (word_mode, REGNO (arg));
9005
9006 return gen_rtx_SET (VOIDmode,
9007 gen_rtx_MEM (word_mode,
9008 gen_rtx_PRE_DEC (Pmode,
9009 stack_pointer_rtx)),
9010 arg);
9011 }
9012
9013 /* Generate an "pop" pattern for input ARG. */
9014
9015 static rtx
9016 gen_pop (rtx arg)
9017 {
9018 if (REG_P (arg) && GET_MODE (arg) != word_mode)
9019 arg = gen_rtx_REG (word_mode, REGNO (arg));
9020
9021 return gen_rtx_SET (VOIDmode,
9022 arg,
9023 gen_rtx_MEM (word_mode,
9024 gen_rtx_POST_INC (Pmode,
9025 stack_pointer_rtx)));
9026 }
9027
9028 /* Return >= 0 if there is an unused call-clobbered register available
9029 for the entire function. */
9030
9031 static unsigned int
9032 ix86_select_alt_pic_regnum (void)
9033 {
9034 if (crtl->is_leaf
9035 && !crtl->profile
9036 && !ix86_current_function_calls_tls_descriptor)
9037 {
9038 int i, drap;
9039 /* Can't use the same register for both PIC and DRAP. */
9040 if (crtl->drap_reg)
9041 drap = REGNO (crtl->drap_reg);
9042 else
9043 drap = -1;
9044 for (i = 2; i >= 0; --i)
9045 if (i != drap && !df_regs_ever_live_p (i))
9046 return i;
9047 }
9048
9049 return INVALID_REGNUM;
9050 }
9051
9052 /* Return TRUE if we need to save REGNO. */
9053
9054 static bool
9055 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
9056 {
9057 if (pic_offset_table_rtx
9058 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
9059 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
9060 || crtl->profile
9061 || crtl->calls_eh_return
9062 || crtl->uses_const_pool))
9063 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
9064
9065 if (crtl->calls_eh_return && maybe_eh_return)
9066 {
9067 unsigned i;
9068 for (i = 0; ; i++)
9069 {
9070 unsigned test = EH_RETURN_DATA_REGNO (i);
9071 if (test == INVALID_REGNUM)
9072 break;
9073 if (test == regno)
9074 return true;
9075 }
9076 }
9077
9078 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
9079 return true;
9080
9081 return (df_regs_ever_live_p (regno)
9082 && !call_used_regs[regno]
9083 && !fixed_regs[regno]
9084 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
9085 }
9086
9087 /* Return number of saved general prupose registers. */
9088
9089 static int
9090 ix86_nsaved_regs (void)
9091 {
9092 int nregs = 0;
9093 int regno;
9094
9095 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9096 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9097 nregs ++;
9098 return nregs;
9099 }
9100
9101 /* Return number of saved SSE registrers. */
9102
9103 static int
9104 ix86_nsaved_sseregs (void)
9105 {
9106 int nregs = 0;
9107 int regno;
9108
9109 if (!TARGET_64BIT_MS_ABI)
9110 return 0;
9111 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9112 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9113 nregs ++;
9114 return nregs;
9115 }
9116
9117 /* Given FROM and TO register numbers, say whether this elimination is
9118 allowed. If stack alignment is needed, we can only replace argument
9119 pointer with hard frame pointer, or replace frame pointer with stack
9120 pointer. Otherwise, frame pointer elimination is automatically
9121 handled and all other eliminations are valid. */
9122
9123 static bool
9124 ix86_can_eliminate (const int from, const int to)
9125 {
9126 if (stack_realign_fp)
9127 return ((from == ARG_POINTER_REGNUM
9128 && to == HARD_FRAME_POINTER_REGNUM)
9129 || (from == FRAME_POINTER_REGNUM
9130 && to == STACK_POINTER_REGNUM));
9131 else
9132 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
9133 }
9134
9135 /* Return the offset between two registers, one to be eliminated, and the other
9136 its replacement, at the start of a routine. */
9137
9138 HOST_WIDE_INT
9139 ix86_initial_elimination_offset (int from, int to)
9140 {
9141 struct ix86_frame frame;
9142 ix86_compute_frame_layout (&frame);
9143
9144 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
9145 return frame.hard_frame_pointer_offset;
9146 else if (from == FRAME_POINTER_REGNUM
9147 && to == HARD_FRAME_POINTER_REGNUM)
9148 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
9149 else
9150 {
9151 gcc_assert (to == STACK_POINTER_REGNUM);
9152
9153 if (from == ARG_POINTER_REGNUM)
9154 return frame.stack_pointer_offset;
9155
9156 gcc_assert (from == FRAME_POINTER_REGNUM);
9157 return frame.stack_pointer_offset - frame.frame_pointer_offset;
9158 }
9159 }
9160
9161 /* In a dynamically-aligned function, we can't know the offset from
9162 stack pointer to frame pointer, so we must ensure that setjmp
9163 eliminates fp against the hard fp (%ebp) rather than trying to
9164 index from %esp up to the top of the frame across a gap that is
9165 of unknown (at compile-time) size. */
9166 static rtx
9167 ix86_builtin_setjmp_frame_value (void)
9168 {
9169 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
9170 }
9171
9172 /* When using -fsplit-stack, the allocation routines set a field in
9173 the TCB to the bottom of the stack plus this much space, measured
9174 in bytes. */
9175
9176 #define SPLIT_STACK_AVAILABLE 256
9177
9178 /* Fill structure ix86_frame about frame of currently computed function. */
9179
9180 static void
9181 ix86_compute_frame_layout (struct ix86_frame *frame)
9182 {
9183 unsigned HOST_WIDE_INT stack_alignment_needed;
9184 HOST_WIDE_INT offset;
9185 unsigned HOST_WIDE_INT preferred_alignment;
9186 HOST_WIDE_INT size = get_frame_size ();
9187 HOST_WIDE_INT to_allocate;
9188
9189 frame->nregs = ix86_nsaved_regs ();
9190 frame->nsseregs = ix86_nsaved_sseregs ();
9191
9192 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
9193 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
9194
9195 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
9196 function prologues and leaf. */
9197 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
9198 && (!crtl->is_leaf || cfun->calls_alloca != 0
9199 || ix86_current_function_calls_tls_descriptor))
9200 {
9201 preferred_alignment = 16;
9202 stack_alignment_needed = 16;
9203 crtl->preferred_stack_boundary = 128;
9204 crtl->stack_alignment_needed = 128;
9205 }
9206
9207 gcc_assert (!size || stack_alignment_needed);
9208 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
9209 gcc_assert (preferred_alignment <= stack_alignment_needed);
9210
9211 /* For SEH we have to limit the amount of code movement into the prologue.
9212 At present we do this via a BLOCKAGE, at which point there's very little
9213 scheduling that can be done, which means that there's very little point
9214 in doing anything except PUSHs. */
9215 if (TARGET_SEH)
9216 cfun->machine->use_fast_prologue_epilogue = false;
9217
9218 /* During reload iteration the amount of registers saved can change.
9219 Recompute the value as needed. Do not recompute when amount of registers
9220 didn't change as reload does multiple calls to the function and does not
9221 expect the decision to change within single iteration. */
9222 else if (!optimize_function_for_size_p (cfun)
9223 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
9224 {
9225 int count = frame->nregs;
9226 struct cgraph_node *node = cgraph_get_node (current_function_decl);
9227
9228 cfun->machine->use_fast_prologue_epilogue_nregs = count;
9229
9230 /* The fast prologue uses move instead of push to save registers. This
9231 is significantly longer, but also executes faster as modern hardware
9232 can execute the moves in parallel, but can't do that for push/pop.
9233
9234 Be careful about choosing what prologue to emit: When function takes
9235 many instructions to execute we may use slow version as well as in
9236 case function is known to be outside hot spot (this is known with
9237 feedback only). Weight the size of function by number of registers
9238 to save as it is cheap to use one or two push instructions but very
9239 slow to use many of them. */
9240 if (count)
9241 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9242 if (node->frequency < NODE_FREQUENCY_NORMAL
9243 || (flag_branch_probabilities
9244 && node->frequency < NODE_FREQUENCY_HOT))
9245 cfun->machine->use_fast_prologue_epilogue = false;
9246 else
9247 cfun->machine->use_fast_prologue_epilogue
9248 = !expensive_function_p (count);
9249 }
9250
9251 frame->save_regs_using_mov
9252 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9253 /* If static stack checking is enabled and done with probes,
9254 the registers need to be saved before allocating the frame. */
9255 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9256
9257 /* Skip return address. */
9258 offset = UNITS_PER_WORD;
9259
9260 /* Skip pushed static chain. */
9261 if (ix86_static_chain_on_stack)
9262 offset += UNITS_PER_WORD;
9263
9264 /* Skip saved base pointer. */
9265 if (frame_pointer_needed)
9266 offset += UNITS_PER_WORD;
9267 frame->hfp_save_offset = offset;
9268
9269 /* The traditional frame pointer location is at the top of the frame. */
9270 frame->hard_frame_pointer_offset = offset;
9271
9272 /* Register save area */
9273 offset += frame->nregs * UNITS_PER_WORD;
9274 frame->reg_save_offset = offset;
9275
9276 /* On SEH target, registers are pushed just before the frame pointer
9277 location. */
9278 if (TARGET_SEH)
9279 frame->hard_frame_pointer_offset = offset;
9280
9281 /* Align and set SSE register save area. */
9282 if (frame->nsseregs)
9283 {
9284 /* The only ABI that has saved SSE registers (Win64) also has a
9285 16-byte aligned default stack, and thus we don't need to be
9286 within the re-aligned local stack frame to save them. */
9287 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9288 offset = (offset + 16 - 1) & -16;
9289 offset += frame->nsseregs * 16;
9290 }
9291 frame->sse_reg_save_offset = offset;
9292
9293 /* The re-aligned stack starts here. Values before this point are not
9294 directly comparable with values below this point. In order to make
9295 sure that no value happens to be the same before and after, force
9296 the alignment computation below to add a non-zero value. */
9297 if (stack_realign_fp)
9298 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9299
9300 /* Va-arg area */
9301 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9302 offset += frame->va_arg_size;
9303
9304 /* Align start of frame for local function. */
9305 if (stack_realign_fp
9306 || offset != frame->sse_reg_save_offset
9307 || size != 0
9308 || !crtl->is_leaf
9309 || cfun->calls_alloca
9310 || ix86_current_function_calls_tls_descriptor)
9311 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9312
9313 /* Frame pointer points here. */
9314 frame->frame_pointer_offset = offset;
9315
9316 offset += size;
9317
9318 /* Add outgoing arguments area. Can be skipped if we eliminated
9319 all the function calls as dead code.
9320 Skipping is however impossible when function calls alloca. Alloca
9321 expander assumes that last crtl->outgoing_args_size
9322 of stack frame are unused. */
9323 if (ACCUMULATE_OUTGOING_ARGS
9324 && (!crtl->is_leaf || cfun->calls_alloca
9325 || ix86_current_function_calls_tls_descriptor))
9326 {
9327 offset += crtl->outgoing_args_size;
9328 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9329 }
9330 else
9331 frame->outgoing_arguments_size = 0;
9332
9333 /* Align stack boundary. Only needed if we're calling another function
9334 or using alloca. */
9335 if (!crtl->is_leaf || cfun->calls_alloca
9336 || ix86_current_function_calls_tls_descriptor)
9337 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9338
9339 /* We've reached end of stack frame. */
9340 frame->stack_pointer_offset = offset;
9341
9342 /* Size prologue needs to allocate. */
9343 to_allocate = offset - frame->sse_reg_save_offset;
9344
9345 if ((!to_allocate && frame->nregs <= 1)
9346 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9347 frame->save_regs_using_mov = false;
9348
9349 if (ix86_using_red_zone ()
9350 && crtl->sp_is_unchanging
9351 && crtl->is_leaf
9352 && !ix86_current_function_calls_tls_descriptor)
9353 {
9354 frame->red_zone_size = to_allocate;
9355 if (frame->save_regs_using_mov)
9356 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9357 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9358 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9359 }
9360 else
9361 frame->red_zone_size = 0;
9362 frame->stack_pointer_offset -= frame->red_zone_size;
9363
9364 /* The SEH frame pointer location is near the bottom of the frame.
9365 This is enforced by the fact that the difference between the
9366 stack pointer and the frame pointer is limited to 240 bytes in
9367 the unwind data structure. */
9368 if (TARGET_SEH)
9369 {
9370 HOST_WIDE_INT diff;
9371
9372 /* If we can leave the frame pointer where it is, do so. Also, returns
9373 the establisher frame for __builtin_frame_address (0). */
9374 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9375 if (diff <= SEH_MAX_FRAME_SIZE
9376 && (diff > 240 || (diff & 15) != 0)
9377 && !crtl->accesses_prior_frames)
9378 {
9379 /* Ideally we'd determine what portion of the local stack frame
9380 (within the constraint of the lowest 240) is most heavily used.
9381 But without that complication, simply bias the frame pointer
9382 by 128 bytes so as to maximize the amount of the local stack
9383 frame that is addressable with 8-bit offsets. */
9384 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9385 }
9386 }
9387 }
9388
9389 /* This is semi-inlined memory_address_length, but simplified
9390 since we know that we're always dealing with reg+offset, and
9391 to avoid having to create and discard all that rtl. */
9392
9393 static inline int
9394 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9395 {
9396 int len = 4;
9397
9398 if (offset == 0)
9399 {
9400 /* EBP and R13 cannot be encoded without an offset. */
9401 len = (regno == BP_REG || regno == R13_REG);
9402 }
9403 else if (IN_RANGE (offset, -128, 127))
9404 len = 1;
9405
9406 /* ESP and R12 must be encoded with a SIB byte. */
9407 if (regno == SP_REG || regno == R12_REG)
9408 len++;
9409
9410 return len;
9411 }
9412
9413 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9414 The valid base registers are taken from CFUN->MACHINE->FS. */
9415
9416 static rtx
9417 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9418 {
9419 const struct machine_function *m = cfun->machine;
9420 rtx base_reg = NULL;
9421 HOST_WIDE_INT base_offset = 0;
9422
9423 if (m->use_fast_prologue_epilogue)
9424 {
9425 /* Choose the base register most likely to allow the most scheduling
9426 opportunities. Generally FP is valid throughout the function,
9427 while DRAP must be reloaded within the epilogue. But choose either
9428 over the SP due to increased encoding size. */
9429
9430 if (m->fs.fp_valid)
9431 {
9432 base_reg = hard_frame_pointer_rtx;
9433 base_offset = m->fs.fp_offset - cfa_offset;
9434 }
9435 else if (m->fs.drap_valid)
9436 {
9437 base_reg = crtl->drap_reg;
9438 base_offset = 0 - cfa_offset;
9439 }
9440 else if (m->fs.sp_valid)
9441 {
9442 base_reg = stack_pointer_rtx;
9443 base_offset = m->fs.sp_offset - cfa_offset;
9444 }
9445 }
9446 else
9447 {
9448 HOST_WIDE_INT toffset;
9449 int len = 16, tlen;
9450
9451 /* Choose the base register with the smallest address encoding.
9452 With a tie, choose FP > DRAP > SP. */
9453 if (m->fs.sp_valid)
9454 {
9455 base_reg = stack_pointer_rtx;
9456 base_offset = m->fs.sp_offset - cfa_offset;
9457 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9458 }
9459 if (m->fs.drap_valid)
9460 {
9461 toffset = 0 - cfa_offset;
9462 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9463 if (tlen <= len)
9464 {
9465 base_reg = crtl->drap_reg;
9466 base_offset = toffset;
9467 len = tlen;
9468 }
9469 }
9470 if (m->fs.fp_valid)
9471 {
9472 toffset = m->fs.fp_offset - cfa_offset;
9473 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9474 if (tlen <= len)
9475 {
9476 base_reg = hard_frame_pointer_rtx;
9477 base_offset = toffset;
9478 len = tlen;
9479 }
9480 }
9481 }
9482 gcc_assert (base_reg != NULL);
9483
9484 return plus_constant (Pmode, base_reg, base_offset);
9485 }
9486
9487 /* Emit code to save registers in the prologue. */
9488
9489 static void
9490 ix86_emit_save_regs (void)
9491 {
9492 unsigned int regno;
9493 rtx insn;
9494
9495 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9496 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9497 {
9498 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9499 RTX_FRAME_RELATED_P (insn) = 1;
9500 }
9501 }
9502
9503 /* Emit a single register save at CFA - CFA_OFFSET. */
9504
9505 static void
9506 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9507 HOST_WIDE_INT cfa_offset)
9508 {
9509 struct machine_function *m = cfun->machine;
9510 rtx reg = gen_rtx_REG (mode, regno);
9511 rtx mem, addr, base, insn;
9512
9513 addr = choose_baseaddr (cfa_offset);
9514 mem = gen_frame_mem (mode, addr);
9515
9516 /* For SSE saves, we need to indicate the 128-bit alignment. */
9517 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9518
9519 insn = emit_move_insn (mem, reg);
9520 RTX_FRAME_RELATED_P (insn) = 1;
9521
9522 base = addr;
9523 if (GET_CODE (base) == PLUS)
9524 base = XEXP (base, 0);
9525 gcc_checking_assert (REG_P (base));
9526
9527 /* When saving registers into a re-aligned local stack frame, avoid
9528 any tricky guessing by dwarf2out. */
9529 if (m->fs.realigned)
9530 {
9531 gcc_checking_assert (stack_realign_drap);
9532
9533 if (regno == REGNO (crtl->drap_reg))
9534 {
9535 /* A bit of a hack. We force the DRAP register to be saved in
9536 the re-aligned stack frame, which provides us with a copy
9537 of the CFA that will last past the prologue. Install it. */
9538 gcc_checking_assert (cfun->machine->fs.fp_valid);
9539 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9540 cfun->machine->fs.fp_offset - cfa_offset);
9541 mem = gen_rtx_MEM (mode, addr);
9542 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9543 }
9544 else
9545 {
9546 /* The frame pointer is a stable reference within the
9547 aligned frame. Use it. */
9548 gcc_checking_assert (cfun->machine->fs.fp_valid);
9549 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9550 cfun->machine->fs.fp_offset - cfa_offset);
9551 mem = gen_rtx_MEM (mode, addr);
9552 add_reg_note (insn, REG_CFA_EXPRESSION,
9553 gen_rtx_SET (VOIDmode, mem, reg));
9554 }
9555 }
9556
9557 /* The memory may not be relative to the current CFA register,
9558 which means that we may need to generate a new pattern for
9559 use by the unwind info. */
9560 else if (base != m->fs.cfa_reg)
9561 {
9562 addr = plus_constant (Pmode, m->fs.cfa_reg,
9563 m->fs.cfa_offset - cfa_offset);
9564 mem = gen_rtx_MEM (mode, addr);
9565 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9566 }
9567 }
9568
9569 /* Emit code to save registers using MOV insns.
9570 First register is stored at CFA - CFA_OFFSET. */
9571 static void
9572 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9573 {
9574 unsigned int regno;
9575
9576 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9577 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9578 {
9579 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9580 cfa_offset -= UNITS_PER_WORD;
9581 }
9582 }
9583
9584 /* Emit code to save SSE registers using MOV insns.
9585 First register is stored at CFA - CFA_OFFSET. */
9586 static void
9587 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9588 {
9589 unsigned int regno;
9590
9591 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9592 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9593 {
9594 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9595 cfa_offset -= 16;
9596 }
9597 }
9598
9599 static GTY(()) rtx queued_cfa_restores;
9600
9601 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9602 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9603 Don't add the note if the previously saved value will be left untouched
9604 within stack red-zone till return, as unwinders can find the same value
9605 in the register and on the stack. */
9606
9607 static void
9608 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9609 {
9610 if (!crtl->shrink_wrapped
9611 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9612 return;
9613
9614 if (insn)
9615 {
9616 add_reg_note (insn, REG_CFA_RESTORE, reg);
9617 RTX_FRAME_RELATED_P (insn) = 1;
9618 }
9619 else
9620 queued_cfa_restores
9621 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9622 }
9623
9624 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9625
9626 static void
9627 ix86_add_queued_cfa_restore_notes (rtx insn)
9628 {
9629 rtx last;
9630 if (!queued_cfa_restores)
9631 return;
9632 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9633 ;
9634 XEXP (last, 1) = REG_NOTES (insn);
9635 REG_NOTES (insn) = queued_cfa_restores;
9636 queued_cfa_restores = NULL_RTX;
9637 RTX_FRAME_RELATED_P (insn) = 1;
9638 }
9639
9640 /* Expand prologue or epilogue stack adjustment.
9641 The pattern exist to put a dependency on all ebp-based memory accesses.
9642 STYLE should be negative if instructions should be marked as frame related,
9643 zero if %r11 register is live and cannot be freely used and positive
9644 otherwise. */
9645
9646 static void
9647 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9648 int style, bool set_cfa)
9649 {
9650 struct machine_function *m = cfun->machine;
9651 rtx insn;
9652 bool add_frame_related_expr = false;
9653
9654 if (Pmode == SImode)
9655 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9656 else if (x86_64_immediate_operand (offset, DImode))
9657 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9658 else
9659 {
9660 rtx tmp;
9661 /* r11 is used by indirect sibcall return as well, set before the
9662 epilogue and used after the epilogue. */
9663 if (style)
9664 tmp = gen_rtx_REG (DImode, R11_REG);
9665 else
9666 {
9667 gcc_assert (src != hard_frame_pointer_rtx
9668 && dest != hard_frame_pointer_rtx);
9669 tmp = hard_frame_pointer_rtx;
9670 }
9671 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9672 if (style < 0)
9673 add_frame_related_expr = true;
9674
9675 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9676 }
9677
9678 insn = emit_insn (insn);
9679 if (style >= 0)
9680 ix86_add_queued_cfa_restore_notes (insn);
9681
9682 if (set_cfa)
9683 {
9684 rtx r;
9685
9686 gcc_assert (m->fs.cfa_reg == src);
9687 m->fs.cfa_offset += INTVAL (offset);
9688 m->fs.cfa_reg = dest;
9689
9690 r = gen_rtx_PLUS (Pmode, src, offset);
9691 r = gen_rtx_SET (VOIDmode, dest, r);
9692 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9693 RTX_FRAME_RELATED_P (insn) = 1;
9694 }
9695 else if (style < 0)
9696 {
9697 RTX_FRAME_RELATED_P (insn) = 1;
9698 if (add_frame_related_expr)
9699 {
9700 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9701 r = gen_rtx_SET (VOIDmode, dest, r);
9702 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9703 }
9704 }
9705
9706 if (dest == stack_pointer_rtx)
9707 {
9708 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9709 bool valid = m->fs.sp_valid;
9710
9711 if (src == hard_frame_pointer_rtx)
9712 {
9713 valid = m->fs.fp_valid;
9714 ooffset = m->fs.fp_offset;
9715 }
9716 else if (src == crtl->drap_reg)
9717 {
9718 valid = m->fs.drap_valid;
9719 ooffset = 0;
9720 }
9721 else
9722 {
9723 /* Else there are two possibilities: SP itself, which we set
9724 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9725 taken care of this by hand along the eh_return path. */
9726 gcc_checking_assert (src == stack_pointer_rtx
9727 || offset == const0_rtx);
9728 }
9729
9730 m->fs.sp_offset = ooffset - INTVAL (offset);
9731 m->fs.sp_valid = valid;
9732 }
9733 }
9734
9735 /* Find an available register to be used as dynamic realign argument
9736 pointer regsiter. Such a register will be written in prologue and
9737 used in begin of body, so it must not be
9738 1. parameter passing register.
9739 2. GOT pointer.
9740 We reuse static-chain register if it is available. Otherwise, we
9741 use DI for i386 and R13 for x86-64. We chose R13 since it has
9742 shorter encoding.
9743
9744 Return: the regno of chosen register. */
9745
9746 static unsigned int
9747 find_drap_reg (void)
9748 {
9749 tree decl = cfun->decl;
9750
9751 if (TARGET_64BIT)
9752 {
9753 /* Use R13 for nested function or function need static chain.
9754 Since function with tail call may use any caller-saved
9755 registers in epilogue, DRAP must not use caller-saved
9756 register in such case. */
9757 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9758 return R13_REG;
9759
9760 return R10_REG;
9761 }
9762 else
9763 {
9764 /* Use DI for nested function or function need static chain.
9765 Since function with tail call may use any caller-saved
9766 registers in epilogue, DRAP must not use caller-saved
9767 register in such case. */
9768 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9769 return DI_REG;
9770
9771 /* Reuse static chain register if it isn't used for parameter
9772 passing. */
9773 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9774 {
9775 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9776 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9777 return CX_REG;
9778 }
9779 return DI_REG;
9780 }
9781 }
9782
9783 /* Return minimum incoming stack alignment. */
9784
9785 static unsigned int
9786 ix86_minimum_incoming_stack_boundary (bool sibcall)
9787 {
9788 unsigned int incoming_stack_boundary;
9789
9790 /* Prefer the one specified at command line. */
9791 if (ix86_user_incoming_stack_boundary)
9792 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9793 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9794 if -mstackrealign is used, it isn't used for sibcall check and
9795 estimated stack alignment is 128bit. */
9796 else if (!sibcall
9797 && !TARGET_64BIT
9798 && ix86_force_align_arg_pointer
9799 && crtl->stack_alignment_estimated == 128)
9800 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9801 else
9802 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9803
9804 /* Incoming stack alignment can be changed on individual functions
9805 via force_align_arg_pointer attribute. We use the smallest
9806 incoming stack boundary. */
9807 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9808 && lookup_attribute (ix86_force_align_arg_pointer_string,
9809 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9810 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9811
9812 /* The incoming stack frame has to be aligned at least at
9813 parm_stack_boundary. */
9814 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9815 incoming_stack_boundary = crtl->parm_stack_boundary;
9816
9817 /* Stack at entrance of main is aligned by runtime. We use the
9818 smallest incoming stack boundary. */
9819 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9820 && DECL_NAME (current_function_decl)
9821 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9822 && DECL_FILE_SCOPE_P (current_function_decl))
9823 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9824
9825 return incoming_stack_boundary;
9826 }
9827
9828 /* Update incoming stack boundary and estimated stack alignment. */
9829
9830 static void
9831 ix86_update_stack_boundary (void)
9832 {
9833 ix86_incoming_stack_boundary
9834 = ix86_minimum_incoming_stack_boundary (false);
9835
9836 /* x86_64 vararg needs 16byte stack alignment for register save
9837 area. */
9838 if (TARGET_64BIT
9839 && cfun->stdarg
9840 && crtl->stack_alignment_estimated < 128)
9841 crtl->stack_alignment_estimated = 128;
9842 }
9843
9844 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9845 needed or an rtx for DRAP otherwise. */
9846
9847 static rtx
9848 ix86_get_drap_rtx (void)
9849 {
9850 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9851 crtl->need_drap = true;
9852
9853 if (stack_realign_drap)
9854 {
9855 /* Assign DRAP to vDRAP and returns vDRAP */
9856 unsigned int regno = find_drap_reg ();
9857 rtx drap_vreg;
9858 rtx arg_ptr;
9859 rtx seq, insn;
9860
9861 arg_ptr = gen_rtx_REG (Pmode, regno);
9862 crtl->drap_reg = arg_ptr;
9863
9864 start_sequence ();
9865 drap_vreg = copy_to_reg (arg_ptr);
9866 seq = get_insns ();
9867 end_sequence ();
9868
9869 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9870 if (!optimize)
9871 {
9872 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9873 RTX_FRAME_RELATED_P (insn) = 1;
9874 }
9875 return drap_vreg;
9876 }
9877 else
9878 return NULL;
9879 }
9880
9881 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9882
9883 static rtx
9884 ix86_internal_arg_pointer (void)
9885 {
9886 return virtual_incoming_args_rtx;
9887 }
9888
9889 struct scratch_reg {
9890 rtx reg;
9891 bool saved;
9892 };
9893
9894 /* Return a short-lived scratch register for use on function entry.
9895 In 32-bit mode, it is valid only after the registers are saved
9896 in the prologue. This register must be released by means of
9897 release_scratch_register_on_entry once it is dead. */
9898
9899 static void
9900 get_scratch_register_on_entry (struct scratch_reg *sr)
9901 {
9902 int regno;
9903
9904 sr->saved = false;
9905
9906 if (TARGET_64BIT)
9907 {
9908 /* We always use R11 in 64-bit mode. */
9909 regno = R11_REG;
9910 }
9911 else
9912 {
9913 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9914 bool fastcall_p
9915 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9916 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9917 int regparm = ix86_function_regparm (fntype, decl);
9918 int drap_regno
9919 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9920
9921 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9922 for the static chain register. */
9923 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9924 && drap_regno != AX_REG)
9925 regno = AX_REG;
9926 else if (regparm < 2 && drap_regno != DX_REG)
9927 regno = DX_REG;
9928 /* ecx is the static chain register. */
9929 else if (regparm < 3 && !fastcall_p && !static_chain_p
9930 && drap_regno != CX_REG)
9931 regno = CX_REG;
9932 else if (ix86_save_reg (BX_REG, true))
9933 regno = BX_REG;
9934 /* esi is the static chain register. */
9935 else if (!(regparm == 3 && static_chain_p)
9936 && ix86_save_reg (SI_REG, true))
9937 regno = SI_REG;
9938 else if (ix86_save_reg (DI_REG, true))
9939 regno = DI_REG;
9940 else
9941 {
9942 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9943 sr->saved = true;
9944 }
9945 }
9946
9947 sr->reg = gen_rtx_REG (Pmode, regno);
9948 if (sr->saved)
9949 {
9950 rtx insn = emit_insn (gen_push (sr->reg));
9951 RTX_FRAME_RELATED_P (insn) = 1;
9952 }
9953 }
9954
9955 /* Release a scratch register obtained from the preceding function. */
9956
9957 static void
9958 release_scratch_register_on_entry (struct scratch_reg *sr)
9959 {
9960 if (sr->saved)
9961 {
9962 rtx x, insn = emit_insn (gen_pop (sr->reg));
9963
9964 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9965 RTX_FRAME_RELATED_P (insn) = 1;
9966 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9967 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9968 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9969 }
9970 }
9971
9972 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9973
9974 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9975
9976 static void
9977 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9978 {
9979 /* We skip the probe for the first interval + a small dope of 4 words and
9980 probe that many bytes past the specified size to maintain a protection
9981 area at the botton of the stack. */
9982 const int dope = 4 * UNITS_PER_WORD;
9983 rtx size_rtx = GEN_INT (size), last;
9984
9985 /* See if we have a constant small number of probes to generate. If so,
9986 that's the easy case. The run-time loop is made up of 11 insns in the
9987 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9988 for n # of intervals. */
9989 if (size <= 5 * PROBE_INTERVAL)
9990 {
9991 HOST_WIDE_INT i, adjust;
9992 bool first_probe = true;
9993
9994 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9995 values of N from 1 until it exceeds SIZE. If only one probe is
9996 needed, this will not generate any code. Then adjust and probe
9997 to PROBE_INTERVAL + SIZE. */
9998 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9999 {
10000 if (first_probe)
10001 {
10002 adjust = 2 * PROBE_INTERVAL + dope;
10003 first_probe = false;
10004 }
10005 else
10006 adjust = PROBE_INTERVAL;
10007
10008 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10009 plus_constant (Pmode, stack_pointer_rtx,
10010 -adjust)));
10011 emit_stack_probe (stack_pointer_rtx);
10012 }
10013
10014 if (first_probe)
10015 adjust = size + PROBE_INTERVAL + dope;
10016 else
10017 adjust = size + PROBE_INTERVAL - i;
10018
10019 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10020 plus_constant (Pmode, stack_pointer_rtx,
10021 -adjust)));
10022 emit_stack_probe (stack_pointer_rtx);
10023
10024 /* Adjust back to account for the additional first interval. */
10025 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10026 plus_constant (Pmode, stack_pointer_rtx,
10027 PROBE_INTERVAL + dope)));
10028 }
10029
10030 /* Otherwise, do the same as above, but in a loop. Note that we must be
10031 extra careful with variables wrapping around because we might be at
10032 the very top (or the very bottom) of the address space and we have
10033 to be able to handle this case properly; in particular, we use an
10034 equality test for the loop condition. */
10035 else
10036 {
10037 HOST_WIDE_INT rounded_size;
10038 struct scratch_reg sr;
10039
10040 get_scratch_register_on_entry (&sr);
10041
10042
10043 /* Step 1: round SIZE to the previous multiple of the interval. */
10044
10045 rounded_size = size & -PROBE_INTERVAL;
10046
10047
10048 /* Step 2: compute initial and final value of the loop counter. */
10049
10050 /* SP = SP_0 + PROBE_INTERVAL. */
10051 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10052 plus_constant (Pmode, stack_pointer_rtx,
10053 - (PROBE_INTERVAL + dope))));
10054
10055 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
10056 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
10057 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
10058 gen_rtx_PLUS (Pmode, sr.reg,
10059 stack_pointer_rtx)));
10060
10061
10062 /* Step 3: the loop
10063
10064 while (SP != LAST_ADDR)
10065 {
10066 SP = SP + PROBE_INTERVAL
10067 probe at SP
10068 }
10069
10070 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
10071 values of N from 1 until it is equal to ROUNDED_SIZE. */
10072
10073 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
10074
10075
10076 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
10077 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
10078
10079 if (size != rounded_size)
10080 {
10081 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10082 plus_constant (Pmode, stack_pointer_rtx,
10083 rounded_size - size)));
10084 emit_stack_probe (stack_pointer_rtx);
10085 }
10086
10087 /* Adjust back to account for the additional first interval. */
10088 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10089 plus_constant (Pmode, stack_pointer_rtx,
10090 PROBE_INTERVAL + dope)));
10091
10092 release_scratch_register_on_entry (&sr);
10093 }
10094
10095 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
10096
10097 /* Even if the stack pointer isn't the CFA register, we need to correctly
10098 describe the adjustments made to it, in particular differentiate the
10099 frame-related ones from the frame-unrelated ones. */
10100 if (size > 0)
10101 {
10102 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
10103 XVECEXP (expr, 0, 0)
10104 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10105 plus_constant (Pmode, stack_pointer_rtx, -size));
10106 XVECEXP (expr, 0, 1)
10107 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10108 plus_constant (Pmode, stack_pointer_rtx,
10109 PROBE_INTERVAL + dope + size));
10110 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
10111 RTX_FRAME_RELATED_P (last) = 1;
10112
10113 cfun->machine->fs.sp_offset += size;
10114 }
10115
10116 /* Make sure nothing is scheduled before we are done. */
10117 emit_insn (gen_blockage ());
10118 }
10119
10120 /* Adjust the stack pointer up to REG while probing it. */
10121
10122 const char *
10123 output_adjust_stack_and_probe (rtx reg)
10124 {
10125 static int labelno = 0;
10126 char loop_lab[32], end_lab[32];
10127 rtx xops[2];
10128
10129 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10130 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10131
10132 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10133
10134 /* Jump to END_LAB if SP == LAST_ADDR. */
10135 xops[0] = stack_pointer_rtx;
10136 xops[1] = reg;
10137 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10138 fputs ("\tje\t", asm_out_file);
10139 assemble_name_raw (asm_out_file, end_lab);
10140 fputc ('\n', asm_out_file);
10141
10142 /* SP = SP + PROBE_INTERVAL. */
10143 xops[1] = GEN_INT (PROBE_INTERVAL);
10144 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10145
10146 /* Probe at SP. */
10147 xops[1] = const0_rtx;
10148 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
10149
10150 fprintf (asm_out_file, "\tjmp\t");
10151 assemble_name_raw (asm_out_file, loop_lab);
10152 fputc ('\n', asm_out_file);
10153
10154 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10155
10156 return "";
10157 }
10158
10159 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
10160 inclusive. These are offsets from the current stack pointer. */
10161
10162 static void
10163 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
10164 {
10165 /* See if we have a constant small number of probes to generate. If so,
10166 that's the easy case. The run-time loop is made up of 7 insns in the
10167 generic case while the compile-time loop is made up of n insns for n #
10168 of intervals. */
10169 if (size <= 7 * PROBE_INTERVAL)
10170 {
10171 HOST_WIDE_INT i;
10172
10173 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
10174 it exceeds SIZE. If only one probe is needed, this will not
10175 generate any code. Then probe at FIRST + SIZE. */
10176 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
10177 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10178 -(first + i)));
10179
10180 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10181 -(first + size)));
10182 }
10183
10184 /* Otherwise, do the same as above, but in a loop. Note that we must be
10185 extra careful with variables wrapping around because we might be at
10186 the very top (or the very bottom) of the address space and we have
10187 to be able to handle this case properly; in particular, we use an
10188 equality test for the loop condition. */
10189 else
10190 {
10191 HOST_WIDE_INT rounded_size, last;
10192 struct scratch_reg sr;
10193
10194 get_scratch_register_on_entry (&sr);
10195
10196
10197 /* Step 1: round SIZE to the previous multiple of the interval. */
10198
10199 rounded_size = size & -PROBE_INTERVAL;
10200
10201
10202 /* Step 2: compute initial and final value of the loop counter. */
10203
10204 /* TEST_OFFSET = FIRST. */
10205 emit_move_insn (sr.reg, GEN_INT (-first));
10206
10207 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
10208 last = first + rounded_size;
10209
10210
10211 /* Step 3: the loop
10212
10213 while (TEST_ADDR != LAST_ADDR)
10214 {
10215 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
10216 probe at TEST_ADDR
10217 }
10218
10219 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
10220 until it is equal to ROUNDED_SIZE. */
10221
10222 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10223
10224
10225 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10226 that SIZE is equal to ROUNDED_SIZE. */
10227
10228 if (size != rounded_size)
10229 emit_stack_probe (plus_constant (Pmode,
10230 gen_rtx_PLUS (Pmode,
10231 stack_pointer_rtx,
10232 sr.reg),
10233 rounded_size - size));
10234
10235 release_scratch_register_on_entry (&sr);
10236 }
10237
10238 /* Make sure nothing is scheduled before we are done. */
10239 emit_insn (gen_blockage ());
10240 }
10241
10242 /* Probe a range of stack addresses from REG to END, inclusive. These are
10243 offsets from the current stack pointer. */
10244
10245 const char *
10246 output_probe_stack_range (rtx reg, rtx end)
10247 {
10248 static int labelno = 0;
10249 char loop_lab[32], end_lab[32];
10250 rtx xops[3];
10251
10252 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10253 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10254
10255 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10256
10257 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10258 xops[0] = reg;
10259 xops[1] = end;
10260 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10261 fputs ("\tje\t", asm_out_file);
10262 assemble_name_raw (asm_out_file, end_lab);
10263 fputc ('\n', asm_out_file);
10264
10265 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10266 xops[1] = GEN_INT (PROBE_INTERVAL);
10267 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10268
10269 /* Probe at TEST_ADDR. */
10270 xops[0] = stack_pointer_rtx;
10271 xops[1] = reg;
10272 xops[2] = const0_rtx;
10273 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10274
10275 fprintf (asm_out_file, "\tjmp\t");
10276 assemble_name_raw (asm_out_file, loop_lab);
10277 fputc ('\n', asm_out_file);
10278
10279 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10280
10281 return "";
10282 }
10283
10284 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10285 to be generated in correct form. */
10286 static void
10287 ix86_finalize_stack_realign_flags (void)
10288 {
10289 /* Check if stack realign is really needed after reload, and
10290 stores result in cfun */
10291 unsigned int incoming_stack_boundary
10292 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10293 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10294 unsigned int stack_realign = (incoming_stack_boundary
10295 < (crtl->is_leaf
10296 ? crtl->max_used_stack_slot_alignment
10297 : crtl->stack_alignment_needed));
10298
10299 if (crtl->stack_realign_finalized)
10300 {
10301 /* After stack_realign_needed is finalized, we can't no longer
10302 change it. */
10303 gcc_assert (crtl->stack_realign_needed == stack_realign);
10304 return;
10305 }
10306
10307 /* If the only reason for frame_pointer_needed is that we conservatively
10308 assumed stack realignment might be needed, but in the end nothing that
10309 needed the stack alignment had been spilled, clear frame_pointer_needed
10310 and say we don't need stack realignment. */
10311 if (stack_realign
10312 && !crtl->need_drap
10313 && frame_pointer_needed
10314 && crtl->is_leaf
10315 && flag_omit_frame_pointer
10316 && crtl->sp_is_unchanging
10317 && !ix86_current_function_calls_tls_descriptor
10318 && !crtl->accesses_prior_frames
10319 && !cfun->calls_alloca
10320 && !crtl->calls_eh_return
10321 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10322 && !ix86_frame_pointer_required ()
10323 && get_frame_size () == 0
10324 && ix86_nsaved_sseregs () == 0
10325 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10326 {
10327 HARD_REG_SET set_up_by_prologue, prologue_used;
10328 basic_block bb;
10329
10330 CLEAR_HARD_REG_SET (prologue_used);
10331 CLEAR_HARD_REG_SET (set_up_by_prologue);
10332 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10333 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10334 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10335 HARD_FRAME_POINTER_REGNUM);
10336 FOR_EACH_BB (bb)
10337 {
10338 rtx insn;
10339 FOR_BB_INSNS (bb, insn)
10340 if (NONDEBUG_INSN_P (insn)
10341 && requires_stack_frame_p (insn, prologue_used,
10342 set_up_by_prologue))
10343 {
10344 crtl->stack_realign_needed = stack_realign;
10345 crtl->stack_realign_finalized = true;
10346 return;
10347 }
10348 }
10349
10350 frame_pointer_needed = false;
10351 stack_realign = false;
10352 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10353 crtl->stack_alignment_needed = incoming_stack_boundary;
10354 crtl->stack_alignment_estimated = incoming_stack_boundary;
10355 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10356 crtl->preferred_stack_boundary = incoming_stack_boundary;
10357 df_finish_pass (true);
10358 df_scan_alloc (NULL);
10359 df_scan_blocks ();
10360 df_compute_regs_ever_live (true);
10361 df_analyze ();
10362 }
10363
10364 crtl->stack_realign_needed = stack_realign;
10365 crtl->stack_realign_finalized = true;
10366 }
10367
10368 /* Expand the prologue into a bunch of separate insns. */
10369
10370 void
10371 ix86_expand_prologue (void)
10372 {
10373 struct machine_function *m = cfun->machine;
10374 rtx insn, t;
10375 bool pic_reg_used;
10376 struct ix86_frame frame;
10377 HOST_WIDE_INT allocate;
10378 bool int_registers_saved;
10379 bool sse_registers_saved;
10380
10381 ix86_finalize_stack_realign_flags ();
10382
10383 /* DRAP should not coexist with stack_realign_fp */
10384 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10385
10386 memset (&m->fs, 0, sizeof (m->fs));
10387
10388 /* Initialize CFA state for before the prologue. */
10389 m->fs.cfa_reg = stack_pointer_rtx;
10390 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10391
10392 /* Track SP offset to the CFA. We continue tracking this after we've
10393 swapped the CFA register away from SP. In the case of re-alignment
10394 this is fudged; we're interested to offsets within the local frame. */
10395 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10396 m->fs.sp_valid = true;
10397
10398 ix86_compute_frame_layout (&frame);
10399
10400 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10401 {
10402 /* We should have already generated an error for any use of
10403 ms_hook on a nested function. */
10404 gcc_checking_assert (!ix86_static_chain_on_stack);
10405
10406 /* Check if profiling is active and we shall use profiling before
10407 prologue variant. If so sorry. */
10408 if (crtl->profile && flag_fentry != 0)
10409 sorry ("ms_hook_prologue attribute isn%'t compatible "
10410 "with -mfentry for 32-bit");
10411
10412 /* In ix86_asm_output_function_label we emitted:
10413 8b ff movl.s %edi,%edi
10414 55 push %ebp
10415 8b ec movl.s %esp,%ebp
10416
10417 This matches the hookable function prologue in Win32 API
10418 functions in Microsoft Windows XP Service Pack 2 and newer.
10419 Wine uses this to enable Windows apps to hook the Win32 API
10420 functions provided by Wine.
10421
10422 What that means is that we've already set up the frame pointer. */
10423
10424 if (frame_pointer_needed
10425 && !(crtl->drap_reg && crtl->stack_realign_needed))
10426 {
10427 rtx push, mov;
10428
10429 /* We've decided to use the frame pointer already set up.
10430 Describe this to the unwinder by pretending that both
10431 push and mov insns happen right here.
10432
10433 Putting the unwind info here at the end of the ms_hook
10434 is done so that we can make absolutely certain we get
10435 the required byte sequence at the start of the function,
10436 rather than relying on an assembler that can produce
10437 the exact encoding required.
10438
10439 However it does mean (in the unpatched case) that we have
10440 a 1 insn window where the asynchronous unwind info is
10441 incorrect. However, if we placed the unwind info at
10442 its correct location we would have incorrect unwind info
10443 in the patched case. Which is probably all moot since
10444 I don't expect Wine generates dwarf2 unwind info for the
10445 system libraries that use this feature. */
10446
10447 insn = emit_insn (gen_blockage ());
10448
10449 push = gen_push (hard_frame_pointer_rtx);
10450 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10451 stack_pointer_rtx);
10452 RTX_FRAME_RELATED_P (push) = 1;
10453 RTX_FRAME_RELATED_P (mov) = 1;
10454
10455 RTX_FRAME_RELATED_P (insn) = 1;
10456 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10457 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10458
10459 /* Note that gen_push incremented m->fs.cfa_offset, even
10460 though we didn't emit the push insn here. */
10461 m->fs.cfa_reg = hard_frame_pointer_rtx;
10462 m->fs.fp_offset = m->fs.cfa_offset;
10463 m->fs.fp_valid = true;
10464 }
10465 else
10466 {
10467 /* The frame pointer is not needed so pop %ebp again.
10468 This leaves us with a pristine state. */
10469 emit_insn (gen_pop (hard_frame_pointer_rtx));
10470 }
10471 }
10472
10473 /* The first insn of a function that accepts its static chain on the
10474 stack is to push the register that would be filled in by a direct
10475 call. This insn will be skipped by the trampoline. */
10476 else if (ix86_static_chain_on_stack)
10477 {
10478 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10479 emit_insn (gen_blockage ());
10480
10481 /* We don't want to interpret this push insn as a register save,
10482 only as a stack adjustment. The real copy of the register as
10483 a save will be done later, if needed. */
10484 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10485 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10486 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10487 RTX_FRAME_RELATED_P (insn) = 1;
10488 }
10489
10490 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10491 of DRAP is needed and stack realignment is really needed after reload */
10492 if (stack_realign_drap)
10493 {
10494 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10495
10496 /* Only need to push parameter pointer reg if it is caller saved. */
10497 if (!call_used_regs[REGNO (crtl->drap_reg)])
10498 {
10499 /* Push arg pointer reg */
10500 insn = emit_insn (gen_push (crtl->drap_reg));
10501 RTX_FRAME_RELATED_P (insn) = 1;
10502 }
10503
10504 /* Grab the argument pointer. */
10505 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10506 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10507 RTX_FRAME_RELATED_P (insn) = 1;
10508 m->fs.cfa_reg = crtl->drap_reg;
10509 m->fs.cfa_offset = 0;
10510
10511 /* Align the stack. */
10512 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10513 stack_pointer_rtx,
10514 GEN_INT (-align_bytes)));
10515 RTX_FRAME_RELATED_P (insn) = 1;
10516
10517 /* Replicate the return address on the stack so that return
10518 address can be reached via (argp - 1) slot. This is needed
10519 to implement macro RETURN_ADDR_RTX and intrinsic function
10520 expand_builtin_return_addr etc. */
10521 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10522 t = gen_frame_mem (word_mode, t);
10523 insn = emit_insn (gen_push (t));
10524 RTX_FRAME_RELATED_P (insn) = 1;
10525
10526 /* For the purposes of frame and register save area addressing,
10527 we've started over with a new frame. */
10528 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10529 m->fs.realigned = true;
10530 }
10531
10532 int_registers_saved = (frame.nregs == 0);
10533 sse_registers_saved = (frame.nsseregs == 0);
10534
10535 if (frame_pointer_needed && !m->fs.fp_valid)
10536 {
10537 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10538 slower on all targets. Also sdb doesn't like it. */
10539 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10540 RTX_FRAME_RELATED_P (insn) = 1;
10541
10542 /* Push registers now, before setting the frame pointer
10543 on SEH target. */
10544 if (!int_registers_saved
10545 && TARGET_SEH
10546 && !frame.save_regs_using_mov)
10547 {
10548 ix86_emit_save_regs ();
10549 int_registers_saved = true;
10550 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10551 }
10552
10553 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10554 {
10555 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10556 RTX_FRAME_RELATED_P (insn) = 1;
10557
10558 if (m->fs.cfa_reg == stack_pointer_rtx)
10559 m->fs.cfa_reg = hard_frame_pointer_rtx;
10560 m->fs.fp_offset = m->fs.sp_offset;
10561 m->fs.fp_valid = true;
10562 }
10563 }
10564
10565 if (!int_registers_saved)
10566 {
10567 /* If saving registers via PUSH, do so now. */
10568 if (!frame.save_regs_using_mov)
10569 {
10570 ix86_emit_save_regs ();
10571 int_registers_saved = true;
10572 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10573 }
10574
10575 /* When using red zone we may start register saving before allocating
10576 the stack frame saving one cycle of the prologue. However, avoid
10577 doing this if we have to probe the stack; at least on x86_64 the
10578 stack probe can turn into a call that clobbers a red zone location. */
10579 else if (ix86_using_red_zone ()
10580 && (! TARGET_STACK_PROBE
10581 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10582 {
10583 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10584 int_registers_saved = true;
10585 }
10586 }
10587
10588 if (stack_realign_fp)
10589 {
10590 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10591 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10592
10593 /* The computation of the size of the re-aligned stack frame means
10594 that we must allocate the size of the register save area before
10595 performing the actual alignment. Otherwise we cannot guarantee
10596 that there's enough storage above the realignment point. */
10597 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10598 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10599 GEN_INT (m->fs.sp_offset
10600 - frame.sse_reg_save_offset),
10601 -1, false);
10602
10603 /* Align the stack. */
10604 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10605 stack_pointer_rtx,
10606 GEN_INT (-align_bytes)));
10607
10608 /* For the purposes of register save area addressing, the stack
10609 pointer is no longer valid. As for the value of sp_offset,
10610 see ix86_compute_frame_layout, which we need to match in order
10611 to pass verification of stack_pointer_offset at the end. */
10612 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10613 m->fs.sp_valid = false;
10614 }
10615
10616 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10617
10618 if (flag_stack_usage_info)
10619 {
10620 /* We start to count from ARG_POINTER. */
10621 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10622
10623 /* If it was realigned, take into account the fake frame. */
10624 if (stack_realign_drap)
10625 {
10626 if (ix86_static_chain_on_stack)
10627 stack_size += UNITS_PER_WORD;
10628
10629 if (!call_used_regs[REGNO (crtl->drap_reg)])
10630 stack_size += UNITS_PER_WORD;
10631
10632 /* This over-estimates by 1 minimal-stack-alignment-unit but
10633 mitigates that by counting in the new return address slot. */
10634 current_function_dynamic_stack_size
10635 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10636 }
10637
10638 current_function_static_stack_size = stack_size;
10639 }
10640
10641 /* On SEH target with very large frame size, allocate an area to save
10642 SSE registers (as the very large allocation won't be described). */
10643 if (TARGET_SEH
10644 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10645 && !sse_registers_saved)
10646 {
10647 HOST_WIDE_INT sse_size =
10648 frame.sse_reg_save_offset - frame.reg_save_offset;
10649
10650 gcc_assert (int_registers_saved);
10651
10652 /* No need to do stack checking as the area will be immediately
10653 written. */
10654 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10655 GEN_INT (-sse_size), -1,
10656 m->fs.cfa_reg == stack_pointer_rtx);
10657 allocate -= sse_size;
10658 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10659 sse_registers_saved = true;
10660 }
10661
10662 /* The stack has already been decremented by the instruction calling us
10663 so probe if the size is non-negative to preserve the protection area. */
10664 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10665 {
10666 /* We expect the registers to be saved when probes are used. */
10667 gcc_assert (int_registers_saved);
10668
10669 if (STACK_CHECK_MOVING_SP)
10670 {
10671 ix86_adjust_stack_and_probe (allocate);
10672 allocate = 0;
10673 }
10674 else
10675 {
10676 HOST_WIDE_INT size = allocate;
10677
10678 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10679 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10680
10681 if (TARGET_STACK_PROBE)
10682 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10683 else
10684 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10685 }
10686 }
10687
10688 if (allocate == 0)
10689 ;
10690 else if (!ix86_target_stack_probe ()
10691 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10692 {
10693 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10694 GEN_INT (-allocate), -1,
10695 m->fs.cfa_reg == stack_pointer_rtx);
10696 }
10697 else
10698 {
10699 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10700 rtx r10 = NULL;
10701 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10702
10703 bool eax_live = false;
10704 bool r10_live = false;
10705
10706 if (TARGET_64BIT)
10707 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10708 if (!TARGET_64BIT_MS_ABI)
10709 eax_live = ix86_eax_live_at_start_p ();
10710
10711 if (eax_live)
10712 {
10713 emit_insn (gen_push (eax));
10714 allocate -= UNITS_PER_WORD;
10715 }
10716 if (r10_live)
10717 {
10718 r10 = gen_rtx_REG (Pmode, R10_REG);
10719 emit_insn (gen_push (r10));
10720 allocate -= UNITS_PER_WORD;
10721 }
10722
10723 emit_move_insn (eax, GEN_INT (allocate));
10724 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10725
10726 /* Use the fact that AX still contains ALLOCATE. */
10727 adjust_stack_insn = (Pmode == DImode
10728 ? gen_pro_epilogue_adjust_stack_di_sub
10729 : gen_pro_epilogue_adjust_stack_si_sub);
10730
10731 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10732 stack_pointer_rtx, eax));
10733
10734 /* Note that SEH directives need to continue tracking the stack
10735 pointer even after the frame pointer has been set up. */
10736 if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
10737 {
10738 if (m->fs.cfa_reg == stack_pointer_rtx)
10739 m->fs.cfa_offset += allocate;
10740
10741 RTX_FRAME_RELATED_P (insn) = 1;
10742 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10743 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10744 plus_constant (Pmode, stack_pointer_rtx,
10745 -allocate)));
10746 }
10747 m->fs.sp_offset += allocate;
10748
10749 if (r10_live && eax_live)
10750 {
10751 t = choose_baseaddr (m->fs.sp_offset - allocate);
10752 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10753 gen_frame_mem (word_mode, t));
10754 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10755 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10756 gen_frame_mem (word_mode, t));
10757 }
10758 else if (eax_live || r10_live)
10759 {
10760 t = choose_baseaddr (m->fs.sp_offset - allocate);
10761 emit_move_insn (gen_rtx_REG (word_mode,
10762 (eax_live ? AX_REG : R10_REG)),
10763 gen_frame_mem (word_mode, t));
10764 }
10765 }
10766 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10767
10768 /* If we havn't already set up the frame pointer, do so now. */
10769 if (frame_pointer_needed && !m->fs.fp_valid)
10770 {
10771 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10772 GEN_INT (frame.stack_pointer_offset
10773 - frame.hard_frame_pointer_offset));
10774 insn = emit_insn (insn);
10775 RTX_FRAME_RELATED_P (insn) = 1;
10776 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10777
10778 if (m->fs.cfa_reg == stack_pointer_rtx)
10779 m->fs.cfa_reg = hard_frame_pointer_rtx;
10780 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10781 m->fs.fp_valid = true;
10782 }
10783
10784 if (!int_registers_saved)
10785 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10786 if (!sse_registers_saved)
10787 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10788
10789 pic_reg_used = false;
10790 if (pic_offset_table_rtx
10791 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10792 || crtl->profile))
10793 {
10794 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10795
10796 if (alt_pic_reg_used != INVALID_REGNUM)
10797 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10798
10799 pic_reg_used = true;
10800 }
10801
10802 if (pic_reg_used)
10803 {
10804 if (TARGET_64BIT)
10805 {
10806 if (ix86_cmodel == CM_LARGE_PIC)
10807 {
10808 rtx label, tmp_reg;
10809
10810 gcc_assert (Pmode == DImode);
10811 label = gen_label_rtx ();
10812 emit_label (label);
10813 LABEL_PRESERVE_P (label) = 1;
10814 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10815 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10816 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10817 label));
10818 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10819 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10820 pic_offset_table_rtx, tmp_reg));
10821 }
10822 else
10823 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10824 }
10825 else
10826 {
10827 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10828 RTX_FRAME_RELATED_P (insn) = 1;
10829 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10830 }
10831 }
10832
10833 /* In the pic_reg_used case, make sure that the got load isn't deleted
10834 when mcount needs it. Blockage to avoid call movement across mcount
10835 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10836 note. */
10837 if (crtl->profile && !flag_fentry && pic_reg_used)
10838 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10839
10840 if (crtl->drap_reg && !crtl->stack_realign_needed)
10841 {
10842 /* vDRAP is setup but after reload it turns out stack realign
10843 isn't necessary, here we will emit prologue to setup DRAP
10844 without stack realign adjustment */
10845 t = choose_baseaddr (0);
10846 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10847 }
10848
10849 /* Prevent instructions from being scheduled into register save push
10850 sequence when access to the redzone area is done through frame pointer.
10851 The offset between the frame pointer and the stack pointer is calculated
10852 relative to the value of the stack pointer at the end of the function
10853 prologue, and moving instructions that access redzone area via frame
10854 pointer inside push sequence violates this assumption. */
10855 if (frame_pointer_needed && frame.red_zone_size)
10856 emit_insn (gen_memory_blockage ());
10857
10858 /* Emit cld instruction if stringops are used in the function. */
10859 if (TARGET_CLD && ix86_current_function_needs_cld)
10860 emit_insn (gen_cld ());
10861
10862 /* SEH requires that the prologue end within 256 bytes of the start of
10863 the function. Prevent instruction schedules that would extend that.
10864 Further, prevent alloca modifications to the stack pointer from being
10865 combined with prologue modifications. */
10866 if (TARGET_SEH)
10867 emit_insn (gen_prologue_use (stack_pointer_rtx));
10868 }
10869
10870 /* Emit code to restore REG using a POP insn. */
10871
10872 static void
10873 ix86_emit_restore_reg_using_pop (rtx reg)
10874 {
10875 struct machine_function *m = cfun->machine;
10876 rtx insn = emit_insn (gen_pop (reg));
10877
10878 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10879 m->fs.sp_offset -= UNITS_PER_WORD;
10880
10881 if (m->fs.cfa_reg == crtl->drap_reg
10882 && REGNO (reg) == REGNO (crtl->drap_reg))
10883 {
10884 /* Previously we'd represented the CFA as an expression
10885 like *(%ebp - 8). We've just popped that value from
10886 the stack, which means we need to reset the CFA to
10887 the drap register. This will remain until we restore
10888 the stack pointer. */
10889 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10890 RTX_FRAME_RELATED_P (insn) = 1;
10891
10892 /* This means that the DRAP register is valid for addressing too. */
10893 m->fs.drap_valid = true;
10894 return;
10895 }
10896
10897 if (m->fs.cfa_reg == stack_pointer_rtx)
10898 {
10899 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10900 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10901 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10902 RTX_FRAME_RELATED_P (insn) = 1;
10903
10904 m->fs.cfa_offset -= UNITS_PER_WORD;
10905 }
10906
10907 /* When the frame pointer is the CFA, and we pop it, we are
10908 swapping back to the stack pointer as the CFA. This happens
10909 for stack frames that don't allocate other data, so we assume
10910 the stack pointer is now pointing at the return address, i.e.
10911 the function entry state, which makes the offset be 1 word. */
10912 if (reg == hard_frame_pointer_rtx)
10913 {
10914 m->fs.fp_valid = false;
10915 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10916 {
10917 m->fs.cfa_reg = stack_pointer_rtx;
10918 m->fs.cfa_offset -= UNITS_PER_WORD;
10919
10920 add_reg_note (insn, REG_CFA_DEF_CFA,
10921 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10922 GEN_INT (m->fs.cfa_offset)));
10923 RTX_FRAME_RELATED_P (insn) = 1;
10924 }
10925 }
10926 }
10927
10928 /* Emit code to restore saved registers using POP insns. */
10929
10930 static void
10931 ix86_emit_restore_regs_using_pop (void)
10932 {
10933 unsigned int regno;
10934
10935 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10936 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10937 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10938 }
10939
10940 /* Emit code and notes for the LEAVE instruction. */
10941
10942 static void
10943 ix86_emit_leave (void)
10944 {
10945 struct machine_function *m = cfun->machine;
10946 rtx insn = emit_insn (ix86_gen_leave ());
10947
10948 ix86_add_queued_cfa_restore_notes (insn);
10949
10950 gcc_assert (m->fs.fp_valid);
10951 m->fs.sp_valid = true;
10952 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10953 m->fs.fp_valid = false;
10954
10955 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10956 {
10957 m->fs.cfa_reg = stack_pointer_rtx;
10958 m->fs.cfa_offset = m->fs.sp_offset;
10959
10960 add_reg_note (insn, REG_CFA_DEF_CFA,
10961 plus_constant (Pmode, stack_pointer_rtx,
10962 m->fs.sp_offset));
10963 RTX_FRAME_RELATED_P (insn) = 1;
10964 }
10965 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10966 m->fs.fp_offset);
10967 }
10968
10969 /* Emit code to restore saved registers using MOV insns.
10970 First register is restored from CFA - CFA_OFFSET. */
10971 static void
10972 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10973 bool maybe_eh_return)
10974 {
10975 struct machine_function *m = cfun->machine;
10976 unsigned int regno;
10977
10978 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10979 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10980 {
10981 rtx reg = gen_rtx_REG (word_mode, regno);
10982 rtx insn, mem;
10983
10984 mem = choose_baseaddr (cfa_offset);
10985 mem = gen_frame_mem (word_mode, mem);
10986 insn = emit_move_insn (reg, mem);
10987
10988 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10989 {
10990 /* Previously we'd represented the CFA as an expression
10991 like *(%ebp - 8). We've just popped that value from
10992 the stack, which means we need to reset the CFA to
10993 the drap register. This will remain until we restore
10994 the stack pointer. */
10995 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10996 RTX_FRAME_RELATED_P (insn) = 1;
10997
10998 /* This means that the DRAP register is valid for addressing. */
10999 m->fs.drap_valid = true;
11000 }
11001 else
11002 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11003
11004 cfa_offset -= UNITS_PER_WORD;
11005 }
11006 }
11007
11008 /* Emit code to restore saved registers using MOV insns.
11009 First register is restored from CFA - CFA_OFFSET. */
11010 static void
11011 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
11012 bool maybe_eh_return)
11013 {
11014 unsigned int regno;
11015
11016 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11017 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
11018 {
11019 rtx reg = gen_rtx_REG (V4SFmode, regno);
11020 rtx mem;
11021
11022 mem = choose_baseaddr (cfa_offset);
11023 mem = gen_rtx_MEM (V4SFmode, mem);
11024 set_mem_align (mem, 128);
11025 emit_move_insn (reg, mem);
11026
11027 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11028
11029 cfa_offset -= 16;
11030 }
11031 }
11032
11033 /* Emit vzeroupper if needed. */
11034
11035 void
11036 ix86_maybe_emit_epilogue_vzeroupper (void)
11037 {
11038 if (TARGET_VZEROUPPER
11039 && !TREE_THIS_VOLATILE (cfun->decl)
11040 && !cfun->machine->caller_return_avx256_p)
11041 emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
11042 }
11043
11044 /* Restore function stack, frame, and registers. */
11045
11046 void
11047 ix86_expand_epilogue (int style)
11048 {
11049 struct machine_function *m = cfun->machine;
11050 struct machine_frame_state frame_state_save = m->fs;
11051 struct ix86_frame frame;
11052 bool restore_regs_via_mov;
11053 bool using_drap;
11054
11055 ix86_finalize_stack_realign_flags ();
11056 ix86_compute_frame_layout (&frame);
11057
11058 m->fs.sp_valid = (!frame_pointer_needed
11059 || (crtl->sp_is_unchanging
11060 && !stack_realign_fp));
11061 gcc_assert (!m->fs.sp_valid
11062 || m->fs.sp_offset == frame.stack_pointer_offset);
11063
11064 /* The FP must be valid if the frame pointer is present. */
11065 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
11066 gcc_assert (!m->fs.fp_valid
11067 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
11068
11069 /* We must have *some* valid pointer to the stack frame. */
11070 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
11071
11072 /* The DRAP is never valid at this point. */
11073 gcc_assert (!m->fs.drap_valid);
11074
11075 /* See the comment about red zone and frame
11076 pointer usage in ix86_expand_prologue. */
11077 if (frame_pointer_needed && frame.red_zone_size)
11078 emit_insn (gen_memory_blockage ());
11079
11080 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
11081 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
11082
11083 /* Determine the CFA offset of the end of the red-zone. */
11084 m->fs.red_zone_offset = 0;
11085 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
11086 {
11087 /* The red-zone begins below the return address. */
11088 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
11089
11090 /* When the register save area is in the aligned portion of
11091 the stack, determine the maximum runtime displacement that
11092 matches up with the aligned frame. */
11093 if (stack_realign_drap)
11094 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
11095 + UNITS_PER_WORD);
11096 }
11097
11098 /* Special care must be taken for the normal return case of a function
11099 using eh_return: the eax and edx registers are marked as saved, but
11100 not restored along this path. Adjust the save location to match. */
11101 if (crtl->calls_eh_return && style != 2)
11102 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
11103
11104 /* EH_RETURN requires the use of moves to function properly. */
11105 if (crtl->calls_eh_return)
11106 restore_regs_via_mov = true;
11107 /* SEH requires the use of pops to identify the epilogue. */
11108 else if (TARGET_SEH)
11109 restore_regs_via_mov = false;
11110 /* If we're only restoring one register and sp is not valid then
11111 using a move instruction to restore the register since it's
11112 less work than reloading sp and popping the register. */
11113 else if (!m->fs.sp_valid && frame.nregs <= 1)
11114 restore_regs_via_mov = true;
11115 else if (TARGET_EPILOGUE_USING_MOVE
11116 && cfun->machine->use_fast_prologue_epilogue
11117 && (frame.nregs > 1
11118 || m->fs.sp_offset != frame.reg_save_offset))
11119 restore_regs_via_mov = true;
11120 else if (frame_pointer_needed
11121 && !frame.nregs
11122 && m->fs.sp_offset != frame.reg_save_offset)
11123 restore_regs_via_mov = true;
11124 else if (frame_pointer_needed
11125 && TARGET_USE_LEAVE
11126 && cfun->machine->use_fast_prologue_epilogue
11127 && frame.nregs == 1)
11128 restore_regs_via_mov = true;
11129 else
11130 restore_regs_via_mov = false;
11131
11132 if (restore_regs_via_mov || frame.nsseregs)
11133 {
11134 /* Ensure that the entire register save area is addressable via
11135 the stack pointer, if we will restore via sp. */
11136 if (TARGET_64BIT
11137 && m->fs.sp_offset > 0x7fffffff
11138 && !(m->fs.fp_valid || m->fs.drap_valid)
11139 && (frame.nsseregs + frame.nregs) != 0)
11140 {
11141 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11142 GEN_INT (m->fs.sp_offset
11143 - frame.sse_reg_save_offset),
11144 style,
11145 m->fs.cfa_reg == stack_pointer_rtx);
11146 }
11147 }
11148
11149 /* If there are any SSE registers to restore, then we have to do it
11150 via moves, since there's obviously no pop for SSE regs. */
11151 if (frame.nsseregs)
11152 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
11153 style == 2);
11154
11155 if (restore_regs_via_mov)
11156 {
11157 rtx t;
11158
11159 if (frame.nregs)
11160 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
11161
11162 /* eh_return epilogues need %ecx added to the stack pointer. */
11163 if (style == 2)
11164 {
11165 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
11166
11167 /* Stack align doesn't work with eh_return. */
11168 gcc_assert (!stack_realign_drap);
11169 /* Neither does regparm nested functions. */
11170 gcc_assert (!ix86_static_chain_on_stack);
11171
11172 if (frame_pointer_needed)
11173 {
11174 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
11175 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
11176 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
11177
11178 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
11179 insn = emit_move_insn (hard_frame_pointer_rtx, t);
11180
11181 /* Note that we use SA as a temporary CFA, as the return
11182 address is at the proper place relative to it. We
11183 pretend this happens at the FP restore insn because
11184 prior to this insn the FP would be stored at the wrong
11185 offset relative to SA, and after this insn we have no
11186 other reasonable register to use for the CFA. We don't
11187 bother resetting the CFA to the SP for the duration of
11188 the return insn. */
11189 add_reg_note (insn, REG_CFA_DEF_CFA,
11190 plus_constant (Pmode, sa, UNITS_PER_WORD));
11191 ix86_add_queued_cfa_restore_notes (insn);
11192 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
11193 RTX_FRAME_RELATED_P (insn) = 1;
11194
11195 m->fs.cfa_reg = sa;
11196 m->fs.cfa_offset = UNITS_PER_WORD;
11197 m->fs.fp_valid = false;
11198
11199 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
11200 const0_rtx, style, false);
11201 }
11202 else
11203 {
11204 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
11205 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
11206 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
11207 ix86_add_queued_cfa_restore_notes (insn);
11208
11209 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
11210 if (m->fs.cfa_offset != UNITS_PER_WORD)
11211 {
11212 m->fs.cfa_offset = UNITS_PER_WORD;
11213 add_reg_note (insn, REG_CFA_DEF_CFA,
11214 plus_constant (Pmode, stack_pointer_rtx,
11215 UNITS_PER_WORD));
11216 RTX_FRAME_RELATED_P (insn) = 1;
11217 }
11218 }
11219 m->fs.sp_offset = UNITS_PER_WORD;
11220 m->fs.sp_valid = true;
11221 }
11222 }
11223 else
11224 {
11225 /* SEH requires that the function end with (1) a stack adjustment
11226 if necessary, (2) a sequence of pops, and (3) a return or
11227 jump instruction. Prevent insns from the function body from
11228 being scheduled into this sequence. */
11229 if (TARGET_SEH)
11230 {
11231 /* Prevent a catch region from being adjacent to the standard
11232 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11233 several other flags that would be interesting to test are
11234 not yet set up. */
11235 if (flag_non_call_exceptions)
11236 emit_insn (gen_nops (const1_rtx));
11237 else
11238 emit_insn (gen_blockage ());
11239 }
11240
11241 /* First step is to deallocate the stack frame so that we can
11242 pop the registers. Also do it on SEH target for very large
11243 frame as the emitted instructions aren't allowed by the ABI in
11244 epilogues. */
11245 if (!m->fs.sp_valid
11246 || (TARGET_SEH
11247 && (m->fs.sp_offset - frame.reg_save_offset
11248 >= SEH_MAX_FRAME_SIZE)))
11249 {
11250 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11251 GEN_INT (m->fs.fp_offset
11252 - frame.reg_save_offset),
11253 style, false);
11254 }
11255 else if (m->fs.sp_offset != frame.reg_save_offset)
11256 {
11257 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11258 GEN_INT (m->fs.sp_offset
11259 - frame.reg_save_offset),
11260 style,
11261 m->fs.cfa_reg == stack_pointer_rtx);
11262 }
11263
11264 ix86_emit_restore_regs_using_pop ();
11265 }
11266
11267 /* If we used a stack pointer and haven't already got rid of it,
11268 then do so now. */
11269 if (m->fs.fp_valid)
11270 {
11271 /* If the stack pointer is valid and pointing at the frame
11272 pointer store address, then we only need a pop. */
11273 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11274 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11275 /* Leave results in shorter dependency chains on CPUs that are
11276 able to grok it fast. */
11277 else if (TARGET_USE_LEAVE
11278 || optimize_function_for_size_p (cfun)
11279 || !cfun->machine->use_fast_prologue_epilogue)
11280 ix86_emit_leave ();
11281 else
11282 {
11283 pro_epilogue_adjust_stack (stack_pointer_rtx,
11284 hard_frame_pointer_rtx,
11285 const0_rtx, style, !using_drap);
11286 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11287 }
11288 }
11289
11290 if (using_drap)
11291 {
11292 int param_ptr_offset = UNITS_PER_WORD;
11293 rtx insn;
11294
11295 gcc_assert (stack_realign_drap);
11296
11297 if (ix86_static_chain_on_stack)
11298 param_ptr_offset += UNITS_PER_WORD;
11299 if (!call_used_regs[REGNO (crtl->drap_reg)])
11300 param_ptr_offset += UNITS_PER_WORD;
11301
11302 insn = emit_insn (gen_rtx_SET
11303 (VOIDmode, stack_pointer_rtx,
11304 gen_rtx_PLUS (Pmode,
11305 crtl->drap_reg,
11306 GEN_INT (-param_ptr_offset))));
11307 m->fs.cfa_reg = stack_pointer_rtx;
11308 m->fs.cfa_offset = param_ptr_offset;
11309 m->fs.sp_offset = param_ptr_offset;
11310 m->fs.realigned = false;
11311
11312 add_reg_note (insn, REG_CFA_DEF_CFA,
11313 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11314 GEN_INT (param_ptr_offset)));
11315 RTX_FRAME_RELATED_P (insn) = 1;
11316
11317 if (!call_used_regs[REGNO (crtl->drap_reg)])
11318 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11319 }
11320
11321 /* At this point the stack pointer must be valid, and we must have
11322 restored all of the registers. We may not have deallocated the
11323 entire stack frame. We've delayed this until now because it may
11324 be possible to merge the local stack deallocation with the
11325 deallocation forced by ix86_static_chain_on_stack. */
11326 gcc_assert (m->fs.sp_valid);
11327 gcc_assert (!m->fs.fp_valid);
11328 gcc_assert (!m->fs.realigned);
11329 if (m->fs.sp_offset != UNITS_PER_WORD)
11330 {
11331 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11332 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11333 style, true);
11334 }
11335 else
11336 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11337
11338 /* Sibcall epilogues don't want a return instruction. */
11339 if (style == 0)
11340 {
11341 m->fs = frame_state_save;
11342 return;
11343 }
11344
11345 /* Emit vzeroupper if needed. */
11346 ix86_maybe_emit_epilogue_vzeroupper ();
11347
11348 if (crtl->args.pops_args && crtl->args.size)
11349 {
11350 rtx popc = GEN_INT (crtl->args.pops_args);
11351
11352 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11353 address, do explicit add, and jump indirectly to the caller. */
11354
11355 if (crtl->args.pops_args >= 65536)
11356 {
11357 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11358 rtx insn;
11359
11360 /* There is no "pascal" calling convention in any 64bit ABI. */
11361 gcc_assert (!TARGET_64BIT);
11362
11363 insn = emit_insn (gen_pop (ecx));
11364 m->fs.cfa_offset -= UNITS_PER_WORD;
11365 m->fs.sp_offset -= UNITS_PER_WORD;
11366
11367 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11368 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11369 add_reg_note (insn, REG_CFA_REGISTER,
11370 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11371 RTX_FRAME_RELATED_P (insn) = 1;
11372
11373 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11374 popc, -1, true);
11375 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11376 }
11377 else
11378 emit_jump_insn (gen_simple_return_pop_internal (popc));
11379 }
11380 else
11381 emit_jump_insn (gen_simple_return_internal ());
11382
11383 /* Restore the state back to the state from the prologue,
11384 so that it's correct for the next epilogue. */
11385 m->fs = frame_state_save;
11386 }
11387
11388 /* Reset from the function's potential modifications. */
11389
11390 static void
11391 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11392 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11393 {
11394 if (pic_offset_table_rtx)
11395 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11396 #if TARGET_MACHO
11397 /* Mach-O doesn't support labels at the end of objects, so if
11398 it looks like we might want one, insert a NOP. */
11399 {
11400 rtx insn = get_last_insn ();
11401 rtx deleted_debug_label = NULL_RTX;
11402 while (insn
11403 && NOTE_P (insn)
11404 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11405 {
11406 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11407 notes only, instead set their CODE_LABEL_NUMBER to -1,
11408 otherwise there would be code generation differences
11409 in between -g and -g0. */
11410 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11411 deleted_debug_label = insn;
11412 insn = PREV_INSN (insn);
11413 }
11414 if (insn
11415 && (LABEL_P (insn)
11416 || (NOTE_P (insn)
11417 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11418 fputs ("\tnop\n", file);
11419 else if (deleted_debug_label)
11420 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11421 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11422 CODE_LABEL_NUMBER (insn) = -1;
11423 }
11424 #endif
11425
11426 }
11427
11428 /* Return a scratch register to use in the split stack prologue. The
11429 split stack prologue is used for -fsplit-stack. It is the first
11430 instructions in the function, even before the regular prologue.
11431 The scratch register can be any caller-saved register which is not
11432 used for parameters or for the static chain. */
11433
11434 static unsigned int
11435 split_stack_prologue_scratch_regno (void)
11436 {
11437 if (TARGET_64BIT)
11438 return R11_REG;
11439 else
11440 {
11441 bool is_fastcall;
11442 int regparm;
11443
11444 is_fastcall = (lookup_attribute ("fastcall",
11445 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11446 != NULL);
11447 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11448
11449 if (is_fastcall)
11450 {
11451 if (DECL_STATIC_CHAIN (cfun->decl))
11452 {
11453 sorry ("-fsplit-stack does not support fastcall with "
11454 "nested function");
11455 return INVALID_REGNUM;
11456 }
11457 return AX_REG;
11458 }
11459 else if (regparm < 3)
11460 {
11461 if (!DECL_STATIC_CHAIN (cfun->decl))
11462 return CX_REG;
11463 else
11464 {
11465 if (regparm >= 2)
11466 {
11467 sorry ("-fsplit-stack does not support 2 register "
11468 " parameters for a nested function");
11469 return INVALID_REGNUM;
11470 }
11471 return DX_REG;
11472 }
11473 }
11474 else
11475 {
11476 /* FIXME: We could make this work by pushing a register
11477 around the addition and comparison. */
11478 sorry ("-fsplit-stack does not support 3 register parameters");
11479 return INVALID_REGNUM;
11480 }
11481 }
11482 }
11483
11484 /* A SYMBOL_REF for the function which allocates new stackspace for
11485 -fsplit-stack. */
11486
11487 static GTY(()) rtx split_stack_fn;
11488
11489 /* A SYMBOL_REF for the more stack function when using the large
11490 model. */
11491
11492 static GTY(()) rtx split_stack_fn_large;
11493
11494 /* Handle -fsplit-stack. These are the first instructions in the
11495 function, even before the regular prologue. */
11496
11497 void
11498 ix86_expand_split_stack_prologue (void)
11499 {
11500 struct ix86_frame frame;
11501 HOST_WIDE_INT allocate;
11502 unsigned HOST_WIDE_INT args_size;
11503 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11504 rtx scratch_reg = NULL_RTX;
11505 rtx varargs_label = NULL_RTX;
11506 rtx fn;
11507
11508 gcc_assert (flag_split_stack && reload_completed);
11509
11510 ix86_finalize_stack_realign_flags ();
11511 ix86_compute_frame_layout (&frame);
11512 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11513
11514 /* This is the label we will branch to if we have enough stack
11515 space. We expect the basic block reordering pass to reverse this
11516 branch if optimizing, so that we branch in the unlikely case. */
11517 label = gen_label_rtx ();
11518
11519 /* We need to compare the stack pointer minus the frame size with
11520 the stack boundary in the TCB. The stack boundary always gives
11521 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11522 can compare directly. Otherwise we need to do an addition. */
11523
11524 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11525 UNSPEC_STACK_CHECK);
11526 limit = gen_rtx_CONST (Pmode, limit);
11527 limit = gen_rtx_MEM (Pmode, limit);
11528 if (allocate < SPLIT_STACK_AVAILABLE)
11529 current = stack_pointer_rtx;
11530 else
11531 {
11532 unsigned int scratch_regno;
11533 rtx offset;
11534
11535 /* We need a scratch register to hold the stack pointer minus
11536 the required frame size. Since this is the very start of the
11537 function, the scratch register can be any caller-saved
11538 register which is not used for parameters. */
11539 offset = GEN_INT (- allocate);
11540 scratch_regno = split_stack_prologue_scratch_regno ();
11541 if (scratch_regno == INVALID_REGNUM)
11542 return;
11543 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11544 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11545 {
11546 /* We don't use ix86_gen_add3 in this case because it will
11547 want to split to lea, but when not optimizing the insn
11548 will not be split after this point. */
11549 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11550 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11551 offset)));
11552 }
11553 else
11554 {
11555 emit_move_insn (scratch_reg, offset);
11556 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11557 stack_pointer_rtx));
11558 }
11559 current = scratch_reg;
11560 }
11561
11562 ix86_expand_branch (GEU, current, limit, label);
11563 jump_insn = get_last_insn ();
11564 JUMP_LABEL (jump_insn) = label;
11565
11566 /* Mark the jump as very likely to be taken. */
11567 add_reg_note (jump_insn, REG_BR_PROB,
11568 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11569
11570 if (split_stack_fn == NULL_RTX)
11571 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11572 fn = split_stack_fn;
11573
11574 /* Get more stack space. We pass in the desired stack space and the
11575 size of the arguments to copy to the new stack. In 32-bit mode
11576 we push the parameters; __morestack will return on a new stack
11577 anyhow. In 64-bit mode we pass the parameters in r10 and
11578 r11. */
11579 allocate_rtx = GEN_INT (allocate);
11580 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11581 call_fusage = NULL_RTX;
11582 if (TARGET_64BIT)
11583 {
11584 rtx reg10, reg11;
11585
11586 reg10 = gen_rtx_REG (Pmode, R10_REG);
11587 reg11 = gen_rtx_REG (Pmode, R11_REG);
11588
11589 /* If this function uses a static chain, it will be in %r10.
11590 Preserve it across the call to __morestack. */
11591 if (DECL_STATIC_CHAIN (cfun->decl))
11592 {
11593 rtx rax;
11594
11595 rax = gen_rtx_REG (word_mode, AX_REG);
11596 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11597 use_reg (&call_fusage, rax);
11598 }
11599
11600 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11601 {
11602 HOST_WIDE_INT argval;
11603
11604 gcc_assert (Pmode == DImode);
11605 /* When using the large model we need to load the address
11606 into a register, and we've run out of registers. So we
11607 switch to a different calling convention, and we call a
11608 different function: __morestack_large. We pass the
11609 argument size in the upper 32 bits of r10 and pass the
11610 frame size in the lower 32 bits. */
11611 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11612 gcc_assert ((args_size & 0xffffffff) == args_size);
11613
11614 if (split_stack_fn_large == NULL_RTX)
11615 split_stack_fn_large =
11616 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11617
11618 if (ix86_cmodel == CM_LARGE_PIC)
11619 {
11620 rtx label, x;
11621
11622 label = gen_label_rtx ();
11623 emit_label (label);
11624 LABEL_PRESERVE_P (label) = 1;
11625 emit_insn (gen_set_rip_rex64 (reg10, label));
11626 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11627 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11628 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11629 UNSPEC_GOT);
11630 x = gen_rtx_CONST (Pmode, x);
11631 emit_move_insn (reg11, x);
11632 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11633 x = gen_const_mem (Pmode, x);
11634 emit_move_insn (reg11, x);
11635 }
11636 else
11637 emit_move_insn (reg11, split_stack_fn_large);
11638
11639 fn = reg11;
11640
11641 argval = ((args_size << 16) << 16) + allocate;
11642 emit_move_insn (reg10, GEN_INT (argval));
11643 }
11644 else
11645 {
11646 emit_move_insn (reg10, allocate_rtx);
11647 emit_move_insn (reg11, GEN_INT (args_size));
11648 use_reg (&call_fusage, reg11);
11649 }
11650
11651 use_reg (&call_fusage, reg10);
11652 }
11653 else
11654 {
11655 emit_insn (gen_push (GEN_INT (args_size)));
11656 emit_insn (gen_push (allocate_rtx));
11657 }
11658 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11659 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11660 NULL_RTX, false);
11661 add_function_usage_to (call_insn, call_fusage);
11662
11663 /* In order to make call/return prediction work right, we now need
11664 to execute a return instruction. See
11665 libgcc/config/i386/morestack.S for the details on how this works.
11666
11667 For flow purposes gcc must not see this as a return
11668 instruction--we need control flow to continue at the subsequent
11669 label. Therefore, we use an unspec. */
11670 gcc_assert (crtl->args.pops_args < 65536);
11671 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11672
11673 /* If we are in 64-bit mode and this function uses a static chain,
11674 we saved %r10 in %rax before calling _morestack. */
11675 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11676 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11677 gen_rtx_REG (word_mode, AX_REG));
11678
11679 /* If this function calls va_start, we need to store a pointer to
11680 the arguments on the old stack, because they may not have been
11681 all copied to the new stack. At this point the old stack can be
11682 found at the frame pointer value used by __morestack, because
11683 __morestack has set that up before calling back to us. Here we
11684 store that pointer in a scratch register, and in
11685 ix86_expand_prologue we store the scratch register in a stack
11686 slot. */
11687 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11688 {
11689 unsigned int scratch_regno;
11690 rtx frame_reg;
11691 int words;
11692
11693 scratch_regno = split_stack_prologue_scratch_regno ();
11694 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11695 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11696
11697 /* 64-bit:
11698 fp -> old fp value
11699 return address within this function
11700 return address of caller of this function
11701 stack arguments
11702 So we add three words to get to the stack arguments.
11703
11704 32-bit:
11705 fp -> old fp value
11706 return address within this function
11707 first argument to __morestack
11708 second argument to __morestack
11709 return address of caller of this function
11710 stack arguments
11711 So we add five words to get to the stack arguments.
11712 */
11713 words = TARGET_64BIT ? 3 : 5;
11714 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11715 gen_rtx_PLUS (Pmode, frame_reg,
11716 GEN_INT (words * UNITS_PER_WORD))));
11717
11718 varargs_label = gen_label_rtx ();
11719 emit_jump_insn (gen_jump (varargs_label));
11720 JUMP_LABEL (get_last_insn ()) = varargs_label;
11721
11722 emit_barrier ();
11723 }
11724
11725 emit_label (label);
11726 LABEL_NUSES (label) = 1;
11727
11728 /* If this function calls va_start, we now have to set the scratch
11729 register for the case where we do not call __morestack. In this
11730 case we need to set it based on the stack pointer. */
11731 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11732 {
11733 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11734 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11735 GEN_INT (UNITS_PER_WORD))));
11736
11737 emit_label (varargs_label);
11738 LABEL_NUSES (varargs_label) = 1;
11739 }
11740 }
11741
11742 /* We may have to tell the dataflow pass that the split stack prologue
11743 is initializing a scratch register. */
11744
11745 static void
11746 ix86_live_on_entry (bitmap regs)
11747 {
11748 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11749 {
11750 gcc_assert (flag_split_stack);
11751 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11752 }
11753 }
11754 \f
11755 /* Determine if op is suitable SUBREG RTX for address. */
11756
11757 static bool
11758 ix86_address_subreg_operand (rtx op)
11759 {
11760 enum machine_mode mode;
11761
11762 if (!REG_P (op))
11763 return false;
11764
11765 mode = GET_MODE (op);
11766
11767 if (GET_MODE_CLASS (mode) != MODE_INT)
11768 return false;
11769
11770 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11771 failures when the register is one word out of a two word structure. */
11772 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11773 return false;
11774
11775 /* simplify_subreg does not handle stack pointer. */
11776 if (REGNO (op) == STACK_POINTER_REGNUM)
11777 return false;
11778
11779 /* Allow only SUBREGs of non-eliminable hard registers. */
11780 return register_no_elim_operand (op, mode);
11781 }
11782
11783 /* Extract the parts of an RTL expression that is a valid memory address
11784 for an instruction. Return 0 if the structure of the address is
11785 grossly off. Return -1 if the address contains ASHIFT, so it is not
11786 strictly valid, but still used for computing length of lea instruction. */
11787
11788 int
11789 ix86_decompose_address (rtx addr, struct ix86_address *out)
11790 {
11791 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11792 rtx base_reg, index_reg;
11793 HOST_WIDE_INT scale = 1;
11794 rtx scale_rtx = NULL_RTX;
11795 rtx tmp;
11796 int retval = 1;
11797 enum ix86_address_seg seg = SEG_DEFAULT;
11798
11799 /* Allow zero-extended SImode addresses,
11800 they will be emitted with addr32 prefix. */
11801 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11802 {
11803 if (GET_CODE (addr) == ZERO_EXTEND
11804 && GET_MODE (XEXP (addr, 0)) == SImode)
11805 {
11806 addr = XEXP (addr, 0);
11807 if (CONST_INT_P (addr))
11808 return 0;
11809 }
11810 else if (GET_CODE (addr) == AND
11811 && const_32bit_mask (XEXP (addr, 1), DImode))
11812 {
11813 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
11814 if (addr == NULL_RTX)
11815 return 0;
11816
11817 if (CONST_INT_P (addr))
11818 return 0;
11819 }
11820 }
11821
11822 /* Allow SImode subregs of DImode addresses,
11823 they will be emitted with addr32 prefix. */
11824 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11825 {
11826 if (GET_CODE (addr) == SUBREG
11827 && GET_MODE (SUBREG_REG (addr)) == DImode)
11828 {
11829 addr = SUBREG_REG (addr);
11830 if (CONST_INT_P (addr))
11831 return 0;
11832 }
11833 }
11834
11835 if (REG_P (addr))
11836 base = addr;
11837 else if (GET_CODE (addr) == SUBREG)
11838 {
11839 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11840 base = addr;
11841 else
11842 return 0;
11843 }
11844 else if (GET_CODE (addr) == PLUS)
11845 {
11846 rtx addends[4], op;
11847 int n = 0, i;
11848
11849 op = addr;
11850 do
11851 {
11852 if (n >= 4)
11853 return 0;
11854 addends[n++] = XEXP (op, 1);
11855 op = XEXP (op, 0);
11856 }
11857 while (GET_CODE (op) == PLUS);
11858 if (n >= 4)
11859 return 0;
11860 addends[n] = op;
11861
11862 for (i = n; i >= 0; --i)
11863 {
11864 op = addends[i];
11865 switch (GET_CODE (op))
11866 {
11867 case MULT:
11868 if (index)
11869 return 0;
11870 index = XEXP (op, 0);
11871 scale_rtx = XEXP (op, 1);
11872 break;
11873
11874 case ASHIFT:
11875 if (index)
11876 return 0;
11877 index = XEXP (op, 0);
11878 tmp = XEXP (op, 1);
11879 if (!CONST_INT_P (tmp))
11880 return 0;
11881 scale = INTVAL (tmp);
11882 if ((unsigned HOST_WIDE_INT) scale > 3)
11883 return 0;
11884 scale = 1 << scale;
11885 break;
11886
11887 case ZERO_EXTEND:
11888 op = XEXP (op, 0);
11889 if (GET_CODE (op) != UNSPEC)
11890 return 0;
11891 /* FALLTHRU */
11892
11893 case UNSPEC:
11894 if (XINT (op, 1) == UNSPEC_TP
11895 && TARGET_TLS_DIRECT_SEG_REFS
11896 && seg == SEG_DEFAULT)
11897 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11898 else
11899 return 0;
11900 break;
11901
11902 case SUBREG:
11903 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11904 return 0;
11905 /* FALLTHRU */
11906
11907 case REG:
11908 if (!base)
11909 base = op;
11910 else if (!index)
11911 index = op;
11912 else
11913 return 0;
11914 break;
11915
11916 case CONST:
11917 case CONST_INT:
11918 case SYMBOL_REF:
11919 case LABEL_REF:
11920 if (disp)
11921 return 0;
11922 disp = op;
11923 break;
11924
11925 default:
11926 return 0;
11927 }
11928 }
11929 }
11930 else if (GET_CODE (addr) == MULT)
11931 {
11932 index = XEXP (addr, 0); /* index*scale */
11933 scale_rtx = XEXP (addr, 1);
11934 }
11935 else if (GET_CODE (addr) == ASHIFT)
11936 {
11937 /* We're called for lea too, which implements ashift on occasion. */
11938 index = XEXP (addr, 0);
11939 tmp = XEXP (addr, 1);
11940 if (!CONST_INT_P (tmp))
11941 return 0;
11942 scale = INTVAL (tmp);
11943 if ((unsigned HOST_WIDE_INT) scale > 3)
11944 return 0;
11945 scale = 1 << scale;
11946 retval = -1;
11947 }
11948 else if (CONST_INT_P (addr))
11949 {
11950 if (!x86_64_immediate_operand (addr, VOIDmode))
11951 return 0;
11952
11953 /* Constant addresses are sign extended to 64bit, we have to
11954 prevent addresses from 0x80000000 to 0xffffffff in x32 mode. */
11955 if (TARGET_X32
11956 && val_signbit_known_set_p (SImode, INTVAL (addr)))
11957 return 0;
11958
11959 disp = addr;
11960 }
11961 else
11962 disp = addr; /* displacement */
11963
11964 if (index)
11965 {
11966 if (REG_P (index))
11967 ;
11968 else if (GET_CODE (index) == SUBREG
11969 && ix86_address_subreg_operand (SUBREG_REG (index)))
11970 ;
11971 else
11972 return 0;
11973 }
11974
11975 /* Address override works only on the (%reg) part of %fs:(%reg). */
11976 if (seg != SEG_DEFAULT
11977 && ((base && GET_MODE (base) != word_mode)
11978 || (index && GET_MODE (index) != word_mode)))
11979 return 0;
11980
11981 /* Extract the integral value of scale. */
11982 if (scale_rtx)
11983 {
11984 if (!CONST_INT_P (scale_rtx))
11985 return 0;
11986 scale = INTVAL (scale_rtx);
11987 }
11988
11989 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11990 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11991
11992 /* Avoid useless 0 displacement. */
11993 if (disp == const0_rtx && (base || index))
11994 disp = NULL_RTX;
11995
11996 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11997 if (base_reg && index_reg && scale == 1
11998 && (index_reg == arg_pointer_rtx
11999 || index_reg == frame_pointer_rtx
12000 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
12001 {
12002 rtx tmp;
12003 tmp = base, base = index, index = tmp;
12004 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
12005 }
12006
12007 /* Special case: %ebp cannot be encoded as a base without a displacement.
12008 Similarly %r13. */
12009 if (!disp
12010 && base_reg
12011 && (base_reg == hard_frame_pointer_rtx
12012 || base_reg == frame_pointer_rtx
12013 || base_reg == arg_pointer_rtx
12014 || (REG_P (base_reg)
12015 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
12016 || REGNO (base_reg) == R13_REG))))
12017 disp = const0_rtx;
12018
12019 /* Special case: on K6, [%esi] makes the instruction vector decoded.
12020 Avoid this by transforming to [%esi+0].
12021 Reload calls address legitimization without cfun defined, so we need
12022 to test cfun for being non-NULL. */
12023 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
12024 && base_reg && !index_reg && !disp
12025 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
12026 disp = const0_rtx;
12027
12028 /* Special case: encode reg+reg instead of reg*2. */
12029 if (!base && index && scale == 2)
12030 base = index, base_reg = index_reg, scale = 1;
12031
12032 /* Special case: scaling cannot be encoded without base or displacement. */
12033 if (!base && !disp && index && scale != 1)
12034 disp = const0_rtx;
12035
12036 out->base = base;
12037 out->index = index;
12038 out->disp = disp;
12039 out->scale = scale;
12040 out->seg = seg;
12041
12042 return retval;
12043 }
12044 \f
12045 /* Return cost of the memory address x.
12046 For i386, it is better to use a complex address than let gcc copy
12047 the address into a reg and make a new pseudo. But not if the address
12048 requires to two regs - that would mean more pseudos with longer
12049 lifetimes. */
12050 static int
12051 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
12052 addr_space_t as ATTRIBUTE_UNUSED,
12053 bool speed ATTRIBUTE_UNUSED)
12054 {
12055 struct ix86_address parts;
12056 int cost = 1;
12057 int ok = ix86_decompose_address (x, &parts);
12058
12059 gcc_assert (ok);
12060
12061 if (parts.base && GET_CODE (parts.base) == SUBREG)
12062 parts.base = SUBREG_REG (parts.base);
12063 if (parts.index && GET_CODE (parts.index) == SUBREG)
12064 parts.index = SUBREG_REG (parts.index);
12065
12066 /* Attempt to minimize number of registers in the address. */
12067 if ((parts.base
12068 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
12069 || (parts.index
12070 && (!REG_P (parts.index)
12071 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
12072 cost++;
12073
12074 if (parts.base
12075 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
12076 && parts.index
12077 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
12078 && parts.base != parts.index)
12079 cost++;
12080
12081 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
12082 since it's predecode logic can't detect the length of instructions
12083 and it degenerates to vector decoded. Increase cost of such
12084 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
12085 to split such addresses or even refuse such addresses at all.
12086
12087 Following addressing modes are affected:
12088 [base+scale*index]
12089 [scale*index+disp]
12090 [base+index]
12091
12092 The first and last case may be avoidable by explicitly coding the zero in
12093 memory address, but I don't have AMD-K6 machine handy to check this
12094 theory. */
12095
12096 if (TARGET_K6
12097 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
12098 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
12099 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
12100 cost += 10;
12101
12102 return cost;
12103 }
12104 \f
12105 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
12106 this is used for to form addresses to local data when -fPIC is in
12107 use. */
12108
12109 static bool
12110 darwin_local_data_pic (rtx disp)
12111 {
12112 return (GET_CODE (disp) == UNSPEC
12113 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
12114 }
12115
12116 /* Determine if a given RTX is a valid constant. We already know this
12117 satisfies CONSTANT_P. */
12118
12119 static bool
12120 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
12121 {
12122 switch (GET_CODE (x))
12123 {
12124 case CONST:
12125 x = XEXP (x, 0);
12126
12127 if (GET_CODE (x) == PLUS)
12128 {
12129 if (!CONST_INT_P (XEXP (x, 1)))
12130 return false;
12131 x = XEXP (x, 0);
12132 }
12133
12134 if (TARGET_MACHO && darwin_local_data_pic (x))
12135 return true;
12136
12137 /* Only some unspecs are valid as "constants". */
12138 if (GET_CODE (x) == UNSPEC)
12139 switch (XINT (x, 1))
12140 {
12141 case UNSPEC_GOT:
12142 case UNSPEC_GOTOFF:
12143 case UNSPEC_PLTOFF:
12144 return TARGET_64BIT;
12145 case UNSPEC_TPOFF:
12146 case UNSPEC_NTPOFF:
12147 x = XVECEXP (x, 0, 0);
12148 return (GET_CODE (x) == SYMBOL_REF
12149 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12150 case UNSPEC_DTPOFF:
12151 x = XVECEXP (x, 0, 0);
12152 return (GET_CODE (x) == SYMBOL_REF
12153 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
12154 default:
12155 return false;
12156 }
12157
12158 /* We must have drilled down to a symbol. */
12159 if (GET_CODE (x) == LABEL_REF)
12160 return true;
12161 if (GET_CODE (x) != SYMBOL_REF)
12162 return false;
12163 /* FALLTHRU */
12164
12165 case SYMBOL_REF:
12166 /* TLS symbols are never valid. */
12167 if (SYMBOL_REF_TLS_MODEL (x))
12168 return false;
12169
12170 /* DLLIMPORT symbols are never valid. */
12171 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
12172 && SYMBOL_REF_DLLIMPORT_P (x))
12173 return false;
12174
12175 #if TARGET_MACHO
12176 /* mdynamic-no-pic */
12177 if (MACHO_DYNAMIC_NO_PIC_P)
12178 return machopic_symbol_defined_p (x);
12179 #endif
12180 break;
12181
12182 case CONST_DOUBLE:
12183 if (GET_MODE (x) == TImode
12184 && x != CONST0_RTX (TImode)
12185 && !TARGET_64BIT)
12186 return false;
12187 break;
12188
12189 case CONST_VECTOR:
12190 if (!standard_sse_constant_p (x))
12191 return false;
12192
12193 default:
12194 break;
12195 }
12196
12197 /* Otherwise we handle everything else in the move patterns. */
12198 return true;
12199 }
12200
12201 /* Determine if it's legal to put X into the constant pool. This
12202 is not possible for the address of thread-local symbols, which
12203 is checked above. */
12204
12205 static bool
12206 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
12207 {
12208 /* We can always put integral constants and vectors in memory. */
12209 switch (GET_CODE (x))
12210 {
12211 case CONST_INT:
12212 case CONST_DOUBLE:
12213 case CONST_VECTOR:
12214 return false;
12215
12216 default:
12217 break;
12218 }
12219 return !ix86_legitimate_constant_p (mode, x);
12220 }
12221
12222
12223 /* Nonzero if the constant value X is a legitimate general operand
12224 when generating PIC code. It is given that flag_pic is on and
12225 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
12226
12227 bool
12228 legitimate_pic_operand_p (rtx x)
12229 {
12230 rtx inner;
12231
12232 switch (GET_CODE (x))
12233 {
12234 case CONST:
12235 inner = XEXP (x, 0);
12236 if (GET_CODE (inner) == PLUS
12237 && CONST_INT_P (XEXP (inner, 1)))
12238 inner = XEXP (inner, 0);
12239
12240 /* Only some unspecs are valid as "constants". */
12241 if (GET_CODE (inner) == UNSPEC)
12242 switch (XINT (inner, 1))
12243 {
12244 case UNSPEC_GOT:
12245 case UNSPEC_GOTOFF:
12246 case UNSPEC_PLTOFF:
12247 return TARGET_64BIT;
12248 case UNSPEC_TPOFF:
12249 x = XVECEXP (inner, 0, 0);
12250 return (GET_CODE (x) == SYMBOL_REF
12251 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12252 case UNSPEC_MACHOPIC_OFFSET:
12253 return legitimate_pic_address_disp_p (x);
12254 default:
12255 return false;
12256 }
12257 /* FALLTHRU */
12258
12259 case SYMBOL_REF:
12260 case LABEL_REF:
12261 return legitimate_pic_address_disp_p (x);
12262
12263 default:
12264 return true;
12265 }
12266 }
12267
12268 /* Determine if a given CONST RTX is a valid memory displacement
12269 in PIC mode. */
12270
12271 bool
12272 legitimate_pic_address_disp_p (rtx disp)
12273 {
12274 bool saw_plus;
12275
12276 /* In 64bit mode we can allow direct addresses of symbols and labels
12277 when they are not dynamic symbols. */
12278 if (TARGET_64BIT)
12279 {
12280 rtx op0 = disp, op1;
12281
12282 switch (GET_CODE (disp))
12283 {
12284 case LABEL_REF:
12285 return true;
12286
12287 case CONST:
12288 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12289 break;
12290 op0 = XEXP (XEXP (disp, 0), 0);
12291 op1 = XEXP (XEXP (disp, 0), 1);
12292 if (!CONST_INT_P (op1)
12293 || INTVAL (op1) >= 16*1024*1024
12294 || INTVAL (op1) < -16*1024*1024)
12295 break;
12296 if (GET_CODE (op0) == LABEL_REF)
12297 return true;
12298 if (GET_CODE (op0) == CONST
12299 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12300 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12301 return true;
12302 if (GET_CODE (op0) == UNSPEC
12303 && XINT (op0, 1) == UNSPEC_PCREL)
12304 return true;
12305 if (GET_CODE (op0) != SYMBOL_REF)
12306 break;
12307 /* FALLTHRU */
12308
12309 case SYMBOL_REF:
12310 /* TLS references should always be enclosed in UNSPEC. */
12311 if (SYMBOL_REF_TLS_MODEL (op0))
12312 return false;
12313 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
12314 && ix86_cmodel != CM_LARGE_PIC)
12315 return true;
12316 break;
12317
12318 default:
12319 break;
12320 }
12321 }
12322 if (GET_CODE (disp) != CONST)
12323 return false;
12324 disp = XEXP (disp, 0);
12325
12326 if (TARGET_64BIT)
12327 {
12328 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12329 of GOT tables. We should not need these anyway. */
12330 if (GET_CODE (disp) != UNSPEC
12331 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12332 && XINT (disp, 1) != UNSPEC_GOTOFF
12333 && XINT (disp, 1) != UNSPEC_PCREL
12334 && XINT (disp, 1) != UNSPEC_PLTOFF))
12335 return false;
12336
12337 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12338 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12339 return false;
12340 return true;
12341 }
12342
12343 saw_plus = false;
12344 if (GET_CODE (disp) == PLUS)
12345 {
12346 if (!CONST_INT_P (XEXP (disp, 1)))
12347 return false;
12348 disp = XEXP (disp, 0);
12349 saw_plus = true;
12350 }
12351
12352 if (TARGET_MACHO && darwin_local_data_pic (disp))
12353 return true;
12354
12355 if (GET_CODE (disp) != UNSPEC)
12356 return false;
12357
12358 switch (XINT (disp, 1))
12359 {
12360 case UNSPEC_GOT:
12361 if (saw_plus)
12362 return false;
12363 /* We need to check for both symbols and labels because VxWorks loads
12364 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12365 details. */
12366 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12367 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12368 case UNSPEC_GOTOFF:
12369 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12370 While ABI specify also 32bit relocation but we don't produce it in
12371 small PIC model at all. */
12372 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12373 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12374 && !TARGET_64BIT)
12375 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12376 return false;
12377 case UNSPEC_GOTTPOFF:
12378 case UNSPEC_GOTNTPOFF:
12379 case UNSPEC_INDNTPOFF:
12380 if (saw_plus)
12381 return false;
12382 disp = XVECEXP (disp, 0, 0);
12383 return (GET_CODE (disp) == SYMBOL_REF
12384 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12385 case UNSPEC_NTPOFF:
12386 disp = XVECEXP (disp, 0, 0);
12387 return (GET_CODE (disp) == SYMBOL_REF
12388 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12389 case UNSPEC_DTPOFF:
12390 disp = XVECEXP (disp, 0, 0);
12391 return (GET_CODE (disp) == SYMBOL_REF
12392 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12393 }
12394
12395 return false;
12396 }
12397
12398 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12399 replace the input X, or the original X if no replacement is called for.
12400 The output parameter *WIN is 1 if the calling macro should goto WIN,
12401 0 if it should not. */
12402
12403 bool
12404 ix86_legitimize_reload_address (rtx x,
12405 enum machine_mode mode ATTRIBUTE_UNUSED,
12406 int opnum, int type,
12407 int ind_levels ATTRIBUTE_UNUSED)
12408 {
12409 /* Reload can generate:
12410
12411 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12412 (reg:DI 97))
12413 (reg:DI 2 cx))
12414
12415 This RTX is rejected from ix86_legitimate_address_p due to
12416 non-strictness of base register 97. Following this rejection,
12417 reload pushes all three components into separate registers,
12418 creating invalid memory address RTX.
12419
12420 Following code reloads only the invalid part of the
12421 memory address RTX. */
12422
12423 if (GET_CODE (x) == PLUS
12424 && REG_P (XEXP (x, 1))
12425 && GET_CODE (XEXP (x, 0)) == PLUS
12426 && REG_P (XEXP (XEXP (x, 0), 1)))
12427 {
12428 rtx base, index;
12429 bool something_reloaded = false;
12430
12431 base = XEXP (XEXP (x, 0), 1);
12432 if (!REG_OK_FOR_BASE_STRICT_P (base))
12433 {
12434 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12435 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12436 opnum, (enum reload_type) type);
12437 something_reloaded = true;
12438 }
12439
12440 index = XEXP (x, 1);
12441 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12442 {
12443 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12444 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12445 opnum, (enum reload_type) type);
12446 something_reloaded = true;
12447 }
12448
12449 gcc_assert (something_reloaded);
12450 return true;
12451 }
12452
12453 return false;
12454 }
12455
12456 /* Recognizes RTL expressions that are valid memory addresses for an
12457 instruction. The MODE argument is the machine mode for the MEM
12458 expression that wants to use this address.
12459
12460 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12461 convert common non-canonical forms to canonical form so that they will
12462 be recognized. */
12463
12464 static bool
12465 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12466 rtx addr, bool strict)
12467 {
12468 struct ix86_address parts;
12469 rtx base, index, disp;
12470 HOST_WIDE_INT scale;
12471
12472 if (ix86_decompose_address (addr, &parts) <= 0)
12473 /* Decomposition failed. */
12474 return false;
12475
12476 base = parts.base;
12477 index = parts.index;
12478 disp = parts.disp;
12479 scale = parts.scale;
12480
12481 /* Validate base register. */
12482 if (base)
12483 {
12484 rtx reg;
12485
12486 if (REG_P (base))
12487 reg = base;
12488 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12489 reg = SUBREG_REG (base);
12490 else
12491 /* Base is not a register. */
12492 return false;
12493
12494 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12495 return false;
12496
12497 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12498 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12499 /* Base is not valid. */
12500 return false;
12501 }
12502
12503 /* Validate index register. */
12504 if (index)
12505 {
12506 rtx reg;
12507
12508 if (REG_P (index))
12509 reg = index;
12510 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12511 reg = SUBREG_REG (index);
12512 else
12513 /* Index is not a register. */
12514 return false;
12515
12516 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12517 return false;
12518
12519 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12520 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12521 /* Index is not valid. */
12522 return false;
12523 }
12524
12525 /* Index and base should have the same mode. */
12526 if (base && index
12527 && GET_MODE (base) != GET_MODE (index))
12528 return false;
12529
12530 /* Validate scale factor. */
12531 if (scale != 1)
12532 {
12533 if (!index)
12534 /* Scale without index. */
12535 return false;
12536
12537 if (scale != 2 && scale != 4 && scale != 8)
12538 /* Scale is not a valid multiplier. */
12539 return false;
12540 }
12541
12542 /* Validate displacement. */
12543 if (disp)
12544 {
12545 if (GET_CODE (disp) == CONST
12546 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12547 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12548 switch (XINT (XEXP (disp, 0), 1))
12549 {
12550 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12551 used. While ABI specify also 32bit relocations, we don't produce
12552 them at all and use IP relative instead. */
12553 case UNSPEC_GOT:
12554 case UNSPEC_GOTOFF:
12555 gcc_assert (flag_pic);
12556 if (!TARGET_64BIT)
12557 goto is_legitimate_pic;
12558
12559 /* 64bit address unspec. */
12560 return false;
12561
12562 case UNSPEC_GOTPCREL:
12563 case UNSPEC_PCREL:
12564 gcc_assert (flag_pic);
12565 goto is_legitimate_pic;
12566
12567 case UNSPEC_GOTTPOFF:
12568 case UNSPEC_GOTNTPOFF:
12569 case UNSPEC_INDNTPOFF:
12570 case UNSPEC_NTPOFF:
12571 case UNSPEC_DTPOFF:
12572 break;
12573
12574 case UNSPEC_STACK_CHECK:
12575 gcc_assert (flag_split_stack);
12576 break;
12577
12578 default:
12579 /* Invalid address unspec. */
12580 return false;
12581 }
12582
12583 else if (SYMBOLIC_CONST (disp)
12584 && (flag_pic
12585 || (TARGET_MACHO
12586 #if TARGET_MACHO
12587 && MACHOPIC_INDIRECT
12588 && !machopic_operand_p (disp)
12589 #endif
12590 )))
12591 {
12592
12593 is_legitimate_pic:
12594 if (TARGET_64BIT && (index || base))
12595 {
12596 /* foo@dtpoff(%rX) is ok. */
12597 if (GET_CODE (disp) != CONST
12598 || GET_CODE (XEXP (disp, 0)) != PLUS
12599 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12600 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12601 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12602 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12603 /* Non-constant pic memory reference. */
12604 return false;
12605 }
12606 else if ((!TARGET_MACHO || flag_pic)
12607 && ! legitimate_pic_address_disp_p (disp))
12608 /* Displacement is an invalid pic construct. */
12609 return false;
12610 #if TARGET_MACHO
12611 else if (MACHO_DYNAMIC_NO_PIC_P
12612 && !ix86_legitimate_constant_p (Pmode, disp))
12613 /* displacment must be referenced via non_lazy_pointer */
12614 return false;
12615 #endif
12616
12617 /* This code used to verify that a symbolic pic displacement
12618 includes the pic_offset_table_rtx register.
12619
12620 While this is good idea, unfortunately these constructs may
12621 be created by "adds using lea" optimization for incorrect
12622 code like:
12623
12624 int a;
12625 int foo(int i)
12626 {
12627 return *(&a+i);
12628 }
12629
12630 This code is nonsensical, but results in addressing
12631 GOT table with pic_offset_table_rtx base. We can't
12632 just refuse it easily, since it gets matched by
12633 "addsi3" pattern, that later gets split to lea in the
12634 case output register differs from input. While this
12635 can be handled by separate addsi pattern for this case
12636 that never results in lea, this seems to be easier and
12637 correct fix for crash to disable this test. */
12638 }
12639 else if (GET_CODE (disp) != LABEL_REF
12640 && !CONST_INT_P (disp)
12641 && (GET_CODE (disp) != CONST
12642 || !ix86_legitimate_constant_p (Pmode, disp))
12643 && (GET_CODE (disp) != SYMBOL_REF
12644 || !ix86_legitimate_constant_p (Pmode, disp)))
12645 /* Displacement is not constant. */
12646 return false;
12647 else if (TARGET_64BIT
12648 && !x86_64_immediate_operand (disp, VOIDmode))
12649 /* Displacement is out of range. */
12650 return false;
12651 }
12652
12653 /* Everything looks valid. */
12654 return true;
12655 }
12656
12657 /* Determine if a given RTX is a valid constant address. */
12658
12659 bool
12660 constant_address_p (rtx x)
12661 {
12662 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12663 }
12664 \f
12665 /* Return a unique alias set for the GOT. */
12666
12667 static alias_set_type
12668 ix86_GOT_alias_set (void)
12669 {
12670 static alias_set_type set = -1;
12671 if (set == -1)
12672 set = new_alias_set ();
12673 return set;
12674 }
12675
12676 /* Return a legitimate reference for ORIG (an address) using the
12677 register REG. If REG is 0, a new pseudo is generated.
12678
12679 There are two types of references that must be handled:
12680
12681 1. Global data references must load the address from the GOT, via
12682 the PIC reg. An insn is emitted to do this load, and the reg is
12683 returned.
12684
12685 2. Static data references, constant pool addresses, and code labels
12686 compute the address as an offset from the GOT, whose base is in
12687 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12688 differentiate them from global data objects. The returned
12689 address is the PIC reg + an unspec constant.
12690
12691 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12692 reg also appears in the address. */
12693
12694 static rtx
12695 legitimize_pic_address (rtx orig, rtx reg)
12696 {
12697 rtx addr = orig;
12698 rtx new_rtx = orig;
12699 rtx base;
12700
12701 #if TARGET_MACHO
12702 if (TARGET_MACHO && !TARGET_64BIT)
12703 {
12704 if (reg == 0)
12705 reg = gen_reg_rtx (Pmode);
12706 /* Use the generic Mach-O PIC machinery. */
12707 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12708 }
12709 #endif
12710
12711 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12712 new_rtx = addr;
12713 else if (TARGET_64BIT
12714 && ix86_cmodel != CM_SMALL_PIC
12715 && gotoff_operand (addr, Pmode))
12716 {
12717 rtx tmpreg;
12718 /* This symbol may be referenced via a displacement from the PIC
12719 base address (@GOTOFF). */
12720
12721 if (reload_in_progress)
12722 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12723 if (GET_CODE (addr) == CONST)
12724 addr = XEXP (addr, 0);
12725 if (GET_CODE (addr) == PLUS)
12726 {
12727 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12728 UNSPEC_GOTOFF);
12729 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12730 }
12731 else
12732 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12733 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12734 if (!reg)
12735 tmpreg = gen_reg_rtx (Pmode);
12736 else
12737 tmpreg = reg;
12738 emit_move_insn (tmpreg, new_rtx);
12739
12740 if (reg != 0)
12741 {
12742 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12743 tmpreg, 1, OPTAB_DIRECT);
12744 new_rtx = reg;
12745 }
12746 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12747 }
12748 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12749 {
12750 /* This symbol may be referenced via a displacement from the PIC
12751 base address (@GOTOFF). */
12752
12753 if (reload_in_progress)
12754 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12755 if (GET_CODE (addr) == CONST)
12756 addr = XEXP (addr, 0);
12757 if (GET_CODE (addr) == PLUS)
12758 {
12759 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12760 UNSPEC_GOTOFF);
12761 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12762 }
12763 else
12764 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12765 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12766 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12767
12768 if (reg != 0)
12769 {
12770 emit_move_insn (reg, new_rtx);
12771 new_rtx = reg;
12772 }
12773 }
12774 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12775 /* We can't use @GOTOFF for text labels on VxWorks;
12776 see gotoff_operand. */
12777 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12778 {
12779 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12780 {
12781 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12782 return legitimize_dllimport_symbol (addr, true);
12783 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12784 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12785 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12786 {
12787 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12788 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12789 }
12790 }
12791
12792 /* For x64 PE-COFF there is no GOT table. So we use address
12793 directly. */
12794 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12795 {
12796 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12797 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12798
12799 if (reg == 0)
12800 reg = gen_reg_rtx (Pmode);
12801 emit_move_insn (reg, new_rtx);
12802 new_rtx = reg;
12803 }
12804 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12805 {
12806 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12807 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12808 new_rtx = gen_const_mem (Pmode, new_rtx);
12809 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12810
12811 if (reg == 0)
12812 reg = gen_reg_rtx (Pmode);
12813 /* Use directly gen_movsi, otherwise the address is loaded
12814 into register for CSE. We don't want to CSE this addresses,
12815 instead we CSE addresses from the GOT table, so skip this. */
12816 emit_insn (gen_movsi (reg, new_rtx));
12817 new_rtx = reg;
12818 }
12819 else
12820 {
12821 /* This symbol must be referenced via a load from the
12822 Global Offset Table (@GOT). */
12823
12824 if (reload_in_progress)
12825 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12826 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12827 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12828 if (TARGET_64BIT)
12829 new_rtx = force_reg (Pmode, new_rtx);
12830 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12831 new_rtx = gen_const_mem (Pmode, new_rtx);
12832 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12833
12834 if (reg == 0)
12835 reg = gen_reg_rtx (Pmode);
12836 emit_move_insn (reg, new_rtx);
12837 new_rtx = reg;
12838 }
12839 }
12840 else
12841 {
12842 if (CONST_INT_P (addr)
12843 && !x86_64_immediate_operand (addr, VOIDmode))
12844 {
12845 if (reg)
12846 {
12847 emit_move_insn (reg, addr);
12848 new_rtx = reg;
12849 }
12850 else
12851 new_rtx = force_reg (Pmode, addr);
12852 }
12853 else if (GET_CODE (addr) == CONST)
12854 {
12855 addr = XEXP (addr, 0);
12856
12857 /* We must match stuff we generate before. Assume the only
12858 unspecs that can get here are ours. Not that we could do
12859 anything with them anyway.... */
12860 if (GET_CODE (addr) == UNSPEC
12861 || (GET_CODE (addr) == PLUS
12862 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12863 return orig;
12864 gcc_assert (GET_CODE (addr) == PLUS);
12865 }
12866 if (GET_CODE (addr) == PLUS)
12867 {
12868 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12869
12870 /* Check first to see if this is a constant offset from a @GOTOFF
12871 symbol reference. */
12872 if (gotoff_operand (op0, Pmode)
12873 && CONST_INT_P (op1))
12874 {
12875 if (!TARGET_64BIT)
12876 {
12877 if (reload_in_progress)
12878 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12879 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12880 UNSPEC_GOTOFF);
12881 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12882 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12883 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12884
12885 if (reg != 0)
12886 {
12887 emit_move_insn (reg, new_rtx);
12888 new_rtx = reg;
12889 }
12890 }
12891 else
12892 {
12893 if (INTVAL (op1) < -16*1024*1024
12894 || INTVAL (op1) >= 16*1024*1024)
12895 {
12896 if (!x86_64_immediate_operand (op1, Pmode))
12897 op1 = force_reg (Pmode, op1);
12898 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12899 }
12900 }
12901 }
12902 else
12903 {
12904 base = legitimize_pic_address (XEXP (addr, 0), reg);
12905 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12906 base == reg ? NULL_RTX : reg);
12907
12908 if (CONST_INT_P (new_rtx))
12909 new_rtx = plus_constant (Pmode, base, INTVAL (new_rtx));
12910 else
12911 {
12912 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12913 {
12914 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12915 new_rtx = XEXP (new_rtx, 1);
12916 }
12917 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12918 }
12919 }
12920 }
12921 }
12922 return new_rtx;
12923 }
12924 \f
12925 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12926
12927 static rtx
12928 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12929 {
12930 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12931
12932 if (GET_MODE (tp) != tp_mode)
12933 {
12934 gcc_assert (GET_MODE (tp) == SImode);
12935 gcc_assert (tp_mode == DImode);
12936
12937 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12938 }
12939
12940 if (to_reg)
12941 tp = copy_to_mode_reg (tp_mode, tp);
12942
12943 return tp;
12944 }
12945
12946 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12947
12948 static GTY(()) rtx ix86_tls_symbol;
12949
12950 static rtx
12951 ix86_tls_get_addr (void)
12952 {
12953 if (!ix86_tls_symbol)
12954 {
12955 const char *sym
12956 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12957 ? "___tls_get_addr" : "__tls_get_addr");
12958
12959 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12960 }
12961
12962 return ix86_tls_symbol;
12963 }
12964
12965 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12966
12967 static GTY(()) rtx ix86_tls_module_base_symbol;
12968
12969 rtx
12970 ix86_tls_module_base (void)
12971 {
12972 if (!ix86_tls_module_base_symbol)
12973 {
12974 ix86_tls_module_base_symbol
12975 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12976
12977 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12978 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12979 }
12980
12981 return ix86_tls_module_base_symbol;
12982 }
12983
12984 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12985 false if we expect this to be used for a memory address and true if
12986 we expect to load the address into a register. */
12987
12988 static rtx
12989 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12990 {
12991 rtx dest, base, off;
12992 rtx pic = NULL_RTX, tp = NULL_RTX;
12993 enum machine_mode tp_mode = Pmode;
12994 int type;
12995
12996 switch (model)
12997 {
12998 case TLS_MODEL_GLOBAL_DYNAMIC:
12999 dest = gen_reg_rtx (Pmode);
13000
13001 if (!TARGET_64BIT)
13002 {
13003 if (flag_pic)
13004 pic = pic_offset_table_rtx;
13005 else
13006 {
13007 pic = gen_reg_rtx (Pmode);
13008 emit_insn (gen_set_got (pic));
13009 }
13010 }
13011
13012 if (TARGET_GNU2_TLS)
13013 {
13014 if (TARGET_64BIT)
13015 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
13016 else
13017 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
13018
13019 tp = get_thread_pointer (Pmode, true);
13020 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
13021
13022 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13023 }
13024 else
13025 {
13026 rtx caddr = ix86_tls_get_addr ();
13027
13028 if (TARGET_64BIT)
13029 {
13030 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
13031
13032 start_sequence ();
13033 emit_call_insn (ix86_gen_tls_global_dynamic_64 (rax, x,
13034 caddr));
13035 insns = get_insns ();
13036 end_sequence ();
13037
13038 RTL_CONST_CALL_P (insns) = 1;
13039 emit_libcall_block (insns, dest, rax, x);
13040 }
13041 else
13042 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
13043 }
13044 break;
13045
13046 case TLS_MODEL_LOCAL_DYNAMIC:
13047 base = gen_reg_rtx (Pmode);
13048
13049 if (!TARGET_64BIT)
13050 {
13051 if (flag_pic)
13052 pic = pic_offset_table_rtx;
13053 else
13054 {
13055 pic = gen_reg_rtx (Pmode);
13056 emit_insn (gen_set_got (pic));
13057 }
13058 }
13059
13060 if (TARGET_GNU2_TLS)
13061 {
13062 rtx tmp = ix86_tls_module_base ();
13063
13064 if (TARGET_64BIT)
13065 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
13066 else
13067 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
13068
13069 tp = get_thread_pointer (Pmode, true);
13070 set_unique_reg_note (get_last_insn (), REG_EQUAL,
13071 gen_rtx_MINUS (Pmode, tmp, tp));
13072 }
13073 else
13074 {
13075 rtx caddr = ix86_tls_get_addr ();
13076
13077 if (TARGET_64BIT)
13078 {
13079 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
13080
13081 start_sequence ();
13082 emit_call_insn (ix86_gen_tls_local_dynamic_base_64 (rax,
13083 caddr));
13084 insns = get_insns ();
13085 end_sequence ();
13086
13087 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
13088 share the LD_BASE result with other LD model accesses. */
13089 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
13090 UNSPEC_TLS_LD_BASE);
13091
13092 RTL_CONST_CALL_P (insns) = 1;
13093 emit_libcall_block (insns, base, rax, eqv);
13094 }
13095 else
13096 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
13097 }
13098
13099 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
13100 off = gen_rtx_CONST (Pmode, off);
13101
13102 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
13103
13104 if (TARGET_GNU2_TLS)
13105 {
13106 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
13107
13108 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13109 }
13110 break;
13111
13112 case TLS_MODEL_INITIAL_EXEC:
13113 if (TARGET_64BIT)
13114 {
13115 if (TARGET_SUN_TLS && !TARGET_X32)
13116 {
13117 /* The Sun linker took the AMD64 TLS spec literally
13118 and can only handle %rax as destination of the
13119 initial executable code sequence. */
13120
13121 dest = gen_reg_rtx (DImode);
13122 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
13123 return dest;
13124 }
13125
13126 /* Generate DImode references to avoid %fs:(%reg32)
13127 problems and linker IE->LE relaxation bug. */
13128 tp_mode = DImode;
13129 pic = NULL;
13130 type = UNSPEC_GOTNTPOFF;
13131 }
13132 else if (flag_pic)
13133 {
13134 if (reload_in_progress)
13135 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13136 pic = pic_offset_table_rtx;
13137 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
13138 }
13139 else if (!TARGET_ANY_GNU_TLS)
13140 {
13141 pic = gen_reg_rtx (Pmode);
13142 emit_insn (gen_set_got (pic));
13143 type = UNSPEC_GOTTPOFF;
13144 }
13145 else
13146 {
13147 pic = NULL;
13148 type = UNSPEC_INDNTPOFF;
13149 }
13150
13151 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
13152 off = gen_rtx_CONST (tp_mode, off);
13153 if (pic)
13154 off = gen_rtx_PLUS (tp_mode, pic, off);
13155 off = gen_const_mem (tp_mode, off);
13156 set_mem_alias_set (off, ix86_GOT_alias_set ());
13157
13158 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13159 {
13160 base = get_thread_pointer (tp_mode,
13161 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13162 off = force_reg (tp_mode, off);
13163 return gen_rtx_PLUS (tp_mode, base, off);
13164 }
13165 else
13166 {
13167 base = get_thread_pointer (Pmode, true);
13168 dest = gen_reg_rtx (Pmode);
13169 emit_insn (ix86_gen_sub3 (dest, base, off));
13170 }
13171 break;
13172
13173 case TLS_MODEL_LOCAL_EXEC:
13174 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
13175 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13176 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
13177 off = gen_rtx_CONST (Pmode, off);
13178
13179 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13180 {
13181 base = get_thread_pointer (Pmode,
13182 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13183 return gen_rtx_PLUS (Pmode, base, off);
13184 }
13185 else
13186 {
13187 base = get_thread_pointer (Pmode, true);
13188 dest = gen_reg_rtx (Pmode);
13189 emit_insn (ix86_gen_sub3 (dest, base, off));
13190 }
13191 break;
13192
13193 default:
13194 gcc_unreachable ();
13195 }
13196
13197 return dest;
13198 }
13199
13200 /* Create or return the unique __imp_DECL dllimport symbol corresponding
13201 to symbol DECL. */
13202
13203 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
13204 htab_t dllimport_map;
13205
13206 static tree
13207 get_dllimport_decl (tree decl)
13208 {
13209 struct tree_map *h, in;
13210 void **loc;
13211 const char *name;
13212 const char *prefix;
13213 size_t namelen, prefixlen;
13214 char *imp_name;
13215 tree to;
13216 rtx rtl;
13217
13218 if (!dllimport_map)
13219 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13220
13221 in.hash = htab_hash_pointer (decl);
13222 in.base.from = decl;
13223 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13224 h = (struct tree_map *) *loc;
13225 if (h)
13226 return h->to;
13227
13228 *loc = h = ggc_alloc_tree_map ();
13229 h->hash = in.hash;
13230 h->base.from = decl;
13231 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13232 VAR_DECL, NULL, ptr_type_node);
13233 DECL_ARTIFICIAL (to) = 1;
13234 DECL_IGNORED_P (to) = 1;
13235 DECL_EXTERNAL (to) = 1;
13236 TREE_READONLY (to) = 1;
13237
13238 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13239 name = targetm.strip_name_encoding (name);
13240 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13241 ? "*__imp_" : "*__imp__";
13242 namelen = strlen (name);
13243 prefixlen = strlen (prefix);
13244 imp_name = (char *) alloca (namelen + prefixlen + 1);
13245 memcpy (imp_name, prefix, prefixlen);
13246 memcpy (imp_name + prefixlen, name, namelen + 1);
13247
13248 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13249 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13250 SET_SYMBOL_REF_DECL (rtl, to);
13251 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
13252
13253 rtl = gen_const_mem (Pmode, rtl);
13254 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13255
13256 SET_DECL_RTL (to, rtl);
13257 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13258
13259 return to;
13260 }
13261
13262 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13263 true if we require the result be a register. */
13264
13265 static rtx
13266 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13267 {
13268 tree imp_decl;
13269 rtx x;
13270
13271 gcc_assert (SYMBOL_REF_DECL (symbol));
13272 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
13273
13274 x = DECL_RTL (imp_decl);
13275 if (want_reg)
13276 x = force_reg (Pmode, x);
13277 return x;
13278 }
13279
13280 /* Try machine-dependent ways of modifying an illegitimate address
13281 to be legitimate. If we find one, return the new, valid address.
13282 This macro is used in only one place: `memory_address' in explow.c.
13283
13284 OLDX is the address as it was before break_out_memory_refs was called.
13285 In some cases it is useful to look at this to decide what needs to be done.
13286
13287 It is always safe for this macro to do nothing. It exists to recognize
13288 opportunities to optimize the output.
13289
13290 For the 80386, we handle X+REG by loading X into a register R and
13291 using R+REG. R will go in a general reg and indexing will be used.
13292 However, if REG is a broken-out memory address or multiplication,
13293 nothing needs to be done because REG can certainly go in a general reg.
13294
13295 When -fpic is used, special handling is needed for symbolic references.
13296 See comments by legitimize_pic_address in i386.c for details. */
13297
13298 static rtx
13299 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13300 enum machine_mode mode)
13301 {
13302 int changed = 0;
13303 unsigned log;
13304
13305 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13306 if (log)
13307 return legitimize_tls_address (x, (enum tls_model) log, false);
13308 if (GET_CODE (x) == CONST
13309 && GET_CODE (XEXP (x, 0)) == PLUS
13310 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13311 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13312 {
13313 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13314 (enum tls_model) log, false);
13315 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13316 }
13317
13318 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13319 {
13320 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
13321 return legitimize_dllimport_symbol (x, true);
13322 if (GET_CODE (x) == CONST
13323 && GET_CODE (XEXP (x, 0)) == PLUS
13324 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13325 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
13326 {
13327 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
13328 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13329 }
13330 }
13331
13332 if (flag_pic && SYMBOLIC_CONST (x))
13333 return legitimize_pic_address (x, 0);
13334
13335 #if TARGET_MACHO
13336 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13337 return machopic_indirect_data_reference (x, 0);
13338 #endif
13339
13340 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13341 if (GET_CODE (x) == ASHIFT
13342 && CONST_INT_P (XEXP (x, 1))
13343 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13344 {
13345 changed = 1;
13346 log = INTVAL (XEXP (x, 1));
13347 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13348 GEN_INT (1 << log));
13349 }
13350
13351 if (GET_CODE (x) == PLUS)
13352 {
13353 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13354
13355 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13356 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13357 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13358 {
13359 changed = 1;
13360 log = INTVAL (XEXP (XEXP (x, 0), 1));
13361 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13362 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13363 GEN_INT (1 << log));
13364 }
13365
13366 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13367 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13368 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13369 {
13370 changed = 1;
13371 log = INTVAL (XEXP (XEXP (x, 1), 1));
13372 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13373 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13374 GEN_INT (1 << log));
13375 }
13376
13377 /* Put multiply first if it isn't already. */
13378 if (GET_CODE (XEXP (x, 1)) == MULT)
13379 {
13380 rtx tmp = XEXP (x, 0);
13381 XEXP (x, 0) = XEXP (x, 1);
13382 XEXP (x, 1) = tmp;
13383 changed = 1;
13384 }
13385
13386 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13387 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13388 created by virtual register instantiation, register elimination, and
13389 similar optimizations. */
13390 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13391 {
13392 changed = 1;
13393 x = gen_rtx_PLUS (Pmode,
13394 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13395 XEXP (XEXP (x, 1), 0)),
13396 XEXP (XEXP (x, 1), 1));
13397 }
13398
13399 /* Canonicalize
13400 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13401 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13402 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13403 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13404 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13405 && CONSTANT_P (XEXP (x, 1)))
13406 {
13407 rtx constant;
13408 rtx other = NULL_RTX;
13409
13410 if (CONST_INT_P (XEXP (x, 1)))
13411 {
13412 constant = XEXP (x, 1);
13413 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13414 }
13415 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13416 {
13417 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13418 other = XEXP (x, 1);
13419 }
13420 else
13421 constant = 0;
13422
13423 if (constant)
13424 {
13425 changed = 1;
13426 x = gen_rtx_PLUS (Pmode,
13427 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13428 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13429 plus_constant (Pmode, other,
13430 INTVAL (constant)));
13431 }
13432 }
13433
13434 if (changed && ix86_legitimate_address_p (mode, x, false))
13435 return x;
13436
13437 if (GET_CODE (XEXP (x, 0)) == MULT)
13438 {
13439 changed = 1;
13440 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13441 }
13442
13443 if (GET_CODE (XEXP (x, 1)) == MULT)
13444 {
13445 changed = 1;
13446 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13447 }
13448
13449 if (changed
13450 && REG_P (XEXP (x, 1))
13451 && REG_P (XEXP (x, 0)))
13452 return x;
13453
13454 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13455 {
13456 changed = 1;
13457 x = legitimize_pic_address (x, 0);
13458 }
13459
13460 if (changed && ix86_legitimate_address_p (mode, x, false))
13461 return x;
13462
13463 if (REG_P (XEXP (x, 0)))
13464 {
13465 rtx temp = gen_reg_rtx (Pmode);
13466 rtx val = force_operand (XEXP (x, 1), temp);
13467 if (val != temp)
13468 {
13469 if (GET_MODE (val) != Pmode)
13470 val = convert_to_mode (Pmode, val, 1);
13471 emit_move_insn (temp, val);
13472 }
13473
13474 XEXP (x, 1) = temp;
13475 return x;
13476 }
13477
13478 else if (REG_P (XEXP (x, 1)))
13479 {
13480 rtx temp = gen_reg_rtx (Pmode);
13481 rtx val = force_operand (XEXP (x, 0), temp);
13482 if (val != temp)
13483 {
13484 if (GET_MODE (val) != Pmode)
13485 val = convert_to_mode (Pmode, val, 1);
13486 emit_move_insn (temp, val);
13487 }
13488
13489 XEXP (x, 0) = temp;
13490 return x;
13491 }
13492 }
13493
13494 return x;
13495 }
13496 \f
13497 /* Print an integer constant expression in assembler syntax. Addition
13498 and subtraction are the only arithmetic that may appear in these
13499 expressions. FILE is the stdio stream to write to, X is the rtx, and
13500 CODE is the operand print code from the output string. */
13501
13502 static void
13503 output_pic_addr_const (FILE *file, rtx x, int code)
13504 {
13505 char buf[256];
13506
13507 switch (GET_CODE (x))
13508 {
13509 case PC:
13510 gcc_assert (flag_pic);
13511 putc ('.', file);
13512 break;
13513
13514 case SYMBOL_REF:
13515 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13516 output_addr_const (file, x);
13517 else
13518 {
13519 const char *name = XSTR (x, 0);
13520
13521 /* Mark the decl as referenced so that cgraph will
13522 output the function. */
13523 if (SYMBOL_REF_DECL (x))
13524 mark_decl_referenced (SYMBOL_REF_DECL (x));
13525
13526 #if TARGET_MACHO
13527 if (MACHOPIC_INDIRECT
13528 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13529 name = machopic_indirection_name (x, /*stub_p=*/true);
13530 #endif
13531 assemble_name (file, name);
13532 }
13533 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13534 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13535 fputs ("@PLT", file);
13536 break;
13537
13538 case LABEL_REF:
13539 x = XEXP (x, 0);
13540 /* FALLTHRU */
13541 case CODE_LABEL:
13542 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13543 assemble_name (asm_out_file, buf);
13544 break;
13545
13546 case CONST_INT:
13547 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13548 break;
13549
13550 case CONST:
13551 /* This used to output parentheses around the expression,
13552 but that does not work on the 386 (either ATT or BSD assembler). */
13553 output_pic_addr_const (file, XEXP (x, 0), code);
13554 break;
13555
13556 case CONST_DOUBLE:
13557 if (GET_MODE (x) == VOIDmode)
13558 {
13559 /* We can use %d if the number is <32 bits and positive. */
13560 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13561 fprintf (file, "0x%lx%08lx",
13562 (unsigned long) CONST_DOUBLE_HIGH (x),
13563 (unsigned long) CONST_DOUBLE_LOW (x));
13564 else
13565 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13566 }
13567 else
13568 /* We can't handle floating point constants;
13569 TARGET_PRINT_OPERAND must handle them. */
13570 output_operand_lossage ("floating constant misused");
13571 break;
13572
13573 case PLUS:
13574 /* Some assemblers need integer constants to appear first. */
13575 if (CONST_INT_P (XEXP (x, 0)))
13576 {
13577 output_pic_addr_const (file, XEXP (x, 0), code);
13578 putc ('+', file);
13579 output_pic_addr_const (file, XEXP (x, 1), code);
13580 }
13581 else
13582 {
13583 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13584 output_pic_addr_const (file, XEXP (x, 1), code);
13585 putc ('+', file);
13586 output_pic_addr_const (file, XEXP (x, 0), code);
13587 }
13588 break;
13589
13590 case MINUS:
13591 if (!TARGET_MACHO)
13592 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13593 output_pic_addr_const (file, XEXP (x, 0), code);
13594 putc ('-', file);
13595 output_pic_addr_const (file, XEXP (x, 1), code);
13596 if (!TARGET_MACHO)
13597 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13598 break;
13599
13600 case UNSPEC:
13601 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13602 {
13603 bool f = i386_asm_output_addr_const_extra (file, x);
13604 gcc_assert (f);
13605 break;
13606 }
13607
13608 gcc_assert (XVECLEN (x, 0) == 1);
13609 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13610 switch (XINT (x, 1))
13611 {
13612 case UNSPEC_GOT:
13613 fputs ("@GOT", file);
13614 break;
13615 case UNSPEC_GOTOFF:
13616 fputs ("@GOTOFF", file);
13617 break;
13618 case UNSPEC_PLTOFF:
13619 fputs ("@PLTOFF", file);
13620 break;
13621 case UNSPEC_PCREL:
13622 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13623 "(%rip)" : "[rip]", file);
13624 break;
13625 case UNSPEC_GOTPCREL:
13626 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13627 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13628 break;
13629 case UNSPEC_GOTTPOFF:
13630 /* FIXME: This might be @TPOFF in Sun ld too. */
13631 fputs ("@gottpoff", file);
13632 break;
13633 case UNSPEC_TPOFF:
13634 fputs ("@tpoff", file);
13635 break;
13636 case UNSPEC_NTPOFF:
13637 if (TARGET_64BIT)
13638 fputs ("@tpoff", file);
13639 else
13640 fputs ("@ntpoff", file);
13641 break;
13642 case UNSPEC_DTPOFF:
13643 fputs ("@dtpoff", file);
13644 break;
13645 case UNSPEC_GOTNTPOFF:
13646 if (TARGET_64BIT)
13647 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13648 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13649 else
13650 fputs ("@gotntpoff", file);
13651 break;
13652 case UNSPEC_INDNTPOFF:
13653 fputs ("@indntpoff", file);
13654 break;
13655 #if TARGET_MACHO
13656 case UNSPEC_MACHOPIC_OFFSET:
13657 putc ('-', file);
13658 machopic_output_function_base_name (file);
13659 break;
13660 #endif
13661 default:
13662 output_operand_lossage ("invalid UNSPEC as operand");
13663 break;
13664 }
13665 break;
13666
13667 default:
13668 output_operand_lossage ("invalid expression as operand");
13669 }
13670 }
13671
13672 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13673 We need to emit DTP-relative relocations. */
13674
13675 static void ATTRIBUTE_UNUSED
13676 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13677 {
13678 fputs (ASM_LONG, file);
13679 output_addr_const (file, x);
13680 fputs ("@dtpoff", file);
13681 switch (size)
13682 {
13683 case 4:
13684 break;
13685 case 8:
13686 fputs (", 0", file);
13687 break;
13688 default:
13689 gcc_unreachable ();
13690 }
13691 }
13692
13693 /* Return true if X is a representation of the PIC register. This copes
13694 with calls from ix86_find_base_term, where the register might have
13695 been replaced by a cselib value. */
13696
13697 static bool
13698 ix86_pic_register_p (rtx x)
13699 {
13700 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13701 return (pic_offset_table_rtx
13702 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13703 else
13704 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13705 }
13706
13707 /* Helper function for ix86_delegitimize_address.
13708 Attempt to delegitimize TLS local-exec accesses. */
13709
13710 static rtx
13711 ix86_delegitimize_tls_address (rtx orig_x)
13712 {
13713 rtx x = orig_x, unspec;
13714 struct ix86_address addr;
13715
13716 if (!TARGET_TLS_DIRECT_SEG_REFS)
13717 return orig_x;
13718 if (MEM_P (x))
13719 x = XEXP (x, 0);
13720 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13721 return orig_x;
13722 if (ix86_decompose_address (x, &addr) == 0
13723 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13724 || addr.disp == NULL_RTX
13725 || GET_CODE (addr.disp) != CONST)
13726 return orig_x;
13727 unspec = XEXP (addr.disp, 0);
13728 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13729 unspec = XEXP (unspec, 0);
13730 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13731 return orig_x;
13732 x = XVECEXP (unspec, 0, 0);
13733 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13734 if (unspec != XEXP (addr.disp, 0))
13735 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13736 if (addr.index)
13737 {
13738 rtx idx = addr.index;
13739 if (addr.scale != 1)
13740 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13741 x = gen_rtx_PLUS (Pmode, idx, x);
13742 }
13743 if (addr.base)
13744 x = gen_rtx_PLUS (Pmode, addr.base, x);
13745 if (MEM_P (orig_x))
13746 x = replace_equiv_address_nv (orig_x, x);
13747 return x;
13748 }
13749
13750 /* In the name of slightly smaller debug output, and to cater to
13751 general assembler lossage, recognize PIC+GOTOFF and turn it back
13752 into a direct symbol reference.
13753
13754 On Darwin, this is necessary to avoid a crash, because Darwin
13755 has a different PIC label for each routine but the DWARF debugging
13756 information is not associated with any particular routine, so it's
13757 necessary to remove references to the PIC label from RTL stored by
13758 the DWARF output code. */
13759
13760 static rtx
13761 ix86_delegitimize_address (rtx x)
13762 {
13763 rtx orig_x = delegitimize_mem_from_attrs (x);
13764 /* addend is NULL or some rtx if x is something+GOTOFF where
13765 something doesn't include the PIC register. */
13766 rtx addend = NULL_RTX;
13767 /* reg_addend is NULL or a multiple of some register. */
13768 rtx reg_addend = NULL_RTX;
13769 /* const_addend is NULL or a const_int. */
13770 rtx const_addend = NULL_RTX;
13771 /* This is the result, or NULL. */
13772 rtx result = NULL_RTX;
13773
13774 x = orig_x;
13775
13776 if (MEM_P (x))
13777 x = XEXP (x, 0);
13778
13779 if (TARGET_64BIT)
13780 {
13781 if (GET_CODE (x) == CONST
13782 && GET_CODE (XEXP (x, 0)) == PLUS
13783 && GET_MODE (XEXP (x, 0)) == Pmode
13784 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13785 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13786 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13787 {
13788 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13789 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13790 if (MEM_P (orig_x))
13791 x = replace_equiv_address_nv (orig_x, x);
13792 return x;
13793 }
13794 if (GET_CODE (x) != CONST
13795 || GET_CODE (XEXP (x, 0)) != UNSPEC
13796 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13797 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13798 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13799 return ix86_delegitimize_tls_address (orig_x);
13800 x = XVECEXP (XEXP (x, 0), 0, 0);
13801 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13802 {
13803 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13804 GET_MODE (x), 0);
13805 if (x == NULL_RTX)
13806 return orig_x;
13807 }
13808 return x;
13809 }
13810
13811 if (GET_CODE (x) != PLUS
13812 || GET_CODE (XEXP (x, 1)) != CONST)
13813 return ix86_delegitimize_tls_address (orig_x);
13814
13815 if (ix86_pic_register_p (XEXP (x, 0)))
13816 /* %ebx + GOT/GOTOFF */
13817 ;
13818 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13819 {
13820 /* %ebx + %reg * scale + GOT/GOTOFF */
13821 reg_addend = XEXP (x, 0);
13822 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13823 reg_addend = XEXP (reg_addend, 1);
13824 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13825 reg_addend = XEXP (reg_addend, 0);
13826 else
13827 {
13828 reg_addend = NULL_RTX;
13829 addend = XEXP (x, 0);
13830 }
13831 }
13832 else
13833 addend = XEXP (x, 0);
13834
13835 x = XEXP (XEXP (x, 1), 0);
13836 if (GET_CODE (x) == PLUS
13837 && CONST_INT_P (XEXP (x, 1)))
13838 {
13839 const_addend = XEXP (x, 1);
13840 x = XEXP (x, 0);
13841 }
13842
13843 if (GET_CODE (x) == UNSPEC
13844 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13845 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13846 result = XVECEXP (x, 0, 0);
13847
13848 if (TARGET_MACHO && darwin_local_data_pic (x)
13849 && !MEM_P (orig_x))
13850 result = XVECEXP (x, 0, 0);
13851
13852 if (! result)
13853 return ix86_delegitimize_tls_address (orig_x);
13854
13855 if (const_addend)
13856 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13857 if (reg_addend)
13858 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13859 if (addend)
13860 {
13861 /* If the rest of original X doesn't involve the PIC register, add
13862 addend and subtract pic_offset_table_rtx. This can happen e.g.
13863 for code like:
13864 leal (%ebx, %ecx, 4), %ecx
13865 ...
13866 movl foo@GOTOFF(%ecx), %edx
13867 in which case we return (%ecx - %ebx) + foo. */
13868 if (pic_offset_table_rtx)
13869 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13870 pic_offset_table_rtx),
13871 result);
13872 else
13873 return orig_x;
13874 }
13875 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13876 {
13877 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13878 if (result == NULL_RTX)
13879 return orig_x;
13880 }
13881 return result;
13882 }
13883
13884 /* If X is a machine specific address (i.e. a symbol or label being
13885 referenced as a displacement from the GOT implemented using an
13886 UNSPEC), then return the base term. Otherwise return X. */
13887
13888 rtx
13889 ix86_find_base_term (rtx x)
13890 {
13891 rtx term;
13892
13893 if (TARGET_64BIT)
13894 {
13895 if (GET_CODE (x) != CONST)
13896 return x;
13897 term = XEXP (x, 0);
13898 if (GET_CODE (term) == PLUS
13899 && (CONST_INT_P (XEXP (term, 1))
13900 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13901 term = XEXP (term, 0);
13902 if (GET_CODE (term) != UNSPEC
13903 || (XINT (term, 1) != UNSPEC_GOTPCREL
13904 && XINT (term, 1) != UNSPEC_PCREL))
13905 return x;
13906
13907 return XVECEXP (term, 0, 0);
13908 }
13909
13910 return ix86_delegitimize_address (x);
13911 }
13912 \f
13913 static void
13914 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
13915 bool fp, FILE *file)
13916 {
13917 const char *suffix;
13918
13919 if (mode == CCFPmode || mode == CCFPUmode)
13920 {
13921 code = ix86_fp_compare_code_to_integer (code);
13922 mode = CCmode;
13923 }
13924 if (reverse)
13925 code = reverse_condition (code);
13926
13927 switch (code)
13928 {
13929 case EQ:
13930 switch (mode)
13931 {
13932 case CCAmode:
13933 suffix = "a";
13934 break;
13935
13936 case CCCmode:
13937 suffix = "c";
13938 break;
13939
13940 case CCOmode:
13941 suffix = "o";
13942 break;
13943
13944 case CCSmode:
13945 suffix = "s";
13946 break;
13947
13948 default:
13949 suffix = "e";
13950 }
13951 break;
13952 case NE:
13953 switch (mode)
13954 {
13955 case CCAmode:
13956 suffix = "na";
13957 break;
13958
13959 case CCCmode:
13960 suffix = "nc";
13961 break;
13962
13963 case CCOmode:
13964 suffix = "no";
13965 break;
13966
13967 case CCSmode:
13968 suffix = "ns";
13969 break;
13970
13971 default:
13972 suffix = "ne";
13973 }
13974 break;
13975 case GT:
13976 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13977 suffix = "g";
13978 break;
13979 case GTU:
13980 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13981 Those same assemblers have the same but opposite lossage on cmov. */
13982 if (mode == CCmode)
13983 suffix = fp ? "nbe" : "a";
13984 else if (mode == CCCmode)
13985 suffix = "b";
13986 else
13987 gcc_unreachable ();
13988 break;
13989 case LT:
13990 switch (mode)
13991 {
13992 case CCNOmode:
13993 case CCGOCmode:
13994 suffix = "s";
13995 break;
13996
13997 case CCmode:
13998 case CCGCmode:
13999 suffix = "l";
14000 break;
14001
14002 default:
14003 gcc_unreachable ();
14004 }
14005 break;
14006 case LTU:
14007 gcc_assert (mode == CCmode || mode == CCCmode);
14008 suffix = "b";
14009 break;
14010 case GE:
14011 switch (mode)
14012 {
14013 case CCNOmode:
14014 case CCGOCmode:
14015 suffix = "ns";
14016 break;
14017
14018 case CCmode:
14019 case CCGCmode:
14020 suffix = "ge";
14021 break;
14022
14023 default:
14024 gcc_unreachable ();
14025 }
14026 break;
14027 case GEU:
14028 /* ??? As above. */
14029 gcc_assert (mode == CCmode || mode == CCCmode);
14030 suffix = fp ? "nb" : "ae";
14031 break;
14032 case LE:
14033 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
14034 suffix = "le";
14035 break;
14036 case LEU:
14037 /* ??? As above. */
14038 if (mode == CCmode)
14039 suffix = "be";
14040 else if (mode == CCCmode)
14041 suffix = fp ? "nb" : "ae";
14042 else
14043 gcc_unreachable ();
14044 break;
14045 case UNORDERED:
14046 suffix = fp ? "u" : "p";
14047 break;
14048 case ORDERED:
14049 suffix = fp ? "nu" : "np";
14050 break;
14051 default:
14052 gcc_unreachable ();
14053 }
14054 fputs (suffix, file);
14055 }
14056
14057 /* Print the name of register X to FILE based on its machine mode and number.
14058 If CODE is 'w', pretend the mode is HImode.
14059 If CODE is 'b', pretend the mode is QImode.
14060 If CODE is 'k', pretend the mode is SImode.
14061 If CODE is 'q', pretend the mode is DImode.
14062 If CODE is 'x', pretend the mode is V4SFmode.
14063 If CODE is 't', pretend the mode is V8SFmode.
14064 If CODE is 'h', pretend the reg is the 'high' byte register.
14065 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
14066 If CODE is 'd', duplicate the operand for AVX instruction.
14067 */
14068
14069 void
14070 print_reg (rtx x, int code, FILE *file)
14071 {
14072 const char *reg;
14073 bool duplicated = code == 'd' && TARGET_AVX;
14074
14075 gcc_assert (x == pc_rtx
14076 || (REGNO (x) != ARG_POINTER_REGNUM
14077 && REGNO (x) != FRAME_POINTER_REGNUM
14078 && REGNO (x) != FLAGS_REG
14079 && REGNO (x) != FPSR_REG
14080 && REGNO (x) != FPCR_REG));
14081
14082 if (ASSEMBLER_DIALECT == ASM_ATT)
14083 putc ('%', file);
14084
14085 if (x == pc_rtx)
14086 {
14087 gcc_assert (TARGET_64BIT);
14088 fputs ("rip", file);
14089 return;
14090 }
14091
14092 if (code == 'w' || MMX_REG_P (x))
14093 code = 2;
14094 else if (code == 'b')
14095 code = 1;
14096 else if (code == 'k')
14097 code = 4;
14098 else if (code == 'q')
14099 code = 8;
14100 else if (code == 'y')
14101 code = 3;
14102 else if (code == 'h')
14103 code = 0;
14104 else if (code == 'x')
14105 code = 16;
14106 else if (code == 't')
14107 code = 32;
14108 else
14109 code = GET_MODE_SIZE (GET_MODE (x));
14110
14111 /* Irritatingly, AMD extended registers use different naming convention
14112 from the normal registers: "r%d[bwd]" */
14113 if (REX_INT_REG_P (x))
14114 {
14115 gcc_assert (TARGET_64BIT);
14116 putc ('r', file);
14117 fprint_ul (file, REGNO (x) - FIRST_REX_INT_REG + 8);
14118 switch (code)
14119 {
14120 case 0:
14121 error ("extended registers have no high halves");
14122 break;
14123 case 1:
14124 putc ('b', file);
14125 break;
14126 case 2:
14127 putc ('w', file);
14128 break;
14129 case 4:
14130 putc ('d', file);
14131 break;
14132 case 8:
14133 /* no suffix */
14134 break;
14135 default:
14136 error ("unsupported operand size for extended register");
14137 break;
14138 }
14139 return;
14140 }
14141
14142 reg = NULL;
14143 switch (code)
14144 {
14145 case 3:
14146 if (STACK_TOP_P (x))
14147 {
14148 reg = "st(0)";
14149 break;
14150 }
14151 /* FALLTHRU */
14152 case 8:
14153 case 4:
14154 case 12:
14155 if (! ANY_FP_REG_P (x))
14156 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
14157 /* FALLTHRU */
14158 case 16:
14159 case 2:
14160 normal:
14161 reg = hi_reg_name[REGNO (x)];
14162 break;
14163 case 1:
14164 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
14165 goto normal;
14166 reg = qi_reg_name[REGNO (x)];
14167 break;
14168 case 0:
14169 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
14170 goto normal;
14171 reg = qi_high_reg_name[REGNO (x)];
14172 break;
14173 case 32:
14174 if (SSE_REG_P (x))
14175 {
14176 gcc_assert (!duplicated);
14177 putc ('y', file);
14178 fputs (hi_reg_name[REGNO (x)] + 1, file);
14179 return;
14180 }
14181 break;
14182 default:
14183 gcc_unreachable ();
14184 }
14185
14186 fputs (reg, file);
14187 if (duplicated)
14188 {
14189 if (ASSEMBLER_DIALECT == ASM_ATT)
14190 fprintf (file, ", %%%s", reg);
14191 else
14192 fprintf (file, ", %s", reg);
14193 }
14194 }
14195
14196 /* Locate some local-dynamic symbol still in use by this function
14197 so that we can print its name in some tls_local_dynamic_base
14198 pattern. */
14199
14200 static int
14201 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14202 {
14203 rtx x = *px;
14204
14205 if (GET_CODE (x) == SYMBOL_REF
14206 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14207 {
14208 cfun->machine->some_ld_name = XSTR (x, 0);
14209 return 1;
14210 }
14211
14212 return 0;
14213 }
14214
14215 static const char *
14216 get_some_local_dynamic_name (void)
14217 {
14218 rtx insn;
14219
14220 if (cfun->machine->some_ld_name)
14221 return cfun->machine->some_ld_name;
14222
14223 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14224 if (NONDEBUG_INSN_P (insn)
14225 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14226 return cfun->machine->some_ld_name;
14227
14228 return NULL;
14229 }
14230
14231 /* Meaning of CODE:
14232 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14233 C -- print opcode suffix for set/cmov insn.
14234 c -- like C, but print reversed condition
14235 F,f -- likewise, but for floating-point.
14236 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14237 otherwise nothing
14238 R -- print the prefix for register names.
14239 z -- print the opcode suffix for the size of the current operand.
14240 Z -- likewise, with special suffixes for x87 instructions.
14241 * -- print a star (in certain assembler syntax)
14242 A -- print an absolute memory reference.
14243 E -- print address with DImode register names if TARGET_64BIT.
14244 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14245 s -- print a shift double count, followed by the assemblers argument
14246 delimiter.
14247 b -- print the QImode name of the register for the indicated operand.
14248 %b0 would print %al if operands[0] is reg 0.
14249 w -- likewise, print the HImode name of the register.
14250 k -- likewise, print the SImode name of the register.
14251 q -- likewise, print the DImode name of the register.
14252 x -- likewise, print the V4SFmode name of the register.
14253 t -- likewise, print the V8SFmode name of the register.
14254 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14255 y -- print "st(0)" instead of "st" as a register.
14256 d -- print duplicated register operand for AVX instruction.
14257 D -- print condition for SSE cmp instruction.
14258 P -- if PIC, print an @PLT suffix.
14259 p -- print raw symbol name.
14260 X -- don't print any sort of PIC '@' suffix for a symbol.
14261 & -- print some in-use local-dynamic symbol name.
14262 H -- print a memory address offset by 8; used for sse high-parts
14263 Y -- print condition for XOP pcom* instruction.
14264 + -- print a branch hint as 'cs' or 'ds' prefix
14265 ; -- print a semicolon (after prefixes due to bug in older gas).
14266 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14267 @ -- print a segment register of thread base pointer load
14268 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14269 */
14270
14271 void
14272 ix86_print_operand (FILE *file, rtx x, int code)
14273 {
14274 if (code)
14275 {
14276 switch (code)
14277 {
14278 case 'A':
14279 switch (ASSEMBLER_DIALECT)
14280 {
14281 case ASM_ATT:
14282 putc ('*', file);
14283 break;
14284
14285 case ASM_INTEL:
14286 /* Intel syntax. For absolute addresses, registers should not
14287 be surrounded by braces. */
14288 if (!REG_P (x))
14289 {
14290 putc ('[', file);
14291 ix86_print_operand (file, x, 0);
14292 putc (']', file);
14293 return;
14294 }
14295 break;
14296
14297 default:
14298 gcc_unreachable ();
14299 }
14300
14301 ix86_print_operand (file, x, 0);
14302 return;
14303
14304 case 'E':
14305 /* Wrap address in an UNSPEC to declare special handling. */
14306 if (TARGET_64BIT)
14307 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14308
14309 output_address (x);
14310 return;
14311
14312 case 'L':
14313 if (ASSEMBLER_DIALECT == ASM_ATT)
14314 putc ('l', file);
14315 return;
14316
14317 case 'W':
14318 if (ASSEMBLER_DIALECT == ASM_ATT)
14319 putc ('w', file);
14320 return;
14321
14322 case 'B':
14323 if (ASSEMBLER_DIALECT == ASM_ATT)
14324 putc ('b', file);
14325 return;
14326
14327 case 'Q':
14328 if (ASSEMBLER_DIALECT == ASM_ATT)
14329 putc ('l', file);
14330 return;
14331
14332 case 'S':
14333 if (ASSEMBLER_DIALECT == ASM_ATT)
14334 putc ('s', file);
14335 return;
14336
14337 case 'T':
14338 if (ASSEMBLER_DIALECT == ASM_ATT)
14339 putc ('t', file);
14340 return;
14341
14342 case 'O':
14343 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14344 if (ASSEMBLER_DIALECT != ASM_ATT)
14345 return;
14346
14347 switch (GET_MODE_SIZE (GET_MODE (x)))
14348 {
14349 case 2:
14350 putc ('w', file);
14351 break;
14352
14353 case 4:
14354 putc ('l', file);
14355 break;
14356
14357 case 8:
14358 putc ('q', file);
14359 break;
14360
14361 default:
14362 output_operand_lossage
14363 ("invalid operand size for operand code 'O'");
14364 return;
14365 }
14366
14367 putc ('.', file);
14368 #endif
14369 return;
14370
14371 case 'z':
14372 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14373 {
14374 /* Opcodes don't get size suffixes if using Intel opcodes. */
14375 if (ASSEMBLER_DIALECT == ASM_INTEL)
14376 return;
14377
14378 switch (GET_MODE_SIZE (GET_MODE (x)))
14379 {
14380 case 1:
14381 putc ('b', file);
14382 return;
14383
14384 case 2:
14385 putc ('w', file);
14386 return;
14387
14388 case 4:
14389 putc ('l', file);
14390 return;
14391
14392 case 8:
14393 putc ('q', file);
14394 return;
14395
14396 default:
14397 output_operand_lossage
14398 ("invalid operand size for operand code 'z'");
14399 return;
14400 }
14401 }
14402
14403 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14404 warning
14405 (0, "non-integer operand used with operand code 'z'");
14406 /* FALLTHRU */
14407
14408 case 'Z':
14409 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14410 if (ASSEMBLER_DIALECT == ASM_INTEL)
14411 return;
14412
14413 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14414 {
14415 switch (GET_MODE_SIZE (GET_MODE (x)))
14416 {
14417 case 2:
14418 #ifdef HAVE_AS_IX86_FILDS
14419 putc ('s', file);
14420 #endif
14421 return;
14422
14423 case 4:
14424 putc ('l', file);
14425 return;
14426
14427 case 8:
14428 #ifdef HAVE_AS_IX86_FILDQ
14429 putc ('q', file);
14430 #else
14431 fputs ("ll", file);
14432 #endif
14433 return;
14434
14435 default:
14436 break;
14437 }
14438 }
14439 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14440 {
14441 /* 387 opcodes don't get size suffixes
14442 if the operands are registers. */
14443 if (STACK_REG_P (x))
14444 return;
14445
14446 switch (GET_MODE_SIZE (GET_MODE (x)))
14447 {
14448 case 4:
14449 putc ('s', file);
14450 return;
14451
14452 case 8:
14453 putc ('l', file);
14454 return;
14455
14456 case 12:
14457 case 16:
14458 putc ('t', file);
14459 return;
14460
14461 default:
14462 break;
14463 }
14464 }
14465 else
14466 {
14467 output_operand_lossage
14468 ("invalid operand type used with operand code 'Z'");
14469 return;
14470 }
14471
14472 output_operand_lossage
14473 ("invalid operand size for operand code 'Z'");
14474 return;
14475
14476 case 'd':
14477 case 'b':
14478 case 'w':
14479 case 'k':
14480 case 'q':
14481 case 'h':
14482 case 't':
14483 case 'y':
14484 case 'x':
14485 case 'X':
14486 case 'P':
14487 case 'p':
14488 break;
14489
14490 case 's':
14491 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14492 {
14493 ix86_print_operand (file, x, 0);
14494 fputs (", ", file);
14495 }
14496 return;
14497
14498 case 'Y':
14499 switch (GET_CODE (x))
14500 {
14501 case NE:
14502 fputs ("neq", file);
14503 break;
14504 case EQ:
14505 fputs ("eq", file);
14506 break;
14507 case GE:
14508 case GEU:
14509 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14510 break;
14511 case GT:
14512 case GTU:
14513 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14514 break;
14515 case LE:
14516 case LEU:
14517 fputs ("le", file);
14518 break;
14519 case LT:
14520 case LTU:
14521 fputs ("lt", file);
14522 break;
14523 case UNORDERED:
14524 fputs ("unord", file);
14525 break;
14526 case ORDERED:
14527 fputs ("ord", file);
14528 break;
14529 case UNEQ:
14530 fputs ("ueq", file);
14531 break;
14532 case UNGE:
14533 fputs ("nlt", file);
14534 break;
14535 case UNGT:
14536 fputs ("nle", file);
14537 break;
14538 case UNLE:
14539 fputs ("ule", file);
14540 break;
14541 case UNLT:
14542 fputs ("ult", file);
14543 break;
14544 case LTGT:
14545 fputs ("une", file);
14546 break;
14547 default:
14548 output_operand_lossage ("operand is not a condition code, "
14549 "invalid operand code 'Y'");
14550 return;
14551 }
14552 return;
14553
14554 case 'D':
14555 /* Little bit of braindamage here. The SSE compare instructions
14556 does use completely different names for the comparisons that the
14557 fp conditional moves. */
14558 switch (GET_CODE (x))
14559 {
14560 case UNEQ:
14561 if (TARGET_AVX)
14562 {
14563 fputs ("eq_us", file);
14564 break;
14565 }
14566 case EQ:
14567 fputs ("eq", file);
14568 break;
14569 case UNLT:
14570 if (TARGET_AVX)
14571 {
14572 fputs ("nge", file);
14573 break;
14574 }
14575 case LT:
14576 fputs ("lt", file);
14577 break;
14578 case UNLE:
14579 if (TARGET_AVX)
14580 {
14581 fputs ("ngt", file);
14582 break;
14583 }
14584 case LE:
14585 fputs ("le", file);
14586 break;
14587 case UNORDERED:
14588 fputs ("unord", file);
14589 break;
14590 case LTGT:
14591 if (TARGET_AVX)
14592 {
14593 fputs ("neq_oq", file);
14594 break;
14595 }
14596 case NE:
14597 fputs ("neq", file);
14598 break;
14599 case GE:
14600 if (TARGET_AVX)
14601 {
14602 fputs ("ge", file);
14603 break;
14604 }
14605 case UNGE:
14606 fputs ("nlt", file);
14607 break;
14608 case GT:
14609 if (TARGET_AVX)
14610 {
14611 fputs ("gt", file);
14612 break;
14613 }
14614 case UNGT:
14615 fputs ("nle", file);
14616 break;
14617 case ORDERED:
14618 fputs ("ord", file);
14619 break;
14620 default:
14621 output_operand_lossage ("operand is not a condition code, "
14622 "invalid operand code 'D'");
14623 return;
14624 }
14625 return;
14626
14627 case 'F':
14628 case 'f':
14629 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14630 if (ASSEMBLER_DIALECT == ASM_ATT)
14631 putc ('.', file);
14632 #endif
14633
14634 case 'C':
14635 case 'c':
14636 if (!COMPARISON_P (x))
14637 {
14638 output_operand_lossage ("operand is not a condition code, "
14639 "invalid operand code '%c'", code);
14640 return;
14641 }
14642 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14643 code == 'c' || code == 'f',
14644 code == 'F' || code == 'f',
14645 file);
14646 return;
14647
14648 case 'H':
14649 if (!offsettable_memref_p (x))
14650 {
14651 output_operand_lossage ("operand is not an offsettable memory "
14652 "reference, invalid operand code 'H'");
14653 return;
14654 }
14655 /* It doesn't actually matter what mode we use here, as we're
14656 only going to use this for printing. */
14657 x = adjust_address_nv (x, DImode, 8);
14658 break;
14659
14660 case 'K':
14661 gcc_assert (CONST_INT_P (x));
14662
14663 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14664 #ifdef HAVE_AS_IX86_HLE
14665 fputs ("xacquire ", file);
14666 #else
14667 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14668 #endif
14669 else if (INTVAL (x) & IX86_HLE_RELEASE)
14670 #ifdef HAVE_AS_IX86_HLE
14671 fputs ("xrelease ", file);
14672 #else
14673 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14674 #endif
14675 /* We do not want to print value of the operand. */
14676 return;
14677
14678 case '*':
14679 if (ASSEMBLER_DIALECT == ASM_ATT)
14680 putc ('*', file);
14681 return;
14682
14683 case '&':
14684 {
14685 const char *name = get_some_local_dynamic_name ();
14686 if (name == NULL)
14687 output_operand_lossage ("'%%&' used without any "
14688 "local dynamic TLS references");
14689 else
14690 assemble_name (file, name);
14691 return;
14692 }
14693
14694 case '+':
14695 {
14696 rtx x;
14697
14698 if (!optimize
14699 || optimize_function_for_size_p (cfun)
14700 || !TARGET_BRANCH_PREDICTION_HINTS)
14701 return;
14702
14703 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14704 if (x)
14705 {
14706 int pred_val = INTVAL (XEXP (x, 0));
14707
14708 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14709 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14710 {
14711 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14712 bool cputaken
14713 = final_forward_branch_p (current_output_insn) == 0;
14714
14715 /* Emit hints only in the case default branch prediction
14716 heuristics would fail. */
14717 if (taken != cputaken)
14718 {
14719 /* We use 3e (DS) prefix for taken branches and
14720 2e (CS) prefix for not taken branches. */
14721 if (taken)
14722 fputs ("ds ; ", file);
14723 else
14724 fputs ("cs ; ", file);
14725 }
14726 }
14727 }
14728 return;
14729 }
14730
14731 case ';':
14732 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14733 putc (';', file);
14734 #endif
14735 return;
14736
14737 case '@':
14738 if (ASSEMBLER_DIALECT == ASM_ATT)
14739 putc ('%', file);
14740
14741 /* The kernel uses a different segment register for performance
14742 reasons; a system call would not have to trash the userspace
14743 segment register, which would be expensive. */
14744 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14745 fputs ("fs", file);
14746 else
14747 fputs ("gs", file);
14748 return;
14749
14750 case '~':
14751 putc (TARGET_AVX2 ? 'i' : 'f', file);
14752 return;
14753
14754 case '^':
14755 if (TARGET_64BIT && Pmode != word_mode)
14756 fputs ("addr32 ", file);
14757 return;
14758
14759 default:
14760 output_operand_lossage ("invalid operand code '%c'", code);
14761 }
14762 }
14763
14764 if (REG_P (x))
14765 print_reg (x, code, file);
14766
14767 else if (MEM_P (x))
14768 {
14769 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14770 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14771 && GET_MODE (x) != BLKmode)
14772 {
14773 const char * size;
14774 switch (GET_MODE_SIZE (GET_MODE (x)))
14775 {
14776 case 1: size = "BYTE"; break;
14777 case 2: size = "WORD"; break;
14778 case 4: size = "DWORD"; break;
14779 case 8: size = "QWORD"; break;
14780 case 12: size = "TBYTE"; break;
14781 case 16:
14782 if (GET_MODE (x) == XFmode)
14783 size = "TBYTE";
14784 else
14785 size = "XMMWORD";
14786 break;
14787 case 32: size = "YMMWORD"; break;
14788 default:
14789 gcc_unreachable ();
14790 }
14791
14792 /* Check for explicit size override (codes 'b', 'w', 'k',
14793 'q' and 'x') */
14794 if (code == 'b')
14795 size = "BYTE";
14796 else if (code == 'w')
14797 size = "WORD";
14798 else if (code == 'k')
14799 size = "DWORD";
14800 else if (code == 'q')
14801 size = "QWORD";
14802 else if (code == 'x')
14803 size = "XMMWORD";
14804
14805 fputs (size, file);
14806 fputs (" PTR ", file);
14807 }
14808
14809 x = XEXP (x, 0);
14810 /* Avoid (%rip) for call operands. */
14811 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14812 && !CONST_INT_P (x))
14813 output_addr_const (file, x);
14814 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14815 output_operand_lossage ("invalid constraints for operand");
14816 else
14817 output_address (x);
14818 }
14819
14820 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14821 {
14822 REAL_VALUE_TYPE r;
14823 long l;
14824
14825 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14826 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14827
14828 if (ASSEMBLER_DIALECT == ASM_ATT)
14829 putc ('$', file);
14830 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14831 if (code == 'q')
14832 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14833 else
14834 fprintf (file, "0x%08x", (unsigned int) l);
14835 }
14836
14837 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14838 {
14839 REAL_VALUE_TYPE r;
14840 long l[2];
14841
14842 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14843 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14844
14845 if (ASSEMBLER_DIALECT == ASM_ATT)
14846 putc ('$', file);
14847 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14848 }
14849
14850 /* These float cases don't actually occur as immediate operands. */
14851 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14852 {
14853 char dstr[30];
14854
14855 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14856 fputs (dstr, file);
14857 }
14858
14859 else
14860 {
14861 /* We have patterns that allow zero sets of memory, for instance.
14862 In 64-bit mode, we should probably support all 8-byte vectors,
14863 since we can in fact encode that into an immediate. */
14864 if (GET_CODE (x) == CONST_VECTOR)
14865 {
14866 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14867 x = const0_rtx;
14868 }
14869
14870 if (code != 'P' && code != 'p')
14871 {
14872 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14873 {
14874 if (ASSEMBLER_DIALECT == ASM_ATT)
14875 putc ('$', file);
14876 }
14877 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14878 || GET_CODE (x) == LABEL_REF)
14879 {
14880 if (ASSEMBLER_DIALECT == ASM_ATT)
14881 putc ('$', file);
14882 else
14883 fputs ("OFFSET FLAT:", file);
14884 }
14885 }
14886 if (CONST_INT_P (x))
14887 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14888 else if (flag_pic || MACHOPIC_INDIRECT)
14889 output_pic_addr_const (file, x, code);
14890 else
14891 output_addr_const (file, x);
14892 }
14893 }
14894
14895 static bool
14896 ix86_print_operand_punct_valid_p (unsigned char code)
14897 {
14898 return (code == '@' || code == '*' || code == '+' || code == '&'
14899 || code == ';' || code == '~' || code == '^');
14900 }
14901 \f
14902 /* Print a memory operand whose address is ADDR. */
14903
14904 static void
14905 ix86_print_operand_address (FILE *file, rtx addr)
14906 {
14907 struct ix86_address parts;
14908 rtx base, index, disp;
14909 int scale;
14910 int ok;
14911 bool vsib = false;
14912 int code = 0;
14913
14914 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14915 {
14916 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14917 gcc_assert (parts.index == NULL_RTX);
14918 parts.index = XVECEXP (addr, 0, 1);
14919 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14920 addr = XVECEXP (addr, 0, 0);
14921 vsib = true;
14922 }
14923 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14924 {
14925 gcc_assert (TARGET_64BIT);
14926 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14927 code = 'q';
14928 }
14929 else
14930 ok = ix86_decompose_address (addr, &parts);
14931
14932 gcc_assert (ok);
14933
14934 if (parts.base && GET_CODE (parts.base) == SUBREG)
14935 {
14936 rtx tmp = SUBREG_REG (parts.base);
14937 parts.base = simplify_subreg (GET_MODE (parts.base),
14938 tmp, GET_MODE (tmp), 0);
14939 gcc_assert (parts.base != NULL_RTX);
14940 }
14941
14942 if (parts.index && GET_CODE (parts.index) == SUBREG)
14943 {
14944 rtx tmp = SUBREG_REG (parts.index);
14945 parts.index = simplify_subreg (GET_MODE (parts.index),
14946 tmp, GET_MODE (tmp), 0);
14947 gcc_assert (parts.index != NULL_RTX);
14948 }
14949
14950 base = parts.base;
14951 index = parts.index;
14952 disp = parts.disp;
14953 scale = parts.scale;
14954
14955 switch (parts.seg)
14956 {
14957 case SEG_DEFAULT:
14958 break;
14959 case SEG_FS:
14960 case SEG_GS:
14961 if (ASSEMBLER_DIALECT == ASM_ATT)
14962 putc ('%', file);
14963 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14964 break;
14965 default:
14966 gcc_unreachable ();
14967 }
14968
14969 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14970 if (TARGET_64BIT && !base && !index)
14971 {
14972 rtx symbol = disp;
14973
14974 if (GET_CODE (disp) == CONST
14975 && GET_CODE (XEXP (disp, 0)) == PLUS
14976 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14977 symbol = XEXP (XEXP (disp, 0), 0);
14978
14979 if (GET_CODE (symbol) == LABEL_REF
14980 || (GET_CODE (symbol) == SYMBOL_REF
14981 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14982 base = pc_rtx;
14983 }
14984 if (!base && !index)
14985 {
14986 /* Displacement only requires special attention. */
14987
14988 if (CONST_INT_P (disp))
14989 {
14990 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14991 fputs ("ds:", file);
14992 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14993 }
14994 else if (flag_pic)
14995 output_pic_addr_const (file, disp, 0);
14996 else
14997 output_addr_const (file, disp);
14998 }
14999 else
15000 {
15001 /* Print SImode register names to force addr32 prefix. */
15002 if (SImode_address_operand (addr, VOIDmode))
15003 {
15004 #ifdef ENABLE_CHECKING
15005 gcc_assert (TARGET_64BIT);
15006 switch (GET_CODE (addr))
15007 {
15008 case SUBREG:
15009 gcc_assert (GET_MODE (addr) == SImode);
15010 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
15011 break;
15012 case ZERO_EXTEND:
15013 case AND:
15014 gcc_assert (GET_MODE (addr) == DImode);
15015 break;
15016 default:
15017 gcc_unreachable ();
15018 }
15019 #endif
15020 gcc_assert (!code);
15021 code = 'l';
15022 }
15023
15024 if (ASSEMBLER_DIALECT == ASM_ATT)
15025 {
15026 if (disp)
15027 {
15028 if (flag_pic)
15029 output_pic_addr_const (file, disp, 0);
15030 else if (GET_CODE (disp) == LABEL_REF)
15031 output_asm_label (disp);
15032 else
15033 output_addr_const (file, disp);
15034 }
15035
15036 putc ('(', file);
15037 if (base)
15038 print_reg (base, code, file);
15039 if (index)
15040 {
15041 putc (',', file);
15042 print_reg (index, vsib ? 0 : code, file);
15043 if (scale != 1 || vsib)
15044 fprintf (file, ",%d", scale);
15045 }
15046 putc (')', file);
15047 }
15048 else
15049 {
15050 rtx offset = NULL_RTX;
15051
15052 if (disp)
15053 {
15054 /* Pull out the offset of a symbol; print any symbol itself. */
15055 if (GET_CODE (disp) == CONST
15056 && GET_CODE (XEXP (disp, 0)) == PLUS
15057 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15058 {
15059 offset = XEXP (XEXP (disp, 0), 1);
15060 disp = gen_rtx_CONST (VOIDmode,
15061 XEXP (XEXP (disp, 0), 0));
15062 }
15063
15064 if (flag_pic)
15065 output_pic_addr_const (file, disp, 0);
15066 else if (GET_CODE (disp) == LABEL_REF)
15067 output_asm_label (disp);
15068 else if (CONST_INT_P (disp))
15069 offset = disp;
15070 else
15071 output_addr_const (file, disp);
15072 }
15073
15074 putc ('[', file);
15075 if (base)
15076 {
15077 print_reg (base, code, file);
15078 if (offset)
15079 {
15080 if (INTVAL (offset) >= 0)
15081 putc ('+', file);
15082 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15083 }
15084 }
15085 else if (offset)
15086 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15087 else
15088 putc ('0', file);
15089
15090 if (index)
15091 {
15092 putc ('+', file);
15093 print_reg (index, vsib ? 0 : code, file);
15094 if (scale != 1 || vsib)
15095 fprintf (file, "*%d", scale);
15096 }
15097 putc (']', file);
15098 }
15099 }
15100 }
15101
15102 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
15103
15104 static bool
15105 i386_asm_output_addr_const_extra (FILE *file, rtx x)
15106 {
15107 rtx op;
15108
15109 if (GET_CODE (x) != UNSPEC)
15110 return false;
15111
15112 op = XVECEXP (x, 0, 0);
15113 switch (XINT (x, 1))
15114 {
15115 case UNSPEC_GOTTPOFF:
15116 output_addr_const (file, op);
15117 /* FIXME: This might be @TPOFF in Sun ld. */
15118 fputs ("@gottpoff", file);
15119 break;
15120 case UNSPEC_TPOFF:
15121 output_addr_const (file, op);
15122 fputs ("@tpoff", file);
15123 break;
15124 case UNSPEC_NTPOFF:
15125 output_addr_const (file, op);
15126 if (TARGET_64BIT)
15127 fputs ("@tpoff", file);
15128 else
15129 fputs ("@ntpoff", file);
15130 break;
15131 case UNSPEC_DTPOFF:
15132 output_addr_const (file, op);
15133 fputs ("@dtpoff", file);
15134 break;
15135 case UNSPEC_GOTNTPOFF:
15136 output_addr_const (file, op);
15137 if (TARGET_64BIT)
15138 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
15139 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
15140 else
15141 fputs ("@gotntpoff", file);
15142 break;
15143 case UNSPEC_INDNTPOFF:
15144 output_addr_const (file, op);
15145 fputs ("@indntpoff", file);
15146 break;
15147 #if TARGET_MACHO
15148 case UNSPEC_MACHOPIC_OFFSET:
15149 output_addr_const (file, op);
15150 putc ('-', file);
15151 machopic_output_function_base_name (file);
15152 break;
15153 #endif
15154
15155 case UNSPEC_STACK_CHECK:
15156 {
15157 int offset;
15158
15159 gcc_assert (flag_split_stack);
15160
15161 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15162 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15163 #else
15164 gcc_unreachable ();
15165 #endif
15166
15167 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
15168 }
15169 break;
15170
15171 default:
15172 return false;
15173 }
15174
15175 return true;
15176 }
15177 \f
15178 /* Split one or more double-mode RTL references into pairs of half-mode
15179 references. The RTL can be REG, offsettable MEM, integer constant, or
15180 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
15181 split and "num" is its length. lo_half and hi_half are output arrays
15182 that parallel "operands". */
15183
15184 void
15185 split_double_mode (enum machine_mode mode, rtx operands[],
15186 int num, rtx lo_half[], rtx hi_half[])
15187 {
15188 enum machine_mode half_mode;
15189 unsigned int byte;
15190
15191 switch (mode)
15192 {
15193 case TImode:
15194 half_mode = DImode;
15195 break;
15196 case DImode:
15197 half_mode = SImode;
15198 break;
15199 default:
15200 gcc_unreachable ();
15201 }
15202
15203 byte = GET_MODE_SIZE (half_mode);
15204
15205 while (num--)
15206 {
15207 rtx op = operands[num];
15208
15209 /* simplify_subreg refuse to split volatile memory addresses,
15210 but we still have to handle it. */
15211 if (MEM_P (op))
15212 {
15213 lo_half[num] = adjust_address (op, half_mode, 0);
15214 hi_half[num] = adjust_address (op, half_mode, byte);
15215 }
15216 else
15217 {
15218 lo_half[num] = simplify_gen_subreg (half_mode, op,
15219 GET_MODE (op) == VOIDmode
15220 ? mode : GET_MODE (op), 0);
15221 hi_half[num] = simplify_gen_subreg (half_mode, op,
15222 GET_MODE (op) == VOIDmode
15223 ? mode : GET_MODE (op), byte);
15224 }
15225 }
15226 }
15227 \f
15228 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15229 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15230 is the expression of the binary operation. The output may either be
15231 emitted here, or returned to the caller, like all output_* functions.
15232
15233 There is no guarantee that the operands are the same mode, as they
15234 might be within FLOAT or FLOAT_EXTEND expressions. */
15235
15236 #ifndef SYSV386_COMPAT
15237 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15238 wants to fix the assemblers because that causes incompatibility
15239 with gcc. No-one wants to fix gcc because that causes
15240 incompatibility with assemblers... You can use the option of
15241 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15242 #define SYSV386_COMPAT 1
15243 #endif
15244
15245 const char *
15246 output_387_binary_op (rtx insn, rtx *operands)
15247 {
15248 static char buf[40];
15249 const char *p;
15250 const char *ssep;
15251 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15252
15253 #ifdef ENABLE_CHECKING
15254 /* Even if we do not want to check the inputs, this documents input
15255 constraints. Which helps in understanding the following code. */
15256 if (STACK_REG_P (operands[0])
15257 && ((REG_P (operands[1])
15258 && REGNO (operands[0]) == REGNO (operands[1])
15259 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15260 || (REG_P (operands[2])
15261 && REGNO (operands[0]) == REGNO (operands[2])
15262 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15263 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15264 ; /* ok */
15265 else
15266 gcc_assert (is_sse);
15267 #endif
15268
15269 switch (GET_CODE (operands[3]))
15270 {
15271 case PLUS:
15272 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15273 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15274 p = "fiadd";
15275 else
15276 p = "fadd";
15277 ssep = "vadd";
15278 break;
15279
15280 case MINUS:
15281 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15282 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15283 p = "fisub";
15284 else
15285 p = "fsub";
15286 ssep = "vsub";
15287 break;
15288
15289 case MULT:
15290 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15291 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15292 p = "fimul";
15293 else
15294 p = "fmul";
15295 ssep = "vmul";
15296 break;
15297
15298 case DIV:
15299 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15300 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15301 p = "fidiv";
15302 else
15303 p = "fdiv";
15304 ssep = "vdiv";
15305 break;
15306
15307 default:
15308 gcc_unreachable ();
15309 }
15310
15311 if (is_sse)
15312 {
15313 if (TARGET_AVX)
15314 {
15315 strcpy (buf, ssep);
15316 if (GET_MODE (operands[0]) == SFmode)
15317 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15318 else
15319 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15320 }
15321 else
15322 {
15323 strcpy (buf, ssep + 1);
15324 if (GET_MODE (operands[0]) == SFmode)
15325 strcat (buf, "ss\t{%2, %0|%0, %2}");
15326 else
15327 strcat (buf, "sd\t{%2, %0|%0, %2}");
15328 }
15329 return buf;
15330 }
15331 strcpy (buf, p);
15332
15333 switch (GET_CODE (operands[3]))
15334 {
15335 case MULT:
15336 case PLUS:
15337 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15338 {
15339 rtx temp = operands[2];
15340 operands[2] = operands[1];
15341 operands[1] = temp;
15342 }
15343
15344 /* know operands[0] == operands[1]. */
15345
15346 if (MEM_P (operands[2]))
15347 {
15348 p = "%Z2\t%2";
15349 break;
15350 }
15351
15352 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15353 {
15354 if (STACK_TOP_P (operands[0]))
15355 /* How is it that we are storing to a dead operand[2]?
15356 Well, presumably operands[1] is dead too. We can't
15357 store the result to st(0) as st(0) gets popped on this
15358 instruction. Instead store to operands[2] (which I
15359 think has to be st(1)). st(1) will be popped later.
15360 gcc <= 2.8.1 didn't have this check and generated
15361 assembly code that the Unixware assembler rejected. */
15362 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15363 else
15364 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15365 break;
15366 }
15367
15368 if (STACK_TOP_P (operands[0]))
15369 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15370 else
15371 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15372 break;
15373
15374 case MINUS:
15375 case DIV:
15376 if (MEM_P (operands[1]))
15377 {
15378 p = "r%Z1\t%1";
15379 break;
15380 }
15381
15382 if (MEM_P (operands[2]))
15383 {
15384 p = "%Z2\t%2";
15385 break;
15386 }
15387
15388 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15389 {
15390 #if SYSV386_COMPAT
15391 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15392 derived assemblers, confusingly reverse the direction of
15393 the operation for fsub{r} and fdiv{r} when the
15394 destination register is not st(0). The Intel assembler
15395 doesn't have this brain damage. Read !SYSV386_COMPAT to
15396 figure out what the hardware really does. */
15397 if (STACK_TOP_P (operands[0]))
15398 p = "{p\t%0, %2|rp\t%2, %0}";
15399 else
15400 p = "{rp\t%2, %0|p\t%0, %2}";
15401 #else
15402 if (STACK_TOP_P (operands[0]))
15403 /* As above for fmul/fadd, we can't store to st(0). */
15404 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15405 else
15406 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15407 #endif
15408 break;
15409 }
15410
15411 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15412 {
15413 #if SYSV386_COMPAT
15414 if (STACK_TOP_P (operands[0]))
15415 p = "{rp\t%0, %1|p\t%1, %0}";
15416 else
15417 p = "{p\t%1, %0|rp\t%0, %1}";
15418 #else
15419 if (STACK_TOP_P (operands[0]))
15420 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15421 else
15422 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15423 #endif
15424 break;
15425 }
15426
15427 if (STACK_TOP_P (operands[0]))
15428 {
15429 if (STACK_TOP_P (operands[1]))
15430 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15431 else
15432 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15433 break;
15434 }
15435 else if (STACK_TOP_P (operands[1]))
15436 {
15437 #if SYSV386_COMPAT
15438 p = "{\t%1, %0|r\t%0, %1}";
15439 #else
15440 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15441 #endif
15442 }
15443 else
15444 {
15445 #if SYSV386_COMPAT
15446 p = "{r\t%2, %0|\t%0, %2}";
15447 #else
15448 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15449 #endif
15450 }
15451 break;
15452
15453 default:
15454 gcc_unreachable ();
15455 }
15456
15457 strcat (buf, p);
15458 return buf;
15459 }
15460
15461 /* Return needed mode for entity in optimize_mode_switching pass. */
15462
15463 int
15464 ix86_mode_needed (int entity, rtx insn)
15465 {
15466 enum attr_i387_cw mode;
15467
15468 /* The mode UNINITIALIZED is used to store control word after a
15469 function call or ASM pattern. The mode ANY specify that function
15470 has no requirements on the control word and make no changes in the
15471 bits we are interested in. */
15472
15473 if (CALL_P (insn)
15474 || (NONJUMP_INSN_P (insn)
15475 && (asm_noperands (PATTERN (insn)) >= 0
15476 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15477 return I387_CW_UNINITIALIZED;
15478
15479 if (recog_memoized (insn) < 0)
15480 return I387_CW_ANY;
15481
15482 mode = get_attr_i387_cw (insn);
15483
15484 switch (entity)
15485 {
15486 case I387_TRUNC:
15487 if (mode == I387_CW_TRUNC)
15488 return mode;
15489 break;
15490
15491 case I387_FLOOR:
15492 if (mode == I387_CW_FLOOR)
15493 return mode;
15494 break;
15495
15496 case I387_CEIL:
15497 if (mode == I387_CW_CEIL)
15498 return mode;
15499 break;
15500
15501 case I387_MASK_PM:
15502 if (mode == I387_CW_MASK_PM)
15503 return mode;
15504 break;
15505
15506 default:
15507 gcc_unreachable ();
15508 }
15509
15510 return I387_CW_ANY;
15511 }
15512
15513 /* Output code to initialize control word copies used by trunc?f?i and
15514 rounding patterns. CURRENT_MODE is set to current control word,
15515 while NEW_MODE is set to new control word. */
15516
15517 void
15518 emit_i387_cw_initialization (int mode)
15519 {
15520 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15521 rtx new_mode;
15522
15523 enum ix86_stack_slot slot;
15524
15525 rtx reg = gen_reg_rtx (HImode);
15526
15527 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15528 emit_move_insn (reg, copy_rtx (stored_mode));
15529
15530 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15531 || optimize_function_for_size_p (cfun))
15532 {
15533 switch (mode)
15534 {
15535 case I387_CW_TRUNC:
15536 /* round toward zero (truncate) */
15537 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15538 slot = SLOT_CW_TRUNC;
15539 break;
15540
15541 case I387_CW_FLOOR:
15542 /* round down toward -oo */
15543 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15544 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15545 slot = SLOT_CW_FLOOR;
15546 break;
15547
15548 case I387_CW_CEIL:
15549 /* round up toward +oo */
15550 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15551 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15552 slot = SLOT_CW_CEIL;
15553 break;
15554
15555 case I387_CW_MASK_PM:
15556 /* mask precision exception for nearbyint() */
15557 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15558 slot = SLOT_CW_MASK_PM;
15559 break;
15560
15561 default:
15562 gcc_unreachable ();
15563 }
15564 }
15565 else
15566 {
15567 switch (mode)
15568 {
15569 case I387_CW_TRUNC:
15570 /* round toward zero (truncate) */
15571 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15572 slot = SLOT_CW_TRUNC;
15573 break;
15574
15575 case I387_CW_FLOOR:
15576 /* round down toward -oo */
15577 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15578 slot = SLOT_CW_FLOOR;
15579 break;
15580
15581 case I387_CW_CEIL:
15582 /* round up toward +oo */
15583 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15584 slot = SLOT_CW_CEIL;
15585 break;
15586
15587 case I387_CW_MASK_PM:
15588 /* mask precision exception for nearbyint() */
15589 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15590 slot = SLOT_CW_MASK_PM;
15591 break;
15592
15593 default:
15594 gcc_unreachable ();
15595 }
15596 }
15597
15598 gcc_assert (slot < MAX_386_STACK_LOCALS);
15599
15600 new_mode = assign_386_stack_local (HImode, slot);
15601 emit_move_insn (new_mode, reg);
15602 }
15603
15604 /* Output code for INSN to convert a float to a signed int. OPERANDS
15605 are the insn operands. The output may be [HSD]Imode and the input
15606 operand may be [SDX]Fmode. */
15607
15608 const char *
15609 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15610 {
15611 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15612 int dimode_p = GET_MODE (operands[0]) == DImode;
15613 int round_mode = get_attr_i387_cw (insn);
15614
15615 /* Jump through a hoop or two for DImode, since the hardware has no
15616 non-popping instruction. We used to do this a different way, but
15617 that was somewhat fragile and broke with post-reload splitters. */
15618 if ((dimode_p || fisttp) && !stack_top_dies)
15619 output_asm_insn ("fld\t%y1", operands);
15620
15621 gcc_assert (STACK_TOP_P (operands[1]));
15622 gcc_assert (MEM_P (operands[0]));
15623 gcc_assert (GET_MODE (operands[1]) != TFmode);
15624
15625 if (fisttp)
15626 output_asm_insn ("fisttp%Z0\t%0", operands);
15627 else
15628 {
15629 if (round_mode != I387_CW_ANY)
15630 output_asm_insn ("fldcw\t%3", operands);
15631 if (stack_top_dies || dimode_p)
15632 output_asm_insn ("fistp%Z0\t%0", operands);
15633 else
15634 output_asm_insn ("fist%Z0\t%0", operands);
15635 if (round_mode != I387_CW_ANY)
15636 output_asm_insn ("fldcw\t%2", operands);
15637 }
15638
15639 return "";
15640 }
15641
15642 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15643 have the values zero or one, indicates the ffreep insn's operand
15644 from the OPERANDS array. */
15645
15646 static const char *
15647 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15648 {
15649 if (TARGET_USE_FFREEP)
15650 #ifdef HAVE_AS_IX86_FFREEP
15651 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15652 #else
15653 {
15654 static char retval[32];
15655 int regno = REGNO (operands[opno]);
15656
15657 gcc_assert (STACK_REGNO_P (regno));
15658
15659 regno -= FIRST_STACK_REG;
15660
15661 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15662 return retval;
15663 }
15664 #endif
15665
15666 return opno ? "fstp\t%y1" : "fstp\t%y0";
15667 }
15668
15669
15670 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15671 should be used. UNORDERED_P is true when fucom should be used. */
15672
15673 const char *
15674 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15675 {
15676 int stack_top_dies;
15677 rtx cmp_op0, cmp_op1;
15678 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15679
15680 if (eflags_p)
15681 {
15682 cmp_op0 = operands[0];
15683 cmp_op1 = operands[1];
15684 }
15685 else
15686 {
15687 cmp_op0 = operands[1];
15688 cmp_op1 = operands[2];
15689 }
15690
15691 if (is_sse)
15692 {
15693 if (GET_MODE (operands[0]) == SFmode)
15694 if (unordered_p)
15695 return "%vucomiss\t{%1, %0|%0, %1}";
15696 else
15697 return "%vcomiss\t{%1, %0|%0, %1}";
15698 else
15699 if (unordered_p)
15700 return "%vucomisd\t{%1, %0|%0, %1}";
15701 else
15702 return "%vcomisd\t{%1, %0|%0, %1}";
15703 }
15704
15705 gcc_assert (STACK_TOP_P (cmp_op0));
15706
15707 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15708
15709 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15710 {
15711 if (stack_top_dies)
15712 {
15713 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15714 return output_387_ffreep (operands, 1);
15715 }
15716 else
15717 return "ftst\n\tfnstsw\t%0";
15718 }
15719
15720 if (STACK_REG_P (cmp_op1)
15721 && stack_top_dies
15722 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15723 && REGNO (cmp_op1) != FIRST_STACK_REG)
15724 {
15725 /* If both the top of the 387 stack dies, and the other operand
15726 is also a stack register that dies, then this must be a
15727 `fcompp' float compare */
15728
15729 if (eflags_p)
15730 {
15731 /* There is no double popping fcomi variant. Fortunately,
15732 eflags is immune from the fstp's cc clobbering. */
15733 if (unordered_p)
15734 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15735 else
15736 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15737 return output_387_ffreep (operands, 0);
15738 }
15739 else
15740 {
15741 if (unordered_p)
15742 return "fucompp\n\tfnstsw\t%0";
15743 else
15744 return "fcompp\n\tfnstsw\t%0";
15745 }
15746 }
15747 else
15748 {
15749 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15750
15751 static const char * const alt[16] =
15752 {
15753 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15754 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15755 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15756 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15757
15758 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15759 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15760 NULL,
15761 NULL,
15762
15763 "fcomi\t{%y1, %0|%0, %y1}",
15764 "fcomip\t{%y1, %0|%0, %y1}",
15765 "fucomi\t{%y1, %0|%0, %y1}",
15766 "fucomip\t{%y1, %0|%0, %y1}",
15767
15768 NULL,
15769 NULL,
15770 NULL,
15771 NULL
15772 };
15773
15774 int mask;
15775 const char *ret;
15776
15777 mask = eflags_p << 3;
15778 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15779 mask |= unordered_p << 1;
15780 mask |= stack_top_dies;
15781
15782 gcc_assert (mask < 16);
15783 ret = alt[mask];
15784 gcc_assert (ret);
15785
15786 return ret;
15787 }
15788 }
15789
15790 void
15791 ix86_output_addr_vec_elt (FILE *file, int value)
15792 {
15793 const char *directive = ASM_LONG;
15794
15795 #ifdef ASM_QUAD
15796 if (TARGET_LP64)
15797 directive = ASM_QUAD;
15798 #else
15799 gcc_assert (!TARGET_64BIT);
15800 #endif
15801
15802 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15803 }
15804
15805 void
15806 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15807 {
15808 const char *directive = ASM_LONG;
15809
15810 #ifdef ASM_QUAD
15811 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15812 directive = ASM_QUAD;
15813 #else
15814 gcc_assert (!TARGET_64BIT);
15815 #endif
15816 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15817 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15818 fprintf (file, "%s%s%d-%s%d\n",
15819 directive, LPREFIX, value, LPREFIX, rel);
15820 else if (HAVE_AS_GOTOFF_IN_DATA)
15821 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15822 #if TARGET_MACHO
15823 else if (TARGET_MACHO)
15824 {
15825 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15826 machopic_output_function_base_name (file);
15827 putc ('\n', file);
15828 }
15829 #endif
15830 else
15831 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15832 GOT_SYMBOL_NAME, LPREFIX, value);
15833 }
15834 \f
15835 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15836 for the target. */
15837
15838 void
15839 ix86_expand_clear (rtx dest)
15840 {
15841 rtx tmp;
15842
15843 /* We play register width games, which are only valid after reload. */
15844 gcc_assert (reload_completed);
15845
15846 /* Avoid HImode and its attendant prefix byte. */
15847 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15848 dest = gen_rtx_REG (SImode, REGNO (dest));
15849 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15850
15851 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15852 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15853 {
15854 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15855 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15856 }
15857
15858 emit_insn (tmp);
15859 }
15860
15861 /* X is an unchanging MEM. If it is a constant pool reference, return
15862 the constant pool rtx, else NULL. */
15863
15864 rtx
15865 maybe_get_pool_constant (rtx x)
15866 {
15867 x = ix86_delegitimize_address (XEXP (x, 0));
15868
15869 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15870 return get_pool_constant (x);
15871
15872 return NULL_RTX;
15873 }
15874
15875 void
15876 ix86_expand_move (enum machine_mode mode, rtx operands[])
15877 {
15878 rtx op0, op1;
15879 enum tls_model model;
15880
15881 op0 = operands[0];
15882 op1 = operands[1];
15883
15884 if (GET_CODE (op1) == SYMBOL_REF)
15885 {
15886 model = SYMBOL_REF_TLS_MODEL (op1);
15887 if (model)
15888 {
15889 op1 = legitimize_tls_address (op1, model, true);
15890 op1 = force_operand (op1, op0);
15891 if (op1 == op0)
15892 return;
15893 if (GET_MODE (op1) != mode)
15894 op1 = convert_to_mode (mode, op1, 1);
15895 }
15896 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15897 && SYMBOL_REF_DLLIMPORT_P (op1))
15898 op1 = legitimize_dllimport_symbol (op1, false);
15899 }
15900 else if (GET_CODE (op1) == CONST
15901 && GET_CODE (XEXP (op1, 0)) == PLUS
15902 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15903 {
15904 rtx addend = XEXP (XEXP (op1, 0), 1);
15905 rtx symbol = XEXP (XEXP (op1, 0), 0);
15906 rtx tmp = NULL;
15907
15908 model = SYMBOL_REF_TLS_MODEL (symbol);
15909 if (model)
15910 tmp = legitimize_tls_address (symbol, model, true);
15911 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15912 && SYMBOL_REF_DLLIMPORT_P (symbol))
15913 tmp = legitimize_dllimport_symbol (symbol, true);
15914
15915 if (tmp)
15916 {
15917 tmp = force_operand (tmp, NULL);
15918 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15919 op0, 1, OPTAB_DIRECT);
15920 if (tmp == op0)
15921 return;
15922 if (GET_MODE (tmp) != mode)
15923 op1 = convert_to_mode (mode, tmp, 1);
15924 }
15925 }
15926
15927 if ((flag_pic || MACHOPIC_INDIRECT)
15928 && symbolic_operand (op1, mode))
15929 {
15930 if (TARGET_MACHO && !TARGET_64BIT)
15931 {
15932 #if TARGET_MACHO
15933 /* dynamic-no-pic */
15934 if (MACHOPIC_INDIRECT)
15935 {
15936 rtx temp = ((reload_in_progress
15937 || ((op0 && REG_P (op0))
15938 && mode == Pmode))
15939 ? op0 : gen_reg_rtx (Pmode));
15940 op1 = machopic_indirect_data_reference (op1, temp);
15941 if (MACHOPIC_PURE)
15942 op1 = machopic_legitimize_pic_address (op1, mode,
15943 temp == op1 ? 0 : temp);
15944 }
15945 if (op0 != op1 && GET_CODE (op0) != MEM)
15946 {
15947 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15948 emit_insn (insn);
15949 return;
15950 }
15951 if (GET_CODE (op0) == MEM)
15952 op1 = force_reg (Pmode, op1);
15953 else
15954 {
15955 rtx temp = op0;
15956 if (GET_CODE (temp) != REG)
15957 temp = gen_reg_rtx (Pmode);
15958 temp = legitimize_pic_address (op1, temp);
15959 if (temp == op0)
15960 return;
15961 op1 = temp;
15962 }
15963 /* dynamic-no-pic */
15964 #endif
15965 }
15966 else
15967 {
15968 if (MEM_P (op0))
15969 op1 = force_reg (mode, op1);
15970 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15971 {
15972 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15973 op1 = legitimize_pic_address (op1, reg);
15974 if (op0 == op1)
15975 return;
15976 if (GET_MODE (op1) != mode)
15977 op1 = convert_to_mode (mode, op1, 1);
15978 }
15979 }
15980 }
15981 else
15982 {
15983 if (MEM_P (op0)
15984 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15985 || !push_operand (op0, mode))
15986 && MEM_P (op1))
15987 op1 = force_reg (mode, op1);
15988
15989 if (push_operand (op0, mode)
15990 && ! general_no_elim_operand (op1, mode))
15991 op1 = copy_to_mode_reg (mode, op1);
15992
15993 /* Force large constants in 64bit compilation into register
15994 to get them CSEed. */
15995 if (can_create_pseudo_p ()
15996 && (mode == DImode) && TARGET_64BIT
15997 && immediate_operand (op1, mode)
15998 && !x86_64_zext_immediate_operand (op1, VOIDmode)
15999 && !register_operand (op0, mode)
16000 && optimize)
16001 op1 = copy_to_mode_reg (mode, op1);
16002
16003 if (can_create_pseudo_p ()
16004 && FLOAT_MODE_P (mode)
16005 && GET_CODE (op1) == CONST_DOUBLE)
16006 {
16007 /* If we are loading a floating point constant to a register,
16008 force the value to memory now, since we'll get better code
16009 out the back end. */
16010
16011 op1 = validize_mem (force_const_mem (mode, op1));
16012 if (!register_operand (op0, mode))
16013 {
16014 rtx temp = gen_reg_rtx (mode);
16015 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16016 emit_move_insn (op0, temp);
16017 return;
16018 }
16019 }
16020 }
16021
16022 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16023 }
16024
16025 void
16026 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16027 {
16028 rtx op0 = operands[0], op1 = operands[1];
16029 unsigned int align = GET_MODE_ALIGNMENT (mode);
16030
16031 /* Force constants other than zero into memory. We do not know how
16032 the instructions used to build constants modify the upper 64 bits
16033 of the register, once we have that information we may be able
16034 to handle some of them more efficiently. */
16035 if (can_create_pseudo_p ()
16036 && register_operand (op0, mode)
16037 && (CONSTANT_P (op1)
16038 || (GET_CODE (op1) == SUBREG
16039 && CONSTANT_P (SUBREG_REG (op1))))
16040 && !standard_sse_constant_p (op1))
16041 op1 = validize_mem (force_const_mem (mode, op1));
16042
16043 /* We need to check memory alignment for SSE mode since attribute
16044 can make operands unaligned. */
16045 if (can_create_pseudo_p ()
16046 && SSE_REG_MODE_P (mode)
16047 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16048 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16049 {
16050 rtx tmp[2];
16051
16052 /* ix86_expand_vector_move_misalign() does not like constants ... */
16053 if (CONSTANT_P (op1)
16054 || (GET_CODE (op1) == SUBREG
16055 && CONSTANT_P (SUBREG_REG (op1))))
16056 op1 = validize_mem (force_const_mem (mode, op1));
16057
16058 /* ... nor both arguments in memory. */
16059 if (!register_operand (op0, mode)
16060 && !register_operand (op1, mode))
16061 op1 = force_reg (mode, op1);
16062
16063 tmp[0] = op0; tmp[1] = op1;
16064 ix86_expand_vector_move_misalign (mode, tmp);
16065 return;
16066 }
16067
16068 /* Make operand1 a register if it isn't already. */
16069 if (can_create_pseudo_p ()
16070 && !register_operand (op0, mode)
16071 && !register_operand (op1, mode))
16072 {
16073 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16074 return;
16075 }
16076
16077 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16078 }
16079
16080 /* Split 32-byte AVX unaligned load and store if needed. */
16081
16082 static void
16083 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16084 {
16085 rtx m;
16086 rtx (*extract) (rtx, rtx, rtx);
16087 rtx (*load_unaligned) (rtx, rtx);
16088 rtx (*store_unaligned) (rtx, rtx);
16089 enum machine_mode mode;
16090
16091 switch (GET_MODE (op0))
16092 {
16093 default:
16094 gcc_unreachable ();
16095 case V32QImode:
16096 extract = gen_avx_vextractf128v32qi;
16097 load_unaligned = gen_avx_loaddqu256;
16098 store_unaligned = gen_avx_storedqu256;
16099 mode = V16QImode;
16100 break;
16101 case V8SFmode:
16102 extract = gen_avx_vextractf128v8sf;
16103 load_unaligned = gen_avx_loadups256;
16104 store_unaligned = gen_avx_storeups256;
16105 mode = V4SFmode;
16106 break;
16107 case V4DFmode:
16108 extract = gen_avx_vextractf128v4df;
16109 load_unaligned = gen_avx_loadupd256;
16110 store_unaligned = gen_avx_storeupd256;
16111 mode = V2DFmode;
16112 break;
16113 }
16114
16115 if (MEM_P (op1))
16116 {
16117 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16118 {
16119 rtx r = gen_reg_rtx (mode);
16120 m = adjust_address (op1, mode, 0);
16121 emit_move_insn (r, m);
16122 m = adjust_address (op1, mode, 16);
16123 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16124 emit_move_insn (op0, r);
16125 }
16126 else
16127 emit_insn (load_unaligned (op0, op1));
16128 }
16129 else if (MEM_P (op0))
16130 {
16131 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16132 {
16133 m = adjust_address (op0, mode, 0);
16134 emit_insn (extract (m, op1, const0_rtx));
16135 m = adjust_address (op0, mode, 16);
16136 emit_insn (extract (m, op1, const1_rtx));
16137 }
16138 else
16139 emit_insn (store_unaligned (op0, op1));
16140 }
16141 else
16142 gcc_unreachable ();
16143 }
16144
16145 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16146 straight to ix86_expand_vector_move. */
16147 /* Code generation for scalar reg-reg moves of single and double precision data:
16148 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16149 movaps reg, reg
16150 else
16151 movss reg, reg
16152 if (x86_sse_partial_reg_dependency == true)
16153 movapd reg, reg
16154 else
16155 movsd reg, reg
16156
16157 Code generation for scalar loads of double precision data:
16158 if (x86_sse_split_regs == true)
16159 movlpd mem, reg (gas syntax)
16160 else
16161 movsd mem, reg
16162
16163 Code generation for unaligned packed loads of single precision data
16164 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16165 if (x86_sse_unaligned_move_optimal)
16166 movups mem, reg
16167
16168 if (x86_sse_partial_reg_dependency == true)
16169 {
16170 xorps reg, reg
16171 movlps mem, reg
16172 movhps mem+8, reg
16173 }
16174 else
16175 {
16176 movlps mem, reg
16177 movhps mem+8, reg
16178 }
16179
16180 Code generation for unaligned packed loads of double precision data
16181 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16182 if (x86_sse_unaligned_move_optimal)
16183 movupd mem, reg
16184
16185 if (x86_sse_split_regs == true)
16186 {
16187 movlpd mem, reg
16188 movhpd mem+8, reg
16189 }
16190 else
16191 {
16192 movsd mem, reg
16193 movhpd mem+8, reg
16194 }
16195 */
16196
16197 void
16198 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16199 {
16200 rtx op0, op1, m;
16201
16202 op0 = operands[0];
16203 op1 = operands[1];
16204
16205 if (TARGET_AVX
16206 && GET_MODE_SIZE (mode) == 32)
16207 {
16208 switch (GET_MODE_CLASS (mode))
16209 {
16210 case MODE_VECTOR_INT:
16211 case MODE_INT:
16212 op0 = gen_lowpart (V32QImode, op0);
16213 op1 = gen_lowpart (V32QImode, op1);
16214 /* FALLTHRU */
16215
16216 case MODE_VECTOR_FLOAT:
16217 ix86_avx256_split_vector_move_misalign (op0, op1);
16218 break;
16219
16220 default:
16221 gcc_unreachable ();
16222 }
16223
16224 return;
16225 }
16226
16227 if (MEM_P (op1))
16228 {
16229 /* ??? If we have typed data, then it would appear that using
16230 movdqu is the only way to get unaligned data loaded with
16231 integer type. */
16232 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16233 {
16234 op0 = gen_lowpart (V16QImode, op0);
16235 op1 = gen_lowpart (V16QImode, op1);
16236 /* We will eventually emit movups based on insn attributes. */
16237 emit_insn (gen_sse2_loaddqu (op0, op1));
16238 }
16239 else if (TARGET_SSE2 && mode == V2DFmode)
16240 {
16241 rtx zero;
16242
16243 if (TARGET_AVX
16244 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16245 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16246 || optimize_function_for_size_p (cfun))
16247 {
16248 /* We will eventually emit movups based on insn attributes. */
16249 emit_insn (gen_sse2_loadupd (op0, op1));
16250 return;
16251 }
16252
16253 /* When SSE registers are split into halves, we can avoid
16254 writing to the top half twice. */
16255 if (TARGET_SSE_SPLIT_REGS)
16256 {
16257 emit_clobber (op0);
16258 zero = op0;
16259 }
16260 else
16261 {
16262 /* ??? Not sure about the best option for the Intel chips.
16263 The following would seem to satisfy; the register is
16264 entirely cleared, breaking the dependency chain. We
16265 then store to the upper half, with a dependency depth
16266 of one. A rumor has it that Intel recommends two movsd
16267 followed by an unpacklpd, but this is unconfirmed. And
16268 given that the dependency depth of the unpacklpd would
16269 still be one, I'm not sure why this would be better. */
16270 zero = CONST0_RTX (V2DFmode);
16271 }
16272
16273 m = adjust_address (op1, DFmode, 0);
16274 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16275 m = adjust_address (op1, DFmode, 8);
16276 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16277 }
16278 else
16279 {
16280 if (TARGET_AVX
16281 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16282 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16283 || optimize_function_for_size_p (cfun))
16284 {
16285 op0 = gen_lowpart (V4SFmode, op0);
16286 op1 = gen_lowpart (V4SFmode, op1);
16287 emit_insn (gen_sse_loadups (op0, op1));
16288 return;
16289 }
16290
16291 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16292 emit_move_insn (op0, CONST0_RTX (mode));
16293 else
16294 emit_clobber (op0);
16295
16296 if (mode != V4SFmode)
16297 op0 = gen_lowpart (V4SFmode, op0);
16298
16299 m = adjust_address (op1, V2SFmode, 0);
16300 emit_insn (gen_sse_loadlps (op0, op0, m));
16301 m = adjust_address (op1, V2SFmode, 8);
16302 emit_insn (gen_sse_loadhps (op0, op0, m));
16303 }
16304 }
16305 else if (MEM_P (op0))
16306 {
16307 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16308 {
16309 op0 = gen_lowpart (V16QImode, op0);
16310 op1 = gen_lowpart (V16QImode, op1);
16311 /* We will eventually emit movups based on insn attributes. */
16312 emit_insn (gen_sse2_storedqu (op0, op1));
16313 }
16314 else if (TARGET_SSE2 && mode == V2DFmode)
16315 {
16316 if (TARGET_AVX
16317 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16318 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16319 || optimize_function_for_size_p (cfun))
16320 /* We will eventually emit movups based on insn attributes. */
16321 emit_insn (gen_sse2_storeupd (op0, op1));
16322 else
16323 {
16324 m = adjust_address (op0, DFmode, 0);
16325 emit_insn (gen_sse2_storelpd (m, op1));
16326 m = adjust_address (op0, DFmode, 8);
16327 emit_insn (gen_sse2_storehpd (m, op1));
16328 }
16329 }
16330 else
16331 {
16332 if (mode != V4SFmode)
16333 op1 = gen_lowpart (V4SFmode, op1);
16334
16335 if (TARGET_AVX
16336 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16337 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16338 || optimize_function_for_size_p (cfun))
16339 {
16340 op0 = gen_lowpart (V4SFmode, op0);
16341 emit_insn (gen_sse_storeups (op0, op1));
16342 }
16343 else
16344 {
16345 m = adjust_address (op0, V2SFmode, 0);
16346 emit_insn (gen_sse_storelps (m, op1));
16347 m = adjust_address (op0, V2SFmode, 8);
16348 emit_insn (gen_sse_storehps (m, op1));
16349 }
16350 }
16351 }
16352 else
16353 gcc_unreachable ();
16354 }
16355
16356 /* Expand a push in MODE. This is some mode for which we do not support
16357 proper push instructions, at least from the registers that we expect
16358 the value to live in. */
16359
16360 void
16361 ix86_expand_push (enum machine_mode mode, rtx x)
16362 {
16363 rtx tmp;
16364
16365 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16366 GEN_INT (-GET_MODE_SIZE (mode)),
16367 stack_pointer_rtx, 1, OPTAB_DIRECT);
16368 if (tmp != stack_pointer_rtx)
16369 emit_move_insn (stack_pointer_rtx, tmp);
16370
16371 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16372
16373 /* When we push an operand onto stack, it has to be aligned at least
16374 at the function argument boundary. However since we don't have
16375 the argument type, we can't determine the actual argument
16376 boundary. */
16377 emit_move_insn (tmp, x);
16378 }
16379
16380 /* Helper function of ix86_fixup_binary_operands to canonicalize
16381 operand order. Returns true if the operands should be swapped. */
16382
16383 static bool
16384 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16385 rtx operands[])
16386 {
16387 rtx dst = operands[0];
16388 rtx src1 = operands[1];
16389 rtx src2 = operands[2];
16390
16391 /* If the operation is not commutative, we can't do anything. */
16392 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16393 return false;
16394
16395 /* Highest priority is that src1 should match dst. */
16396 if (rtx_equal_p (dst, src1))
16397 return false;
16398 if (rtx_equal_p (dst, src2))
16399 return true;
16400
16401 /* Next highest priority is that immediate constants come second. */
16402 if (immediate_operand (src2, mode))
16403 return false;
16404 if (immediate_operand (src1, mode))
16405 return true;
16406
16407 /* Lowest priority is that memory references should come second. */
16408 if (MEM_P (src2))
16409 return false;
16410 if (MEM_P (src1))
16411 return true;
16412
16413 return false;
16414 }
16415
16416
16417 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16418 destination to use for the operation. If different from the true
16419 destination in operands[0], a copy operation will be required. */
16420
16421 rtx
16422 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16423 rtx operands[])
16424 {
16425 rtx dst = operands[0];
16426 rtx src1 = operands[1];
16427 rtx src2 = operands[2];
16428
16429 /* Canonicalize operand order. */
16430 if (ix86_swap_binary_operands_p (code, mode, operands))
16431 {
16432 rtx temp;
16433
16434 /* It is invalid to swap operands of different modes. */
16435 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16436
16437 temp = src1;
16438 src1 = src2;
16439 src2 = temp;
16440 }
16441
16442 /* Both source operands cannot be in memory. */
16443 if (MEM_P (src1) && MEM_P (src2))
16444 {
16445 /* Optimization: Only read from memory once. */
16446 if (rtx_equal_p (src1, src2))
16447 {
16448 src2 = force_reg (mode, src2);
16449 src1 = src2;
16450 }
16451 else
16452 src2 = force_reg (mode, src2);
16453 }
16454
16455 /* If the destination is memory, and we do not have matching source
16456 operands, do things in registers. */
16457 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16458 dst = gen_reg_rtx (mode);
16459
16460 /* Source 1 cannot be a constant. */
16461 if (CONSTANT_P (src1))
16462 src1 = force_reg (mode, src1);
16463
16464 /* Source 1 cannot be a non-matching memory. */
16465 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16466 src1 = force_reg (mode, src1);
16467
16468 /* Improve address combine. */
16469 if (code == PLUS
16470 && GET_MODE_CLASS (mode) == MODE_INT
16471 && MEM_P (src2))
16472 src2 = force_reg (mode, src2);
16473
16474 operands[1] = src1;
16475 operands[2] = src2;
16476 return dst;
16477 }
16478
16479 /* Similarly, but assume that the destination has already been
16480 set up properly. */
16481
16482 void
16483 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16484 enum machine_mode mode, rtx operands[])
16485 {
16486 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16487 gcc_assert (dst == operands[0]);
16488 }
16489
16490 /* Attempt to expand a binary operator. Make the expansion closer to the
16491 actual machine, then just general_operand, which will allow 3 separate
16492 memory references (one output, two input) in a single insn. */
16493
16494 void
16495 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16496 rtx operands[])
16497 {
16498 rtx src1, src2, dst, op, clob;
16499
16500 dst = ix86_fixup_binary_operands (code, mode, operands);
16501 src1 = operands[1];
16502 src2 = operands[2];
16503
16504 /* Emit the instruction. */
16505
16506 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16507 if (reload_in_progress)
16508 {
16509 /* Reload doesn't know about the flags register, and doesn't know that
16510 it doesn't want to clobber it. We can only do this with PLUS. */
16511 gcc_assert (code == PLUS);
16512 emit_insn (op);
16513 }
16514 else if (reload_completed
16515 && code == PLUS
16516 && !rtx_equal_p (dst, src1))
16517 {
16518 /* This is going to be an LEA; avoid splitting it later. */
16519 emit_insn (op);
16520 }
16521 else
16522 {
16523 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16524 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16525 }
16526
16527 /* Fix up the destination if needed. */
16528 if (dst != operands[0])
16529 emit_move_insn (operands[0], dst);
16530 }
16531
16532 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
16533 the given OPERANDS. */
16534
16535 void
16536 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
16537 rtx operands[])
16538 {
16539 rtx op1 = NULL_RTX, op2 = NULL_RTX;
16540 if (GET_CODE (operands[1]) == SUBREG)
16541 {
16542 op1 = operands[1];
16543 op2 = operands[2];
16544 }
16545 else if (GET_CODE (operands[2]) == SUBREG)
16546 {
16547 op1 = operands[2];
16548 op2 = operands[1];
16549 }
16550 /* Optimize (__m128i) d | (__m128i) e and similar code
16551 when d and e are float vectors into float vector logical
16552 insn. In C/C++ without using intrinsics there is no other way
16553 to express vector logical operation on float vectors than
16554 to cast them temporarily to integer vectors. */
16555 if (op1
16556 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16557 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
16558 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
16559 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
16560 && SUBREG_BYTE (op1) == 0
16561 && (GET_CODE (op2) == CONST_VECTOR
16562 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
16563 && SUBREG_BYTE (op2) == 0))
16564 && can_create_pseudo_p ())
16565 {
16566 rtx dst;
16567 switch (GET_MODE (SUBREG_REG (op1)))
16568 {
16569 case V4SFmode:
16570 case V8SFmode:
16571 case V2DFmode:
16572 case V4DFmode:
16573 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
16574 if (GET_CODE (op2) == CONST_VECTOR)
16575 {
16576 op2 = gen_lowpart (GET_MODE (dst), op2);
16577 op2 = force_reg (GET_MODE (dst), op2);
16578 }
16579 else
16580 {
16581 op1 = operands[1];
16582 op2 = SUBREG_REG (operands[2]);
16583 if (!nonimmediate_operand (op2, GET_MODE (dst)))
16584 op2 = force_reg (GET_MODE (dst), op2);
16585 }
16586 op1 = SUBREG_REG (op1);
16587 if (!nonimmediate_operand (op1, GET_MODE (dst)))
16588 op1 = force_reg (GET_MODE (dst), op1);
16589 emit_insn (gen_rtx_SET (VOIDmode, dst,
16590 gen_rtx_fmt_ee (code, GET_MODE (dst),
16591 op1, op2)));
16592 emit_move_insn (operands[0], gen_lowpart (mode, dst));
16593 return;
16594 default:
16595 break;
16596 }
16597 }
16598 if (!nonimmediate_operand (operands[1], mode))
16599 operands[1] = force_reg (mode, operands[1]);
16600 if (!nonimmediate_operand (operands[2], mode))
16601 operands[2] = force_reg (mode, operands[2]);
16602 ix86_fixup_binary_operands_no_copy (code, mode, operands);
16603 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
16604 gen_rtx_fmt_ee (code, mode, operands[1],
16605 operands[2])));
16606 }
16607
16608 /* Return TRUE or FALSE depending on whether the binary operator meets the
16609 appropriate constraints. */
16610
16611 bool
16612 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16613 rtx operands[3])
16614 {
16615 rtx dst = operands[0];
16616 rtx src1 = operands[1];
16617 rtx src2 = operands[2];
16618
16619 /* Both source operands cannot be in memory. */
16620 if (MEM_P (src1) && MEM_P (src2))
16621 return false;
16622
16623 /* Canonicalize operand order for commutative operators. */
16624 if (ix86_swap_binary_operands_p (code, mode, operands))
16625 {
16626 rtx temp = src1;
16627 src1 = src2;
16628 src2 = temp;
16629 }
16630
16631 /* If the destination is memory, we must have a matching source operand. */
16632 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16633 return false;
16634
16635 /* Source 1 cannot be a constant. */
16636 if (CONSTANT_P (src1))
16637 return false;
16638
16639 /* Source 1 cannot be a non-matching memory. */
16640 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16641 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16642 return (code == AND
16643 && (mode == HImode
16644 || mode == SImode
16645 || (TARGET_64BIT && mode == DImode))
16646 && satisfies_constraint_L (src2));
16647
16648 return true;
16649 }
16650
16651 /* Attempt to expand a unary operator. Make the expansion closer to the
16652 actual machine, then just general_operand, which will allow 2 separate
16653 memory references (one output, one input) in a single insn. */
16654
16655 void
16656 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16657 rtx operands[])
16658 {
16659 int matching_memory;
16660 rtx src, dst, op, clob;
16661
16662 dst = operands[0];
16663 src = operands[1];
16664
16665 /* If the destination is memory, and we do not have matching source
16666 operands, do things in registers. */
16667 matching_memory = 0;
16668 if (MEM_P (dst))
16669 {
16670 if (rtx_equal_p (dst, src))
16671 matching_memory = 1;
16672 else
16673 dst = gen_reg_rtx (mode);
16674 }
16675
16676 /* When source operand is memory, destination must match. */
16677 if (MEM_P (src) && !matching_memory)
16678 src = force_reg (mode, src);
16679
16680 /* Emit the instruction. */
16681
16682 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16683 if (reload_in_progress || code == NOT)
16684 {
16685 /* Reload doesn't know about the flags register, and doesn't know that
16686 it doesn't want to clobber it. */
16687 gcc_assert (code == NOT);
16688 emit_insn (op);
16689 }
16690 else
16691 {
16692 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16693 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16694 }
16695
16696 /* Fix up the destination if needed. */
16697 if (dst != operands[0])
16698 emit_move_insn (operands[0], dst);
16699 }
16700
16701 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16702 divisor are within the range [0-255]. */
16703
16704 void
16705 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16706 bool signed_p)
16707 {
16708 rtx end_label, qimode_label;
16709 rtx insn, div, mod;
16710 rtx scratch, tmp0, tmp1, tmp2;
16711 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16712 rtx (*gen_zero_extend) (rtx, rtx);
16713 rtx (*gen_test_ccno_1) (rtx, rtx);
16714
16715 switch (mode)
16716 {
16717 case SImode:
16718 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16719 gen_test_ccno_1 = gen_testsi_ccno_1;
16720 gen_zero_extend = gen_zero_extendqisi2;
16721 break;
16722 case DImode:
16723 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16724 gen_test_ccno_1 = gen_testdi_ccno_1;
16725 gen_zero_extend = gen_zero_extendqidi2;
16726 break;
16727 default:
16728 gcc_unreachable ();
16729 }
16730
16731 end_label = gen_label_rtx ();
16732 qimode_label = gen_label_rtx ();
16733
16734 scratch = gen_reg_rtx (mode);
16735
16736 /* Use 8bit unsigned divimod if dividend and divisor are within
16737 the range [0-255]. */
16738 emit_move_insn (scratch, operands[2]);
16739 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16740 scratch, 1, OPTAB_DIRECT);
16741 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16742 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16743 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16744 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16745 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16746 pc_rtx);
16747 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16748 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16749 JUMP_LABEL (insn) = qimode_label;
16750
16751 /* Generate original signed/unsigned divimod. */
16752 div = gen_divmod4_1 (operands[0], operands[1],
16753 operands[2], operands[3]);
16754 emit_insn (div);
16755
16756 /* Branch to the end. */
16757 emit_jump_insn (gen_jump (end_label));
16758 emit_barrier ();
16759
16760 /* Generate 8bit unsigned divide. */
16761 emit_label (qimode_label);
16762 /* Don't use operands[0] for result of 8bit divide since not all
16763 registers support QImode ZERO_EXTRACT. */
16764 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16765 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16766 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16767 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16768
16769 if (signed_p)
16770 {
16771 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16772 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16773 }
16774 else
16775 {
16776 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16777 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16778 }
16779
16780 /* Extract remainder from AH. */
16781 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16782 if (REG_P (operands[1]))
16783 insn = emit_move_insn (operands[1], tmp1);
16784 else
16785 {
16786 /* Need a new scratch register since the old one has result
16787 of 8bit divide. */
16788 scratch = gen_reg_rtx (mode);
16789 emit_move_insn (scratch, tmp1);
16790 insn = emit_move_insn (operands[1], scratch);
16791 }
16792 set_unique_reg_note (insn, REG_EQUAL, mod);
16793
16794 /* Zero extend quotient from AL. */
16795 tmp1 = gen_lowpart (QImode, tmp0);
16796 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16797 set_unique_reg_note (insn, REG_EQUAL, div);
16798
16799 emit_label (end_label);
16800 }
16801
16802 #define LEA_MAX_STALL (3)
16803 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16804
16805 /* Increase given DISTANCE in half-cycles according to
16806 dependencies between PREV and NEXT instructions.
16807 Add 1 half-cycle if there is no dependency and
16808 go to next cycle if there is some dependecy. */
16809
16810 static unsigned int
16811 increase_distance (rtx prev, rtx next, unsigned int distance)
16812 {
16813 df_ref *use_rec;
16814 df_ref *def_rec;
16815
16816 if (!prev || !next)
16817 return distance + (distance & 1) + 2;
16818
16819 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16820 return distance + 1;
16821
16822 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16823 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16824 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16825 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16826 return distance + (distance & 1) + 2;
16827
16828 return distance + 1;
16829 }
16830
16831 /* Function checks if instruction INSN defines register number
16832 REGNO1 or REGNO2. */
16833
16834 static bool
16835 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16836 rtx insn)
16837 {
16838 df_ref *def_rec;
16839
16840 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16841 if (DF_REF_REG_DEF_P (*def_rec)
16842 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16843 && (regno1 == DF_REF_REGNO (*def_rec)
16844 || regno2 == DF_REF_REGNO (*def_rec)))
16845 {
16846 return true;
16847 }
16848
16849 return false;
16850 }
16851
16852 /* Function checks if instruction INSN uses register number
16853 REGNO as a part of address expression. */
16854
16855 static bool
16856 insn_uses_reg_mem (unsigned int regno, rtx insn)
16857 {
16858 df_ref *use_rec;
16859
16860 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16861 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16862 return true;
16863
16864 return false;
16865 }
16866
16867 /* Search backward for non-agu definition of register number REGNO1
16868 or register number REGNO2 in basic block starting from instruction
16869 START up to head of basic block or instruction INSN.
16870
16871 Function puts true value into *FOUND var if definition was found
16872 and false otherwise.
16873
16874 Distance in half-cycles between START and found instruction or head
16875 of BB is added to DISTANCE and returned. */
16876
16877 static int
16878 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16879 rtx insn, int distance,
16880 rtx start, bool *found)
16881 {
16882 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16883 rtx prev = start;
16884 rtx next = NULL;
16885
16886 *found = false;
16887
16888 while (prev
16889 && prev != insn
16890 && distance < LEA_SEARCH_THRESHOLD)
16891 {
16892 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16893 {
16894 distance = increase_distance (prev, next, distance);
16895 if (insn_defines_reg (regno1, regno2, prev))
16896 {
16897 if (recog_memoized (prev) < 0
16898 || get_attr_type (prev) != TYPE_LEA)
16899 {
16900 *found = true;
16901 return distance;
16902 }
16903 }
16904
16905 next = prev;
16906 }
16907 if (prev == BB_HEAD (bb))
16908 break;
16909
16910 prev = PREV_INSN (prev);
16911 }
16912
16913 return distance;
16914 }
16915
16916 /* Search backward for non-agu definition of register number REGNO1
16917 or register number REGNO2 in INSN's basic block until
16918 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16919 2. Reach neighbour BBs boundary, or
16920 3. Reach agu definition.
16921 Returns the distance between the non-agu definition point and INSN.
16922 If no definition point, returns -1. */
16923
16924 static int
16925 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16926 rtx insn)
16927 {
16928 basic_block bb = BLOCK_FOR_INSN (insn);
16929 int distance = 0;
16930 bool found = false;
16931
16932 if (insn != BB_HEAD (bb))
16933 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16934 distance, PREV_INSN (insn),
16935 &found);
16936
16937 if (!found && distance < LEA_SEARCH_THRESHOLD)
16938 {
16939 edge e;
16940 edge_iterator ei;
16941 bool simple_loop = false;
16942
16943 FOR_EACH_EDGE (e, ei, bb->preds)
16944 if (e->src == bb)
16945 {
16946 simple_loop = true;
16947 break;
16948 }
16949
16950 if (simple_loop)
16951 distance = distance_non_agu_define_in_bb (regno1, regno2,
16952 insn, distance,
16953 BB_END (bb), &found);
16954 else
16955 {
16956 int shortest_dist = -1;
16957 bool found_in_bb = false;
16958
16959 FOR_EACH_EDGE (e, ei, bb->preds)
16960 {
16961 int bb_dist
16962 = distance_non_agu_define_in_bb (regno1, regno2,
16963 insn, distance,
16964 BB_END (e->src),
16965 &found_in_bb);
16966 if (found_in_bb)
16967 {
16968 if (shortest_dist < 0)
16969 shortest_dist = bb_dist;
16970 else if (bb_dist > 0)
16971 shortest_dist = MIN (bb_dist, shortest_dist);
16972
16973 found = true;
16974 }
16975 }
16976
16977 distance = shortest_dist;
16978 }
16979 }
16980
16981 /* get_attr_type may modify recog data. We want to make sure
16982 that recog data is valid for instruction INSN, on which
16983 distance_non_agu_define is called. INSN is unchanged here. */
16984 extract_insn_cached (insn);
16985
16986 if (!found)
16987 return -1;
16988
16989 return distance >> 1;
16990 }
16991
16992 /* Return the distance in half-cycles between INSN and the next
16993 insn that uses register number REGNO in memory address added
16994 to DISTANCE. Return -1 if REGNO0 is set.
16995
16996 Put true value into *FOUND if register usage was found and
16997 false otherwise.
16998 Put true value into *REDEFINED if register redefinition was
16999 found and false otherwise. */
17000
17001 static int
17002 distance_agu_use_in_bb (unsigned int regno,
17003 rtx insn, int distance, rtx start,
17004 bool *found, bool *redefined)
17005 {
17006 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17007 rtx next = start;
17008 rtx prev = NULL;
17009
17010 *found = false;
17011 *redefined = false;
17012
17013 while (next
17014 && next != insn
17015 && distance < LEA_SEARCH_THRESHOLD)
17016 {
17017 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17018 {
17019 distance = increase_distance(prev, next, distance);
17020 if (insn_uses_reg_mem (regno, next))
17021 {
17022 /* Return DISTANCE if OP0 is used in memory
17023 address in NEXT. */
17024 *found = true;
17025 return distance;
17026 }
17027
17028 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17029 {
17030 /* Return -1 if OP0 is set in NEXT. */
17031 *redefined = true;
17032 return -1;
17033 }
17034
17035 prev = next;
17036 }
17037
17038 if (next == BB_END (bb))
17039 break;
17040
17041 next = NEXT_INSN (next);
17042 }
17043
17044 return distance;
17045 }
17046
17047 /* Return the distance between INSN and the next insn that uses
17048 register number REGNO0 in memory address. Return -1 if no such
17049 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17050
17051 static int
17052 distance_agu_use (unsigned int regno0, rtx insn)
17053 {
17054 basic_block bb = BLOCK_FOR_INSN (insn);
17055 int distance = 0;
17056 bool found = false;
17057 bool redefined = false;
17058
17059 if (insn != BB_END (bb))
17060 distance = distance_agu_use_in_bb (regno0, insn, distance,
17061 NEXT_INSN (insn),
17062 &found, &redefined);
17063
17064 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17065 {
17066 edge e;
17067 edge_iterator ei;
17068 bool simple_loop = false;
17069
17070 FOR_EACH_EDGE (e, ei, bb->succs)
17071 if (e->dest == bb)
17072 {
17073 simple_loop = true;
17074 break;
17075 }
17076
17077 if (simple_loop)
17078 distance = distance_agu_use_in_bb (regno0, insn,
17079 distance, BB_HEAD (bb),
17080 &found, &redefined);
17081 else
17082 {
17083 int shortest_dist = -1;
17084 bool found_in_bb = false;
17085 bool redefined_in_bb = false;
17086
17087 FOR_EACH_EDGE (e, ei, bb->succs)
17088 {
17089 int bb_dist
17090 = distance_agu_use_in_bb (regno0, insn,
17091 distance, BB_HEAD (e->dest),
17092 &found_in_bb, &redefined_in_bb);
17093 if (found_in_bb)
17094 {
17095 if (shortest_dist < 0)
17096 shortest_dist = bb_dist;
17097 else if (bb_dist > 0)
17098 shortest_dist = MIN (bb_dist, shortest_dist);
17099
17100 found = true;
17101 }
17102 }
17103
17104 distance = shortest_dist;
17105 }
17106 }
17107
17108 if (!found || redefined)
17109 return -1;
17110
17111 return distance >> 1;
17112 }
17113
17114 /* Define this macro to tune LEA priority vs ADD, it take effect when
17115 there is a dilemma of choicing LEA or ADD
17116 Negative value: ADD is more preferred than LEA
17117 Zero: Netrual
17118 Positive value: LEA is more preferred than ADD*/
17119 #define IX86_LEA_PRIORITY 0
17120
17121 /* Return true if usage of lea INSN has performance advantage
17122 over a sequence of instructions. Instructions sequence has
17123 SPLIT_COST cycles higher latency than lea latency. */
17124
17125 static bool
17126 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17127 unsigned int regno2, int split_cost)
17128 {
17129 int dist_define, dist_use;
17130
17131 dist_define = distance_non_agu_define (regno1, regno2, insn);
17132 dist_use = distance_agu_use (regno0, insn);
17133
17134 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17135 {
17136 /* If there is no non AGU operand definition, no AGU
17137 operand usage and split cost is 0 then both lea
17138 and non lea variants have same priority. Currently
17139 we prefer lea for 64 bit code and non lea on 32 bit
17140 code. */
17141 if (dist_use < 0 && split_cost == 0)
17142 return TARGET_64BIT || IX86_LEA_PRIORITY;
17143 else
17144 return true;
17145 }
17146
17147 /* With longer definitions distance lea is more preferable.
17148 Here we change it to take into account splitting cost and
17149 lea priority. */
17150 dist_define += split_cost + IX86_LEA_PRIORITY;
17151
17152 /* If there is no use in memory addess then we just check
17153 that split cost exceeds AGU stall. */
17154 if (dist_use < 0)
17155 return dist_define > LEA_MAX_STALL;
17156
17157 /* If this insn has both backward non-agu dependence and forward
17158 agu dependence, the one with short distance takes effect. */
17159 return dist_define >= dist_use;
17160 }
17161
17162 /* Return true if it is legal to clobber flags by INSN and
17163 false otherwise. */
17164
17165 static bool
17166 ix86_ok_to_clobber_flags (rtx insn)
17167 {
17168 basic_block bb = BLOCK_FOR_INSN (insn);
17169 df_ref *use;
17170 bitmap live;
17171
17172 while (insn)
17173 {
17174 if (NONDEBUG_INSN_P (insn))
17175 {
17176 for (use = DF_INSN_USES (insn); *use; use++)
17177 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17178 return false;
17179
17180 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17181 return true;
17182 }
17183
17184 if (insn == BB_END (bb))
17185 break;
17186
17187 insn = NEXT_INSN (insn);
17188 }
17189
17190 live = df_get_live_out(bb);
17191 return !REGNO_REG_SET_P (live, FLAGS_REG);
17192 }
17193
17194 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17195 move and add to avoid AGU stalls. */
17196
17197 bool
17198 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17199 {
17200 unsigned int regno0, regno1, regno2;
17201
17202 /* Check if we need to optimize. */
17203 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17204 return false;
17205
17206 /* Check it is correct to split here. */
17207 if (!ix86_ok_to_clobber_flags(insn))
17208 return false;
17209
17210 regno0 = true_regnum (operands[0]);
17211 regno1 = true_regnum (operands[1]);
17212 regno2 = true_regnum (operands[2]);
17213
17214 /* We need to split only adds with non destructive
17215 destination operand. */
17216 if (regno0 == regno1 || regno0 == regno2)
17217 return false;
17218 else
17219 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
17220 }
17221
17222 /* Return true if we should emit lea instruction instead of mov
17223 instruction. */
17224
17225 bool
17226 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17227 {
17228 unsigned int regno0, regno1;
17229
17230 /* Check if we need to optimize. */
17231 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17232 return false;
17233
17234 /* Use lea for reg to reg moves only. */
17235 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17236 return false;
17237
17238 regno0 = true_regnum (operands[0]);
17239 regno1 = true_regnum (operands[1]);
17240
17241 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0);
17242 }
17243
17244 /* Return true if we need to split lea into a sequence of
17245 instructions to avoid AGU stalls. */
17246
17247 bool
17248 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17249 {
17250 unsigned int regno0, regno1, regno2;
17251 int split_cost;
17252 struct ix86_address parts;
17253 int ok;
17254
17255 /* Check we need to optimize. */
17256 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17257 return false;
17258
17259 /* Check it is correct to split here. */
17260 if (!ix86_ok_to_clobber_flags(insn))
17261 return false;
17262
17263 ok = ix86_decompose_address (operands[1], &parts);
17264 gcc_assert (ok);
17265
17266 /* There should be at least two components in the address. */
17267 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17268 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17269 return false;
17270
17271 /* We should not split into add if non legitimate pic
17272 operand is used as displacement. */
17273 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17274 return false;
17275
17276 regno0 = true_regnum (operands[0]) ;
17277 regno1 = INVALID_REGNUM;
17278 regno2 = INVALID_REGNUM;
17279
17280 if (parts.base)
17281 regno1 = true_regnum (parts.base);
17282 if (parts.index)
17283 regno2 = true_regnum (parts.index);
17284
17285 split_cost = 0;
17286
17287 /* Compute how many cycles we will add to execution time
17288 if split lea into a sequence of instructions. */
17289 if (parts.base || parts.index)
17290 {
17291 /* Have to use mov instruction if non desctructive
17292 destination form is used. */
17293 if (regno1 != regno0 && regno2 != regno0)
17294 split_cost += 1;
17295
17296 /* Have to add index to base if both exist. */
17297 if (parts.base && parts.index)
17298 split_cost += 1;
17299
17300 /* Have to use shift and adds if scale is 2 or greater. */
17301 if (parts.scale > 1)
17302 {
17303 if (regno0 != regno1)
17304 split_cost += 1;
17305 else if (regno2 == regno0)
17306 split_cost += 4;
17307 else
17308 split_cost += parts.scale;
17309 }
17310
17311 /* Have to use add instruction with immediate if
17312 disp is non zero. */
17313 if (parts.disp && parts.disp != const0_rtx)
17314 split_cost += 1;
17315
17316 /* Subtract the price of lea. */
17317 split_cost -= 1;
17318 }
17319
17320 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
17321 }
17322
17323 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17324 matches destination. RTX includes clobber of FLAGS_REG. */
17325
17326 static void
17327 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17328 rtx dst, rtx src)
17329 {
17330 rtx op, clob;
17331
17332 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17333 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17334
17335 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17336 }
17337
17338 /* Return true if regno1 def is nearest to the insn. */
17339
17340 static bool
17341 find_nearest_reg_def (rtx insn, int regno1, int regno2)
17342 {
17343 rtx prev = insn;
17344 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
17345
17346 if (insn == start)
17347 return false;
17348 while (prev && prev != start)
17349 {
17350 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
17351 {
17352 prev = PREV_INSN (prev);
17353 continue;
17354 }
17355 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
17356 return true;
17357 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
17358 return false;
17359 prev = PREV_INSN (prev);
17360 }
17361
17362 /* None of the regs is defined in the bb. */
17363 return false;
17364 }
17365
17366 /* Split lea instructions into a sequence of instructions
17367 which are executed on ALU to avoid AGU stalls.
17368 It is assumed that it is allowed to clobber flags register
17369 at lea position. */
17370
17371 void
17372 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
17373 {
17374 unsigned int regno0, regno1, regno2;
17375 struct ix86_address parts;
17376 rtx target, tmp;
17377 int ok, adds;
17378
17379 ok = ix86_decompose_address (operands[1], &parts);
17380 gcc_assert (ok);
17381
17382 target = gen_lowpart (mode, operands[0]);
17383
17384 regno0 = true_regnum (target);
17385 regno1 = INVALID_REGNUM;
17386 regno2 = INVALID_REGNUM;
17387
17388 if (parts.base)
17389 {
17390 parts.base = gen_lowpart (mode, parts.base);
17391 regno1 = true_regnum (parts.base);
17392 }
17393
17394 if (parts.index)
17395 {
17396 parts.index = gen_lowpart (mode, parts.index);
17397 regno2 = true_regnum (parts.index);
17398 }
17399
17400 if (parts.disp)
17401 parts.disp = gen_lowpart (mode, parts.disp);
17402
17403 if (parts.scale > 1)
17404 {
17405 /* Case r1 = r1 + ... */
17406 if (regno1 == regno0)
17407 {
17408 /* If we have a case r1 = r1 + C * r1 then we
17409 should use multiplication which is very
17410 expensive. Assume cost model is wrong if we
17411 have such case here. */
17412 gcc_assert (regno2 != regno0);
17413
17414 for (adds = parts.scale; adds > 0; adds--)
17415 ix86_emit_binop (PLUS, mode, target, parts.index);
17416 }
17417 else
17418 {
17419 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
17420 if (regno0 != regno2)
17421 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17422
17423 /* Use shift for scaling. */
17424 ix86_emit_binop (ASHIFT, mode, target,
17425 GEN_INT (exact_log2 (parts.scale)));
17426
17427 if (parts.base)
17428 ix86_emit_binop (PLUS, mode, target, parts.base);
17429
17430 if (parts.disp && parts.disp != const0_rtx)
17431 ix86_emit_binop (PLUS, mode, target, parts.disp);
17432 }
17433 }
17434 else if (!parts.base && !parts.index)
17435 {
17436 gcc_assert(parts.disp);
17437 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
17438 }
17439 else
17440 {
17441 if (!parts.base)
17442 {
17443 if (regno0 != regno2)
17444 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17445 }
17446 else if (!parts.index)
17447 {
17448 if (regno0 != regno1)
17449 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
17450 }
17451 else
17452 {
17453 if (regno0 == regno1)
17454 tmp = parts.index;
17455 else if (regno0 == regno2)
17456 tmp = parts.base;
17457 else
17458 {
17459 rtx tmp1;
17460
17461 /* Find better operand for SET instruction, depending
17462 on which definition is farther from the insn. */
17463 if (find_nearest_reg_def (insn, regno1, regno2))
17464 tmp = parts.index, tmp1 = parts.base;
17465 else
17466 tmp = parts.base, tmp1 = parts.index;
17467
17468 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
17469
17470 if (parts.disp && parts.disp != const0_rtx)
17471 ix86_emit_binop (PLUS, mode, target, parts.disp);
17472
17473 ix86_emit_binop (PLUS, mode, target, tmp1);
17474 return;
17475 }
17476
17477 ix86_emit_binop (PLUS, mode, target, tmp);
17478 }
17479
17480 if (parts.disp && parts.disp != const0_rtx)
17481 ix86_emit_binop (PLUS, mode, target, parts.disp);
17482 }
17483 }
17484
17485 /* Return true if it is ok to optimize an ADD operation to LEA
17486 operation to avoid flag register consumation. For most processors,
17487 ADD is faster than LEA. For the processors like ATOM, if the
17488 destination register of LEA holds an actual address which will be
17489 used soon, LEA is better and otherwise ADD is better. */
17490
17491 bool
17492 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17493 {
17494 unsigned int regno0 = true_regnum (operands[0]);
17495 unsigned int regno1 = true_regnum (operands[1]);
17496 unsigned int regno2 = true_regnum (operands[2]);
17497
17498 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17499 if (regno0 != regno1 && regno0 != regno2)
17500 return true;
17501
17502 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17503 return false;
17504
17505 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17506 }
17507
17508 /* Return true if destination reg of SET_BODY is shift count of
17509 USE_BODY. */
17510
17511 static bool
17512 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17513 {
17514 rtx set_dest;
17515 rtx shift_rtx;
17516 int i;
17517
17518 /* Retrieve destination of SET_BODY. */
17519 switch (GET_CODE (set_body))
17520 {
17521 case SET:
17522 set_dest = SET_DEST (set_body);
17523 if (!set_dest || !REG_P (set_dest))
17524 return false;
17525 break;
17526 case PARALLEL:
17527 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17528 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17529 use_body))
17530 return true;
17531 default:
17532 return false;
17533 break;
17534 }
17535
17536 /* Retrieve shift count of USE_BODY. */
17537 switch (GET_CODE (use_body))
17538 {
17539 case SET:
17540 shift_rtx = XEXP (use_body, 1);
17541 break;
17542 case PARALLEL:
17543 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17544 if (ix86_dep_by_shift_count_body (set_body,
17545 XVECEXP (use_body, 0, i)))
17546 return true;
17547 default:
17548 return false;
17549 break;
17550 }
17551
17552 if (shift_rtx
17553 && (GET_CODE (shift_rtx) == ASHIFT
17554 || GET_CODE (shift_rtx) == LSHIFTRT
17555 || GET_CODE (shift_rtx) == ASHIFTRT
17556 || GET_CODE (shift_rtx) == ROTATE
17557 || GET_CODE (shift_rtx) == ROTATERT))
17558 {
17559 rtx shift_count = XEXP (shift_rtx, 1);
17560
17561 /* Return true if shift count is dest of SET_BODY. */
17562 if (REG_P (shift_count))
17563 {
17564 /* Add check since it can be invoked before register
17565 allocation in pre-reload schedule. */
17566 if (reload_completed
17567 && true_regnum (set_dest) == true_regnum (shift_count))
17568 return true;
17569 else if (REGNO(set_dest) == REGNO(shift_count))
17570 return true;
17571 }
17572 }
17573
17574 return false;
17575 }
17576
17577 /* Return true if destination reg of SET_INSN is shift count of
17578 USE_INSN. */
17579
17580 bool
17581 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17582 {
17583 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17584 PATTERN (use_insn));
17585 }
17586
17587 /* Return TRUE or FALSE depending on whether the unary operator meets the
17588 appropriate constraints. */
17589
17590 bool
17591 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17592 enum machine_mode mode ATTRIBUTE_UNUSED,
17593 rtx operands[2] ATTRIBUTE_UNUSED)
17594 {
17595 /* If one of operands is memory, source and destination must match. */
17596 if ((MEM_P (operands[0])
17597 || MEM_P (operands[1]))
17598 && ! rtx_equal_p (operands[0], operands[1]))
17599 return false;
17600 return true;
17601 }
17602
17603 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17604 are ok, keeping in mind the possible movddup alternative. */
17605
17606 bool
17607 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17608 {
17609 if (MEM_P (operands[0]))
17610 return rtx_equal_p (operands[0], operands[1 + high]);
17611 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17612 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17613 return true;
17614 }
17615
17616 /* Post-reload splitter for converting an SF or DFmode value in an
17617 SSE register into an unsigned SImode. */
17618
17619 void
17620 ix86_split_convert_uns_si_sse (rtx operands[])
17621 {
17622 enum machine_mode vecmode;
17623 rtx value, large, zero_or_two31, input, two31, x;
17624
17625 large = operands[1];
17626 zero_or_two31 = operands[2];
17627 input = operands[3];
17628 two31 = operands[4];
17629 vecmode = GET_MODE (large);
17630 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17631
17632 /* Load up the value into the low element. We must ensure that the other
17633 elements are valid floats -- zero is the easiest such value. */
17634 if (MEM_P (input))
17635 {
17636 if (vecmode == V4SFmode)
17637 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17638 else
17639 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17640 }
17641 else
17642 {
17643 input = gen_rtx_REG (vecmode, REGNO (input));
17644 emit_move_insn (value, CONST0_RTX (vecmode));
17645 if (vecmode == V4SFmode)
17646 emit_insn (gen_sse_movss (value, value, input));
17647 else
17648 emit_insn (gen_sse2_movsd (value, value, input));
17649 }
17650
17651 emit_move_insn (large, two31);
17652 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17653
17654 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17655 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17656
17657 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17658 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17659
17660 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17661 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17662
17663 large = gen_rtx_REG (V4SImode, REGNO (large));
17664 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17665
17666 x = gen_rtx_REG (V4SImode, REGNO (value));
17667 if (vecmode == V4SFmode)
17668 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17669 else
17670 emit_insn (gen_sse2_cvttpd2dq (x, value));
17671 value = x;
17672
17673 emit_insn (gen_xorv4si3 (value, value, large));
17674 }
17675
17676 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17677 Expects the 64-bit DImode to be supplied in a pair of integral
17678 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17679 -mfpmath=sse, !optimize_size only. */
17680
17681 void
17682 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17683 {
17684 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17685 rtx int_xmm, fp_xmm;
17686 rtx biases, exponents;
17687 rtx x;
17688
17689 int_xmm = gen_reg_rtx (V4SImode);
17690 if (TARGET_INTER_UNIT_MOVES)
17691 emit_insn (gen_movdi_to_sse (int_xmm, input));
17692 else if (TARGET_SSE_SPLIT_REGS)
17693 {
17694 emit_clobber (int_xmm);
17695 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17696 }
17697 else
17698 {
17699 x = gen_reg_rtx (V2DImode);
17700 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17701 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17702 }
17703
17704 x = gen_rtx_CONST_VECTOR (V4SImode,
17705 gen_rtvec (4, GEN_INT (0x43300000UL),
17706 GEN_INT (0x45300000UL),
17707 const0_rtx, const0_rtx));
17708 exponents = validize_mem (force_const_mem (V4SImode, x));
17709
17710 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17711 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17712
17713 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17714 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17715 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17716 (0x1.0p84 + double(fp_value_hi_xmm)).
17717 Note these exponents differ by 32. */
17718
17719 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17720
17721 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17722 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17723 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17724 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17725 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17726 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17727 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17728 biases = validize_mem (force_const_mem (V2DFmode, biases));
17729 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17730
17731 /* Add the upper and lower DFmode values together. */
17732 if (TARGET_SSE3)
17733 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17734 else
17735 {
17736 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17737 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17738 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17739 }
17740
17741 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17742 }
17743
17744 /* Not used, but eases macroization of patterns. */
17745 void
17746 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17747 rtx input ATTRIBUTE_UNUSED)
17748 {
17749 gcc_unreachable ();
17750 }
17751
17752 /* Convert an unsigned SImode value into a DFmode. Only currently used
17753 for SSE, but applicable anywhere. */
17754
17755 void
17756 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17757 {
17758 REAL_VALUE_TYPE TWO31r;
17759 rtx x, fp;
17760
17761 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17762 NULL, 1, OPTAB_DIRECT);
17763
17764 fp = gen_reg_rtx (DFmode);
17765 emit_insn (gen_floatsidf2 (fp, x));
17766
17767 real_ldexp (&TWO31r, &dconst1, 31);
17768 x = const_double_from_real_value (TWO31r, DFmode);
17769
17770 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17771 if (x != target)
17772 emit_move_insn (target, x);
17773 }
17774
17775 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17776 32-bit mode; otherwise we have a direct convert instruction. */
17777
17778 void
17779 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17780 {
17781 REAL_VALUE_TYPE TWO32r;
17782 rtx fp_lo, fp_hi, x;
17783
17784 fp_lo = gen_reg_rtx (DFmode);
17785 fp_hi = gen_reg_rtx (DFmode);
17786
17787 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17788
17789 real_ldexp (&TWO32r, &dconst1, 32);
17790 x = const_double_from_real_value (TWO32r, DFmode);
17791 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17792
17793 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17794
17795 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17796 0, OPTAB_DIRECT);
17797 if (x != target)
17798 emit_move_insn (target, x);
17799 }
17800
17801 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17802 For x86_32, -mfpmath=sse, !optimize_size only. */
17803 void
17804 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17805 {
17806 REAL_VALUE_TYPE ONE16r;
17807 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17808
17809 real_ldexp (&ONE16r, &dconst1, 16);
17810 x = const_double_from_real_value (ONE16r, SFmode);
17811 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17812 NULL, 0, OPTAB_DIRECT);
17813 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17814 NULL, 0, OPTAB_DIRECT);
17815 fp_hi = gen_reg_rtx (SFmode);
17816 fp_lo = gen_reg_rtx (SFmode);
17817 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17818 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17819 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17820 0, OPTAB_DIRECT);
17821 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17822 0, OPTAB_DIRECT);
17823 if (!rtx_equal_p (target, fp_hi))
17824 emit_move_insn (target, fp_hi);
17825 }
17826
17827 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17828 a vector of unsigned ints VAL to vector of floats TARGET. */
17829
17830 void
17831 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17832 {
17833 rtx tmp[8];
17834 REAL_VALUE_TYPE TWO16r;
17835 enum machine_mode intmode = GET_MODE (val);
17836 enum machine_mode fltmode = GET_MODE (target);
17837 rtx (*cvt) (rtx, rtx);
17838
17839 if (intmode == V4SImode)
17840 cvt = gen_floatv4siv4sf2;
17841 else
17842 cvt = gen_floatv8siv8sf2;
17843 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17844 tmp[0] = force_reg (intmode, tmp[0]);
17845 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17846 OPTAB_DIRECT);
17847 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17848 NULL_RTX, 1, OPTAB_DIRECT);
17849 tmp[3] = gen_reg_rtx (fltmode);
17850 emit_insn (cvt (tmp[3], tmp[1]));
17851 tmp[4] = gen_reg_rtx (fltmode);
17852 emit_insn (cvt (tmp[4], tmp[2]));
17853 real_ldexp (&TWO16r, &dconst1, 16);
17854 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17855 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17856 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17857 OPTAB_DIRECT);
17858 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17859 OPTAB_DIRECT);
17860 if (tmp[7] != target)
17861 emit_move_insn (target, tmp[7]);
17862 }
17863
17864 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17865 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17866 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17867 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17868
17869 rtx
17870 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17871 {
17872 REAL_VALUE_TYPE TWO31r;
17873 rtx two31r, tmp[4];
17874 enum machine_mode mode = GET_MODE (val);
17875 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17876 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17877 rtx (*cmp) (rtx, rtx, rtx, rtx);
17878 int i;
17879
17880 for (i = 0; i < 3; i++)
17881 tmp[i] = gen_reg_rtx (mode);
17882 real_ldexp (&TWO31r, &dconst1, 31);
17883 two31r = const_double_from_real_value (TWO31r, scalarmode);
17884 two31r = ix86_build_const_vector (mode, 1, two31r);
17885 two31r = force_reg (mode, two31r);
17886 switch (mode)
17887 {
17888 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17889 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17890 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17891 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17892 default: gcc_unreachable ();
17893 }
17894 tmp[3] = gen_rtx_LE (mode, two31r, val);
17895 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17896 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17897 0, OPTAB_DIRECT);
17898 if (intmode == V4SImode || TARGET_AVX2)
17899 *xorp = expand_simple_binop (intmode, ASHIFT,
17900 gen_lowpart (intmode, tmp[0]),
17901 GEN_INT (31), NULL_RTX, 0,
17902 OPTAB_DIRECT);
17903 else
17904 {
17905 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17906 two31 = ix86_build_const_vector (intmode, 1, two31);
17907 *xorp = expand_simple_binop (intmode, AND,
17908 gen_lowpart (intmode, tmp[0]),
17909 two31, NULL_RTX, 0,
17910 OPTAB_DIRECT);
17911 }
17912 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17913 0, OPTAB_DIRECT);
17914 }
17915
17916 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17917 then replicate the value for all elements of the vector
17918 register. */
17919
17920 rtx
17921 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17922 {
17923 int i, n_elt;
17924 rtvec v;
17925 enum machine_mode scalar_mode;
17926
17927 switch (mode)
17928 {
17929 case V32QImode:
17930 case V16QImode:
17931 case V16HImode:
17932 case V8HImode:
17933 case V8SImode:
17934 case V4SImode:
17935 case V4DImode:
17936 case V2DImode:
17937 gcc_assert (vect);
17938 case V8SFmode:
17939 case V4SFmode:
17940 case V4DFmode:
17941 case V2DFmode:
17942 n_elt = GET_MODE_NUNITS (mode);
17943 v = rtvec_alloc (n_elt);
17944 scalar_mode = GET_MODE_INNER (mode);
17945
17946 RTVEC_ELT (v, 0) = value;
17947
17948 for (i = 1; i < n_elt; ++i)
17949 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17950
17951 return gen_rtx_CONST_VECTOR (mode, v);
17952
17953 default:
17954 gcc_unreachable ();
17955 }
17956 }
17957
17958 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17959 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17960 for an SSE register. If VECT is true, then replicate the mask for
17961 all elements of the vector register. If INVERT is true, then create
17962 a mask excluding the sign bit. */
17963
17964 rtx
17965 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17966 {
17967 enum machine_mode vec_mode, imode;
17968 HOST_WIDE_INT hi, lo;
17969 int shift = 63;
17970 rtx v;
17971 rtx mask;
17972
17973 /* Find the sign bit, sign extended to 2*HWI. */
17974 switch (mode)
17975 {
17976 case V8SImode:
17977 case V4SImode:
17978 case V8SFmode:
17979 case V4SFmode:
17980 vec_mode = mode;
17981 mode = GET_MODE_INNER (mode);
17982 imode = SImode;
17983 lo = 0x80000000, hi = lo < 0;
17984 break;
17985
17986 case V4DImode:
17987 case V2DImode:
17988 case V4DFmode:
17989 case V2DFmode:
17990 vec_mode = mode;
17991 mode = GET_MODE_INNER (mode);
17992 imode = DImode;
17993 if (HOST_BITS_PER_WIDE_INT >= 64)
17994 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17995 else
17996 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17997 break;
17998
17999 case TImode:
18000 case TFmode:
18001 vec_mode = VOIDmode;
18002 if (HOST_BITS_PER_WIDE_INT >= 64)
18003 {
18004 imode = TImode;
18005 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18006 }
18007 else
18008 {
18009 rtvec vec;
18010
18011 imode = DImode;
18012 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18013
18014 if (invert)
18015 {
18016 lo = ~lo, hi = ~hi;
18017 v = constm1_rtx;
18018 }
18019 else
18020 v = const0_rtx;
18021
18022 mask = immed_double_const (lo, hi, imode);
18023
18024 vec = gen_rtvec (2, v, mask);
18025 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18026 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18027
18028 return v;
18029 }
18030 break;
18031
18032 default:
18033 gcc_unreachable ();
18034 }
18035
18036 if (invert)
18037 lo = ~lo, hi = ~hi;
18038
18039 /* Force this value into the low part of a fp vector constant. */
18040 mask = immed_double_const (lo, hi, imode);
18041 mask = gen_lowpart (mode, mask);
18042
18043 if (vec_mode == VOIDmode)
18044 return force_reg (mode, mask);
18045
18046 v = ix86_build_const_vector (vec_mode, vect, mask);
18047 return force_reg (vec_mode, v);
18048 }
18049
18050 /* Generate code for floating point ABS or NEG. */
18051
18052 void
18053 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18054 rtx operands[])
18055 {
18056 rtx mask, set, dst, src;
18057 bool use_sse = false;
18058 bool vector_mode = VECTOR_MODE_P (mode);
18059 enum machine_mode vmode = mode;
18060
18061 if (vector_mode)
18062 use_sse = true;
18063 else if (mode == TFmode)
18064 use_sse = true;
18065 else if (TARGET_SSE_MATH)
18066 {
18067 use_sse = SSE_FLOAT_MODE_P (mode);
18068 if (mode == SFmode)
18069 vmode = V4SFmode;
18070 else if (mode == DFmode)
18071 vmode = V2DFmode;
18072 }
18073
18074 /* NEG and ABS performed with SSE use bitwise mask operations.
18075 Create the appropriate mask now. */
18076 if (use_sse)
18077 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18078 else
18079 mask = NULL_RTX;
18080
18081 dst = operands[0];
18082 src = operands[1];
18083
18084 set = gen_rtx_fmt_e (code, mode, src);
18085 set = gen_rtx_SET (VOIDmode, dst, set);
18086
18087 if (mask)
18088 {
18089 rtx use, clob;
18090 rtvec par;
18091
18092 use = gen_rtx_USE (VOIDmode, mask);
18093 if (vector_mode)
18094 par = gen_rtvec (2, set, use);
18095 else
18096 {
18097 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18098 par = gen_rtvec (3, set, use, clob);
18099 }
18100 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18101 }
18102 else
18103 emit_insn (set);
18104 }
18105
18106 /* Expand a copysign operation. Special case operand 0 being a constant. */
18107
18108 void
18109 ix86_expand_copysign (rtx operands[])
18110 {
18111 enum machine_mode mode, vmode;
18112 rtx dest, op0, op1, mask, nmask;
18113
18114 dest = operands[0];
18115 op0 = operands[1];
18116 op1 = operands[2];
18117
18118 mode = GET_MODE (dest);
18119
18120 if (mode == SFmode)
18121 vmode = V4SFmode;
18122 else if (mode == DFmode)
18123 vmode = V2DFmode;
18124 else
18125 vmode = mode;
18126
18127 if (GET_CODE (op0) == CONST_DOUBLE)
18128 {
18129 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18130
18131 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18132 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18133
18134 if (mode == SFmode || mode == DFmode)
18135 {
18136 if (op0 == CONST0_RTX (mode))
18137 op0 = CONST0_RTX (vmode);
18138 else
18139 {
18140 rtx v = ix86_build_const_vector (vmode, false, op0);
18141
18142 op0 = force_reg (vmode, v);
18143 }
18144 }
18145 else if (op0 != CONST0_RTX (mode))
18146 op0 = force_reg (mode, op0);
18147
18148 mask = ix86_build_signbit_mask (vmode, 0, 0);
18149
18150 if (mode == SFmode)
18151 copysign_insn = gen_copysignsf3_const;
18152 else if (mode == DFmode)
18153 copysign_insn = gen_copysigndf3_const;
18154 else
18155 copysign_insn = gen_copysigntf3_const;
18156
18157 emit_insn (copysign_insn (dest, op0, op1, mask));
18158 }
18159 else
18160 {
18161 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18162
18163 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18164 mask = ix86_build_signbit_mask (vmode, 0, 0);
18165
18166 if (mode == SFmode)
18167 copysign_insn = gen_copysignsf3_var;
18168 else if (mode == DFmode)
18169 copysign_insn = gen_copysigndf3_var;
18170 else
18171 copysign_insn = gen_copysigntf3_var;
18172
18173 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18174 }
18175 }
18176
18177 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18178 be a constant, and so has already been expanded into a vector constant. */
18179
18180 void
18181 ix86_split_copysign_const (rtx operands[])
18182 {
18183 enum machine_mode mode, vmode;
18184 rtx dest, op0, mask, x;
18185
18186 dest = operands[0];
18187 op0 = operands[1];
18188 mask = operands[3];
18189
18190 mode = GET_MODE (dest);
18191 vmode = GET_MODE (mask);
18192
18193 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18194 x = gen_rtx_AND (vmode, dest, mask);
18195 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18196
18197 if (op0 != CONST0_RTX (vmode))
18198 {
18199 x = gen_rtx_IOR (vmode, dest, op0);
18200 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18201 }
18202 }
18203
18204 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18205 so we have to do two masks. */
18206
18207 void
18208 ix86_split_copysign_var (rtx operands[])
18209 {
18210 enum machine_mode mode, vmode;
18211 rtx dest, scratch, op0, op1, mask, nmask, x;
18212
18213 dest = operands[0];
18214 scratch = operands[1];
18215 op0 = operands[2];
18216 op1 = operands[3];
18217 nmask = operands[4];
18218 mask = operands[5];
18219
18220 mode = GET_MODE (dest);
18221 vmode = GET_MODE (mask);
18222
18223 if (rtx_equal_p (op0, op1))
18224 {
18225 /* Shouldn't happen often (it's useless, obviously), but when it does
18226 we'd generate incorrect code if we continue below. */
18227 emit_move_insn (dest, op0);
18228 return;
18229 }
18230
18231 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18232 {
18233 gcc_assert (REGNO (op1) == REGNO (scratch));
18234
18235 x = gen_rtx_AND (vmode, scratch, mask);
18236 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18237
18238 dest = mask;
18239 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18240 x = gen_rtx_NOT (vmode, dest);
18241 x = gen_rtx_AND (vmode, x, op0);
18242 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18243 }
18244 else
18245 {
18246 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18247 {
18248 x = gen_rtx_AND (vmode, scratch, mask);
18249 }
18250 else /* alternative 2,4 */
18251 {
18252 gcc_assert (REGNO (mask) == REGNO (scratch));
18253 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18254 x = gen_rtx_AND (vmode, scratch, op1);
18255 }
18256 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18257
18258 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18259 {
18260 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18261 x = gen_rtx_AND (vmode, dest, nmask);
18262 }
18263 else /* alternative 3,4 */
18264 {
18265 gcc_assert (REGNO (nmask) == REGNO (dest));
18266 dest = nmask;
18267 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18268 x = gen_rtx_AND (vmode, dest, op0);
18269 }
18270 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18271 }
18272
18273 x = gen_rtx_IOR (vmode, dest, scratch);
18274 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18275 }
18276
18277 /* Return TRUE or FALSE depending on whether the first SET in INSN
18278 has source and destination with matching CC modes, and that the
18279 CC mode is at least as constrained as REQ_MODE. */
18280
18281 bool
18282 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18283 {
18284 rtx set;
18285 enum machine_mode set_mode;
18286
18287 set = PATTERN (insn);
18288 if (GET_CODE (set) == PARALLEL)
18289 set = XVECEXP (set, 0, 0);
18290 gcc_assert (GET_CODE (set) == SET);
18291 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18292
18293 set_mode = GET_MODE (SET_DEST (set));
18294 switch (set_mode)
18295 {
18296 case CCNOmode:
18297 if (req_mode != CCNOmode
18298 && (req_mode != CCmode
18299 || XEXP (SET_SRC (set), 1) != const0_rtx))
18300 return false;
18301 break;
18302 case CCmode:
18303 if (req_mode == CCGCmode)
18304 return false;
18305 /* FALLTHRU */
18306 case CCGCmode:
18307 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18308 return false;
18309 /* FALLTHRU */
18310 case CCGOCmode:
18311 if (req_mode == CCZmode)
18312 return false;
18313 /* FALLTHRU */
18314 case CCZmode:
18315 break;
18316
18317 case CCAmode:
18318 case CCCmode:
18319 case CCOmode:
18320 case CCSmode:
18321 if (set_mode != req_mode)
18322 return false;
18323 break;
18324
18325 default:
18326 gcc_unreachable ();
18327 }
18328
18329 return GET_MODE (SET_SRC (set)) == set_mode;
18330 }
18331
18332 /* Generate insn patterns to do an integer compare of OPERANDS. */
18333
18334 static rtx
18335 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18336 {
18337 enum machine_mode cmpmode;
18338 rtx tmp, flags;
18339
18340 cmpmode = SELECT_CC_MODE (code, op0, op1);
18341 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18342
18343 /* This is very simple, but making the interface the same as in the
18344 FP case makes the rest of the code easier. */
18345 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18346 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18347
18348 /* Return the test that should be put into the flags user, i.e.
18349 the bcc, scc, or cmov instruction. */
18350 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18351 }
18352
18353 /* Figure out whether to use ordered or unordered fp comparisons.
18354 Return the appropriate mode to use. */
18355
18356 enum machine_mode
18357 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18358 {
18359 /* ??? In order to make all comparisons reversible, we do all comparisons
18360 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18361 all forms trapping and nontrapping comparisons, we can make inequality
18362 comparisons trapping again, since it results in better code when using
18363 FCOM based compares. */
18364 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18365 }
18366
18367 enum machine_mode
18368 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18369 {
18370 enum machine_mode mode = GET_MODE (op0);
18371
18372 if (SCALAR_FLOAT_MODE_P (mode))
18373 {
18374 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18375 return ix86_fp_compare_mode (code);
18376 }
18377
18378 switch (code)
18379 {
18380 /* Only zero flag is needed. */
18381 case EQ: /* ZF=0 */
18382 case NE: /* ZF!=0 */
18383 return CCZmode;
18384 /* Codes needing carry flag. */
18385 case GEU: /* CF=0 */
18386 case LTU: /* CF=1 */
18387 /* Detect overflow checks. They need just the carry flag. */
18388 if (GET_CODE (op0) == PLUS
18389 && rtx_equal_p (op1, XEXP (op0, 0)))
18390 return CCCmode;
18391 else
18392 return CCmode;
18393 case GTU: /* CF=0 & ZF=0 */
18394 case LEU: /* CF=1 | ZF=1 */
18395 /* Detect overflow checks. They need just the carry flag. */
18396 if (GET_CODE (op0) == MINUS
18397 && rtx_equal_p (op1, XEXP (op0, 0)))
18398 return CCCmode;
18399 else
18400 return CCmode;
18401 /* Codes possibly doable only with sign flag when
18402 comparing against zero. */
18403 case GE: /* SF=OF or SF=0 */
18404 case LT: /* SF<>OF or SF=1 */
18405 if (op1 == const0_rtx)
18406 return CCGOCmode;
18407 else
18408 /* For other cases Carry flag is not required. */
18409 return CCGCmode;
18410 /* Codes doable only with sign flag when comparing
18411 against zero, but we miss jump instruction for it
18412 so we need to use relational tests against overflow
18413 that thus needs to be zero. */
18414 case GT: /* ZF=0 & SF=OF */
18415 case LE: /* ZF=1 | SF<>OF */
18416 if (op1 == const0_rtx)
18417 return CCNOmode;
18418 else
18419 return CCGCmode;
18420 /* strcmp pattern do (use flags) and combine may ask us for proper
18421 mode. */
18422 case USE:
18423 return CCmode;
18424 default:
18425 gcc_unreachable ();
18426 }
18427 }
18428
18429 /* Return the fixed registers used for condition codes. */
18430
18431 static bool
18432 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18433 {
18434 *p1 = FLAGS_REG;
18435 *p2 = FPSR_REG;
18436 return true;
18437 }
18438
18439 /* If two condition code modes are compatible, return a condition code
18440 mode which is compatible with both. Otherwise, return
18441 VOIDmode. */
18442
18443 static enum machine_mode
18444 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18445 {
18446 if (m1 == m2)
18447 return m1;
18448
18449 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18450 return VOIDmode;
18451
18452 if ((m1 == CCGCmode && m2 == CCGOCmode)
18453 || (m1 == CCGOCmode && m2 == CCGCmode))
18454 return CCGCmode;
18455
18456 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
18457 return m2;
18458 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
18459 return m1;
18460
18461 switch (m1)
18462 {
18463 default:
18464 gcc_unreachable ();
18465
18466 case CCmode:
18467 case CCGCmode:
18468 case CCGOCmode:
18469 case CCNOmode:
18470 case CCAmode:
18471 case CCCmode:
18472 case CCOmode:
18473 case CCSmode:
18474 case CCZmode:
18475 switch (m2)
18476 {
18477 default:
18478 return VOIDmode;
18479
18480 case CCmode:
18481 case CCGCmode:
18482 case CCGOCmode:
18483 case CCNOmode:
18484 case CCAmode:
18485 case CCCmode:
18486 case CCOmode:
18487 case CCSmode:
18488 case CCZmode:
18489 return CCmode;
18490 }
18491
18492 case CCFPmode:
18493 case CCFPUmode:
18494 /* These are only compatible with themselves, which we already
18495 checked above. */
18496 return VOIDmode;
18497 }
18498 }
18499
18500
18501 /* Return a comparison we can do and that it is equivalent to
18502 swap_condition (code) apart possibly from orderedness.
18503 But, never change orderedness if TARGET_IEEE_FP, returning
18504 UNKNOWN in that case if necessary. */
18505
18506 static enum rtx_code
18507 ix86_fp_swap_condition (enum rtx_code code)
18508 {
18509 switch (code)
18510 {
18511 case GT: /* GTU - CF=0 & ZF=0 */
18512 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18513 case GE: /* GEU - CF=0 */
18514 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18515 case UNLT: /* LTU - CF=1 */
18516 return TARGET_IEEE_FP ? UNKNOWN : GT;
18517 case UNLE: /* LEU - CF=1 | ZF=1 */
18518 return TARGET_IEEE_FP ? UNKNOWN : GE;
18519 default:
18520 return swap_condition (code);
18521 }
18522 }
18523
18524 /* Return cost of comparison CODE using the best strategy for performance.
18525 All following functions do use number of instructions as a cost metrics.
18526 In future this should be tweaked to compute bytes for optimize_size and
18527 take into account performance of various instructions on various CPUs. */
18528
18529 static int
18530 ix86_fp_comparison_cost (enum rtx_code code)
18531 {
18532 int arith_cost;
18533
18534 /* The cost of code using bit-twiddling on %ah. */
18535 switch (code)
18536 {
18537 case UNLE:
18538 case UNLT:
18539 case LTGT:
18540 case GT:
18541 case GE:
18542 case UNORDERED:
18543 case ORDERED:
18544 case UNEQ:
18545 arith_cost = 4;
18546 break;
18547 case LT:
18548 case NE:
18549 case EQ:
18550 case UNGE:
18551 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18552 break;
18553 case LE:
18554 case UNGT:
18555 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18556 break;
18557 default:
18558 gcc_unreachable ();
18559 }
18560
18561 switch (ix86_fp_comparison_strategy (code))
18562 {
18563 case IX86_FPCMP_COMI:
18564 return arith_cost > 4 ? 3 : 2;
18565 case IX86_FPCMP_SAHF:
18566 return arith_cost > 4 ? 4 : 3;
18567 default:
18568 return arith_cost;
18569 }
18570 }
18571
18572 /* Return strategy to use for floating-point. We assume that fcomi is always
18573 preferrable where available, since that is also true when looking at size
18574 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18575
18576 enum ix86_fpcmp_strategy
18577 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18578 {
18579 /* Do fcomi/sahf based test when profitable. */
18580
18581 if (TARGET_CMOVE)
18582 return IX86_FPCMP_COMI;
18583
18584 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18585 return IX86_FPCMP_SAHF;
18586
18587 return IX86_FPCMP_ARITH;
18588 }
18589
18590 /* Swap, force into registers, or otherwise massage the two operands
18591 to a fp comparison. The operands are updated in place; the new
18592 comparison code is returned. */
18593
18594 static enum rtx_code
18595 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18596 {
18597 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18598 rtx op0 = *pop0, op1 = *pop1;
18599 enum machine_mode op_mode = GET_MODE (op0);
18600 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18601
18602 /* All of the unordered compare instructions only work on registers.
18603 The same is true of the fcomi compare instructions. The XFmode
18604 compare instructions require registers except when comparing
18605 against zero or when converting operand 1 from fixed point to
18606 floating point. */
18607
18608 if (!is_sse
18609 && (fpcmp_mode == CCFPUmode
18610 || (op_mode == XFmode
18611 && ! (standard_80387_constant_p (op0) == 1
18612 || standard_80387_constant_p (op1) == 1)
18613 && GET_CODE (op1) != FLOAT)
18614 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18615 {
18616 op0 = force_reg (op_mode, op0);
18617 op1 = force_reg (op_mode, op1);
18618 }
18619 else
18620 {
18621 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18622 things around if they appear profitable, otherwise force op0
18623 into a register. */
18624
18625 if (standard_80387_constant_p (op0) == 0
18626 || (MEM_P (op0)
18627 && ! (standard_80387_constant_p (op1) == 0
18628 || MEM_P (op1))))
18629 {
18630 enum rtx_code new_code = ix86_fp_swap_condition (code);
18631 if (new_code != UNKNOWN)
18632 {
18633 rtx tmp;
18634 tmp = op0, op0 = op1, op1 = tmp;
18635 code = new_code;
18636 }
18637 }
18638
18639 if (!REG_P (op0))
18640 op0 = force_reg (op_mode, op0);
18641
18642 if (CONSTANT_P (op1))
18643 {
18644 int tmp = standard_80387_constant_p (op1);
18645 if (tmp == 0)
18646 op1 = validize_mem (force_const_mem (op_mode, op1));
18647 else if (tmp == 1)
18648 {
18649 if (TARGET_CMOVE)
18650 op1 = force_reg (op_mode, op1);
18651 }
18652 else
18653 op1 = force_reg (op_mode, op1);
18654 }
18655 }
18656
18657 /* Try to rearrange the comparison to make it cheaper. */
18658 if (ix86_fp_comparison_cost (code)
18659 > ix86_fp_comparison_cost (swap_condition (code))
18660 && (REG_P (op1) || can_create_pseudo_p ()))
18661 {
18662 rtx tmp;
18663 tmp = op0, op0 = op1, op1 = tmp;
18664 code = swap_condition (code);
18665 if (!REG_P (op0))
18666 op0 = force_reg (op_mode, op0);
18667 }
18668
18669 *pop0 = op0;
18670 *pop1 = op1;
18671 return code;
18672 }
18673
18674 /* Convert comparison codes we use to represent FP comparison to integer
18675 code that will result in proper branch. Return UNKNOWN if no such code
18676 is available. */
18677
18678 enum rtx_code
18679 ix86_fp_compare_code_to_integer (enum rtx_code code)
18680 {
18681 switch (code)
18682 {
18683 case GT:
18684 return GTU;
18685 case GE:
18686 return GEU;
18687 case ORDERED:
18688 case UNORDERED:
18689 return code;
18690 break;
18691 case UNEQ:
18692 return EQ;
18693 break;
18694 case UNLT:
18695 return LTU;
18696 break;
18697 case UNLE:
18698 return LEU;
18699 break;
18700 case LTGT:
18701 return NE;
18702 break;
18703 default:
18704 return UNKNOWN;
18705 }
18706 }
18707
18708 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18709
18710 static rtx
18711 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18712 {
18713 enum machine_mode fpcmp_mode, intcmp_mode;
18714 rtx tmp, tmp2;
18715
18716 fpcmp_mode = ix86_fp_compare_mode (code);
18717 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18718
18719 /* Do fcomi/sahf based test when profitable. */
18720 switch (ix86_fp_comparison_strategy (code))
18721 {
18722 case IX86_FPCMP_COMI:
18723 intcmp_mode = fpcmp_mode;
18724 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18725 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18726 tmp);
18727 emit_insn (tmp);
18728 break;
18729
18730 case IX86_FPCMP_SAHF:
18731 intcmp_mode = fpcmp_mode;
18732 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18733 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18734 tmp);
18735
18736 if (!scratch)
18737 scratch = gen_reg_rtx (HImode);
18738 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18739 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18740 break;
18741
18742 case IX86_FPCMP_ARITH:
18743 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18744 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18745 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18746 if (!scratch)
18747 scratch = gen_reg_rtx (HImode);
18748 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18749
18750 /* In the unordered case, we have to check C2 for NaN's, which
18751 doesn't happen to work out to anything nice combination-wise.
18752 So do some bit twiddling on the value we've got in AH to come
18753 up with an appropriate set of condition codes. */
18754
18755 intcmp_mode = CCNOmode;
18756 switch (code)
18757 {
18758 case GT:
18759 case UNGT:
18760 if (code == GT || !TARGET_IEEE_FP)
18761 {
18762 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18763 code = EQ;
18764 }
18765 else
18766 {
18767 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18768 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18769 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18770 intcmp_mode = CCmode;
18771 code = GEU;
18772 }
18773 break;
18774 case LT:
18775 case UNLT:
18776 if (code == LT && TARGET_IEEE_FP)
18777 {
18778 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18779 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18780 intcmp_mode = CCmode;
18781 code = EQ;
18782 }
18783 else
18784 {
18785 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18786 code = NE;
18787 }
18788 break;
18789 case GE:
18790 case UNGE:
18791 if (code == GE || !TARGET_IEEE_FP)
18792 {
18793 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18794 code = EQ;
18795 }
18796 else
18797 {
18798 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18799 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18800 code = NE;
18801 }
18802 break;
18803 case LE:
18804 case UNLE:
18805 if (code == LE && TARGET_IEEE_FP)
18806 {
18807 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18808 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18809 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18810 intcmp_mode = CCmode;
18811 code = LTU;
18812 }
18813 else
18814 {
18815 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18816 code = NE;
18817 }
18818 break;
18819 case EQ:
18820 case UNEQ:
18821 if (code == EQ && TARGET_IEEE_FP)
18822 {
18823 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18824 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18825 intcmp_mode = CCmode;
18826 code = EQ;
18827 }
18828 else
18829 {
18830 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18831 code = NE;
18832 }
18833 break;
18834 case NE:
18835 case LTGT:
18836 if (code == NE && TARGET_IEEE_FP)
18837 {
18838 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18839 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18840 GEN_INT (0x40)));
18841 code = NE;
18842 }
18843 else
18844 {
18845 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18846 code = EQ;
18847 }
18848 break;
18849
18850 case UNORDERED:
18851 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18852 code = NE;
18853 break;
18854 case ORDERED:
18855 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18856 code = EQ;
18857 break;
18858
18859 default:
18860 gcc_unreachable ();
18861 }
18862 break;
18863
18864 default:
18865 gcc_unreachable();
18866 }
18867
18868 /* Return the test that should be put into the flags user, i.e.
18869 the bcc, scc, or cmov instruction. */
18870 return gen_rtx_fmt_ee (code, VOIDmode,
18871 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18872 const0_rtx);
18873 }
18874
18875 static rtx
18876 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18877 {
18878 rtx ret;
18879
18880 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18881 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18882
18883 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18884 {
18885 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18886 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18887 }
18888 else
18889 ret = ix86_expand_int_compare (code, op0, op1);
18890
18891 return ret;
18892 }
18893
18894 void
18895 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18896 {
18897 enum machine_mode mode = GET_MODE (op0);
18898 rtx tmp;
18899
18900 switch (mode)
18901 {
18902 case SFmode:
18903 case DFmode:
18904 case XFmode:
18905 case QImode:
18906 case HImode:
18907 case SImode:
18908 simple:
18909 tmp = ix86_expand_compare (code, op0, op1);
18910 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18911 gen_rtx_LABEL_REF (VOIDmode, label),
18912 pc_rtx);
18913 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18914 return;
18915
18916 case DImode:
18917 if (TARGET_64BIT)
18918 goto simple;
18919 case TImode:
18920 /* Expand DImode branch into multiple compare+branch. */
18921 {
18922 rtx lo[2], hi[2], label2;
18923 enum rtx_code code1, code2, code3;
18924 enum machine_mode submode;
18925
18926 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18927 {
18928 tmp = op0, op0 = op1, op1 = tmp;
18929 code = swap_condition (code);
18930 }
18931
18932 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18933 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18934
18935 submode = mode == DImode ? SImode : DImode;
18936
18937 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18938 avoid two branches. This costs one extra insn, so disable when
18939 optimizing for size. */
18940
18941 if ((code == EQ || code == NE)
18942 && (!optimize_insn_for_size_p ()
18943 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18944 {
18945 rtx xor0, xor1;
18946
18947 xor1 = hi[0];
18948 if (hi[1] != const0_rtx)
18949 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18950 NULL_RTX, 0, OPTAB_WIDEN);
18951
18952 xor0 = lo[0];
18953 if (lo[1] != const0_rtx)
18954 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18955 NULL_RTX, 0, OPTAB_WIDEN);
18956
18957 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18958 NULL_RTX, 0, OPTAB_WIDEN);
18959
18960 ix86_expand_branch (code, tmp, const0_rtx, label);
18961 return;
18962 }
18963
18964 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18965 op1 is a constant and the low word is zero, then we can just
18966 examine the high word. Similarly for low word -1 and
18967 less-or-equal-than or greater-than. */
18968
18969 if (CONST_INT_P (hi[1]))
18970 switch (code)
18971 {
18972 case LT: case LTU: case GE: case GEU:
18973 if (lo[1] == const0_rtx)
18974 {
18975 ix86_expand_branch (code, hi[0], hi[1], label);
18976 return;
18977 }
18978 break;
18979 case LE: case LEU: case GT: case GTU:
18980 if (lo[1] == constm1_rtx)
18981 {
18982 ix86_expand_branch (code, hi[0], hi[1], label);
18983 return;
18984 }
18985 break;
18986 default:
18987 break;
18988 }
18989
18990 /* Otherwise, we need two or three jumps. */
18991
18992 label2 = gen_label_rtx ();
18993
18994 code1 = code;
18995 code2 = swap_condition (code);
18996 code3 = unsigned_condition (code);
18997
18998 switch (code)
18999 {
19000 case LT: case GT: case LTU: case GTU:
19001 break;
19002
19003 case LE: code1 = LT; code2 = GT; break;
19004 case GE: code1 = GT; code2 = LT; break;
19005 case LEU: code1 = LTU; code2 = GTU; break;
19006 case GEU: code1 = GTU; code2 = LTU; break;
19007
19008 case EQ: code1 = UNKNOWN; code2 = NE; break;
19009 case NE: code2 = UNKNOWN; break;
19010
19011 default:
19012 gcc_unreachable ();
19013 }
19014
19015 /*
19016 * a < b =>
19017 * if (hi(a) < hi(b)) goto true;
19018 * if (hi(a) > hi(b)) goto false;
19019 * if (lo(a) < lo(b)) goto true;
19020 * false:
19021 */
19022
19023 if (code1 != UNKNOWN)
19024 ix86_expand_branch (code1, hi[0], hi[1], label);
19025 if (code2 != UNKNOWN)
19026 ix86_expand_branch (code2, hi[0], hi[1], label2);
19027
19028 ix86_expand_branch (code3, lo[0], lo[1], label);
19029
19030 if (code2 != UNKNOWN)
19031 emit_label (label2);
19032 return;
19033 }
19034
19035 default:
19036 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19037 goto simple;
19038 }
19039 }
19040
19041 /* Split branch based on floating point condition. */
19042 void
19043 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19044 rtx target1, rtx target2, rtx tmp, rtx pushed)
19045 {
19046 rtx condition;
19047 rtx i;
19048
19049 if (target2 != pc_rtx)
19050 {
19051 rtx tmp = target2;
19052 code = reverse_condition_maybe_unordered (code);
19053 target2 = target1;
19054 target1 = tmp;
19055 }
19056
19057 condition = ix86_expand_fp_compare (code, op1, op2,
19058 tmp);
19059
19060 /* Remove pushed operand from stack. */
19061 if (pushed)
19062 ix86_free_from_memory (GET_MODE (pushed));
19063
19064 i = emit_jump_insn (gen_rtx_SET
19065 (VOIDmode, pc_rtx,
19066 gen_rtx_IF_THEN_ELSE (VOIDmode,
19067 condition, target1, target2)));
19068 if (split_branch_probability >= 0)
19069 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
19070 }
19071
19072 void
19073 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19074 {
19075 rtx ret;
19076
19077 gcc_assert (GET_MODE (dest) == QImode);
19078
19079 ret = ix86_expand_compare (code, op0, op1);
19080 PUT_MODE (ret, QImode);
19081 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19082 }
19083
19084 /* Expand comparison setting or clearing carry flag. Return true when
19085 successful and set pop for the operation. */
19086 static bool
19087 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19088 {
19089 enum machine_mode mode =
19090 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19091
19092 /* Do not handle double-mode compares that go through special path. */
19093 if (mode == (TARGET_64BIT ? TImode : DImode))
19094 return false;
19095
19096 if (SCALAR_FLOAT_MODE_P (mode))
19097 {
19098 rtx compare_op, compare_seq;
19099
19100 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19101
19102 /* Shortcut: following common codes never translate
19103 into carry flag compares. */
19104 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19105 || code == ORDERED || code == UNORDERED)
19106 return false;
19107
19108 /* These comparisons require zero flag; swap operands so they won't. */
19109 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19110 && !TARGET_IEEE_FP)
19111 {
19112 rtx tmp = op0;
19113 op0 = op1;
19114 op1 = tmp;
19115 code = swap_condition (code);
19116 }
19117
19118 /* Try to expand the comparison and verify that we end up with
19119 carry flag based comparison. This fails to be true only when
19120 we decide to expand comparison using arithmetic that is not
19121 too common scenario. */
19122 start_sequence ();
19123 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19124 compare_seq = get_insns ();
19125 end_sequence ();
19126
19127 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19128 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19129 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19130 else
19131 code = GET_CODE (compare_op);
19132
19133 if (code != LTU && code != GEU)
19134 return false;
19135
19136 emit_insn (compare_seq);
19137 *pop = compare_op;
19138 return true;
19139 }
19140
19141 if (!INTEGRAL_MODE_P (mode))
19142 return false;
19143
19144 switch (code)
19145 {
19146 case LTU:
19147 case GEU:
19148 break;
19149
19150 /* Convert a==0 into (unsigned)a<1. */
19151 case EQ:
19152 case NE:
19153 if (op1 != const0_rtx)
19154 return false;
19155 op1 = const1_rtx;
19156 code = (code == EQ ? LTU : GEU);
19157 break;
19158
19159 /* Convert a>b into b<a or a>=b-1. */
19160 case GTU:
19161 case LEU:
19162 if (CONST_INT_P (op1))
19163 {
19164 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19165 /* Bail out on overflow. We still can swap operands but that
19166 would force loading of the constant into register. */
19167 if (op1 == const0_rtx
19168 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19169 return false;
19170 code = (code == GTU ? GEU : LTU);
19171 }
19172 else
19173 {
19174 rtx tmp = op1;
19175 op1 = op0;
19176 op0 = tmp;
19177 code = (code == GTU ? LTU : GEU);
19178 }
19179 break;
19180
19181 /* Convert a>=0 into (unsigned)a<0x80000000. */
19182 case LT:
19183 case GE:
19184 if (mode == DImode || op1 != const0_rtx)
19185 return false;
19186 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19187 code = (code == LT ? GEU : LTU);
19188 break;
19189 case LE:
19190 case GT:
19191 if (mode == DImode || op1 != constm1_rtx)
19192 return false;
19193 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19194 code = (code == LE ? GEU : LTU);
19195 break;
19196
19197 default:
19198 return false;
19199 }
19200 /* Swapping operands may cause constant to appear as first operand. */
19201 if (!nonimmediate_operand (op0, VOIDmode))
19202 {
19203 if (!can_create_pseudo_p ())
19204 return false;
19205 op0 = force_reg (mode, op0);
19206 }
19207 *pop = ix86_expand_compare (code, op0, op1);
19208 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19209 return true;
19210 }
19211
19212 bool
19213 ix86_expand_int_movcc (rtx operands[])
19214 {
19215 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19216 rtx compare_seq, compare_op;
19217 enum machine_mode mode = GET_MODE (operands[0]);
19218 bool sign_bit_compare_p = false;
19219 rtx op0 = XEXP (operands[1], 0);
19220 rtx op1 = XEXP (operands[1], 1);
19221
19222 if (GET_MODE (op0) == TImode
19223 || (GET_MODE (op0) == DImode
19224 && !TARGET_64BIT))
19225 return false;
19226
19227 start_sequence ();
19228 compare_op = ix86_expand_compare (code, op0, op1);
19229 compare_seq = get_insns ();
19230 end_sequence ();
19231
19232 compare_code = GET_CODE (compare_op);
19233
19234 if ((op1 == const0_rtx && (code == GE || code == LT))
19235 || (op1 == constm1_rtx && (code == GT || code == LE)))
19236 sign_bit_compare_p = true;
19237
19238 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19239 HImode insns, we'd be swallowed in word prefix ops. */
19240
19241 if ((mode != HImode || TARGET_FAST_PREFIX)
19242 && (mode != (TARGET_64BIT ? TImode : DImode))
19243 && CONST_INT_P (operands[2])
19244 && CONST_INT_P (operands[3]))
19245 {
19246 rtx out = operands[0];
19247 HOST_WIDE_INT ct = INTVAL (operands[2]);
19248 HOST_WIDE_INT cf = INTVAL (operands[3]);
19249 HOST_WIDE_INT diff;
19250
19251 diff = ct - cf;
19252 /* Sign bit compares are better done using shifts than we do by using
19253 sbb. */
19254 if (sign_bit_compare_p
19255 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19256 {
19257 /* Detect overlap between destination and compare sources. */
19258 rtx tmp = out;
19259
19260 if (!sign_bit_compare_p)
19261 {
19262 rtx flags;
19263 bool fpcmp = false;
19264
19265 compare_code = GET_CODE (compare_op);
19266
19267 flags = XEXP (compare_op, 0);
19268
19269 if (GET_MODE (flags) == CCFPmode
19270 || GET_MODE (flags) == CCFPUmode)
19271 {
19272 fpcmp = true;
19273 compare_code
19274 = ix86_fp_compare_code_to_integer (compare_code);
19275 }
19276
19277 /* To simplify rest of code, restrict to the GEU case. */
19278 if (compare_code == LTU)
19279 {
19280 HOST_WIDE_INT tmp = ct;
19281 ct = cf;
19282 cf = tmp;
19283 compare_code = reverse_condition (compare_code);
19284 code = reverse_condition (code);
19285 }
19286 else
19287 {
19288 if (fpcmp)
19289 PUT_CODE (compare_op,
19290 reverse_condition_maybe_unordered
19291 (GET_CODE (compare_op)));
19292 else
19293 PUT_CODE (compare_op,
19294 reverse_condition (GET_CODE (compare_op)));
19295 }
19296 diff = ct - cf;
19297
19298 if (reg_overlap_mentioned_p (out, op0)
19299 || reg_overlap_mentioned_p (out, op1))
19300 tmp = gen_reg_rtx (mode);
19301
19302 if (mode == DImode)
19303 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19304 else
19305 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19306 flags, compare_op));
19307 }
19308 else
19309 {
19310 if (code == GT || code == GE)
19311 code = reverse_condition (code);
19312 else
19313 {
19314 HOST_WIDE_INT tmp = ct;
19315 ct = cf;
19316 cf = tmp;
19317 diff = ct - cf;
19318 }
19319 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19320 }
19321
19322 if (diff == 1)
19323 {
19324 /*
19325 * cmpl op0,op1
19326 * sbbl dest,dest
19327 * [addl dest, ct]
19328 *
19329 * Size 5 - 8.
19330 */
19331 if (ct)
19332 tmp = expand_simple_binop (mode, PLUS,
19333 tmp, GEN_INT (ct),
19334 copy_rtx (tmp), 1, OPTAB_DIRECT);
19335 }
19336 else if (cf == -1)
19337 {
19338 /*
19339 * cmpl op0,op1
19340 * sbbl dest,dest
19341 * orl $ct, dest
19342 *
19343 * Size 8.
19344 */
19345 tmp = expand_simple_binop (mode, IOR,
19346 tmp, GEN_INT (ct),
19347 copy_rtx (tmp), 1, OPTAB_DIRECT);
19348 }
19349 else if (diff == -1 && ct)
19350 {
19351 /*
19352 * cmpl op0,op1
19353 * sbbl dest,dest
19354 * notl dest
19355 * [addl dest, cf]
19356 *
19357 * Size 8 - 11.
19358 */
19359 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19360 if (cf)
19361 tmp = expand_simple_binop (mode, PLUS,
19362 copy_rtx (tmp), GEN_INT (cf),
19363 copy_rtx (tmp), 1, OPTAB_DIRECT);
19364 }
19365 else
19366 {
19367 /*
19368 * cmpl op0,op1
19369 * sbbl dest,dest
19370 * [notl dest]
19371 * andl cf - ct, dest
19372 * [addl dest, ct]
19373 *
19374 * Size 8 - 11.
19375 */
19376
19377 if (cf == 0)
19378 {
19379 cf = ct;
19380 ct = 0;
19381 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19382 }
19383
19384 tmp = expand_simple_binop (mode, AND,
19385 copy_rtx (tmp),
19386 gen_int_mode (cf - ct, mode),
19387 copy_rtx (tmp), 1, OPTAB_DIRECT);
19388 if (ct)
19389 tmp = expand_simple_binop (mode, PLUS,
19390 copy_rtx (tmp), GEN_INT (ct),
19391 copy_rtx (tmp), 1, OPTAB_DIRECT);
19392 }
19393
19394 if (!rtx_equal_p (tmp, out))
19395 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19396
19397 return true;
19398 }
19399
19400 if (diff < 0)
19401 {
19402 enum machine_mode cmp_mode = GET_MODE (op0);
19403
19404 HOST_WIDE_INT tmp;
19405 tmp = ct, ct = cf, cf = tmp;
19406 diff = -diff;
19407
19408 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19409 {
19410 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19411
19412 /* We may be reversing unordered compare to normal compare, that
19413 is not valid in general (we may convert non-trapping condition
19414 to trapping one), however on i386 we currently emit all
19415 comparisons unordered. */
19416 compare_code = reverse_condition_maybe_unordered (compare_code);
19417 code = reverse_condition_maybe_unordered (code);
19418 }
19419 else
19420 {
19421 compare_code = reverse_condition (compare_code);
19422 code = reverse_condition (code);
19423 }
19424 }
19425
19426 compare_code = UNKNOWN;
19427 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19428 && CONST_INT_P (op1))
19429 {
19430 if (op1 == const0_rtx
19431 && (code == LT || code == GE))
19432 compare_code = code;
19433 else if (op1 == constm1_rtx)
19434 {
19435 if (code == LE)
19436 compare_code = LT;
19437 else if (code == GT)
19438 compare_code = GE;
19439 }
19440 }
19441
19442 /* Optimize dest = (op0 < 0) ? -1 : cf. */
19443 if (compare_code != UNKNOWN
19444 && GET_MODE (op0) == GET_MODE (out)
19445 && (cf == -1 || ct == -1))
19446 {
19447 /* If lea code below could be used, only optimize
19448 if it results in a 2 insn sequence. */
19449
19450 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19451 || diff == 3 || diff == 5 || diff == 9)
19452 || (compare_code == LT && ct == -1)
19453 || (compare_code == GE && cf == -1))
19454 {
19455 /*
19456 * notl op1 (if necessary)
19457 * sarl $31, op1
19458 * orl cf, op1
19459 */
19460 if (ct != -1)
19461 {
19462 cf = ct;
19463 ct = -1;
19464 code = reverse_condition (code);
19465 }
19466
19467 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19468
19469 out = expand_simple_binop (mode, IOR,
19470 out, GEN_INT (cf),
19471 out, 1, OPTAB_DIRECT);
19472 if (out != operands[0])
19473 emit_move_insn (operands[0], out);
19474
19475 return true;
19476 }
19477 }
19478
19479
19480 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19481 || diff == 3 || diff == 5 || diff == 9)
19482 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19483 && (mode != DImode
19484 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19485 {
19486 /*
19487 * xorl dest,dest
19488 * cmpl op1,op2
19489 * setcc dest
19490 * lea cf(dest*(ct-cf)),dest
19491 *
19492 * Size 14.
19493 *
19494 * This also catches the degenerate setcc-only case.
19495 */
19496
19497 rtx tmp;
19498 int nops;
19499
19500 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19501
19502 nops = 0;
19503 /* On x86_64 the lea instruction operates on Pmode, so we need
19504 to get arithmetics done in proper mode to match. */
19505 if (diff == 1)
19506 tmp = copy_rtx (out);
19507 else
19508 {
19509 rtx out1;
19510 out1 = copy_rtx (out);
19511 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19512 nops++;
19513 if (diff & 1)
19514 {
19515 tmp = gen_rtx_PLUS (mode, tmp, out1);
19516 nops++;
19517 }
19518 }
19519 if (cf != 0)
19520 {
19521 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19522 nops++;
19523 }
19524 if (!rtx_equal_p (tmp, out))
19525 {
19526 if (nops == 1)
19527 out = force_operand (tmp, copy_rtx (out));
19528 else
19529 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19530 }
19531 if (!rtx_equal_p (out, operands[0]))
19532 emit_move_insn (operands[0], copy_rtx (out));
19533
19534 return true;
19535 }
19536
19537 /*
19538 * General case: Jumpful:
19539 * xorl dest,dest cmpl op1, op2
19540 * cmpl op1, op2 movl ct, dest
19541 * setcc dest jcc 1f
19542 * decl dest movl cf, dest
19543 * andl (cf-ct),dest 1:
19544 * addl ct,dest
19545 *
19546 * Size 20. Size 14.
19547 *
19548 * This is reasonably steep, but branch mispredict costs are
19549 * high on modern cpus, so consider failing only if optimizing
19550 * for space.
19551 */
19552
19553 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19554 && BRANCH_COST (optimize_insn_for_speed_p (),
19555 false) >= 2)
19556 {
19557 if (cf == 0)
19558 {
19559 enum machine_mode cmp_mode = GET_MODE (op0);
19560
19561 cf = ct;
19562 ct = 0;
19563
19564 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19565 {
19566 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19567
19568 /* We may be reversing unordered compare to normal compare,
19569 that is not valid in general (we may convert non-trapping
19570 condition to trapping one), however on i386 we currently
19571 emit all comparisons unordered. */
19572 code = reverse_condition_maybe_unordered (code);
19573 }
19574 else
19575 {
19576 code = reverse_condition (code);
19577 if (compare_code != UNKNOWN)
19578 compare_code = reverse_condition (compare_code);
19579 }
19580 }
19581
19582 if (compare_code != UNKNOWN)
19583 {
19584 /* notl op1 (if needed)
19585 sarl $31, op1
19586 andl (cf-ct), op1
19587 addl ct, op1
19588
19589 For x < 0 (resp. x <= -1) there will be no notl,
19590 so if possible swap the constants to get rid of the
19591 complement.
19592 True/false will be -1/0 while code below (store flag
19593 followed by decrement) is 0/-1, so the constants need
19594 to be exchanged once more. */
19595
19596 if (compare_code == GE || !cf)
19597 {
19598 code = reverse_condition (code);
19599 compare_code = LT;
19600 }
19601 else
19602 {
19603 HOST_WIDE_INT tmp = cf;
19604 cf = ct;
19605 ct = tmp;
19606 }
19607
19608 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19609 }
19610 else
19611 {
19612 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19613
19614 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19615 constm1_rtx,
19616 copy_rtx (out), 1, OPTAB_DIRECT);
19617 }
19618
19619 out = expand_simple_binop (mode, AND, copy_rtx (out),
19620 gen_int_mode (cf - ct, mode),
19621 copy_rtx (out), 1, OPTAB_DIRECT);
19622 if (ct)
19623 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19624 copy_rtx (out), 1, OPTAB_DIRECT);
19625 if (!rtx_equal_p (out, operands[0]))
19626 emit_move_insn (operands[0], copy_rtx (out));
19627
19628 return true;
19629 }
19630 }
19631
19632 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19633 {
19634 /* Try a few things more with specific constants and a variable. */
19635
19636 optab op;
19637 rtx var, orig_out, out, tmp;
19638
19639 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19640 return false;
19641
19642 /* If one of the two operands is an interesting constant, load a
19643 constant with the above and mask it in with a logical operation. */
19644
19645 if (CONST_INT_P (operands[2]))
19646 {
19647 var = operands[3];
19648 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19649 operands[3] = constm1_rtx, op = and_optab;
19650 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19651 operands[3] = const0_rtx, op = ior_optab;
19652 else
19653 return false;
19654 }
19655 else if (CONST_INT_P (operands[3]))
19656 {
19657 var = operands[2];
19658 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19659 operands[2] = constm1_rtx, op = and_optab;
19660 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19661 operands[2] = const0_rtx, op = ior_optab;
19662 else
19663 return false;
19664 }
19665 else
19666 return false;
19667
19668 orig_out = operands[0];
19669 tmp = gen_reg_rtx (mode);
19670 operands[0] = tmp;
19671
19672 /* Recurse to get the constant loaded. */
19673 if (ix86_expand_int_movcc (operands) == 0)
19674 return false;
19675
19676 /* Mask in the interesting variable. */
19677 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19678 OPTAB_WIDEN);
19679 if (!rtx_equal_p (out, orig_out))
19680 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19681
19682 return true;
19683 }
19684
19685 /*
19686 * For comparison with above,
19687 *
19688 * movl cf,dest
19689 * movl ct,tmp
19690 * cmpl op1,op2
19691 * cmovcc tmp,dest
19692 *
19693 * Size 15.
19694 */
19695
19696 if (! nonimmediate_operand (operands[2], mode))
19697 operands[2] = force_reg (mode, operands[2]);
19698 if (! nonimmediate_operand (operands[3], mode))
19699 operands[3] = force_reg (mode, operands[3]);
19700
19701 if (! register_operand (operands[2], VOIDmode)
19702 && (mode == QImode
19703 || ! register_operand (operands[3], VOIDmode)))
19704 operands[2] = force_reg (mode, operands[2]);
19705
19706 if (mode == QImode
19707 && ! register_operand (operands[3], VOIDmode))
19708 operands[3] = force_reg (mode, operands[3]);
19709
19710 emit_insn (compare_seq);
19711 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19712 gen_rtx_IF_THEN_ELSE (mode,
19713 compare_op, operands[2],
19714 operands[3])));
19715 return true;
19716 }
19717
19718 /* Swap, force into registers, or otherwise massage the two operands
19719 to an sse comparison with a mask result. Thus we differ a bit from
19720 ix86_prepare_fp_compare_args which expects to produce a flags result.
19721
19722 The DEST operand exists to help determine whether to commute commutative
19723 operators. The POP0/POP1 operands are updated in place. The new
19724 comparison code is returned, or UNKNOWN if not implementable. */
19725
19726 static enum rtx_code
19727 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19728 rtx *pop0, rtx *pop1)
19729 {
19730 rtx tmp;
19731
19732 switch (code)
19733 {
19734 case LTGT:
19735 case UNEQ:
19736 /* AVX supports all the needed comparisons. */
19737 if (TARGET_AVX)
19738 break;
19739 /* We have no LTGT as an operator. We could implement it with
19740 NE & ORDERED, but this requires an extra temporary. It's
19741 not clear that it's worth it. */
19742 return UNKNOWN;
19743
19744 case LT:
19745 case LE:
19746 case UNGT:
19747 case UNGE:
19748 /* These are supported directly. */
19749 break;
19750
19751 case EQ:
19752 case NE:
19753 case UNORDERED:
19754 case ORDERED:
19755 /* AVX has 3 operand comparisons, no need to swap anything. */
19756 if (TARGET_AVX)
19757 break;
19758 /* For commutative operators, try to canonicalize the destination
19759 operand to be first in the comparison - this helps reload to
19760 avoid extra moves. */
19761 if (!dest || !rtx_equal_p (dest, *pop1))
19762 break;
19763 /* FALLTHRU */
19764
19765 case GE:
19766 case GT:
19767 case UNLE:
19768 case UNLT:
19769 /* These are not supported directly before AVX, and furthermore
19770 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19771 comparison operands to transform into something that is
19772 supported. */
19773 tmp = *pop0;
19774 *pop0 = *pop1;
19775 *pop1 = tmp;
19776 code = swap_condition (code);
19777 break;
19778
19779 default:
19780 gcc_unreachable ();
19781 }
19782
19783 return code;
19784 }
19785
19786 /* Detect conditional moves that exactly match min/max operational
19787 semantics. Note that this is IEEE safe, as long as we don't
19788 interchange the operands.
19789
19790 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19791 and TRUE if the operation is successful and instructions are emitted. */
19792
19793 static bool
19794 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19795 rtx cmp_op1, rtx if_true, rtx if_false)
19796 {
19797 enum machine_mode mode;
19798 bool is_min;
19799 rtx tmp;
19800
19801 if (code == LT)
19802 ;
19803 else if (code == UNGE)
19804 {
19805 tmp = if_true;
19806 if_true = if_false;
19807 if_false = tmp;
19808 }
19809 else
19810 return false;
19811
19812 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19813 is_min = true;
19814 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19815 is_min = false;
19816 else
19817 return false;
19818
19819 mode = GET_MODE (dest);
19820
19821 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19822 but MODE may be a vector mode and thus not appropriate. */
19823 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19824 {
19825 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19826 rtvec v;
19827
19828 if_true = force_reg (mode, if_true);
19829 v = gen_rtvec (2, if_true, if_false);
19830 tmp = gen_rtx_UNSPEC (mode, v, u);
19831 }
19832 else
19833 {
19834 code = is_min ? SMIN : SMAX;
19835 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19836 }
19837
19838 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19839 return true;
19840 }
19841
19842 /* Expand an sse vector comparison. Return the register with the result. */
19843
19844 static rtx
19845 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19846 rtx op_true, rtx op_false)
19847 {
19848 enum machine_mode mode = GET_MODE (dest);
19849 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19850 rtx x;
19851
19852 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19853 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19854 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19855
19856 if (optimize
19857 || reg_overlap_mentioned_p (dest, op_true)
19858 || reg_overlap_mentioned_p (dest, op_false))
19859 dest = gen_reg_rtx (mode);
19860
19861 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19862 if (cmp_mode != mode)
19863 {
19864 x = force_reg (cmp_mode, x);
19865 convert_move (dest, x, false);
19866 }
19867 else
19868 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19869
19870 return dest;
19871 }
19872
19873 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19874 operations. This is used for both scalar and vector conditional moves. */
19875
19876 static void
19877 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19878 {
19879 enum machine_mode mode = GET_MODE (dest);
19880 rtx t2, t3, x;
19881
19882 if (vector_all_ones_operand (op_true, mode)
19883 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19884 {
19885 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19886 }
19887 else if (op_false == CONST0_RTX (mode))
19888 {
19889 op_true = force_reg (mode, op_true);
19890 x = gen_rtx_AND (mode, cmp, op_true);
19891 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19892 }
19893 else if (op_true == CONST0_RTX (mode))
19894 {
19895 op_false = force_reg (mode, op_false);
19896 x = gen_rtx_NOT (mode, cmp);
19897 x = gen_rtx_AND (mode, x, op_false);
19898 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19899 }
19900 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19901 {
19902 op_false = force_reg (mode, op_false);
19903 x = gen_rtx_IOR (mode, cmp, op_false);
19904 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19905 }
19906 else if (TARGET_XOP)
19907 {
19908 op_true = force_reg (mode, op_true);
19909
19910 if (!nonimmediate_operand (op_false, mode))
19911 op_false = force_reg (mode, op_false);
19912
19913 emit_insn (gen_rtx_SET (mode, dest,
19914 gen_rtx_IF_THEN_ELSE (mode, cmp,
19915 op_true,
19916 op_false)));
19917 }
19918 else
19919 {
19920 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19921
19922 if (!nonimmediate_operand (op_true, mode))
19923 op_true = force_reg (mode, op_true);
19924
19925 op_false = force_reg (mode, op_false);
19926
19927 switch (mode)
19928 {
19929 case V4SFmode:
19930 if (TARGET_SSE4_1)
19931 gen = gen_sse4_1_blendvps;
19932 break;
19933 case V2DFmode:
19934 if (TARGET_SSE4_1)
19935 gen = gen_sse4_1_blendvpd;
19936 break;
19937 case V16QImode:
19938 case V8HImode:
19939 case V4SImode:
19940 case V2DImode:
19941 if (TARGET_SSE4_1)
19942 {
19943 gen = gen_sse4_1_pblendvb;
19944 dest = gen_lowpart (V16QImode, dest);
19945 op_false = gen_lowpart (V16QImode, op_false);
19946 op_true = gen_lowpart (V16QImode, op_true);
19947 cmp = gen_lowpart (V16QImode, cmp);
19948 }
19949 break;
19950 case V8SFmode:
19951 if (TARGET_AVX)
19952 gen = gen_avx_blendvps256;
19953 break;
19954 case V4DFmode:
19955 if (TARGET_AVX)
19956 gen = gen_avx_blendvpd256;
19957 break;
19958 case V32QImode:
19959 case V16HImode:
19960 case V8SImode:
19961 case V4DImode:
19962 if (TARGET_AVX2)
19963 {
19964 gen = gen_avx2_pblendvb;
19965 dest = gen_lowpart (V32QImode, dest);
19966 op_false = gen_lowpart (V32QImode, op_false);
19967 op_true = gen_lowpart (V32QImode, op_true);
19968 cmp = gen_lowpart (V32QImode, cmp);
19969 }
19970 break;
19971 default:
19972 break;
19973 }
19974
19975 if (gen != NULL)
19976 emit_insn (gen (dest, op_false, op_true, cmp));
19977 else
19978 {
19979 op_true = force_reg (mode, op_true);
19980
19981 t2 = gen_reg_rtx (mode);
19982 if (optimize)
19983 t3 = gen_reg_rtx (mode);
19984 else
19985 t3 = dest;
19986
19987 x = gen_rtx_AND (mode, op_true, cmp);
19988 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19989
19990 x = gen_rtx_NOT (mode, cmp);
19991 x = gen_rtx_AND (mode, x, op_false);
19992 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19993
19994 x = gen_rtx_IOR (mode, t3, t2);
19995 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19996 }
19997 }
19998 }
19999
20000 /* Expand a floating-point conditional move. Return true if successful. */
20001
20002 bool
20003 ix86_expand_fp_movcc (rtx operands[])
20004 {
20005 enum machine_mode mode = GET_MODE (operands[0]);
20006 enum rtx_code code = GET_CODE (operands[1]);
20007 rtx tmp, compare_op;
20008 rtx op0 = XEXP (operands[1], 0);
20009 rtx op1 = XEXP (operands[1], 1);
20010
20011 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20012 {
20013 enum machine_mode cmode;
20014
20015 /* Since we've no cmove for sse registers, don't force bad register
20016 allocation just to gain access to it. Deny movcc when the
20017 comparison mode doesn't match the move mode. */
20018 cmode = GET_MODE (op0);
20019 if (cmode == VOIDmode)
20020 cmode = GET_MODE (op1);
20021 if (cmode != mode)
20022 return false;
20023
20024 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20025 if (code == UNKNOWN)
20026 return false;
20027
20028 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20029 operands[2], operands[3]))
20030 return true;
20031
20032 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20033 operands[2], operands[3]);
20034 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20035 return true;
20036 }
20037
20038 /* The floating point conditional move instructions don't directly
20039 support conditions resulting from a signed integer comparison. */
20040
20041 compare_op = ix86_expand_compare (code, op0, op1);
20042 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20043 {
20044 tmp = gen_reg_rtx (QImode);
20045 ix86_expand_setcc (tmp, code, op0, op1);
20046
20047 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20048 }
20049
20050 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20051 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20052 operands[2], operands[3])));
20053
20054 return true;
20055 }
20056
20057 /* Expand a floating-point vector conditional move; a vcond operation
20058 rather than a movcc operation. */
20059
20060 bool
20061 ix86_expand_fp_vcond (rtx operands[])
20062 {
20063 enum rtx_code code = GET_CODE (operands[3]);
20064 rtx cmp;
20065
20066 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20067 &operands[4], &operands[5]);
20068 if (code == UNKNOWN)
20069 {
20070 rtx temp;
20071 switch (GET_CODE (operands[3]))
20072 {
20073 case LTGT:
20074 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20075 operands[5], operands[0], operands[0]);
20076 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20077 operands[5], operands[1], operands[2]);
20078 code = AND;
20079 break;
20080 case UNEQ:
20081 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20082 operands[5], operands[0], operands[0]);
20083 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20084 operands[5], operands[1], operands[2]);
20085 code = IOR;
20086 break;
20087 default:
20088 gcc_unreachable ();
20089 }
20090 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20091 OPTAB_DIRECT);
20092 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20093 return true;
20094 }
20095
20096 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20097 operands[5], operands[1], operands[2]))
20098 return true;
20099
20100 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20101 operands[1], operands[2]);
20102 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20103 return true;
20104 }
20105
20106 /* Expand a signed/unsigned integral vector conditional move. */
20107
20108 bool
20109 ix86_expand_int_vcond (rtx operands[])
20110 {
20111 enum machine_mode data_mode = GET_MODE (operands[0]);
20112 enum machine_mode mode = GET_MODE (operands[4]);
20113 enum rtx_code code = GET_CODE (operands[3]);
20114 bool negate = false;
20115 rtx x, cop0, cop1;
20116
20117 cop0 = operands[4];
20118 cop1 = operands[5];
20119
20120 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20121 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20122 if ((code == LT || code == GE)
20123 && data_mode == mode
20124 && cop1 == CONST0_RTX (mode)
20125 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20126 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20127 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20128 && (GET_MODE_SIZE (data_mode) == 16
20129 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20130 {
20131 rtx negop = operands[2 - (code == LT)];
20132 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20133 if (negop == CONST1_RTX (data_mode))
20134 {
20135 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20136 operands[0], 1, OPTAB_DIRECT);
20137 if (res != operands[0])
20138 emit_move_insn (operands[0], res);
20139 return true;
20140 }
20141 else if (GET_MODE_INNER (data_mode) != DImode
20142 && vector_all_ones_operand (negop, data_mode))
20143 {
20144 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20145 operands[0], 0, OPTAB_DIRECT);
20146 if (res != operands[0])
20147 emit_move_insn (operands[0], res);
20148 return true;
20149 }
20150 }
20151
20152 if (!nonimmediate_operand (cop1, mode))
20153 cop1 = force_reg (mode, cop1);
20154 if (!general_operand (operands[1], data_mode))
20155 operands[1] = force_reg (data_mode, operands[1]);
20156 if (!general_operand (operands[2], data_mode))
20157 operands[2] = force_reg (data_mode, operands[2]);
20158
20159 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20160 if (TARGET_XOP
20161 && (mode == V16QImode || mode == V8HImode
20162 || mode == V4SImode || mode == V2DImode))
20163 ;
20164 else
20165 {
20166 /* Canonicalize the comparison to EQ, GT, GTU. */
20167 switch (code)
20168 {
20169 case EQ:
20170 case GT:
20171 case GTU:
20172 break;
20173
20174 case NE:
20175 case LE:
20176 case LEU:
20177 code = reverse_condition (code);
20178 negate = true;
20179 break;
20180
20181 case GE:
20182 case GEU:
20183 code = reverse_condition (code);
20184 negate = true;
20185 /* FALLTHRU */
20186
20187 case LT:
20188 case LTU:
20189 code = swap_condition (code);
20190 x = cop0, cop0 = cop1, cop1 = x;
20191 break;
20192
20193 default:
20194 gcc_unreachable ();
20195 }
20196
20197 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20198 if (mode == V2DImode)
20199 {
20200 switch (code)
20201 {
20202 case EQ:
20203 /* SSE4.1 supports EQ. */
20204 if (!TARGET_SSE4_1)
20205 return false;
20206 break;
20207
20208 case GT:
20209 case GTU:
20210 /* SSE4.2 supports GT/GTU. */
20211 if (!TARGET_SSE4_2)
20212 return false;
20213 break;
20214
20215 default:
20216 gcc_unreachable ();
20217 }
20218 }
20219
20220 /* Unsigned parallel compare is not supported by the hardware.
20221 Play some tricks to turn this into a signed comparison
20222 against 0. */
20223 if (code == GTU)
20224 {
20225 cop0 = force_reg (mode, cop0);
20226
20227 switch (mode)
20228 {
20229 case V8SImode:
20230 case V4DImode:
20231 case V4SImode:
20232 case V2DImode:
20233 {
20234 rtx t1, t2, mask;
20235 rtx (*gen_sub3) (rtx, rtx, rtx);
20236
20237 switch (mode)
20238 {
20239 case V8SImode: gen_sub3 = gen_subv8si3; break;
20240 case V4DImode: gen_sub3 = gen_subv4di3; break;
20241 case V4SImode: gen_sub3 = gen_subv4si3; break;
20242 case V2DImode: gen_sub3 = gen_subv2di3; break;
20243 default:
20244 gcc_unreachable ();
20245 }
20246 /* Subtract (-(INT MAX) - 1) from both operands to make
20247 them signed. */
20248 mask = ix86_build_signbit_mask (mode, true, false);
20249 t1 = gen_reg_rtx (mode);
20250 emit_insn (gen_sub3 (t1, cop0, mask));
20251
20252 t2 = gen_reg_rtx (mode);
20253 emit_insn (gen_sub3 (t2, cop1, mask));
20254
20255 cop0 = t1;
20256 cop1 = t2;
20257 code = GT;
20258 }
20259 break;
20260
20261 case V32QImode:
20262 case V16HImode:
20263 case V16QImode:
20264 case V8HImode:
20265 /* Perform a parallel unsigned saturating subtraction. */
20266 x = gen_reg_rtx (mode);
20267 emit_insn (gen_rtx_SET (VOIDmode, x,
20268 gen_rtx_US_MINUS (mode, cop0, cop1)));
20269
20270 cop0 = x;
20271 cop1 = CONST0_RTX (mode);
20272 code = EQ;
20273 negate = !negate;
20274 break;
20275
20276 default:
20277 gcc_unreachable ();
20278 }
20279 }
20280 }
20281
20282 /* Allow the comparison to be done in one mode, but the movcc to
20283 happen in another mode. */
20284 if (data_mode == mode)
20285 {
20286 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20287 operands[1+negate], operands[2-negate]);
20288 }
20289 else
20290 {
20291 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20292 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
20293 code, cop0, cop1,
20294 operands[1+negate], operands[2-negate]);
20295 x = gen_lowpart (data_mode, x);
20296 }
20297
20298 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20299 operands[2-negate]);
20300 return true;
20301 }
20302
20303 /* Expand a variable vector permutation. */
20304
20305 void
20306 ix86_expand_vec_perm (rtx operands[])
20307 {
20308 rtx target = operands[0];
20309 rtx op0 = operands[1];
20310 rtx op1 = operands[2];
20311 rtx mask = operands[3];
20312 rtx t1, t2, t3, t4, vt, vt2, vec[32];
20313 enum machine_mode mode = GET_MODE (op0);
20314 enum machine_mode maskmode = GET_MODE (mask);
20315 int w, e, i;
20316 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20317
20318 /* Number of elements in the vector. */
20319 w = GET_MODE_NUNITS (mode);
20320 e = GET_MODE_UNIT_SIZE (mode);
20321 gcc_assert (w <= 32);
20322
20323 if (TARGET_AVX2)
20324 {
20325 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20326 {
20327 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20328 an constant shuffle operand. With a tiny bit of effort we can
20329 use VPERMD instead. A re-interpretation stall for V4DFmode is
20330 unfortunate but there's no avoiding it.
20331 Similarly for V16HImode we don't have instructions for variable
20332 shuffling, while for V32QImode we can use after preparing suitable
20333 masks vpshufb; vpshufb; vpermq; vpor. */
20334
20335 if (mode == V16HImode)
20336 {
20337 maskmode = mode = V32QImode;
20338 w = 32;
20339 e = 1;
20340 }
20341 else
20342 {
20343 maskmode = mode = V8SImode;
20344 w = 8;
20345 e = 4;
20346 }
20347 t1 = gen_reg_rtx (maskmode);
20348
20349 /* Replicate the low bits of the V4DImode mask into V8SImode:
20350 mask = { A B C D }
20351 t1 = { A A B B C C D D }. */
20352 for (i = 0; i < w / 2; ++i)
20353 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20354 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20355 vt = force_reg (maskmode, vt);
20356 mask = gen_lowpart (maskmode, mask);
20357 if (maskmode == V8SImode)
20358 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20359 else
20360 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20361
20362 /* Multiply the shuffle indicies by two. */
20363 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20364 OPTAB_DIRECT);
20365
20366 /* Add one to the odd shuffle indicies:
20367 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20368 for (i = 0; i < w / 2; ++i)
20369 {
20370 vec[i * 2] = const0_rtx;
20371 vec[i * 2 + 1] = const1_rtx;
20372 }
20373 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20374 vt = force_const_mem (maskmode, vt);
20375 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20376 OPTAB_DIRECT);
20377
20378 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20379 operands[3] = mask = t1;
20380 target = gen_lowpart (mode, target);
20381 op0 = gen_lowpart (mode, op0);
20382 op1 = gen_lowpart (mode, op1);
20383 }
20384
20385 switch (mode)
20386 {
20387 case V8SImode:
20388 /* The VPERMD and VPERMPS instructions already properly ignore
20389 the high bits of the shuffle elements. No need for us to
20390 perform an AND ourselves. */
20391 if (one_operand_shuffle)
20392 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20393 else
20394 {
20395 t1 = gen_reg_rtx (V8SImode);
20396 t2 = gen_reg_rtx (V8SImode);
20397 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20398 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20399 goto merge_two;
20400 }
20401 return;
20402
20403 case V8SFmode:
20404 mask = gen_lowpart (V8SFmode, mask);
20405 if (one_operand_shuffle)
20406 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20407 else
20408 {
20409 t1 = gen_reg_rtx (V8SFmode);
20410 t2 = gen_reg_rtx (V8SFmode);
20411 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20412 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20413 goto merge_two;
20414 }
20415 return;
20416
20417 case V4SImode:
20418 /* By combining the two 128-bit input vectors into one 256-bit
20419 input vector, we can use VPERMD and VPERMPS for the full
20420 two-operand shuffle. */
20421 t1 = gen_reg_rtx (V8SImode);
20422 t2 = gen_reg_rtx (V8SImode);
20423 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
20424 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20425 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
20426 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
20427 return;
20428
20429 case V4SFmode:
20430 t1 = gen_reg_rtx (V8SFmode);
20431 t2 = gen_reg_rtx (V8SImode);
20432 mask = gen_lowpart (V4SImode, mask);
20433 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
20434 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20435 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
20436 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
20437 return;
20438
20439 case V32QImode:
20440 t1 = gen_reg_rtx (V32QImode);
20441 t2 = gen_reg_rtx (V32QImode);
20442 t3 = gen_reg_rtx (V32QImode);
20443 vt2 = GEN_INT (128);
20444 for (i = 0; i < 32; i++)
20445 vec[i] = vt2;
20446 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20447 vt = force_reg (V32QImode, vt);
20448 for (i = 0; i < 32; i++)
20449 vec[i] = i < 16 ? vt2 : const0_rtx;
20450 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20451 vt2 = force_reg (V32QImode, vt2);
20452 /* From mask create two adjusted masks, which contain the same
20453 bits as mask in the low 7 bits of each vector element.
20454 The first mask will have the most significant bit clear
20455 if it requests element from the same 128-bit lane
20456 and MSB set if it requests element from the other 128-bit lane.
20457 The second mask will have the opposite values of the MSB,
20458 and additionally will have its 128-bit lanes swapped.
20459 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20460 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
20461 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20462 stands for other 12 bytes. */
20463 /* The bit whether element is from the same lane or the other
20464 lane is bit 4, so shift it up by 3 to the MSB position. */
20465 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20466 gen_lowpart (V4DImode, mask),
20467 GEN_INT (3)));
20468 /* Clear MSB bits from the mask just in case it had them set. */
20469 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20470 /* After this t1 will have MSB set for elements from other lane. */
20471 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20472 /* Clear bits other than MSB. */
20473 emit_insn (gen_andv32qi3 (t1, t1, vt));
20474 /* Or in the lower bits from mask into t3. */
20475 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20476 /* And invert MSB bits in t1, so MSB is set for elements from the same
20477 lane. */
20478 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20479 /* Swap 128-bit lanes in t3. */
20480 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20481 gen_lowpart (V4DImode, t3),
20482 const2_rtx, GEN_INT (3),
20483 const0_rtx, const1_rtx));
20484 /* And or in the lower bits from mask into t1. */
20485 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20486 if (one_operand_shuffle)
20487 {
20488 /* Each of these shuffles will put 0s in places where
20489 element from the other 128-bit lane is needed, otherwise
20490 will shuffle in the requested value. */
20491 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20492 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20493 /* For t3 the 128-bit lanes are swapped again. */
20494 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20495 gen_lowpart (V4DImode, t3),
20496 const2_rtx, GEN_INT (3),
20497 const0_rtx, const1_rtx));
20498 /* And oring both together leads to the result. */
20499 emit_insn (gen_iorv32qi3 (target, t1, t3));
20500 return;
20501 }
20502
20503 t4 = gen_reg_rtx (V32QImode);
20504 /* Similarly to the above one_operand_shuffle code,
20505 just for repeated twice for each operand. merge_two:
20506 code will merge the two results together. */
20507 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20508 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20509 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20510 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20511 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20512 gen_lowpart (V4DImode, t4),
20513 const2_rtx, GEN_INT (3),
20514 const0_rtx, const1_rtx));
20515 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20516 gen_lowpart (V4DImode, t3),
20517 const2_rtx, GEN_INT (3),
20518 const0_rtx, const1_rtx));
20519 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20520 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20521 t1 = t4;
20522 t2 = t3;
20523 goto merge_two;
20524
20525 default:
20526 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20527 break;
20528 }
20529 }
20530
20531 if (TARGET_XOP)
20532 {
20533 /* The XOP VPPERM insn supports three inputs. By ignoring the
20534 one_operand_shuffle special case, we avoid creating another
20535 set of constant vectors in memory. */
20536 one_operand_shuffle = false;
20537
20538 /* mask = mask & {2*w-1, ...} */
20539 vt = GEN_INT (2*w - 1);
20540 }
20541 else
20542 {
20543 /* mask = mask & {w-1, ...} */
20544 vt = GEN_INT (w - 1);
20545 }
20546
20547 for (i = 0; i < w; i++)
20548 vec[i] = vt;
20549 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20550 mask = expand_simple_binop (maskmode, AND, mask, vt,
20551 NULL_RTX, 0, OPTAB_DIRECT);
20552
20553 /* For non-QImode operations, convert the word permutation control
20554 into a byte permutation control. */
20555 if (mode != V16QImode)
20556 {
20557 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20558 GEN_INT (exact_log2 (e)),
20559 NULL_RTX, 0, OPTAB_DIRECT);
20560
20561 /* Convert mask to vector of chars. */
20562 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20563
20564 /* Replicate each of the input bytes into byte positions:
20565 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20566 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20567 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20568 for (i = 0; i < 16; ++i)
20569 vec[i] = GEN_INT (i/e * e);
20570 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20571 vt = force_const_mem (V16QImode, vt);
20572 if (TARGET_XOP)
20573 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20574 else
20575 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20576
20577 /* Convert it into the byte positions by doing
20578 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20579 for (i = 0; i < 16; ++i)
20580 vec[i] = GEN_INT (i % e);
20581 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20582 vt = force_const_mem (V16QImode, vt);
20583 emit_insn (gen_addv16qi3 (mask, mask, vt));
20584 }
20585
20586 /* The actual shuffle operations all operate on V16QImode. */
20587 op0 = gen_lowpart (V16QImode, op0);
20588 op1 = gen_lowpart (V16QImode, op1);
20589 target = gen_lowpart (V16QImode, target);
20590
20591 if (TARGET_XOP)
20592 {
20593 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20594 }
20595 else if (one_operand_shuffle)
20596 {
20597 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20598 }
20599 else
20600 {
20601 rtx xops[6];
20602 bool ok;
20603
20604 /* Shuffle the two input vectors independently. */
20605 t1 = gen_reg_rtx (V16QImode);
20606 t2 = gen_reg_rtx (V16QImode);
20607 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20608 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20609
20610 merge_two:
20611 /* Then merge them together. The key is whether any given control
20612 element contained a bit set that indicates the second word. */
20613 mask = operands[3];
20614 vt = GEN_INT (w);
20615 if (maskmode == V2DImode && !TARGET_SSE4_1)
20616 {
20617 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20618 more shuffle to convert the V2DI input mask into a V4SI
20619 input mask. At which point the masking that expand_int_vcond
20620 will work as desired. */
20621 rtx t3 = gen_reg_rtx (V4SImode);
20622 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20623 const0_rtx, const0_rtx,
20624 const2_rtx, const2_rtx));
20625 mask = t3;
20626 maskmode = V4SImode;
20627 e = w = 4;
20628 }
20629
20630 for (i = 0; i < w; i++)
20631 vec[i] = vt;
20632 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20633 vt = force_reg (maskmode, vt);
20634 mask = expand_simple_binop (maskmode, AND, mask, vt,
20635 NULL_RTX, 0, OPTAB_DIRECT);
20636
20637 xops[0] = gen_lowpart (mode, operands[0]);
20638 xops[1] = gen_lowpart (mode, t2);
20639 xops[2] = gen_lowpart (mode, t1);
20640 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20641 xops[4] = mask;
20642 xops[5] = vt;
20643 ok = ix86_expand_int_vcond (xops);
20644 gcc_assert (ok);
20645 }
20646 }
20647
20648 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20649 true if we should do zero extension, else sign extension. HIGH_P is
20650 true if we want the N/2 high elements, else the low elements. */
20651
20652 void
20653 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
20654 {
20655 enum machine_mode imode = GET_MODE (src);
20656 rtx tmp;
20657
20658 if (TARGET_SSE4_1)
20659 {
20660 rtx (*unpack)(rtx, rtx);
20661 rtx (*extract)(rtx, rtx) = NULL;
20662 enum machine_mode halfmode = BLKmode;
20663
20664 switch (imode)
20665 {
20666 case V32QImode:
20667 if (unsigned_p)
20668 unpack = gen_avx2_zero_extendv16qiv16hi2;
20669 else
20670 unpack = gen_avx2_sign_extendv16qiv16hi2;
20671 halfmode = V16QImode;
20672 extract
20673 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20674 break;
20675 case V16HImode:
20676 if (unsigned_p)
20677 unpack = gen_avx2_zero_extendv8hiv8si2;
20678 else
20679 unpack = gen_avx2_sign_extendv8hiv8si2;
20680 halfmode = V8HImode;
20681 extract
20682 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20683 break;
20684 case V8SImode:
20685 if (unsigned_p)
20686 unpack = gen_avx2_zero_extendv4siv4di2;
20687 else
20688 unpack = gen_avx2_sign_extendv4siv4di2;
20689 halfmode = V4SImode;
20690 extract
20691 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20692 break;
20693 case V16QImode:
20694 if (unsigned_p)
20695 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20696 else
20697 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20698 break;
20699 case V8HImode:
20700 if (unsigned_p)
20701 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20702 else
20703 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20704 break;
20705 case V4SImode:
20706 if (unsigned_p)
20707 unpack = gen_sse4_1_zero_extendv2siv2di2;
20708 else
20709 unpack = gen_sse4_1_sign_extendv2siv2di2;
20710 break;
20711 default:
20712 gcc_unreachable ();
20713 }
20714
20715 if (GET_MODE_SIZE (imode) == 32)
20716 {
20717 tmp = gen_reg_rtx (halfmode);
20718 emit_insn (extract (tmp, src));
20719 }
20720 else if (high_p)
20721 {
20722 /* Shift higher 8 bytes to lower 8 bytes. */
20723 tmp = gen_reg_rtx (imode);
20724 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20725 gen_lowpart (V1TImode, src),
20726 GEN_INT (64)));
20727 }
20728 else
20729 tmp = src;
20730
20731 emit_insn (unpack (dest, tmp));
20732 }
20733 else
20734 {
20735 rtx (*unpack)(rtx, rtx, rtx);
20736
20737 switch (imode)
20738 {
20739 case V16QImode:
20740 if (high_p)
20741 unpack = gen_vec_interleave_highv16qi;
20742 else
20743 unpack = gen_vec_interleave_lowv16qi;
20744 break;
20745 case V8HImode:
20746 if (high_p)
20747 unpack = gen_vec_interleave_highv8hi;
20748 else
20749 unpack = gen_vec_interleave_lowv8hi;
20750 break;
20751 case V4SImode:
20752 if (high_p)
20753 unpack = gen_vec_interleave_highv4si;
20754 else
20755 unpack = gen_vec_interleave_lowv4si;
20756 break;
20757 default:
20758 gcc_unreachable ();
20759 }
20760
20761 if (unsigned_p)
20762 tmp = force_reg (imode, CONST0_RTX (imode));
20763 else
20764 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20765 src, pc_rtx, pc_rtx);
20766
20767 emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
20768 }
20769 }
20770
20771 /* Expand conditional increment or decrement using adb/sbb instructions.
20772 The default case using setcc followed by the conditional move can be
20773 done by generic code. */
20774 bool
20775 ix86_expand_int_addcc (rtx operands[])
20776 {
20777 enum rtx_code code = GET_CODE (operands[1]);
20778 rtx flags;
20779 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20780 rtx compare_op;
20781 rtx val = const0_rtx;
20782 bool fpcmp = false;
20783 enum machine_mode mode;
20784 rtx op0 = XEXP (operands[1], 0);
20785 rtx op1 = XEXP (operands[1], 1);
20786
20787 if (operands[3] != const1_rtx
20788 && operands[3] != constm1_rtx)
20789 return false;
20790 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20791 return false;
20792 code = GET_CODE (compare_op);
20793
20794 flags = XEXP (compare_op, 0);
20795
20796 if (GET_MODE (flags) == CCFPmode
20797 || GET_MODE (flags) == CCFPUmode)
20798 {
20799 fpcmp = true;
20800 code = ix86_fp_compare_code_to_integer (code);
20801 }
20802
20803 if (code != LTU)
20804 {
20805 val = constm1_rtx;
20806 if (fpcmp)
20807 PUT_CODE (compare_op,
20808 reverse_condition_maybe_unordered
20809 (GET_CODE (compare_op)));
20810 else
20811 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20812 }
20813
20814 mode = GET_MODE (operands[0]);
20815
20816 /* Construct either adc or sbb insn. */
20817 if ((code == LTU) == (operands[3] == constm1_rtx))
20818 {
20819 switch (mode)
20820 {
20821 case QImode:
20822 insn = gen_subqi3_carry;
20823 break;
20824 case HImode:
20825 insn = gen_subhi3_carry;
20826 break;
20827 case SImode:
20828 insn = gen_subsi3_carry;
20829 break;
20830 case DImode:
20831 insn = gen_subdi3_carry;
20832 break;
20833 default:
20834 gcc_unreachable ();
20835 }
20836 }
20837 else
20838 {
20839 switch (mode)
20840 {
20841 case QImode:
20842 insn = gen_addqi3_carry;
20843 break;
20844 case HImode:
20845 insn = gen_addhi3_carry;
20846 break;
20847 case SImode:
20848 insn = gen_addsi3_carry;
20849 break;
20850 case DImode:
20851 insn = gen_adddi3_carry;
20852 break;
20853 default:
20854 gcc_unreachable ();
20855 }
20856 }
20857 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20858
20859 return true;
20860 }
20861
20862
20863 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20864 but works for floating pointer parameters and nonoffsetable memories.
20865 For pushes, it returns just stack offsets; the values will be saved
20866 in the right order. Maximally three parts are generated. */
20867
20868 static int
20869 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20870 {
20871 int size;
20872
20873 if (!TARGET_64BIT)
20874 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20875 else
20876 size = (GET_MODE_SIZE (mode) + 4) / 8;
20877
20878 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20879 gcc_assert (size >= 2 && size <= 4);
20880
20881 /* Optimize constant pool reference to immediates. This is used by fp
20882 moves, that force all constants to memory to allow combining. */
20883 if (MEM_P (operand) && MEM_READONLY_P (operand))
20884 {
20885 rtx tmp = maybe_get_pool_constant (operand);
20886 if (tmp)
20887 operand = tmp;
20888 }
20889
20890 if (MEM_P (operand) && !offsettable_memref_p (operand))
20891 {
20892 /* The only non-offsetable memories we handle are pushes. */
20893 int ok = push_operand (operand, VOIDmode);
20894
20895 gcc_assert (ok);
20896
20897 operand = copy_rtx (operand);
20898 PUT_MODE (operand, word_mode);
20899 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20900 return size;
20901 }
20902
20903 if (GET_CODE (operand) == CONST_VECTOR)
20904 {
20905 enum machine_mode imode = int_mode_for_mode (mode);
20906 /* Caution: if we looked through a constant pool memory above,
20907 the operand may actually have a different mode now. That's
20908 ok, since we want to pun this all the way back to an integer. */
20909 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20910 gcc_assert (operand != NULL);
20911 mode = imode;
20912 }
20913
20914 if (!TARGET_64BIT)
20915 {
20916 if (mode == DImode)
20917 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20918 else
20919 {
20920 int i;
20921
20922 if (REG_P (operand))
20923 {
20924 gcc_assert (reload_completed);
20925 for (i = 0; i < size; i++)
20926 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20927 }
20928 else if (offsettable_memref_p (operand))
20929 {
20930 operand = adjust_address (operand, SImode, 0);
20931 parts[0] = operand;
20932 for (i = 1; i < size; i++)
20933 parts[i] = adjust_address (operand, SImode, 4 * i);
20934 }
20935 else if (GET_CODE (operand) == CONST_DOUBLE)
20936 {
20937 REAL_VALUE_TYPE r;
20938 long l[4];
20939
20940 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20941 switch (mode)
20942 {
20943 case TFmode:
20944 real_to_target (l, &r, mode);
20945 parts[3] = gen_int_mode (l[3], SImode);
20946 parts[2] = gen_int_mode (l[2], SImode);
20947 break;
20948 case XFmode:
20949 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
20950 long double may not be 80-bit. */
20951 real_to_target (l, &r, mode);
20952 parts[2] = gen_int_mode (l[2], SImode);
20953 break;
20954 case DFmode:
20955 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20956 break;
20957 default:
20958 gcc_unreachable ();
20959 }
20960 parts[1] = gen_int_mode (l[1], SImode);
20961 parts[0] = gen_int_mode (l[0], SImode);
20962 }
20963 else
20964 gcc_unreachable ();
20965 }
20966 }
20967 else
20968 {
20969 if (mode == TImode)
20970 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20971 if (mode == XFmode || mode == TFmode)
20972 {
20973 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20974 if (REG_P (operand))
20975 {
20976 gcc_assert (reload_completed);
20977 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20978 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20979 }
20980 else if (offsettable_memref_p (operand))
20981 {
20982 operand = adjust_address (operand, DImode, 0);
20983 parts[0] = operand;
20984 parts[1] = adjust_address (operand, upper_mode, 8);
20985 }
20986 else if (GET_CODE (operand) == CONST_DOUBLE)
20987 {
20988 REAL_VALUE_TYPE r;
20989 long l[4];
20990
20991 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20992 real_to_target (l, &r, mode);
20993
20994 /* Do not use shift by 32 to avoid warning on 32bit systems. */
20995 if (HOST_BITS_PER_WIDE_INT >= 64)
20996 parts[0]
20997 = gen_int_mode
20998 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20999 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21000 DImode);
21001 else
21002 parts[0] = immed_double_const (l[0], l[1], DImode);
21003
21004 if (upper_mode == SImode)
21005 parts[1] = gen_int_mode (l[2], SImode);
21006 else if (HOST_BITS_PER_WIDE_INT >= 64)
21007 parts[1]
21008 = gen_int_mode
21009 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21010 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21011 DImode);
21012 else
21013 parts[1] = immed_double_const (l[2], l[3], DImode);
21014 }
21015 else
21016 gcc_unreachable ();
21017 }
21018 }
21019
21020 return size;
21021 }
21022
21023 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21024 Return false when normal moves are needed; true when all required
21025 insns have been emitted. Operands 2-4 contain the input values
21026 int the correct order; operands 5-7 contain the output values. */
21027
21028 void
21029 ix86_split_long_move (rtx operands[])
21030 {
21031 rtx part[2][4];
21032 int nparts, i, j;
21033 int push = 0;
21034 int collisions = 0;
21035 enum machine_mode mode = GET_MODE (operands[0]);
21036 bool collisionparts[4];
21037
21038 /* The DFmode expanders may ask us to move double.
21039 For 64bit target this is single move. By hiding the fact
21040 here we simplify i386.md splitters. */
21041 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21042 {
21043 /* Optimize constant pool reference to immediates. This is used by
21044 fp moves, that force all constants to memory to allow combining. */
21045
21046 if (MEM_P (operands[1])
21047 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21048 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21049 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21050 if (push_operand (operands[0], VOIDmode))
21051 {
21052 operands[0] = copy_rtx (operands[0]);
21053 PUT_MODE (operands[0], word_mode);
21054 }
21055 else
21056 operands[0] = gen_lowpart (DImode, operands[0]);
21057 operands[1] = gen_lowpart (DImode, operands[1]);
21058 emit_move_insn (operands[0], operands[1]);
21059 return;
21060 }
21061
21062 /* The only non-offsettable memory we handle is push. */
21063 if (push_operand (operands[0], VOIDmode))
21064 push = 1;
21065 else
21066 gcc_assert (!MEM_P (operands[0])
21067 || offsettable_memref_p (operands[0]));
21068
21069 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21070 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21071
21072 /* When emitting push, take care for source operands on the stack. */
21073 if (push && MEM_P (operands[1])
21074 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21075 {
21076 rtx src_base = XEXP (part[1][nparts - 1], 0);
21077
21078 /* Compensate for the stack decrement by 4. */
21079 if (!TARGET_64BIT && nparts == 3
21080 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21081 src_base = plus_constant (Pmode, src_base, 4);
21082
21083 /* src_base refers to the stack pointer and is
21084 automatically decreased by emitted push. */
21085 for (i = 0; i < nparts; i++)
21086 part[1][i] = change_address (part[1][i],
21087 GET_MODE (part[1][i]), src_base);
21088 }
21089
21090 /* We need to do copy in the right order in case an address register
21091 of the source overlaps the destination. */
21092 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21093 {
21094 rtx tmp;
21095
21096 for (i = 0; i < nparts; i++)
21097 {
21098 collisionparts[i]
21099 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21100 if (collisionparts[i])
21101 collisions++;
21102 }
21103
21104 /* Collision in the middle part can be handled by reordering. */
21105 if (collisions == 1 && nparts == 3 && collisionparts [1])
21106 {
21107 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21108 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21109 }
21110 else if (collisions == 1
21111 && nparts == 4
21112 && (collisionparts [1] || collisionparts [2]))
21113 {
21114 if (collisionparts [1])
21115 {
21116 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21117 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21118 }
21119 else
21120 {
21121 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21122 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21123 }
21124 }
21125
21126 /* If there are more collisions, we can't handle it by reordering.
21127 Do an lea to the last part and use only one colliding move. */
21128 else if (collisions > 1)
21129 {
21130 rtx base;
21131
21132 collisions = 1;
21133
21134 base = part[0][nparts - 1];
21135
21136 /* Handle the case when the last part isn't valid for lea.
21137 Happens in 64-bit mode storing the 12-byte XFmode. */
21138 if (GET_MODE (base) != Pmode)
21139 base = gen_rtx_REG (Pmode, REGNO (base));
21140
21141 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21142 part[1][0] = replace_equiv_address (part[1][0], base);
21143 for (i = 1; i < nparts; i++)
21144 {
21145 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21146 part[1][i] = replace_equiv_address (part[1][i], tmp);
21147 }
21148 }
21149 }
21150
21151 if (push)
21152 {
21153 if (!TARGET_64BIT)
21154 {
21155 if (nparts == 3)
21156 {
21157 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21158 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21159 stack_pointer_rtx, GEN_INT (-4)));
21160 emit_move_insn (part[0][2], part[1][2]);
21161 }
21162 else if (nparts == 4)
21163 {
21164 emit_move_insn (part[0][3], part[1][3]);
21165 emit_move_insn (part[0][2], part[1][2]);
21166 }
21167 }
21168 else
21169 {
21170 /* In 64bit mode we don't have 32bit push available. In case this is
21171 register, it is OK - we will just use larger counterpart. We also
21172 retype memory - these comes from attempt to avoid REX prefix on
21173 moving of second half of TFmode value. */
21174 if (GET_MODE (part[1][1]) == SImode)
21175 {
21176 switch (GET_CODE (part[1][1]))
21177 {
21178 case MEM:
21179 part[1][1] = adjust_address (part[1][1], DImode, 0);
21180 break;
21181
21182 case REG:
21183 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21184 break;
21185
21186 default:
21187 gcc_unreachable ();
21188 }
21189
21190 if (GET_MODE (part[1][0]) == SImode)
21191 part[1][0] = part[1][1];
21192 }
21193 }
21194 emit_move_insn (part[0][1], part[1][1]);
21195 emit_move_insn (part[0][0], part[1][0]);
21196 return;
21197 }
21198
21199 /* Choose correct order to not overwrite the source before it is copied. */
21200 if ((REG_P (part[0][0])
21201 && REG_P (part[1][1])
21202 && (REGNO (part[0][0]) == REGNO (part[1][1])
21203 || (nparts == 3
21204 && REGNO (part[0][0]) == REGNO (part[1][2]))
21205 || (nparts == 4
21206 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21207 || (collisions > 0
21208 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21209 {
21210 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21211 {
21212 operands[2 + i] = part[0][j];
21213 operands[6 + i] = part[1][j];
21214 }
21215 }
21216 else
21217 {
21218 for (i = 0; i < nparts; i++)
21219 {
21220 operands[2 + i] = part[0][i];
21221 operands[6 + i] = part[1][i];
21222 }
21223 }
21224
21225 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21226 if (optimize_insn_for_size_p ())
21227 {
21228 for (j = 0; j < nparts - 1; j++)
21229 if (CONST_INT_P (operands[6 + j])
21230 && operands[6 + j] != const0_rtx
21231 && REG_P (operands[2 + j]))
21232 for (i = j; i < nparts - 1; i++)
21233 if (CONST_INT_P (operands[7 + i])
21234 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21235 operands[7 + i] = operands[2 + j];
21236 }
21237
21238 for (i = 0; i < nparts; i++)
21239 emit_move_insn (operands[2 + i], operands[6 + i]);
21240
21241 return;
21242 }
21243
21244 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21245 left shift by a constant, either using a single shift or
21246 a sequence of add instructions. */
21247
21248 static void
21249 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21250 {
21251 rtx (*insn)(rtx, rtx, rtx);
21252
21253 if (count == 1
21254 || (count * ix86_cost->add <= ix86_cost->shift_const
21255 && !optimize_insn_for_size_p ()))
21256 {
21257 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21258 while (count-- > 0)
21259 emit_insn (insn (operand, operand, operand));
21260 }
21261 else
21262 {
21263 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21264 emit_insn (insn (operand, operand, GEN_INT (count)));
21265 }
21266 }
21267
21268 void
21269 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
21270 {
21271 rtx (*gen_ashl3)(rtx, rtx, rtx);
21272 rtx (*gen_shld)(rtx, rtx, rtx);
21273 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21274
21275 rtx low[2], high[2];
21276 int count;
21277
21278 if (CONST_INT_P (operands[2]))
21279 {
21280 split_double_mode (mode, operands, 2, low, high);
21281 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21282
21283 if (count >= half_width)
21284 {
21285 emit_move_insn (high[0], low[1]);
21286 emit_move_insn (low[0], const0_rtx);
21287
21288 if (count > half_width)
21289 ix86_expand_ashl_const (high[0], count - half_width, mode);
21290 }
21291 else
21292 {
21293 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21294
21295 if (!rtx_equal_p (operands[0], operands[1]))
21296 emit_move_insn (operands[0], operands[1]);
21297
21298 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21299 ix86_expand_ashl_const (low[0], count, mode);
21300 }
21301 return;
21302 }
21303
21304 split_double_mode (mode, operands, 1, low, high);
21305
21306 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21307
21308 if (operands[1] == const1_rtx)
21309 {
21310 /* Assuming we've chosen a QImode capable registers, then 1 << N
21311 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21312 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21313 {
21314 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21315
21316 ix86_expand_clear (low[0]);
21317 ix86_expand_clear (high[0]);
21318 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21319
21320 d = gen_lowpart (QImode, low[0]);
21321 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21322 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21323 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21324
21325 d = gen_lowpart (QImode, high[0]);
21326 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21327 s = gen_rtx_NE (QImode, flags, const0_rtx);
21328 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21329 }
21330
21331 /* Otherwise, we can get the same results by manually performing
21332 a bit extract operation on bit 5/6, and then performing the two
21333 shifts. The two methods of getting 0/1 into low/high are exactly
21334 the same size. Avoiding the shift in the bit extract case helps
21335 pentium4 a bit; no one else seems to care much either way. */
21336 else
21337 {
21338 enum machine_mode half_mode;
21339 rtx (*gen_lshr3)(rtx, rtx, rtx);
21340 rtx (*gen_and3)(rtx, rtx, rtx);
21341 rtx (*gen_xor3)(rtx, rtx, rtx);
21342 HOST_WIDE_INT bits;
21343 rtx x;
21344
21345 if (mode == DImode)
21346 {
21347 half_mode = SImode;
21348 gen_lshr3 = gen_lshrsi3;
21349 gen_and3 = gen_andsi3;
21350 gen_xor3 = gen_xorsi3;
21351 bits = 5;
21352 }
21353 else
21354 {
21355 half_mode = DImode;
21356 gen_lshr3 = gen_lshrdi3;
21357 gen_and3 = gen_anddi3;
21358 gen_xor3 = gen_xordi3;
21359 bits = 6;
21360 }
21361
21362 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21363 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21364 else
21365 x = gen_lowpart (half_mode, operands[2]);
21366 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21367
21368 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21369 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21370 emit_move_insn (low[0], high[0]);
21371 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21372 }
21373
21374 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21375 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21376 return;
21377 }
21378
21379 if (operands[1] == constm1_rtx)
21380 {
21381 /* For -1 << N, we can avoid the shld instruction, because we
21382 know that we're shifting 0...31/63 ones into a -1. */
21383 emit_move_insn (low[0], constm1_rtx);
21384 if (optimize_insn_for_size_p ())
21385 emit_move_insn (high[0], low[0]);
21386 else
21387 emit_move_insn (high[0], constm1_rtx);
21388 }
21389 else
21390 {
21391 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21392
21393 if (!rtx_equal_p (operands[0], operands[1]))
21394 emit_move_insn (operands[0], operands[1]);
21395
21396 split_double_mode (mode, operands, 1, low, high);
21397 emit_insn (gen_shld (high[0], low[0], operands[2]));
21398 }
21399
21400 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21401
21402 if (TARGET_CMOVE && scratch)
21403 {
21404 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21405 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21406
21407 ix86_expand_clear (scratch);
21408 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
21409 }
21410 else
21411 {
21412 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21413 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21414
21415 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
21416 }
21417 }
21418
21419 void
21420 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
21421 {
21422 rtx (*gen_ashr3)(rtx, rtx, rtx)
21423 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
21424 rtx (*gen_shrd)(rtx, rtx, rtx);
21425 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21426
21427 rtx low[2], high[2];
21428 int count;
21429
21430 if (CONST_INT_P (operands[2]))
21431 {
21432 split_double_mode (mode, operands, 2, low, high);
21433 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21434
21435 if (count == GET_MODE_BITSIZE (mode) - 1)
21436 {
21437 emit_move_insn (high[0], high[1]);
21438 emit_insn (gen_ashr3 (high[0], high[0],
21439 GEN_INT (half_width - 1)));
21440 emit_move_insn (low[0], high[0]);
21441
21442 }
21443 else if (count >= half_width)
21444 {
21445 emit_move_insn (low[0], high[1]);
21446 emit_move_insn (high[0], low[0]);
21447 emit_insn (gen_ashr3 (high[0], high[0],
21448 GEN_INT (half_width - 1)));
21449
21450 if (count > half_width)
21451 emit_insn (gen_ashr3 (low[0], low[0],
21452 GEN_INT (count - half_width)));
21453 }
21454 else
21455 {
21456 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21457
21458 if (!rtx_equal_p (operands[0], operands[1]))
21459 emit_move_insn (operands[0], operands[1]);
21460
21461 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21462 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21463 }
21464 }
21465 else
21466 {
21467 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21468
21469 if (!rtx_equal_p (operands[0], operands[1]))
21470 emit_move_insn (operands[0], operands[1]);
21471
21472 split_double_mode (mode, operands, 1, low, high);
21473
21474 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21475 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21476
21477 if (TARGET_CMOVE && scratch)
21478 {
21479 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21480 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21481
21482 emit_move_insn (scratch, high[0]);
21483 emit_insn (gen_ashr3 (scratch, scratch,
21484 GEN_INT (half_width - 1)));
21485 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21486 scratch));
21487 }
21488 else
21489 {
21490 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21491 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21492
21493 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21494 }
21495 }
21496 }
21497
21498 void
21499 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21500 {
21501 rtx (*gen_lshr3)(rtx, rtx, rtx)
21502 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21503 rtx (*gen_shrd)(rtx, rtx, rtx);
21504 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21505
21506 rtx low[2], high[2];
21507 int count;
21508
21509 if (CONST_INT_P (operands[2]))
21510 {
21511 split_double_mode (mode, operands, 2, low, high);
21512 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21513
21514 if (count >= half_width)
21515 {
21516 emit_move_insn (low[0], high[1]);
21517 ix86_expand_clear (high[0]);
21518
21519 if (count > half_width)
21520 emit_insn (gen_lshr3 (low[0], low[0],
21521 GEN_INT (count - half_width)));
21522 }
21523 else
21524 {
21525 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21526
21527 if (!rtx_equal_p (operands[0], operands[1]))
21528 emit_move_insn (operands[0], operands[1]);
21529
21530 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21531 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21532 }
21533 }
21534 else
21535 {
21536 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21537
21538 if (!rtx_equal_p (operands[0], operands[1]))
21539 emit_move_insn (operands[0], operands[1]);
21540
21541 split_double_mode (mode, operands, 1, low, high);
21542
21543 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21544 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21545
21546 if (TARGET_CMOVE && scratch)
21547 {
21548 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21549 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21550
21551 ix86_expand_clear (scratch);
21552 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21553 scratch));
21554 }
21555 else
21556 {
21557 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21558 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21559
21560 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21561 }
21562 }
21563 }
21564
21565 /* Predict just emitted jump instruction to be taken with probability PROB. */
21566 static void
21567 predict_jump (int prob)
21568 {
21569 rtx insn = get_last_insn ();
21570 gcc_assert (JUMP_P (insn));
21571 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21572 }
21573
21574 /* Helper function for the string operations below. Dest VARIABLE whether
21575 it is aligned to VALUE bytes. If true, jump to the label. */
21576 static rtx
21577 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21578 {
21579 rtx label = gen_label_rtx ();
21580 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21581 if (GET_MODE (variable) == DImode)
21582 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21583 else
21584 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21585 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21586 1, label);
21587 if (epilogue)
21588 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21589 else
21590 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21591 return label;
21592 }
21593
21594 /* Adjust COUNTER by the VALUE. */
21595 static void
21596 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21597 {
21598 rtx (*gen_add)(rtx, rtx, rtx)
21599 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21600
21601 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21602 }
21603
21604 /* Zero extend possibly SImode EXP to Pmode register. */
21605 rtx
21606 ix86_zero_extend_to_Pmode (rtx exp)
21607 {
21608 if (GET_MODE (exp) != Pmode)
21609 exp = convert_to_mode (Pmode, exp, 1);
21610 return force_reg (Pmode, exp);
21611 }
21612
21613 /* Divide COUNTREG by SCALE. */
21614 static rtx
21615 scale_counter (rtx countreg, int scale)
21616 {
21617 rtx sc;
21618
21619 if (scale == 1)
21620 return countreg;
21621 if (CONST_INT_P (countreg))
21622 return GEN_INT (INTVAL (countreg) / scale);
21623 gcc_assert (REG_P (countreg));
21624
21625 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21626 GEN_INT (exact_log2 (scale)),
21627 NULL, 1, OPTAB_DIRECT);
21628 return sc;
21629 }
21630
21631 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21632 DImode for constant loop counts. */
21633
21634 static enum machine_mode
21635 counter_mode (rtx count_exp)
21636 {
21637 if (GET_MODE (count_exp) != VOIDmode)
21638 return GET_MODE (count_exp);
21639 if (!CONST_INT_P (count_exp))
21640 return Pmode;
21641 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21642 return DImode;
21643 return SImode;
21644 }
21645
21646 /* When SRCPTR is non-NULL, output simple loop to move memory
21647 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21648 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21649 equivalent loop to set memory by VALUE (supposed to be in MODE).
21650
21651 The size is rounded down to whole number of chunk size moved at once.
21652 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21653
21654
21655 static void
21656 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21657 rtx destptr, rtx srcptr, rtx value,
21658 rtx count, enum machine_mode mode, int unroll,
21659 int expected_size)
21660 {
21661 rtx out_label, top_label, iter, tmp;
21662 enum machine_mode iter_mode = counter_mode (count);
21663 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21664 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21665 rtx size;
21666 rtx x_addr;
21667 rtx y_addr;
21668 int i;
21669
21670 top_label = gen_label_rtx ();
21671 out_label = gen_label_rtx ();
21672 iter = gen_reg_rtx (iter_mode);
21673
21674 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21675 NULL, 1, OPTAB_DIRECT);
21676 /* Those two should combine. */
21677 if (piece_size == const1_rtx)
21678 {
21679 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21680 true, out_label);
21681 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21682 }
21683 emit_move_insn (iter, const0_rtx);
21684
21685 emit_label (top_label);
21686
21687 tmp = convert_modes (Pmode, iter_mode, iter, true);
21688 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21689 destmem = change_address (destmem, mode, x_addr);
21690
21691 if (srcmem)
21692 {
21693 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21694 srcmem = change_address (srcmem, mode, y_addr);
21695
21696 /* When unrolling for chips that reorder memory reads and writes,
21697 we can save registers by using single temporary.
21698 Also using 4 temporaries is overkill in 32bit mode. */
21699 if (!TARGET_64BIT && 0)
21700 {
21701 for (i = 0; i < unroll; i++)
21702 {
21703 if (i)
21704 {
21705 destmem =
21706 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21707 srcmem =
21708 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21709 }
21710 emit_move_insn (destmem, srcmem);
21711 }
21712 }
21713 else
21714 {
21715 rtx tmpreg[4];
21716 gcc_assert (unroll <= 4);
21717 for (i = 0; i < unroll; i++)
21718 {
21719 tmpreg[i] = gen_reg_rtx (mode);
21720 if (i)
21721 {
21722 srcmem =
21723 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21724 }
21725 emit_move_insn (tmpreg[i], srcmem);
21726 }
21727 for (i = 0; i < unroll; i++)
21728 {
21729 if (i)
21730 {
21731 destmem =
21732 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21733 }
21734 emit_move_insn (destmem, tmpreg[i]);
21735 }
21736 }
21737 }
21738 else
21739 for (i = 0; i < unroll; i++)
21740 {
21741 if (i)
21742 destmem =
21743 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21744 emit_move_insn (destmem, value);
21745 }
21746
21747 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21748 true, OPTAB_LIB_WIDEN);
21749 if (tmp != iter)
21750 emit_move_insn (iter, tmp);
21751
21752 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21753 true, top_label);
21754 if (expected_size != -1)
21755 {
21756 expected_size /= GET_MODE_SIZE (mode) * unroll;
21757 if (expected_size == 0)
21758 predict_jump (0);
21759 else if (expected_size > REG_BR_PROB_BASE)
21760 predict_jump (REG_BR_PROB_BASE - 1);
21761 else
21762 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21763 }
21764 else
21765 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21766 iter = ix86_zero_extend_to_Pmode (iter);
21767 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21768 true, OPTAB_LIB_WIDEN);
21769 if (tmp != destptr)
21770 emit_move_insn (destptr, tmp);
21771 if (srcptr)
21772 {
21773 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21774 true, OPTAB_LIB_WIDEN);
21775 if (tmp != srcptr)
21776 emit_move_insn (srcptr, tmp);
21777 }
21778 emit_label (out_label);
21779 }
21780
21781 /* Output "rep; mov" instruction.
21782 Arguments have same meaning as for previous function */
21783 static void
21784 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21785 rtx destptr, rtx srcptr,
21786 rtx count,
21787 enum machine_mode mode)
21788 {
21789 rtx destexp;
21790 rtx srcexp;
21791 rtx countreg;
21792 HOST_WIDE_INT rounded_count;
21793
21794 /* If the size is known, it is shorter to use rep movs. */
21795 if (mode == QImode && CONST_INT_P (count)
21796 && !(INTVAL (count) & 3))
21797 mode = SImode;
21798
21799 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21800 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21801 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21802 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21803 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21804 if (mode != QImode)
21805 {
21806 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21807 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21808 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21809 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21810 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21811 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21812 }
21813 else
21814 {
21815 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21816 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21817 }
21818 if (CONST_INT_P (count))
21819 {
21820 rounded_count = (INTVAL (count)
21821 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21822 destmem = shallow_copy_rtx (destmem);
21823 srcmem = shallow_copy_rtx (srcmem);
21824 set_mem_size (destmem, rounded_count);
21825 set_mem_size (srcmem, rounded_count);
21826 }
21827 else
21828 {
21829 if (MEM_SIZE_KNOWN_P (destmem))
21830 clear_mem_size (destmem);
21831 if (MEM_SIZE_KNOWN_P (srcmem))
21832 clear_mem_size (srcmem);
21833 }
21834 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21835 destexp, srcexp));
21836 }
21837
21838 /* Output "rep; stos" instruction.
21839 Arguments have same meaning as for previous function */
21840 static void
21841 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21842 rtx count, enum machine_mode mode,
21843 rtx orig_value)
21844 {
21845 rtx destexp;
21846 rtx countreg;
21847 HOST_WIDE_INT rounded_count;
21848
21849 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21850 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21851 value = force_reg (mode, gen_lowpart (mode, value));
21852 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21853 if (mode != QImode)
21854 {
21855 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21856 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21857 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21858 }
21859 else
21860 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21861 if (orig_value == const0_rtx && CONST_INT_P (count))
21862 {
21863 rounded_count = (INTVAL (count)
21864 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21865 destmem = shallow_copy_rtx (destmem);
21866 set_mem_size (destmem, rounded_count);
21867 }
21868 else if (MEM_SIZE_KNOWN_P (destmem))
21869 clear_mem_size (destmem);
21870 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21871 }
21872
21873 static void
21874 emit_strmov (rtx destmem, rtx srcmem,
21875 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21876 {
21877 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21878 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21879 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21880 }
21881
21882 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21883 static void
21884 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21885 rtx destptr, rtx srcptr, rtx count, int max_size)
21886 {
21887 rtx src, dest;
21888 if (CONST_INT_P (count))
21889 {
21890 HOST_WIDE_INT countval = INTVAL (count);
21891 int offset = 0;
21892
21893 if ((countval & 0x10) && max_size > 16)
21894 {
21895 if (TARGET_64BIT)
21896 {
21897 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21898 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21899 }
21900 else
21901 gcc_unreachable ();
21902 offset += 16;
21903 }
21904 if ((countval & 0x08) && max_size > 8)
21905 {
21906 if (TARGET_64BIT)
21907 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21908 else
21909 {
21910 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21911 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21912 }
21913 offset += 8;
21914 }
21915 if ((countval & 0x04) && max_size > 4)
21916 {
21917 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21918 offset += 4;
21919 }
21920 if ((countval & 0x02) && max_size > 2)
21921 {
21922 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21923 offset += 2;
21924 }
21925 if ((countval & 0x01) && max_size > 1)
21926 {
21927 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21928 offset += 1;
21929 }
21930 return;
21931 }
21932 if (max_size > 8)
21933 {
21934 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21935 count, 1, OPTAB_DIRECT);
21936 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21937 count, QImode, 1, 4);
21938 return;
21939 }
21940
21941 /* When there are stringops, we can cheaply increase dest and src pointers.
21942 Otherwise we save code size by maintaining offset (zero is readily
21943 available from preceding rep operation) and using x86 addressing modes.
21944 */
21945 if (TARGET_SINGLE_STRINGOP)
21946 {
21947 if (max_size > 4)
21948 {
21949 rtx label = ix86_expand_aligntest (count, 4, true);
21950 src = change_address (srcmem, SImode, srcptr);
21951 dest = change_address (destmem, SImode, destptr);
21952 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21953 emit_label (label);
21954 LABEL_NUSES (label) = 1;
21955 }
21956 if (max_size > 2)
21957 {
21958 rtx label = ix86_expand_aligntest (count, 2, true);
21959 src = change_address (srcmem, HImode, srcptr);
21960 dest = change_address (destmem, HImode, destptr);
21961 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21962 emit_label (label);
21963 LABEL_NUSES (label) = 1;
21964 }
21965 if (max_size > 1)
21966 {
21967 rtx label = ix86_expand_aligntest (count, 1, true);
21968 src = change_address (srcmem, QImode, srcptr);
21969 dest = change_address (destmem, QImode, destptr);
21970 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21971 emit_label (label);
21972 LABEL_NUSES (label) = 1;
21973 }
21974 }
21975 else
21976 {
21977 rtx offset = force_reg (Pmode, const0_rtx);
21978 rtx tmp;
21979
21980 if (max_size > 4)
21981 {
21982 rtx label = ix86_expand_aligntest (count, 4, true);
21983 src = change_address (srcmem, SImode, srcptr);
21984 dest = change_address (destmem, SImode, destptr);
21985 emit_move_insn (dest, src);
21986 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21987 true, OPTAB_LIB_WIDEN);
21988 if (tmp != offset)
21989 emit_move_insn (offset, tmp);
21990 emit_label (label);
21991 LABEL_NUSES (label) = 1;
21992 }
21993 if (max_size > 2)
21994 {
21995 rtx label = ix86_expand_aligntest (count, 2, true);
21996 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21997 src = change_address (srcmem, HImode, tmp);
21998 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21999 dest = change_address (destmem, HImode, tmp);
22000 emit_move_insn (dest, src);
22001 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22002 true, OPTAB_LIB_WIDEN);
22003 if (tmp != offset)
22004 emit_move_insn (offset, tmp);
22005 emit_label (label);
22006 LABEL_NUSES (label) = 1;
22007 }
22008 if (max_size > 1)
22009 {
22010 rtx label = ix86_expand_aligntest (count, 1, true);
22011 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22012 src = change_address (srcmem, QImode, tmp);
22013 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22014 dest = change_address (destmem, QImode, tmp);
22015 emit_move_insn (dest, src);
22016 emit_label (label);
22017 LABEL_NUSES (label) = 1;
22018 }
22019 }
22020 }
22021
22022 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22023 static void
22024 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22025 rtx count, int max_size)
22026 {
22027 count =
22028 expand_simple_binop (counter_mode (count), AND, count,
22029 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22030 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22031 gen_lowpart (QImode, value), count, QImode,
22032 1, max_size / 2);
22033 }
22034
22035 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22036 static void
22037 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
22038 {
22039 rtx dest;
22040
22041 if (CONST_INT_P (count))
22042 {
22043 HOST_WIDE_INT countval = INTVAL (count);
22044 int offset = 0;
22045
22046 if ((countval & 0x10) && max_size > 16)
22047 {
22048 if (TARGET_64BIT)
22049 {
22050 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22051 emit_insn (gen_strset (destptr, dest, value));
22052 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
22053 emit_insn (gen_strset (destptr, dest, value));
22054 }
22055 else
22056 gcc_unreachable ();
22057 offset += 16;
22058 }
22059 if ((countval & 0x08) && max_size > 8)
22060 {
22061 if (TARGET_64BIT)
22062 {
22063 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22064 emit_insn (gen_strset (destptr, dest, value));
22065 }
22066 else
22067 {
22068 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22069 emit_insn (gen_strset (destptr, dest, value));
22070 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
22071 emit_insn (gen_strset (destptr, dest, value));
22072 }
22073 offset += 8;
22074 }
22075 if ((countval & 0x04) && max_size > 4)
22076 {
22077 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22078 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22079 offset += 4;
22080 }
22081 if ((countval & 0x02) && max_size > 2)
22082 {
22083 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
22084 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22085 offset += 2;
22086 }
22087 if ((countval & 0x01) && max_size > 1)
22088 {
22089 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
22090 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22091 offset += 1;
22092 }
22093 return;
22094 }
22095 if (max_size > 32)
22096 {
22097 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22098 return;
22099 }
22100 if (max_size > 16)
22101 {
22102 rtx label = ix86_expand_aligntest (count, 16, true);
22103 if (TARGET_64BIT)
22104 {
22105 dest = change_address (destmem, DImode, destptr);
22106 emit_insn (gen_strset (destptr, dest, value));
22107 emit_insn (gen_strset (destptr, dest, value));
22108 }
22109 else
22110 {
22111 dest = change_address (destmem, SImode, destptr);
22112 emit_insn (gen_strset (destptr, dest, value));
22113 emit_insn (gen_strset (destptr, dest, value));
22114 emit_insn (gen_strset (destptr, dest, value));
22115 emit_insn (gen_strset (destptr, dest, value));
22116 }
22117 emit_label (label);
22118 LABEL_NUSES (label) = 1;
22119 }
22120 if (max_size > 8)
22121 {
22122 rtx label = ix86_expand_aligntest (count, 8, true);
22123 if (TARGET_64BIT)
22124 {
22125 dest = change_address (destmem, DImode, destptr);
22126 emit_insn (gen_strset (destptr, dest, value));
22127 }
22128 else
22129 {
22130 dest = change_address (destmem, SImode, destptr);
22131 emit_insn (gen_strset (destptr, dest, value));
22132 emit_insn (gen_strset (destptr, dest, value));
22133 }
22134 emit_label (label);
22135 LABEL_NUSES (label) = 1;
22136 }
22137 if (max_size > 4)
22138 {
22139 rtx label = ix86_expand_aligntest (count, 4, true);
22140 dest = change_address (destmem, SImode, destptr);
22141 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22142 emit_label (label);
22143 LABEL_NUSES (label) = 1;
22144 }
22145 if (max_size > 2)
22146 {
22147 rtx label = ix86_expand_aligntest (count, 2, true);
22148 dest = change_address (destmem, HImode, destptr);
22149 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22150 emit_label (label);
22151 LABEL_NUSES (label) = 1;
22152 }
22153 if (max_size > 1)
22154 {
22155 rtx label = ix86_expand_aligntest (count, 1, true);
22156 dest = change_address (destmem, QImode, destptr);
22157 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22158 emit_label (label);
22159 LABEL_NUSES (label) = 1;
22160 }
22161 }
22162
22163 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
22164 DESIRED_ALIGNMENT. */
22165 static void
22166 expand_movmem_prologue (rtx destmem, rtx srcmem,
22167 rtx destptr, rtx srcptr, rtx count,
22168 int align, int desired_alignment)
22169 {
22170 if (align <= 1 && desired_alignment > 1)
22171 {
22172 rtx label = ix86_expand_aligntest (destptr, 1, false);
22173 srcmem = change_address (srcmem, QImode, srcptr);
22174 destmem = change_address (destmem, QImode, destptr);
22175 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22176 ix86_adjust_counter (count, 1);
22177 emit_label (label);
22178 LABEL_NUSES (label) = 1;
22179 }
22180 if (align <= 2 && desired_alignment > 2)
22181 {
22182 rtx label = ix86_expand_aligntest (destptr, 2, false);
22183 srcmem = change_address (srcmem, HImode, srcptr);
22184 destmem = change_address (destmem, HImode, destptr);
22185 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22186 ix86_adjust_counter (count, 2);
22187 emit_label (label);
22188 LABEL_NUSES (label) = 1;
22189 }
22190 if (align <= 4 && desired_alignment > 4)
22191 {
22192 rtx label = ix86_expand_aligntest (destptr, 4, false);
22193 srcmem = change_address (srcmem, SImode, srcptr);
22194 destmem = change_address (destmem, SImode, destptr);
22195 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22196 ix86_adjust_counter (count, 4);
22197 emit_label (label);
22198 LABEL_NUSES (label) = 1;
22199 }
22200 gcc_assert (desired_alignment <= 8);
22201 }
22202
22203 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
22204 ALIGN_BYTES is how many bytes need to be copied. */
22205 static rtx
22206 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
22207 int desired_align, int align_bytes)
22208 {
22209 rtx src = *srcp;
22210 rtx orig_dst = dst;
22211 rtx orig_src = src;
22212 int off = 0;
22213 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
22214 if (src_align_bytes >= 0)
22215 src_align_bytes = desired_align - src_align_bytes;
22216 if (align_bytes & 1)
22217 {
22218 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22219 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
22220 off = 1;
22221 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22222 }
22223 if (align_bytes & 2)
22224 {
22225 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22226 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
22227 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22228 set_mem_align (dst, 2 * BITS_PER_UNIT);
22229 if (src_align_bytes >= 0
22230 && (src_align_bytes & 1) == (align_bytes & 1)
22231 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
22232 set_mem_align (src, 2 * BITS_PER_UNIT);
22233 off = 2;
22234 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22235 }
22236 if (align_bytes & 4)
22237 {
22238 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22239 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
22240 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22241 set_mem_align (dst, 4 * BITS_PER_UNIT);
22242 if (src_align_bytes >= 0)
22243 {
22244 unsigned int src_align = 0;
22245 if ((src_align_bytes & 3) == (align_bytes & 3))
22246 src_align = 4;
22247 else if ((src_align_bytes & 1) == (align_bytes & 1))
22248 src_align = 2;
22249 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22250 set_mem_align (src, src_align * BITS_PER_UNIT);
22251 }
22252 off = 4;
22253 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22254 }
22255 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22256 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
22257 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22258 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22259 if (src_align_bytes >= 0)
22260 {
22261 unsigned int src_align = 0;
22262 if ((src_align_bytes & 7) == (align_bytes & 7))
22263 src_align = 8;
22264 else if ((src_align_bytes & 3) == (align_bytes & 3))
22265 src_align = 4;
22266 else if ((src_align_bytes & 1) == (align_bytes & 1))
22267 src_align = 2;
22268 if (src_align > (unsigned int) desired_align)
22269 src_align = desired_align;
22270 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22271 set_mem_align (src, src_align * BITS_PER_UNIT);
22272 }
22273 if (MEM_SIZE_KNOWN_P (orig_dst))
22274 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22275 if (MEM_SIZE_KNOWN_P (orig_src))
22276 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
22277 *srcp = src;
22278 return dst;
22279 }
22280
22281 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
22282 DESIRED_ALIGNMENT. */
22283 static void
22284 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
22285 int align, int desired_alignment)
22286 {
22287 if (align <= 1 && desired_alignment > 1)
22288 {
22289 rtx label = ix86_expand_aligntest (destptr, 1, false);
22290 destmem = change_address (destmem, QImode, destptr);
22291 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
22292 ix86_adjust_counter (count, 1);
22293 emit_label (label);
22294 LABEL_NUSES (label) = 1;
22295 }
22296 if (align <= 2 && desired_alignment > 2)
22297 {
22298 rtx label = ix86_expand_aligntest (destptr, 2, false);
22299 destmem = change_address (destmem, HImode, destptr);
22300 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
22301 ix86_adjust_counter (count, 2);
22302 emit_label (label);
22303 LABEL_NUSES (label) = 1;
22304 }
22305 if (align <= 4 && desired_alignment > 4)
22306 {
22307 rtx label = ix86_expand_aligntest (destptr, 4, false);
22308 destmem = change_address (destmem, SImode, destptr);
22309 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22310 ix86_adjust_counter (count, 4);
22311 emit_label (label);
22312 LABEL_NUSES (label) = 1;
22313 }
22314 gcc_assert (desired_alignment <= 8);
22315 }
22316
22317 /* Set enough from DST to align DST known to by aligned by ALIGN to
22318 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
22319 static rtx
22320 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
22321 int desired_align, int align_bytes)
22322 {
22323 int off = 0;
22324 rtx orig_dst = dst;
22325 if (align_bytes & 1)
22326 {
22327 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22328 off = 1;
22329 emit_insn (gen_strset (destreg, dst,
22330 gen_lowpart (QImode, value)));
22331 }
22332 if (align_bytes & 2)
22333 {
22334 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22335 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22336 set_mem_align (dst, 2 * BITS_PER_UNIT);
22337 off = 2;
22338 emit_insn (gen_strset (destreg, dst,
22339 gen_lowpart (HImode, value)));
22340 }
22341 if (align_bytes & 4)
22342 {
22343 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22344 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22345 set_mem_align (dst, 4 * BITS_PER_UNIT);
22346 off = 4;
22347 emit_insn (gen_strset (destreg, dst,
22348 gen_lowpart (SImode, value)));
22349 }
22350 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22351 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22352 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22353 if (MEM_SIZE_KNOWN_P (orig_dst))
22354 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22355 return dst;
22356 }
22357
22358 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
22359 static enum stringop_alg
22360 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22361 int *dynamic_check)
22362 {
22363 const struct stringop_algs * algs;
22364 bool optimize_for_speed;
22365 /* Algorithms using the rep prefix want at least edi and ecx;
22366 additionally, memset wants eax and memcpy wants esi. Don't
22367 consider such algorithms if the user has appropriated those
22368 registers for their own purposes. */
22369 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22370 || (memset
22371 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22372
22373 #define ALG_USABLE_P(alg) (rep_prefix_usable \
22374 || (alg != rep_prefix_1_byte \
22375 && alg != rep_prefix_4_byte \
22376 && alg != rep_prefix_8_byte))
22377 const struct processor_costs *cost;
22378
22379 /* Even if the string operation call is cold, we still might spend a lot
22380 of time processing large blocks. */
22381 if (optimize_function_for_size_p (cfun)
22382 || (optimize_insn_for_size_p ()
22383 && expected_size != -1 && expected_size < 256))
22384 optimize_for_speed = false;
22385 else
22386 optimize_for_speed = true;
22387
22388 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
22389
22390 *dynamic_check = -1;
22391 if (memset)
22392 algs = &cost->memset[TARGET_64BIT != 0];
22393 else
22394 algs = &cost->memcpy[TARGET_64BIT != 0];
22395 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
22396 return ix86_stringop_alg;
22397 /* rep; movq or rep; movl is the smallest variant. */
22398 else if (!optimize_for_speed)
22399 {
22400 if (!count || (count & 3))
22401 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
22402 else
22403 return rep_prefix_usable ? rep_prefix_4_byte : loop;
22404 }
22405 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
22406 */
22407 else if (expected_size != -1 && expected_size < 4)
22408 return loop_1_byte;
22409 else if (expected_size != -1)
22410 {
22411 unsigned int i;
22412 enum stringop_alg alg = libcall;
22413 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22414 {
22415 /* We get here if the algorithms that were not libcall-based
22416 were rep-prefix based and we are unable to use rep prefixes
22417 based on global register usage. Break out of the loop and
22418 use the heuristic below. */
22419 if (algs->size[i].max == 0)
22420 break;
22421 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22422 {
22423 enum stringop_alg candidate = algs->size[i].alg;
22424
22425 if (candidate != libcall && ALG_USABLE_P (candidate))
22426 alg = candidate;
22427 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22428 last non-libcall inline algorithm. */
22429 if (TARGET_INLINE_ALL_STRINGOPS)
22430 {
22431 /* When the current size is best to be copied by a libcall,
22432 but we are still forced to inline, run the heuristic below
22433 that will pick code for medium sized blocks. */
22434 if (alg != libcall)
22435 return alg;
22436 break;
22437 }
22438 else if (ALG_USABLE_P (candidate))
22439 return candidate;
22440 }
22441 }
22442 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22443 }
22444 /* When asked to inline the call anyway, try to pick meaningful choice.
22445 We look for maximal size of block that is faster to copy by hand and
22446 take blocks of at most of that size guessing that average size will
22447 be roughly half of the block.
22448
22449 If this turns out to be bad, we might simply specify the preferred
22450 choice in ix86_costs. */
22451 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22452 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22453 {
22454 int max = -1;
22455 enum stringop_alg alg;
22456 int i;
22457 bool any_alg_usable_p = true;
22458
22459 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22460 {
22461 enum stringop_alg candidate = algs->size[i].alg;
22462 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22463
22464 if (candidate != libcall && candidate
22465 && ALG_USABLE_P (candidate))
22466 max = algs->size[i].max;
22467 }
22468 /* If there aren't any usable algorithms, then recursing on
22469 smaller sizes isn't going to find anything. Just return the
22470 simple byte-at-a-time copy loop. */
22471 if (!any_alg_usable_p)
22472 {
22473 /* Pick something reasonable. */
22474 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22475 *dynamic_check = 128;
22476 return loop_1_byte;
22477 }
22478 if (max == -1)
22479 max = 4096;
22480 alg = decide_alg (count, max / 2, memset, dynamic_check);
22481 gcc_assert (*dynamic_check == -1);
22482 gcc_assert (alg != libcall);
22483 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22484 *dynamic_check = max;
22485 return alg;
22486 }
22487 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22488 #undef ALG_USABLE_P
22489 }
22490
22491 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22492 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22493 static int
22494 decide_alignment (int align,
22495 enum stringop_alg alg,
22496 int expected_size)
22497 {
22498 int desired_align = 0;
22499 switch (alg)
22500 {
22501 case no_stringop:
22502 gcc_unreachable ();
22503 case loop:
22504 case unrolled_loop:
22505 desired_align = GET_MODE_SIZE (Pmode);
22506 break;
22507 case rep_prefix_8_byte:
22508 desired_align = 8;
22509 break;
22510 case rep_prefix_4_byte:
22511 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22512 copying whole cacheline at once. */
22513 if (TARGET_PENTIUMPRO)
22514 desired_align = 8;
22515 else
22516 desired_align = 4;
22517 break;
22518 case rep_prefix_1_byte:
22519 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22520 copying whole cacheline at once. */
22521 if (TARGET_PENTIUMPRO)
22522 desired_align = 8;
22523 else
22524 desired_align = 1;
22525 break;
22526 case loop_1_byte:
22527 desired_align = 1;
22528 break;
22529 case libcall:
22530 return 0;
22531 }
22532
22533 if (optimize_size)
22534 desired_align = 1;
22535 if (desired_align < align)
22536 desired_align = align;
22537 if (expected_size != -1 && expected_size < 4)
22538 desired_align = align;
22539 return desired_align;
22540 }
22541
22542 /* Return the smallest power of 2 greater than VAL. */
22543 static int
22544 smallest_pow2_greater_than (int val)
22545 {
22546 int ret = 1;
22547 while (ret <= val)
22548 ret <<= 1;
22549 return ret;
22550 }
22551
22552 /* Expand string move (memcpy) operation. Use i386 string operations
22553 when profitable. expand_setmem contains similar code. The code
22554 depends upon architecture, block size and alignment, but always has
22555 the same overall structure:
22556
22557 1) Prologue guard: Conditional that jumps up to epilogues for small
22558 blocks that can be handled by epilogue alone. This is faster
22559 but also needed for correctness, since prologue assume the block
22560 is larger than the desired alignment.
22561
22562 Optional dynamic check for size and libcall for large
22563 blocks is emitted here too, with -minline-stringops-dynamically.
22564
22565 2) Prologue: copy first few bytes in order to get destination
22566 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22567 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22568 copied. We emit either a jump tree on power of two sized
22569 blocks, or a byte loop.
22570
22571 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22572 with specified algorithm.
22573
22574 4) Epilogue: code copying tail of the block that is too small to be
22575 handled by main body (or up to size guarded by prologue guard). */
22576
22577 bool
22578 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22579 rtx expected_align_exp, rtx expected_size_exp)
22580 {
22581 rtx destreg;
22582 rtx srcreg;
22583 rtx label = NULL;
22584 rtx tmp;
22585 rtx jump_around_label = NULL;
22586 HOST_WIDE_INT align = 1;
22587 unsigned HOST_WIDE_INT count = 0;
22588 HOST_WIDE_INT expected_size = -1;
22589 int size_needed = 0, epilogue_size_needed;
22590 int desired_align = 0, align_bytes = 0;
22591 enum stringop_alg alg;
22592 int dynamic_check;
22593 bool need_zero_guard = false;
22594
22595 if (CONST_INT_P (align_exp))
22596 align = INTVAL (align_exp);
22597 /* i386 can do misaligned access on reasonably increased cost. */
22598 if (CONST_INT_P (expected_align_exp)
22599 && INTVAL (expected_align_exp) > align)
22600 align = INTVAL (expected_align_exp);
22601 /* ALIGN is the minimum of destination and source alignment, but we care here
22602 just about destination alignment. */
22603 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22604 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22605
22606 if (CONST_INT_P (count_exp))
22607 count = expected_size = INTVAL (count_exp);
22608 if (CONST_INT_P (expected_size_exp) && count == 0)
22609 expected_size = INTVAL (expected_size_exp);
22610
22611 /* Make sure we don't need to care about overflow later on. */
22612 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22613 return false;
22614
22615 /* Step 0: Decide on preferred algorithm, desired alignment and
22616 size of chunks to be copied by main loop. */
22617
22618 alg = decide_alg (count, expected_size, false, &dynamic_check);
22619 desired_align = decide_alignment (align, alg, expected_size);
22620
22621 if (!TARGET_ALIGN_STRINGOPS)
22622 align = desired_align;
22623
22624 if (alg == libcall)
22625 return false;
22626 gcc_assert (alg != no_stringop);
22627 if (!count)
22628 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22629 destreg = copy_addr_to_reg (XEXP (dst, 0));
22630 srcreg = copy_addr_to_reg (XEXP (src, 0));
22631 switch (alg)
22632 {
22633 case libcall:
22634 case no_stringop:
22635 gcc_unreachable ();
22636 case loop:
22637 need_zero_guard = true;
22638 size_needed = GET_MODE_SIZE (word_mode);
22639 break;
22640 case unrolled_loop:
22641 need_zero_guard = true;
22642 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22643 break;
22644 case rep_prefix_8_byte:
22645 size_needed = 8;
22646 break;
22647 case rep_prefix_4_byte:
22648 size_needed = 4;
22649 break;
22650 case rep_prefix_1_byte:
22651 size_needed = 1;
22652 break;
22653 case loop_1_byte:
22654 need_zero_guard = true;
22655 size_needed = 1;
22656 break;
22657 }
22658
22659 epilogue_size_needed = size_needed;
22660
22661 /* Step 1: Prologue guard. */
22662
22663 /* Alignment code needs count to be in register. */
22664 if (CONST_INT_P (count_exp) && desired_align > align)
22665 {
22666 if (INTVAL (count_exp) > desired_align
22667 && INTVAL (count_exp) > size_needed)
22668 {
22669 align_bytes
22670 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22671 if (align_bytes <= 0)
22672 align_bytes = 0;
22673 else
22674 align_bytes = desired_align - align_bytes;
22675 }
22676 if (align_bytes == 0)
22677 count_exp = force_reg (counter_mode (count_exp), count_exp);
22678 }
22679 gcc_assert (desired_align >= 1 && align >= 1);
22680
22681 /* Ensure that alignment prologue won't copy past end of block. */
22682 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22683 {
22684 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22685 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22686 Make sure it is power of 2. */
22687 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22688
22689 if (count)
22690 {
22691 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22692 {
22693 /* If main algorithm works on QImode, no epilogue is needed.
22694 For small sizes just don't align anything. */
22695 if (size_needed == 1)
22696 desired_align = align;
22697 else
22698 goto epilogue;
22699 }
22700 }
22701 else
22702 {
22703 label = gen_label_rtx ();
22704 emit_cmp_and_jump_insns (count_exp,
22705 GEN_INT (epilogue_size_needed),
22706 LTU, 0, counter_mode (count_exp), 1, label);
22707 if (expected_size == -1 || expected_size < epilogue_size_needed)
22708 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22709 else
22710 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22711 }
22712 }
22713
22714 /* Emit code to decide on runtime whether library call or inline should be
22715 used. */
22716 if (dynamic_check != -1)
22717 {
22718 if (CONST_INT_P (count_exp))
22719 {
22720 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22721 {
22722 emit_block_move_via_libcall (dst, src, count_exp, false);
22723 count_exp = const0_rtx;
22724 goto epilogue;
22725 }
22726 }
22727 else
22728 {
22729 rtx hot_label = gen_label_rtx ();
22730 jump_around_label = gen_label_rtx ();
22731 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22732 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22733 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22734 emit_block_move_via_libcall (dst, src, count_exp, false);
22735 emit_jump (jump_around_label);
22736 emit_label (hot_label);
22737 }
22738 }
22739
22740 /* Step 2: Alignment prologue. */
22741
22742 if (desired_align > align)
22743 {
22744 if (align_bytes == 0)
22745 {
22746 /* Except for the first move in epilogue, we no longer know
22747 constant offset in aliasing info. It don't seems to worth
22748 the pain to maintain it for the first move, so throw away
22749 the info early. */
22750 src = change_address (src, BLKmode, srcreg);
22751 dst = change_address (dst, BLKmode, destreg);
22752 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22753 desired_align);
22754 }
22755 else
22756 {
22757 /* If we know how many bytes need to be stored before dst is
22758 sufficiently aligned, maintain aliasing info accurately. */
22759 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22760 desired_align, align_bytes);
22761 count_exp = plus_constant (counter_mode (count_exp),
22762 count_exp, -align_bytes);
22763 count -= align_bytes;
22764 }
22765 if (need_zero_guard
22766 && (count < (unsigned HOST_WIDE_INT) size_needed
22767 || (align_bytes == 0
22768 && count < ((unsigned HOST_WIDE_INT) size_needed
22769 + desired_align - align))))
22770 {
22771 /* It is possible that we copied enough so the main loop will not
22772 execute. */
22773 gcc_assert (size_needed > 1);
22774 if (label == NULL_RTX)
22775 label = gen_label_rtx ();
22776 emit_cmp_and_jump_insns (count_exp,
22777 GEN_INT (size_needed),
22778 LTU, 0, counter_mode (count_exp), 1, label);
22779 if (expected_size == -1
22780 || expected_size < (desired_align - align) / 2 + size_needed)
22781 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22782 else
22783 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22784 }
22785 }
22786 if (label && size_needed == 1)
22787 {
22788 emit_label (label);
22789 LABEL_NUSES (label) = 1;
22790 label = NULL;
22791 epilogue_size_needed = 1;
22792 }
22793 else if (label == NULL_RTX)
22794 epilogue_size_needed = size_needed;
22795
22796 /* Step 3: Main loop. */
22797
22798 switch (alg)
22799 {
22800 case libcall:
22801 case no_stringop:
22802 gcc_unreachable ();
22803 case loop_1_byte:
22804 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22805 count_exp, QImode, 1, expected_size);
22806 break;
22807 case loop:
22808 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22809 count_exp, word_mode, 1, expected_size);
22810 break;
22811 case unrolled_loop:
22812 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22813 registers for 4 temporaries anyway. */
22814 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22815 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
22816 expected_size);
22817 break;
22818 case rep_prefix_8_byte:
22819 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22820 DImode);
22821 break;
22822 case rep_prefix_4_byte:
22823 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22824 SImode);
22825 break;
22826 case rep_prefix_1_byte:
22827 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22828 QImode);
22829 break;
22830 }
22831 /* Adjust properly the offset of src and dest memory for aliasing. */
22832 if (CONST_INT_P (count_exp))
22833 {
22834 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22835 (count / size_needed) * size_needed);
22836 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22837 (count / size_needed) * size_needed);
22838 }
22839 else
22840 {
22841 src = change_address (src, BLKmode, srcreg);
22842 dst = change_address (dst, BLKmode, destreg);
22843 }
22844
22845 /* Step 4: Epilogue to copy the remaining bytes. */
22846 epilogue:
22847 if (label)
22848 {
22849 /* When the main loop is done, COUNT_EXP might hold original count,
22850 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22851 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22852 bytes. Compensate if needed. */
22853
22854 if (size_needed < epilogue_size_needed)
22855 {
22856 tmp =
22857 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22858 GEN_INT (size_needed - 1), count_exp, 1,
22859 OPTAB_DIRECT);
22860 if (tmp != count_exp)
22861 emit_move_insn (count_exp, tmp);
22862 }
22863 emit_label (label);
22864 LABEL_NUSES (label) = 1;
22865 }
22866
22867 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22868 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22869 epilogue_size_needed);
22870 if (jump_around_label)
22871 emit_label (jump_around_label);
22872 return true;
22873 }
22874
22875 /* Helper function for memcpy. For QImode value 0xXY produce
22876 0xXYXYXYXY of wide specified by MODE. This is essentially
22877 a * 0x10101010, but we can do slightly better than
22878 synth_mult by unwinding the sequence by hand on CPUs with
22879 slow multiply. */
22880 static rtx
22881 promote_duplicated_reg (enum machine_mode mode, rtx val)
22882 {
22883 enum machine_mode valmode = GET_MODE (val);
22884 rtx tmp;
22885 int nops = mode == DImode ? 3 : 2;
22886
22887 gcc_assert (mode == SImode || mode == DImode);
22888 if (val == const0_rtx)
22889 return copy_to_mode_reg (mode, const0_rtx);
22890 if (CONST_INT_P (val))
22891 {
22892 HOST_WIDE_INT v = INTVAL (val) & 255;
22893
22894 v |= v << 8;
22895 v |= v << 16;
22896 if (mode == DImode)
22897 v |= (v << 16) << 16;
22898 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22899 }
22900
22901 if (valmode == VOIDmode)
22902 valmode = QImode;
22903 if (valmode != QImode)
22904 val = gen_lowpart (QImode, val);
22905 if (mode == QImode)
22906 return val;
22907 if (!TARGET_PARTIAL_REG_STALL)
22908 nops--;
22909 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22910 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22911 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22912 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22913 {
22914 rtx reg = convert_modes (mode, QImode, val, true);
22915 tmp = promote_duplicated_reg (mode, const1_rtx);
22916 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22917 OPTAB_DIRECT);
22918 }
22919 else
22920 {
22921 rtx reg = convert_modes (mode, QImode, val, true);
22922
22923 if (!TARGET_PARTIAL_REG_STALL)
22924 if (mode == SImode)
22925 emit_insn (gen_movsi_insv_1 (reg, reg));
22926 else
22927 emit_insn (gen_movdi_insv_1 (reg, reg));
22928 else
22929 {
22930 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22931 NULL, 1, OPTAB_DIRECT);
22932 reg =
22933 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22934 }
22935 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22936 NULL, 1, OPTAB_DIRECT);
22937 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22938 if (mode == SImode)
22939 return reg;
22940 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22941 NULL, 1, OPTAB_DIRECT);
22942 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22943 return reg;
22944 }
22945 }
22946
22947 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22948 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22949 alignment from ALIGN to DESIRED_ALIGN. */
22950 static rtx
22951 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22952 {
22953 rtx promoted_val;
22954
22955 if (TARGET_64BIT
22956 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22957 promoted_val = promote_duplicated_reg (DImode, val);
22958 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22959 promoted_val = promote_duplicated_reg (SImode, val);
22960 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22961 promoted_val = promote_duplicated_reg (HImode, val);
22962 else
22963 promoted_val = val;
22964
22965 return promoted_val;
22966 }
22967
22968 /* Expand string clear operation (bzero). Use i386 string operations when
22969 profitable. See expand_movmem comment for explanation of individual
22970 steps performed. */
22971 bool
22972 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22973 rtx expected_align_exp, rtx expected_size_exp)
22974 {
22975 rtx destreg;
22976 rtx label = NULL;
22977 rtx tmp;
22978 rtx jump_around_label = NULL;
22979 HOST_WIDE_INT align = 1;
22980 unsigned HOST_WIDE_INT count = 0;
22981 HOST_WIDE_INT expected_size = -1;
22982 int size_needed = 0, epilogue_size_needed;
22983 int desired_align = 0, align_bytes = 0;
22984 enum stringop_alg alg;
22985 rtx promoted_val = NULL;
22986 bool force_loopy_epilogue = false;
22987 int dynamic_check;
22988 bool need_zero_guard = false;
22989
22990 if (CONST_INT_P (align_exp))
22991 align = INTVAL (align_exp);
22992 /* i386 can do misaligned access on reasonably increased cost. */
22993 if (CONST_INT_P (expected_align_exp)
22994 && INTVAL (expected_align_exp) > align)
22995 align = INTVAL (expected_align_exp);
22996 if (CONST_INT_P (count_exp))
22997 count = expected_size = INTVAL (count_exp);
22998 if (CONST_INT_P (expected_size_exp) && count == 0)
22999 expected_size = INTVAL (expected_size_exp);
23000
23001 /* Make sure we don't need to care about overflow later on. */
23002 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23003 return false;
23004
23005 /* Step 0: Decide on preferred algorithm, desired alignment and
23006 size of chunks to be copied by main loop. */
23007
23008 alg = decide_alg (count, expected_size, true, &dynamic_check);
23009 desired_align = decide_alignment (align, alg, expected_size);
23010
23011 if (!TARGET_ALIGN_STRINGOPS)
23012 align = desired_align;
23013
23014 if (alg == libcall)
23015 return false;
23016 gcc_assert (alg != no_stringop);
23017 if (!count)
23018 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
23019 destreg = copy_addr_to_reg (XEXP (dst, 0));
23020 switch (alg)
23021 {
23022 case libcall:
23023 case no_stringop:
23024 gcc_unreachable ();
23025 case loop:
23026 need_zero_guard = true;
23027 size_needed = GET_MODE_SIZE (word_mode);
23028 break;
23029 case unrolled_loop:
23030 need_zero_guard = true;
23031 size_needed = GET_MODE_SIZE (word_mode) * 4;
23032 break;
23033 case rep_prefix_8_byte:
23034 size_needed = 8;
23035 break;
23036 case rep_prefix_4_byte:
23037 size_needed = 4;
23038 break;
23039 case rep_prefix_1_byte:
23040 size_needed = 1;
23041 break;
23042 case loop_1_byte:
23043 need_zero_guard = true;
23044 size_needed = 1;
23045 break;
23046 }
23047 epilogue_size_needed = size_needed;
23048
23049 /* Step 1: Prologue guard. */
23050
23051 /* Alignment code needs count to be in register. */
23052 if (CONST_INT_P (count_exp) && desired_align > align)
23053 {
23054 if (INTVAL (count_exp) > desired_align
23055 && INTVAL (count_exp) > size_needed)
23056 {
23057 align_bytes
23058 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23059 if (align_bytes <= 0)
23060 align_bytes = 0;
23061 else
23062 align_bytes = desired_align - align_bytes;
23063 }
23064 if (align_bytes == 0)
23065 {
23066 enum machine_mode mode = SImode;
23067 if (TARGET_64BIT && (count & ~0xffffffff))
23068 mode = DImode;
23069 count_exp = force_reg (mode, count_exp);
23070 }
23071 }
23072 /* Do the cheap promotion to allow better CSE across the
23073 main loop and epilogue (ie one load of the big constant in the
23074 front of all code. */
23075 if (CONST_INT_P (val_exp))
23076 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23077 desired_align, align);
23078 /* Ensure that alignment prologue won't copy past end of block. */
23079 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23080 {
23081 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23082 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
23083 Make sure it is power of 2. */
23084 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
23085
23086 /* To improve performance of small blocks, we jump around the VAL
23087 promoting mode. This mean that if the promoted VAL is not constant,
23088 we might not use it in the epilogue and have to use byte
23089 loop variant. */
23090 if (epilogue_size_needed > 2 && !promoted_val)
23091 force_loopy_epilogue = true;
23092 if (count)
23093 {
23094 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23095 {
23096 /* If main algorithm works on QImode, no epilogue is needed.
23097 For small sizes just don't align anything. */
23098 if (size_needed == 1)
23099 desired_align = align;
23100 else
23101 goto epilogue;
23102 }
23103 }
23104 else
23105 {
23106 label = gen_label_rtx ();
23107 emit_cmp_and_jump_insns (count_exp,
23108 GEN_INT (epilogue_size_needed),
23109 LTU, 0, counter_mode (count_exp), 1, label);
23110 if (expected_size == -1 || expected_size <= epilogue_size_needed)
23111 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23112 else
23113 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23114 }
23115 }
23116 if (dynamic_check != -1)
23117 {
23118 rtx hot_label = gen_label_rtx ();
23119 jump_around_label = gen_label_rtx ();
23120 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23121 LEU, 0, counter_mode (count_exp), 1, hot_label);
23122 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23123 set_storage_via_libcall (dst, count_exp, val_exp, false);
23124 emit_jump (jump_around_label);
23125 emit_label (hot_label);
23126 }
23127
23128 /* Step 2: Alignment prologue. */
23129
23130 /* Do the expensive promotion once we branched off the small blocks. */
23131 if (!promoted_val)
23132 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23133 desired_align, align);
23134 gcc_assert (desired_align >= 1 && align >= 1);
23135
23136 if (desired_align > align)
23137 {
23138 if (align_bytes == 0)
23139 {
23140 /* Except for the first move in epilogue, we no longer know
23141 constant offset in aliasing info. It don't seems to worth
23142 the pain to maintain it for the first move, so throw away
23143 the info early. */
23144 dst = change_address (dst, BLKmode, destreg);
23145 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
23146 desired_align);
23147 }
23148 else
23149 {
23150 /* If we know how many bytes need to be stored before dst is
23151 sufficiently aligned, maintain aliasing info accurately. */
23152 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
23153 desired_align, align_bytes);
23154 count_exp = plus_constant (counter_mode (count_exp),
23155 count_exp, -align_bytes);
23156 count -= align_bytes;
23157 }
23158 if (need_zero_guard
23159 && (count < (unsigned HOST_WIDE_INT) size_needed
23160 || (align_bytes == 0
23161 && count < ((unsigned HOST_WIDE_INT) size_needed
23162 + desired_align - align))))
23163 {
23164 /* It is possible that we copied enough so the main loop will not
23165 execute. */
23166 gcc_assert (size_needed > 1);
23167 if (label == NULL_RTX)
23168 label = gen_label_rtx ();
23169 emit_cmp_and_jump_insns (count_exp,
23170 GEN_INT (size_needed),
23171 LTU, 0, counter_mode (count_exp), 1, label);
23172 if (expected_size == -1
23173 || expected_size < (desired_align - align) / 2 + size_needed)
23174 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23175 else
23176 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23177 }
23178 }
23179 if (label && size_needed == 1)
23180 {
23181 emit_label (label);
23182 LABEL_NUSES (label) = 1;
23183 label = NULL;
23184 promoted_val = val_exp;
23185 epilogue_size_needed = 1;
23186 }
23187 else if (label == NULL_RTX)
23188 epilogue_size_needed = size_needed;
23189
23190 /* Step 3: Main loop. */
23191
23192 switch (alg)
23193 {
23194 case libcall:
23195 case no_stringop:
23196 gcc_unreachable ();
23197 case loop_1_byte:
23198 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23199 count_exp, QImode, 1, expected_size);
23200 break;
23201 case loop:
23202 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23203 count_exp, word_mode, 1, expected_size);
23204 break;
23205 case unrolled_loop:
23206 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23207 count_exp, word_mode, 4, expected_size);
23208 break;
23209 case rep_prefix_8_byte:
23210 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23211 DImode, val_exp);
23212 break;
23213 case rep_prefix_4_byte:
23214 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23215 SImode, val_exp);
23216 break;
23217 case rep_prefix_1_byte:
23218 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23219 QImode, val_exp);
23220 break;
23221 }
23222 /* Adjust properly the offset of src and dest memory for aliasing. */
23223 if (CONST_INT_P (count_exp))
23224 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23225 (count / size_needed) * size_needed);
23226 else
23227 dst = change_address (dst, BLKmode, destreg);
23228
23229 /* Step 4: Epilogue to copy the remaining bytes. */
23230
23231 if (label)
23232 {
23233 /* When the main loop is done, COUNT_EXP might hold original count,
23234 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23235 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23236 bytes. Compensate if needed. */
23237
23238 if (size_needed < epilogue_size_needed)
23239 {
23240 tmp =
23241 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23242 GEN_INT (size_needed - 1), count_exp, 1,
23243 OPTAB_DIRECT);
23244 if (tmp != count_exp)
23245 emit_move_insn (count_exp, tmp);
23246 }
23247 emit_label (label);
23248 LABEL_NUSES (label) = 1;
23249 }
23250 epilogue:
23251 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23252 {
23253 if (force_loopy_epilogue)
23254 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
23255 epilogue_size_needed);
23256 else
23257 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
23258 epilogue_size_needed);
23259 }
23260 if (jump_around_label)
23261 emit_label (jump_around_label);
23262 return true;
23263 }
23264
23265 /* Expand the appropriate insns for doing strlen if not just doing
23266 repnz; scasb
23267
23268 out = result, initialized with the start address
23269 align_rtx = alignment of the address.
23270 scratch = scratch register, initialized with the startaddress when
23271 not aligned, otherwise undefined
23272
23273 This is just the body. It needs the initializations mentioned above and
23274 some address computing at the end. These things are done in i386.md. */
23275
23276 static void
23277 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
23278 {
23279 int align;
23280 rtx tmp;
23281 rtx align_2_label = NULL_RTX;
23282 rtx align_3_label = NULL_RTX;
23283 rtx align_4_label = gen_label_rtx ();
23284 rtx end_0_label = gen_label_rtx ();
23285 rtx mem;
23286 rtx tmpreg = gen_reg_rtx (SImode);
23287 rtx scratch = gen_reg_rtx (SImode);
23288 rtx cmp;
23289
23290 align = 0;
23291 if (CONST_INT_P (align_rtx))
23292 align = INTVAL (align_rtx);
23293
23294 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
23295
23296 /* Is there a known alignment and is it less than 4? */
23297 if (align < 4)
23298 {
23299 rtx scratch1 = gen_reg_rtx (Pmode);
23300 emit_move_insn (scratch1, out);
23301 /* Is there a known alignment and is it not 2? */
23302 if (align != 2)
23303 {
23304 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
23305 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
23306
23307 /* Leave just the 3 lower bits. */
23308 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
23309 NULL_RTX, 0, OPTAB_WIDEN);
23310
23311 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23312 Pmode, 1, align_4_label);
23313 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
23314 Pmode, 1, align_2_label);
23315 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
23316 Pmode, 1, align_3_label);
23317 }
23318 else
23319 {
23320 /* Since the alignment is 2, we have to check 2 or 0 bytes;
23321 check if is aligned to 4 - byte. */
23322
23323 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
23324 NULL_RTX, 0, OPTAB_WIDEN);
23325
23326 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23327 Pmode, 1, align_4_label);
23328 }
23329
23330 mem = change_address (src, QImode, out);
23331
23332 /* Now compare the bytes. */
23333
23334 /* Compare the first n unaligned byte on a byte per byte basis. */
23335 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
23336 QImode, 1, end_0_label);
23337
23338 /* Increment the address. */
23339 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23340
23341 /* Not needed with an alignment of 2 */
23342 if (align != 2)
23343 {
23344 emit_label (align_2_label);
23345
23346 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23347 end_0_label);
23348
23349 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23350
23351 emit_label (align_3_label);
23352 }
23353
23354 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23355 end_0_label);
23356
23357 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23358 }
23359
23360 /* Generate loop to check 4 bytes at a time. It is not a good idea to
23361 align this loop. It gives only huge programs, but does not help to
23362 speed up. */
23363 emit_label (align_4_label);
23364
23365 mem = change_address (src, SImode, out);
23366 emit_move_insn (scratch, mem);
23367 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23368
23369 /* This formula yields a nonzero result iff one of the bytes is zero.
23370 This saves three branches inside loop and many cycles. */
23371
23372 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23373 emit_insn (gen_one_cmplsi2 (scratch, scratch));
23374 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23375 emit_insn (gen_andsi3 (tmpreg, tmpreg,
23376 gen_int_mode (0x80808080, SImode)));
23377 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23378 align_4_label);
23379
23380 if (TARGET_CMOVE)
23381 {
23382 rtx reg = gen_reg_rtx (SImode);
23383 rtx reg2 = gen_reg_rtx (Pmode);
23384 emit_move_insn (reg, tmpreg);
23385 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23386
23387 /* If zero is not in the first two bytes, move two bytes forward. */
23388 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23389 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23390 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23391 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
23392 gen_rtx_IF_THEN_ELSE (SImode, tmp,
23393 reg,
23394 tmpreg)));
23395 /* Emit lea manually to avoid clobbering of flags. */
23396 emit_insn (gen_rtx_SET (SImode, reg2,
23397 gen_rtx_PLUS (Pmode, out, const2_rtx)));
23398
23399 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23400 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23401 emit_insn (gen_rtx_SET (VOIDmode, out,
23402 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
23403 reg2,
23404 out)));
23405 }
23406 else
23407 {
23408 rtx end_2_label = gen_label_rtx ();
23409 /* Is zero in the first two bytes? */
23410
23411 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23412 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23413 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23414 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23415 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23416 pc_rtx);
23417 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23418 JUMP_LABEL (tmp) = end_2_label;
23419
23420 /* Not in the first two. Move two bytes forward. */
23421 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23422 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23423
23424 emit_label (end_2_label);
23425
23426 }
23427
23428 /* Avoid branch in fixing the byte. */
23429 tmpreg = gen_lowpart (QImode, tmpreg);
23430 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23431 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23432 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23433 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23434
23435 emit_label (end_0_label);
23436 }
23437
23438 /* Expand strlen. */
23439
23440 bool
23441 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23442 {
23443 rtx addr, scratch1, scratch2, scratch3, scratch4;
23444
23445 /* The generic case of strlen expander is long. Avoid it's
23446 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
23447
23448 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23449 && !TARGET_INLINE_ALL_STRINGOPS
23450 && !optimize_insn_for_size_p ()
23451 && (!CONST_INT_P (align) || INTVAL (align) < 4))
23452 return false;
23453
23454 addr = force_reg (Pmode, XEXP (src, 0));
23455 scratch1 = gen_reg_rtx (Pmode);
23456
23457 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23458 && !optimize_insn_for_size_p ())
23459 {
23460 /* Well it seems that some optimizer does not combine a call like
23461 foo(strlen(bar), strlen(bar));
23462 when the move and the subtraction is done here. It does calculate
23463 the length just once when these instructions are done inside of
23464 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23465 often used and I use one fewer register for the lifetime of
23466 output_strlen_unroll() this is better. */
23467
23468 emit_move_insn (out, addr);
23469
23470 ix86_expand_strlensi_unroll_1 (out, src, align);
23471
23472 /* strlensi_unroll_1 returns the address of the zero at the end of
23473 the string, like memchr(), so compute the length by subtracting
23474 the start address. */
23475 emit_insn (ix86_gen_sub3 (out, out, addr));
23476 }
23477 else
23478 {
23479 rtx unspec;
23480
23481 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23482 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23483 return false;
23484
23485 scratch2 = gen_reg_rtx (Pmode);
23486 scratch3 = gen_reg_rtx (Pmode);
23487 scratch4 = force_reg (Pmode, constm1_rtx);
23488
23489 emit_move_insn (scratch3, addr);
23490 eoschar = force_reg (QImode, eoschar);
23491
23492 src = replace_equiv_address_nv (src, scratch3);
23493
23494 /* If .md starts supporting :P, this can be done in .md. */
23495 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23496 scratch4), UNSPEC_SCAS);
23497 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23498 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23499 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23500 }
23501 return true;
23502 }
23503
23504 /* For given symbol (function) construct code to compute address of it's PLT
23505 entry in large x86-64 PIC model. */
23506 static rtx
23507 construct_plt_address (rtx symbol)
23508 {
23509 rtx tmp, unspec;
23510
23511 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23512 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
23513 gcc_assert (Pmode == DImode);
23514
23515 tmp = gen_reg_rtx (Pmode);
23516 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23517
23518 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23519 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23520 return tmp;
23521 }
23522
23523 rtx
23524 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23525 rtx callarg2,
23526 rtx pop, bool sibcall)
23527 {
23528 /* We need to represent that SI and DI registers are clobbered
23529 by SYSV calls. */
23530 static int clobbered_registers[] = {
23531 XMM6_REG, XMM7_REG, XMM8_REG,
23532 XMM9_REG, XMM10_REG, XMM11_REG,
23533 XMM12_REG, XMM13_REG, XMM14_REG,
23534 XMM15_REG, SI_REG, DI_REG
23535 };
23536 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23537 rtx use = NULL, call;
23538 unsigned int vec_len;
23539
23540 if (pop == const0_rtx)
23541 pop = NULL;
23542 gcc_assert (!TARGET_64BIT || !pop);
23543
23544 if (TARGET_MACHO && !TARGET_64BIT)
23545 {
23546 #if TARGET_MACHO
23547 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23548 fnaddr = machopic_indirect_call_target (fnaddr);
23549 #endif
23550 }
23551 else
23552 {
23553 /* Static functions and indirect calls don't need the pic register. */
23554 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23555 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23556 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23557 use_reg (&use, pic_offset_table_rtx);
23558 }
23559
23560 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23561 {
23562 rtx al = gen_rtx_REG (QImode, AX_REG);
23563 emit_move_insn (al, callarg2);
23564 use_reg (&use, al);
23565 }
23566
23567 if (ix86_cmodel == CM_LARGE_PIC
23568 && MEM_P (fnaddr)
23569 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23570 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23571 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23572 else if (sibcall
23573 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23574 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23575 {
23576 fnaddr = XEXP (fnaddr, 0);
23577 if (GET_MODE (fnaddr) != word_mode)
23578 fnaddr = convert_to_mode (word_mode, fnaddr, 1);
23579 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23580 }
23581
23582 vec_len = 0;
23583 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23584 if (retval)
23585 call = gen_rtx_SET (VOIDmode, retval, call);
23586 vec[vec_len++] = call;
23587
23588 if (pop)
23589 {
23590 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23591 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23592 vec[vec_len++] = pop;
23593 }
23594
23595 if (TARGET_64BIT_MS_ABI
23596 && (!callarg2 || INTVAL (callarg2) != -2))
23597 {
23598 unsigned i;
23599
23600 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23601 UNSPEC_MS_TO_SYSV_CALL);
23602
23603 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23604 vec[vec_len++]
23605 = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
23606 ? TImode : DImode,
23607 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23608 ? TImode : DImode,
23609 clobbered_registers[i]));
23610 }
23611
23612 /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
23613 if (TARGET_VZEROUPPER)
23614 {
23615 int avx256;
23616 if (cfun->machine->callee_pass_avx256_p)
23617 {
23618 if (cfun->machine->callee_return_avx256_p)
23619 avx256 = callee_return_pass_avx256;
23620 else
23621 avx256 = callee_pass_avx256;
23622 }
23623 else if (cfun->machine->callee_return_avx256_p)
23624 avx256 = callee_return_avx256;
23625 else
23626 avx256 = call_no_avx256;
23627
23628 if (reload_completed)
23629 emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
23630 else
23631 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
23632 gen_rtvec (1, GEN_INT (avx256)),
23633 UNSPEC_CALL_NEEDS_VZEROUPPER);
23634 }
23635
23636 if (vec_len > 1)
23637 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23638 call = emit_call_insn (call);
23639 if (use)
23640 CALL_INSN_FUNCTION_USAGE (call) = use;
23641
23642 return call;
23643 }
23644
23645 void
23646 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
23647 {
23648 rtx pat = PATTERN (insn);
23649 rtvec vec = XVEC (pat, 0);
23650 int len = GET_NUM_ELEM (vec) - 1;
23651
23652 /* Strip off the last entry of the parallel. */
23653 gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
23654 gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
23655 if (len == 1)
23656 pat = RTVEC_ELT (vec, 0);
23657 else
23658 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
23659
23660 emit_insn (gen_avx_vzeroupper (vzeroupper));
23661 emit_call_insn (pat);
23662 }
23663
23664 /* Output the assembly for a call instruction. */
23665
23666 const char *
23667 ix86_output_call_insn (rtx insn, rtx call_op)
23668 {
23669 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23670 bool seh_nop_p = false;
23671 const char *xasm;
23672
23673 if (SIBLING_CALL_P (insn))
23674 {
23675 if (direct_p)
23676 xasm = "jmp\t%P0";
23677 /* SEH epilogue detection requires the indirect branch case
23678 to include REX.W. */
23679 else if (TARGET_SEH)
23680 xasm = "rex.W jmp %A0";
23681 else
23682 xasm = "jmp\t%A0";
23683
23684 output_asm_insn (xasm, &call_op);
23685 return "";
23686 }
23687
23688 /* SEH unwinding can require an extra nop to be emitted in several
23689 circumstances. Determine if we have one of those. */
23690 if (TARGET_SEH)
23691 {
23692 rtx i;
23693
23694 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23695 {
23696 /* If we get to another real insn, we don't need the nop. */
23697 if (INSN_P (i))
23698 break;
23699
23700 /* If we get to the epilogue note, prevent a catch region from
23701 being adjacent to the standard epilogue sequence. If non-
23702 call-exceptions, we'll have done this during epilogue emission. */
23703 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23704 && !flag_non_call_exceptions
23705 && !can_throw_internal (insn))
23706 {
23707 seh_nop_p = true;
23708 break;
23709 }
23710 }
23711
23712 /* If we didn't find a real insn following the call, prevent the
23713 unwinder from looking into the next function. */
23714 if (i == NULL)
23715 seh_nop_p = true;
23716 }
23717
23718 if (direct_p)
23719 xasm = "call\t%P0";
23720 else
23721 xasm = "call\t%A0";
23722
23723 output_asm_insn (xasm, &call_op);
23724
23725 if (seh_nop_p)
23726 return "nop";
23727
23728 return "";
23729 }
23730 \f
23731 /* Clear stack slot assignments remembered from previous functions.
23732 This is called from INIT_EXPANDERS once before RTL is emitted for each
23733 function. */
23734
23735 static struct machine_function *
23736 ix86_init_machine_status (void)
23737 {
23738 struct machine_function *f;
23739
23740 f = ggc_alloc_cleared_machine_function ();
23741 f->use_fast_prologue_epilogue_nregs = -1;
23742 f->tls_descriptor_call_expanded_p = 0;
23743 f->call_abi = ix86_abi;
23744
23745 return f;
23746 }
23747
23748 /* Return a MEM corresponding to a stack slot with mode MODE.
23749 Allocate a new slot if necessary.
23750
23751 The RTL for a function can have several slots available: N is
23752 which slot to use. */
23753
23754 rtx
23755 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23756 {
23757 struct stack_local_entry *s;
23758
23759 gcc_assert (n < MAX_386_STACK_LOCALS);
23760
23761 /* Virtual slot is valid only before vregs are instantiated. */
23762 gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
23763
23764 for (s = ix86_stack_locals; s; s = s->next)
23765 if (s->mode == mode && s->n == n)
23766 return validize_mem (copy_rtx (s->rtl));
23767
23768 s = ggc_alloc_stack_local_entry ();
23769 s->n = n;
23770 s->mode = mode;
23771 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23772
23773 s->next = ix86_stack_locals;
23774 ix86_stack_locals = s;
23775 return validize_mem (s->rtl);
23776 }
23777 \f
23778 /* Calculate the length of the memory address in the instruction encoding.
23779 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23780 or other prefixes. We never generate addr32 prefix for LEA insn. */
23781
23782 int
23783 memory_address_length (rtx addr, bool lea)
23784 {
23785 struct ix86_address parts;
23786 rtx base, index, disp;
23787 int len;
23788 int ok;
23789
23790 if (GET_CODE (addr) == PRE_DEC
23791 || GET_CODE (addr) == POST_INC
23792 || GET_CODE (addr) == PRE_MODIFY
23793 || GET_CODE (addr) == POST_MODIFY)
23794 return 0;
23795
23796 ok = ix86_decompose_address (addr, &parts);
23797 gcc_assert (ok);
23798
23799 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
23800
23801 /* If this is not LEA instruction, add the length of addr32 prefix. */
23802 if (TARGET_64BIT && !lea
23803 && (SImode_address_operand (addr, VOIDmode)
23804 || (parts.base && GET_MODE (parts.base) == SImode)
23805 || (parts.index && GET_MODE (parts.index) == SImode)))
23806 len++;
23807
23808 base = parts.base;
23809 index = parts.index;
23810 disp = parts.disp;
23811
23812 if (base && GET_CODE (base) == SUBREG)
23813 base = SUBREG_REG (base);
23814 if (index && GET_CODE (index) == SUBREG)
23815 index = SUBREG_REG (index);
23816
23817 gcc_assert (base == NULL_RTX || REG_P (base));
23818 gcc_assert (index == NULL_RTX || REG_P (index));
23819
23820 /* Rule of thumb:
23821 - esp as the base always wants an index,
23822 - ebp as the base always wants a displacement,
23823 - r12 as the base always wants an index,
23824 - r13 as the base always wants a displacement. */
23825
23826 /* Register Indirect. */
23827 if (base && !index && !disp)
23828 {
23829 /* esp (for its index) and ebp (for its displacement) need
23830 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23831 code. */
23832 if (base == arg_pointer_rtx
23833 || base == frame_pointer_rtx
23834 || REGNO (base) == SP_REG
23835 || REGNO (base) == BP_REG
23836 || REGNO (base) == R12_REG
23837 || REGNO (base) == R13_REG)
23838 len++;
23839 }
23840
23841 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23842 is not disp32, but disp32(%rip), so for disp32
23843 SIB byte is needed, unless print_operand_address
23844 optimizes it into disp32(%rip) or (%rip) is implied
23845 by UNSPEC. */
23846 else if (disp && !base && !index)
23847 {
23848 len += 4;
23849 if (TARGET_64BIT)
23850 {
23851 rtx symbol = disp;
23852
23853 if (GET_CODE (disp) == CONST)
23854 symbol = XEXP (disp, 0);
23855 if (GET_CODE (symbol) == PLUS
23856 && CONST_INT_P (XEXP (symbol, 1)))
23857 symbol = XEXP (symbol, 0);
23858
23859 if (GET_CODE (symbol) != LABEL_REF
23860 && (GET_CODE (symbol) != SYMBOL_REF
23861 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23862 && (GET_CODE (symbol) != UNSPEC
23863 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23864 && XINT (symbol, 1) != UNSPEC_PCREL
23865 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23866 len++;
23867 }
23868 }
23869 else
23870 {
23871 /* Find the length of the displacement constant. */
23872 if (disp)
23873 {
23874 if (base && satisfies_constraint_K (disp))
23875 len += 1;
23876 else
23877 len += 4;
23878 }
23879 /* ebp always wants a displacement. Similarly r13. */
23880 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23881 len++;
23882
23883 /* An index requires the two-byte modrm form.... */
23884 if (index
23885 /* ...like esp (or r12), which always wants an index. */
23886 || base == arg_pointer_rtx
23887 || base == frame_pointer_rtx
23888 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23889 len++;
23890 }
23891
23892 return len;
23893 }
23894
23895 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23896 is set, expect that insn have 8bit immediate alternative. */
23897 int
23898 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23899 {
23900 int len = 0;
23901 int i;
23902 extract_insn_cached (insn);
23903 for (i = recog_data.n_operands - 1; i >= 0; --i)
23904 if (CONSTANT_P (recog_data.operand[i]))
23905 {
23906 enum attr_mode mode = get_attr_mode (insn);
23907
23908 gcc_assert (!len);
23909 if (shortform && CONST_INT_P (recog_data.operand[i]))
23910 {
23911 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23912 switch (mode)
23913 {
23914 case MODE_QI:
23915 len = 1;
23916 continue;
23917 case MODE_HI:
23918 ival = trunc_int_for_mode (ival, HImode);
23919 break;
23920 case MODE_SI:
23921 ival = trunc_int_for_mode (ival, SImode);
23922 break;
23923 default:
23924 break;
23925 }
23926 if (IN_RANGE (ival, -128, 127))
23927 {
23928 len = 1;
23929 continue;
23930 }
23931 }
23932 switch (mode)
23933 {
23934 case MODE_QI:
23935 len = 1;
23936 break;
23937 case MODE_HI:
23938 len = 2;
23939 break;
23940 case MODE_SI:
23941 len = 4;
23942 break;
23943 /* Immediates for DImode instructions are encoded
23944 as 32bit sign extended values. */
23945 case MODE_DI:
23946 len = 4;
23947 break;
23948 default:
23949 fatal_insn ("unknown insn mode", insn);
23950 }
23951 }
23952 return len;
23953 }
23954
23955 /* Compute default value for "length_address" attribute. */
23956 int
23957 ix86_attr_length_address_default (rtx insn)
23958 {
23959 int i;
23960
23961 if (get_attr_type (insn) == TYPE_LEA)
23962 {
23963 rtx set = PATTERN (insn), addr;
23964
23965 if (GET_CODE (set) == PARALLEL)
23966 set = XVECEXP (set, 0, 0);
23967
23968 gcc_assert (GET_CODE (set) == SET);
23969
23970 addr = SET_SRC (set);
23971
23972 return memory_address_length (addr, true);
23973 }
23974
23975 extract_insn_cached (insn);
23976 for (i = recog_data.n_operands - 1; i >= 0; --i)
23977 if (MEM_P (recog_data.operand[i]))
23978 {
23979 constrain_operands_cached (reload_completed);
23980 if (which_alternative != -1)
23981 {
23982 const char *constraints = recog_data.constraints[i];
23983 int alt = which_alternative;
23984
23985 while (*constraints == '=' || *constraints == '+')
23986 constraints++;
23987 while (alt-- > 0)
23988 while (*constraints++ != ',')
23989 ;
23990 /* Skip ignored operands. */
23991 if (*constraints == 'X')
23992 continue;
23993 }
23994 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
23995 }
23996 return 0;
23997 }
23998
23999 /* Compute default value for "length_vex" attribute. It includes
24000 2 or 3 byte VEX prefix and 1 opcode byte. */
24001
24002 int
24003 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24004 {
24005 int i;
24006
24007 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24008 byte VEX prefix. */
24009 if (!has_0f_opcode || has_vex_w)
24010 return 3 + 1;
24011
24012 /* We can always use 2 byte VEX prefix in 32bit. */
24013 if (!TARGET_64BIT)
24014 return 2 + 1;
24015
24016 extract_insn_cached (insn);
24017
24018 for (i = recog_data.n_operands - 1; i >= 0; --i)
24019 if (REG_P (recog_data.operand[i]))
24020 {
24021 /* REX.W bit uses 3 byte VEX prefix. */
24022 if (GET_MODE (recog_data.operand[i]) == DImode
24023 && GENERAL_REG_P (recog_data.operand[i]))
24024 return 3 + 1;
24025 }
24026 else
24027 {
24028 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24029 if (MEM_P (recog_data.operand[i])
24030 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24031 return 3 + 1;
24032 }
24033
24034 return 2 + 1;
24035 }
24036 \f
24037 /* Return the maximum number of instructions a cpu can issue. */
24038
24039 static int
24040 ix86_issue_rate (void)
24041 {
24042 switch (ix86_tune)
24043 {
24044 case PROCESSOR_PENTIUM:
24045 case PROCESSOR_ATOM:
24046 case PROCESSOR_K6:
24047 case PROCESSOR_BTVER2:
24048 return 2;
24049
24050 case PROCESSOR_PENTIUMPRO:
24051 case PROCESSOR_PENTIUM4:
24052 case PROCESSOR_CORE2_32:
24053 case PROCESSOR_CORE2_64:
24054 case PROCESSOR_COREI7_32:
24055 case PROCESSOR_COREI7_64:
24056 case PROCESSOR_ATHLON:
24057 case PROCESSOR_K8:
24058 case PROCESSOR_AMDFAM10:
24059 case PROCESSOR_NOCONA:
24060 case PROCESSOR_GENERIC32:
24061 case PROCESSOR_GENERIC64:
24062 case PROCESSOR_BDVER1:
24063 case PROCESSOR_BDVER2:
24064 case PROCESSOR_BTVER1:
24065 return 3;
24066
24067 default:
24068 return 1;
24069 }
24070 }
24071
24072 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24073 by DEP_INSN and nothing set by DEP_INSN. */
24074
24075 static bool
24076 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24077 {
24078 rtx set, set2;
24079
24080 /* Simplify the test for uninteresting insns. */
24081 if (insn_type != TYPE_SETCC
24082 && insn_type != TYPE_ICMOV
24083 && insn_type != TYPE_FCMOV
24084 && insn_type != TYPE_IBR)
24085 return false;
24086
24087 if ((set = single_set (dep_insn)) != 0)
24088 {
24089 set = SET_DEST (set);
24090 set2 = NULL_RTX;
24091 }
24092 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
24093 && XVECLEN (PATTERN (dep_insn), 0) == 2
24094 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
24095 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
24096 {
24097 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24098 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24099 }
24100 else
24101 return false;
24102
24103 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
24104 return false;
24105
24106 /* This test is true if the dependent insn reads the flags but
24107 not any other potentially set register. */
24108 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
24109 return false;
24110
24111 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
24112 return false;
24113
24114 return true;
24115 }
24116
24117 /* Return true iff USE_INSN has a memory address with operands set by
24118 SET_INSN. */
24119
24120 bool
24121 ix86_agi_dependent (rtx set_insn, rtx use_insn)
24122 {
24123 int i;
24124 extract_insn_cached (use_insn);
24125 for (i = recog_data.n_operands - 1; i >= 0; --i)
24126 if (MEM_P (recog_data.operand[i]))
24127 {
24128 rtx addr = XEXP (recog_data.operand[i], 0);
24129 return modified_in_p (addr, set_insn) != 0;
24130 }
24131 return false;
24132 }
24133
24134 static int
24135 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
24136 {
24137 enum attr_type insn_type, dep_insn_type;
24138 enum attr_memory memory;
24139 rtx set, set2;
24140 int dep_insn_code_number;
24141
24142 /* Anti and output dependencies have zero cost on all CPUs. */
24143 if (REG_NOTE_KIND (link) != 0)
24144 return 0;
24145
24146 dep_insn_code_number = recog_memoized (dep_insn);
24147
24148 /* If we can't recognize the insns, we can't really do anything. */
24149 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
24150 return cost;
24151
24152 insn_type = get_attr_type (insn);
24153 dep_insn_type = get_attr_type (dep_insn);
24154
24155 switch (ix86_tune)
24156 {
24157 case PROCESSOR_PENTIUM:
24158 /* Address Generation Interlock adds a cycle of latency. */
24159 if (insn_type == TYPE_LEA)
24160 {
24161 rtx addr = PATTERN (insn);
24162
24163 if (GET_CODE (addr) == PARALLEL)
24164 addr = XVECEXP (addr, 0, 0);
24165
24166 gcc_assert (GET_CODE (addr) == SET);
24167
24168 addr = SET_SRC (addr);
24169 if (modified_in_p (addr, dep_insn))
24170 cost += 1;
24171 }
24172 else if (ix86_agi_dependent (dep_insn, insn))
24173 cost += 1;
24174
24175 /* ??? Compares pair with jump/setcc. */
24176 if (ix86_flags_dependent (insn, dep_insn, insn_type))
24177 cost = 0;
24178
24179 /* Floating point stores require value to be ready one cycle earlier. */
24180 if (insn_type == TYPE_FMOV
24181 && get_attr_memory (insn) == MEMORY_STORE
24182 && !ix86_agi_dependent (dep_insn, insn))
24183 cost += 1;
24184 break;
24185
24186 case PROCESSOR_PENTIUMPRO:
24187 memory = get_attr_memory (insn);
24188
24189 /* INT->FP conversion is expensive. */
24190 if (get_attr_fp_int_src (dep_insn))
24191 cost += 5;
24192
24193 /* There is one cycle extra latency between an FP op and a store. */
24194 if (insn_type == TYPE_FMOV
24195 && (set = single_set (dep_insn)) != NULL_RTX
24196 && (set2 = single_set (insn)) != NULL_RTX
24197 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
24198 && MEM_P (SET_DEST (set2)))
24199 cost += 1;
24200
24201 /* Show ability of reorder buffer to hide latency of load by executing
24202 in parallel with previous instruction in case
24203 previous instruction is not needed to compute the address. */
24204 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24205 && !ix86_agi_dependent (dep_insn, insn))
24206 {
24207 /* Claim moves to take one cycle, as core can issue one load
24208 at time and the next load can start cycle later. */
24209 if (dep_insn_type == TYPE_IMOV
24210 || dep_insn_type == TYPE_FMOV)
24211 cost = 1;
24212 else if (cost > 1)
24213 cost--;
24214 }
24215 break;
24216
24217 case PROCESSOR_K6:
24218 memory = get_attr_memory (insn);
24219
24220 /* The esp dependency is resolved before the instruction is really
24221 finished. */
24222 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
24223 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
24224 return 1;
24225
24226 /* INT->FP conversion is expensive. */
24227 if (get_attr_fp_int_src (dep_insn))
24228 cost += 5;
24229
24230 /* Show ability of reorder buffer to hide latency of load by executing
24231 in parallel with previous instruction in case
24232 previous instruction is not needed to compute the address. */
24233 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24234 && !ix86_agi_dependent (dep_insn, insn))
24235 {
24236 /* Claim moves to take one cycle, as core can issue one load
24237 at time and the next load can start cycle later. */
24238 if (dep_insn_type == TYPE_IMOV
24239 || dep_insn_type == TYPE_FMOV)
24240 cost = 1;
24241 else if (cost > 2)
24242 cost -= 2;
24243 else
24244 cost = 1;
24245 }
24246 break;
24247
24248 case PROCESSOR_ATHLON:
24249 case PROCESSOR_K8:
24250 case PROCESSOR_AMDFAM10:
24251 case PROCESSOR_BDVER1:
24252 case PROCESSOR_BDVER2:
24253 case PROCESSOR_BTVER1:
24254 case PROCESSOR_BTVER2:
24255 case PROCESSOR_ATOM:
24256 case PROCESSOR_GENERIC32:
24257 case PROCESSOR_GENERIC64:
24258 memory = get_attr_memory (insn);
24259
24260 /* Show ability of reorder buffer to hide latency of load by executing
24261 in parallel with previous instruction in case
24262 previous instruction is not needed to compute the address. */
24263 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24264 && !ix86_agi_dependent (dep_insn, insn))
24265 {
24266 enum attr_unit unit = get_attr_unit (insn);
24267 int loadcost = 3;
24268
24269 /* Because of the difference between the length of integer and
24270 floating unit pipeline preparation stages, the memory operands
24271 for floating point are cheaper.
24272
24273 ??? For Athlon it the difference is most probably 2. */
24274 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
24275 loadcost = 3;
24276 else
24277 loadcost = TARGET_ATHLON ? 2 : 0;
24278
24279 if (cost >= loadcost)
24280 cost -= loadcost;
24281 else
24282 cost = 0;
24283 }
24284
24285 default:
24286 break;
24287 }
24288
24289 return cost;
24290 }
24291
24292 /* How many alternative schedules to try. This should be as wide as the
24293 scheduling freedom in the DFA, but no wider. Making this value too
24294 large results extra work for the scheduler. */
24295
24296 static int
24297 ia32_multipass_dfa_lookahead (void)
24298 {
24299 switch (ix86_tune)
24300 {
24301 case PROCESSOR_PENTIUM:
24302 return 2;
24303
24304 case PROCESSOR_PENTIUMPRO:
24305 case PROCESSOR_K6:
24306 return 1;
24307
24308 case PROCESSOR_CORE2_32:
24309 case PROCESSOR_CORE2_64:
24310 case PROCESSOR_COREI7_32:
24311 case PROCESSOR_COREI7_64:
24312 case PROCESSOR_ATOM:
24313 /* Generally, we want haifa-sched:max_issue() to look ahead as far
24314 as many instructions can be executed on a cycle, i.e.,
24315 issue_rate. I wonder why tuning for many CPUs does not do this. */
24316 if (reload_completed)
24317 return ix86_issue_rate ();
24318 /* Don't use lookahead for pre-reload schedule to save compile time. */
24319 return 0;
24320
24321 default:
24322 return 0;
24323 }
24324 }
24325
24326 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
24327 execution. It is applied if
24328 (1) IMUL instruction is on the top of list;
24329 (2) There exists the only producer of independent IMUL instruction in
24330 ready list;
24331 (3) Put found producer on the top of ready list.
24332 Returns issue rate. */
24333
24334 static int
24335 ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
24336 int clock_var ATTRIBUTE_UNUSED)
24337 {
24338 static int issue_rate = -1;
24339 int n_ready = *pn_ready;
24340 rtx insn, insn1, insn2;
24341 int i;
24342 sd_iterator_def sd_it;
24343 dep_t dep;
24344 int index = -1;
24345
24346 /* Set up issue rate. */
24347 issue_rate = ix86_issue_rate();
24348
24349 /* Do reodering for Atom only. */
24350 if (ix86_tune != PROCESSOR_ATOM)
24351 return issue_rate;
24352 /* Do not perform ready list reodering for pre-reload schedule pass. */
24353 if (!reload_completed)
24354 return issue_rate;
24355 /* Nothing to do if ready list contains only 1 instruction. */
24356 if (n_ready <= 1)
24357 return issue_rate;
24358
24359 /* Check that IMUL instruction is on the top of ready list. */
24360 insn = ready[n_ready - 1];
24361 if (!NONDEBUG_INSN_P (insn))
24362 return issue_rate;
24363 insn = PATTERN (insn);
24364 if (GET_CODE (insn) == PARALLEL)
24365 insn = XVECEXP (insn, 0, 0);
24366 if (GET_CODE (insn) != SET)
24367 return issue_rate;
24368 if (!(GET_CODE (SET_SRC (insn)) == MULT
24369 && GET_MODE (SET_SRC (insn)) == SImode))
24370 return issue_rate;
24371
24372 /* Search for producer of independent IMUL instruction. */
24373 for (i = n_ready - 2; i>= 0; i--)
24374 {
24375 insn = ready[i];
24376 if (!NONDEBUG_INSN_P (insn))
24377 continue;
24378 /* Skip IMUL instruction. */
24379 insn2 = PATTERN (insn);
24380 if (GET_CODE (insn2) == PARALLEL)
24381 insn2 = XVECEXP (insn2, 0, 0);
24382 if (GET_CODE (insn2) == SET
24383 && GET_CODE (SET_SRC (insn2)) == MULT
24384 && GET_MODE (SET_SRC (insn2)) == SImode)
24385 continue;
24386
24387 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
24388 {
24389 rtx con;
24390 con = DEP_CON (dep);
24391 if (!NONDEBUG_INSN_P (con))
24392 continue;
24393 insn1 = PATTERN (con);
24394 if (GET_CODE (insn1) == PARALLEL)
24395 insn1 = XVECEXP (insn1, 0, 0);
24396
24397 if (GET_CODE (insn1) == SET
24398 && GET_CODE (SET_SRC (insn1)) == MULT
24399 && GET_MODE (SET_SRC (insn1)) == SImode)
24400 {
24401 sd_iterator_def sd_it1;
24402 dep_t dep1;
24403 /* Check if there is no other dependee for IMUL. */
24404 index = i;
24405 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
24406 {
24407 rtx pro;
24408 pro = DEP_PRO (dep1);
24409 if (!NONDEBUG_INSN_P (pro))
24410 continue;
24411 if (pro != insn)
24412 index = -1;
24413 }
24414 if (index >= 0)
24415 break;
24416 }
24417 }
24418 if (index >= 0)
24419 break;
24420 }
24421 if (index < 0)
24422 return issue_rate; /* Didn't find IMUL producer. */
24423
24424 if (sched_verbose > 1)
24425 fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n",
24426 INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1]));
24427
24428 /* Put IMUL producer (ready[index]) at the top of ready list. */
24429 insn1= ready[index];
24430 for (i = index; i < n_ready - 1; i++)
24431 ready[i] = ready[i + 1];
24432 ready[n_ready - 1] = insn1;
24433
24434 return issue_rate;
24435 }
24436
24437 static bool
24438 ix86_class_likely_spilled_p (reg_class_t);
24439
24440 /* Returns true if lhs of insn is HW function argument register and set up
24441 is_spilled to true if it is likely spilled HW register. */
24442 static bool
24443 insn_is_function_arg (rtx insn, bool* is_spilled)
24444 {
24445 rtx dst;
24446
24447 if (!NONDEBUG_INSN_P (insn))
24448 return false;
24449 /* Call instructions are not movable, ignore it. */
24450 if (CALL_P (insn))
24451 return false;
24452 insn = PATTERN (insn);
24453 if (GET_CODE (insn) == PARALLEL)
24454 insn = XVECEXP (insn, 0, 0);
24455 if (GET_CODE (insn) != SET)
24456 return false;
24457 dst = SET_DEST (insn);
24458 if (REG_P (dst) && HARD_REGISTER_P (dst)
24459 && ix86_function_arg_regno_p (REGNO (dst)))
24460 {
24461 /* Is it likely spilled HW register? */
24462 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
24463 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
24464 *is_spilled = true;
24465 return true;
24466 }
24467 return false;
24468 }
24469
24470 /* Add output dependencies for chain of function adjacent arguments if only
24471 there is a move to likely spilled HW register. Return first argument
24472 if at least one dependence was added or NULL otherwise. */
24473 static rtx
24474 add_parameter_dependencies (rtx call, rtx head)
24475 {
24476 rtx insn;
24477 rtx last = call;
24478 rtx first_arg = NULL;
24479 bool is_spilled = false;
24480
24481 head = PREV_INSN (head);
24482
24483 /* Find nearest to call argument passing instruction. */
24484 while (true)
24485 {
24486 last = PREV_INSN (last);
24487 if (last == head)
24488 return NULL;
24489 if (!NONDEBUG_INSN_P (last))
24490 continue;
24491 if (insn_is_function_arg (last, &is_spilled))
24492 break;
24493 return NULL;
24494 }
24495
24496 first_arg = last;
24497 while (true)
24498 {
24499 insn = PREV_INSN (last);
24500 if (!INSN_P (insn))
24501 break;
24502 if (insn == head)
24503 break;
24504 if (!NONDEBUG_INSN_P (insn))
24505 {
24506 last = insn;
24507 continue;
24508 }
24509 if (insn_is_function_arg (insn, &is_spilled))
24510 {
24511 /* Add output depdendence between two function arguments if chain
24512 of output arguments contains likely spilled HW registers. */
24513 if (is_spilled)
24514 add_dependence (last, insn, REG_DEP_OUTPUT);
24515 first_arg = last = insn;
24516 }
24517 else
24518 break;
24519 }
24520 if (!is_spilled)
24521 return NULL;
24522 return first_arg;
24523 }
24524
24525 /* Add output or anti dependency from insn to first_arg to restrict its code
24526 motion. */
24527 static void
24528 avoid_func_arg_motion (rtx first_arg, rtx insn)
24529 {
24530 rtx set;
24531 rtx tmp;
24532
24533 set = single_set (insn);
24534 if (!set)
24535 return;
24536 tmp = SET_DEST (set);
24537 if (REG_P (tmp))
24538 {
24539 /* Add output dependency to the first function argument. */
24540 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
24541 return;
24542 }
24543 /* Add anti dependency. */
24544 add_dependence (first_arg, insn, REG_DEP_ANTI);
24545 }
24546
24547 /* Avoid cross block motion of function argument through adding dependency
24548 from the first non-jump instruction in bb. */
24549 static void
24550 add_dependee_for_func_arg (rtx arg, basic_block bb)
24551 {
24552 rtx insn = BB_END (bb);
24553
24554 while (insn)
24555 {
24556 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
24557 {
24558 rtx set = single_set (insn);
24559 if (set)
24560 {
24561 avoid_func_arg_motion (arg, insn);
24562 return;
24563 }
24564 }
24565 if (insn == BB_HEAD (bb))
24566 return;
24567 insn = PREV_INSN (insn);
24568 }
24569 }
24570
24571 /* Hook for pre-reload schedule - avoid motion of function arguments
24572 passed in likely spilled HW registers. */
24573 static void
24574 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
24575 {
24576 rtx insn;
24577 rtx first_arg = NULL;
24578 if (reload_completed)
24579 return;
24580 while (head != tail && DEBUG_INSN_P (head))
24581 head = NEXT_INSN (head);
24582 for (insn = tail; insn != head; insn = PREV_INSN (insn))
24583 if (INSN_P (insn) && CALL_P (insn))
24584 {
24585 first_arg = add_parameter_dependencies (insn, head);
24586 if (first_arg)
24587 {
24588 /* Add dependee for first argument to predecessors if only
24589 region contains more than one block. */
24590 basic_block bb = BLOCK_FOR_INSN (insn);
24591 int rgn = CONTAINING_RGN (bb->index);
24592 int nr_blks = RGN_NR_BLOCKS (rgn);
24593 /* Skip trivial regions and region head blocks that can have
24594 predecessors outside of region. */
24595 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
24596 {
24597 edge e;
24598 edge_iterator ei;
24599 /* Assume that region is SCC, i.e. all immediate predecessors
24600 of non-head block are in the same region. */
24601 FOR_EACH_EDGE (e, ei, bb->preds)
24602 {
24603 /* Avoid creating of loop-carried dependencies through
24604 using topological odering in region. */
24605 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
24606 add_dependee_for_func_arg (first_arg, e->src);
24607 }
24608 }
24609 insn = first_arg;
24610 if (insn == head)
24611 break;
24612 }
24613 }
24614 else if (first_arg)
24615 avoid_func_arg_motion (first_arg, insn);
24616 }
24617
24618 /* Hook for pre-reload schedule - set priority of moves from likely spilled
24619 HW registers to maximum, to schedule them at soon as possible. These are
24620 moves from function argument registers at the top of the function entry
24621 and moves from function return value registers after call. */
24622 static int
24623 ix86_adjust_priority (rtx insn, int priority)
24624 {
24625 rtx set;
24626
24627 if (reload_completed)
24628 return priority;
24629
24630 if (!NONDEBUG_INSN_P (insn))
24631 return priority;
24632
24633 set = single_set (insn);
24634 if (set)
24635 {
24636 rtx tmp = SET_SRC (set);
24637 if (REG_P (tmp)
24638 && HARD_REGISTER_P (tmp)
24639 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
24640 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
24641 return current_sched_info->sched_max_insns_priority;
24642 }
24643
24644 return priority;
24645 }
24646
24647 /* Model decoder of Core 2/i7.
24648 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
24649 track the instruction fetch block boundaries and make sure that long
24650 (9+ bytes) instructions are assigned to D0. */
24651
24652 /* Maximum length of an insn that can be handled by
24653 a secondary decoder unit. '8' for Core 2/i7. */
24654 static int core2i7_secondary_decoder_max_insn_size;
24655
24656 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
24657 '16' for Core 2/i7. */
24658 static int core2i7_ifetch_block_size;
24659
24660 /* Maximum number of instructions decoder can handle per cycle.
24661 '6' for Core 2/i7. */
24662 static int core2i7_ifetch_block_max_insns;
24663
24664 typedef struct ix86_first_cycle_multipass_data_ *
24665 ix86_first_cycle_multipass_data_t;
24666 typedef const struct ix86_first_cycle_multipass_data_ *
24667 const_ix86_first_cycle_multipass_data_t;
24668
24669 /* A variable to store target state across calls to max_issue within
24670 one cycle. */
24671 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
24672 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
24673
24674 /* Initialize DATA. */
24675 static void
24676 core2i7_first_cycle_multipass_init (void *_data)
24677 {
24678 ix86_first_cycle_multipass_data_t data
24679 = (ix86_first_cycle_multipass_data_t) _data;
24680
24681 data->ifetch_block_len = 0;
24682 data->ifetch_block_n_insns = 0;
24683 data->ready_try_change = NULL;
24684 data->ready_try_change_size = 0;
24685 }
24686
24687 /* Advancing the cycle; reset ifetch block counts. */
24688 static void
24689 core2i7_dfa_post_advance_cycle (void)
24690 {
24691 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
24692
24693 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24694
24695 data->ifetch_block_len = 0;
24696 data->ifetch_block_n_insns = 0;
24697 }
24698
24699 static int min_insn_size (rtx);
24700
24701 /* Filter out insns from ready_try that the core will not be able to issue
24702 on current cycle due to decoder. */
24703 static void
24704 core2i7_first_cycle_multipass_filter_ready_try
24705 (const_ix86_first_cycle_multipass_data_t data,
24706 char *ready_try, int n_ready, bool first_cycle_insn_p)
24707 {
24708 while (n_ready--)
24709 {
24710 rtx insn;
24711 int insn_size;
24712
24713 if (ready_try[n_ready])
24714 continue;
24715
24716 insn = get_ready_element (n_ready);
24717 insn_size = min_insn_size (insn);
24718
24719 if (/* If this is a too long an insn for a secondary decoder ... */
24720 (!first_cycle_insn_p
24721 && insn_size > core2i7_secondary_decoder_max_insn_size)
24722 /* ... or it would not fit into the ifetch block ... */
24723 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
24724 /* ... or the decoder is full already ... */
24725 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
24726 /* ... mask the insn out. */
24727 {
24728 ready_try[n_ready] = 1;
24729
24730 if (data->ready_try_change)
24731 SET_BIT (data->ready_try_change, n_ready);
24732 }
24733 }
24734 }
24735
24736 /* Prepare for a new round of multipass lookahead scheduling. */
24737 static void
24738 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
24739 bool first_cycle_insn_p)
24740 {
24741 ix86_first_cycle_multipass_data_t data
24742 = (ix86_first_cycle_multipass_data_t) _data;
24743 const_ix86_first_cycle_multipass_data_t prev_data
24744 = ix86_first_cycle_multipass_data;
24745
24746 /* Restore the state from the end of the previous round. */
24747 data->ifetch_block_len = prev_data->ifetch_block_len;
24748 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
24749
24750 /* Filter instructions that cannot be issued on current cycle due to
24751 decoder restrictions. */
24752 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24753 first_cycle_insn_p);
24754 }
24755
24756 /* INSN is being issued in current solution. Account for its impact on
24757 the decoder model. */
24758 static void
24759 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24760 rtx insn, const void *_prev_data)
24761 {
24762 ix86_first_cycle_multipass_data_t data
24763 = (ix86_first_cycle_multipass_data_t) _data;
24764 const_ix86_first_cycle_multipass_data_t prev_data
24765 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
24766
24767 int insn_size = min_insn_size (insn);
24768
24769 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
24770 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
24771 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
24772 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24773
24774 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
24775 if (!data->ready_try_change)
24776 {
24777 data->ready_try_change = sbitmap_alloc (n_ready);
24778 data->ready_try_change_size = n_ready;
24779 }
24780 else if (data->ready_try_change_size < n_ready)
24781 {
24782 data->ready_try_change = sbitmap_resize (data->ready_try_change,
24783 n_ready, 0);
24784 data->ready_try_change_size = n_ready;
24785 }
24786 sbitmap_zero (data->ready_try_change);
24787
24788 /* Filter out insns from ready_try that the core will not be able to issue
24789 on current cycle due to decoder. */
24790 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24791 false);
24792 }
24793
24794 /* Revert the effect on ready_try. */
24795 static void
24796 core2i7_first_cycle_multipass_backtrack (const void *_data,
24797 char *ready_try,
24798 int n_ready ATTRIBUTE_UNUSED)
24799 {
24800 const_ix86_first_cycle_multipass_data_t data
24801 = (const_ix86_first_cycle_multipass_data_t) _data;
24802 unsigned int i = 0;
24803 sbitmap_iterator sbi;
24804
24805 gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
24806 EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
24807 {
24808 ready_try[i] = 0;
24809 }
24810 }
24811
24812 /* Save the result of multipass lookahead scheduling for the next round. */
24813 static void
24814 core2i7_first_cycle_multipass_end (const void *_data)
24815 {
24816 const_ix86_first_cycle_multipass_data_t data
24817 = (const_ix86_first_cycle_multipass_data_t) _data;
24818 ix86_first_cycle_multipass_data_t next_data
24819 = ix86_first_cycle_multipass_data;
24820
24821 if (data != NULL)
24822 {
24823 next_data->ifetch_block_len = data->ifetch_block_len;
24824 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24825 }
24826 }
24827
24828 /* Deallocate target data. */
24829 static void
24830 core2i7_first_cycle_multipass_fini (void *_data)
24831 {
24832 ix86_first_cycle_multipass_data_t data
24833 = (ix86_first_cycle_multipass_data_t) _data;
24834
24835 if (data->ready_try_change)
24836 {
24837 sbitmap_free (data->ready_try_change);
24838 data->ready_try_change = NULL;
24839 data->ready_try_change_size = 0;
24840 }
24841 }
24842
24843 /* Prepare for scheduling pass. */
24844 static void
24845 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24846 int verbose ATTRIBUTE_UNUSED,
24847 int max_uid ATTRIBUTE_UNUSED)
24848 {
24849 /* Install scheduling hooks for current CPU. Some of these hooks are used
24850 in time-critical parts of the scheduler, so we only set them up when
24851 they are actually used. */
24852 switch (ix86_tune)
24853 {
24854 case PROCESSOR_CORE2_32:
24855 case PROCESSOR_CORE2_64:
24856 case PROCESSOR_COREI7_32:
24857 case PROCESSOR_COREI7_64:
24858 /* Do not perform multipass scheduling for pre-reload schedule
24859 to save compile time. */
24860 if (reload_completed)
24861 {
24862 targetm.sched.dfa_post_advance_cycle
24863 = core2i7_dfa_post_advance_cycle;
24864 targetm.sched.first_cycle_multipass_init
24865 = core2i7_first_cycle_multipass_init;
24866 targetm.sched.first_cycle_multipass_begin
24867 = core2i7_first_cycle_multipass_begin;
24868 targetm.sched.first_cycle_multipass_issue
24869 = core2i7_first_cycle_multipass_issue;
24870 targetm.sched.first_cycle_multipass_backtrack
24871 = core2i7_first_cycle_multipass_backtrack;
24872 targetm.sched.first_cycle_multipass_end
24873 = core2i7_first_cycle_multipass_end;
24874 targetm.sched.first_cycle_multipass_fini
24875 = core2i7_first_cycle_multipass_fini;
24876
24877 /* Set decoder parameters. */
24878 core2i7_secondary_decoder_max_insn_size = 8;
24879 core2i7_ifetch_block_size = 16;
24880 core2i7_ifetch_block_max_insns = 6;
24881 break;
24882 }
24883 /* ... Fall through ... */
24884 default:
24885 targetm.sched.dfa_post_advance_cycle = NULL;
24886 targetm.sched.first_cycle_multipass_init = NULL;
24887 targetm.sched.first_cycle_multipass_begin = NULL;
24888 targetm.sched.first_cycle_multipass_issue = NULL;
24889 targetm.sched.first_cycle_multipass_backtrack = NULL;
24890 targetm.sched.first_cycle_multipass_end = NULL;
24891 targetm.sched.first_cycle_multipass_fini = NULL;
24892 break;
24893 }
24894 }
24895
24896 \f
24897 /* Compute the alignment given to a constant that is being placed in memory.
24898 EXP is the constant and ALIGN is the alignment that the object would
24899 ordinarily have.
24900 The value of this function is used instead of that alignment to align
24901 the object. */
24902
24903 int
24904 ix86_constant_alignment (tree exp, int align)
24905 {
24906 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24907 || TREE_CODE (exp) == INTEGER_CST)
24908 {
24909 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24910 return 64;
24911 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24912 return 128;
24913 }
24914 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24915 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24916 return BITS_PER_WORD;
24917
24918 return align;
24919 }
24920
24921 /* Compute the alignment for a static variable.
24922 TYPE is the data type, and ALIGN is the alignment that
24923 the object would ordinarily have. The value of this function is used
24924 instead of that alignment to align the object. */
24925
24926 int
24927 ix86_data_alignment (tree type, int align)
24928 {
24929 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24930
24931 if (AGGREGATE_TYPE_P (type)
24932 && TYPE_SIZE (type)
24933 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24934 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24935 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24936 && align < max_align)
24937 align = max_align;
24938
24939 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24940 to 16byte boundary. */
24941 if (TARGET_64BIT)
24942 {
24943 if (AGGREGATE_TYPE_P (type)
24944 && TYPE_SIZE (type)
24945 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24946 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24947 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24948 return 128;
24949 }
24950
24951 if (TREE_CODE (type) == ARRAY_TYPE)
24952 {
24953 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24954 return 64;
24955 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24956 return 128;
24957 }
24958 else if (TREE_CODE (type) == COMPLEX_TYPE)
24959 {
24960
24961 if (TYPE_MODE (type) == DCmode && align < 64)
24962 return 64;
24963 if ((TYPE_MODE (type) == XCmode
24964 || TYPE_MODE (type) == TCmode) && align < 128)
24965 return 128;
24966 }
24967 else if ((TREE_CODE (type) == RECORD_TYPE
24968 || TREE_CODE (type) == UNION_TYPE
24969 || TREE_CODE (type) == QUAL_UNION_TYPE)
24970 && TYPE_FIELDS (type))
24971 {
24972 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24973 return 64;
24974 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24975 return 128;
24976 }
24977 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24978 || TREE_CODE (type) == INTEGER_TYPE)
24979 {
24980 if (TYPE_MODE (type) == DFmode && align < 64)
24981 return 64;
24982 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24983 return 128;
24984 }
24985
24986 return align;
24987 }
24988
24989 /* Compute the alignment for a local variable or a stack slot. EXP is
24990 the data type or decl itself, MODE is the widest mode available and
24991 ALIGN is the alignment that the object would ordinarily have. The
24992 value of this macro is used instead of that alignment to align the
24993 object. */
24994
24995 unsigned int
24996 ix86_local_alignment (tree exp, enum machine_mode mode,
24997 unsigned int align)
24998 {
24999 tree type, decl;
25000
25001 if (exp && DECL_P (exp))
25002 {
25003 type = TREE_TYPE (exp);
25004 decl = exp;
25005 }
25006 else
25007 {
25008 type = exp;
25009 decl = NULL;
25010 }
25011
25012 /* Don't do dynamic stack realignment for long long objects with
25013 -mpreferred-stack-boundary=2. */
25014 if (!TARGET_64BIT
25015 && align == 64
25016 && ix86_preferred_stack_boundary < 64
25017 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
25018 && (!type || !TYPE_USER_ALIGN (type))
25019 && (!decl || !DECL_USER_ALIGN (decl)))
25020 align = 32;
25021
25022 /* If TYPE is NULL, we are allocating a stack slot for caller-save
25023 register in MODE. We will return the largest alignment of XF
25024 and DF. */
25025 if (!type)
25026 {
25027 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
25028 align = GET_MODE_ALIGNMENT (DFmode);
25029 return align;
25030 }
25031
25032 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25033 to 16byte boundary. Exact wording is:
25034
25035 An array uses the same alignment as its elements, except that a local or
25036 global array variable of length at least 16 bytes or
25037 a C99 variable-length array variable always has alignment of at least 16 bytes.
25038
25039 This was added to allow use of aligned SSE instructions at arrays. This
25040 rule is meant for static storage (where compiler can not do the analysis
25041 by itself). We follow it for automatic variables only when convenient.
25042 We fully control everything in the function compiled and functions from
25043 other unit can not rely on the alignment.
25044
25045 Exclude va_list type. It is the common case of local array where
25046 we can not benefit from the alignment. */
25047 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
25048 && TARGET_SSE)
25049 {
25050 if (AGGREGATE_TYPE_P (type)
25051 && (va_list_type_node == NULL_TREE
25052 || (TYPE_MAIN_VARIANT (type)
25053 != TYPE_MAIN_VARIANT (va_list_type_node)))
25054 && TYPE_SIZE (type)
25055 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25056 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
25057 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25058 return 128;
25059 }
25060 if (TREE_CODE (type) == ARRAY_TYPE)
25061 {
25062 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25063 return 64;
25064 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25065 return 128;
25066 }
25067 else if (TREE_CODE (type) == COMPLEX_TYPE)
25068 {
25069 if (TYPE_MODE (type) == DCmode && align < 64)
25070 return 64;
25071 if ((TYPE_MODE (type) == XCmode
25072 || TYPE_MODE (type) == TCmode) && align < 128)
25073 return 128;
25074 }
25075 else if ((TREE_CODE (type) == RECORD_TYPE
25076 || TREE_CODE (type) == UNION_TYPE
25077 || TREE_CODE (type) == QUAL_UNION_TYPE)
25078 && TYPE_FIELDS (type))
25079 {
25080 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25081 return 64;
25082 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25083 return 128;
25084 }
25085 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25086 || TREE_CODE (type) == INTEGER_TYPE)
25087 {
25088
25089 if (TYPE_MODE (type) == DFmode && align < 64)
25090 return 64;
25091 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25092 return 128;
25093 }
25094 return align;
25095 }
25096
25097 /* Compute the minimum required alignment for dynamic stack realignment
25098 purposes for a local variable, parameter or a stack slot. EXP is
25099 the data type or decl itself, MODE is its mode and ALIGN is the
25100 alignment that the object would ordinarily have. */
25101
25102 unsigned int
25103 ix86_minimum_alignment (tree exp, enum machine_mode mode,
25104 unsigned int align)
25105 {
25106 tree type, decl;
25107
25108 if (exp && DECL_P (exp))
25109 {
25110 type = TREE_TYPE (exp);
25111 decl = exp;
25112 }
25113 else
25114 {
25115 type = exp;
25116 decl = NULL;
25117 }
25118
25119 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
25120 return align;
25121
25122 /* Don't do dynamic stack realignment for long long objects with
25123 -mpreferred-stack-boundary=2. */
25124 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
25125 && (!type || !TYPE_USER_ALIGN (type))
25126 && (!decl || !DECL_USER_ALIGN (decl)))
25127 return 32;
25128
25129 return align;
25130 }
25131 \f
25132 /* Find a location for the static chain incoming to a nested function.
25133 This is a register, unless all free registers are used by arguments. */
25134
25135 static rtx
25136 ix86_static_chain (const_tree fndecl, bool incoming_p)
25137 {
25138 unsigned regno;
25139
25140 if (!DECL_STATIC_CHAIN (fndecl))
25141 return NULL;
25142
25143 if (TARGET_64BIT)
25144 {
25145 /* We always use R10 in 64-bit mode. */
25146 regno = R10_REG;
25147 }
25148 else
25149 {
25150 tree fntype;
25151 unsigned int ccvt;
25152
25153 /* By default in 32-bit mode we use ECX to pass the static chain. */
25154 regno = CX_REG;
25155
25156 fntype = TREE_TYPE (fndecl);
25157 ccvt = ix86_get_callcvt (fntype);
25158 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
25159 {
25160 /* Fastcall functions use ecx/edx for arguments, which leaves
25161 us with EAX for the static chain.
25162 Thiscall functions use ecx for arguments, which also
25163 leaves us with EAX for the static chain. */
25164 regno = AX_REG;
25165 }
25166 else if (ix86_function_regparm (fntype, fndecl) == 3)
25167 {
25168 /* For regparm 3, we have no free call-clobbered registers in
25169 which to store the static chain. In order to implement this,
25170 we have the trampoline push the static chain to the stack.
25171 However, we can't push a value below the return address when
25172 we call the nested function directly, so we have to use an
25173 alternate entry point. For this we use ESI, and have the
25174 alternate entry point push ESI, so that things appear the
25175 same once we're executing the nested function. */
25176 if (incoming_p)
25177 {
25178 if (fndecl == current_function_decl)
25179 ix86_static_chain_on_stack = true;
25180 return gen_frame_mem (SImode,
25181 plus_constant (Pmode,
25182 arg_pointer_rtx, -8));
25183 }
25184 regno = SI_REG;
25185 }
25186 }
25187
25188 return gen_rtx_REG (Pmode, regno);
25189 }
25190
25191 /* Emit RTL insns to initialize the variable parts of a trampoline.
25192 FNDECL is the decl of the target address; M_TRAMP is a MEM for
25193 the trampoline, and CHAIN_VALUE is an RTX for the static chain
25194 to be passed to the target function. */
25195
25196 static void
25197 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
25198 {
25199 rtx mem, fnaddr;
25200 int opcode;
25201 int offset = 0;
25202
25203 fnaddr = XEXP (DECL_RTL (fndecl), 0);
25204
25205 if (TARGET_64BIT)
25206 {
25207 int size;
25208
25209 /* Load the function address to r11. Try to load address using
25210 the shorter movl instead of movabs. We may want to support
25211 movq for kernel mode, but kernel does not use trampolines at
25212 the moment. FNADDR is a 32bit address and may not be in
25213 DImode when ptr_mode == SImode. Always use movl in this
25214 case. */
25215 if (ptr_mode == SImode
25216 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
25217 {
25218 fnaddr = copy_addr_to_reg (fnaddr);
25219
25220 mem = adjust_address (m_tramp, HImode, offset);
25221 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
25222
25223 mem = adjust_address (m_tramp, SImode, offset + 2);
25224 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
25225 offset += 6;
25226 }
25227 else
25228 {
25229 mem = adjust_address (m_tramp, HImode, offset);
25230 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
25231
25232 mem = adjust_address (m_tramp, DImode, offset + 2);
25233 emit_move_insn (mem, fnaddr);
25234 offset += 10;
25235 }
25236
25237 /* Load static chain using movabs to r10. Use the shorter movl
25238 instead of movabs when ptr_mode == SImode. */
25239 if (ptr_mode == SImode)
25240 {
25241 opcode = 0xba41;
25242 size = 6;
25243 }
25244 else
25245 {
25246 opcode = 0xba49;
25247 size = 10;
25248 }
25249
25250 mem = adjust_address (m_tramp, HImode, offset);
25251 emit_move_insn (mem, gen_int_mode (opcode, HImode));
25252
25253 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
25254 emit_move_insn (mem, chain_value);
25255 offset += size;
25256
25257 /* Jump to r11; the last (unused) byte is a nop, only there to
25258 pad the write out to a single 32-bit store. */
25259 mem = adjust_address (m_tramp, SImode, offset);
25260 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
25261 offset += 4;
25262 }
25263 else
25264 {
25265 rtx disp, chain;
25266
25267 /* Depending on the static chain location, either load a register
25268 with a constant, or push the constant to the stack. All of the
25269 instructions are the same size. */
25270 chain = ix86_static_chain (fndecl, true);
25271 if (REG_P (chain))
25272 {
25273 switch (REGNO (chain))
25274 {
25275 case AX_REG:
25276 opcode = 0xb8; break;
25277 case CX_REG:
25278 opcode = 0xb9; break;
25279 default:
25280 gcc_unreachable ();
25281 }
25282 }
25283 else
25284 opcode = 0x68;
25285
25286 mem = adjust_address (m_tramp, QImode, offset);
25287 emit_move_insn (mem, gen_int_mode (opcode, QImode));
25288
25289 mem = adjust_address (m_tramp, SImode, offset + 1);
25290 emit_move_insn (mem, chain_value);
25291 offset += 5;
25292
25293 mem = adjust_address (m_tramp, QImode, offset);
25294 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
25295
25296 mem = adjust_address (m_tramp, SImode, offset + 1);
25297
25298 /* Compute offset from the end of the jmp to the target function.
25299 In the case in which the trampoline stores the static chain on
25300 the stack, we need to skip the first insn which pushes the
25301 (call-saved) register static chain; this push is 1 byte. */
25302 offset += 5;
25303 disp = expand_binop (SImode, sub_optab, fnaddr,
25304 plus_constant (Pmode, XEXP (m_tramp, 0),
25305 offset - (MEM_P (chain) ? 1 : 0)),
25306 NULL_RTX, 1, OPTAB_DIRECT);
25307 emit_move_insn (mem, disp);
25308 }
25309
25310 gcc_assert (offset <= TRAMPOLINE_SIZE);
25311
25312 #ifdef HAVE_ENABLE_EXECUTE_STACK
25313 #ifdef CHECK_EXECUTE_STACK_ENABLED
25314 if (CHECK_EXECUTE_STACK_ENABLED)
25315 #endif
25316 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
25317 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
25318 #endif
25319 }
25320 \f
25321 /* The following file contains several enumerations and data structures
25322 built from the definitions in i386-builtin-types.def. */
25323
25324 #include "i386-builtin-types.inc"
25325
25326 /* Table for the ix86 builtin non-function types. */
25327 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
25328
25329 /* Retrieve an element from the above table, building some of
25330 the types lazily. */
25331
25332 static tree
25333 ix86_get_builtin_type (enum ix86_builtin_type tcode)
25334 {
25335 unsigned int index;
25336 tree type, itype;
25337
25338 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
25339
25340 type = ix86_builtin_type_tab[(int) tcode];
25341 if (type != NULL)
25342 return type;
25343
25344 gcc_assert (tcode > IX86_BT_LAST_PRIM);
25345 if (tcode <= IX86_BT_LAST_VECT)
25346 {
25347 enum machine_mode mode;
25348
25349 index = tcode - IX86_BT_LAST_PRIM - 1;
25350 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
25351 mode = ix86_builtin_type_vect_mode[index];
25352
25353 type = build_vector_type_for_mode (itype, mode);
25354 }
25355 else
25356 {
25357 int quals;
25358
25359 index = tcode - IX86_BT_LAST_VECT - 1;
25360 if (tcode <= IX86_BT_LAST_PTR)
25361 quals = TYPE_UNQUALIFIED;
25362 else
25363 quals = TYPE_QUAL_CONST;
25364
25365 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
25366 if (quals != TYPE_UNQUALIFIED)
25367 itype = build_qualified_type (itype, quals);
25368
25369 type = build_pointer_type (itype);
25370 }
25371
25372 ix86_builtin_type_tab[(int) tcode] = type;
25373 return type;
25374 }
25375
25376 /* Table for the ix86 builtin function types. */
25377 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
25378
25379 /* Retrieve an element from the above table, building some of
25380 the types lazily. */
25381
25382 static tree
25383 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
25384 {
25385 tree type;
25386
25387 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
25388
25389 type = ix86_builtin_func_type_tab[(int) tcode];
25390 if (type != NULL)
25391 return type;
25392
25393 if (tcode <= IX86_BT_LAST_FUNC)
25394 {
25395 unsigned start = ix86_builtin_func_start[(int) tcode];
25396 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
25397 tree rtype, atype, args = void_list_node;
25398 unsigned i;
25399
25400 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
25401 for (i = after - 1; i > start; --i)
25402 {
25403 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
25404 args = tree_cons (NULL, atype, args);
25405 }
25406
25407 type = build_function_type (rtype, args);
25408 }
25409 else
25410 {
25411 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
25412 enum ix86_builtin_func_type icode;
25413
25414 icode = ix86_builtin_func_alias_base[index];
25415 type = ix86_get_builtin_func_type (icode);
25416 }
25417
25418 ix86_builtin_func_type_tab[(int) tcode] = type;
25419 return type;
25420 }
25421
25422
25423 /* Codes for all the SSE/MMX builtins. */
25424 enum ix86_builtins
25425 {
25426 IX86_BUILTIN_ADDPS,
25427 IX86_BUILTIN_ADDSS,
25428 IX86_BUILTIN_DIVPS,
25429 IX86_BUILTIN_DIVSS,
25430 IX86_BUILTIN_MULPS,
25431 IX86_BUILTIN_MULSS,
25432 IX86_BUILTIN_SUBPS,
25433 IX86_BUILTIN_SUBSS,
25434
25435 IX86_BUILTIN_CMPEQPS,
25436 IX86_BUILTIN_CMPLTPS,
25437 IX86_BUILTIN_CMPLEPS,
25438 IX86_BUILTIN_CMPGTPS,
25439 IX86_BUILTIN_CMPGEPS,
25440 IX86_BUILTIN_CMPNEQPS,
25441 IX86_BUILTIN_CMPNLTPS,
25442 IX86_BUILTIN_CMPNLEPS,
25443 IX86_BUILTIN_CMPNGTPS,
25444 IX86_BUILTIN_CMPNGEPS,
25445 IX86_BUILTIN_CMPORDPS,
25446 IX86_BUILTIN_CMPUNORDPS,
25447 IX86_BUILTIN_CMPEQSS,
25448 IX86_BUILTIN_CMPLTSS,
25449 IX86_BUILTIN_CMPLESS,
25450 IX86_BUILTIN_CMPNEQSS,
25451 IX86_BUILTIN_CMPNLTSS,
25452 IX86_BUILTIN_CMPNLESS,
25453 IX86_BUILTIN_CMPNGTSS,
25454 IX86_BUILTIN_CMPNGESS,
25455 IX86_BUILTIN_CMPORDSS,
25456 IX86_BUILTIN_CMPUNORDSS,
25457
25458 IX86_BUILTIN_COMIEQSS,
25459 IX86_BUILTIN_COMILTSS,
25460 IX86_BUILTIN_COMILESS,
25461 IX86_BUILTIN_COMIGTSS,
25462 IX86_BUILTIN_COMIGESS,
25463 IX86_BUILTIN_COMINEQSS,
25464 IX86_BUILTIN_UCOMIEQSS,
25465 IX86_BUILTIN_UCOMILTSS,
25466 IX86_BUILTIN_UCOMILESS,
25467 IX86_BUILTIN_UCOMIGTSS,
25468 IX86_BUILTIN_UCOMIGESS,
25469 IX86_BUILTIN_UCOMINEQSS,
25470
25471 IX86_BUILTIN_CVTPI2PS,
25472 IX86_BUILTIN_CVTPS2PI,
25473 IX86_BUILTIN_CVTSI2SS,
25474 IX86_BUILTIN_CVTSI642SS,
25475 IX86_BUILTIN_CVTSS2SI,
25476 IX86_BUILTIN_CVTSS2SI64,
25477 IX86_BUILTIN_CVTTPS2PI,
25478 IX86_BUILTIN_CVTTSS2SI,
25479 IX86_BUILTIN_CVTTSS2SI64,
25480
25481 IX86_BUILTIN_MAXPS,
25482 IX86_BUILTIN_MAXSS,
25483 IX86_BUILTIN_MINPS,
25484 IX86_BUILTIN_MINSS,
25485
25486 IX86_BUILTIN_LOADUPS,
25487 IX86_BUILTIN_STOREUPS,
25488 IX86_BUILTIN_MOVSS,
25489
25490 IX86_BUILTIN_MOVHLPS,
25491 IX86_BUILTIN_MOVLHPS,
25492 IX86_BUILTIN_LOADHPS,
25493 IX86_BUILTIN_LOADLPS,
25494 IX86_BUILTIN_STOREHPS,
25495 IX86_BUILTIN_STORELPS,
25496
25497 IX86_BUILTIN_MASKMOVQ,
25498 IX86_BUILTIN_MOVMSKPS,
25499 IX86_BUILTIN_PMOVMSKB,
25500
25501 IX86_BUILTIN_MOVNTPS,
25502 IX86_BUILTIN_MOVNTQ,
25503
25504 IX86_BUILTIN_LOADDQU,
25505 IX86_BUILTIN_STOREDQU,
25506
25507 IX86_BUILTIN_PACKSSWB,
25508 IX86_BUILTIN_PACKSSDW,
25509 IX86_BUILTIN_PACKUSWB,
25510
25511 IX86_BUILTIN_PADDB,
25512 IX86_BUILTIN_PADDW,
25513 IX86_BUILTIN_PADDD,
25514 IX86_BUILTIN_PADDQ,
25515 IX86_BUILTIN_PADDSB,
25516 IX86_BUILTIN_PADDSW,
25517 IX86_BUILTIN_PADDUSB,
25518 IX86_BUILTIN_PADDUSW,
25519 IX86_BUILTIN_PSUBB,
25520 IX86_BUILTIN_PSUBW,
25521 IX86_BUILTIN_PSUBD,
25522 IX86_BUILTIN_PSUBQ,
25523 IX86_BUILTIN_PSUBSB,
25524 IX86_BUILTIN_PSUBSW,
25525 IX86_BUILTIN_PSUBUSB,
25526 IX86_BUILTIN_PSUBUSW,
25527
25528 IX86_BUILTIN_PAND,
25529 IX86_BUILTIN_PANDN,
25530 IX86_BUILTIN_POR,
25531 IX86_BUILTIN_PXOR,
25532
25533 IX86_BUILTIN_PAVGB,
25534 IX86_BUILTIN_PAVGW,
25535
25536 IX86_BUILTIN_PCMPEQB,
25537 IX86_BUILTIN_PCMPEQW,
25538 IX86_BUILTIN_PCMPEQD,
25539 IX86_BUILTIN_PCMPGTB,
25540 IX86_BUILTIN_PCMPGTW,
25541 IX86_BUILTIN_PCMPGTD,
25542
25543 IX86_BUILTIN_PMADDWD,
25544
25545 IX86_BUILTIN_PMAXSW,
25546 IX86_BUILTIN_PMAXUB,
25547 IX86_BUILTIN_PMINSW,
25548 IX86_BUILTIN_PMINUB,
25549
25550 IX86_BUILTIN_PMULHUW,
25551 IX86_BUILTIN_PMULHW,
25552 IX86_BUILTIN_PMULLW,
25553
25554 IX86_BUILTIN_PSADBW,
25555 IX86_BUILTIN_PSHUFW,
25556
25557 IX86_BUILTIN_PSLLW,
25558 IX86_BUILTIN_PSLLD,
25559 IX86_BUILTIN_PSLLQ,
25560 IX86_BUILTIN_PSRAW,
25561 IX86_BUILTIN_PSRAD,
25562 IX86_BUILTIN_PSRLW,
25563 IX86_BUILTIN_PSRLD,
25564 IX86_BUILTIN_PSRLQ,
25565 IX86_BUILTIN_PSLLWI,
25566 IX86_BUILTIN_PSLLDI,
25567 IX86_BUILTIN_PSLLQI,
25568 IX86_BUILTIN_PSRAWI,
25569 IX86_BUILTIN_PSRADI,
25570 IX86_BUILTIN_PSRLWI,
25571 IX86_BUILTIN_PSRLDI,
25572 IX86_BUILTIN_PSRLQI,
25573
25574 IX86_BUILTIN_PUNPCKHBW,
25575 IX86_BUILTIN_PUNPCKHWD,
25576 IX86_BUILTIN_PUNPCKHDQ,
25577 IX86_BUILTIN_PUNPCKLBW,
25578 IX86_BUILTIN_PUNPCKLWD,
25579 IX86_BUILTIN_PUNPCKLDQ,
25580
25581 IX86_BUILTIN_SHUFPS,
25582
25583 IX86_BUILTIN_RCPPS,
25584 IX86_BUILTIN_RCPSS,
25585 IX86_BUILTIN_RSQRTPS,
25586 IX86_BUILTIN_RSQRTPS_NR,
25587 IX86_BUILTIN_RSQRTSS,
25588 IX86_BUILTIN_RSQRTF,
25589 IX86_BUILTIN_SQRTPS,
25590 IX86_BUILTIN_SQRTPS_NR,
25591 IX86_BUILTIN_SQRTSS,
25592
25593 IX86_BUILTIN_UNPCKHPS,
25594 IX86_BUILTIN_UNPCKLPS,
25595
25596 IX86_BUILTIN_ANDPS,
25597 IX86_BUILTIN_ANDNPS,
25598 IX86_BUILTIN_ORPS,
25599 IX86_BUILTIN_XORPS,
25600
25601 IX86_BUILTIN_EMMS,
25602 IX86_BUILTIN_LDMXCSR,
25603 IX86_BUILTIN_STMXCSR,
25604 IX86_BUILTIN_SFENCE,
25605
25606 IX86_BUILTIN_FXSAVE,
25607 IX86_BUILTIN_FXRSTOR,
25608 IX86_BUILTIN_FXSAVE64,
25609 IX86_BUILTIN_FXRSTOR64,
25610
25611 IX86_BUILTIN_XSAVE,
25612 IX86_BUILTIN_XRSTOR,
25613 IX86_BUILTIN_XSAVE64,
25614 IX86_BUILTIN_XRSTOR64,
25615
25616 IX86_BUILTIN_XSAVEOPT,
25617 IX86_BUILTIN_XSAVEOPT64,
25618
25619 /* 3DNow! Original */
25620 IX86_BUILTIN_FEMMS,
25621 IX86_BUILTIN_PAVGUSB,
25622 IX86_BUILTIN_PF2ID,
25623 IX86_BUILTIN_PFACC,
25624 IX86_BUILTIN_PFADD,
25625 IX86_BUILTIN_PFCMPEQ,
25626 IX86_BUILTIN_PFCMPGE,
25627 IX86_BUILTIN_PFCMPGT,
25628 IX86_BUILTIN_PFMAX,
25629 IX86_BUILTIN_PFMIN,
25630 IX86_BUILTIN_PFMUL,
25631 IX86_BUILTIN_PFRCP,
25632 IX86_BUILTIN_PFRCPIT1,
25633 IX86_BUILTIN_PFRCPIT2,
25634 IX86_BUILTIN_PFRSQIT1,
25635 IX86_BUILTIN_PFRSQRT,
25636 IX86_BUILTIN_PFSUB,
25637 IX86_BUILTIN_PFSUBR,
25638 IX86_BUILTIN_PI2FD,
25639 IX86_BUILTIN_PMULHRW,
25640
25641 /* 3DNow! Athlon Extensions */
25642 IX86_BUILTIN_PF2IW,
25643 IX86_BUILTIN_PFNACC,
25644 IX86_BUILTIN_PFPNACC,
25645 IX86_BUILTIN_PI2FW,
25646 IX86_BUILTIN_PSWAPDSI,
25647 IX86_BUILTIN_PSWAPDSF,
25648
25649 /* SSE2 */
25650 IX86_BUILTIN_ADDPD,
25651 IX86_BUILTIN_ADDSD,
25652 IX86_BUILTIN_DIVPD,
25653 IX86_BUILTIN_DIVSD,
25654 IX86_BUILTIN_MULPD,
25655 IX86_BUILTIN_MULSD,
25656 IX86_BUILTIN_SUBPD,
25657 IX86_BUILTIN_SUBSD,
25658
25659 IX86_BUILTIN_CMPEQPD,
25660 IX86_BUILTIN_CMPLTPD,
25661 IX86_BUILTIN_CMPLEPD,
25662 IX86_BUILTIN_CMPGTPD,
25663 IX86_BUILTIN_CMPGEPD,
25664 IX86_BUILTIN_CMPNEQPD,
25665 IX86_BUILTIN_CMPNLTPD,
25666 IX86_BUILTIN_CMPNLEPD,
25667 IX86_BUILTIN_CMPNGTPD,
25668 IX86_BUILTIN_CMPNGEPD,
25669 IX86_BUILTIN_CMPORDPD,
25670 IX86_BUILTIN_CMPUNORDPD,
25671 IX86_BUILTIN_CMPEQSD,
25672 IX86_BUILTIN_CMPLTSD,
25673 IX86_BUILTIN_CMPLESD,
25674 IX86_BUILTIN_CMPNEQSD,
25675 IX86_BUILTIN_CMPNLTSD,
25676 IX86_BUILTIN_CMPNLESD,
25677 IX86_BUILTIN_CMPORDSD,
25678 IX86_BUILTIN_CMPUNORDSD,
25679
25680 IX86_BUILTIN_COMIEQSD,
25681 IX86_BUILTIN_COMILTSD,
25682 IX86_BUILTIN_COMILESD,
25683 IX86_BUILTIN_COMIGTSD,
25684 IX86_BUILTIN_COMIGESD,
25685 IX86_BUILTIN_COMINEQSD,
25686 IX86_BUILTIN_UCOMIEQSD,
25687 IX86_BUILTIN_UCOMILTSD,
25688 IX86_BUILTIN_UCOMILESD,
25689 IX86_BUILTIN_UCOMIGTSD,
25690 IX86_BUILTIN_UCOMIGESD,
25691 IX86_BUILTIN_UCOMINEQSD,
25692
25693 IX86_BUILTIN_MAXPD,
25694 IX86_BUILTIN_MAXSD,
25695 IX86_BUILTIN_MINPD,
25696 IX86_BUILTIN_MINSD,
25697
25698 IX86_BUILTIN_ANDPD,
25699 IX86_BUILTIN_ANDNPD,
25700 IX86_BUILTIN_ORPD,
25701 IX86_BUILTIN_XORPD,
25702
25703 IX86_BUILTIN_SQRTPD,
25704 IX86_BUILTIN_SQRTSD,
25705
25706 IX86_BUILTIN_UNPCKHPD,
25707 IX86_BUILTIN_UNPCKLPD,
25708
25709 IX86_BUILTIN_SHUFPD,
25710
25711 IX86_BUILTIN_LOADUPD,
25712 IX86_BUILTIN_STOREUPD,
25713 IX86_BUILTIN_MOVSD,
25714
25715 IX86_BUILTIN_LOADHPD,
25716 IX86_BUILTIN_LOADLPD,
25717
25718 IX86_BUILTIN_CVTDQ2PD,
25719 IX86_BUILTIN_CVTDQ2PS,
25720
25721 IX86_BUILTIN_CVTPD2DQ,
25722 IX86_BUILTIN_CVTPD2PI,
25723 IX86_BUILTIN_CVTPD2PS,
25724 IX86_BUILTIN_CVTTPD2DQ,
25725 IX86_BUILTIN_CVTTPD2PI,
25726
25727 IX86_BUILTIN_CVTPI2PD,
25728 IX86_BUILTIN_CVTSI2SD,
25729 IX86_BUILTIN_CVTSI642SD,
25730
25731 IX86_BUILTIN_CVTSD2SI,
25732 IX86_BUILTIN_CVTSD2SI64,
25733 IX86_BUILTIN_CVTSD2SS,
25734 IX86_BUILTIN_CVTSS2SD,
25735 IX86_BUILTIN_CVTTSD2SI,
25736 IX86_BUILTIN_CVTTSD2SI64,
25737
25738 IX86_BUILTIN_CVTPS2DQ,
25739 IX86_BUILTIN_CVTPS2PD,
25740 IX86_BUILTIN_CVTTPS2DQ,
25741
25742 IX86_BUILTIN_MOVNTI,
25743 IX86_BUILTIN_MOVNTI64,
25744 IX86_BUILTIN_MOVNTPD,
25745 IX86_BUILTIN_MOVNTDQ,
25746
25747 IX86_BUILTIN_MOVQ128,
25748
25749 /* SSE2 MMX */
25750 IX86_BUILTIN_MASKMOVDQU,
25751 IX86_BUILTIN_MOVMSKPD,
25752 IX86_BUILTIN_PMOVMSKB128,
25753
25754 IX86_BUILTIN_PACKSSWB128,
25755 IX86_BUILTIN_PACKSSDW128,
25756 IX86_BUILTIN_PACKUSWB128,
25757
25758 IX86_BUILTIN_PADDB128,
25759 IX86_BUILTIN_PADDW128,
25760 IX86_BUILTIN_PADDD128,
25761 IX86_BUILTIN_PADDQ128,
25762 IX86_BUILTIN_PADDSB128,
25763 IX86_BUILTIN_PADDSW128,
25764 IX86_BUILTIN_PADDUSB128,
25765 IX86_BUILTIN_PADDUSW128,
25766 IX86_BUILTIN_PSUBB128,
25767 IX86_BUILTIN_PSUBW128,
25768 IX86_BUILTIN_PSUBD128,
25769 IX86_BUILTIN_PSUBQ128,
25770 IX86_BUILTIN_PSUBSB128,
25771 IX86_BUILTIN_PSUBSW128,
25772 IX86_BUILTIN_PSUBUSB128,
25773 IX86_BUILTIN_PSUBUSW128,
25774
25775 IX86_BUILTIN_PAND128,
25776 IX86_BUILTIN_PANDN128,
25777 IX86_BUILTIN_POR128,
25778 IX86_BUILTIN_PXOR128,
25779
25780 IX86_BUILTIN_PAVGB128,
25781 IX86_BUILTIN_PAVGW128,
25782
25783 IX86_BUILTIN_PCMPEQB128,
25784 IX86_BUILTIN_PCMPEQW128,
25785 IX86_BUILTIN_PCMPEQD128,
25786 IX86_BUILTIN_PCMPGTB128,
25787 IX86_BUILTIN_PCMPGTW128,
25788 IX86_BUILTIN_PCMPGTD128,
25789
25790 IX86_BUILTIN_PMADDWD128,
25791
25792 IX86_BUILTIN_PMAXSW128,
25793 IX86_BUILTIN_PMAXUB128,
25794 IX86_BUILTIN_PMINSW128,
25795 IX86_BUILTIN_PMINUB128,
25796
25797 IX86_BUILTIN_PMULUDQ,
25798 IX86_BUILTIN_PMULUDQ128,
25799 IX86_BUILTIN_PMULHUW128,
25800 IX86_BUILTIN_PMULHW128,
25801 IX86_BUILTIN_PMULLW128,
25802
25803 IX86_BUILTIN_PSADBW128,
25804 IX86_BUILTIN_PSHUFHW,
25805 IX86_BUILTIN_PSHUFLW,
25806 IX86_BUILTIN_PSHUFD,
25807
25808 IX86_BUILTIN_PSLLDQI128,
25809 IX86_BUILTIN_PSLLWI128,
25810 IX86_BUILTIN_PSLLDI128,
25811 IX86_BUILTIN_PSLLQI128,
25812 IX86_BUILTIN_PSRAWI128,
25813 IX86_BUILTIN_PSRADI128,
25814 IX86_BUILTIN_PSRLDQI128,
25815 IX86_BUILTIN_PSRLWI128,
25816 IX86_BUILTIN_PSRLDI128,
25817 IX86_BUILTIN_PSRLQI128,
25818
25819 IX86_BUILTIN_PSLLDQ128,
25820 IX86_BUILTIN_PSLLW128,
25821 IX86_BUILTIN_PSLLD128,
25822 IX86_BUILTIN_PSLLQ128,
25823 IX86_BUILTIN_PSRAW128,
25824 IX86_BUILTIN_PSRAD128,
25825 IX86_BUILTIN_PSRLW128,
25826 IX86_BUILTIN_PSRLD128,
25827 IX86_BUILTIN_PSRLQ128,
25828
25829 IX86_BUILTIN_PUNPCKHBW128,
25830 IX86_BUILTIN_PUNPCKHWD128,
25831 IX86_BUILTIN_PUNPCKHDQ128,
25832 IX86_BUILTIN_PUNPCKHQDQ128,
25833 IX86_BUILTIN_PUNPCKLBW128,
25834 IX86_BUILTIN_PUNPCKLWD128,
25835 IX86_BUILTIN_PUNPCKLDQ128,
25836 IX86_BUILTIN_PUNPCKLQDQ128,
25837
25838 IX86_BUILTIN_CLFLUSH,
25839 IX86_BUILTIN_MFENCE,
25840 IX86_BUILTIN_LFENCE,
25841 IX86_BUILTIN_PAUSE,
25842
25843 IX86_BUILTIN_BSRSI,
25844 IX86_BUILTIN_BSRDI,
25845 IX86_BUILTIN_RDPMC,
25846 IX86_BUILTIN_RDTSC,
25847 IX86_BUILTIN_RDTSCP,
25848 IX86_BUILTIN_ROLQI,
25849 IX86_BUILTIN_ROLHI,
25850 IX86_BUILTIN_RORQI,
25851 IX86_BUILTIN_RORHI,
25852
25853 /* SSE3. */
25854 IX86_BUILTIN_ADDSUBPS,
25855 IX86_BUILTIN_HADDPS,
25856 IX86_BUILTIN_HSUBPS,
25857 IX86_BUILTIN_MOVSHDUP,
25858 IX86_BUILTIN_MOVSLDUP,
25859 IX86_BUILTIN_ADDSUBPD,
25860 IX86_BUILTIN_HADDPD,
25861 IX86_BUILTIN_HSUBPD,
25862 IX86_BUILTIN_LDDQU,
25863
25864 IX86_BUILTIN_MONITOR,
25865 IX86_BUILTIN_MWAIT,
25866
25867 /* SSSE3. */
25868 IX86_BUILTIN_PHADDW,
25869 IX86_BUILTIN_PHADDD,
25870 IX86_BUILTIN_PHADDSW,
25871 IX86_BUILTIN_PHSUBW,
25872 IX86_BUILTIN_PHSUBD,
25873 IX86_BUILTIN_PHSUBSW,
25874 IX86_BUILTIN_PMADDUBSW,
25875 IX86_BUILTIN_PMULHRSW,
25876 IX86_BUILTIN_PSHUFB,
25877 IX86_BUILTIN_PSIGNB,
25878 IX86_BUILTIN_PSIGNW,
25879 IX86_BUILTIN_PSIGND,
25880 IX86_BUILTIN_PALIGNR,
25881 IX86_BUILTIN_PABSB,
25882 IX86_BUILTIN_PABSW,
25883 IX86_BUILTIN_PABSD,
25884
25885 IX86_BUILTIN_PHADDW128,
25886 IX86_BUILTIN_PHADDD128,
25887 IX86_BUILTIN_PHADDSW128,
25888 IX86_BUILTIN_PHSUBW128,
25889 IX86_BUILTIN_PHSUBD128,
25890 IX86_BUILTIN_PHSUBSW128,
25891 IX86_BUILTIN_PMADDUBSW128,
25892 IX86_BUILTIN_PMULHRSW128,
25893 IX86_BUILTIN_PSHUFB128,
25894 IX86_BUILTIN_PSIGNB128,
25895 IX86_BUILTIN_PSIGNW128,
25896 IX86_BUILTIN_PSIGND128,
25897 IX86_BUILTIN_PALIGNR128,
25898 IX86_BUILTIN_PABSB128,
25899 IX86_BUILTIN_PABSW128,
25900 IX86_BUILTIN_PABSD128,
25901
25902 /* AMDFAM10 - SSE4A New Instructions. */
25903 IX86_BUILTIN_MOVNTSD,
25904 IX86_BUILTIN_MOVNTSS,
25905 IX86_BUILTIN_EXTRQI,
25906 IX86_BUILTIN_EXTRQ,
25907 IX86_BUILTIN_INSERTQI,
25908 IX86_BUILTIN_INSERTQ,
25909
25910 /* SSE4.1. */
25911 IX86_BUILTIN_BLENDPD,
25912 IX86_BUILTIN_BLENDPS,
25913 IX86_BUILTIN_BLENDVPD,
25914 IX86_BUILTIN_BLENDVPS,
25915 IX86_BUILTIN_PBLENDVB128,
25916 IX86_BUILTIN_PBLENDW128,
25917
25918 IX86_BUILTIN_DPPD,
25919 IX86_BUILTIN_DPPS,
25920
25921 IX86_BUILTIN_INSERTPS128,
25922
25923 IX86_BUILTIN_MOVNTDQA,
25924 IX86_BUILTIN_MPSADBW128,
25925 IX86_BUILTIN_PACKUSDW128,
25926 IX86_BUILTIN_PCMPEQQ,
25927 IX86_BUILTIN_PHMINPOSUW128,
25928
25929 IX86_BUILTIN_PMAXSB128,
25930 IX86_BUILTIN_PMAXSD128,
25931 IX86_BUILTIN_PMAXUD128,
25932 IX86_BUILTIN_PMAXUW128,
25933
25934 IX86_BUILTIN_PMINSB128,
25935 IX86_BUILTIN_PMINSD128,
25936 IX86_BUILTIN_PMINUD128,
25937 IX86_BUILTIN_PMINUW128,
25938
25939 IX86_BUILTIN_PMOVSXBW128,
25940 IX86_BUILTIN_PMOVSXBD128,
25941 IX86_BUILTIN_PMOVSXBQ128,
25942 IX86_BUILTIN_PMOVSXWD128,
25943 IX86_BUILTIN_PMOVSXWQ128,
25944 IX86_BUILTIN_PMOVSXDQ128,
25945
25946 IX86_BUILTIN_PMOVZXBW128,
25947 IX86_BUILTIN_PMOVZXBD128,
25948 IX86_BUILTIN_PMOVZXBQ128,
25949 IX86_BUILTIN_PMOVZXWD128,
25950 IX86_BUILTIN_PMOVZXWQ128,
25951 IX86_BUILTIN_PMOVZXDQ128,
25952
25953 IX86_BUILTIN_PMULDQ128,
25954 IX86_BUILTIN_PMULLD128,
25955
25956 IX86_BUILTIN_ROUNDSD,
25957 IX86_BUILTIN_ROUNDSS,
25958
25959 IX86_BUILTIN_ROUNDPD,
25960 IX86_BUILTIN_ROUNDPS,
25961
25962 IX86_BUILTIN_FLOORPD,
25963 IX86_BUILTIN_CEILPD,
25964 IX86_BUILTIN_TRUNCPD,
25965 IX86_BUILTIN_RINTPD,
25966 IX86_BUILTIN_ROUNDPD_AZ,
25967
25968 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
25969 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
25970 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
25971
25972 IX86_BUILTIN_FLOORPS,
25973 IX86_BUILTIN_CEILPS,
25974 IX86_BUILTIN_TRUNCPS,
25975 IX86_BUILTIN_RINTPS,
25976 IX86_BUILTIN_ROUNDPS_AZ,
25977
25978 IX86_BUILTIN_FLOORPS_SFIX,
25979 IX86_BUILTIN_CEILPS_SFIX,
25980 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
25981
25982 IX86_BUILTIN_PTESTZ,
25983 IX86_BUILTIN_PTESTC,
25984 IX86_BUILTIN_PTESTNZC,
25985
25986 IX86_BUILTIN_VEC_INIT_V2SI,
25987 IX86_BUILTIN_VEC_INIT_V4HI,
25988 IX86_BUILTIN_VEC_INIT_V8QI,
25989 IX86_BUILTIN_VEC_EXT_V2DF,
25990 IX86_BUILTIN_VEC_EXT_V2DI,
25991 IX86_BUILTIN_VEC_EXT_V4SF,
25992 IX86_BUILTIN_VEC_EXT_V4SI,
25993 IX86_BUILTIN_VEC_EXT_V8HI,
25994 IX86_BUILTIN_VEC_EXT_V2SI,
25995 IX86_BUILTIN_VEC_EXT_V4HI,
25996 IX86_BUILTIN_VEC_EXT_V16QI,
25997 IX86_BUILTIN_VEC_SET_V2DI,
25998 IX86_BUILTIN_VEC_SET_V4SF,
25999 IX86_BUILTIN_VEC_SET_V4SI,
26000 IX86_BUILTIN_VEC_SET_V8HI,
26001 IX86_BUILTIN_VEC_SET_V4HI,
26002 IX86_BUILTIN_VEC_SET_V16QI,
26003
26004 IX86_BUILTIN_VEC_PACK_SFIX,
26005 IX86_BUILTIN_VEC_PACK_SFIX256,
26006
26007 /* SSE4.2. */
26008 IX86_BUILTIN_CRC32QI,
26009 IX86_BUILTIN_CRC32HI,
26010 IX86_BUILTIN_CRC32SI,
26011 IX86_BUILTIN_CRC32DI,
26012
26013 IX86_BUILTIN_PCMPESTRI128,
26014 IX86_BUILTIN_PCMPESTRM128,
26015 IX86_BUILTIN_PCMPESTRA128,
26016 IX86_BUILTIN_PCMPESTRC128,
26017 IX86_BUILTIN_PCMPESTRO128,
26018 IX86_BUILTIN_PCMPESTRS128,
26019 IX86_BUILTIN_PCMPESTRZ128,
26020 IX86_BUILTIN_PCMPISTRI128,
26021 IX86_BUILTIN_PCMPISTRM128,
26022 IX86_BUILTIN_PCMPISTRA128,
26023 IX86_BUILTIN_PCMPISTRC128,
26024 IX86_BUILTIN_PCMPISTRO128,
26025 IX86_BUILTIN_PCMPISTRS128,
26026 IX86_BUILTIN_PCMPISTRZ128,
26027
26028 IX86_BUILTIN_PCMPGTQ,
26029
26030 /* AES instructions */
26031 IX86_BUILTIN_AESENC128,
26032 IX86_BUILTIN_AESENCLAST128,
26033 IX86_BUILTIN_AESDEC128,
26034 IX86_BUILTIN_AESDECLAST128,
26035 IX86_BUILTIN_AESIMC128,
26036 IX86_BUILTIN_AESKEYGENASSIST128,
26037
26038 /* PCLMUL instruction */
26039 IX86_BUILTIN_PCLMULQDQ128,
26040
26041 /* AVX */
26042 IX86_BUILTIN_ADDPD256,
26043 IX86_BUILTIN_ADDPS256,
26044 IX86_BUILTIN_ADDSUBPD256,
26045 IX86_BUILTIN_ADDSUBPS256,
26046 IX86_BUILTIN_ANDPD256,
26047 IX86_BUILTIN_ANDPS256,
26048 IX86_BUILTIN_ANDNPD256,
26049 IX86_BUILTIN_ANDNPS256,
26050 IX86_BUILTIN_BLENDPD256,
26051 IX86_BUILTIN_BLENDPS256,
26052 IX86_BUILTIN_BLENDVPD256,
26053 IX86_BUILTIN_BLENDVPS256,
26054 IX86_BUILTIN_DIVPD256,
26055 IX86_BUILTIN_DIVPS256,
26056 IX86_BUILTIN_DPPS256,
26057 IX86_BUILTIN_HADDPD256,
26058 IX86_BUILTIN_HADDPS256,
26059 IX86_BUILTIN_HSUBPD256,
26060 IX86_BUILTIN_HSUBPS256,
26061 IX86_BUILTIN_MAXPD256,
26062 IX86_BUILTIN_MAXPS256,
26063 IX86_BUILTIN_MINPD256,
26064 IX86_BUILTIN_MINPS256,
26065 IX86_BUILTIN_MULPD256,
26066 IX86_BUILTIN_MULPS256,
26067 IX86_BUILTIN_ORPD256,
26068 IX86_BUILTIN_ORPS256,
26069 IX86_BUILTIN_SHUFPD256,
26070 IX86_BUILTIN_SHUFPS256,
26071 IX86_BUILTIN_SUBPD256,
26072 IX86_BUILTIN_SUBPS256,
26073 IX86_BUILTIN_XORPD256,
26074 IX86_BUILTIN_XORPS256,
26075 IX86_BUILTIN_CMPSD,
26076 IX86_BUILTIN_CMPSS,
26077 IX86_BUILTIN_CMPPD,
26078 IX86_BUILTIN_CMPPS,
26079 IX86_BUILTIN_CMPPD256,
26080 IX86_BUILTIN_CMPPS256,
26081 IX86_BUILTIN_CVTDQ2PD256,
26082 IX86_BUILTIN_CVTDQ2PS256,
26083 IX86_BUILTIN_CVTPD2PS256,
26084 IX86_BUILTIN_CVTPS2DQ256,
26085 IX86_BUILTIN_CVTPS2PD256,
26086 IX86_BUILTIN_CVTTPD2DQ256,
26087 IX86_BUILTIN_CVTPD2DQ256,
26088 IX86_BUILTIN_CVTTPS2DQ256,
26089 IX86_BUILTIN_EXTRACTF128PD256,
26090 IX86_BUILTIN_EXTRACTF128PS256,
26091 IX86_BUILTIN_EXTRACTF128SI256,
26092 IX86_BUILTIN_VZEROALL,
26093 IX86_BUILTIN_VZEROUPPER,
26094 IX86_BUILTIN_VPERMILVARPD,
26095 IX86_BUILTIN_VPERMILVARPS,
26096 IX86_BUILTIN_VPERMILVARPD256,
26097 IX86_BUILTIN_VPERMILVARPS256,
26098 IX86_BUILTIN_VPERMILPD,
26099 IX86_BUILTIN_VPERMILPS,
26100 IX86_BUILTIN_VPERMILPD256,
26101 IX86_BUILTIN_VPERMILPS256,
26102 IX86_BUILTIN_VPERMIL2PD,
26103 IX86_BUILTIN_VPERMIL2PS,
26104 IX86_BUILTIN_VPERMIL2PD256,
26105 IX86_BUILTIN_VPERMIL2PS256,
26106 IX86_BUILTIN_VPERM2F128PD256,
26107 IX86_BUILTIN_VPERM2F128PS256,
26108 IX86_BUILTIN_VPERM2F128SI256,
26109 IX86_BUILTIN_VBROADCASTSS,
26110 IX86_BUILTIN_VBROADCASTSD256,
26111 IX86_BUILTIN_VBROADCASTSS256,
26112 IX86_BUILTIN_VBROADCASTPD256,
26113 IX86_BUILTIN_VBROADCASTPS256,
26114 IX86_BUILTIN_VINSERTF128PD256,
26115 IX86_BUILTIN_VINSERTF128PS256,
26116 IX86_BUILTIN_VINSERTF128SI256,
26117 IX86_BUILTIN_LOADUPD256,
26118 IX86_BUILTIN_LOADUPS256,
26119 IX86_BUILTIN_STOREUPD256,
26120 IX86_BUILTIN_STOREUPS256,
26121 IX86_BUILTIN_LDDQU256,
26122 IX86_BUILTIN_MOVNTDQ256,
26123 IX86_BUILTIN_MOVNTPD256,
26124 IX86_BUILTIN_MOVNTPS256,
26125 IX86_BUILTIN_LOADDQU256,
26126 IX86_BUILTIN_STOREDQU256,
26127 IX86_BUILTIN_MASKLOADPD,
26128 IX86_BUILTIN_MASKLOADPS,
26129 IX86_BUILTIN_MASKSTOREPD,
26130 IX86_BUILTIN_MASKSTOREPS,
26131 IX86_BUILTIN_MASKLOADPD256,
26132 IX86_BUILTIN_MASKLOADPS256,
26133 IX86_BUILTIN_MASKSTOREPD256,
26134 IX86_BUILTIN_MASKSTOREPS256,
26135 IX86_BUILTIN_MOVSHDUP256,
26136 IX86_BUILTIN_MOVSLDUP256,
26137 IX86_BUILTIN_MOVDDUP256,
26138
26139 IX86_BUILTIN_SQRTPD256,
26140 IX86_BUILTIN_SQRTPS256,
26141 IX86_BUILTIN_SQRTPS_NR256,
26142 IX86_BUILTIN_RSQRTPS256,
26143 IX86_BUILTIN_RSQRTPS_NR256,
26144
26145 IX86_BUILTIN_RCPPS256,
26146
26147 IX86_BUILTIN_ROUNDPD256,
26148 IX86_BUILTIN_ROUNDPS256,
26149
26150 IX86_BUILTIN_FLOORPD256,
26151 IX86_BUILTIN_CEILPD256,
26152 IX86_BUILTIN_TRUNCPD256,
26153 IX86_BUILTIN_RINTPD256,
26154 IX86_BUILTIN_ROUNDPD_AZ256,
26155
26156 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
26157 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
26158 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
26159
26160 IX86_BUILTIN_FLOORPS256,
26161 IX86_BUILTIN_CEILPS256,
26162 IX86_BUILTIN_TRUNCPS256,
26163 IX86_BUILTIN_RINTPS256,
26164 IX86_BUILTIN_ROUNDPS_AZ256,
26165
26166 IX86_BUILTIN_FLOORPS_SFIX256,
26167 IX86_BUILTIN_CEILPS_SFIX256,
26168 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
26169
26170 IX86_BUILTIN_UNPCKHPD256,
26171 IX86_BUILTIN_UNPCKLPD256,
26172 IX86_BUILTIN_UNPCKHPS256,
26173 IX86_BUILTIN_UNPCKLPS256,
26174
26175 IX86_BUILTIN_SI256_SI,
26176 IX86_BUILTIN_PS256_PS,
26177 IX86_BUILTIN_PD256_PD,
26178 IX86_BUILTIN_SI_SI256,
26179 IX86_BUILTIN_PS_PS256,
26180 IX86_BUILTIN_PD_PD256,
26181
26182 IX86_BUILTIN_VTESTZPD,
26183 IX86_BUILTIN_VTESTCPD,
26184 IX86_BUILTIN_VTESTNZCPD,
26185 IX86_BUILTIN_VTESTZPS,
26186 IX86_BUILTIN_VTESTCPS,
26187 IX86_BUILTIN_VTESTNZCPS,
26188 IX86_BUILTIN_VTESTZPD256,
26189 IX86_BUILTIN_VTESTCPD256,
26190 IX86_BUILTIN_VTESTNZCPD256,
26191 IX86_BUILTIN_VTESTZPS256,
26192 IX86_BUILTIN_VTESTCPS256,
26193 IX86_BUILTIN_VTESTNZCPS256,
26194 IX86_BUILTIN_PTESTZ256,
26195 IX86_BUILTIN_PTESTC256,
26196 IX86_BUILTIN_PTESTNZC256,
26197
26198 IX86_BUILTIN_MOVMSKPD256,
26199 IX86_BUILTIN_MOVMSKPS256,
26200
26201 /* AVX2 */
26202 IX86_BUILTIN_MPSADBW256,
26203 IX86_BUILTIN_PABSB256,
26204 IX86_BUILTIN_PABSW256,
26205 IX86_BUILTIN_PABSD256,
26206 IX86_BUILTIN_PACKSSDW256,
26207 IX86_BUILTIN_PACKSSWB256,
26208 IX86_BUILTIN_PACKUSDW256,
26209 IX86_BUILTIN_PACKUSWB256,
26210 IX86_BUILTIN_PADDB256,
26211 IX86_BUILTIN_PADDW256,
26212 IX86_BUILTIN_PADDD256,
26213 IX86_BUILTIN_PADDQ256,
26214 IX86_BUILTIN_PADDSB256,
26215 IX86_BUILTIN_PADDSW256,
26216 IX86_BUILTIN_PADDUSB256,
26217 IX86_BUILTIN_PADDUSW256,
26218 IX86_BUILTIN_PALIGNR256,
26219 IX86_BUILTIN_AND256I,
26220 IX86_BUILTIN_ANDNOT256I,
26221 IX86_BUILTIN_PAVGB256,
26222 IX86_BUILTIN_PAVGW256,
26223 IX86_BUILTIN_PBLENDVB256,
26224 IX86_BUILTIN_PBLENDVW256,
26225 IX86_BUILTIN_PCMPEQB256,
26226 IX86_BUILTIN_PCMPEQW256,
26227 IX86_BUILTIN_PCMPEQD256,
26228 IX86_BUILTIN_PCMPEQQ256,
26229 IX86_BUILTIN_PCMPGTB256,
26230 IX86_BUILTIN_PCMPGTW256,
26231 IX86_BUILTIN_PCMPGTD256,
26232 IX86_BUILTIN_PCMPGTQ256,
26233 IX86_BUILTIN_PHADDW256,
26234 IX86_BUILTIN_PHADDD256,
26235 IX86_BUILTIN_PHADDSW256,
26236 IX86_BUILTIN_PHSUBW256,
26237 IX86_BUILTIN_PHSUBD256,
26238 IX86_BUILTIN_PHSUBSW256,
26239 IX86_BUILTIN_PMADDUBSW256,
26240 IX86_BUILTIN_PMADDWD256,
26241 IX86_BUILTIN_PMAXSB256,
26242 IX86_BUILTIN_PMAXSW256,
26243 IX86_BUILTIN_PMAXSD256,
26244 IX86_BUILTIN_PMAXUB256,
26245 IX86_BUILTIN_PMAXUW256,
26246 IX86_BUILTIN_PMAXUD256,
26247 IX86_BUILTIN_PMINSB256,
26248 IX86_BUILTIN_PMINSW256,
26249 IX86_BUILTIN_PMINSD256,
26250 IX86_BUILTIN_PMINUB256,
26251 IX86_BUILTIN_PMINUW256,
26252 IX86_BUILTIN_PMINUD256,
26253 IX86_BUILTIN_PMOVMSKB256,
26254 IX86_BUILTIN_PMOVSXBW256,
26255 IX86_BUILTIN_PMOVSXBD256,
26256 IX86_BUILTIN_PMOVSXBQ256,
26257 IX86_BUILTIN_PMOVSXWD256,
26258 IX86_BUILTIN_PMOVSXWQ256,
26259 IX86_BUILTIN_PMOVSXDQ256,
26260 IX86_BUILTIN_PMOVZXBW256,
26261 IX86_BUILTIN_PMOVZXBD256,
26262 IX86_BUILTIN_PMOVZXBQ256,
26263 IX86_BUILTIN_PMOVZXWD256,
26264 IX86_BUILTIN_PMOVZXWQ256,
26265 IX86_BUILTIN_PMOVZXDQ256,
26266 IX86_BUILTIN_PMULDQ256,
26267 IX86_BUILTIN_PMULHRSW256,
26268 IX86_BUILTIN_PMULHUW256,
26269 IX86_BUILTIN_PMULHW256,
26270 IX86_BUILTIN_PMULLW256,
26271 IX86_BUILTIN_PMULLD256,
26272 IX86_BUILTIN_PMULUDQ256,
26273 IX86_BUILTIN_POR256,
26274 IX86_BUILTIN_PSADBW256,
26275 IX86_BUILTIN_PSHUFB256,
26276 IX86_BUILTIN_PSHUFD256,
26277 IX86_BUILTIN_PSHUFHW256,
26278 IX86_BUILTIN_PSHUFLW256,
26279 IX86_BUILTIN_PSIGNB256,
26280 IX86_BUILTIN_PSIGNW256,
26281 IX86_BUILTIN_PSIGND256,
26282 IX86_BUILTIN_PSLLDQI256,
26283 IX86_BUILTIN_PSLLWI256,
26284 IX86_BUILTIN_PSLLW256,
26285 IX86_BUILTIN_PSLLDI256,
26286 IX86_BUILTIN_PSLLD256,
26287 IX86_BUILTIN_PSLLQI256,
26288 IX86_BUILTIN_PSLLQ256,
26289 IX86_BUILTIN_PSRAWI256,
26290 IX86_BUILTIN_PSRAW256,
26291 IX86_BUILTIN_PSRADI256,
26292 IX86_BUILTIN_PSRAD256,
26293 IX86_BUILTIN_PSRLDQI256,
26294 IX86_BUILTIN_PSRLWI256,
26295 IX86_BUILTIN_PSRLW256,
26296 IX86_BUILTIN_PSRLDI256,
26297 IX86_BUILTIN_PSRLD256,
26298 IX86_BUILTIN_PSRLQI256,
26299 IX86_BUILTIN_PSRLQ256,
26300 IX86_BUILTIN_PSUBB256,
26301 IX86_BUILTIN_PSUBW256,
26302 IX86_BUILTIN_PSUBD256,
26303 IX86_BUILTIN_PSUBQ256,
26304 IX86_BUILTIN_PSUBSB256,
26305 IX86_BUILTIN_PSUBSW256,
26306 IX86_BUILTIN_PSUBUSB256,
26307 IX86_BUILTIN_PSUBUSW256,
26308 IX86_BUILTIN_PUNPCKHBW256,
26309 IX86_BUILTIN_PUNPCKHWD256,
26310 IX86_BUILTIN_PUNPCKHDQ256,
26311 IX86_BUILTIN_PUNPCKHQDQ256,
26312 IX86_BUILTIN_PUNPCKLBW256,
26313 IX86_BUILTIN_PUNPCKLWD256,
26314 IX86_BUILTIN_PUNPCKLDQ256,
26315 IX86_BUILTIN_PUNPCKLQDQ256,
26316 IX86_BUILTIN_PXOR256,
26317 IX86_BUILTIN_MOVNTDQA256,
26318 IX86_BUILTIN_VBROADCASTSS_PS,
26319 IX86_BUILTIN_VBROADCASTSS_PS256,
26320 IX86_BUILTIN_VBROADCASTSD_PD256,
26321 IX86_BUILTIN_VBROADCASTSI256,
26322 IX86_BUILTIN_PBLENDD256,
26323 IX86_BUILTIN_PBLENDD128,
26324 IX86_BUILTIN_PBROADCASTB256,
26325 IX86_BUILTIN_PBROADCASTW256,
26326 IX86_BUILTIN_PBROADCASTD256,
26327 IX86_BUILTIN_PBROADCASTQ256,
26328 IX86_BUILTIN_PBROADCASTB128,
26329 IX86_BUILTIN_PBROADCASTW128,
26330 IX86_BUILTIN_PBROADCASTD128,
26331 IX86_BUILTIN_PBROADCASTQ128,
26332 IX86_BUILTIN_VPERMVARSI256,
26333 IX86_BUILTIN_VPERMDF256,
26334 IX86_BUILTIN_VPERMVARSF256,
26335 IX86_BUILTIN_VPERMDI256,
26336 IX86_BUILTIN_VPERMTI256,
26337 IX86_BUILTIN_VEXTRACT128I256,
26338 IX86_BUILTIN_VINSERT128I256,
26339 IX86_BUILTIN_MASKLOADD,
26340 IX86_BUILTIN_MASKLOADQ,
26341 IX86_BUILTIN_MASKLOADD256,
26342 IX86_BUILTIN_MASKLOADQ256,
26343 IX86_BUILTIN_MASKSTORED,
26344 IX86_BUILTIN_MASKSTOREQ,
26345 IX86_BUILTIN_MASKSTORED256,
26346 IX86_BUILTIN_MASKSTOREQ256,
26347 IX86_BUILTIN_PSLLVV4DI,
26348 IX86_BUILTIN_PSLLVV2DI,
26349 IX86_BUILTIN_PSLLVV8SI,
26350 IX86_BUILTIN_PSLLVV4SI,
26351 IX86_BUILTIN_PSRAVV8SI,
26352 IX86_BUILTIN_PSRAVV4SI,
26353 IX86_BUILTIN_PSRLVV4DI,
26354 IX86_BUILTIN_PSRLVV2DI,
26355 IX86_BUILTIN_PSRLVV8SI,
26356 IX86_BUILTIN_PSRLVV4SI,
26357
26358 IX86_BUILTIN_GATHERSIV2DF,
26359 IX86_BUILTIN_GATHERSIV4DF,
26360 IX86_BUILTIN_GATHERDIV2DF,
26361 IX86_BUILTIN_GATHERDIV4DF,
26362 IX86_BUILTIN_GATHERSIV4SF,
26363 IX86_BUILTIN_GATHERSIV8SF,
26364 IX86_BUILTIN_GATHERDIV4SF,
26365 IX86_BUILTIN_GATHERDIV8SF,
26366 IX86_BUILTIN_GATHERSIV2DI,
26367 IX86_BUILTIN_GATHERSIV4DI,
26368 IX86_BUILTIN_GATHERDIV2DI,
26369 IX86_BUILTIN_GATHERDIV4DI,
26370 IX86_BUILTIN_GATHERSIV4SI,
26371 IX86_BUILTIN_GATHERSIV8SI,
26372 IX86_BUILTIN_GATHERDIV4SI,
26373 IX86_BUILTIN_GATHERDIV8SI,
26374
26375 /* Alternate 4 element gather for the vectorizer where
26376 all operands are 32-byte wide. */
26377 IX86_BUILTIN_GATHERALTSIV4DF,
26378 IX86_BUILTIN_GATHERALTDIV8SF,
26379 IX86_BUILTIN_GATHERALTSIV4DI,
26380 IX86_BUILTIN_GATHERALTDIV8SI,
26381
26382 /* TFmode support builtins. */
26383 IX86_BUILTIN_INFQ,
26384 IX86_BUILTIN_HUGE_VALQ,
26385 IX86_BUILTIN_FABSQ,
26386 IX86_BUILTIN_COPYSIGNQ,
26387
26388 /* Vectorizer support builtins. */
26389 IX86_BUILTIN_CPYSGNPS,
26390 IX86_BUILTIN_CPYSGNPD,
26391 IX86_BUILTIN_CPYSGNPS256,
26392 IX86_BUILTIN_CPYSGNPD256,
26393
26394 /* FMA4 instructions. */
26395 IX86_BUILTIN_VFMADDSS,
26396 IX86_BUILTIN_VFMADDSD,
26397 IX86_BUILTIN_VFMADDPS,
26398 IX86_BUILTIN_VFMADDPD,
26399 IX86_BUILTIN_VFMADDPS256,
26400 IX86_BUILTIN_VFMADDPD256,
26401 IX86_BUILTIN_VFMADDSUBPS,
26402 IX86_BUILTIN_VFMADDSUBPD,
26403 IX86_BUILTIN_VFMADDSUBPS256,
26404 IX86_BUILTIN_VFMADDSUBPD256,
26405
26406 /* FMA3 instructions. */
26407 IX86_BUILTIN_VFMADDSS3,
26408 IX86_BUILTIN_VFMADDSD3,
26409
26410 /* XOP instructions. */
26411 IX86_BUILTIN_VPCMOV,
26412 IX86_BUILTIN_VPCMOV_V2DI,
26413 IX86_BUILTIN_VPCMOV_V4SI,
26414 IX86_BUILTIN_VPCMOV_V8HI,
26415 IX86_BUILTIN_VPCMOV_V16QI,
26416 IX86_BUILTIN_VPCMOV_V4SF,
26417 IX86_BUILTIN_VPCMOV_V2DF,
26418 IX86_BUILTIN_VPCMOV256,
26419 IX86_BUILTIN_VPCMOV_V4DI256,
26420 IX86_BUILTIN_VPCMOV_V8SI256,
26421 IX86_BUILTIN_VPCMOV_V16HI256,
26422 IX86_BUILTIN_VPCMOV_V32QI256,
26423 IX86_BUILTIN_VPCMOV_V8SF256,
26424 IX86_BUILTIN_VPCMOV_V4DF256,
26425
26426 IX86_BUILTIN_VPPERM,
26427
26428 IX86_BUILTIN_VPMACSSWW,
26429 IX86_BUILTIN_VPMACSWW,
26430 IX86_BUILTIN_VPMACSSWD,
26431 IX86_BUILTIN_VPMACSWD,
26432 IX86_BUILTIN_VPMACSSDD,
26433 IX86_BUILTIN_VPMACSDD,
26434 IX86_BUILTIN_VPMACSSDQL,
26435 IX86_BUILTIN_VPMACSSDQH,
26436 IX86_BUILTIN_VPMACSDQL,
26437 IX86_BUILTIN_VPMACSDQH,
26438 IX86_BUILTIN_VPMADCSSWD,
26439 IX86_BUILTIN_VPMADCSWD,
26440
26441 IX86_BUILTIN_VPHADDBW,
26442 IX86_BUILTIN_VPHADDBD,
26443 IX86_BUILTIN_VPHADDBQ,
26444 IX86_BUILTIN_VPHADDWD,
26445 IX86_BUILTIN_VPHADDWQ,
26446 IX86_BUILTIN_VPHADDDQ,
26447 IX86_BUILTIN_VPHADDUBW,
26448 IX86_BUILTIN_VPHADDUBD,
26449 IX86_BUILTIN_VPHADDUBQ,
26450 IX86_BUILTIN_VPHADDUWD,
26451 IX86_BUILTIN_VPHADDUWQ,
26452 IX86_BUILTIN_VPHADDUDQ,
26453 IX86_BUILTIN_VPHSUBBW,
26454 IX86_BUILTIN_VPHSUBWD,
26455 IX86_BUILTIN_VPHSUBDQ,
26456
26457 IX86_BUILTIN_VPROTB,
26458 IX86_BUILTIN_VPROTW,
26459 IX86_BUILTIN_VPROTD,
26460 IX86_BUILTIN_VPROTQ,
26461 IX86_BUILTIN_VPROTB_IMM,
26462 IX86_BUILTIN_VPROTW_IMM,
26463 IX86_BUILTIN_VPROTD_IMM,
26464 IX86_BUILTIN_VPROTQ_IMM,
26465
26466 IX86_BUILTIN_VPSHLB,
26467 IX86_BUILTIN_VPSHLW,
26468 IX86_BUILTIN_VPSHLD,
26469 IX86_BUILTIN_VPSHLQ,
26470 IX86_BUILTIN_VPSHAB,
26471 IX86_BUILTIN_VPSHAW,
26472 IX86_BUILTIN_VPSHAD,
26473 IX86_BUILTIN_VPSHAQ,
26474
26475 IX86_BUILTIN_VFRCZSS,
26476 IX86_BUILTIN_VFRCZSD,
26477 IX86_BUILTIN_VFRCZPS,
26478 IX86_BUILTIN_VFRCZPD,
26479 IX86_BUILTIN_VFRCZPS256,
26480 IX86_BUILTIN_VFRCZPD256,
26481
26482 IX86_BUILTIN_VPCOMEQUB,
26483 IX86_BUILTIN_VPCOMNEUB,
26484 IX86_BUILTIN_VPCOMLTUB,
26485 IX86_BUILTIN_VPCOMLEUB,
26486 IX86_BUILTIN_VPCOMGTUB,
26487 IX86_BUILTIN_VPCOMGEUB,
26488 IX86_BUILTIN_VPCOMFALSEUB,
26489 IX86_BUILTIN_VPCOMTRUEUB,
26490
26491 IX86_BUILTIN_VPCOMEQUW,
26492 IX86_BUILTIN_VPCOMNEUW,
26493 IX86_BUILTIN_VPCOMLTUW,
26494 IX86_BUILTIN_VPCOMLEUW,
26495 IX86_BUILTIN_VPCOMGTUW,
26496 IX86_BUILTIN_VPCOMGEUW,
26497 IX86_BUILTIN_VPCOMFALSEUW,
26498 IX86_BUILTIN_VPCOMTRUEUW,
26499
26500 IX86_BUILTIN_VPCOMEQUD,
26501 IX86_BUILTIN_VPCOMNEUD,
26502 IX86_BUILTIN_VPCOMLTUD,
26503 IX86_BUILTIN_VPCOMLEUD,
26504 IX86_BUILTIN_VPCOMGTUD,
26505 IX86_BUILTIN_VPCOMGEUD,
26506 IX86_BUILTIN_VPCOMFALSEUD,
26507 IX86_BUILTIN_VPCOMTRUEUD,
26508
26509 IX86_BUILTIN_VPCOMEQUQ,
26510 IX86_BUILTIN_VPCOMNEUQ,
26511 IX86_BUILTIN_VPCOMLTUQ,
26512 IX86_BUILTIN_VPCOMLEUQ,
26513 IX86_BUILTIN_VPCOMGTUQ,
26514 IX86_BUILTIN_VPCOMGEUQ,
26515 IX86_BUILTIN_VPCOMFALSEUQ,
26516 IX86_BUILTIN_VPCOMTRUEUQ,
26517
26518 IX86_BUILTIN_VPCOMEQB,
26519 IX86_BUILTIN_VPCOMNEB,
26520 IX86_BUILTIN_VPCOMLTB,
26521 IX86_BUILTIN_VPCOMLEB,
26522 IX86_BUILTIN_VPCOMGTB,
26523 IX86_BUILTIN_VPCOMGEB,
26524 IX86_BUILTIN_VPCOMFALSEB,
26525 IX86_BUILTIN_VPCOMTRUEB,
26526
26527 IX86_BUILTIN_VPCOMEQW,
26528 IX86_BUILTIN_VPCOMNEW,
26529 IX86_BUILTIN_VPCOMLTW,
26530 IX86_BUILTIN_VPCOMLEW,
26531 IX86_BUILTIN_VPCOMGTW,
26532 IX86_BUILTIN_VPCOMGEW,
26533 IX86_BUILTIN_VPCOMFALSEW,
26534 IX86_BUILTIN_VPCOMTRUEW,
26535
26536 IX86_BUILTIN_VPCOMEQD,
26537 IX86_BUILTIN_VPCOMNED,
26538 IX86_BUILTIN_VPCOMLTD,
26539 IX86_BUILTIN_VPCOMLED,
26540 IX86_BUILTIN_VPCOMGTD,
26541 IX86_BUILTIN_VPCOMGED,
26542 IX86_BUILTIN_VPCOMFALSED,
26543 IX86_BUILTIN_VPCOMTRUED,
26544
26545 IX86_BUILTIN_VPCOMEQQ,
26546 IX86_BUILTIN_VPCOMNEQ,
26547 IX86_BUILTIN_VPCOMLTQ,
26548 IX86_BUILTIN_VPCOMLEQ,
26549 IX86_BUILTIN_VPCOMGTQ,
26550 IX86_BUILTIN_VPCOMGEQ,
26551 IX86_BUILTIN_VPCOMFALSEQ,
26552 IX86_BUILTIN_VPCOMTRUEQ,
26553
26554 /* LWP instructions. */
26555 IX86_BUILTIN_LLWPCB,
26556 IX86_BUILTIN_SLWPCB,
26557 IX86_BUILTIN_LWPVAL32,
26558 IX86_BUILTIN_LWPVAL64,
26559 IX86_BUILTIN_LWPINS32,
26560 IX86_BUILTIN_LWPINS64,
26561
26562 IX86_BUILTIN_CLZS,
26563
26564 /* RTM */
26565 IX86_BUILTIN_XBEGIN,
26566 IX86_BUILTIN_XEND,
26567 IX86_BUILTIN_XABORT,
26568 IX86_BUILTIN_XTEST,
26569
26570 /* BMI instructions. */
26571 IX86_BUILTIN_BEXTR32,
26572 IX86_BUILTIN_BEXTR64,
26573 IX86_BUILTIN_CTZS,
26574
26575 /* TBM instructions. */
26576 IX86_BUILTIN_BEXTRI32,
26577 IX86_BUILTIN_BEXTRI64,
26578
26579 /* BMI2 instructions. */
26580 IX86_BUILTIN_BZHI32,
26581 IX86_BUILTIN_BZHI64,
26582 IX86_BUILTIN_PDEP32,
26583 IX86_BUILTIN_PDEP64,
26584 IX86_BUILTIN_PEXT32,
26585 IX86_BUILTIN_PEXT64,
26586
26587 /* ADX instructions. */
26588 IX86_BUILTIN_ADDCARRYX32,
26589 IX86_BUILTIN_ADDCARRYX64,
26590
26591 /* FSGSBASE instructions. */
26592 IX86_BUILTIN_RDFSBASE32,
26593 IX86_BUILTIN_RDFSBASE64,
26594 IX86_BUILTIN_RDGSBASE32,
26595 IX86_BUILTIN_RDGSBASE64,
26596 IX86_BUILTIN_WRFSBASE32,
26597 IX86_BUILTIN_WRFSBASE64,
26598 IX86_BUILTIN_WRGSBASE32,
26599 IX86_BUILTIN_WRGSBASE64,
26600
26601 /* RDRND instructions. */
26602 IX86_BUILTIN_RDRAND16_STEP,
26603 IX86_BUILTIN_RDRAND32_STEP,
26604 IX86_BUILTIN_RDRAND64_STEP,
26605
26606 /* RDSEED instructions. */
26607 IX86_BUILTIN_RDSEED16_STEP,
26608 IX86_BUILTIN_RDSEED32_STEP,
26609 IX86_BUILTIN_RDSEED64_STEP,
26610
26611 /* F16C instructions. */
26612 IX86_BUILTIN_CVTPH2PS,
26613 IX86_BUILTIN_CVTPH2PS256,
26614 IX86_BUILTIN_CVTPS2PH,
26615 IX86_BUILTIN_CVTPS2PH256,
26616
26617 /* CFString built-in for darwin */
26618 IX86_BUILTIN_CFSTRING,
26619
26620 /* Builtins to get CPU type and supported features. */
26621 IX86_BUILTIN_CPU_INIT,
26622 IX86_BUILTIN_CPU_IS,
26623 IX86_BUILTIN_CPU_SUPPORTS,
26624
26625 IX86_BUILTIN_MAX
26626 };
26627
26628 /* Table for the ix86 builtin decls. */
26629 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
26630
26631 /* Table of all of the builtin functions that are possible with different ISA's
26632 but are waiting to be built until a function is declared to use that
26633 ISA. */
26634 struct builtin_isa {
26635 const char *name; /* function name */
26636 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
26637 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
26638 bool const_p; /* true if the declaration is constant */
26639 bool set_and_not_built_p;
26640 };
26641
26642 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
26643
26644
26645 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
26646 of which isa_flags to use in the ix86_builtins_isa array. Stores the
26647 function decl in the ix86_builtins array. Returns the function decl or
26648 NULL_TREE, if the builtin was not added.
26649
26650 If the front end has a special hook for builtin functions, delay adding
26651 builtin functions that aren't in the current ISA until the ISA is changed
26652 with function specific optimization. Doing so, can save about 300K for the
26653 default compiler. When the builtin is expanded, check at that time whether
26654 it is valid.
26655
26656 If the front end doesn't have a special hook, record all builtins, even if
26657 it isn't an instruction set in the current ISA in case the user uses
26658 function specific options for a different ISA, so that we don't get scope
26659 errors if a builtin is added in the middle of a function scope. */
26660
26661 static inline tree
26662 def_builtin (HOST_WIDE_INT mask, const char *name,
26663 enum ix86_builtin_func_type tcode,
26664 enum ix86_builtins code)
26665 {
26666 tree decl = NULL_TREE;
26667
26668 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
26669 {
26670 ix86_builtins_isa[(int) code].isa = mask;
26671
26672 mask &= ~OPTION_MASK_ISA_64BIT;
26673 if (mask == 0
26674 || (mask & ix86_isa_flags) != 0
26675 || (lang_hooks.builtin_function
26676 == lang_hooks.builtin_function_ext_scope))
26677
26678 {
26679 tree type = ix86_get_builtin_func_type (tcode);
26680 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
26681 NULL, NULL_TREE);
26682 ix86_builtins[(int) code] = decl;
26683 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
26684 }
26685 else
26686 {
26687 ix86_builtins[(int) code] = NULL_TREE;
26688 ix86_builtins_isa[(int) code].tcode = tcode;
26689 ix86_builtins_isa[(int) code].name = name;
26690 ix86_builtins_isa[(int) code].const_p = false;
26691 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
26692 }
26693 }
26694
26695 return decl;
26696 }
26697
26698 /* Like def_builtin, but also marks the function decl "const". */
26699
26700 static inline tree
26701 def_builtin_const (HOST_WIDE_INT mask, const char *name,
26702 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
26703 {
26704 tree decl = def_builtin (mask, name, tcode, code);
26705 if (decl)
26706 TREE_READONLY (decl) = 1;
26707 else
26708 ix86_builtins_isa[(int) code].const_p = true;
26709
26710 return decl;
26711 }
26712
26713 /* Add any new builtin functions for a given ISA that may not have been
26714 declared. This saves a bit of space compared to adding all of the
26715 declarations to the tree, even if we didn't use them. */
26716
26717 static void
26718 ix86_add_new_builtins (HOST_WIDE_INT isa)
26719 {
26720 int i;
26721
26722 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
26723 {
26724 if ((ix86_builtins_isa[i].isa & isa) != 0
26725 && ix86_builtins_isa[i].set_and_not_built_p)
26726 {
26727 tree decl, type;
26728
26729 /* Don't define the builtin again. */
26730 ix86_builtins_isa[i].set_and_not_built_p = false;
26731
26732 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
26733 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
26734 type, i, BUILT_IN_MD, NULL,
26735 NULL_TREE);
26736
26737 ix86_builtins[i] = decl;
26738 if (ix86_builtins_isa[i].const_p)
26739 TREE_READONLY (decl) = 1;
26740 }
26741 }
26742 }
26743
26744 /* Bits for builtin_description.flag. */
26745
26746 /* Set when we don't support the comparison natively, and should
26747 swap_comparison in order to support it. */
26748 #define BUILTIN_DESC_SWAP_OPERANDS 1
26749
26750 struct builtin_description
26751 {
26752 const HOST_WIDE_INT mask;
26753 const enum insn_code icode;
26754 const char *const name;
26755 const enum ix86_builtins code;
26756 const enum rtx_code comparison;
26757 const int flag;
26758 };
26759
26760 static const struct builtin_description bdesc_comi[] =
26761 {
26762 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
26763 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
26764 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
26765 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
26766 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
26767 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
26768 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
26769 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
26770 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
26771 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
26772 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
26773 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
26774 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
26775 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
26776 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
26777 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
26778 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
26779 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
26780 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
26781 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
26782 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
26783 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
26784 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
26785 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
26786 };
26787
26788 static const struct builtin_description bdesc_pcmpestr[] =
26789 {
26790 /* SSE4.2 */
26791 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
26792 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
26793 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
26794 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
26795 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
26796 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
26797 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
26798 };
26799
26800 static const struct builtin_description bdesc_pcmpistr[] =
26801 {
26802 /* SSE4.2 */
26803 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
26804 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
26805 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
26806 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
26807 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
26808 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
26809 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
26810 };
26811
26812 /* Special builtins with variable number of arguments. */
26813 static const struct builtin_description bdesc_special_args[] =
26814 {
26815 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
26816 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
26817 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
26818
26819 /* MMX */
26820 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26821
26822 /* 3DNow! */
26823 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26824
26825 /* FXSR, XSAVE and XSAVEOPT */
26826 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
26827 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
26828 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26829 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26830 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26831
26832 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26833 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26834 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26835 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26836 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26837
26838 /* SSE */
26839 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26840 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26841 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26842
26843 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26844 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26845 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26846 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26847
26848 /* SSE or 3DNow!A */
26849 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26850 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
26851
26852 /* SSE2 */
26853 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26854 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26855 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26856 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
26857 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26858 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
26859 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
26860 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
26861 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
26862 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26863
26864 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26865 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26866
26867 /* SSE3 */
26868 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26869
26870 /* SSE4.1 */
26871 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
26872
26873 /* SSE4A */
26874 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26875 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26876
26877 /* AVX */
26878 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
26879 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
26880
26881 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26882 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26883 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26884 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26885 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26886
26887 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26888 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26889 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26890 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26891 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26892 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26893 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26894
26895 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26896 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26897 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26898
26899 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26900 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26901 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26902 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26903 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26904 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26905 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26906 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26907
26908 /* AVX2 */
26909 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26910 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26911 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26912 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26913 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26914 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26915 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26916 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26917 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26918
26919 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26920 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26921 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26922 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26923 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26924 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26925
26926 /* FSGSBASE */
26927 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26928 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26929 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26930 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26931 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26932 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26933 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26934 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26935
26936 /* RTM */
26937 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26938 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
26939 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
26940 };
26941
26942 /* Builtins with variable number of arguments. */
26943 static const struct builtin_description bdesc_args[] =
26944 {
26945 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26946 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26947 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26948 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26949 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26950 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26951 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26952
26953 /* MMX */
26954 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26955 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26956 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26957 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26958 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26959 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26960
26961 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26962 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26963 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26964 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26965 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26966 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26967 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26968 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26969
26970 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26971 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26972
26973 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26974 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26975 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26976 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26977
26978 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26979 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26980 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26981 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26982 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26983 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26984
26985 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26986 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26987 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26988 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26989 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
26990 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
26991
26992 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26993 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
26994 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26995
26996 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
26997
26998 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26999 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27000 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27001 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27002 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27003 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27004
27005 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27006 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27007 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27008 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27009 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27010 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27011
27012 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27013 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27014 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27015 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27016
27017 /* 3DNow! */
27018 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27019 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27020 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27021 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27022
27023 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27024 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27025 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27026 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27027 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27028 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27029 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27030 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27031 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27032 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27033 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27034 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27035 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27036 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27037 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27038
27039 /* 3DNow!A */
27040 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27041 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27042 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27043 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27044 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27045 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27046
27047 /* SSE */
27048 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
27049 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27050 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27051 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27052 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27053 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27054 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27055 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27056 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27057 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27058 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27059 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27060
27061 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27062
27063 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27064 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27065 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27066 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27067 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27068 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27069 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27070 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27071
27072 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27073 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27074 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27075 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27076 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27077 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27078 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27079 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27080 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27081 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27082 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
27083 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27084 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27085 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27086 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27087 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27088 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27089 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27090 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27091 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27092 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27093 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27094
27095 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27096 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27097 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27098 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27099
27100 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27101 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27102 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27103 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27104
27105 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27106
27107 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27108 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27109 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27110 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27111 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27112
27113 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
27114 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
27115 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
27116
27117 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
27118
27119 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27120 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27121 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27122
27123 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
27124 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
27125
27126 /* SSE MMX or 3Dnow!A */
27127 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27128 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27129 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27130
27131 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27132 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27133 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27134 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27135
27136 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
27137 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
27138
27139 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
27140
27141 /* SSE2 */
27142 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27143
27144 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
27145 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
27146 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27147 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
27148 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
27149
27150 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27151 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27152 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
27153 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27154 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27155
27156 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
27157
27158 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27159 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27160 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27161 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27162
27163 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27164 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
27165 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27166
27167 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27168 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27169 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27170 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27171 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27172 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27173 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27174 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27175
27176 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27177 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27178 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27179 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27180 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
27181 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27182 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27183 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27184 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27185 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27186 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27187 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27188 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27189 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27190 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27191 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27192 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27193 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27194 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27195 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27196
27197 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27198 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27199 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27200 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27201
27202 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27203 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27204 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27205 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27206
27207 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27208
27209 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27210 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27211 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27212
27213 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27214
27215 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27216 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27217 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27218 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27219 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27220 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27221 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27222 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27223
27224 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27225 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27226 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27227 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27228 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27229 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27230 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27231 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27232
27233 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27234 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
27235
27236 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27237 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27238 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27239 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27240
27241 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27242 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27243
27244 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27245 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27246 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27247 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27248 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27249 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27250
27251 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27252 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27253 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27254 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27255
27256 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27257 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27258 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27259 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27260 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27261 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27262 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27263 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27264
27265 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27266 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27267 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27268
27269 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27270 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
27271
27272 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
27273 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27274
27275 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
27276
27277 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
27278 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
27279 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
27280 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
27281
27282 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27283 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27284 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27285 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27286 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27287 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27288 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27289
27290 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27291 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27292 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27293 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27294 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27295 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27296 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27297
27298 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27299 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27300 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27301 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27302
27303 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
27304 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27305 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27306
27307 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
27308
27309 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27310
27311 /* SSE2 MMX */
27312 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27313 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27314
27315 /* SSE3 */
27316 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
27317 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27318
27319 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27320 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27321 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27322 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27323 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27324 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27325
27326 /* SSSE3 */
27327 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27328 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
27329 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27330 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
27331 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27332 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27333
27334 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27335 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27336 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27337 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27338 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27339 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27340 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27341 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27342 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27343 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27344 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27345 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27346 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
27347 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
27348 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27349 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27350 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27351 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27352 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27353 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27354 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27355 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27356 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27357 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27358
27359 /* SSSE3. */
27360 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
27361 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
27362
27363 /* SSE4.1 */
27364 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27365 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27366 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
27367 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
27368 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27369 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27370 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27371 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
27372 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
27373 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
27374
27375 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27376 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27377 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27378 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27379 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27380 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27381 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27382 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27383 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27384 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27385 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27386 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27387 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27388
27389 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27390 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27391 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27392 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27393 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27394 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27395 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27396 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27397 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27398 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27399 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27400 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27401
27402 /* SSE4.1 */
27403 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27404 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27405 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27406 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27407
27408 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
27409 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
27410 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
27411 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
27412
27413 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27414 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27415
27416 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27417 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27418
27419 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
27420 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
27421 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
27422 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
27423
27424 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
27425 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
27426
27427 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27428 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27429
27430 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27431 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27432 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27433
27434 /* SSE4.2 */
27435 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27436 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
27437 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
27438 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27439 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27440
27441 /* SSE4A */
27442 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
27443 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
27444 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
27445 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27446
27447 /* AES */
27448 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
27449 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27450
27451 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27452 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27453 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27454 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27455
27456 /* PCLMUL */
27457 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
27458
27459 /* AVX */
27460 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27461 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27462 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27463 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27464 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27465 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27466 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27467 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27468 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27469 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27470 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27471 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27472 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27473 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27474 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27475 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27476 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27477 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27478 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27479 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27480 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27481 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27482 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27483 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27484 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27485 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27486
27487 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
27488 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
27489 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
27490 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27491
27492 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27493 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27494 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
27495 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
27496 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27497 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27498 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27499 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27500 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27501 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27502 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27503 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27504 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27505 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
27506 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
27507 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
27508 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
27509 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
27510 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
27511 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27512 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
27513 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27514 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27515 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27516 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27517 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27518 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27519 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27520 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27521 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27522 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27523 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
27524 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
27525 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
27526
27527 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27528 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27529 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27530
27531 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27532 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27533 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27534 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27535 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27536
27537 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27538
27539 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27540 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27541
27542 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
27543 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
27544 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
27545 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
27546
27547 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27548 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27549
27550 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27551 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27552
27553 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
27554 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
27555 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
27556 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
27557
27558 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
27559 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
27560
27561 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27562 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27563
27564 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27565 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27566 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27567 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27568
27569 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27570 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27571 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27572 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
27573 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
27574 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
27575
27576 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27577 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27578 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27579 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27580 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27581 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27582 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27583 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27584 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27585 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27586 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27587 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27588 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27589 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27590 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27591
27592 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
27593 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
27594
27595 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27596 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27597
27598 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27599
27600 /* AVX2 */
27601 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
27602 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
27603 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
27604 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
27605 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27606 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27607 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27608 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27609 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27610 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27611 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27612 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27613 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27614 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27615 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27616 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27617 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
27618 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27619 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27620 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27621 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27622 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
27623 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
27624 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27625 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27626 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27627 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27628 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27629 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27630 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27631 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27632 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27633 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27634 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27635 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27636 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27637 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27638 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27639 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
27640 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27641 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27642 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27643 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27644 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27645 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27646 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27647 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27648 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27649 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27650 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27651 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27652 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
27653 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27654 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27655 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27656 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27657 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27658 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27659 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27660 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27661 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27662 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27663 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27664 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27665 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27666 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27667 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27668 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27669 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27670 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27671 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27672 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27673 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27674 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27675 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
27676 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27677 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27678 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27679 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27680 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27681 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27682 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27683 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27684 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27685 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27686 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27687 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27688 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27689 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27690 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27691 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27692 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27693 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27694 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27695 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27696 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27697 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27698 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27699 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27700 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27701 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27702 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27703 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27704 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27705 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27706 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27707 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27708 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27709 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27710 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27711 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27712 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27713 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27714 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27715 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27716 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27717 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27718 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27719 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27720 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
27721 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27722 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
27723 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
27724 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27725 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27726 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27727 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27728 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27729 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27730 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27731 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27732 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27733 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
27734 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
27735 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
27736 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
27737 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27738 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27739 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27740 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27741 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27742 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27743 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27744 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27745 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27746 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27747
27748 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27749
27750 /* BMI */
27751 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27752 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27753 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27754
27755 /* TBM */
27756 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27757 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27758
27759 /* F16C */
27760 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
27761 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
27762 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
27763 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
27764
27765 /* BMI2 */
27766 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27767 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27768 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27769 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27770 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27771 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27772 };
27773
27774 /* FMA4 and XOP. */
27775 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
27776 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
27777 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
27778 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
27779 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
27780 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
27781 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
27782 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
27783 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
27784 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
27785 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
27786 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
27787 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
27788 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
27789 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
27790 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
27791 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
27792 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
27793 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
27794 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
27795 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
27796 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
27797 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
27798 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
27799 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
27800 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
27801 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
27802 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
27803 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
27804 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
27805 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
27806 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
27807 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
27808 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
27809 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
27810 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
27811 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
27812 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
27813 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
27814 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
27815 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
27816 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
27817 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
27818 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
27819 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
27820 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
27821 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
27822 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
27823 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
27824 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
27825 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
27826 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
27827
27828 static const struct builtin_description bdesc_multi_arg[] =
27829 {
27830 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
27831 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
27832 UNKNOWN, (int)MULTI_ARG_3_SF },
27833 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
27834 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
27835 UNKNOWN, (int)MULTI_ARG_3_DF },
27836
27837 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27838 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27839 UNKNOWN, (int)MULTI_ARG_3_SF },
27840 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27841 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27842 UNKNOWN, (int)MULTI_ARG_3_DF },
27843
27844 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
27845 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
27846 UNKNOWN, (int)MULTI_ARG_3_SF },
27847 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
27848 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
27849 UNKNOWN, (int)MULTI_ARG_3_DF },
27850 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
27851 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
27852 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27853 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
27854 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
27855 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27856
27857 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
27858 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
27859 UNKNOWN, (int)MULTI_ARG_3_SF },
27860 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
27861 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
27862 UNKNOWN, (int)MULTI_ARG_3_DF },
27863 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
27864 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
27865 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27866 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
27867 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
27868 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27869
27870 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
27871 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
27872 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
27873 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
27874 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
27875 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
27876 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
27877
27878 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27879 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27880 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
27881 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
27882 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
27883 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
27884 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
27885
27886 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
27887
27888 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27889 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27890 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27891 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27892 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27893 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27894 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27895 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27896 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27897 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27898 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27899 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27900
27901 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27902 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27903 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27904 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27905 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27906 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27907 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27908 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27909 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27910 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27911 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27912 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27913 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27914 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27915 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27916 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27917
27918 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
27919 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
27920 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27921 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27922 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27923 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27924
27925 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27926 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27927 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27928 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27929 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27930 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27931 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27932 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27933 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27934 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27935 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27936 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27937 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27938 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27939 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27940
27941 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27942 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27943 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27944 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
27945 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
27946 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
27947 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
27948
27949 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
27950 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27951 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27952 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
27953 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
27954 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
27955 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
27956
27957 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
27958 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27959 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27960 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
27961 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
27962 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
27963 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
27964
27965 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27966 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27967 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27968 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
27969 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
27970 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
27971 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
27972
27973 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
27974 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27975 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27976 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
27977 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
27978 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
27979 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
27980
27981 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
27982 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27983 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27984 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
27985 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
27986 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
27987 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
27988
27989 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
27990 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27991 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27992 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
27993 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
27994 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
27995 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
27996
27997 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27998 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27999 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28000 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
28001 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
28002 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
28003 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
28004
28005 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28006 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28007 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28008 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28009 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28010 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28011 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28012 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28013
28014 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28015 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28016 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28017 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28018 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28019 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28020 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28021 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28022
28023 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
28024 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
28025 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
28026 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
28027
28028 };
28029 \f
28030 /* TM vector builtins. */
28031
28032 /* Reuse the existing x86-specific `struct builtin_description' cause
28033 we're lazy. Add casts to make them fit. */
28034 static const struct builtin_description bdesc_tm[] =
28035 {
28036 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28037 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28038 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28039 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28040 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28041 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28042 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28043
28044 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28045 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28046 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28047 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28048 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28049 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28050 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28051
28052 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28053 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28054 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28055 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28056 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28057 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28058 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28059
28060 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
28061 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
28062 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
28063 };
28064
28065 /* TM callbacks. */
28066
28067 /* Return the builtin decl needed to load a vector of TYPE. */
28068
28069 static tree
28070 ix86_builtin_tm_load (tree type)
28071 {
28072 if (TREE_CODE (type) == VECTOR_TYPE)
28073 {
28074 switch (tree_low_cst (TYPE_SIZE (type), 1))
28075 {
28076 case 64:
28077 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
28078 case 128:
28079 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
28080 case 256:
28081 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
28082 }
28083 }
28084 return NULL_TREE;
28085 }
28086
28087 /* Return the builtin decl needed to store a vector of TYPE. */
28088
28089 static tree
28090 ix86_builtin_tm_store (tree type)
28091 {
28092 if (TREE_CODE (type) == VECTOR_TYPE)
28093 {
28094 switch (tree_low_cst (TYPE_SIZE (type), 1))
28095 {
28096 case 64:
28097 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
28098 case 128:
28099 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
28100 case 256:
28101 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
28102 }
28103 }
28104 return NULL_TREE;
28105 }
28106 \f
28107 /* Initialize the transactional memory vector load/store builtins. */
28108
28109 static void
28110 ix86_init_tm_builtins (void)
28111 {
28112 enum ix86_builtin_func_type ftype;
28113 const struct builtin_description *d;
28114 size_t i;
28115 tree decl;
28116 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
28117 tree attrs_log, attrs_type_log;
28118
28119 if (!flag_tm)
28120 return;
28121
28122 /* If there are no builtins defined, we must be compiling in a
28123 language without trans-mem support. */
28124 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
28125 return;
28126
28127 /* Use whatever attributes a normal TM load has. */
28128 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
28129 attrs_load = DECL_ATTRIBUTES (decl);
28130 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28131 /* Use whatever attributes a normal TM store has. */
28132 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
28133 attrs_store = DECL_ATTRIBUTES (decl);
28134 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28135 /* Use whatever attributes a normal TM log has. */
28136 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
28137 attrs_log = DECL_ATTRIBUTES (decl);
28138 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28139
28140 for (i = 0, d = bdesc_tm;
28141 i < ARRAY_SIZE (bdesc_tm);
28142 i++, d++)
28143 {
28144 if ((d->mask & ix86_isa_flags) != 0
28145 || (lang_hooks.builtin_function
28146 == lang_hooks.builtin_function_ext_scope))
28147 {
28148 tree type, attrs, attrs_type;
28149 enum built_in_function code = (enum built_in_function) d->code;
28150
28151 ftype = (enum ix86_builtin_func_type) d->flag;
28152 type = ix86_get_builtin_func_type (ftype);
28153
28154 if (BUILTIN_TM_LOAD_P (code))
28155 {
28156 attrs = attrs_load;
28157 attrs_type = attrs_type_load;
28158 }
28159 else if (BUILTIN_TM_STORE_P (code))
28160 {
28161 attrs = attrs_store;
28162 attrs_type = attrs_type_store;
28163 }
28164 else
28165 {
28166 attrs = attrs_log;
28167 attrs_type = attrs_type_log;
28168 }
28169 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
28170 /* The builtin without the prefix for
28171 calling it directly. */
28172 d->name + strlen ("__builtin_"),
28173 attrs);
28174 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
28175 set the TYPE_ATTRIBUTES. */
28176 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
28177
28178 set_builtin_decl (code, decl, false);
28179 }
28180 }
28181 }
28182
28183 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
28184 in the current target ISA to allow the user to compile particular modules
28185 with different target specific options that differ from the command line
28186 options. */
28187 static void
28188 ix86_init_mmx_sse_builtins (void)
28189 {
28190 const struct builtin_description * d;
28191 enum ix86_builtin_func_type ftype;
28192 size_t i;
28193
28194 /* Add all special builtins with variable number of operands. */
28195 for (i = 0, d = bdesc_special_args;
28196 i < ARRAY_SIZE (bdesc_special_args);
28197 i++, d++)
28198 {
28199 if (d->name == 0)
28200 continue;
28201
28202 ftype = (enum ix86_builtin_func_type) d->flag;
28203 def_builtin (d->mask, d->name, ftype, d->code);
28204 }
28205
28206 /* Add all builtins with variable number of operands. */
28207 for (i = 0, d = bdesc_args;
28208 i < ARRAY_SIZE (bdesc_args);
28209 i++, d++)
28210 {
28211 if (d->name == 0)
28212 continue;
28213
28214 ftype = (enum ix86_builtin_func_type) d->flag;
28215 def_builtin_const (d->mask, d->name, ftype, d->code);
28216 }
28217
28218 /* pcmpestr[im] insns. */
28219 for (i = 0, d = bdesc_pcmpestr;
28220 i < ARRAY_SIZE (bdesc_pcmpestr);
28221 i++, d++)
28222 {
28223 if (d->code == IX86_BUILTIN_PCMPESTRM128)
28224 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
28225 else
28226 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
28227 def_builtin_const (d->mask, d->name, ftype, d->code);
28228 }
28229
28230 /* pcmpistr[im] insns. */
28231 for (i = 0, d = bdesc_pcmpistr;
28232 i < ARRAY_SIZE (bdesc_pcmpistr);
28233 i++, d++)
28234 {
28235 if (d->code == IX86_BUILTIN_PCMPISTRM128)
28236 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
28237 else
28238 ftype = INT_FTYPE_V16QI_V16QI_INT;
28239 def_builtin_const (d->mask, d->name, ftype, d->code);
28240 }
28241
28242 /* comi/ucomi insns. */
28243 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
28244 {
28245 if (d->mask == OPTION_MASK_ISA_SSE2)
28246 ftype = INT_FTYPE_V2DF_V2DF;
28247 else
28248 ftype = INT_FTYPE_V4SF_V4SF;
28249 def_builtin_const (d->mask, d->name, ftype, d->code);
28250 }
28251
28252 /* SSE */
28253 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
28254 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
28255 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
28256 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
28257
28258 /* SSE or 3DNow!A */
28259 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28260 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
28261 IX86_BUILTIN_MASKMOVQ);
28262
28263 /* SSE2 */
28264 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
28265 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
28266
28267 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
28268 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
28269 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
28270 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
28271
28272 /* SSE3. */
28273 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
28274 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
28275 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
28276 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
28277
28278 /* AES */
28279 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
28280 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
28281 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
28282 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
28283 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
28284 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
28285 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
28286 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
28287 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
28288 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
28289 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
28290 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
28291
28292 /* PCLMUL */
28293 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
28294 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
28295
28296 /* RDRND */
28297 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
28298 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
28299 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
28300 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
28301 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
28302 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
28303 IX86_BUILTIN_RDRAND64_STEP);
28304
28305 /* AVX2 */
28306 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
28307 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
28308 IX86_BUILTIN_GATHERSIV2DF);
28309
28310 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
28311 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
28312 IX86_BUILTIN_GATHERSIV4DF);
28313
28314 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
28315 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
28316 IX86_BUILTIN_GATHERDIV2DF);
28317
28318 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
28319 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
28320 IX86_BUILTIN_GATHERDIV4DF);
28321
28322 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
28323 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
28324 IX86_BUILTIN_GATHERSIV4SF);
28325
28326 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
28327 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
28328 IX86_BUILTIN_GATHERSIV8SF);
28329
28330 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
28331 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
28332 IX86_BUILTIN_GATHERDIV4SF);
28333
28334 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
28335 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
28336 IX86_BUILTIN_GATHERDIV8SF);
28337
28338 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
28339 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
28340 IX86_BUILTIN_GATHERSIV2DI);
28341
28342 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
28343 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
28344 IX86_BUILTIN_GATHERSIV4DI);
28345
28346 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
28347 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
28348 IX86_BUILTIN_GATHERDIV2DI);
28349
28350 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
28351 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
28352 IX86_BUILTIN_GATHERDIV4DI);
28353
28354 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
28355 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
28356 IX86_BUILTIN_GATHERSIV4SI);
28357
28358 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
28359 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
28360 IX86_BUILTIN_GATHERSIV8SI);
28361
28362 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
28363 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
28364 IX86_BUILTIN_GATHERDIV4SI);
28365
28366 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
28367 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
28368 IX86_BUILTIN_GATHERDIV8SI);
28369
28370 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
28371 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
28372 IX86_BUILTIN_GATHERALTSIV4DF);
28373
28374 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
28375 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
28376 IX86_BUILTIN_GATHERALTDIV8SF);
28377
28378 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
28379 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
28380 IX86_BUILTIN_GATHERALTSIV4DI);
28381
28382 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
28383 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
28384 IX86_BUILTIN_GATHERALTDIV8SI);
28385
28386 /* RTM. */
28387 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
28388 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
28389
28390 /* MMX access to the vec_init patterns. */
28391 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
28392 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
28393
28394 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
28395 V4HI_FTYPE_HI_HI_HI_HI,
28396 IX86_BUILTIN_VEC_INIT_V4HI);
28397
28398 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
28399 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
28400 IX86_BUILTIN_VEC_INIT_V8QI);
28401
28402 /* Access to the vec_extract patterns. */
28403 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
28404 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
28405 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
28406 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
28407 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
28408 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
28409 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
28410 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
28411 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
28412 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
28413
28414 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28415 "__builtin_ia32_vec_ext_v4hi",
28416 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
28417
28418 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
28419 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
28420
28421 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
28422 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
28423
28424 /* Access to the vec_set patterns. */
28425 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
28426 "__builtin_ia32_vec_set_v2di",
28427 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
28428
28429 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
28430 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
28431
28432 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
28433 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
28434
28435 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
28436 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
28437
28438 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28439 "__builtin_ia32_vec_set_v4hi",
28440 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
28441
28442 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
28443 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
28444
28445 /* RDSEED */
28446 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
28447 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
28448 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
28449 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
28450 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
28451 "__builtin_ia32_rdseed_di_step",
28452 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
28453
28454 /* ADCX */
28455 def_builtin (0, "__builtin_ia32_addcarryx_u32",
28456 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
28457 def_builtin (OPTION_MASK_ISA_64BIT,
28458 "__builtin_ia32_addcarryx_u64",
28459 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
28460 IX86_BUILTIN_ADDCARRYX64);
28461
28462 /* Add FMA4 multi-arg argument instructions */
28463 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
28464 {
28465 if (d->name == 0)
28466 continue;
28467
28468 ftype = (enum ix86_builtin_func_type) d->flag;
28469 def_builtin_const (d->mask, d->name, ftype, d->code);
28470 }
28471 }
28472
28473 /* This builds the processor_model struct type defined in
28474 libgcc/config/i386/cpuinfo.c */
28475
28476 static tree
28477 build_processor_model_struct (void)
28478 {
28479 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
28480 "__cpu_features"};
28481 tree field = NULL_TREE, field_chain = NULL_TREE;
28482 int i;
28483 tree type = make_node (RECORD_TYPE);
28484
28485 /* The first 3 fields are unsigned int. */
28486 for (i = 0; i < 3; ++i)
28487 {
28488 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
28489 get_identifier (field_name[i]), unsigned_type_node);
28490 if (field_chain != NULL_TREE)
28491 DECL_CHAIN (field) = field_chain;
28492 field_chain = field;
28493 }
28494
28495 /* The last field is an array of unsigned integers of size one. */
28496 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
28497 get_identifier (field_name[3]),
28498 build_array_type (unsigned_type_node,
28499 build_index_type (size_one_node)));
28500 if (field_chain != NULL_TREE)
28501 DECL_CHAIN (field) = field_chain;
28502 field_chain = field;
28503
28504 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
28505 return type;
28506 }
28507
28508 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
28509
28510 static tree
28511 make_var_decl (tree type, const char *name)
28512 {
28513 tree new_decl;
28514
28515 new_decl = build_decl (UNKNOWN_LOCATION,
28516 VAR_DECL,
28517 get_identifier(name),
28518 type);
28519
28520 DECL_EXTERNAL (new_decl) = 1;
28521 TREE_STATIC (new_decl) = 1;
28522 TREE_PUBLIC (new_decl) = 1;
28523 DECL_INITIAL (new_decl) = 0;
28524 DECL_ARTIFICIAL (new_decl) = 0;
28525 DECL_PRESERVE_P (new_decl) = 1;
28526
28527 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
28528 assemble_variable (new_decl, 0, 0, 0);
28529
28530 return new_decl;
28531 }
28532
28533 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
28534 into an integer defined in libgcc/config/i386/cpuinfo.c */
28535
28536 static tree
28537 fold_builtin_cpu (tree fndecl, tree *args)
28538 {
28539 unsigned int i;
28540 enum ix86_builtins fn_code = (enum ix86_builtins)
28541 DECL_FUNCTION_CODE (fndecl);
28542 tree param_string_cst = NULL;
28543
28544 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
28545 enum processor_features
28546 {
28547 F_CMOV = 0,
28548 F_MMX,
28549 F_POPCNT,
28550 F_SSE,
28551 F_SSE2,
28552 F_SSE3,
28553 F_SSSE3,
28554 F_SSE4_1,
28555 F_SSE4_2,
28556 F_AVX,
28557 F_AVX2,
28558 F_MAX
28559 };
28560
28561 /* These are the values for vendor types and cpu types and subtypes
28562 in cpuinfo.c. Cpu types and subtypes should be subtracted by
28563 the corresponding start value. */
28564 enum processor_model
28565 {
28566 M_INTEL = 1,
28567 M_AMD,
28568 M_CPU_TYPE_START,
28569 M_INTEL_ATOM,
28570 M_INTEL_CORE2,
28571 M_INTEL_COREI7,
28572 M_AMDFAM10H,
28573 M_AMDFAM15H,
28574 M_CPU_SUBTYPE_START,
28575 M_INTEL_COREI7_NEHALEM,
28576 M_INTEL_COREI7_WESTMERE,
28577 M_INTEL_COREI7_SANDYBRIDGE,
28578 M_AMDFAM10H_BARCELONA,
28579 M_AMDFAM10H_SHANGHAI,
28580 M_AMDFAM10H_ISTANBUL,
28581 M_AMDFAM15H_BDVER1,
28582 M_AMDFAM15H_BDVER2
28583 };
28584
28585 static struct _arch_names_table
28586 {
28587 const char *const name;
28588 const enum processor_model model;
28589 }
28590 const arch_names_table[] =
28591 {
28592 {"amd", M_AMD},
28593 {"intel", M_INTEL},
28594 {"atom", M_INTEL_ATOM},
28595 {"core2", M_INTEL_CORE2},
28596 {"corei7", M_INTEL_COREI7},
28597 {"nehalem", M_INTEL_COREI7_NEHALEM},
28598 {"westmere", M_INTEL_COREI7_WESTMERE},
28599 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
28600 {"amdfam10h", M_AMDFAM10H},
28601 {"barcelona", M_AMDFAM10H_BARCELONA},
28602 {"shanghai", M_AMDFAM10H_SHANGHAI},
28603 {"istanbul", M_AMDFAM10H_ISTANBUL},
28604 {"amdfam15h", M_AMDFAM15H},
28605 {"bdver1", M_AMDFAM15H_BDVER1},
28606 {"bdver2", M_AMDFAM15H_BDVER2},
28607 };
28608
28609 static struct _isa_names_table
28610 {
28611 const char *const name;
28612 const enum processor_features feature;
28613 }
28614 const isa_names_table[] =
28615 {
28616 {"cmov", F_CMOV},
28617 {"mmx", F_MMX},
28618 {"popcnt", F_POPCNT},
28619 {"sse", F_SSE},
28620 {"sse2", F_SSE2},
28621 {"sse3", F_SSE3},
28622 {"ssse3", F_SSSE3},
28623 {"sse4.1", F_SSE4_1},
28624 {"sse4.2", F_SSE4_2},
28625 {"avx", F_AVX},
28626 {"avx2", F_AVX2}
28627 };
28628
28629 static tree __processor_model_type = NULL_TREE;
28630 static tree __cpu_model_var = NULL_TREE;
28631
28632 if (__processor_model_type == NULL_TREE)
28633 __processor_model_type = build_processor_model_struct ();
28634
28635 if (__cpu_model_var == NULL_TREE)
28636 __cpu_model_var = make_var_decl (__processor_model_type,
28637 "__cpu_model");
28638
28639 gcc_assert ((args != NULL) && (*args != NULL));
28640
28641 param_string_cst = *args;
28642 while (param_string_cst
28643 && TREE_CODE (param_string_cst) != STRING_CST)
28644 {
28645 /* *args must be a expr that can contain other EXPRS leading to a
28646 STRING_CST. */
28647 if (!EXPR_P (param_string_cst))
28648 {
28649 error ("Parameter to builtin must be a string constant or literal");
28650 return integer_zero_node;
28651 }
28652 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
28653 }
28654
28655 gcc_assert (param_string_cst);
28656
28657 if (fn_code == IX86_BUILTIN_CPU_IS)
28658 {
28659 tree ref;
28660 tree field;
28661 unsigned int field_val = 0;
28662 unsigned int NUM_ARCH_NAMES
28663 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
28664
28665 for (i = 0; i < NUM_ARCH_NAMES; i++)
28666 if (strcmp (arch_names_table[i].name,
28667 TREE_STRING_POINTER (param_string_cst)) == 0)
28668 break;
28669
28670 if (i == NUM_ARCH_NAMES)
28671 {
28672 error ("Parameter to builtin not valid: %s",
28673 TREE_STRING_POINTER (param_string_cst));
28674 return integer_zero_node;
28675 }
28676
28677 field = TYPE_FIELDS (__processor_model_type);
28678 field_val = arch_names_table[i].model;
28679
28680 /* CPU types are stored in the next field. */
28681 if (field_val > M_CPU_TYPE_START
28682 && field_val < M_CPU_SUBTYPE_START)
28683 {
28684 field = DECL_CHAIN (field);
28685 field_val -= M_CPU_TYPE_START;
28686 }
28687
28688 /* CPU subtypes are stored in the next field. */
28689 if (field_val > M_CPU_SUBTYPE_START)
28690 {
28691 field = DECL_CHAIN ( DECL_CHAIN (field));
28692 field_val -= M_CPU_SUBTYPE_START;
28693 }
28694
28695 /* Get the appropriate field in __cpu_model. */
28696 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
28697 field, NULL_TREE);
28698
28699 /* Check the value. */
28700 return build2 (EQ_EXPR, unsigned_type_node, ref,
28701 build_int_cstu (unsigned_type_node, field_val));
28702 }
28703 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
28704 {
28705 tree ref;
28706 tree array_elt;
28707 tree field;
28708 unsigned int field_val = 0;
28709 unsigned int NUM_ISA_NAMES
28710 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
28711
28712 for (i = 0; i < NUM_ISA_NAMES; i++)
28713 if (strcmp (isa_names_table[i].name,
28714 TREE_STRING_POINTER (param_string_cst)) == 0)
28715 break;
28716
28717 if (i == NUM_ISA_NAMES)
28718 {
28719 error ("Parameter to builtin not valid: %s",
28720 TREE_STRING_POINTER (param_string_cst));
28721 return integer_zero_node;
28722 }
28723
28724 field = TYPE_FIELDS (__processor_model_type);
28725 /* Get the last field, which is __cpu_features. */
28726 while (DECL_CHAIN (field))
28727 field = DECL_CHAIN (field);
28728
28729 /* Get the appropriate field: __cpu_model.__cpu_features */
28730 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
28731 field, NULL_TREE);
28732
28733 /* Access the 0th element of __cpu_features array. */
28734 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
28735 integer_zero_node, NULL_TREE, NULL_TREE);
28736
28737 field_val = (1 << isa_names_table[i].feature);
28738 /* Return __cpu_model.__cpu_features[0] & field_val */
28739 return build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
28740 build_int_cstu (unsigned_type_node, field_val));
28741 }
28742 gcc_unreachable ();
28743 }
28744
28745 static tree
28746 ix86_fold_builtin (tree fndecl, int n_args,
28747 tree *args, bool ignore ATTRIBUTE_UNUSED)
28748 {
28749 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
28750 {
28751 enum ix86_builtins fn_code = (enum ix86_builtins)
28752 DECL_FUNCTION_CODE (fndecl);
28753 if (fn_code == IX86_BUILTIN_CPU_IS
28754 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
28755 {
28756 gcc_assert (n_args == 1);
28757 return fold_builtin_cpu (fndecl, args);
28758 }
28759 }
28760
28761 #ifdef SUBTARGET_FOLD_BUILTIN
28762 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
28763 #endif
28764
28765 return NULL_TREE;
28766 }
28767
28768 /* Make builtins to detect cpu type and features supported. NAME is
28769 the builtin name, CODE is the builtin code, and FTYPE is the function
28770 type of the builtin. */
28771
28772 static void
28773 make_cpu_type_builtin (const char* name, int code,
28774 enum ix86_builtin_func_type ftype, bool is_const)
28775 {
28776 tree decl;
28777 tree type;
28778
28779 type = ix86_get_builtin_func_type (ftype);
28780 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
28781 NULL, NULL_TREE);
28782 gcc_assert (decl != NULL_TREE);
28783 ix86_builtins[(int) code] = decl;
28784 TREE_READONLY (decl) = is_const;
28785 }
28786
28787 /* Make builtins to get CPU type and features supported. The created
28788 builtins are :
28789
28790 __builtin_cpu_init (), to detect cpu type and features,
28791 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
28792 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
28793 */
28794
28795 static void
28796 ix86_init_platform_type_builtins (void)
28797 {
28798 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
28799 INT_FTYPE_VOID, false);
28800 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
28801 INT_FTYPE_PCCHAR, true);
28802 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
28803 INT_FTYPE_PCCHAR, true);
28804 }
28805
28806 /* Internal method for ix86_init_builtins. */
28807
28808 static void
28809 ix86_init_builtins_va_builtins_abi (void)
28810 {
28811 tree ms_va_ref, sysv_va_ref;
28812 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
28813 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
28814 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
28815 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
28816
28817 if (!TARGET_64BIT)
28818 return;
28819 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
28820 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
28821 ms_va_ref = build_reference_type (ms_va_list_type_node);
28822 sysv_va_ref =
28823 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
28824
28825 fnvoid_va_end_ms =
28826 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
28827 fnvoid_va_start_ms =
28828 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
28829 fnvoid_va_end_sysv =
28830 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
28831 fnvoid_va_start_sysv =
28832 build_varargs_function_type_list (void_type_node, sysv_va_ref,
28833 NULL_TREE);
28834 fnvoid_va_copy_ms =
28835 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
28836 NULL_TREE);
28837 fnvoid_va_copy_sysv =
28838 build_function_type_list (void_type_node, sysv_va_ref,
28839 sysv_va_ref, NULL_TREE);
28840
28841 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
28842 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
28843 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
28844 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
28845 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
28846 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
28847 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
28848 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28849 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
28850 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28851 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
28852 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28853 }
28854
28855 static void
28856 ix86_init_builtin_types (void)
28857 {
28858 tree float128_type_node, float80_type_node;
28859
28860 /* The __float80 type. */
28861 float80_type_node = long_double_type_node;
28862 if (TYPE_MODE (float80_type_node) != XFmode)
28863 {
28864 /* The __float80 type. */
28865 float80_type_node = make_node (REAL_TYPE);
28866
28867 TYPE_PRECISION (float80_type_node) = 80;
28868 layout_type (float80_type_node);
28869 }
28870 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
28871
28872 /* The __float128 type. */
28873 float128_type_node = make_node (REAL_TYPE);
28874 TYPE_PRECISION (float128_type_node) = 128;
28875 layout_type (float128_type_node);
28876 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
28877
28878 /* This macro is built by i386-builtin-types.awk. */
28879 DEFINE_BUILTIN_PRIMITIVE_TYPES;
28880 }
28881
28882 static void
28883 ix86_init_builtins (void)
28884 {
28885 tree t;
28886
28887 ix86_init_builtin_types ();
28888
28889 /* Builtins to get CPU type and features. */
28890 ix86_init_platform_type_builtins ();
28891
28892 /* TFmode support builtins. */
28893 def_builtin_const (0, "__builtin_infq",
28894 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
28895 def_builtin_const (0, "__builtin_huge_valq",
28896 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
28897
28898 /* We will expand them to normal call if SSE isn't available since
28899 they are used by libgcc. */
28900 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
28901 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
28902 BUILT_IN_MD, "__fabstf2", NULL_TREE);
28903 TREE_READONLY (t) = 1;
28904 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
28905
28906 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
28907 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
28908 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
28909 TREE_READONLY (t) = 1;
28910 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
28911
28912 ix86_init_tm_builtins ();
28913 ix86_init_mmx_sse_builtins ();
28914
28915 if (TARGET_LP64)
28916 ix86_init_builtins_va_builtins_abi ();
28917
28918 #ifdef SUBTARGET_INIT_BUILTINS
28919 SUBTARGET_INIT_BUILTINS;
28920 #endif
28921 }
28922
28923 /* Return the ix86 builtin for CODE. */
28924
28925 static tree
28926 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
28927 {
28928 if (code >= IX86_BUILTIN_MAX)
28929 return error_mark_node;
28930
28931 return ix86_builtins[code];
28932 }
28933
28934 /* Errors in the source file can cause expand_expr to return const0_rtx
28935 where we expect a vector. To avoid crashing, use one of the vector
28936 clear instructions. */
28937 static rtx
28938 safe_vector_operand (rtx x, enum machine_mode mode)
28939 {
28940 if (x == const0_rtx)
28941 x = CONST0_RTX (mode);
28942 return x;
28943 }
28944
28945 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
28946
28947 static rtx
28948 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
28949 {
28950 rtx pat;
28951 tree arg0 = CALL_EXPR_ARG (exp, 0);
28952 tree arg1 = CALL_EXPR_ARG (exp, 1);
28953 rtx op0 = expand_normal (arg0);
28954 rtx op1 = expand_normal (arg1);
28955 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28956 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
28957 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
28958
28959 if (VECTOR_MODE_P (mode0))
28960 op0 = safe_vector_operand (op0, mode0);
28961 if (VECTOR_MODE_P (mode1))
28962 op1 = safe_vector_operand (op1, mode1);
28963
28964 if (optimize || !target
28965 || GET_MODE (target) != tmode
28966 || !insn_data[icode].operand[0].predicate (target, tmode))
28967 target = gen_reg_rtx (tmode);
28968
28969 if (GET_MODE (op1) == SImode && mode1 == TImode)
28970 {
28971 rtx x = gen_reg_rtx (V4SImode);
28972 emit_insn (gen_sse2_loadd (x, op1));
28973 op1 = gen_lowpart (TImode, x);
28974 }
28975
28976 if (!insn_data[icode].operand[1].predicate (op0, mode0))
28977 op0 = copy_to_mode_reg (mode0, op0);
28978 if (!insn_data[icode].operand[2].predicate (op1, mode1))
28979 op1 = copy_to_mode_reg (mode1, op1);
28980
28981 pat = GEN_FCN (icode) (target, op0, op1);
28982 if (! pat)
28983 return 0;
28984
28985 emit_insn (pat);
28986
28987 return target;
28988 }
28989
28990 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
28991
28992 static rtx
28993 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
28994 enum ix86_builtin_func_type m_type,
28995 enum rtx_code sub_code)
28996 {
28997 rtx pat;
28998 int i;
28999 int nargs;
29000 bool comparison_p = false;
29001 bool tf_p = false;
29002 bool last_arg_constant = false;
29003 int num_memory = 0;
29004 struct {
29005 rtx op;
29006 enum machine_mode mode;
29007 } args[4];
29008
29009 enum machine_mode tmode = insn_data[icode].operand[0].mode;
29010
29011 switch (m_type)
29012 {
29013 case MULTI_ARG_4_DF2_DI_I:
29014 case MULTI_ARG_4_DF2_DI_I1:
29015 case MULTI_ARG_4_SF2_SI_I:
29016 case MULTI_ARG_4_SF2_SI_I1:
29017 nargs = 4;
29018 last_arg_constant = true;
29019 break;
29020
29021 case MULTI_ARG_3_SF:
29022 case MULTI_ARG_3_DF:
29023 case MULTI_ARG_3_SF2:
29024 case MULTI_ARG_3_DF2:
29025 case MULTI_ARG_3_DI:
29026 case MULTI_ARG_3_SI:
29027 case MULTI_ARG_3_SI_DI:
29028 case MULTI_ARG_3_HI:
29029 case MULTI_ARG_3_HI_SI:
29030 case MULTI_ARG_3_QI:
29031 case MULTI_ARG_3_DI2:
29032 case MULTI_ARG_3_SI2:
29033 case MULTI_ARG_3_HI2:
29034 case MULTI_ARG_3_QI2:
29035 nargs = 3;
29036 break;
29037
29038 case MULTI_ARG_2_SF:
29039 case MULTI_ARG_2_DF:
29040 case MULTI_ARG_2_DI:
29041 case MULTI_ARG_2_SI:
29042 case MULTI_ARG_2_HI:
29043 case MULTI_ARG_2_QI:
29044 nargs = 2;
29045 break;
29046
29047 case MULTI_ARG_2_DI_IMM:
29048 case MULTI_ARG_2_SI_IMM:
29049 case MULTI_ARG_2_HI_IMM:
29050 case MULTI_ARG_2_QI_IMM:
29051 nargs = 2;
29052 last_arg_constant = true;
29053 break;
29054
29055 case MULTI_ARG_1_SF:
29056 case MULTI_ARG_1_DF:
29057 case MULTI_ARG_1_SF2:
29058 case MULTI_ARG_1_DF2:
29059 case MULTI_ARG_1_DI:
29060 case MULTI_ARG_1_SI:
29061 case MULTI_ARG_1_HI:
29062 case MULTI_ARG_1_QI:
29063 case MULTI_ARG_1_SI_DI:
29064 case MULTI_ARG_1_HI_DI:
29065 case MULTI_ARG_1_HI_SI:
29066 case MULTI_ARG_1_QI_DI:
29067 case MULTI_ARG_1_QI_SI:
29068 case MULTI_ARG_1_QI_HI:
29069 nargs = 1;
29070 break;
29071
29072 case MULTI_ARG_2_DI_CMP:
29073 case MULTI_ARG_2_SI_CMP:
29074 case MULTI_ARG_2_HI_CMP:
29075 case MULTI_ARG_2_QI_CMP:
29076 nargs = 2;
29077 comparison_p = true;
29078 break;
29079
29080 case MULTI_ARG_2_SF_TF:
29081 case MULTI_ARG_2_DF_TF:
29082 case MULTI_ARG_2_DI_TF:
29083 case MULTI_ARG_2_SI_TF:
29084 case MULTI_ARG_2_HI_TF:
29085 case MULTI_ARG_2_QI_TF:
29086 nargs = 2;
29087 tf_p = true;
29088 break;
29089
29090 default:
29091 gcc_unreachable ();
29092 }
29093
29094 if (optimize || !target
29095 || GET_MODE (target) != tmode
29096 || !insn_data[icode].operand[0].predicate (target, tmode))
29097 target = gen_reg_rtx (tmode);
29098
29099 gcc_assert (nargs <= 4);
29100
29101 for (i = 0; i < nargs; i++)
29102 {
29103 tree arg = CALL_EXPR_ARG (exp, i);
29104 rtx op = expand_normal (arg);
29105 int adjust = (comparison_p) ? 1 : 0;
29106 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
29107
29108 if (last_arg_constant && i == nargs - 1)
29109 {
29110 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
29111 {
29112 enum insn_code new_icode = icode;
29113 switch (icode)
29114 {
29115 case CODE_FOR_xop_vpermil2v2df3:
29116 case CODE_FOR_xop_vpermil2v4sf3:
29117 case CODE_FOR_xop_vpermil2v4df3:
29118 case CODE_FOR_xop_vpermil2v8sf3:
29119 error ("the last argument must be a 2-bit immediate");
29120 return gen_reg_rtx (tmode);
29121 case CODE_FOR_xop_rotlv2di3:
29122 new_icode = CODE_FOR_rotlv2di3;
29123 goto xop_rotl;
29124 case CODE_FOR_xop_rotlv4si3:
29125 new_icode = CODE_FOR_rotlv4si3;
29126 goto xop_rotl;
29127 case CODE_FOR_xop_rotlv8hi3:
29128 new_icode = CODE_FOR_rotlv8hi3;
29129 goto xop_rotl;
29130 case CODE_FOR_xop_rotlv16qi3:
29131 new_icode = CODE_FOR_rotlv16qi3;
29132 xop_rotl:
29133 if (CONST_INT_P (op))
29134 {
29135 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
29136 op = GEN_INT (INTVAL (op) & mask);
29137 gcc_checking_assert
29138 (insn_data[icode].operand[i + 1].predicate (op, mode));
29139 }
29140 else
29141 {
29142 gcc_checking_assert
29143 (nargs == 2
29144 && insn_data[new_icode].operand[0].mode == tmode
29145 && insn_data[new_icode].operand[1].mode == tmode
29146 && insn_data[new_icode].operand[2].mode == mode
29147 && insn_data[new_icode].operand[0].predicate
29148 == insn_data[icode].operand[0].predicate
29149 && insn_data[new_icode].operand[1].predicate
29150 == insn_data[icode].operand[1].predicate);
29151 icode = new_icode;
29152 goto non_constant;
29153 }
29154 break;
29155 default:
29156 gcc_unreachable ();
29157 }
29158 }
29159 }
29160 else
29161 {
29162 non_constant:
29163 if (VECTOR_MODE_P (mode))
29164 op = safe_vector_operand (op, mode);
29165
29166 /* If we aren't optimizing, only allow one memory operand to be
29167 generated. */
29168 if (memory_operand (op, mode))
29169 num_memory++;
29170
29171 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
29172
29173 if (optimize
29174 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
29175 || num_memory > 1)
29176 op = force_reg (mode, op);
29177 }
29178
29179 args[i].op = op;
29180 args[i].mode = mode;
29181 }
29182
29183 switch (nargs)
29184 {
29185 case 1:
29186 pat = GEN_FCN (icode) (target, args[0].op);
29187 break;
29188
29189 case 2:
29190 if (tf_p)
29191 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
29192 GEN_INT ((int)sub_code));
29193 else if (! comparison_p)
29194 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
29195 else
29196 {
29197 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
29198 args[0].op,
29199 args[1].op);
29200
29201 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
29202 }
29203 break;
29204
29205 case 3:
29206 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
29207 break;
29208
29209 case 4:
29210 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
29211 break;
29212
29213 default:
29214 gcc_unreachable ();
29215 }
29216
29217 if (! pat)
29218 return 0;
29219
29220 emit_insn (pat);
29221 return target;
29222 }
29223
29224 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
29225 insns with vec_merge. */
29226
29227 static rtx
29228 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
29229 rtx target)
29230 {
29231 rtx pat;
29232 tree arg0 = CALL_EXPR_ARG (exp, 0);
29233 rtx op1, op0 = expand_normal (arg0);
29234 enum machine_mode tmode = insn_data[icode].operand[0].mode;
29235 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
29236
29237 if (optimize || !target
29238 || GET_MODE (target) != tmode
29239 || !insn_data[icode].operand[0].predicate (target, tmode))
29240 target = gen_reg_rtx (tmode);
29241
29242 if (VECTOR_MODE_P (mode0))
29243 op0 = safe_vector_operand (op0, mode0);
29244
29245 if ((optimize && !register_operand (op0, mode0))
29246 || !insn_data[icode].operand[1].predicate (op0, mode0))
29247 op0 = copy_to_mode_reg (mode0, op0);
29248
29249 op1 = op0;
29250 if (!insn_data[icode].operand[2].predicate (op1, mode0))
29251 op1 = copy_to_mode_reg (mode0, op1);
29252
29253 pat = GEN_FCN (icode) (target, op0, op1);
29254 if (! pat)
29255 return 0;
29256 emit_insn (pat);
29257 return target;
29258 }
29259
29260 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
29261
29262 static rtx
29263 ix86_expand_sse_compare (const struct builtin_description *d,
29264 tree exp, rtx target, bool swap)
29265 {
29266 rtx pat;
29267 tree arg0 = CALL_EXPR_ARG (exp, 0);
29268 tree arg1 = CALL_EXPR_ARG (exp, 1);
29269 rtx op0 = expand_normal (arg0);
29270 rtx op1 = expand_normal (arg1);
29271 rtx op2;
29272 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
29273 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
29274 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
29275 enum rtx_code comparison = d->comparison;
29276
29277 if (VECTOR_MODE_P (mode0))
29278 op0 = safe_vector_operand (op0, mode0);
29279 if (VECTOR_MODE_P (mode1))
29280 op1 = safe_vector_operand (op1, mode1);
29281
29282 /* Swap operands if we have a comparison that isn't available in
29283 hardware. */
29284 if (swap)
29285 {
29286 rtx tmp = gen_reg_rtx (mode1);
29287 emit_move_insn (tmp, op1);
29288 op1 = op0;
29289 op0 = tmp;
29290 }
29291
29292 if (optimize || !target
29293 || GET_MODE (target) != tmode
29294 || !insn_data[d->icode].operand[0].predicate (target, tmode))
29295 target = gen_reg_rtx (tmode);
29296
29297 if ((optimize && !register_operand (op0, mode0))
29298 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
29299 op0 = copy_to_mode_reg (mode0, op0);
29300 if ((optimize && !register_operand (op1, mode1))
29301 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
29302 op1 = copy_to_mode_reg (mode1, op1);
29303
29304 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
29305 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
29306 if (! pat)
29307 return 0;
29308 emit_insn (pat);
29309 return target;
29310 }
29311
29312 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
29313
29314 static rtx
29315 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
29316 rtx target)
29317 {
29318 rtx pat;
29319 tree arg0 = CALL_EXPR_ARG (exp, 0);
29320 tree arg1 = CALL_EXPR_ARG (exp, 1);
29321 rtx op0 = expand_normal (arg0);
29322 rtx op1 = expand_normal (arg1);
29323 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
29324 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
29325 enum rtx_code comparison = d->comparison;
29326
29327 if (VECTOR_MODE_P (mode0))
29328 op0 = safe_vector_operand (op0, mode0);
29329 if (VECTOR_MODE_P (mode1))
29330 op1 = safe_vector_operand (op1, mode1);
29331
29332 /* Swap operands if we have a comparison that isn't available in
29333 hardware. */
29334 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
29335 {
29336 rtx tmp = op1;
29337 op1 = op0;
29338 op0 = tmp;
29339 }
29340
29341 target = gen_reg_rtx (SImode);
29342 emit_move_insn (target, const0_rtx);
29343 target = gen_rtx_SUBREG (QImode, target, 0);
29344
29345 if ((optimize && !register_operand (op0, mode0))
29346 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
29347 op0 = copy_to_mode_reg (mode0, op0);
29348 if ((optimize && !register_operand (op1, mode1))
29349 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
29350 op1 = copy_to_mode_reg (mode1, op1);
29351
29352 pat = GEN_FCN (d->icode) (op0, op1);
29353 if (! pat)
29354 return 0;
29355 emit_insn (pat);
29356 emit_insn (gen_rtx_SET (VOIDmode,
29357 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
29358 gen_rtx_fmt_ee (comparison, QImode,
29359 SET_DEST (pat),
29360 const0_rtx)));
29361
29362 return SUBREG_REG (target);
29363 }
29364
29365 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
29366
29367 static rtx
29368 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
29369 rtx target)
29370 {
29371 rtx pat;
29372 tree arg0 = CALL_EXPR_ARG (exp, 0);
29373 rtx op1, op0 = expand_normal (arg0);
29374 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
29375 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
29376
29377 if (optimize || target == 0
29378 || GET_MODE (target) != tmode
29379 || !insn_data[d->icode].operand[0].predicate (target, tmode))
29380 target = gen_reg_rtx (tmode);
29381
29382 if (VECTOR_MODE_P (mode0))
29383 op0 = safe_vector_operand (op0, mode0);
29384
29385 if ((optimize && !register_operand (op0, mode0))
29386 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
29387 op0 = copy_to_mode_reg (mode0, op0);
29388
29389 op1 = GEN_INT (d->comparison);
29390
29391 pat = GEN_FCN (d->icode) (target, op0, op1);
29392 if (! pat)
29393 return 0;
29394 emit_insn (pat);
29395 return target;
29396 }
29397
29398 static rtx
29399 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
29400 tree exp, rtx target)
29401 {
29402 rtx pat;
29403 tree arg0 = CALL_EXPR_ARG (exp, 0);
29404 tree arg1 = CALL_EXPR_ARG (exp, 1);
29405 rtx op0 = expand_normal (arg0);
29406 rtx op1 = expand_normal (arg1);
29407 rtx op2;
29408 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
29409 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
29410 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
29411
29412 if (optimize || target == 0
29413 || GET_MODE (target) != tmode
29414 || !insn_data[d->icode].operand[0].predicate (target, tmode))
29415 target = gen_reg_rtx (tmode);
29416
29417 op0 = safe_vector_operand (op0, mode0);
29418 op1 = safe_vector_operand (op1, mode1);
29419
29420 if ((optimize && !register_operand (op0, mode0))
29421 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
29422 op0 = copy_to_mode_reg (mode0, op0);
29423 if ((optimize && !register_operand (op1, mode1))
29424 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
29425 op1 = copy_to_mode_reg (mode1, op1);
29426
29427 op2 = GEN_INT (d->comparison);
29428
29429 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
29430 if (! pat)
29431 return 0;
29432 emit_insn (pat);
29433 return target;
29434 }
29435
29436 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
29437
29438 static rtx
29439 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
29440 rtx target)
29441 {
29442 rtx pat;
29443 tree arg0 = CALL_EXPR_ARG (exp, 0);
29444 tree arg1 = CALL_EXPR_ARG (exp, 1);
29445 rtx op0 = expand_normal (arg0);
29446 rtx op1 = expand_normal (arg1);
29447 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
29448 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
29449 enum rtx_code comparison = d->comparison;
29450
29451 if (VECTOR_MODE_P (mode0))
29452 op0 = safe_vector_operand (op0, mode0);
29453 if (VECTOR_MODE_P (mode1))
29454 op1 = safe_vector_operand (op1, mode1);
29455
29456 target = gen_reg_rtx (SImode);
29457 emit_move_insn (target, const0_rtx);
29458 target = gen_rtx_SUBREG (QImode, target, 0);
29459
29460 if ((optimize && !register_operand (op0, mode0))
29461 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
29462 op0 = copy_to_mode_reg (mode0, op0);
29463 if ((optimize && !register_operand (op1, mode1))
29464 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
29465 op1 = copy_to_mode_reg (mode1, op1);
29466
29467 pat = GEN_FCN (d->icode) (op0, op1);
29468 if (! pat)
29469 return 0;
29470 emit_insn (pat);
29471 emit_insn (gen_rtx_SET (VOIDmode,
29472 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
29473 gen_rtx_fmt_ee (comparison, QImode,
29474 SET_DEST (pat),
29475 const0_rtx)));
29476
29477 return SUBREG_REG (target);
29478 }
29479
29480 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
29481
29482 static rtx
29483 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
29484 tree exp, rtx target)
29485 {
29486 rtx pat;
29487 tree arg0 = CALL_EXPR_ARG (exp, 0);
29488 tree arg1 = CALL_EXPR_ARG (exp, 1);
29489 tree arg2 = CALL_EXPR_ARG (exp, 2);
29490 tree arg3 = CALL_EXPR_ARG (exp, 3);
29491 tree arg4 = CALL_EXPR_ARG (exp, 4);
29492 rtx scratch0, scratch1;
29493 rtx op0 = expand_normal (arg0);
29494 rtx op1 = expand_normal (arg1);
29495 rtx op2 = expand_normal (arg2);
29496 rtx op3 = expand_normal (arg3);
29497 rtx op4 = expand_normal (arg4);
29498 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
29499
29500 tmode0 = insn_data[d->icode].operand[0].mode;
29501 tmode1 = insn_data[d->icode].operand[1].mode;
29502 modev2 = insn_data[d->icode].operand[2].mode;
29503 modei3 = insn_data[d->icode].operand[3].mode;
29504 modev4 = insn_data[d->icode].operand[4].mode;
29505 modei5 = insn_data[d->icode].operand[5].mode;
29506 modeimm = insn_data[d->icode].operand[6].mode;
29507
29508 if (VECTOR_MODE_P (modev2))
29509 op0 = safe_vector_operand (op0, modev2);
29510 if (VECTOR_MODE_P (modev4))
29511 op2 = safe_vector_operand (op2, modev4);
29512
29513 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
29514 op0 = copy_to_mode_reg (modev2, op0);
29515 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
29516 op1 = copy_to_mode_reg (modei3, op1);
29517 if ((optimize && !register_operand (op2, modev4))
29518 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
29519 op2 = copy_to_mode_reg (modev4, op2);
29520 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
29521 op3 = copy_to_mode_reg (modei5, op3);
29522
29523 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
29524 {
29525 error ("the fifth argument must be an 8-bit immediate");
29526 return const0_rtx;
29527 }
29528
29529 if (d->code == IX86_BUILTIN_PCMPESTRI128)
29530 {
29531 if (optimize || !target
29532 || GET_MODE (target) != tmode0
29533 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
29534 target = gen_reg_rtx (tmode0);
29535
29536 scratch1 = gen_reg_rtx (tmode1);
29537
29538 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
29539 }
29540 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
29541 {
29542 if (optimize || !target
29543 || GET_MODE (target) != tmode1
29544 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
29545 target = gen_reg_rtx (tmode1);
29546
29547 scratch0 = gen_reg_rtx (tmode0);
29548
29549 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
29550 }
29551 else
29552 {
29553 gcc_assert (d->flag);
29554
29555 scratch0 = gen_reg_rtx (tmode0);
29556 scratch1 = gen_reg_rtx (tmode1);
29557
29558 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
29559 }
29560
29561 if (! pat)
29562 return 0;
29563
29564 emit_insn (pat);
29565
29566 if (d->flag)
29567 {
29568 target = gen_reg_rtx (SImode);
29569 emit_move_insn (target, const0_rtx);
29570 target = gen_rtx_SUBREG (QImode, target, 0);
29571
29572 emit_insn
29573 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
29574 gen_rtx_fmt_ee (EQ, QImode,
29575 gen_rtx_REG ((enum machine_mode) d->flag,
29576 FLAGS_REG),
29577 const0_rtx)));
29578 return SUBREG_REG (target);
29579 }
29580 else
29581 return target;
29582 }
29583
29584
29585 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
29586
29587 static rtx
29588 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
29589 tree exp, rtx target)
29590 {
29591 rtx pat;
29592 tree arg0 = CALL_EXPR_ARG (exp, 0);
29593 tree arg1 = CALL_EXPR_ARG (exp, 1);
29594 tree arg2 = CALL_EXPR_ARG (exp, 2);
29595 rtx scratch0, scratch1;
29596 rtx op0 = expand_normal (arg0);
29597 rtx op1 = expand_normal (arg1);
29598 rtx op2 = expand_normal (arg2);
29599 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
29600
29601 tmode0 = insn_data[d->icode].operand[0].mode;
29602 tmode1 = insn_data[d->icode].operand[1].mode;
29603 modev2 = insn_data[d->icode].operand[2].mode;
29604 modev3 = insn_data[d->icode].operand[3].mode;
29605 modeimm = insn_data[d->icode].operand[4].mode;
29606
29607 if (VECTOR_MODE_P (modev2))
29608 op0 = safe_vector_operand (op0, modev2);
29609 if (VECTOR_MODE_P (modev3))
29610 op1 = safe_vector_operand (op1, modev3);
29611
29612 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
29613 op0 = copy_to_mode_reg (modev2, op0);
29614 if ((optimize && !register_operand (op1, modev3))
29615 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
29616 op1 = copy_to_mode_reg (modev3, op1);
29617
29618 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
29619 {
29620 error ("the third argument must be an 8-bit immediate");
29621 return const0_rtx;
29622 }
29623
29624 if (d->code == IX86_BUILTIN_PCMPISTRI128)
29625 {
29626 if (optimize || !target
29627 || GET_MODE (target) != tmode0
29628 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
29629 target = gen_reg_rtx (tmode0);
29630
29631 scratch1 = gen_reg_rtx (tmode1);
29632
29633 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
29634 }
29635 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
29636 {
29637 if (optimize || !target
29638 || GET_MODE (target) != tmode1
29639 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
29640 target = gen_reg_rtx (tmode1);
29641
29642 scratch0 = gen_reg_rtx (tmode0);
29643
29644 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
29645 }
29646 else
29647 {
29648 gcc_assert (d->flag);
29649
29650 scratch0 = gen_reg_rtx (tmode0);
29651 scratch1 = gen_reg_rtx (tmode1);
29652
29653 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
29654 }
29655
29656 if (! pat)
29657 return 0;
29658
29659 emit_insn (pat);
29660
29661 if (d->flag)
29662 {
29663 target = gen_reg_rtx (SImode);
29664 emit_move_insn (target, const0_rtx);
29665 target = gen_rtx_SUBREG (QImode, target, 0);
29666
29667 emit_insn
29668 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
29669 gen_rtx_fmt_ee (EQ, QImode,
29670 gen_rtx_REG ((enum machine_mode) d->flag,
29671 FLAGS_REG),
29672 const0_rtx)));
29673 return SUBREG_REG (target);
29674 }
29675 else
29676 return target;
29677 }
29678
29679 /* Subroutine of ix86_expand_builtin to take care of insns with
29680 variable number of operands. */
29681
29682 static rtx
29683 ix86_expand_args_builtin (const struct builtin_description *d,
29684 tree exp, rtx target)
29685 {
29686 rtx pat, real_target;
29687 unsigned int i, nargs;
29688 unsigned int nargs_constant = 0;
29689 int num_memory = 0;
29690 struct
29691 {
29692 rtx op;
29693 enum machine_mode mode;
29694 } args[4];
29695 bool last_arg_count = false;
29696 enum insn_code icode = d->icode;
29697 const struct insn_data_d *insn_p = &insn_data[icode];
29698 enum machine_mode tmode = insn_p->operand[0].mode;
29699 enum machine_mode rmode = VOIDmode;
29700 bool swap = false;
29701 enum rtx_code comparison = d->comparison;
29702
29703 switch ((enum ix86_builtin_func_type) d->flag)
29704 {
29705 case V2DF_FTYPE_V2DF_ROUND:
29706 case V4DF_FTYPE_V4DF_ROUND:
29707 case V4SF_FTYPE_V4SF_ROUND:
29708 case V8SF_FTYPE_V8SF_ROUND:
29709 case V4SI_FTYPE_V4SF_ROUND:
29710 case V8SI_FTYPE_V8SF_ROUND:
29711 return ix86_expand_sse_round (d, exp, target);
29712 case V4SI_FTYPE_V2DF_V2DF_ROUND:
29713 case V8SI_FTYPE_V4DF_V4DF_ROUND:
29714 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
29715 case INT_FTYPE_V8SF_V8SF_PTEST:
29716 case INT_FTYPE_V4DI_V4DI_PTEST:
29717 case INT_FTYPE_V4DF_V4DF_PTEST:
29718 case INT_FTYPE_V4SF_V4SF_PTEST:
29719 case INT_FTYPE_V2DI_V2DI_PTEST:
29720 case INT_FTYPE_V2DF_V2DF_PTEST:
29721 return ix86_expand_sse_ptest (d, exp, target);
29722 case FLOAT128_FTYPE_FLOAT128:
29723 case FLOAT_FTYPE_FLOAT:
29724 case INT_FTYPE_INT:
29725 case UINT64_FTYPE_INT:
29726 case UINT16_FTYPE_UINT16:
29727 case INT64_FTYPE_INT64:
29728 case INT64_FTYPE_V4SF:
29729 case INT64_FTYPE_V2DF:
29730 case INT_FTYPE_V16QI:
29731 case INT_FTYPE_V8QI:
29732 case INT_FTYPE_V8SF:
29733 case INT_FTYPE_V4DF:
29734 case INT_FTYPE_V4SF:
29735 case INT_FTYPE_V2DF:
29736 case INT_FTYPE_V32QI:
29737 case V16QI_FTYPE_V16QI:
29738 case V8SI_FTYPE_V8SF:
29739 case V8SI_FTYPE_V4SI:
29740 case V8HI_FTYPE_V8HI:
29741 case V8HI_FTYPE_V16QI:
29742 case V8QI_FTYPE_V8QI:
29743 case V8SF_FTYPE_V8SF:
29744 case V8SF_FTYPE_V8SI:
29745 case V8SF_FTYPE_V4SF:
29746 case V8SF_FTYPE_V8HI:
29747 case V4SI_FTYPE_V4SI:
29748 case V4SI_FTYPE_V16QI:
29749 case V4SI_FTYPE_V4SF:
29750 case V4SI_FTYPE_V8SI:
29751 case V4SI_FTYPE_V8HI:
29752 case V4SI_FTYPE_V4DF:
29753 case V4SI_FTYPE_V2DF:
29754 case V4HI_FTYPE_V4HI:
29755 case V4DF_FTYPE_V4DF:
29756 case V4DF_FTYPE_V4SI:
29757 case V4DF_FTYPE_V4SF:
29758 case V4DF_FTYPE_V2DF:
29759 case V4SF_FTYPE_V4SF:
29760 case V4SF_FTYPE_V4SI:
29761 case V4SF_FTYPE_V8SF:
29762 case V4SF_FTYPE_V4DF:
29763 case V4SF_FTYPE_V8HI:
29764 case V4SF_FTYPE_V2DF:
29765 case V2DI_FTYPE_V2DI:
29766 case V2DI_FTYPE_V16QI:
29767 case V2DI_FTYPE_V8HI:
29768 case V2DI_FTYPE_V4SI:
29769 case V2DF_FTYPE_V2DF:
29770 case V2DF_FTYPE_V4SI:
29771 case V2DF_FTYPE_V4DF:
29772 case V2DF_FTYPE_V4SF:
29773 case V2DF_FTYPE_V2SI:
29774 case V2SI_FTYPE_V2SI:
29775 case V2SI_FTYPE_V4SF:
29776 case V2SI_FTYPE_V2SF:
29777 case V2SI_FTYPE_V2DF:
29778 case V2SF_FTYPE_V2SF:
29779 case V2SF_FTYPE_V2SI:
29780 case V32QI_FTYPE_V32QI:
29781 case V32QI_FTYPE_V16QI:
29782 case V16HI_FTYPE_V16HI:
29783 case V16HI_FTYPE_V8HI:
29784 case V8SI_FTYPE_V8SI:
29785 case V16HI_FTYPE_V16QI:
29786 case V8SI_FTYPE_V16QI:
29787 case V4DI_FTYPE_V16QI:
29788 case V8SI_FTYPE_V8HI:
29789 case V4DI_FTYPE_V8HI:
29790 case V4DI_FTYPE_V4SI:
29791 case V4DI_FTYPE_V2DI:
29792 nargs = 1;
29793 break;
29794 case V4SF_FTYPE_V4SF_VEC_MERGE:
29795 case V2DF_FTYPE_V2DF_VEC_MERGE:
29796 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
29797 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
29798 case V16QI_FTYPE_V16QI_V16QI:
29799 case V16QI_FTYPE_V8HI_V8HI:
29800 case V8QI_FTYPE_V8QI_V8QI:
29801 case V8QI_FTYPE_V4HI_V4HI:
29802 case V8HI_FTYPE_V8HI_V8HI:
29803 case V8HI_FTYPE_V16QI_V16QI:
29804 case V8HI_FTYPE_V4SI_V4SI:
29805 case V8SF_FTYPE_V8SF_V8SF:
29806 case V8SF_FTYPE_V8SF_V8SI:
29807 case V4SI_FTYPE_V4SI_V4SI:
29808 case V4SI_FTYPE_V8HI_V8HI:
29809 case V4SI_FTYPE_V4SF_V4SF:
29810 case V4SI_FTYPE_V2DF_V2DF:
29811 case V4HI_FTYPE_V4HI_V4HI:
29812 case V4HI_FTYPE_V8QI_V8QI:
29813 case V4HI_FTYPE_V2SI_V2SI:
29814 case V4DF_FTYPE_V4DF_V4DF:
29815 case V4DF_FTYPE_V4DF_V4DI:
29816 case V4SF_FTYPE_V4SF_V4SF:
29817 case V4SF_FTYPE_V4SF_V4SI:
29818 case V4SF_FTYPE_V4SF_V2SI:
29819 case V4SF_FTYPE_V4SF_V2DF:
29820 case V4SF_FTYPE_V4SF_DI:
29821 case V4SF_FTYPE_V4SF_SI:
29822 case V2DI_FTYPE_V2DI_V2DI:
29823 case V2DI_FTYPE_V16QI_V16QI:
29824 case V2DI_FTYPE_V4SI_V4SI:
29825 case V2UDI_FTYPE_V4USI_V4USI:
29826 case V2DI_FTYPE_V2DI_V16QI:
29827 case V2DI_FTYPE_V2DF_V2DF:
29828 case V2SI_FTYPE_V2SI_V2SI:
29829 case V2SI_FTYPE_V4HI_V4HI:
29830 case V2SI_FTYPE_V2SF_V2SF:
29831 case V2DF_FTYPE_V2DF_V2DF:
29832 case V2DF_FTYPE_V2DF_V4SF:
29833 case V2DF_FTYPE_V2DF_V2DI:
29834 case V2DF_FTYPE_V2DF_DI:
29835 case V2DF_FTYPE_V2DF_SI:
29836 case V2SF_FTYPE_V2SF_V2SF:
29837 case V1DI_FTYPE_V1DI_V1DI:
29838 case V1DI_FTYPE_V8QI_V8QI:
29839 case V1DI_FTYPE_V2SI_V2SI:
29840 case V32QI_FTYPE_V16HI_V16HI:
29841 case V16HI_FTYPE_V8SI_V8SI:
29842 case V32QI_FTYPE_V32QI_V32QI:
29843 case V16HI_FTYPE_V32QI_V32QI:
29844 case V16HI_FTYPE_V16HI_V16HI:
29845 case V8SI_FTYPE_V4DF_V4DF:
29846 case V8SI_FTYPE_V8SI_V8SI:
29847 case V8SI_FTYPE_V16HI_V16HI:
29848 case V4DI_FTYPE_V4DI_V4DI:
29849 case V4DI_FTYPE_V8SI_V8SI:
29850 case V4UDI_FTYPE_V8USI_V8USI:
29851 if (comparison == UNKNOWN)
29852 return ix86_expand_binop_builtin (icode, exp, target);
29853 nargs = 2;
29854 break;
29855 case V4SF_FTYPE_V4SF_V4SF_SWAP:
29856 case V2DF_FTYPE_V2DF_V2DF_SWAP:
29857 gcc_assert (comparison != UNKNOWN);
29858 nargs = 2;
29859 swap = true;
29860 break;
29861 case V16HI_FTYPE_V16HI_V8HI_COUNT:
29862 case V16HI_FTYPE_V16HI_SI_COUNT:
29863 case V8SI_FTYPE_V8SI_V4SI_COUNT:
29864 case V8SI_FTYPE_V8SI_SI_COUNT:
29865 case V4DI_FTYPE_V4DI_V2DI_COUNT:
29866 case V4DI_FTYPE_V4DI_INT_COUNT:
29867 case V8HI_FTYPE_V8HI_V8HI_COUNT:
29868 case V8HI_FTYPE_V8HI_SI_COUNT:
29869 case V4SI_FTYPE_V4SI_V4SI_COUNT:
29870 case V4SI_FTYPE_V4SI_SI_COUNT:
29871 case V4HI_FTYPE_V4HI_V4HI_COUNT:
29872 case V4HI_FTYPE_V4HI_SI_COUNT:
29873 case V2DI_FTYPE_V2DI_V2DI_COUNT:
29874 case V2DI_FTYPE_V2DI_SI_COUNT:
29875 case V2SI_FTYPE_V2SI_V2SI_COUNT:
29876 case V2SI_FTYPE_V2SI_SI_COUNT:
29877 case V1DI_FTYPE_V1DI_V1DI_COUNT:
29878 case V1DI_FTYPE_V1DI_SI_COUNT:
29879 nargs = 2;
29880 last_arg_count = true;
29881 break;
29882 case UINT64_FTYPE_UINT64_UINT64:
29883 case UINT_FTYPE_UINT_UINT:
29884 case UINT_FTYPE_UINT_USHORT:
29885 case UINT_FTYPE_UINT_UCHAR:
29886 case UINT16_FTYPE_UINT16_INT:
29887 case UINT8_FTYPE_UINT8_INT:
29888 nargs = 2;
29889 break;
29890 case V2DI_FTYPE_V2DI_INT_CONVERT:
29891 nargs = 2;
29892 rmode = V1TImode;
29893 nargs_constant = 1;
29894 break;
29895 case V4DI_FTYPE_V4DI_INT_CONVERT:
29896 nargs = 2;
29897 rmode = V2TImode;
29898 nargs_constant = 1;
29899 break;
29900 case V8HI_FTYPE_V8HI_INT:
29901 case V8HI_FTYPE_V8SF_INT:
29902 case V8HI_FTYPE_V4SF_INT:
29903 case V8SF_FTYPE_V8SF_INT:
29904 case V4SI_FTYPE_V4SI_INT:
29905 case V4SI_FTYPE_V8SI_INT:
29906 case V4HI_FTYPE_V4HI_INT:
29907 case V4DF_FTYPE_V4DF_INT:
29908 case V4SF_FTYPE_V4SF_INT:
29909 case V4SF_FTYPE_V8SF_INT:
29910 case V2DI_FTYPE_V2DI_INT:
29911 case V2DF_FTYPE_V2DF_INT:
29912 case V2DF_FTYPE_V4DF_INT:
29913 case V16HI_FTYPE_V16HI_INT:
29914 case V8SI_FTYPE_V8SI_INT:
29915 case V4DI_FTYPE_V4DI_INT:
29916 case V2DI_FTYPE_V4DI_INT:
29917 nargs = 2;
29918 nargs_constant = 1;
29919 break;
29920 case V16QI_FTYPE_V16QI_V16QI_V16QI:
29921 case V8SF_FTYPE_V8SF_V8SF_V8SF:
29922 case V4DF_FTYPE_V4DF_V4DF_V4DF:
29923 case V4SF_FTYPE_V4SF_V4SF_V4SF:
29924 case V2DF_FTYPE_V2DF_V2DF_V2DF:
29925 case V32QI_FTYPE_V32QI_V32QI_V32QI:
29926 nargs = 3;
29927 break;
29928 case V32QI_FTYPE_V32QI_V32QI_INT:
29929 case V16HI_FTYPE_V16HI_V16HI_INT:
29930 case V16QI_FTYPE_V16QI_V16QI_INT:
29931 case V4DI_FTYPE_V4DI_V4DI_INT:
29932 case V8HI_FTYPE_V8HI_V8HI_INT:
29933 case V8SI_FTYPE_V8SI_V8SI_INT:
29934 case V8SI_FTYPE_V8SI_V4SI_INT:
29935 case V8SF_FTYPE_V8SF_V8SF_INT:
29936 case V8SF_FTYPE_V8SF_V4SF_INT:
29937 case V4SI_FTYPE_V4SI_V4SI_INT:
29938 case V4DF_FTYPE_V4DF_V4DF_INT:
29939 case V4DF_FTYPE_V4DF_V2DF_INT:
29940 case V4SF_FTYPE_V4SF_V4SF_INT:
29941 case V2DI_FTYPE_V2DI_V2DI_INT:
29942 case V4DI_FTYPE_V4DI_V2DI_INT:
29943 case V2DF_FTYPE_V2DF_V2DF_INT:
29944 nargs = 3;
29945 nargs_constant = 1;
29946 break;
29947 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
29948 nargs = 3;
29949 rmode = V4DImode;
29950 nargs_constant = 1;
29951 break;
29952 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
29953 nargs = 3;
29954 rmode = V2DImode;
29955 nargs_constant = 1;
29956 break;
29957 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
29958 nargs = 3;
29959 rmode = DImode;
29960 nargs_constant = 1;
29961 break;
29962 case V2DI_FTYPE_V2DI_UINT_UINT:
29963 nargs = 3;
29964 nargs_constant = 2;
29965 break;
29966 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
29967 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
29968 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
29969 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
29970 nargs = 4;
29971 nargs_constant = 1;
29972 break;
29973 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
29974 nargs = 4;
29975 nargs_constant = 2;
29976 break;
29977 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
29978 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
29979 nargs = 4;
29980 break;
29981 default:
29982 gcc_unreachable ();
29983 }
29984
29985 gcc_assert (nargs <= ARRAY_SIZE (args));
29986
29987 if (comparison != UNKNOWN)
29988 {
29989 gcc_assert (nargs == 2);
29990 return ix86_expand_sse_compare (d, exp, target, swap);
29991 }
29992
29993 if (rmode == VOIDmode || rmode == tmode)
29994 {
29995 if (optimize
29996 || target == 0
29997 || GET_MODE (target) != tmode
29998 || !insn_p->operand[0].predicate (target, tmode))
29999 target = gen_reg_rtx (tmode);
30000 real_target = target;
30001 }
30002 else
30003 {
30004 target = gen_reg_rtx (rmode);
30005 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
30006 }
30007
30008 for (i = 0; i < nargs; i++)
30009 {
30010 tree arg = CALL_EXPR_ARG (exp, i);
30011 rtx op = expand_normal (arg);
30012 enum machine_mode mode = insn_p->operand[i + 1].mode;
30013 bool match = insn_p->operand[i + 1].predicate (op, mode);
30014
30015 if (last_arg_count && (i + 1) == nargs)
30016 {
30017 /* SIMD shift insns take either an 8-bit immediate or
30018 register as count. But builtin functions take int as
30019 count. If count doesn't match, we put it in register. */
30020 if (!match)
30021 {
30022 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
30023 if (!insn_p->operand[i + 1].predicate (op, mode))
30024 op = copy_to_reg (op);
30025 }
30026 }
30027 else if ((nargs - i) <= nargs_constant)
30028 {
30029 if (!match)
30030 switch (icode)
30031 {
30032 case CODE_FOR_avx2_inserti128:
30033 case CODE_FOR_avx2_extracti128:
30034 error ("the last argument must be an 1-bit immediate");
30035 return const0_rtx;
30036
30037 case CODE_FOR_sse4_1_roundsd:
30038 case CODE_FOR_sse4_1_roundss:
30039
30040 case CODE_FOR_sse4_1_roundpd:
30041 case CODE_FOR_sse4_1_roundps:
30042 case CODE_FOR_avx_roundpd256:
30043 case CODE_FOR_avx_roundps256:
30044
30045 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
30046 case CODE_FOR_sse4_1_roundps_sfix:
30047 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
30048 case CODE_FOR_avx_roundps_sfix256:
30049
30050 case CODE_FOR_sse4_1_blendps:
30051 case CODE_FOR_avx_blendpd256:
30052 case CODE_FOR_avx_vpermilv4df:
30053 error ("the last argument must be a 4-bit immediate");
30054 return const0_rtx;
30055
30056 case CODE_FOR_sse4_1_blendpd:
30057 case CODE_FOR_avx_vpermilv2df:
30058 case CODE_FOR_xop_vpermil2v2df3:
30059 case CODE_FOR_xop_vpermil2v4sf3:
30060 case CODE_FOR_xop_vpermil2v4df3:
30061 case CODE_FOR_xop_vpermil2v8sf3:
30062 error ("the last argument must be a 2-bit immediate");
30063 return const0_rtx;
30064
30065 case CODE_FOR_avx_vextractf128v4df:
30066 case CODE_FOR_avx_vextractf128v8sf:
30067 case CODE_FOR_avx_vextractf128v8si:
30068 case CODE_FOR_avx_vinsertf128v4df:
30069 case CODE_FOR_avx_vinsertf128v8sf:
30070 case CODE_FOR_avx_vinsertf128v8si:
30071 error ("the last argument must be a 1-bit immediate");
30072 return const0_rtx;
30073
30074 case CODE_FOR_avx_vmcmpv2df3:
30075 case CODE_FOR_avx_vmcmpv4sf3:
30076 case CODE_FOR_avx_cmpv2df3:
30077 case CODE_FOR_avx_cmpv4sf3:
30078 case CODE_FOR_avx_cmpv4df3:
30079 case CODE_FOR_avx_cmpv8sf3:
30080 error ("the last argument must be a 5-bit immediate");
30081 return const0_rtx;
30082
30083 default:
30084 switch (nargs_constant)
30085 {
30086 case 2:
30087 if ((nargs - i) == nargs_constant)
30088 {
30089 error ("the next to last argument must be an 8-bit immediate");
30090 break;
30091 }
30092 case 1:
30093 error ("the last argument must be an 8-bit immediate");
30094 break;
30095 default:
30096 gcc_unreachable ();
30097 }
30098 return const0_rtx;
30099 }
30100 }
30101 else
30102 {
30103 if (VECTOR_MODE_P (mode))
30104 op = safe_vector_operand (op, mode);
30105
30106 /* If we aren't optimizing, only allow one memory operand to
30107 be generated. */
30108 if (memory_operand (op, mode))
30109 num_memory++;
30110
30111 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
30112 {
30113 if (optimize || !match || num_memory > 1)
30114 op = copy_to_mode_reg (mode, op);
30115 }
30116 else
30117 {
30118 op = copy_to_reg (op);
30119 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
30120 }
30121 }
30122
30123 args[i].op = op;
30124 args[i].mode = mode;
30125 }
30126
30127 switch (nargs)
30128 {
30129 case 1:
30130 pat = GEN_FCN (icode) (real_target, args[0].op);
30131 break;
30132 case 2:
30133 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
30134 break;
30135 case 3:
30136 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
30137 args[2].op);
30138 break;
30139 case 4:
30140 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
30141 args[2].op, args[3].op);
30142 break;
30143 default:
30144 gcc_unreachable ();
30145 }
30146
30147 if (! pat)
30148 return 0;
30149
30150 emit_insn (pat);
30151 return target;
30152 }
30153
30154 /* Subroutine of ix86_expand_builtin to take care of special insns
30155 with variable number of operands. */
30156
30157 static rtx
30158 ix86_expand_special_args_builtin (const struct builtin_description *d,
30159 tree exp, rtx target)
30160 {
30161 tree arg;
30162 rtx pat, op;
30163 unsigned int i, nargs, arg_adjust, memory;
30164 struct
30165 {
30166 rtx op;
30167 enum machine_mode mode;
30168 } args[3];
30169 enum insn_code icode = d->icode;
30170 bool last_arg_constant = false;
30171 const struct insn_data_d *insn_p = &insn_data[icode];
30172 enum machine_mode tmode = insn_p->operand[0].mode;
30173 enum { load, store } klass;
30174
30175 switch ((enum ix86_builtin_func_type) d->flag)
30176 {
30177 case VOID_FTYPE_VOID:
30178 if (icode == CODE_FOR_avx_vzeroupper)
30179 target = GEN_INT (vzeroupper_intrinsic);
30180 emit_insn (GEN_FCN (icode) (target));
30181 return 0;
30182 case VOID_FTYPE_UINT64:
30183 case VOID_FTYPE_UNSIGNED:
30184 nargs = 0;
30185 klass = store;
30186 memory = 0;
30187 break;
30188
30189 case INT_FTYPE_VOID:
30190 case UINT64_FTYPE_VOID:
30191 case UNSIGNED_FTYPE_VOID:
30192 nargs = 0;
30193 klass = load;
30194 memory = 0;
30195 break;
30196 case UINT64_FTYPE_PUNSIGNED:
30197 case V2DI_FTYPE_PV2DI:
30198 case V4DI_FTYPE_PV4DI:
30199 case V32QI_FTYPE_PCCHAR:
30200 case V16QI_FTYPE_PCCHAR:
30201 case V8SF_FTYPE_PCV4SF:
30202 case V8SF_FTYPE_PCFLOAT:
30203 case V4SF_FTYPE_PCFLOAT:
30204 case V4DF_FTYPE_PCV2DF:
30205 case V4DF_FTYPE_PCDOUBLE:
30206 case V2DF_FTYPE_PCDOUBLE:
30207 case VOID_FTYPE_PVOID:
30208 nargs = 1;
30209 klass = load;
30210 memory = 0;
30211 break;
30212 case VOID_FTYPE_PV2SF_V4SF:
30213 case VOID_FTYPE_PV4DI_V4DI:
30214 case VOID_FTYPE_PV2DI_V2DI:
30215 case VOID_FTYPE_PCHAR_V32QI:
30216 case VOID_FTYPE_PCHAR_V16QI:
30217 case VOID_FTYPE_PFLOAT_V8SF:
30218 case VOID_FTYPE_PFLOAT_V4SF:
30219 case VOID_FTYPE_PDOUBLE_V4DF:
30220 case VOID_FTYPE_PDOUBLE_V2DF:
30221 case VOID_FTYPE_PLONGLONG_LONGLONG:
30222 case VOID_FTYPE_PULONGLONG_ULONGLONG:
30223 case VOID_FTYPE_PINT_INT:
30224 nargs = 1;
30225 klass = store;
30226 /* Reserve memory operand for target. */
30227 memory = ARRAY_SIZE (args);
30228 break;
30229 case V4SF_FTYPE_V4SF_PCV2SF:
30230 case V2DF_FTYPE_V2DF_PCDOUBLE:
30231 nargs = 2;
30232 klass = load;
30233 memory = 1;
30234 break;
30235 case V8SF_FTYPE_PCV8SF_V8SI:
30236 case V4DF_FTYPE_PCV4DF_V4DI:
30237 case V4SF_FTYPE_PCV4SF_V4SI:
30238 case V2DF_FTYPE_PCV2DF_V2DI:
30239 case V8SI_FTYPE_PCV8SI_V8SI:
30240 case V4DI_FTYPE_PCV4DI_V4DI:
30241 case V4SI_FTYPE_PCV4SI_V4SI:
30242 case V2DI_FTYPE_PCV2DI_V2DI:
30243 nargs = 2;
30244 klass = load;
30245 memory = 0;
30246 break;
30247 case VOID_FTYPE_PV8SF_V8SI_V8SF:
30248 case VOID_FTYPE_PV4DF_V4DI_V4DF:
30249 case VOID_FTYPE_PV4SF_V4SI_V4SF:
30250 case VOID_FTYPE_PV2DF_V2DI_V2DF:
30251 case VOID_FTYPE_PV8SI_V8SI_V8SI:
30252 case VOID_FTYPE_PV4DI_V4DI_V4DI:
30253 case VOID_FTYPE_PV4SI_V4SI_V4SI:
30254 case VOID_FTYPE_PV2DI_V2DI_V2DI:
30255 nargs = 2;
30256 klass = store;
30257 /* Reserve memory operand for target. */
30258 memory = ARRAY_SIZE (args);
30259 break;
30260 case VOID_FTYPE_UINT_UINT_UINT:
30261 case VOID_FTYPE_UINT64_UINT_UINT:
30262 case UCHAR_FTYPE_UINT_UINT_UINT:
30263 case UCHAR_FTYPE_UINT64_UINT_UINT:
30264 nargs = 3;
30265 klass = load;
30266 memory = ARRAY_SIZE (args);
30267 last_arg_constant = true;
30268 break;
30269 default:
30270 gcc_unreachable ();
30271 }
30272
30273 gcc_assert (nargs <= ARRAY_SIZE (args));
30274
30275 if (klass == store)
30276 {
30277 arg = CALL_EXPR_ARG (exp, 0);
30278 op = expand_normal (arg);
30279 gcc_assert (target == 0);
30280 if (memory)
30281 {
30282 if (GET_MODE (op) != Pmode)
30283 op = convert_to_mode (Pmode, op, 1);
30284 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
30285 }
30286 else
30287 target = force_reg (tmode, op);
30288 arg_adjust = 1;
30289 }
30290 else
30291 {
30292 arg_adjust = 0;
30293 if (optimize
30294 || target == 0
30295 || !register_operand (target, tmode)
30296 || GET_MODE (target) != tmode)
30297 target = gen_reg_rtx (tmode);
30298 }
30299
30300 for (i = 0; i < nargs; i++)
30301 {
30302 enum machine_mode mode = insn_p->operand[i + 1].mode;
30303 bool match;
30304
30305 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
30306 op = expand_normal (arg);
30307 match = insn_p->operand[i + 1].predicate (op, mode);
30308
30309 if (last_arg_constant && (i + 1) == nargs)
30310 {
30311 if (!match)
30312 {
30313 if (icode == CODE_FOR_lwp_lwpvalsi3
30314 || icode == CODE_FOR_lwp_lwpinssi3
30315 || icode == CODE_FOR_lwp_lwpvaldi3
30316 || icode == CODE_FOR_lwp_lwpinsdi3)
30317 error ("the last argument must be a 32-bit immediate");
30318 else
30319 error ("the last argument must be an 8-bit immediate");
30320 return const0_rtx;
30321 }
30322 }
30323 else
30324 {
30325 if (i == memory)
30326 {
30327 /* This must be the memory operand. */
30328 if (GET_MODE (op) != Pmode)
30329 op = convert_to_mode (Pmode, op, 1);
30330 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
30331 gcc_assert (GET_MODE (op) == mode
30332 || GET_MODE (op) == VOIDmode);
30333 }
30334 else
30335 {
30336 /* This must be register. */
30337 if (VECTOR_MODE_P (mode))
30338 op = safe_vector_operand (op, mode);
30339
30340 gcc_assert (GET_MODE (op) == mode
30341 || GET_MODE (op) == VOIDmode);
30342 op = copy_to_mode_reg (mode, op);
30343 }
30344 }
30345
30346 args[i].op = op;
30347 args[i].mode = mode;
30348 }
30349
30350 switch (nargs)
30351 {
30352 case 0:
30353 pat = GEN_FCN (icode) (target);
30354 break;
30355 case 1:
30356 pat = GEN_FCN (icode) (target, args[0].op);
30357 break;
30358 case 2:
30359 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
30360 break;
30361 case 3:
30362 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
30363 break;
30364 default:
30365 gcc_unreachable ();
30366 }
30367
30368 if (! pat)
30369 return 0;
30370 emit_insn (pat);
30371 return klass == store ? 0 : target;
30372 }
30373
30374 /* Return the integer constant in ARG. Constrain it to be in the range
30375 of the subparts of VEC_TYPE; issue an error if not. */
30376
30377 static int
30378 get_element_number (tree vec_type, tree arg)
30379 {
30380 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
30381
30382 if (!host_integerp (arg, 1)
30383 || (elt = tree_low_cst (arg, 1), elt > max))
30384 {
30385 error ("selector must be an integer constant in the range 0..%wi", max);
30386 return 0;
30387 }
30388
30389 return elt;
30390 }
30391
30392 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
30393 ix86_expand_vector_init. We DO have language-level syntax for this, in
30394 the form of (type){ init-list }. Except that since we can't place emms
30395 instructions from inside the compiler, we can't allow the use of MMX
30396 registers unless the user explicitly asks for it. So we do *not* define
30397 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
30398 we have builtins invoked by mmintrin.h that gives us license to emit
30399 these sorts of instructions. */
30400
30401 static rtx
30402 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
30403 {
30404 enum machine_mode tmode = TYPE_MODE (type);
30405 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
30406 int i, n_elt = GET_MODE_NUNITS (tmode);
30407 rtvec v = rtvec_alloc (n_elt);
30408
30409 gcc_assert (VECTOR_MODE_P (tmode));
30410 gcc_assert (call_expr_nargs (exp) == n_elt);
30411
30412 for (i = 0; i < n_elt; ++i)
30413 {
30414 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
30415 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
30416 }
30417
30418 if (!target || !register_operand (target, tmode))
30419 target = gen_reg_rtx (tmode);
30420
30421 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
30422 return target;
30423 }
30424
30425 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
30426 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
30427 had a language-level syntax for referencing vector elements. */
30428
30429 static rtx
30430 ix86_expand_vec_ext_builtin (tree exp, rtx target)
30431 {
30432 enum machine_mode tmode, mode0;
30433 tree arg0, arg1;
30434 int elt;
30435 rtx op0;
30436
30437 arg0 = CALL_EXPR_ARG (exp, 0);
30438 arg1 = CALL_EXPR_ARG (exp, 1);
30439
30440 op0 = expand_normal (arg0);
30441 elt = get_element_number (TREE_TYPE (arg0), arg1);
30442
30443 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
30444 mode0 = TYPE_MODE (TREE_TYPE (arg0));
30445 gcc_assert (VECTOR_MODE_P (mode0));
30446
30447 op0 = force_reg (mode0, op0);
30448
30449 if (optimize || !target || !register_operand (target, tmode))
30450 target = gen_reg_rtx (tmode);
30451
30452 ix86_expand_vector_extract (true, target, op0, elt);
30453
30454 return target;
30455 }
30456
30457 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
30458 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
30459 a language-level syntax for referencing vector elements. */
30460
30461 static rtx
30462 ix86_expand_vec_set_builtin (tree exp)
30463 {
30464 enum machine_mode tmode, mode1;
30465 tree arg0, arg1, arg2;
30466 int elt;
30467 rtx op0, op1, target;
30468
30469 arg0 = CALL_EXPR_ARG (exp, 0);
30470 arg1 = CALL_EXPR_ARG (exp, 1);
30471 arg2 = CALL_EXPR_ARG (exp, 2);
30472
30473 tmode = TYPE_MODE (TREE_TYPE (arg0));
30474 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
30475 gcc_assert (VECTOR_MODE_P (tmode));
30476
30477 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
30478 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
30479 elt = get_element_number (TREE_TYPE (arg0), arg2);
30480
30481 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
30482 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
30483
30484 op0 = force_reg (tmode, op0);
30485 op1 = force_reg (mode1, op1);
30486
30487 /* OP0 is the source of these builtin functions and shouldn't be
30488 modified. Create a copy, use it and return it as target. */
30489 target = gen_reg_rtx (tmode);
30490 emit_move_insn (target, op0);
30491 ix86_expand_vector_set (true, target, op1, elt);
30492
30493 return target;
30494 }
30495
30496 /* Expand an expression EXP that calls a built-in function,
30497 with result going to TARGET if that's convenient
30498 (and in mode MODE if that's convenient).
30499 SUBTARGET may be used as the target for computing one of EXP's operands.
30500 IGNORE is nonzero if the value is to be ignored. */
30501
30502 static rtx
30503 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
30504 enum machine_mode mode ATTRIBUTE_UNUSED,
30505 int ignore ATTRIBUTE_UNUSED)
30506 {
30507 const struct builtin_description *d;
30508 size_t i;
30509 enum insn_code icode;
30510 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
30511 tree arg0, arg1, arg2, arg3, arg4;
30512 rtx op0, op1, op2, op3, op4, pat, insn;
30513 enum machine_mode mode0, mode1, mode2, mode3, mode4;
30514 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
30515
30516 /* For CPU builtins that can be folded, fold first and expand the fold. */
30517 switch (fcode)
30518 {
30519 case IX86_BUILTIN_CPU_INIT:
30520 {
30521 /* Make it call __cpu_indicator_init in libgcc. */
30522 tree call_expr, fndecl, type;
30523 type = build_function_type_list (integer_type_node, NULL_TREE);
30524 fndecl = build_fn_decl ("__cpu_indicator_init", type);
30525 call_expr = build_call_expr (fndecl, 0);
30526 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
30527 }
30528 case IX86_BUILTIN_CPU_IS:
30529 case IX86_BUILTIN_CPU_SUPPORTS:
30530 {
30531 tree arg0 = CALL_EXPR_ARG (exp, 0);
30532 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
30533 gcc_assert (fold_expr != NULL_TREE);
30534 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
30535 }
30536 }
30537
30538 /* Determine whether the builtin function is available under the current ISA.
30539 Originally the builtin was not created if it wasn't applicable to the
30540 current ISA based on the command line switches. With function specific
30541 options, we need to check in the context of the function making the call
30542 whether it is supported. */
30543 if (ix86_builtins_isa[fcode].isa
30544 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
30545 {
30546 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
30547 NULL, (enum fpmath_unit) 0, false);
30548
30549 if (!opts)
30550 error ("%qE needs unknown isa option", fndecl);
30551 else
30552 {
30553 gcc_assert (opts != NULL);
30554 error ("%qE needs isa option %s", fndecl, opts);
30555 free (opts);
30556 }
30557 return const0_rtx;
30558 }
30559
30560 switch (fcode)
30561 {
30562 case IX86_BUILTIN_MASKMOVQ:
30563 case IX86_BUILTIN_MASKMOVDQU:
30564 icode = (fcode == IX86_BUILTIN_MASKMOVQ
30565 ? CODE_FOR_mmx_maskmovq
30566 : CODE_FOR_sse2_maskmovdqu);
30567 /* Note the arg order is different from the operand order. */
30568 arg1 = CALL_EXPR_ARG (exp, 0);
30569 arg2 = CALL_EXPR_ARG (exp, 1);
30570 arg0 = CALL_EXPR_ARG (exp, 2);
30571 op0 = expand_normal (arg0);
30572 op1 = expand_normal (arg1);
30573 op2 = expand_normal (arg2);
30574 mode0 = insn_data[icode].operand[0].mode;
30575 mode1 = insn_data[icode].operand[1].mode;
30576 mode2 = insn_data[icode].operand[2].mode;
30577
30578 if (GET_MODE (op0) != Pmode)
30579 op0 = convert_to_mode (Pmode, op0, 1);
30580 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
30581
30582 if (!insn_data[icode].operand[0].predicate (op0, mode0))
30583 op0 = copy_to_mode_reg (mode0, op0);
30584 if (!insn_data[icode].operand[1].predicate (op1, mode1))
30585 op1 = copy_to_mode_reg (mode1, op1);
30586 if (!insn_data[icode].operand[2].predicate (op2, mode2))
30587 op2 = copy_to_mode_reg (mode2, op2);
30588 pat = GEN_FCN (icode) (op0, op1, op2);
30589 if (! pat)
30590 return 0;
30591 emit_insn (pat);
30592 return 0;
30593
30594 case IX86_BUILTIN_LDMXCSR:
30595 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
30596 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
30597 emit_move_insn (target, op0);
30598 emit_insn (gen_sse_ldmxcsr (target));
30599 return 0;
30600
30601 case IX86_BUILTIN_STMXCSR:
30602 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
30603 emit_insn (gen_sse_stmxcsr (target));
30604 return copy_to_mode_reg (SImode, target);
30605
30606 case IX86_BUILTIN_CLFLUSH:
30607 arg0 = CALL_EXPR_ARG (exp, 0);
30608 op0 = expand_normal (arg0);
30609 icode = CODE_FOR_sse2_clflush;
30610 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
30611 {
30612 if (GET_MODE (op0) != Pmode)
30613 op0 = convert_to_mode (Pmode, op0, 1);
30614 op0 = force_reg (Pmode, op0);
30615 }
30616
30617 emit_insn (gen_sse2_clflush (op0));
30618 return 0;
30619
30620 case IX86_BUILTIN_MONITOR:
30621 arg0 = CALL_EXPR_ARG (exp, 0);
30622 arg1 = CALL_EXPR_ARG (exp, 1);
30623 arg2 = CALL_EXPR_ARG (exp, 2);
30624 op0 = expand_normal (arg0);
30625 op1 = expand_normal (arg1);
30626 op2 = expand_normal (arg2);
30627 if (!REG_P (op0))
30628 {
30629 if (GET_MODE (op0) != Pmode)
30630 op0 = convert_to_mode (Pmode, op0, 1);
30631 op0 = force_reg (Pmode, op0);
30632 }
30633 if (!REG_P (op1))
30634 op1 = copy_to_mode_reg (SImode, op1);
30635 if (!REG_P (op2))
30636 op2 = copy_to_mode_reg (SImode, op2);
30637 emit_insn (ix86_gen_monitor (op0, op1, op2));
30638 return 0;
30639
30640 case IX86_BUILTIN_MWAIT:
30641 arg0 = CALL_EXPR_ARG (exp, 0);
30642 arg1 = CALL_EXPR_ARG (exp, 1);
30643 op0 = expand_normal (arg0);
30644 op1 = expand_normal (arg1);
30645 if (!REG_P (op0))
30646 op0 = copy_to_mode_reg (SImode, op0);
30647 if (!REG_P (op1))
30648 op1 = copy_to_mode_reg (SImode, op1);
30649 emit_insn (gen_sse3_mwait (op0, op1));
30650 return 0;
30651
30652 case IX86_BUILTIN_VEC_INIT_V2SI:
30653 case IX86_BUILTIN_VEC_INIT_V4HI:
30654 case IX86_BUILTIN_VEC_INIT_V8QI:
30655 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
30656
30657 case IX86_BUILTIN_VEC_EXT_V2DF:
30658 case IX86_BUILTIN_VEC_EXT_V2DI:
30659 case IX86_BUILTIN_VEC_EXT_V4SF:
30660 case IX86_BUILTIN_VEC_EXT_V4SI:
30661 case IX86_BUILTIN_VEC_EXT_V8HI:
30662 case IX86_BUILTIN_VEC_EXT_V2SI:
30663 case IX86_BUILTIN_VEC_EXT_V4HI:
30664 case IX86_BUILTIN_VEC_EXT_V16QI:
30665 return ix86_expand_vec_ext_builtin (exp, target);
30666
30667 case IX86_BUILTIN_VEC_SET_V2DI:
30668 case IX86_BUILTIN_VEC_SET_V4SF:
30669 case IX86_BUILTIN_VEC_SET_V4SI:
30670 case IX86_BUILTIN_VEC_SET_V8HI:
30671 case IX86_BUILTIN_VEC_SET_V4HI:
30672 case IX86_BUILTIN_VEC_SET_V16QI:
30673 return ix86_expand_vec_set_builtin (exp);
30674
30675 case IX86_BUILTIN_INFQ:
30676 case IX86_BUILTIN_HUGE_VALQ:
30677 {
30678 REAL_VALUE_TYPE inf;
30679 rtx tmp;
30680
30681 real_inf (&inf);
30682 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
30683
30684 tmp = validize_mem (force_const_mem (mode, tmp));
30685
30686 if (target == 0)
30687 target = gen_reg_rtx (mode);
30688
30689 emit_move_insn (target, tmp);
30690 return target;
30691 }
30692
30693 case IX86_BUILTIN_RDPMC:
30694 case IX86_BUILTIN_RDTSC:
30695 case IX86_BUILTIN_RDTSCP:
30696
30697 op0 = gen_reg_rtx (DImode);
30698 op1 = gen_reg_rtx (DImode);
30699
30700 if (fcode == IX86_BUILTIN_RDPMC)
30701 {
30702 arg0 = CALL_EXPR_ARG (exp, 0);
30703 op2 = expand_normal (arg0);
30704 if (!register_operand (op2, SImode))
30705 op2 = copy_to_mode_reg (SImode, op2);
30706
30707 insn = (TARGET_64BIT
30708 ? gen_rdpmc_rex64 (op0, op1, op2)
30709 : gen_rdpmc (op0, op2));
30710 emit_insn (insn);
30711 }
30712 else if (fcode == IX86_BUILTIN_RDTSC)
30713 {
30714 insn = (TARGET_64BIT
30715 ? gen_rdtsc_rex64 (op0, op1)
30716 : gen_rdtsc (op0));
30717 emit_insn (insn);
30718 }
30719 else
30720 {
30721 op2 = gen_reg_rtx (SImode);
30722
30723 insn = (TARGET_64BIT
30724 ? gen_rdtscp_rex64 (op0, op1, op2)
30725 : gen_rdtscp (op0, op2));
30726 emit_insn (insn);
30727
30728 arg0 = CALL_EXPR_ARG (exp, 0);
30729 op4 = expand_normal (arg0);
30730 if (!address_operand (op4, VOIDmode))
30731 {
30732 op4 = convert_memory_address (Pmode, op4);
30733 op4 = copy_addr_to_reg (op4);
30734 }
30735 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
30736 }
30737
30738 if (target == 0)
30739 target = gen_reg_rtx (mode);
30740
30741 if (TARGET_64BIT)
30742 {
30743 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
30744 op1, 1, OPTAB_DIRECT);
30745 op0 = expand_simple_binop (DImode, IOR, op0, op1,
30746 op0, 1, OPTAB_DIRECT);
30747 }
30748
30749 emit_move_insn (target, op0);
30750 return target;
30751
30752 case IX86_BUILTIN_FXSAVE:
30753 case IX86_BUILTIN_FXRSTOR:
30754 case IX86_BUILTIN_FXSAVE64:
30755 case IX86_BUILTIN_FXRSTOR64:
30756 switch (fcode)
30757 {
30758 case IX86_BUILTIN_FXSAVE:
30759 icode = CODE_FOR_fxsave;
30760 break;
30761 case IX86_BUILTIN_FXRSTOR:
30762 icode = CODE_FOR_fxrstor;
30763 break;
30764 case IX86_BUILTIN_FXSAVE64:
30765 icode = CODE_FOR_fxsave64;
30766 break;
30767 case IX86_BUILTIN_FXRSTOR64:
30768 icode = CODE_FOR_fxrstor64;
30769 break;
30770 default:
30771 gcc_unreachable ();
30772 }
30773
30774 arg0 = CALL_EXPR_ARG (exp, 0);
30775 op0 = expand_normal (arg0);
30776
30777 if (!address_operand (op0, VOIDmode))
30778 {
30779 op0 = convert_memory_address (Pmode, op0);
30780 op0 = copy_addr_to_reg (op0);
30781 }
30782 op0 = gen_rtx_MEM (BLKmode, op0);
30783
30784 pat = GEN_FCN (icode) (op0);
30785 if (pat)
30786 emit_insn (pat);
30787 return 0;
30788
30789 case IX86_BUILTIN_XSAVE:
30790 case IX86_BUILTIN_XRSTOR:
30791 case IX86_BUILTIN_XSAVE64:
30792 case IX86_BUILTIN_XRSTOR64:
30793 case IX86_BUILTIN_XSAVEOPT:
30794 case IX86_BUILTIN_XSAVEOPT64:
30795 arg0 = CALL_EXPR_ARG (exp, 0);
30796 arg1 = CALL_EXPR_ARG (exp, 1);
30797 op0 = expand_normal (arg0);
30798 op1 = expand_normal (arg1);
30799
30800 if (!address_operand (op0, VOIDmode))
30801 {
30802 op0 = convert_memory_address (Pmode, op0);
30803 op0 = copy_addr_to_reg (op0);
30804 }
30805 op0 = gen_rtx_MEM (BLKmode, op0);
30806
30807 op1 = force_reg (DImode, op1);
30808
30809 if (TARGET_64BIT)
30810 {
30811 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
30812 NULL, 1, OPTAB_DIRECT);
30813 switch (fcode)
30814 {
30815 case IX86_BUILTIN_XSAVE:
30816 icode = CODE_FOR_xsave_rex64;
30817 break;
30818 case IX86_BUILTIN_XRSTOR:
30819 icode = CODE_FOR_xrstor_rex64;
30820 break;
30821 case IX86_BUILTIN_XSAVE64:
30822 icode = CODE_FOR_xsave64;
30823 break;
30824 case IX86_BUILTIN_XRSTOR64:
30825 icode = CODE_FOR_xrstor64;
30826 break;
30827 case IX86_BUILTIN_XSAVEOPT:
30828 icode = CODE_FOR_xsaveopt_rex64;
30829 break;
30830 case IX86_BUILTIN_XSAVEOPT64:
30831 icode = CODE_FOR_xsaveopt64;
30832 break;
30833 default:
30834 gcc_unreachable ();
30835 }
30836
30837 op2 = gen_lowpart (SImode, op2);
30838 op1 = gen_lowpart (SImode, op1);
30839 pat = GEN_FCN (icode) (op0, op1, op2);
30840 }
30841 else
30842 {
30843 switch (fcode)
30844 {
30845 case IX86_BUILTIN_XSAVE:
30846 icode = CODE_FOR_xsave;
30847 break;
30848 case IX86_BUILTIN_XRSTOR:
30849 icode = CODE_FOR_xrstor;
30850 break;
30851 case IX86_BUILTIN_XSAVEOPT:
30852 icode = CODE_FOR_xsaveopt;
30853 break;
30854 default:
30855 gcc_unreachable ();
30856 }
30857 pat = GEN_FCN (icode) (op0, op1);
30858 }
30859
30860 if (pat)
30861 emit_insn (pat);
30862 return 0;
30863
30864 case IX86_BUILTIN_LLWPCB:
30865 arg0 = CALL_EXPR_ARG (exp, 0);
30866 op0 = expand_normal (arg0);
30867 icode = CODE_FOR_lwp_llwpcb;
30868 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
30869 {
30870 if (GET_MODE (op0) != Pmode)
30871 op0 = convert_to_mode (Pmode, op0, 1);
30872 op0 = force_reg (Pmode, op0);
30873 }
30874 emit_insn (gen_lwp_llwpcb (op0));
30875 return 0;
30876
30877 case IX86_BUILTIN_SLWPCB:
30878 icode = CODE_FOR_lwp_slwpcb;
30879 if (!target
30880 || !insn_data[icode].operand[0].predicate (target, Pmode))
30881 target = gen_reg_rtx (Pmode);
30882 emit_insn (gen_lwp_slwpcb (target));
30883 return target;
30884
30885 case IX86_BUILTIN_BEXTRI32:
30886 case IX86_BUILTIN_BEXTRI64:
30887 arg0 = CALL_EXPR_ARG (exp, 0);
30888 arg1 = CALL_EXPR_ARG (exp, 1);
30889 op0 = expand_normal (arg0);
30890 op1 = expand_normal (arg1);
30891 icode = (fcode == IX86_BUILTIN_BEXTRI32
30892 ? CODE_FOR_tbm_bextri_si
30893 : CODE_FOR_tbm_bextri_di);
30894 if (!CONST_INT_P (op1))
30895 {
30896 error ("last argument must be an immediate");
30897 return const0_rtx;
30898 }
30899 else
30900 {
30901 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
30902 unsigned char lsb_index = INTVAL (op1) & 0xFF;
30903 op1 = GEN_INT (length);
30904 op2 = GEN_INT (lsb_index);
30905 pat = GEN_FCN (icode) (target, op0, op1, op2);
30906 if (pat)
30907 emit_insn (pat);
30908 return target;
30909 }
30910
30911 case IX86_BUILTIN_RDRAND16_STEP:
30912 icode = CODE_FOR_rdrandhi_1;
30913 mode0 = HImode;
30914 goto rdrand_step;
30915
30916 case IX86_BUILTIN_RDRAND32_STEP:
30917 icode = CODE_FOR_rdrandsi_1;
30918 mode0 = SImode;
30919 goto rdrand_step;
30920
30921 case IX86_BUILTIN_RDRAND64_STEP:
30922 icode = CODE_FOR_rdranddi_1;
30923 mode0 = DImode;
30924
30925 rdrand_step:
30926 op0 = gen_reg_rtx (mode0);
30927 emit_insn (GEN_FCN (icode) (op0));
30928
30929 arg0 = CALL_EXPR_ARG (exp, 0);
30930 op1 = expand_normal (arg0);
30931 if (!address_operand (op1, VOIDmode))
30932 {
30933 op1 = convert_memory_address (Pmode, op1);
30934 op1 = copy_addr_to_reg (op1);
30935 }
30936 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
30937
30938 op1 = gen_reg_rtx (SImode);
30939 emit_move_insn (op1, CONST1_RTX (SImode));
30940
30941 /* Emit SImode conditional move. */
30942 if (mode0 == HImode)
30943 {
30944 op2 = gen_reg_rtx (SImode);
30945 emit_insn (gen_zero_extendhisi2 (op2, op0));
30946 }
30947 else if (mode0 == SImode)
30948 op2 = op0;
30949 else
30950 op2 = gen_rtx_SUBREG (SImode, op0, 0);
30951
30952 if (target == 0)
30953 target = gen_reg_rtx (SImode);
30954
30955 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
30956 const0_rtx);
30957 emit_insn (gen_rtx_SET (VOIDmode, target,
30958 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
30959 return target;
30960
30961 case IX86_BUILTIN_RDSEED16_STEP:
30962 icode = CODE_FOR_rdseedhi_1;
30963 mode0 = HImode;
30964 goto rdseed_step;
30965
30966 case IX86_BUILTIN_RDSEED32_STEP:
30967 icode = CODE_FOR_rdseedsi_1;
30968 mode0 = SImode;
30969 goto rdseed_step;
30970
30971 case IX86_BUILTIN_RDSEED64_STEP:
30972 icode = CODE_FOR_rdseeddi_1;
30973 mode0 = DImode;
30974
30975 rdseed_step:
30976 op0 = gen_reg_rtx (mode0);
30977 emit_insn (GEN_FCN (icode) (op0));
30978
30979 arg0 = CALL_EXPR_ARG (exp, 0);
30980 op1 = expand_normal (arg0);
30981 if (!address_operand (op1, VOIDmode))
30982 {
30983 op1 = convert_memory_address (Pmode, op1);
30984 op1 = copy_addr_to_reg (op1);
30985 }
30986 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
30987
30988 op2 = gen_reg_rtx (QImode);
30989
30990 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
30991 const0_rtx);
30992 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
30993
30994 if (target == 0)
30995 target = gen_reg_rtx (SImode);
30996
30997 emit_insn (gen_zero_extendqisi2 (target, op2));
30998 return target;
30999
31000 case IX86_BUILTIN_ADDCARRYX32:
31001 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
31002 mode0 = SImode;
31003 goto addcarryx;
31004
31005 case IX86_BUILTIN_ADDCARRYX64:
31006 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
31007 mode0 = DImode;
31008
31009 addcarryx:
31010 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
31011 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
31012 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
31013 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
31014
31015 op0 = gen_reg_rtx (QImode);
31016
31017 /* Generate CF from input operand. */
31018 op1 = expand_normal (arg0);
31019 if (GET_MODE (op1) != QImode)
31020 op1 = convert_to_mode (QImode, op1, 1);
31021 op1 = copy_to_mode_reg (QImode, op1);
31022 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
31023
31024 /* Gen ADCX instruction to compute X+Y+CF. */
31025 op2 = expand_normal (arg1);
31026 op3 = expand_normal (arg2);
31027
31028 if (!REG_P (op2))
31029 op2 = copy_to_mode_reg (mode0, op2);
31030 if (!REG_P (op3))
31031 op3 = copy_to_mode_reg (mode0, op3);
31032
31033 op0 = gen_reg_rtx (mode0);
31034
31035 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
31036 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
31037 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
31038
31039 /* Store the result. */
31040 op4 = expand_normal (arg3);
31041 if (!address_operand (op4, VOIDmode))
31042 {
31043 op4 = convert_memory_address (Pmode, op4);
31044 op4 = copy_addr_to_reg (op4);
31045 }
31046 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
31047
31048 /* Return current CF value. */
31049 if (target == 0)
31050 target = gen_reg_rtx (QImode);
31051
31052 PUT_MODE (pat, QImode);
31053 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
31054 return target;
31055
31056 case IX86_BUILTIN_GATHERSIV2DF:
31057 icode = CODE_FOR_avx2_gathersiv2df;
31058 goto gather_gen;
31059 case IX86_BUILTIN_GATHERSIV4DF:
31060 icode = CODE_FOR_avx2_gathersiv4df;
31061 goto gather_gen;
31062 case IX86_BUILTIN_GATHERDIV2DF:
31063 icode = CODE_FOR_avx2_gatherdiv2df;
31064 goto gather_gen;
31065 case IX86_BUILTIN_GATHERDIV4DF:
31066 icode = CODE_FOR_avx2_gatherdiv4df;
31067 goto gather_gen;
31068 case IX86_BUILTIN_GATHERSIV4SF:
31069 icode = CODE_FOR_avx2_gathersiv4sf;
31070 goto gather_gen;
31071 case IX86_BUILTIN_GATHERSIV8SF:
31072 icode = CODE_FOR_avx2_gathersiv8sf;
31073 goto gather_gen;
31074 case IX86_BUILTIN_GATHERDIV4SF:
31075 icode = CODE_FOR_avx2_gatherdiv4sf;
31076 goto gather_gen;
31077 case IX86_BUILTIN_GATHERDIV8SF:
31078 icode = CODE_FOR_avx2_gatherdiv8sf;
31079 goto gather_gen;
31080 case IX86_BUILTIN_GATHERSIV2DI:
31081 icode = CODE_FOR_avx2_gathersiv2di;
31082 goto gather_gen;
31083 case IX86_BUILTIN_GATHERSIV4DI:
31084 icode = CODE_FOR_avx2_gathersiv4di;
31085 goto gather_gen;
31086 case IX86_BUILTIN_GATHERDIV2DI:
31087 icode = CODE_FOR_avx2_gatherdiv2di;
31088 goto gather_gen;
31089 case IX86_BUILTIN_GATHERDIV4DI:
31090 icode = CODE_FOR_avx2_gatherdiv4di;
31091 goto gather_gen;
31092 case IX86_BUILTIN_GATHERSIV4SI:
31093 icode = CODE_FOR_avx2_gathersiv4si;
31094 goto gather_gen;
31095 case IX86_BUILTIN_GATHERSIV8SI:
31096 icode = CODE_FOR_avx2_gathersiv8si;
31097 goto gather_gen;
31098 case IX86_BUILTIN_GATHERDIV4SI:
31099 icode = CODE_FOR_avx2_gatherdiv4si;
31100 goto gather_gen;
31101 case IX86_BUILTIN_GATHERDIV8SI:
31102 icode = CODE_FOR_avx2_gatherdiv8si;
31103 goto gather_gen;
31104 case IX86_BUILTIN_GATHERALTSIV4DF:
31105 icode = CODE_FOR_avx2_gathersiv4df;
31106 goto gather_gen;
31107 case IX86_BUILTIN_GATHERALTDIV8SF:
31108 icode = CODE_FOR_avx2_gatherdiv8sf;
31109 goto gather_gen;
31110 case IX86_BUILTIN_GATHERALTSIV4DI:
31111 icode = CODE_FOR_avx2_gathersiv4di;
31112 goto gather_gen;
31113 case IX86_BUILTIN_GATHERALTDIV8SI:
31114 icode = CODE_FOR_avx2_gatherdiv8si;
31115 goto gather_gen;
31116
31117 gather_gen:
31118 arg0 = CALL_EXPR_ARG (exp, 0);
31119 arg1 = CALL_EXPR_ARG (exp, 1);
31120 arg2 = CALL_EXPR_ARG (exp, 2);
31121 arg3 = CALL_EXPR_ARG (exp, 3);
31122 arg4 = CALL_EXPR_ARG (exp, 4);
31123 op0 = expand_normal (arg0);
31124 op1 = expand_normal (arg1);
31125 op2 = expand_normal (arg2);
31126 op3 = expand_normal (arg3);
31127 op4 = expand_normal (arg4);
31128 /* Note the arg order is different from the operand order. */
31129 mode0 = insn_data[icode].operand[1].mode;
31130 mode2 = insn_data[icode].operand[3].mode;
31131 mode3 = insn_data[icode].operand[4].mode;
31132 mode4 = insn_data[icode].operand[5].mode;
31133
31134 if (target == NULL_RTX
31135 || GET_MODE (target) != insn_data[icode].operand[0].mode)
31136 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
31137 else
31138 subtarget = target;
31139
31140 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
31141 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
31142 {
31143 rtx half = gen_reg_rtx (V4SImode);
31144 if (!nonimmediate_operand (op2, V8SImode))
31145 op2 = copy_to_mode_reg (V8SImode, op2);
31146 emit_insn (gen_vec_extract_lo_v8si (half, op2));
31147 op2 = half;
31148 }
31149 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
31150 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
31151 {
31152 rtx (*gen) (rtx, rtx);
31153 rtx half = gen_reg_rtx (mode0);
31154 if (mode0 == V4SFmode)
31155 gen = gen_vec_extract_lo_v8sf;
31156 else
31157 gen = gen_vec_extract_lo_v8si;
31158 if (!nonimmediate_operand (op0, GET_MODE (op0)))
31159 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
31160 emit_insn (gen (half, op0));
31161 op0 = half;
31162 if (!nonimmediate_operand (op3, GET_MODE (op3)))
31163 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
31164 emit_insn (gen (half, op3));
31165 op3 = half;
31166 }
31167
31168 /* Force memory operand only with base register here. But we
31169 don't want to do it on memory operand for other builtin
31170 functions. */
31171 if (GET_MODE (op1) != Pmode)
31172 op1 = convert_to_mode (Pmode, op1, 1);
31173 op1 = force_reg (Pmode, op1);
31174
31175 if (!insn_data[icode].operand[1].predicate (op0, mode0))
31176 op0 = copy_to_mode_reg (mode0, op0);
31177 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
31178 op1 = copy_to_mode_reg (Pmode, op1);
31179 if (!insn_data[icode].operand[3].predicate (op2, mode2))
31180 op2 = copy_to_mode_reg (mode2, op2);
31181 if (!insn_data[icode].operand[4].predicate (op3, mode3))
31182 op3 = copy_to_mode_reg (mode3, op3);
31183 if (!insn_data[icode].operand[5].predicate (op4, mode4))
31184 {
31185 error ("last argument must be scale 1, 2, 4, 8");
31186 return const0_rtx;
31187 }
31188
31189 /* Optimize. If mask is known to have all high bits set,
31190 replace op0 with pc_rtx to signal that the instruction
31191 overwrites the whole destination and doesn't use its
31192 previous contents. */
31193 if (optimize)
31194 {
31195 if (TREE_CODE (arg3) == VECTOR_CST)
31196 {
31197 unsigned int negative = 0;
31198 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
31199 {
31200 tree cst = VECTOR_CST_ELT (arg3, i);
31201 if (TREE_CODE (cst) == INTEGER_CST
31202 && tree_int_cst_sign_bit (cst))
31203 negative++;
31204 else if (TREE_CODE (cst) == REAL_CST
31205 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
31206 negative++;
31207 }
31208 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
31209 op0 = pc_rtx;
31210 }
31211 else if (TREE_CODE (arg3) == SSA_NAME)
31212 {
31213 /* Recognize also when mask is like:
31214 __v2df src = _mm_setzero_pd ();
31215 __v2df mask = _mm_cmpeq_pd (src, src);
31216 or
31217 __v8sf src = _mm256_setzero_ps ();
31218 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
31219 as that is a cheaper way to load all ones into
31220 a register than having to load a constant from
31221 memory. */
31222 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
31223 if (is_gimple_call (def_stmt))
31224 {
31225 tree fndecl = gimple_call_fndecl (def_stmt);
31226 if (fndecl
31227 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
31228 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
31229 {
31230 case IX86_BUILTIN_CMPPD:
31231 case IX86_BUILTIN_CMPPS:
31232 case IX86_BUILTIN_CMPPD256:
31233 case IX86_BUILTIN_CMPPS256:
31234 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
31235 break;
31236 /* FALLTHRU */
31237 case IX86_BUILTIN_CMPEQPD:
31238 case IX86_BUILTIN_CMPEQPS:
31239 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
31240 && initializer_zerop (gimple_call_arg (def_stmt,
31241 1)))
31242 op0 = pc_rtx;
31243 break;
31244 default:
31245 break;
31246 }
31247 }
31248 }
31249 }
31250
31251 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
31252 if (! pat)
31253 return const0_rtx;
31254 emit_insn (pat);
31255
31256 if (fcode == IX86_BUILTIN_GATHERDIV8SF
31257 || fcode == IX86_BUILTIN_GATHERDIV8SI)
31258 {
31259 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
31260 ? V4SFmode : V4SImode;
31261 if (target == NULL_RTX)
31262 target = gen_reg_rtx (tmode);
31263 if (tmode == V4SFmode)
31264 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
31265 else
31266 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
31267 }
31268 else
31269 target = subtarget;
31270
31271 return target;
31272
31273 case IX86_BUILTIN_XABORT:
31274 icode = CODE_FOR_xabort;
31275 arg0 = CALL_EXPR_ARG (exp, 0);
31276 op0 = expand_normal (arg0);
31277 mode0 = insn_data[icode].operand[0].mode;
31278 if (!insn_data[icode].operand[0].predicate (op0, mode0))
31279 {
31280 error ("the xabort's argument must be an 8-bit immediate");
31281 return const0_rtx;
31282 }
31283 emit_insn (gen_xabort (op0));
31284 return 0;
31285
31286 default:
31287 break;
31288 }
31289
31290 for (i = 0, d = bdesc_special_args;
31291 i < ARRAY_SIZE (bdesc_special_args);
31292 i++, d++)
31293 if (d->code == fcode)
31294 return ix86_expand_special_args_builtin (d, exp, target);
31295
31296 for (i = 0, d = bdesc_args;
31297 i < ARRAY_SIZE (bdesc_args);
31298 i++, d++)
31299 if (d->code == fcode)
31300 switch (fcode)
31301 {
31302 case IX86_BUILTIN_FABSQ:
31303 case IX86_BUILTIN_COPYSIGNQ:
31304 if (!TARGET_SSE)
31305 /* Emit a normal call if SSE isn't available. */
31306 return expand_call (exp, target, ignore);
31307 default:
31308 return ix86_expand_args_builtin (d, exp, target);
31309 }
31310
31311 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
31312 if (d->code == fcode)
31313 return ix86_expand_sse_comi (d, exp, target);
31314
31315 for (i = 0, d = bdesc_pcmpestr;
31316 i < ARRAY_SIZE (bdesc_pcmpestr);
31317 i++, d++)
31318 if (d->code == fcode)
31319 return ix86_expand_sse_pcmpestr (d, exp, target);
31320
31321 for (i = 0, d = bdesc_pcmpistr;
31322 i < ARRAY_SIZE (bdesc_pcmpistr);
31323 i++, d++)
31324 if (d->code == fcode)
31325 return ix86_expand_sse_pcmpistr (d, exp, target);
31326
31327 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
31328 if (d->code == fcode)
31329 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
31330 (enum ix86_builtin_func_type)
31331 d->flag, d->comparison);
31332
31333 gcc_unreachable ();
31334 }
31335
31336 /* Returns a function decl for a vectorized version of the builtin function
31337 with builtin function code FN and the result vector type TYPE, or NULL_TREE
31338 if it is not available. */
31339
31340 static tree
31341 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
31342 tree type_in)
31343 {
31344 enum machine_mode in_mode, out_mode;
31345 int in_n, out_n;
31346 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
31347
31348 if (TREE_CODE (type_out) != VECTOR_TYPE
31349 || TREE_CODE (type_in) != VECTOR_TYPE
31350 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
31351 return NULL_TREE;
31352
31353 out_mode = TYPE_MODE (TREE_TYPE (type_out));
31354 out_n = TYPE_VECTOR_SUBPARTS (type_out);
31355 in_mode = TYPE_MODE (TREE_TYPE (type_in));
31356 in_n = TYPE_VECTOR_SUBPARTS (type_in);
31357
31358 switch (fn)
31359 {
31360 case BUILT_IN_SQRT:
31361 if (out_mode == DFmode && in_mode == DFmode)
31362 {
31363 if (out_n == 2 && in_n == 2)
31364 return ix86_builtins[IX86_BUILTIN_SQRTPD];
31365 else if (out_n == 4 && in_n == 4)
31366 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
31367 }
31368 break;
31369
31370 case BUILT_IN_SQRTF:
31371 if (out_mode == SFmode && in_mode == SFmode)
31372 {
31373 if (out_n == 4 && in_n == 4)
31374 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
31375 else if (out_n == 8 && in_n == 8)
31376 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
31377 }
31378 break;
31379
31380 case BUILT_IN_IFLOOR:
31381 case BUILT_IN_LFLOOR:
31382 case BUILT_IN_LLFLOOR:
31383 /* The round insn does not trap on denormals. */
31384 if (flag_trapping_math || !TARGET_ROUND)
31385 break;
31386
31387 if (out_mode == SImode && in_mode == DFmode)
31388 {
31389 if (out_n == 4 && in_n == 2)
31390 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
31391 else if (out_n == 8 && in_n == 4)
31392 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
31393 }
31394 break;
31395
31396 case BUILT_IN_IFLOORF:
31397 case BUILT_IN_LFLOORF:
31398 case BUILT_IN_LLFLOORF:
31399 /* The round insn does not trap on denormals. */
31400 if (flag_trapping_math || !TARGET_ROUND)
31401 break;
31402
31403 if (out_mode == SImode && in_mode == SFmode)
31404 {
31405 if (out_n == 4 && in_n == 4)
31406 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
31407 else if (out_n == 8 && in_n == 8)
31408 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
31409 }
31410 break;
31411
31412 case BUILT_IN_ICEIL:
31413 case BUILT_IN_LCEIL:
31414 case BUILT_IN_LLCEIL:
31415 /* The round insn does not trap on denormals. */
31416 if (flag_trapping_math || !TARGET_ROUND)
31417 break;
31418
31419 if (out_mode == SImode && in_mode == DFmode)
31420 {
31421 if (out_n == 4 && in_n == 2)
31422 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
31423 else if (out_n == 8 && in_n == 4)
31424 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
31425 }
31426 break;
31427
31428 case BUILT_IN_ICEILF:
31429 case BUILT_IN_LCEILF:
31430 case BUILT_IN_LLCEILF:
31431 /* The round insn does not trap on denormals. */
31432 if (flag_trapping_math || !TARGET_ROUND)
31433 break;
31434
31435 if (out_mode == SImode && in_mode == SFmode)
31436 {
31437 if (out_n == 4 && in_n == 4)
31438 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
31439 else if (out_n == 8 && in_n == 8)
31440 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
31441 }
31442 break;
31443
31444 case BUILT_IN_IRINT:
31445 case BUILT_IN_LRINT:
31446 case BUILT_IN_LLRINT:
31447 if (out_mode == SImode && in_mode == DFmode)
31448 {
31449 if (out_n == 4 && in_n == 2)
31450 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
31451 else if (out_n == 8 && in_n == 4)
31452 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
31453 }
31454 break;
31455
31456 case BUILT_IN_IRINTF:
31457 case BUILT_IN_LRINTF:
31458 case BUILT_IN_LLRINTF:
31459 if (out_mode == SImode && in_mode == SFmode)
31460 {
31461 if (out_n == 4 && in_n == 4)
31462 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
31463 else if (out_n == 8 && in_n == 8)
31464 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
31465 }
31466 break;
31467
31468 case BUILT_IN_IROUND:
31469 case BUILT_IN_LROUND:
31470 case BUILT_IN_LLROUND:
31471 /* The round insn does not trap on denormals. */
31472 if (flag_trapping_math || !TARGET_ROUND)
31473 break;
31474
31475 if (out_mode == SImode && in_mode == DFmode)
31476 {
31477 if (out_n == 4 && in_n == 2)
31478 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
31479 else if (out_n == 8 && in_n == 4)
31480 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
31481 }
31482 break;
31483
31484 case BUILT_IN_IROUNDF:
31485 case BUILT_IN_LROUNDF:
31486 case BUILT_IN_LLROUNDF:
31487 /* The round insn does not trap on denormals. */
31488 if (flag_trapping_math || !TARGET_ROUND)
31489 break;
31490
31491 if (out_mode == SImode && in_mode == SFmode)
31492 {
31493 if (out_n == 4 && in_n == 4)
31494 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
31495 else if (out_n == 8 && in_n == 8)
31496 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
31497 }
31498 break;
31499
31500 case BUILT_IN_COPYSIGN:
31501 if (out_mode == DFmode && in_mode == DFmode)
31502 {
31503 if (out_n == 2 && in_n == 2)
31504 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
31505 else if (out_n == 4 && in_n == 4)
31506 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
31507 }
31508 break;
31509
31510 case BUILT_IN_COPYSIGNF:
31511 if (out_mode == SFmode && in_mode == SFmode)
31512 {
31513 if (out_n == 4 && in_n == 4)
31514 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
31515 else if (out_n == 8 && in_n == 8)
31516 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
31517 }
31518 break;
31519
31520 case BUILT_IN_FLOOR:
31521 /* The round insn does not trap on denormals. */
31522 if (flag_trapping_math || !TARGET_ROUND)
31523 break;
31524
31525 if (out_mode == DFmode && in_mode == DFmode)
31526 {
31527 if (out_n == 2 && in_n == 2)
31528 return ix86_builtins[IX86_BUILTIN_FLOORPD];
31529 else if (out_n == 4 && in_n == 4)
31530 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
31531 }
31532 break;
31533
31534 case BUILT_IN_FLOORF:
31535 /* The round insn does not trap on denormals. */
31536 if (flag_trapping_math || !TARGET_ROUND)
31537 break;
31538
31539 if (out_mode == SFmode && in_mode == SFmode)
31540 {
31541 if (out_n == 4 && in_n == 4)
31542 return ix86_builtins[IX86_BUILTIN_FLOORPS];
31543 else if (out_n == 8 && in_n == 8)
31544 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
31545 }
31546 break;
31547
31548 case BUILT_IN_CEIL:
31549 /* The round insn does not trap on denormals. */
31550 if (flag_trapping_math || !TARGET_ROUND)
31551 break;
31552
31553 if (out_mode == DFmode && in_mode == DFmode)
31554 {
31555 if (out_n == 2 && in_n == 2)
31556 return ix86_builtins[IX86_BUILTIN_CEILPD];
31557 else if (out_n == 4 && in_n == 4)
31558 return ix86_builtins[IX86_BUILTIN_CEILPD256];
31559 }
31560 break;
31561
31562 case BUILT_IN_CEILF:
31563 /* The round insn does not trap on denormals. */
31564 if (flag_trapping_math || !TARGET_ROUND)
31565 break;
31566
31567 if (out_mode == SFmode && in_mode == SFmode)
31568 {
31569 if (out_n == 4 && in_n == 4)
31570 return ix86_builtins[IX86_BUILTIN_CEILPS];
31571 else if (out_n == 8 && in_n == 8)
31572 return ix86_builtins[IX86_BUILTIN_CEILPS256];
31573 }
31574 break;
31575
31576 case BUILT_IN_TRUNC:
31577 /* The round insn does not trap on denormals. */
31578 if (flag_trapping_math || !TARGET_ROUND)
31579 break;
31580
31581 if (out_mode == DFmode && in_mode == DFmode)
31582 {
31583 if (out_n == 2 && in_n == 2)
31584 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
31585 else if (out_n == 4 && in_n == 4)
31586 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
31587 }
31588 break;
31589
31590 case BUILT_IN_TRUNCF:
31591 /* The round insn does not trap on denormals. */
31592 if (flag_trapping_math || !TARGET_ROUND)
31593 break;
31594
31595 if (out_mode == SFmode && in_mode == SFmode)
31596 {
31597 if (out_n == 4 && in_n == 4)
31598 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
31599 else if (out_n == 8 && in_n == 8)
31600 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
31601 }
31602 break;
31603
31604 case BUILT_IN_RINT:
31605 /* The round insn does not trap on denormals. */
31606 if (flag_trapping_math || !TARGET_ROUND)
31607 break;
31608
31609 if (out_mode == DFmode && in_mode == DFmode)
31610 {
31611 if (out_n == 2 && in_n == 2)
31612 return ix86_builtins[IX86_BUILTIN_RINTPD];
31613 else if (out_n == 4 && in_n == 4)
31614 return ix86_builtins[IX86_BUILTIN_RINTPD256];
31615 }
31616 break;
31617
31618 case BUILT_IN_RINTF:
31619 /* The round insn does not trap on denormals. */
31620 if (flag_trapping_math || !TARGET_ROUND)
31621 break;
31622
31623 if (out_mode == SFmode && in_mode == SFmode)
31624 {
31625 if (out_n == 4 && in_n == 4)
31626 return ix86_builtins[IX86_BUILTIN_RINTPS];
31627 else if (out_n == 8 && in_n == 8)
31628 return ix86_builtins[IX86_BUILTIN_RINTPS256];
31629 }
31630 break;
31631
31632 case BUILT_IN_ROUND:
31633 /* The round insn does not trap on denormals. */
31634 if (flag_trapping_math || !TARGET_ROUND)
31635 break;
31636
31637 if (out_mode == DFmode && in_mode == DFmode)
31638 {
31639 if (out_n == 2 && in_n == 2)
31640 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
31641 else if (out_n == 4 && in_n == 4)
31642 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
31643 }
31644 break;
31645
31646 case BUILT_IN_ROUNDF:
31647 /* The round insn does not trap on denormals. */
31648 if (flag_trapping_math || !TARGET_ROUND)
31649 break;
31650
31651 if (out_mode == SFmode && in_mode == SFmode)
31652 {
31653 if (out_n == 4 && in_n == 4)
31654 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
31655 else if (out_n == 8 && in_n == 8)
31656 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
31657 }
31658 break;
31659
31660 case BUILT_IN_FMA:
31661 if (out_mode == DFmode && in_mode == DFmode)
31662 {
31663 if (out_n == 2 && in_n == 2)
31664 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
31665 if (out_n == 4 && in_n == 4)
31666 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
31667 }
31668 break;
31669
31670 case BUILT_IN_FMAF:
31671 if (out_mode == SFmode && in_mode == SFmode)
31672 {
31673 if (out_n == 4 && in_n == 4)
31674 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
31675 if (out_n == 8 && in_n == 8)
31676 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
31677 }
31678 break;
31679
31680 default:
31681 break;
31682 }
31683
31684 /* Dispatch to a handler for a vectorization library. */
31685 if (ix86_veclib_handler)
31686 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
31687 type_in);
31688
31689 return NULL_TREE;
31690 }
31691
31692 /* Handler for an SVML-style interface to
31693 a library with vectorized intrinsics. */
31694
31695 static tree
31696 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
31697 {
31698 char name[20];
31699 tree fntype, new_fndecl, args;
31700 unsigned arity;
31701 const char *bname;
31702 enum machine_mode el_mode, in_mode;
31703 int n, in_n;
31704
31705 /* The SVML is suitable for unsafe math only. */
31706 if (!flag_unsafe_math_optimizations)
31707 return NULL_TREE;
31708
31709 el_mode = TYPE_MODE (TREE_TYPE (type_out));
31710 n = TYPE_VECTOR_SUBPARTS (type_out);
31711 in_mode = TYPE_MODE (TREE_TYPE (type_in));
31712 in_n = TYPE_VECTOR_SUBPARTS (type_in);
31713 if (el_mode != in_mode
31714 || n != in_n)
31715 return NULL_TREE;
31716
31717 switch (fn)
31718 {
31719 case BUILT_IN_EXP:
31720 case BUILT_IN_LOG:
31721 case BUILT_IN_LOG10:
31722 case BUILT_IN_POW:
31723 case BUILT_IN_TANH:
31724 case BUILT_IN_TAN:
31725 case BUILT_IN_ATAN:
31726 case BUILT_IN_ATAN2:
31727 case BUILT_IN_ATANH:
31728 case BUILT_IN_CBRT:
31729 case BUILT_IN_SINH:
31730 case BUILT_IN_SIN:
31731 case BUILT_IN_ASINH:
31732 case BUILT_IN_ASIN:
31733 case BUILT_IN_COSH:
31734 case BUILT_IN_COS:
31735 case BUILT_IN_ACOSH:
31736 case BUILT_IN_ACOS:
31737 if (el_mode != DFmode || n != 2)
31738 return NULL_TREE;
31739 break;
31740
31741 case BUILT_IN_EXPF:
31742 case BUILT_IN_LOGF:
31743 case BUILT_IN_LOG10F:
31744 case BUILT_IN_POWF:
31745 case BUILT_IN_TANHF:
31746 case BUILT_IN_TANF:
31747 case BUILT_IN_ATANF:
31748 case BUILT_IN_ATAN2F:
31749 case BUILT_IN_ATANHF:
31750 case BUILT_IN_CBRTF:
31751 case BUILT_IN_SINHF:
31752 case BUILT_IN_SINF:
31753 case BUILT_IN_ASINHF:
31754 case BUILT_IN_ASINF:
31755 case BUILT_IN_COSHF:
31756 case BUILT_IN_COSF:
31757 case BUILT_IN_ACOSHF:
31758 case BUILT_IN_ACOSF:
31759 if (el_mode != SFmode || n != 4)
31760 return NULL_TREE;
31761 break;
31762
31763 default:
31764 return NULL_TREE;
31765 }
31766
31767 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
31768
31769 if (fn == BUILT_IN_LOGF)
31770 strcpy (name, "vmlsLn4");
31771 else if (fn == BUILT_IN_LOG)
31772 strcpy (name, "vmldLn2");
31773 else if (n == 4)
31774 {
31775 sprintf (name, "vmls%s", bname+10);
31776 name[strlen (name)-1] = '4';
31777 }
31778 else
31779 sprintf (name, "vmld%s2", bname+10);
31780
31781 /* Convert to uppercase. */
31782 name[4] &= ~0x20;
31783
31784 arity = 0;
31785 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
31786 args;
31787 args = TREE_CHAIN (args))
31788 arity++;
31789
31790 if (arity == 1)
31791 fntype = build_function_type_list (type_out, type_in, NULL);
31792 else
31793 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
31794
31795 /* Build a function declaration for the vectorized function. */
31796 new_fndecl = build_decl (BUILTINS_LOCATION,
31797 FUNCTION_DECL, get_identifier (name), fntype);
31798 TREE_PUBLIC (new_fndecl) = 1;
31799 DECL_EXTERNAL (new_fndecl) = 1;
31800 DECL_IS_NOVOPS (new_fndecl) = 1;
31801 TREE_READONLY (new_fndecl) = 1;
31802
31803 return new_fndecl;
31804 }
31805
31806 /* Handler for an ACML-style interface to
31807 a library with vectorized intrinsics. */
31808
31809 static tree
31810 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
31811 {
31812 char name[20] = "__vr.._";
31813 tree fntype, new_fndecl, args;
31814 unsigned arity;
31815 const char *bname;
31816 enum machine_mode el_mode, in_mode;
31817 int n, in_n;
31818
31819 /* The ACML is 64bits only and suitable for unsafe math only as
31820 it does not correctly support parts of IEEE with the required
31821 precision such as denormals. */
31822 if (!TARGET_64BIT
31823 || !flag_unsafe_math_optimizations)
31824 return NULL_TREE;
31825
31826 el_mode = TYPE_MODE (TREE_TYPE (type_out));
31827 n = TYPE_VECTOR_SUBPARTS (type_out);
31828 in_mode = TYPE_MODE (TREE_TYPE (type_in));
31829 in_n = TYPE_VECTOR_SUBPARTS (type_in);
31830 if (el_mode != in_mode
31831 || n != in_n)
31832 return NULL_TREE;
31833
31834 switch (fn)
31835 {
31836 case BUILT_IN_SIN:
31837 case BUILT_IN_COS:
31838 case BUILT_IN_EXP:
31839 case BUILT_IN_LOG:
31840 case BUILT_IN_LOG2:
31841 case BUILT_IN_LOG10:
31842 name[4] = 'd';
31843 name[5] = '2';
31844 if (el_mode != DFmode
31845 || n != 2)
31846 return NULL_TREE;
31847 break;
31848
31849 case BUILT_IN_SINF:
31850 case BUILT_IN_COSF:
31851 case BUILT_IN_EXPF:
31852 case BUILT_IN_POWF:
31853 case BUILT_IN_LOGF:
31854 case BUILT_IN_LOG2F:
31855 case BUILT_IN_LOG10F:
31856 name[4] = 's';
31857 name[5] = '4';
31858 if (el_mode != SFmode
31859 || n != 4)
31860 return NULL_TREE;
31861 break;
31862
31863 default:
31864 return NULL_TREE;
31865 }
31866
31867 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
31868 sprintf (name + 7, "%s", bname+10);
31869
31870 arity = 0;
31871 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
31872 args;
31873 args = TREE_CHAIN (args))
31874 arity++;
31875
31876 if (arity == 1)
31877 fntype = build_function_type_list (type_out, type_in, NULL);
31878 else
31879 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
31880
31881 /* Build a function declaration for the vectorized function. */
31882 new_fndecl = build_decl (BUILTINS_LOCATION,
31883 FUNCTION_DECL, get_identifier (name), fntype);
31884 TREE_PUBLIC (new_fndecl) = 1;
31885 DECL_EXTERNAL (new_fndecl) = 1;
31886 DECL_IS_NOVOPS (new_fndecl) = 1;
31887 TREE_READONLY (new_fndecl) = 1;
31888
31889 return new_fndecl;
31890 }
31891
31892 /* Returns a decl of a function that implements gather load with
31893 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
31894 Return NULL_TREE if it is not available. */
31895
31896 static tree
31897 ix86_vectorize_builtin_gather (const_tree mem_vectype,
31898 const_tree index_type, int scale)
31899 {
31900 bool si;
31901 enum ix86_builtins code;
31902
31903 if (! TARGET_AVX2)
31904 return NULL_TREE;
31905
31906 if ((TREE_CODE (index_type) != INTEGER_TYPE
31907 && !POINTER_TYPE_P (index_type))
31908 || (TYPE_MODE (index_type) != SImode
31909 && TYPE_MODE (index_type) != DImode))
31910 return NULL_TREE;
31911
31912 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
31913 return NULL_TREE;
31914
31915 /* v*gather* insn sign extends index to pointer mode. */
31916 if (TYPE_PRECISION (index_type) < POINTER_SIZE
31917 && TYPE_UNSIGNED (index_type))
31918 return NULL_TREE;
31919
31920 if (scale <= 0
31921 || scale > 8
31922 || (scale & (scale - 1)) != 0)
31923 return NULL_TREE;
31924
31925 si = TYPE_MODE (index_type) == SImode;
31926 switch (TYPE_MODE (mem_vectype))
31927 {
31928 case V2DFmode:
31929 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
31930 break;
31931 case V4DFmode:
31932 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
31933 break;
31934 case V2DImode:
31935 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
31936 break;
31937 case V4DImode:
31938 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
31939 break;
31940 case V4SFmode:
31941 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
31942 break;
31943 case V8SFmode:
31944 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
31945 break;
31946 case V4SImode:
31947 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
31948 break;
31949 case V8SImode:
31950 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
31951 break;
31952 default:
31953 return NULL_TREE;
31954 }
31955
31956 return ix86_builtins[code];
31957 }
31958
31959 /* Returns a code for a target-specific builtin that implements
31960 reciprocal of the function, or NULL_TREE if not available. */
31961
31962 static tree
31963 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
31964 bool sqrt ATTRIBUTE_UNUSED)
31965 {
31966 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
31967 && flag_finite_math_only && !flag_trapping_math
31968 && flag_unsafe_math_optimizations))
31969 return NULL_TREE;
31970
31971 if (md_fn)
31972 /* Machine dependent builtins. */
31973 switch (fn)
31974 {
31975 /* Vectorized version of sqrt to rsqrt conversion. */
31976 case IX86_BUILTIN_SQRTPS_NR:
31977 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
31978
31979 case IX86_BUILTIN_SQRTPS_NR256:
31980 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
31981
31982 default:
31983 return NULL_TREE;
31984 }
31985 else
31986 /* Normal builtins. */
31987 switch (fn)
31988 {
31989 /* Sqrt to rsqrt conversion. */
31990 case BUILT_IN_SQRTF:
31991 return ix86_builtins[IX86_BUILTIN_RSQRTF];
31992
31993 default:
31994 return NULL_TREE;
31995 }
31996 }
31997 \f
31998 /* Helper for avx_vpermilps256_operand et al. This is also used by
31999 the expansion functions to turn the parallel back into a mask.
32000 The return value is 0 for no match and the imm8+1 for a match. */
32001
32002 int
32003 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
32004 {
32005 unsigned i, nelt = GET_MODE_NUNITS (mode);
32006 unsigned mask = 0;
32007 unsigned char ipar[8];
32008
32009 if (XVECLEN (par, 0) != (int) nelt)
32010 return 0;
32011
32012 /* Validate that all of the elements are constants, and not totally
32013 out of range. Copy the data into an integral array to make the
32014 subsequent checks easier. */
32015 for (i = 0; i < nelt; ++i)
32016 {
32017 rtx er = XVECEXP (par, 0, i);
32018 unsigned HOST_WIDE_INT ei;
32019
32020 if (!CONST_INT_P (er))
32021 return 0;
32022 ei = INTVAL (er);
32023 if (ei >= nelt)
32024 return 0;
32025 ipar[i] = ei;
32026 }
32027
32028 switch (mode)
32029 {
32030 case V4DFmode:
32031 /* In the 256-bit DFmode case, we can only move elements within
32032 a 128-bit lane. */
32033 for (i = 0; i < 2; ++i)
32034 {
32035 if (ipar[i] >= 2)
32036 return 0;
32037 mask |= ipar[i] << i;
32038 }
32039 for (i = 2; i < 4; ++i)
32040 {
32041 if (ipar[i] < 2)
32042 return 0;
32043 mask |= (ipar[i] - 2) << i;
32044 }
32045 break;
32046
32047 case V8SFmode:
32048 /* In the 256-bit SFmode case, we have full freedom of movement
32049 within the low 128-bit lane, but the high 128-bit lane must
32050 mirror the exact same pattern. */
32051 for (i = 0; i < 4; ++i)
32052 if (ipar[i] + 4 != ipar[i + 4])
32053 return 0;
32054 nelt = 4;
32055 /* FALLTHRU */
32056
32057 case V2DFmode:
32058 case V4SFmode:
32059 /* In the 128-bit case, we've full freedom in the placement of
32060 the elements from the source operand. */
32061 for (i = 0; i < nelt; ++i)
32062 mask |= ipar[i] << (i * (nelt / 2));
32063 break;
32064
32065 default:
32066 gcc_unreachable ();
32067 }
32068
32069 /* Make sure success has a non-zero value by adding one. */
32070 return mask + 1;
32071 }
32072
32073 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
32074 the expansion functions to turn the parallel back into a mask.
32075 The return value is 0 for no match and the imm8+1 for a match. */
32076
32077 int
32078 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
32079 {
32080 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
32081 unsigned mask = 0;
32082 unsigned char ipar[8];
32083
32084 if (XVECLEN (par, 0) != (int) nelt)
32085 return 0;
32086
32087 /* Validate that all of the elements are constants, and not totally
32088 out of range. Copy the data into an integral array to make the
32089 subsequent checks easier. */
32090 for (i = 0; i < nelt; ++i)
32091 {
32092 rtx er = XVECEXP (par, 0, i);
32093 unsigned HOST_WIDE_INT ei;
32094
32095 if (!CONST_INT_P (er))
32096 return 0;
32097 ei = INTVAL (er);
32098 if (ei >= 2 * nelt)
32099 return 0;
32100 ipar[i] = ei;
32101 }
32102
32103 /* Validate that the halves of the permute are halves. */
32104 for (i = 0; i < nelt2 - 1; ++i)
32105 if (ipar[i] + 1 != ipar[i + 1])
32106 return 0;
32107 for (i = nelt2; i < nelt - 1; ++i)
32108 if (ipar[i] + 1 != ipar[i + 1])
32109 return 0;
32110
32111 /* Reconstruct the mask. */
32112 for (i = 0; i < 2; ++i)
32113 {
32114 unsigned e = ipar[i * nelt2];
32115 if (e % nelt2)
32116 return 0;
32117 e /= nelt2;
32118 mask |= e << (i * 4);
32119 }
32120
32121 /* Make sure success has a non-zero value by adding one. */
32122 return mask + 1;
32123 }
32124 \f
32125 /* Store OPERAND to the memory after reload is completed. This means
32126 that we can't easily use assign_stack_local. */
32127 rtx
32128 ix86_force_to_memory (enum machine_mode mode, rtx operand)
32129 {
32130 rtx result;
32131
32132 gcc_assert (reload_completed);
32133 if (ix86_using_red_zone ())
32134 {
32135 result = gen_rtx_MEM (mode,
32136 gen_rtx_PLUS (Pmode,
32137 stack_pointer_rtx,
32138 GEN_INT (-RED_ZONE_SIZE)));
32139 emit_move_insn (result, operand);
32140 }
32141 else if (TARGET_64BIT)
32142 {
32143 switch (mode)
32144 {
32145 case HImode:
32146 case SImode:
32147 operand = gen_lowpart (DImode, operand);
32148 /* FALLTHRU */
32149 case DImode:
32150 emit_insn (
32151 gen_rtx_SET (VOIDmode,
32152 gen_rtx_MEM (DImode,
32153 gen_rtx_PRE_DEC (DImode,
32154 stack_pointer_rtx)),
32155 operand));
32156 break;
32157 default:
32158 gcc_unreachable ();
32159 }
32160 result = gen_rtx_MEM (mode, stack_pointer_rtx);
32161 }
32162 else
32163 {
32164 switch (mode)
32165 {
32166 case DImode:
32167 {
32168 rtx operands[2];
32169 split_double_mode (mode, &operand, 1, operands, operands + 1);
32170 emit_insn (
32171 gen_rtx_SET (VOIDmode,
32172 gen_rtx_MEM (SImode,
32173 gen_rtx_PRE_DEC (Pmode,
32174 stack_pointer_rtx)),
32175 operands[1]));
32176 emit_insn (
32177 gen_rtx_SET (VOIDmode,
32178 gen_rtx_MEM (SImode,
32179 gen_rtx_PRE_DEC (Pmode,
32180 stack_pointer_rtx)),
32181 operands[0]));
32182 }
32183 break;
32184 case HImode:
32185 /* Store HImodes as SImodes. */
32186 operand = gen_lowpart (SImode, operand);
32187 /* FALLTHRU */
32188 case SImode:
32189 emit_insn (
32190 gen_rtx_SET (VOIDmode,
32191 gen_rtx_MEM (GET_MODE (operand),
32192 gen_rtx_PRE_DEC (SImode,
32193 stack_pointer_rtx)),
32194 operand));
32195 break;
32196 default:
32197 gcc_unreachable ();
32198 }
32199 result = gen_rtx_MEM (mode, stack_pointer_rtx);
32200 }
32201 return result;
32202 }
32203
32204 /* Free operand from the memory. */
32205 void
32206 ix86_free_from_memory (enum machine_mode mode)
32207 {
32208 if (!ix86_using_red_zone ())
32209 {
32210 int size;
32211
32212 if (mode == DImode || TARGET_64BIT)
32213 size = 8;
32214 else
32215 size = 4;
32216 /* Use LEA to deallocate stack space. In peephole2 it will be converted
32217 to pop or add instruction if registers are available. */
32218 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
32219 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
32220 GEN_INT (size))));
32221 }
32222 }
32223
32224 /* Return true if we use LRA instead of reload pass. */
32225 static bool
32226 ix86_lra_p (void)
32227 {
32228 return true;
32229 }
32230
32231 /* Return a register priority for hard reg REGNO. */
32232 static int
32233 ix86_register_priority (int hard_regno)
32234 {
32235 /* ebp and r13 as the base always wants a displacement, r12 as the
32236 base always wants an index. So discourage their usage in an
32237 address. */
32238 if (hard_regno == R12_REG || hard_regno == R13_REG)
32239 return 0;
32240 if (hard_regno == BP_REG)
32241 return 1;
32242 /* New x86-64 int registers result in bigger code size. Discourage
32243 them. */
32244 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
32245 return 2;
32246 /* New x86-64 SSE registers result in bigger code size. Discourage
32247 them. */
32248 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
32249 return 2;
32250 /* Usage of AX register results in smaller code. Prefer it. */
32251 if (hard_regno == 0)
32252 return 4;
32253 return 3;
32254 }
32255
32256 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
32257
32258 Put float CONST_DOUBLE in the constant pool instead of fp regs.
32259 QImode must go into class Q_REGS.
32260 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
32261 movdf to do mem-to-mem moves through integer regs. */
32262
32263 static reg_class_t
32264 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
32265 {
32266 enum machine_mode mode = GET_MODE (x);
32267
32268 /* We're only allowed to return a subclass of CLASS. Many of the
32269 following checks fail for NO_REGS, so eliminate that early. */
32270 if (regclass == NO_REGS)
32271 return NO_REGS;
32272
32273 /* All classes can load zeros. */
32274 if (x == CONST0_RTX (mode))
32275 return regclass;
32276
32277 /* Force constants into memory if we are loading a (nonzero) constant into
32278 an MMX or SSE register. This is because there are no MMX/SSE instructions
32279 to load from a constant. */
32280 if (CONSTANT_P (x)
32281 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
32282 return NO_REGS;
32283
32284 /* Prefer SSE regs only, if we can use them for math. */
32285 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
32286 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
32287
32288 /* Floating-point constants need more complex checks. */
32289 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
32290 {
32291 /* General regs can load everything. */
32292 if (reg_class_subset_p (regclass, GENERAL_REGS))
32293 return regclass;
32294
32295 /* Floats can load 0 and 1 plus some others. Note that we eliminated
32296 zero above. We only want to wind up preferring 80387 registers if
32297 we plan on doing computation with them. */
32298 if (TARGET_80387
32299 && standard_80387_constant_p (x) > 0)
32300 {
32301 /* Limit class to non-sse. */
32302 if (regclass == FLOAT_SSE_REGS)
32303 return FLOAT_REGS;
32304 if (regclass == FP_TOP_SSE_REGS)
32305 return FP_TOP_REG;
32306 if (regclass == FP_SECOND_SSE_REGS)
32307 return FP_SECOND_REG;
32308 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
32309 return regclass;
32310 }
32311
32312 return NO_REGS;
32313 }
32314
32315 /* Generally when we see PLUS here, it's the function invariant
32316 (plus soft-fp const_int). Which can only be computed into general
32317 regs. */
32318 if (GET_CODE (x) == PLUS)
32319 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
32320
32321 /* QImode constants are easy to load, but non-constant QImode data
32322 must go into Q_REGS. */
32323 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
32324 {
32325 if (reg_class_subset_p (regclass, Q_REGS))
32326 return regclass;
32327 if (reg_class_subset_p (Q_REGS, regclass))
32328 return Q_REGS;
32329 return NO_REGS;
32330 }
32331
32332 return regclass;
32333 }
32334
32335 /* Discourage putting floating-point values in SSE registers unless
32336 SSE math is being used, and likewise for the 387 registers. */
32337 static reg_class_t
32338 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
32339 {
32340 enum machine_mode mode = GET_MODE (x);
32341
32342 /* Restrict the output reload class to the register bank that we are doing
32343 math on. If we would like not to return a subset of CLASS, reject this
32344 alternative: if reload cannot do this, it will still use its choice. */
32345 mode = GET_MODE (x);
32346 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
32347 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
32348
32349 if (X87_FLOAT_MODE_P (mode))
32350 {
32351 if (regclass == FP_TOP_SSE_REGS)
32352 return FP_TOP_REG;
32353 else if (regclass == FP_SECOND_SSE_REGS)
32354 return FP_SECOND_REG;
32355 else
32356 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
32357 }
32358
32359 return regclass;
32360 }
32361
32362 static reg_class_t
32363 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
32364 enum machine_mode mode, secondary_reload_info *sri)
32365 {
32366 /* Double-word spills from general registers to non-offsettable memory
32367 references (zero-extended addresses) require special handling. */
32368 if (TARGET_64BIT
32369 && MEM_P (x)
32370 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
32371 && rclass == GENERAL_REGS
32372 && !offsettable_memref_p (x))
32373 {
32374 sri->icode = (in_p
32375 ? CODE_FOR_reload_noff_load
32376 : CODE_FOR_reload_noff_store);
32377 /* Add the cost of moving address to a temporary. */
32378 sri->extra_cost = 1;
32379
32380 return NO_REGS;
32381 }
32382
32383 /* QImode spills from non-QI registers require
32384 intermediate register on 32bit targets. */
32385 if (!TARGET_64BIT
32386 && !in_p && mode == QImode
32387 && (rclass == GENERAL_REGS
32388 || rclass == LEGACY_REGS
32389 || rclass == NON_Q_REGS
32390 || rclass == SIREG
32391 || rclass == DIREG
32392 || rclass == INDEX_REGS))
32393 {
32394 int regno;
32395
32396 if (REG_P (x))
32397 regno = REGNO (x);
32398 else
32399 regno = -1;
32400
32401 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
32402 regno = true_regnum (x);
32403
32404 /* Return Q_REGS if the operand is in memory. */
32405 if (regno == -1)
32406 return Q_REGS;
32407 }
32408
32409 /* This condition handles corner case where an expression involving
32410 pointers gets vectorized. We're trying to use the address of a
32411 stack slot as a vector initializer.
32412
32413 (set (reg:V2DI 74 [ vect_cst_.2 ])
32414 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
32415
32416 Eventually frame gets turned into sp+offset like this:
32417
32418 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
32419 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
32420 (const_int 392 [0x188]))))
32421
32422 That later gets turned into:
32423
32424 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
32425 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
32426 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
32427
32428 We'll have the following reload recorded:
32429
32430 Reload 0: reload_in (DI) =
32431 (plus:DI (reg/f:DI 7 sp)
32432 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
32433 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
32434 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
32435 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
32436 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
32437 reload_reg_rtx: (reg:V2DI 22 xmm1)
32438
32439 Which isn't going to work since SSE instructions can't handle scalar
32440 additions. Returning GENERAL_REGS forces the addition into integer
32441 register and reload can handle subsequent reloads without problems. */
32442
32443 if (in_p && GET_CODE (x) == PLUS
32444 && SSE_CLASS_P (rclass)
32445 && SCALAR_INT_MODE_P (mode))
32446 return GENERAL_REGS;
32447
32448 return NO_REGS;
32449 }
32450
32451 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
32452
32453 static bool
32454 ix86_class_likely_spilled_p (reg_class_t rclass)
32455 {
32456 switch (rclass)
32457 {
32458 case AREG:
32459 case DREG:
32460 case CREG:
32461 case BREG:
32462 case AD_REGS:
32463 case SIREG:
32464 case DIREG:
32465 case SSE_FIRST_REG:
32466 case FP_TOP_REG:
32467 case FP_SECOND_REG:
32468 return true;
32469
32470 default:
32471 break;
32472 }
32473
32474 return false;
32475 }
32476
32477 /* If we are copying between general and FP registers, we need a memory
32478 location. The same is true for SSE and MMX registers.
32479
32480 To optimize register_move_cost performance, allow inline variant.
32481
32482 The macro can't work reliably when one of the CLASSES is class containing
32483 registers from multiple units (SSE, MMX, integer). We avoid this by never
32484 combining those units in single alternative in the machine description.
32485 Ensure that this constraint holds to avoid unexpected surprises.
32486
32487 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
32488 enforce these sanity checks. */
32489
32490 static inline bool
32491 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
32492 enum machine_mode mode, int strict)
32493 {
32494 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
32495 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
32496 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
32497 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
32498 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
32499 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
32500 {
32501 gcc_assert (!strict || lra_in_progress);
32502 return true;
32503 }
32504
32505 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
32506 return true;
32507
32508 /* ??? This is a lie. We do have moves between mmx/general, and for
32509 mmx/sse2. But by saying we need secondary memory we discourage the
32510 register allocator from using the mmx registers unless needed. */
32511 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
32512 return true;
32513
32514 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
32515 {
32516 /* SSE1 doesn't have any direct moves from other classes. */
32517 if (!TARGET_SSE2)
32518 return true;
32519
32520 /* If the target says that inter-unit moves are more expensive
32521 than moving through memory, then don't generate them. */
32522 if (!TARGET_INTER_UNIT_MOVES)
32523 return true;
32524
32525 /* Between SSE and general, we have moves no larger than word size. */
32526 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
32527 return true;
32528 }
32529
32530 return false;
32531 }
32532
32533 bool
32534 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
32535 enum machine_mode mode, int strict)
32536 {
32537 return inline_secondary_memory_needed (class1, class2, mode, strict);
32538 }
32539
32540 /* Implement the TARGET_CLASS_MAX_NREGS hook.
32541
32542 On the 80386, this is the size of MODE in words,
32543 except in the FP regs, where a single reg is always enough. */
32544
32545 static unsigned char
32546 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
32547 {
32548 if (MAYBE_INTEGER_CLASS_P (rclass))
32549 {
32550 if (mode == XFmode)
32551 return (TARGET_64BIT ? 2 : 3);
32552 else if (mode == XCmode)
32553 return (TARGET_64BIT ? 4 : 6);
32554 else
32555 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
32556 }
32557 else
32558 {
32559 if (COMPLEX_MODE_P (mode))
32560 return 2;
32561 else
32562 return 1;
32563 }
32564 }
32565
32566 /* Return true if the registers in CLASS cannot represent the change from
32567 modes FROM to TO. */
32568
32569 bool
32570 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
32571 enum reg_class regclass)
32572 {
32573 if (from == to)
32574 return false;
32575
32576 /* x87 registers can't do subreg at all, as all values are reformatted
32577 to extended precision. */
32578 if (MAYBE_FLOAT_CLASS_P (regclass))
32579 return true;
32580
32581 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
32582 {
32583 /* Vector registers do not support QI or HImode loads. If we don't
32584 disallow a change to these modes, reload will assume it's ok to
32585 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
32586 the vec_dupv4hi pattern. */
32587 if (GET_MODE_SIZE (from) < 4)
32588 return true;
32589
32590 /* Vector registers do not support subreg with nonzero offsets, which
32591 are otherwise valid for integer registers. Since we can't see
32592 whether we have a nonzero offset from here, prohibit all
32593 nonparadoxical subregs changing size. */
32594 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
32595 return true;
32596 }
32597
32598 return false;
32599 }
32600
32601 /* Return the cost of moving data of mode M between a
32602 register and memory. A value of 2 is the default; this cost is
32603 relative to those in `REGISTER_MOVE_COST'.
32604
32605 This function is used extensively by register_move_cost that is used to
32606 build tables at startup. Make it inline in this case.
32607 When IN is 2, return maximum of in and out move cost.
32608
32609 If moving between registers and memory is more expensive than
32610 between two registers, you should define this macro to express the
32611 relative cost.
32612
32613 Model also increased moving costs of QImode registers in non
32614 Q_REGS classes.
32615 */
32616 static inline int
32617 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
32618 int in)
32619 {
32620 int cost;
32621 if (FLOAT_CLASS_P (regclass))
32622 {
32623 int index;
32624 switch (mode)
32625 {
32626 case SFmode:
32627 index = 0;
32628 break;
32629 case DFmode:
32630 index = 1;
32631 break;
32632 case XFmode:
32633 index = 2;
32634 break;
32635 default:
32636 return 100;
32637 }
32638 if (in == 2)
32639 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
32640 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
32641 }
32642 if (SSE_CLASS_P (regclass))
32643 {
32644 int index;
32645 switch (GET_MODE_SIZE (mode))
32646 {
32647 case 4:
32648 index = 0;
32649 break;
32650 case 8:
32651 index = 1;
32652 break;
32653 case 16:
32654 index = 2;
32655 break;
32656 default:
32657 return 100;
32658 }
32659 if (in == 2)
32660 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
32661 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
32662 }
32663 if (MMX_CLASS_P (regclass))
32664 {
32665 int index;
32666 switch (GET_MODE_SIZE (mode))
32667 {
32668 case 4:
32669 index = 0;
32670 break;
32671 case 8:
32672 index = 1;
32673 break;
32674 default:
32675 return 100;
32676 }
32677 if (in)
32678 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
32679 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
32680 }
32681 switch (GET_MODE_SIZE (mode))
32682 {
32683 case 1:
32684 if (Q_CLASS_P (regclass) || TARGET_64BIT)
32685 {
32686 if (!in)
32687 return ix86_cost->int_store[0];
32688 if (TARGET_PARTIAL_REG_DEPENDENCY
32689 && optimize_function_for_speed_p (cfun))
32690 cost = ix86_cost->movzbl_load;
32691 else
32692 cost = ix86_cost->int_load[0];
32693 if (in == 2)
32694 return MAX (cost, ix86_cost->int_store[0]);
32695 return cost;
32696 }
32697 else
32698 {
32699 if (in == 2)
32700 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
32701 if (in)
32702 return ix86_cost->movzbl_load;
32703 else
32704 return ix86_cost->int_store[0] + 4;
32705 }
32706 break;
32707 case 2:
32708 if (in == 2)
32709 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
32710 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
32711 default:
32712 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
32713 if (mode == TFmode)
32714 mode = XFmode;
32715 if (in == 2)
32716 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
32717 else if (in)
32718 cost = ix86_cost->int_load[2];
32719 else
32720 cost = ix86_cost->int_store[2];
32721 return (cost * (((int) GET_MODE_SIZE (mode)
32722 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
32723 }
32724 }
32725
32726 static int
32727 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
32728 bool in)
32729 {
32730 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
32731 }
32732
32733
32734 /* Return the cost of moving data from a register in class CLASS1 to
32735 one in class CLASS2.
32736
32737 It is not required that the cost always equal 2 when FROM is the same as TO;
32738 on some machines it is expensive to move between registers if they are not
32739 general registers. */
32740
32741 static int
32742 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
32743 reg_class_t class2_i)
32744 {
32745 enum reg_class class1 = (enum reg_class) class1_i;
32746 enum reg_class class2 = (enum reg_class) class2_i;
32747
32748 /* In case we require secondary memory, compute cost of the store followed
32749 by load. In order to avoid bad register allocation choices, we need
32750 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
32751
32752 if (inline_secondary_memory_needed (class1, class2, mode, 0))
32753 {
32754 int cost = 1;
32755
32756 cost += inline_memory_move_cost (mode, class1, 2);
32757 cost += inline_memory_move_cost (mode, class2, 2);
32758
32759 /* In case of copying from general_purpose_register we may emit multiple
32760 stores followed by single load causing memory size mismatch stall.
32761 Count this as arbitrarily high cost of 20. */
32762 if (targetm.class_max_nregs (class1, mode)
32763 > targetm.class_max_nregs (class2, mode))
32764 cost += 20;
32765
32766 /* In the case of FP/MMX moves, the registers actually overlap, and we
32767 have to switch modes in order to treat them differently. */
32768 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
32769 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
32770 cost += 20;
32771
32772 return cost;
32773 }
32774
32775 /* Moves between SSE/MMX and integer unit are expensive. */
32776 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
32777 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
32778
32779 /* ??? By keeping returned value relatively high, we limit the number
32780 of moves between integer and MMX/SSE registers for all targets.
32781 Additionally, high value prevents problem with x86_modes_tieable_p(),
32782 where integer modes in MMX/SSE registers are not tieable
32783 because of missing QImode and HImode moves to, from or between
32784 MMX/SSE registers. */
32785 return MAX (8, ix86_cost->mmxsse_to_integer);
32786
32787 if (MAYBE_FLOAT_CLASS_P (class1))
32788 return ix86_cost->fp_move;
32789 if (MAYBE_SSE_CLASS_P (class1))
32790 return ix86_cost->sse_move;
32791 if (MAYBE_MMX_CLASS_P (class1))
32792 return ix86_cost->mmx_move;
32793 return 2;
32794 }
32795
32796 /* Return TRUE if hard register REGNO can hold a value of machine-mode
32797 MODE. */
32798
32799 bool
32800 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
32801 {
32802 /* Flags and only flags can only hold CCmode values. */
32803 if (CC_REGNO_P (regno))
32804 return GET_MODE_CLASS (mode) == MODE_CC;
32805 if (GET_MODE_CLASS (mode) == MODE_CC
32806 || GET_MODE_CLASS (mode) == MODE_RANDOM
32807 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
32808 return false;
32809 if (STACK_REGNO_P (regno))
32810 return VALID_FP_MODE_P (mode);
32811 if (SSE_REGNO_P (regno))
32812 {
32813 /* We implement the move patterns for all vector modes into and
32814 out of SSE registers, even when no operation instructions
32815 are available. OImode move is available only when AVX is
32816 enabled. */
32817 return ((TARGET_AVX && mode == OImode)
32818 || VALID_AVX256_REG_MODE (mode)
32819 || VALID_SSE_REG_MODE (mode)
32820 || VALID_SSE2_REG_MODE (mode)
32821 || VALID_MMX_REG_MODE (mode)
32822 || VALID_MMX_REG_MODE_3DNOW (mode));
32823 }
32824 if (MMX_REGNO_P (regno))
32825 {
32826 /* We implement the move patterns for 3DNOW modes even in MMX mode,
32827 so if the register is available at all, then we can move data of
32828 the given mode into or out of it. */
32829 return (VALID_MMX_REG_MODE (mode)
32830 || VALID_MMX_REG_MODE_3DNOW (mode));
32831 }
32832
32833 if (mode == QImode)
32834 {
32835 /* Take care for QImode values - they can be in non-QI regs,
32836 but then they do cause partial register stalls. */
32837 if (TARGET_64BIT || QI_REGNO_P (regno))
32838 return true;
32839 if (!TARGET_PARTIAL_REG_STALL)
32840 return true;
32841 return !can_create_pseudo_p ();
32842 }
32843 /* We handle both integer and floats in the general purpose registers. */
32844 else if (VALID_INT_MODE_P (mode))
32845 return true;
32846 else if (VALID_FP_MODE_P (mode))
32847 return true;
32848 else if (VALID_DFP_MODE_P (mode))
32849 return true;
32850 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
32851 on to use that value in smaller contexts, this can easily force a
32852 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
32853 supporting DImode, allow it. */
32854 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
32855 return true;
32856
32857 return false;
32858 }
32859
32860 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
32861 tieable integer mode. */
32862
32863 static bool
32864 ix86_tieable_integer_mode_p (enum machine_mode mode)
32865 {
32866 switch (mode)
32867 {
32868 case HImode:
32869 case SImode:
32870 return true;
32871
32872 case QImode:
32873 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
32874
32875 case DImode:
32876 return TARGET_64BIT;
32877
32878 default:
32879 return false;
32880 }
32881 }
32882
32883 /* Return true if MODE1 is accessible in a register that can hold MODE2
32884 without copying. That is, all register classes that can hold MODE2
32885 can also hold MODE1. */
32886
32887 bool
32888 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
32889 {
32890 if (mode1 == mode2)
32891 return true;
32892
32893 if (ix86_tieable_integer_mode_p (mode1)
32894 && ix86_tieable_integer_mode_p (mode2))
32895 return true;
32896
32897 /* MODE2 being XFmode implies fp stack or general regs, which means we
32898 can tie any smaller floating point modes to it. Note that we do not
32899 tie this with TFmode. */
32900 if (mode2 == XFmode)
32901 return mode1 == SFmode || mode1 == DFmode;
32902
32903 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
32904 that we can tie it with SFmode. */
32905 if (mode2 == DFmode)
32906 return mode1 == SFmode;
32907
32908 /* If MODE2 is only appropriate for an SSE register, then tie with
32909 any other mode acceptable to SSE registers. */
32910 if (GET_MODE_SIZE (mode2) == 32
32911 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
32912 return (GET_MODE_SIZE (mode1) == 32
32913 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
32914 if (GET_MODE_SIZE (mode2) == 16
32915 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
32916 return (GET_MODE_SIZE (mode1) == 16
32917 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
32918
32919 /* If MODE2 is appropriate for an MMX register, then tie
32920 with any other mode acceptable to MMX registers. */
32921 if (GET_MODE_SIZE (mode2) == 8
32922 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
32923 return (GET_MODE_SIZE (mode1) == 8
32924 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
32925
32926 return false;
32927 }
32928
32929 /* Return the cost of moving between two registers of mode MODE. */
32930
32931 static int
32932 ix86_set_reg_reg_cost (enum machine_mode mode)
32933 {
32934 unsigned int units = UNITS_PER_WORD;
32935
32936 switch (GET_MODE_CLASS (mode))
32937 {
32938 default:
32939 break;
32940
32941 case MODE_CC:
32942 units = GET_MODE_SIZE (CCmode);
32943 break;
32944
32945 case MODE_FLOAT:
32946 if ((TARGET_SSE && mode == TFmode)
32947 || (TARGET_80387 && mode == XFmode)
32948 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
32949 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
32950 units = GET_MODE_SIZE (mode);
32951 break;
32952
32953 case MODE_COMPLEX_FLOAT:
32954 if ((TARGET_SSE && mode == TCmode)
32955 || (TARGET_80387 && mode == XCmode)
32956 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
32957 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
32958 units = GET_MODE_SIZE (mode);
32959 break;
32960
32961 case MODE_VECTOR_INT:
32962 case MODE_VECTOR_FLOAT:
32963 if ((TARGET_AVX && VALID_AVX256_REG_MODE (mode))
32964 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
32965 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
32966 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
32967 units = GET_MODE_SIZE (mode);
32968 }
32969
32970 /* Return the cost of moving between two registers of mode MODE,
32971 assuming that the move will be in pieces of at most UNITS bytes. */
32972 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
32973 }
32974
32975 /* Compute a (partial) cost for rtx X. Return true if the complete
32976 cost has been computed, and false if subexpressions should be
32977 scanned. In either case, *TOTAL contains the cost result. */
32978
32979 static bool
32980 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
32981 bool speed)
32982 {
32983 enum rtx_code code = (enum rtx_code) code_i;
32984 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
32985 enum machine_mode mode = GET_MODE (x);
32986 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
32987
32988 switch (code)
32989 {
32990 case SET:
32991 if (register_operand (SET_DEST (x), VOIDmode)
32992 && reg_or_0_operand (SET_SRC (x), VOIDmode))
32993 {
32994 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
32995 return true;
32996 }
32997 return false;
32998
32999 case CONST_INT:
33000 case CONST:
33001 case LABEL_REF:
33002 case SYMBOL_REF:
33003 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
33004 *total = 3;
33005 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
33006 *total = 2;
33007 else if (flag_pic && SYMBOLIC_CONST (x)
33008 && (!TARGET_64BIT
33009 || (!GET_CODE (x) != LABEL_REF
33010 && (GET_CODE (x) != SYMBOL_REF
33011 || !SYMBOL_REF_LOCAL_P (x)))))
33012 *total = 1;
33013 else
33014 *total = 0;
33015 return true;
33016
33017 case CONST_DOUBLE:
33018 if (mode == VOIDmode)
33019 {
33020 *total = 0;
33021 return true;
33022 }
33023 switch (standard_80387_constant_p (x))
33024 {
33025 case 1: /* 0.0 */
33026 *total = 1;
33027 return true;
33028 default: /* Other constants */
33029 *total = 2;
33030 return true;
33031 case 0:
33032 case -1:
33033 break;
33034 }
33035 if (SSE_FLOAT_MODE_P (mode))
33036 {
33037 case CONST_VECTOR:
33038 switch (standard_sse_constant_p (x))
33039 {
33040 case 0:
33041 break;
33042 case 1: /* 0: xor eliminates false dependency */
33043 *total = 0;
33044 return true;
33045 default: /* -1: cmp contains false dependency */
33046 *total = 1;
33047 return true;
33048 }
33049 }
33050 /* Fall back to (MEM (SYMBOL_REF)), since that's where
33051 it'll probably end up. Add a penalty for size. */
33052 *total = (COSTS_N_INSNS (1)
33053 + (flag_pic != 0 && !TARGET_64BIT)
33054 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
33055 return true;
33056
33057 case ZERO_EXTEND:
33058 /* The zero extensions is often completely free on x86_64, so make
33059 it as cheap as possible. */
33060 if (TARGET_64BIT && mode == DImode
33061 && GET_MODE (XEXP (x, 0)) == SImode)
33062 *total = 1;
33063 else if (TARGET_ZERO_EXTEND_WITH_AND)
33064 *total = cost->add;
33065 else
33066 *total = cost->movzx;
33067 return false;
33068
33069 case SIGN_EXTEND:
33070 *total = cost->movsx;
33071 return false;
33072
33073 case ASHIFT:
33074 if (SCALAR_INT_MODE_P (mode)
33075 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
33076 && CONST_INT_P (XEXP (x, 1)))
33077 {
33078 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
33079 if (value == 1)
33080 {
33081 *total = cost->add;
33082 return false;
33083 }
33084 if ((value == 2 || value == 3)
33085 && cost->lea <= cost->shift_const)
33086 {
33087 *total = cost->lea;
33088 return false;
33089 }
33090 }
33091 /* FALLTHRU */
33092
33093 case ROTATE:
33094 case ASHIFTRT:
33095 case LSHIFTRT:
33096 case ROTATERT:
33097 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
33098 {
33099 /* ??? Should be SSE vector operation cost. */
33100 /* At least for published AMD latencies, this really is the same
33101 as the latency for a simple fpu operation like fabs. */
33102 /* V*QImode is emulated with 1-11 insns. */
33103 if (mode == V16QImode || mode == V32QImode)
33104 {
33105 int count = 11;
33106 if (TARGET_XOP && mode == V16QImode)
33107 {
33108 /* For XOP we use vpshab, which requires a broadcast of the
33109 value to the variable shift insn. For constants this
33110 means a V16Q const in mem; even when we can perform the
33111 shift with one insn set the cost to prefer paddb. */
33112 if (CONSTANT_P (XEXP (x, 1)))
33113 {
33114 *total = (cost->fabs
33115 + rtx_cost (XEXP (x, 0), code, 0, speed)
33116 + (speed ? 2 : COSTS_N_BYTES (16)));
33117 return true;
33118 }
33119 count = 3;
33120 }
33121 else if (TARGET_SSSE3)
33122 count = 7;
33123 *total = cost->fabs * count;
33124 }
33125 else
33126 *total = cost->fabs;
33127 }
33128 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
33129 {
33130 if (CONST_INT_P (XEXP (x, 1)))
33131 {
33132 if (INTVAL (XEXP (x, 1)) > 32)
33133 *total = cost->shift_const + COSTS_N_INSNS (2);
33134 else
33135 *total = cost->shift_const * 2;
33136 }
33137 else
33138 {
33139 if (GET_CODE (XEXP (x, 1)) == AND)
33140 *total = cost->shift_var * 2;
33141 else
33142 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
33143 }
33144 }
33145 else
33146 {
33147 if (CONST_INT_P (XEXP (x, 1)))
33148 *total = cost->shift_const;
33149 else
33150 *total = cost->shift_var;
33151 }
33152 return false;
33153
33154 case FMA:
33155 {
33156 rtx sub;
33157
33158 gcc_assert (FLOAT_MODE_P (mode));
33159 gcc_assert (TARGET_FMA || TARGET_FMA4);
33160
33161 /* ??? SSE scalar/vector cost should be used here. */
33162 /* ??? Bald assumption that fma has the same cost as fmul. */
33163 *total = cost->fmul;
33164 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
33165
33166 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
33167 sub = XEXP (x, 0);
33168 if (GET_CODE (sub) == NEG)
33169 sub = XEXP (sub, 0);
33170 *total += rtx_cost (sub, FMA, 0, speed);
33171
33172 sub = XEXP (x, 2);
33173 if (GET_CODE (sub) == NEG)
33174 sub = XEXP (sub, 0);
33175 *total += rtx_cost (sub, FMA, 2, speed);
33176 return true;
33177 }
33178
33179 case MULT:
33180 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
33181 {
33182 /* ??? SSE scalar cost should be used here. */
33183 *total = cost->fmul;
33184 return false;
33185 }
33186 else if (X87_FLOAT_MODE_P (mode))
33187 {
33188 *total = cost->fmul;
33189 return false;
33190 }
33191 else if (FLOAT_MODE_P (mode))
33192 {
33193 /* ??? SSE vector cost should be used here. */
33194 *total = cost->fmul;
33195 return false;
33196 }
33197 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
33198 {
33199 /* V*QImode is emulated with 7-13 insns. */
33200 if (mode == V16QImode || mode == V32QImode)
33201 {
33202 int extra = 11;
33203 if (TARGET_XOP && mode == V16QImode)
33204 extra = 5;
33205 else if (TARGET_SSSE3)
33206 extra = 6;
33207 *total = cost->fmul * 2 + cost->fabs * extra;
33208 }
33209 /* V*DImode is emulated with 5-8 insns. */
33210 else if (mode == V2DImode || mode == V4DImode)
33211 {
33212 if (TARGET_XOP && mode == V2DImode)
33213 *total = cost->fmul * 2 + cost->fabs * 3;
33214 else
33215 *total = cost->fmul * 3 + cost->fabs * 5;
33216 }
33217 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
33218 insns, including two PMULUDQ. */
33219 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
33220 *total = cost->fmul * 2 + cost->fabs * 5;
33221 else
33222 *total = cost->fmul;
33223 return false;
33224 }
33225 else
33226 {
33227 rtx op0 = XEXP (x, 0);
33228 rtx op1 = XEXP (x, 1);
33229 int nbits;
33230 if (CONST_INT_P (XEXP (x, 1)))
33231 {
33232 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
33233 for (nbits = 0; value != 0; value &= value - 1)
33234 nbits++;
33235 }
33236 else
33237 /* This is arbitrary. */
33238 nbits = 7;
33239
33240 /* Compute costs correctly for widening multiplication. */
33241 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
33242 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
33243 == GET_MODE_SIZE (mode))
33244 {
33245 int is_mulwiden = 0;
33246 enum machine_mode inner_mode = GET_MODE (op0);
33247
33248 if (GET_CODE (op0) == GET_CODE (op1))
33249 is_mulwiden = 1, op1 = XEXP (op1, 0);
33250 else if (CONST_INT_P (op1))
33251 {
33252 if (GET_CODE (op0) == SIGN_EXTEND)
33253 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
33254 == INTVAL (op1);
33255 else
33256 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
33257 }
33258
33259 if (is_mulwiden)
33260 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
33261 }
33262
33263 *total = (cost->mult_init[MODE_INDEX (mode)]
33264 + nbits * cost->mult_bit
33265 + rtx_cost (op0, outer_code, opno, speed)
33266 + rtx_cost (op1, outer_code, opno, speed));
33267
33268 return true;
33269 }
33270
33271 case DIV:
33272 case UDIV:
33273 case MOD:
33274 case UMOD:
33275 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
33276 /* ??? SSE cost should be used here. */
33277 *total = cost->fdiv;
33278 else if (X87_FLOAT_MODE_P (mode))
33279 *total = cost->fdiv;
33280 else if (FLOAT_MODE_P (mode))
33281 /* ??? SSE vector cost should be used here. */
33282 *total = cost->fdiv;
33283 else
33284 *total = cost->divide[MODE_INDEX (mode)];
33285 return false;
33286
33287 case PLUS:
33288 if (GET_MODE_CLASS (mode) == MODE_INT
33289 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
33290 {
33291 if (GET_CODE (XEXP (x, 0)) == PLUS
33292 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
33293 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
33294 && CONSTANT_P (XEXP (x, 1)))
33295 {
33296 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
33297 if (val == 2 || val == 4 || val == 8)
33298 {
33299 *total = cost->lea;
33300 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
33301 outer_code, opno, speed);
33302 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
33303 outer_code, opno, speed);
33304 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
33305 return true;
33306 }
33307 }
33308 else if (GET_CODE (XEXP (x, 0)) == MULT
33309 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
33310 {
33311 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
33312 if (val == 2 || val == 4 || val == 8)
33313 {
33314 *total = cost->lea;
33315 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
33316 outer_code, opno, speed);
33317 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
33318 return true;
33319 }
33320 }
33321 else if (GET_CODE (XEXP (x, 0)) == PLUS)
33322 {
33323 *total = cost->lea;
33324 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
33325 outer_code, opno, speed);
33326 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
33327 outer_code, opno, speed);
33328 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
33329 return true;
33330 }
33331 }
33332 /* FALLTHRU */
33333
33334 case MINUS:
33335 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
33336 {
33337 /* ??? SSE cost should be used here. */
33338 *total = cost->fadd;
33339 return false;
33340 }
33341 else if (X87_FLOAT_MODE_P (mode))
33342 {
33343 *total = cost->fadd;
33344 return false;
33345 }
33346 else if (FLOAT_MODE_P (mode))
33347 {
33348 /* ??? SSE vector cost should be used here. */
33349 *total = cost->fadd;
33350 return false;
33351 }
33352 /* FALLTHRU */
33353
33354 case AND:
33355 case IOR:
33356 case XOR:
33357 if (GET_MODE_CLASS (mode) == MODE_INT
33358 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
33359 {
33360 *total = (cost->add * 2
33361 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
33362 << (GET_MODE (XEXP (x, 0)) != DImode))
33363 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
33364 << (GET_MODE (XEXP (x, 1)) != DImode)));
33365 return true;
33366 }
33367 /* FALLTHRU */
33368
33369 case NEG:
33370 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
33371 {
33372 /* ??? SSE cost should be used here. */
33373 *total = cost->fchs;
33374 return false;
33375 }
33376 else if (X87_FLOAT_MODE_P (mode))
33377 {
33378 *total = cost->fchs;
33379 return false;
33380 }
33381 else if (FLOAT_MODE_P (mode))
33382 {
33383 /* ??? SSE vector cost should be used here. */
33384 *total = cost->fchs;
33385 return false;
33386 }
33387 /* FALLTHRU */
33388
33389 case NOT:
33390 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
33391 {
33392 /* ??? Should be SSE vector operation cost. */
33393 /* At least for published AMD latencies, this really is the same
33394 as the latency for a simple fpu operation like fabs. */
33395 *total = cost->fabs;
33396 }
33397 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
33398 *total = cost->add * 2;
33399 else
33400 *total = cost->add;
33401 return false;
33402
33403 case COMPARE:
33404 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
33405 && XEXP (XEXP (x, 0), 1) == const1_rtx
33406 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
33407 && XEXP (x, 1) == const0_rtx)
33408 {
33409 /* This kind of construct is implemented using test[bwl].
33410 Treat it as if we had an AND. */
33411 *total = (cost->add
33412 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
33413 + rtx_cost (const1_rtx, outer_code, opno, speed));
33414 return true;
33415 }
33416 return false;
33417
33418 case FLOAT_EXTEND:
33419 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
33420 *total = 0;
33421 return false;
33422
33423 case ABS:
33424 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
33425 /* ??? SSE cost should be used here. */
33426 *total = cost->fabs;
33427 else if (X87_FLOAT_MODE_P (mode))
33428 *total = cost->fabs;
33429 else if (FLOAT_MODE_P (mode))
33430 /* ??? SSE vector cost should be used here. */
33431 *total = cost->fabs;
33432 return false;
33433
33434 case SQRT:
33435 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
33436 /* ??? SSE cost should be used here. */
33437 *total = cost->fsqrt;
33438 else if (X87_FLOAT_MODE_P (mode))
33439 *total = cost->fsqrt;
33440 else if (FLOAT_MODE_P (mode))
33441 /* ??? SSE vector cost should be used here. */
33442 *total = cost->fsqrt;
33443 return false;
33444
33445 case UNSPEC:
33446 if (XINT (x, 1) == UNSPEC_TP)
33447 *total = 0;
33448 return false;
33449
33450 case VEC_SELECT:
33451 case VEC_CONCAT:
33452 case VEC_MERGE:
33453 case VEC_DUPLICATE:
33454 /* ??? Assume all of these vector manipulation patterns are
33455 recognizable. In which case they all pretty much have the
33456 same cost. */
33457 *total = cost->fabs;
33458 return true;
33459
33460 default:
33461 return false;
33462 }
33463 }
33464
33465 #if TARGET_MACHO
33466
33467 static int current_machopic_label_num;
33468
33469 /* Given a symbol name and its associated stub, write out the
33470 definition of the stub. */
33471
33472 void
33473 machopic_output_stub (FILE *file, const char *symb, const char *stub)
33474 {
33475 unsigned int length;
33476 char *binder_name, *symbol_name, lazy_ptr_name[32];
33477 int label = ++current_machopic_label_num;
33478
33479 /* For 64-bit we shouldn't get here. */
33480 gcc_assert (!TARGET_64BIT);
33481
33482 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
33483 symb = targetm.strip_name_encoding (symb);
33484
33485 length = strlen (stub);
33486 binder_name = XALLOCAVEC (char, length + 32);
33487 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
33488
33489 length = strlen (symb);
33490 symbol_name = XALLOCAVEC (char, length + 32);
33491 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
33492
33493 sprintf (lazy_ptr_name, "L%d$lz", label);
33494
33495 if (MACHOPIC_ATT_STUB)
33496 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
33497 else if (MACHOPIC_PURE)
33498 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
33499 else
33500 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
33501
33502 fprintf (file, "%s:\n", stub);
33503 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
33504
33505 if (MACHOPIC_ATT_STUB)
33506 {
33507 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
33508 }
33509 else if (MACHOPIC_PURE)
33510 {
33511 /* PIC stub. */
33512 /* 25-byte PIC stub using "CALL get_pc_thunk". */
33513 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
33514 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
33515 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
33516 label, lazy_ptr_name, label);
33517 fprintf (file, "\tjmp\t*%%ecx\n");
33518 }
33519 else
33520 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
33521
33522 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
33523 it needs no stub-binding-helper. */
33524 if (MACHOPIC_ATT_STUB)
33525 return;
33526
33527 fprintf (file, "%s:\n", binder_name);
33528
33529 if (MACHOPIC_PURE)
33530 {
33531 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
33532 fprintf (file, "\tpushl\t%%ecx\n");
33533 }
33534 else
33535 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
33536
33537 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
33538
33539 /* N.B. Keep the correspondence of these
33540 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
33541 old-pic/new-pic/non-pic stubs; altering this will break
33542 compatibility with existing dylibs. */
33543 if (MACHOPIC_PURE)
33544 {
33545 /* 25-byte PIC stub using "CALL get_pc_thunk". */
33546 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
33547 }
33548 else
33549 /* 16-byte -mdynamic-no-pic stub. */
33550 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
33551
33552 fprintf (file, "%s:\n", lazy_ptr_name);
33553 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
33554 fprintf (file, ASM_LONG "%s\n", binder_name);
33555 }
33556 #endif /* TARGET_MACHO */
33557
33558 /* Order the registers for register allocator. */
33559
33560 void
33561 x86_order_regs_for_local_alloc (void)
33562 {
33563 int pos = 0;
33564 int i;
33565
33566 /* First allocate the local general purpose registers. */
33567 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
33568 if (GENERAL_REGNO_P (i) && call_used_regs[i])
33569 reg_alloc_order [pos++] = i;
33570
33571 /* Global general purpose registers. */
33572 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
33573 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
33574 reg_alloc_order [pos++] = i;
33575
33576 /* x87 registers come first in case we are doing FP math
33577 using them. */
33578 if (!TARGET_SSE_MATH)
33579 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
33580 reg_alloc_order [pos++] = i;
33581
33582 /* SSE registers. */
33583 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
33584 reg_alloc_order [pos++] = i;
33585 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
33586 reg_alloc_order [pos++] = i;
33587
33588 /* x87 registers. */
33589 if (TARGET_SSE_MATH)
33590 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
33591 reg_alloc_order [pos++] = i;
33592
33593 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
33594 reg_alloc_order [pos++] = i;
33595
33596 /* Initialize the rest of array as we do not allocate some registers
33597 at all. */
33598 while (pos < FIRST_PSEUDO_REGISTER)
33599 reg_alloc_order [pos++] = 0;
33600 }
33601
33602 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
33603 in struct attribute_spec handler. */
33604 static tree
33605 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
33606 tree args,
33607 int flags ATTRIBUTE_UNUSED,
33608 bool *no_add_attrs)
33609 {
33610 if (TREE_CODE (*node) != FUNCTION_TYPE
33611 && TREE_CODE (*node) != METHOD_TYPE
33612 && TREE_CODE (*node) != FIELD_DECL
33613 && TREE_CODE (*node) != TYPE_DECL)
33614 {
33615 warning (OPT_Wattributes, "%qE attribute only applies to functions",
33616 name);
33617 *no_add_attrs = true;
33618 return NULL_TREE;
33619 }
33620 if (TARGET_64BIT)
33621 {
33622 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
33623 name);
33624 *no_add_attrs = true;
33625 return NULL_TREE;
33626 }
33627 if (is_attribute_p ("callee_pop_aggregate_return", name))
33628 {
33629 tree cst;
33630
33631 cst = TREE_VALUE (args);
33632 if (TREE_CODE (cst) != INTEGER_CST)
33633 {
33634 warning (OPT_Wattributes,
33635 "%qE attribute requires an integer constant argument",
33636 name);
33637 *no_add_attrs = true;
33638 }
33639 else if (compare_tree_int (cst, 0) != 0
33640 && compare_tree_int (cst, 1) != 0)
33641 {
33642 warning (OPT_Wattributes,
33643 "argument to %qE attribute is neither zero, nor one",
33644 name);
33645 *no_add_attrs = true;
33646 }
33647
33648 return NULL_TREE;
33649 }
33650
33651 return NULL_TREE;
33652 }
33653
33654 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
33655 struct attribute_spec.handler. */
33656 static tree
33657 ix86_handle_abi_attribute (tree *node, tree name,
33658 tree args ATTRIBUTE_UNUSED,
33659 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
33660 {
33661 if (TREE_CODE (*node) != FUNCTION_TYPE
33662 && TREE_CODE (*node) != METHOD_TYPE
33663 && TREE_CODE (*node) != FIELD_DECL
33664 && TREE_CODE (*node) != TYPE_DECL)
33665 {
33666 warning (OPT_Wattributes, "%qE attribute only applies to functions",
33667 name);
33668 *no_add_attrs = true;
33669 return NULL_TREE;
33670 }
33671
33672 /* Can combine regparm with all attributes but fastcall. */
33673 if (is_attribute_p ("ms_abi", name))
33674 {
33675 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
33676 {
33677 error ("ms_abi and sysv_abi attributes are not compatible");
33678 }
33679
33680 return NULL_TREE;
33681 }
33682 else if (is_attribute_p ("sysv_abi", name))
33683 {
33684 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
33685 {
33686 error ("ms_abi and sysv_abi attributes are not compatible");
33687 }
33688
33689 return NULL_TREE;
33690 }
33691
33692 return NULL_TREE;
33693 }
33694
33695 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
33696 struct attribute_spec.handler. */
33697 static tree
33698 ix86_handle_struct_attribute (tree *node, tree name,
33699 tree args ATTRIBUTE_UNUSED,
33700 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
33701 {
33702 tree *type = NULL;
33703 if (DECL_P (*node))
33704 {
33705 if (TREE_CODE (*node) == TYPE_DECL)
33706 type = &TREE_TYPE (*node);
33707 }
33708 else
33709 type = node;
33710
33711 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
33712 {
33713 warning (OPT_Wattributes, "%qE attribute ignored",
33714 name);
33715 *no_add_attrs = true;
33716 }
33717
33718 else if ((is_attribute_p ("ms_struct", name)
33719 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
33720 || ((is_attribute_p ("gcc_struct", name)
33721 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
33722 {
33723 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
33724 name);
33725 *no_add_attrs = true;
33726 }
33727
33728 return NULL_TREE;
33729 }
33730
33731 static tree
33732 ix86_handle_fndecl_attribute (tree *node, tree name,
33733 tree args ATTRIBUTE_UNUSED,
33734 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
33735 {
33736 if (TREE_CODE (*node) != FUNCTION_DECL)
33737 {
33738 warning (OPT_Wattributes, "%qE attribute only applies to functions",
33739 name);
33740 *no_add_attrs = true;
33741 }
33742 return NULL_TREE;
33743 }
33744
33745 static bool
33746 ix86_ms_bitfield_layout_p (const_tree record_type)
33747 {
33748 return ((TARGET_MS_BITFIELD_LAYOUT
33749 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
33750 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
33751 }
33752
33753 /* Returns an expression indicating where the this parameter is
33754 located on entry to the FUNCTION. */
33755
33756 static rtx
33757 x86_this_parameter (tree function)
33758 {
33759 tree type = TREE_TYPE (function);
33760 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
33761 int nregs;
33762
33763 if (TARGET_64BIT)
33764 {
33765 const int *parm_regs;
33766
33767 if (ix86_function_type_abi (type) == MS_ABI)
33768 parm_regs = x86_64_ms_abi_int_parameter_registers;
33769 else
33770 parm_regs = x86_64_int_parameter_registers;
33771 return gen_rtx_REG (Pmode, parm_regs[aggr]);
33772 }
33773
33774 nregs = ix86_function_regparm (type, function);
33775
33776 if (nregs > 0 && !stdarg_p (type))
33777 {
33778 int regno;
33779 unsigned int ccvt = ix86_get_callcvt (type);
33780
33781 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
33782 regno = aggr ? DX_REG : CX_REG;
33783 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
33784 {
33785 regno = CX_REG;
33786 if (aggr)
33787 return gen_rtx_MEM (SImode,
33788 plus_constant (Pmode, stack_pointer_rtx, 4));
33789 }
33790 else
33791 {
33792 regno = AX_REG;
33793 if (aggr)
33794 {
33795 regno = DX_REG;
33796 if (nregs == 1)
33797 return gen_rtx_MEM (SImode,
33798 plus_constant (Pmode,
33799 stack_pointer_rtx, 4));
33800 }
33801 }
33802 return gen_rtx_REG (SImode, regno);
33803 }
33804
33805 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
33806 aggr ? 8 : 4));
33807 }
33808
33809 /* Determine whether x86_output_mi_thunk can succeed. */
33810
33811 static bool
33812 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
33813 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
33814 HOST_WIDE_INT vcall_offset, const_tree function)
33815 {
33816 /* 64-bit can handle anything. */
33817 if (TARGET_64BIT)
33818 return true;
33819
33820 /* For 32-bit, everything's fine if we have one free register. */
33821 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
33822 return true;
33823
33824 /* Need a free register for vcall_offset. */
33825 if (vcall_offset)
33826 return false;
33827
33828 /* Need a free register for GOT references. */
33829 if (flag_pic && !targetm.binds_local_p (function))
33830 return false;
33831
33832 /* Otherwise ok. */
33833 return true;
33834 }
33835
33836 /* Output the assembler code for a thunk function. THUNK_DECL is the
33837 declaration for the thunk function itself, FUNCTION is the decl for
33838 the target function. DELTA is an immediate constant offset to be
33839 added to THIS. If VCALL_OFFSET is nonzero, the word at
33840 *(*this + vcall_offset) should be added to THIS. */
33841
33842 static void
33843 x86_output_mi_thunk (FILE *file,
33844 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
33845 HOST_WIDE_INT vcall_offset, tree function)
33846 {
33847 rtx this_param = x86_this_parameter (function);
33848 rtx this_reg, tmp, fnaddr;
33849 unsigned int tmp_regno;
33850
33851 if (TARGET_64BIT)
33852 tmp_regno = R10_REG;
33853 else
33854 {
33855 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
33856 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
33857 tmp_regno = AX_REG;
33858 else
33859 tmp_regno = CX_REG;
33860 }
33861
33862 emit_note (NOTE_INSN_PROLOGUE_END);
33863
33864 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
33865 pull it in now and let DELTA benefit. */
33866 if (REG_P (this_param))
33867 this_reg = this_param;
33868 else if (vcall_offset)
33869 {
33870 /* Put the this parameter into %eax. */
33871 this_reg = gen_rtx_REG (Pmode, AX_REG);
33872 emit_move_insn (this_reg, this_param);
33873 }
33874 else
33875 this_reg = NULL_RTX;
33876
33877 /* Adjust the this parameter by a fixed constant. */
33878 if (delta)
33879 {
33880 rtx delta_rtx = GEN_INT (delta);
33881 rtx delta_dst = this_reg ? this_reg : this_param;
33882
33883 if (TARGET_64BIT)
33884 {
33885 if (!x86_64_general_operand (delta_rtx, Pmode))
33886 {
33887 tmp = gen_rtx_REG (Pmode, tmp_regno);
33888 emit_move_insn (tmp, delta_rtx);
33889 delta_rtx = tmp;
33890 }
33891 }
33892
33893 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
33894 }
33895
33896 /* Adjust the this parameter by a value stored in the vtable. */
33897 if (vcall_offset)
33898 {
33899 rtx vcall_addr, vcall_mem, this_mem;
33900
33901 tmp = gen_rtx_REG (Pmode, tmp_regno);
33902
33903 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
33904 if (Pmode != ptr_mode)
33905 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
33906 emit_move_insn (tmp, this_mem);
33907
33908 /* Adjust the this parameter. */
33909 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
33910 if (TARGET_64BIT
33911 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
33912 {
33913 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
33914 emit_move_insn (tmp2, GEN_INT (vcall_offset));
33915 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
33916 }
33917
33918 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
33919 if (Pmode != ptr_mode)
33920 emit_insn (gen_addsi_1_zext (this_reg,
33921 gen_rtx_REG (ptr_mode,
33922 REGNO (this_reg)),
33923 vcall_mem));
33924 else
33925 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
33926 }
33927
33928 /* If necessary, drop THIS back to its stack slot. */
33929 if (this_reg && this_reg != this_param)
33930 emit_move_insn (this_param, this_reg);
33931
33932 fnaddr = XEXP (DECL_RTL (function), 0);
33933 if (TARGET_64BIT)
33934 {
33935 if (!flag_pic || targetm.binds_local_p (function)
33936 || cfun->machine->call_abi == MS_ABI)
33937 ;
33938 else
33939 {
33940 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
33941 tmp = gen_rtx_CONST (Pmode, tmp);
33942 fnaddr = gen_rtx_MEM (Pmode, tmp);
33943 }
33944 }
33945 else
33946 {
33947 if (!flag_pic || targetm.binds_local_p (function))
33948 ;
33949 #if TARGET_MACHO
33950 else if (TARGET_MACHO)
33951 {
33952 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
33953 fnaddr = XEXP (fnaddr, 0);
33954 }
33955 #endif /* TARGET_MACHO */
33956 else
33957 {
33958 tmp = gen_rtx_REG (Pmode, CX_REG);
33959 output_set_got (tmp, NULL_RTX);
33960
33961 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
33962 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
33963 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
33964 }
33965 }
33966
33967 /* Our sibling call patterns do not allow memories, because we have no
33968 predicate that can distinguish between frame and non-frame memory.
33969 For our purposes here, we can get away with (ab)using a jump pattern,
33970 because we're going to do no optimization. */
33971 if (MEM_P (fnaddr))
33972 emit_jump_insn (gen_indirect_jump (fnaddr));
33973 else
33974 {
33975 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
33976 fnaddr = legitimize_pic_address (fnaddr,
33977 gen_rtx_REG (Pmode, tmp_regno));
33978
33979 if (!sibcall_insn_operand (fnaddr, word_mode))
33980 {
33981 tmp = gen_rtx_REG (word_mode, tmp_regno);
33982 if (GET_MODE (fnaddr) != word_mode)
33983 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
33984 emit_move_insn (tmp, fnaddr);
33985 fnaddr = tmp;
33986 }
33987
33988 tmp = gen_rtx_MEM (QImode, fnaddr);
33989 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
33990 tmp = emit_call_insn (tmp);
33991 SIBLING_CALL_P (tmp) = 1;
33992 }
33993 emit_barrier ();
33994
33995 /* Emit just enough of rest_of_compilation to get the insns emitted.
33996 Note that use_thunk calls assemble_start_function et al. */
33997 tmp = get_insns ();
33998 shorten_branches (tmp);
33999 final_start_function (tmp, file, 1);
34000 final (tmp, file, 1);
34001 final_end_function ();
34002 }
34003
34004 static void
34005 x86_file_start (void)
34006 {
34007 default_file_start ();
34008 #if TARGET_MACHO
34009 darwin_file_start ();
34010 #endif
34011 if (X86_FILE_START_VERSION_DIRECTIVE)
34012 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
34013 if (X86_FILE_START_FLTUSED)
34014 fputs ("\t.global\t__fltused\n", asm_out_file);
34015 if (ix86_asm_dialect == ASM_INTEL)
34016 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
34017 }
34018
34019 int
34020 x86_field_alignment (tree field, int computed)
34021 {
34022 enum machine_mode mode;
34023 tree type = TREE_TYPE (field);
34024
34025 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
34026 return computed;
34027 mode = TYPE_MODE (strip_array_types (type));
34028 if (mode == DFmode || mode == DCmode
34029 || GET_MODE_CLASS (mode) == MODE_INT
34030 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
34031 return MIN (32, computed);
34032 return computed;
34033 }
34034
34035 /* Output assembler code to FILE to increment profiler label # LABELNO
34036 for profiling a function entry. */
34037 void
34038 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
34039 {
34040 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
34041 : MCOUNT_NAME);
34042
34043 if (TARGET_64BIT)
34044 {
34045 #ifndef NO_PROFILE_COUNTERS
34046 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
34047 #endif
34048
34049 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
34050 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
34051 else
34052 fprintf (file, "\tcall\t%s\n", mcount_name);
34053 }
34054 else if (flag_pic)
34055 {
34056 #ifndef NO_PROFILE_COUNTERS
34057 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
34058 LPREFIX, labelno);
34059 #endif
34060 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
34061 }
34062 else
34063 {
34064 #ifndef NO_PROFILE_COUNTERS
34065 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
34066 LPREFIX, labelno);
34067 #endif
34068 fprintf (file, "\tcall\t%s\n", mcount_name);
34069 }
34070 }
34071
34072 /* We don't have exact information about the insn sizes, but we may assume
34073 quite safely that we are informed about all 1 byte insns and memory
34074 address sizes. This is enough to eliminate unnecessary padding in
34075 99% of cases. */
34076
34077 static int
34078 min_insn_size (rtx insn)
34079 {
34080 int l = 0, len;
34081
34082 if (!INSN_P (insn) || !active_insn_p (insn))
34083 return 0;
34084
34085 /* Discard alignments we've emit and jump instructions. */
34086 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
34087 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
34088 return 0;
34089 if (JUMP_TABLE_DATA_P (insn))
34090 return 0;
34091
34092 /* Important case - calls are always 5 bytes.
34093 It is common to have many calls in the row. */
34094 if (CALL_P (insn)
34095 && symbolic_reference_mentioned_p (PATTERN (insn))
34096 && !SIBLING_CALL_P (insn))
34097 return 5;
34098 len = get_attr_length (insn);
34099 if (len <= 1)
34100 return 1;
34101
34102 /* For normal instructions we rely on get_attr_length being exact,
34103 with a few exceptions. */
34104 if (!JUMP_P (insn))
34105 {
34106 enum attr_type type = get_attr_type (insn);
34107
34108 switch (type)
34109 {
34110 case TYPE_MULTI:
34111 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
34112 || asm_noperands (PATTERN (insn)) >= 0)
34113 return 0;
34114 break;
34115 case TYPE_OTHER:
34116 case TYPE_FCMP:
34117 break;
34118 default:
34119 /* Otherwise trust get_attr_length. */
34120 return len;
34121 }
34122
34123 l = get_attr_length_address (insn);
34124 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
34125 l = 4;
34126 }
34127 if (l)
34128 return 1+l;
34129 else
34130 return 2;
34131 }
34132
34133 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
34134
34135 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
34136 window. */
34137
34138 static void
34139 ix86_avoid_jump_mispredicts (void)
34140 {
34141 rtx insn, start = get_insns ();
34142 int nbytes = 0, njumps = 0;
34143 int isjump = 0;
34144
34145 /* Look for all minimal intervals of instructions containing 4 jumps.
34146 The intervals are bounded by START and INSN. NBYTES is the total
34147 size of instructions in the interval including INSN and not including
34148 START. When the NBYTES is smaller than 16 bytes, it is possible
34149 that the end of START and INSN ends up in the same 16byte page.
34150
34151 The smallest offset in the page INSN can start is the case where START
34152 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
34153 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
34154 */
34155 for (insn = start; insn; insn = NEXT_INSN (insn))
34156 {
34157 int min_size;
34158
34159 if (LABEL_P (insn))
34160 {
34161 int align = label_to_alignment (insn);
34162 int max_skip = label_to_max_skip (insn);
34163
34164 if (max_skip > 15)
34165 max_skip = 15;
34166 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
34167 already in the current 16 byte page, because otherwise
34168 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
34169 bytes to reach 16 byte boundary. */
34170 if (align <= 0
34171 || (align <= 3 && max_skip != (1 << align) - 1))
34172 max_skip = 0;
34173 if (dump_file)
34174 fprintf (dump_file, "Label %i with max_skip %i\n",
34175 INSN_UID (insn), max_skip);
34176 if (max_skip)
34177 {
34178 while (nbytes + max_skip >= 16)
34179 {
34180 start = NEXT_INSN (start);
34181 if ((JUMP_P (start)
34182 && GET_CODE (PATTERN (start)) != ADDR_VEC
34183 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
34184 || CALL_P (start))
34185 njumps--, isjump = 1;
34186 else
34187 isjump = 0;
34188 nbytes -= min_insn_size (start);
34189 }
34190 }
34191 continue;
34192 }
34193
34194 min_size = min_insn_size (insn);
34195 nbytes += min_size;
34196 if (dump_file)
34197 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
34198 INSN_UID (insn), min_size);
34199 if ((JUMP_P (insn)
34200 && GET_CODE (PATTERN (insn)) != ADDR_VEC
34201 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
34202 || CALL_P (insn))
34203 njumps++;
34204 else
34205 continue;
34206
34207 while (njumps > 3)
34208 {
34209 start = NEXT_INSN (start);
34210 if ((JUMP_P (start)
34211 && GET_CODE (PATTERN (start)) != ADDR_VEC
34212 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
34213 || CALL_P (start))
34214 njumps--, isjump = 1;
34215 else
34216 isjump = 0;
34217 nbytes -= min_insn_size (start);
34218 }
34219 gcc_assert (njumps >= 0);
34220 if (dump_file)
34221 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
34222 INSN_UID (start), INSN_UID (insn), nbytes);
34223
34224 if (njumps == 3 && isjump && nbytes < 16)
34225 {
34226 int padsize = 15 - nbytes + min_insn_size (insn);
34227
34228 if (dump_file)
34229 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
34230 INSN_UID (insn), padsize);
34231 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
34232 }
34233 }
34234 }
34235 #endif
34236
34237 /* AMD Athlon works faster
34238 when RET is not destination of conditional jump or directly preceded
34239 by other jump instruction. We avoid the penalty by inserting NOP just
34240 before the RET instructions in such cases. */
34241 static void
34242 ix86_pad_returns (void)
34243 {
34244 edge e;
34245 edge_iterator ei;
34246
34247 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
34248 {
34249 basic_block bb = e->src;
34250 rtx ret = BB_END (bb);
34251 rtx prev;
34252 bool replace = false;
34253
34254 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
34255 || optimize_bb_for_size_p (bb))
34256 continue;
34257 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
34258 if (active_insn_p (prev) || LABEL_P (prev))
34259 break;
34260 if (prev && LABEL_P (prev))
34261 {
34262 edge e;
34263 edge_iterator ei;
34264
34265 FOR_EACH_EDGE (e, ei, bb->preds)
34266 if (EDGE_FREQUENCY (e) && e->src->index >= 0
34267 && !(e->flags & EDGE_FALLTHRU))
34268 replace = true;
34269 }
34270 if (!replace)
34271 {
34272 prev = prev_active_insn (ret);
34273 if (prev
34274 && ((JUMP_P (prev) && any_condjump_p (prev))
34275 || CALL_P (prev)))
34276 replace = true;
34277 /* Empty functions get branch mispredict even when
34278 the jump destination is not visible to us. */
34279 if (!prev && !optimize_function_for_size_p (cfun))
34280 replace = true;
34281 }
34282 if (replace)
34283 {
34284 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
34285 delete_insn (ret);
34286 }
34287 }
34288 }
34289
34290 /* Count the minimum number of instructions in BB. Return 4 if the
34291 number of instructions >= 4. */
34292
34293 static int
34294 ix86_count_insn_bb (basic_block bb)
34295 {
34296 rtx insn;
34297 int insn_count = 0;
34298
34299 /* Count number of instructions in this block. Return 4 if the number
34300 of instructions >= 4. */
34301 FOR_BB_INSNS (bb, insn)
34302 {
34303 /* Only happen in exit blocks. */
34304 if (JUMP_P (insn)
34305 && ANY_RETURN_P (PATTERN (insn)))
34306 break;
34307
34308 if (NONDEBUG_INSN_P (insn)
34309 && GET_CODE (PATTERN (insn)) != USE
34310 && GET_CODE (PATTERN (insn)) != CLOBBER)
34311 {
34312 insn_count++;
34313 if (insn_count >= 4)
34314 return insn_count;
34315 }
34316 }
34317
34318 return insn_count;
34319 }
34320
34321
34322 /* Count the minimum number of instructions in code path in BB.
34323 Return 4 if the number of instructions >= 4. */
34324
34325 static int
34326 ix86_count_insn (basic_block bb)
34327 {
34328 edge e;
34329 edge_iterator ei;
34330 int min_prev_count;
34331
34332 /* Only bother counting instructions along paths with no
34333 more than 2 basic blocks between entry and exit. Given
34334 that BB has an edge to exit, determine if a predecessor
34335 of BB has an edge from entry. If so, compute the number
34336 of instructions in the predecessor block. If there
34337 happen to be multiple such blocks, compute the minimum. */
34338 min_prev_count = 4;
34339 FOR_EACH_EDGE (e, ei, bb->preds)
34340 {
34341 edge prev_e;
34342 edge_iterator prev_ei;
34343
34344 if (e->src == ENTRY_BLOCK_PTR)
34345 {
34346 min_prev_count = 0;
34347 break;
34348 }
34349 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
34350 {
34351 if (prev_e->src == ENTRY_BLOCK_PTR)
34352 {
34353 int count = ix86_count_insn_bb (e->src);
34354 if (count < min_prev_count)
34355 min_prev_count = count;
34356 break;
34357 }
34358 }
34359 }
34360
34361 if (min_prev_count < 4)
34362 min_prev_count += ix86_count_insn_bb (bb);
34363
34364 return min_prev_count;
34365 }
34366
34367 /* Pad short function to 4 instructions. */
34368
34369 static void
34370 ix86_pad_short_function (void)
34371 {
34372 edge e;
34373 edge_iterator ei;
34374
34375 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
34376 {
34377 rtx ret = BB_END (e->src);
34378 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
34379 {
34380 int insn_count = ix86_count_insn (e->src);
34381
34382 /* Pad short function. */
34383 if (insn_count < 4)
34384 {
34385 rtx insn = ret;
34386
34387 /* Find epilogue. */
34388 while (insn
34389 && (!NOTE_P (insn)
34390 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
34391 insn = PREV_INSN (insn);
34392
34393 if (!insn)
34394 insn = ret;
34395
34396 /* Two NOPs count as one instruction. */
34397 insn_count = 2 * (4 - insn_count);
34398 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
34399 }
34400 }
34401 }
34402 }
34403
34404 /* Implement machine specific optimizations. We implement padding of returns
34405 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
34406 static void
34407 ix86_reorg (void)
34408 {
34409 /* We are freeing block_for_insn in the toplev to keep compatibility
34410 with old MDEP_REORGS that are not CFG based. Recompute it now. */
34411 compute_bb_for_insn ();
34412
34413 /* Run the vzeroupper optimization if needed. */
34414 if (TARGET_VZEROUPPER)
34415 move_or_delete_vzeroupper ();
34416
34417 if (optimize && optimize_function_for_speed_p (cfun))
34418 {
34419 if (TARGET_PAD_SHORT_FUNCTION)
34420 ix86_pad_short_function ();
34421 else if (TARGET_PAD_RETURNS)
34422 ix86_pad_returns ();
34423 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
34424 if (TARGET_FOUR_JUMP_LIMIT)
34425 ix86_avoid_jump_mispredicts ();
34426 #endif
34427 }
34428 }
34429
34430 /* Return nonzero when QImode register that must be represented via REX prefix
34431 is used. */
34432 bool
34433 x86_extended_QIreg_mentioned_p (rtx insn)
34434 {
34435 int i;
34436 extract_insn_cached (insn);
34437 for (i = 0; i < recog_data.n_operands; i++)
34438 if (GENERAL_REG_P (recog_data.operand[i])
34439 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
34440 return true;
34441 return false;
34442 }
34443
34444 /* Return nonzero when P points to register encoded via REX prefix.
34445 Called via for_each_rtx. */
34446 static int
34447 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
34448 {
34449 unsigned int regno;
34450 if (!REG_P (*p))
34451 return 0;
34452 regno = REGNO (*p);
34453 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
34454 }
34455
34456 /* Return true when INSN mentions register that must be encoded using REX
34457 prefix. */
34458 bool
34459 x86_extended_reg_mentioned_p (rtx insn)
34460 {
34461 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
34462 extended_reg_mentioned_1, NULL);
34463 }
34464
34465 /* If profitable, negate (without causing overflow) integer constant
34466 of mode MODE at location LOC. Return true in this case. */
34467 bool
34468 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
34469 {
34470 HOST_WIDE_INT val;
34471
34472 if (!CONST_INT_P (*loc))
34473 return false;
34474
34475 switch (mode)
34476 {
34477 case DImode:
34478 /* DImode x86_64 constants must fit in 32 bits. */
34479 gcc_assert (x86_64_immediate_operand (*loc, mode));
34480
34481 mode = SImode;
34482 break;
34483
34484 case SImode:
34485 case HImode:
34486 case QImode:
34487 break;
34488
34489 default:
34490 gcc_unreachable ();
34491 }
34492
34493 /* Avoid overflows. */
34494 if (mode_signbit_p (mode, *loc))
34495 return false;
34496
34497 val = INTVAL (*loc);
34498
34499 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
34500 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
34501 if ((val < 0 && val != -128)
34502 || val == 128)
34503 {
34504 *loc = GEN_INT (-val);
34505 return true;
34506 }
34507
34508 return false;
34509 }
34510
34511 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
34512 optabs would emit if we didn't have TFmode patterns. */
34513
34514 void
34515 x86_emit_floatuns (rtx operands[2])
34516 {
34517 rtx neglab, donelab, i0, i1, f0, in, out;
34518 enum machine_mode mode, inmode;
34519
34520 inmode = GET_MODE (operands[1]);
34521 gcc_assert (inmode == SImode || inmode == DImode);
34522
34523 out = operands[0];
34524 in = force_reg (inmode, operands[1]);
34525 mode = GET_MODE (out);
34526 neglab = gen_label_rtx ();
34527 donelab = gen_label_rtx ();
34528 f0 = gen_reg_rtx (mode);
34529
34530 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
34531
34532 expand_float (out, in, 0);
34533
34534 emit_jump_insn (gen_jump (donelab));
34535 emit_barrier ();
34536
34537 emit_label (neglab);
34538
34539 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
34540 1, OPTAB_DIRECT);
34541 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
34542 1, OPTAB_DIRECT);
34543 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
34544
34545 expand_float (f0, i0, 0);
34546
34547 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
34548
34549 emit_label (donelab);
34550 }
34551 \f
34552 /* AVX2 does support 32-byte integer vector operations,
34553 thus the longest vector we are faced with is V32QImode. */
34554 #define MAX_VECT_LEN 32
34555
34556 struct expand_vec_perm_d
34557 {
34558 rtx target, op0, op1;
34559 unsigned char perm[MAX_VECT_LEN];
34560 enum machine_mode vmode;
34561 unsigned char nelt;
34562 bool one_operand_p;
34563 bool testing_p;
34564 };
34565
34566 static bool canonicalize_perm (struct expand_vec_perm_d *d);
34567 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
34568 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
34569
34570 /* Get a vector mode of the same size as the original but with elements
34571 twice as wide. This is only guaranteed to apply to integral vectors. */
34572
34573 static inline enum machine_mode
34574 get_mode_wider_vector (enum machine_mode o)
34575 {
34576 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
34577 enum machine_mode n = GET_MODE_WIDER_MODE (o);
34578 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
34579 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
34580 return n;
34581 }
34582
34583 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
34584 with all elements equal to VAR. Return true if successful. */
34585
34586 static bool
34587 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
34588 rtx target, rtx val)
34589 {
34590 bool ok;
34591
34592 switch (mode)
34593 {
34594 case V2SImode:
34595 case V2SFmode:
34596 if (!mmx_ok)
34597 return false;
34598 /* FALLTHRU */
34599
34600 case V4DFmode:
34601 case V4DImode:
34602 case V8SFmode:
34603 case V8SImode:
34604 case V2DFmode:
34605 case V2DImode:
34606 case V4SFmode:
34607 case V4SImode:
34608 {
34609 rtx insn, dup;
34610
34611 /* First attempt to recognize VAL as-is. */
34612 dup = gen_rtx_VEC_DUPLICATE (mode, val);
34613 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
34614 if (recog_memoized (insn) < 0)
34615 {
34616 rtx seq;
34617 /* If that fails, force VAL into a register. */
34618
34619 start_sequence ();
34620 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
34621 seq = get_insns ();
34622 end_sequence ();
34623 if (seq)
34624 emit_insn_before (seq, insn);
34625
34626 ok = recog_memoized (insn) >= 0;
34627 gcc_assert (ok);
34628 }
34629 }
34630 return true;
34631
34632 case V4HImode:
34633 if (!mmx_ok)
34634 return false;
34635 if (TARGET_SSE || TARGET_3DNOW_A)
34636 {
34637 rtx x;
34638
34639 val = gen_lowpart (SImode, val);
34640 x = gen_rtx_TRUNCATE (HImode, val);
34641 x = gen_rtx_VEC_DUPLICATE (mode, x);
34642 emit_insn (gen_rtx_SET (VOIDmode, target, x));
34643 return true;
34644 }
34645 goto widen;
34646
34647 case V8QImode:
34648 if (!mmx_ok)
34649 return false;
34650 goto widen;
34651
34652 case V8HImode:
34653 if (TARGET_SSE2)
34654 {
34655 struct expand_vec_perm_d dperm;
34656 rtx tmp1, tmp2;
34657
34658 permute:
34659 memset (&dperm, 0, sizeof (dperm));
34660 dperm.target = target;
34661 dperm.vmode = mode;
34662 dperm.nelt = GET_MODE_NUNITS (mode);
34663 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
34664 dperm.one_operand_p = true;
34665
34666 /* Extend to SImode using a paradoxical SUBREG. */
34667 tmp1 = gen_reg_rtx (SImode);
34668 emit_move_insn (tmp1, gen_lowpart (SImode, val));
34669
34670 /* Insert the SImode value as low element of a V4SImode vector. */
34671 tmp2 = gen_lowpart (V4SImode, dperm.op0);
34672 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
34673
34674 ok = (expand_vec_perm_1 (&dperm)
34675 || expand_vec_perm_broadcast_1 (&dperm));
34676 gcc_assert (ok);
34677 return ok;
34678 }
34679 goto widen;
34680
34681 case V16QImode:
34682 if (TARGET_SSE2)
34683 goto permute;
34684 goto widen;
34685
34686 widen:
34687 /* Replicate the value once into the next wider mode and recurse. */
34688 {
34689 enum machine_mode smode, wsmode, wvmode;
34690 rtx x;
34691
34692 smode = GET_MODE_INNER (mode);
34693 wvmode = get_mode_wider_vector (mode);
34694 wsmode = GET_MODE_INNER (wvmode);
34695
34696 val = convert_modes (wsmode, smode, val, true);
34697 x = expand_simple_binop (wsmode, ASHIFT, val,
34698 GEN_INT (GET_MODE_BITSIZE (smode)),
34699 NULL_RTX, 1, OPTAB_LIB_WIDEN);
34700 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
34701
34702 x = gen_lowpart (wvmode, target);
34703 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
34704 gcc_assert (ok);
34705 return ok;
34706 }
34707
34708 case V16HImode:
34709 case V32QImode:
34710 {
34711 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
34712 rtx x = gen_reg_rtx (hvmode);
34713
34714 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
34715 gcc_assert (ok);
34716
34717 x = gen_rtx_VEC_CONCAT (mode, x, x);
34718 emit_insn (gen_rtx_SET (VOIDmode, target, x));
34719 }
34720 return true;
34721
34722 default:
34723 return false;
34724 }
34725 }
34726
34727 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
34728 whose ONE_VAR element is VAR, and other elements are zero. Return true
34729 if successful. */
34730
34731 static bool
34732 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
34733 rtx target, rtx var, int one_var)
34734 {
34735 enum machine_mode vsimode;
34736 rtx new_target;
34737 rtx x, tmp;
34738 bool use_vector_set = false;
34739
34740 switch (mode)
34741 {
34742 case V2DImode:
34743 /* For SSE4.1, we normally use vector set. But if the second
34744 element is zero and inter-unit moves are OK, we use movq
34745 instead. */
34746 use_vector_set = (TARGET_64BIT
34747 && TARGET_SSE4_1
34748 && !(TARGET_INTER_UNIT_MOVES
34749 && one_var == 0));
34750 break;
34751 case V16QImode:
34752 case V4SImode:
34753 case V4SFmode:
34754 use_vector_set = TARGET_SSE4_1;
34755 break;
34756 case V8HImode:
34757 use_vector_set = TARGET_SSE2;
34758 break;
34759 case V4HImode:
34760 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
34761 break;
34762 case V32QImode:
34763 case V16HImode:
34764 case V8SImode:
34765 case V8SFmode:
34766 case V4DFmode:
34767 use_vector_set = TARGET_AVX;
34768 break;
34769 case V4DImode:
34770 /* Use ix86_expand_vector_set in 64bit mode only. */
34771 use_vector_set = TARGET_AVX && TARGET_64BIT;
34772 break;
34773 default:
34774 break;
34775 }
34776
34777 if (use_vector_set)
34778 {
34779 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
34780 var = force_reg (GET_MODE_INNER (mode), var);
34781 ix86_expand_vector_set (mmx_ok, target, var, one_var);
34782 return true;
34783 }
34784
34785 switch (mode)
34786 {
34787 case V2SFmode:
34788 case V2SImode:
34789 if (!mmx_ok)
34790 return false;
34791 /* FALLTHRU */
34792
34793 case V2DFmode:
34794 case V2DImode:
34795 if (one_var != 0)
34796 return false;
34797 var = force_reg (GET_MODE_INNER (mode), var);
34798 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
34799 emit_insn (gen_rtx_SET (VOIDmode, target, x));
34800 return true;
34801
34802 case V4SFmode:
34803 case V4SImode:
34804 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
34805 new_target = gen_reg_rtx (mode);
34806 else
34807 new_target = target;
34808 var = force_reg (GET_MODE_INNER (mode), var);
34809 x = gen_rtx_VEC_DUPLICATE (mode, var);
34810 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
34811 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
34812 if (one_var != 0)
34813 {
34814 /* We need to shuffle the value to the correct position, so
34815 create a new pseudo to store the intermediate result. */
34816
34817 /* With SSE2, we can use the integer shuffle insns. */
34818 if (mode != V4SFmode && TARGET_SSE2)
34819 {
34820 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
34821 const1_rtx,
34822 GEN_INT (one_var == 1 ? 0 : 1),
34823 GEN_INT (one_var == 2 ? 0 : 1),
34824 GEN_INT (one_var == 3 ? 0 : 1)));
34825 if (target != new_target)
34826 emit_move_insn (target, new_target);
34827 return true;
34828 }
34829
34830 /* Otherwise convert the intermediate result to V4SFmode and
34831 use the SSE1 shuffle instructions. */
34832 if (mode != V4SFmode)
34833 {
34834 tmp = gen_reg_rtx (V4SFmode);
34835 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
34836 }
34837 else
34838 tmp = new_target;
34839
34840 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
34841 const1_rtx,
34842 GEN_INT (one_var == 1 ? 0 : 1),
34843 GEN_INT (one_var == 2 ? 0+4 : 1+4),
34844 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
34845
34846 if (mode != V4SFmode)
34847 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
34848 else if (tmp != target)
34849 emit_move_insn (target, tmp);
34850 }
34851 else if (target != new_target)
34852 emit_move_insn (target, new_target);
34853 return true;
34854
34855 case V8HImode:
34856 case V16QImode:
34857 vsimode = V4SImode;
34858 goto widen;
34859 case V4HImode:
34860 case V8QImode:
34861 if (!mmx_ok)
34862 return false;
34863 vsimode = V2SImode;
34864 goto widen;
34865 widen:
34866 if (one_var != 0)
34867 return false;
34868
34869 /* Zero extend the variable element to SImode and recurse. */
34870 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
34871
34872 x = gen_reg_rtx (vsimode);
34873 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
34874 var, one_var))
34875 gcc_unreachable ();
34876
34877 emit_move_insn (target, gen_lowpart (mode, x));
34878 return true;
34879
34880 default:
34881 return false;
34882 }
34883 }
34884
34885 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
34886 consisting of the values in VALS. It is known that all elements
34887 except ONE_VAR are constants. Return true if successful. */
34888
34889 static bool
34890 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
34891 rtx target, rtx vals, int one_var)
34892 {
34893 rtx var = XVECEXP (vals, 0, one_var);
34894 enum machine_mode wmode;
34895 rtx const_vec, x;
34896
34897 const_vec = copy_rtx (vals);
34898 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
34899 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
34900
34901 switch (mode)
34902 {
34903 case V2DFmode:
34904 case V2DImode:
34905 case V2SFmode:
34906 case V2SImode:
34907 /* For the two element vectors, it's just as easy to use
34908 the general case. */
34909 return false;
34910
34911 case V4DImode:
34912 /* Use ix86_expand_vector_set in 64bit mode only. */
34913 if (!TARGET_64BIT)
34914 return false;
34915 case V4DFmode:
34916 case V8SFmode:
34917 case V8SImode:
34918 case V16HImode:
34919 case V32QImode:
34920 case V4SFmode:
34921 case V4SImode:
34922 case V8HImode:
34923 case V4HImode:
34924 break;
34925
34926 case V16QImode:
34927 if (TARGET_SSE4_1)
34928 break;
34929 wmode = V8HImode;
34930 goto widen;
34931 case V8QImode:
34932 wmode = V4HImode;
34933 goto widen;
34934 widen:
34935 /* There's no way to set one QImode entry easily. Combine
34936 the variable value with its adjacent constant value, and
34937 promote to an HImode set. */
34938 x = XVECEXP (vals, 0, one_var ^ 1);
34939 if (one_var & 1)
34940 {
34941 var = convert_modes (HImode, QImode, var, true);
34942 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
34943 NULL_RTX, 1, OPTAB_LIB_WIDEN);
34944 x = GEN_INT (INTVAL (x) & 0xff);
34945 }
34946 else
34947 {
34948 var = convert_modes (HImode, QImode, var, true);
34949 x = gen_int_mode (INTVAL (x) << 8, HImode);
34950 }
34951 if (x != const0_rtx)
34952 var = expand_simple_binop (HImode, IOR, var, x, var,
34953 1, OPTAB_LIB_WIDEN);
34954
34955 x = gen_reg_rtx (wmode);
34956 emit_move_insn (x, gen_lowpart (wmode, const_vec));
34957 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
34958
34959 emit_move_insn (target, gen_lowpart (mode, x));
34960 return true;
34961
34962 default:
34963 return false;
34964 }
34965
34966 emit_move_insn (target, const_vec);
34967 ix86_expand_vector_set (mmx_ok, target, var, one_var);
34968 return true;
34969 }
34970
34971 /* A subroutine of ix86_expand_vector_init_general. Use vector
34972 concatenate to handle the most general case: all values variable,
34973 and none identical. */
34974
34975 static void
34976 ix86_expand_vector_init_concat (enum machine_mode mode,
34977 rtx target, rtx *ops, int n)
34978 {
34979 enum machine_mode cmode, hmode = VOIDmode;
34980 rtx first[8], second[4];
34981 rtvec v;
34982 int i, j;
34983
34984 switch (n)
34985 {
34986 case 2:
34987 switch (mode)
34988 {
34989 case V8SImode:
34990 cmode = V4SImode;
34991 break;
34992 case V8SFmode:
34993 cmode = V4SFmode;
34994 break;
34995 case V4DImode:
34996 cmode = V2DImode;
34997 break;
34998 case V4DFmode:
34999 cmode = V2DFmode;
35000 break;
35001 case V4SImode:
35002 cmode = V2SImode;
35003 break;
35004 case V4SFmode:
35005 cmode = V2SFmode;
35006 break;
35007 case V2DImode:
35008 cmode = DImode;
35009 break;
35010 case V2SImode:
35011 cmode = SImode;
35012 break;
35013 case V2DFmode:
35014 cmode = DFmode;
35015 break;
35016 case V2SFmode:
35017 cmode = SFmode;
35018 break;
35019 default:
35020 gcc_unreachable ();
35021 }
35022
35023 if (!register_operand (ops[1], cmode))
35024 ops[1] = force_reg (cmode, ops[1]);
35025 if (!register_operand (ops[0], cmode))
35026 ops[0] = force_reg (cmode, ops[0]);
35027 emit_insn (gen_rtx_SET (VOIDmode, target,
35028 gen_rtx_VEC_CONCAT (mode, ops[0],
35029 ops[1])));
35030 break;
35031
35032 case 4:
35033 switch (mode)
35034 {
35035 case V4DImode:
35036 cmode = V2DImode;
35037 break;
35038 case V4DFmode:
35039 cmode = V2DFmode;
35040 break;
35041 case V4SImode:
35042 cmode = V2SImode;
35043 break;
35044 case V4SFmode:
35045 cmode = V2SFmode;
35046 break;
35047 default:
35048 gcc_unreachable ();
35049 }
35050 goto half;
35051
35052 case 8:
35053 switch (mode)
35054 {
35055 case V8SImode:
35056 cmode = V2SImode;
35057 hmode = V4SImode;
35058 break;
35059 case V8SFmode:
35060 cmode = V2SFmode;
35061 hmode = V4SFmode;
35062 break;
35063 default:
35064 gcc_unreachable ();
35065 }
35066 goto half;
35067
35068 half:
35069 /* FIXME: We process inputs backward to help RA. PR 36222. */
35070 i = n - 1;
35071 j = (n >> 1) - 1;
35072 for (; i > 0; i -= 2, j--)
35073 {
35074 first[j] = gen_reg_rtx (cmode);
35075 v = gen_rtvec (2, ops[i - 1], ops[i]);
35076 ix86_expand_vector_init (false, first[j],
35077 gen_rtx_PARALLEL (cmode, v));
35078 }
35079
35080 n >>= 1;
35081 if (n > 2)
35082 {
35083 gcc_assert (hmode != VOIDmode);
35084 for (i = j = 0; i < n; i += 2, j++)
35085 {
35086 second[j] = gen_reg_rtx (hmode);
35087 ix86_expand_vector_init_concat (hmode, second [j],
35088 &first [i], 2);
35089 }
35090 n >>= 1;
35091 ix86_expand_vector_init_concat (mode, target, second, n);
35092 }
35093 else
35094 ix86_expand_vector_init_concat (mode, target, first, n);
35095 break;
35096
35097 default:
35098 gcc_unreachable ();
35099 }
35100 }
35101
35102 /* A subroutine of ix86_expand_vector_init_general. Use vector
35103 interleave to handle the most general case: all values variable,
35104 and none identical. */
35105
35106 static void
35107 ix86_expand_vector_init_interleave (enum machine_mode mode,
35108 rtx target, rtx *ops, int n)
35109 {
35110 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
35111 int i, j;
35112 rtx op0, op1;
35113 rtx (*gen_load_even) (rtx, rtx, rtx);
35114 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
35115 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
35116
35117 switch (mode)
35118 {
35119 case V8HImode:
35120 gen_load_even = gen_vec_setv8hi;
35121 gen_interleave_first_low = gen_vec_interleave_lowv4si;
35122 gen_interleave_second_low = gen_vec_interleave_lowv2di;
35123 inner_mode = HImode;
35124 first_imode = V4SImode;
35125 second_imode = V2DImode;
35126 third_imode = VOIDmode;
35127 break;
35128 case V16QImode:
35129 gen_load_even = gen_vec_setv16qi;
35130 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
35131 gen_interleave_second_low = gen_vec_interleave_lowv4si;
35132 inner_mode = QImode;
35133 first_imode = V8HImode;
35134 second_imode = V4SImode;
35135 third_imode = V2DImode;
35136 break;
35137 default:
35138 gcc_unreachable ();
35139 }
35140
35141 for (i = 0; i < n; i++)
35142 {
35143 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
35144 op0 = gen_reg_rtx (SImode);
35145 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
35146
35147 /* Insert the SImode value as low element of V4SImode vector. */
35148 op1 = gen_reg_rtx (V4SImode);
35149 op0 = gen_rtx_VEC_MERGE (V4SImode,
35150 gen_rtx_VEC_DUPLICATE (V4SImode,
35151 op0),
35152 CONST0_RTX (V4SImode),
35153 const1_rtx);
35154 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
35155
35156 /* Cast the V4SImode vector back to a vector in orignal mode. */
35157 op0 = gen_reg_rtx (mode);
35158 emit_move_insn (op0, gen_lowpart (mode, op1));
35159
35160 /* Load even elements into the second positon. */
35161 emit_insn (gen_load_even (op0,
35162 force_reg (inner_mode,
35163 ops [i + i + 1]),
35164 const1_rtx));
35165
35166 /* Cast vector to FIRST_IMODE vector. */
35167 ops[i] = gen_reg_rtx (first_imode);
35168 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
35169 }
35170
35171 /* Interleave low FIRST_IMODE vectors. */
35172 for (i = j = 0; i < n; i += 2, j++)
35173 {
35174 op0 = gen_reg_rtx (first_imode);
35175 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
35176
35177 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
35178 ops[j] = gen_reg_rtx (second_imode);
35179 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
35180 }
35181
35182 /* Interleave low SECOND_IMODE vectors. */
35183 switch (second_imode)
35184 {
35185 case V4SImode:
35186 for (i = j = 0; i < n / 2; i += 2, j++)
35187 {
35188 op0 = gen_reg_rtx (second_imode);
35189 emit_insn (gen_interleave_second_low (op0, ops[i],
35190 ops[i + 1]));
35191
35192 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
35193 vector. */
35194 ops[j] = gen_reg_rtx (third_imode);
35195 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
35196 }
35197 second_imode = V2DImode;
35198 gen_interleave_second_low = gen_vec_interleave_lowv2di;
35199 /* FALLTHRU */
35200
35201 case V2DImode:
35202 op0 = gen_reg_rtx (second_imode);
35203 emit_insn (gen_interleave_second_low (op0, ops[0],
35204 ops[1]));
35205
35206 /* Cast the SECOND_IMODE vector back to a vector on original
35207 mode. */
35208 emit_insn (gen_rtx_SET (VOIDmode, target,
35209 gen_lowpart (mode, op0)));
35210 break;
35211
35212 default:
35213 gcc_unreachable ();
35214 }
35215 }
35216
35217 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
35218 all values variable, and none identical. */
35219
35220 static void
35221 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
35222 rtx target, rtx vals)
35223 {
35224 rtx ops[32], op0, op1;
35225 enum machine_mode half_mode = VOIDmode;
35226 int n, i;
35227
35228 switch (mode)
35229 {
35230 case V2SFmode:
35231 case V2SImode:
35232 if (!mmx_ok && !TARGET_SSE)
35233 break;
35234 /* FALLTHRU */
35235
35236 case V8SFmode:
35237 case V8SImode:
35238 case V4DFmode:
35239 case V4DImode:
35240 case V4SFmode:
35241 case V4SImode:
35242 case V2DFmode:
35243 case V2DImode:
35244 n = GET_MODE_NUNITS (mode);
35245 for (i = 0; i < n; i++)
35246 ops[i] = XVECEXP (vals, 0, i);
35247 ix86_expand_vector_init_concat (mode, target, ops, n);
35248 return;
35249
35250 case V32QImode:
35251 half_mode = V16QImode;
35252 goto half;
35253
35254 case V16HImode:
35255 half_mode = V8HImode;
35256 goto half;
35257
35258 half:
35259 n = GET_MODE_NUNITS (mode);
35260 for (i = 0; i < n; i++)
35261 ops[i] = XVECEXP (vals, 0, i);
35262 op0 = gen_reg_rtx (half_mode);
35263 op1 = gen_reg_rtx (half_mode);
35264 ix86_expand_vector_init_interleave (half_mode, op0, ops,
35265 n >> 2);
35266 ix86_expand_vector_init_interleave (half_mode, op1,
35267 &ops [n >> 1], n >> 2);
35268 emit_insn (gen_rtx_SET (VOIDmode, target,
35269 gen_rtx_VEC_CONCAT (mode, op0, op1)));
35270 return;
35271
35272 case V16QImode:
35273 if (!TARGET_SSE4_1)
35274 break;
35275 /* FALLTHRU */
35276
35277 case V8HImode:
35278 if (!TARGET_SSE2)
35279 break;
35280
35281 /* Don't use ix86_expand_vector_init_interleave if we can't
35282 move from GPR to SSE register directly. */
35283 if (!TARGET_INTER_UNIT_MOVES)
35284 break;
35285
35286 n = GET_MODE_NUNITS (mode);
35287 for (i = 0; i < n; i++)
35288 ops[i] = XVECEXP (vals, 0, i);
35289 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
35290 return;
35291
35292 case V4HImode:
35293 case V8QImode:
35294 break;
35295
35296 default:
35297 gcc_unreachable ();
35298 }
35299
35300 {
35301 int i, j, n_elts, n_words, n_elt_per_word;
35302 enum machine_mode inner_mode;
35303 rtx words[4], shift;
35304
35305 inner_mode = GET_MODE_INNER (mode);
35306 n_elts = GET_MODE_NUNITS (mode);
35307 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
35308 n_elt_per_word = n_elts / n_words;
35309 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
35310
35311 for (i = 0; i < n_words; ++i)
35312 {
35313 rtx word = NULL_RTX;
35314
35315 for (j = 0; j < n_elt_per_word; ++j)
35316 {
35317 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
35318 elt = convert_modes (word_mode, inner_mode, elt, true);
35319
35320 if (j == 0)
35321 word = elt;
35322 else
35323 {
35324 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
35325 word, 1, OPTAB_LIB_WIDEN);
35326 word = expand_simple_binop (word_mode, IOR, word, elt,
35327 word, 1, OPTAB_LIB_WIDEN);
35328 }
35329 }
35330
35331 words[i] = word;
35332 }
35333
35334 if (n_words == 1)
35335 emit_move_insn (target, gen_lowpart (mode, words[0]));
35336 else if (n_words == 2)
35337 {
35338 rtx tmp = gen_reg_rtx (mode);
35339 emit_clobber (tmp);
35340 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
35341 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
35342 emit_move_insn (target, tmp);
35343 }
35344 else if (n_words == 4)
35345 {
35346 rtx tmp = gen_reg_rtx (V4SImode);
35347 gcc_assert (word_mode == SImode);
35348 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
35349 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
35350 emit_move_insn (target, gen_lowpart (mode, tmp));
35351 }
35352 else
35353 gcc_unreachable ();
35354 }
35355 }
35356
35357 /* Initialize vector TARGET via VALS. Suppress the use of MMX
35358 instructions unless MMX_OK is true. */
35359
35360 void
35361 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
35362 {
35363 enum machine_mode mode = GET_MODE (target);
35364 enum machine_mode inner_mode = GET_MODE_INNER (mode);
35365 int n_elts = GET_MODE_NUNITS (mode);
35366 int n_var = 0, one_var = -1;
35367 bool all_same = true, all_const_zero = true;
35368 int i;
35369 rtx x;
35370
35371 for (i = 0; i < n_elts; ++i)
35372 {
35373 x = XVECEXP (vals, 0, i);
35374 if (!(CONST_INT_P (x)
35375 || GET_CODE (x) == CONST_DOUBLE
35376 || GET_CODE (x) == CONST_FIXED))
35377 n_var++, one_var = i;
35378 else if (x != CONST0_RTX (inner_mode))
35379 all_const_zero = false;
35380 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
35381 all_same = false;
35382 }
35383
35384 /* Constants are best loaded from the constant pool. */
35385 if (n_var == 0)
35386 {
35387 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
35388 return;
35389 }
35390
35391 /* If all values are identical, broadcast the value. */
35392 if (all_same
35393 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
35394 XVECEXP (vals, 0, 0)))
35395 return;
35396
35397 /* Values where only one field is non-constant are best loaded from
35398 the pool and overwritten via move later. */
35399 if (n_var == 1)
35400 {
35401 if (all_const_zero
35402 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
35403 XVECEXP (vals, 0, one_var),
35404 one_var))
35405 return;
35406
35407 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
35408 return;
35409 }
35410
35411 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
35412 }
35413
35414 void
35415 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
35416 {
35417 enum machine_mode mode = GET_MODE (target);
35418 enum machine_mode inner_mode = GET_MODE_INNER (mode);
35419 enum machine_mode half_mode;
35420 bool use_vec_merge = false;
35421 rtx tmp;
35422 static rtx (*gen_extract[6][2]) (rtx, rtx)
35423 = {
35424 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
35425 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
35426 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
35427 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
35428 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
35429 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
35430 };
35431 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
35432 = {
35433 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
35434 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
35435 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
35436 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
35437 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
35438 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
35439 };
35440 int i, j, n;
35441
35442 switch (mode)
35443 {
35444 case V2SFmode:
35445 case V2SImode:
35446 if (mmx_ok)
35447 {
35448 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
35449 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
35450 if (elt == 0)
35451 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
35452 else
35453 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
35454 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
35455 return;
35456 }
35457 break;
35458
35459 case V2DImode:
35460 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
35461 if (use_vec_merge)
35462 break;
35463
35464 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
35465 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
35466 if (elt == 0)
35467 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
35468 else
35469 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
35470 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
35471 return;
35472
35473 case V2DFmode:
35474 {
35475 rtx op0, op1;
35476
35477 /* For the two element vectors, we implement a VEC_CONCAT with
35478 the extraction of the other element. */
35479
35480 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
35481 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
35482
35483 if (elt == 0)
35484 op0 = val, op1 = tmp;
35485 else
35486 op0 = tmp, op1 = val;
35487
35488 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
35489 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
35490 }
35491 return;
35492
35493 case V4SFmode:
35494 use_vec_merge = TARGET_SSE4_1;
35495 if (use_vec_merge)
35496 break;
35497
35498 switch (elt)
35499 {
35500 case 0:
35501 use_vec_merge = true;
35502 break;
35503
35504 case 1:
35505 /* tmp = target = A B C D */
35506 tmp = copy_to_reg (target);
35507 /* target = A A B B */
35508 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
35509 /* target = X A B B */
35510 ix86_expand_vector_set (false, target, val, 0);
35511 /* target = A X C D */
35512 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
35513 const1_rtx, const0_rtx,
35514 GEN_INT (2+4), GEN_INT (3+4)));
35515 return;
35516
35517 case 2:
35518 /* tmp = target = A B C D */
35519 tmp = copy_to_reg (target);
35520 /* tmp = X B C D */
35521 ix86_expand_vector_set (false, tmp, val, 0);
35522 /* target = A B X D */
35523 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
35524 const0_rtx, const1_rtx,
35525 GEN_INT (0+4), GEN_INT (3+4)));
35526 return;
35527
35528 case 3:
35529 /* tmp = target = A B C D */
35530 tmp = copy_to_reg (target);
35531 /* tmp = X B C D */
35532 ix86_expand_vector_set (false, tmp, val, 0);
35533 /* target = A B X D */
35534 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
35535 const0_rtx, const1_rtx,
35536 GEN_INT (2+4), GEN_INT (0+4)));
35537 return;
35538
35539 default:
35540 gcc_unreachable ();
35541 }
35542 break;
35543
35544 case V4SImode:
35545 use_vec_merge = TARGET_SSE4_1;
35546 if (use_vec_merge)
35547 break;
35548
35549 /* Element 0 handled by vec_merge below. */
35550 if (elt == 0)
35551 {
35552 use_vec_merge = true;
35553 break;
35554 }
35555
35556 if (TARGET_SSE2)
35557 {
35558 /* With SSE2, use integer shuffles to swap element 0 and ELT,
35559 store into element 0, then shuffle them back. */
35560
35561 rtx order[4];
35562
35563 order[0] = GEN_INT (elt);
35564 order[1] = const1_rtx;
35565 order[2] = const2_rtx;
35566 order[3] = GEN_INT (3);
35567 order[elt] = const0_rtx;
35568
35569 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
35570 order[1], order[2], order[3]));
35571
35572 ix86_expand_vector_set (false, target, val, 0);
35573
35574 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
35575 order[1], order[2], order[3]));
35576 }
35577 else
35578 {
35579 /* For SSE1, we have to reuse the V4SF code. */
35580 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
35581 gen_lowpart (SFmode, val), elt);
35582 }
35583 return;
35584
35585 case V8HImode:
35586 use_vec_merge = TARGET_SSE2;
35587 break;
35588 case V4HImode:
35589 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
35590 break;
35591
35592 case V16QImode:
35593 use_vec_merge = TARGET_SSE4_1;
35594 break;
35595
35596 case V8QImode:
35597 break;
35598
35599 case V32QImode:
35600 half_mode = V16QImode;
35601 j = 0;
35602 n = 16;
35603 goto half;
35604
35605 case V16HImode:
35606 half_mode = V8HImode;
35607 j = 1;
35608 n = 8;
35609 goto half;
35610
35611 case V8SImode:
35612 half_mode = V4SImode;
35613 j = 2;
35614 n = 4;
35615 goto half;
35616
35617 case V4DImode:
35618 half_mode = V2DImode;
35619 j = 3;
35620 n = 2;
35621 goto half;
35622
35623 case V8SFmode:
35624 half_mode = V4SFmode;
35625 j = 4;
35626 n = 4;
35627 goto half;
35628
35629 case V4DFmode:
35630 half_mode = V2DFmode;
35631 j = 5;
35632 n = 2;
35633 goto half;
35634
35635 half:
35636 /* Compute offset. */
35637 i = elt / n;
35638 elt %= n;
35639
35640 gcc_assert (i <= 1);
35641
35642 /* Extract the half. */
35643 tmp = gen_reg_rtx (half_mode);
35644 emit_insn (gen_extract[j][i] (tmp, target));
35645
35646 /* Put val in tmp at elt. */
35647 ix86_expand_vector_set (false, tmp, val, elt);
35648
35649 /* Put it back. */
35650 emit_insn (gen_insert[j][i] (target, target, tmp));
35651 return;
35652
35653 default:
35654 break;
35655 }
35656
35657 if (use_vec_merge)
35658 {
35659 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
35660 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
35661 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
35662 }
35663 else
35664 {
35665 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
35666
35667 emit_move_insn (mem, target);
35668
35669 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
35670 emit_move_insn (tmp, val);
35671
35672 emit_move_insn (target, mem);
35673 }
35674 }
35675
35676 void
35677 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
35678 {
35679 enum machine_mode mode = GET_MODE (vec);
35680 enum machine_mode inner_mode = GET_MODE_INNER (mode);
35681 bool use_vec_extr = false;
35682 rtx tmp;
35683
35684 switch (mode)
35685 {
35686 case V2SImode:
35687 case V2SFmode:
35688 if (!mmx_ok)
35689 break;
35690 /* FALLTHRU */
35691
35692 case V2DFmode:
35693 case V2DImode:
35694 use_vec_extr = true;
35695 break;
35696
35697 case V4SFmode:
35698 use_vec_extr = TARGET_SSE4_1;
35699 if (use_vec_extr)
35700 break;
35701
35702 switch (elt)
35703 {
35704 case 0:
35705 tmp = vec;
35706 break;
35707
35708 case 1:
35709 case 3:
35710 tmp = gen_reg_rtx (mode);
35711 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
35712 GEN_INT (elt), GEN_INT (elt),
35713 GEN_INT (elt+4), GEN_INT (elt+4)));
35714 break;
35715
35716 case 2:
35717 tmp = gen_reg_rtx (mode);
35718 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
35719 break;
35720
35721 default:
35722 gcc_unreachable ();
35723 }
35724 vec = tmp;
35725 use_vec_extr = true;
35726 elt = 0;
35727 break;
35728
35729 case V4SImode:
35730 use_vec_extr = TARGET_SSE4_1;
35731 if (use_vec_extr)
35732 break;
35733
35734 if (TARGET_SSE2)
35735 {
35736 switch (elt)
35737 {
35738 case 0:
35739 tmp = vec;
35740 break;
35741
35742 case 1:
35743 case 3:
35744 tmp = gen_reg_rtx (mode);
35745 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
35746 GEN_INT (elt), GEN_INT (elt),
35747 GEN_INT (elt), GEN_INT (elt)));
35748 break;
35749
35750 case 2:
35751 tmp = gen_reg_rtx (mode);
35752 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
35753 break;
35754
35755 default:
35756 gcc_unreachable ();
35757 }
35758 vec = tmp;
35759 use_vec_extr = true;
35760 elt = 0;
35761 }
35762 else
35763 {
35764 /* For SSE1, we have to reuse the V4SF code. */
35765 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
35766 gen_lowpart (V4SFmode, vec), elt);
35767 return;
35768 }
35769 break;
35770
35771 case V8HImode:
35772 use_vec_extr = TARGET_SSE2;
35773 break;
35774 case V4HImode:
35775 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
35776 break;
35777
35778 case V16QImode:
35779 use_vec_extr = TARGET_SSE4_1;
35780 break;
35781
35782 case V8SFmode:
35783 if (TARGET_AVX)
35784 {
35785 tmp = gen_reg_rtx (V4SFmode);
35786 if (elt < 4)
35787 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
35788 else
35789 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
35790 ix86_expand_vector_extract (false, target, tmp, elt & 3);
35791 return;
35792 }
35793 break;
35794
35795 case V4DFmode:
35796 if (TARGET_AVX)
35797 {
35798 tmp = gen_reg_rtx (V2DFmode);
35799 if (elt < 2)
35800 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
35801 else
35802 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
35803 ix86_expand_vector_extract (false, target, tmp, elt & 1);
35804 return;
35805 }
35806 break;
35807
35808 case V32QImode:
35809 if (TARGET_AVX)
35810 {
35811 tmp = gen_reg_rtx (V16QImode);
35812 if (elt < 16)
35813 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
35814 else
35815 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
35816 ix86_expand_vector_extract (false, target, tmp, elt & 15);
35817 return;
35818 }
35819 break;
35820
35821 case V16HImode:
35822 if (TARGET_AVX)
35823 {
35824 tmp = gen_reg_rtx (V8HImode);
35825 if (elt < 8)
35826 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
35827 else
35828 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
35829 ix86_expand_vector_extract (false, target, tmp, elt & 7);
35830 return;
35831 }
35832 break;
35833
35834 case V8SImode:
35835 if (TARGET_AVX)
35836 {
35837 tmp = gen_reg_rtx (V4SImode);
35838 if (elt < 4)
35839 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
35840 else
35841 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
35842 ix86_expand_vector_extract (false, target, tmp, elt & 3);
35843 return;
35844 }
35845 break;
35846
35847 case V4DImode:
35848 if (TARGET_AVX)
35849 {
35850 tmp = gen_reg_rtx (V2DImode);
35851 if (elt < 2)
35852 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
35853 else
35854 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
35855 ix86_expand_vector_extract (false, target, tmp, elt & 1);
35856 return;
35857 }
35858 break;
35859
35860 case V8QImode:
35861 /* ??? Could extract the appropriate HImode element and shift. */
35862 default:
35863 break;
35864 }
35865
35866 if (use_vec_extr)
35867 {
35868 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
35869 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
35870
35871 /* Let the rtl optimizers know about the zero extension performed. */
35872 if (inner_mode == QImode || inner_mode == HImode)
35873 {
35874 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
35875 target = gen_lowpart (SImode, target);
35876 }
35877
35878 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
35879 }
35880 else
35881 {
35882 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
35883
35884 emit_move_insn (mem, vec);
35885
35886 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
35887 emit_move_insn (target, tmp);
35888 }
35889 }
35890
35891 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
35892 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
35893 The upper bits of DEST are undefined, though they shouldn't cause
35894 exceptions (some bits from src or all zeros are ok). */
35895
35896 static void
35897 emit_reduc_half (rtx dest, rtx src, int i)
35898 {
35899 rtx tem;
35900 switch (GET_MODE (src))
35901 {
35902 case V4SFmode:
35903 if (i == 128)
35904 tem = gen_sse_movhlps (dest, src, src);
35905 else
35906 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
35907 GEN_INT (1 + 4), GEN_INT (1 + 4));
35908 break;
35909 case V2DFmode:
35910 tem = gen_vec_interleave_highv2df (dest, src, src);
35911 break;
35912 case V16QImode:
35913 case V8HImode:
35914 case V4SImode:
35915 case V2DImode:
35916 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
35917 gen_lowpart (V1TImode, src),
35918 GEN_INT (i / 2));
35919 break;
35920 case V8SFmode:
35921 if (i == 256)
35922 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
35923 else
35924 tem = gen_avx_shufps256 (dest, src, src,
35925 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
35926 break;
35927 case V4DFmode:
35928 if (i == 256)
35929 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
35930 else
35931 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
35932 break;
35933 case V32QImode:
35934 case V16HImode:
35935 case V8SImode:
35936 case V4DImode:
35937 if (i == 256)
35938 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
35939 gen_lowpart (V4DImode, src),
35940 gen_lowpart (V4DImode, src),
35941 const1_rtx);
35942 else
35943 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
35944 gen_lowpart (V2TImode, src),
35945 GEN_INT (i / 2));
35946 break;
35947 default:
35948 gcc_unreachable ();
35949 }
35950 emit_insn (tem);
35951 }
35952
35953 /* Expand a vector reduction. FN is the binary pattern to reduce;
35954 DEST is the destination; IN is the input vector. */
35955
35956 void
35957 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
35958 {
35959 rtx half, dst, vec = in;
35960 enum machine_mode mode = GET_MODE (in);
35961 int i;
35962
35963 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
35964 if (TARGET_SSE4_1
35965 && mode == V8HImode
35966 && fn == gen_uminv8hi3)
35967 {
35968 emit_insn (gen_sse4_1_phminposuw (dest, in));
35969 return;
35970 }
35971
35972 for (i = GET_MODE_BITSIZE (mode);
35973 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
35974 i >>= 1)
35975 {
35976 half = gen_reg_rtx (mode);
35977 emit_reduc_half (half, vec, i);
35978 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
35979 dst = dest;
35980 else
35981 dst = gen_reg_rtx (mode);
35982 emit_insn (fn (dst, half, vec));
35983 vec = dst;
35984 }
35985 }
35986 \f
35987 /* Target hook for scalar_mode_supported_p. */
35988 static bool
35989 ix86_scalar_mode_supported_p (enum machine_mode mode)
35990 {
35991 if (DECIMAL_FLOAT_MODE_P (mode))
35992 return default_decimal_float_supported_p ();
35993 else if (mode == TFmode)
35994 return true;
35995 else
35996 return default_scalar_mode_supported_p (mode);
35997 }
35998
35999 /* Implements target hook vector_mode_supported_p. */
36000 static bool
36001 ix86_vector_mode_supported_p (enum machine_mode mode)
36002 {
36003 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
36004 return true;
36005 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
36006 return true;
36007 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
36008 return true;
36009 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
36010 return true;
36011 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
36012 return true;
36013 return false;
36014 }
36015
36016 /* Target hook for c_mode_for_suffix. */
36017 static enum machine_mode
36018 ix86_c_mode_for_suffix (char suffix)
36019 {
36020 if (suffix == 'q')
36021 return TFmode;
36022 if (suffix == 'w')
36023 return XFmode;
36024
36025 return VOIDmode;
36026 }
36027
36028 /* Worker function for TARGET_MD_ASM_CLOBBERS.
36029
36030 We do this in the new i386 backend to maintain source compatibility
36031 with the old cc0-based compiler. */
36032
36033 static tree
36034 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
36035 tree inputs ATTRIBUTE_UNUSED,
36036 tree clobbers)
36037 {
36038 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
36039 clobbers);
36040 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
36041 clobbers);
36042 return clobbers;
36043 }
36044
36045 /* Implements target vector targetm.asm.encode_section_info. */
36046
36047 static void ATTRIBUTE_UNUSED
36048 ix86_encode_section_info (tree decl, rtx rtl, int first)
36049 {
36050 default_encode_section_info (decl, rtl, first);
36051
36052 if (TREE_CODE (decl) == VAR_DECL
36053 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
36054 && ix86_in_large_data_p (decl))
36055 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
36056 }
36057
36058 /* Worker function for REVERSE_CONDITION. */
36059
36060 enum rtx_code
36061 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
36062 {
36063 return (mode != CCFPmode && mode != CCFPUmode
36064 ? reverse_condition (code)
36065 : reverse_condition_maybe_unordered (code));
36066 }
36067
36068 /* Output code to perform an x87 FP register move, from OPERANDS[1]
36069 to OPERANDS[0]. */
36070
36071 const char *
36072 output_387_reg_move (rtx insn, rtx *operands)
36073 {
36074 if (REG_P (operands[0]))
36075 {
36076 if (REG_P (operands[1])
36077 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
36078 {
36079 if (REGNO (operands[0]) == FIRST_STACK_REG)
36080 return output_387_ffreep (operands, 0);
36081 return "fstp\t%y0";
36082 }
36083 if (STACK_TOP_P (operands[0]))
36084 return "fld%Z1\t%y1";
36085 return "fst\t%y0";
36086 }
36087 else if (MEM_P (operands[0]))
36088 {
36089 gcc_assert (REG_P (operands[1]));
36090 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
36091 return "fstp%Z0\t%y0";
36092 else
36093 {
36094 /* There is no non-popping store to memory for XFmode.
36095 So if we need one, follow the store with a load. */
36096 if (GET_MODE (operands[0]) == XFmode)
36097 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
36098 else
36099 return "fst%Z0\t%y0";
36100 }
36101 }
36102 else
36103 gcc_unreachable();
36104 }
36105
36106 /* Output code to perform a conditional jump to LABEL, if C2 flag in
36107 FP status register is set. */
36108
36109 void
36110 ix86_emit_fp_unordered_jump (rtx label)
36111 {
36112 rtx reg = gen_reg_rtx (HImode);
36113 rtx temp;
36114
36115 emit_insn (gen_x86_fnstsw_1 (reg));
36116
36117 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
36118 {
36119 emit_insn (gen_x86_sahf_1 (reg));
36120
36121 temp = gen_rtx_REG (CCmode, FLAGS_REG);
36122 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
36123 }
36124 else
36125 {
36126 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
36127
36128 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
36129 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
36130 }
36131
36132 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
36133 gen_rtx_LABEL_REF (VOIDmode, label),
36134 pc_rtx);
36135 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
36136
36137 emit_jump_insn (temp);
36138 predict_jump (REG_BR_PROB_BASE * 10 / 100);
36139 }
36140
36141 /* Output code to perform a log1p XFmode calculation. */
36142
36143 void ix86_emit_i387_log1p (rtx op0, rtx op1)
36144 {
36145 rtx label1 = gen_label_rtx ();
36146 rtx label2 = gen_label_rtx ();
36147
36148 rtx tmp = gen_reg_rtx (XFmode);
36149 rtx tmp2 = gen_reg_rtx (XFmode);
36150 rtx test;
36151
36152 emit_insn (gen_absxf2 (tmp, op1));
36153 test = gen_rtx_GE (VOIDmode, tmp,
36154 CONST_DOUBLE_FROM_REAL_VALUE (
36155 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
36156 XFmode));
36157 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
36158
36159 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
36160 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
36161 emit_jump (label2);
36162
36163 emit_label (label1);
36164 emit_move_insn (tmp, CONST1_RTX (XFmode));
36165 emit_insn (gen_addxf3 (tmp, op1, tmp));
36166 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
36167 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
36168
36169 emit_label (label2);
36170 }
36171
36172 /* Emit code for round calculation. */
36173 void ix86_emit_i387_round (rtx op0, rtx op1)
36174 {
36175 enum machine_mode inmode = GET_MODE (op1);
36176 enum machine_mode outmode = GET_MODE (op0);
36177 rtx e1, e2, res, tmp, tmp1, half;
36178 rtx scratch = gen_reg_rtx (HImode);
36179 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
36180 rtx jump_label = gen_label_rtx ();
36181 rtx insn;
36182 rtx (*gen_abs) (rtx, rtx);
36183 rtx (*gen_neg) (rtx, rtx);
36184
36185 switch (inmode)
36186 {
36187 case SFmode:
36188 gen_abs = gen_abssf2;
36189 break;
36190 case DFmode:
36191 gen_abs = gen_absdf2;
36192 break;
36193 case XFmode:
36194 gen_abs = gen_absxf2;
36195 break;
36196 default:
36197 gcc_unreachable ();
36198 }
36199
36200 switch (outmode)
36201 {
36202 case SFmode:
36203 gen_neg = gen_negsf2;
36204 break;
36205 case DFmode:
36206 gen_neg = gen_negdf2;
36207 break;
36208 case XFmode:
36209 gen_neg = gen_negxf2;
36210 break;
36211 case HImode:
36212 gen_neg = gen_neghi2;
36213 break;
36214 case SImode:
36215 gen_neg = gen_negsi2;
36216 break;
36217 case DImode:
36218 gen_neg = gen_negdi2;
36219 break;
36220 default:
36221 gcc_unreachable ();
36222 }
36223
36224 e1 = gen_reg_rtx (inmode);
36225 e2 = gen_reg_rtx (inmode);
36226 res = gen_reg_rtx (outmode);
36227
36228 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
36229
36230 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
36231
36232 /* scratch = fxam(op1) */
36233 emit_insn (gen_rtx_SET (VOIDmode, scratch,
36234 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
36235 UNSPEC_FXAM)));
36236 /* e1 = fabs(op1) */
36237 emit_insn (gen_abs (e1, op1));
36238
36239 /* e2 = e1 + 0.5 */
36240 half = force_reg (inmode, half);
36241 emit_insn (gen_rtx_SET (VOIDmode, e2,
36242 gen_rtx_PLUS (inmode, e1, half)));
36243
36244 /* res = floor(e2) */
36245 if (inmode != XFmode)
36246 {
36247 tmp1 = gen_reg_rtx (XFmode);
36248
36249 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
36250 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
36251 }
36252 else
36253 tmp1 = e2;
36254
36255 switch (outmode)
36256 {
36257 case SFmode:
36258 case DFmode:
36259 {
36260 rtx tmp0 = gen_reg_rtx (XFmode);
36261
36262 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
36263
36264 emit_insn (gen_rtx_SET (VOIDmode, res,
36265 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
36266 UNSPEC_TRUNC_NOOP)));
36267 }
36268 break;
36269 case XFmode:
36270 emit_insn (gen_frndintxf2_floor (res, tmp1));
36271 break;
36272 case HImode:
36273 emit_insn (gen_lfloorxfhi2 (res, tmp1));
36274 break;
36275 case SImode:
36276 emit_insn (gen_lfloorxfsi2 (res, tmp1));
36277 break;
36278 case DImode:
36279 emit_insn (gen_lfloorxfdi2 (res, tmp1));
36280 break;
36281 default:
36282 gcc_unreachable ();
36283 }
36284
36285 /* flags = signbit(a) */
36286 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
36287
36288 /* if (flags) then res = -res */
36289 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
36290 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
36291 gen_rtx_LABEL_REF (VOIDmode, jump_label),
36292 pc_rtx);
36293 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
36294 predict_jump (REG_BR_PROB_BASE * 50 / 100);
36295 JUMP_LABEL (insn) = jump_label;
36296
36297 emit_insn (gen_neg (res, res));
36298
36299 emit_label (jump_label);
36300 LABEL_NUSES (jump_label) = 1;
36301
36302 emit_move_insn (op0, res);
36303 }
36304
36305 /* Output code to perform a Newton-Rhapson approximation of a single precision
36306 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
36307
36308 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
36309 {
36310 rtx x0, x1, e0, e1;
36311
36312 x0 = gen_reg_rtx (mode);
36313 e0 = gen_reg_rtx (mode);
36314 e1 = gen_reg_rtx (mode);
36315 x1 = gen_reg_rtx (mode);
36316
36317 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
36318
36319 b = force_reg (mode, b);
36320
36321 /* x0 = rcp(b) estimate */
36322 emit_insn (gen_rtx_SET (VOIDmode, x0,
36323 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
36324 UNSPEC_RCP)));
36325 /* e0 = x0 * b */
36326 emit_insn (gen_rtx_SET (VOIDmode, e0,
36327 gen_rtx_MULT (mode, x0, b)));
36328
36329 /* e0 = x0 * e0 */
36330 emit_insn (gen_rtx_SET (VOIDmode, e0,
36331 gen_rtx_MULT (mode, x0, e0)));
36332
36333 /* e1 = x0 + x0 */
36334 emit_insn (gen_rtx_SET (VOIDmode, e1,
36335 gen_rtx_PLUS (mode, x0, x0)));
36336
36337 /* x1 = e1 - e0 */
36338 emit_insn (gen_rtx_SET (VOIDmode, x1,
36339 gen_rtx_MINUS (mode, e1, e0)));
36340
36341 /* res = a * x1 */
36342 emit_insn (gen_rtx_SET (VOIDmode, res,
36343 gen_rtx_MULT (mode, a, x1)));
36344 }
36345
36346 /* Output code to perform a Newton-Rhapson approximation of a
36347 single precision floating point [reciprocal] square root. */
36348
36349 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
36350 bool recip)
36351 {
36352 rtx x0, e0, e1, e2, e3, mthree, mhalf;
36353 REAL_VALUE_TYPE r;
36354
36355 x0 = gen_reg_rtx (mode);
36356 e0 = gen_reg_rtx (mode);
36357 e1 = gen_reg_rtx (mode);
36358 e2 = gen_reg_rtx (mode);
36359 e3 = gen_reg_rtx (mode);
36360
36361 real_from_integer (&r, VOIDmode, -3, -1, 0);
36362 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
36363
36364 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
36365 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
36366
36367 if (VECTOR_MODE_P (mode))
36368 {
36369 mthree = ix86_build_const_vector (mode, true, mthree);
36370 mhalf = ix86_build_const_vector (mode, true, mhalf);
36371 }
36372
36373 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
36374 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
36375
36376 a = force_reg (mode, a);
36377
36378 /* x0 = rsqrt(a) estimate */
36379 emit_insn (gen_rtx_SET (VOIDmode, x0,
36380 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
36381 UNSPEC_RSQRT)));
36382
36383 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
36384 if (!recip)
36385 {
36386 rtx zero, mask;
36387
36388 zero = gen_reg_rtx (mode);
36389 mask = gen_reg_rtx (mode);
36390
36391 zero = force_reg (mode, CONST0_RTX(mode));
36392 emit_insn (gen_rtx_SET (VOIDmode, mask,
36393 gen_rtx_NE (mode, zero, a)));
36394
36395 emit_insn (gen_rtx_SET (VOIDmode, x0,
36396 gen_rtx_AND (mode, x0, mask)));
36397 }
36398
36399 /* e0 = x0 * a */
36400 emit_insn (gen_rtx_SET (VOIDmode, e0,
36401 gen_rtx_MULT (mode, x0, a)));
36402 /* e1 = e0 * x0 */
36403 emit_insn (gen_rtx_SET (VOIDmode, e1,
36404 gen_rtx_MULT (mode, e0, x0)));
36405
36406 /* e2 = e1 - 3. */
36407 mthree = force_reg (mode, mthree);
36408 emit_insn (gen_rtx_SET (VOIDmode, e2,
36409 gen_rtx_PLUS (mode, e1, mthree)));
36410
36411 mhalf = force_reg (mode, mhalf);
36412 if (recip)
36413 /* e3 = -.5 * x0 */
36414 emit_insn (gen_rtx_SET (VOIDmode, e3,
36415 gen_rtx_MULT (mode, x0, mhalf)));
36416 else
36417 /* e3 = -.5 * e0 */
36418 emit_insn (gen_rtx_SET (VOIDmode, e3,
36419 gen_rtx_MULT (mode, e0, mhalf)));
36420 /* ret = e2 * e3 */
36421 emit_insn (gen_rtx_SET (VOIDmode, res,
36422 gen_rtx_MULT (mode, e2, e3)));
36423 }
36424
36425 #ifdef TARGET_SOLARIS
36426 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
36427
36428 static void
36429 i386_solaris_elf_named_section (const char *name, unsigned int flags,
36430 tree decl)
36431 {
36432 /* With Binutils 2.15, the "@unwind" marker must be specified on
36433 every occurrence of the ".eh_frame" section, not just the first
36434 one. */
36435 if (TARGET_64BIT
36436 && strcmp (name, ".eh_frame") == 0)
36437 {
36438 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
36439 flags & SECTION_WRITE ? "aw" : "a");
36440 return;
36441 }
36442
36443 #ifndef USE_GAS
36444 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
36445 {
36446 solaris_elf_asm_comdat_section (name, flags, decl);
36447 return;
36448 }
36449 #endif
36450
36451 default_elf_asm_named_section (name, flags, decl);
36452 }
36453 #endif /* TARGET_SOLARIS */
36454
36455 /* Return the mangling of TYPE if it is an extended fundamental type. */
36456
36457 static const char *
36458 ix86_mangle_type (const_tree type)
36459 {
36460 type = TYPE_MAIN_VARIANT (type);
36461
36462 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
36463 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
36464 return NULL;
36465
36466 switch (TYPE_MODE (type))
36467 {
36468 case TFmode:
36469 /* __float128 is "g". */
36470 return "g";
36471 case XFmode:
36472 /* "long double" or __float80 is "e". */
36473 return "e";
36474 default:
36475 return NULL;
36476 }
36477 }
36478
36479 /* For 32-bit code we can save PIC register setup by using
36480 __stack_chk_fail_local hidden function instead of calling
36481 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
36482 register, so it is better to call __stack_chk_fail directly. */
36483
36484 static tree ATTRIBUTE_UNUSED
36485 ix86_stack_protect_fail (void)
36486 {
36487 return TARGET_64BIT
36488 ? default_external_stack_protect_fail ()
36489 : default_hidden_stack_protect_fail ();
36490 }
36491
36492 /* Select a format to encode pointers in exception handling data. CODE
36493 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
36494 true if the symbol may be affected by dynamic relocations.
36495
36496 ??? All x86 object file formats are capable of representing this.
36497 After all, the relocation needed is the same as for the call insn.
36498 Whether or not a particular assembler allows us to enter such, I
36499 guess we'll have to see. */
36500 int
36501 asm_preferred_eh_data_format (int code, int global)
36502 {
36503 if (flag_pic)
36504 {
36505 int type = DW_EH_PE_sdata8;
36506 if (!TARGET_64BIT
36507 || ix86_cmodel == CM_SMALL_PIC
36508 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
36509 type = DW_EH_PE_sdata4;
36510 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
36511 }
36512 if (ix86_cmodel == CM_SMALL
36513 || (ix86_cmodel == CM_MEDIUM && code))
36514 return DW_EH_PE_udata4;
36515 return DW_EH_PE_absptr;
36516 }
36517 \f
36518 /* Expand copysign from SIGN to the positive value ABS_VALUE
36519 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
36520 the sign-bit. */
36521 static void
36522 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
36523 {
36524 enum machine_mode mode = GET_MODE (sign);
36525 rtx sgn = gen_reg_rtx (mode);
36526 if (mask == NULL_RTX)
36527 {
36528 enum machine_mode vmode;
36529
36530 if (mode == SFmode)
36531 vmode = V4SFmode;
36532 else if (mode == DFmode)
36533 vmode = V2DFmode;
36534 else
36535 vmode = mode;
36536
36537 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
36538 if (!VECTOR_MODE_P (mode))
36539 {
36540 /* We need to generate a scalar mode mask in this case. */
36541 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
36542 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
36543 mask = gen_reg_rtx (mode);
36544 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
36545 }
36546 }
36547 else
36548 mask = gen_rtx_NOT (mode, mask);
36549 emit_insn (gen_rtx_SET (VOIDmode, sgn,
36550 gen_rtx_AND (mode, mask, sign)));
36551 emit_insn (gen_rtx_SET (VOIDmode, result,
36552 gen_rtx_IOR (mode, abs_value, sgn)));
36553 }
36554
36555 /* Expand fabs (OP0) and return a new rtx that holds the result. The
36556 mask for masking out the sign-bit is stored in *SMASK, if that is
36557 non-null. */
36558 static rtx
36559 ix86_expand_sse_fabs (rtx op0, rtx *smask)
36560 {
36561 enum machine_mode vmode, mode = GET_MODE (op0);
36562 rtx xa, mask;
36563
36564 xa = gen_reg_rtx (mode);
36565 if (mode == SFmode)
36566 vmode = V4SFmode;
36567 else if (mode == DFmode)
36568 vmode = V2DFmode;
36569 else
36570 vmode = mode;
36571 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
36572 if (!VECTOR_MODE_P (mode))
36573 {
36574 /* We need to generate a scalar mode mask in this case. */
36575 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
36576 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
36577 mask = gen_reg_rtx (mode);
36578 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
36579 }
36580 emit_insn (gen_rtx_SET (VOIDmode, xa,
36581 gen_rtx_AND (mode, op0, mask)));
36582
36583 if (smask)
36584 *smask = mask;
36585
36586 return xa;
36587 }
36588
36589 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
36590 swapping the operands if SWAP_OPERANDS is true. The expanded
36591 code is a forward jump to a newly created label in case the
36592 comparison is true. The generated label rtx is returned. */
36593 static rtx
36594 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
36595 bool swap_operands)
36596 {
36597 rtx label, tmp;
36598
36599 if (swap_operands)
36600 {
36601 tmp = op0;
36602 op0 = op1;
36603 op1 = tmp;
36604 }
36605
36606 label = gen_label_rtx ();
36607 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
36608 emit_insn (gen_rtx_SET (VOIDmode, tmp,
36609 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
36610 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
36611 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
36612 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
36613 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
36614 JUMP_LABEL (tmp) = label;
36615
36616 return label;
36617 }
36618
36619 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
36620 using comparison code CODE. Operands are swapped for the comparison if
36621 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
36622 static rtx
36623 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
36624 bool swap_operands)
36625 {
36626 rtx (*insn)(rtx, rtx, rtx, rtx);
36627 enum machine_mode mode = GET_MODE (op0);
36628 rtx mask = gen_reg_rtx (mode);
36629
36630 if (swap_operands)
36631 {
36632 rtx tmp = op0;
36633 op0 = op1;
36634 op1 = tmp;
36635 }
36636
36637 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
36638
36639 emit_insn (insn (mask, op0, op1,
36640 gen_rtx_fmt_ee (code, mode, op0, op1)));
36641 return mask;
36642 }
36643
36644 /* Generate and return a rtx of mode MODE for 2**n where n is the number
36645 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
36646 static rtx
36647 ix86_gen_TWO52 (enum machine_mode mode)
36648 {
36649 REAL_VALUE_TYPE TWO52r;
36650 rtx TWO52;
36651
36652 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
36653 TWO52 = const_double_from_real_value (TWO52r, mode);
36654 TWO52 = force_reg (mode, TWO52);
36655
36656 return TWO52;
36657 }
36658
36659 /* Expand SSE sequence for computing lround from OP1 storing
36660 into OP0. */
36661 void
36662 ix86_expand_lround (rtx op0, rtx op1)
36663 {
36664 /* C code for the stuff we're doing below:
36665 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
36666 return (long)tmp;
36667 */
36668 enum machine_mode mode = GET_MODE (op1);
36669 const struct real_format *fmt;
36670 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
36671 rtx adj;
36672
36673 /* load nextafter (0.5, 0.0) */
36674 fmt = REAL_MODE_FORMAT (mode);
36675 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
36676 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
36677
36678 /* adj = copysign (0.5, op1) */
36679 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
36680 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
36681
36682 /* adj = op1 + adj */
36683 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
36684
36685 /* op0 = (imode)adj */
36686 expand_fix (op0, adj, 0);
36687 }
36688
36689 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
36690 into OPERAND0. */
36691 void
36692 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
36693 {
36694 /* C code for the stuff we're doing below (for do_floor):
36695 xi = (long)op1;
36696 xi -= (double)xi > op1 ? 1 : 0;
36697 return xi;
36698 */
36699 enum machine_mode fmode = GET_MODE (op1);
36700 enum machine_mode imode = GET_MODE (op0);
36701 rtx ireg, freg, label, tmp;
36702
36703 /* reg = (long)op1 */
36704 ireg = gen_reg_rtx (imode);
36705 expand_fix (ireg, op1, 0);
36706
36707 /* freg = (double)reg */
36708 freg = gen_reg_rtx (fmode);
36709 expand_float (freg, ireg, 0);
36710
36711 /* ireg = (freg > op1) ? ireg - 1 : ireg */
36712 label = ix86_expand_sse_compare_and_jump (UNLE,
36713 freg, op1, !do_floor);
36714 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
36715 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
36716 emit_move_insn (ireg, tmp);
36717
36718 emit_label (label);
36719 LABEL_NUSES (label) = 1;
36720
36721 emit_move_insn (op0, ireg);
36722 }
36723
36724 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
36725 result in OPERAND0. */
36726 void
36727 ix86_expand_rint (rtx operand0, rtx operand1)
36728 {
36729 /* C code for the stuff we're doing below:
36730 xa = fabs (operand1);
36731 if (!isless (xa, 2**52))
36732 return operand1;
36733 xa = xa + 2**52 - 2**52;
36734 return copysign (xa, operand1);
36735 */
36736 enum machine_mode mode = GET_MODE (operand0);
36737 rtx res, xa, label, TWO52, mask;
36738
36739 res = gen_reg_rtx (mode);
36740 emit_move_insn (res, operand1);
36741
36742 /* xa = abs (operand1) */
36743 xa = ix86_expand_sse_fabs (res, &mask);
36744
36745 /* if (!isless (xa, TWO52)) goto label; */
36746 TWO52 = ix86_gen_TWO52 (mode);
36747 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36748
36749 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
36750 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
36751
36752 ix86_sse_copysign_to_positive (res, xa, res, mask);
36753
36754 emit_label (label);
36755 LABEL_NUSES (label) = 1;
36756
36757 emit_move_insn (operand0, res);
36758 }
36759
36760 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
36761 into OPERAND0. */
36762 void
36763 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
36764 {
36765 /* C code for the stuff we expand below.
36766 double xa = fabs (x), x2;
36767 if (!isless (xa, TWO52))
36768 return x;
36769 xa = xa + TWO52 - TWO52;
36770 x2 = copysign (xa, x);
36771 Compensate. Floor:
36772 if (x2 > x)
36773 x2 -= 1;
36774 Compensate. Ceil:
36775 if (x2 < x)
36776 x2 -= -1;
36777 return x2;
36778 */
36779 enum machine_mode mode = GET_MODE (operand0);
36780 rtx xa, TWO52, tmp, label, one, res, mask;
36781
36782 TWO52 = ix86_gen_TWO52 (mode);
36783
36784 /* Temporary for holding the result, initialized to the input
36785 operand to ease control flow. */
36786 res = gen_reg_rtx (mode);
36787 emit_move_insn (res, operand1);
36788
36789 /* xa = abs (operand1) */
36790 xa = ix86_expand_sse_fabs (res, &mask);
36791
36792 /* if (!isless (xa, TWO52)) goto label; */
36793 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36794
36795 /* xa = xa + TWO52 - TWO52; */
36796 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
36797 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
36798
36799 /* xa = copysign (xa, operand1) */
36800 ix86_sse_copysign_to_positive (xa, xa, res, mask);
36801
36802 /* generate 1.0 or -1.0 */
36803 one = force_reg (mode,
36804 const_double_from_real_value (do_floor
36805 ? dconst1 : dconstm1, mode));
36806
36807 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
36808 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
36809 emit_insn (gen_rtx_SET (VOIDmode, tmp,
36810 gen_rtx_AND (mode, one, tmp)));
36811 /* We always need to subtract here to preserve signed zero. */
36812 tmp = expand_simple_binop (mode, MINUS,
36813 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
36814 emit_move_insn (res, tmp);
36815
36816 emit_label (label);
36817 LABEL_NUSES (label) = 1;
36818
36819 emit_move_insn (operand0, res);
36820 }
36821
36822 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
36823 into OPERAND0. */
36824 void
36825 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
36826 {
36827 /* C code for the stuff we expand below.
36828 double xa = fabs (x), x2;
36829 if (!isless (xa, TWO52))
36830 return x;
36831 x2 = (double)(long)x;
36832 Compensate. Floor:
36833 if (x2 > x)
36834 x2 -= 1;
36835 Compensate. Ceil:
36836 if (x2 < x)
36837 x2 += 1;
36838 if (HONOR_SIGNED_ZEROS (mode))
36839 return copysign (x2, x);
36840 return x2;
36841 */
36842 enum machine_mode mode = GET_MODE (operand0);
36843 rtx xa, xi, TWO52, tmp, label, one, res, mask;
36844
36845 TWO52 = ix86_gen_TWO52 (mode);
36846
36847 /* Temporary for holding the result, initialized to the input
36848 operand to ease control flow. */
36849 res = gen_reg_rtx (mode);
36850 emit_move_insn (res, operand1);
36851
36852 /* xa = abs (operand1) */
36853 xa = ix86_expand_sse_fabs (res, &mask);
36854
36855 /* if (!isless (xa, TWO52)) goto label; */
36856 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36857
36858 /* xa = (double)(long)x */
36859 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
36860 expand_fix (xi, res, 0);
36861 expand_float (xa, xi, 0);
36862
36863 /* generate 1.0 */
36864 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
36865
36866 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
36867 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
36868 emit_insn (gen_rtx_SET (VOIDmode, tmp,
36869 gen_rtx_AND (mode, one, tmp)));
36870 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
36871 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
36872 emit_move_insn (res, tmp);
36873
36874 if (HONOR_SIGNED_ZEROS (mode))
36875 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
36876
36877 emit_label (label);
36878 LABEL_NUSES (label) = 1;
36879
36880 emit_move_insn (operand0, res);
36881 }
36882
36883 /* Expand SSE sequence for computing round from OPERAND1 storing
36884 into OPERAND0. Sequence that works without relying on DImode truncation
36885 via cvttsd2siq that is only available on 64bit targets. */
36886 void
36887 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
36888 {
36889 /* C code for the stuff we expand below.
36890 double xa = fabs (x), xa2, x2;
36891 if (!isless (xa, TWO52))
36892 return x;
36893 Using the absolute value and copying back sign makes
36894 -0.0 -> -0.0 correct.
36895 xa2 = xa + TWO52 - TWO52;
36896 Compensate.
36897 dxa = xa2 - xa;
36898 if (dxa <= -0.5)
36899 xa2 += 1;
36900 else if (dxa > 0.5)
36901 xa2 -= 1;
36902 x2 = copysign (xa2, x);
36903 return x2;
36904 */
36905 enum machine_mode mode = GET_MODE (operand0);
36906 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
36907
36908 TWO52 = ix86_gen_TWO52 (mode);
36909
36910 /* Temporary for holding the result, initialized to the input
36911 operand to ease control flow. */
36912 res = gen_reg_rtx (mode);
36913 emit_move_insn (res, operand1);
36914
36915 /* xa = abs (operand1) */
36916 xa = ix86_expand_sse_fabs (res, &mask);
36917
36918 /* if (!isless (xa, TWO52)) goto label; */
36919 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36920
36921 /* xa2 = xa + TWO52 - TWO52; */
36922 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
36923 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
36924
36925 /* dxa = xa2 - xa; */
36926 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
36927
36928 /* generate 0.5, 1.0 and -0.5 */
36929 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
36930 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
36931 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
36932 0, OPTAB_DIRECT);
36933
36934 /* Compensate. */
36935 tmp = gen_reg_rtx (mode);
36936 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
36937 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
36938 emit_insn (gen_rtx_SET (VOIDmode, tmp,
36939 gen_rtx_AND (mode, one, tmp)));
36940 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
36941 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
36942 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
36943 emit_insn (gen_rtx_SET (VOIDmode, tmp,
36944 gen_rtx_AND (mode, one, tmp)));
36945 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
36946
36947 /* res = copysign (xa2, operand1) */
36948 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
36949
36950 emit_label (label);
36951 LABEL_NUSES (label) = 1;
36952
36953 emit_move_insn (operand0, res);
36954 }
36955
36956 /* Expand SSE sequence for computing trunc from OPERAND1 storing
36957 into OPERAND0. */
36958 void
36959 ix86_expand_trunc (rtx operand0, rtx operand1)
36960 {
36961 /* C code for SSE variant we expand below.
36962 double xa = fabs (x), x2;
36963 if (!isless (xa, TWO52))
36964 return x;
36965 x2 = (double)(long)x;
36966 if (HONOR_SIGNED_ZEROS (mode))
36967 return copysign (x2, x);
36968 return x2;
36969 */
36970 enum machine_mode mode = GET_MODE (operand0);
36971 rtx xa, xi, TWO52, label, res, mask;
36972
36973 TWO52 = ix86_gen_TWO52 (mode);
36974
36975 /* Temporary for holding the result, initialized to the input
36976 operand to ease control flow. */
36977 res = gen_reg_rtx (mode);
36978 emit_move_insn (res, operand1);
36979
36980 /* xa = abs (operand1) */
36981 xa = ix86_expand_sse_fabs (res, &mask);
36982
36983 /* if (!isless (xa, TWO52)) goto label; */
36984 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
36985
36986 /* x = (double)(long)x */
36987 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
36988 expand_fix (xi, res, 0);
36989 expand_float (res, xi, 0);
36990
36991 if (HONOR_SIGNED_ZEROS (mode))
36992 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
36993
36994 emit_label (label);
36995 LABEL_NUSES (label) = 1;
36996
36997 emit_move_insn (operand0, res);
36998 }
36999
37000 /* Expand SSE sequence for computing trunc from OPERAND1 storing
37001 into OPERAND0. */
37002 void
37003 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
37004 {
37005 enum machine_mode mode = GET_MODE (operand0);
37006 rtx xa, mask, TWO52, label, one, res, smask, tmp;
37007
37008 /* C code for SSE variant we expand below.
37009 double xa = fabs (x), x2;
37010 if (!isless (xa, TWO52))
37011 return x;
37012 xa2 = xa + TWO52 - TWO52;
37013 Compensate:
37014 if (xa2 > xa)
37015 xa2 -= 1.0;
37016 x2 = copysign (xa2, x);
37017 return x2;
37018 */
37019
37020 TWO52 = ix86_gen_TWO52 (mode);
37021
37022 /* Temporary for holding the result, initialized to the input
37023 operand to ease control flow. */
37024 res = gen_reg_rtx (mode);
37025 emit_move_insn (res, operand1);
37026
37027 /* xa = abs (operand1) */
37028 xa = ix86_expand_sse_fabs (res, &smask);
37029
37030 /* if (!isless (xa, TWO52)) goto label; */
37031 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37032
37033 /* res = xa + TWO52 - TWO52; */
37034 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37035 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
37036 emit_move_insn (res, tmp);
37037
37038 /* generate 1.0 */
37039 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
37040
37041 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
37042 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
37043 emit_insn (gen_rtx_SET (VOIDmode, mask,
37044 gen_rtx_AND (mode, mask, one)));
37045 tmp = expand_simple_binop (mode, MINUS,
37046 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
37047 emit_move_insn (res, tmp);
37048
37049 /* res = copysign (res, operand1) */
37050 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
37051
37052 emit_label (label);
37053 LABEL_NUSES (label) = 1;
37054
37055 emit_move_insn (operand0, res);
37056 }
37057
37058 /* Expand SSE sequence for computing round from OPERAND1 storing
37059 into OPERAND0. */
37060 void
37061 ix86_expand_round (rtx operand0, rtx operand1)
37062 {
37063 /* C code for the stuff we're doing below:
37064 double xa = fabs (x);
37065 if (!isless (xa, TWO52))
37066 return x;
37067 xa = (double)(long)(xa + nextafter (0.5, 0.0));
37068 return copysign (xa, x);
37069 */
37070 enum machine_mode mode = GET_MODE (operand0);
37071 rtx res, TWO52, xa, label, xi, half, mask;
37072 const struct real_format *fmt;
37073 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
37074
37075 /* Temporary for holding the result, initialized to the input
37076 operand to ease control flow. */
37077 res = gen_reg_rtx (mode);
37078 emit_move_insn (res, operand1);
37079
37080 TWO52 = ix86_gen_TWO52 (mode);
37081 xa = ix86_expand_sse_fabs (res, &mask);
37082 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37083
37084 /* load nextafter (0.5, 0.0) */
37085 fmt = REAL_MODE_FORMAT (mode);
37086 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
37087 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
37088
37089 /* xa = xa + 0.5 */
37090 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
37091 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
37092
37093 /* xa = (double)(int64_t)xa */
37094 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
37095 expand_fix (xi, xa, 0);
37096 expand_float (xa, xi, 0);
37097
37098 /* res = copysign (xa, operand1) */
37099 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
37100
37101 emit_label (label);
37102 LABEL_NUSES (label) = 1;
37103
37104 emit_move_insn (operand0, res);
37105 }
37106
37107 /* Expand SSE sequence for computing round
37108 from OP1 storing into OP0 using sse4 round insn. */
37109 void
37110 ix86_expand_round_sse4 (rtx op0, rtx op1)
37111 {
37112 enum machine_mode mode = GET_MODE (op0);
37113 rtx e1, e2, res, half;
37114 const struct real_format *fmt;
37115 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
37116 rtx (*gen_copysign) (rtx, rtx, rtx);
37117 rtx (*gen_round) (rtx, rtx, rtx);
37118
37119 switch (mode)
37120 {
37121 case SFmode:
37122 gen_copysign = gen_copysignsf3;
37123 gen_round = gen_sse4_1_roundsf2;
37124 break;
37125 case DFmode:
37126 gen_copysign = gen_copysigndf3;
37127 gen_round = gen_sse4_1_rounddf2;
37128 break;
37129 default:
37130 gcc_unreachable ();
37131 }
37132
37133 /* round (a) = trunc (a + copysign (0.5, a)) */
37134
37135 /* load nextafter (0.5, 0.0) */
37136 fmt = REAL_MODE_FORMAT (mode);
37137 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
37138 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
37139 half = const_double_from_real_value (pred_half, mode);
37140
37141 /* e1 = copysign (0.5, op1) */
37142 e1 = gen_reg_rtx (mode);
37143 emit_insn (gen_copysign (e1, half, op1));
37144
37145 /* e2 = op1 + e1 */
37146 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
37147
37148 /* res = trunc (e2) */
37149 res = gen_reg_rtx (mode);
37150 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
37151
37152 emit_move_insn (op0, res);
37153 }
37154 \f
37155
37156 /* Table of valid machine attributes. */
37157 static const struct attribute_spec ix86_attribute_table[] =
37158 {
37159 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
37160 affects_type_identity } */
37161 /* Stdcall attribute says callee is responsible for popping arguments
37162 if they are not variable. */
37163 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
37164 true },
37165 /* Fastcall attribute says callee is responsible for popping arguments
37166 if they are not variable. */
37167 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
37168 true },
37169 /* Thiscall attribute says callee is responsible for popping arguments
37170 if they are not variable. */
37171 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
37172 true },
37173 /* Cdecl attribute says the callee is a normal C declaration */
37174 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
37175 true },
37176 /* Regparm attribute specifies how many integer arguments are to be
37177 passed in registers. */
37178 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
37179 true },
37180 /* Sseregparm attribute says we are using x86_64 calling conventions
37181 for FP arguments. */
37182 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
37183 true },
37184 /* The transactional memory builtins are implicitly regparm or fastcall
37185 depending on the ABI. Override the generic do-nothing attribute that
37186 these builtins were declared with. */
37187 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
37188 true },
37189 /* force_align_arg_pointer says this function realigns the stack at entry. */
37190 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
37191 false, true, true, ix86_handle_cconv_attribute, false },
37192 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
37193 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
37194 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
37195 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
37196 false },
37197 #endif
37198 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
37199 false },
37200 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
37201 false },
37202 #ifdef SUBTARGET_ATTRIBUTE_TABLE
37203 SUBTARGET_ATTRIBUTE_TABLE,
37204 #endif
37205 /* ms_abi and sysv_abi calling convention function attributes. */
37206 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
37207 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
37208 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
37209 false },
37210 { "callee_pop_aggregate_return", 1, 1, false, true, true,
37211 ix86_handle_callee_pop_aggregate_return, true },
37212 /* End element. */
37213 { NULL, 0, 0, false, false, false, NULL, false }
37214 };
37215
37216 /* Implement targetm.vectorize.builtin_vectorization_cost. */
37217 static int
37218 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
37219 tree vectype,
37220 int misalign ATTRIBUTE_UNUSED)
37221 {
37222 unsigned elements;
37223
37224 switch (type_of_cost)
37225 {
37226 case scalar_stmt:
37227 return ix86_cost->scalar_stmt_cost;
37228
37229 case scalar_load:
37230 return ix86_cost->scalar_load_cost;
37231
37232 case scalar_store:
37233 return ix86_cost->scalar_store_cost;
37234
37235 case vector_stmt:
37236 return ix86_cost->vec_stmt_cost;
37237
37238 case vector_load:
37239 return ix86_cost->vec_align_load_cost;
37240
37241 case vector_store:
37242 return ix86_cost->vec_store_cost;
37243
37244 case vec_to_scalar:
37245 return ix86_cost->vec_to_scalar_cost;
37246
37247 case scalar_to_vec:
37248 return ix86_cost->scalar_to_vec_cost;
37249
37250 case unaligned_load:
37251 case unaligned_store:
37252 return ix86_cost->vec_unalign_load_cost;
37253
37254 case cond_branch_taken:
37255 return ix86_cost->cond_taken_branch_cost;
37256
37257 case cond_branch_not_taken:
37258 return ix86_cost->cond_not_taken_branch_cost;
37259
37260 case vec_perm:
37261 case vec_promote_demote:
37262 return ix86_cost->vec_stmt_cost;
37263
37264 case vec_construct:
37265 elements = TYPE_VECTOR_SUBPARTS (vectype);
37266 return elements / 2 + 1;
37267
37268 default:
37269 gcc_unreachable ();
37270 }
37271 }
37272
37273 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
37274 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
37275 insn every time. */
37276
37277 static GTY(()) rtx vselect_insn;
37278
37279 /* Initialize vselect_insn. */
37280
37281 static void
37282 init_vselect_insn (void)
37283 {
37284 unsigned i;
37285 rtx x;
37286
37287 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
37288 for (i = 0; i < MAX_VECT_LEN; ++i)
37289 XVECEXP (x, 0, i) = const0_rtx;
37290 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
37291 const0_rtx), x);
37292 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
37293 start_sequence ();
37294 vselect_insn = emit_insn (x);
37295 end_sequence ();
37296 }
37297
37298 /* Construct (set target (vec_select op0 (parallel perm))) and
37299 return true if that's a valid instruction in the active ISA. */
37300
37301 static bool
37302 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
37303 unsigned nelt, bool testing_p)
37304 {
37305 unsigned int i;
37306 rtx x, save_vconcat;
37307 int icode;
37308
37309 if (vselect_insn == NULL_RTX)
37310 init_vselect_insn ();
37311
37312 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
37313 PUT_NUM_ELEM (XVEC (x, 0), nelt);
37314 for (i = 0; i < nelt; ++i)
37315 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
37316 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
37317 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
37318 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
37319 SET_DEST (PATTERN (vselect_insn)) = target;
37320 icode = recog_memoized (vselect_insn);
37321
37322 if (icode >= 0 && !testing_p)
37323 emit_insn (copy_rtx (PATTERN (vselect_insn)));
37324
37325 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
37326 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
37327 INSN_CODE (vselect_insn) = -1;
37328
37329 return icode >= 0;
37330 }
37331
37332 /* Similar, but generate a vec_concat from op0 and op1 as well. */
37333
37334 static bool
37335 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
37336 const unsigned char *perm, unsigned nelt,
37337 bool testing_p)
37338 {
37339 enum machine_mode v2mode;
37340 rtx x;
37341 bool ok;
37342
37343 if (vselect_insn == NULL_RTX)
37344 init_vselect_insn ();
37345
37346 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
37347 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
37348 PUT_MODE (x, v2mode);
37349 XEXP (x, 0) = op0;
37350 XEXP (x, 1) = op1;
37351 ok = expand_vselect (target, x, perm, nelt, testing_p);
37352 XEXP (x, 0) = const0_rtx;
37353 XEXP (x, 1) = const0_rtx;
37354 return ok;
37355 }
37356
37357 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
37358 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
37359
37360 static bool
37361 expand_vec_perm_blend (struct expand_vec_perm_d *d)
37362 {
37363 enum machine_mode vmode = d->vmode;
37364 unsigned i, mask, nelt = d->nelt;
37365 rtx target, op0, op1, x;
37366 rtx rperm[32], vperm;
37367
37368 if (d->one_operand_p)
37369 return false;
37370 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
37371 ;
37372 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
37373 ;
37374 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
37375 ;
37376 else
37377 return false;
37378
37379 /* This is a blend, not a permute. Elements must stay in their
37380 respective lanes. */
37381 for (i = 0; i < nelt; ++i)
37382 {
37383 unsigned e = d->perm[i];
37384 if (!(e == i || e == i + nelt))
37385 return false;
37386 }
37387
37388 if (d->testing_p)
37389 return true;
37390
37391 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
37392 decision should be extracted elsewhere, so that we only try that
37393 sequence once all budget==3 options have been tried. */
37394 target = d->target;
37395 op0 = d->op0;
37396 op1 = d->op1;
37397 mask = 0;
37398
37399 switch (vmode)
37400 {
37401 case V4DFmode:
37402 case V8SFmode:
37403 case V2DFmode:
37404 case V4SFmode:
37405 case V8HImode:
37406 case V8SImode:
37407 for (i = 0; i < nelt; ++i)
37408 mask |= (d->perm[i] >= nelt) << i;
37409 break;
37410
37411 case V2DImode:
37412 for (i = 0; i < 2; ++i)
37413 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
37414 vmode = V8HImode;
37415 goto do_subreg;
37416
37417 case V4SImode:
37418 for (i = 0; i < 4; ++i)
37419 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
37420 vmode = V8HImode;
37421 goto do_subreg;
37422
37423 case V16QImode:
37424 /* See if bytes move in pairs so we can use pblendw with
37425 an immediate argument, rather than pblendvb with a vector
37426 argument. */
37427 for (i = 0; i < 16; i += 2)
37428 if (d->perm[i] + 1 != d->perm[i + 1])
37429 {
37430 use_pblendvb:
37431 for (i = 0; i < nelt; ++i)
37432 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
37433
37434 finish_pblendvb:
37435 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
37436 vperm = force_reg (vmode, vperm);
37437
37438 if (GET_MODE_SIZE (vmode) == 16)
37439 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
37440 else
37441 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
37442 return true;
37443 }
37444
37445 for (i = 0; i < 8; ++i)
37446 mask |= (d->perm[i * 2] >= 16) << i;
37447 vmode = V8HImode;
37448 /* FALLTHRU */
37449
37450 do_subreg:
37451 target = gen_lowpart (vmode, target);
37452 op0 = gen_lowpart (vmode, op0);
37453 op1 = gen_lowpart (vmode, op1);
37454 break;
37455
37456 case V32QImode:
37457 /* See if bytes move in pairs. If not, vpblendvb must be used. */
37458 for (i = 0; i < 32; i += 2)
37459 if (d->perm[i] + 1 != d->perm[i + 1])
37460 goto use_pblendvb;
37461 /* See if bytes move in quadruplets. If yes, vpblendd
37462 with immediate can be used. */
37463 for (i = 0; i < 32; i += 4)
37464 if (d->perm[i] + 2 != d->perm[i + 2])
37465 break;
37466 if (i < 32)
37467 {
37468 /* See if bytes move the same in both lanes. If yes,
37469 vpblendw with immediate can be used. */
37470 for (i = 0; i < 16; i += 2)
37471 if (d->perm[i] + 16 != d->perm[i + 16])
37472 goto use_pblendvb;
37473
37474 /* Use vpblendw. */
37475 for (i = 0; i < 16; ++i)
37476 mask |= (d->perm[i * 2] >= 32) << i;
37477 vmode = V16HImode;
37478 goto do_subreg;
37479 }
37480
37481 /* Use vpblendd. */
37482 for (i = 0; i < 8; ++i)
37483 mask |= (d->perm[i * 4] >= 32) << i;
37484 vmode = V8SImode;
37485 goto do_subreg;
37486
37487 case V16HImode:
37488 /* See if words move in pairs. If yes, vpblendd can be used. */
37489 for (i = 0; i < 16; i += 2)
37490 if (d->perm[i] + 1 != d->perm[i + 1])
37491 break;
37492 if (i < 16)
37493 {
37494 /* See if words move the same in both lanes. If not,
37495 vpblendvb must be used. */
37496 for (i = 0; i < 8; i++)
37497 if (d->perm[i] + 8 != d->perm[i + 8])
37498 {
37499 /* Use vpblendvb. */
37500 for (i = 0; i < 32; ++i)
37501 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
37502
37503 vmode = V32QImode;
37504 nelt = 32;
37505 target = gen_lowpart (vmode, target);
37506 op0 = gen_lowpart (vmode, op0);
37507 op1 = gen_lowpart (vmode, op1);
37508 goto finish_pblendvb;
37509 }
37510
37511 /* Use vpblendw. */
37512 for (i = 0; i < 16; ++i)
37513 mask |= (d->perm[i] >= 16) << i;
37514 break;
37515 }
37516
37517 /* Use vpblendd. */
37518 for (i = 0; i < 8; ++i)
37519 mask |= (d->perm[i * 2] >= 16) << i;
37520 vmode = V8SImode;
37521 goto do_subreg;
37522
37523 case V4DImode:
37524 /* Use vpblendd. */
37525 for (i = 0; i < 4; ++i)
37526 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
37527 vmode = V8SImode;
37528 goto do_subreg;
37529
37530 default:
37531 gcc_unreachable ();
37532 }
37533
37534 /* This matches five different patterns with the different modes. */
37535 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
37536 x = gen_rtx_SET (VOIDmode, target, x);
37537 emit_insn (x);
37538
37539 return true;
37540 }
37541
37542 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
37543 in terms of the variable form of vpermilps.
37544
37545 Note that we will have already failed the immediate input vpermilps,
37546 which requires that the high and low part shuffle be identical; the
37547 variable form doesn't require that. */
37548
37549 static bool
37550 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
37551 {
37552 rtx rperm[8], vperm;
37553 unsigned i;
37554
37555 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
37556 return false;
37557
37558 /* We can only permute within the 128-bit lane. */
37559 for (i = 0; i < 8; ++i)
37560 {
37561 unsigned e = d->perm[i];
37562 if (i < 4 ? e >= 4 : e < 4)
37563 return false;
37564 }
37565
37566 if (d->testing_p)
37567 return true;
37568
37569 for (i = 0; i < 8; ++i)
37570 {
37571 unsigned e = d->perm[i];
37572
37573 /* Within each 128-bit lane, the elements of op0 are numbered
37574 from 0 and the elements of op1 are numbered from 4. */
37575 if (e >= 8 + 4)
37576 e -= 8;
37577 else if (e >= 4)
37578 e -= 4;
37579
37580 rperm[i] = GEN_INT (e);
37581 }
37582
37583 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
37584 vperm = force_reg (V8SImode, vperm);
37585 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
37586
37587 return true;
37588 }
37589
37590 /* Return true if permutation D can be performed as VMODE permutation
37591 instead. */
37592
37593 static bool
37594 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
37595 {
37596 unsigned int i, j, chunk;
37597
37598 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
37599 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
37600 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
37601 return false;
37602
37603 if (GET_MODE_NUNITS (vmode) >= d->nelt)
37604 return true;
37605
37606 chunk = d->nelt / GET_MODE_NUNITS (vmode);
37607 for (i = 0; i < d->nelt; i += chunk)
37608 if (d->perm[i] & (chunk - 1))
37609 return false;
37610 else
37611 for (j = 1; j < chunk; ++j)
37612 if (d->perm[i] + j != d->perm[i + j])
37613 return false;
37614
37615 return true;
37616 }
37617
37618 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
37619 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
37620
37621 static bool
37622 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
37623 {
37624 unsigned i, nelt, eltsz, mask;
37625 unsigned char perm[32];
37626 enum machine_mode vmode = V16QImode;
37627 rtx rperm[32], vperm, target, op0, op1;
37628
37629 nelt = d->nelt;
37630
37631 if (!d->one_operand_p)
37632 {
37633 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
37634 {
37635 if (TARGET_AVX2
37636 && valid_perm_using_mode_p (V2TImode, d))
37637 {
37638 if (d->testing_p)
37639 return true;
37640
37641 /* Use vperm2i128 insn. The pattern uses
37642 V4DImode instead of V2TImode. */
37643 target = gen_lowpart (V4DImode, d->target);
37644 op0 = gen_lowpart (V4DImode, d->op0);
37645 op1 = gen_lowpart (V4DImode, d->op1);
37646 rperm[0]
37647 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
37648 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
37649 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
37650 return true;
37651 }
37652 return false;
37653 }
37654 }
37655 else
37656 {
37657 if (GET_MODE_SIZE (d->vmode) == 16)
37658 {
37659 if (!TARGET_SSSE3)
37660 return false;
37661 }
37662 else if (GET_MODE_SIZE (d->vmode) == 32)
37663 {
37664 if (!TARGET_AVX2)
37665 return false;
37666
37667 /* V4DImode should be already handled through
37668 expand_vselect by vpermq instruction. */
37669 gcc_assert (d->vmode != V4DImode);
37670
37671 vmode = V32QImode;
37672 if (d->vmode == V8SImode
37673 || d->vmode == V16HImode
37674 || d->vmode == V32QImode)
37675 {
37676 /* First see if vpermq can be used for
37677 V8SImode/V16HImode/V32QImode. */
37678 if (valid_perm_using_mode_p (V4DImode, d))
37679 {
37680 for (i = 0; i < 4; i++)
37681 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
37682 if (d->testing_p)
37683 return true;
37684 return expand_vselect (gen_lowpart (V4DImode, d->target),
37685 gen_lowpart (V4DImode, d->op0),
37686 perm, 4, false);
37687 }
37688
37689 /* Next see if vpermd can be used. */
37690 if (valid_perm_using_mode_p (V8SImode, d))
37691 vmode = V8SImode;
37692 }
37693 /* Or if vpermps can be used. */
37694 else if (d->vmode == V8SFmode)
37695 vmode = V8SImode;
37696
37697 if (vmode == V32QImode)
37698 {
37699 /* vpshufb only works intra lanes, it is not
37700 possible to shuffle bytes in between the lanes. */
37701 for (i = 0; i < nelt; ++i)
37702 if ((d->perm[i] ^ i) & (nelt / 2))
37703 return false;
37704 }
37705 }
37706 else
37707 return false;
37708 }
37709
37710 if (d->testing_p)
37711 return true;
37712
37713 if (vmode == V8SImode)
37714 for (i = 0; i < 8; ++i)
37715 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
37716 else
37717 {
37718 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37719 if (!d->one_operand_p)
37720 mask = 2 * nelt - 1;
37721 else if (vmode == V16QImode)
37722 mask = nelt - 1;
37723 else
37724 mask = nelt / 2 - 1;
37725
37726 for (i = 0; i < nelt; ++i)
37727 {
37728 unsigned j, e = d->perm[i] & mask;
37729 for (j = 0; j < eltsz; ++j)
37730 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
37731 }
37732 }
37733
37734 vperm = gen_rtx_CONST_VECTOR (vmode,
37735 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
37736 vperm = force_reg (vmode, vperm);
37737
37738 target = gen_lowpart (vmode, d->target);
37739 op0 = gen_lowpart (vmode, d->op0);
37740 if (d->one_operand_p)
37741 {
37742 if (vmode == V16QImode)
37743 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
37744 else if (vmode == V32QImode)
37745 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
37746 else if (vmode == V8SFmode)
37747 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
37748 else
37749 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
37750 }
37751 else
37752 {
37753 op1 = gen_lowpart (vmode, d->op1);
37754 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
37755 }
37756
37757 return true;
37758 }
37759
37760 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
37761 in a single instruction. */
37762
37763 static bool
37764 expand_vec_perm_1 (struct expand_vec_perm_d *d)
37765 {
37766 unsigned i, nelt = d->nelt;
37767 unsigned char perm2[MAX_VECT_LEN];
37768
37769 /* Check plain VEC_SELECT first, because AVX has instructions that could
37770 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
37771 input where SEL+CONCAT may not. */
37772 if (d->one_operand_p)
37773 {
37774 int mask = nelt - 1;
37775 bool identity_perm = true;
37776 bool broadcast_perm = true;
37777
37778 for (i = 0; i < nelt; i++)
37779 {
37780 perm2[i] = d->perm[i] & mask;
37781 if (perm2[i] != i)
37782 identity_perm = false;
37783 if (perm2[i])
37784 broadcast_perm = false;
37785 }
37786
37787 if (identity_perm)
37788 {
37789 if (!d->testing_p)
37790 emit_move_insn (d->target, d->op0);
37791 return true;
37792 }
37793 else if (broadcast_perm && TARGET_AVX2)
37794 {
37795 /* Use vpbroadcast{b,w,d}. */
37796 rtx (*gen) (rtx, rtx) = NULL;
37797 switch (d->vmode)
37798 {
37799 case V32QImode:
37800 gen = gen_avx2_pbroadcastv32qi_1;
37801 break;
37802 case V16HImode:
37803 gen = gen_avx2_pbroadcastv16hi_1;
37804 break;
37805 case V8SImode:
37806 gen = gen_avx2_pbroadcastv8si_1;
37807 break;
37808 case V16QImode:
37809 gen = gen_avx2_pbroadcastv16qi;
37810 break;
37811 case V8HImode:
37812 gen = gen_avx2_pbroadcastv8hi;
37813 break;
37814 case V8SFmode:
37815 gen = gen_avx2_vec_dupv8sf_1;
37816 break;
37817 /* For other modes prefer other shuffles this function creates. */
37818 default: break;
37819 }
37820 if (gen != NULL)
37821 {
37822 if (!d->testing_p)
37823 emit_insn (gen (d->target, d->op0));
37824 return true;
37825 }
37826 }
37827
37828 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
37829 return true;
37830
37831 /* There are plenty of patterns in sse.md that are written for
37832 SEL+CONCAT and are not replicated for a single op. Perhaps
37833 that should be changed, to avoid the nastiness here. */
37834
37835 /* Recognize interleave style patterns, which means incrementing
37836 every other permutation operand. */
37837 for (i = 0; i < nelt; i += 2)
37838 {
37839 perm2[i] = d->perm[i] & mask;
37840 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
37841 }
37842 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
37843 d->testing_p))
37844 return true;
37845
37846 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
37847 if (nelt >= 4)
37848 {
37849 for (i = 0; i < nelt; i += 4)
37850 {
37851 perm2[i + 0] = d->perm[i + 0] & mask;
37852 perm2[i + 1] = d->perm[i + 1] & mask;
37853 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
37854 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
37855 }
37856
37857 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
37858 d->testing_p))
37859 return true;
37860 }
37861 }
37862
37863 /* Finally, try the fully general two operand permute. */
37864 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
37865 d->testing_p))
37866 return true;
37867
37868 /* Recognize interleave style patterns with reversed operands. */
37869 if (!d->one_operand_p)
37870 {
37871 for (i = 0; i < nelt; ++i)
37872 {
37873 unsigned e = d->perm[i];
37874 if (e >= nelt)
37875 e -= nelt;
37876 else
37877 e += nelt;
37878 perm2[i] = e;
37879 }
37880
37881 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
37882 d->testing_p))
37883 return true;
37884 }
37885
37886 /* Try the SSE4.1 blend variable merge instructions. */
37887 if (expand_vec_perm_blend (d))
37888 return true;
37889
37890 /* Try one of the AVX vpermil variable permutations. */
37891 if (expand_vec_perm_vpermil (d))
37892 return true;
37893
37894 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
37895 vpshufb, vpermd, vpermps or vpermq variable permutation. */
37896 if (expand_vec_perm_pshufb (d))
37897 return true;
37898
37899 return false;
37900 }
37901
37902 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
37903 in terms of a pair of pshuflw + pshufhw instructions. */
37904
37905 static bool
37906 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
37907 {
37908 unsigned char perm2[MAX_VECT_LEN];
37909 unsigned i;
37910 bool ok;
37911
37912 if (d->vmode != V8HImode || !d->one_operand_p)
37913 return false;
37914
37915 /* The two permutations only operate in 64-bit lanes. */
37916 for (i = 0; i < 4; ++i)
37917 if (d->perm[i] >= 4)
37918 return false;
37919 for (i = 4; i < 8; ++i)
37920 if (d->perm[i] < 4)
37921 return false;
37922
37923 if (d->testing_p)
37924 return true;
37925
37926 /* Emit the pshuflw. */
37927 memcpy (perm2, d->perm, 4);
37928 for (i = 4; i < 8; ++i)
37929 perm2[i] = i;
37930 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
37931 gcc_assert (ok);
37932
37933 /* Emit the pshufhw. */
37934 memcpy (perm2 + 4, d->perm + 4, 4);
37935 for (i = 0; i < 4; ++i)
37936 perm2[i] = i;
37937 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
37938 gcc_assert (ok);
37939
37940 return true;
37941 }
37942
37943 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
37944 the permutation using the SSSE3 palignr instruction. This succeeds
37945 when all of the elements in PERM fit within one vector and we merely
37946 need to shift them down so that a single vector permutation has a
37947 chance to succeed. */
37948
37949 static bool
37950 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
37951 {
37952 unsigned i, nelt = d->nelt;
37953 unsigned min, max;
37954 bool in_order, ok;
37955 rtx shift;
37956
37957 /* Even with AVX, palignr only operates on 128-bit vectors. */
37958 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
37959 return false;
37960
37961 min = nelt, max = 0;
37962 for (i = 0; i < nelt; ++i)
37963 {
37964 unsigned e = d->perm[i];
37965 if (e < min)
37966 min = e;
37967 if (e > max)
37968 max = e;
37969 }
37970 if (min == 0 || max - min >= nelt)
37971 return false;
37972
37973 /* Given that we have SSSE3, we know we'll be able to implement the
37974 single operand permutation after the palignr with pshufb. */
37975 if (d->testing_p)
37976 return true;
37977
37978 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
37979 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
37980 gen_lowpart (TImode, d->op1),
37981 gen_lowpart (TImode, d->op0), shift));
37982
37983 d->op0 = d->op1 = d->target;
37984 d->one_operand_p = true;
37985
37986 in_order = true;
37987 for (i = 0; i < nelt; ++i)
37988 {
37989 unsigned e = d->perm[i] - min;
37990 if (e != i)
37991 in_order = false;
37992 d->perm[i] = e;
37993 }
37994
37995 /* Test for the degenerate case where the alignment by itself
37996 produces the desired permutation. */
37997 if (in_order)
37998 return true;
37999
38000 ok = expand_vec_perm_1 (d);
38001 gcc_assert (ok);
38002
38003 return ok;
38004 }
38005
38006 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
38007
38008 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
38009 a two vector permutation into a single vector permutation by using
38010 an interleave operation to merge the vectors. */
38011
38012 static bool
38013 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
38014 {
38015 struct expand_vec_perm_d dremap, dfinal;
38016 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
38017 unsigned HOST_WIDE_INT contents;
38018 unsigned char remap[2 * MAX_VECT_LEN];
38019 rtx seq;
38020 bool ok, same_halves = false;
38021
38022 if (GET_MODE_SIZE (d->vmode) == 16)
38023 {
38024 if (d->one_operand_p)
38025 return false;
38026 }
38027 else if (GET_MODE_SIZE (d->vmode) == 32)
38028 {
38029 if (!TARGET_AVX)
38030 return false;
38031 /* For 32-byte modes allow even d->one_operand_p.
38032 The lack of cross-lane shuffling in some instructions
38033 might prevent a single insn shuffle. */
38034 dfinal = *d;
38035 dfinal.testing_p = true;
38036 /* If expand_vec_perm_interleave3 can expand this into
38037 a 3 insn sequence, give up and let it be expanded as
38038 3 insn sequence. While that is one insn longer,
38039 it doesn't need a memory operand and in the common
38040 case that both interleave low and high permutations
38041 with the same operands are adjacent needs 4 insns
38042 for both after CSE. */
38043 if (expand_vec_perm_interleave3 (&dfinal))
38044 return false;
38045 }
38046 else
38047 return false;
38048
38049 /* Examine from whence the elements come. */
38050 contents = 0;
38051 for (i = 0; i < nelt; ++i)
38052 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
38053
38054 memset (remap, 0xff, sizeof (remap));
38055 dremap = *d;
38056
38057 if (GET_MODE_SIZE (d->vmode) == 16)
38058 {
38059 unsigned HOST_WIDE_INT h1, h2, h3, h4;
38060
38061 /* Split the two input vectors into 4 halves. */
38062 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
38063 h2 = h1 << nelt2;
38064 h3 = h2 << nelt2;
38065 h4 = h3 << nelt2;
38066
38067 /* If the elements from the low halves use interleave low, and similarly
38068 for interleave high. If the elements are from mis-matched halves, we
38069 can use shufps for V4SF/V4SI or do a DImode shuffle. */
38070 if ((contents & (h1 | h3)) == contents)
38071 {
38072 /* punpckl* */
38073 for (i = 0; i < nelt2; ++i)
38074 {
38075 remap[i] = i * 2;
38076 remap[i + nelt] = i * 2 + 1;
38077 dremap.perm[i * 2] = i;
38078 dremap.perm[i * 2 + 1] = i + nelt;
38079 }
38080 if (!TARGET_SSE2 && d->vmode == V4SImode)
38081 dremap.vmode = V4SFmode;
38082 }
38083 else if ((contents & (h2 | h4)) == contents)
38084 {
38085 /* punpckh* */
38086 for (i = 0; i < nelt2; ++i)
38087 {
38088 remap[i + nelt2] = i * 2;
38089 remap[i + nelt + nelt2] = i * 2 + 1;
38090 dremap.perm[i * 2] = i + nelt2;
38091 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
38092 }
38093 if (!TARGET_SSE2 && d->vmode == V4SImode)
38094 dremap.vmode = V4SFmode;
38095 }
38096 else if ((contents & (h1 | h4)) == contents)
38097 {
38098 /* shufps */
38099 for (i = 0; i < nelt2; ++i)
38100 {
38101 remap[i] = i;
38102 remap[i + nelt + nelt2] = i + nelt2;
38103 dremap.perm[i] = i;
38104 dremap.perm[i + nelt2] = i + nelt + nelt2;
38105 }
38106 if (nelt != 4)
38107 {
38108 /* shufpd */
38109 dremap.vmode = V2DImode;
38110 dremap.nelt = 2;
38111 dremap.perm[0] = 0;
38112 dremap.perm[1] = 3;
38113 }
38114 }
38115 else if ((contents & (h2 | h3)) == contents)
38116 {
38117 /* shufps */
38118 for (i = 0; i < nelt2; ++i)
38119 {
38120 remap[i + nelt2] = i;
38121 remap[i + nelt] = i + nelt2;
38122 dremap.perm[i] = i + nelt2;
38123 dremap.perm[i + nelt2] = i + nelt;
38124 }
38125 if (nelt != 4)
38126 {
38127 /* shufpd */
38128 dremap.vmode = V2DImode;
38129 dremap.nelt = 2;
38130 dremap.perm[0] = 1;
38131 dremap.perm[1] = 2;
38132 }
38133 }
38134 else
38135 return false;
38136 }
38137 else
38138 {
38139 unsigned int nelt4 = nelt / 4, nzcnt = 0;
38140 unsigned HOST_WIDE_INT q[8];
38141 unsigned int nonzero_halves[4];
38142
38143 /* Split the two input vectors into 8 quarters. */
38144 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
38145 for (i = 1; i < 8; ++i)
38146 q[i] = q[0] << (nelt4 * i);
38147 for (i = 0; i < 4; ++i)
38148 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
38149 {
38150 nonzero_halves[nzcnt] = i;
38151 ++nzcnt;
38152 }
38153
38154 if (nzcnt == 1)
38155 {
38156 gcc_assert (d->one_operand_p);
38157 nonzero_halves[1] = nonzero_halves[0];
38158 same_halves = true;
38159 }
38160 else if (d->one_operand_p)
38161 {
38162 gcc_assert (nonzero_halves[0] == 0);
38163 gcc_assert (nonzero_halves[1] == 1);
38164 }
38165
38166 if (nzcnt <= 2)
38167 {
38168 if (d->perm[0] / nelt2 == nonzero_halves[1])
38169 {
38170 /* Attempt to increase the likelihood that dfinal
38171 shuffle will be intra-lane. */
38172 char tmph = nonzero_halves[0];
38173 nonzero_halves[0] = nonzero_halves[1];
38174 nonzero_halves[1] = tmph;
38175 }
38176
38177 /* vperm2f128 or vperm2i128. */
38178 for (i = 0; i < nelt2; ++i)
38179 {
38180 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
38181 remap[i + nonzero_halves[0] * nelt2] = i;
38182 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
38183 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
38184 }
38185
38186 if (d->vmode != V8SFmode
38187 && d->vmode != V4DFmode
38188 && d->vmode != V8SImode)
38189 {
38190 dremap.vmode = V8SImode;
38191 dremap.nelt = 8;
38192 for (i = 0; i < 4; ++i)
38193 {
38194 dremap.perm[i] = i + nonzero_halves[0] * 4;
38195 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
38196 }
38197 }
38198 }
38199 else if (d->one_operand_p)
38200 return false;
38201 else if (TARGET_AVX2
38202 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
38203 {
38204 /* vpunpckl* */
38205 for (i = 0; i < nelt4; ++i)
38206 {
38207 remap[i] = i * 2;
38208 remap[i + nelt] = i * 2 + 1;
38209 remap[i + nelt2] = i * 2 + nelt2;
38210 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
38211 dremap.perm[i * 2] = i;
38212 dremap.perm[i * 2 + 1] = i + nelt;
38213 dremap.perm[i * 2 + nelt2] = i + nelt2;
38214 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
38215 }
38216 }
38217 else if (TARGET_AVX2
38218 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
38219 {
38220 /* vpunpckh* */
38221 for (i = 0; i < nelt4; ++i)
38222 {
38223 remap[i + nelt4] = i * 2;
38224 remap[i + nelt + nelt4] = i * 2 + 1;
38225 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
38226 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
38227 dremap.perm[i * 2] = i + nelt4;
38228 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
38229 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
38230 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
38231 }
38232 }
38233 else
38234 return false;
38235 }
38236
38237 /* Use the remapping array set up above to move the elements from their
38238 swizzled locations into their final destinations. */
38239 dfinal = *d;
38240 for (i = 0; i < nelt; ++i)
38241 {
38242 unsigned e = remap[d->perm[i]];
38243 gcc_assert (e < nelt);
38244 /* If same_halves is true, both halves of the remapped vector are the
38245 same. Avoid cross-lane accesses if possible. */
38246 if (same_halves && i >= nelt2)
38247 {
38248 gcc_assert (e < nelt2);
38249 dfinal.perm[i] = e + nelt2;
38250 }
38251 else
38252 dfinal.perm[i] = e;
38253 }
38254 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
38255 dfinal.op1 = dfinal.op0;
38256 dfinal.one_operand_p = true;
38257 dremap.target = dfinal.op0;
38258
38259 /* Test if the final remap can be done with a single insn. For V4SFmode or
38260 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
38261 start_sequence ();
38262 ok = expand_vec_perm_1 (&dfinal);
38263 seq = get_insns ();
38264 end_sequence ();
38265
38266 if (!ok)
38267 return false;
38268
38269 if (d->testing_p)
38270 return true;
38271
38272 if (dremap.vmode != dfinal.vmode)
38273 {
38274 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
38275 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
38276 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
38277 }
38278
38279 ok = expand_vec_perm_1 (&dremap);
38280 gcc_assert (ok);
38281
38282 emit_insn (seq);
38283 return true;
38284 }
38285
38286 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
38287 a single vector cross-lane permutation into vpermq followed
38288 by any of the single insn permutations. */
38289
38290 static bool
38291 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
38292 {
38293 struct expand_vec_perm_d dremap, dfinal;
38294 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
38295 unsigned contents[2];
38296 bool ok;
38297
38298 if (!(TARGET_AVX2
38299 && (d->vmode == V32QImode || d->vmode == V16HImode)
38300 && d->one_operand_p))
38301 return false;
38302
38303 contents[0] = 0;
38304 contents[1] = 0;
38305 for (i = 0; i < nelt2; ++i)
38306 {
38307 contents[0] |= 1u << (d->perm[i] / nelt4);
38308 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
38309 }
38310
38311 for (i = 0; i < 2; ++i)
38312 {
38313 unsigned int cnt = 0;
38314 for (j = 0; j < 4; ++j)
38315 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
38316 return false;
38317 }
38318
38319 if (d->testing_p)
38320 return true;
38321
38322 dremap = *d;
38323 dremap.vmode = V4DImode;
38324 dremap.nelt = 4;
38325 dremap.target = gen_reg_rtx (V4DImode);
38326 dremap.op0 = gen_lowpart (V4DImode, d->op0);
38327 dremap.op1 = dremap.op0;
38328 dremap.one_operand_p = true;
38329 for (i = 0; i < 2; ++i)
38330 {
38331 unsigned int cnt = 0;
38332 for (j = 0; j < 4; ++j)
38333 if ((contents[i] & (1u << j)) != 0)
38334 dremap.perm[2 * i + cnt++] = j;
38335 for (; cnt < 2; ++cnt)
38336 dremap.perm[2 * i + cnt] = 0;
38337 }
38338
38339 dfinal = *d;
38340 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
38341 dfinal.op1 = dfinal.op0;
38342 dfinal.one_operand_p = true;
38343 for (i = 0, j = 0; i < nelt; ++i)
38344 {
38345 if (i == nelt2)
38346 j = 2;
38347 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
38348 if ((d->perm[i] / nelt4) == dremap.perm[j])
38349 ;
38350 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
38351 dfinal.perm[i] |= nelt4;
38352 else
38353 gcc_unreachable ();
38354 }
38355
38356 ok = expand_vec_perm_1 (&dremap);
38357 gcc_assert (ok);
38358
38359 ok = expand_vec_perm_1 (&dfinal);
38360 gcc_assert (ok);
38361
38362 return true;
38363 }
38364
38365 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
38366 a vector permutation using two instructions, vperm2f128 resp.
38367 vperm2i128 followed by any single in-lane permutation. */
38368
38369 static bool
38370 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
38371 {
38372 struct expand_vec_perm_d dfirst, dsecond;
38373 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
38374 bool ok;
38375
38376 if (!TARGET_AVX
38377 || GET_MODE_SIZE (d->vmode) != 32
38378 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
38379 return false;
38380
38381 dsecond = *d;
38382 dsecond.one_operand_p = false;
38383 dsecond.testing_p = true;
38384
38385 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
38386 immediate. For perm < 16 the second permutation uses
38387 d->op0 as first operand, for perm >= 16 it uses d->op1
38388 as first operand. The second operand is the result of
38389 vperm2[fi]128. */
38390 for (perm = 0; perm < 32; perm++)
38391 {
38392 /* Ignore permutations which do not move anything cross-lane. */
38393 if (perm < 16)
38394 {
38395 /* The second shuffle for e.g. V4DFmode has
38396 0123 and ABCD operands.
38397 Ignore AB23, as 23 is already in the second lane
38398 of the first operand. */
38399 if ((perm & 0xc) == (1 << 2)) continue;
38400 /* And 01CD, as 01 is in the first lane of the first
38401 operand. */
38402 if ((perm & 3) == 0) continue;
38403 /* And 4567, as then the vperm2[fi]128 doesn't change
38404 anything on the original 4567 second operand. */
38405 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
38406 }
38407 else
38408 {
38409 /* The second shuffle for e.g. V4DFmode has
38410 4567 and ABCD operands.
38411 Ignore AB67, as 67 is already in the second lane
38412 of the first operand. */
38413 if ((perm & 0xc) == (3 << 2)) continue;
38414 /* And 45CD, as 45 is in the first lane of the first
38415 operand. */
38416 if ((perm & 3) == 2) continue;
38417 /* And 0123, as then the vperm2[fi]128 doesn't change
38418 anything on the original 0123 first operand. */
38419 if ((perm & 0xf) == (1 << 2)) continue;
38420 }
38421
38422 for (i = 0; i < nelt; i++)
38423 {
38424 j = d->perm[i] / nelt2;
38425 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
38426 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
38427 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
38428 dsecond.perm[i] = d->perm[i] & (nelt - 1);
38429 else
38430 break;
38431 }
38432
38433 if (i == nelt)
38434 {
38435 start_sequence ();
38436 ok = expand_vec_perm_1 (&dsecond);
38437 end_sequence ();
38438 }
38439 else
38440 ok = false;
38441
38442 if (ok)
38443 {
38444 if (d->testing_p)
38445 return true;
38446
38447 /* Found a usable second shuffle. dfirst will be
38448 vperm2f128 on d->op0 and d->op1. */
38449 dsecond.testing_p = false;
38450 dfirst = *d;
38451 dfirst.target = gen_reg_rtx (d->vmode);
38452 for (i = 0; i < nelt; i++)
38453 dfirst.perm[i] = (i & (nelt2 - 1))
38454 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
38455
38456 ok = expand_vec_perm_1 (&dfirst);
38457 gcc_assert (ok);
38458
38459 /* And dsecond is some single insn shuffle, taking
38460 d->op0 and result of vperm2f128 (if perm < 16) or
38461 d->op1 and result of vperm2f128 (otherwise). */
38462 dsecond.op1 = dfirst.target;
38463 if (perm >= 16)
38464 dsecond.op0 = dfirst.op1;
38465
38466 ok = expand_vec_perm_1 (&dsecond);
38467 gcc_assert (ok);
38468
38469 return true;
38470 }
38471
38472 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
38473 if (d->one_operand_p)
38474 return false;
38475 }
38476
38477 return false;
38478 }
38479
38480 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
38481 a two vector permutation using 2 intra-lane interleave insns
38482 and cross-lane shuffle for 32-byte vectors. */
38483
38484 static bool
38485 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
38486 {
38487 unsigned i, nelt;
38488 rtx (*gen) (rtx, rtx, rtx);
38489
38490 if (d->one_operand_p)
38491 return false;
38492 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
38493 ;
38494 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
38495 ;
38496 else
38497 return false;
38498
38499 nelt = d->nelt;
38500 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
38501 return false;
38502 for (i = 0; i < nelt; i += 2)
38503 if (d->perm[i] != d->perm[0] + i / 2
38504 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
38505 return false;
38506
38507 if (d->testing_p)
38508 return true;
38509
38510 switch (d->vmode)
38511 {
38512 case V32QImode:
38513 if (d->perm[0])
38514 gen = gen_vec_interleave_highv32qi;
38515 else
38516 gen = gen_vec_interleave_lowv32qi;
38517 break;
38518 case V16HImode:
38519 if (d->perm[0])
38520 gen = gen_vec_interleave_highv16hi;
38521 else
38522 gen = gen_vec_interleave_lowv16hi;
38523 break;
38524 case V8SImode:
38525 if (d->perm[0])
38526 gen = gen_vec_interleave_highv8si;
38527 else
38528 gen = gen_vec_interleave_lowv8si;
38529 break;
38530 case V4DImode:
38531 if (d->perm[0])
38532 gen = gen_vec_interleave_highv4di;
38533 else
38534 gen = gen_vec_interleave_lowv4di;
38535 break;
38536 case V8SFmode:
38537 if (d->perm[0])
38538 gen = gen_vec_interleave_highv8sf;
38539 else
38540 gen = gen_vec_interleave_lowv8sf;
38541 break;
38542 case V4DFmode:
38543 if (d->perm[0])
38544 gen = gen_vec_interleave_highv4df;
38545 else
38546 gen = gen_vec_interleave_lowv4df;
38547 break;
38548 default:
38549 gcc_unreachable ();
38550 }
38551
38552 emit_insn (gen (d->target, d->op0, d->op1));
38553 return true;
38554 }
38555
38556 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
38557 a single vector permutation using a single intra-lane vector
38558 permutation, vperm2f128 swapping the lanes and vblend* insn blending
38559 the non-swapped and swapped vectors together. */
38560
38561 static bool
38562 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
38563 {
38564 struct expand_vec_perm_d dfirst, dsecond;
38565 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
38566 rtx seq;
38567 bool ok;
38568 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
38569
38570 if (!TARGET_AVX
38571 || TARGET_AVX2
38572 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
38573 || !d->one_operand_p)
38574 return false;
38575
38576 dfirst = *d;
38577 for (i = 0; i < nelt; i++)
38578 dfirst.perm[i] = 0xff;
38579 for (i = 0, msk = 0; i < nelt; i++)
38580 {
38581 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
38582 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
38583 return false;
38584 dfirst.perm[j] = d->perm[i];
38585 if (j != i)
38586 msk |= (1 << i);
38587 }
38588 for (i = 0; i < nelt; i++)
38589 if (dfirst.perm[i] == 0xff)
38590 dfirst.perm[i] = i;
38591
38592 if (!d->testing_p)
38593 dfirst.target = gen_reg_rtx (dfirst.vmode);
38594
38595 start_sequence ();
38596 ok = expand_vec_perm_1 (&dfirst);
38597 seq = get_insns ();
38598 end_sequence ();
38599
38600 if (!ok)
38601 return false;
38602
38603 if (d->testing_p)
38604 return true;
38605
38606 emit_insn (seq);
38607
38608 dsecond = *d;
38609 dsecond.op0 = dfirst.target;
38610 dsecond.op1 = dfirst.target;
38611 dsecond.one_operand_p = true;
38612 dsecond.target = gen_reg_rtx (dsecond.vmode);
38613 for (i = 0; i < nelt; i++)
38614 dsecond.perm[i] = i ^ nelt2;
38615
38616 ok = expand_vec_perm_1 (&dsecond);
38617 gcc_assert (ok);
38618
38619 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
38620 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
38621 return true;
38622 }
38623
38624 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
38625 permutation using two vperm2f128, followed by a vshufpd insn blending
38626 the two vectors together. */
38627
38628 static bool
38629 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
38630 {
38631 struct expand_vec_perm_d dfirst, dsecond, dthird;
38632 bool ok;
38633
38634 if (!TARGET_AVX || (d->vmode != V4DFmode))
38635 return false;
38636
38637 if (d->testing_p)
38638 return true;
38639
38640 dfirst = *d;
38641 dsecond = *d;
38642 dthird = *d;
38643
38644 dfirst.perm[0] = (d->perm[0] & ~1);
38645 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
38646 dfirst.perm[2] = (d->perm[2] & ~1);
38647 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
38648 dsecond.perm[0] = (d->perm[1] & ~1);
38649 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
38650 dsecond.perm[2] = (d->perm[3] & ~1);
38651 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
38652 dthird.perm[0] = (d->perm[0] % 2);
38653 dthird.perm[1] = (d->perm[1] % 2) + 4;
38654 dthird.perm[2] = (d->perm[2] % 2) + 2;
38655 dthird.perm[3] = (d->perm[3] % 2) + 6;
38656
38657 dfirst.target = gen_reg_rtx (dfirst.vmode);
38658 dsecond.target = gen_reg_rtx (dsecond.vmode);
38659 dthird.op0 = dfirst.target;
38660 dthird.op1 = dsecond.target;
38661 dthird.one_operand_p = false;
38662
38663 canonicalize_perm (&dfirst);
38664 canonicalize_perm (&dsecond);
38665
38666 ok = expand_vec_perm_1 (&dfirst)
38667 && expand_vec_perm_1 (&dsecond)
38668 && expand_vec_perm_1 (&dthird);
38669
38670 gcc_assert (ok);
38671
38672 return true;
38673 }
38674
38675 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
38676 permutation with two pshufb insns and an ior. We should have already
38677 failed all two instruction sequences. */
38678
38679 static bool
38680 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
38681 {
38682 rtx rperm[2][16], vperm, l, h, op, m128;
38683 unsigned int i, nelt, eltsz;
38684
38685 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
38686 return false;
38687 gcc_assert (!d->one_operand_p);
38688
38689 nelt = d->nelt;
38690 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38691
38692 /* Generate two permutation masks. If the required element is within
38693 the given vector it is shuffled into the proper lane. If the required
38694 element is in the other vector, force a zero into the lane by setting
38695 bit 7 in the permutation mask. */
38696 m128 = GEN_INT (-128);
38697 for (i = 0; i < nelt; ++i)
38698 {
38699 unsigned j, e = d->perm[i];
38700 unsigned which = (e >= nelt);
38701 if (e >= nelt)
38702 e -= nelt;
38703
38704 for (j = 0; j < eltsz; ++j)
38705 {
38706 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
38707 rperm[1-which][i*eltsz + j] = m128;
38708 }
38709 }
38710
38711 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
38712 vperm = force_reg (V16QImode, vperm);
38713
38714 l = gen_reg_rtx (V16QImode);
38715 op = gen_lowpart (V16QImode, d->op0);
38716 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
38717
38718 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
38719 vperm = force_reg (V16QImode, vperm);
38720
38721 h = gen_reg_rtx (V16QImode);
38722 op = gen_lowpart (V16QImode, d->op1);
38723 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
38724
38725 op = gen_lowpart (V16QImode, d->target);
38726 emit_insn (gen_iorv16qi3 (op, l, h));
38727
38728 return true;
38729 }
38730
38731 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
38732 with two vpshufb insns, vpermq and vpor. We should have already failed
38733 all two or three instruction sequences. */
38734
38735 static bool
38736 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
38737 {
38738 rtx rperm[2][32], vperm, l, h, hp, op, m128;
38739 unsigned int i, nelt, eltsz;
38740
38741 if (!TARGET_AVX2
38742 || !d->one_operand_p
38743 || (d->vmode != V32QImode && d->vmode != V16HImode))
38744 return false;
38745
38746 if (d->testing_p)
38747 return true;
38748
38749 nelt = d->nelt;
38750 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38751
38752 /* Generate two permutation masks. If the required element is within
38753 the same lane, it is shuffled in. If the required element from the
38754 other lane, force a zero by setting bit 7 in the permutation mask.
38755 In the other mask the mask has non-negative elements if element
38756 is requested from the other lane, but also moved to the other lane,
38757 so that the result of vpshufb can have the two V2TImode halves
38758 swapped. */
38759 m128 = GEN_INT (-128);
38760 for (i = 0; i < nelt; ++i)
38761 {
38762 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
38763 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
38764
38765 for (j = 0; j < eltsz; ++j)
38766 {
38767 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
38768 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
38769 }
38770 }
38771
38772 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
38773 vperm = force_reg (V32QImode, vperm);
38774
38775 h = gen_reg_rtx (V32QImode);
38776 op = gen_lowpart (V32QImode, d->op0);
38777 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
38778
38779 /* Swap the 128-byte lanes of h into hp. */
38780 hp = gen_reg_rtx (V4DImode);
38781 op = gen_lowpart (V4DImode, h);
38782 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
38783 const1_rtx));
38784
38785 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
38786 vperm = force_reg (V32QImode, vperm);
38787
38788 l = gen_reg_rtx (V32QImode);
38789 op = gen_lowpart (V32QImode, d->op0);
38790 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
38791
38792 op = gen_lowpart (V32QImode, d->target);
38793 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
38794
38795 return true;
38796 }
38797
38798 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
38799 and extract-odd permutations of two V32QImode and V16QImode operand
38800 with two vpshufb insns, vpor and vpermq. We should have already
38801 failed all two or three instruction sequences. */
38802
38803 static bool
38804 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
38805 {
38806 rtx rperm[2][32], vperm, l, h, ior, op, m128;
38807 unsigned int i, nelt, eltsz;
38808
38809 if (!TARGET_AVX2
38810 || d->one_operand_p
38811 || (d->vmode != V32QImode && d->vmode != V16HImode))
38812 return false;
38813
38814 for (i = 0; i < d->nelt; ++i)
38815 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
38816 return false;
38817
38818 if (d->testing_p)
38819 return true;
38820
38821 nelt = d->nelt;
38822 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38823
38824 /* Generate two permutation masks. In the first permutation mask
38825 the first quarter will contain indexes for the first half
38826 of the op0, the second quarter will contain bit 7 set, third quarter
38827 will contain indexes for the second half of the op0 and the
38828 last quarter bit 7 set. In the second permutation mask
38829 the first quarter will contain bit 7 set, the second quarter
38830 indexes for the first half of the op1, the third quarter bit 7 set
38831 and last quarter indexes for the second half of the op1.
38832 I.e. the first mask e.g. for V32QImode extract even will be:
38833 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
38834 (all values masked with 0xf except for -128) and second mask
38835 for extract even will be
38836 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
38837 m128 = GEN_INT (-128);
38838 for (i = 0; i < nelt; ++i)
38839 {
38840 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
38841 unsigned which = d->perm[i] >= nelt;
38842 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
38843
38844 for (j = 0; j < eltsz; ++j)
38845 {
38846 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
38847 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
38848 }
38849 }
38850
38851 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
38852 vperm = force_reg (V32QImode, vperm);
38853
38854 l = gen_reg_rtx (V32QImode);
38855 op = gen_lowpart (V32QImode, d->op0);
38856 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
38857
38858 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
38859 vperm = force_reg (V32QImode, vperm);
38860
38861 h = gen_reg_rtx (V32QImode);
38862 op = gen_lowpart (V32QImode, d->op1);
38863 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
38864
38865 ior = gen_reg_rtx (V32QImode);
38866 emit_insn (gen_iorv32qi3 (ior, l, h));
38867
38868 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
38869 op = gen_lowpart (V4DImode, d->target);
38870 ior = gen_lowpart (V4DImode, ior);
38871 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
38872 const1_rtx, GEN_INT (3)));
38873
38874 return true;
38875 }
38876
38877 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
38878 and extract-odd permutations. */
38879
38880 static bool
38881 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
38882 {
38883 rtx t1, t2, t3;
38884
38885 switch (d->vmode)
38886 {
38887 case V4DFmode:
38888 t1 = gen_reg_rtx (V4DFmode);
38889 t2 = gen_reg_rtx (V4DFmode);
38890
38891 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
38892 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
38893 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
38894
38895 /* Now an unpck[lh]pd will produce the result required. */
38896 if (odd)
38897 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
38898 else
38899 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
38900 emit_insn (t3);
38901 break;
38902
38903 case V8SFmode:
38904 {
38905 int mask = odd ? 0xdd : 0x88;
38906
38907 t1 = gen_reg_rtx (V8SFmode);
38908 t2 = gen_reg_rtx (V8SFmode);
38909 t3 = gen_reg_rtx (V8SFmode);
38910
38911 /* Shuffle within the 128-bit lanes to produce:
38912 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
38913 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
38914 GEN_INT (mask)));
38915
38916 /* Shuffle the lanes around to produce:
38917 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
38918 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
38919 GEN_INT (0x3)));
38920
38921 /* Shuffle within the 128-bit lanes to produce:
38922 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
38923 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
38924
38925 /* Shuffle within the 128-bit lanes to produce:
38926 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
38927 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
38928
38929 /* Shuffle the lanes around to produce:
38930 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
38931 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
38932 GEN_INT (0x20)));
38933 }
38934 break;
38935
38936 case V2DFmode:
38937 case V4SFmode:
38938 case V2DImode:
38939 case V4SImode:
38940 /* These are always directly implementable by expand_vec_perm_1. */
38941 gcc_unreachable ();
38942
38943 case V8HImode:
38944 if (TARGET_SSSE3)
38945 return expand_vec_perm_pshufb2 (d);
38946 else
38947 {
38948 /* We need 2*log2(N)-1 operations to achieve odd/even
38949 with interleave. */
38950 t1 = gen_reg_rtx (V8HImode);
38951 t2 = gen_reg_rtx (V8HImode);
38952 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
38953 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
38954 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
38955 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
38956 if (odd)
38957 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
38958 else
38959 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
38960 emit_insn (t3);
38961 }
38962 break;
38963
38964 case V16QImode:
38965 if (TARGET_SSSE3)
38966 return expand_vec_perm_pshufb2 (d);
38967 else
38968 {
38969 t1 = gen_reg_rtx (V16QImode);
38970 t2 = gen_reg_rtx (V16QImode);
38971 t3 = gen_reg_rtx (V16QImode);
38972 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
38973 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
38974 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
38975 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
38976 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
38977 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
38978 if (odd)
38979 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
38980 else
38981 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
38982 emit_insn (t3);
38983 }
38984 break;
38985
38986 case V16HImode:
38987 case V32QImode:
38988 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
38989
38990 case V4DImode:
38991 if (!TARGET_AVX2)
38992 {
38993 struct expand_vec_perm_d d_copy = *d;
38994 d_copy.vmode = V4DFmode;
38995 d_copy.target = gen_lowpart (V4DFmode, d->target);
38996 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
38997 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
38998 return expand_vec_perm_even_odd_1 (&d_copy, odd);
38999 }
39000
39001 t1 = gen_reg_rtx (V4DImode);
39002 t2 = gen_reg_rtx (V4DImode);
39003
39004 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
39005 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
39006 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
39007
39008 /* Now an vpunpck[lh]qdq will produce the result required. */
39009 if (odd)
39010 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
39011 else
39012 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
39013 emit_insn (t3);
39014 break;
39015
39016 case V8SImode:
39017 if (!TARGET_AVX2)
39018 {
39019 struct expand_vec_perm_d d_copy = *d;
39020 d_copy.vmode = V8SFmode;
39021 d_copy.target = gen_lowpart (V8SFmode, d->target);
39022 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
39023 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
39024 return expand_vec_perm_even_odd_1 (&d_copy, odd);
39025 }
39026
39027 t1 = gen_reg_rtx (V8SImode);
39028 t2 = gen_reg_rtx (V8SImode);
39029
39030 /* Shuffle the lanes around into
39031 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
39032 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
39033 gen_lowpart (V4DImode, d->op0),
39034 gen_lowpart (V4DImode, d->op1),
39035 GEN_INT (0x20)));
39036 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
39037 gen_lowpart (V4DImode, d->op0),
39038 gen_lowpart (V4DImode, d->op1),
39039 GEN_INT (0x31)));
39040
39041 /* Swap the 2nd and 3rd position in each lane into
39042 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
39043 emit_insn (gen_avx2_pshufdv3 (t1, t1,
39044 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
39045 emit_insn (gen_avx2_pshufdv3 (t2, t2,
39046 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
39047
39048 /* Now an vpunpck[lh]qdq will produce
39049 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
39050 if (odd)
39051 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
39052 gen_lowpart (V4DImode, t1),
39053 gen_lowpart (V4DImode, t2));
39054 else
39055 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
39056 gen_lowpart (V4DImode, t1),
39057 gen_lowpart (V4DImode, t2));
39058 emit_insn (t3);
39059 break;
39060
39061 default:
39062 gcc_unreachable ();
39063 }
39064
39065 return true;
39066 }
39067
39068 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
39069 extract-even and extract-odd permutations. */
39070
39071 static bool
39072 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
39073 {
39074 unsigned i, odd, nelt = d->nelt;
39075
39076 odd = d->perm[0];
39077 if (odd != 0 && odd != 1)
39078 return false;
39079
39080 for (i = 1; i < nelt; ++i)
39081 if (d->perm[i] != 2 * i + odd)
39082 return false;
39083
39084 return expand_vec_perm_even_odd_1 (d, odd);
39085 }
39086
39087 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
39088 permutations. We assume that expand_vec_perm_1 has already failed. */
39089
39090 static bool
39091 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
39092 {
39093 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
39094 enum machine_mode vmode = d->vmode;
39095 unsigned char perm2[4];
39096 rtx op0 = d->op0;
39097 bool ok;
39098
39099 switch (vmode)
39100 {
39101 case V4DFmode:
39102 case V8SFmode:
39103 /* These are special-cased in sse.md so that we can optionally
39104 use the vbroadcast instruction. They expand to two insns
39105 if the input happens to be in a register. */
39106 gcc_unreachable ();
39107
39108 case V2DFmode:
39109 case V2DImode:
39110 case V4SFmode:
39111 case V4SImode:
39112 /* These are always implementable using standard shuffle patterns. */
39113 gcc_unreachable ();
39114
39115 case V8HImode:
39116 case V16QImode:
39117 /* These can be implemented via interleave. We save one insn by
39118 stopping once we have promoted to V4SImode and then use pshufd. */
39119 do
39120 {
39121 rtx dest;
39122 rtx (*gen) (rtx, rtx, rtx)
39123 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
39124 : gen_vec_interleave_lowv8hi;
39125
39126 if (elt >= nelt2)
39127 {
39128 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
39129 : gen_vec_interleave_highv8hi;
39130 elt -= nelt2;
39131 }
39132 nelt2 /= 2;
39133
39134 dest = gen_reg_rtx (vmode);
39135 emit_insn (gen (dest, op0, op0));
39136 vmode = get_mode_wider_vector (vmode);
39137 op0 = gen_lowpart (vmode, dest);
39138 }
39139 while (vmode != V4SImode);
39140
39141 memset (perm2, elt, 4);
39142 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
39143 d->testing_p);
39144 gcc_assert (ok);
39145 return true;
39146
39147 case V32QImode:
39148 case V16HImode:
39149 case V8SImode:
39150 case V4DImode:
39151 /* For AVX2 broadcasts of the first element vpbroadcast* or
39152 vpermq should be used by expand_vec_perm_1. */
39153 gcc_assert (!TARGET_AVX2 || d->perm[0]);
39154 return false;
39155
39156 default:
39157 gcc_unreachable ();
39158 }
39159 }
39160
39161 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
39162 broadcast permutations. */
39163
39164 static bool
39165 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
39166 {
39167 unsigned i, elt, nelt = d->nelt;
39168
39169 if (!d->one_operand_p)
39170 return false;
39171
39172 elt = d->perm[0];
39173 for (i = 1; i < nelt; ++i)
39174 if (d->perm[i] != elt)
39175 return false;
39176
39177 return expand_vec_perm_broadcast_1 (d);
39178 }
39179
39180 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
39181 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
39182 all the shorter instruction sequences. */
39183
39184 static bool
39185 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
39186 {
39187 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
39188 unsigned int i, nelt, eltsz;
39189 bool used[4];
39190
39191 if (!TARGET_AVX2
39192 || d->one_operand_p
39193 || (d->vmode != V32QImode && d->vmode != V16HImode))
39194 return false;
39195
39196 if (d->testing_p)
39197 return true;
39198
39199 nelt = d->nelt;
39200 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39201
39202 /* Generate 4 permutation masks. If the required element is within
39203 the same lane, it is shuffled in. If the required element from the
39204 other lane, force a zero by setting bit 7 in the permutation mask.
39205 In the other mask the mask has non-negative elements if element
39206 is requested from the other lane, but also moved to the other lane,
39207 so that the result of vpshufb can have the two V2TImode halves
39208 swapped. */
39209 m128 = GEN_INT (-128);
39210 for (i = 0; i < 32; ++i)
39211 {
39212 rperm[0][i] = m128;
39213 rperm[1][i] = m128;
39214 rperm[2][i] = m128;
39215 rperm[3][i] = m128;
39216 }
39217 used[0] = false;
39218 used[1] = false;
39219 used[2] = false;
39220 used[3] = false;
39221 for (i = 0; i < nelt; ++i)
39222 {
39223 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39224 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
39225 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
39226
39227 for (j = 0; j < eltsz; ++j)
39228 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
39229 used[which] = true;
39230 }
39231
39232 for (i = 0; i < 2; ++i)
39233 {
39234 if (!used[2 * i + 1])
39235 {
39236 h[i] = NULL_RTX;
39237 continue;
39238 }
39239 vperm = gen_rtx_CONST_VECTOR (V32QImode,
39240 gen_rtvec_v (32, rperm[2 * i + 1]));
39241 vperm = force_reg (V32QImode, vperm);
39242 h[i] = gen_reg_rtx (V32QImode);
39243 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
39244 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
39245 }
39246
39247 /* Swap the 128-byte lanes of h[X]. */
39248 for (i = 0; i < 2; ++i)
39249 {
39250 if (h[i] == NULL_RTX)
39251 continue;
39252 op = gen_reg_rtx (V4DImode);
39253 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
39254 const2_rtx, GEN_INT (3), const0_rtx,
39255 const1_rtx));
39256 h[i] = gen_lowpart (V32QImode, op);
39257 }
39258
39259 for (i = 0; i < 2; ++i)
39260 {
39261 if (!used[2 * i])
39262 {
39263 l[i] = NULL_RTX;
39264 continue;
39265 }
39266 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
39267 vperm = force_reg (V32QImode, vperm);
39268 l[i] = gen_reg_rtx (V32QImode);
39269 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
39270 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
39271 }
39272
39273 for (i = 0; i < 2; ++i)
39274 {
39275 if (h[i] && l[i])
39276 {
39277 op = gen_reg_rtx (V32QImode);
39278 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
39279 l[i] = op;
39280 }
39281 else if (h[i])
39282 l[i] = h[i];
39283 }
39284
39285 gcc_assert (l[0] && l[1]);
39286 op = gen_lowpart (V32QImode, d->target);
39287 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
39288 return true;
39289 }
39290
39291 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
39292 With all of the interface bits taken care of, perform the expansion
39293 in D and return true on success. */
39294
39295 static bool
39296 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
39297 {
39298 /* Try a single instruction expansion. */
39299 if (expand_vec_perm_1 (d))
39300 return true;
39301
39302 /* Try sequences of two instructions. */
39303
39304 if (expand_vec_perm_pshuflw_pshufhw (d))
39305 return true;
39306
39307 if (expand_vec_perm_palignr (d))
39308 return true;
39309
39310 if (expand_vec_perm_interleave2 (d))
39311 return true;
39312
39313 if (expand_vec_perm_broadcast (d))
39314 return true;
39315
39316 if (expand_vec_perm_vpermq_perm_1 (d))
39317 return true;
39318
39319 if (expand_vec_perm_vperm2f128 (d))
39320 return true;
39321
39322 /* Try sequences of three instructions. */
39323
39324 if (expand_vec_perm_2vperm2f128_vshuf (d))
39325 return true;
39326
39327 if (expand_vec_perm_pshufb2 (d))
39328 return true;
39329
39330 if (expand_vec_perm_interleave3 (d))
39331 return true;
39332
39333 if (expand_vec_perm_vperm2f128_vblend (d))
39334 return true;
39335
39336 /* Try sequences of four instructions. */
39337
39338 if (expand_vec_perm_vpshufb2_vpermq (d))
39339 return true;
39340
39341 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
39342 return true;
39343
39344 /* ??? Look for narrow permutations whose element orderings would
39345 allow the promotion to a wider mode. */
39346
39347 /* ??? Look for sequences of interleave or a wider permute that place
39348 the data into the correct lanes for a half-vector shuffle like
39349 pshuf[lh]w or vpermilps. */
39350
39351 /* ??? Look for sequences of interleave that produce the desired results.
39352 The combinatorics of punpck[lh] get pretty ugly... */
39353
39354 if (expand_vec_perm_even_odd (d))
39355 return true;
39356
39357 /* Even longer sequences. */
39358 if (expand_vec_perm_vpshufb4_vpermq2 (d))
39359 return true;
39360
39361 return false;
39362 }
39363
39364 /* If a permutation only uses one operand, make it clear. Returns true
39365 if the permutation references both operands. */
39366
39367 static bool
39368 canonicalize_perm (struct expand_vec_perm_d *d)
39369 {
39370 int i, which, nelt = d->nelt;
39371
39372 for (i = which = 0; i < nelt; ++i)
39373 which |= (d->perm[i] < nelt ? 1 : 2);
39374
39375 d->one_operand_p = true;
39376 switch (which)
39377 {
39378 default:
39379 gcc_unreachable();
39380
39381 case 3:
39382 if (!rtx_equal_p (d->op0, d->op1))
39383 {
39384 d->one_operand_p = false;
39385 break;
39386 }
39387 /* The elements of PERM do not suggest that only the first operand
39388 is used, but both operands are identical. Allow easier matching
39389 of the permutation by folding the permutation into the single
39390 input vector. */
39391 /* FALLTHRU */
39392
39393 case 2:
39394 for (i = 0; i < nelt; ++i)
39395 d->perm[i] &= nelt - 1;
39396 d->op0 = d->op1;
39397 break;
39398
39399 case 1:
39400 d->op1 = d->op0;
39401 break;
39402 }
39403
39404 return (which == 3);
39405 }
39406
39407 bool
39408 ix86_expand_vec_perm_const (rtx operands[4])
39409 {
39410 struct expand_vec_perm_d d;
39411 unsigned char perm[MAX_VECT_LEN];
39412 int i, nelt;
39413 bool two_args;
39414 rtx sel;
39415
39416 d.target = operands[0];
39417 d.op0 = operands[1];
39418 d.op1 = operands[2];
39419 sel = operands[3];
39420
39421 d.vmode = GET_MODE (d.target);
39422 gcc_assert (VECTOR_MODE_P (d.vmode));
39423 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
39424 d.testing_p = false;
39425
39426 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
39427 gcc_assert (XVECLEN (sel, 0) == nelt);
39428 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
39429
39430 for (i = 0; i < nelt; ++i)
39431 {
39432 rtx e = XVECEXP (sel, 0, i);
39433 int ei = INTVAL (e) & (2 * nelt - 1);
39434 d.perm[i] = ei;
39435 perm[i] = ei;
39436 }
39437
39438 two_args = canonicalize_perm (&d);
39439
39440 if (ix86_expand_vec_perm_const_1 (&d))
39441 return true;
39442
39443 /* If the selector says both arguments are needed, but the operands are the
39444 same, the above tried to expand with one_operand_p and flattened selector.
39445 If that didn't work, retry without one_operand_p; we succeeded with that
39446 during testing. */
39447 if (two_args && d.one_operand_p)
39448 {
39449 d.one_operand_p = false;
39450 memcpy (d.perm, perm, sizeof (perm));
39451 return ix86_expand_vec_perm_const_1 (&d);
39452 }
39453
39454 return false;
39455 }
39456
39457 /* Implement targetm.vectorize.vec_perm_const_ok. */
39458
39459 static bool
39460 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
39461 const unsigned char *sel)
39462 {
39463 struct expand_vec_perm_d d;
39464 unsigned int i, nelt, which;
39465 bool ret;
39466
39467 d.vmode = vmode;
39468 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
39469 d.testing_p = true;
39470
39471 /* Given sufficient ISA support we can just return true here
39472 for selected vector modes. */
39473 if (GET_MODE_SIZE (d.vmode) == 16)
39474 {
39475 /* All implementable with a single vpperm insn. */
39476 if (TARGET_XOP)
39477 return true;
39478 /* All implementable with 2 pshufb + 1 ior. */
39479 if (TARGET_SSSE3)
39480 return true;
39481 /* All implementable with shufpd or unpck[lh]pd. */
39482 if (d.nelt == 2)
39483 return true;
39484 }
39485
39486 /* Extract the values from the vector CST into the permutation
39487 array in D. */
39488 memcpy (d.perm, sel, nelt);
39489 for (i = which = 0; i < nelt; ++i)
39490 {
39491 unsigned char e = d.perm[i];
39492 gcc_assert (e < 2 * nelt);
39493 which |= (e < nelt ? 1 : 2);
39494 }
39495
39496 /* For all elements from second vector, fold the elements to first. */
39497 if (which == 2)
39498 for (i = 0; i < nelt; ++i)
39499 d.perm[i] -= nelt;
39500
39501 /* Check whether the mask can be applied to the vector type. */
39502 d.one_operand_p = (which != 3);
39503
39504 /* Implementable with shufps or pshufd. */
39505 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
39506 return true;
39507
39508 /* Otherwise we have to go through the motions and see if we can
39509 figure out how to generate the requested permutation. */
39510 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
39511 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
39512 if (!d.one_operand_p)
39513 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
39514
39515 start_sequence ();
39516 ret = ix86_expand_vec_perm_const_1 (&d);
39517 end_sequence ();
39518
39519 return ret;
39520 }
39521
39522 void
39523 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
39524 {
39525 struct expand_vec_perm_d d;
39526 unsigned i, nelt;
39527
39528 d.target = targ;
39529 d.op0 = op0;
39530 d.op1 = op1;
39531 d.vmode = GET_MODE (targ);
39532 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
39533 d.one_operand_p = false;
39534 d.testing_p = false;
39535
39536 for (i = 0; i < nelt; ++i)
39537 d.perm[i] = i * 2 + odd;
39538
39539 /* We'll either be able to implement the permutation directly... */
39540 if (expand_vec_perm_1 (&d))
39541 return;
39542
39543 /* ... or we use the special-case patterns. */
39544 expand_vec_perm_even_odd_1 (&d, odd);
39545 }
39546
39547 static void
39548 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
39549 {
39550 struct expand_vec_perm_d d;
39551 unsigned i, nelt, base;
39552 bool ok;
39553
39554 d.target = targ;
39555 d.op0 = op0;
39556 d.op1 = op1;
39557 d.vmode = GET_MODE (targ);
39558 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
39559 d.one_operand_p = false;
39560 d.testing_p = false;
39561
39562 base = high_p ? nelt / 2 : 0;
39563 for (i = 0; i < nelt / 2; ++i)
39564 {
39565 d.perm[i * 2] = i + base;
39566 d.perm[i * 2 + 1] = i + base + nelt;
39567 }
39568
39569 /* Note that for AVX this isn't one instruction. */
39570 ok = ix86_expand_vec_perm_const_1 (&d);
39571 gcc_assert (ok);
39572 }
39573
39574
39575 /* Expand a vector operation CODE for a V*QImode in terms of the
39576 same operation on V*HImode. */
39577
39578 void
39579 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
39580 {
39581 enum machine_mode qimode = GET_MODE (dest);
39582 enum machine_mode himode;
39583 rtx (*gen_il) (rtx, rtx, rtx);
39584 rtx (*gen_ih) (rtx, rtx, rtx);
39585 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
39586 struct expand_vec_perm_d d;
39587 bool ok, full_interleave;
39588 bool uns_p = false;
39589 int i;
39590
39591 switch (qimode)
39592 {
39593 case V16QImode:
39594 himode = V8HImode;
39595 gen_il = gen_vec_interleave_lowv16qi;
39596 gen_ih = gen_vec_interleave_highv16qi;
39597 break;
39598 case V32QImode:
39599 himode = V16HImode;
39600 gen_il = gen_avx2_interleave_lowv32qi;
39601 gen_ih = gen_avx2_interleave_highv32qi;
39602 break;
39603 default:
39604 gcc_unreachable ();
39605 }
39606
39607 op2_l = op2_h = op2;
39608 switch (code)
39609 {
39610 case MULT:
39611 /* Unpack data such that we've got a source byte in each low byte of
39612 each word. We don't care what goes into the high byte of each word.
39613 Rather than trying to get zero in there, most convenient is to let
39614 it be a copy of the low byte. */
39615 op2_l = gen_reg_rtx (qimode);
39616 op2_h = gen_reg_rtx (qimode);
39617 emit_insn (gen_il (op2_l, op2, op2));
39618 emit_insn (gen_ih (op2_h, op2, op2));
39619 /* FALLTHRU */
39620
39621 op1_l = gen_reg_rtx (qimode);
39622 op1_h = gen_reg_rtx (qimode);
39623 emit_insn (gen_il (op1_l, op1, op1));
39624 emit_insn (gen_ih (op1_h, op1, op1));
39625 full_interleave = qimode == V16QImode;
39626 break;
39627
39628 case ASHIFT:
39629 case LSHIFTRT:
39630 uns_p = true;
39631 /* FALLTHRU */
39632 case ASHIFTRT:
39633 op1_l = gen_reg_rtx (himode);
39634 op1_h = gen_reg_rtx (himode);
39635 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
39636 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
39637 full_interleave = true;
39638 break;
39639 default:
39640 gcc_unreachable ();
39641 }
39642
39643 /* Perform the operation. */
39644 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
39645 1, OPTAB_DIRECT);
39646 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
39647 1, OPTAB_DIRECT);
39648 gcc_assert (res_l && res_h);
39649
39650 /* Merge the data back into the right place. */
39651 d.target = dest;
39652 d.op0 = gen_lowpart (qimode, res_l);
39653 d.op1 = gen_lowpart (qimode, res_h);
39654 d.vmode = qimode;
39655 d.nelt = GET_MODE_NUNITS (qimode);
39656 d.one_operand_p = false;
39657 d.testing_p = false;
39658
39659 if (full_interleave)
39660 {
39661 /* For SSE2, we used an full interleave, so the desired
39662 results are in the even elements. */
39663 for (i = 0; i < 32; ++i)
39664 d.perm[i] = i * 2;
39665 }
39666 else
39667 {
39668 /* For AVX, the interleave used above was not cross-lane. So the
39669 extraction is evens but with the second and third quarter swapped.
39670 Happily, that is even one insn shorter than even extraction. */
39671 for (i = 0; i < 32; ++i)
39672 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
39673 }
39674
39675 ok = ix86_expand_vec_perm_const_1 (&d);
39676 gcc_assert (ok);
39677
39678 set_unique_reg_note (get_last_insn (), REG_EQUAL,
39679 gen_rtx_fmt_ee (code, qimode, op1, op2));
39680 }
39681
39682 void
39683 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
39684 bool uns_p, bool odd_p)
39685 {
39686 enum machine_mode mode = GET_MODE (op1);
39687 enum machine_mode wmode = GET_MODE (dest);
39688 rtx x;
39689
39690 /* We only play even/odd games with vectors of SImode. */
39691 gcc_assert (mode == V4SImode || mode == V8SImode);
39692
39693 /* If we're looking for the odd results, shift those members down to
39694 the even slots. For some cpus this is faster than a PSHUFD. */
39695 if (odd_p)
39696 {
39697 if (TARGET_XOP && mode == V4SImode)
39698 {
39699 x = force_reg (wmode, CONST0_RTX (wmode));
39700 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
39701 return;
39702 }
39703
39704 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
39705 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
39706 x, NULL, 1, OPTAB_DIRECT);
39707 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
39708 x, NULL, 1, OPTAB_DIRECT);
39709 op1 = gen_lowpart (mode, op1);
39710 op2 = gen_lowpart (mode, op2);
39711 }
39712
39713 if (mode == V8SImode)
39714 {
39715 if (uns_p)
39716 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
39717 else
39718 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
39719 }
39720 else if (uns_p)
39721 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
39722 else if (TARGET_SSE4_1)
39723 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
39724 else
39725 {
39726 rtx s1, s2, t0, t1, t2;
39727
39728 /* The easiest way to implement this without PMULDQ is to go through
39729 the motions as if we are performing a full 64-bit multiply. With
39730 the exception that we need to do less shuffling of the elements. */
39731
39732 /* Compute the sign-extension, aka highparts, of the two operands. */
39733 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
39734 op1, pc_rtx, pc_rtx);
39735 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
39736 op2, pc_rtx, pc_rtx);
39737
39738 /* Multiply LO(A) * HI(B), and vice-versa. */
39739 t1 = gen_reg_rtx (wmode);
39740 t2 = gen_reg_rtx (wmode);
39741 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
39742 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
39743
39744 /* Multiply LO(A) * LO(B). */
39745 t0 = gen_reg_rtx (wmode);
39746 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
39747
39748 /* Combine and shift the highparts into place. */
39749 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
39750 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
39751 1, OPTAB_DIRECT);
39752
39753 /* Combine high and low parts. */
39754 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
39755 return;
39756 }
39757 emit_insn (x);
39758 }
39759
39760 void
39761 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
39762 bool uns_p, bool high_p)
39763 {
39764 enum machine_mode wmode = GET_MODE (dest);
39765 enum machine_mode mode = GET_MODE (op1);
39766 rtx t1, t2, t3, t4, mask;
39767
39768 switch (mode)
39769 {
39770 case V4SImode:
39771 t1 = gen_reg_rtx (mode);
39772 t2 = gen_reg_rtx (mode);
39773 if (TARGET_XOP && !uns_p)
39774 {
39775 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
39776 shuffle the elements once so that all elements are in the right
39777 place for immediate use: { A C B D }. */
39778 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
39779 const1_rtx, GEN_INT (3)));
39780 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
39781 const1_rtx, GEN_INT (3)));
39782 }
39783 else
39784 {
39785 /* Put the elements into place for the multiply. */
39786 ix86_expand_vec_interleave (t1, op1, op1, high_p);
39787 ix86_expand_vec_interleave (t2, op2, op2, high_p);
39788 high_p = false;
39789 }
39790 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
39791 break;
39792
39793 case V8SImode:
39794 /* Shuffle the elements between the lanes. After this we
39795 have { A B E F | C D G H } for each operand. */
39796 t1 = gen_reg_rtx (V4DImode);
39797 t2 = gen_reg_rtx (V4DImode);
39798 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
39799 const0_rtx, const2_rtx,
39800 const1_rtx, GEN_INT (3)));
39801 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
39802 const0_rtx, const2_rtx,
39803 const1_rtx, GEN_INT (3)));
39804
39805 /* Shuffle the elements within the lanes. After this we
39806 have { A A B B | C C D D } or { E E F F | G G H H }. */
39807 t3 = gen_reg_rtx (V8SImode);
39808 t4 = gen_reg_rtx (V8SImode);
39809 mask = GEN_INT (high_p
39810 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
39811 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
39812 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
39813 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
39814
39815 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
39816 break;
39817
39818 case V8HImode:
39819 case V16HImode:
39820 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
39821 uns_p, OPTAB_DIRECT);
39822 t2 = expand_binop (mode,
39823 uns_p ? umul_highpart_optab : smul_highpart_optab,
39824 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
39825 gcc_assert (t1 && t2);
39826
39827 ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
39828 break;
39829
39830 case V16QImode:
39831 case V32QImode:
39832 t1 = gen_reg_rtx (wmode);
39833 t2 = gen_reg_rtx (wmode);
39834 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
39835 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
39836
39837 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
39838 break;
39839
39840 default:
39841 gcc_unreachable ();
39842 }
39843 }
39844
39845 void
39846 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
39847 {
39848 rtx res_1, res_2;
39849
39850 res_1 = gen_reg_rtx (V4SImode);
39851 res_2 = gen_reg_rtx (V4SImode);
39852 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
39853 op1, op2, true, false);
39854 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
39855 op1, op2, true, true);
39856
39857 /* Move the results in element 2 down to element 1; we don't care
39858 what goes in elements 2 and 3. Then we can merge the parts
39859 back together with an interleave.
39860
39861 Note that two other sequences were tried:
39862 (1) Use interleaves at the start instead of psrldq, which allows
39863 us to use a single shufps to merge things back at the end.
39864 (2) Use shufps here to combine the two vectors, then pshufd to
39865 put the elements in the correct order.
39866 In both cases the cost of the reformatting stall was too high
39867 and the overall sequence slower. */
39868
39869 emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
39870 const0_rtx, const0_rtx));
39871 emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
39872 const0_rtx, const0_rtx));
39873 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
39874
39875 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
39876 }
39877
39878 void
39879 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
39880 {
39881 enum machine_mode mode = GET_MODE (op0);
39882 rtx t1, t2, t3, t4, t5, t6;
39883
39884 if (TARGET_XOP && mode == V2DImode)
39885 {
39886 /* op1: A,B,C,D, op2: E,F,G,H */
39887 op1 = gen_lowpart (V4SImode, op1);
39888 op2 = gen_lowpart (V4SImode, op2);
39889
39890 t1 = gen_reg_rtx (V4SImode);
39891 t2 = gen_reg_rtx (V4SImode);
39892 t3 = gen_reg_rtx (V2DImode);
39893 t4 = gen_reg_rtx (V2DImode);
39894
39895 /* t1: B,A,D,C */
39896 emit_insn (gen_sse2_pshufd_1 (t1, op1,
39897 GEN_INT (1),
39898 GEN_INT (0),
39899 GEN_INT (3),
39900 GEN_INT (2)));
39901
39902 /* t2: (B*E),(A*F),(D*G),(C*H) */
39903 emit_insn (gen_mulv4si3 (t2, t1, op2));
39904
39905 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
39906 emit_insn (gen_xop_phadddq (t3, t2));
39907
39908 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
39909 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
39910
39911 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
39912 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
39913 }
39914 else
39915 {
39916 enum machine_mode nmode;
39917 rtx (*umul) (rtx, rtx, rtx);
39918
39919 if (mode == V2DImode)
39920 {
39921 umul = gen_vec_widen_umult_even_v4si;
39922 nmode = V4SImode;
39923 }
39924 else if (mode == V4DImode)
39925 {
39926 umul = gen_vec_widen_umult_even_v8si;
39927 nmode = V8SImode;
39928 }
39929 else
39930 gcc_unreachable ();
39931
39932
39933 /* Multiply low parts. */
39934 t1 = gen_reg_rtx (mode);
39935 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
39936
39937 /* Shift input vectors right 32 bits so we can multiply high parts. */
39938 t6 = GEN_INT (32);
39939 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
39940 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
39941
39942 /* Multiply high parts by low parts. */
39943 t4 = gen_reg_rtx (mode);
39944 t5 = gen_reg_rtx (mode);
39945 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
39946 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
39947
39948 /* Combine and shift the highparts back. */
39949 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
39950 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
39951
39952 /* Combine high and low parts. */
39953 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
39954 }
39955
39956 set_unique_reg_note (get_last_insn (), REG_EQUAL,
39957 gen_rtx_MULT (mode, op1, op2));
39958 }
39959
39960 /* Expand an insert into a vector register through pinsr insn.
39961 Return true if successful. */
39962
39963 bool
39964 ix86_expand_pinsr (rtx *operands)
39965 {
39966 rtx dst = operands[0];
39967 rtx src = operands[3];
39968
39969 unsigned int size = INTVAL (operands[1]);
39970 unsigned int pos = INTVAL (operands[2]);
39971
39972 if (GET_CODE (dst) == SUBREG)
39973 {
39974 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
39975 dst = SUBREG_REG (dst);
39976 }
39977
39978 if (GET_CODE (src) == SUBREG)
39979 src = SUBREG_REG (src);
39980
39981 switch (GET_MODE (dst))
39982 {
39983 case V16QImode:
39984 case V8HImode:
39985 case V4SImode:
39986 case V2DImode:
39987 {
39988 enum machine_mode srcmode, dstmode;
39989 rtx (*pinsr)(rtx, rtx, rtx, rtx);
39990
39991 srcmode = mode_for_size (size, MODE_INT, 0);
39992
39993 switch (srcmode)
39994 {
39995 case QImode:
39996 if (!TARGET_SSE4_1)
39997 return false;
39998 dstmode = V16QImode;
39999 pinsr = gen_sse4_1_pinsrb;
40000 break;
40001
40002 case HImode:
40003 if (!TARGET_SSE2)
40004 return false;
40005 dstmode = V8HImode;
40006 pinsr = gen_sse2_pinsrw;
40007 break;
40008
40009 case SImode:
40010 if (!TARGET_SSE4_1)
40011 return false;
40012 dstmode = V4SImode;
40013 pinsr = gen_sse4_1_pinsrd;
40014 break;
40015
40016 case DImode:
40017 gcc_assert (TARGET_64BIT);
40018 if (!TARGET_SSE4_1)
40019 return false;
40020 dstmode = V2DImode;
40021 pinsr = gen_sse4_1_pinsrq;
40022 break;
40023
40024 default:
40025 return false;
40026 }
40027
40028 dst = gen_lowpart (dstmode, dst);
40029 src = gen_lowpart (srcmode, src);
40030
40031 pos /= size;
40032
40033 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
40034 return true;
40035 }
40036
40037 default:
40038 return false;
40039 }
40040 }
40041 \f
40042 /* This function returns the calling abi specific va_list type node.
40043 It returns the FNDECL specific va_list type. */
40044
40045 static tree
40046 ix86_fn_abi_va_list (tree fndecl)
40047 {
40048 if (!TARGET_64BIT)
40049 return va_list_type_node;
40050 gcc_assert (fndecl != NULL_TREE);
40051
40052 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
40053 return ms_va_list_type_node;
40054 else
40055 return sysv_va_list_type_node;
40056 }
40057
40058 /* Returns the canonical va_list type specified by TYPE. If there
40059 is no valid TYPE provided, it return NULL_TREE. */
40060
40061 static tree
40062 ix86_canonical_va_list_type (tree type)
40063 {
40064 tree wtype, htype;
40065
40066 /* Resolve references and pointers to va_list type. */
40067 if (TREE_CODE (type) == MEM_REF)
40068 type = TREE_TYPE (type);
40069 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
40070 type = TREE_TYPE (type);
40071 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
40072 type = TREE_TYPE (type);
40073
40074 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
40075 {
40076 wtype = va_list_type_node;
40077 gcc_assert (wtype != NULL_TREE);
40078 htype = type;
40079 if (TREE_CODE (wtype) == ARRAY_TYPE)
40080 {
40081 /* If va_list is an array type, the argument may have decayed
40082 to a pointer type, e.g. by being passed to another function.
40083 In that case, unwrap both types so that we can compare the
40084 underlying records. */
40085 if (TREE_CODE (htype) == ARRAY_TYPE
40086 || POINTER_TYPE_P (htype))
40087 {
40088 wtype = TREE_TYPE (wtype);
40089 htype = TREE_TYPE (htype);
40090 }
40091 }
40092 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
40093 return va_list_type_node;
40094 wtype = sysv_va_list_type_node;
40095 gcc_assert (wtype != NULL_TREE);
40096 htype = type;
40097 if (TREE_CODE (wtype) == ARRAY_TYPE)
40098 {
40099 /* If va_list is an array type, the argument may have decayed
40100 to a pointer type, e.g. by being passed to another function.
40101 In that case, unwrap both types so that we can compare the
40102 underlying records. */
40103 if (TREE_CODE (htype) == ARRAY_TYPE
40104 || POINTER_TYPE_P (htype))
40105 {
40106 wtype = TREE_TYPE (wtype);
40107 htype = TREE_TYPE (htype);
40108 }
40109 }
40110 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
40111 return sysv_va_list_type_node;
40112 wtype = ms_va_list_type_node;
40113 gcc_assert (wtype != NULL_TREE);
40114 htype = type;
40115 if (TREE_CODE (wtype) == ARRAY_TYPE)
40116 {
40117 /* If va_list is an array type, the argument may have decayed
40118 to a pointer type, e.g. by being passed to another function.
40119 In that case, unwrap both types so that we can compare the
40120 underlying records. */
40121 if (TREE_CODE (htype) == ARRAY_TYPE
40122 || POINTER_TYPE_P (htype))
40123 {
40124 wtype = TREE_TYPE (wtype);
40125 htype = TREE_TYPE (htype);
40126 }
40127 }
40128 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
40129 return ms_va_list_type_node;
40130 return NULL_TREE;
40131 }
40132 return std_canonical_va_list_type (type);
40133 }
40134
40135 /* Iterate through the target-specific builtin types for va_list.
40136 IDX denotes the iterator, *PTREE is set to the result type of
40137 the va_list builtin, and *PNAME to its internal type.
40138 Returns zero if there is no element for this index, otherwise
40139 IDX should be increased upon the next call.
40140 Note, do not iterate a base builtin's name like __builtin_va_list.
40141 Used from c_common_nodes_and_builtins. */
40142
40143 static int
40144 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
40145 {
40146 if (TARGET_64BIT)
40147 {
40148 switch (idx)
40149 {
40150 default:
40151 break;
40152
40153 case 0:
40154 *ptree = ms_va_list_type_node;
40155 *pname = "__builtin_ms_va_list";
40156 return 1;
40157
40158 case 1:
40159 *ptree = sysv_va_list_type_node;
40160 *pname = "__builtin_sysv_va_list";
40161 return 1;
40162 }
40163 }
40164
40165 return 0;
40166 }
40167
40168 #undef TARGET_SCHED_DISPATCH
40169 #define TARGET_SCHED_DISPATCH has_dispatch
40170 #undef TARGET_SCHED_DISPATCH_DO
40171 #define TARGET_SCHED_DISPATCH_DO do_dispatch
40172 #undef TARGET_SCHED_REASSOCIATION_WIDTH
40173 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
40174 #undef TARGET_SCHED_REORDER
40175 #define TARGET_SCHED_REORDER ix86_sched_reorder
40176 #undef TARGET_SCHED_ADJUST_PRIORITY
40177 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
40178 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
40179 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK ix86_dependencies_evaluation_hook
40180
40181 /* The size of the dispatch window is the total number of bytes of
40182 object code allowed in a window. */
40183 #define DISPATCH_WINDOW_SIZE 16
40184
40185 /* Number of dispatch windows considered for scheduling. */
40186 #define MAX_DISPATCH_WINDOWS 3
40187
40188 /* Maximum number of instructions in a window. */
40189 #define MAX_INSN 4
40190
40191 /* Maximum number of immediate operands in a window. */
40192 #define MAX_IMM 4
40193
40194 /* Maximum number of immediate bits allowed in a window. */
40195 #define MAX_IMM_SIZE 128
40196
40197 /* Maximum number of 32 bit immediates allowed in a window. */
40198 #define MAX_IMM_32 4
40199
40200 /* Maximum number of 64 bit immediates allowed in a window. */
40201 #define MAX_IMM_64 2
40202
40203 /* Maximum total of loads or prefetches allowed in a window. */
40204 #define MAX_LOAD 2
40205
40206 /* Maximum total of stores allowed in a window. */
40207 #define MAX_STORE 1
40208
40209 #undef BIG
40210 #define BIG 100
40211
40212
40213 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
40214 enum dispatch_group {
40215 disp_no_group = 0,
40216 disp_load,
40217 disp_store,
40218 disp_load_store,
40219 disp_prefetch,
40220 disp_imm,
40221 disp_imm_32,
40222 disp_imm_64,
40223 disp_branch,
40224 disp_cmp,
40225 disp_jcc,
40226 disp_last
40227 };
40228
40229 /* Number of allowable groups in a dispatch window. It is an array
40230 indexed by dispatch_group enum. 100 is used as a big number,
40231 because the number of these kind of operations does not have any
40232 effect in dispatch window, but we need them for other reasons in
40233 the table. */
40234 static unsigned int num_allowable_groups[disp_last] = {
40235 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
40236 };
40237
40238 char group_name[disp_last + 1][16] = {
40239 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
40240 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
40241 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
40242 };
40243
40244 /* Instruction path. */
40245 enum insn_path {
40246 no_path = 0,
40247 path_single, /* Single micro op. */
40248 path_double, /* Double micro op. */
40249 path_multi, /* Instructions with more than 2 micro op.. */
40250 last_path
40251 };
40252
40253 /* sched_insn_info defines a window to the instructions scheduled in
40254 the basic block. It contains a pointer to the insn_info table and
40255 the instruction scheduled.
40256
40257 Windows are allocated for each basic block and are linked
40258 together. */
40259 typedef struct sched_insn_info_s {
40260 rtx insn;
40261 enum dispatch_group group;
40262 enum insn_path path;
40263 int byte_len;
40264 int imm_bytes;
40265 } sched_insn_info;
40266
40267 /* Linked list of dispatch windows. This is a two way list of
40268 dispatch windows of a basic block. It contains information about
40269 the number of uops in the window and the total number of
40270 instructions and of bytes in the object code for this dispatch
40271 window. */
40272 typedef struct dispatch_windows_s {
40273 int num_insn; /* Number of insn in the window. */
40274 int num_uops; /* Number of uops in the window. */
40275 int window_size; /* Number of bytes in the window. */
40276 int window_num; /* Window number between 0 or 1. */
40277 int num_imm; /* Number of immediates in an insn. */
40278 int num_imm_32; /* Number of 32 bit immediates in an insn. */
40279 int num_imm_64; /* Number of 64 bit immediates in an insn. */
40280 int imm_size; /* Total immediates in the window. */
40281 int num_loads; /* Total memory loads in the window. */
40282 int num_stores; /* Total memory stores in the window. */
40283 int violation; /* Violation exists in window. */
40284 sched_insn_info *window; /* Pointer to the window. */
40285 struct dispatch_windows_s *next;
40286 struct dispatch_windows_s *prev;
40287 } dispatch_windows;
40288
40289 /* Immediate valuse used in an insn. */
40290 typedef struct imm_info_s
40291 {
40292 int imm;
40293 int imm32;
40294 int imm64;
40295 } imm_info;
40296
40297 static dispatch_windows *dispatch_window_list;
40298 static dispatch_windows *dispatch_window_list1;
40299
40300 /* Get dispatch group of insn. */
40301
40302 static enum dispatch_group
40303 get_mem_group (rtx insn)
40304 {
40305 enum attr_memory memory;
40306
40307 if (INSN_CODE (insn) < 0)
40308 return disp_no_group;
40309 memory = get_attr_memory (insn);
40310 if (memory == MEMORY_STORE)
40311 return disp_store;
40312
40313 if (memory == MEMORY_LOAD)
40314 return disp_load;
40315
40316 if (memory == MEMORY_BOTH)
40317 return disp_load_store;
40318
40319 return disp_no_group;
40320 }
40321
40322 /* Return true if insn is a compare instruction. */
40323
40324 static bool
40325 is_cmp (rtx insn)
40326 {
40327 enum attr_type type;
40328
40329 type = get_attr_type (insn);
40330 return (type == TYPE_TEST
40331 || type == TYPE_ICMP
40332 || type == TYPE_FCMP
40333 || GET_CODE (PATTERN (insn)) == COMPARE);
40334 }
40335
40336 /* Return true if a dispatch violation encountered. */
40337
40338 static bool
40339 dispatch_violation (void)
40340 {
40341 if (dispatch_window_list->next)
40342 return dispatch_window_list->next->violation;
40343 return dispatch_window_list->violation;
40344 }
40345
40346 /* Return true if insn is a branch instruction. */
40347
40348 static bool
40349 is_branch (rtx insn)
40350 {
40351 return (CALL_P (insn) || JUMP_P (insn));
40352 }
40353
40354 /* Return true if insn is a prefetch instruction. */
40355
40356 static bool
40357 is_prefetch (rtx insn)
40358 {
40359 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
40360 }
40361
40362 /* This function initializes a dispatch window and the list container holding a
40363 pointer to the window. */
40364
40365 static void
40366 init_window (int window_num)
40367 {
40368 int i;
40369 dispatch_windows *new_list;
40370
40371 if (window_num == 0)
40372 new_list = dispatch_window_list;
40373 else
40374 new_list = dispatch_window_list1;
40375
40376 new_list->num_insn = 0;
40377 new_list->num_uops = 0;
40378 new_list->window_size = 0;
40379 new_list->next = NULL;
40380 new_list->prev = NULL;
40381 new_list->window_num = window_num;
40382 new_list->num_imm = 0;
40383 new_list->num_imm_32 = 0;
40384 new_list->num_imm_64 = 0;
40385 new_list->imm_size = 0;
40386 new_list->num_loads = 0;
40387 new_list->num_stores = 0;
40388 new_list->violation = false;
40389
40390 for (i = 0; i < MAX_INSN; i++)
40391 {
40392 new_list->window[i].insn = NULL;
40393 new_list->window[i].group = disp_no_group;
40394 new_list->window[i].path = no_path;
40395 new_list->window[i].byte_len = 0;
40396 new_list->window[i].imm_bytes = 0;
40397 }
40398 return;
40399 }
40400
40401 /* This function allocates and initializes a dispatch window and the
40402 list container holding a pointer to the window. */
40403
40404 static dispatch_windows *
40405 allocate_window (void)
40406 {
40407 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
40408 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
40409
40410 return new_list;
40411 }
40412
40413 /* This routine initializes the dispatch scheduling information. It
40414 initiates building dispatch scheduler tables and constructs the
40415 first dispatch window. */
40416
40417 static void
40418 init_dispatch_sched (void)
40419 {
40420 /* Allocate a dispatch list and a window. */
40421 dispatch_window_list = allocate_window ();
40422 dispatch_window_list1 = allocate_window ();
40423 init_window (0);
40424 init_window (1);
40425 }
40426
40427 /* This function returns true if a branch is detected. End of a basic block
40428 does not have to be a branch, but here we assume only branches end a
40429 window. */
40430
40431 static bool
40432 is_end_basic_block (enum dispatch_group group)
40433 {
40434 return group == disp_branch;
40435 }
40436
40437 /* This function is called when the end of a window processing is reached. */
40438
40439 static void
40440 process_end_window (void)
40441 {
40442 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
40443 if (dispatch_window_list->next)
40444 {
40445 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
40446 gcc_assert (dispatch_window_list->window_size
40447 + dispatch_window_list1->window_size <= 48);
40448 init_window (1);
40449 }
40450 init_window (0);
40451 }
40452
40453 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
40454 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
40455 for 48 bytes of instructions. Note that these windows are not dispatch
40456 windows that their sizes are DISPATCH_WINDOW_SIZE. */
40457
40458 static dispatch_windows *
40459 allocate_next_window (int window_num)
40460 {
40461 if (window_num == 0)
40462 {
40463 if (dispatch_window_list->next)
40464 init_window (1);
40465 init_window (0);
40466 return dispatch_window_list;
40467 }
40468
40469 dispatch_window_list->next = dispatch_window_list1;
40470 dispatch_window_list1->prev = dispatch_window_list;
40471
40472 return dispatch_window_list1;
40473 }
40474
40475 /* Increment the number of immediate operands of an instruction. */
40476
40477 static int
40478 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
40479 {
40480 if (*in_rtx == 0)
40481 return 0;
40482
40483 switch ( GET_CODE (*in_rtx))
40484 {
40485 case CONST:
40486 case SYMBOL_REF:
40487 case CONST_INT:
40488 (imm_values->imm)++;
40489 if (x86_64_immediate_operand (*in_rtx, SImode))
40490 (imm_values->imm32)++;
40491 else
40492 (imm_values->imm64)++;
40493 break;
40494
40495 case CONST_DOUBLE:
40496 (imm_values->imm)++;
40497 (imm_values->imm64)++;
40498 break;
40499
40500 case CODE_LABEL:
40501 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
40502 {
40503 (imm_values->imm)++;
40504 (imm_values->imm32)++;
40505 }
40506 break;
40507
40508 default:
40509 break;
40510 }
40511
40512 return 0;
40513 }
40514
40515 /* Compute number of immediate operands of an instruction. */
40516
40517 static void
40518 find_constant (rtx in_rtx, imm_info *imm_values)
40519 {
40520 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
40521 (rtx_function) find_constant_1, (void *) imm_values);
40522 }
40523
40524 /* Return total size of immediate operands of an instruction along with number
40525 of corresponding immediate-operands. It initializes its parameters to zero
40526 befor calling FIND_CONSTANT.
40527 INSN is the input instruction. IMM is the total of immediates.
40528 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
40529 bit immediates. */
40530
40531 static int
40532 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
40533 {
40534 imm_info imm_values = {0, 0, 0};
40535
40536 find_constant (insn, &imm_values);
40537 *imm = imm_values.imm;
40538 *imm32 = imm_values.imm32;
40539 *imm64 = imm_values.imm64;
40540 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
40541 }
40542
40543 /* This function indicates if an operand of an instruction is an
40544 immediate. */
40545
40546 static bool
40547 has_immediate (rtx insn)
40548 {
40549 int num_imm_operand;
40550 int num_imm32_operand;
40551 int num_imm64_operand;
40552
40553 if (insn)
40554 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
40555 &num_imm64_operand);
40556 return false;
40557 }
40558
40559 /* Return single or double path for instructions. */
40560
40561 static enum insn_path
40562 get_insn_path (rtx insn)
40563 {
40564 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
40565
40566 if ((int)path == 0)
40567 return path_single;
40568
40569 if ((int)path == 1)
40570 return path_double;
40571
40572 return path_multi;
40573 }
40574
40575 /* Return insn dispatch group. */
40576
40577 static enum dispatch_group
40578 get_insn_group (rtx insn)
40579 {
40580 enum dispatch_group group = get_mem_group (insn);
40581 if (group)
40582 return group;
40583
40584 if (is_branch (insn))
40585 return disp_branch;
40586
40587 if (is_cmp (insn))
40588 return disp_cmp;
40589
40590 if (has_immediate (insn))
40591 return disp_imm;
40592
40593 if (is_prefetch (insn))
40594 return disp_prefetch;
40595
40596 return disp_no_group;
40597 }
40598
40599 /* Count number of GROUP restricted instructions in a dispatch
40600 window WINDOW_LIST. */
40601
40602 static int
40603 count_num_restricted (rtx insn, dispatch_windows *window_list)
40604 {
40605 enum dispatch_group group = get_insn_group (insn);
40606 int imm_size;
40607 int num_imm_operand;
40608 int num_imm32_operand;
40609 int num_imm64_operand;
40610
40611 if (group == disp_no_group)
40612 return 0;
40613
40614 if (group == disp_imm)
40615 {
40616 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
40617 &num_imm64_operand);
40618 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
40619 || num_imm_operand + window_list->num_imm > MAX_IMM
40620 || (num_imm32_operand > 0
40621 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
40622 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
40623 || (num_imm64_operand > 0
40624 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
40625 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
40626 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
40627 && num_imm64_operand > 0
40628 && ((window_list->num_imm_64 > 0
40629 && window_list->num_insn >= 2)
40630 || window_list->num_insn >= 3)))
40631 return BIG;
40632
40633 return 1;
40634 }
40635
40636 if ((group == disp_load_store
40637 && (window_list->num_loads >= MAX_LOAD
40638 || window_list->num_stores >= MAX_STORE))
40639 || ((group == disp_load
40640 || group == disp_prefetch)
40641 && window_list->num_loads >= MAX_LOAD)
40642 || (group == disp_store
40643 && window_list->num_stores >= MAX_STORE))
40644 return BIG;
40645
40646 return 1;
40647 }
40648
40649 /* This function returns true if insn satisfies dispatch rules on the
40650 last window scheduled. */
40651
40652 static bool
40653 fits_dispatch_window (rtx insn)
40654 {
40655 dispatch_windows *window_list = dispatch_window_list;
40656 dispatch_windows *window_list_next = dispatch_window_list->next;
40657 unsigned int num_restrict;
40658 enum dispatch_group group = get_insn_group (insn);
40659 enum insn_path path = get_insn_path (insn);
40660 int sum;
40661
40662 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
40663 instructions should be given the lowest priority in the
40664 scheduling process in Haifa scheduler to make sure they will be
40665 scheduled in the same dispatch window as the reference to them. */
40666 if (group == disp_jcc || group == disp_cmp)
40667 return false;
40668
40669 /* Check nonrestricted. */
40670 if (group == disp_no_group || group == disp_branch)
40671 return true;
40672
40673 /* Get last dispatch window. */
40674 if (window_list_next)
40675 window_list = window_list_next;
40676
40677 if (window_list->window_num == 1)
40678 {
40679 sum = window_list->prev->window_size + window_list->window_size;
40680
40681 if (sum == 32
40682 || (min_insn_size (insn) + sum) >= 48)
40683 /* Window 1 is full. Go for next window. */
40684 return true;
40685 }
40686
40687 num_restrict = count_num_restricted (insn, window_list);
40688
40689 if (num_restrict > num_allowable_groups[group])
40690 return false;
40691
40692 /* See if it fits in the first window. */
40693 if (window_list->window_num == 0)
40694 {
40695 /* The first widow should have only single and double path
40696 uops. */
40697 if (path == path_double
40698 && (window_list->num_uops + 2) > MAX_INSN)
40699 return false;
40700 else if (path != path_single)
40701 return false;
40702 }
40703 return true;
40704 }
40705
40706 /* Add an instruction INSN with NUM_UOPS micro-operations to the
40707 dispatch window WINDOW_LIST. */
40708
40709 static void
40710 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
40711 {
40712 int byte_len = min_insn_size (insn);
40713 int num_insn = window_list->num_insn;
40714 int imm_size;
40715 sched_insn_info *window = window_list->window;
40716 enum dispatch_group group = get_insn_group (insn);
40717 enum insn_path path = get_insn_path (insn);
40718 int num_imm_operand;
40719 int num_imm32_operand;
40720 int num_imm64_operand;
40721
40722 if (!window_list->violation && group != disp_cmp
40723 && !fits_dispatch_window (insn))
40724 window_list->violation = true;
40725
40726 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
40727 &num_imm64_operand);
40728
40729 /* Initialize window with new instruction. */
40730 window[num_insn].insn = insn;
40731 window[num_insn].byte_len = byte_len;
40732 window[num_insn].group = group;
40733 window[num_insn].path = path;
40734 window[num_insn].imm_bytes = imm_size;
40735
40736 window_list->window_size += byte_len;
40737 window_list->num_insn = num_insn + 1;
40738 window_list->num_uops = window_list->num_uops + num_uops;
40739 window_list->imm_size += imm_size;
40740 window_list->num_imm += num_imm_operand;
40741 window_list->num_imm_32 += num_imm32_operand;
40742 window_list->num_imm_64 += num_imm64_operand;
40743
40744 if (group == disp_store)
40745 window_list->num_stores += 1;
40746 else if (group == disp_load
40747 || group == disp_prefetch)
40748 window_list->num_loads += 1;
40749 else if (group == disp_load_store)
40750 {
40751 window_list->num_stores += 1;
40752 window_list->num_loads += 1;
40753 }
40754 }
40755
40756 /* Adds a scheduled instruction, INSN, to the current dispatch window.
40757 If the total bytes of instructions or the number of instructions in
40758 the window exceed allowable, it allocates a new window. */
40759
40760 static void
40761 add_to_dispatch_window (rtx insn)
40762 {
40763 int byte_len;
40764 dispatch_windows *window_list;
40765 dispatch_windows *next_list;
40766 dispatch_windows *window0_list;
40767 enum insn_path path;
40768 enum dispatch_group insn_group;
40769 bool insn_fits;
40770 int num_insn;
40771 int num_uops;
40772 int window_num;
40773 int insn_num_uops;
40774 int sum;
40775
40776 if (INSN_CODE (insn) < 0)
40777 return;
40778
40779 byte_len = min_insn_size (insn);
40780 window_list = dispatch_window_list;
40781 next_list = window_list->next;
40782 path = get_insn_path (insn);
40783 insn_group = get_insn_group (insn);
40784
40785 /* Get the last dispatch window. */
40786 if (next_list)
40787 window_list = dispatch_window_list->next;
40788
40789 if (path == path_single)
40790 insn_num_uops = 1;
40791 else if (path == path_double)
40792 insn_num_uops = 2;
40793 else
40794 insn_num_uops = (int) path;
40795
40796 /* If current window is full, get a new window.
40797 Window number zero is full, if MAX_INSN uops are scheduled in it.
40798 Window number one is full, if window zero's bytes plus window
40799 one's bytes is 32, or if the bytes of the new instruction added
40800 to the total makes it greater than 48, or it has already MAX_INSN
40801 instructions in it. */
40802 num_insn = window_list->num_insn;
40803 num_uops = window_list->num_uops;
40804 window_num = window_list->window_num;
40805 insn_fits = fits_dispatch_window (insn);
40806
40807 if (num_insn >= MAX_INSN
40808 || num_uops + insn_num_uops > MAX_INSN
40809 || !(insn_fits))
40810 {
40811 window_num = ~window_num & 1;
40812 window_list = allocate_next_window (window_num);
40813 }
40814
40815 if (window_num == 0)
40816 {
40817 add_insn_window (insn, window_list, insn_num_uops);
40818 if (window_list->num_insn >= MAX_INSN
40819 && insn_group == disp_branch)
40820 {
40821 process_end_window ();
40822 return;
40823 }
40824 }
40825 else if (window_num == 1)
40826 {
40827 window0_list = window_list->prev;
40828 sum = window0_list->window_size + window_list->window_size;
40829 if (sum == 32
40830 || (byte_len + sum) >= 48)
40831 {
40832 process_end_window ();
40833 window_list = dispatch_window_list;
40834 }
40835
40836 add_insn_window (insn, window_list, insn_num_uops);
40837 }
40838 else
40839 gcc_unreachable ();
40840
40841 if (is_end_basic_block (insn_group))
40842 {
40843 /* End of basic block is reached do end-basic-block process. */
40844 process_end_window ();
40845 return;
40846 }
40847 }
40848
40849 /* Print the dispatch window, WINDOW_NUM, to FILE. */
40850
40851 DEBUG_FUNCTION static void
40852 debug_dispatch_window_file (FILE *file, int window_num)
40853 {
40854 dispatch_windows *list;
40855 int i;
40856
40857 if (window_num == 0)
40858 list = dispatch_window_list;
40859 else
40860 list = dispatch_window_list1;
40861
40862 fprintf (file, "Window #%d:\n", list->window_num);
40863 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
40864 list->num_insn, list->num_uops, list->window_size);
40865 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
40866 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
40867
40868 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
40869 list->num_stores);
40870 fprintf (file, " insn info:\n");
40871
40872 for (i = 0; i < MAX_INSN; i++)
40873 {
40874 if (!list->window[i].insn)
40875 break;
40876 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
40877 i, group_name[list->window[i].group],
40878 i, (void *)list->window[i].insn,
40879 i, list->window[i].path,
40880 i, list->window[i].byte_len,
40881 i, list->window[i].imm_bytes);
40882 }
40883 }
40884
40885 /* Print to stdout a dispatch window. */
40886
40887 DEBUG_FUNCTION void
40888 debug_dispatch_window (int window_num)
40889 {
40890 debug_dispatch_window_file (stdout, window_num);
40891 }
40892
40893 /* Print INSN dispatch information to FILE. */
40894
40895 DEBUG_FUNCTION static void
40896 debug_insn_dispatch_info_file (FILE *file, rtx insn)
40897 {
40898 int byte_len;
40899 enum insn_path path;
40900 enum dispatch_group group;
40901 int imm_size;
40902 int num_imm_operand;
40903 int num_imm32_operand;
40904 int num_imm64_operand;
40905
40906 if (INSN_CODE (insn) < 0)
40907 return;
40908
40909 byte_len = min_insn_size (insn);
40910 path = get_insn_path (insn);
40911 group = get_insn_group (insn);
40912 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
40913 &num_imm64_operand);
40914
40915 fprintf (file, " insn info:\n");
40916 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
40917 group_name[group], path, byte_len);
40918 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
40919 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
40920 }
40921
40922 /* Print to STDERR the status of the ready list with respect to
40923 dispatch windows. */
40924
40925 DEBUG_FUNCTION void
40926 debug_ready_dispatch (void)
40927 {
40928 int i;
40929 int no_ready = number_in_ready ();
40930
40931 fprintf (stdout, "Number of ready: %d\n", no_ready);
40932
40933 for (i = 0; i < no_ready; i++)
40934 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
40935 }
40936
40937 /* This routine is the driver of the dispatch scheduler. */
40938
40939 static void
40940 do_dispatch (rtx insn, int mode)
40941 {
40942 if (mode == DISPATCH_INIT)
40943 init_dispatch_sched ();
40944 else if (mode == ADD_TO_DISPATCH_WINDOW)
40945 add_to_dispatch_window (insn);
40946 }
40947
40948 /* Return TRUE if Dispatch Scheduling is supported. */
40949
40950 static bool
40951 has_dispatch (rtx insn, int action)
40952 {
40953 if ((TARGET_BDVER1 || TARGET_BDVER2)
40954 && flag_dispatch_scheduler)
40955 switch (action)
40956 {
40957 default:
40958 return false;
40959
40960 case IS_DISPATCH_ON:
40961 return true;
40962 break;
40963
40964 case IS_CMP:
40965 return is_cmp (insn);
40966
40967 case DISPATCH_VIOLATION:
40968 return dispatch_violation ();
40969
40970 case FITS_DISPATCH_WINDOW:
40971 return fits_dispatch_window (insn);
40972 }
40973
40974 return false;
40975 }
40976
40977 /* Implementation of reassociation_width target hook used by
40978 reassoc phase to identify parallelism level in reassociated
40979 tree. Statements tree_code is passed in OPC. Arguments type
40980 is passed in MODE.
40981
40982 Currently parallel reassociation is enabled for Atom
40983 processors only and we set reassociation width to be 2
40984 because Atom may issue up to 2 instructions per cycle.
40985
40986 Return value should be fixed if parallel reassociation is
40987 enabled for other processors. */
40988
40989 static int
40990 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
40991 enum machine_mode mode)
40992 {
40993 int res = 1;
40994
40995 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
40996 res = 2;
40997 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
40998 res = 2;
40999
41000 return res;
41001 }
41002
41003 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
41004 place emms and femms instructions. */
41005
41006 static enum machine_mode
41007 ix86_preferred_simd_mode (enum machine_mode mode)
41008 {
41009 if (!TARGET_SSE)
41010 return word_mode;
41011
41012 switch (mode)
41013 {
41014 case QImode:
41015 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
41016 case HImode:
41017 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
41018 case SImode:
41019 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
41020 case DImode:
41021 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
41022
41023 case SFmode:
41024 if (TARGET_AVX && !TARGET_PREFER_AVX128)
41025 return V8SFmode;
41026 else
41027 return V4SFmode;
41028
41029 case DFmode:
41030 if (!TARGET_VECTORIZE_DOUBLE)
41031 return word_mode;
41032 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
41033 return V4DFmode;
41034 else if (TARGET_SSE2)
41035 return V2DFmode;
41036 /* FALLTHRU */
41037
41038 default:
41039 return word_mode;
41040 }
41041 }
41042
41043 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
41044 vectors. */
41045
41046 static unsigned int
41047 ix86_autovectorize_vector_sizes (void)
41048 {
41049 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
41050 }
41051
41052 \f
41053
41054 /* Return class of registers which could be used for pseudo of MODE
41055 and of class RCLASS for spilling instead of memory. Return NO_REGS
41056 if it is not possible or non-profitable. */
41057 static reg_class_t
41058 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
41059 {
41060 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
41061 && hard_reg_set_subset_p (reg_class_contents[rclass],
41062 reg_class_contents[GENERAL_REGS])
41063 && (mode == SImode || (TARGET_64BIT && mode == DImode)))
41064 return SSE_REGS;
41065 return NO_REGS;
41066 }
41067
41068 /* Implement targetm.vectorize.init_cost. */
41069
41070 static void *
41071 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
41072 {
41073 unsigned *cost = XNEWVEC (unsigned, 3);
41074 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
41075 return cost;
41076 }
41077
41078 /* Implement targetm.vectorize.add_stmt_cost. */
41079
41080 static unsigned
41081 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
41082 struct _stmt_vec_info *stmt_info, int misalign,
41083 enum vect_cost_model_location where)
41084 {
41085 unsigned *cost = (unsigned *) data;
41086 unsigned retval = 0;
41087
41088 if (flag_vect_cost_model)
41089 {
41090 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
41091 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
41092
41093 /* Statements in an inner loop relative to the loop being
41094 vectorized are weighted more heavily. The value here is
41095 arbitrary and could potentially be improved with analysis. */
41096 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
41097 count *= 50; /* FIXME. */
41098
41099 retval = (unsigned) (count * stmt_cost);
41100 cost[where] += retval;
41101 }
41102
41103 return retval;
41104 }
41105
41106 /* Implement targetm.vectorize.finish_cost. */
41107
41108 static void
41109 ix86_finish_cost (void *data, unsigned *prologue_cost,
41110 unsigned *body_cost, unsigned *epilogue_cost)
41111 {
41112 unsigned *cost = (unsigned *) data;
41113 *prologue_cost = cost[vect_prologue];
41114 *body_cost = cost[vect_body];
41115 *epilogue_cost = cost[vect_epilogue];
41116 }
41117
41118 /* Implement targetm.vectorize.destroy_cost_data. */
41119
41120 static void
41121 ix86_destroy_cost_data (void *data)
41122 {
41123 free (data);
41124 }
41125
41126 /* Validate target specific memory model bits in VAL. */
41127
41128 static unsigned HOST_WIDE_INT
41129 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
41130 {
41131 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
41132 unsigned HOST_WIDE_INT strong;
41133
41134 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
41135 |MEMMODEL_MASK)
41136 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
41137 {
41138 warning (OPT_Winvalid_memory_model,
41139 "Unknown architecture specific memory model");
41140 return MEMMODEL_SEQ_CST;
41141 }
41142 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
41143 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
41144 {
41145 warning (OPT_Winvalid_memory_model,
41146 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
41147 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
41148 }
41149 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
41150 {
41151 warning (OPT_Winvalid_memory_model,
41152 "HLE_RELEASE not used with RELEASE or stronger memory model");
41153 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
41154 }
41155 return val;
41156 }
41157
41158 /* Initialize the GCC target structure. */
41159 #undef TARGET_RETURN_IN_MEMORY
41160 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
41161
41162 #undef TARGET_LEGITIMIZE_ADDRESS
41163 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
41164
41165 #undef TARGET_ATTRIBUTE_TABLE
41166 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
41167 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
41168 # undef TARGET_MERGE_DECL_ATTRIBUTES
41169 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
41170 #endif
41171
41172 #undef TARGET_COMP_TYPE_ATTRIBUTES
41173 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
41174
41175 #undef TARGET_INIT_BUILTINS
41176 #define TARGET_INIT_BUILTINS ix86_init_builtins
41177 #undef TARGET_BUILTIN_DECL
41178 #define TARGET_BUILTIN_DECL ix86_builtin_decl
41179 #undef TARGET_EXPAND_BUILTIN
41180 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
41181
41182 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
41183 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
41184 ix86_builtin_vectorized_function
41185
41186 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
41187 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
41188
41189 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
41190 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
41191
41192 #undef TARGET_VECTORIZE_BUILTIN_GATHER
41193 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
41194
41195 #undef TARGET_BUILTIN_RECIPROCAL
41196 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
41197
41198 #undef TARGET_ASM_FUNCTION_EPILOGUE
41199 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
41200
41201 #undef TARGET_ENCODE_SECTION_INFO
41202 #ifndef SUBTARGET_ENCODE_SECTION_INFO
41203 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
41204 #else
41205 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
41206 #endif
41207
41208 #undef TARGET_ASM_OPEN_PAREN
41209 #define TARGET_ASM_OPEN_PAREN ""
41210 #undef TARGET_ASM_CLOSE_PAREN
41211 #define TARGET_ASM_CLOSE_PAREN ""
41212
41213 #undef TARGET_ASM_BYTE_OP
41214 #define TARGET_ASM_BYTE_OP ASM_BYTE
41215
41216 #undef TARGET_ASM_ALIGNED_HI_OP
41217 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
41218 #undef TARGET_ASM_ALIGNED_SI_OP
41219 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
41220 #ifdef ASM_QUAD
41221 #undef TARGET_ASM_ALIGNED_DI_OP
41222 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
41223 #endif
41224
41225 #undef TARGET_PROFILE_BEFORE_PROLOGUE
41226 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
41227
41228 #undef TARGET_ASM_UNALIGNED_HI_OP
41229 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
41230 #undef TARGET_ASM_UNALIGNED_SI_OP
41231 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
41232 #undef TARGET_ASM_UNALIGNED_DI_OP
41233 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
41234
41235 #undef TARGET_PRINT_OPERAND
41236 #define TARGET_PRINT_OPERAND ix86_print_operand
41237 #undef TARGET_PRINT_OPERAND_ADDRESS
41238 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
41239 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
41240 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
41241 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
41242 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
41243
41244 #undef TARGET_SCHED_INIT_GLOBAL
41245 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
41246 #undef TARGET_SCHED_ADJUST_COST
41247 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
41248 #undef TARGET_SCHED_ISSUE_RATE
41249 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
41250 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
41251 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
41252 ia32_multipass_dfa_lookahead
41253
41254 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
41255 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
41256
41257 #undef TARGET_MEMMODEL_CHECK
41258 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
41259
41260 #ifdef HAVE_AS_TLS
41261 #undef TARGET_HAVE_TLS
41262 #define TARGET_HAVE_TLS true
41263 #endif
41264 #undef TARGET_CANNOT_FORCE_CONST_MEM
41265 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
41266 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
41267 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
41268
41269 #undef TARGET_DELEGITIMIZE_ADDRESS
41270 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
41271
41272 #undef TARGET_MS_BITFIELD_LAYOUT_P
41273 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
41274
41275 #if TARGET_MACHO
41276 #undef TARGET_BINDS_LOCAL_P
41277 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
41278 #endif
41279 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
41280 #undef TARGET_BINDS_LOCAL_P
41281 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
41282 #endif
41283
41284 #undef TARGET_ASM_OUTPUT_MI_THUNK
41285 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
41286 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
41287 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
41288
41289 #undef TARGET_ASM_FILE_START
41290 #define TARGET_ASM_FILE_START x86_file_start
41291
41292 #undef TARGET_OPTION_OVERRIDE
41293 #define TARGET_OPTION_OVERRIDE ix86_option_override
41294
41295 #undef TARGET_REGISTER_MOVE_COST
41296 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
41297 #undef TARGET_MEMORY_MOVE_COST
41298 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
41299 #undef TARGET_RTX_COSTS
41300 #define TARGET_RTX_COSTS ix86_rtx_costs
41301 #undef TARGET_ADDRESS_COST
41302 #define TARGET_ADDRESS_COST ix86_address_cost
41303
41304 #undef TARGET_FIXED_CONDITION_CODE_REGS
41305 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
41306 #undef TARGET_CC_MODES_COMPATIBLE
41307 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
41308
41309 #undef TARGET_MACHINE_DEPENDENT_REORG
41310 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
41311
41312 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
41313 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
41314
41315 #undef TARGET_BUILD_BUILTIN_VA_LIST
41316 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
41317
41318 #undef TARGET_FOLD_BUILTIN
41319 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
41320
41321 #undef TARGET_ENUM_VA_LIST_P
41322 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
41323
41324 #undef TARGET_FN_ABI_VA_LIST
41325 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
41326
41327 #undef TARGET_CANONICAL_VA_LIST_TYPE
41328 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
41329
41330 #undef TARGET_EXPAND_BUILTIN_VA_START
41331 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
41332
41333 #undef TARGET_MD_ASM_CLOBBERS
41334 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
41335
41336 #undef TARGET_PROMOTE_PROTOTYPES
41337 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
41338 #undef TARGET_STRUCT_VALUE_RTX
41339 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
41340 #undef TARGET_SETUP_INCOMING_VARARGS
41341 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
41342 #undef TARGET_MUST_PASS_IN_STACK
41343 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
41344 #undef TARGET_FUNCTION_ARG_ADVANCE
41345 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
41346 #undef TARGET_FUNCTION_ARG
41347 #define TARGET_FUNCTION_ARG ix86_function_arg
41348 #undef TARGET_FUNCTION_ARG_BOUNDARY
41349 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
41350 #undef TARGET_PASS_BY_REFERENCE
41351 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
41352 #undef TARGET_INTERNAL_ARG_POINTER
41353 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
41354 #undef TARGET_UPDATE_STACK_BOUNDARY
41355 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
41356 #undef TARGET_GET_DRAP_RTX
41357 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
41358 #undef TARGET_STRICT_ARGUMENT_NAMING
41359 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
41360 #undef TARGET_STATIC_CHAIN
41361 #define TARGET_STATIC_CHAIN ix86_static_chain
41362 #undef TARGET_TRAMPOLINE_INIT
41363 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
41364 #undef TARGET_RETURN_POPS_ARGS
41365 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
41366
41367 #undef TARGET_LEGITIMATE_COMBINED_INSN
41368 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
41369
41370 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
41371 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
41372
41373 #undef TARGET_SCALAR_MODE_SUPPORTED_P
41374 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
41375
41376 #undef TARGET_VECTOR_MODE_SUPPORTED_P
41377 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
41378
41379 #undef TARGET_C_MODE_FOR_SUFFIX
41380 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
41381
41382 #ifdef HAVE_AS_TLS
41383 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
41384 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
41385 #endif
41386
41387 #ifdef SUBTARGET_INSERT_ATTRIBUTES
41388 #undef TARGET_INSERT_ATTRIBUTES
41389 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
41390 #endif
41391
41392 #undef TARGET_MANGLE_TYPE
41393 #define TARGET_MANGLE_TYPE ix86_mangle_type
41394
41395 #if !TARGET_MACHO
41396 #undef TARGET_STACK_PROTECT_FAIL
41397 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
41398 #endif
41399
41400 #undef TARGET_FUNCTION_VALUE
41401 #define TARGET_FUNCTION_VALUE ix86_function_value
41402
41403 #undef TARGET_FUNCTION_VALUE_REGNO_P
41404 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
41405
41406 #undef TARGET_PROMOTE_FUNCTION_MODE
41407 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
41408
41409 #undef TARGET_MEMBER_TYPE_FORCES_BLK
41410 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
41411
41412 #undef TARGET_SECONDARY_RELOAD
41413 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
41414
41415 #undef TARGET_CLASS_MAX_NREGS
41416 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
41417
41418 #undef TARGET_PREFERRED_RELOAD_CLASS
41419 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
41420 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
41421 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
41422 #undef TARGET_CLASS_LIKELY_SPILLED_P
41423 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
41424
41425 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
41426 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
41427 ix86_builtin_vectorization_cost
41428 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
41429 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
41430 ix86_vectorize_vec_perm_const_ok
41431 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
41432 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
41433 ix86_preferred_simd_mode
41434 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
41435 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
41436 ix86_autovectorize_vector_sizes
41437 #undef TARGET_VECTORIZE_INIT_COST
41438 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
41439 #undef TARGET_VECTORIZE_ADD_STMT_COST
41440 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
41441 #undef TARGET_VECTORIZE_FINISH_COST
41442 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
41443 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
41444 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
41445
41446 #undef TARGET_SET_CURRENT_FUNCTION
41447 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
41448
41449 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
41450 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
41451
41452 #undef TARGET_OPTION_SAVE
41453 #define TARGET_OPTION_SAVE ix86_function_specific_save
41454
41455 #undef TARGET_OPTION_RESTORE
41456 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
41457
41458 #undef TARGET_OPTION_PRINT
41459 #define TARGET_OPTION_PRINT ix86_function_specific_print
41460
41461 #undef TARGET_CAN_INLINE_P
41462 #define TARGET_CAN_INLINE_P ix86_can_inline_p
41463
41464 #undef TARGET_EXPAND_TO_RTL_HOOK
41465 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
41466
41467 #undef TARGET_LEGITIMATE_ADDRESS_P
41468 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
41469
41470 #undef TARGET_LRA_P
41471 #define TARGET_LRA_P ix86_lra_p
41472
41473 #undef TARGET_REGISTER_PRIORITY
41474 #define TARGET_REGISTER_PRIORITY ix86_register_priority
41475
41476 #undef TARGET_LEGITIMATE_CONSTANT_P
41477 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
41478
41479 #undef TARGET_FRAME_POINTER_REQUIRED
41480 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
41481
41482 #undef TARGET_CAN_ELIMINATE
41483 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
41484
41485 #undef TARGET_EXTRA_LIVE_ON_ENTRY
41486 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
41487
41488 #undef TARGET_ASM_CODE_END
41489 #define TARGET_ASM_CODE_END ix86_code_end
41490
41491 #undef TARGET_CONDITIONAL_REGISTER_USAGE
41492 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
41493
41494 #if TARGET_MACHO
41495 #undef TARGET_INIT_LIBFUNCS
41496 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
41497 #endif
41498
41499 #undef TARGET_SPILL_CLASS
41500 #define TARGET_SPILL_CLASS ix86_spill_class
41501
41502 struct gcc_target targetm = TARGET_INITIALIZER;
41503 \f
41504 #include "gt-i386.h"