]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/i386/i386.c
* final.c, output.h (fprint_whex, fprint_w, fprint_ul, sprint_ul):
[thirdparty/gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
4 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "cgraph.h"
51 #include "gimple.h"
52 #include "dwarf2.h"
53 #include "df.h"
54 #include "tm-constrs.h"
55 #include "params.h"
56 #include "cselib.h"
57 #include "debug.h"
58 #include "sched-int.h"
59 #include "sbitmap.h"
60 #include "fibheap.h"
61 #include "opts.h"
62 #include "diagnostic.h"
63
64 enum upper_128bits_state
65 {
66 unknown = 0,
67 unused,
68 used
69 };
70
71 typedef struct block_info_def
72 {
73 /* State of the upper 128bits of AVX registers at exit. */
74 enum upper_128bits_state state;
75 /* TRUE if state of the upper 128bits of AVX registers is unchanged
76 in this block. */
77 bool unchanged;
78 /* TRUE if block has been processed. */
79 bool processed;
80 /* TRUE if block has been scanned. */
81 bool scanned;
82 /* Previous state of the upper 128bits of AVX registers at entry. */
83 enum upper_128bits_state prev;
84 } *block_info;
85
86 #define BLOCK_INFO(B) ((block_info) (B)->aux)
87
88 enum call_avx256_state
89 {
90 /* Callee returns 256bit AVX register. */
91 callee_return_avx256 = -1,
92 /* Callee returns and passes 256bit AVX register. */
93 callee_return_pass_avx256,
94 /* Callee passes 256bit AVX register. */
95 callee_pass_avx256,
96 /* Callee doesn't return nor passe 256bit AVX register, or no
97 256bit AVX register in function return. */
98 call_no_avx256,
99 /* vzeroupper intrinsic. */
100 vzeroupper_intrinsic
101 };
102
103 /* Check if a 256bit AVX register is referenced in stores. */
104
105 static void
106 check_avx256_stores (rtx dest, const_rtx set, void *data)
107 {
108 if ((REG_P (dest)
109 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
110 || (GET_CODE (set) == SET
111 && REG_P (SET_SRC (set))
112 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
113 {
114 enum upper_128bits_state *state
115 = (enum upper_128bits_state *) data;
116 *state = used;
117 }
118 }
119
120 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
121 in basic block BB. Delete it if upper 128bit AVX registers are
122 unused. If it isn't deleted, move it to just before a jump insn.
123
124 STATE is state of the upper 128bits of AVX registers at entry. */
125
126 static void
127 move_or_delete_vzeroupper_2 (basic_block bb,
128 enum upper_128bits_state state)
129 {
130 rtx insn, bb_end;
131 rtx vzeroupper_insn = NULL_RTX;
132 rtx pat;
133 int avx256;
134 bool unchanged;
135
136 if (BLOCK_INFO (bb)->unchanged)
137 {
138 if (dump_file)
139 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
140 bb->index, state);
141
142 BLOCK_INFO (bb)->state = state;
143 return;
144 }
145
146 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
147 {
148 if (dump_file)
149 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
150 bb->index, BLOCK_INFO (bb)->state);
151 return;
152 }
153
154 BLOCK_INFO (bb)->prev = state;
155
156 if (dump_file)
157 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
158 bb->index, state);
159
160 unchanged = true;
161
162 /* BB_END changes when it is deleted. */
163 bb_end = BB_END (bb);
164 insn = BB_HEAD (bb);
165 while (insn != bb_end)
166 {
167 insn = NEXT_INSN (insn);
168
169 if (!NONDEBUG_INSN_P (insn))
170 continue;
171
172 /* Move vzeroupper before jump/call. */
173 if (JUMP_P (insn) || CALL_P (insn))
174 {
175 if (!vzeroupper_insn)
176 continue;
177
178 if (PREV_INSN (insn) != vzeroupper_insn)
179 {
180 if (dump_file)
181 {
182 fprintf (dump_file, "Move vzeroupper after:\n");
183 print_rtl_single (dump_file, PREV_INSN (insn));
184 fprintf (dump_file, "before:\n");
185 print_rtl_single (dump_file, insn);
186 }
187 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
188 PREV_INSN (insn));
189 }
190 vzeroupper_insn = NULL_RTX;
191 continue;
192 }
193
194 pat = PATTERN (insn);
195
196 /* Check insn for vzeroupper intrinsic. */
197 if (GET_CODE (pat) == UNSPEC_VOLATILE
198 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
199 {
200 if (dump_file)
201 {
202 /* Found vzeroupper intrinsic. */
203 fprintf (dump_file, "Found vzeroupper:\n");
204 print_rtl_single (dump_file, insn);
205 }
206 }
207 else
208 {
209 /* Check insn for vzeroall intrinsic. */
210 if (GET_CODE (pat) == PARALLEL
211 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
212 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
213 {
214 state = unused;
215 unchanged = false;
216
217 /* Delete pending vzeroupper insertion. */
218 if (vzeroupper_insn)
219 {
220 delete_insn (vzeroupper_insn);
221 vzeroupper_insn = NULL_RTX;
222 }
223 }
224 else if (state != used)
225 {
226 note_stores (pat, check_avx256_stores, &state);
227 if (state == used)
228 unchanged = false;
229 }
230 continue;
231 }
232
233 /* Process vzeroupper intrinsic. */
234 avx256 = INTVAL (XVECEXP (pat, 0, 0));
235
236 if (state == unused)
237 {
238 /* Since the upper 128bits are cleared, callee must not pass
239 256bit AVX register. We only need to check if callee
240 returns 256bit AVX register. */
241 if (avx256 == callee_return_avx256)
242 {
243 state = used;
244 unchanged = false;
245 }
246
247 /* Remove unnecessary vzeroupper since upper 128bits are
248 cleared. */
249 if (dump_file)
250 {
251 fprintf (dump_file, "Delete redundant vzeroupper:\n");
252 print_rtl_single (dump_file, insn);
253 }
254 delete_insn (insn);
255 }
256 else
257 {
258 /* Set state to UNUSED if callee doesn't return 256bit AVX
259 register. */
260 if (avx256 != callee_return_pass_avx256)
261 state = unused;
262
263 if (avx256 == callee_return_pass_avx256
264 || avx256 == callee_pass_avx256)
265 {
266 /* Must remove vzeroupper since callee passes in 256bit
267 AVX register. */
268 if (dump_file)
269 {
270 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
271 print_rtl_single (dump_file, insn);
272 }
273 delete_insn (insn);
274 }
275 else
276 {
277 vzeroupper_insn = insn;
278 unchanged = false;
279 }
280 }
281 }
282
283 BLOCK_INFO (bb)->state = state;
284 BLOCK_INFO (bb)->unchanged = unchanged;
285 BLOCK_INFO (bb)->scanned = true;
286
287 if (dump_file)
288 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
289 bb->index, unchanged ? "unchanged" : "changed",
290 state);
291 }
292
293 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
294 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
295 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
296 state is changed. */
297
298 static bool
299 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
300 {
301 edge e;
302 edge_iterator ei;
303 enum upper_128bits_state state, old_state, new_state;
304 bool seen_unknown;
305
306 if (dump_file)
307 fprintf (dump_file, " Process [bb %i]: status: %d\n",
308 block->index, BLOCK_INFO (block)->processed);
309
310 if (BLOCK_INFO (block)->processed)
311 return false;
312
313 state = unused;
314
315 /* Check all predecessor edges of this block. */
316 seen_unknown = false;
317 FOR_EACH_EDGE (e, ei, block->preds)
318 {
319 if (e->src == block)
320 continue;
321 switch (BLOCK_INFO (e->src)->state)
322 {
323 case unknown:
324 if (!unknown_is_unused)
325 seen_unknown = true;
326 case unused:
327 break;
328 case used:
329 state = used;
330 goto done;
331 }
332 }
333
334 if (seen_unknown)
335 state = unknown;
336
337 done:
338 old_state = BLOCK_INFO (block)->state;
339 move_or_delete_vzeroupper_2 (block, state);
340 new_state = BLOCK_INFO (block)->state;
341
342 if (state != unknown || new_state == used)
343 BLOCK_INFO (block)->processed = true;
344
345 /* Need to rescan if the upper 128bits of AVX registers are changed
346 to USED at exit. */
347 if (new_state != old_state)
348 {
349 if (new_state == used)
350 cfun->machine->rescan_vzeroupper_p = 1;
351 return true;
352 }
353 else
354 return false;
355 }
356
357 /* Go through the instruction stream looking for vzeroupper. Delete
358 it if upper 128bit AVX registers are unused. If it isn't deleted,
359 move it to just before a jump insn. */
360
361 static void
362 move_or_delete_vzeroupper (void)
363 {
364 edge e;
365 edge_iterator ei;
366 basic_block bb;
367 fibheap_t worklist, pending, fibheap_swap;
368 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
369 int *bb_order;
370 int *rc_order;
371 int i;
372
373 /* Set up block info for each basic block. */
374 alloc_aux_for_blocks (sizeof (struct block_info_def));
375
376 /* Process outgoing edges of entry point. */
377 if (dump_file)
378 fprintf (dump_file, "Process outgoing edges of entry point\n");
379
380 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
381 {
382 move_or_delete_vzeroupper_2 (e->dest,
383 cfun->machine->caller_pass_avx256_p
384 ? used : unused);
385 BLOCK_INFO (e->dest)->processed = true;
386 }
387
388 /* Compute reverse completion order of depth first search of the CFG
389 so that the data-flow runs faster. */
390 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
391 bb_order = XNEWVEC (int, last_basic_block);
392 pre_and_rev_post_order_compute (NULL, rc_order, false);
393 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
394 bb_order[rc_order[i]] = i;
395 free (rc_order);
396
397 worklist = fibheap_new ();
398 pending = fibheap_new ();
399 visited = sbitmap_alloc (last_basic_block);
400 in_worklist = sbitmap_alloc (last_basic_block);
401 in_pending = sbitmap_alloc (last_basic_block);
402 sbitmap_zero (in_worklist);
403
404 /* Don't check outgoing edges of entry point. */
405 sbitmap_ones (in_pending);
406 FOR_EACH_BB (bb)
407 if (BLOCK_INFO (bb)->processed)
408 RESET_BIT (in_pending, bb->index);
409 else
410 {
411 move_or_delete_vzeroupper_1 (bb, false);
412 fibheap_insert (pending, bb_order[bb->index], bb);
413 }
414
415 if (dump_file)
416 fprintf (dump_file, "Check remaining basic blocks\n");
417
418 while (!fibheap_empty (pending))
419 {
420 fibheap_swap = pending;
421 pending = worklist;
422 worklist = fibheap_swap;
423 sbitmap_swap = in_pending;
424 in_pending = in_worklist;
425 in_worklist = sbitmap_swap;
426
427 sbitmap_zero (visited);
428
429 cfun->machine->rescan_vzeroupper_p = 0;
430
431 while (!fibheap_empty (worklist))
432 {
433 bb = (basic_block) fibheap_extract_min (worklist);
434 RESET_BIT (in_worklist, bb->index);
435 gcc_assert (!TEST_BIT (visited, bb->index));
436 if (!TEST_BIT (visited, bb->index))
437 {
438 edge_iterator ei;
439
440 SET_BIT (visited, bb->index);
441
442 if (move_or_delete_vzeroupper_1 (bb, false))
443 FOR_EACH_EDGE (e, ei, bb->succs)
444 {
445 if (e->dest == EXIT_BLOCK_PTR
446 || BLOCK_INFO (e->dest)->processed)
447 continue;
448
449 if (TEST_BIT (visited, e->dest->index))
450 {
451 if (!TEST_BIT (in_pending, e->dest->index))
452 {
453 /* Send E->DEST to next round. */
454 SET_BIT (in_pending, e->dest->index);
455 fibheap_insert (pending,
456 bb_order[e->dest->index],
457 e->dest);
458 }
459 }
460 else if (!TEST_BIT (in_worklist, e->dest->index))
461 {
462 /* Add E->DEST to current round. */
463 SET_BIT (in_worklist, e->dest->index);
464 fibheap_insert (worklist, bb_order[e->dest->index],
465 e->dest);
466 }
467 }
468 }
469 }
470
471 if (!cfun->machine->rescan_vzeroupper_p)
472 break;
473 }
474
475 free (bb_order);
476 fibheap_delete (worklist);
477 fibheap_delete (pending);
478 sbitmap_free (visited);
479 sbitmap_free (in_worklist);
480 sbitmap_free (in_pending);
481
482 if (dump_file)
483 fprintf (dump_file, "Process remaining basic blocks\n");
484
485 FOR_EACH_BB (bb)
486 move_or_delete_vzeroupper_1 (bb, true);
487
488 free_aux_for_blocks ();
489 }
490
491 static rtx legitimize_dllimport_symbol (rtx, bool);
492
493 #ifndef CHECK_STACK_LIMIT
494 #define CHECK_STACK_LIMIT (-1)
495 #endif
496
497 /* Return index of given mode in mult and division cost tables. */
498 #define MODE_INDEX(mode) \
499 ((mode) == QImode ? 0 \
500 : (mode) == HImode ? 1 \
501 : (mode) == SImode ? 2 \
502 : (mode) == DImode ? 3 \
503 : 4)
504
505 /* Processor costs (relative to an add) */
506 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
507 #define COSTS_N_BYTES(N) ((N) * 2)
508
509 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
510
511 const
512 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
513 COSTS_N_BYTES (2), /* cost of an add instruction */
514 COSTS_N_BYTES (3), /* cost of a lea instruction */
515 COSTS_N_BYTES (2), /* variable shift costs */
516 COSTS_N_BYTES (3), /* constant shift costs */
517 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
518 COSTS_N_BYTES (3), /* HI */
519 COSTS_N_BYTES (3), /* SI */
520 COSTS_N_BYTES (3), /* DI */
521 COSTS_N_BYTES (5)}, /* other */
522 0, /* cost of multiply per each bit set */
523 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
524 COSTS_N_BYTES (3), /* HI */
525 COSTS_N_BYTES (3), /* SI */
526 COSTS_N_BYTES (3), /* DI */
527 COSTS_N_BYTES (5)}, /* other */
528 COSTS_N_BYTES (3), /* cost of movsx */
529 COSTS_N_BYTES (3), /* cost of movzx */
530 0, /* "large" insn */
531 2, /* MOVE_RATIO */
532 2, /* cost for loading QImode using movzbl */
533 {2, 2, 2}, /* cost of loading integer registers
534 in QImode, HImode and SImode.
535 Relative to reg-reg move (2). */
536 {2, 2, 2}, /* cost of storing integer registers */
537 2, /* cost of reg,reg fld/fst */
538 {2, 2, 2}, /* cost of loading fp registers
539 in SFmode, DFmode and XFmode */
540 {2, 2, 2}, /* cost of storing fp registers
541 in SFmode, DFmode and XFmode */
542 3, /* cost of moving MMX register */
543 {3, 3}, /* cost of loading MMX registers
544 in SImode and DImode */
545 {3, 3}, /* cost of storing MMX registers
546 in SImode and DImode */
547 3, /* cost of moving SSE register */
548 {3, 3, 3}, /* cost of loading SSE registers
549 in SImode, DImode and TImode */
550 {3, 3, 3}, /* cost of storing SSE registers
551 in SImode, DImode and TImode */
552 3, /* MMX or SSE register to integer */
553 0, /* size of l1 cache */
554 0, /* size of l2 cache */
555 0, /* size of prefetch block */
556 0, /* number of parallel prefetches */
557 2, /* Branch cost */
558 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
559 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
560 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
561 COSTS_N_BYTES (2), /* cost of FABS instruction. */
562 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
563 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
564 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
565 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
566 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
567 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
568 1, /* scalar_stmt_cost. */
569 1, /* scalar load_cost. */
570 1, /* scalar_store_cost. */
571 1, /* vec_stmt_cost. */
572 1, /* vec_to_scalar_cost. */
573 1, /* scalar_to_vec_cost. */
574 1, /* vec_align_load_cost. */
575 1, /* vec_unalign_load_cost. */
576 1, /* vec_store_cost. */
577 1, /* cond_taken_branch_cost. */
578 1, /* cond_not_taken_branch_cost. */
579 };
580
581 /* Processor costs (relative to an add) */
582 static const
583 struct processor_costs i386_cost = { /* 386 specific costs */
584 COSTS_N_INSNS (1), /* cost of an add instruction */
585 COSTS_N_INSNS (1), /* cost of a lea instruction */
586 COSTS_N_INSNS (3), /* variable shift costs */
587 COSTS_N_INSNS (2), /* constant shift costs */
588 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
589 COSTS_N_INSNS (6), /* HI */
590 COSTS_N_INSNS (6), /* SI */
591 COSTS_N_INSNS (6), /* DI */
592 COSTS_N_INSNS (6)}, /* other */
593 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
594 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
595 COSTS_N_INSNS (23), /* HI */
596 COSTS_N_INSNS (23), /* SI */
597 COSTS_N_INSNS (23), /* DI */
598 COSTS_N_INSNS (23)}, /* other */
599 COSTS_N_INSNS (3), /* cost of movsx */
600 COSTS_N_INSNS (2), /* cost of movzx */
601 15, /* "large" insn */
602 3, /* MOVE_RATIO */
603 4, /* cost for loading QImode using movzbl */
604 {2, 4, 2}, /* cost of loading integer registers
605 in QImode, HImode and SImode.
606 Relative to reg-reg move (2). */
607 {2, 4, 2}, /* cost of storing integer registers */
608 2, /* cost of reg,reg fld/fst */
609 {8, 8, 8}, /* cost of loading fp registers
610 in SFmode, DFmode and XFmode */
611 {8, 8, 8}, /* cost of storing fp registers
612 in SFmode, DFmode and XFmode */
613 2, /* cost of moving MMX register */
614 {4, 8}, /* cost of loading MMX registers
615 in SImode and DImode */
616 {4, 8}, /* cost of storing MMX registers
617 in SImode and DImode */
618 2, /* cost of moving SSE register */
619 {4, 8, 16}, /* cost of loading SSE registers
620 in SImode, DImode and TImode */
621 {4, 8, 16}, /* cost of storing SSE registers
622 in SImode, DImode and TImode */
623 3, /* MMX or SSE register to integer */
624 0, /* size of l1 cache */
625 0, /* size of l2 cache */
626 0, /* size of prefetch block */
627 0, /* number of parallel prefetches */
628 1, /* Branch cost */
629 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
630 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
631 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
632 COSTS_N_INSNS (22), /* cost of FABS instruction. */
633 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
634 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
635 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
636 DUMMY_STRINGOP_ALGS},
637 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
638 DUMMY_STRINGOP_ALGS},
639 1, /* scalar_stmt_cost. */
640 1, /* scalar load_cost. */
641 1, /* scalar_store_cost. */
642 1, /* vec_stmt_cost. */
643 1, /* vec_to_scalar_cost. */
644 1, /* scalar_to_vec_cost. */
645 1, /* vec_align_load_cost. */
646 2, /* vec_unalign_load_cost. */
647 1, /* vec_store_cost. */
648 3, /* cond_taken_branch_cost. */
649 1, /* cond_not_taken_branch_cost. */
650 };
651
652 static const
653 struct processor_costs i486_cost = { /* 486 specific costs */
654 COSTS_N_INSNS (1), /* cost of an add instruction */
655 COSTS_N_INSNS (1), /* cost of a lea instruction */
656 COSTS_N_INSNS (3), /* variable shift costs */
657 COSTS_N_INSNS (2), /* constant shift costs */
658 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
659 COSTS_N_INSNS (12), /* HI */
660 COSTS_N_INSNS (12), /* SI */
661 COSTS_N_INSNS (12), /* DI */
662 COSTS_N_INSNS (12)}, /* other */
663 1, /* cost of multiply per each bit set */
664 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
665 COSTS_N_INSNS (40), /* HI */
666 COSTS_N_INSNS (40), /* SI */
667 COSTS_N_INSNS (40), /* DI */
668 COSTS_N_INSNS (40)}, /* other */
669 COSTS_N_INSNS (3), /* cost of movsx */
670 COSTS_N_INSNS (2), /* cost of movzx */
671 15, /* "large" insn */
672 3, /* MOVE_RATIO */
673 4, /* cost for loading QImode using movzbl */
674 {2, 4, 2}, /* cost of loading integer registers
675 in QImode, HImode and SImode.
676 Relative to reg-reg move (2). */
677 {2, 4, 2}, /* cost of storing integer registers */
678 2, /* cost of reg,reg fld/fst */
679 {8, 8, 8}, /* cost of loading fp registers
680 in SFmode, DFmode and XFmode */
681 {8, 8, 8}, /* cost of storing fp registers
682 in SFmode, DFmode and XFmode */
683 2, /* cost of moving MMX register */
684 {4, 8}, /* cost of loading MMX registers
685 in SImode and DImode */
686 {4, 8}, /* cost of storing MMX registers
687 in SImode and DImode */
688 2, /* cost of moving SSE register */
689 {4, 8, 16}, /* cost of loading SSE registers
690 in SImode, DImode and TImode */
691 {4, 8, 16}, /* cost of storing SSE registers
692 in SImode, DImode and TImode */
693 3, /* MMX or SSE register to integer */
694 4, /* size of l1 cache. 486 has 8kB cache
695 shared for code and data, so 4kB is
696 not really precise. */
697 4, /* size of l2 cache */
698 0, /* size of prefetch block */
699 0, /* number of parallel prefetches */
700 1, /* Branch cost */
701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
702 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
703 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
704 COSTS_N_INSNS (3), /* cost of FABS instruction. */
705 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
706 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
707 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
708 DUMMY_STRINGOP_ALGS},
709 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
710 DUMMY_STRINGOP_ALGS},
711 1, /* scalar_stmt_cost. */
712 1, /* scalar load_cost. */
713 1, /* scalar_store_cost. */
714 1, /* vec_stmt_cost. */
715 1, /* vec_to_scalar_cost. */
716 1, /* scalar_to_vec_cost. */
717 1, /* vec_align_load_cost. */
718 2, /* vec_unalign_load_cost. */
719 1, /* vec_store_cost. */
720 3, /* cond_taken_branch_cost. */
721 1, /* cond_not_taken_branch_cost. */
722 };
723
724 static const
725 struct processor_costs pentium_cost = {
726 COSTS_N_INSNS (1), /* cost of an add instruction */
727 COSTS_N_INSNS (1), /* cost of a lea instruction */
728 COSTS_N_INSNS (4), /* variable shift costs */
729 COSTS_N_INSNS (1), /* constant shift costs */
730 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
731 COSTS_N_INSNS (11), /* HI */
732 COSTS_N_INSNS (11), /* SI */
733 COSTS_N_INSNS (11), /* DI */
734 COSTS_N_INSNS (11)}, /* other */
735 0, /* cost of multiply per each bit set */
736 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
737 COSTS_N_INSNS (25), /* HI */
738 COSTS_N_INSNS (25), /* SI */
739 COSTS_N_INSNS (25), /* DI */
740 COSTS_N_INSNS (25)}, /* other */
741 COSTS_N_INSNS (3), /* cost of movsx */
742 COSTS_N_INSNS (2), /* cost of movzx */
743 8, /* "large" insn */
744 6, /* MOVE_RATIO */
745 6, /* cost for loading QImode using movzbl */
746 {2, 4, 2}, /* cost of loading integer registers
747 in QImode, HImode and SImode.
748 Relative to reg-reg move (2). */
749 {2, 4, 2}, /* cost of storing integer registers */
750 2, /* cost of reg,reg fld/fst */
751 {2, 2, 6}, /* cost of loading fp registers
752 in SFmode, DFmode and XFmode */
753 {4, 4, 6}, /* cost of storing fp registers
754 in SFmode, DFmode and XFmode */
755 8, /* cost of moving MMX register */
756 {8, 8}, /* cost of loading MMX registers
757 in SImode and DImode */
758 {8, 8}, /* cost of storing MMX registers
759 in SImode and DImode */
760 2, /* cost of moving SSE register */
761 {4, 8, 16}, /* cost of loading SSE registers
762 in SImode, DImode and TImode */
763 {4, 8, 16}, /* cost of storing SSE registers
764 in SImode, DImode and TImode */
765 3, /* MMX or SSE register to integer */
766 8, /* size of l1 cache. */
767 8, /* size of l2 cache */
768 0, /* size of prefetch block */
769 0, /* number of parallel prefetches */
770 2, /* Branch cost */
771 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
772 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
773 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
774 COSTS_N_INSNS (1), /* cost of FABS instruction. */
775 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
776 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
777 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
778 DUMMY_STRINGOP_ALGS},
779 {{libcall, {{-1, rep_prefix_4_byte}}},
780 DUMMY_STRINGOP_ALGS},
781 1, /* scalar_stmt_cost. */
782 1, /* scalar load_cost. */
783 1, /* scalar_store_cost. */
784 1, /* vec_stmt_cost. */
785 1, /* vec_to_scalar_cost. */
786 1, /* scalar_to_vec_cost. */
787 1, /* vec_align_load_cost. */
788 2, /* vec_unalign_load_cost. */
789 1, /* vec_store_cost. */
790 3, /* cond_taken_branch_cost. */
791 1, /* cond_not_taken_branch_cost. */
792 };
793
794 static const
795 struct processor_costs pentiumpro_cost = {
796 COSTS_N_INSNS (1), /* cost of an add instruction */
797 COSTS_N_INSNS (1), /* cost of a lea instruction */
798 COSTS_N_INSNS (1), /* variable shift costs */
799 COSTS_N_INSNS (1), /* constant shift costs */
800 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
801 COSTS_N_INSNS (4), /* HI */
802 COSTS_N_INSNS (4), /* SI */
803 COSTS_N_INSNS (4), /* DI */
804 COSTS_N_INSNS (4)}, /* other */
805 0, /* cost of multiply per each bit set */
806 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
807 COSTS_N_INSNS (17), /* HI */
808 COSTS_N_INSNS (17), /* SI */
809 COSTS_N_INSNS (17), /* DI */
810 COSTS_N_INSNS (17)}, /* other */
811 COSTS_N_INSNS (1), /* cost of movsx */
812 COSTS_N_INSNS (1), /* cost of movzx */
813 8, /* "large" insn */
814 6, /* MOVE_RATIO */
815 2, /* cost for loading QImode using movzbl */
816 {4, 4, 4}, /* cost of loading integer registers
817 in QImode, HImode and SImode.
818 Relative to reg-reg move (2). */
819 {2, 2, 2}, /* cost of storing integer registers */
820 2, /* cost of reg,reg fld/fst */
821 {2, 2, 6}, /* cost of loading fp registers
822 in SFmode, DFmode and XFmode */
823 {4, 4, 6}, /* cost of storing fp registers
824 in SFmode, DFmode and XFmode */
825 2, /* cost of moving MMX register */
826 {2, 2}, /* cost of loading MMX registers
827 in SImode and DImode */
828 {2, 2}, /* cost of storing MMX registers
829 in SImode and DImode */
830 2, /* cost of moving SSE register */
831 {2, 2, 8}, /* cost of loading SSE registers
832 in SImode, DImode and TImode */
833 {2, 2, 8}, /* cost of storing SSE registers
834 in SImode, DImode and TImode */
835 3, /* MMX or SSE register to integer */
836 8, /* size of l1 cache. */
837 256, /* size of l2 cache */
838 32, /* size of prefetch block */
839 6, /* number of parallel prefetches */
840 2, /* Branch cost */
841 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
842 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
843 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
844 COSTS_N_INSNS (2), /* cost of FABS instruction. */
845 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
846 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
847 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
848 (we ensure the alignment). For small blocks inline loop is still a
849 noticeable win, for bigger blocks either rep movsl or rep movsb is
850 way to go. Rep movsb has apparently more expensive startup time in CPU,
851 but after 4K the difference is down in the noise. */
852 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
853 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
854 DUMMY_STRINGOP_ALGS},
855 {{rep_prefix_4_byte, {{1024, unrolled_loop},
856 {8192, rep_prefix_4_byte}, {-1, libcall}}},
857 DUMMY_STRINGOP_ALGS},
858 1, /* scalar_stmt_cost. */
859 1, /* scalar load_cost. */
860 1, /* scalar_store_cost. */
861 1, /* vec_stmt_cost. */
862 1, /* vec_to_scalar_cost. */
863 1, /* scalar_to_vec_cost. */
864 1, /* vec_align_load_cost. */
865 2, /* vec_unalign_load_cost. */
866 1, /* vec_store_cost. */
867 3, /* cond_taken_branch_cost. */
868 1, /* cond_not_taken_branch_cost. */
869 };
870
871 static const
872 struct processor_costs geode_cost = {
873 COSTS_N_INSNS (1), /* cost of an add instruction */
874 COSTS_N_INSNS (1), /* cost of a lea instruction */
875 COSTS_N_INSNS (2), /* variable shift costs */
876 COSTS_N_INSNS (1), /* constant shift costs */
877 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
878 COSTS_N_INSNS (4), /* HI */
879 COSTS_N_INSNS (7), /* SI */
880 COSTS_N_INSNS (7), /* DI */
881 COSTS_N_INSNS (7)}, /* other */
882 0, /* cost of multiply per each bit set */
883 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
884 COSTS_N_INSNS (23), /* HI */
885 COSTS_N_INSNS (39), /* SI */
886 COSTS_N_INSNS (39), /* DI */
887 COSTS_N_INSNS (39)}, /* other */
888 COSTS_N_INSNS (1), /* cost of movsx */
889 COSTS_N_INSNS (1), /* cost of movzx */
890 8, /* "large" insn */
891 4, /* MOVE_RATIO */
892 1, /* cost for loading QImode using movzbl */
893 {1, 1, 1}, /* cost of loading integer registers
894 in QImode, HImode and SImode.
895 Relative to reg-reg move (2). */
896 {1, 1, 1}, /* cost of storing integer registers */
897 1, /* cost of reg,reg fld/fst */
898 {1, 1, 1}, /* cost of loading fp registers
899 in SFmode, DFmode and XFmode */
900 {4, 6, 6}, /* cost of storing fp registers
901 in SFmode, DFmode and XFmode */
902
903 1, /* cost of moving MMX register */
904 {1, 1}, /* cost of loading MMX registers
905 in SImode and DImode */
906 {1, 1}, /* cost of storing MMX registers
907 in SImode and DImode */
908 1, /* cost of moving SSE register */
909 {1, 1, 1}, /* cost of loading SSE registers
910 in SImode, DImode and TImode */
911 {1, 1, 1}, /* cost of storing SSE registers
912 in SImode, DImode and TImode */
913 1, /* MMX or SSE register to integer */
914 64, /* size of l1 cache. */
915 128, /* size of l2 cache. */
916 32, /* size of prefetch block */
917 1, /* number of parallel prefetches */
918 1, /* Branch cost */
919 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
920 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
921 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
922 COSTS_N_INSNS (1), /* cost of FABS instruction. */
923 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
924 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
925 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
926 DUMMY_STRINGOP_ALGS},
927 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
928 DUMMY_STRINGOP_ALGS},
929 1, /* scalar_stmt_cost. */
930 1, /* scalar load_cost. */
931 1, /* scalar_store_cost. */
932 1, /* vec_stmt_cost. */
933 1, /* vec_to_scalar_cost. */
934 1, /* scalar_to_vec_cost. */
935 1, /* vec_align_load_cost. */
936 2, /* vec_unalign_load_cost. */
937 1, /* vec_store_cost. */
938 3, /* cond_taken_branch_cost. */
939 1, /* cond_not_taken_branch_cost. */
940 };
941
942 static const
943 struct processor_costs k6_cost = {
944 COSTS_N_INSNS (1), /* cost of an add instruction */
945 COSTS_N_INSNS (2), /* cost of a lea instruction */
946 COSTS_N_INSNS (1), /* variable shift costs */
947 COSTS_N_INSNS (1), /* constant shift costs */
948 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
949 COSTS_N_INSNS (3), /* HI */
950 COSTS_N_INSNS (3), /* SI */
951 COSTS_N_INSNS (3), /* DI */
952 COSTS_N_INSNS (3)}, /* other */
953 0, /* cost of multiply per each bit set */
954 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
955 COSTS_N_INSNS (18), /* HI */
956 COSTS_N_INSNS (18), /* SI */
957 COSTS_N_INSNS (18), /* DI */
958 COSTS_N_INSNS (18)}, /* other */
959 COSTS_N_INSNS (2), /* cost of movsx */
960 COSTS_N_INSNS (2), /* cost of movzx */
961 8, /* "large" insn */
962 4, /* MOVE_RATIO */
963 3, /* cost for loading QImode using movzbl */
964 {4, 5, 4}, /* cost of loading integer registers
965 in QImode, HImode and SImode.
966 Relative to reg-reg move (2). */
967 {2, 3, 2}, /* cost of storing integer registers */
968 4, /* cost of reg,reg fld/fst */
969 {6, 6, 6}, /* cost of loading fp registers
970 in SFmode, DFmode and XFmode */
971 {4, 4, 4}, /* cost of storing fp registers
972 in SFmode, DFmode and XFmode */
973 2, /* cost of moving MMX register */
974 {2, 2}, /* cost of loading MMX registers
975 in SImode and DImode */
976 {2, 2}, /* cost of storing MMX registers
977 in SImode and DImode */
978 2, /* cost of moving SSE register */
979 {2, 2, 8}, /* cost of loading SSE registers
980 in SImode, DImode and TImode */
981 {2, 2, 8}, /* cost of storing SSE registers
982 in SImode, DImode and TImode */
983 6, /* MMX or SSE register to integer */
984 32, /* size of l1 cache. */
985 32, /* size of l2 cache. Some models
986 have integrated l2 cache, but
987 optimizing for k6 is not important
988 enough to worry about that. */
989 32, /* size of prefetch block */
990 1, /* number of parallel prefetches */
991 1, /* Branch cost */
992 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
993 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
994 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
995 COSTS_N_INSNS (2), /* cost of FABS instruction. */
996 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
997 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
998 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
999 DUMMY_STRINGOP_ALGS},
1000 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1001 DUMMY_STRINGOP_ALGS},
1002 1, /* scalar_stmt_cost. */
1003 1, /* scalar load_cost. */
1004 1, /* scalar_store_cost. */
1005 1, /* vec_stmt_cost. */
1006 1, /* vec_to_scalar_cost. */
1007 1, /* scalar_to_vec_cost. */
1008 1, /* vec_align_load_cost. */
1009 2, /* vec_unalign_load_cost. */
1010 1, /* vec_store_cost. */
1011 3, /* cond_taken_branch_cost. */
1012 1, /* cond_not_taken_branch_cost. */
1013 };
1014
1015 static const
1016 struct processor_costs athlon_cost = {
1017 COSTS_N_INSNS (1), /* cost of an add instruction */
1018 COSTS_N_INSNS (2), /* cost of a lea instruction */
1019 COSTS_N_INSNS (1), /* variable shift costs */
1020 COSTS_N_INSNS (1), /* constant shift costs */
1021 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1022 COSTS_N_INSNS (5), /* HI */
1023 COSTS_N_INSNS (5), /* SI */
1024 COSTS_N_INSNS (5), /* DI */
1025 COSTS_N_INSNS (5)}, /* other */
1026 0, /* cost of multiply per each bit set */
1027 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1028 COSTS_N_INSNS (26), /* HI */
1029 COSTS_N_INSNS (42), /* SI */
1030 COSTS_N_INSNS (74), /* DI */
1031 COSTS_N_INSNS (74)}, /* other */
1032 COSTS_N_INSNS (1), /* cost of movsx */
1033 COSTS_N_INSNS (1), /* cost of movzx */
1034 8, /* "large" insn */
1035 9, /* MOVE_RATIO */
1036 4, /* cost for loading QImode using movzbl */
1037 {3, 4, 3}, /* cost of loading integer registers
1038 in QImode, HImode and SImode.
1039 Relative to reg-reg move (2). */
1040 {3, 4, 3}, /* cost of storing integer registers */
1041 4, /* cost of reg,reg fld/fst */
1042 {4, 4, 12}, /* cost of loading fp registers
1043 in SFmode, DFmode and XFmode */
1044 {6, 6, 8}, /* cost of storing fp registers
1045 in SFmode, DFmode and XFmode */
1046 2, /* cost of moving MMX register */
1047 {4, 4}, /* cost of loading MMX registers
1048 in SImode and DImode */
1049 {4, 4}, /* cost of storing MMX registers
1050 in SImode and DImode */
1051 2, /* cost of moving SSE register */
1052 {4, 4, 6}, /* cost of loading SSE registers
1053 in SImode, DImode and TImode */
1054 {4, 4, 5}, /* cost of storing SSE registers
1055 in SImode, DImode and TImode */
1056 5, /* MMX or SSE register to integer */
1057 64, /* size of l1 cache. */
1058 256, /* size of l2 cache. */
1059 64, /* size of prefetch block */
1060 6, /* number of parallel prefetches */
1061 5, /* Branch cost */
1062 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1063 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1064 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1065 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1066 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1067 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1068 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1069 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1070 128 bytes for memset. */
1071 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1072 DUMMY_STRINGOP_ALGS},
1073 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1074 DUMMY_STRINGOP_ALGS},
1075 1, /* scalar_stmt_cost. */
1076 1, /* scalar load_cost. */
1077 1, /* scalar_store_cost. */
1078 1, /* vec_stmt_cost. */
1079 1, /* vec_to_scalar_cost. */
1080 1, /* scalar_to_vec_cost. */
1081 1, /* vec_align_load_cost. */
1082 2, /* vec_unalign_load_cost. */
1083 1, /* vec_store_cost. */
1084 3, /* cond_taken_branch_cost. */
1085 1, /* cond_not_taken_branch_cost. */
1086 };
1087
1088 static const
1089 struct processor_costs k8_cost = {
1090 COSTS_N_INSNS (1), /* cost of an add instruction */
1091 COSTS_N_INSNS (2), /* cost of a lea instruction */
1092 COSTS_N_INSNS (1), /* variable shift costs */
1093 COSTS_N_INSNS (1), /* constant shift costs */
1094 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1095 COSTS_N_INSNS (4), /* HI */
1096 COSTS_N_INSNS (3), /* SI */
1097 COSTS_N_INSNS (4), /* DI */
1098 COSTS_N_INSNS (5)}, /* other */
1099 0, /* cost of multiply per each bit set */
1100 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1101 COSTS_N_INSNS (26), /* HI */
1102 COSTS_N_INSNS (42), /* SI */
1103 COSTS_N_INSNS (74), /* DI */
1104 COSTS_N_INSNS (74)}, /* other */
1105 COSTS_N_INSNS (1), /* cost of movsx */
1106 COSTS_N_INSNS (1), /* cost of movzx */
1107 8, /* "large" insn */
1108 9, /* MOVE_RATIO */
1109 4, /* cost for loading QImode using movzbl */
1110 {3, 4, 3}, /* cost of loading integer registers
1111 in QImode, HImode and SImode.
1112 Relative to reg-reg move (2). */
1113 {3, 4, 3}, /* cost of storing integer registers */
1114 4, /* cost of reg,reg fld/fst */
1115 {4, 4, 12}, /* cost of loading fp registers
1116 in SFmode, DFmode and XFmode */
1117 {6, 6, 8}, /* cost of storing fp registers
1118 in SFmode, DFmode and XFmode */
1119 2, /* cost of moving MMX register */
1120 {3, 3}, /* cost of loading MMX registers
1121 in SImode and DImode */
1122 {4, 4}, /* cost of storing MMX registers
1123 in SImode and DImode */
1124 2, /* cost of moving SSE register */
1125 {4, 3, 6}, /* cost of loading SSE registers
1126 in SImode, DImode and TImode */
1127 {4, 4, 5}, /* cost of storing SSE registers
1128 in SImode, DImode and TImode */
1129 5, /* MMX or SSE register to integer */
1130 64, /* size of l1 cache. */
1131 512, /* size of l2 cache. */
1132 64, /* size of prefetch block */
1133 /* New AMD processors never drop prefetches; if they cannot be performed
1134 immediately, they are queued. We set number of simultaneous prefetches
1135 to a large constant to reflect this (it probably is not a good idea not
1136 to limit number of prefetches at all, as their execution also takes some
1137 time). */
1138 100, /* number of parallel prefetches */
1139 3, /* Branch cost */
1140 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1141 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1142 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1143 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1144 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1145 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1146 /* K8 has optimized REP instruction for medium sized blocks, but for very
1147 small blocks it is better to use loop. For large blocks, libcall can
1148 do nontemporary accesses and beat inline considerably. */
1149 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1150 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1151 {{libcall, {{8, loop}, {24, unrolled_loop},
1152 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1153 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1154 4, /* scalar_stmt_cost. */
1155 2, /* scalar load_cost. */
1156 2, /* scalar_store_cost. */
1157 5, /* vec_stmt_cost. */
1158 0, /* vec_to_scalar_cost. */
1159 2, /* scalar_to_vec_cost. */
1160 2, /* vec_align_load_cost. */
1161 3, /* vec_unalign_load_cost. */
1162 3, /* vec_store_cost. */
1163 3, /* cond_taken_branch_cost. */
1164 2, /* cond_not_taken_branch_cost. */
1165 };
1166
1167 struct processor_costs amdfam10_cost = {
1168 COSTS_N_INSNS (1), /* cost of an add instruction */
1169 COSTS_N_INSNS (2), /* cost of a lea instruction */
1170 COSTS_N_INSNS (1), /* variable shift costs */
1171 COSTS_N_INSNS (1), /* constant shift costs */
1172 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1173 COSTS_N_INSNS (4), /* HI */
1174 COSTS_N_INSNS (3), /* SI */
1175 COSTS_N_INSNS (4), /* DI */
1176 COSTS_N_INSNS (5)}, /* other */
1177 0, /* cost of multiply per each bit set */
1178 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1179 COSTS_N_INSNS (35), /* HI */
1180 COSTS_N_INSNS (51), /* SI */
1181 COSTS_N_INSNS (83), /* DI */
1182 COSTS_N_INSNS (83)}, /* other */
1183 COSTS_N_INSNS (1), /* cost of movsx */
1184 COSTS_N_INSNS (1), /* cost of movzx */
1185 8, /* "large" insn */
1186 9, /* MOVE_RATIO */
1187 4, /* cost for loading QImode using movzbl */
1188 {3, 4, 3}, /* cost of loading integer registers
1189 in QImode, HImode and SImode.
1190 Relative to reg-reg move (2). */
1191 {3, 4, 3}, /* cost of storing integer registers */
1192 4, /* cost of reg,reg fld/fst */
1193 {4, 4, 12}, /* cost of loading fp registers
1194 in SFmode, DFmode and XFmode */
1195 {6, 6, 8}, /* cost of storing fp registers
1196 in SFmode, DFmode and XFmode */
1197 2, /* cost of moving MMX register */
1198 {3, 3}, /* cost of loading MMX registers
1199 in SImode and DImode */
1200 {4, 4}, /* cost of storing MMX registers
1201 in SImode and DImode */
1202 2, /* cost of moving SSE register */
1203 {4, 4, 3}, /* cost of loading SSE registers
1204 in SImode, DImode and TImode */
1205 {4, 4, 5}, /* cost of storing SSE registers
1206 in SImode, DImode and TImode */
1207 3, /* MMX or SSE register to integer */
1208 /* On K8:
1209 MOVD reg64, xmmreg Double FSTORE 4
1210 MOVD reg32, xmmreg Double FSTORE 4
1211 On AMDFAM10:
1212 MOVD reg64, xmmreg Double FADD 3
1213 1/1 1/1
1214 MOVD reg32, xmmreg Double FADD 3
1215 1/1 1/1 */
1216 64, /* size of l1 cache. */
1217 512, /* size of l2 cache. */
1218 64, /* size of prefetch block */
1219 /* New AMD processors never drop prefetches; if they cannot be performed
1220 immediately, they are queued. We set number of simultaneous prefetches
1221 to a large constant to reflect this (it probably is not a good idea not
1222 to limit number of prefetches at all, as their execution also takes some
1223 time). */
1224 100, /* number of parallel prefetches */
1225 2, /* Branch cost */
1226 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1227 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1228 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1229 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1230 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1231 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1232
1233 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1234 very small blocks it is better to use loop. For large blocks, libcall can
1235 do nontemporary accesses and beat inline considerably. */
1236 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1237 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1238 {{libcall, {{8, loop}, {24, unrolled_loop},
1239 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1240 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1241 4, /* scalar_stmt_cost. */
1242 2, /* scalar load_cost. */
1243 2, /* scalar_store_cost. */
1244 6, /* vec_stmt_cost. */
1245 0, /* vec_to_scalar_cost. */
1246 2, /* scalar_to_vec_cost. */
1247 2, /* vec_align_load_cost. */
1248 2, /* vec_unalign_load_cost. */
1249 2, /* vec_store_cost. */
1250 2, /* cond_taken_branch_cost. */
1251 1, /* cond_not_taken_branch_cost. */
1252 };
1253
1254 struct processor_costs bdver1_cost = {
1255 COSTS_N_INSNS (1), /* cost of an add instruction */
1256 COSTS_N_INSNS (1), /* cost of a lea instruction */
1257 COSTS_N_INSNS (1), /* variable shift costs */
1258 COSTS_N_INSNS (1), /* constant shift costs */
1259 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1260 COSTS_N_INSNS (4), /* HI */
1261 COSTS_N_INSNS (4), /* SI */
1262 COSTS_N_INSNS (6), /* DI */
1263 COSTS_N_INSNS (6)}, /* other */
1264 0, /* cost of multiply per each bit set */
1265 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1266 COSTS_N_INSNS (35), /* HI */
1267 COSTS_N_INSNS (51), /* SI */
1268 COSTS_N_INSNS (83), /* DI */
1269 COSTS_N_INSNS (83)}, /* other */
1270 COSTS_N_INSNS (1), /* cost of movsx */
1271 COSTS_N_INSNS (1), /* cost of movzx */
1272 8, /* "large" insn */
1273 9, /* MOVE_RATIO */
1274 4, /* cost for loading QImode using movzbl */
1275 {5, 5, 4}, /* cost of loading integer registers
1276 in QImode, HImode and SImode.
1277 Relative to reg-reg move (2). */
1278 {4, 4, 4}, /* cost of storing integer registers */
1279 2, /* cost of reg,reg fld/fst */
1280 {5, 5, 12}, /* cost of loading fp registers
1281 in SFmode, DFmode and XFmode */
1282 {4, 4, 8}, /* cost of storing fp registers
1283 in SFmode, DFmode and XFmode */
1284 2, /* cost of moving MMX register */
1285 {4, 4}, /* cost of loading MMX registers
1286 in SImode and DImode */
1287 {4, 4}, /* cost of storing MMX registers
1288 in SImode and DImode */
1289 2, /* cost of moving SSE register */
1290 {4, 4, 4}, /* cost of loading SSE registers
1291 in SImode, DImode and TImode */
1292 {4, 4, 4}, /* cost of storing SSE registers
1293 in SImode, DImode and TImode */
1294 2, /* MMX or SSE register to integer */
1295 /* On K8:
1296 MOVD reg64, xmmreg Double FSTORE 4
1297 MOVD reg32, xmmreg Double FSTORE 4
1298 On AMDFAM10:
1299 MOVD reg64, xmmreg Double FADD 3
1300 1/1 1/1
1301 MOVD reg32, xmmreg Double FADD 3
1302 1/1 1/1 */
1303 16, /* size of l1 cache. */
1304 2048, /* size of l2 cache. */
1305 64, /* size of prefetch block */
1306 /* New AMD processors never drop prefetches; if they cannot be performed
1307 immediately, they are queued. We set number of simultaneous prefetches
1308 to a large constant to reflect this (it probably is not a good idea not
1309 to limit number of prefetches at all, as their execution also takes some
1310 time). */
1311 100, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1319
1320 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1321 very small blocks it is better to use loop. For large blocks, libcall
1322 can do nontemporary accesses and beat inline considerably. */
1323 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1324 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1325 {{libcall, {{8, loop}, {24, unrolled_loop},
1326 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1327 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1328 6, /* scalar_stmt_cost. */
1329 4, /* scalar load_cost. */
1330 4, /* scalar_store_cost. */
1331 6, /* vec_stmt_cost. */
1332 0, /* vec_to_scalar_cost. */
1333 2, /* scalar_to_vec_cost. */
1334 4, /* vec_align_load_cost. */
1335 4, /* vec_unalign_load_cost. */
1336 4, /* vec_store_cost. */
1337 2, /* cond_taken_branch_cost. */
1338 1, /* cond_not_taken_branch_cost. */
1339 };
1340
1341 struct processor_costs bdver2_cost = {
1342 COSTS_N_INSNS (1), /* cost of an add instruction */
1343 COSTS_N_INSNS (1), /* cost of a lea instruction */
1344 COSTS_N_INSNS (1), /* variable shift costs */
1345 COSTS_N_INSNS (1), /* constant shift costs */
1346 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1347 COSTS_N_INSNS (4), /* HI */
1348 COSTS_N_INSNS (4), /* SI */
1349 COSTS_N_INSNS (6), /* DI */
1350 COSTS_N_INSNS (6)}, /* other */
1351 0, /* cost of multiply per each bit set */
1352 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1353 COSTS_N_INSNS (35), /* HI */
1354 COSTS_N_INSNS (51), /* SI */
1355 COSTS_N_INSNS (83), /* DI */
1356 COSTS_N_INSNS (83)}, /* other */
1357 COSTS_N_INSNS (1), /* cost of movsx */
1358 COSTS_N_INSNS (1), /* cost of movzx */
1359 8, /* "large" insn */
1360 9, /* MOVE_RATIO */
1361 4, /* cost for loading QImode using movzbl */
1362 {5, 5, 4}, /* cost of loading integer registers
1363 in QImode, HImode and SImode.
1364 Relative to reg-reg move (2). */
1365 {4, 4, 4}, /* cost of storing integer registers */
1366 2, /* cost of reg,reg fld/fst */
1367 {5, 5, 12}, /* cost of loading fp registers
1368 in SFmode, DFmode and XFmode */
1369 {4, 4, 8}, /* cost of storing fp registers
1370 in SFmode, DFmode and XFmode */
1371 2, /* cost of moving MMX register */
1372 {4, 4}, /* cost of loading MMX registers
1373 in SImode and DImode */
1374 {4, 4}, /* cost of storing MMX registers
1375 in SImode and DImode */
1376 2, /* cost of moving SSE register */
1377 {4, 4, 4}, /* cost of loading SSE registers
1378 in SImode, DImode and TImode */
1379 {4, 4, 4}, /* cost of storing SSE registers
1380 in SImode, DImode and TImode */
1381 2, /* MMX or SSE register to integer */
1382 /* On K8:
1383 MOVD reg64, xmmreg Double FSTORE 4
1384 MOVD reg32, xmmreg Double FSTORE 4
1385 On AMDFAM10:
1386 MOVD reg64, xmmreg Double FADD 3
1387 1/1 1/1
1388 MOVD reg32, xmmreg Double FADD 3
1389 1/1 1/1 */
1390 16, /* size of l1 cache. */
1391 2048, /* size of l2 cache. */
1392 64, /* size of prefetch block */
1393 /* New AMD processors never drop prefetches; if they cannot be performed
1394 immediately, they are queued. We set number of simultaneous prefetches
1395 to a large constant to reflect this (it probably is not a good idea not
1396 to limit number of prefetches at all, as their execution also takes some
1397 time). */
1398 100, /* number of parallel prefetches */
1399 2, /* Branch cost */
1400 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1401 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1402 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1403 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1404 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1405 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1406
1407 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1408 very small blocks it is better to use loop. For large blocks, libcall
1409 can do nontemporary accesses and beat inline considerably. */
1410 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1411 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1412 {{libcall, {{8, loop}, {24, unrolled_loop},
1413 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1414 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1415 6, /* scalar_stmt_cost. */
1416 4, /* scalar load_cost. */
1417 4, /* scalar_store_cost. */
1418 6, /* vec_stmt_cost. */
1419 0, /* vec_to_scalar_cost. */
1420 2, /* scalar_to_vec_cost. */
1421 4, /* vec_align_load_cost. */
1422 4, /* vec_unalign_load_cost. */
1423 4, /* vec_store_cost. */
1424 2, /* cond_taken_branch_cost. */
1425 1, /* cond_not_taken_branch_cost. */
1426 };
1427
1428 struct processor_costs btver1_cost = {
1429 COSTS_N_INSNS (1), /* cost of an add instruction */
1430 COSTS_N_INSNS (2), /* cost of a lea instruction */
1431 COSTS_N_INSNS (1), /* variable shift costs */
1432 COSTS_N_INSNS (1), /* constant shift costs */
1433 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1434 COSTS_N_INSNS (4), /* HI */
1435 COSTS_N_INSNS (3), /* SI */
1436 COSTS_N_INSNS (4), /* DI */
1437 COSTS_N_INSNS (5)}, /* other */
1438 0, /* cost of multiply per each bit set */
1439 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1440 COSTS_N_INSNS (35), /* HI */
1441 COSTS_N_INSNS (51), /* SI */
1442 COSTS_N_INSNS (83), /* DI */
1443 COSTS_N_INSNS (83)}, /* other */
1444 COSTS_N_INSNS (1), /* cost of movsx */
1445 COSTS_N_INSNS (1), /* cost of movzx */
1446 8, /* "large" insn */
1447 9, /* MOVE_RATIO */
1448 4, /* cost for loading QImode using movzbl */
1449 {3, 4, 3}, /* cost of loading integer registers
1450 in QImode, HImode and SImode.
1451 Relative to reg-reg move (2). */
1452 {3, 4, 3}, /* cost of storing integer registers */
1453 4, /* cost of reg,reg fld/fst */
1454 {4, 4, 12}, /* cost of loading fp registers
1455 in SFmode, DFmode and XFmode */
1456 {6, 6, 8}, /* cost of storing fp registers
1457 in SFmode, DFmode and XFmode */
1458 2, /* cost of moving MMX register */
1459 {3, 3}, /* cost of loading MMX registers
1460 in SImode and DImode */
1461 {4, 4}, /* cost of storing MMX registers
1462 in SImode and DImode */
1463 2, /* cost of moving SSE register */
1464 {4, 4, 3}, /* cost of loading SSE registers
1465 in SImode, DImode and TImode */
1466 {4, 4, 5}, /* cost of storing SSE registers
1467 in SImode, DImode and TImode */
1468 3, /* MMX or SSE register to integer */
1469 /* On K8:
1470 MOVD reg64, xmmreg Double FSTORE 4
1471 MOVD reg32, xmmreg Double FSTORE 4
1472 On AMDFAM10:
1473 MOVD reg64, xmmreg Double FADD 3
1474 1/1 1/1
1475 MOVD reg32, xmmreg Double FADD 3
1476 1/1 1/1 */
1477 32, /* size of l1 cache. */
1478 512, /* size of l2 cache. */
1479 64, /* size of prefetch block */
1480 100, /* number of parallel prefetches */
1481 2, /* Branch cost */
1482 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1483 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1484 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1485 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1486 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1487 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1488
1489 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1490 very small blocks it is better to use loop. For large blocks, libcall can
1491 do nontemporary accesses and beat inline considerably. */
1492 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1493 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1494 {{libcall, {{8, loop}, {24, unrolled_loop},
1495 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1496 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1497 4, /* scalar_stmt_cost. */
1498 2, /* scalar load_cost. */
1499 2, /* scalar_store_cost. */
1500 6, /* vec_stmt_cost. */
1501 0, /* vec_to_scalar_cost. */
1502 2, /* scalar_to_vec_cost. */
1503 2, /* vec_align_load_cost. */
1504 2, /* vec_unalign_load_cost. */
1505 2, /* vec_store_cost. */
1506 2, /* cond_taken_branch_cost. */
1507 1, /* cond_not_taken_branch_cost. */
1508 };
1509
1510 static const
1511 struct processor_costs pentium4_cost = {
1512 COSTS_N_INSNS (1), /* cost of an add instruction */
1513 COSTS_N_INSNS (3), /* cost of a lea instruction */
1514 COSTS_N_INSNS (4), /* variable shift costs */
1515 COSTS_N_INSNS (4), /* constant shift costs */
1516 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1517 COSTS_N_INSNS (15), /* HI */
1518 COSTS_N_INSNS (15), /* SI */
1519 COSTS_N_INSNS (15), /* DI */
1520 COSTS_N_INSNS (15)}, /* other */
1521 0, /* cost of multiply per each bit set */
1522 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1523 COSTS_N_INSNS (56), /* HI */
1524 COSTS_N_INSNS (56), /* SI */
1525 COSTS_N_INSNS (56), /* DI */
1526 COSTS_N_INSNS (56)}, /* other */
1527 COSTS_N_INSNS (1), /* cost of movsx */
1528 COSTS_N_INSNS (1), /* cost of movzx */
1529 16, /* "large" insn */
1530 6, /* MOVE_RATIO */
1531 2, /* cost for loading QImode using movzbl */
1532 {4, 5, 4}, /* cost of loading integer registers
1533 in QImode, HImode and SImode.
1534 Relative to reg-reg move (2). */
1535 {2, 3, 2}, /* cost of storing integer registers */
1536 2, /* cost of reg,reg fld/fst */
1537 {2, 2, 6}, /* cost of loading fp registers
1538 in SFmode, DFmode and XFmode */
1539 {4, 4, 6}, /* cost of storing fp registers
1540 in SFmode, DFmode and XFmode */
1541 2, /* cost of moving MMX register */
1542 {2, 2}, /* cost of loading MMX registers
1543 in SImode and DImode */
1544 {2, 2}, /* cost of storing MMX registers
1545 in SImode and DImode */
1546 12, /* cost of moving SSE register */
1547 {12, 12, 12}, /* cost of loading SSE registers
1548 in SImode, DImode and TImode */
1549 {2, 2, 8}, /* cost of storing SSE registers
1550 in SImode, DImode and TImode */
1551 10, /* MMX or SSE register to integer */
1552 8, /* size of l1 cache. */
1553 256, /* size of l2 cache. */
1554 64, /* size of prefetch block */
1555 6, /* number of parallel prefetches */
1556 2, /* Branch cost */
1557 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1558 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1559 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1560 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1561 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1562 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1563 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1564 DUMMY_STRINGOP_ALGS},
1565 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1566 {-1, libcall}}},
1567 DUMMY_STRINGOP_ALGS},
1568 1, /* scalar_stmt_cost. */
1569 1, /* scalar load_cost. */
1570 1, /* scalar_store_cost. */
1571 1, /* vec_stmt_cost. */
1572 1, /* vec_to_scalar_cost. */
1573 1, /* scalar_to_vec_cost. */
1574 1, /* vec_align_load_cost. */
1575 2, /* vec_unalign_load_cost. */
1576 1, /* vec_store_cost. */
1577 3, /* cond_taken_branch_cost. */
1578 1, /* cond_not_taken_branch_cost. */
1579 };
1580
1581 static const
1582 struct processor_costs nocona_cost = {
1583 COSTS_N_INSNS (1), /* cost of an add instruction */
1584 COSTS_N_INSNS (1), /* cost of a lea instruction */
1585 COSTS_N_INSNS (1), /* variable shift costs */
1586 COSTS_N_INSNS (1), /* constant shift costs */
1587 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1588 COSTS_N_INSNS (10), /* HI */
1589 COSTS_N_INSNS (10), /* SI */
1590 COSTS_N_INSNS (10), /* DI */
1591 COSTS_N_INSNS (10)}, /* other */
1592 0, /* cost of multiply per each bit set */
1593 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1594 COSTS_N_INSNS (66), /* HI */
1595 COSTS_N_INSNS (66), /* SI */
1596 COSTS_N_INSNS (66), /* DI */
1597 COSTS_N_INSNS (66)}, /* other */
1598 COSTS_N_INSNS (1), /* cost of movsx */
1599 COSTS_N_INSNS (1), /* cost of movzx */
1600 16, /* "large" insn */
1601 17, /* MOVE_RATIO */
1602 4, /* cost for loading QImode using movzbl */
1603 {4, 4, 4}, /* cost of loading integer registers
1604 in QImode, HImode and SImode.
1605 Relative to reg-reg move (2). */
1606 {4, 4, 4}, /* cost of storing integer registers */
1607 3, /* cost of reg,reg fld/fst */
1608 {12, 12, 12}, /* cost of loading fp registers
1609 in SFmode, DFmode and XFmode */
1610 {4, 4, 4}, /* cost of storing fp registers
1611 in SFmode, DFmode and XFmode */
1612 6, /* cost of moving MMX register */
1613 {12, 12}, /* cost of loading MMX registers
1614 in SImode and DImode */
1615 {12, 12}, /* cost of storing MMX registers
1616 in SImode and DImode */
1617 6, /* cost of moving SSE register */
1618 {12, 12, 12}, /* cost of loading SSE registers
1619 in SImode, DImode and TImode */
1620 {12, 12, 12}, /* cost of storing SSE registers
1621 in SImode, DImode and TImode */
1622 8, /* MMX or SSE register to integer */
1623 8, /* size of l1 cache. */
1624 1024, /* size of l2 cache. */
1625 128, /* size of prefetch block */
1626 8, /* number of parallel prefetches */
1627 1, /* Branch cost */
1628 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1629 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1630 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1631 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1632 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1633 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1634 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1635 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1636 {100000, unrolled_loop}, {-1, libcall}}}},
1637 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1638 {-1, libcall}}},
1639 {libcall, {{24, loop}, {64, unrolled_loop},
1640 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1641 1, /* scalar_stmt_cost. */
1642 1, /* scalar load_cost. */
1643 1, /* scalar_store_cost. */
1644 1, /* vec_stmt_cost. */
1645 1, /* vec_to_scalar_cost. */
1646 1, /* scalar_to_vec_cost. */
1647 1, /* vec_align_load_cost. */
1648 2, /* vec_unalign_load_cost. */
1649 1, /* vec_store_cost. */
1650 3, /* cond_taken_branch_cost. */
1651 1, /* cond_not_taken_branch_cost. */
1652 };
1653
1654 static const
1655 struct processor_costs atom_cost = {
1656 COSTS_N_INSNS (1), /* cost of an add instruction */
1657 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1658 COSTS_N_INSNS (1), /* variable shift costs */
1659 COSTS_N_INSNS (1), /* constant shift costs */
1660 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1661 COSTS_N_INSNS (4), /* HI */
1662 COSTS_N_INSNS (3), /* SI */
1663 COSTS_N_INSNS (4), /* DI */
1664 COSTS_N_INSNS (2)}, /* other */
1665 0, /* cost of multiply per each bit set */
1666 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1667 COSTS_N_INSNS (26), /* HI */
1668 COSTS_N_INSNS (42), /* SI */
1669 COSTS_N_INSNS (74), /* DI */
1670 COSTS_N_INSNS (74)}, /* other */
1671 COSTS_N_INSNS (1), /* cost of movsx */
1672 COSTS_N_INSNS (1), /* cost of movzx */
1673 8, /* "large" insn */
1674 17, /* MOVE_RATIO */
1675 4, /* cost for loading QImode using movzbl */
1676 {4, 4, 4}, /* cost of loading integer registers
1677 in QImode, HImode and SImode.
1678 Relative to reg-reg move (2). */
1679 {4, 4, 4}, /* cost of storing integer registers */
1680 4, /* cost of reg,reg fld/fst */
1681 {12, 12, 12}, /* cost of loading fp registers
1682 in SFmode, DFmode and XFmode */
1683 {6, 6, 8}, /* cost of storing fp registers
1684 in SFmode, DFmode and XFmode */
1685 2, /* cost of moving MMX register */
1686 {8, 8}, /* cost of loading MMX registers
1687 in SImode and DImode */
1688 {8, 8}, /* cost of storing MMX registers
1689 in SImode and DImode */
1690 2, /* cost of moving SSE register */
1691 {8, 8, 8}, /* cost of loading SSE registers
1692 in SImode, DImode and TImode */
1693 {8, 8, 8}, /* cost of storing SSE registers
1694 in SImode, DImode and TImode */
1695 5, /* MMX or SSE register to integer */
1696 32, /* size of l1 cache. */
1697 256, /* size of l2 cache. */
1698 64, /* size of prefetch block */
1699 6, /* number of parallel prefetches */
1700 3, /* Branch cost */
1701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1702 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1703 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1704 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1705 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1706 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1707 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1708 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1709 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1710 {{libcall, {{8, loop}, {15, unrolled_loop},
1711 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1712 {libcall, {{24, loop}, {32, unrolled_loop},
1713 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1714 1, /* scalar_stmt_cost. */
1715 1, /* scalar load_cost. */
1716 1, /* scalar_store_cost. */
1717 1, /* vec_stmt_cost. */
1718 1, /* vec_to_scalar_cost. */
1719 1, /* scalar_to_vec_cost. */
1720 1, /* vec_align_load_cost. */
1721 2, /* vec_unalign_load_cost. */
1722 1, /* vec_store_cost. */
1723 3, /* cond_taken_branch_cost. */
1724 1, /* cond_not_taken_branch_cost. */
1725 };
1726
1727 /* Generic64 should produce code tuned for Nocona and K8. */
1728 static const
1729 struct processor_costs generic64_cost = {
1730 COSTS_N_INSNS (1), /* cost of an add instruction */
1731 /* On all chips taken into consideration lea is 2 cycles and more. With
1732 this cost however our current implementation of synth_mult results in
1733 use of unnecessary temporary registers causing regression on several
1734 SPECfp benchmarks. */
1735 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1736 COSTS_N_INSNS (1), /* variable shift costs */
1737 COSTS_N_INSNS (1), /* constant shift costs */
1738 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1739 COSTS_N_INSNS (4), /* HI */
1740 COSTS_N_INSNS (3), /* SI */
1741 COSTS_N_INSNS (4), /* DI */
1742 COSTS_N_INSNS (2)}, /* other */
1743 0, /* cost of multiply per each bit set */
1744 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1745 COSTS_N_INSNS (26), /* HI */
1746 COSTS_N_INSNS (42), /* SI */
1747 COSTS_N_INSNS (74), /* DI */
1748 COSTS_N_INSNS (74)}, /* other */
1749 COSTS_N_INSNS (1), /* cost of movsx */
1750 COSTS_N_INSNS (1), /* cost of movzx */
1751 8, /* "large" insn */
1752 17, /* MOVE_RATIO */
1753 4, /* cost for loading QImode using movzbl */
1754 {4, 4, 4}, /* cost of loading integer registers
1755 in QImode, HImode and SImode.
1756 Relative to reg-reg move (2). */
1757 {4, 4, 4}, /* cost of storing integer registers */
1758 4, /* cost of reg,reg fld/fst */
1759 {12, 12, 12}, /* cost of loading fp registers
1760 in SFmode, DFmode and XFmode */
1761 {6, 6, 8}, /* cost of storing fp registers
1762 in SFmode, DFmode and XFmode */
1763 2, /* cost of moving MMX register */
1764 {8, 8}, /* cost of loading MMX registers
1765 in SImode and DImode */
1766 {8, 8}, /* cost of storing MMX registers
1767 in SImode and DImode */
1768 2, /* cost of moving SSE register */
1769 {8, 8, 8}, /* cost of loading SSE registers
1770 in SImode, DImode and TImode */
1771 {8, 8, 8}, /* cost of storing SSE registers
1772 in SImode, DImode and TImode */
1773 5, /* MMX or SSE register to integer */
1774 32, /* size of l1 cache. */
1775 512, /* size of l2 cache. */
1776 64, /* size of prefetch block */
1777 6, /* number of parallel prefetches */
1778 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1779 value is increased to perhaps more appropriate value of 5. */
1780 3, /* Branch cost */
1781 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1782 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1783 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1784 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1785 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1786 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1787 {DUMMY_STRINGOP_ALGS,
1788 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1789 {DUMMY_STRINGOP_ALGS,
1790 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1791 1, /* scalar_stmt_cost. */
1792 1, /* scalar load_cost. */
1793 1, /* scalar_store_cost. */
1794 1, /* vec_stmt_cost. */
1795 1, /* vec_to_scalar_cost. */
1796 1, /* scalar_to_vec_cost. */
1797 1, /* vec_align_load_cost. */
1798 2, /* vec_unalign_load_cost. */
1799 1, /* vec_store_cost. */
1800 3, /* cond_taken_branch_cost. */
1801 1, /* cond_not_taken_branch_cost. */
1802 };
1803
1804 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1805 Athlon and K8. */
1806 static const
1807 struct processor_costs generic32_cost = {
1808 COSTS_N_INSNS (1), /* cost of an add instruction */
1809 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1810 COSTS_N_INSNS (1), /* variable shift costs */
1811 COSTS_N_INSNS (1), /* constant shift costs */
1812 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1813 COSTS_N_INSNS (4), /* HI */
1814 COSTS_N_INSNS (3), /* SI */
1815 COSTS_N_INSNS (4), /* DI */
1816 COSTS_N_INSNS (2)}, /* other */
1817 0, /* cost of multiply per each bit set */
1818 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1819 COSTS_N_INSNS (26), /* HI */
1820 COSTS_N_INSNS (42), /* SI */
1821 COSTS_N_INSNS (74), /* DI */
1822 COSTS_N_INSNS (74)}, /* other */
1823 COSTS_N_INSNS (1), /* cost of movsx */
1824 COSTS_N_INSNS (1), /* cost of movzx */
1825 8, /* "large" insn */
1826 17, /* MOVE_RATIO */
1827 4, /* cost for loading QImode using movzbl */
1828 {4, 4, 4}, /* cost of loading integer registers
1829 in QImode, HImode and SImode.
1830 Relative to reg-reg move (2). */
1831 {4, 4, 4}, /* cost of storing integer registers */
1832 4, /* cost of reg,reg fld/fst */
1833 {12, 12, 12}, /* cost of loading fp registers
1834 in SFmode, DFmode and XFmode */
1835 {6, 6, 8}, /* cost of storing fp registers
1836 in SFmode, DFmode and XFmode */
1837 2, /* cost of moving MMX register */
1838 {8, 8}, /* cost of loading MMX registers
1839 in SImode and DImode */
1840 {8, 8}, /* cost of storing MMX registers
1841 in SImode and DImode */
1842 2, /* cost of moving SSE register */
1843 {8, 8, 8}, /* cost of loading SSE registers
1844 in SImode, DImode and TImode */
1845 {8, 8, 8}, /* cost of storing SSE registers
1846 in SImode, DImode and TImode */
1847 5, /* MMX or SSE register to integer */
1848 32, /* size of l1 cache. */
1849 256, /* size of l2 cache. */
1850 64, /* size of prefetch block */
1851 6, /* number of parallel prefetches */
1852 3, /* Branch cost */
1853 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1854 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1855 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1856 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1857 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1858 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1859 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1860 DUMMY_STRINGOP_ALGS},
1861 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1862 DUMMY_STRINGOP_ALGS},
1863 1, /* scalar_stmt_cost. */
1864 1, /* scalar load_cost. */
1865 1, /* scalar_store_cost. */
1866 1, /* vec_stmt_cost. */
1867 1, /* vec_to_scalar_cost. */
1868 1, /* scalar_to_vec_cost. */
1869 1, /* vec_align_load_cost. */
1870 2, /* vec_unalign_load_cost. */
1871 1, /* vec_store_cost. */
1872 3, /* cond_taken_branch_cost. */
1873 1, /* cond_not_taken_branch_cost. */
1874 };
1875
1876 const struct processor_costs *ix86_cost = &pentium_cost;
1877
1878 /* Processor feature/optimization bitmasks. */
1879 #define m_386 (1<<PROCESSOR_I386)
1880 #define m_486 (1<<PROCESSOR_I486)
1881 #define m_PENT (1<<PROCESSOR_PENTIUM)
1882 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1883 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1884 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1885 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1886 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1887 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1888 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1889 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1890 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1891 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1892 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1893 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1894 #define m_ATOM (1<<PROCESSOR_ATOM)
1895
1896 #define m_GEODE (1<<PROCESSOR_GEODE)
1897 #define m_K6 (1<<PROCESSOR_K6)
1898 #define m_K6_GEODE (m_K6 | m_GEODE)
1899 #define m_K8 (1<<PROCESSOR_K8)
1900 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1901 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1902 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1903 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1904 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1905 #define m_BDVER (m_BDVER1 | m_BDVER2)
1906 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1907 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1)
1908
1909 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1910 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1911
1912 /* Generic instruction choice should be common subset of supported CPUs
1913 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1914 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1915
1916 /* Feature tests against the various tunings. */
1917 unsigned char ix86_tune_features[X86_TUNE_LAST];
1918
1919 /* Feature tests against the various tunings used to create ix86_tune_features
1920 based on the processor mask. */
1921 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1922 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1923 negatively, so enabling for Generic64 seems like good code size
1924 tradeoff. We can't enable it for 32bit generic because it does not
1925 work well with PPro base chips. */
1926 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1927
1928 /* X86_TUNE_PUSH_MEMORY */
1929 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1930
1931 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1932 m_486 | m_PENT,
1933
1934 /* X86_TUNE_UNROLL_STRLEN */
1935 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1936
1937 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1938 on simulation result. But after P4 was made, no performance benefit
1939 was observed with branch hints. It also increases the code size.
1940 As a result, icc never generates branch hints. */
1941 0,
1942
1943 /* X86_TUNE_DOUBLE_WITH_ADD */
1944 ~m_386,
1945
1946 /* X86_TUNE_USE_SAHF */
1947 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC,
1948
1949 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1950 partial dependencies. */
1951 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1952
1953 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1954 register stalls on Generic32 compilation setting as well. However
1955 in current implementation the partial register stalls are not eliminated
1956 very well - they can be introduced via subregs synthesized by combine
1957 and can happen in caller/callee saving sequences. Because this option
1958 pays back little on PPro based chips and is in conflict with partial reg
1959 dependencies used by Athlon/P4 based chips, it is better to leave it off
1960 for generic32 for now. */
1961 m_PPRO,
1962
1963 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1964 m_CORE2I7 | m_GENERIC,
1965
1966 /* X86_TUNE_USE_HIMODE_FIOP */
1967 m_386 | m_486 | m_K6_GEODE,
1968
1969 /* X86_TUNE_USE_SIMODE_FIOP */
1970 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1971
1972 /* X86_TUNE_USE_MOV0 */
1973 m_K6,
1974
1975 /* X86_TUNE_USE_CLTD */
1976 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
1977
1978 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1979 m_PENT4,
1980
1981 /* X86_TUNE_SPLIT_LONG_MOVES */
1982 m_PPRO,
1983
1984 /* X86_TUNE_READ_MODIFY_WRITE */
1985 ~m_PENT,
1986
1987 /* X86_TUNE_READ_MODIFY */
1988 ~(m_PENT | m_PPRO),
1989
1990 /* X86_TUNE_PROMOTE_QIMODE */
1991 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1992
1993 /* X86_TUNE_FAST_PREFIX */
1994 ~(m_386 | m_486 | m_PENT),
1995
1996 /* X86_TUNE_SINGLE_STRINGOP */
1997 m_386 | m_P4_NOCONA,
1998
1999 /* X86_TUNE_QIMODE_MATH */
2000 ~0,
2001
2002 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2003 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2004 might be considered for Generic32 if our scheme for avoiding partial
2005 stalls was more effective. */
2006 ~m_PPRO,
2007
2008 /* X86_TUNE_PROMOTE_QI_REGS */
2009 0,
2010
2011 /* X86_TUNE_PROMOTE_HI_REGS */
2012 m_PPRO,
2013
2014 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2015 over esp addition. */
2016 m_386 | m_486 | m_PENT | m_PPRO,
2017
2018 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2019 over esp addition. */
2020 m_PENT,
2021
2022 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2023 over esp subtraction. */
2024 m_386 | m_486 | m_PENT | m_K6_GEODE,
2025
2026 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2027 over esp subtraction. */
2028 m_PENT | m_K6_GEODE,
2029
2030 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2031 for DFmode copies */
2032 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2033
2034 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2035 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2036
2037 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2038 conflict here in between PPro/Pentium4 based chips that thread 128bit
2039 SSE registers as single units versus K8 based chips that divide SSE
2040 registers to two 64bit halves. This knob promotes all store destinations
2041 to be 128bit to allow register renaming on 128bit SSE units, but usually
2042 results in one extra microop on 64bit SSE units. Experimental results
2043 shows that disabling this option on P4 brings over 20% SPECfp regression,
2044 while enabling it on K8 brings roughly 2.4% regression that can be partly
2045 masked by careful scheduling of moves. */
2046 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2047
2048 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2049 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER1,
2050
2051 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2052 m_COREI7 | m_BDVER,
2053
2054 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2055 m_BDVER ,
2056
2057 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2058 are resolved on SSE register parts instead of whole registers, so we may
2059 maintain just lower part of scalar values in proper format leaving the
2060 upper part undefined. */
2061 m_ATHLON_K8,
2062
2063 /* X86_TUNE_SSE_TYPELESS_STORES */
2064 m_AMD_MULTIPLE,
2065
2066 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2067 m_PPRO | m_P4_NOCONA,
2068
2069 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2070 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2071
2072 /* X86_TUNE_PROLOGUE_USING_MOVE */
2073 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2074
2075 /* X86_TUNE_EPILOGUE_USING_MOVE */
2076 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2077
2078 /* X86_TUNE_SHIFT1 */
2079 ~m_486,
2080
2081 /* X86_TUNE_USE_FFREEP */
2082 m_AMD_MULTIPLE,
2083
2084 /* X86_TUNE_INTER_UNIT_MOVES */
2085 ~(m_AMD_MULTIPLE | m_GENERIC),
2086
2087 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2088 ~(m_AMDFAM10 | m_BDVER ),
2089
2090 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2091 than 4 branch instructions in the 16 byte window. */
2092 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2093
2094 /* X86_TUNE_SCHEDULE */
2095 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2096
2097 /* X86_TUNE_USE_BT */
2098 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2099
2100 /* X86_TUNE_USE_INCDEC */
2101 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2102
2103 /* X86_TUNE_PAD_RETURNS */
2104 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2105
2106 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2107 m_ATOM,
2108
2109 /* X86_TUNE_EXT_80387_CONSTANTS */
2110 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2111
2112 /* X86_TUNE_SHORTEN_X87_SSE */
2113 ~m_K8,
2114
2115 /* X86_TUNE_AVOID_VECTOR_DECODE */
2116 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2117
2118 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2119 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2120 ~(m_386 | m_486),
2121
2122 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2123 vector path on AMD machines. */
2124 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2125
2126 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2127 machines. */
2128 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2129
2130 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2131 than a MOV. */
2132 m_PENT,
2133
2134 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2135 but one byte longer. */
2136 m_PENT,
2137
2138 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2139 operand that cannot be represented using a modRM byte. The XOR
2140 replacement is long decoded, so this split helps here as well. */
2141 m_K6,
2142
2143 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2144 from FP to FP. */
2145 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2146
2147 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2148 from integer to FP. */
2149 m_AMDFAM10,
2150
2151 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2152 with a subsequent conditional jump instruction into a single
2153 compare-and-branch uop. */
2154 m_BDVER,
2155
2156 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2157 will impact LEA instruction selection. */
2158 m_ATOM,
2159
2160 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2161 instructions. */
2162 ~m_ATOM,
2163
2164 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2165 at -O3. For the moment, the prefetching seems badly tuned for Intel
2166 chips. */
2167 m_K6_GEODE | m_AMD_MULTIPLE,
2168
2169 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2170 the auto-vectorizer. */
2171 m_BDVER,
2172
2173 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2174 during reassociation of integer computation. */
2175 m_ATOM,
2176
2177 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2178 during reassociation of fp computation. */
2179 m_ATOM
2180 };
2181
2182 /* Feature tests against the various architecture variations. */
2183 unsigned char ix86_arch_features[X86_ARCH_LAST];
2184
2185 /* Feature tests against the various architecture variations, used to create
2186 ix86_arch_features based on the processor mask. */
2187 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2188 /* X86_ARCH_CMOVE: Conditional move was added for pentiumpro. */
2189 ~(m_386 | m_486 | m_PENT | m_K6),
2190
2191 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2192 ~m_386,
2193
2194 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2195 ~(m_386 | m_486),
2196
2197 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2198 ~m_386,
2199
2200 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2201 ~m_386,
2202 };
2203
2204 static const unsigned int x86_accumulate_outgoing_args
2205 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2206
2207 static const unsigned int x86_arch_always_fancy_math_387
2208 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2209
2210 static const unsigned int x86_avx256_split_unaligned_load
2211 = m_COREI7 | m_GENERIC;
2212
2213 static const unsigned int x86_avx256_split_unaligned_store
2214 = m_COREI7 | m_BDVER | m_GENERIC;
2215
2216 /* In case the average insn count for single function invocation is
2217 lower than this constant, emit fast (but longer) prologue and
2218 epilogue code. */
2219 #define FAST_PROLOGUE_INSN_COUNT 20
2220
2221 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2222 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2223 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2224 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2225
2226 /* Array of the smallest class containing reg number REGNO, indexed by
2227 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2228
2229 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2230 {
2231 /* ax, dx, cx, bx */
2232 AREG, DREG, CREG, BREG,
2233 /* si, di, bp, sp */
2234 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2235 /* FP registers */
2236 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2237 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2238 /* arg pointer */
2239 NON_Q_REGS,
2240 /* flags, fpsr, fpcr, frame */
2241 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2242 /* SSE registers */
2243 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2244 SSE_REGS, SSE_REGS,
2245 /* MMX registers */
2246 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2247 MMX_REGS, MMX_REGS,
2248 /* REX registers */
2249 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2250 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2251 /* SSE REX registers */
2252 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2253 SSE_REGS, SSE_REGS,
2254 };
2255
2256 /* The "default" register map used in 32bit mode. */
2257
2258 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2259 {
2260 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2261 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2262 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2263 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2264 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2265 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2266 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2267 };
2268
2269 /* The "default" register map used in 64bit mode. */
2270
2271 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2272 {
2273 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2274 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2275 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2276 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2277 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2278 8,9,10,11,12,13,14,15, /* extended integer registers */
2279 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2280 };
2281
2282 /* Define the register numbers to be used in Dwarf debugging information.
2283 The SVR4 reference port C compiler uses the following register numbers
2284 in its Dwarf output code:
2285 0 for %eax (gcc regno = 0)
2286 1 for %ecx (gcc regno = 2)
2287 2 for %edx (gcc regno = 1)
2288 3 for %ebx (gcc regno = 3)
2289 4 for %esp (gcc regno = 7)
2290 5 for %ebp (gcc regno = 6)
2291 6 for %esi (gcc regno = 4)
2292 7 for %edi (gcc regno = 5)
2293 The following three DWARF register numbers are never generated by
2294 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2295 believes these numbers have these meanings.
2296 8 for %eip (no gcc equivalent)
2297 9 for %eflags (gcc regno = 17)
2298 10 for %trapno (no gcc equivalent)
2299 It is not at all clear how we should number the FP stack registers
2300 for the x86 architecture. If the version of SDB on x86/svr4 were
2301 a bit less brain dead with respect to floating-point then we would
2302 have a precedent to follow with respect to DWARF register numbers
2303 for x86 FP registers, but the SDB on x86/svr4 is so completely
2304 broken with respect to FP registers that it is hardly worth thinking
2305 of it as something to strive for compatibility with.
2306 The version of x86/svr4 SDB I have at the moment does (partially)
2307 seem to believe that DWARF register number 11 is associated with
2308 the x86 register %st(0), but that's about all. Higher DWARF
2309 register numbers don't seem to be associated with anything in
2310 particular, and even for DWARF regno 11, SDB only seems to under-
2311 stand that it should say that a variable lives in %st(0) (when
2312 asked via an `=' command) if we said it was in DWARF regno 11,
2313 but SDB still prints garbage when asked for the value of the
2314 variable in question (via a `/' command).
2315 (Also note that the labels SDB prints for various FP stack regs
2316 when doing an `x' command are all wrong.)
2317 Note that these problems generally don't affect the native SVR4
2318 C compiler because it doesn't allow the use of -O with -g and
2319 because when it is *not* optimizing, it allocates a memory
2320 location for each floating-point variable, and the memory
2321 location is what gets described in the DWARF AT_location
2322 attribute for the variable in question.
2323 Regardless of the severe mental illness of the x86/svr4 SDB, we
2324 do something sensible here and we use the following DWARF
2325 register numbers. Note that these are all stack-top-relative
2326 numbers.
2327 11 for %st(0) (gcc regno = 8)
2328 12 for %st(1) (gcc regno = 9)
2329 13 for %st(2) (gcc regno = 10)
2330 14 for %st(3) (gcc regno = 11)
2331 15 for %st(4) (gcc regno = 12)
2332 16 for %st(5) (gcc regno = 13)
2333 17 for %st(6) (gcc regno = 14)
2334 18 for %st(7) (gcc regno = 15)
2335 */
2336 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2337 {
2338 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2339 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2340 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2341 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2342 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2343 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2344 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2345 };
2346
2347 /* Define parameter passing and return registers. */
2348
2349 static int const x86_64_int_parameter_registers[6] =
2350 {
2351 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2352 };
2353
2354 static int const x86_64_ms_abi_int_parameter_registers[4] =
2355 {
2356 CX_REG, DX_REG, R8_REG, R9_REG
2357 };
2358
2359 static int const x86_64_int_return_registers[4] =
2360 {
2361 AX_REG, DX_REG, DI_REG, SI_REG
2362 };
2363
2364 /* Define the structure for the machine field in struct function. */
2365
2366 struct GTY(()) stack_local_entry {
2367 unsigned short mode;
2368 unsigned short n;
2369 rtx rtl;
2370 struct stack_local_entry *next;
2371 };
2372
2373 /* Structure describing stack frame layout.
2374 Stack grows downward:
2375
2376 [arguments]
2377 <- ARG_POINTER
2378 saved pc
2379
2380 saved static chain if ix86_static_chain_on_stack
2381
2382 saved frame pointer if frame_pointer_needed
2383 <- HARD_FRAME_POINTER
2384 [saved regs]
2385 <- regs_save_offset
2386 [padding0]
2387
2388 [saved SSE regs]
2389 <- sse_regs_save_offset
2390 [padding1] |
2391 | <- FRAME_POINTER
2392 [va_arg registers] |
2393 |
2394 [frame] |
2395 |
2396 [padding2] | = to_allocate
2397 <- STACK_POINTER
2398 */
2399 struct ix86_frame
2400 {
2401 int nsseregs;
2402 int nregs;
2403 int va_arg_size;
2404 int red_zone_size;
2405 int outgoing_arguments_size;
2406 HOST_WIDE_INT frame;
2407
2408 /* The offsets relative to ARG_POINTER. */
2409 HOST_WIDE_INT frame_pointer_offset;
2410 HOST_WIDE_INT hard_frame_pointer_offset;
2411 HOST_WIDE_INT stack_pointer_offset;
2412 HOST_WIDE_INT hfp_save_offset;
2413 HOST_WIDE_INT reg_save_offset;
2414 HOST_WIDE_INT sse_reg_save_offset;
2415
2416 /* When save_regs_using_mov is set, emit prologue using
2417 move instead of push instructions. */
2418 bool save_regs_using_mov;
2419 };
2420
2421 /* Which cpu are we scheduling for. */
2422 enum attr_cpu ix86_schedule;
2423
2424 /* Which cpu are we optimizing for. */
2425 enum processor_type ix86_tune;
2426
2427 /* Which instruction set architecture to use. */
2428 enum processor_type ix86_arch;
2429
2430 /* true if sse prefetch instruction is not NOOP. */
2431 int x86_prefetch_sse;
2432
2433 /* -mstackrealign option */
2434 static const char ix86_force_align_arg_pointer_string[]
2435 = "force_align_arg_pointer";
2436
2437 static rtx (*ix86_gen_leave) (void);
2438 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2439 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2440 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2441 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2442 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2443 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2444 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2445 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2446 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2447
2448 /* Preferred alignment for stack boundary in bits. */
2449 unsigned int ix86_preferred_stack_boundary;
2450
2451 /* Alignment for incoming stack boundary in bits specified at
2452 command line. */
2453 static unsigned int ix86_user_incoming_stack_boundary;
2454
2455 /* Default alignment for incoming stack boundary in bits. */
2456 static unsigned int ix86_default_incoming_stack_boundary;
2457
2458 /* Alignment for incoming stack boundary in bits. */
2459 unsigned int ix86_incoming_stack_boundary;
2460
2461 /* Calling abi specific va_list type nodes. */
2462 static GTY(()) tree sysv_va_list_type_node;
2463 static GTY(()) tree ms_va_list_type_node;
2464
2465 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2466 char internal_label_prefix[16];
2467 int internal_label_prefix_len;
2468
2469 /* Fence to use after loop using movnt. */
2470 tree x86_mfence;
2471
2472 /* Register class used for passing given 64bit part of the argument.
2473 These represent classes as documented by the PS ABI, with the exception
2474 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2475 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2476
2477 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2478 whenever possible (upper half does contain padding). */
2479 enum x86_64_reg_class
2480 {
2481 X86_64_NO_CLASS,
2482 X86_64_INTEGER_CLASS,
2483 X86_64_INTEGERSI_CLASS,
2484 X86_64_SSE_CLASS,
2485 X86_64_SSESF_CLASS,
2486 X86_64_SSEDF_CLASS,
2487 X86_64_SSEUP_CLASS,
2488 X86_64_X87_CLASS,
2489 X86_64_X87UP_CLASS,
2490 X86_64_COMPLEX_X87_CLASS,
2491 X86_64_MEMORY_CLASS
2492 };
2493
2494 #define MAX_CLASSES 4
2495
2496 /* Table of constants used by fldpi, fldln2, etc.... */
2497 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2498 static bool ext_80387_constants_init = 0;
2499
2500 \f
2501 static struct machine_function * ix86_init_machine_status (void);
2502 static rtx ix86_function_value (const_tree, const_tree, bool);
2503 static bool ix86_function_value_regno_p (const unsigned int);
2504 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2505 const_tree);
2506 static rtx ix86_static_chain (const_tree, bool);
2507 static int ix86_function_regparm (const_tree, const_tree);
2508 static void ix86_compute_frame_layout (struct ix86_frame *);
2509 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2510 rtx, rtx, int);
2511 static void ix86_add_new_builtins (HOST_WIDE_INT);
2512 static tree ix86_canonical_va_list_type (tree);
2513 static void predict_jump (int);
2514 static unsigned int split_stack_prologue_scratch_regno (void);
2515 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2516
2517 enum ix86_function_specific_strings
2518 {
2519 IX86_FUNCTION_SPECIFIC_ARCH,
2520 IX86_FUNCTION_SPECIFIC_TUNE,
2521 IX86_FUNCTION_SPECIFIC_MAX
2522 };
2523
2524 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2525 const char *, enum fpmath_unit, bool);
2526 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2527 static void ix86_function_specific_save (struct cl_target_option *);
2528 static void ix86_function_specific_restore (struct cl_target_option *);
2529 static void ix86_function_specific_print (FILE *, int,
2530 struct cl_target_option *);
2531 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2532 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2533 struct gcc_options *);
2534 static bool ix86_can_inline_p (tree, tree);
2535 static void ix86_set_current_function (tree);
2536 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2537
2538 static enum calling_abi ix86_function_abi (const_tree);
2539
2540 \f
2541 #ifndef SUBTARGET32_DEFAULT_CPU
2542 #define SUBTARGET32_DEFAULT_CPU "i386"
2543 #endif
2544
2545 /* The svr4 ABI for the i386 says that records and unions are returned
2546 in memory. */
2547 #ifndef DEFAULT_PCC_STRUCT_RETURN
2548 #define DEFAULT_PCC_STRUCT_RETURN 1
2549 #endif
2550
2551 /* Whether -mtune= or -march= were specified */
2552 static int ix86_tune_defaulted;
2553 static int ix86_arch_specified;
2554
2555 /* Vectorization library interface and handlers. */
2556 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2557
2558 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2559 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2560
2561 /* Processor target table, indexed by processor number */
2562 struct ptt
2563 {
2564 const struct processor_costs *cost; /* Processor costs */
2565 const int align_loop; /* Default alignments. */
2566 const int align_loop_max_skip;
2567 const int align_jump;
2568 const int align_jump_max_skip;
2569 const int align_func;
2570 };
2571
2572 static const struct ptt processor_target_table[PROCESSOR_max] =
2573 {
2574 {&i386_cost, 4, 3, 4, 3, 4},
2575 {&i486_cost, 16, 15, 16, 15, 16},
2576 {&pentium_cost, 16, 7, 16, 7, 16},
2577 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2578 {&geode_cost, 0, 0, 0, 0, 0},
2579 {&k6_cost, 32, 7, 32, 7, 32},
2580 {&athlon_cost, 16, 7, 16, 7, 16},
2581 {&pentium4_cost, 0, 0, 0, 0, 0},
2582 {&k8_cost, 16, 7, 16, 7, 16},
2583 {&nocona_cost, 0, 0, 0, 0, 0},
2584 /* Core 2 32-bit. */
2585 {&generic32_cost, 16, 10, 16, 10, 16},
2586 /* Core 2 64-bit. */
2587 {&generic64_cost, 16, 10, 16, 10, 16},
2588 /* Core i7 32-bit. */
2589 {&generic32_cost, 16, 10, 16, 10, 16},
2590 /* Core i7 64-bit. */
2591 {&generic64_cost, 16, 10, 16, 10, 16},
2592 {&generic32_cost, 16, 7, 16, 7, 16},
2593 {&generic64_cost, 16, 10, 16, 10, 16},
2594 {&amdfam10_cost, 32, 24, 32, 7, 32},
2595 {&bdver1_cost, 32, 24, 32, 7, 32},
2596 {&bdver2_cost, 32, 24, 32, 7, 32},
2597 {&btver1_cost, 32, 24, 32, 7, 32},
2598 {&atom_cost, 16, 15, 16, 7, 16}
2599 };
2600
2601 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2602 {
2603 "generic",
2604 "i386",
2605 "i486",
2606 "pentium",
2607 "pentium-mmx",
2608 "pentiumpro",
2609 "pentium2",
2610 "pentium3",
2611 "pentium4",
2612 "pentium-m",
2613 "prescott",
2614 "nocona",
2615 "core2",
2616 "corei7",
2617 "atom",
2618 "geode",
2619 "k6",
2620 "k6-2",
2621 "k6-3",
2622 "athlon",
2623 "athlon-4",
2624 "k8",
2625 "amdfam10",
2626 "bdver1",
2627 "bdver2",
2628 "btver1"
2629 };
2630 \f
2631 /* Return true if a red-zone is in use. */
2632
2633 static inline bool
2634 ix86_using_red_zone (void)
2635 {
2636 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2637 }
2638 \f
2639 /* Return a string that documents the current -m options. The caller is
2640 responsible for freeing the string. */
2641
2642 static char *
2643 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2644 const char *tune, enum fpmath_unit fpmath,
2645 bool add_nl_p)
2646 {
2647 struct ix86_target_opts
2648 {
2649 const char *option; /* option string */
2650 HOST_WIDE_INT mask; /* isa mask options */
2651 };
2652
2653 /* This table is ordered so that options like -msse4.2 that imply
2654 preceding options while match those first. */
2655 static struct ix86_target_opts isa_opts[] =
2656 {
2657 { "-m64", OPTION_MASK_ISA_64BIT },
2658 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2659 { "-mfma", OPTION_MASK_ISA_FMA },
2660 { "-mxop", OPTION_MASK_ISA_XOP },
2661 { "-mlwp", OPTION_MASK_ISA_LWP },
2662 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2663 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2664 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2665 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2666 { "-msse3", OPTION_MASK_ISA_SSE3 },
2667 { "-msse2", OPTION_MASK_ISA_SSE2 },
2668 { "-msse", OPTION_MASK_ISA_SSE },
2669 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2670 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2671 { "-mmmx", OPTION_MASK_ISA_MMX },
2672 { "-mabm", OPTION_MASK_ISA_ABM },
2673 { "-mbmi", OPTION_MASK_ISA_BMI },
2674 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2675 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2676 { "-mtbm", OPTION_MASK_ISA_TBM },
2677 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2678 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2679 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2680 { "-maes", OPTION_MASK_ISA_AES },
2681 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2682 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2683 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2684 { "-mf16c", OPTION_MASK_ISA_F16C },
2685 };
2686
2687 /* Flag options. */
2688 static struct ix86_target_opts flag_opts[] =
2689 {
2690 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2691 { "-m80387", MASK_80387 },
2692 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2693 { "-malign-double", MASK_ALIGN_DOUBLE },
2694 { "-mcld", MASK_CLD },
2695 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2696 { "-mieee-fp", MASK_IEEE_FP },
2697 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2698 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2699 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2700 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2701 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2702 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2703 { "-mno-red-zone", MASK_NO_RED_ZONE },
2704 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2705 { "-mrecip", MASK_RECIP },
2706 { "-mrtd", MASK_RTD },
2707 { "-msseregparm", MASK_SSEREGPARM },
2708 { "-mstack-arg-probe", MASK_STACK_PROBE },
2709 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2710 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2711 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2712 { "-mvzeroupper", MASK_VZEROUPPER },
2713 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2714 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2715 { "-mprefer-avx128", MASK_PREFER_AVX128},
2716 };
2717
2718 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2719
2720 char isa_other[40];
2721 char target_other[40];
2722 unsigned num = 0;
2723 unsigned i, j;
2724 char *ret;
2725 char *ptr;
2726 size_t len;
2727 size_t line_len;
2728 size_t sep_len;
2729
2730 memset (opts, '\0', sizeof (opts));
2731
2732 /* Add -march= option. */
2733 if (arch)
2734 {
2735 opts[num][0] = "-march=";
2736 opts[num++][1] = arch;
2737 }
2738
2739 /* Add -mtune= option. */
2740 if (tune)
2741 {
2742 opts[num][0] = "-mtune=";
2743 opts[num++][1] = tune;
2744 }
2745
2746 /* Pick out the options in isa options. */
2747 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2748 {
2749 if ((isa & isa_opts[i].mask) != 0)
2750 {
2751 opts[num++][0] = isa_opts[i].option;
2752 isa &= ~ isa_opts[i].mask;
2753 }
2754 }
2755
2756 if (isa && add_nl_p)
2757 {
2758 opts[num++][0] = isa_other;
2759 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2760 isa);
2761 }
2762
2763 /* Add flag options. */
2764 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2765 {
2766 if ((flags & flag_opts[i].mask) != 0)
2767 {
2768 opts[num++][0] = flag_opts[i].option;
2769 flags &= ~ flag_opts[i].mask;
2770 }
2771 }
2772
2773 if (flags && add_nl_p)
2774 {
2775 opts[num++][0] = target_other;
2776 sprintf (target_other, "(other flags: %#x)", flags);
2777 }
2778
2779 /* Add -fpmath= option. */
2780 if (fpmath)
2781 {
2782 opts[num][0] = "-mfpmath=";
2783 switch ((int) fpmath)
2784 {
2785 case FPMATH_387:
2786 opts[num++][1] = "387";
2787 break;
2788
2789 case FPMATH_SSE:
2790 opts[num++][1] = "sse";
2791 break;
2792
2793 case FPMATH_387 | FPMATH_SSE:
2794 opts[num++][1] = "sse+387";
2795 break;
2796
2797 default:
2798 gcc_unreachable ();
2799 }
2800 }
2801
2802 /* Any options? */
2803 if (num == 0)
2804 return NULL;
2805
2806 gcc_assert (num < ARRAY_SIZE (opts));
2807
2808 /* Size the string. */
2809 len = 0;
2810 sep_len = (add_nl_p) ? 3 : 1;
2811 for (i = 0; i < num; i++)
2812 {
2813 len += sep_len;
2814 for (j = 0; j < 2; j++)
2815 if (opts[i][j])
2816 len += strlen (opts[i][j]);
2817 }
2818
2819 /* Build the string. */
2820 ret = ptr = (char *) xmalloc (len);
2821 line_len = 0;
2822
2823 for (i = 0; i < num; i++)
2824 {
2825 size_t len2[2];
2826
2827 for (j = 0; j < 2; j++)
2828 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2829
2830 if (i != 0)
2831 {
2832 *ptr++ = ' ';
2833 line_len++;
2834
2835 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2836 {
2837 *ptr++ = '\\';
2838 *ptr++ = '\n';
2839 line_len = 0;
2840 }
2841 }
2842
2843 for (j = 0; j < 2; j++)
2844 if (opts[i][j])
2845 {
2846 memcpy (ptr, opts[i][j], len2[j]);
2847 ptr += len2[j];
2848 line_len += len2[j];
2849 }
2850 }
2851
2852 *ptr = '\0';
2853 gcc_assert (ret + len >= ptr);
2854
2855 return ret;
2856 }
2857
2858 /* Return true, if profiling code should be emitted before
2859 prologue. Otherwise it returns false.
2860 Note: For x86 with "hotfix" it is sorried. */
2861 static bool
2862 ix86_profile_before_prologue (void)
2863 {
2864 return flag_fentry != 0;
2865 }
2866
2867 /* Function that is callable from the debugger to print the current
2868 options. */
2869 void
2870 ix86_debug_options (void)
2871 {
2872 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2873 ix86_arch_string, ix86_tune_string,
2874 ix86_fpmath, true);
2875
2876 if (opts)
2877 {
2878 fprintf (stderr, "%s\n\n", opts);
2879 free (opts);
2880 }
2881 else
2882 fputs ("<no options>\n\n", stderr);
2883
2884 return;
2885 }
2886 \f
2887 /* Override various settings based on options. If MAIN_ARGS_P, the
2888 options are from the command line, otherwise they are from
2889 attributes. */
2890
2891 static void
2892 ix86_option_override_internal (bool main_args_p)
2893 {
2894 int i;
2895 unsigned int ix86_arch_mask, ix86_tune_mask;
2896 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2897 const char *prefix;
2898 const char *suffix;
2899 const char *sw;
2900
2901 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2902 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2903 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2904 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2905 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2906 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2907 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2908 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2909 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2910 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2911 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2912 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2913 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2914 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2915 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2916 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2917 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2918 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2919 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2920 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2921 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2922 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2923 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2924 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2925 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2926 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2927 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2928 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2929 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2930 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2931 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2932 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2933 /* if this reaches 64, need to widen struct pta flags below */
2934
2935 static struct pta
2936 {
2937 const char *const name; /* processor name or nickname. */
2938 const enum processor_type processor;
2939 const enum attr_cpu schedule;
2940 const unsigned HOST_WIDE_INT flags;
2941 }
2942 const processor_alias_table[] =
2943 {
2944 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2945 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2946 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2947 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2948 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2949 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2950 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2951 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2952 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2953 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2954 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2955 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
2956 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2957 PTA_MMX | PTA_SSE},
2958 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2959 PTA_MMX | PTA_SSE},
2960 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2961 PTA_MMX | PTA_SSE | PTA_SSE2},
2962 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2963 PTA_MMX |PTA_SSE | PTA_SSE2},
2964 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2965 PTA_MMX | PTA_SSE | PTA_SSE2},
2966 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2967 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2968 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2969 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2970 | PTA_CX16 | PTA_NO_SAHF},
2971 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
2972 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2973 | PTA_SSSE3 | PTA_CX16},
2974 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
2975 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2976 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16},
2977 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
2978 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2979 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2980 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
2981 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
2982 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2983 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2984 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2985 | PTA_RDRND | PTA_F16C},
2986 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
2987 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2988 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2989 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2990 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2991 | PTA_FMA | PTA_MOVBE},
2992 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2993 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2994 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
2995 {"geode", PROCESSOR_GEODE, CPU_GEODE,
2996 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE},
2997 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
2998 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2999 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3000 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3001 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3002 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3003 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3004 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3005 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3006 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3007 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3008 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3009 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3010 {"x86-64", PROCESSOR_K8, CPU_K8,
3011 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3012 {"k8", PROCESSOR_K8, CPU_K8,
3013 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3014 | PTA_SSE2 | PTA_NO_SAHF},
3015 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3016 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3017 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3018 {"opteron", PROCESSOR_K8, CPU_K8,
3019 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3020 | PTA_SSE2 | PTA_NO_SAHF},
3021 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3022 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3023 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3024 {"athlon64", PROCESSOR_K8, CPU_K8,
3025 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3026 | PTA_SSE2 | PTA_NO_SAHF},
3027 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3028 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3029 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3030 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3031 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3032 | PTA_SSE2 | PTA_NO_SAHF},
3033 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3034 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3035 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3036 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3037 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3038 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3039 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3040 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3041 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3042 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3043 | PTA_XOP | PTA_LWP},
3044 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3045 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3046 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3047 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3048 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3049 | PTA_FMA},
3050 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3051 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3052 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16},
3053 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3054 0 /* flags are only used for -march switch. */ },
3055 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3056 PTA_64BIT /* flags are only used for -march switch. */ },
3057 };
3058
3059 /* -mrecip options. */
3060 static struct
3061 {
3062 const char *string; /* option name */
3063 unsigned int mask; /* mask bits to set */
3064 }
3065 const recip_options[] =
3066 {
3067 { "all", RECIP_MASK_ALL },
3068 { "none", RECIP_MASK_NONE },
3069 { "div", RECIP_MASK_DIV },
3070 { "sqrt", RECIP_MASK_SQRT },
3071 { "vec-div", RECIP_MASK_VEC_DIV },
3072 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3073 };
3074
3075 int const pta_size = ARRAY_SIZE (processor_alias_table);
3076
3077 /* Set up prefix/suffix so the error messages refer to either the command
3078 line argument, or the attribute(target). */
3079 if (main_args_p)
3080 {
3081 prefix = "-m";
3082 suffix = "";
3083 sw = "switch";
3084 }
3085 else
3086 {
3087 prefix = "option(\"";
3088 suffix = "\")";
3089 sw = "attribute";
3090 }
3091
3092 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3093 SUBTARGET_OVERRIDE_OPTIONS;
3094 #endif
3095
3096 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3097 SUBSUBTARGET_OVERRIDE_OPTIONS;
3098 #endif
3099
3100 if (TARGET_X32)
3101 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3102
3103 /* -fPIC is the default for x86_64. */
3104 if (TARGET_MACHO && TARGET_64BIT)
3105 flag_pic = 2;
3106
3107 /* Need to check -mtune=generic first. */
3108 if (ix86_tune_string)
3109 {
3110 if (!strcmp (ix86_tune_string, "generic")
3111 || !strcmp (ix86_tune_string, "i686")
3112 /* As special support for cross compilers we read -mtune=native
3113 as -mtune=generic. With native compilers we won't see the
3114 -mtune=native, as it was changed by the driver. */
3115 || !strcmp (ix86_tune_string, "native"))
3116 {
3117 if (TARGET_64BIT)
3118 ix86_tune_string = "generic64";
3119 else
3120 ix86_tune_string = "generic32";
3121 }
3122 /* If this call is for setting the option attribute, allow the
3123 generic32/generic64 that was previously set. */
3124 else if (!main_args_p
3125 && (!strcmp (ix86_tune_string, "generic32")
3126 || !strcmp (ix86_tune_string, "generic64")))
3127 ;
3128 else if (!strncmp (ix86_tune_string, "generic", 7))
3129 error ("bad value (%s) for %stune=%s %s",
3130 ix86_tune_string, prefix, suffix, sw);
3131 else if (!strcmp (ix86_tune_string, "x86-64"))
3132 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3133 "%stune=k8%s or %stune=generic%s instead as appropriate",
3134 prefix, suffix, prefix, suffix, prefix, suffix);
3135 }
3136 else
3137 {
3138 if (ix86_arch_string)
3139 ix86_tune_string = ix86_arch_string;
3140 if (!ix86_tune_string)
3141 {
3142 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3143 ix86_tune_defaulted = 1;
3144 }
3145
3146 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3147 need to use a sensible tune option. */
3148 if (!strcmp (ix86_tune_string, "generic")
3149 || !strcmp (ix86_tune_string, "x86-64")
3150 || !strcmp (ix86_tune_string, "i686"))
3151 {
3152 if (TARGET_64BIT)
3153 ix86_tune_string = "generic64";
3154 else
3155 ix86_tune_string = "generic32";
3156 }
3157 }
3158
3159 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3160 {
3161 /* rep; movq isn't available in 32-bit code. */
3162 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3163 ix86_stringop_alg = no_stringop;
3164 }
3165
3166 if (!ix86_arch_string)
3167 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3168 else
3169 ix86_arch_specified = 1;
3170
3171 if (!global_options_set.x_ix86_abi)
3172 ix86_abi = DEFAULT_ABI;
3173
3174 if (global_options_set.x_ix86_cmodel)
3175 {
3176 switch (ix86_cmodel)
3177 {
3178 case CM_SMALL:
3179 case CM_SMALL_PIC:
3180 if (flag_pic)
3181 ix86_cmodel = CM_SMALL_PIC;
3182 if (!TARGET_64BIT)
3183 error ("code model %qs not supported in the %s bit mode",
3184 "small", "32");
3185 break;
3186
3187 case CM_MEDIUM:
3188 case CM_MEDIUM_PIC:
3189 if (flag_pic)
3190 ix86_cmodel = CM_MEDIUM_PIC;
3191 if (!TARGET_64BIT)
3192 error ("code model %qs not supported in the %s bit mode",
3193 "medium", "32");
3194 else if (TARGET_X32)
3195 error ("code model %qs not supported in x32 mode",
3196 "medium");
3197 break;
3198
3199 case CM_LARGE:
3200 case CM_LARGE_PIC:
3201 if (flag_pic)
3202 ix86_cmodel = CM_LARGE_PIC;
3203 if (!TARGET_64BIT)
3204 error ("code model %qs not supported in the %s bit mode",
3205 "large", "32");
3206 else if (TARGET_X32)
3207 error ("code model %qs not supported in x32 mode",
3208 "medium");
3209 break;
3210
3211 case CM_32:
3212 if (flag_pic)
3213 error ("code model %s does not support PIC mode", "32");
3214 if (TARGET_64BIT)
3215 error ("code model %qs not supported in the %s bit mode",
3216 "32", "64");
3217 break;
3218
3219 case CM_KERNEL:
3220 if (flag_pic)
3221 {
3222 error ("code model %s does not support PIC mode", "kernel");
3223 ix86_cmodel = CM_32;
3224 }
3225 if (!TARGET_64BIT)
3226 error ("code model %qs not supported in the %s bit mode",
3227 "kernel", "32");
3228 break;
3229
3230 default:
3231 gcc_unreachable ();
3232 }
3233 }
3234 else
3235 {
3236 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3237 use of rip-relative addressing. This eliminates fixups that
3238 would otherwise be needed if this object is to be placed in a
3239 DLL, and is essentially just as efficient as direct addressing. */
3240 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3241 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3242 else if (TARGET_64BIT)
3243 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3244 else
3245 ix86_cmodel = CM_32;
3246 }
3247 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3248 {
3249 error ("-masm=intel not supported in this configuration");
3250 ix86_asm_dialect = ASM_ATT;
3251 }
3252 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3253 sorry ("%i-bit mode not compiled in",
3254 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3255
3256 for (i = 0; i < pta_size; i++)
3257 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3258 {
3259 ix86_schedule = processor_alias_table[i].schedule;
3260 ix86_arch = processor_alias_table[i].processor;
3261 /* Default cpu tuning to the architecture. */
3262 ix86_tune = ix86_arch;
3263
3264 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3265 error ("CPU you selected does not support x86-64 "
3266 "instruction set");
3267
3268 if (processor_alias_table[i].flags & PTA_MMX
3269 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3270 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3271 if (processor_alias_table[i].flags & PTA_3DNOW
3272 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3273 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3274 if (processor_alias_table[i].flags & PTA_3DNOW_A
3275 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3276 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3277 if (processor_alias_table[i].flags & PTA_SSE
3278 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3279 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3280 if (processor_alias_table[i].flags & PTA_SSE2
3281 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3282 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3283 if (processor_alias_table[i].flags & PTA_SSE3
3284 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3285 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3286 if (processor_alias_table[i].flags & PTA_SSSE3
3287 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3288 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3289 if (processor_alias_table[i].flags & PTA_SSE4_1
3290 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3291 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3292 if (processor_alias_table[i].flags & PTA_SSE4_2
3293 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3294 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3295 if (processor_alias_table[i].flags & PTA_AVX
3296 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3297 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3298 if (processor_alias_table[i].flags & PTA_AVX2
3299 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3300 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3301 if (processor_alias_table[i].flags & PTA_FMA
3302 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3303 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3304 if (processor_alias_table[i].flags & PTA_SSE4A
3305 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3306 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3307 if (processor_alias_table[i].flags & PTA_FMA4
3308 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3309 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3310 if (processor_alias_table[i].flags & PTA_XOP
3311 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3312 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3313 if (processor_alias_table[i].flags & PTA_LWP
3314 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3315 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3316 if (processor_alias_table[i].flags & PTA_ABM
3317 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3318 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3319 if (processor_alias_table[i].flags & PTA_BMI
3320 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3321 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3322 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3323 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3324 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3325 if (processor_alias_table[i].flags & PTA_TBM
3326 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3327 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3328 if (processor_alias_table[i].flags & PTA_BMI2
3329 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3330 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3331 if (processor_alias_table[i].flags & PTA_CX16
3332 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3333 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3334 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3335 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3336 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3337 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3338 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3339 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3340 if (processor_alias_table[i].flags & PTA_MOVBE
3341 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3342 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3343 if (processor_alias_table[i].flags & PTA_AES
3344 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3345 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3346 if (processor_alias_table[i].flags & PTA_PCLMUL
3347 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3348 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3349 if (processor_alias_table[i].flags & PTA_FSGSBASE
3350 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3351 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3352 if (processor_alias_table[i].flags & PTA_RDRND
3353 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3354 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3355 if (processor_alias_table[i].flags & PTA_F16C
3356 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3357 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3358 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3359 x86_prefetch_sse = true;
3360
3361 break;
3362 }
3363
3364 if (!strcmp (ix86_arch_string, "generic"))
3365 error ("generic CPU can be used only for %stune=%s %s",
3366 prefix, suffix, sw);
3367 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3368 error ("bad value (%s) for %sarch=%s %s",
3369 ix86_arch_string, prefix, suffix, sw);
3370
3371 ix86_arch_mask = 1u << ix86_arch;
3372 for (i = 0; i < X86_ARCH_LAST; ++i)
3373 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3374
3375 for (i = 0; i < pta_size; i++)
3376 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3377 {
3378 ix86_schedule = processor_alias_table[i].schedule;
3379 ix86_tune = processor_alias_table[i].processor;
3380 if (TARGET_64BIT)
3381 {
3382 if (!(processor_alias_table[i].flags & PTA_64BIT))
3383 {
3384 if (ix86_tune_defaulted)
3385 {
3386 ix86_tune_string = "x86-64";
3387 for (i = 0; i < pta_size; i++)
3388 if (! strcmp (ix86_tune_string,
3389 processor_alias_table[i].name))
3390 break;
3391 ix86_schedule = processor_alias_table[i].schedule;
3392 ix86_tune = processor_alias_table[i].processor;
3393 }
3394 else
3395 error ("CPU you selected does not support x86-64 "
3396 "instruction set");
3397 }
3398 }
3399 else
3400 {
3401 /* Adjust tuning when compiling for 32-bit ABI. */
3402 switch (ix86_tune)
3403 {
3404 case PROCESSOR_GENERIC64:
3405 ix86_tune = PROCESSOR_GENERIC32;
3406 ix86_schedule = CPU_PENTIUMPRO;
3407 break;
3408
3409 case PROCESSOR_CORE2_64:
3410 ix86_tune = PROCESSOR_CORE2_32;
3411 break;
3412
3413 case PROCESSOR_COREI7_64:
3414 ix86_tune = PROCESSOR_COREI7_32;
3415 break;
3416
3417 default:
3418 break;
3419 }
3420 }
3421 /* Intel CPUs have always interpreted SSE prefetch instructions as
3422 NOPs; so, we can enable SSE prefetch instructions even when
3423 -mtune (rather than -march) points us to a processor that has them.
3424 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3425 higher processors. */
3426 if (TARGET_CMOVE
3427 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3428 x86_prefetch_sse = true;
3429 break;
3430 }
3431
3432 if (ix86_tune_specified && i == pta_size)
3433 error ("bad value (%s) for %stune=%s %s",
3434 ix86_tune_string, prefix, suffix, sw);
3435
3436 ix86_tune_mask = 1u << ix86_tune;
3437 for (i = 0; i < X86_TUNE_LAST; ++i)
3438 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3439
3440 #ifndef USE_IX86_FRAME_POINTER
3441 #define USE_IX86_FRAME_POINTER 0
3442 #endif
3443
3444 #ifndef USE_X86_64_FRAME_POINTER
3445 #define USE_X86_64_FRAME_POINTER 0
3446 #endif
3447
3448 /* Set the default values for switches whose default depends on TARGET_64BIT
3449 in case they weren't overwritten by command line options. */
3450 if (TARGET_64BIT)
3451 {
3452 if (optimize > 1 && !global_options_set.x_flag_zee)
3453 flag_zee = 1;
3454 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3455 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3456 if (flag_asynchronous_unwind_tables == 2)
3457 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3458 if (flag_pcc_struct_return == 2)
3459 flag_pcc_struct_return = 0;
3460 }
3461 else
3462 {
3463 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3464 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3465 if (flag_asynchronous_unwind_tables == 2)
3466 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3467 if (flag_pcc_struct_return == 2)
3468 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3469 }
3470
3471 if (optimize_size)
3472 ix86_cost = &ix86_size_cost;
3473 else
3474 ix86_cost = processor_target_table[ix86_tune].cost;
3475
3476 /* Arrange to set up i386_stack_locals for all functions. */
3477 init_machine_status = ix86_init_machine_status;
3478
3479 /* Validate -mregparm= value. */
3480 if (global_options_set.x_ix86_regparm)
3481 {
3482 if (TARGET_64BIT)
3483 warning (0, "-mregparm is ignored in 64-bit mode");
3484 if (ix86_regparm > REGPARM_MAX)
3485 {
3486 error ("-mregparm=%d is not between 0 and %d",
3487 ix86_regparm, REGPARM_MAX);
3488 ix86_regparm = 0;
3489 }
3490 }
3491 if (TARGET_64BIT)
3492 ix86_regparm = REGPARM_MAX;
3493
3494 /* Default align_* from the processor table. */
3495 if (align_loops == 0)
3496 {
3497 align_loops = processor_target_table[ix86_tune].align_loop;
3498 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3499 }
3500 if (align_jumps == 0)
3501 {
3502 align_jumps = processor_target_table[ix86_tune].align_jump;
3503 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3504 }
3505 if (align_functions == 0)
3506 {
3507 align_functions = processor_target_table[ix86_tune].align_func;
3508 }
3509
3510 /* Provide default for -mbranch-cost= value. */
3511 if (!global_options_set.x_ix86_branch_cost)
3512 ix86_branch_cost = ix86_cost->branch_cost;
3513
3514 if (TARGET_64BIT)
3515 {
3516 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3517
3518 /* Enable by default the SSE and MMX builtins. Do allow the user to
3519 explicitly disable any of these. In particular, disabling SSE and
3520 MMX for kernel code is extremely useful. */
3521 if (!ix86_arch_specified)
3522 ix86_isa_flags
3523 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3524 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3525
3526 if (TARGET_RTD)
3527 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3528 }
3529 else
3530 {
3531 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3532
3533 if (!ix86_arch_specified)
3534 ix86_isa_flags
3535 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3536
3537 /* i386 ABI does not specify red zone. It still makes sense to use it
3538 when programmer takes care to stack from being destroyed. */
3539 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3540 target_flags |= MASK_NO_RED_ZONE;
3541 }
3542
3543 /* Keep nonleaf frame pointers. */
3544 if (flag_omit_frame_pointer)
3545 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3546 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3547 flag_omit_frame_pointer = 1;
3548
3549 /* If we're doing fast math, we don't care about comparison order
3550 wrt NaNs. This lets us use a shorter comparison sequence. */
3551 if (flag_finite_math_only)
3552 target_flags &= ~MASK_IEEE_FP;
3553
3554 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3555 since the insns won't need emulation. */
3556 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3557 target_flags &= ~MASK_NO_FANCY_MATH_387;
3558
3559 /* Likewise, if the target doesn't have a 387, or we've specified
3560 software floating point, don't use 387 inline intrinsics. */
3561 if (!TARGET_80387)
3562 target_flags |= MASK_NO_FANCY_MATH_387;
3563
3564 /* Turn on MMX builtins for -msse. */
3565 if (TARGET_SSE)
3566 {
3567 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3568 x86_prefetch_sse = true;
3569 }
3570
3571 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3572 if (TARGET_SSE4_2 || TARGET_ABM)
3573 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3574
3575 /* Turn on lzcnt instruction for -mabm. */
3576 if (TARGET_ABM)
3577 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3578
3579 /* Validate -mpreferred-stack-boundary= value or default it to
3580 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3581 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3582 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3583 {
3584 int min = (TARGET_64BIT ? 4 : 2);
3585 int max = (TARGET_SEH ? 4 : 12);
3586
3587 if (ix86_preferred_stack_boundary_arg < min
3588 || ix86_preferred_stack_boundary_arg > max)
3589 {
3590 if (min == max)
3591 error ("-mpreferred-stack-boundary is not supported "
3592 "for this target");
3593 else
3594 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3595 ix86_preferred_stack_boundary_arg, min, max);
3596 }
3597 else
3598 ix86_preferred_stack_boundary
3599 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3600 }
3601
3602 /* Set the default value for -mstackrealign. */
3603 if (ix86_force_align_arg_pointer == -1)
3604 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3605
3606 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3607
3608 /* Validate -mincoming-stack-boundary= value or default it to
3609 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3610 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3611 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3612 {
3613 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3614 || ix86_incoming_stack_boundary_arg > 12)
3615 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3616 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3617 else
3618 {
3619 ix86_user_incoming_stack_boundary
3620 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3621 ix86_incoming_stack_boundary
3622 = ix86_user_incoming_stack_boundary;
3623 }
3624 }
3625
3626 /* Accept -msseregparm only if at least SSE support is enabled. */
3627 if (TARGET_SSEREGPARM
3628 && ! TARGET_SSE)
3629 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3630
3631 if (global_options_set.x_ix86_fpmath)
3632 {
3633 if (ix86_fpmath & FPMATH_SSE)
3634 {
3635 if (!TARGET_SSE)
3636 {
3637 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3638 ix86_fpmath = FPMATH_387;
3639 }
3640 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3641 {
3642 warning (0, "387 instruction set disabled, using SSE arithmetics");
3643 ix86_fpmath = FPMATH_SSE;
3644 }
3645 }
3646 }
3647 else
3648 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3649
3650 /* If the i387 is disabled, then do not return values in it. */
3651 if (!TARGET_80387)
3652 target_flags &= ~MASK_FLOAT_RETURNS;
3653
3654 /* Use external vectorized library in vectorizing intrinsics. */
3655 if (global_options_set.x_ix86_veclibabi_type)
3656 switch (ix86_veclibabi_type)
3657 {
3658 case ix86_veclibabi_type_svml:
3659 ix86_veclib_handler = ix86_veclibabi_svml;
3660 break;
3661
3662 case ix86_veclibabi_type_acml:
3663 ix86_veclib_handler = ix86_veclibabi_acml;
3664 break;
3665
3666 default:
3667 gcc_unreachable ();
3668 }
3669
3670 if ((!USE_IX86_FRAME_POINTER
3671 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3672 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3673 && !optimize_size)
3674 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3675
3676 /* ??? Unwind info is not correct around the CFG unless either a frame
3677 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3678 unwind info generation to be aware of the CFG and propagating states
3679 around edges. */
3680 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3681 || flag_exceptions || flag_non_call_exceptions)
3682 && flag_omit_frame_pointer
3683 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3684 {
3685 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3686 warning (0, "unwind tables currently require either a frame pointer "
3687 "or %saccumulate-outgoing-args%s for correctness",
3688 prefix, suffix);
3689 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3690 }
3691
3692 /* If stack probes are required, the space used for large function
3693 arguments on the stack must also be probed, so enable
3694 -maccumulate-outgoing-args so this happens in the prologue. */
3695 if (TARGET_STACK_PROBE
3696 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3697 {
3698 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3699 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3700 "for correctness", prefix, suffix);
3701 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3702 }
3703
3704 /* For sane SSE instruction set generation we need fcomi instruction.
3705 It is safe to enable all CMOVE instructions. Also, RDRAND intrinsic
3706 expands to a sequence that includes conditional move. */
3707 if (TARGET_SSE || TARGET_RDRND)
3708 TARGET_CMOVE = 1;
3709
3710 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3711 {
3712 char *p;
3713 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3714 p = strchr (internal_label_prefix, 'X');
3715 internal_label_prefix_len = p - internal_label_prefix;
3716 *p = '\0';
3717 }
3718
3719 /* When scheduling description is not available, disable scheduler pass
3720 so it won't slow down the compilation and make x87 code slower. */
3721 if (!TARGET_SCHEDULE)
3722 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3723
3724 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3725 ix86_cost->simultaneous_prefetches,
3726 global_options.x_param_values,
3727 global_options_set.x_param_values);
3728 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, ix86_cost->prefetch_block,
3729 global_options.x_param_values,
3730 global_options_set.x_param_values);
3731 maybe_set_param_value (PARAM_L1_CACHE_SIZE, ix86_cost->l1_cache_size,
3732 global_options.x_param_values,
3733 global_options_set.x_param_values);
3734 maybe_set_param_value (PARAM_L2_CACHE_SIZE, ix86_cost->l2_cache_size,
3735 global_options.x_param_values,
3736 global_options_set.x_param_values);
3737
3738 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3739 if (flag_prefetch_loop_arrays < 0
3740 && HAVE_prefetch
3741 && optimize >= 3
3742 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3743 flag_prefetch_loop_arrays = 1;
3744
3745 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3746 can be optimized to ap = __builtin_next_arg (0). */
3747 if (!TARGET_64BIT && !flag_split_stack)
3748 targetm.expand_builtin_va_start = NULL;
3749
3750 if (TARGET_64BIT)
3751 {
3752 ix86_gen_leave = gen_leave_rex64;
3753 ix86_gen_add3 = gen_adddi3;
3754 ix86_gen_sub3 = gen_subdi3;
3755 ix86_gen_sub3_carry = gen_subdi3_carry;
3756 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3757 ix86_gen_monitor = gen_sse3_monitor64;
3758 ix86_gen_andsp = gen_anddi3;
3759 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3760 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3761 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3762 }
3763 else
3764 {
3765 ix86_gen_leave = gen_leave;
3766 ix86_gen_add3 = gen_addsi3;
3767 ix86_gen_sub3 = gen_subsi3;
3768 ix86_gen_sub3_carry = gen_subsi3_carry;
3769 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3770 ix86_gen_monitor = gen_sse3_monitor;
3771 ix86_gen_andsp = gen_andsi3;
3772 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3773 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3774 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3775 }
3776
3777 #ifdef USE_IX86_CLD
3778 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3779 if (!TARGET_64BIT)
3780 target_flags |= MASK_CLD & ~target_flags_explicit;
3781 #endif
3782
3783 if (!TARGET_64BIT && flag_pic)
3784 {
3785 if (flag_fentry > 0)
3786 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3787 "with -fpic");
3788 flag_fentry = 0;
3789 }
3790 else if (TARGET_SEH)
3791 {
3792 if (flag_fentry == 0)
3793 sorry ("-mno-fentry isn%'t compatible with SEH");
3794 flag_fentry = 1;
3795 }
3796 else if (flag_fentry < 0)
3797 {
3798 #if defined(PROFILE_BEFORE_PROLOGUE)
3799 flag_fentry = 1;
3800 #else
3801 flag_fentry = 0;
3802 #endif
3803 }
3804
3805 if (TARGET_AVX)
3806 {
3807 /* When not optimize for size, enable vzeroupper optimization for
3808 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3809 AVX unaligned load/store. */
3810 if (!optimize_size)
3811 {
3812 if (flag_expensive_optimizations
3813 && !(target_flags_explicit & MASK_VZEROUPPER))
3814 target_flags |= MASK_VZEROUPPER;
3815 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3816 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3817 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3818 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3819 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3820 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3821 /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */
3822 if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
3823 target_flags |= MASK_PREFER_AVX128;
3824 }
3825 }
3826 else
3827 {
3828 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3829 target_flags &= ~MASK_VZEROUPPER;
3830 }
3831
3832 if (ix86_recip_name)
3833 {
3834 char *p = ASTRDUP (ix86_recip_name);
3835 char *q;
3836 unsigned int mask, i;
3837 bool invert;
3838
3839 while ((q = strtok (p, ",")) != NULL)
3840 {
3841 p = NULL;
3842 if (*q == '!')
3843 {
3844 invert = true;
3845 q++;
3846 }
3847 else
3848 invert = false;
3849
3850 if (!strcmp (q, "default"))
3851 mask = RECIP_MASK_ALL;
3852 else
3853 {
3854 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3855 if (!strcmp (q, recip_options[i].string))
3856 {
3857 mask = recip_options[i].mask;
3858 break;
3859 }
3860
3861 if (i == ARRAY_SIZE (recip_options))
3862 {
3863 error ("unknown option for -mrecip=%s", q);
3864 invert = false;
3865 mask = RECIP_MASK_NONE;
3866 }
3867 }
3868
3869 recip_mask_explicit |= mask;
3870 if (invert)
3871 recip_mask &= ~mask;
3872 else
3873 recip_mask |= mask;
3874 }
3875 }
3876
3877 if (TARGET_RECIP)
3878 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3879 else if (target_flags_explicit & MASK_RECIP)
3880 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3881
3882 /* Save the initial options in case the user does function specific
3883 options. */
3884 if (main_args_p)
3885 target_option_default_node = target_option_current_node
3886 = build_target_option_node ();
3887 }
3888
3889 /* Return TRUE if VAL is passed in register with 256bit AVX modes. */
3890
3891 static bool
3892 function_pass_avx256_p (const_rtx val)
3893 {
3894 if (!val)
3895 return false;
3896
3897 if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
3898 return true;
3899
3900 if (GET_CODE (val) == PARALLEL)
3901 {
3902 int i;
3903 rtx r;
3904
3905 for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
3906 {
3907 r = XVECEXP (val, 0, i);
3908 if (GET_CODE (r) == EXPR_LIST
3909 && XEXP (r, 0)
3910 && REG_P (XEXP (r, 0))
3911 && (GET_MODE (XEXP (r, 0)) == OImode
3912 || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
3913 return true;
3914 }
3915 }
3916
3917 return false;
3918 }
3919
3920 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3921
3922 static void
3923 ix86_option_override (void)
3924 {
3925 ix86_option_override_internal (true);
3926 }
3927
3928 /* Update register usage after having seen the compiler flags. */
3929
3930 static void
3931 ix86_conditional_register_usage (void)
3932 {
3933 int i;
3934 unsigned int j;
3935
3936 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3937 {
3938 if (fixed_regs[i] > 1)
3939 fixed_regs[i] = (fixed_regs[i] == (TARGET_64BIT ? 3 : 2));
3940 if (call_used_regs[i] > 1)
3941 call_used_regs[i] = (call_used_regs[i] == (TARGET_64BIT ? 3 : 2));
3942 }
3943
3944 /* The PIC register, if it exists, is fixed. */
3945 j = PIC_OFFSET_TABLE_REGNUM;
3946 if (j != INVALID_REGNUM)
3947 fixed_regs[j] = call_used_regs[j] = 1;
3948
3949 /* The 64-bit MS_ABI changes the set of call-used registers. */
3950 if (TARGET_64BIT_MS_ABI)
3951 {
3952 call_used_regs[SI_REG] = 0;
3953 call_used_regs[DI_REG] = 0;
3954 call_used_regs[XMM6_REG] = 0;
3955 call_used_regs[XMM7_REG] = 0;
3956 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3957 call_used_regs[i] = 0;
3958 }
3959
3960 /* The default setting of CLOBBERED_REGS is for 32-bit; add in the
3961 other call-clobbered regs for 64-bit. */
3962 if (TARGET_64BIT)
3963 {
3964 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
3965
3966 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3967 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
3968 && call_used_regs[i])
3969 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
3970 }
3971
3972 /* If MMX is disabled, squash the registers. */
3973 if (! TARGET_MMX)
3974 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3975 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
3976 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3977
3978 /* If SSE is disabled, squash the registers. */
3979 if (! TARGET_SSE)
3980 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3981 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
3982 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3983
3984 /* If the FPU is disabled, squash the registers. */
3985 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
3986 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3987 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
3988 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3989
3990 /* If 32-bit, squash the 64-bit registers. */
3991 if (! TARGET_64BIT)
3992 {
3993 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
3994 reg_names[i] = "";
3995 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3996 reg_names[i] = "";
3997 }
3998 }
3999
4000 \f
4001 /* Save the current options */
4002
4003 static void
4004 ix86_function_specific_save (struct cl_target_option *ptr)
4005 {
4006 ptr->arch = ix86_arch;
4007 ptr->schedule = ix86_schedule;
4008 ptr->tune = ix86_tune;
4009 ptr->branch_cost = ix86_branch_cost;
4010 ptr->tune_defaulted = ix86_tune_defaulted;
4011 ptr->arch_specified = ix86_arch_specified;
4012 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4013 ptr->ix86_target_flags_explicit = target_flags_explicit;
4014 ptr->x_recip_mask_explicit = recip_mask_explicit;
4015
4016 /* The fields are char but the variables are not; make sure the
4017 values fit in the fields. */
4018 gcc_assert (ptr->arch == ix86_arch);
4019 gcc_assert (ptr->schedule == ix86_schedule);
4020 gcc_assert (ptr->tune == ix86_tune);
4021 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4022 }
4023
4024 /* Restore the current options */
4025
4026 static void
4027 ix86_function_specific_restore (struct cl_target_option *ptr)
4028 {
4029 enum processor_type old_tune = ix86_tune;
4030 enum processor_type old_arch = ix86_arch;
4031 unsigned int ix86_arch_mask, ix86_tune_mask;
4032 int i;
4033
4034 ix86_arch = (enum processor_type) ptr->arch;
4035 ix86_schedule = (enum attr_cpu) ptr->schedule;
4036 ix86_tune = (enum processor_type) ptr->tune;
4037 ix86_branch_cost = ptr->branch_cost;
4038 ix86_tune_defaulted = ptr->tune_defaulted;
4039 ix86_arch_specified = ptr->arch_specified;
4040 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4041 target_flags_explicit = ptr->ix86_target_flags_explicit;
4042 recip_mask_explicit = ptr->x_recip_mask_explicit;
4043
4044 /* Recreate the arch feature tests if the arch changed */
4045 if (old_arch != ix86_arch)
4046 {
4047 ix86_arch_mask = 1u << ix86_arch;
4048 for (i = 0; i < X86_ARCH_LAST; ++i)
4049 ix86_arch_features[i]
4050 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4051 }
4052
4053 /* Recreate the tune optimization tests */
4054 if (old_tune != ix86_tune)
4055 {
4056 ix86_tune_mask = 1u << ix86_tune;
4057 for (i = 0; i < X86_TUNE_LAST; ++i)
4058 ix86_tune_features[i]
4059 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4060 }
4061 }
4062
4063 /* Print the current options */
4064
4065 static void
4066 ix86_function_specific_print (FILE *file, int indent,
4067 struct cl_target_option *ptr)
4068 {
4069 char *target_string
4070 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4071 NULL, NULL, ptr->x_ix86_fpmath, false);
4072
4073 fprintf (file, "%*sarch = %d (%s)\n",
4074 indent, "",
4075 ptr->arch,
4076 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4077 ? cpu_names[ptr->arch]
4078 : "<unknown>"));
4079
4080 fprintf (file, "%*stune = %d (%s)\n",
4081 indent, "",
4082 ptr->tune,
4083 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4084 ? cpu_names[ptr->tune]
4085 : "<unknown>"));
4086
4087 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4088
4089 if (target_string)
4090 {
4091 fprintf (file, "%*s%s\n", indent, "", target_string);
4092 free (target_string);
4093 }
4094 }
4095
4096 \f
4097 /* Inner function to process the attribute((target(...))), take an argument and
4098 set the current options from the argument. If we have a list, recursively go
4099 over the list. */
4100
4101 static bool
4102 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4103 struct gcc_options *enum_opts_set)
4104 {
4105 char *next_optstr;
4106 bool ret = true;
4107
4108 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4109 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4110 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4111 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4112 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4113
4114 enum ix86_opt_type
4115 {
4116 ix86_opt_unknown,
4117 ix86_opt_yes,
4118 ix86_opt_no,
4119 ix86_opt_str,
4120 ix86_opt_enum,
4121 ix86_opt_isa
4122 };
4123
4124 static const struct
4125 {
4126 const char *string;
4127 size_t len;
4128 enum ix86_opt_type type;
4129 int opt;
4130 int mask;
4131 } attrs[] = {
4132 /* isa options */
4133 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4134 IX86_ATTR_ISA ("abm", OPT_mabm),
4135 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4136 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4137 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4138 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4139 IX86_ATTR_ISA ("aes", OPT_maes),
4140 IX86_ATTR_ISA ("avx", OPT_mavx),
4141 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4142 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4143 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4144 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4145 IX86_ATTR_ISA ("sse", OPT_msse),
4146 IX86_ATTR_ISA ("sse2", OPT_msse2),
4147 IX86_ATTR_ISA ("sse3", OPT_msse3),
4148 IX86_ATTR_ISA ("sse4", OPT_msse4),
4149 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4150 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4151 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4152 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4153 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4154 IX86_ATTR_ISA ("fma", OPT_mfma),
4155 IX86_ATTR_ISA ("xop", OPT_mxop),
4156 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4157 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4158 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4159 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4160
4161 /* enum options */
4162 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4163
4164 /* string options */
4165 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4166 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4167
4168 /* flag options */
4169 IX86_ATTR_YES ("cld",
4170 OPT_mcld,
4171 MASK_CLD),
4172
4173 IX86_ATTR_NO ("fancy-math-387",
4174 OPT_mfancy_math_387,
4175 MASK_NO_FANCY_MATH_387),
4176
4177 IX86_ATTR_YES ("ieee-fp",
4178 OPT_mieee_fp,
4179 MASK_IEEE_FP),
4180
4181 IX86_ATTR_YES ("inline-all-stringops",
4182 OPT_minline_all_stringops,
4183 MASK_INLINE_ALL_STRINGOPS),
4184
4185 IX86_ATTR_YES ("inline-stringops-dynamically",
4186 OPT_minline_stringops_dynamically,
4187 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4188
4189 IX86_ATTR_NO ("align-stringops",
4190 OPT_mno_align_stringops,
4191 MASK_NO_ALIGN_STRINGOPS),
4192
4193 IX86_ATTR_YES ("recip",
4194 OPT_mrecip,
4195 MASK_RECIP),
4196
4197 };
4198
4199 /* If this is a list, recurse to get the options. */
4200 if (TREE_CODE (args) == TREE_LIST)
4201 {
4202 bool ret = true;
4203
4204 for (; args; args = TREE_CHAIN (args))
4205 if (TREE_VALUE (args)
4206 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4207 p_strings, enum_opts_set))
4208 ret = false;
4209
4210 return ret;
4211 }
4212
4213 else if (TREE_CODE (args) != STRING_CST)
4214 gcc_unreachable ();
4215
4216 /* Handle multiple arguments separated by commas. */
4217 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4218
4219 while (next_optstr && *next_optstr != '\0')
4220 {
4221 char *p = next_optstr;
4222 char *orig_p = p;
4223 char *comma = strchr (next_optstr, ',');
4224 const char *opt_string;
4225 size_t len, opt_len;
4226 int opt;
4227 bool opt_set_p;
4228 char ch;
4229 unsigned i;
4230 enum ix86_opt_type type = ix86_opt_unknown;
4231 int mask = 0;
4232
4233 if (comma)
4234 {
4235 *comma = '\0';
4236 len = comma - next_optstr;
4237 next_optstr = comma + 1;
4238 }
4239 else
4240 {
4241 len = strlen (p);
4242 next_optstr = NULL;
4243 }
4244
4245 /* Recognize no-xxx. */
4246 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4247 {
4248 opt_set_p = false;
4249 p += 3;
4250 len -= 3;
4251 }
4252 else
4253 opt_set_p = true;
4254
4255 /* Find the option. */
4256 ch = *p;
4257 opt = N_OPTS;
4258 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4259 {
4260 type = attrs[i].type;
4261 opt_len = attrs[i].len;
4262 if (ch == attrs[i].string[0]
4263 && ((type != ix86_opt_str && type != ix86_opt_enum)
4264 ? len == opt_len
4265 : len > opt_len)
4266 && memcmp (p, attrs[i].string, opt_len) == 0)
4267 {
4268 opt = attrs[i].opt;
4269 mask = attrs[i].mask;
4270 opt_string = attrs[i].string;
4271 break;
4272 }
4273 }
4274
4275 /* Process the option. */
4276 if (opt == N_OPTS)
4277 {
4278 error ("attribute(target(\"%s\")) is unknown", orig_p);
4279 ret = false;
4280 }
4281
4282 else if (type == ix86_opt_isa)
4283 {
4284 struct cl_decoded_option decoded;
4285
4286 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4287 ix86_handle_option (&global_options, &global_options_set,
4288 &decoded, input_location);
4289 }
4290
4291 else if (type == ix86_opt_yes || type == ix86_opt_no)
4292 {
4293 if (type == ix86_opt_no)
4294 opt_set_p = !opt_set_p;
4295
4296 if (opt_set_p)
4297 target_flags |= mask;
4298 else
4299 target_flags &= ~mask;
4300 }
4301
4302 else if (type == ix86_opt_str)
4303 {
4304 if (p_strings[opt])
4305 {
4306 error ("option(\"%s\") was already specified", opt_string);
4307 ret = false;
4308 }
4309 else
4310 p_strings[opt] = xstrdup (p + opt_len);
4311 }
4312
4313 else if (type == ix86_opt_enum)
4314 {
4315 bool arg_ok;
4316 int value;
4317
4318 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4319 if (arg_ok)
4320 set_option (&global_options, enum_opts_set, opt, value,
4321 p + opt_len, DK_UNSPECIFIED, input_location,
4322 global_dc);
4323 else
4324 {
4325 error ("attribute(target(\"%s\")) is unknown", orig_p);
4326 ret = false;
4327 }
4328 }
4329
4330 else
4331 gcc_unreachable ();
4332 }
4333
4334 return ret;
4335 }
4336
4337 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4338
4339 tree
4340 ix86_valid_target_attribute_tree (tree args)
4341 {
4342 const char *orig_arch_string = ix86_arch_string;
4343 const char *orig_tune_string = ix86_tune_string;
4344 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4345 int orig_tune_defaulted = ix86_tune_defaulted;
4346 int orig_arch_specified = ix86_arch_specified;
4347 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4348 tree t = NULL_TREE;
4349 int i;
4350 struct cl_target_option *def
4351 = TREE_TARGET_OPTION (target_option_default_node);
4352 struct gcc_options enum_opts_set;
4353
4354 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4355
4356 /* Process each of the options on the chain. */
4357 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4358 &enum_opts_set))
4359 return NULL_TREE;
4360
4361 /* If the changed options are different from the default, rerun
4362 ix86_option_override_internal, and then save the options away.
4363 The string options are are attribute options, and will be undone
4364 when we copy the save structure. */
4365 if (ix86_isa_flags != def->x_ix86_isa_flags
4366 || target_flags != def->x_target_flags
4367 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4368 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4369 || enum_opts_set.x_ix86_fpmath)
4370 {
4371 /* If we are using the default tune= or arch=, undo the string assigned,
4372 and use the default. */
4373 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4374 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4375 else if (!orig_arch_specified)
4376 ix86_arch_string = NULL;
4377
4378 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4379 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4380 else if (orig_tune_defaulted)
4381 ix86_tune_string = NULL;
4382
4383 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4384 if (enum_opts_set.x_ix86_fpmath)
4385 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4386 else if (!TARGET_64BIT && TARGET_SSE)
4387 {
4388 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4389 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4390 }
4391
4392 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4393 ix86_option_override_internal (false);
4394
4395 /* Add any builtin functions with the new isa if any. */
4396 ix86_add_new_builtins (ix86_isa_flags);
4397
4398 /* Save the current options unless we are validating options for
4399 #pragma. */
4400 t = build_target_option_node ();
4401
4402 ix86_arch_string = orig_arch_string;
4403 ix86_tune_string = orig_tune_string;
4404 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4405
4406 /* Free up memory allocated to hold the strings */
4407 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4408 free (option_strings[i]);
4409 }
4410
4411 return t;
4412 }
4413
4414 /* Hook to validate attribute((target("string"))). */
4415
4416 static bool
4417 ix86_valid_target_attribute_p (tree fndecl,
4418 tree ARG_UNUSED (name),
4419 tree args,
4420 int ARG_UNUSED (flags))
4421 {
4422 struct cl_target_option cur_target;
4423 bool ret = true;
4424 tree old_optimize = build_optimization_node ();
4425 tree new_target, new_optimize;
4426 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4427
4428 /* If the function changed the optimization levels as well as setting target
4429 options, start with the optimizations specified. */
4430 if (func_optimize && func_optimize != old_optimize)
4431 cl_optimization_restore (&global_options,
4432 TREE_OPTIMIZATION (func_optimize));
4433
4434 /* The target attributes may also change some optimization flags, so update
4435 the optimization options if necessary. */
4436 cl_target_option_save (&cur_target, &global_options);
4437 new_target = ix86_valid_target_attribute_tree (args);
4438 new_optimize = build_optimization_node ();
4439
4440 if (!new_target)
4441 ret = false;
4442
4443 else if (fndecl)
4444 {
4445 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4446
4447 if (old_optimize != new_optimize)
4448 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4449 }
4450
4451 cl_target_option_restore (&global_options, &cur_target);
4452
4453 if (old_optimize != new_optimize)
4454 cl_optimization_restore (&global_options,
4455 TREE_OPTIMIZATION (old_optimize));
4456
4457 return ret;
4458 }
4459
4460 \f
4461 /* Hook to determine if one function can safely inline another. */
4462
4463 static bool
4464 ix86_can_inline_p (tree caller, tree callee)
4465 {
4466 bool ret = false;
4467 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4468 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4469
4470 /* If callee has no option attributes, then it is ok to inline. */
4471 if (!callee_tree)
4472 ret = true;
4473
4474 /* If caller has no option attributes, but callee does then it is not ok to
4475 inline. */
4476 else if (!caller_tree)
4477 ret = false;
4478
4479 else
4480 {
4481 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4482 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4483
4484 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4485 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4486 function. */
4487 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4488 != callee_opts->x_ix86_isa_flags)
4489 ret = false;
4490
4491 /* See if we have the same non-isa options. */
4492 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4493 ret = false;
4494
4495 /* See if arch, tune, etc. are the same. */
4496 else if (caller_opts->arch != callee_opts->arch)
4497 ret = false;
4498
4499 else if (caller_opts->tune != callee_opts->tune)
4500 ret = false;
4501
4502 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4503 ret = false;
4504
4505 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4506 ret = false;
4507
4508 else
4509 ret = true;
4510 }
4511
4512 return ret;
4513 }
4514
4515 \f
4516 /* Remember the last target of ix86_set_current_function. */
4517 static GTY(()) tree ix86_previous_fndecl;
4518
4519 /* Establish appropriate back-end context for processing the function
4520 FNDECL. The argument might be NULL to indicate processing at top
4521 level, outside of any function scope. */
4522 static void
4523 ix86_set_current_function (tree fndecl)
4524 {
4525 /* Only change the context if the function changes. This hook is called
4526 several times in the course of compiling a function, and we don't want to
4527 slow things down too much or call target_reinit when it isn't safe. */
4528 if (fndecl && fndecl != ix86_previous_fndecl)
4529 {
4530 tree old_tree = (ix86_previous_fndecl
4531 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4532 : NULL_TREE);
4533
4534 tree new_tree = (fndecl
4535 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4536 : NULL_TREE);
4537
4538 ix86_previous_fndecl = fndecl;
4539 if (old_tree == new_tree)
4540 ;
4541
4542 else if (new_tree)
4543 {
4544 cl_target_option_restore (&global_options,
4545 TREE_TARGET_OPTION (new_tree));
4546 target_reinit ();
4547 }
4548
4549 else if (old_tree)
4550 {
4551 struct cl_target_option *def
4552 = TREE_TARGET_OPTION (target_option_current_node);
4553
4554 cl_target_option_restore (&global_options, def);
4555 target_reinit ();
4556 }
4557 }
4558 }
4559
4560 \f
4561 /* Return true if this goes in large data/bss. */
4562
4563 static bool
4564 ix86_in_large_data_p (tree exp)
4565 {
4566 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4567 return false;
4568
4569 /* Functions are never large data. */
4570 if (TREE_CODE (exp) == FUNCTION_DECL)
4571 return false;
4572
4573 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4574 {
4575 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4576 if (strcmp (section, ".ldata") == 0
4577 || strcmp (section, ".lbss") == 0)
4578 return true;
4579 return false;
4580 }
4581 else
4582 {
4583 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4584
4585 /* If this is an incomplete type with size 0, then we can't put it
4586 in data because it might be too big when completed. */
4587 if (!size || size > ix86_section_threshold)
4588 return true;
4589 }
4590
4591 return false;
4592 }
4593
4594 /* Switch to the appropriate section for output of DECL.
4595 DECL is either a `VAR_DECL' node or a constant of some sort.
4596 RELOC indicates whether forming the initial value of DECL requires
4597 link-time relocations. */
4598
4599 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4600 ATTRIBUTE_UNUSED;
4601
4602 static section *
4603 x86_64_elf_select_section (tree decl, int reloc,
4604 unsigned HOST_WIDE_INT align)
4605 {
4606 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4607 && ix86_in_large_data_p (decl))
4608 {
4609 const char *sname = NULL;
4610 unsigned int flags = SECTION_WRITE;
4611 switch (categorize_decl_for_section (decl, reloc))
4612 {
4613 case SECCAT_DATA:
4614 sname = ".ldata";
4615 break;
4616 case SECCAT_DATA_REL:
4617 sname = ".ldata.rel";
4618 break;
4619 case SECCAT_DATA_REL_LOCAL:
4620 sname = ".ldata.rel.local";
4621 break;
4622 case SECCAT_DATA_REL_RO:
4623 sname = ".ldata.rel.ro";
4624 break;
4625 case SECCAT_DATA_REL_RO_LOCAL:
4626 sname = ".ldata.rel.ro.local";
4627 break;
4628 case SECCAT_BSS:
4629 sname = ".lbss";
4630 flags |= SECTION_BSS;
4631 break;
4632 case SECCAT_RODATA:
4633 case SECCAT_RODATA_MERGE_STR:
4634 case SECCAT_RODATA_MERGE_STR_INIT:
4635 case SECCAT_RODATA_MERGE_CONST:
4636 sname = ".lrodata";
4637 flags = 0;
4638 break;
4639 case SECCAT_SRODATA:
4640 case SECCAT_SDATA:
4641 case SECCAT_SBSS:
4642 gcc_unreachable ();
4643 case SECCAT_TEXT:
4644 case SECCAT_TDATA:
4645 case SECCAT_TBSS:
4646 /* We don't split these for medium model. Place them into
4647 default sections and hope for best. */
4648 break;
4649 }
4650 if (sname)
4651 {
4652 /* We might get called with string constants, but get_named_section
4653 doesn't like them as they are not DECLs. Also, we need to set
4654 flags in that case. */
4655 if (!DECL_P (decl))
4656 return get_section (sname, flags, NULL);
4657 return get_named_section (decl, sname, reloc);
4658 }
4659 }
4660 return default_elf_select_section (decl, reloc, align);
4661 }
4662
4663 /* Build up a unique section name, expressed as a
4664 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4665 RELOC indicates whether the initial value of EXP requires
4666 link-time relocations. */
4667
4668 static void ATTRIBUTE_UNUSED
4669 x86_64_elf_unique_section (tree decl, int reloc)
4670 {
4671 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4672 && ix86_in_large_data_p (decl))
4673 {
4674 const char *prefix = NULL;
4675 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4676 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4677
4678 switch (categorize_decl_for_section (decl, reloc))
4679 {
4680 case SECCAT_DATA:
4681 case SECCAT_DATA_REL:
4682 case SECCAT_DATA_REL_LOCAL:
4683 case SECCAT_DATA_REL_RO:
4684 case SECCAT_DATA_REL_RO_LOCAL:
4685 prefix = one_only ? ".ld" : ".ldata";
4686 break;
4687 case SECCAT_BSS:
4688 prefix = one_only ? ".lb" : ".lbss";
4689 break;
4690 case SECCAT_RODATA:
4691 case SECCAT_RODATA_MERGE_STR:
4692 case SECCAT_RODATA_MERGE_STR_INIT:
4693 case SECCAT_RODATA_MERGE_CONST:
4694 prefix = one_only ? ".lr" : ".lrodata";
4695 break;
4696 case SECCAT_SRODATA:
4697 case SECCAT_SDATA:
4698 case SECCAT_SBSS:
4699 gcc_unreachable ();
4700 case SECCAT_TEXT:
4701 case SECCAT_TDATA:
4702 case SECCAT_TBSS:
4703 /* We don't split these for medium model. Place them into
4704 default sections and hope for best. */
4705 break;
4706 }
4707 if (prefix)
4708 {
4709 const char *name, *linkonce;
4710 char *string;
4711
4712 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4713 name = targetm.strip_name_encoding (name);
4714
4715 /* If we're using one_only, then there needs to be a .gnu.linkonce
4716 prefix to the section name. */
4717 linkonce = one_only ? ".gnu.linkonce" : "";
4718
4719 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4720
4721 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4722 return;
4723 }
4724 }
4725 default_unique_section (decl, reloc);
4726 }
4727
4728 #ifdef COMMON_ASM_OP
4729 /* This says how to output assembler code to declare an
4730 uninitialized external linkage data object.
4731
4732 For medium model x86-64 we need to use .largecomm opcode for
4733 large objects. */
4734 void
4735 x86_elf_aligned_common (FILE *file,
4736 const char *name, unsigned HOST_WIDE_INT size,
4737 int align)
4738 {
4739 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4740 && size > (unsigned int)ix86_section_threshold)
4741 fputs (".largecomm\t", file);
4742 else
4743 fputs (COMMON_ASM_OP, file);
4744 assemble_name (file, name);
4745 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4746 size, align / BITS_PER_UNIT);
4747 }
4748 #endif
4749
4750 /* Utility function for targets to use in implementing
4751 ASM_OUTPUT_ALIGNED_BSS. */
4752
4753 void
4754 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4755 const char *name, unsigned HOST_WIDE_INT size,
4756 int align)
4757 {
4758 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4759 && size > (unsigned int)ix86_section_threshold)
4760 switch_to_section (get_named_section (decl, ".lbss", 0));
4761 else
4762 switch_to_section (bss_section);
4763 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4764 #ifdef ASM_DECLARE_OBJECT_NAME
4765 last_assemble_variable_decl = decl;
4766 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4767 #else
4768 /* Standard thing is just output label for the object. */
4769 ASM_OUTPUT_LABEL (file, name);
4770 #endif /* ASM_DECLARE_OBJECT_NAME */
4771 ASM_OUTPUT_SKIP (file, size ? size : 1);
4772 }
4773 \f
4774 /* Decide whether we must probe the stack before any space allocation
4775 on this target. It's essentially TARGET_STACK_PROBE except when
4776 -fstack-check causes the stack to be already probed differently. */
4777
4778 bool
4779 ix86_target_stack_probe (void)
4780 {
4781 /* Do not probe the stack twice if static stack checking is enabled. */
4782 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4783 return false;
4784
4785 return TARGET_STACK_PROBE;
4786 }
4787 \f
4788 /* Decide whether we can make a sibling call to a function. DECL is the
4789 declaration of the function being targeted by the call and EXP is the
4790 CALL_EXPR representing the call. */
4791
4792 static bool
4793 ix86_function_ok_for_sibcall (tree decl, tree exp)
4794 {
4795 tree type, decl_or_type;
4796 rtx a, b;
4797
4798 /* If we are generating position-independent code, we cannot sibcall
4799 optimize any indirect call, or a direct call to a global function,
4800 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4801 if (!TARGET_MACHO
4802 && !TARGET_64BIT
4803 && flag_pic
4804 && (!decl || !targetm.binds_local_p (decl)))
4805 return false;
4806
4807 /* If we need to align the outgoing stack, then sibcalling would
4808 unalign the stack, which may break the called function. */
4809 if (ix86_minimum_incoming_stack_boundary (true)
4810 < PREFERRED_STACK_BOUNDARY)
4811 return false;
4812
4813 if (decl)
4814 {
4815 decl_or_type = decl;
4816 type = TREE_TYPE (decl);
4817 }
4818 else
4819 {
4820 /* We're looking at the CALL_EXPR, we need the type of the function. */
4821 type = CALL_EXPR_FN (exp); /* pointer expression */
4822 type = TREE_TYPE (type); /* pointer type */
4823 type = TREE_TYPE (type); /* function type */
4824 decl_or_type = type;
4825 }
4826
4827 /* Check that the return value locations are the same. Like
4828 if we are returning floats on the 80387 register stack, we cannot
4829 make a sibcall from a function that doesn't return a float to a
4830 function that does or, conversely, from a function that does return
4831 a float to a function that doesn't; the necessary stack adjustment
4832 would not be executed. This is also the place we notice
4833 differences in the return value ABI. Note that it is ok for one
4834 of the functions to have void return type as long as the return
4835 value of the other is passed in a register. */
4836 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4837 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4838 cfun->decl, false);
4839 if (STACK_REG_P (a) || STACK_REG_P (b))
4840 {
4841 if (!rtx_equal_p (a, b))
4842 return false;
4843 }
4844 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4845 {
4846 /* Disable sibcall if we need to generate vzeroupper after
4847 callee returns. */
4848 if (TARGET_VZEROUPPER
4849 && cfun->machine->callee_return_avx256_p
4850 && !cfun->machine->caller_return_avx256_p)
4851 return false;
4852 }
4853 else if (!rtx_equal_p (a, b))
4854 return false;
4855
4856 if (TARGET_64BIT)
4857 {
4858 /* The SYSV ABI has more call-clobbered registers;
4859 disallow sibcalls from MS to SYSV. */
4860 if (cfun->machine->call_abi == MS_ABI
4861 && ix86_function_type_abi (type) == SYSV_ABI)
4862 return false;
4863 }
4864 else
4865 {
4866 /* If this call is indirect, we'll need to be able to use a
4867 call-clobbered register for the address of the target function.
4868 Make sure that all such registers are not used for passing
4869 parameters. Note that DLLIMPORT functions are indirect. */
4870 if (!decl
4871 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4872 {
4873 if (ix86_function_regparm (type, NULL) >= 3)
4874 {
4875 /* ??? Need to count the actual number of registers to be used,
4876 not the possible number of registers. Fix later. */
4877 return false;
4878 }
4879 }
4880 }
4881
4882 /* Otherwise okay. That also includes certain types of indirect calls. */
4883 return true;
4884 }
4885
4886 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4887 and "sseregparm" calling convention attributes;
4888 arguments as in struct attribute_spec.handler. */
4889
4890 static tree
4891 ix86_handle_cconv_attribute (tree *node, tree name,
4892 tree args,
4893 int flags ATTRIBUTE_UNUSED,
4894 bool *no_add_attrs)
4895 {
4896 if (TREE_CODE (*node) != FUNCTION_TYPE
4897 && TREE_CODE (*node) != METHOD_TYPE
4898 && TREE_CODE (*node) != FIELD_DECL
4899 && TREE_CODE (*node) != TYPE_DECL)
4900 {
4901 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4902 name);
4903 *no_add_attrs = true;
4904 return NULL_TREE;
4905 }
4906
4907 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4908 if (is_attribute_p ("regparm", name))
4909 {
4910 tree cst;
4911
4912 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4913 {
4914 error ("fastcall and regparm attributes are not compatible");
4915 }
4916
4917 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4918 {
4919 error ("regparam and thiscall attributes are not compatible");
4920 }
4921
4922 cst = TREE_VALUE (args);
4923 if (TREE_CODE (cst) != INTEGER_CST)
4924 {
4925 warning (OPT_Wattributes,
4926 "%qE attribute requires an integer constant argument",
4927 name);
4928 *no_add_attrs = true;
4929 }
4930 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4931 {
4932 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4933 name, REGPARM_MAX);
4934 *no_add_attrs = true;
4935 }
4936
4937 return NULL_TREE;
4938 }
4939
4940 if (TARGET_64BIT)
4941 {
4942 /* Do not warn when emulating the MS ABI. */
4943 if ((TREE_CODE (*node) != FUNCTION_TYPE
4944 && TREE_CODE (*node) != METHOD_TYPE)
4945 || ix86_function_type_abi (*node) != MS_ABI)
4946 warning (OPT_Wattributes, "%qE attribute ignored",
4947 name);
4948 *no_add_attrs = true;
4949 return NULL_TREE;
4950 }
4951
4952 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4953 if (is_attribute_p ("fastcall", name))
4954 {
4955 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4956 {
4957 error ("fastcall and cdecl attributes are not compatible");
4958 }
4959 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4960 {
4961 error ("fastcall and stdcall attributes are not compatible");
4962 }
4963 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
4964 {
4965 error ("fastcall and regparm attributes are not compatible");
4966 }
4967 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4968 {
4969 error ("fastcall and thiscall attributes are not compatible");
4970 }
4971 }
4972
4973 /* Can combine stdcall with fastcall (redundant), regparm and
4974 sseregparm. */
4975 else if (is_attribute_p ("stdcall", name))
4976 {
4977 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4978 {
4979 error ("stdcall and cdecl attributes are not compatible");
4980 }
4981 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4982 {
4983 error ("stdcall and fastcall attributes are not compatible");
4984 }
4985 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4986 {
4987 error ("stdcall and thiscall attributes are not compatible");
4988 }
4989 }
4990
4991 /* Can combine cdecl with regparm and sseregparm. */
4992 else if (is_attribute_p ("cdecl", name))
4993 {
4994 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4995 {
4996 error ("stdcall and cdecl attributes are not compatible");
4997 }
4998 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4999 {
5000 error ("fastcall and cdecl attributes are not compatible");
5001 }
5002 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5003 {
5004 error ("cdecl and thiscall attributes are not compatible");
5005 }
5006 }
5007 else if (is_attribute_p ("thiscall", name))
5008 {
5009 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5010 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5011 name);
5012 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5013 {
5014 error ("stdcall and thiscall attributes are not compatible");
5015 }
5016 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5017 {
5018 error ("fastcall and thiscall attributes are not compatible");
5019 }
5020 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5021 {
5022 error ("cdecl and thiscall attributes are not compatible");
5023 }
5024 }
5025
5026 /* Can combine sseregparm with all attributes. */
5027
5028 return NULL_TREE;
5029 }
5030
5031 /* The transactional memory builtins are implicitly regparm or fastcall
5032 depending on the ABI. Override the generic do-nothing attribute that
5033 these builtins were declared with, and replace it with one of the two
5034 attributes that we expect elsewhere. */
5035
5036 static tree
5037 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5038 tree args ATTRIBUTE_UNUSED,
5039 int flags ATTRIBUTE_UNUSED,
5040 bool *no_add_attrs)
5041 {
5042 tree alt;
5043
5044 /* In no case do we want to add the placeholder attribute. */
5045 *no_add_attrs = true;
5046
5047 /* The 64-bit ABI is unchanged for transactional memory. */
5048 if (TARGET_64BIT)
5049 return NULL_TREE;
5050
5051 /* ??? Is there a better way to validate 32-bit windows? We have
5052 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5053 if (CHECK_STACK_LIMIT > 0)
5054 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5055 else
5056 {
5057 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5058 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5059 }
5060 decl_attributes (node, alt, flags);
5061
5062 return NULL_TREE;
5063 }
5064
5065 /* This function determines from TYPE the calling-convention. */
5066
5067 unsigned int
5068 ix86_get_callcvt (const_tree type)
5069 {
5070 unsigned int ret = 0;
5071 bool is_stdarg;
5072 tree attrs;
5073
5074 if (TARGET_64BIT)
5075 return IX86_CALLCVT_CDECL;
5076
5077 attrs = TYPE_ATTRIBUTES (type);
5078 if (attrs != NULL_TREE)
5079 {
5080 if (lookup_attribute ("cdecl", attrs))
5081 ret |= IX86_CALLCVT_CDECL;
5082 else if (lookup_attribute ("stdcall", attrs))
5083 ret |= IX86_CALLCVT_STDCALL;
5084 else if (lookup_attribute ("fastcall", attrs))
5085 ret |= IX86_CALLCVT_FASTCALL;
5086 else if (lookup_attribute ("thiscall", attrs))
5087 ret |= IX86_CALLCVT_THISCALL;
5088
5089 /* Regparam isn't allowed for thiscall and fastcall. */
5090 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5091 {
5092 if (lookup_attribute ("regparm", attrs))
5093 ret |= IX86_CALLCVT_REGPARM;
5094 if (lookup_attribute ("sseregparm", attrs))
5095 ret |= IX86_CALLCVT_SSEREGPARM;
5096 }
5097
5098 if (IX86_BASE_CALLCVT(ret) != 0)
5099 return ret;
5100 }
5101
5102 is_stdarg = stdarg_p (type);
5103 if (TARGET_RTD && !is_stdarg)
5104 return IX86_CALLCVT_STDCALL | ret;
5105
5106 if (ret != 0
5107 || is_stdarg
5108 || TREE_CODE (type) != METHOD_TYPE
5109 || ix86_function_type_abi (type) != MS_ABI)
5110 return IX86_CALLCVT_CDECL | ret;
5111
5112 return IX86_CALLCVT_THISCALL;
5113 }
5114
5115 /* Return 0 if the attributes for two types are incompatible, 1 if they
5116 are compatible, and 2 if they are nearly compatible (which causes a
5117 warning to be generated). */
5118
5119 static int
5120 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5121 {
5122 unsigned int ccvt1, ccvt2;
5123
5124 if (TREE_CODE (type1) != FUNCTION_TYPE
5125 && TREE_CODE (type1) != METHOD_TYPE)
5126 return 1;
5127
5128 ccvt1 = ix86_get_callcvt (type1);
5129 ccvt2 = ix86_get_callcvt (type2);
5130 if (ccvt1 != ccvt2)
5131 return 0;
5132 if (ix86_function_regparm (type1, NULL)
5133 != ix86_function_regparm (type2, NULL))
5134 return 0;
5135
5136 return 1;
5137 }
5138 \f
5139 /* Return the regparm value for a function with the indicated TYPE and DECL.
5140 DECL may be NULL when calling function indirectly
5141 or considering a libcall. */
5142
5143 static int
5144 ix86_function_regparm (const_tree type, const_tree decl)
5145 {
5146 tree attr;
5147 int regparm;
5148 unsigned int ccvt;
5149
5150 if (TARGET_64BIT)
5151 return (ix86_function_type_abi (type) == SYSV_ABI
5152 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5153 ccvt = ix86_get_callcvt (type);
5154 regparm = ix86_regparm;
5155
5156 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5157 {
5158 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5159 if (attr)
5160 {
5161 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5162 return regparm;
5163 }
5164 }
5165 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5166 return 2;
5167 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5168 return 1;
5169
5170 /* Use register calling convention for local functions when possible. */
5171 if (decl
5172 && TREE_CODE (decl) == FUNCTION_DECL
5173 && optimize
5174 && !(profile_flag && !flag_fentry))
5175 {
5176 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5177 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5178 if (i && i->local && i->can_change_signature)
5179 {
5180 int local_regparm, globals = 0, regno;
5181
5182 /* Make sure no regparm register is taken by a
5183 fixed register variable. */
5184 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5185 if (fixed_regs[local_regparm])
5186 break;
5187
5188 /* We don't want to use regparm(3) for nested functions as
5189 these use a static chain pointer in the third argument. */
5190 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5191 local_regparm = 2;
5192
5193 /* In 32-bit mode save a register for the split stack. */
5194 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5195 local_regparm = 2;
5196
5197 /* Each fixed register usage increases register pressure,
5198 so less registers should be used for argument passing.
5199 This functionality can be overriden by an explicit
5200 regparm value. */
5201 for (regno = 0; regno <= DI_REG; regno++)
5202 if (fixed_regs[regno])
5203 globals++;
5204
5205 local_regparm
5206 = globals < local_regparm ? local_regparm - globals : 0;
5207
5208 if (local_regparm > regparm)
5209 regparm = local_regparm;
5210 }
5211 }
5212
5213 return regparm;
5214 }
5215
5216 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5217 DFmode (2) arguments in SSE registers for a function with the
5218 indicated TYPE and DECL. DECL may be NULL when calling function
5219 indirectly or considering a libcall. Otherwise return 0. */
5220
5221 static int
5222 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5223 {
5224 gcc_assert (!TARGET_64BIT);
5225
5226 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5227 by the sseregparm attribute. */
5228 if (TARGET_SSEREGPARM
5229 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5230 {
5231 if (!TARGET_SSE)
5232 {
5233 if (warn)
5234 {
5235 if (decl)
5236 error ("calling %qD with attribute sseregparm without "
5237 "SSE/SSE2 enabled", decl);
5238 else
5239 error ("calling %qT with attribute sseregparm without "
5240 "SSE/SSE2 enabled", type);
5241 }
5242 return 0;
5243 }
5244
5245 return 2;
5246 }
5247
5248 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5249 (and DFmode for SSE2) arguments in SSE registers. */
5250 if (decl && TARGET_SSE_MATH && optimize
5251 && !(profile_flag && !flag_fentry))
5252 {
5253 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5254 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5255 if (i && i->local && i->can_change_signature)
5256 return TARGET_SSE2 ? 2 : 1;
5257 }
5258
5259 return 0;
5260 }
5261
5262 /* Return true if EAX is live at the start of the function. Used by
5263 ix86_expand_prologue to determine if we need special help before
5264 calling allocate_stack_worker. */
5265
5266 static bool
5267 ix86_eax_live_at_start_p (void)
5268 {
5269 /* Cheat. Don't bother working forward from ix86_function_regparm
5270 to the function type to whether an actual argument is located in
5271 eax. Instead just look at cfg info, which is still close enough
5272 to correct at this point. This gives false positives for broken
5273 functions that might use uninitialized data that happens to be
5274 allocated in eax, but who cares? */
5275 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5276 }
5277
5278 static bool
5279 ix86_keep_aggregate_return_pointer (tree fntype)
5280 {
5281 tree attr;
5282
5283 if (!TARGET_64BIT)
5284 {
5285 attr = lookup_attribute ("callee_pop_aggregate_return",
5286 TYPE_ATTRIBUTES (fntype));
5287 if (attr)
5288 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5289
5290 /* For 32-bit MS-ABI the default is to keep aggregate
5291 return pointer. */
5292 if (ix86_function_type_abi (fntype) == MS_ABI)
5293 return true;
5294 }
5295 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5296 }
5297
5298 /* Value is the number of bytes of arguments automatically
5299 popped when returning from a subroutine call.
5300 FUNDECL is the declaration node of the function (as a tree),
5301 FUNTYPE is the data type of the function (as a tree),
5302 or for a library call it is an identifier node for the subroutine name.
5303 SIZE is the number of bytes of arguments passed on the stack.
5304
5305 On the 80386, the RTD insn may be used to pop them if the number
5306 of args is fixed, but if the number is variable then the caller
5307 must pop them all. RTD can't be used for library calls now
5308 because the library is compiled with the Unix compiler.
5309 Use of RTD is a selectable option, since it is incompatible with
5310 standard Unix calling sequences. If the option is not selected,
5311 the caller must always pop the args.
5312
5313 The attribute stdcall is equivalent to RTD on a per module basis. */
5314
5315 static int
5316 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5317 {
5318 unsigned int ccvt;
5319
5320 /* None of the 64-bit ABIs pop arguments. */
5321 if (TARGET_64BIT)
5322 return 0;
5323
5324 ccvt = ix86_get_callcvt (funtype);
5325
5326 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5327 | IX86_CALLCVT_THISCALL)) != 0
5328 && ! stdarg_p (funtype))
5329 return size;
5330
5331 /* Lose any fake structure return argument if it is passed on the stack. */
5332 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5333 && !ix86_keep_aggregate_return_pointer (funtype))
5334 {
5335 int nregs = ix86_function_regparm (funtype, fundecl);
5336 if (nregs == 0)
5337 return GET_MODE_SIZE (Pmode);
5338 }
5339
5340 return 0;
5341 }
5342 \f
5343 /* Argument support functions. */
5344
5345 /* Return true when register may be used to pass function parameters. */
5346 bool
5347 ix86_function_arg_regno_p (int regno)
5348 {
5349 int i;
5350 const int *parm_regs;
5351
5352 if (!TARGET_64BIT)
5353 {
5354 if (TARGET_MACHO)
5355 return (regno < REGPARM_MAX
5356 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5357 else
5358 return (regno < REGPARM_MAX
5359 || (TARGET_MMX && MMX_REGNO_P (regno)
5360 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5361 || (TARGET_SSE && SSE_REGNO_P (regno)
5362 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5363 }
5364
5365 if (TARGET_MACHO)
5366 {
5367 if (SSE_REGNO_P (regno) && TARGET_SSE)
5368 return true;
5369 }
5370 else
5371 {
5372 if (TARGET_SSE && SSE_REGNO_P (regno)
5373 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5374 return true;
5375 }
5376
5377 /* TODO: The function should depend on current function ABI but
5378 builtins.c would need updating then. Therefore we use the
5379 default ABI. */
5380
5381 /* RAX is used as hidden argument to va_arg functions. */
5382 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5383 return true;
5384
5385 if (ix86_abi == MS_ABI)
5386 parm_regs = x86_64_ms_abi_int_parameter_registers;
5387 else
5388 parm_regs = x86_64_int_parameter_registers;
5389 for (i = 0; i < (ix86_abi == MS_ABI
5390 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5391 if (regno == parm_regs[i])
5392 return true;
5393 return false;
5394 }
5395
5396 /* Return if we do not know how to pass TYPE solely in registers. */
5397
5398 static bool
5399 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5400 {
5401 if (must_pass_in_stack_var_size_or_pad (mode, type))
5402 return true;
5403
5404 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5405 The layout_type routine is crafty and tries to trick us into passing
5406 currently unsupported vector types on the stack by using TImode. */
5407 return (!TARGET_64BIT && mode == TImode
5408 && type && TREE_CODE (type) != VECTOR_TYPE);
5409 }
5410
5411 /* It returns the size, in bytes, of the area reserved for arguments passed
5412 in registers for the function represented by fndecl dependent to the used
5413 abi format. */
5414 int
5415 ix86_reg_parm_stack_space (const_tree fndecl)
5416 {
5417 enum calling_abi call_abi = SYSV_ABI;
5418 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5419 call_abi = ix86_function_abi (fndecl);
5420 else
5421 call_abi = ix86_function_type_abi (fndecl);
5422 if (TARGET_64BIT && call_abi == MS_ABI)
5423 return 32;
5424 return 0;
5425 }
5426
5427 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5428 call abi used. */
5429 enum calling_abi
5430 ix86_function_type_abi (const_tree fntype)
5431 {
5432 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5433 {
5434 enum calling_abi abi = ix86_abi;
5435 if (abi == SYSV_ABI)
5436 {
5437 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5438 abi = MS_ABI;
5439 }
5440 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5441 abi = SYSV_ABI;
5442 return abi;
5443 }
5444 return ix86_abi;
5445 }
5446
5447 static bool
5448 ix86_function_ms_hook_prologue (const_tree fn)
5449 {
5450 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5451 {
5452 if (decl_function_context (fn) != NULL_TREE)
5453 error_at (DECL_SOURCE_LOCATION (fn),
5454 "ms_hook_prologue is not compatible with nested function");
5455 else
5456 return true;
5457 }
5458 return false;
5459 }
5460
5461 static enum calling_abi
5462 ix86_function_abi (const_tree fndecl)
5463 {
5464 if (! fndecl)
5465 return ix86_abi;
5466 return ix86_function_type_abi (TREE_TYPE (fndecl));
5467 }
5468
5469 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5470 call abi used. */
5471 enum calling_abi
5472 ix86_cfun_abi (void)
5473 {
5474 if (! cfun)
5475 return ix86_abi;
5476 return cfun->machine->call_abi;
5477 }
5478
5479 /* Write the extra assembler code needed to declare a function properly. */
5480
5481 void
5482 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5483 tree decl)
5484 {
5485 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5486
5487 if (is_ms_hook)
5488 {
5489 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5490 unsigned int filler_cc = 0xcccccccc;
5491
5492 for (i = 0; i < filler_count; i += 4)
5493 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5494 }
5495
5496 #ifdef SUBTARGET_ASM_UNWIND_INIT
5497 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5498 #endif
5499
5500 ASM_OUTPUT_LABEL (asm_out_file, fname);
5501
5502 /* Output magic byte marker, if hot-patch attribute is set. */
5503 if (is_ms_hook)
5504 {
5505 if (TARGET_64BIT)
5506 {
5507 /* leaq [%rsp + 0], %rsp */
5508 asm_fprintf (asm_out_file, ASM_BYTE
5509 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5510 }
5511 else
5512 {
5513 /* movl.s %edi, %edi
5514 push %ebp
5515 movl.s %esp, %ebp */
5516 asm_fprintf (asm_out_file, ASM_BYTE
5517 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5518 }
5519 }
5520 }
5521
5522 /* regclass.c */
5523 extern void init_regs (void);
5524
5525 /* Implementation of call abi switching target hook. Specific to FNDECL
5526 the specific call register sets are set. See also
5527 ix86_conditional_register_usage for more details. */
5528 void
5529 ix86_call_abi_override (const_tree fndecl)
5530 {
5531 if (fndecl == NULL_TREE)
5532 cfun->machine->call_abi = ix86_abi;
5533 else
5534 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5535 }
5536
5537 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5538 expensive re-initialization of init_regs each time we switch function context
5539 since this is needed only during RTL expansion. */
5540 static void
5541 ix86_maybe_switch_abi (void)
5542 {
5543 if (TARGET_64BIT &&
5544 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5545 reinit_regs ();
5546 }
5547
5548 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5549 for a call to a function whose data type is FNTYPE.
5550 For a library call, FNTYPE is 0. */
5551
5552 void
5553 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5554 tree fntype, /* tree ptr for function decl */
5555 rtx libname, /* SYMBOL_REF of library name or 0 */
5556 tree fndecl,
5557 int caller)
5558 {
5559 struct cgraph_local_info *i;
5560 tree fnret_type;
5561
5562 memset (cum, 0, sizeof (*cum));
5563
5564 /* Initialize for the current callee. */
5565 if (caller)
5566 {
5567 cfun->machine->callee_pass_avx256_p = false;
5568 cfun->machine->callee_return_avx256_p = false;
5569 }
5570
5571 if (fndecl)
5572 {
5573 i = cgraph_local_info (fndecl);
5574 cum->call_abi = ix86_function_abi (fndecl);
5575 fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5576 }
5577 else
5578 {
5579 i = NULL;
5580 cum->call_abi = ix86_function_type_abi (fntype);
5581 if (fntype)
5582 fnret_type = TREE_TYPE (fntype);
5583 else
5584 fnret_type = NULL;
5585 }
5586
5587 if (TARGET_VZEROUPPER && fnret_type)
5588 {
5589 rtx fnret_value = ix86_function_value (fnret_type, fntype,
5590 false);
5591 if (function_pass_avx256_p (fnret_value))
5592 {
5593 /* The return value of this function uses 256bit AVX modes. */
5594 if (caller)
5595 cfun->machine->callee_return_avx256_p = true;
5596 else
5597 cfun->machine->caller_return_avx256_p = true;
5598 }
5599 }
5600
5601 cum->caller = caller;
5602
5603 /* Set up the number of registers to use for passing arguments. */
5604
5605 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5606 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5607 "or subtarget optimization implying it");
5608 cum->nregs = ix86_regparm;
5609 if (TARGET_64BIT)
5610 {
5611 cum->nregs = (cum->call_abi == SYSV_ABI
5612 ? X86_64_REGPARM_MAX
5613 : X86_64_MS_REGPARM_MAX);
5614 }
5615 if (TARGET_SSE)
5616 {
5617 cum->sse_nregs = SSE_REGPARM_MAX;
5618 if (TARGET_64BIT)
5619 {
5620 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5621 ? X86_64_SSE_REGPARM_MAX
5622 : X86_64_MS_SSE_REGPARM_MAX);
5623 }
5624 }
5625 if (TARGET_MMX)
5626 cum->mmx_nregs = MMX_REGPARM_MAX;
5627 cum->warn_avx = true;
5628 cum->warn_sse = true;
5629 cum->warn_mmx = true;
5630
5631 /* Because type might mismatch in between caller and callee, we need to
5632 use actual type of function for local calls.
5633 FIXME: cgraph_analyze can be told to actually record if function uses
5634 va_start so for local functions maybe_vaarg can be made aggressive
5635 helping K&R code.
5636 FIXME: once typesytem is fixed, we won't need this code anymore. */
5637 if (i && i->local && i->can_change_signature)
5638 fntype = TREE_TYPE (fndecl);
5639 cum->maybe_vaarg = (fntype
5640 ? (!prototype_p (fntype) || stdarg_p (fntype))
5641 : !libname);
5642
5643 if (!TARGET_64BIT)
5644 {
5645 /* If there are variable arguments, then we won't pass anything
5646 in registers in 32-bit mode. */
5647 if (stdarg_p (fntype))
5648 {
5649 cum->nregs = 0;
5650 cum->sse_nregs = 0;
5651 cum->mmx_nregs = 0;
5652 cum->warn_avx = 0;
5653 cum->warn_sse = 0;
5654 cum->warn_mmx = 0;
5655 return;
5656 }
5657
5658 /* Use ecx and edx registers if function has fastcall attribute,
5659 else look for regparm information. */
5660 if (fntype)
5661 {
5662 unsigned int ccvt = ix86_get_callcvt (fntype);
5663 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5664 {
5665 cum->nregs = 1;
5666 cum->fastcall = 1; /* Same first register as in fastcall. */
5667 }
5668 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5669 {
5670 cum->nregs = 2;
5671 cum->fastcall = 1;
5672 }
5673 else
5674 cum->nregs = ix86_function_regparm (fntype, fndecl);
5675 }
5676
5677 /* Set up the number of SSE registers used for passing SFmode
5678 and DFmode arguments. Warn for mismatching ABI. */
5679 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5680 }
5681 }
5682
5683 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5684 But in the case of vector types, it is some vector mode.
5685
5686 When we have only some of our vector isa extensions enabled, then there
5687 are some modes for which vector_mode_supported_p is false. For these
5688 modes, the generic vector support in gcc will choose some non-vector mode
5689 in order to implement the type. By computing the natural mode, we'll
5690 select the proper ABI location for the operand and not depend on whatever
5691 the middle-end decides to do with these vector types.
5692
5693 The midde-end can't deal with the vector types > 16 bytes. In this
5694 case, we return the original mode and warn ABI change if CUM isn't
5695 NULL. */
5696
5697 static enum machine_mode
5698 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5699 {
5700 enum machine_mode mode = TYPE_MODE (type);
5701
5702 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5703 {
5704 HOST_WIDE_INT size = int_size_in_bytes (type);
5705 if ((size == 8 || size == 16 || size == 32)
5706 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5707 && TYPE_VECTOR_SUBPARTS (type) > 1)
5708 {
5709 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5710
5711 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5712 mode = MIN_MODE_VECTOR_FLOAT;
5713 else
5714 mode = MIN_MODE_VECTOR_INT;
5715
5716 /* Get the mode which has this inner mode and number of units. */
5717 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5718 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5719 && GET_MODE_INNER (mode) == innermode)
5720 {
5721 if (size == 32 && !TARGET_AVX)
5722 {
5723 static bool warnedavx;
5724
5725 if (cum
5726 && !warnedavx
5727 && cum->warn_avx)
5728 {
5729 warnedavx = true;
5730 warning (0, "AVX vector argument without AVX "
5731 "enabled changes the ABI");
5732 }
5733 return TYPE_MODE (type);
5734 }
5735 else
5736 return mode;
5737 }
5738
5739 gcc_unreachable ();
5740 }
5741 }
5742
5743 return mode;
5744 }
5745
5746 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5747 this may not agree with the mode that the type system has chosen for the
5748 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5749 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5750
5751 static rtx
5752 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5753 unsigned int regno)
5754 {
5755 rtx tmp;
5756
5757 if (orig_mode != BLKmode)
5758 tmp = gen_rtx_REG (orig_mode, regno);
5759 else
5760 {
5761 tmp = gen_rtx_REG (mode, regno);
5762 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5763 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5764 }
5765
5766 return tmp;
5767 }
5768
5769 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5770 of this code is to classify each 8bytes of incoming argument by the register
5771 class and assign registers accordingly. */
5772
5773 /* Return the union class of CLASS1 and CLASS2.
5774 See the x86-64 PS ABI for details. */
5775
5776 static enum x86_64_reg_class
5777 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5778 {
5779 /* Rule #1: If both classes are equal, this is the resulting class. */
5780 if (class1 == class2)
5781 return class1;
5782
5783 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5784 the other class. */
5785 if (class1 == X86_64_NO_CLASS)
5786 return class2;
5787 if (class2 == X86_64_NO_CLASS)
5788 return class1;
5789
5790 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5791 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5792 return X86_64_MEMORY_CLASS;
5793
5794 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5795 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5796 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5797 return X86_64_INTEGERSI_CLASS;
5798 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5799 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5800 return X86_64_INTEGER_CLASS;
5801
5802 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5803 MEMORY is used. */
5804 if (class1 == X86_64_X87_CLASS
5805 || class1 == X86_64_X87UP_CLASS
5806 || class1 == X86_64_COMPLEX_X87_CLASS
5807 || class2 == X86_64_X87_CLASS
5808 || class2 == X86_64_X87UP_CLASS
5809 || class2 == X86_64_COMPLEX_X87_CLASS)
5810 return X86_64_MEMORY_CLASS;
5811
5812 /* Rule #6: Otherwise class SSE is used. */
5813 return X86_64_SSE_CLASS;
5814 }
5815
5816 /* Classify the argument of type TYPE and mode MODE.
5817 CLASSES will be filled by the register class used to pass each word
5818 of the operand. The number of words is returned. In case the parameter
5819 should be passed in memory, 0 is returned. As a special case for zero
5820 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5821
5822 BIT_OFFSET is used internally for handling records and specifies offset
5823 of the offset in bits modulo 256 to avoid overflow cases.
5824
5825 See the x86-64 PS ABI for details.
5826 */
5827
5828 static int
5829 classify_argument (enum machine_mode mode, const_tree type,
5830 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5831 {
5832 HOST_WIDE_INT bytes =
5833 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5834 int words = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5835
5836 /* Variable sized entities are always passed/returned in memory. */
5837 if (bytes < 0)
5838 return 0;
5839
5840 if (mode != VOIDmode
5841 && targetm.calls.must_pass_in_stack (mode, type))
5842 return 0;
5843
5844 if (type && AGGREGATE_TYPE_P (type))
5845 {
5846 int i;
5847 tree field;
5848 enum x86_64_reg_class subclasses[MAX_CLASSES];
5849
5850 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5851 if (bytes > 32)
5852 return 0;
5853
5854 for (i = 0; i < words; i++)
5855 classes[i] = X86_64_NO_CLASS;
5856
5857 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5858 signalize memory class, so handle it as special case. */
5859 if (!words)
5860 {
5861 classes[0] = X86_64_NO_CLASS;
5862 return 1;
5863 }
5864
5865 /* Classify each field of record and merge classes. */
5866 switch (TREE_CODE (type))
5867 {
5868 case RECORD_TYPE:
5869 /* And now merge the fields of structure. */
5870 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5871 {
5872 if (TREE_CODE (field) == FIELD_DECL)
5873 {
5874 int num;
5875
5876 if (TREE_TYPE (field) == error_mark_node)
5877 continue;
5878
5879 /* Bitfields are always classified as integer. Handle them
5880 early, since later code would consider them to be
5881 misaligned integers. */
5882 if (DECL_BIT_FIELD (field))
5883 {
5884 for (i = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
5885 i < ((int_bit_position (field) + (bit_offset % 64))
5886 + tree_low_cst (DECL_SIZE (field), 0)
5887 + 63) / 8 / 8; i++)
5888 classes[i] =
5889 merge_classes (X86_64_INTEGER_CLASS,
5890 classes[i]);
5891 }
5892 else
5893 {
5894 int pos;
5895
5896 type = TREE_TYPE (field);
5897
5898 /* Flexible array member is ignored. */
5899 if (TYPE_MODE (type) == BLKmode
5900 && TREE_CODE (type) == ARRAY_TYPE
5901 && TYPE_SIZE (type) == NULL_TREE
5902 && TYPE_DOMAIN (type) != NULL_TREE
5903 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
5904 == NULL_TREE))
5905 {
5906 static bool warned;
5907
5908 if (!warned && warn_psabi)
5909 {
5910 warned = true;
5911 inform (input_location,
5912 "the ABI of passing struct with"
5913 " a flexible array member has"
5914 " changed in GCC 4.4");
5915 }
5916 continue;
5917 }
5918 num = classify_argument (TYPE_MODE (type), type,
5919 subclasses,
5920 (int_bit_position (field)
5921 + bit_offset) % 256);
5922 if (!num)
5923 return 0;
5924 pos = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
5925 for (i = 0; i < num && (i + pos) < words; i++)
5926 classes[i + pos] =
5927 merge_classes (subclasses[i], classes[i + pos]);
5928 }
5929 }
5930 }
5931 break;
5932
5933 case ARRAY_TYPE:
5934 /* Arrays are handled as small records. */
5935 {
5936 int num;
5937 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
5938 TREE_TYPE (type), subclasses, bit_offset);
5939 if (!num)
5940 return 0;
5941
5942 /* The partial classes are now full classes. */
5943 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
5944 subclasses[0] = X86_64_SSE_CLASS;
5945 if (subclasses[0] == X86_64_INTEGERSI_CLASS
5946 && !((bit_offset % 64) == 0 && bytes == 4))
5947 subclasses[0] = X86_64_INTEGER_CLASS;
5948
5949 for (i = 0; i < words; i++)
5950 classes[i] = subclasses[i % num];
5951
5952 break;
5953 }
5954 case UNION_TYPE:
5955 case QUAL_UNION_TYPE:
5956 /* Unions are similar to RECORD_TYPE but offset is always 0.
5957 */
5958 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5959 {
5960 if (TREE_CODE (field) == FIELD_DECL)
5961 {
5962 int num;
5963
5964 if (TREE_TYPE (field) == error_mark_node)
5965 continue;
5966
5967 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
5968 TREE_TYPE (field), subclasses,
5969 bit_offset);
5970 if (!num)
5971 return 0;
5972 for (i = 0; i < num; i++)
5973 classes[i] = merge_classes (subclasses[i], classes[i]);
5974 }
5975 }
5976 break;
5977
5978 default:
5979 gcc_unreachable ();
5980 }
5981
5982 if (words > 2)
5983 {
5984 /* When size > 16 bytes, if the first one isn't
5985 X86_64_SSE_CLASS or any other ones aren't
5986 X86_64_SSEUP_CLASS, everything should be passed in
5987 memory. */
5988 if (classes[0] != X86_64_SSE_CLASS)
5989 return 0;
5990
5991 for (i = 1; i < words; i++)
5992 if (classes[i] != X86_64_SSEUP_CLASS)
5993 return 0;
5994 }
5995
5996 /* Final merger cleanup. */
5997 for (i = 0; i < words; i++)
5998 {
5999 /* If one class is MEMORY, everything should be passed in
6000 memory. */
6001 if (classes[i] == X86_64_MEMORY_CLASS)
6002 return 0;
6003
6004 /* The X86_64_SSEUP_CLASS should be always preceded by
6005 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6006 if (classes[i] == X86_64_SSEUP_CLASS
6007 && classes[i - 1] != X86_64_SSE_CLASS
6008 && classes[i - 1] != X86_64_SSEUP_CLASS)
6009 {
6010 /* The first one should never be X86_64_SSEUP_CLASS. */
6011 gcc_assert (i != 0);
6012 classes[i] = X86_64_SSE_CLASS;
6013 }
6014
6015 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6016 everything should be passed in memory. */
6017 if (classes[i] == X86_64_X87UP_CLASS
6018 && (classes[i - 1] != X86_64_X87_CLASS))
6019 {
6020 static bool warned;
6021
6022 /* The first one should never be X86_64_X87UP_CLASS. */
6023 gcc_assert (i != 0);
6024 if (!warned && warn_psabi)
6025 {
6026 warned = true;
6027 inform (input_location,
6028 "the ABI of passing union with long double"
6029 " has changed in GCC 4.4");
6030 }
6031 return 0;
6032 }
6033 }
6034 return words;
6035 }
6036
6037 /* Compute alignment needed. We align all types to natural boundaries with
6038 exception of XFmode that is aligned to 64bits. */
6039 if (mode != VOIDmode && mode != BLKmode)
6040 {
6041 int mode_alignment = GET_MODE_BITSIZE (mode);
6042
6043 if (mode == XFmode)
6044 mode_alignment = 128;
6045 else if (mode == XCmode)
6046 mode_alignment = 256;
6047 if (COMPLEX_MODE_P (mode))
6048 mode_alignment /= 2;
6049 /* Misaligned fields are always returned in memory. */
6050 if (bit_offset % mode_alignment)
6051 return 0;
6052 }
6053
6054 /* for V1xx modes, just use the base mode */
6055 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6056 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6057 mode = GET_MODE_INNER (mode);
6058
6059 /* Classification of atomic types. */
6060 switch (mode)
6061 {
6062 case SDmode:
6063 case DDmode:
6064 classes[0] = X86_64_SSE_CLASS;
6065 return 1;
6066 case TDmode:
6067 classes[0] = X86_64_SSE_CLASS;
6068 classes[1] = X86_64_SSEUP_CLASS;
6069 return 2;
6070 case DImode:
6071 case SImode:
6072 case HImode:
6073 case QImode:
6074 case CSImode:
6075 case CHImode:
6076 case CQImode:
6077 {
6078 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6079
6080 if (size <= 32)
6081 {
6082 classes[0] = X86_64_INTEGERSI_CLASS;
6083 return 1;
6084 }
6085 else if (size <= 64)
6086 {
6087 classes[0] = X86_64_INTEGER_CLASS;
6088 return 1;
6089 }
6090 else if (size <= 64+32)
6091 {
6092 classes[0] = X86_64_INTEGER_CLASS;
6093 classes[1] = X86_64_INTEGERSI_CLASS;
6094 return 2;
6095 }
6096 else if (size <= 64+64)
6097 {
6098 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6099 return 2;
6100 }
6101 else
6102 gcc_unreachable ();
6103 }
6104 case CDImode:
6105 case TImode:
6106 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6107 return 2;
6108 case COImode:
6109 case OImode:
6110 /* OImode shouldn't be used directly. */
6111 gcc_unreachable ();
6112 case CTImode:
6113 return 0;
6114 case SFmode:
6115 if (!(bit_offset % 64))
6116 classes[0] = X86_64_SSESF_CLASS;
6117 else
6118 classes[0] = X86_64_SSE_CLASS;
6119 return 1;
6120 case DFmode:
6121 classes[0] = X86_64_SSEDF_CLASS;
6122 return 1;
6123 case XFmode:
6124 classes[0] = X86_64_X87_CLASS;
6125 classes[1] = X86_64_X87UP_CLASS;
6126 return 2;
6127 case TFmode:
6128 classes[0] = X86_64_SSE_CLASS;
6129 classes[1] = X86_64_SSEUP_CLASS;
6130 return 2;
6131 case SCmode:
6132 classes[0] = X86_64_SSE_CLASS;
6133 if (!(bit_offset % 64))
6134 return 1;
6135 else
6136 {
6137 static bool warned;
6138
6139 if (!warned && warn_psabi)
6140 {
6141 warned = true;
6142 inform (input_location,
6143 "the ABI of passing structure with complex float"
6144 " member has changed in GCC 4.4");
6145 }
6146 classes[1] = X86_64_SSESF_CLASS;
6147 return 2;
6148 }
6149 case DCmode:
6150 classes[0] = X86_64_SSEDF_CLASS;
6151 classes[1] = X86_64_SSEDF_CLASS;
6152 return 2;
6153 case XCmode:
6154 classes[0] = X86_64_COMPLEX_X87_CLASS;
6155 return 1;
6156 case TCmode:
6157 /* This modes is larger than 16 bytes. */
6158 return 0;
6159 case V8SFmode:
6160 case V8SImode:
6161 case V32QImode:
6162 case V16HImode:
6163 case V4DFmode:
6164 case V4DImode:
6165 classes[0] = X86_64_SSE_CLASS;
6166 classes[1] = X86_64_SSEUP_CLASS;
6167 classes[2] = X86_64_SSEUP_CLASS;
6168 classes[3] = X86_64_SSEUP_CLASS;
6169 return 4;
6170 case V4SFmode:
6171 case V4SImode:
6172 case V16QImode:
6173 case V8HImode:
6174 case V2DFmode:
6175 case V2DImode:
6176 classes[0] = X86_64_SSE_CLASS;
6177 classes[1] = X86_64_SSEUP_CLASS;
6178 return 2;
6179 case V1TImode:
6180 case V1DImode:
6181 case V2SFmode:
6182 case V2SImode:
6183 case V4HImode:
6184 case V8QImode:
6185 classes[0] = X86_64_SSE_CLASS;
6186 return 1;
6187 case BLKmode:
6188 case VOIDmode:
6189 return 0;
6190 default:
6191 gcc_assert (VECTOR_MODE_P (mode));
6192
6193 if (bytes > 16)
6194 return 0;
6195
6196 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6197
6198 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6199 classes[0] = X86_64_INTEGERSI_CLASS;
6200 else
6201 classes[0] = X86_64_INTEGER_CLASS;
6202 classes[1] = X86_64_INTEGER_CLASS;
6203 return 1 + (bytes > 8);
6204 }
6205 }
6206
6207 /* Examine the argument and return set number of register required in each
6208 class. Return 0 iff parameter should be passed in memory. */
6209 static int
6210 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6211 int *int_nregs, int *sse_nregs)
6212 {
6213 enum x86_64_reg_class regclass[MAX_CLASSES];
6214 int n = classify_argument (mode, type, regclass, 0);
6215
6216 *int_nregs = 0;
6217 *sse_nregs = 0;
6218 if (!n)
6219 return 0;
6220 for (n--; n >= 0; n--)
6221 switch (regclass[n])
6222 {
6223 case X86_64_INTEGER_CLASS:
6224 case X86_64_INTEGERSI_CLASS:
6225 (*int_nregs)++;
6226 break;
6227 case X86_64_SSE_CLASS:
6228 case X86_64_SSESF_CLASS:
6229 case X86_64_SSEDF_CLASS:
6230 (*sse_nregs)++;
6231 break;
6232 case X86_64_NO_CLASS:
6233 case X86_64_SSEUP_CLASS:
6234 break;
6235 case X86_64_X87_CLASS:
6236 case X86_64_X87UP_CLASS:
6237 if (!in_return)
6238 return 0;
6239 break;
6240 case X86_64_COMPLEX_X87_CLASS:
6241 return in_return ? 2 : 0;
6242 case X86_64_MEMORY_CLASS:
6243 gcc_unreachable ();
6244 }
6245 return 1;
6246 }
6247
6248 /* Construct container for the argument used by GCC interface. See
6249 FUNCTION_ARG for the detailed description. */
6250
6251 static rtx
6252 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6253 const_tree type, int in_return, int nintregs, int nsseregs,
6254 const int *intreg, int sse_regno)
6255 {
6256 /* The following variables hold the static issued_error state. */
6257 static bool issued_sse_arg_error;
6258 static bool issued_sse_ret_error;
6259 static bool issued_x87_ret_error;
6260
6261 enum machine_mode tmpmode;
6262 int bytes =
6263 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6264 enum x86_64_reg_class regclass[MAX_CLASSES];
6265 int n;
6266 int i;
6267 int nexps = 0;
6268 int needed_sseregs, needed_intregs;
6269 rtx exp[MAX_CLASSES];
6270 rtx ret;
6271
6272 n = classify_argument (mode, type, regclass, 0);
6273 if (!n)
6274 return NULL;
6275 if (!examine_argument (mode, type, in_return, &needed_intregs,
6276 &needed_sseregs))
6277 return NULL;
6278 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6279 return NULL;
6280
6281 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6282 some less clueful developer tries to use floating-point anyway. */
6283 if (needed_sseregs && !TARGET_SSE)
6284 {
6285 if (in_return)
6286 {
6287 if (!issued_sse_ret_error)
6288 {
6289 error ("SSE register return with SSE disabled");
6290 issued_sse_ret_error = true;
6291 }
6292 }
6293 else if (!issued_sse_arg_error)
6294 {
6295 error ("SSE register argument with SSE disabled");
6296 issued_sse_arg_error = true;
6297 }
6298 return NULL;
6299 }
6300
6301 /* Likewise, error if the ABI requires us to return values in the
6302 x87 registers and the user specified -mno-80387. */
6303 if (!TARGET_80387 && in_return)
6304 for (i = 0; i < n; i++)
6305 if (regclass[i] == X86_64_X87_CLASS
6306 || regclass[i] == X86_64_X87UP_CLASS
6307 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6308 {
6309 if (!issued_x87_ret_error)
6310 {
6311 error ("x87 register return with x87 disabled");
6312 issued_x87_ret_error = true;
6313 }
6314 return NULL;
6315 }
6316
6317 /* First construct simple cases. Avoid SCmode, since we want to use
6318 single register to pass this type. */
6319 if (n == 1 && mode != SCmode)
6320 switch (regclass[0])
6321 {
6322 case X86_64_INTEGER_CLASS:
6323 case X86_64_INTEGERSI_CLASS:
6324 return gen_rtx_REG (mode, intreg[0]);
6325 case X86_64_SSE_CLASS:
6326 case X86_64_SSESF_CLASS:
6327 case X86_64_SSEDF_CLASS:
6328 if (mode != BLKmode)
6329 return gen_reg_or_parallel (mode, orig_mode,
6330 SSE_REGNO (sse_regno));
6331 break;
6332 case X86_64_X87_CLASS:
6333 case X86_64_COMPLEX_X87_CLASS:
6334 return gen_rtx_REG (mode, FIRST_STACK_REG);
6335 case X86_64_NO_CLASS:
6336 /* Zero sized array, struct or class. */
6337 return NULL;
6338 default:
6339 gcc_unreachable ();
6340 }
6341 if (n == 2 && regclass[0] == X86_64_SSE_CLASS
6342 && regclass[1] == X86_64_SSEUP_CLASS && mode != BLKmode)
6343 return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
6344 if (n == 4
6345 && regclass[0] == X86_64_SSE_CLASS
6346 && regclass[1] == X86_64_SSEUP_CLASS
6347 && regclass[2] == X86_64_SSEUP_CLASS
6348 && regclass[3] == X86_64_SSEUP_CLASS
6349 && mode != BLKmode)
6350 return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
6351
6352 if (n == 2
6353 && regclass[0] == X86_64_X87_CLASS && regclass[1] == X86_64_X87UP_CLASS)
6354 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6355 if (n == 2 && regclass[0] == X86_64_INTEGER_CLASS
6356 && regclass[1] == X86_64_INTEGER_CLASS
6357 && (mode == CDImode || mode == TImode || mode == TFmode)
6358 && intreg[0] + 1 == intreg[1])
6359 return gen_rtx_REG (mode, intreg[0]);
6360
6361 /* Otherwise figure out the entries of the PARALLEL. */
6362 for (i = 0; i < n; i++)
6363 {
6364 int pos;
6365
6366 switch (regclass[i])
6367 {
6368 case X86_64_NO_CLASS:
6369 break;
6370 case X86_64_INTEGER_CLASS:
6371 case X86_64_INTEGERSI_CLASS:
6372 /* Merge TImodes on aligned occasions here too. */
6373 if (i * 8 + 8 > bytes)
6374 tmpmode = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6375 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6376 tmpmode = SImode;
6377 else
6378 tmpmode = DImode;
6379 /* We've requested 24 bytes we don't have mode for. Use DImode. */
6380 if (tmpmode == BLKmode)
6381 tmpmode = DImode;
6382 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6383 gen_rtx_REG (tmpmode, *intreg),
6384 GEN_INT (i*8));
6385 intreg++;
6386 break;
6387 case X86_64_SSESF_CLASS:
6388 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6389 gen_rtx_REG (SFmode,
6390 SSE_REGNO (sse_regno)),
6391 GEN_INT (i*8));
6392 sse_regno++;
6393 break;
6394 case X86_64_SSEDF_CLASS:
6395 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6396 gen_rtx_REG (DFmode,
6397 SSE_REGNO (sse_regno)),
6398 GEN_INT (i*8));
6399 sse_regno++;
6400 break;
6401 case X86_64_SSE_CLASS:
6402 pos = i;
6403 switch (n)
6404 {
6405 case 1:
6406 tmpmode = DImode;
6407 break;
6408 case 2:
6409 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6410 {
6411 tmpmode = TImode;
6412 i++;
6413 }
6414 else
6415 tmpmode = DImode;
6416 break;
6417 case 4:
6418 gcc_assert (i == 0
6419 && regclass[1] == X86_64_SSEUP_CLASS
6420 && regclass[2] == X86_64_SSEUP_CLASS
6421 && regclass[3] == X86_64_SSEUP_CLASS);
6422 tmpmode = OImode;
6423 i += 3;
6424 break;
6425 default:
6426 gcc_unreachable ();
6427 }
6428 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6429 gen_rtx_REG (tmpmode,
6430 SSE_REGNO (sse_regno)),
6431 GEN_INT (pos*8));
6432 sse_regno++;
6433 break;
6434 default:
6435 gcc_unreachable ();
6436 }
6437 }
6438
6439 /* Empty aligned struct, union or class. */
6440 if (nexps == 0)
6441 return NULL;
6442
6443 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6444 for (i = 0; i < nexps; i++)
6445 XVECEXP (ret, 0, i) = exp [i];
6446 return ret;
6447 }
6448
6449 /* Update the data in CUM to advance over an argument of mode MODE
6450 and data type TYPE. (TYPE is null for libcalls where that information
6451 may not be available.) */
6452
6453 static void
6454 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6455 const_tree type, HOST_WIDE_INT bytes,
6456 HOST_WIDE_INT words)
6457 {
6458 switch (mode)
6459 {
6460 default:
6461 break;
6462
6463 case BLKmode:
6464 if (bytes < 0)
6465 break;
6466 /* FALLTHRU */
6467
6468 case DImode:
6469 case SImode:
6470 case HImode:
6471 case QImode:
6472 cum->words += words;
6473 cum->nregs -= words;
6474 cum->regno += words;
6475
6476 if (cum->nregs <= 0)
6477 {
6478 cum->nregs = 0;
6479 cum->regno = 0;
6480 }
6481 break;
6482
6483 case OImode:
6484 /* OImode shouldn't be used directly. */
6485 gcc_unreachable ();
6486
6487 case DFmode:
6488 if (cum->float_in_sse < 2)
6489 break;
6490 case SFmode:
6491 if (cum->float_in_sse < 1)
6492 break;
6493 /* FALLTHRU */
6494
6495 case V8SFmode:
6496 case V8SImode:
6497 case V32QImode:
6498 case V16HImode:
6499 case V4DFmode:
6500 case V4DImode:
6501 case TImode:
6502 case V16QImode:
6503 case V8HImode:
6504 case V4SImode:
6505 case V2DImode:
6506 case V4SFmode:
6507 case V2DFmode:
6508 if (!type || !AGGREGATE_TYPE_P (type))
6509 {
6510 cum->sse_words += words;
6511 cum->sse_nregs -= 1;
6512 cum->sse_regno += 1;
6513 if (cum->sse_nregs <= 0)
6514 {
6515 cum->sse_nregs = 0;
6516 cum->sse_regno = 0;
6517 }
6518 }
6519 break;
6520
6521 case V8QImode:
6522 case V4HImode:
6523 case V2SImode:
6524 case V2SFmode:
6525 case V1TImode:
6526 case V1DImode:
6527 if (!type || !AGGREGATE_TYPE_P (type))
6528 {
6529 cum->mmx_words += words;
6530 cum->mmx_nregs -= 1;
6531 cum->mmx_regno += 1;
6532 if (cum->mmx_nregs <= 0)
6533 {
6534 cum->mmx_nregs = 0;
6535 cum->mmx_regno = 0;
6536 }
6537 }
6538 break;
6539 }
6540 }
6541
6542 static void
6543 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6544 const_tree type, HOST_WIDE_INT words, bool named)
6545 {
6546 int int_nregs, sse_nregs;
6547
6548 /* Unnamed 256bit vector mode parameters are passed on stack. */
6549 if (!named && VALID_AVX256_REG_MODE (mode))
6550 return;
6551
6552 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6553 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6554 {
6555 cum->nregs -= int_nregs;
6556 cum->sse_nregs -= sse_nregs;
6557 cum->regno += int_nregs;
6558 cum->sse_regno += sse_nregs;
6559 }
6560 else
6561 {
6562 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6563 cum->words = (cum->words + align - 1) & ~(align - 1);
6564 cum->words += words;
6565 }
6566 }
6567
6568 static void
6569 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6570 HOST_WIDE_INT words)
6571 {
6572 /* Otherwise, this should be passed indirect. */
6573 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6574
6575 cum->words += words;
6576 if (cum->nregs > 0)
6577 {
6578 cum->nregs -= 1;
6579 cum->regno += 1;
6580 }
6581 }
6582
6583 /* Update the data in CUM to advance over an argument of mode MODE and
6584 data type TYPE. (TYPE is null for libcalls where that information
6585 may not be available.) */
6586
6587 static void
6588 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6589 const_tree type, bool named)
6590 {
6591 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6592 HOST_WIDE_INT bytes, words;
6593
6594 if (mode == BLKmode)
6595 bytes = int_size_in_bytes (type);
6596 else
6597 bytes = GET_MODE_SIZE (mode);
6598 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6599
6600 if (type)
6601 mode = type_natural_mode (type, NULL);
6602
6603 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6604 function_arg_advance_ms_64 (cum, bytes, words);
6605 else if (TARGET_64BIT)
6606 function_arg_advance_64 (cum, mode, type, words, named);
6607 else
6608 function_arg_advance_32 (cum, mode, type, bytes, words);
6609 }
6610
6611 /* Define where to put the arguments to a function.
6612 Value is zero to push the argument on the stack,
6613 or a hard register in which to store the argument.
6614
6615 MODE is the argument's machine mode.
6616 TYPE is the data type of the argument (as a tree).
6617 This is null for libcalls where that information may
6618 not be available.
6619 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6620 the preceding args and about the function being called.
6621 NAMED is nonzero if this argument is a named parameter
6622 (otherwise it is an extra parameter matching an ellipsis). */
6623
6624 static rtx
6625 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6626 enum machine_mode orig_mode, const_tree type,
6627 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6628 {
6629 static bool warnedsse, warnedmmx;
6630
6631 /* Avoid the AL settings for the Unix64 ABI. */
6632 if (mode == VOIDmode)
6633 return constm1_rtx;
6634
6635 switch (mode)
6636 {
6637 default:
6638 break;
6639
6640 case BLKmode:
6641 if (bytes < 0)
6642 break;
6643 /* FALLTHRU */
6644 case DImode:
6645 case SImode:
6646 case HImode:
6647 case QImode:
6648 if (words <= cum->nregs)
6649 {
6650 int regno = cum->regno;
6651
6652 /* Fastcall allocates the first two DWORD (SImode) or
6653 smaller arguments to ECX and EDX if it isn't an
6654 aggregate type . */
6655 if (cum->fastcall)
6656 {
6657 if (mode == BLKmode
6658 || mode == DImode
6659 || (type && AGGREGATE_TYPE_P (type)))
6660 break;
6661
6662 /* ECX not EAX is the first allocated register. */
6663 if (regno == AX_REG)
6664 regno = CX_REG;
6665 }
6666 return gen_rtx_REG (mode, regno);
6667 }
6668 break;
6669
6670 case DFmode:
6671 if (cum->float_in_sse < 2)
6672 break;
6673 case SFmode:
6674 if (cum->float_in_sse < 1)
6675 break;
6676 /* FALLTHRU */
6677 case TImode:
6678 /* In 32bit, we pass TImode in xmm registers. */
6679 case V16QImode:
6680 case V8HImode:
6681 case V4SImode:
6682 case V2DImode:
6683 case V4SFmode:
6684 case V2DFmode:
6685 if (!type || !AGGREGATE_TYPE_P (type))
6686 {
6687 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6688 {
6689 warnedsse = true;
6690 warning (0, "SSE vector argument without SSE enabled "
6691 "changes the ABI");
6692 }
6693 if (cum->sse_nregs)
6694 return gen_reg_or_parallel (mode, orig_mode,
6695 cum->sse_regno + FIRST_SSE_REG);
6696 }
6697 break;
6698
6699 case OImode:
6700 /* OImode shouldn't be used directly. */
6701 gcc_unreachable ();
6702
6703 case V8SFmode:
6704 case V8SImode:
6705 case V32QImode:
6706 case V16HImode:
6707 case V4DFmode:
6708 case V4DImode:
6709 if (!type || !AGGREGATE_TYPE_P (type))
6710 {
6711 if (cum->sse_nregs)
6712 return gen_reg_or_parallel (mode, orig_mode,
6713 cum->sse_regno + FIRST_SSE_REG);
6714 }
6715 break;
6716
6717 case V8QImode:
6718 case V4HImode:
6719 case V2SImode:
6720 case V2SFmode:
6721 case V1TImode:
6722 case V1DImode:
6723 if (!type || !AGGREGATE_TYPE_P (type))
6724 {
6725 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6726 {
6727 warnedmmx = true;
6728 warning (0, "MMX vector argument without MMX enabled "
6729 "changes the ABI");
6730 }
6731 if (cum->mmx_nregs)
6732 return gen_reg_or_parallel (mode, orig_mode,
6733 cum->mmx_regno + FIRST_MMX_REG);
6734 }
6735 break;
6736 }
6737
6738 return NULL_RTX;
6739 }
6740
6741 static rtx
6742 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6743 enum machine_mode orig_mode, const_tree type, bool named)
6744 {
6745 /* Handle a hidden AL argument containing number of registers
6746 for varargs x86-64 functions. */
6747 if (mode == VOIDmode)
6748 return GEN_INT (cum->maybe_vaarg
6749 ? (cum->sse_nregs < 0
6750 ? X86_64_SSE_REGPARM_MAX
6751 : cum->sse_regno)
6752 : -1);
6753
6754 switch (mode)
6755 {
6756 default:
6757 break;
6758
6759 case V8SFmode:
6760 case V8SImode:
6761 case V32QImode:
6762 case V16HImode:
6763 case V4DFmode:
6764 case V4DImode:
6765 /* Unnamed 256bit vector mode parameters are passed on stack. */
6766 if (!named)
6767 return NULL;
6768 break;
6769 }
6770
6771 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6772 cum->sse_nregs,
6773 &x86_64_int_parameter_registers [cum->regno],
6774 cum->sse_regno);
6775 }
6776
6777 static rtx
6778 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6779 enum machine_mode orig_mode, bool named,
6780 HOST_WIDE_INT bytes)
6781 {
6782 unsigned int regno;
6783
6784 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6785 We use value of -2 to specify that current function call is MSABI. */
6786 if (mode == VOIDmode)
6787 return GEN_INT (-2);
6788
6789 /* If we've run out of registers, it goes on the stack. */
6790 if (cum->nregs == 0)
6791 return NULL_RTX;
6792
6793 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6794
6795 /* Only floating point modes are passed in anything but integer regs. */
6796 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6797 {
6798 if (named)
6799 regno = cum->regno + FIRST_SSE_REG;
6800 else
6801 {
6802 rtx t1, t2;
6803
6804 /* Unnamed floating parameters are passed in both the
6805 SSE and integer registers. */
6806 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6807 t2 = gen_rtx_REG (mode, regno);
6808 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6809 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6810 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6811 }
6812 }
6813 /* Handle aggregated types passed in register. */
6814 if (orig_mode == BLKmode)
6815 {
6816 if (bytes > 0 && bytes <= 8)
6817 mode = (bytes > 4 ? DImode : SImode);
6818 if (mode == BLKmode)
6819 mode = DImode;
6820 }
6821
6822 return gen_reg_or_parallel (mode, orig_mode, regno);
6823 }
6824
6825 /* Return where to put the arguments to a function.
6826 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6827
6828 MODE is the argument's machine mode. TYPE is the data type of the
6829 argument. It is null for libcalls where that information may not be
6830 available. CUM gives information about the preceding args and about
6831 the function being called. NAMED is nonzero if this argument is a
6832 named parameter (otherwise it is an extra parameter matching an
6833 ellipsis). */
6834
6835 static rtx
6836 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6837 const_tree type, bool named)
6838 {
6839 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6840 enum machine_mode mode = omode;
6841 HOST_WIDE_INT bytes, words;
6842 rtx arg;
6843
6844 if (mode == BLKmode)
6845 bytes = int_size_in_bytes (type);
6846 else
6847 bytes = GET_MODE_SIZE (mode);
6848 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6849
6850 /* To simplify the code below, represent vector types with a vector mode
6851 even if MMX/SSE are not active. */
6852 if (type && TREE_CODE (type) == VECTOR_TYPE)
6853 mode = type_natural_mode (type, cum);
6854
6855 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6856 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6857 else if (TARGET_64BIT)
6858 arg = function_arg_64 (cum, mode, omode, type, named);
6859 else
6860 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6861
6862 if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
6863 {
6864 /* This argument uses 256bit AVX modes. */
6865 if (cum->caller)
6866 cfun->machine->callee_pass_avx256_p = true;
6867 else
6868 cfun->machine->caller_pass_avx256_p = true;
6869 }
6870
6871 return arg;
6872 }
6873
6874 /* A C expression that indicates when an argument must be passed by
6875 reference. If nonzero for an argument, a copy of that argument is
6876 made in memory and a pointer to the argument is passed instead of
6877 the argument itself. The pointer is passed in whatever way is
6878 appropriate for passing a pointer to that type. */
6879
6880 static bool
6881 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6882 enum machine_mode mode ATTRIBUTE_UNUSED,
6883 const_tree type, bool named ATTRIBUTE_UNUSED)
6884 {
6885 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6886
6887 /* See Windows x64 Software Convention. */
6888 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6889 {
6890 int msize = (int) GET_MODE_SIZE (mode);
6891 if (type)
6892 {
6893 /* Arrays are passed by reference. */
6894 if (TREE_CODE (type) == ARRAY_TYPE)
6895 return true;
6896
6897 if (AGGREGATE_TYPE_P (type))
6898 {
6899 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
6900 are passed by reference. */
6901 msize = int_size_in_bytes (type);
6902 }
6903 }
6904
6905 /* __m128 is passed by reference. */
6906 switch (msize) {
6907 case 1: case 2: case 4: case 8:
6908 break;
6909 default:
6910 return true;
6911 }
6912 }
6913 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
6914 return 1;
6915
6916 return 0;
6917 }
6918
6919 /* Return true when TYPE should be 128bit aligned for 32bit argument
6920 passing ABI. XXX: This function is obsolete and is only used for
6921 checking psABI compatibility with previous versions of GCC. */
6922
6923 static bool
6924 ix86_compat_aligned_value_p (const_tree type)
6925 {
6926 enum machine_mode mode = TYPE_MODE (type);
6927 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
6928 || mode == TDmode
6929 || mode == TFmode
6930 || mode == TCmode)
6931 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
6932 return true;
6933 if (TYPE_ALIGN (type) < 128)
6934 return false;
6935
6936 if (AGGREGATE_TYPE_P (type))
6937 {
6938 /* Walk the aggregates recursively. */
6939 switch (TREE_CODE (type))
6940 {
6941 case RECORD_TYPE:
6942 case UNION_TYPE:
6943 case QUAL_UNION_TYPE:
6944 {
6945 tree field;
6946
6947 /* Walk all the structure fields. */
6948 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6949 {
6950 if (TREE_CODE (field) == FIELD_DECL
6951 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
6952 return true;
6953 }
6954 break;
6955 }
6956
6957 case ARRAY_TYPE:
6958 /* Just for use if some languages passes arrays by value. */
6959 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
6960 return true;
6961 break;
6962
6963 default:
6964 gcc_unreachable ();
6965 }
6966 }
6967 return false;
6968 }
6969
6970 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
6971 XXX: This function is obsolete and is only used for checking psABI
6972 compatibility with previous versions of GCC. */
6973
6974 static unsigned int
6975 ix86_compat_function_arg_boundary (enum machine_mode mode,
6976 const_tree type, unsigned int align)
6977 {
6978 /* In 32bit, only _Decimal128 and __float128 are aligned to their
6979 natural boundaries. */
6980 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
6981 {
6982 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
6983 make an exception for SSE modes since these require 128bit
6984 alignment.
6985
6986 The handling here differs from field_alignment. ICC aligns MMX
6987 arguments to 4 byte boundaries, while structure fields are aligned
6988 to 8 byte boundaries. */
6989 if (!type)
6990 {
6991 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
6992 align = PARM_BOUNDARY;
6993 }
6994 else
6995 {
6996 if (!ix86_compat_aligned_value_p (type))
6997 align = PARM_BOUNDARY;
6998 }
6999 }
7000 if (align > BIGGEST_ALIGNMENT)
7001 align = BIGGEST_ALIGNMENT;
7002 return align;
7003 }
7004
7005 /* Return true when TYPE should be 128bit aligned for 32bit argument
7006 passing ABI. */
7007
7008 static bool
7009 ix86_contains_aligned_value_p (const_tree type)
7010 {
7011 enum machine_mode mode = TYPE_MODE (type);
7012
7013 if (mode == XFmode || mode == XCmode)
7014 return false;
7015
7016 if (TYPE_ALIGN (type) < 128)
7017 return false;
7018
7019 if (AGGREGATE_TYPE_P (type))
7020 {
7021 /* Walk the aggregates recursively. */
7022 switch (TREE_CODE (type))
7023 {
7024 case RECORD_TYPE:
7025 case UNION_TYPE:
7026 case QUAL_UNION_TYPE:
7027 {
7028 tree field;
7029
7030 /* Walk all the structure fields. */
7031 for (field = TYPE_FIELDS (type);
7032 field;
7033 field = DECL_CHAIN (field))
7034 {
7035 if (TREE_CODE (field) == FIELD_DECL
7036 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7037 return true;
7038 }
7039 break;
7040 }
7041
7042 case ARRAY_TYPE:
7043 /* Just for use if some languages passes arrays by value. */
7044 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7045 return true;
7046 break;
7047
7048 default:
7049 gcc_unreachable ();
7050 }
7051 }
7052 else
7053 return TYPE_ALIGN (type) >= 128;
7054
7055 return false;
7056 }
7057
7058 /* Gives the alignment boundary, in bits, of an argument with the
7059 specified mode and type. */
7060
7061 static unsigned int
7062 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7063 {
7064 unsigned int align;
7065 if (type)
7066 {
7067 /* Since the main variant type is used for call, we convert it to
7068 the main variant type. */
7069 type = TYPE_MAIN_VARIANT (type);
7070 align = TYPE_ALIGN (type);
7071 }
7072 else
7073 align = GET_MODE_ALIGNMENT (mode);
7074 if (align < PARM_BOUNDARY)
7075 align = PARM_BOUNDARY;
7076 else
7077 {
7078 static bool warned;
7079 unsigned int saved_align = align;
7080
7081 if (!TARGET_64BIT)
7082 {
7083 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7084 if (!type)
7085 {
7086 if (mode == XFmode || mode == XCmode)
7087 align = PARM_BOUNDARY;
7088 }
7089 else if (!ix86_contains_aligned_value_p (type))
7090 align = PARM_BOUNDARY;
7091
7092 if (align < 128)
7093 align = PARM_BOUNDARY;
7094 }
7095
7096 if (warn_psabi
7097 && !warned
7098 && align != ix86_compat_function_arg_boundary (mode, type,
7099 saved_align))
7100 {
7101 warned = true;
7102 inform (input_location,
7103 "The ABI for passing parameters with %d-byte"
7104 " alignment has changed in GCC 4.6",
7105 align / BITS_PER_UNIT);
7106 }
7107 }
7108
7109 return align;
7110 }
7111
7112 /* Return true if N is a possible register number of function value. */
7113
7114 static bool
7115 ix86_function_value_regno_p (const unsigned int regno)
7116 {
7117 switch (regno)
7118 {
7119 case AX_REG:
7120 return true;
7121
7122 case FIRST_FLOAT_REG:
7123 /* TODO: The function should depend on current function ABI but
7124 builtins.c would need updating then. Therefore we use the
7125 default ABI. */
7126 if (TARGET_64BIT && ix86_abi == MS_ABI)
7127 return false;
7128 return TARGET_FLOAT_RETURNS_IN_80387;
7129
7130 case FIRST_SSE_REG:
7131 return TARGET_SSE;
7132
7133 case FIRST_MMX_REG:
7134 if (TARGET_MACHO || TARGET_64BIT)
7135 return false;
7136 return TARGET_MMX;
7137 }
7138
7139 return false;
7140 }
7141
7142 /* Define how to find the value returned by a function.
7143 VALTYPE is the data type of the value (as a tree).
7144 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7145 otherwise, FUNC is 0. */
7146
7147 static rtx
7148 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7149 const_tree fntype, const_tree fn)
7150 {
7151 unsigned int regno;
7152
7153 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7154 we normally prevent this case when mmx is not available. However
7155 some ABIs may require the result to be returned like DImode. */
7156 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7157 regno = FIRST_MMX_REG;
7158
7159 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7160 we prevent this case when sse is not available. However some ABIs
7161 may require the result to be returned like integer TImode. */
7162 else if (mode == TImode
7163 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7164 regno = FIRST_SSE_REG;
7165
7166 /* 32-byte vector modes in %ymm0. */
7167 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7168 regno = FIRST_SSE_REG;
7169
7170 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7171 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7172 regno = FIRST_FLOAT_REG;
7173 else
7174 /* Most things go in %eax. */
7175 regno = AX_REG;
7176
7177 /* Override FP return register with %xmm0 for local functions when
7178 SSE math is enabled or for functions with sseregparm attribute. */
7179 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7180 {
7181 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7182 if ((sse_level >= 1 && mode == SFmode)
7183 || (sse_level == 2 && mode == DFmode))
7184 regno = FIRST_SSE_REG;
7185 }
7186
7187 /* OImode shouldn't be used directly. */
7188 gcc_assert (mode != OImode);
7189
7190 return gen_rtx_REG (orig_mode, regno);
7191 }
7192
7193 static rtx
7194 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7195 const_tree valtype)
7196 {
7197 rtx ret;
7198
7199 /* Handle libcalls, which don't provide a type node. */
7200 if (valtype == NULL)
7201 {
7202 unsigned int regno;
7203
7204 switch (mode)
7205 {
7206 case SFmode:
7207 case SCmode:
7208 case DFmode:
7209 case DCmode:
7210 case TFmode:
7211 case SDmode:
7212 case DDmode:
7213 case TDmode:
7214 regno = FIRST_SSE_REG;
7215 break;
7216 case XFmode:
7217 case XCmode:
7218 regno = FIRST_FLOAT_REG;
7219 break;
7220 case TCmode:
7221 return NULL;
7222 default:
7223 regno = AX_REG;
7224 }
7225
7226 return gen_rtx_REG (mode, regno);
7227 }
7228 else if (POINTER_TYPE_P (valtype))
7229 {
7230 /* Pointers are always returned in Pmode. */
7231 mode = Pmode;
7232 }
7233
7234 ret = construct_container (mode, orig_mode, valtype, 1,
7235 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7236 x86_64_int_return_registers, 0);
7237
7238 /* For zero sized structures, construct_container returns NULL, but we
7239 need to keep rest of compiler happy by returning meaningful value. */
7240 if (!ret)
7241 ret = gen_rtx_REG (orig_mode, AX_REG);
7242
7243 return ret;
7244 }
7245
7246 static rtx
7247 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7248 {
7249 unsigned int regno = AX_REG;
7250
7251 if (TARGET_SSE)
7252 {
7253 switch (GET_MODE_SIZE (mode))
7254 {
7255 case 16:
7256 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7257 && !COMPLEX_MODE_P (mode))
7258 regno = FIRST_SSE_REG;
7259 break;
7260 case 8:
7261 case 4:
7262 if (mode == SFmode || mode == DFmode)
7263 regno = FIRST_SSE_REG;
7264 break;
7265 default:
7266 break;
7267 }
7268 }
7269 return gen_rtx_REG (orig_mode, regno);
7270 }
7271
7272 static rtx
7273 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7274 enum machine_mode orig_mode, enum machine_mode mode)
7275 {
7276 const_tree fn, fntype;
7277
7278 fn = NULL_TREE;
7279 if (fntype_or_decl && DECL_P (fntype_or_decl))
7280 fn = fntype_or_decl;
7281 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7282
7283 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7284 return function_value_ms_64 (orig_mode, mode);
7285 else if (TARGET_64BIT)
7286 return function_value_64 (orig_mode, mode, valtype);
7287 else
7288 return function_value_32 (orig_mode, mode, fntype, fn);
7289 }
7290
7291 static rtx
7292 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7293 bool outgoing ATTRIBUTE_UNUSED)
7294 {
7295 enum machine_mode mode, orig_mode;
7296
7297 orig_mode = TYPE_MODE (valtype);
7298 mode = type_natural_mode (valtype, NULL);
7299 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7300 }
7301
7302 /* Pointer function arguments and return values are promoted to Pmode. */
7303
7304 static enum machine_mode
7305 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7306 int *punsignedp, const_tree fntype,
7307 int for_return)
7308 {
7309 if (type != NULL_TREE && POINTER_TYPE_P (type))
7310 {
7311 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7312 return Pmode;
7313 }
7314 return default_promote_function_mode (type, mode, punsignedp, fntype,
7315 for_return);
7316 }
7317
7318 rtx
7319 ix86_libcall_value (enum machine_mode mode)
7320 {
7321 return ix86_function_value_1 (NULL, NULL, mode, mode);
7322 }
7323
7324 /* Return true iff type is returned in memory. */
7325
7326 static bool ATTRIBUTE_UNUSED
7327 return_in_memory_32 (const_tree type, enum machine_mode mode)
7328 {
7329 HOST_WIDE_INT size;
7330
7331 if (mode == BLKmode)
7332 return true;
7333
7334 size = int_size_in_bytes (type);
7335
7336 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7337 return false;
7338
7339 if (VECTOR_MODE_P (mode) || mode == TImode)
7340 {
7341 /* User-created vectors small enough to fit in EAX. */
7342 if (size < 8)
7343 return false;
7344
7345 /* MMX/3dNow values are returned in MM0,
7346 except when it doesn't exits or the ABI prescribes otherwise. */
7347 if (size == 8)
7348 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7349
7350 /* SSE values are returned in XMM0, except when it doesn't exist. */
7351 if (size == 16)
7352 return !TARGET_SSE;
7353
7354 /* AVX values are returned in YMM0, except when it doesn't exist. */
7355 if (size == 32)
7356 return !TARGET_AVX;
7357 }
7358
7359 if (mode == XFmode)
7360 return false;
7361
7362 if (size > 12)
7363 return true;
7364
7365 /* OImode shouldn't be used directly. */
7366 gcc_assert (mode != OImode);
7367
7368 return false;
7369 }
7370
7371 static bool ATTRIBUTE_UNUSED
7372 return_in_memory_64 (const_tree type, enum machine_mode mode)
7373 {
7374 int needed_intregs, needed_sseregs;
7375 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7376 }
7377
7378 static bool ATTRIBUTE_UNUSED
7379 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7380 {
7381 HOST_WIDE_INT size = int_size_in_bytes (type);
7382
7383 /* __m128 is returned in xmm0. */
7384 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7385 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7386 return false;
7387
7388 /* Otherwise, the size must be exactly in [1248]. */
7389 return size != 1 && size != 2 && size != 4 && size != 8;
7390 }
7391
7392 static bool
7393 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7394 {
7395 #ifdef SUBTARGET_RETURN_IN_MEMORY
7396 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7397 #else
7398 const enum machine_mode mode = type_natural_mode (type, NULL);
7399
7400 if (TARGET_64BIT)
7401 {
7402 if (ix86_function_type_abi (fntype) == MS_ABI)
7403 return return_in_memory_ms_64 (type, mode);
7404 else
7405 return return_in_memory_64 (type, mode);
7406 }
7407 else
7408 return return_in_memory_32 (type, mode);
7409 #endif
7410 }
7411
7412 /* When returning SSE vector types, we have a choice of either
7413 (1) being abi incompatible with a -march switch, or
7414 (2) generating an error.
7415 Given no good solution, I think the safest thing is one warning.
7416 The user won't be able to use -Werror, but....
7417
7418 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7419 called in response to actually generating a caller or callee that
7420 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7421 via aggregate_value_p for general type probing from tree-ssa. */
7422
7423 static rtx
7424 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7425 {
7426 static bool warnedsse, warnedmmx;
7427
7428 if (!TARGET_64BIT && type)
7429 {
7430 /* Look at the return type of the function, not the function type. */
7431 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7432
7433 if (!TARGET_SSE && !warnedsse)
7434 {
7435 if (mode == TImode
7436 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7437 {
7438 warnedsse = true;
7439 warning (0, "SSE vector return without SSE enabled "
7440 "changes the ABI");
7441 }
7442 }
7443
7444 if (!TARGET_MMX && !warnedmmx)
7445 {
7446 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7447 {
7448 warnedmmx = true;
7449 warning (0, "MMX vector return without MMX enabled "
7450 "changes the ABI");
7451 }
7452 }
7453 }
7454
7455 return NULL;
7456 }
7457
7458 \f
7459 /* Create the va_list data type. */
7460
7461 /* Returns the calling convention specific va_list date type.
7462 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7463
7464 static tree
7465 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7466 {
7467 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7468
7469 /* For i386 we use plain pointer to argument area. */
7470 if (!TARGET_64BIT || abi == MS_ABI)
7471 return build_pointer_type (char_type_node);
7472
7473 record = lang_hooks.types.make_type (RECORD_TYPE);
7474 type_decl = build_decl (BUILTINS_LOCATION,
7475 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7476
7477 f_gpr = build_decl (BUILTINS_LOCATION,
7478 FIELD_DECL, get_identifier ("gp_offset"),
7479 unsigned_type_node);
7480 f_fpr = build_decl (BUILTINS_LOCATION,
7481 FIELD_DECL, get_identifier ("fp_offset"),
7482 unsigned_type_node);
7483 f_ovf = build_decl (BUILTINS_LOCATION,
7484 FIELD_DECL, get_identifier ("overflow_arg_area"),
7485 ptr_type_node);
7486 f_sav = build_decl (BUILTINS_LOCATION,
7487 FIELD_DECL, get_identifier ("reg_save_area"),
7488 ptr_type_node);
7489
7490 va_list_gpr_counter_field = f_gpr;
7491 va_list_fpr_counter_field = f_fpr;
7492
7493 DECL_FIELD_CONTEXT (f_gpr) = record;
7494 DECL_FIELD_CONTEXT (f_fpr) = record;
7495 DECL_FIELD_CONTEXT (f_ovf) = record;
7496 DECL_FIELD_CONTEXT (f_sav) = record;
7497
7498 TYPE_STUB_DECL (record) = type_decl;
7499 TYPE_NAME (record) = type_decl;
7500 TYPE_FIELDS (record) = f_gpr;
7501 DECL_CHAIN (f_gpr) = f_fpr;
7502 DECL_CHAIN (f_fpr) = f_ovf;
7503 DECL_CHAIN (f_ovf) = f_sav;
7504
7505 layout_type (record);
7506
7507 /* The correct type is an array type of one element. */
7508 return build_array_type (record, build_index_type (size_zero_node));
7509 }
7510
7511 /* Setup the builtin va_list data type and for 64-bit the additional
7512 calling convention specific va_list data types. */
7513
7514 static tree
7515 ix86_build_builtin_va_list (void)
7516 {
7517 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7518
7519 /* Initialize abi specific va_list builtin types. */
7520 if (TARGET_64BIT)
7521 {
7522 tree t;
7523 if (ix86_abi == MS_ABI)
7524 {
7525 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7526 if (TREE_CODE (t) != RECORD_TYPE)
7527 t = build_variant_type_copy (t);
7528 sysv_va_list_type_node = t;
7529 }
7530 else
7531 {
7532 t = ret;
7533 if (TREE_CODE (t) != RECORD_TYPE)
7534 t = build_variant_type_copy (t);
7535 sysv_va_list_type_node = t;
7536 }
7537 if (ix86_abi != MS_ABI)
7538 {
7539 t = ix86_build_builtin_va_list_abi (MS_ABI);
7540 if (TREE_CODE (t) != RECORD_TYPE)
7541 t = build_variant_type_copy (t);
7542 ms_va_list_type_node = t;
7543 }
7544 else
7545 {
7546 t = ret;
7547 if (TREE_CODE (t) != RECORD_TYPE)
7548 t = build_variant_type_copy (t);
7549 ms_va_list_type_node = t;
7550 }
7551 }
7552
7553 return ret;
7554 }
7555
7556 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7557
7558 static void
7559 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7560 {
7561 rtx save_area, mem;
7562 alias_set_type set;
7563 int i, max;
7564
7565 /* GPR size of varargs save area. */
7566 if (cfun->va_list_gpr_size)
7567 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7568 else
7569 ix86_varargs_gpr_size = 0;
7570
7571 /* FPR size of varargs save area. We don't need it if we don't pass
7572 anything in SSE registers. */
7573 if (TARGET_SSE && cfun->va_list_fpr_size)
7574 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7575 else
7576 ix86_varargs_fpr_size = 0;
7577
7578 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7579 return;
7580
7581 save_area = frame_pointer_rtx;
7582 set = get_varargs_alias_set ();
7583
7584 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7585 if (max > X86_64_REGPARM_MAX)
7586 max = X86_64_REGPARM_MAX;
7587
7588 for (i = cum->regno; i < max; i++)
7589 {
7590 mem = gen_rtx_MEM (Pmode,
7591 plus_constant (save_area, i * UNITS_PER_WORD));
7592 MEM_NOTRAP_P (mem) = 1;
7593 set_mem_alias_set (mem, set);
7594 emit_move_insn (mem, gen_rtx_REG (Pmode,
7595 x86_64_int_parameter_registers[i]));
7596 }
7597
7598 if (ix86_varargs_fpr_size)
7599 {
7600 enum machine_mode smode;
7601 rtx label, test;
7602
7603 /* Now emit code to save SSE registers. The AX parameter contains number
7604 of SSE parameter registers used to call this function, though all we
7605 actually check here is the zero/non-zero status. */
7606
7607 label = gen_label_rtx ();
7608 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7609 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7610 label));
7611
7612 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7613 we used movdqa (i.e. TImode) instead? Perhaps even better would
7614 be if we could determine the real mode of the data, via a hook
7615 into pass_stdarg. Ignore all that for now. */
7616 smode = V4SFmode;
7617 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7618 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7619
7620 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7621 if (max > X86_64_SSE_REGPARM_MAX)
7622 max = X86_64_SSE_REGPARM_MAX;
7623
7624 for (i = cum->sse_regno; i < max; ++i)
7625 {
7626 mem = plus_constant (save_area, i * 16 + ix86_varargs_gpr_size);
7627 mem = gen_rtx_MEM (smode, mem);
7628 MEM_NOTRAP_P (mem) = 1;
7629 set_mem_alias_set (mem, set);
7630 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7631
7632 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7633 }
7634
7635 emit_label (label);
7636 }
7637 }
7638
7639 static void
7640 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7641 {
7642 alias_set_type set = get_varargs_alias_set ();
7643 int i;
7644
7645 /* Reset to zero, as there might be a sysv vaarg used
7646 before. */
7647 ix86_varargs_gpr_size = 0;
7648 ix86_varargs_fpr_size = 0;
7649
7650 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7651 {
7652 rtx reg, mem;
7653
7654 mem = gen_rtx_MEM (Pmode,
7655 plus_constant (virtual_incoming_args_rtx,
7656 i * UNITS_PER_WORD));
7657 MEM_NOTRAP_P (mem) = 1;
7658 set_mem_alias_set (mem, set);
7659
7660 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7661 emit_move_insn (mem, reg);
7662 }
7663 }
7664
7665 static void
7666 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7667 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7668 int no_rtl)
7669 {
7670 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7671 CUMULATIVE_ARGS next_cum;
7672 tree fntype;
7673
7674 /* This argument doesn't appear to be used anymore. Which is good,
7675 because the old code here didn't suppress rtl generation. */
7676 gcc_assert (!no_rtl);
7677
7678 if (!TARGET_64BIT)
7679 return;
7680
7681 fntype = TREE_TYPE (current_function_decl);
7682
7683 /* For varargs, we do not want to skip the dummy va_dcl argument.
7684 For stdargs, we do want to skip the last named argument. */
7685 next_cum = *cum;
7686 if (stdarg_p (fntype))
7687 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7688 true);
7689
7690 if (cum->call_abi == MS_ABI)
7691 setup_incoming_varargs_ms_64 (&next_cum);
7692 else
7693 setup_incoming_varargs_64 (&next_cum);
7694 }
7695
7696 /* Checks if TYPE is of kind va_list char *. */
7697
7698 static bool
7699 is_va_list_char_pointer (tree type)
7700 {
7701 tree canonic;
7702
7703 /* For 32-bit it is always true. */
7704 if (!TARGET_64BIT)
7705 return true;
7706 canonic = ix86_canonical_va_list_type (type);
7707 return (canonic == ms_va_list_type_node
7708 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7709 }
7710
7711 /* Implement va_start. */
7712
7713 static void
7714 ix86_va_start (tree valist, rtx nextarg)
7715 {
7716 HOST_WIDE_INT words, n_gpr, n_fpr;
7717 tree f_gpr, f_fpr, f_ovf, f_sav;
7718 tree gpr, fpr, ovf, sav, t;
7719 tree type;
7720 rtx ovf_rtx;
7721
7722 if (flag_split_stack
7723 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7724 {
7725 unsigned int scratch_regno;
7726
7727 /* When we are splitting the stack, we can't refer to the stack
7728 arguments using internal_arg_pointer, because they may be on
7729 the old stack. The split stack prologue will arrange to
7730 leave a pointer to the old stack arguments in a scratch
7731 register, which we here copy to a pseudo-register. The split
7732 stack prologue can't set the pseudo-register directly because
7733 it (the prologue) runs before any registers have been saved. */
7734
7735 scratch_regno = split_stack_prologue_scratch_regno ();
7736 if (scratch_regno != INVALID_REGNUM)
7737 {
7738 rtx reg, seq;
7739
7740 reg = gen_reg_rtx (Pmode);
7741 cfun->machine->split_stack_varargs_pointer = reg;
7742
7743 start_sequence ();
7744 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7745 seq = get_insns ();
7746 end_sequence ();
7747
7748 push_topmost_sequence ();
7749 emit_insn_after (seq, entry_of_function ());
7750 pop_topmost_sequence ();
7751 }
7752 }
7753
7754 /* Only 64bit target needs something special. */
7755 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7756 {
7757 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7758 std_expand_builtin_va_start (valist, nextarg);
7759 else
7760 {
7761 rtx va_r, next;
7762
7763 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7764 next = expand_binop (ptr_mode, add_optab,
7765 cfun->machine->split_stack_varargs_pointer,
7766 crtl->args.arg_offset_rtx,
7767 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7768 convert_move (va_r, next, 0);
7769 }
7770 return;
7771 }
7772
7773 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7774 f_fpr = DECL_CHAIN (f_gpr);
7775 f_ovf = DECL_CHAIN (f_fpr);
7776 f_sav = DECL_CHAIN (f_ovf);
7777
7778 valist = build_simple_mem_ref (valist);
7779 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7780 /* The following should be folded into the MEM_REF offset. */
7781 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7782 f_gpr, NULL_TREE);
7783 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7784 f_fpr, NULL_TREE);
7785 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7786 f_ovf, NULL_TREE);
7787 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7788 f_sav, NULL_TREE);
7789
7790 /* Count number of gp and fp argument registers used. */
7791 words = crtl->args.info.words;
7792 n_gpr = crtl->args.info.regno;
7793 n_fpr = crtl->args.info.sse_regno;
7794
7795 if (cfun->va_list_gpr_size)
7796 {
7797 type = TREE_TYPE (gpr);
7798 t = build2 (MODIFY_EXPR, type,
7799 gpr, build_int_cst (type, n_gpr * 8));
7800 TREE_SIDE_EFFECTS (t) = 1;
7801 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7802 }
7803
7804 if (TARGET_SSE && cfun->va_list_fpr_size)
7805 {
7806 type = TREE_TYPE (fpr);
7807 t = build2 (MODIFY_EXPR, type, fpr,
7808 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7809 TREE_SIDE_EFFECTS (t) = 1;
7810 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7811 }
7812
7813 /* Find the overflow area. */
7814 type = TREE_TYPE (ovf);
7815 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7816 ovf_rtx = crtl->args.internal_arg_pointer;
7817 else
7818 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7819 t = make_tree (type, ovf_rtx);
7820 if (words != 0)
7821 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7822 t = build2 (MODIFY_EXPR, type, ovf, t);
7823 TREE_SIDE_EFFECTS (t) = 1;
7824 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7825
7826 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7827 {
7828 /* Find the register save area.
7829 Prologue of the function save it right above stack frame. */
7830 type = TREE_TYPE (sav);
7831 t = make_tree (type, frame_pointer_rtx);
7832 if (!ix86_varargs_gpr_size)
7833 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7834 t = build2 (MODIFY_EXPR, type, sav, t);
7835 TREE_SIDE_EFFECTS (t) = 1;
7836 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7837 }
7838 }
7839
7840 /* Implement va_arg. */
7841
7842 static tree
7843 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7844 gimple_seq *post_p)
7845 {
7846 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7847 tree f_gpr, f_fpr, f_ovf, f_sav;
7848 tree gpr, fpr, ovf, sav, t;
7849 int size, rsize;
7850 tree lab_false, lab_over = NULL_TREE;
7851 tree addr, t2;
7852 rtx container;
7853 int indirect_p = 0;
7854 tree ptrtype;
7855 enum machine_mode nat_mode;
7856 unsigned int arg_boundary;
7857
7858 /* Only 64bit target needs something special. */
7859 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7860 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7861
7862 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7863 f_fpr = DECL_CHAIN (f_gpr);
7864 f_ovf = DECL_CHAIN (f_fpr);
7865 f_sav = DECL_CHAIN (f_ovf);
7866
7867 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7868 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7869 valist = build_va_arg_indirect_ref (valist);
7870 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7871 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7872 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7873
7874 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7875 if (indirect_p)
7876 type = build_pointer_type (type);
7877 size = int_size_in_bytes (type);
7878 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7879
7880 nat_mode = type_natural_mode (type, NULL);
7881 switch (nat_mode)
7882 {
7883 case V8SFmode:
7884 case V8SImode:
7885 case V32QImode:
7886 case V16HImode:
7887 case V4DFmode:
7888 case V4DImode:
7889 /* Unnamed 256bit vector mode parameters are passed on stack. */
7890 if (!TARGET_64BIT_MS_ABI)
7891 {
7892 container = NULL;
7893 break;
7894 }
7895
7896 default:
7897 container = construct_container (nat_mode, TYPE_MODE (type),
7898 type, 0, X86_64_REGPARM_MAX,
7899 X86_64_SSE_REGPARM_MAX, intreg,
7900 0);
7901 break;
7902 }
7903
7904 /* Pull the value out of the saved registers. */
7905
7906 addr = create_tmp_var (ptr_type_node, "addr");
7907
7908 if (container)
7909 {
7910 int needed_intregs, needed_sseregs;
7911 bool need_temp;
7912 tree int_addr, sse_addr;
7913
7914 lab_false = create_artificial_label (UNKNOWN_LOCATION);
7915 lab_over = create_artificial_label (UNKNOWN_LOCATION);
7916
7917 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
7918
7919 need_temp = (!REG_P (container)
7920 && ((needed_intregs && TYPE_ALIGN (type) > 64)
7921 || TYPE_ALIGN (type) > 128));
7922
7923 /* In case we are passing structure, verify that it is consecutive block
7924 on the register save area. If not we need to do moves. */
7925 if (!need_temp && !REG_P (container))
7926 {
7927 /* Verify that all registers are strictly consecutive */
7928 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
7929 {
7930 int i;
7931
7932 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7933 {
7934 rtx slot = XVECEXP (container, 0, i);
7935 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
7936 || INTVAL (XEXP (slot, 1)) != i * 16)
7937 need_temp = 1;
7938 }
7939 }
7940 else
7941 {
7942 int i;
7943
7944 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7945 {
7946 rtx slot = XVECEXP (container, 0, i);
7947 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
7948 || INTVAL (XEXP (slot, 1)) != i * 8)
7949 need_temp = 1;
7950 }
7951 }
7952 }
7953 if (!need_temp)
7954 {
7955 int_addr = addr;
7956 sse_addr = addr;
7957 }
7958 else
7959 {
7960 int_addr = create_tmp_var (ptr_type_node, "int_addr");
7961 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
7962 }
7963
7964 /* First ensure that we fit completely in registers. */
7965 if (needed_intregs)
7966 {
7967 t = build_int_cst (TREE_TYPE (gpr),
7968 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
7969 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
7970 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7971 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7972 gimplify_and_add (t, pre_p);
7973 }
7974 if (needed_sseregs)
7975 {
7976 t = build_int_cst (TREE_TYPE (fpr),
7977 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
7978 + X86_64_REGPARM_MAX * 8);
7979 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
7980 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7981 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7982 gimplify_and_add (t, pre_p);
7983 }
7984
7985 /* Compute index to start of area used for integer regs. */
7986 if (needed_intregs)
7987 {
7988 /* int_addr = gpr + sav; */
7989 t = fold_build_pointer_plus (sav, gpr);
7990 gimplify_assign (int_addr, t, pre_p);
7991 }
7992 if (needed_sseregs)
7993 {
7994 /* sse_addr = fpr + sav; */
7995 t = fold_build_pointer_plus (sav, fpr);
7996 gimplify_assign (sse_addr, t, pre_p);
7997 }
7998 if (need_temp)
7999 {
8000 int i, prev_size = 0;
8001 tree temp = create_tmp_var (type, "va_arg_tmp");
8002
8003 /* addr = &temp; */
8004 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8005 gimplify_assign (addr, t, pre_p);
8006
8007 for (i = 0; i < XVECLEN (container, 0); i++)
8008 {
8009 rtx slot = XVECEXP (container, 0, i);
8010 rtx reg = XEXP (slot, 0);
8011 enum machine_mode mode = GET_MODE (reg);
8012 tree piece_type;
8013 tree addr_type;
8014 tree daddr_type;
8015 tree src_addr, src;
8016 int src_offset;
8017 tree dest_addr, dest;
8018 int cur_size = GET_MODE_SIZE (mode);
8019
8020 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8021 prev_size = INTVAL (XEXP (slot, 1));
8022 if (prev_size + cur_size > size)
8023 {
8024 cur_size = size - prev_size;
8025 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8026 if (mode == BLKmode)
8027 mode = QImode;
8028 }
8029 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8030 if (mode == GET_MODE (reg))
8031 addr_type = build_pointer_type (piece_type);
8032 else
8033 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8034 true);
8035 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8036 true);
8037
8038 if (SSE_REGNO_P (REGNO (reg)))
8039 {
8040 src_addr = sse_addr;
8041 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8042 }
8043 else
8044 {
8045 src_addr = int_addr;
8046 src_offset = REGNO (reg) * 8;
8047 }
8048 src_addr = fold_convert (addr_type, src_addr);
8049 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8050
8051 dest_addr = fold_convert (daddr_type, addr);
8052 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8053 if (cur_size == GET_MODE_SIZE (mode))
8054 {
8055 src = build_va_arg_indirect_ref (src_addr);
8056 dest = build_va_arg_indirect_ref (dest_addr);
8057
8058 gimplify_assign (dest, src, pre_p);
8059 }
8060 else
8061 {
8062 tree copy
8063 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8064 3, dest_addr, src_addr,
8065 size_int (cur_size));
8066 gimplify_and_add (copy, pre_p);
8067 }
8068 prev_size += cur_size;
8069 }
8070 }
8071
8072 if (needed_intregs)
8073 {
8074 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8075 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8076 gimplify_assign (gpr, t, pre_p);
8077 }
8078
8079 if (needed_sseregs)
8080 {
8081 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8082 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8083 gimplify_assign (fpr, t, pre_p);
8084 }
8085
8086 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8087
8088 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8089 }
8090
8091 /* ... otherwise out of the overflow area. */
8092
8093 /* When we align parameter on stack for caller, if the parameter
8094 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8095 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8096 here with caller. */
8097 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8098 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8099 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8100
8101 /* Care for on-stack alignment if needed. */
8102 if (arg_boundary <= 64 || size == 0)
8103 t = ovf;
8104 else
8105 {
8106 HOST_WIDE_INT align = arg_boundary / 8;
8107 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8108 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8109 build_int_cst (TREE_TYPE (t), -align));
8110 }
8111
8112 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8113 gimplify_assign (addr, t, pre_p);
8114
8115 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8116 gimplify_assign (unshare_expr (ovf), t, pre_p);
8117
8118 if (container)
8119 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8120
8121 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8122 addr = fold_convert (ptrtype, addr);
8123
8124 if (indirect_p)
8125 addr = build_va_arg_indirect_ref (addr);
8126 return build_va_arg_indirect_ref (addr);
8127 }
8128 \f
8129 /* Return true if OPNUM's MEM should be matched
8130 in movabs* patterns. */
8131
8132 bool
8133 ix86_check_movabs (rtx insn, int opnum)
8134 {
8135 rtx set, mem;
8136
8137 set = PATTERN (insn);
8138 if (GET_CODE (set) == PARALLEL)
8139 set = XVECEXP (set, 0, 0);
8140 gcc_assert (GET_CODE (set) == SET);
8141 mem = XEXP (set, opnum);
8142 while (GET_CODE (mem) == SUBREG)
8143 mem = SUBREG_REG (mem);
8144 gcc_assert (MEM_P (mem));
8145 return volatile_ok || !MEM_VOLATILE_P (mem);
8146 }
8147 \f
8148 /* Initialize the table of extra 80387 mathematical constants. */
8149
8150 static void
8151 init_ext_80387_constants (void)
8152 {
8153 static const char * cst[5] =
8154 {
8155 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8156 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8157 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8158 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8159 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8160 };
8161 int i;
8162
8163 for (i = 0; i < 5; i++)
8164 {
8165 real_from_string (&ext_80387_constants_table[i], cst[i]);
8166 /* Ensure each constant is rounded to XFmode precision. */
8167 real_convert (&ext_80387_constants_table[i],
8168 XFmode, &ext_80387_constants_table[i]);
8169 }
8170
8171 ext_80387_constants_init = 1;
8172 }
8173
8174 /* Return non-zero if the constant is something that
8175 can be loaded with a special instruction. */
8176
8177 int
8178 standard_80387_constant_p (rtx x)
8179 {
8180 enum machine_mode mode = GET_MODE (x);
8181
8182 REAL_VALUE_TYPE r;
8183
8184 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8185 return -1;
8186
8187 if (x == CONST0_RTX (mode))
8188 return 1;
8189 if (x == CONST1_RTX (mode))
8190 return 2;
8191
8192 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8193
8194 /* For XFmode constants, try to find a special 80387 instruction when
8195 optimizing for size or on those CPUs that benefit from them. */
8196 if (mode == XFmode
8197 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8198 {
8199 int i;
8200
8201 if (! ext_80387_constants_init)
8202 init_ext_80387_constants ();
8203
8204 for (i = 0; i < 5; i++)
8205 if (real_identical (&r, &ext_80387_constants_table[i]))
8206 return i + 3;
8207 }
8208
8209 /* Load of the constant -0.0 or -1.0 will be split as
8210 fldz;fchs or fld1;fchs sequence. */
8211 if (real_isnegzero (&r))
8212 return 8;
8213 if (real_identical (&r, &dconstm1))
8214 return 9;
8215
8216 return 0;
8217 }
8218
8219 /* Return the opcode of the special instruction to be used to load
8220 the constant X. */
8221
8222 const char *
8223 standard_80387_constant_opcode (rtx x)
8224 {
8225 switch (standard_80387_constant_p (x))
8226 {
8227 case 1:
8228 return "fldz";
8229 case 2:
8230 return "fld1";
8231 case 3:
8232 return "fldlg2";
8233 case 4:
8234 return "fldln2";
8235 case 5:
8236 return "fldl2e";
8237 case 6:
8238 return "fldl2t";
8239 case 7:
8240 return "fldpi";
8241 case 8:
8242 case 9:
8243 return "#";
8244 default:
8245 gcc_unreachable ();
8246 }
8247 }
8248
8249 /* Return the CONST_DOUBLE representing the 80387 constant that is
8250 loaded by the specified special instruction. The argument IDX
8251 matches the return value from standard_80387_constant_p. */
8252
8253 rtx
8254 standard_80387_constant_rtx (int idx)
8255 {
8256 int i;
8257
8258 if (! ext_80387_constants_init)
8259 init_ext_80387_constants ();
8260
8261 switch (idx)
8262 {
8263 case 3:
8264 case 4:
8265 case 5:
8266 case 6:
8267 case 7:
8268 i = idx - 3;
8269 break;
8270
8271 default:
8272 gcc_unreachable ();
8273 }
8274
8275 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8276 XFmode);
8277 }
8278
8279 /* Return 1 if X is all 0s and 2 if x is all 1s
8280 in supported SSE/AVX vector mode. */
8281
8282 int
8283 standard_sse_constant_p (rtx x)
8284 {
8285 enum machine_mode mode = GET_MODE (x);
8286
8287 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8288 return 1;
8289 if (vector_all_ones_operand (x, mode))
8290 switch (mode)
8291 {
8292 case V16QImode:
8293 case V8HImode:
8294 case V4SImode:
8295 case V2DImode:
8296 if (TARGET_SSE2)
8297 return 2;
8298 case V32QImode:
8299 case V16HImode:
8300 case V8SImode:
8301 case V4DImode:
8302 if (TARGET_AVX2)
8303 return 2;
8304 default:
8305 break;
8306 }
8307
8308 return 0;
8309 }
8310
8311 /* Return the opcode of the special instruction to be used to load
8312 the constant X. */
8313
8314 const char *
8315 standard_sse_constant_opcode (rtx insn, rtx x)
8316 {
8317 switch (standard_sse_constant_p (x))
8318 {
8319 case 1:
8320 switch (get_attr_mode (insn))
8321 {
8322 case MODE_TI:
8323 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8324 return "%vpxor\t%0, %d0";
8325 case MODE_V2DF:
8326 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8327 return "%vxorpd\t%0, %d0";
8328 case MODE_V4SF:
8329 return "%vxorps\t%0, %d0";
8330
8331 case MODE_OI:
8332 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8333 return "vpxor\t%x0, %x0, %x0";
8334 case MODE_V4DF:
8335 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8336 return "vxorpd\t%x0, %x0, %x0";
8337 case MODE_V8SF:
8338 return "vxorps\t%x0, %x0, %x0";
8339
8340 default:
8341 break;
8342 }
8343
8344 case 2:
8345 if (TARGET_AVX)
8346 return "vpcmpeqd\t%0, %0, %0";
8347 else
8348 return "pcmpeqd\t%0, %0";
8349
8350 default:
8351 break;
8352 }
8353 gcc_unreachable ();
8354 }
8355
8356 /* Returns true if OP contains a symbol reference */
8357
8358 bool
8359 symbolic_reference_mentioned_p (rtx op)
8360 {
8361 const char *fmt;
8362 int i;
8363
8364 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8365 return true;
8366
8367 fmt = GET_RTX_FORMAT (GET_CODE (op));
8368 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8369 {
8370 if (fmt[i] == 'E')
8371 {
8372 int j;
8373
8374 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8375 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8376 return true;
8377 }
8378
8379 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8380 return true;
8381 }
8382
8383 return false;
8384 }
8385
8386 /* Return true if it is appropriate to emit `ret' instructions in the
8387 body of a function. Do this only if the epilogue is simple, needing a
8388 couple of insns. Prior to reloading, we can't tell how many registers
8389 must be saved, so return false then. Return false if there is no frame
8390 marker to de-allocate. */
8391
8392 bool
8393 ix86_can_use_return_insn_p (void)
8394 {
8395 struct ix86_frame frame;
8396
8397 if (! reload_completed || frame_pointer_needed)
8398 return 0;
8399
8400 /* Don't allow more than 32k pop, since that's all we can do
8401 with one instruction. */
8402 if (crtl->args.pops_args && crtl->args.size >= 32768)
8403 return 0;
8404
8405 ix86_compute_frame_layout (&frame);
8406 return (frame.stack_pointer_offset == UNITS_PER_WORD
8407 && (frame.nregs + frame.nsseregs) == 0);
8408 }
8409 \f
8410 /* Value should be nonzero if functions must have frame pointers.
8411 Zero means the frame pointer need not be set up (and parms may
8412 be accessed via the stack pointer) in functions that seem suitable. */
8413
8414 static bool
8415 ix86_frame_pointer_required (void)
8416 {
8417 /* If we accessed previous frames, then the generated code expects
8418 to be able to access the saved ebp value in our frame. */
8419 if (cfun->machine->accesses_prev_frame)
8420 return true;
8421
8422 /* Several x86 os'es need a frame pointer for other reasons,
8423 usually pertaining to setjmp. */
8424 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8425 return true;
8426
8427 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8428 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8429 return true;
8430
8431 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8432 turns off the frame pointer by default. Turn it back on now if
8433 we've not got a leaf function. */
8434 if (TARGET_OMIT_LEAF_FRAME_POINTER
8435 && (!current_function_is_leaf
8436 || ix86_current_function_calls_tls_descriptor))
8437 return true;
8438
8439 if (crtl->profile && !flag_fentry)
8440 return true;
8441
8442 return false;
8443 }
8444
8445 /* Record that the current function accesses previous call frames. */
8446
8447 void
8448 ix86_setup_frame_addresses (void)
8449 {
8450 cfun->machine->accesses_prev_frame = 1;
8451 }
8452 \f
8453 #ifndef USE_HIDDEN_LINKONCE
8454 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8455 # define USE_HIDDEN_LINKONCE 1
8456 # else
8457 # define USE_HIDDEN_LINKONCE 0
8458 # endif
8459 #endif
8460
8461 static int pic_labels_used;
8462
8463 /* Fills in the label name that should be used for a pc thunk for
8464 the given register. */
8465
8466 static void
8467 get_pc_thunk_name (char name[32], unsigned int regno)
8468 {
8469 gcc_assert (!TARGET_64BIT);
8470
8471 if (USE_HIDDEN_LINKONCE)
8472 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8473 else
8474 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8475 }
8476
8477
8478 /* This function generates code for -fpic that loads %ebx with
8479 the return address of the caller and then returns. */
8480
8481 static void
8482 ix86_code_end (void)
8483 {
8484 rtx xops[2];
8485 int regno;
8486
8487 for (regno = AX_REG; regno <= SP_REG; regno++)
8488 {
8489 char name[32];
8490 tree decl;
8491
8492 if (!(pic_labels_used & (1 << regno)))
8493 continue;
8494
8495 get_pc_thunk_name (name, regno);
8496
8497 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8498 get_identifier (name),
8499 build_function_type_list (void_type_node, NULL_TREE));
8500 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8501 NULL_TREE, void_type_node);
8502 TREE_PUBLIC (decl) = 1;
8503 TREE_STATIC (decl) = 1;
8504
8505 #if TARGET_MACHO
8506 if (TARGET_MACHO)
8507 {
8508 switch_to_section (darwin_sections[text_coal_section]);
8509 fputs ("\t.weak_definition\t", asm_out_file);
8510 assemble_name (asm_out_file, name);
8511 fputs ("\n\t.private_extern\t", asm_out_file);
8512 assemble_name (asm_out_file, name);
8513 putc ('\n', asm_out_file);
8514 ASM_OUTPUT_LABEL (asm_out_file, name);
8515 DECL_WEAK (decl) = 1;
8516 }
8517 else
8518 #endif
8519 if (USE_HIDDEN_LINKONCE)
8520 {
8521 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8522
8523 targetm.asm_out.unique_section (decl, 0);
8524 switch_to_section (get_named_section (decl, NULL, 0));
8525
8526 targetm.asm_out.globalize_label (asm_out_file, name);
8527 fputs ("\t.hidden\t", asm_out_file);
8528 assemble_name (asm_out_file, name);
8529 putc ('\n', asm_out_file);
8530 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8531 }
8532 else
8533 {
8534 switch_to_section (text_section);
8535 ASM_OUTPUT_LABEL (asm_out_file, name);
8536 }
8537
8538 DECL_INITIAL (decl) = make_node (BLOCK);
8539 current_function_decl = decl;
8540 init_function_start (decl);
8541 first_function_block_is_cold = false;
8542 /* Make sure unwind info is emitted for the thunk if needed. */
8543 final_start_function (emit_barrier (), asm_out_file, 1);
8544
8545 /* Pad stack IP move with 4 instructions (two NOPs count
8546 as one instruction). */
8547 if (TARGET_PAD_SHORT_FUNCTION)
8548 {
8549 int i = 8;
8550
8551 while (i--)
8552 fputs ("\tnop\n", asm_out_file);
8553 }
8554
8555 xops[0] = gen_rtx_REG (Pmode, regno);
8556 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8557 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8558 fputs ("\tret\n", asm_out_file);
8559 final_end_function ();
8560 init_insn_lengths ();
8561 free_after_compilation (cfun);
8562 set_cfun (NULL);
8563 current_function_decl = NULL;
8564 }
8565
8566 if (flag_split_stack)
8567 file_end_indicate_split_stack ();
8568 }
8569
8570 /* Emit code for the SET_GOT patterns. */
8571
8572 const char *
8573 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8574 {
8575 rtx xops[3];
8576
8577 xops[0] = dest;
8578
8579 if (TARGET_VXWORKS_RTP && flag_pic)
8580 {
8581 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8582 xops[2] = gen_rtx_MEM (Pmode,
8583 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8584 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8585
8586 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8587 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8588 an unadorned address. */
8589 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8590 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8591 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8592 return "";
8593 }
8594
8595 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8596
8597 if (!flag_pic)
8598 {
8599 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8600
8601 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8602
8603 #if TARGET_MACHO
8604 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8605 is what will be referenced by the Mach-O PIC subsystem. */
8606 if (!label)
8607 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8608 #endif
8609
8610 targetm.asm_out.internal_label (asm_out_file, "L",
8611 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8612 }
8613 else
8614 {
8615 char name[32];
8616 get_pc_thunk_name (name, REGNO (dest));
8617 pic_labels_used |= 1 << REGNO (dest);
8618
8619 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8620 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8621 output_asm_insn ("call\t%X2", xops);
8622 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8623 is what will be referenced by the Mach-O PIC subsystem. */
8624 #if TARGET_MACHO
8625 if (!label)
8626 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8627 else
8628 targetm.asm_out.internal_label (asm_out_file, "L",
8629 CODE_LABEL_NUMBER (label));
8630 #endif
8631 }
8632
8633 if (!TARGET_MACHO)
8634 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8635
8636 return "";
8637 }
8638
8639 /* Generate an "push" pattern for input ARG. */
8640
8641 static rtx
8642 gen_push (rtx arg)
8643 {
8644 struct machine_function *m = cfun->machine;
8645
8646 if (m->fs.cfa_reg == stack_pointer_rtx)
8647 m->fs.cfa_offset += UNITS_PER_WORD;
8648 m->fs.sp_offset += UNITS_PER_WORD;
8649
8650 return gen_rtx_SET (VOIDmode,
8651 gen_rtx_MEM (Pmode,
8652 gen_rtx_PRE_DEC (Pmode,
8653 stack_pointer_rtx)),
8654 arg);
8655 }
8656
8657 /* Generate an "pop" pattern for input ARG. */
8658
8659 static rtx
8660 gen_pop (rtx arg)
8661 {
8662 return gen_rtx_SET (VOIDmode,
8663 arg,
8664 gen_rtx_MEM (Pmode,
8665 gen_rtx_POST_INC (Pmode,
8666 stack_pointer_rtx)));
8667 }
8668
8669 /* Return >= 0 if there is an unused call-clobbered register available
8670 for the entire function. */
8671
8672 static unsigned int
8673 ix86_select_alt_pic_regnum (void)
8674 {
8675 if (current_function_is_leaf
8676 && !crtl->profile
8677 && !ix86_current_function_calls_tls_descriptor)
8678 {
8679 int i, drap;
8680 /* Can't use the same register for both PIC and DRAP. */
8681 if (crtl->drap_reg)
8682 drap = REGNO (crtl->drap_reg);
8683 else
8684 drap = -1;
8685 for (i = 2; i >= 0; --i)
8686 if (i != drap && !df_regs_ever_live_p (i))
8687 return i;
8688 }
8689
8690 return INVALID_REGNUM;
8691 }
8692
8693 /* Return TRUE if we need to save REGNO. */
8694
8695 static bool
8696 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8697 {
8698 if (pic_offset_table_rtx
8699 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8700 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8701 || crtl->profile
8702 || crtl->calls_eh_return
8703 || crtl->uses_const_pool))
8704 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8705
8706 if (crtl->calls_eh_return && maybe_eh_return)
8707 {
8708 unsigned i;
8709 for (i = 0; ; i++)
8710 {
8711 unsigned test = EH_RETURN_DATA_REGNO (i);
8712 if (test == INVALID_REGNUM)
8713 break;
8714 if (test == regno)
8715 return true;
8716 }
8717 }
8718
8719 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8720 return true;
8721
8722 return (df_regs_ever_live_p (regno)
8723 && !call_used_regs[regno]
8724 && !fixed_regs[regno]
8725 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8726 }
8727
8728 /* Return number of saved general prupose registers. */
8729
8730 static int
8731 ix86_nsaved_regs (void)
8732 {
8733 int nregs = 0;
8734 int regno;
8735
8736 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8737 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8738 nregs ++;
8739 return nregs;
8740 }
8741
8742 /* Return number of saved SSE registrers. */
8743
8744 static int
8745 ix86_nsaved_sseregs (void)
8746 {
8747 int nregs = 0;
8748 int regno;
8749
8750 if (!TARGET_64BIT_MS_ABI)
8751 return 0;
8752 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8753 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8754 nregs ++;
8755 return nregs;
8756 }
8757
8758 /* Given FROM and TO register numbers, say whether this elimination is
8759 allowed. If stack alignment is needed, we can only replace argument
8760 pointer with hard frame pointer, or replace frame pointer with stack
8761 pointer. Otherwise, frame pointer elimination is automatically
8762 handled and all other eliminations are valid. */
8763
8764 static bool
8765 ix86_can_eliminate (const int from, const int to)
8766 {
8767 if (stack_realign_fp)
8768 return ((from == ARG_POINTER_REGNUM
8769 && to == HARD_FRAME_POINTER_REGNUM)
8770 || (from == FRAME_POINTER_REGNUM
8771 && to == STACK_POINTER_REGNUM));
8772 else
8773 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8774 }
8775
8776 /* Return the offset between two registers, one to be eliminated, and the other
8777 its replacement, at the start of a routine. */
8778
8779 HOST_WIDE_INT
8780 ix86_initial_elimination_offset (int from, int to)
8781 {
8782 struct ix86_frame frame;
8783 ix86_compute_frame_layout (&frame);
8784
8785 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8786 return frame.hard_frame_pointer_offset;
8787 else if (from == FRAME_POINTER_REGNUM
8788 && to == HARD_FRAME_POINTER_REGNUM)
8789 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8790 else
8791 {
8792 gcc_assert (to == STACK_POINTER_REGNUM);
8793
8794 if (from == ARG_POINTER_REGNUM)
8795 return frame.stack_pointer_offset;
8796
8797 gcc_assert (from == FRAME_POINTER_REGNUM);
8798 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8799 }
8800 }
8801
8802 /* In a dynamically-aligned function, we can't know the offset from
8803 stack pointer to frame pointer, so we must ensure that setjmp
8804 eliminates fp against the hard fp (%ebp) rather than trying to
8805 index from %esp up to the top of the frame across a gap that is
8806 of unknown (at compile-time) size. */
8807 static rtx
8808 ix86_builtin_setjmp_frame_value (void)
8809 {
8810 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8811 }
8812
8813 /* When using -fsplit-stack, the allocation routines set a field in
8814 the TCB to the bottom of the stack plus this much space, measured
8815 in bytes. */
8816
8817 #define SPLIT_STACK_AVAILABLE 256
8818
8819 /* Fill structure ix86_frame about frame of currently computed function. */
8820
8821 static void
8822 ix86_compute_frame_layout (struct ix86_frame *frame)
8823 {
8824 unsigned int stack_alignment_needed;
8825 HOST_WIDE_INT offset;
8826 unsigned int preferred_alignment;
8827 HOST_WIDE_INT size = get_frame_size ();
8828 HOST_WIDE_INT to_allocate;
8829
8830 frame->nregs = ix86_nsaved_regs ();
8831 frame->nsseregs = ix86_nsaved_sseregs ();
8832
8833 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8834 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8835
8836 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8837 function prologues and leaf. */
8838 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8839 && (!current_function_is_leaf || cfun->calls_alloca != 0
8840 || ix86_current_function_calls_tls_descriptor))
8841 {
8842 preferred_alignment = 16;
8843 stack_alignment_needed = 16;
8844 crtl->preferred_stack_boundary = 128;
8845 crtl->stack_alignment_needed = 128;
8846 }
8847
8848 gcc_assert (!size || stack_alignment_needed);
8849 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8850 gcc_assert (preferred_alignment <= stack_alignment_needed);
8851
8852 /* For SEH we have to limit the amount of code movement into the prologue.
8853 At present we do this via a BLOCKAGE, at which point there's very little
8854 scheduling that can be done, which means that there's very little point
8855 in doing anything except PUSHs. */
8856 if (TARGET_SEH)
8857 cfun->machine->use_fast_prologue_epilogue = false;
8858
8859 /* During reload iteration the amount of registers saved can change.
8860 Recompute the value as needed. Do not recompute when amount of registers
8861 didn't change as reload does multiple calls to the function and does not
8862 expect the decision to change within single iteration. */
8863 else if (!optimize_function_for_size_p (cfun)
8864 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8865 {
8866 int count = frame->nregs;
8867 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8868
8869 cfun->machine->use_fast_prologue_epilogue_nregs = count;
8870
8871 /* The fast prologue uses move instead of push to save registers. This
8872 is significantly longer, but also executes faster as modern hardware
8873 can execute the moves in parallel, but can't do that for push/pop.
8874
8875 Be careful about choosing what prologue to emit: When function takes
8876 many instructions to execute we may use slow version as well as in
8877 case function is known to be outside hot spot (this is known with
8878 feedback only). Weight the size of function by number of registers
8879 to save as it is cheap to use one or two push instructions but very
8880 slow to use many of them. */
8881 if (count)
8882 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
8883 if (node->frequency < NODE_FREQUENCY_NORMAL
8884 || (flag_branch_probabilities
8885 && node->frequency < NODE_FREQUENCY_HOT))
8886 cfun->machine->use_fast_prologue_epilogue = false;
8887 else
8888 cfun->machine->use_fast_prologue_epilogue
8889 = !expensive_function_p (count);
8890 }
8891
8892 frame->save_regs_using_mov
8893 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
8894 /* If static stack checking is enabled and done with probes,
8895 the registers need to be saved before allocating the frame. */
8896 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
8897
8898 /* Skip return address. */
8899 offset = UNITS_PER_WORD;
8900
8901 /* Skip pushed static chain. */
8902 if (ix86_static_chain_on_stack)
8903 offset += UNITS_PER_WORD;
8904
8905 /* Skip saved base pointer. */
8906 if (frame_pointer_needed)
8907 offset += UNITS_PER_WORD;
8908 frame->hfp_save_offset = offset;
8909
8910 /* The traditional frame pointer location is at the top of the frame. */
8911 frame->hard_frame_pointer_offset = offset;
8912
8913 /* Register save area */
8914 offset += frame->nregs * UNITS_PER_WORD;
8915 frame->reg_save_offset = offset;
8916
8917 /* Align and set SSE register save area. */
8918 if (frame->nsseregs)
8919 {
8920 /* The only ABI that has saved SSE registers (Win64) also has a
8921 16-byte aligned default stack, and thus we don't need to be
8922 within the re-aligned local stack frame to save them. */
8923 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
8924 offset = (offset + 16 - 1) & -16;
8925 offset += frame->nsseregs * 16;
8926 }
8927 frame->sse_reg_save_offset = offset;
8928
8929 /* The re-aligned stack starts here. Values before this point are not
8930 directly comparable with values below this point. In order to make
8931 sure that no value happens to be the same before and after, force
8932 the alignment computation below to add a non-zero value. */
8933 if (stack_realign_fp)
8934 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
8935
8936 /* Va-arg area */
8937 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
8938 offset += frame->va_arg_size;
8939
8940 /* Align start of frame for local function. */
8941 if (stack_realign_fp
8942 || offset != frame->sse_reg_save_offset
8943 || size != 0
8944 || !current_function_is_leaf
8945 || cfun->calls_alloca
8946 || ix86_current_function_calls_tls_descriptor)
8947 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
8948
8949 /* Frame pointer points here. */
8950 frame->frame_pointer_offset = offset;
8951
8952 offset += size;
8953
8954 /* Add outgoing arguments area. Can be skipped if we eliminated
8955 all the function calls as dead code.
8956 Skipping is however impossible when function calls alloca. Alloca
8957 expander assumes that last crtl->outgoing_args_size
8958 of stack frame are unused. */
8959 if (ACCUMULATE_OUTGOING_ARGS
8960 && (!current_function_is_leaf || cfun->calls_alloca
8961 || ix86_current_function_calls_tls_descriptor))
8962 {
8963 offset += crtl->outgoing_args_size;
8964 frame->outgoing_arguments_size = crtl->outgoing_args_size;
8965 }
8966 else
8967 frame->outgoing_arguments_size = 0;
8968
8969 /* Align stack boundary. Only needed if we're calling another function
8970 or using alloca. */
8971 if (!current_function_is_leaf || cfun->calls_alloca
8972 || ix86_current_function_calls_tls_descriptor)
8973 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
8974
8975 /* We've reached end of stack frame. */
8976 frame->stack_pointer_offset = offset;
8977
8978 /* Size prologue needs to allocate. */
8979 to_allocate = offset - frame->sse_reg_save_offset;
8980
8981 if ((!to_allocate && frame->nregs <= 1)
8982 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
8983 frame->save_regs_using_mov = false;
8984
8985 if (ix86_using_red_zone ()
8986 && current_function_sp_is_unchanging
8987 && current_function_is_leaf
8988 && !ix86_current_function_calls_tls_descriptor)
8989 {
8990 frame->red_zone_size = to_allocate;
8991 if (frame->save_regs_using_mov)
8992 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
8993 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
8994 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
8995 }
8996 else
8997 frame->red_zone_size = 0;
8998 frame->stack_pointer_offset -= frame->red_zone_size;
8999
9000 /* The SEH frame pointer location is near the bottom of the frame.
9001 This is enforced by the fact that the difference between the
9002 stack pointer and the frame pointer is limited to 240 bytes in
9003 the unwind data structure. */
9004 if (TARGET_SEH)
9005 {
9006 HOST_WIDE_INT diff;
9007
9008 /* If we can leave the frame pointer where it is, do so. */
9009 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9010 if (diff > 240 || (diff & 15) != 0)
9011 {
9012 /* Ideally we'd determine what portion of the local stack frame
9013 (within the constraint of the lowest 240) is most heavily used.
9014 But without that complication, simply bias the frame pointer
9015 by 128 bytes so as to maximize the amount of the local stack
9016 frame that is addressable with 8-bit offsets. */
9017 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9018 }
9019 }
9020 }
9021
9022 /* This is semi-inlined memory_address_length, but simplified
9023 since we know that we're always dealing with reg+offset, and
9024 to avoid having to create and discard all that rtl. */
9025
9026 static inline int
9027 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9028 {
9029 int len = 4;
9030
9031 if (offset == 0)
9032 {
9033 /* EBP and R13 cannot be encoded without an offset. */
9034 len = (regno == BP_REG || regno == R13_REG);
9035 }
9036 else if (IN_RANGE (offset, -128, 127))
9037 len = 1;
9038
9039 /* ESP and R12 must be encoded with a SIB byte. */
9040 if (regno == SP_REG || regno == R12_REG)
9041 len++;
9042
9043 return len;
9044 }
9045
9046 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9047 The valid base registers are taken from CFUN->MACHINE->FS. */
9048
9049 static rtx
9050 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9051 {
9052 const struct machine_function *m = cfun->machine;
9053 rtx base_reg = NULL;
9054 HOST_WIDE_INT base_offset = 0;
9055
9056 if (m->use_fast_prologue_epilogue)
9057 {
9058 /* Choose the base register most likely to allow the most scheduling
9059 opportunities. Generally FP is valid througout the function,
9060 while DRAP must be reloaded within the epilogue. But choose either
9061 over the SP due to increased encoding size. */
9062
9063 if (m->fs.fp_valid)
9064 {
9065 base_reg = hard_frame_pointer_rtx;
9066 base_offset = m->fs.fp_offset - cfa_offset;
9067 }
9068 else if (m->fs.drap_valid)
9069 {
9070 base_reg = crtl->drap_reg;
9071 base_offset = 0 - cfa_offset;
9072 }
9073 else if (m->fs.sp_valid)
9074 {
9075 base_reg = stack_pointer_rtx;
9076 base_offset = m->fs.sp_offset - cfa_offset;
9077 }
9078 }
9079 else
9080 {
9081 HOST_WIDE_INT toffset;
9082 int len = 16, tlen;
9083
9084 /* Choose the base register with the smallest address encoding.
9085 With a tie, choose FP > DRAP > SP. */
9086 if (m->fs.sp_valid)
9087 {
9088 base_reg = stack_pointer_rtx;
9089 base_offset = m->fs.sp_offset - cfa_offset;
9090 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9091 }
9092 if (m->fs.drap_valid)
9093 {
9094 toffset = 0 - cfa_offset;
9095 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9096 if (tlen <= len)
9097 {
9098 base_reg = crtl->drap_reg;
9099 base_offset = toffset;
9100 len = tlen;
9101 }
9102 }
9103 if (m->fs.fp_valid)
9104 {
9105 toffset = m->fs.fp_offset - cfa_offset;
9106 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9107 if (tlen <= len)
9108 {
9109 base_reg = hard_frame_pointer_rtx;
9110 base_offset = toffset;
9111 len = tlen;
9112 }
9113 }
9114 }
9115 gcc_assert (base_reg != NULL);
9116
9117 return plus_constant (base_reg, base_offset);
9118 }
9119
9120 /* Emit code to save registers in the prologue. */
9121
9122 static void
9123 ix86_emit_save_regs (void)
9124 {
9125 unsigned int regno;
9126 rtx insn;
9127
9128 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9129 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9130 {
9131 insn = emit_insn (gen_push (gen_rtx_REG (Pmode, regno)));
9132 RTX_FRAME_RELATED_P (insn) = 1;
9133 }
9134 }
9135
9136 /* Emit a single register save at CFA - CFA_OFFSET. */
9137
9138 static void
9139 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9140 HOST_WIDE_INT cfa_offset)
9141 {
9142 struct machine_function *m = cfun->machine;
9143 rtx reg = gen_rtx_REG (mode, regno);
9144 rtx mem, addr, base, insn;
9145
9146 addr = choose_baseaddr (cfa_offset);
9147 mem = gen_frame_mem (mode, addr);
9148
9149 /* For SSE saves, we need to indicate the 128-bit alignment. */
9150 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9151
9152 insn = emit_move_insn (mem, reg);
9153 RTX_FRAME_RELATED_P (insn) = 1;
9154
9155 base = addr;
9156 if (GET_CODE (base) == PLUS)
9157 base = XEXP (base, 0);
9158 gcc_checking_assert (REG_P (base));
9159
9160 /* When saving registers into a re-aligned local stack frame, avoid
9161 any tricky guessing by dwarf2out. */
9162 if (m->fs.realigned)
9163 {
9164 gcc_checking_assert (stack_realign_drap);
9165
9166 if (regno == REGNO (crtl->drap_reg))
9167 {
9168 /* A bit of a hack. We force the DRAP register to be saved in
9169 the re-aligned stack frame, which provides us with a copy
9170 of the CFA that will last past the prologue. Install it. */
9171 gcc_checking_assert (cfun->machine->fs.fp_valid);
9172 addr = plus_constant (hard_frame_pointer_rtx,
9173 cfun->machine->fs.fp_offset - cfa_offset);
9174 mem = gen_rtx_MEM (mode, addr);
9175 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9176 }
9177 else
9178 {
9179 /* The frame pointer is a stable reference within the
9180 aligned frame. Use it. */
9181 gcc_checking_assert (cfun->machine->fs.fp_valid);
9182 addr = plus_constant (hard_frame_pointer_rtx,
9183 cfun->machine->fs.fp_offset - cfa_offset);
9184 mem = gen_rtx_MEM (mode, addr);
9185 add_reg_note (insn, REG_CFA_EXPRESSION,
9186 gen_rtx_SET (VOIDmode, mem, reg));
9187 }
9188 }
9189
9190 /* The memory may not be relative to the current CFA register,
9191 which means that we may need to generate a new pattern for
9192 use by the unwind info. */
9193 else if (base != m->fs.cfa_reg)
9194 {
9195 addr = plus_constant (m->fs.cfa_reg, m->fs.cfa_offset - cfa_offset);
9196 mem = gen_rtx_MEM (mode, addr);
9197 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9198 }
9199 }
9200
9201 /* Emit code to save registers using MOV insns.
9202 First register is stored at CFA - CFA_OFFSET. */
9203 static void
9204 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9205 {
9206 unsigned int regno;
9207
9208 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9209 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9210 {
9211 ix86_emit_save_reg_using_mov (Pmode, regno, cfa_offset);
9212 cfa_offset -= UNITS_PER_WORD;
9213 }
9214 }
9215
9216 /* Emit code to save SSE registers using MOV insns.
9217 First register is stored at CFA - CFA_OFFSET. */
9218 static void
9219 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9220 {
9221 unsigned int regno;
9222
9223 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9224 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9225 {
9226 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9227 cfa_offset -= 16;
9228 }
9229 }
9230
9231 static GTY(()) rtx queued_cfa_restores;
9232
9233 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9234 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9235 Don't add the note if the previously saved value will be left untouched
9236 within stack red-zone till return, as unwinders can find the same value
9237 in the register and on the stack. */
9238
9239 static void
9240 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9241 {
9242 if (!crtl->shrink_wrapped
9243 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9244 return;
9245
9246 if (insn)
9247 {
9248 add_reg_note (insn, REG_CFA_RESTORE, reg);
9249 RTX_FRAME_RELATED_P (insn) = 1;
9250 }
9251 else
9252 queued_cfa_restores
9253 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9254 }
9255
9256 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9257
9258 static void
9259 ix86_add_queued_cfa_restore_notes (rtx insn)
9260 {
9261 rtx last;
9262 if (!queued_cfa_restores)
9263 return;
9264 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9265 ;
9266 XEXP (last, 1) = REG_NOTES (insn);
9267 REG_NOTES (insn) = queued_cfa_restores;
9268 queued_cfa_restores = NULL_RTX;
9269 RTX_FRAME_RELATED_P (insn) = 1;
9270 }
9271
9272 /* Expand prologue or epilogue stack adjustment.
9273 The pattern exist to put a dependency on all ebp-based memory accesses.
9274 STYLE should be negative if instructions should be marked as frame related,
9275 zero if %r11 register is live and cannot be freely used and positive
9276 otherwise. */
9277
9278 static void
9279 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9280 int style, bool set_cfa)
9281 {
9282 struct machine_function *m = cfun->machine;
9283 rtx insn;
9284 bool add_frame_related_expr = false;
9285
9286 if (! TARGET_64BIT)
9287 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9288 else if (x86_64_immediate_operand (offset, DImode))
9289 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9290 else
9291 {
9292 rtx tmp;
9293 /* r11 is used by indirect sibcall return as well, set before the
9294 epilogue and used after the epilogue. */
9295 if (style)
9296 tmp = gen_rtx_REG (DImode, R11_REG);
9297 else
9298 {
9299 gcc_assert (src != hard_frame_pointer_rtx
9300 && dest != hard_frame_pointer_rtx);
9301 tmp = hard_frame_pointer_rtx;
9302 }
9303 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9304 if (style < 0)
9305 add_frame_related_expr = true;
9306
9307 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9308 }
9309
9310 insn = emit_insn (insn);
9311 if (style >= 0)
9312 ix86_add_queued_cfa_restore_notes (insn);
9313
9314 if (set_cfa)
9315 {
9316 rtx r;
9317
9318 gcc_assert (m->fs.cfa_reg == src);
9319 m->fs.cfa_offset += INTVAL (offset);
9320 m->fs.cfa_reg = dest;
9321
9322 r = gen_rtx_PLUS (Pmode, src, offset);
9323 r = gen_rtx_SET (VOIDmode, dest, r);
9324 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9325 RTX_FRAME_RELATED_P (insn) = 1;
9326 }
9327 else if (style < 0)
9328 {
9329 RTX_FRAME_RELATED_P (insn) = 1;
9330 if (add_frame_related_expr)
9331 {
9332 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9333 r = gen_rtx_SET (VOIDmode, dest, r);
9334 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9335 }
9336 }
9337
9338 if (dest == stack_pointer_rtx)
9339 {
9340 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9341 bool valid = m->fs.sp_valid;
9342
9343 if (src == hard_frame_pointer_rtx)
9344 {
9345 valid = m->fs.fp_valid;
9346 ooffset = m->fs.fp_offset;
9347 }
9348 else if (src == crtl->drap_reg)
9349 {
9350 valid = m->fs.drap_valid;
9351 ooffset = 0;
9352 }
9353 else
9354 {
9355 /* Else there are two possibilities: SP itself, which we set
9356 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9357 taken care of this by hand along the eh_return path. */
9358 gcc_checking_assert (src == stack_pointer_rtx
9359 || offset == const0_rtx);
9360 }
9361
9362 m->fs.sp_offset = ooffset - INTVAL (offset);
9363 m->fs.sp_valid = valid;
9364 }
9365 }
9366
9367 /* Find an available register to be used as dynamic realign argument
9368 pointer regsiter. Such a register will be written in prologue and
9369 used in begin of body, so it must not be
9370 1. parameter passing register.
9371 2. GOT pointer.
9372 We reuse static-chain register if it is available. Otherwise, we
9373 use DI for i386 and R13 for x86-64. We chose R13 since it has
9374 shorter encoding.
9375
9376 Return: the regno of chosen register. */
9377
9378 static unsigned int
9379 find_drap_reg (void)
9380 {
9381 tree decl = cfun->decl;
9382
9383 if (TARGET_64BIT)
9384 {
9385 /* Use R13 for nested function or function need static chain.
9386 Since function with tail call may use any caller-saved
9387 registers in epilogue, DRAP must not use caller-saved
9388 register in such case. */
9389 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9390 return R13_REG;
9391
9392 return R10_REG;
9393 }
9394 else
9395 {
9396 /* Use DI for nested function or function need static chain.
9397 Since function with tail call may use any caller-saved
9398 registers in epilogue, DRAP must not use caller-saved
9399 register in such case. */
9400 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9401 return DI_REG;
9402
9403 /* Reuse static chain register if it isn't used for parameter
9404 passing. */
9405 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9406 {
9407 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9408 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9409 return CX_REG;
9410 }
9411 return DI_REG;
9412 }
9413 }
9414
9415 /* Return minimum incoming stack alignment. */
9416
9417 static unsigned int
9418 ix86_minimum_incoming_stack_boundary (bool sibcall)
9419 {
9420 unsigned int incoming_stack_boundary;
9421
9422 /* Prefer the one specified at command line. */
9423 if (ix86_user_incoming_stack_boundary)
9424 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9425 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9426 if -mstackrealign is used, it isn't used for sibcall check and
9427 estimated stack alignment is 128bit. */
9428 else if (!sibcall
9429 && !TARGET_64BIT
9430 && ix86_force_align_arg_pointer
9431 && crtl->stack_alignment_estimated == 128)
9432 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9433 else
9434 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9435
9436 /* Incoming stack alignment can be changed on individual functions
9437 via force_align_arg_pointer attribute. We use the smallest
9438 incoming stack boundary. */
9439 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9440 && lookup_attribute (ix86_force_align_arg_pointer_string,
9441 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9442 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9443
9444 /* The incoming stack frame has to be aligned at least at
9445 parm_stack_boundary. */
9446 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9447 incoming_stack_boundary = crtl->parm_stack_boundary;
9448
9449 /* Stack at entrance of main is aligned by runtime. We use the
9450 smallest incoming stack boundary. */
9451 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9452 && DECL_NAME (current_function_decl)
9453 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9454 && DECL_FILE_SCOPE_P (current_function_decl))
9455 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9456
9457 return incoming_stack_boundary;
9458 }
9459
9460 /* Update incoming stack boundary and estimated stack alignment. */
9461
9462 static void
9463 ix86_update_stack_boundary (void)
9464 {
9465 ix86_incoming_stack_boundary
9466 = ix86_minimum_incoming_stack_boundary (false);
9467
9468 /* x86_64 vararg needs 16byte stack alignment for register save
9469 area. */
9470 if (TARGET_64BIT
9471 && cfun->stdarg
9472 && crtl->stack_alignment_estimated < 128)
9473 crtl->stack_alignment_estimated = 128;
9474 }
9475
9476 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9477 needed or an rtx for DRAP otherwise. */
9478
9479 static rtx
9480 ix86_get_drap_rtx (void)
9481 {
9482 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9483 crtl->need_drap = true;
9484
9485 if (stack_realign_drap)
9486 {
9487 /* Assign DRAP to vDRAP and returns vDRAP */
9488 unsigned int regno = find_drap_reg ();
9489 rtx drap_vreg;
9490 rtx arg_ptr;
9491 rtx seq, insn;
9492
9493 arg_ptr = gen_rtx_REG (Pmode, regno);
9494 crtl->drap_reg = arg_ptr;
9495
9496 start_sequence ();
9497 drap_vreg = copy_to_reg (arg_ptr);
9498 seq = get_insns ();
9499 end_sequence ();
9500
9501 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9502 if (!optimize)
9503 {
9504 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9505 RTX_FRAME_RELATED_P (insn) = 1;
9506 }
9507 return drap_vreg;
9508 }
9509 else
9510 return NULL;
9511 }
9512
9513 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9514
9515 static rtx
9516 ix86_internal_arg_pointer (void)
9517 {
9518 return virtual_incoming_args_rtx;
9519 }
9520
9521 struct scratch_reg {
9522 rtx reg;
9523 bool saved;
9524 };
9525
9526 /* Return a short-lived scratch register for use on function entry.
9527 In 32-bit mode, it is valid only after the registers are saved
9528 in the prologue. This register must be released by means of
9529 release_scratch_register_on_entry once it is dead. */
9530
9531 static void
9532 get_scratch_register_on_entry (struct scratch_reg *sr)
9533 {
9534 int regno;
9535
9536 sr->saved = false;
9537
9538 if (TARGET_64BIT)
9539 {
9540 /* We always use R11 in 64-bit mode. */
9541 regno = R11_REG;
9542 }
9543 else
9544 {
9545 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9546 bool fastcall_p
9547 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9548 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9549 int regparm = ix86_function_regparm (fntype, decl);
9550 int drap_regno
9551 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9552
9553 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9554 for the static chain register. */
9555 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9556 && drap_regno != AX_REG)
9557 regno = AX_REG;
9558 else if (regparm < 2 && drap_regno != DX_REG)
9559 regno = DX_REG;
9560 /* ecx is the static chain register. */
9561 else if (regparm < 3 && !fastcall_p && !static_chain_p
9562 && drap_regno != CX_REG)
9563 regno = CX_REG;
9564 else if (ix86_save_reg (BX_REG, true))
9565 regno = BX_REG;
9566 /* esi is the static chain register. */
9567 else if (!(regparm == 3 && static_chain_p)
9568 && ix86_save_reg (SI_REG, true))
9569 regno = SI_REG;
9570 else if (ix86_save_reg (DI_REG, true))
9571 regno = DI_REG;
9572 else
9573 {
9574 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9575 sr->saved = true;
9576 }
9577 }
9578
9579 sr->reg = gen_rtx_REG (Pmode, regno);
9580 if (sr->saved)
9581 {
9582 rtx insn = emit_insn (gen_push (sr->reg));
9583 RTX_FRAME_RELATED_P (insn) = 1;
9584 }
9585 }
9586
9587 /* Release a scratch register obtained from the preceding function. */
9588
9589 static void
9590 release_scratch_register_on_entry (struct scratch_reg *sr)
9591 {
9592 if (sr->saved)
9593 {
9594 rtx x, insn = emit_insn (gen_pop (sr->reg));
9595
9596 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9597 RTX_FRAME_RELATED_P (insn) = 1;
9598 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9599 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9600 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9601 }
9602 }
9603
9604 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9605
9606 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9607
9608 static void
9609 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9610 {
9611 /* We skip the probe for the first interval + a small dope of 4 words and
9612 probe that many bytes past the specified size to maintain a protection
9613 area at the botton of the stack. */
9614 const int dope = 4 * UNITS_PER_WORD;
9615 rtx size_rtx = GEN_INT (size), last;
9616
9617 /* See if we have a constant small number of probes to generate. If so,
9618 that's the easy case. The run-time loop is made up of 11 insns in the
9619 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9620 for n # of intervals. */
9621 if (size <= 5 * PROBE_INTERVAL)
9622 {
9623 HOST_WIDE_INT i, adjust;
9624 bool first_probe = true;
9625
9626 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9627 values of N from 1 until it exceeds SIZE. If only one probe is
9628 needed, this will not generate any code. Then adjust and probe
9629 to PROBE_INTERVAL + SIZE. */
9630 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9631 {
9632 if (first_probe)
9633 {
9634 adjust = 2 * PROBE_INTERVAL + dope;
9635 first_probe = false;
9636 }
9637 else
9638 adjust = PROBE_INTERVAL;
9639
9640 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9641 plus_constant (stack_pointer_rtx, -adjust)));
9642 emit_stack_probe (stack_pointer_rtx);
9643 }
9644
9645 if (first_probe)
9646 adjust = size + PROBE_INTERVAL + dope;
9647 else
9648 adjust = size + PROBE_INTERVAL - i;
9649
9650 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9651 plus_constant (stack_pointer_rtx, -adjust)));
9652 emit_stack_probe (stack_pointer_rtx);
9653
9654 /* Adjust back to account for the additional first interval. */
9655 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9656 plus_constant (stack_pointer_rtx,
9657 PROBE_INTERVAL + dope)));
9658 }
9659
9660 /* Otherwise, do the same as above, but in a loop. Note that we must be
9661 extra careful with variables wrapping around because we might be at
9662 the very top (or the very bottom) of the address space and we have
9663 to be able to handle this case properly; in particular, we use an
9664 equality test for the loop condition. */
9665 else
9666 {
9667 HOST_WIDE_INT rounded_size;
9668 struct scratch_reg sr;
9669
9670 get_scratch_register_on_entry (&sr);
9671
9672
9673 /* Step 1: round SIZE to the previous multiple of the interval. */
9674
9675 rounded_size = size & -PROBE_INTERVAL;
9676
9677
9678 /* Step 2: compute initial and final value of the loop counter. */
9679
9680 /* SP = SP_0 + PROBE_INTERVAL. */
9681 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9682 plus_constant (stack_pointer_rtx,
9683 - (PROBE_INTERVAL + dope))));
9684
9685 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9686 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9687 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9688 gen_rtx_PLUS (Pmode, sr.reg,
9689 stack_pointer_rtx)));
9690
9691
9692 /* Step 3: the loop
9693
9694 while (SP != LAST_ADDR)
9695 {
9696 SP = SP + PROBE_INTERVAL
9697 probe at SP
9698 }
9699
9700 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9701 values of N from 1 until it is equal to ROUNDED_SIZE. */
9702
9703 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9704
9705
9706 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9707 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9708
9709 if (size != rounded_size)
9710 {
9711 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9712 plus_constant (stack_pointer_rtx,
9713 rounded_size - size)));
9714 emit_stack_probe (stack_pointer_rtx);
9715 }
9716
9717 /* Adjust back to account for the additional first interval. */
9718 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9719 plus_constant (stack_pointer_rtx,
9720 PROBE_INTERVAL + dope)));
9721
9722 release_scratch_register_on_entry (&sr);
9723 }
9724
9725 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9726
9727 /* Even if the stack pointer isn't the CFA register, we need to correctly
9728 describe the adjustments made to it, in particular differentiate the
9729 frame-related ones from the frame-unrelated ones. */
9730 if (size > 0)
9731 {
9732 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9733 XVECEXP (expr, 0, 0)
9734 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9735 plus_constant (stack_pointer_rtx, -size));
9736 XVECEXP (expr, 0, 1)
9737 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9738 plus_constant (stack_pointer_rtx,
9739 PROBE_INTERVAL + dope + size));
9740 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9741 RTX_FRAME_RELATED_P (last) = 1;
9742
9743 cfun->machine->fs.sp_offset += size;
9744 }
9745
9746 /* Make sure nothing is scheduled before we are done. */
9747 emit_insn (gen_blockage ());
9748 }
9749
9750 /* Adjust the stack pointer up to REG while probing it. */
9751
9752 const char *
9753 output_adjust_stack_and_probe (rtx reg)
9754 {
9755 static int labelno = 0;
9756 char loop_lab[32], end_lab[32];
9757 rtx xops[2];
9758
9759 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9760 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9761
9762 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9763
9764 /* Jump to END_LAB if SP == LAST_ADDR. */
9765 xops[0] = stack_pointer_rtx;
9766 xops[1] = reg;
9767 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9768 fputs ("\tje\t", asm_out_file);
9769 assemble_name_raw (asm_out_file, end_lab);
9770 fputc ('\n', asm_out_file);
9771
9772 /* SP = SP + PROBE_INTERVAL. */
9773 xops[1] = GEN_INT (PROBE_INTERVAL);
9774 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9775
9776 /* Probe at SP. */
9777 xops[1] = const0_rtx;
9778 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9779
9780 fprintf (asm_out_file, "\tjmp\t");
9781 assemble_name_raw (asm_out_file, loop_lab);
9782 fputc ('\n', asm_out_file);
9783
9784 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9785
9786 return "";
9787 }
9788
9789 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9790 inclusive. These are offsets from the current stack pointer. */
9791
9792 static void
9793 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9794 {
9795 /* See if we have a constant small number of probes to generate. If so,
9796 that's the easy case. The run-time loop is made up of 7 insns in the
9797 generic case while the compile-time loop is made up of n insns for n #
9798 of intervals. */
9799 if (size <= 7 * PROBE_INTERVAL)
9800 {
9801 HOST_WIDE_INT i;
9802
9803 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9804 it exceeds SIZE. If only one probe is needed, this will not
9805 generate any code. Then probe at FIRST + SIZE. */
9806 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9807 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + i)));
9808
9809 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + size)));
9810 }
9811
9812 /* Otherwise, do the same as above, but in a loop. Note that we must be
9813 extra careful with variables wrapping around because we might be at
9814 the very top (or the very bottom) of the address space and we have
9815 to be able to handle this case properly; in particular, we use an
9816 equality test for the loop condition. */
9817 else
9818 {
9819 HOST_WIDE_INT rounded_size, last;
9820 struct scratch_reg sr;
9821
9822 get_scratch_register_on_entry (&sr);
9823
9824
9825 /* Step 1: round SIZE to the previous multiple of the interval. */
9826
9827 rounded_size = size & -PROBE_INTERVAL;
9828
9829
9830 /* Step 2: compute initial and final value of the loop counter. */
9831
9832 /* TEST_OFFSET = FIRST. */
9833 emit_move_insn (sr.reg, GEN_INT (-first));
9834
9835 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9836 last = first + rounded_size;
9837
9838
9839 /* Step 3: the loop
9840
9841 while (TEST_ADDR != LAST_ADDR)
9842 {
9843 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9844 probe at TEST_ADDR
9845 }
9846
9847 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9848 until it is equal to ROUNDED_SIZE. */
9849
9850 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9851
9852
9853 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9854 that SIZE is equal to ROUNDED_SIZE. */
9855
9856 if (size != rounded_size)
9857 emit_stack_probe (plus_constant (gen_rtx_PLUS (Pmode,
9858 stack_pointer_rtx,
9859 sr.reg),
9860 rounded_size - size));
9861
9862 release_scratch_register_on_entry (&sr);
9863 }
9864
9865 /* Make sure nothing is scheduled before we are done. */
9866 emit_insn (gen_blockage ());
9867 }
9868
9869 /* Probe a range of stack addresses from REG to END, inclusive. These are
9870 offsets from the current stack pointer. */
9871
9872 const char *
9873 output_probe_stack_range (rtx reg, rtx end)
9874 {
9875 static int labelno = 0;
9876 char loop_lab[32], end_lab[32];
9877 rtx xops[3];
9878
9879 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9880 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9881
9882 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9883
9884 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
9885 xops[0] = reg;
9886 xops[1] = end;
9887 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9888 fputs ("\tje\t", asm_out_file);
9889 assemble_name_raw (asm_out_file, end_lab);
9890 fputc ('\n', asm_out_file);
9891
9892 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
9893 xops[1] = GEN_INT (PROBE_INTERVAL);
9894 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9895
9896 /* Probe at TEST_ADDR. */
9897 xops[0] = stack_pointer_rtx;
9898 xops[1] = reg;
9899 xops[2] = const0_rtx;
9900 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
9901
9902 fprintf (asm_out_file, "\tjmp\t");
9903 assemble_name_raw (asm_out_file, loop_lab);
9904 fputc ('\n', asm_out_file);
9905
9906 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9907
9908 return "";
9909 }
9910
9911 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
9912 to be generated in correct form. */
9913 static void
9914 ix86_finalize_stack_realign_flags (void)
9915 {
9916 /* Check if stack realign is really needed after reload, and
9917 stores result in cfun */
9918 unsigned int incoming_stack_boundary
9919 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
9920 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
9921 unsigned int stack_realign = (incoming_stack_boundary
9922 < (current_function_is_leaf
9923 ? crtl->max_used_stack_slot_alignment
9924 : crtl->stack_alignment_needed));
9925
9926 if (crtl->stack_realign_finalized)
9927 {
9928 /* After stack_realign_needed is finalized, we can't no longer
9929 change it. */
9930 gcc_assert (crtl->stack_realign_needed == stack_realign);
9931 return;
9932 }
9933
9934 /* If the only reason for frame_pointer_needed is that we conservatively
9935 assumed stack realignment might be needed, but in the end nothing that
9936 needed the stack alignment had been spilled, clear frame_pointer_needed
9937 and say we don't need stack realignment. */
9938 if (stack_realign
9939 && !crtl->need_drap
9940 && frame_pointer_needed
9941 && current_function_is_leaf
9942 && flag_omit_frame_pointer
9943 && current_function_sp_is_unchanging
9944 && !ix86_current_function_calls_tls_descriptor
9945 && !crtl->accesses_prior_frames
9946 && !cfun->calls_alloca
9947 && !crtl->calls_eh_return
9948 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
9949 && !ix86_frame_pointer_required ()
9950 && get_frame_size () == 0
9951 && ix86_nsaved_sseregs () == 0
9952 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
9953 {
9954 HARD_REG_SET set_up_by_prologue, prologue_used;
9955 basic_block bb;
9956
9957 CLEAR_HARD_REG_SET (prologue_used);
9958 CLEAR_HARD_REG_SET (set_up_by_prologue);
9959 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
9960 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
9961 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
9962 HARD_FRAME_POINTER_REGNUM);
9963 FOR_EACH_BB (bb)
9964 {
9965 rtx insn;
9966 FOR_BB_INSNS (bb, insn)
9967 if (NONDEBUG_INSN_P (insn)
9968 && requires_stack_frame_p (insn, prologue_used,
9969 set_up_by_prologue))
9970 {
9971 crtl->stack_realign_needed = stack_realign;
9972 crtl->stack_realign_finalized = true;
9973 return;
9974 }
9975 }
9976
9977 frame_pointer_needed = false;
9978 stack_realign = false;
9979 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
9980 crtl->stack_alignment_needed = incoming_stack_boundary;
9981 crtl->stack_alignment_estimated = incoming_stack_boundary;
9982 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
9983 crtl->preferred_stack_boundary = incoming_stack_boundary;
9984 df_finish_pass (true);
9985 df_scan_alloc (NULL);
9986 df_scan_blocks ();
9987 df_compute_regs_ever_live (true);
9988 df_analyze ();
9989 }
9990
9991 crtl->stack_realign_needed = stack_realign;
9992 crtl->stack_realign_finalized = true;
9993 }
9994
9995 /* Expand the prologue into a bunch of separate insns. */
9996
9997 void
9998 ix86_expand_prologue (void)
9999 {
10000 struct machine_function *m = cfun->machine;
10001 rtx insn, t;
10002 bool pic_reg_used;
10003 struct ix86_frame frame;
10004 HOST_WIDE_INT allocate;
10005 bool int_registers_saved;
10006
10007 ix86_finalize_stack_realign_flags ();
10008
10009 /* DRAP should not coexist with stack_realign_fp */
10010 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10011
10012 memset (&m->fs, 0, sizeof (m->fs));
10013
10014 /* Initialize CFA state for before the prologue. */
10015 m->fs.cfa_reg = stack_pointer_rtx;
10016 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10017
10018 /* Track SP offset to the CFA. We continue tracking this after we've
10019 swapped the CFA register away from SP. In the case of re-alignment
10020 this is fudged; we're interested to offsets within the local frame. */
10021 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10022 m->fs.sp_valid = true;
10023
10024 ix86_compute_frame_layout (&frame);
10025
10026 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10027 {
10028 /* We should have already generated an error for any use of
10029 ms_hook on a nested function. */
10030 gcc_checking_assert (!ix86_static_chain_on_stack);
10031
10032 /* Check if profiling is active and we shall use profiling before
10033 prologue variant. If so sorry. */
10034 if (crtl->profile && flag_fentry != 0)
10035 sorry ("ms_hook_prologue attribute isn%'t compatible "
10036 "with -mfentry for 32-bit");
10037
10038 /* In ix86_asm_output_function_label we emitted:
10039 8b ff movl.s %edi,%edi
10040 55 push %ebp
10041 8b ec movl.s %esp,%ebp
10042
10043 This matches the hookable function prologue in Win32 API
10044 functions in Microsoft Windows XP Service Pack 2 and newer.
10045 Wine uses this to enable Windows apps to hook the Win32 API
10046 functions provided by Wine.
10047
10048 What that means is that we've already set up the frame pointer. */
10049
10050 if (frame_pointer_needed
10051 && !(crtl->drap_reg && crtl->stack_realign_needed))
10052 {
10053 rtx push, mov;
10054
10055 /* We've decided to use the frame pointer already set up.
10056 Describe this to the unwinder by pretending that both
10057 push and mov insns happen right here.
10058
10059 Putting the unwind info here at the end of the ms_hook
10060 is done so that we can make absolutely certain we get
10061 the required byte sequence at the start of the function,
10062 rather than relying on an assembler that can produce
10063 the exact encoding required.
10064
10065 However it does mean (in the unpatched case) that we have
10066 a 1 insn window where the asynchronous unwind info is
10067 incorrect. However, if we placed the unwind info at
10068 its correct location we would have incorrect unwind info
10069 in the patched case. Which is probably all moot since
10070 I don't expect Wine generates dwarf2 unwind info for the
10071 system libraries that use this feature. */
10072
10073 insn = emit_insn (gen_blockage ());
10074
10075 push = gen_push (hard_frame_pointer_rtx);
10076 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10077 stack_pointer_rtx);
10078 RTX_FRAME_RELATED_P (push) = 1;
10079 RTX_FRAME_RELATED_P (mov) = 1;
10080
10081 RTX_FRAME_RELATED_P (insn) = 1;
10082 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10083 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10084
10085 /* Note that gen_push incremented m->fs.cfa_offset, even
10086 though we didn't emit the push insn here. */
10087 m->fs.cfa_reg = hard_frame_pointer_rtx;
10088 m->fs.fp_offset = m->fs.cfa_offset;
10089 m->fs.fp_valid = true;
10090 }
10091 else
10092 {
10093 /* The frame pointer is not needed so pop %ebp again.
10094 This leaves us with a pristine state. */
10095 emit_insn (gen_pop (hard_frame_pointer_rtx));
10096 }
10097 }
10098
10099 /* The first insn of a function that accepts its static chain on the
10100 stack is to push the register that would be filled in by a direct
10101 call. This insn will be skipped by the trampoline. */
10102 else if (ix86_static_chain_on_stack)
10103 {
10104 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10105 emit_insn (gen_blockage ());
10106
10107 /* We don't want to interpret this push insn as a register save,
10108 only as a stack adjustment. The real copy of the register as
10109 a save will be done later, if needed. */
10110 t = plus_constant (stack_pointer_rtx, -UNITS_PER_WORD);
10111 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10112 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10113 RTX_FRAME_RELATED_P (insn) = 1;
10114 }
10115
10116 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10117 of DRAP is needed and stack realignment is really needed after reload */
10118 if (stack_realign_drap)
10119 {
10120 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10121
10122 /* Only need to push parameter pointer reg if it is caller saved. */
10123 if (!call_used_regs[REGNO (crtl->drap_reg)])
10124 {
10125 /* Push arg pointer reg */
10126 insn = emit_insn (gen_push (crtl->drap_reg));
10127 RTX_FRAME_RELATED_P (insn) = 1;
10128 }
10129
10130 /* Grab the argument pointer. */
10131 t = plus_constant (stack_pointer_rtx, m->fs.sp_offset);
10132 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10133 RTX_FRAME_RELATED_P (insn) = 1;
10134 m->fs.cfa_reg = crtl->drap_reg;
10135 m->fs.cfa_offset = 0;
10136
10137 /* Align the stack. */
10138 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10139 stack_pointer_rtx,
10140 GEN_INT (-align_bytes)));
10141 RTX_FRAME_RELATED_P (insn) = 1;
10142
10143 /* Replicate the return address on the stack so that return
10144 address can be reached via (argp - 1) slot. This is needed
10145 to implement macro RETURN_ADDR_RTX and intrinsic function
10146 expand_builtin_return_addr etc. */
10147 t = plus_constant (crtl->drap_reg, -UNITS_PER_WORD);
10148 t = gen_frame_mem (Pmode, t);
10149 insn = emit_insn (gen_push (t));
10150 RTX_FRAME_RELATED_P (insn) = 1;
10151
10152 /* For the purposes of frame and register save area addressing,
10153 we've started over with a new frame. */
10154 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10155 m->fs.realigned = true;
10156 }
10157
10158 if (frame_pointer_needed && !m->fs.fp_valid)
10159 {
10160 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10161 slower on all targets. Also sdb doesn't like it. */
10162 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10163 RTX_FRAME_RELATED_P (insn) = 1;
10164
10165 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10166 {
10167 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10168 RTX_FRAME_RELATED_P (insn) = 1;
10169
10170 if (m->fs.cfa_reg == stack_pointer_rtx)
10171 m->fs.cfa_reg = hard_frame_pointer_rtx;
10172 m->fs.fp_offset = m->fs.sp_offset;
10173 m->fs.fp_valid = true;
10174 }
10175 }
10176
10177 int_registers_saved = (frame.nregs == 0);
10178
10179 if (!int_registers_saved)
10180 {
10181 /* If saving registers via PUSH, do so now. */
10182 if (!frame.save_regs_using_mov)
10183 {
10184 ix86_emit_save_regs ();
10185 int_registers_saved = true;
10186 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10187 }
10188
10189 /* When using red zone we may start register saving before allocating
10190 the stack frame saving one cycle of the prologue. However, avoid
10191 doing this if we have to probe the stack; at least on x86_64 the
10192 stack probe can turn into a call that clobbers a red zone location. */
10193 else if (ix86_using_red_zone ()
10194 && (! TARGET_STACK_PROBE
10195 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10196 {
10197 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10198 int_registers_saved = true;
10199 }
10200 }
10201
10202 if (stack_realign_fp)
10203 {
10204 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10205 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10206
10207 /* The computation of the size of the re-aligned stack frame means
10208 that we must allocate the size of the register save area before
10209 performing the actual alignment. Otherwise we cannot guarantee
10210 that there's enough storage above the realignment point. */
10211 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10212 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10213 GEN_INT (m->fs.sp_offset
10214 - frame.sse_reg_save_offset),
10215 -1, false);
10216
10217 /* Align the stack. */
10218 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10219 stack_pointer_rtx,
10220 GEN_INT (-align_bytes)));
10221
10222 /* For the purposes of register save area addressing, the stack
10223 pointer is no longer valid. As for the value of sp_offset,
10224 see ix86_compute_frame_layout, which we need to match in order
10225 to pass verification of stack_pointer_offset at the end. */
10226 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10227 m->fs.sp_valid = false;
10228 }
10229
10230 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10231
10232 if (flag_stack_usage_info)
10233 {
10234 /* We start to count from ARG_POINTER. */
10235 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10236
10237 /* If it was realigned, take into account the fake frame. */
10238 if (stack_realign_drap)
10239 {
10240 if (ix86_static_chain_on_stack)
10241 stack_size += UNITS_PER_WORD;
10242
10243 if (!call_used_regs[REGNO (crtl->drap_reg)])
10244 stack_size += UNITS_PER_WORD;
10245
10246 /* This over-estimates by 1 minimal-stack-alignment-unit but
10247 mitigates that by counting in the new return address slot. */
10248 current_function_dynamic_stack_size
10249 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10250 }
10251
10252 current_function_static_stack_size = stack_size;
10253 }
10254
10255 /* The stack has already been decremented by the instruction calling us
10256 so probe if the size is non-negative to preserve the protection area. */
10257 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10258 {
10259 /* We expect the registers to be saved when probes are used. */
10260 gcc_assert (int_registers_saved);
10261
10262 if (STACK_CHECK_MOVING_SP)
10263 {
10264 ix86_adjust_stack_and_probe (allocate);
10265 allocate = 0;
10266 }
10267 else
10268 {
10269 HOST_WIDE_INT size = allocate;
10270
10271 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10272 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10273
10274 if (TARGET_STACK_PROBE)
10275 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10276 else
10277 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10278 }
10279 }
10280
10281 if (allocate == 0)
10282 ;
10283 else if (!ix86_target_stack_probe ()
10284 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10285 {
10286 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10287 GEN_INT (-allocate), -1,
10288 m->fs.cfa_reg == stack_pointer_rtx);
10289 }
10290 else
10291 {
10292 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10293 rtx r10 = NULL;
10294 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10295
10296 bool eax_live = false;
10297 bool r10_live = false;
10298
10299 if (TARGET_64BIT)
10300 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10301 if (!TARGET_64BIT_MS_ABI)
10302 eax_live = ix86_eax_live_at_start_p ();
10303
10304 if (eax_live)
10305 {
10306 emit_insn (gen_push (eax));
10307 allocate -= UNITS_PER_WORD;
10308 }
10309 if (r10_live)
10310 {
10311 r10 = gen_rtx_REG (Pmode, R10_REG);
10312 emit_insn (gen_push (r10));
10313 allocate -= UNITS_PER_WORD;
10314 }
10315
10316 emit_move_insn (eax, GEN_INT (allocate));
10317 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10318
10319 /* Use the fact that AX still contains ALLOCATE. */
10320 adjust_stack_insn = (TARGET_64BIT
10321 ? gen_pro_epilogue_adjust_stack_di_sub
10322 : gen_pro_epilogue_adjust_stack_si_sub);
10323
10324 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10325 stack_pointer_rtx, eax));
10326
10327 /* Note that SEH directives need to continue tracking the stack
10328 pointer even after the frame pointer has been set up. */
10329 if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
10330 {
10331 if (m->fs.cfa_reg == stack_pointer_rtx)
10332 m->fs.cfa_offset += allocate;
10333
10334 RTX_FRAME_RELATED_P (insn) = 1;
10335 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10336 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10337 plus_constant (stack_pointer_rtx,
10338 -allocate)));
10339 }
10340 m->fs.sp_offset += allocate;
10341
10342 if (r10_live && eax_live)
10343 {
10344 t = choose_baseaddr (m->fs.sp_offset - allocate);
10345 emit_move_insn (r10, gen_frame_mem (Pmode, t));
10346 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10347 emit_move_insn (eax, gen_frame_mem (Pmode, t));
10348 }
10349 else if (eax_live || r10_live)
10350 {
10351 t = choose_baseaddr (m->fs.sp_offset - allocate);
10352 emit_move_insn ((eax_live ? eax : r10), gen_frame_mem (Pmode, t));
10353 }
10354 }
10355 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10356
10357 /* If we havn't already set up the frame pointer, do so now. */
10358 if (frame_pointer_needed && !m->fs.fp_valid)
10359 {
10360 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10361 GEN_INT (frame.stack_pointer_offset
10362 - frame.hard_frame_pointer_offset));
10363 insn = emit_insn (insn);
10364 RTX_FRAME_RELATED_P (insn) = 1;
10365 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10366
10367 if (m->fs.cfa_reg == stack_pointer_rtx)
10368 m->fs.cfa_reg = hard_frame_pointer_rtx;
10369 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10370 m->fs.fp_valid = true;
10371 }
10372
10373 if (!int_registers_saved)
10374 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10375 if (frame.nsseregs)
10376 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10377
10378 pic_reg_used = false;
10379 if (pic_offset_table_rtx
10380 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10381 || crtl->profile))
10382 {
10383 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10384
10385 if (alt_pic_reg_used != INVALID_REGNUM)
10386 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10387
10388 pic_reg_used = true;
10389 }
10390
10391 if (pic_reg_used)
10392 {
10393 if (TARGET_64BIT)
10394 {
10395 if (ix86_cmodel == CM_LARGE_PIC)
10396 {
10397 rtx tmp_reg = gen_rtx_REG (DImode, R11_REG);
10398 rtx label = gen_label_rtx ();
10399 emit_label (label);
10400 LABEL_PRESERVE_P (label) = 1;
10401 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10402 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx, label));
10403 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10404 insn = emit_insn (gen_adddi3 (pic_offset_table_rtx,
10405 pic_offset_table_rtx, tmp_reg));
10406 }
10407 else
10408 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10409 }
10410 else
10411 {
10412 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10413 RTX_FRAME_RELATED_P (insn) = 1;
10414 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10415 }
10416 }
10417
10418 /* In the pic_reg_used case, make sure that the got load isn't deleted
10419 when mcount needs it. Blockage to avoid call movement across mcount
10420 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10421 note. */
10422 if (crtl->profile && !flag_fentry && pic_reg_used)
10423 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10424
10425 if (crtl->drap_reg && !crtl->stack_realign_needed)
10426 {
10427 /* vDRAP is setup but after reload it turns out stack realign
10428 isn't necessary, here we will emit prologue to setup DRAP
10429 without stack realign adjustment */
10430 t = choose_baseaddr (0);
10431 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10432 }
10433
10434 /* Prevent instructions from being scheduled into register save push
10435 sequence when access to the redzone area is done through frame pointer.
10436 The offset between the frame pointer and the stack pointer is calculated
10437 relative to the value of the stack pointer at the end of the function
10438 prologue, and moving instructions that access redzone area via frame
10439 pointer inside push sequence violates this assumption. */
10440 if (frame_pointer_needed && frame.red_zone_size)
10441 emit_insn (gen_memory_blockage ());
10442
10443 /* Emit cld instruction if stringops are used in the function. */
10444 if (TARGET_CLD && ix86_current_function_needs_cld)
10445 emit_insn (gen_cld ());
10446
10447 /* SEH requires that the prologue end within 256 bytes of the start of
10448 the function. Prevent instruction schedules that would extend that.
10449 Further, prevent alloca modifications to the stack pointer from being
10450 combined with prologue modifications. */
10451 if (TARGET_SEH)
10452 emit_insn (gen_prologue_use (stack_pointer_rtx));
10453 }
10454
10455 /* Emit code to restore REG using a POP insn. */
10456
10457 static void
10458 ix86_emit_restore_reg_using_pop (rtx reg)
10459 {
10460 struct machine_function *m = cfun->machine;
10461 rtx insn = emit_insn (gen_pop (reg));
10462
10463 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10464 m->fs.sp_offset -= UNITS_PER_WORD;
10465
10466 if (m->fs.cfa_reg == crtl->drap_reg
10467 && REGNO (reg) == REGNO (crtl->drap_reg))
10468 {
10469 /* Previously we'd represented the CFA as an expression
10470 like *(%ebp - 8). We've just popped that value from
10471 the stack, which means we need to reset the CFA to
10472 the drap register. This will remain until we restore
10473 the stack pointer. */
10474 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10475 RTX_FRAME_RELATED_P (insn) = 1;
10476
10477 /* This means that the DRAP register is valid for addressing too. */
10478 m->fs.drap_valid = true;
10479 return;
10480 }
10481
10482 if (m->fs.cfa_reg == stack_pointer_rtx)
10483 {
10484 rtx x = plus_constant (stack_pointer_rtx, UNITS_PER_WORD);
10485 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10486 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10487 RTX_FRAME_RELATED_P (insn) = 1;
10488
10489 m->fs.cfa_offset -= UNITS_PER_WORD;
10490 }
10491
10492 /* When the frame pointer is the CFA, and we pop it, we are
10493 swapping back to the stack pointer as the CFA. This happens
10494 for stack frames that don't allocate other data, so we assume
10495 the stack pointer is now pointing at the return address, i.e.
10496 the function entry state, which makes the offset be 1 word. */
10497 if (reg == hard_frame_pointer_rtx)
10498 {
10499 m->fs.fp_valid = false;
10500 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10501 {
10502 m->fs.cfa_reg = stack_pointer_rtx;
10503 m->fs.cfa_offset -= UNITS_PER_WORD;
10504
10505 add_reg_note (insn, REG_CFA_DEF_CFA,
10506 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10507 GEN_INT (m->fs.cfa_offset)));
10508 RTX_FRAME_RELATED_P (insn) = 1;
10509 }
10510 }
10511 }
10512
10513 /* Emit code to restore saved registers using POP insns. */
10514
10515 static void
10516 ix86_emit_restore_regs_using_pop (void)
10517 {
10518 unsigned int regno;
10519
10520 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10521 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10522 ix86_emit_restore_reg_using_pop (gen_rtx_REG (Pmode, regno));
10523 }
10524
10525 /* Emit code and notes for the LEAVE instruction. */
10526
10527 static void
10528 ix86_emit_leave (void)
10529 {
10530 struct machine_function *m = cfun->machine;
10531 rtx insn = emit_insn (ix86_gen_leave ());
10532
10533 ix86_add_queued_cfa_restore_notes (insn);
10534
10535 gcc_assert (m->fs.fp_valid);
10536 m->fs.sp_valid = true;
10537 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10538 m->fs.fp_valid = false;
10539
10540 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10541 {
10542 m->fs.cfa_reg = stack_pointer_rtx;
10543 m->fs.cfa_offset = m->fs.sp_offset;
10544
10545 add_reg_note (insn, REG_CFA_DEF_CFA,
10546 plus_constant (stack_pointer_rtx, m->fs.sp_offset));
10547 RTX_FRAME_RELATED_P (insn) = 1;
10548 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10549 m->fs.fp_offset);
10550 }
10551 }
10552
10553 /* Emit code to restore saved registers using MOV insns.
10554 First register is restored from CFA - CFA_OFFSET. */
10555 static void
10556 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10557 bool maybe_eh_return)
10558 {
10559 struct machine_function *m = cfun->machine;
10560 unsigned int regno;
10561
10562 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10563 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10564 {
10565 rtx reg = gen_rtx_REG (Pmode, regno);
10566 rtx insn, mem;
10567
10568 mem = choose_baseaddr (cfa_offset);
10569 mem = gen_frame_mem (Pmode, mem);
10570 insn = emit_move_insn (reg, mem);
10571
10572 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10573 {
10574 /* Previously we'd represented the CFA as an expression
10575 like *(%ebp - 8). We've just popped that value from
10576 the stack, which means we need to reset the CFA to
10577 the drap register. This will remain until we restore
10578 the stack pointer. */
10579 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10580 RTX_FRAME_RELATED_P (insn) = 1;
10581
10582 /* This means that the DRAP register is valid for addressing. */
10583 m->fs.drap_valid = true;
10584 }
10585 else
10586 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10587
10588 cfa_offset -= UNITS_PER_WORD;
10589 }
10590 }
10591
10592 /* Emit code to restore saved registers using MOV insns.
10593 First register is restored from CFA - CFA_OFFSET. */
10594 static void
10595 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10596 bool maybe_eh_return)
10597 {
10598 unsigned int regno;
10599
10600 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10601 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10602 {
10603 rtx reg = gen_rtx_REG (V4SFmode, regno);
10604 rtx mem;
10605
10606 mem = choose_baseaddr (cfa_offset);
10607 mem = gen_rtx_MEM (V4SFmode, mem);
10608 set_mem_align (mem, 128);
10609 emit_move_insn (reg, mem);
10610
10611 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10612
10613 cfa_offset -= 16;
10614 }
10615 }
10616
10617 /* Restore function stack, frame, and registers. */
10618
10619 void
10620 ix86_expand_epilogue (int style)
10621 {
10622 struct machine_function *m = cfun->machine;
10623 struct machine_frame_state frame_state_save = m->fs;
10624 struct ix86_frame frame;
10625 bool restore_regs_via_mov;
10626 bool using_drap;
10627
10628 ix86_finalize_stack_realign_flags ();
10629 ix86_compute_frame_layout (&frame);
10630
10631 m->fs.sp_valid = (!frame_pointer_needed
10632 || (current_function_sp_is_unchanging
10633 && !stack_realign_fp));
10634 gcc_assert (!m->fs.sp_valid
10635 || m->fs.sp_offset == frame.stack_pointer_offset);
10636
10637 /* The FP must be valid if the frame pointer is present. */
10638 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10639 gcc_assert (!m->fs.fp_valid
10640 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10641
10642 /* We must have *some* valid pointer to the stack frame. */
10643 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10644
10645 /* The DRAP is never valid at this point. */
10646 gcc_assert (!m->fs.drap_valid);
10647
10648 /* See the comment about red zone and frame
10649 pointer usage in ix86_expand_prologue. */
10650 if (frame_pointer_needed && frame.red_zone_size)
10651 emit_insn (gen_memory_blockage ());
10652
10653 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10654 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10655
10656 /* Determine the CFA offset of the end of the red-zone. */
10657 m->fs.red_zone_offset = 0;
10658 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10659 {
10660 /* The red-zone begins below the return address. */
10661 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10662
10663 /* When the register save area is in the aligned portion of
10664 the stack, determine the maximum runtime displacement that
10665 matches up with the aligned frame. */
10666 if (stack_realign_drap)
10667 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10668 + UNITS_PER_WORD);
10669 }
10670
10671 /* Special care must be taken for the normal return case of a function
10672 using eh_return: the eax and edx registers are marked as saved, but
10673 not restored along this path. Adjust the save location to match. */
10674 if (crtl->calls_eh_return && style != 2)
10675 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10676
10677 /* EH_RETURN requires the use of moves to function properly. */
10678 if (crtl->calls_eh_return)
10679 restore_regs_via_mov = true;
10680 /* SEH requires the use of pops to identify the epilogue. */
10681 else if (TARGET_SEH)
10682 restore_regs_via_mov = false;
10683 /* If we're only restoring one register and sp is not valid then
10684 using a move instruction to restore the register since it's
10685 less work than reloading sp and popping the register. */
10686 else if (!m->fs.sp_valid && frame.nregs <= 1)
10687 restore_regs_via_mov = true;
10688 else if (TARGET_EPILOGUE_USING_MOVE
10689 && cfun->machine->use_fast_prologue_epilogue
10690 && (frame.nregs > 1
10691 || m->fs.sp_offset != frame.reg_save_offset))
10692 restore_regs_via_mov = true;
10693 else if (frame_pointer_needed
10694 && !frame.nregs
10695 && m->fs.sp_offset != frame.reg_save_offset)
10696 restore_regs_via_mov = true;
10697 else if (frame_pointer_needed
10698 && TARGET_USE_LEAVE
10699 && cfun->machine->use_fast_prologue_epilogue
10700 && frame.nregs == 1)
10701 restore_regs_via_mov = true;
10702 else
10703 restore_regs_via_mov = false;
10704
10705 if (restore_regs_via_mov || frame.nsseregs)
10706 {
10707 /* Ensure that the entire register save area is addressable via
10708 the stack pointer, if we will restore via sp. */
10709 if (TARGET_64BIT
10710 && m->fs.sp_offset > 0x7fffffff
10711 && !(m->fs.fp_valid || m->fs.drap_valid)
10712 && (frame.nsseregs + frame.nregs) != 0)
10713 {
10714 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10715 GEN_INT (m->fs.sp_offset
10716 - frame.sse_reg_save_offset),
10717 style,
10718 m->fs.cfa_reg == stack_pointer_rtx);
10719 }
10720 }
10721
10722 /* If there are any SSE registers to restore, then we have to do it
10723 via moves, since there's obviously no pop for SSE regs. */
10724 if (frame.nsseregs)
10725 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10726 style == 2);
10727
10728 if (restore_regs_via_mov)
10729 {
10730 rtx t;
10731
10732 if (frame.nregs)
10733 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10734
10735 /* eh_return epilogues need %ecx added to the stack pointer. */
10736 if (style == 2)
10737 {
10738 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10739
10740 /* Stack align doesn't work with eh_return. */
10741 gcc_assert (!stack_realign_drap);
10742 /* Neither does regparm nested functions. */
10743 gcc_assert (!ix86_static_chain_on_stack);
10744
10745 if (frame_pointer_needed)
10746 {
10747 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10748 t = plus_constant (t, m->fs.fp_offset - UNITS_PER_WORD);
10749 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10750
10751 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10752 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10753
10754 /* Note that we use SA as a temporary CFA, as the return
10755 address is at the proper place relative to it. We
10756 pretend this happens at the FP restore insn because
10757 prior to this insn the FP would be stored at the wrong
10758 offset relative to SA, and after this insn we have no
10759 other reasonable register to use for the CFA. We don't
10760 bother resetting the CFA to the SP for the duration of
10761 the return insn. */
10762 add_reg_note (insn, REG_CFA_DEF_CFA,
10763 plus_constant (sa, UNITS_PER_WORD));
10764 ix86_add_queued_cfa_restore_notes (insn);
10765 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10766 RTX_FRAME_RELATED_P (insn) = 1;
10767
10768 m->fs.cfa_reg = sa;
10769 m->fs.cfa_offset = UNITS_PER_WORD;
10770 m->fs.fp_valid = false;
10771
10772 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10773 const0_rtx, style, false);
10774 }
10775 else
10776 {
10777 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10778 t = plus_constant (t, m->fs.sp_offset - UNITS_PER_WORD);
10779 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10780 ix86_add_queued_cfa_restore_notes (insn);
10781
10782 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10783 if (m->fs.cfa_offset != UNITS_PER_WORD)
10784 {
10785 m->fs.cfa_offset = UNITS_PER_WORD;
10786 add_reg_note (insn, REG_CFA_DEF_CFA,
10787 plus_constant (stack_pointer_rtx,
10788 UNITS_PER_WORD));
10789 RTX_FRAME_RELATED_P (insn) = 1;
10790 }
10791 }
10792 m->fs.sp_offset = UNITS_PER_WORD;
10793 m->fs.sp_valid = true;
10794 }
10795 }
10796 else
10797 {
10798 /* SEH requires that the function end with (1) a stack adjustment
10799 if necessary, (2) a sequence of pops, and (3) a return or
10800 jump instruction. Prevent insns from the function body from
10801 being scheduled into this sequence. */
10802 if (TARGET_SEH)
10803 {
10804 /* Prevent a catch region from being adjacent to the standard
10805 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
10806 several other flags that would be interesting to test are
10807 not yet set up. */
10808 if (flag_non_call_exceptions)
10809 emit_insn (gen_nops (const1_rtx));
10810 else
10811 emit_insn (gen_blockage ());
10812 }
10813
10814 /* First step is to deallocate the stack frame so that we can
10815 pop the registers. */
10816 if (!m->fs.sp_valid)
10817 {
10818 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
10819 GEN_INT (m->fs.fp_offset
10820 - frame.reg_save_offset),
10821 style, false);
10822 }
10823 else if (m->fs.sp_offset != frame.reg_save_offset)
10824 {
10825 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10826 GEN_INT (m->fs.sp_offset
10827 - frame.reg_save_offset),
10828 style,
10829 m->fs.cfa_reg == stack_pointer_rtx);
10830 }
10831
10832 ix86_emit_restore_regs_using_pop ();
10833 }
10834
10835 /* If we used a stack pointer and haven't already got rid of it,
10836 then do so now. */
10837 if (m->fs.fp_valid)
10838 {
10839 /* If the stack pointer is valid and pointing at the frame
10840 pointer store address, then we only need a pop. */
10841 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
10842 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10843 /* Leave results in shorter dependency chains on CPUs that are
10844 able to grok it fast. */
10845 else if (TARGET_USE_LEAVE
10846 || optimize_function_for_size_p (cfun)
10847 || !cfun->machine->use_fast_prologue_epilogue)
10848 ix86_emit_leave ();
10849 else
10850 {
10851 pro_epilogue_adjust_stack (stack_pointer_rtx,
10852 hard_frame_pointer_rtx,
10853 const0_rtx, style, !using_drap);
10854 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10855 }
10856 }
10857
10858 if (using_drap)
10859 {
10860 int param_ptr_offset = UNITS_PER_WORD;
10861 rtx insn;
10862
10863 gcc_assert (stack_realign_drap);
10864
10865 if (ix86_static_chain_on_stack)
10866 param_ptr_offset += UNITS_PER_WORD;
10867 if (!call_used_regs[REGNO (crtl->drap_reg)])
10868 param_ptr_offset += UNITS_PER_WORD;
10869
10870 insn = emit_insn (gen_rtx_SET
10871 (VOIDmode, stack_pointer_rtx,
10872 gen_rtx_PLUS (Pmode,
10873 crtl->drap_reg,
10874 GEN_INT (-param_ptr_offset))));
10875 m->fs.cfa_reg = stack_pointer_rtx;
10876 m->fs.cfa_offset = param_ptr_offset;
10877 m->fs.sp_offset = param_ptr_offset;
10878 m->fs.realigned = false;
10879
10880 add_reg_note (insn, REG_CFA_DEF_CFA,
10881 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10882 GEN_INT (param_ptr_offset)));
10883 RTX_FRAME_RELATED_P (insn) = 1;
10884
10885 if (!call_used_regs[REGNO (crtl->drap_reg)])
10886 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
10887 }
10888
10889 /* At this point the stack pointer must be valid, and we must have
10890 restored all of the registers. We may not have deallocated the
10891 entire stack frame. We've delayed this until now because it may
10892 be possible to merge the local stack deallocation with the
10893 deallocation forced by ix86_static_chain_on_stack. */
10894 gcc_assert (m->fs.sp_valid);
10895 gcc_assert (!m->fs.fp_valid);
10896 gcc_assert (!m->fs.realigned);
10897 if (m->fs.sp_offset != UNITS_PER_WORD)
10898 {
10899 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10900 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
10901 style, true);
10902 }
10903 else
10904 ix86_add_queued_cfa_restore_notes (get_last_insn ());
10905
10906 /* Sibcall epilogues don't want a return instruction. */
10907 if (style == 0)
10908 {
10909 m->fs = frame_state_save;
10910 return;
10911 }
10912
10913 /* Emit vzeroupper if needed. */
10914 if (TARGET_VZEROUPPER
10915 && !TREE_THIS_VOLATILE (cfun->decl)
10916 && !cfun->machine->caller_return_avx256_p)
10917 emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
10918
10919 if (crtl->args.pops_args && crtl->args.size)
10920 {
10921 rtx popc = GEN_INT (crtl->args.pops_args);
10922
10923 /* i386 can only pop 64K bytes. If asked to pop more, pop return
10924 address, do explicit add, and jump indirectly to the caller. */
10925
10926 if (crtl->args.pops_args >= 65536)
10927 {
10928 rtx ecx = gen_rtx_REG (SImode, CX_REG);
10929 rtx insn;
10930
10931 /* There is no "pascal" calling convention in any 64bit ABI. */
10932 gcc_assert (!TARGET_64BIT);
10933
10934 insn = emit_insn (gen_pop (ecx));
10935 m->fs.cfa_offset -= UNITS_PER_WORD;
10936 m->fs.sp_offset -= UNITS_PER_WORD;
10937
10938 add_reg_note (insn, REG_CFA_ADJUST_CFA,
10939 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
10940 add_reg_note (insn, REG_CFA_REGISTER,
10941 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
10942 RTX_FRAME_RELATED_P (insn) = 1;
10943
10944 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10945 popc, -1, true);
10946 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
10947 }
10948 else
10949 emit_jump_insn (gen_simple_return_pop_internal (popc));
10950 }
10951 else
10952 emit_jump_insn (gen_simple_return_internal ());
10953
10954 /* Restore the state back to the state from the prologue,
10955 so that it's correct for the next epilogue. */
10956 m->fs = frame_state_save;
10957 }
10958
10959 /* Reset from the function's potential modifications. */
10960
10961 static void
10962 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
10963 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
10964 {
10965 if (pic_offset_table_rtx)
10966 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
10967 #if TARGET_MACHO
10968 /* Mach-O doesn't support labels at the end of objects, so if
10969 it looks like we might want one, insert a NOP. */
10970 {
10971 rtx insn = get_last_insn ();
10972 rtx deleted_debug_label = NULL_RTX;
10973 while (insn
10974 && NOTE_P (insn)
10975 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
10976 {
10977 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
10978 notes only, instead set their CODE_LABEL_NUMBER to -1,
10979 otherwise there would be code generation differences
10980 in between -g and -g0. */
10981 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
10982 deleted_debug_label = insn;
10983 insn = PREV_INSN (insn);
10984 }
10985 if (insn
10986 && (LABEL_P (insn)
10987 || (NOTE_P (insn)
10988 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
10989 fputs ("\tnop\n", file);
10990 else if (deleted_debug_label)
10991 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
10992 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
10993 CODE_LABEL_NUMBER (insn) = -1;
10994 }
10995 #endif
10996
10997 }
10998
10999 /* Return a scratch register to use in the split stack prologue. The
11000 split stack prologue is used for -fsplit-stack. It is the first
11001 instructions in the function, even before the regular prologue.
11002 The scratch register can be any caller-saved register which is not
11003 used for parameters or for the static chain. */
11004
11005 static unsigned int
11006 split_stack_prologue_scratch_regno (void)
11007 {
11008 if (TARGET_64BIT)
11009 return R11_REG;
11010 else
11011 {
11012 bool is_fastcall;
11013 int regparm;
11014
11015 is_fastcall = (lookup_attribute ("fastcall",
11016 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11017 != NULL);
11018 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11019
11020 if (is_fastcall)
11021 {
11022 if (DECL_STATIC_CHAIN (cfun->decl))
11023 {
11024 sorry ("-fsplit-stack does not support fastcall with "
11025 "nested function");
11026 return INVALID_REGNUM;
11027 }
11028 return AX_REG;
11029 }
11030 else if (regparm < 3)
11031 {
11032 if (!DECL_STATIC_CHAIN (cfun->decl))
11033 return CX_REG;
11034 else
11035 {
11036 if (regparm >= 2)
11037 {
11038 sorry ("-fsplit-stack does not support 2 register "
11039 " parameters for a nested function");
11040 return INVALID_REGNUM;
11041 }
11042 return DX_REG;
11043 }
11044 }
11045 else
11046 {
11047 /* FIXME: We could make this work by pushing a register
11048 around the addition and comparison. */
11049 sorry ("-fsplit-stack does not support 3 register parameters");
11050 return INVALID_REGNUM;
11051 }
11052 }
11053 }
11054
11055 /* A SYMBOL_REF for the function which allocates new stackspace for
11056 -fsplit-stack. */
11057
11058 static GTY(()) rtx split_stack_fn;
11059
11060 /* A SYMBOL_REF for the more stack function when using the large
11061 model. */
11062
11063 static GTY(()) rtx split_stack_fn_large;
11064
11065 /* Handle -fsplit-stack. These are the first instructions in the
11066 function, even before the regular prologue. */
11067
11068 void
11069 ix86_expand_split_stack_prologue (void)
11070 {
11071 struct ix86_frame frame;
11072 HOST_WIDE_INT allocate;
11073 unsigned HOST_WIDE_INT args_size;
11074 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11075 rtx scratch_reg = NULL_RTX;
11076 rtx varargs_label = NULL_RTX;
11077 rtx fn;
11078
11079 gcc_assert (flag_split_stack && reload_completed);
11080
11081 ix86_finalize_stack_realign_flags ();
11082 ix86_compute_frame_layout (&frame);
11083 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11084
11085 /* This is the label we will branch to if we have enough stack
11086 space. We expect the basic block reordering pass to reverse this
11087 branch if optimizing, so that we branch in the unlikely case. */
11088 label = gen_label_rtx ();
11089
11090 /* We need to compare the stack pointer minus the frame size with
11091 the stack boundary in the TCB. The stack boundary always gives
11092 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11093 can compare directly. Otherwise we need to do an addition. */
11094
11095 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11096 UNSPEC_STACK_CHECK);
11097 limit = gen_rtx_CONST (Pmode, limit);
11098 limit = gen_rtx_MEM (Pmode, limit);
11099 if (allocate < SPLIT_STACK_AVAILABLE)
11100 current = stack_pointer_rtx;
11101 else
11102 {
11103 unsigned int scratch_regno;
11104 rtx offset;
11105
11106 /* We need a scratch register to hold the stack pointer minus
11107 the required frame size. Since this is the very start of the
11108 function, the scratch register can be any caller-saved
11109 register which is not used for parameters. */
11110 offset = GEN_INT (- allocate);
11111 scratch_regno = split_stack_prologue_scratch_regno ();
11112 if (scratch_regno == INVALID_REGNUM)
11113 return;
11114 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11115 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11116 {
11117 /* We don't use ix86_gen_add3 in this case because it will
11118 want to split to lea, but when not optimizing the insn
11119 will not be split after this point. */
11120 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11121 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11122 offset)));
11123 }
11124 else
11125 {
11126 emit_move_insn (scratch_reg, offset);
11127 emit_insn (gen_adddi3 (scratch_reg, scratch_reg,
11128 stack_pointer_rtx));
11129 }
11130 current = scratch_reg;
11131 }
11132
11133 ix86_expand_branch (GEU, current, limit, label);
11134 jump_insn = get_last_insn ();
11135 JUMP_LABEL (jump_insn) = label;
11136
11137 /* Mark the jump as very likely to be taken. */
11138 add_reg_note (jump_insn, REG_BR_PROB,
11139 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11140
11141 if (split_stack_fn == NULL_RTX)
11142 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11143 fn = split_stack_fn;
11144
11145 /* Get more stack space. We pass in the desired stack space and the
11146 size of the arguments to copy to the new stack. In 32-bit mode
11147 we push the parameters; __morestack will return on a new stack
11148 anyhow. In 64-bit mode we pass the parameters in r10 and
11149 r11. */
11150 allocate_rtx = GEN_INT (allocate);
11151 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11152 call_fusage = NULL_RTX;
11153 if (TARGET_64BIT)
11154 {
11155 rtx reg10, reg11;
11156
11157 reg10 = gen_rtx_REG (Pmode, R10_REG);
11158 reg11 = gen_rtx_REG (Pmode, R11_REG);
11159
11160 /* If this function uses a static chain, it will be in %r10.
11161 Preserve it across the call to __morestack. */
11162 if (DECL_STATIC_CHAIN (cfun->decl))
11163 {
11164 rtx rax;
11165
11166 rax = gen_rtx_REG (Pmode, AX_REG);
11167 emit_move_insn (rax, reg10);
11168 use_reg (&call_fusage, rax);
11169 }
11170
11171 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11172 {
11173 HOST_WIDE_INT argval;
11174
11175 /* When using the large model we need to load the address
11176 into a register, and we've run out of registers. So we
11177 switch to a different calling convention, and we call a
11178 different function: __morestack_large. We pass the
11179 argument size in the upper 32 bits of r10 and pass the
11180 frame size in the lower 32 bits. */
11181 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11182 gcc_assert ((args_size & 0xffffffff) == args_size);
11183
11184 if (split_stack_fn_large == NULL_RTX)
11185 split_stack_fn_large =
11186 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11187
11188 if (ix86_cmodel == CM_LARGE_PIC)
11189 {
11190 rtx label, x;
11191
11192 label = gen_label_rtx ();
11193 emit_label (label);
11194 LABEL_PRESERVE_P (label) = 1;
11195 emit_insn (gen_set_rip_rex64 (reg10, label));
11196 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11197 emit_insn (gen_adddi3 (reg10, reg10, reg11));
11198 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11199 UNSPEC_GOT);
11200 x = gen_rtx_CONST (Pmode, x);
11201 emit_move_insn (reg11, x);
11202 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11203 x = gen_const_mem (Pmode, x);
11204 emit_move_insn (reg11, x);
11205 }
11206 else
11207 emit_move_insn (reg11, split_stack_fn_large);
11208
11209 fn = reg11;
11210
11211 argval = ((args_size << 16) << 16) + allocate;
11212 emit_move_insn (reg10, GEN_INT (argval));
11213 }
11214 else
11215 {
11216 emit_move_insn (reg10, allocate_rtx);
11217 emit_move_insn (reg11, GEN_INT (args_size));
11218 use_reg (&call_fusage, reg11);
11219 }
11220
11221 use_reg (&call_fusage, reg10);
11222 }
11223 else
11224 {
11225 emit_insn (gen_push (GEN_INT (args_size)));
11226 emit_insn (gen_push (allocate_rtx));
11227 }
11228 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11229 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11230 NULL_RTX, false);
11231 add_function_usage_to (call_insn, call_fusage);
11232
11233 /* In order to make call/return prediction work right, we now need
11234 to execute a return instruction. See
11235 libgcc/config/i386/morestack.S for the details on how this works.
11236
11237 For flow purposes gcc must not see this as a return
11238 instruction--we need control flow to continue at the subsequent
11239 label. Therefore, we use an unspec. */
11240 gcc_assert (crtl->args.pops_args < 65536);
11241 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11242
11243 /* If we are in 64-bit mode and this function uses a static chain,
11244 we saved %r10 in %rax before calling _morestack. */
11245 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11246 emit_move_insn (gen_rtx_REG (Pmode, R10_REG),
11247 gen_rtx_REG (Pmode, AX_REG));
11248
11249 /* If this function calls va_start, we need to store a pointer to
11250 the arguments on the old stack, because they may not have been
11251 all copied to the new stack. At this point the old stack can be
11252 found at the frame pointer value used by __morestack, because
11253 __morestack has set that up before calling back to us. Here we
11254 store that pointer in a scratch register, and in
11255 ix86_expand_prologue we store the scratch register in a stack
11256 slot. */
11257 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11258 {
11259 unsigned int scratch_regno;
11260 rtx frame_reg;
11261 int words;
11262
11263 scratch_regno = split_stack_prologue_scratch_regno ();
11264 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11265 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11266
11267 /* 64-bit:
11268 fp -> old fp value
11269 return address within this function
11270 return address of caller of this function
11271 stack arguments
11272 So we add three words to get to the stack arguments.
11273
11274 32-bit:
11275 fp -> old fp value
11276 return address within this function
11277 first argument to __morestack
11278 second argument to __morestack
11279 return address of caller of this function
11280 stack arguments
11281 So we add five words to get to the stack arguments.
11282 */
11283 words = TARGET_64BIT ? 3 : 5;
11284 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11285 gen_rtx_PLUS (Pmode, frame_reg,
11286 GEN_INT (words * UNITS_PER_WORD))));
11287
11288 varargs_label = gen_label_rtx ();
11289 emit_jump_insn (gen_jump (varargs_label));
11290 JUMP_LABEL (get_last_insn ()) = varargs_label;
11291
11292 emit_barrier ();
11293 }
11294
11295 emit_label (label);
11296 LABEL_NUSES (label) = 1;
11297
11298 /* If this function calls va_start, we now have to set the scratch
11299 register for the case where we do not call __morestack. In this
11300 case we need to set it based on the stack pointer. */
11301 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11302 {
11303 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11304 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11305 GEN_INT (UNITS_PER_WORD))));
11306
11307 emit_label (varargs_label);
11308 LABEL_NUSES (varargs_label) = 1;
11309 }
11310 }
11311
11312 /* We may have to tell the dataflow pass that the split stack prologue
11313 is initializing a scratch register. */
11314
11315 static void
11316 ix86_live_on_entry (bitmap regs)
11317 {
11318 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11319 {
11320 gcc_assert (flag_split_stack);
11321 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11322 }
11323 }
11324 \f
11325 /* Determine if op is suitable SUBREG RTX for address. */
11326
11327 static bool
11328 ix86_address_subreg_operand (rtx op)
11329 {
11330 enum machine_mode mode;
11331
11332 if (!REG_P (op))
11333 return false;
11334
11335 mode = GET_MODE (op);
11336
11337 if (GET_MODE_CLASS (mode) != MODE_INT)
11338 return false;
11339
11340 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11341 failures when the register is one word out of a two word structure. */
11342 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11343 return false;
11344
11345 /* Allow only SUBREGs of non-eliminable hard registers. */
11346 return register_no_elim_operand (op, mode);
11347 }
11348
11349 /* Extract the parts of an RTL expression that is a valid memory address
11350 for an instruction. Return 0 if the structure of the address is
11351 grossly off. Return -1 if the address contains ASHIFT, so it is not
11352 strictly valid, but still used for computing length of lea instruction. */
11353
11354 int
11355 ix86_decompose_address (rtx addr, struct ix86_address *out)
11356 {
11357 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11358 rtx base_reg, index_reg;
11359 HOST_WIDE_INT scale = 1;
11360 rtx scale_rtx = NULL_RTX;
11361 rtx tmp;
11362 int retval = 1;
11363 enum ix86_address_seg seg = SEG_DEFAULT;
11364
11365 /* Allow zero-extended SImode addresses,
11366 they will be emitted with addr32 prefix. */
11367 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11368 {
11369 if (GET_CODE (addr) == ZERO_EXTEND
11370 && GET_MODE (XEXP (addr, 0)) == SImode)
11371 addr = XEXP (addr, 0);
11372 else if (GET_CODE (addr) == AND
11373 && const_32bit_mask (XEXP (addr, 1), DImode))
11374 {
11375 addr = XEXP (addr, 0);
11376
11377 /* Strip subreg. */
11378 if (GET_CODE (addr) == SUBREG
11379 && GET_MODE (SUBREG_REG (addr)) == SImode)
11380 addr = SUBREG_REG (addr);
11381 }
11382 }
11383
11384 if (REG_P (addr))
11385 base = addr;
11386 else if (GET_CODE (addr) == SUBREG)
11387 {
11388 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11389 base = addr;
11390 else
11391 return 0;
11392 }
11393 else if (GET_CODE (addr) == PLUS)
11394 {
11395 rtx addends[4], op;
11396 int n = 0, i;
11397
11398 op = addr;
11399 do
11400 {
11401 if (n >= 4)
11402 return 0;
11403 addends[n++] = XEXP (op, 1);
11404 op = XEXP (op, 0);
11405 }
11406 while (GET_CODE (op) == PLUS);
11407 if (n >= 4)
11408 return 0;
11409 addends[n] = op;
11410
11411 for (i = n; i >= 0; --i)
11412 {
11413 op = addends[i];
11414 switch (GET_CODE (op))
11415 {
11416 case MULT:
11417 if (index)
11418 return 0;
11419 index = XEXP (op, 0);
11420 scale_rtx = XEXP (op, 1);
11421 break;
11422
11423 case ASHIFT:
11424 if (index)
11425 return 0;
11426 index = XEXP (op, 0);
11427 tmp = XEXP (op, 1);
11428 if (!CONST_INT_P (tmp))
11429 return 0;
11430 scale = INTVAL (tmp);
11431 if ((unsigned HOST_WIDE_INT) scale > 3)
11432 return 0;
11433 scale = 1 << scale;
11434 break;
11435
11436 case UNSPEC:
11437 if (XINT (op, 1) == UNSPEC_TP
11438 && TARGET_TLS_DIRECT_SEG_REFS
11439 && seg == SEG_DEFAULT)
11440 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11441 else
11442 return 0;
11443 break;
11444
11445 case SUBREG:
11446 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11447 return 0;
11448 /* FALLTHRU */
11449
11450 case REG:
11451 if (!base)
11452 base = op;
11453 else if (!index)
11454 index = op;
11455 else
11456 return 0;
11457 break;
11458
11459 case CONST:
11460 case CONST_INT:
11461 case SYMBOL_REF:
11462 case LABEL_REF:
11463 if (disp)
11464 return 0;
11465 disp = op;
11466 break;
11467
11468 default:
11469 return 0;
11470 }
11471 }
11472 }
11473 else if (GET_CODE (addr) == MULT)
11474 {
11475 index = XEXP (addr, 0); /* index*scale */
11476 scale_rtx = XEXP (addr, 1);
11477 }
11478 else if (GET_CODE (addr) == ASHIFT)
11479 {
11480 /* We're called for lea too, which implements ashift on occasion. */
11481 index = XEXP (addr, 0);
11482 tmp = XEXP (addr, 1);
11483 if (!CONST_INT_P (tmp))
11484 return 0;
11485 scale = INTVAL (tmp);
11486 if ((unsigned HOST_WIDE_INT) scale > 3)
11487 return 0;
11488 scale = 1 << scale;
11489 retval = -1;
11490 }
11491 else
11492 disp = addr; /* displacement */
11493
11494 if (index)
11495 {
11496 if (REG_P (index))
11497 ;
11498 else if (GET_CODE (index) == SUBREG
11499 && ix86_address_subreg_operand (SUBREG_REG (index)))
11500 ;
11501 else
11502 return 0;
11503 }
11504
11505 /* Extract the integral value of scale. */
11506 if (scale_rtx)
11507 {
11508 if (!CONST_INT_P (scale_rtx))
11509 return 0;
11510 scale = INTVAL (scale_rtx);
11511 }
11512
11513 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11514 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11515
11516 /* Avoid useless 0 displacement. */
11517 if (disp == const0_rtx && (base || index))
11518 disp = NULL_RTX;
11519
11520 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11521 if (base_reg && index_reg && scale == 1
11522 && (index_reg == arg_pointer_rtx
11523 || index_reg == frame_pointer_rtx
11524 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11525 {
11526 rtx tmp;
11527 tmp = base, base = index, index = tmp;
11528 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11529 }
11530
11531 /* Special case: %ebp cannot be encoded as a base without a displacement.
11532 Similarly %r13. */
11533 if (!disp
11534 && base_reg
11535 && (base_reg == hard_frame_pointer_rtx
11536 || base_reg == frame_pointer_rtx
11537 || base_reg == arg_pointer_rtx
11538 || (REG_P (base_reg)
11539 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11540 || REGNO (base_reg) == R13_REG))))
11541 disp = const0_rtx;
11542
11543 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11544 Avoid this by transforming to [%esi+0].
11545 Reload calls address legitimization without cfun defined, so we need
11546 to test cfun for being non-NULL. */
11547 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11548 && base_reg && !index_reg && !disp
11549 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11550 disp = const0_rtx;
11551
11552 /* Special case: encode reg+reg instead of reg*2. */
11553 if (!base && index && scale == 2)
11554 base = index, base_reg = index_reg, scale = 1;
11555
11556 /* Special case: scaling cannot be encoded without base or displacement. */
11557 if (!base && !disp && index && scale != 1)
11558 disp = const0_rtx;
11559
11560 out->base = base;
11561 out->index = index;
11562 out->disp = disp;
11563 out->scale = scale;
11564 out->seg = seg;
11565
11566 return retval;
11567 }
11568 \f
11569 /* Return cost of the memory address x.
11570 For i386, it is better to use a complex address than let gcc copy
11571 the address into a reg and make a new pseudo. But not if the address
11572 requires to two regs - that would mean more pseudos with longer
11573 lifetimes. */
11574 static int
11575 ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
11576 {
11577 struct ix86_address parts;
11578 int cost = 1;
11579 int ok = ix86_decompose_address (x, &parts);
11580
11581 gcc_assert (ok);
11582
11583 if (parts.base && GET_CODE (parts.base) == SUBREG)
11584 parts.base = SUBREG_REG (parts.base);
11585 if (parts.index && GET_CODE (parts.index) == SUBREG)
11586 parts.index = SUBREG_REG (parts.index);
11587
11588 /* Attempt to minimize number of registers in the address. */
11589 if ((parts.base
11590 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11591 || (parts.index
11592 && (!REG_P (parts.index)
11593 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11594 cost++;
11595
11596 if (parts.base
11597 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11598 && parts.index
11599 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11600 && parts.base != parts.index)
11601 cost++;
11602
11603 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11604 since it's predecode logic can't detect the length of instructions
11605 and it degenerates to vector decoded. Increase cost of such
11606 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11607 to split such addresses or even refuse such addresses at all.
11608
11609 Following addressing modes are affected:
11610 [base+scale*index]
11611 [scale*index+disp]
11612 [base+index]
11613
11614 The first and last case may be avoidable by explicitly coding the zero in
11615 memory address, but I don't have AMD-K6 machine handy to check this
11616 theory. */
11617
11618 if (TARGET_K6
11619 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11620 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11621 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11622 cost += 10;
11623
11624 return cost;
11625 }
11626 \f
11627 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11628 this is used for to form addresses to local data when -fPIC is in
11629 use. */
11630
11631 static bool
11632 darwin_local_data_pic (rtx disp)
11633 {
11634 return (GET_CODE (disp) == UNSPEC
11635 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11636 }
11637
11638 /* Determine if a given RTX is a valid constant. We already know this
11639 satisfies CONSTANT_P. */
11640
11641 static bool
11642 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11643 {
11644 switch (GET_CODE (x))
11645 {
11646 case CONST:
11647 x = XEXP (x, 0);
11648
11649 if (GET_CODE (x) == PLUS)
11650 {
11651 if (!CONST_INT_P (XEXP (x, 1)))
11652 return false;
11653 x = XEXP (x, 0);
11654 }
11655
11656 if (TARGET_MACHO && darwin_local_data_pic (x))
11657 return true;
11658
11659 /* Only some unspecs are valid as "constants". */
11660 if (GET_CODE (x) == UNSPEC)
11661 switch (XINT (x, 1))
11662 {
11663 case UNSPEC_GOT:
11664 case UNSPEC_GOTOFF:
11665 case UNSPEC_PLTOFF:
11666 return TARGET_64BIT;
11667 case UNSPEC_TPOFF:
11668 case UNSPEC_NTPOFF:
11669 x = XVECEXP (x, 0, 0);
11670 return (GET_CODE (x) == SYMBOL_REF
11671 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11672 case UNSPEC_DTPOFF:
11673 x = XVECEXP (x, 0, 0);
11674 return (GET_CODE (x) == SYMBOL_REF
11675 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11676 default:
11677 return false;
11678 }
11679
11680 /* We must have drilled down to a symbol. */
11681 if (GET_CODE (x) == LABEL_REF)
11682 return true;
11683 if (GET_CODE (x) != SYMBOL_REF)
11684 return false;
11685 /* FALLTHRU */
11686
11687 case SYMBOL_REF:
11688 /* TLS symbols are never valid. */
11689 if (SYMBOL_REF_TLS_MODEL (x))
11690 return false;
11691
11692 /* DLLIMPORT symbols are never valid. */
11693 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11694 && SYMBOL_REF_DLLIMPORT_P (x))
11695 return false;
11696
11697 #if TARGET_MACHO
11698 /* mdynamic-no-pic */
11699 if (MACHO_DYNAMIC_NO_PIC_P)
11700 return machopic_symbol_defined_p (x);
11701 #endif
11702 break;
11703
11704 case CONST_DOUBLE:
11705 if (GET_MODE (x) == TImode
11706 && x != CONST0_RTX (TImode)
11707 && !TARGET_64BIT)
11708 return false;
11709 break;
11710
11711 case CONST_VECTOR:
11712 if (!standard_sse_constant_p (x))
11713 return false;
11714
11715 default:
11716 break;
11717 }
11718
11719 /* Otherwise we handle everything else in the move patterns. */
11720 return true;
11721 }
11722
11723 /* Determine if it's legal to put X into the constant pool. This
11724 is not possible for the address of thread-local symbols, which
11725 is checked above. */
11726
11727 static bool
11728 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11729 {
11730 /* We can always put integral constants and vectors in memory. */
11731 switch (GET_CODE (x))
11732 {
11733 case CONST_INT:
11734 case CONST_DOUBLE:
11735 case CONST_VECTOR:
11736 return false;
11737
11738 default:
11739 break;
11740 }
11741 return !ix86_legitimate_constant_p (mode, x);
11742 }
11743
11744
11745 /* Nonzero if the constant value X is a legitimate general operand
11746 when generating PIC code. It is given that flag_pic is on and
11747 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11748
11749 bool
11750 legitimate_pic_operand_p (rtx x)
11751 {
11752 rtx inner;
11753
11754 switch (GET_CODE (x))
11755 {
11756 case CONST:
11757 inner = XEXP (x, 0);
11758 if (GET_CODE (inner) == PLUS
11759 && CONST_INT_P (XEXP (inner, 1)))
11760 inner = XEXP (inner, 0);
11761
11762 /* Only some unspecs are valid as "constants". */
11763 if (GET_CODE (inner) == UNSPEC)
11764 switch (XINT (inner, 1))
11765 {
11766 case UNSPEC_GOT:
11767 case UNSPEC_GOTOFF:
11768 case UNSPEC_PLTOFF:
11769 return TARGET_64BIT;
11770 case UNSPEC_TPOFF:
11771 x = XVECEXP (inner, 0, 0);
11772 return (GET_CODE (x) == SYMBOL_REF
11773 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11774 case UNSPEC_MACHOPIC_OFFSET:
11775 return legitimate_pic_address_disp_p (x);
11776 default:
11777 return false;
11778 }
11779 /* FALLTHRU */
11780
11781 case SYMBOL_REF:
11782 case LABEL_REF:
11783 return legitimate_pic_address_disp_p (x);
11784
11785 default:
11786 return true;
11787 }
11788 }
11789
11790 /* Determine if a given CONST RTX is a valid memory displacement
11791 in PIC mode. */
11792
11793 bool
11794 legitimate_pic_address_disp_p (rtx disp)
11795 {
11796 bool saw_plus;
11797
11798 /* In 64bit mode we can allow direct addresses of symbols and labels
11799 when they are not dynamic symbols. */
11800 if (TARGET_64BIT)
11801 {
11802 rtx op0 = disp, op1;
11803
11804 switch (GET_CODE (disp))
11805 {
11806 case LABEL_REF:
11807 return true;
11808
11809 case CONST:
11810 if (GET_CODE (XEXP (disp, 0)) != PLUS)
11811 break;
11812 op0 = XEXP (XEXP (disp, 0), 0);
11813 op1 = XEXP (XEXP (disp, 0), 1);
11814 if (!CONST_INT_P (op1)
11815 || INTVAL (op1) >= 16*1024*1024
11816 || INTVAL (op1) < -16*1024*1024)
11817 break;
11818 if (GET_CODE (op0) == LABEL_REF)
11819 return true;
11820 if (GET_CODE (op0) != SYMBOL_REF)
11821 break;
11822 /* FALLTHRU */
11823
11824 case SYMBOL_REF:
11825 /* TLS references should always be enclosed in UNSPEC. */
11826 if (SYMBOL_REF_TLS_MODEL (op0))
11827 return false;
11828 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
11829 && ix86_cmodel != CM_LARGE_PIC)
11830 return true;
11831 break;
11832
11833 default:
11834 break;
11835 }
11836 }
11837 if (GET_CODE (disp) != CONST)
11838 return false;
11839 disp = XEXP (disp, 0);
11840
11841 if (TARGET_64BIT)
11842 {
11843 /* We are unsafe to allow PLUS expressions. This limit allowed distance
11844 of GOT tables. We should not need these anyway. */
11845 if (GET_CODE (disp) != UNSPEC
11846 || (XINT (disp, 1) != UNSPEC_GOTPCREL
11847 && XINT (disp, 1) != UNSPEC_GOTOFF
11848 && XINT (disp, 1) != UNSPEC_PCREL
11849 && XINT (disp, 1) != UNSPEC_PLTOFF))
11850 return false;
11851
11852 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
11853 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
11854 return false;
11855 return true;
11856 }
11857
11858 saw_plus = false;
11859 if (GET_CODE (disp) == PLUS)
11860 {
11861 if (!CONST_INT_P (XEXP (disp, 1)))
11862 return false;
11863 disp = XEXP (disp, 0);
11864 saw_plus = true;
11865 }
11866
11867 if (TARGET_MACHO && darwin_local_data_pic (disp))
11868 return true;
11869
11870 if (GET_CODE (disp) != UNSPEC)
11871 return false;
11872
11873 switch (XINT (disp, 1))
11874 {
11875 case UNSPEC_GOT:
11876 if (saw_plus)
11877 return false;
11878 /* We need to check for both symbols and labels because VxWorks loads
11879 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
11880 details. */
11881 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11882 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
11883 case UNSPEC_GOTOFF:
11884 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
11885 While ABI specify also 32bit relocation but we don't produce it in
11886 small PIC model at all. */
11887 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11888 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
11889 && !TARGET_64BIT)
11890 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
11891 return false;
11892 case UNSPEC_GOTTPOFF:
11893 case UNSPEC_GOTNTPOFF:
11894 case UNSPEC_INDNTPOFF:
11895 if (saw_plus)
11896 return false;
11897 disp = XVECEXP (disp, 0, 0);
11898 return (GET_CODE (disp) == SYMBOL_REF
11899 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
11900 case UNSPEC_NTPOFF:
11901 disp = XVECEXP (disp, 0, 0);
11902 return (GET_CODE (disp) == SYMBOL_REF
11903 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
11904 case UNSPEC_DTPOFF:
11905 disp = XVECEXP (disp, 0, 0);
11906 return (GET_CODE (disp) == SYMBOL_REF
11907 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
11908 }
11909
11910 return false;
11911 }
11912
11913 /* Recognizes RTL expressions that are valid memory addresses for an
11914 instruction. The MODE argument is the machine mode for the MEM
11915 expression that wants to use this address.
11916
11917 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
11918 convert common non-canonical forms to canonical form so that they will
11919 be recognized. */
11920
11921 static bool
11922 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
11923 rtx addr, bool strict)
11924 {
11925 struct ix86_address parts;
11926 rtx base, index, disp;
11927 HOST_WIDE_INT scale;
11928
11929 if (ix86_decompose_address (addr, &parts) <= 0)
11930 /* Decomposition failed. */
11931 return false;
11932
11933 base = parts.base;
11934 index = parts.index;
11935 disp = parts.disp;
11936 scale = parts.scale;
11937
11938 /* Validate base register. */
11939 if (base)
11940 {
11941 rtx reg;
11942
11943 if (REG_P (base))
11944 reg = base;
11945 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
11946 reg = SUBREG_REG (base);
11947 else
11948 /* Base is not a register. */
11949 return false;
11950
11951 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
11952 return false;
11953
11954 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
11955 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
11956 /* Base is not valid. */
11957 return false;
11958 }
11959
11960 /* Validate index register. */
11961 if (index)
11962 {
11963 rtx reg;
11964
11965 if (REG_P (index))
11966 reg = index;
11967 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
11968 reg = SUBREG_REG (index);
11969 else
11970 /* Index is not a register. */
11971 return false;
11972
11973 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
11974 return false;
11975
11976 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
11977 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
11978 /* Index is not valid. */
11979 return false;
11980 }
11981
11982 /* Index and base should have the same mode. */
11983 if (base && index
11984 && GET_MODE (base) != GET_MODE (index))
11985 return false;
11986
11987 /* Validate scale factor. */
11988 if (scale != 1)
11989 {
11990 if (!index)
11991 /* Scale without index. */
11992 return false;
11993
11994 if (scale != 2 && scale != 4 && scale != 8)
11995 /* Scale is not a valid multiplier. */
11996 return false;
11997 }
11998
11999 /* Validate displacement. */
12000 if (disp)
12001 {
12002 if (GET_CODE (disp) == CONST
12003 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12004 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12005 switch (XINT (XEXP (disp, 0), 1))
12006 {
12007 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12008 used. While ABI specify also 32bit relocations, we don't produce
12009 them at all and use IP relative instead. */
12010 case UNSPEC_GOT:
12011 case UNSPEC_GOTOFF:
12012 gcc_assert (flag_pic);
12013 if (!TARGET_64BIT)
12014 goto is_legitimate_pic;
12015
12016 /* 64bit address unspec. */
12017 return false;
12018
12019 case UNSPEC_GOTPCREL:
12020 case UNSPEC_PCREL:
12021 gcc_assert (flag_pic);
12022 goto is_legitimate_pic;
12023
12024 case UNSPEC_GOTTPOFF:
12025 case UNSPEC_GOTNTPOFF:
12026 case UNSPEC_INDNTPOFF:
12027 case UNSPEC_NTPOFF:
12028 case UNSPEC_DTPOFF:
12029 break;
12030
12031 case UNSPEC_STACK_CHECK:
12032 gcc_assert (flag_split_stack);
12033 break;
12034
12035 default:
12036 /* Invalid address unspec. */
12037 return false;
12038 }
12039
12040 else if (SYMBOLIC_CONST (disp)
12041 && (flag_pic
12042 || (TARGET_MACHO
12043 #if TARGET_MACHO
12044 && MACHOPIC_INDIRECT
12045 && !machopic_operand_p (disp)
12046 #endif
12047 )))
12048 {
12049
12050 is_legitimate_pic:
12051 if (TARGET_64BIT && (index || base))
12052 {
12053 /* foo@dtpoff(%rX) is ok. */
12054 if (GET_CODE (disp) != CONST
12055 || GET_CODE (XEXP (disp, 0)) != PLUS
12056 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12057 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12058 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12059 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12060 /* Non-constant pic memory reference. */
12061 return false;
12062 }
12063 else if ((!TARGET_MACHO || flag_pic)
12064 && ! legitimate_pic_address_disp_p (disp))
12065 /* Displacement is an invalid pic construct. */
12066 return false;
12067 #if TARGET_MACHO
12068 else if (MACHO_DYNAMIC_NO_PIC_P
12069 && !ix86_legitimate_constant_p (Pmode, disp))
12070 /* displacment must be referenced via non_lazy_pointer */
12071 return false;
12072 #endif
12073
12074 /* This code used to verify that a symbolic pic displacement
12075 includes the pic_offset_table_rtx register.
12076
12077 While this is good idea, unfortunately these constructs may
12078 be created by "adds using lea" optimization for incorrect
12079 code like:
12080
12081 int a;
12082 int foo(int i)
12083 {
12084 return *(&a+i);
12085 }
12086
12087 This code is nonsensical, but results in addressing
12088 GOT table with pic_offset_table_rtx base. We can't
12089 just refuse it easily, since it gets matched by
12090 "addsi3" pattern, that later gets split to lea in the
12091 case output register differs from input. While this
12092 can be handled by separate addsi pattern for this case
12093 that never results in lea, this seems to be easier and
12094 correct fix for crash to disable this test. */
12095 }
12096 else if (GET_CODE (disp) != LABEL_REF
12097 && !CONST_INT_P (disp)
12098 && (GET_CODE (disp) != CONST
12099 || !ix86_legitimate_constant_p (Pmode, disp))
12100 && (GET_CODE (disp) != SYMBOL_REF
12101 || !ix86_legitimate_constant_p (Pmode, disp)))
12102 /* Displacement is not constant. */
12103 return false;
12104 else if (TARGET_64BIT
12105 && !x86_64_immediate_operand (disp, VOIDmode))
12106 /* Displacement is out of range. */
12107 return false;
12108 }
12109
12110 /* Everything looks valid. */
12111 return true;
12112 }
12113
12114 /* Determine if a given RTX is a valid constant address. */
12115
12116 bool
12117 constant_address_p (rtx x)
12118 {
12119 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12120 }
12121 \f
12122 /* Return a unique alias set for the GOT. */
12123
12124 static alias_set_type
12125 ix86_GOT_alias_set (void)
12126 {
12127 static alias_set_type set = -1;
12128 if (set == -1)
12129 set = new_alias_set ();
12130 return set;
12131 }
12132
12133 /* Return a legitimate reference for ORIG (an address) using the
12134 register REG. If REG is 0, a new pseudo is generated.
12135
12136 There are two types of references that must be handled:
12137
12138 1. Global data references must load the address from the GOT, via
12139 the PIC reg. An insn is emitted to do this load, and the reg is
12140 returned.
12141
12142 2. Static data references, constant pool addresses, and code labels
12143 compute the address as an offset from the GOT, whose base is in
12144 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12145 differentiate them from global data objects. The returned
12146 address is the PIC reg + an unspec constant.
12147
12148 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12149 reg also appears in the address. */
12150
12151 static rtx
12152 legitimize_pic_address (rtx orig, rtx reg)
12153 {
12154 rtx addr = orig;
12155 rtx new_rtx = orig;
12156 rtx base;
12157
12158 #if TARGET_MACHO
12159 if (TARGET_MACHO && !TARGET_64BIT)
12160 {
12161 if (reg == 0)
12162 reg = gen_reg_rtx (Pmode);
12163 /* Use the generic Mach-O PIC machinery. */
12164 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12165 }
12166 #endif
12167
12168 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12169 new_rtx = addr;
12170 else if (TARGET_64BIT
12171 && ix86_cmodel != CM_SMALL_PIC
12172 && gotoff_operand (addr, Pmode))
12173 {
12174 rtx tmpreg;
12175 /* This symbol may be referenced via a displacement from the PIC
12176 base address (@GOTOFF). */
12177
12178 if (reload_in_progress)
12179 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12180 if (GET_CODE (addr) == CONST)
12181 addr = XEXP (addr, 0);
12182 if (GET_CODE (addr) == PLUS)
12183 {
12184 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12185 UNSPEC_GOTOFF);
12186 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12187 }
12188 else
12189 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12190 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12191 if (!reg)
12192 tmpreg = gen_reg_rtx (Pmode);
12193 else
12194 tmpreg = reg;
12195 emit_move_insn (tmpreg, new_rtx);
12196
12197 if (reg != 0)
12198 {
12199 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12200 tmpreg, 1, OPTAB_DIRECT);
12201 new_rtx = reg;
12202 }
12203 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12204 }
12205 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12206 {
12207 /* This symbol may be referenced via a displacement from the PIC
12208 base address (@GOTOFF). */
12209
12210 if (reload_in_progress)
12211 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12212 if (GET_CODE (addr) == CONST)
12213 addr = XEXP (addr, 0);
12214 if (GET_CODE (addr) == PLUS)
12215 {
12216 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12217 UNSPEC_GOTOFF);
12218 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12219 }
12220 else
12221 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12222 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12223 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12224
12225 if (reg != 0)
12226 {
12227 emit_move_insn (reg, new_rtx);
12228 new_rtx = reg;
12229 }
12230 }
12231 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12232 /* We can't use @GOTOFF for text labels on VxWorks;
12233 see gotoff_operand. */
12234 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12235 {
12236 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12237 {
12238 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12239 return legitimize_dllimport_symbol (addr, true);
12240 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12241 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12242 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12243 {
12244 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12245 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12246 }
12247 }
12248
12249 /* For x64 PE-COFF there is no GOT table. So we use address
12250 directly. */
12251 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12252 {
12253 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12254 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12255
12256 if (reg == 0)
12257 reg = gen_reg_rtx (Pmode);
12258 emit_move_insn (reg, new_rtx);
12259 new_rtx = reg;
12260 }
12261 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12262 {
12263 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12264 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12265 new_rtx = gen_const_mem (Pmode, new_rtx);
12266 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12267
12268 if (reg == 0)
12269 reg = gen_reg_rtx (Pmode);
12270 /* Use directly gen_movsi, otherwise the address is loaded
12271 into register for CSE. We don't want to CSE this addresses,
12272 instead we CSE addresses from the GOT table, so skip this. */
12273 emit_insn (gen_movsi (reg, new_rtx));
12274 new_rtx = reg;
12275 }
12276 else
12277 {
12278 /* This symbol must be referenced via a load from the
12279 Global Offset Table (@GOT). */
12280
12281 if (reload_in_progress)
12282 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12283 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12284 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12285 if (TARGET_64BIT)
12286 new_rtx = force_reg (Pmode, new_rtx);
12287 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12288 new_rtx = gen_const_mem (Pmode, new_rtx);
12289 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12290
12291 if (reg == 0)
12292 reg = gen_reg_rtx (Pmode);
12293 emit_move_insn (reg, new_rtx);
12294 new_rtx = reg;
12295 }
12296 }
12297 else
12298 {
12299 if (CONST_INT_P (addr)
12300 && !x86_64_immediate_operand (addr, VOIDmode))
12301 {
12302 if (reg)
12303 {
12304 emit_move_insn (reg, addr);
12305 new_rtx = reg;
12306 }
12307 else
12308 new_rtx = force_reg (Pmode, addr);
12309 }
12310 else if (GET_CODE (addr) == CONST)
12311 {
12312 addr = XEXP (addr, 0);
12313
12314 /* We must match stuff we generate before. Assume the only
12315 unspecs that can get here are ours. Not that we could do
12316 anything with them anyway.... */
12317 if (GET_CODE (addr) == UNSPEC
12318 || (GET_CODE (addr) == PLUS
12319 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12320 return orig;
12321 gcc_assert (GET_CODE (addr) == PLUS);
12322 }
12323 if (GET_CODE (addr) == PLUS)
12324 {
12325 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12326
12327 /* Check first to see if this is a constant offset from a @GOTOFF
12328 symbol reference. */
12329 if (gotoff_operand (op0, Pmode)
12330 && CONST_INT_P (op1))
12331 {
12332 if (!TARGET_64BIT)
12333 {
12334 if (reload_in_progress)
12335 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12336 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12337 UNSPEC_GOTOFF);
12338 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12339 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12340 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12341
12342 if (reg != 0)
12343 {
12344 emit_move_insn (reg, new_rtx);
12345 new_rtx = reg;
12346 }
12347 }
12348 else
12349 {
12350 if (INTVAL (op1) < -16*1024*1024
12351 || INTVAL (op1) >= 16*1024*1024)
12352 {
12353 if (!x86_64_immediate_operand (op1, Pmode))
12354 op1 = force_reg (Pmode, op1);
12355 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12356 }
12357 }
12358 }
12359 else
12360 {
12361 base = legitimize_pic_address (XEXP (addr, 0), reg);
12362 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12363 base == reg ? NULL_RTX : reg);
12364
12365 if (CONST_INT_P (new_rtx))
12366 new_rtx = plus_constant (base, INTVAL (new_rtx));
12367 else
12368 {
12369 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12370 {
12371 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12372 new_rtx = XEXP (new_rtx, 1);
12373 }
12374 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12375 }
12376 }
12377 }
12378 }
12379 return new_rtx;
12380 }
12381 \f
12382 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12383
12384 static rtx
12385 get_thread_pointer (bool to_reg)
12386 {
12387 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12388
12389 if (GET_MODE (tp) != Pmode)
12390 tp = convert_to_mode (Pmode, tp, 1);
12391
12392 if (to_reg)
12393 tp = copy_addr_to_reg (tp);
12394
12395 return tp;
12396 }
12397
12398 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12399
12400 static GTY(()) rtx ix86_tls_symbol;
12401
12402 static rtx
12403 ix86_tls_get_addr (void)
12404 {
12405 if (!ix86_tls_symbol)
12406 {
12407 const char *sym
12408 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12409 ? "___tls_get_addr" : "__tls_get_addr");
12410
12411 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12412 }
12413
12414 return ix86_tls_symbol;
12415 }
12416
12417 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12418
12419 static GTY(()) rtx ix86_tls_module_base_symbol;
12420
12421 rtx
12422 ix86_tls_module_base (void)
12423 {
12424 if (!ix86_tls_module_base_symbol)
12425 {
12426 ix86_tls_module_base_symbol
12427 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12428
12429 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12430 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12431 }
12432
12433 return ix86_tls_module_base_symbol;
12434 }
12435
12436 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12437 false if we expect this to be used for a memory address and true if
12438 we expect to load the address into a register. */
12439
12440 static rtx
12441 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12442 {
12443 rtx dest, base, off;
12444 rtx pic = NULL_RTX, tp = NULL_RTX;
12445 int type;
12446
12447 switch (model)
12448 {
12449 case TLS_MODEL_GLOBAL_DYNAMIC:
12450 dest = gen_reg_rtx (Pmode);
12451
12452 if (!TARGET_64BIT)
12453 {
12454 if (flag_pic)
12455 pic = pic_offset_table_rtx;
12456 else
12457 {
12458 pic = gen_reg_rtx (Pmode);
12459 emit_insn (gen_set_got (pic));
12460 }
12461 }
12462
12463 if (TARGET_GNU2_TLS)
12464 {
12465 if (TARGET_64BIT)
12466 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12467 else
12468 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12469
12470 tp = get_thread_pointer (true);
12471 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12472
12473 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12474 }
12475 else
12476 {
12477 rtx caddr = ix86_tls_get_addr ();
12478
12479 if (TARGET_64BIT)
12480 {
12481 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
12482
12483 start_sequence ();
12484 emit_call_insn (gen_tls_global_dynamic_64 (rax, x, caddr));
12485 insns = get_insns ();
12486 end_sequence ();
12487
12488 RTL_CONST_CALL_P (insns) = 1;
12489 emit_libcall_block (insns, dest, rax, x);
12490 }
12491 else
12492 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12493 }
12494 break;
12495
12496 case TLS_MODEL_LOCAL_DYNAMIC:
12497 base = gen_reg_rtx (Pmode);
12498
12499 if (!TARGET_64BIT)
12500 {
12501 if (flag_pic)
12502 pic = pic_offset_table_rtx;
12503 else
12504 {
12505 pic = gen_reg_rtx (Pmode);
12506 emit_insn (gen_set_got (pic));
12507 }
12508 }
12509
12510 if (TARGET_GNU2_TLS)
12511 {
12512 rtx tmp = ix86_tls_module_base ();
12513
12514 if (TARGET_64BIT)
12515 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12516 else
12517 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12518
12519 tp = get_thread_pointer (true);
12520 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12521 gen_rtx_MINUS (Pmode, tmp, tp));
12522 }
12523 else
12524 {
12525 rtx caddr = ix86_tls_get_addr ();
12526
12527 if (TARGET_64BIT)
12528 {
12529 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
12530
12531 start_sequence ();
12532 emit_call_insn (gen_tls_local_dynamic_base_64 (rax, caddr));
12533 insns = get_insns ();
12534 end_sequence ();
12535
12536 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12537 share the LD_BASE result with other LD model accesses. */
12538 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12539 UNSPEC_TLS_LD_BASE);
12540
12541 RTL_CONST_CALL_P (insns) = 1;
12542 emit_libcall_block (insns, base, rax, eqv);
12543 }
12544 else
12545 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12546 }
12547
12548 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12549 off = gen_rtx_CONST (Pmode, off);
12550
12551 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12552
12553 if (TARGET_GNU2_TLS)
12554 {
12555 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12556
12557 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12558 }
12559 break;
12560
12561 case TLS_MODEL_INITIAL_EXEC:
12562 if (TARGET_64BIT)
12563 {
12564 if (TARGET_SUN_TLS)
12565 {
12566 /* The Sun linker took the AMD64 TLS spec literally
12567 and can only handle %rax as destination of the
12568 initial executable code sequence. */
12569
12570 dest = gen_reg_rtx (Pmode);
12571 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12572 return dest;
12573 }
12574
12575 pic = NULL;
12576 type = UNSPEC_GOTNTPOFF;
12577 }
12578 else if (flag_pic)
12579 {
12580 if (reload_in_progress)
12581 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12582 pic = pic_offset_table_rtx;
12583 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12584 }
12585 else if (!TARGET_ANY_GNU_TLS)
12586 {
12587 pic = gen_reg_rtx (Pmode);
12588 emit_insn (gen_set_got (pic));
12589 type = UNSPEC_GOTTPOFF;
12590 }
12591 else
12592 {
12593 pic = NULL;
12594 type = UNSPEC_INDNTPOFF;
12595 }
12596
12597 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), type);
12598 off = gen_rtx_CONST (Pmode, off);
12599 if (pic)
12600 off = gen_rtx_PLUS (Pmode, pic, off);
12601 off = gen_const_mem (Pmode, off);
12602 set_mem_alias_set (off, ix86_GOT_alias_set ());
12603
12604 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12605 {
12606 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12607 off = force_reg (Pmode, off);
12608 return gen_rtx_PLUS (Pmode, base, off);
12609 }
12610 else
12611 {
12612 base = get_thread_pointer (true);
12613 dest = gen_reg_rtx (Pmode);
12614 emit_insn (gen_subsi3 (dest, base, off));
12615 }
12616 break;
12617
12618 case TLS_MODEL_LOCAL_EXEC:
12619 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12620 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12621 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12622 off = gen_rtx_CONST (Pmode, off);
12623
12624 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12625 {
12626 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12627 return gen_rtx_PLUS (Pmode, base, off);
12628 }
12629 else
12630 {
12631 base = get_thread_pointer (true);
12632 dest = gen_reg_rtx (Pmode);
12633 emit_insn (gen_subsi3 (dest, base, off));
12634 }
12635 break;
12636
12637 default:
12638 gcc_unreachable ();
12639 }
12640
12641 return dest;
12642 }
12643
12644 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12645 to symbol DECL. */
12646
12647 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12648 htab_t dllimport_map;
12649
12650 static tree
12651 get_dllimport_decl (tree decl)
12652 {
12653 struct tree_map *h, in;
12654 void **loc;
12655 const char *name;
12656 const char *prefix;
12657 size_t namelen, prefixlen;
12658 char *imp_name;
12659 tree to;
12660 rtx rtl;
12661
12662 if (!dllimport_map)
12663 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
12664
12665 in.hash = htab_hash_pointer (decl);
12666 in.base.from = decl;
12667 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
12668 h = (struct tree_map *) *loc;
12669 if (h)
12670 return h->to;
12671
12672 *loc = h = ggc_alloc_tree_map ();
12673 h->hash = in.hash;
12674 h->base.from = decl;
12675 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
12676 VAR_DECL, NULL, ptr_type_node);
12677 DECL_ARTIFICIAL (to) = 1;
12678 DECL_IGNORED_P (to) = 1;
12679 DECL_EXTERNAL (to) = 1;
12680 TREE_READONLY (to) = 1;
12681
12682 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
12683 name = targetm.strip_name_encoding (name);
12684 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
12685 ? "*__imp_" : "*__imp__";
12686 namelen = strlen (name);
12687 prefixlen = strlen (prefix);
12688 imp_name = (char *) alloca (namelen + prefixlen + 1);
12689 memcpy (imp_name, prefix, prefixlen);
12690 memcpy (imp_name + prefixlen, name, namelen + 1);
12691
12692 name = ggc_alloc_string (imp_name, namelen + prefixlen);
12693 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
12694 SET_SYMBOL_REF_DECL (rtl, to);
12695 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
12696
12697 rtl = gen_const_mem (Pmode, rtl);
12698 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
12699
12700 SET_DECL_RTL (to, rtl);
12701 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
12702
12703 return to;
12704 }
12705
12706 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
12707 true if we require the result be a register. */
12708
12709 static rtx
12710 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
12711 {
12712 tree imp_decl;
12713 rtx x;
12714
12715 gcc_assert (SYMBOL_REF_DECL (symbol));
12716 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
12717
12718 x = DECL_RTL (imp_decl);
12719 if (want_reg)
12720 x = force_reg (Pmode, x);
12721 return x;
12722 }
12723
12724 /* Try machine-dependent ways of modifying an illegitimate address
12725 to be legitimate. If we find one, return the new, valid address.
12726 This macro is used in only one place: `memory_address' in explow.c.
12727
12728 OLDX is the address as it was before break_out_memory_refs was called.
12729 In some cases it is useful to look at this to decide what needs to be done.
12730
12731 It is always safe for this macro to do nothing. It exists to recognize
12732 opportunities to optimize the output.
12733
12734 For the 80386, we handle X+REG by loading X into a register R and
12735 using R+REG. R will go in a general reg and indexing will be used.
12736 However, if REG is a broken-out memory address or multiplication,
12737 nothing needs to be done because REG can certainly go in a general reg.
12738
12739 When -fpic is used, special handling is needed for symbolic references.
12740 See comments by legitimize_pic_address in i386.c for details. */
12741
12742 static rtx
12743 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
12744 enum machine_mode mode)
12745 {
12746 int changed = 0;
12747 unsigned log;
12748
12749 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
12750 if (log)
12751 return legitimize_tls_address (x, (enum tls_model) log, false);
12752 if (GET_CODE (x) == CONST
12753 && GET_CODE (XEXP (x, 0)) == PLUS
12754 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12755 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
12756 {
12757 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
12758 (enum tls_model) log, false);
12759 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12760 }
12761
12762 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12763 {
12764 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
12765 return legitimize_dllimport_symbol (x, true);
12766 if (GET_CODE (x) == CONST
12767 && GET_CODE (XEXP (x, 0)) == PLUS
12768 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12769 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
12770 {
12771 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
12772 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12773 }
12774 }
12775
12776 if (flag_pic && SYMBOLIC_CONST (x))
12777 return legitimize_pic_address (x, 0);
12778
12779 #if TARGET_MACHO
12780 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
12781 return machopic_indirect_data_reference (x, 0);
12782 #endif
12783
12784 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
12785 if (GET_CODE (x) == ASHIFT
12786 && CONST_INT_P (XEXP (x, 1))
12787 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
12788 {
12789 changed = 1;
12790 log = INTVAL (XEXP (x, 1));
12791 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
12792 GEN_INT (1 << log));
12793 }
12794
12795 if (GET_CODE (x) == PLUS)
12796 {
12797 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
12798
12799 if (GET_CODE (XEXP (x, 0)) == ASHIFT
12800 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12801 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
12802 {
12803 changed = 1;
12804 log = INTVAL (XEXP (XEXP (x, 0), 1));
12805 XEXP (x, 0) = gen_rtx_MULT (Pmode,
12806 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
12807 GEN_INT (1 << log));
12808 }
12809
12810 if (GET_CODE (XEXP (x, 1)) == ASHIFT
12811 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
12812 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
12813 {
12814 changed = 1;
12815 log = INTVAL (XEXP (XEXP (x, 1), 1));
12816 XEXP (x, 1) = gen_rtx_MULT (Pmode,
12817 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
12818 GEN_INT (1 << log));
12819 }
12820
12821 /* Put multiply first if it isn't already. */
12822 if (GET_CODE (XEXP (x, 1)) == MULT)
12823 {
12824 rtx tmp = XEXP (x, 0);
12825 XEXP (x, 0) = XEXP (x, 1);
12826 XEXP (x, 1) = tmp;
12827 changed = 1;
12828 }
12829
12830 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
12831 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
12832 created by virtual register instantiation, register elimination, and
12833 similar optimizations. */
12834 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
12835 {
12836 changed = 1;
12837 x = gen_rtx_PLUS (Pmode,
12838 gen_rtx_PLUS (Pmode, XEXP (x, 0),
12839 XEXP (XEXP (x, 1), 0)),
12840 XEXP (XEXP (x, 1), 1));
12841 }
12842
12843 /* Canonicalize
12844 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
12845 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
12846 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
12847 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
12848 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
12849 && CONSTANT_P (XEXP (x, 1)))
12850 {
12851 rtx constant;
12852 rtx other = NULL_RTX;
12853
12854 if (CONST_INT_P (XEXP (x, 1)))
12855 {
12856 constant = XEXP (x, 1);
12857 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
12858 }
12859 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
12860 {
12861 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
12862 other = XEXP (x, 1);
12863 }
12864 else
12865 constant = 0;
12866
12867 if (constant)
12868 {
12869 changed = 1;
12870 x = gen_rtx_PLUS (Pmode,
12871 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
12872 XEXP (XEXP (XEXP (x, 0), 1), 0)),
12873 plus_constant (other, INTVAL (constant)));
12874 }
12875 }
12876
12877 if (changed && ix86_legitimate_address_p (mode, x, false))
12878 return x;
12879
12880 if (GET_CODE (XEXP (x, 0)) == MULT)
12881 {
12882 changed = 1;
12883 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
12884 }
12885
12886 if (GET_CODE (XEXP (x, 1)) == MULT)
12887 {
12888 changed = 1;
12889 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
12890 }
12891
12892 if (changed
12893 && REG_P (XEXP (x, 1))
12894 && REG_P (XEXP (x, 0)))
12895 return x;
12896
12897 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
12898 {
12899 changed = 1;
12900 x = legitimize_pic_address (x, 0);
12901 }
12902
12903 if (changed && ix86_legitimate_address_p (mode, x, false))
12904 return x;
12905
12906 if (REG_P (XEXP (x, 0)))
12907 {
12908 rtx temp = gen_reg_rtx (Pmode);
12909 rtx val = force_operand (XEXP (x, 1), temp);
12910 if (val != temp)
12911 {
12912 if (GET_MODE (val) != Pmode)
12913 val = convert_to_mode (Pmode, val, 1);
12914 emit_move_insn (temp, val);
12915 }
12916
12917 XEXP (x, 1) = temp;
12918 return x;
12919 }
12920
12921 else if (REG_P (XEXP (x, 1)))
12922 {
12923 rtx temp = gen_reg_rtx (Pmode);
12924 rtx val = force_operand (XEXP (x, 0), temp);
12925 if (val != temp)
12926 {
12927 if (GET_MODE (val) != Pmode)
12928 val = convert_to_mode (Pmode, val, 1);
12929 emit_move_insn (temp, val);
12930 }
12931
12932 XEXP (x, 0) = temp;
12933 return x;
12934 }
12935 }
12936
12937 return x;
12938 }
12939 \f
12940 /* Print an integer constant expression in assembler syntax. Addition
12941 and subtraction are the only arithmetic that may appear in these
12942 expressions. FILE is the stdio stream to write to, X is the rtx, and
12943 CODE is the operand print code from the output string. */
12944
12945 static void
12946 output_pic_addr_const (FILE *file, rtx x, int code)
12947 {
12948 char buf[256];
12949
12950 switch (GET_CODE (x))
12951 {
12952 case PC:
12953 gcc_assert (flag_pic);
12954 putc ('.', file);
12955 break;
12956
12957 case SYMBOL_REF:
12958 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
12959 output_addr_const (file, x);
12960 else
12961 {
12962 const char *name = XSTR (x, 0);
12963
12964 /* Mark the decl as referenced so that cgraph will
12965 output the function. */
12966 if (SYMBOL_REF_DECL (x))
12967 mark_decl_referenced (SYMBOL_REF_DECL (x));
12968
12969 #if TARGET_MACHO
12970 if (MACHOPIC_INDIRECT
12971 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
12972 name = machopic_indirection_name (x, /*stub_p=*/true);
12973 #endif
12974 assemble_name (file, name);
12975 }
12976 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12977 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
12978 fputs ("@PLT", file);
12979 break;
12980
12981 case LABEL_REF:
12982 x = XEXP (x, 0);
12983 /* FALLTHRU */
12984 case CODE_LABEL:
12985 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
12986 assemble_name (asm_out_file, buf);
12987 break;
12988
12989 case CONST_INT:
12990 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
12991 break;
12992
12993 case CONST:
12994 /* This used to output parentheses around the expression,
12995 but that does not work on the 386 (either ATT or BSD assembler). */
12996 output_pic_addr_const (file, XEXP (x, 0), code);
12997 break;
12998
12999 case CONST_DOUBLE:
13000 if (GET_MODE (x) == VOIDmode)
13001 {
13002 /* We can use %d if the number is <32 bits and positive. */
13003 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13004 fprintf (file, "0x%lx%08lx",
13005 (unsigned long) CONST_DOUBLE_HIGH (x),
13006 (unsigned long) CONST_DOUBLE_LOW (x));
13007 else
13008 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13009 }
13010 else
13011 /* We can't handle floating point constants;
13012 TARGET_PRINT_OPERAND must handle them. */
13013 output_operand_lossage ("floating constant misused");
13014 break;
13015
13016 case PLUS:
13017 /* Some assemblers need integer constants to appear first. */
13018 if (CONST_INT_P (XEXP (x, 0)))
13019 {
13020 output_pic_addr_const (file, XEXP (x, 0), code);
13021 putc ('+', file);
13022 output_pic_addr_const (file, XEXP (x, 1), code);
13023 }
13024 else
13025 {
13026 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13027 output_pic_addr_const (file, XEXP (x, 1), code);
13028 putc ('+', file);
13029 output_pic_addr_const (file, XEXP (x, 0), code);
13030 }
13031 break;
13032
13033 case MINUS:
13034 if (!TARGET_MACHO)
13035 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13036 output_pic_addr_const (file, XEXP (x, 0), code);
13037 putc ('-', file);
13038 output_pic_addr_const (file, XEXP (x, 1), code);
13039 if (!TARGET_MACHO)
13040 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13041 break;
13042
13043 case UNSPEC:
13044 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13045 {
13046 bool f = i386_asm_output_addr_const_extra (file, x);
13047 gcc_assert (f);
13048 break;
13049 }
13050
13051 gcc_assert (XVECLEN (x, 0) == 1);
13052 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13053 switch (XINT (x, 1))
13054 {
13055 case UNSPEC_GOT:
13056 fputs ("@GOT", file);
13057 break;
13058 case UNSPEC_GOTOFF:
13059 fputs ("@GOTOFF", file);
13060 break;
13061 case UNSPEC_PLTOFF:
13062 fputs ("@PLTOFF", file);
13063 break;
13064 case UNSPEC_PCREL:
13065 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13066 "(%rip)" : "[rip]", file);
13067 break;
13068 case UNSPEC_GOTPCREL:
13069 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13070 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13071 break;
13072 case UNSPEC_GOTTPOFF:
13073 /* FIXME: This might be @TPOFF in Sun ld too. */
13074 fputs ("@gottpoff", file);
13075 break;
13076 case UNSPEC_TPOFF:
13077 fputs ("@tpoff", file);
13078 break;
13079 case UNSPEC_NTPOFF:
13080 if (TARGET_64BIT)
13081 fputs ("@tpoff", file);
13082 else
13083 fputs ("@ntpoff", file);
13084 break;
13085 case UNSPEC_DTPOFF:
13086 fputs ("@dtpoff", file);
13087 break;
13088 case UNSPEC_GOTNTPOFF:
13089 if (TARGET_64BIT)
13090 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13091 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13092 else
13093 fputs ("@gotntpoff", file);
13094 break;
13095 case UNSPEC_INDNTPOFF:
13096 fputs ("@indntpoff", file);
13097 break;
13098 #if TARGET_MACHO
13099 case UNSPEC_MACHOPIC_OFFSET:
13100 putc ('-', file);
13101 machopic_output_function_base_name (file);
13102 break;
13103 #endif
13104 default:
13105 output_operand_lossage ("invalid UNSPEC as operand");
13106 break;
13107 }
13108 break;
13109
13110 default:
13111 output_operand_lossage ("invalid expression as operand");
13112 }
13113 }
13114
13115 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13116 We need to emit DTP-relative relocations. */
13117
13118 static void ATTRIBUTE_UNUSED
13119 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13120 {
13121 fputs (ASM_LONG, file);
13122 output_addr_const (file, x);
13123 fputs ("@dtpoff", file);
13124 switch (size)
13125 {
13126 case 4:
13127 break;
13128 case 8:
13129 fputs (", 0", file);
13130 break;
13131 default:
13132 gcc_unreachable ();
13133 }
13134 }
13135
13136 /* Return true if X is a representation of the PIC register. This copes
13137 with calls from ix86_find_base_term, where the register might have
13138 been replaced by a cselib value. */
13139
13140 static bool
13141 ix86_pic_register_p (rtx x)
13142 {
13143 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13144 return (pic_offset_table_rtx
13145 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13146 else
13147 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13148 }
13149
13150 /* Helper function for ix86_delegitimize_address.
13151 Attempt to delegitimize TLS local-exec accesses. */
13152
13153 static rtx
13154 ix86_delegitimize_tls_address (rtx orig_x)
13155 {
13156 rtx x = orig_x, unspec;
13157 struct ix86_address addr;
13158
13159 if (!TARGET_TLS_DIRECT_SEG_REFS)
13160 return orig_x;
13161 if (MEM_P (x))
13162 x = XEXP (x, 0);
13163 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13164 return orig_x;
13165 if (ix86_decompose_address (x, &addr) == 0
13166 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13167 || addr.disp == NULL_RTX
13168 || GET_CODE (addr.disp) != CONST)
13169 return orig_x;
13170 unspec = XEXP (addr.disp, 0);
13171 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13172 unspec = XEXP (unspec, 0);
13173 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13174 return orig_x;
13175 x = XVECEXP (unspec, 0, 0);
13176 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13177 if (unspec != XEXP (addr.disp, 0))
13178 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13179 if (addr.index)
13180 {
13181 rtx idx = addr.index;
13182 if (addr.scale != 1)
13183 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13184 x = gen_rtx_PLUS (Pmode, idx, x);
13185 }
13186 if (addr.base)
13187 x = gen_rtx_PLUS (Pmode, addr.base, x);
13188 if (MEM_P (orig_x))
13189 x = replace_equiv_address_nv (orig_x, x);
13190 return x;
13191 }
13192
13193 /* In the name of slightly smaller debug output, and to cater to
13194 general assembler lossage, recognize PIC+GOTOFF and turn it back
13195 into a direct symbol reference.
13196
13197 On Darwin, this is necessary to avoid a crash, because Darwin
13198 has a different PIC label for each routine but the DWARF debugging
13199 information is not associated with any particular routine, so it's
13200 necessary to remove references to the PIC label from RTL stored by
13201 the DWARF output code. */
13202
13203 static rtx
13204 ix86_delegitimize_address (rtx x)
13205 {
13206 rtx orig_x = delegitimize_mem_from_attrs (x);
13207 /* addend is NULL or some rtx if x is something+GOTOFF where
13208 something doesn't include the PIC register. */
13209 rtx addend = NULL_RTX;
13210 /* reg_addend is NULL or a multiple of some register. */
13211 rtx reg_addend = NULL_RTX;
13212 /* const_addend is NULL or a const_int. */
13213 rtx const_addend = NULL_RTX;
13214 /* This is the result, or NULL. */
13215 rtx result = NULL_RTX;
13216
13217 x = orig_x;
13218
13219 if (MEM_P (x))
13220 x = XEXP (x, 0);
13221
13222 if (TARGET_64BIT)
13223 {
13224 if (GET_CODE (x) != CONST
13225 || GET_CODE (XEXP (x, 0)) != UNSPEC
13226 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13227 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13228 || !MEM_P (orig_x))
13229 return ix86_delegitimize_tls_address (orig_x);
13230 x = XVECEXP (XEXP (x, 0), 0, 0);
13231 if (GET_MODE (orig_x) != GET_MODE (x))
13232 {
13233 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13234 GET_MODE (x), 0);
13235 if (x == NULL_RTX)
13236 return orig_x;
13237 }
13238 return x;
13239 }
13240
13241 if (GET_CODE (x) != PLUS
13242 || GET_CODE (XEXP (x, 1)) != CONST)
13243 return ix86_delegitimize_tls_address (orig_x);
13244
13245 if (ix86_pic_register_p (XEXP (x, 0)))
13246 /* %ebx + GOT/GOTOFF */
13247 ;
13248 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13249 {
13250 /* %ebx + %reg * scale + GOT/GOTOFF */
13251 reg_addend = XEXP (x, 0);
13252 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13253 reg_addend = XEXP (reg_addend, 1);
13254 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13255 reg_addend = XEXP (reg_addend, 0);
13256 else
13257 {
13258 reg_addend = NULL_RTX;
13259 addend = XEXP (x, 0);
13260 }
13261 }
13262 else
13263 addend = XEXP (x, 0);
13264
13265 x = XEXP (XEXP (x, 1), 0);
13266 if (GET_CODE (x) == PLUS
13267 && CONST_INT_P (XEXP (x, 1)))
13268 {
13269 const_addend = XEXP (x, 1);
13270 x = XEXP (x, 0);
13271 }
13272
13273 if (GET_CODE (x) == UNSPEC
13274 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13275 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13276 result = XVECEXP (x, 0, 0);
13277
13278 if (TARGET_MACHO && darwin_local_data_pic (x)
13279 && !MEM_P (orig_x))
13280 result = XVECEXP (x, 0, 0);
13281
13282 if (! result)
13283 return ix86_delegitimize_tls_address (orig_x);
13284
13285 if (const_addend)
13286 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13287 if (reg_addend)
13288 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13289 if (addend)
13290 {
13291 /* If the rest of original X doesn't involve the PIC register, add
13292 addend and subtract pic_offset_table_rtx. This can happen e.g.
13293 for code like:
13294 leal (%ebx, %ecx, 4), %ecx
13295 ...
13296 movl foo@GOTOFF(%ecx), %edx
13297 in which case we return (%ecx - %ebx) + foo. */
13298 if (pic_offset_table_rtx)
13299 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13300 pic_offset_table_rtx),
13301 result);
13302 else
13303 return orig_x;
13304 }
13305 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13306 {
13307 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13308 if (result == NULL_RTX)
13309 return orig_x;
13310 }
13311 return result;
13312 }
13313
13314 /* If X is a machine specific address (i.e. a symbol or label being
13315 referenced as a displacement from the GOT implemented using an
13316 UNSPEC), then return the base term. Otherwise return X. */
13317
13318 rtx
13319 ix86_find_base_term (rtx x)
13320 {
13321 rtx term;
13322
13323 if (TARGET_64BIT)
13324 {
13325 if (GET_CODE (x) != CONST)
13326 return x;
13327 term = XEXP (x, 0);
13328 if (GET_CODE (term) == PLUS
13329 && (CONST_INT_P (XEXP (term, 1))
13330 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13331 term = XEXP (term, 0);
13332 if (GET_CODE (term) != UNSPEC
13333 || (XINT (term, 1) != UNSPEC_GOTPCREL
13334 && XINT (term, 1) != UNSPEC_PCREL))
13335 return x;
13336
13337 return XVECEXP (term, 0, 0);
13338 }
13339
13340 return ix86_delegitimize_address (x);
13341 }
13342 \f
13343 static void
13344 put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
13345 int fp, FILE *file)
13346 {
13347 const char *suffix;
13348
13349 if (mode == CCFPmode || mode == CCFPUmode)
13350 {
13351 code = ix86_fp_compare_code_to_integer (code);
13352 mode = CCmode;
13353 }
13354 if (reverse)
13355 code = reverse_condition (code);
13356
13357 switch (code)
13358 {
13359 case EQ:
13360 switch (mode)
13361 {
13362 case CCAmode:
13363 suffix = "a";
13364 break;
13365
13366 case CCCmode:
13367 suffix = "c";
13368 break;
13369
13370 case CCOmode:
13371 suffix = "o";
13372 break;
13373
13374 case CCSmode:
13375 suffix = "s";
13376 break;
13377
13378 default:
13379 suffix = "e";
13380 }
13381 break;
13382 case NE:
13383 switch (mode)
13384 {
13385 case CCAmode:
13386 suffix = "na";
13387 break;
13388
13389 case CCCmode:
13390 suffix = "nc";
13391 break;
13392
13393 case CCOmode:
13394 suffix = "no";
13395 break;
13396
13397 case CCSmode:
13398 suffix = "ns";
13399 break;
13400
13401 default:
13402 suffix = "ne";
13403 }
13404 break;
13405 case GT:
13406 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13407 suffix = "g";
13408 break;
13409 case GTU:
13410 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13411 Those same assemblers have the same but opposite lossage on cmov. */
13412 if (mode == CCmode)
13413 suffix = fp ? "nbe" : "a";
13414 else if (mode == CCCmode)
13415 suffix = "b";
13416 else
13417 gcc_unreachable ();
13418 break;
13419 case LT:
13420 switch (mode)
13421 {
13422 case CCNOmode:
13423 case CCGOCmode:
13424 suffix = "s";
13425 break;
13426
13427 case CCmode:
13428 case CCGCmode:
13429 suffix = "l";
13430 break;
13431
13432 default:
13433 gcc_unreachable ();
13434 }
13435 break;
13436 case LTU:
13437 gcc_assert (mode == CCmode || mode == CCCmode);
13438 suffix = "b";
13439 break;
13440 case GE:
13441 switch (mode)
13442 {
13443 case CCNOmode:
13444 case CCGOCmode:
13445 suffix = "ns";
13446 break;
13447
13448 case CCmode:
13449 case CCGCmode:
13450 suffix = "ge";
13451 break;
13452
13453 default:
13454 gcc_unreachable ();
13455 }
13456 break;
13457 case GEU:
13458 /* ??? As above. */
13459 gcc_assert (mode == CCmode || mode == CCCmode);
13460 suffix = fp ? "nb" : "ae";
13461 break;
13462 case LE:
13463 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13464 suffix = "le";
13465 break;
13466 case LEU:
13467 /* ??? As above. */
13468 if (mode == CCmode)
13469 suffix = "be";
13470 else if (mode == CCCmode)
13471 suffix = fp ? "nb" : "ae";
13472 else
13473 gcc_unreachable ();
13474 break;
13475 case UNORDERED:
13476 suffix = fp ? "u" : "p";
13477 break;
13478 case ORDERED:
13479 suffix = fp ? "nu" : "np";
13480 break;
13481 default:
13482 gcc_unreachable ();
13483 }
13484 fputs (suffix, file);
13485 }
13486
13487 /* Print the name of register X to FILE based on its machine mode and number.
13488 If CODE is 'w', pretend the mode is HImode.
13489 If CODE is 'b', pretend the mode is QImode.
13490 If CODE is 'k', pretend the mode is SImode.
13491 If CODE is 'q', pretend the mode is DImode.
13492 If CODE is 'x', pretend the mode is V4SFmode.
13493 If CODE is 't', pretend the mode is V8SFmode.
13494 If CODE is 'h', pretend the reg is the 'high' byte register.
13495 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13496 If CODE is 'd', duplicate the operand for AVX instruction.
13497 */
13498
13499 void
13500 print_reg (rtx x, int code, FILE *file)
13501 {
13502 const char *reg;
13503 bool duplicated = code == 'd' && TARGET_AVX;
13504
13505 gcc_assert (x == pc_rtx
13506 || (REGNO (x) != ARG_POINTER_REGNUM
13507 && REGNO (x) != FRAME_POINTER_REGNUM
13508 && REGNO (x) != FLAGS_REG
13509 && REGNO (x) != FPSR_REG
13510 && REGNO (x) != FPCR_REG));
13511
13512 if (ASSEMBLER_DIALECT == ASM_ATT)
13513 putc ('%', file);
13514
13515 if (x == pc_rtx)
13516 {
13517 gcc_assert (TARGET_64BIT);
13518 fputs ("rip", file);
13519 return;
13520 }
13521
13522 if (code == 'w' || MMX_REG_P (x))
13523 code = 2;
13524 else if (code == 'b')
13525 code = 1;
13526 else if (code == 'k')
13527 code = 4;
13528 else if (code == 'q')
13529 code = 8;
13530 else if (code == 'y')
13531 code = 3;
13532 else if (code == 'h')
13533 code = 0;
13534 else if (code == 'x')
13535 code = 16;
13536 else if (code == 't')
13537 code = 32;
13538 else
13539 code = GET_MODE_SIZE (GET_MODE (x));
13540
13541 /* Irritatingly, AMD extended registers use different naming convention
13542 from the normal registers: "r%d[bwd]" */
13543 if (REX_INT_REG_P (x))
13544 {
13545 gcc_assert (TARGET_64BIT);
13546 putc ('r', file);
13547 fprint_ul (file, REGNO (x) - FIRST_REX_INT_REG + 8);
13548 switch (code)
13549 {
13550 case 0:
13551 error ("extended registers have no high halves");
13552 break;
13553 case 1:
13554 putc ('b', file);
13555 break;
13556 case 2:
13557 putc ('w', file);
13558 break;
13559 case 4:
13560 putc ('d', file);
13561 break;
13562 case 8:
13563 /* no suffix */
13564 break;
13565 default:
13566 error ("unsupported operand size for extended register");
13567 break;
13568 }
13569 return;
13570 }
13571
13572 reg = NULL;
13573 switch (code)
13574 {
13575 case 3:
13576 if (STACK_TOP_P (x))
13577 {
13578 reg = "st(0)";
13579 break;
13580 }
13581 /* FALLTHRU */
13582 case 8:
13583 case 4:
13584 case 12:
13585 if (! ANY_FP_REG_P (x))
13586 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13587 /* FALLTHRU */
13588 case 16:
13589 case 2:
13590 normal:
13591 reg = hi_reg_name[REGNO (x)];
13592 break;
13593 case 1:
13594 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
13595 goto normal;
13596 reg = qi_reg_name[REGNO (x)];
13597 break;
13598 case 0:
13599 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
13600 goto normal;
13601 reg = qi_high_reg_name[REGNO (x)];
13602 break;
13603 case 32:
13604 if (SSE_REG_P (x))
13605 {
13606 gcc_assert (!duplicated);
13607 putc ('y', file);
13608 fputs (hi_reg_name[REGNO (x)] + 1, file);
13609 return;
13610 }
13611 break;
13612 default:
13613 gcc_unreachable ();
13614 }
13615
13616 fputs (reg, file);
13617 if (duplicated)
13618 {
13619 if (ASSEMBLER_DIALECT == ASM_ATT)
13620 fprintf (file, ", %%%s", reg);
13621 else
13622 fprintf (file, ", %s", reg);
13623 }
13624 }
13625
13626 /* Locate some local-dynamic symbol still in use by this function
13627 so that we can print its name in some tls_local_dynamic_base
13628 pattern. */
13629
13630 static int
13631 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13632 {
13633 rtx x = *px;
13634
13635 if (GET_CODE (x) == SYMBOL_REF
13636 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13637 {
13638 cfun->machine->some_ld_name = XSTR (x, 0);
13639 return 1;
13640 }
13641
13642 return 0;
13643 }
13644
13645 static const char *
13646 get_some_local_dynamic_name (void)
13647 {
13648 rtx insn;
13649
13650 if (cfun->machine->some_ld_name)
13651 return cfun->machine->some_ld_name;
13652
13653 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
13654 if (NONDEBUG_INSN_P (insn)
13655 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
13656 return cfun->machine->some_ld_name;
13657
13658 return NULL;
13659 }
13660
13661 /* Meaning of CODE:
13662 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
13663 C -- print opcode suffix for set/cmov insn.
13664 c -- like C, but print reversed condition
13665 F,f -- likewise, but for floating-point.
13666 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
13667 otherwise nothing
13668 R -- print the prefix for register names.
13669 z -- print the opcode suffix for the size of the current operand.
13670 Z -- likewise, with special suffixes for x87 instructions.
13671 * -- print a star (in certain assembler syntax)
13672 A -- print an absolute memory reference.
13673 w -- print the operand as if it's a "word" (HImode) even if it isn't.
13674 s -- print a shift double count, followed by the assemblers argument
13675 delimiter.
13676 b -- print the QImode name of the register for the indicated operand.
13677 %b0 would print %al if operands[0] is reg 0.
13678 w -- likewise, print the HImode name of the register.
13679 k -- likewise, print the SImode name of the register.
13680 q -- likewise, print the DImode name of the register.
13681 x -- likewise, print the V4SFmode name of the register.
13682 t -- likewise, print the V8SFmode name of the register.
13683 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
13684 y -- print "st(0)" instead of "st" as a register.
13685 d -- print duplicated register operand for AVX instruction.
13686 D -- print condition for SSE cmp instruction.
13687 P -- if PIC, print an @PLT suffix.
13688 p -- print raw symbol name.
13689 X -- don't print any sort of PIC '@' suffix for a symbol.
13690 & -- print some in-use local-dynamic symbol name.
13691 H -- print a memory address offset by 8; used for sse high-parts
13692 Y -- print condition for XOP pcom* instruction.
13693 + -- print a branch hint as 'cs' or 'ds' prefix
13694 ; -- print a semicolon (after prefixes due to bug in older gas).
13695 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
13696 @ -- print a segment register of thread base pointer load
13697 */
13698
13699 void
13700 ix86_print_operand (FILE *file, rtx x, int code)
13701 {
13702 if (code)
13703 {
13704 switch (code)
13705 {
13706 case '*':
13707 if (ASSEMBLER_DIALECT == ASM_ATT)
13708 putc ('*', file);
13709 return;
13710
13711 case '&':
13712 {
13713 const char *name = get_some_local_dynamic_name ();
13714 if (name == NULL)
13715 output_operand_lossage ("'%%&' used without any "
13716 "local dynamic TLS references");
13717 else
13718 assemble_name (file, name);
13719 return;
13720 }
13721
13722 case 'A':
13723 switch (ASSEMBLER_DIALECT)
13724 {
13725 case ASM_ATT:
13726 putc ('*', file);
13727 break;
13728
13729 case ASM_INTEL:
13730 /* Intel syntax. For absolute addresses, registers should not
13731 be surrounded by braces. */
13732 if (!REG_P (x))
13733 {
13734 putc ('[', file);
13735 ix86_print_operand (file, x, 0);
13736 putc (']', file);
13737 return;
13738 }
13739 break;
13740
13741 default:
13742 gcc_unreachable ();
13743 }
13744
13745 ix86_print_operand (file, x, 0);
13746 return;
13747
13748
13749 case 'L':
13750 if (ASSEMBLER_DIALECT == ASM_ATT)
13751 putc ('l', file);
13752 return;
13753
13754 case 'W':
13755 if (ASSEMBLER_DIALECT == ASM_ATT)
13756 putc ('w', file);
13757 return;
13758
13759 case 'B':
13760 if (ASSEMBLER_DIALECT == ASM_ATT)
13761 putc ('b', file);
13762 return;
13763
13764 case 'Q':
13765 if (ASSEMBLER_DIALECT == ASM_ATT)
13766 putc ('l', file);
13767 return;
13768
13769 case 'S':
13770 if (ASSEMBLER_DIALECT == ASM_ATT)
13771 putc ('s', file);
13772 return;
13773
13774 case 'T':
13775 if (ASSEMBLER_DIALECT == ASM_ATT)
13776 putc ('t', file);
13777 return;
13778
13779 case 'z':
13780 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13781 {
13782 /* Opcodes don't get size suffixes if using Intel opcodes. */
13783 if (ASSEMBLER_DIALECT == ASM_INTEL)
13784 return;
13785
13786 switch (GET_MODE_SIZE (GET_MODE (x)))
13787 {
13788 case 1:
13789 putc ('b', file);
13790 return;
13791
13792 case 2:
13793 putc ('w', file);
13794 return;
13795
13796 case 4:
13797 putc ('l', file);
13798 return;
13799
13800 case 8:
13801 putc ('q', file);
13802 return;
13803
13804 default:
13805 output_operand_lossage
13806 ("invalid operand size for operand code '%c'", code);
13807 return;
13808 }
13809 }
13810
13811 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13812 warning
13813 (0, "non-integer operand used with operand code '%c'", code);
13814 /* FALLTHRU */
13815
13816 case 'Z':
13817 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
13818 if (ASSEMBLER_DIALECT == ASM_INTEL)
13819 return;
13820
13821 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13822 {
13823 switch (GET_MODE_SIZE (GET_MODE (x)))
13824 {
13825 case 2:
13826 #ifdef HAVE_AS_IX86_FILDS
13827 putc ('s', file);
13828 #endif
13829 return;
13830
13831 case 4:
13832 putc ('l', file);
13833 return;
13834
13835 case 8:
13836 #ifdef HAVE_AS_IX86_FILDQ
13837 putc ('q', file);
13838 #else
13839 fputs ("ll", file);
13840 #endif
13841 return;
13842
13843 default:
13844 break;
13845 }
13846 }
13847 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13848 {
13849 /* 387 opcodes don't get size suffixes
13850 if the operands are registers. */
13851 if (STACK_REG_P (x))
13852 return;
13853
13854 switch (GET_MODE_SIZE (GET_MODE (x)))
13855 {
13856 case 4:
13857 putc ('s', file);
13858 return;
13859
13860 case 8:
13861 putc ('l', file);
13862 return;
13863
13864 case 12:
13865 case 16:
13866 putc ('t', file);
13867 return;
13868
13869 default:
13870 break;
13871 }
13872 }
13873 else
13874 {
13875 output_operand_lossage
13876 ("invalid operand type used with operand code '%c'", code);
13877 return;
13878 }
13879
13880 output_operand_lossage
13881 ("invalid operand size for operand code '%c'", code);
13882 return;
13883
13884 case 'd':
13885 case 'b':
13886 case 'w':
13887 case 'k':
13888 case 'q':
13889 case 'h':
13890 case 't':
13891 case 'y':
13892 case 'x':
13893 case 'X':
13894 case 'P':
13895 case 'p':
13896 break;
13897
13898 case 's':
13899 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
13900 {
13901 ix86_print_operand (file, x, 0);
13902 fputs (", ", file);
13903 }
13904 return;
13905
13906 case 'D':
13907 /* Little bit of braindamage here. The SSE compare instructions
13908 does use completely different names for the comparisons that the
13909 fp conditional moves. */
13910 if (TARGET_AVX)
13911 {
13912 switch (GET_CODE (x))
13913 {
13914 case EQ:
13915 fputs ("eq", file);
13916 break;
13917 case UNEQ:
13918 fputs ("eq_us", file);
13919 break;
13920 case LT:
13921 fputs ("lt", file);
13922 break;
13923 case UNLT:
13924 fputs ("nge", file);
13925 break;
13926 case LE:
13927 fputs ("le", file);
13928 break;
13929 case UNLE:
13930 fputs ("ngt", file);
13931 break;
13932 case UNORDERED:
13933 fputs ("unord", file);
13934 break;
13935 case NE:
13936 fputs ("neq", file);
13937 break;
13938 case LTGT:
13939 fputs ("neq_oq", file);
13940 break;
13941 case GE:
13942 fputs ("ge", file);
13943 break;
13944 case UNGE:
13945 fputs ("nlt", file);
13946 break;
13947 case GT:
13948 fputs ("gt", file);
13949 break;
13950 case UNGT:
13951 fputs ("nle", file);
13952 break;
13953 case ORDERED:
13954 fputs ("ord", file);
13955 break;
13956 default:
13957 output_operand_lossage ("operand is not a condition code, "
13958 "invalid operand code 'D'");
13959 return;
13960 }
13961 }
13962 else
13963 {
13964 switch (GET_CODE (x))
13965 {
13966 case EQ:
13967 case UNEQ:
13968 fputs ("eq", file);
13969 break;
13970 case LT:
13971 case UNLT:
13972 fputs ("lt", file);
13973 break;
13974 case LE:
13975 case UNLE:
13976 fputs ("le", file);
13977 break;
13978 case UNORDERED:
13979 fputs ("unord", file);
13980 break;
13981 case NE:
13982 case LTGT:
13983 fputs ("neq", file);
13984 break;
13985 case UNGE:
13986 case GE:
13987 fputs ("nlt", file);
13988 break;
13989 case UNGT:
13990 case GT:
13991 fputs ("nle", file);
13992 break;
13993 case ORDERED:
13994 fputs ("ord", file);
13995 break;
13996 default:
13997 output_operand_lossage ("operand is not a condition code, "
13998 "invalid operand code 'D'");
13999 return;
14000 }
14001 }
14002 return;
14003 case 'O':
14004 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14005 if (ASSEMBLER_DIALECT == ASM_ATT)
14006 {
14007 switch (GET_MODE (x))
14008 {
14009 case HImode: putc ('w', file); break;
14010 case SImode:
14011 case SFmode: putc ('l', file); break;
14012 case DImode:
14013 case DFmode: putc ('q', file); break;
14014 default: gcc_unreachable ();
14015 }
14016 putc ('.', file);
14017 }
14018 #endif
14019 return;
14020 case 'C':
14021 if (!COMPARISON_P (x))
14022 {
14023 output_operand_lossage ("operand is neither a constant nor a "
14024 "condition code, invalid operand code "
14025 "'C'");
14026 return;
14027 }
14028 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 0, file);
14029 return;
14030 case 'F':
14031 if (!COMPARISON_P (x))
14032 {
14033 output_operand_lossage ("operand is neither a constant nor a "
14034 "condition code, invalid operand code "
14035 "'F'");
14036 return;
14037 }
14038 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14039 if (ASSEMBLER_DIALECT == ASM_ATT)
14040 putc ('.', file);
14041 #endif
14042 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 1, file);
14043 return;
14044
14045 /* Like above, but reverse condition */
14046 case 'c':
14047 /* Check to see if argument to %c is really a constant
14048 and not a condition code which needs to be reversed. */
14049 if (!COMPARISON_P (x))
14050 {
14051 output_operand_lossage ("operand is neither a constant nor a "
14052 "condition code, invalid operand "
14053 "code 'c'");
14054 return;
14055 }
14056 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 0, file);
14057 return;
14058 case 'f':
14059 if (!COMPARISON_P (x))
14060 {
14061 output_operand_lossage ("operand is neither a constant nor a "
14062 "condition code, invalid operand "
14063 "code 'f'");
14064 return;
14065 }
14066 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14067 if (ASSEMBLER_DIALECT == ASM_ATT)
14068 putc ('.', file);
14069 #endif
14070 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
14071 return;
14072
14073 case 'H':
14074 /* It doesn't actually matter what mode we use here, as we're
14075 only going to use this for printing. */
14076 x = adjust_address_nv (x, DImode, 8);
14077 break;
14078
14079 case '+':
14080 {
14081 rtx x;
14082
14083 if (!optimize
14084 || optimize_function_for_size_p (cfun) || !TARGET_BRANCH_PREDICTION_HINTS)
14085 return;
14086
14087 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14088 if (x)
14089 {
14090 int pred_val = INTVAL (XEXP (x, 0));
14091
14092 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14093 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14094 {
14095 int taken = pred_val > REG_BR_PROB_BASE / 2;
14096 int cputaken = final_forward_branch_p (current_output_insn) == 0;
14097
14098 /* Emit hints only in the case default branch prediction
14099 heuristics would fail. */
14100 if (taken != cputaken)
14101 {
14102 /* We use 3e (DS) prefix for taken branches and
14103 2e (CS) prefix for not taken branches. */
14104 if (taken)
14105 fputs ("ds ; ", file);
14106 else
14107 fputs ("cs ; ", file);
14108 }
14109 }
14110 }
14111 return;
14112 }
14113
14114 case 'Y':
14115 switch (GET_CODE (x))
14116 {
14117 case NE:
14118 fputs ("neq", file);
14119 break;
14120 case EQ:
14121 fputs ("eq", file);
14122 break;
14123 case GE:
14124 case GEU:
14125 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14126 break;
14127 case GT:
14128 case GTU:
14129 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14130 break;
14131 case LE:
14132 case LEU:
14133 fputs ("le", file);
14134 break;
14135 case LT:
14136 case LTU:
14137 fputs ("lt", file);
14138 break;
14139 case UNORDERED:
14140 fputs ("unord", file);
14141 break;
14142 case ORDERED:
14143 fputs ("ord", file);
14144 break;
14145 case UNEQ:
14146 fputs ("ueq", file);
14147 break;
14148 case UNGE:
14149 fputs ("nlt", file);
14150 break;
14151 case UNGT:
14152 fputs ("nle", file);
14153 break;
14154 case UNLE:
14155 fputs ("ule", file);
14156 break;
14157 case UNLT:
14158 fputs ("ult", file);
14159 break;
14160 case LTGT:
14161 fputs ("une", file);
14162 break;
14163 default:
14164 output_operand_lossage ("operand is not a condition code, "
14165 "invalid operand code 'Y'");
14166 return;
14167 }
14168 return;
14169
14170 case ';':
14171 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14172 putc (';', file);
14173 #endif
14174 return;
14175
14176 case '@':
14177 if (ASSEMBLER_DIALECT == ASM_ATT)
14178 putc ('%', file);
14179
14180 /* The kernel uses a different segment register for performance
14181 reasons; a system call would not have to trash the userspace
14182 segment register, which would be expensive. */
14183 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14184 fputs ("fs", file);
14185 else
14186 fputs ("gs", file);
14187 return;
14188
14189 case '~':
14190 putc (TARGET_AVX2 ? 'i' : 'f', file);
14191 return;
14192
14193 default:
14194 output_operand_lossage ("invalid operand code '%c'", code);
14195 }
14196 }
14197
14198 if (REG_P (x))
14199 print_reg (x, code, file);
14200
14201 else if (MEM_P (x))
14202 {
14203 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14204 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14205 && GET_MODE (x) != BLKmode)
14206 {
14207 const char * size;
14208 switch (GET_MODE_SIZE (GET_MODE (x)))
14209 {
14210 case 1: size = "BYTE"; break;
14211 case 2: size = "WORD"; break;
14212 case 4: size = "DWORD"; break;
14213 case 8: size = "QWORD"; break;
14214 case 12: size = "TBYTE"; break;
14215 case 16:
14216 if (GET_MODE (x) == XFmode)
14217 size = "TBYTE";
14218 else
14219 size = "XMMWORD";
14220 break;
14221 case 32: size = "YMMWORD"; break;
14222 default:
14223 gcc_unreachable ();
14224 }
14225
14226 /* Check for explicit size override (codes 'b', 'w', 'k',
14227 'q' and 'x') */
14228 if (code == 'b')
14229 size = "BYTE";
14230 else if (code == 'w')
14231 size = "WORD";
14232 else if (code == 'k')
14233 size = "DWORD";
14234 else if (code == 'q')
14235 size = "QWORD";
14236 else if (code == 'x')
14237 size = "XMMWORD";
14238
14239 fputs (size, file);
14240 fputs (" PTR ", file);
14241 }
14242
14243 x = XEXP (x, 0);
14244 /* Avoid (%rip) for call operands. */
14245 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14246 && !CONST_INT_P (x))
14247 output_addr_const (file, x);
14248 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14249 output_operand_lossage ("invalid constraints for operand");
14250 else
14251 output_address (x);
14252 }
14253
14254 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14255 {
14256 REAL_VALUE_TYPE r;
14257 long l;
14258
14259 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14260 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14261
14262 if (ASSEMBLER_DIALECT == ASM_ATT)
14263 putc ('$', file);
14264 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14265 if (code == 'q')
14266 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14267 else
14268 fprintf (file, "0x%08x", (unsigned int) l);
14269 }
14270
14271 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14272 {
14273 REAL_VALUE_TYPE r;
14274 long l[2];
14275
14276 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14277 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14278
14279 if (ASSEMBLER_DIALECT == ASM_ATT)
14280 putc ('$', file);
14281 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14282 }
14283
14284 /* These float cases don't actually occur as immediate operands. */
14285 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14286 {
14287 char dstr[30];
14288
14289 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14290 fputs (dstr, file);
14291 }
14292
14293 else
14294 {
14295 /* We have patterns that allow zero sets of memory, for instance.
14296 In 64-bit mode, we should probably support all 8-byte vectors,
14297 since we can in fact encode that into an immediate. */
14298 if (GET_CODE (x) == CONST_VECTOR)
14299 {
14300 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14301 x = const0_rtx;
14302 }
14303
14304 if (code != 'P' && code != 'p')
14305 {
14306 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14307 {
14308 if (ASSEMBLER_DIALECT == ASM_ATT)
14309 putc ('$', file);
14310 }
14311 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14312 || GET_CODE (x) == LABEL_REF)
14313 {
14314 if (ASSEMBLER_DIALECT == ASM_ATT)
14315 putc ('$', file);
14316 else
14317 fputs ("OFFSET FLAT:", file);
14318 }
14319 }
14320 if (CONST_INT_P (x))
14321 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14322 else if (flag_pic || MACHOPIC_INDIRECT)
14323 output_pic_addr_const (file, x, code);
14324 else
14325 output_addr_const (file, x);
14326 }
14327 }
14328
14329 static bool
14330 ix86_print_operand_punct_valid_p (unsigned char code)
14331 {
14332 return (code == '@' || code == '*' || code == '+'
14333 || code == '&' || code == ';' || code == '~');
14334 }
14335 \f
14336 /* Print a memory operand whose address is ADDR. */
14337
14338 static void
14339 ix86_print_operand_address (FILE *file, rtx addr)
14340 {
14341 struct ix86_address parts;
14342 rtx base, index, disp;
14343 int scale;
14344 int ok;
14345 bool vsib = false;
14346
14347 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14348 {
14349 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14350 gcc_assert (parts.index == NULL_RTX);
14351 parts.index = XVECEXP (addr, 0, 1);
14352 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14353 addr = XVECEXP (addr, 0, 0);
14354 vsib = true;
14355 }
14356 else
14357 ok = ix86_decompose_address (addr, &parts);
14358
14359 gcc_assert (ok);
14360
14361 if (parts.base && GET_CODE (parts.base) == SUBREG)
14362 {
14363 rtx tmp = SUBREG_REG (parts.base);
14364 parts.base = simplify_subreg (GET_MODE (parts.base),
14365 tmp, GET_MODE (tmp), 0);
14366 }
14367
14368 if (parts.index && GET_CODE (parts.index) == SUBREG)
14369 {
14370 rtx tmp = SUBREG_REG (parts.index);
14371 parts.index = simplify_subreg (GET_MODE (parts.index),
14372 tmp, GET_MODE (tmp), 0);
14373 }
14374
14375 base = parts.base;
14376 index = parts.index;
14377 disp = parts.disp;
14378 scale = parts.scale;
14379
14380 switch (parts.seg)
14381 {
14382 case SEG_DEFAULT:
14383 break;
14384 case SEG_FS:
14385 case SEG_GS:
14386 if (ASSEMBLER_DIALECT == ASM_ATT)
14387 putc ('%', file);
14388 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14389 break;
14390 default:
14391 gcc_unreachable ();
14392 }
14393
14394 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14395 if (TARGET_64BIT && !base && !index)
14396 {
14397 rtx symbol = disp;
14398
14399 if (GET_CODE (disp) == CONST
14400 && GET_CODE (XEXP (disp, 0)) == PLUS
14401 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14402 symbol = XEXP (XEXP (disp, 0), 0);
14403
14404 if (GET_CODE (symbol) == LABEL_REF
14405 || (GET_CODE (symbol) == SYMBOL_REF
14406 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14407 base = pc_rtx;
14408 }
14409 if (!base && !index)
14410 {
14411 /* Displacement only requires special attention. */
14412
14413 if (CONST_INT_P (disp))
14414 {
14415 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14416 fputs ("ds:", file);
14417 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14418 }
14419 else if (flag_pic)
14420 output_pic_addr_const (file, disp, 0);
14421 else
14422 output_addr_const (file, disp);
14423 }
14424 else
14425 {
14426 int code = 0;
14427
14428 /* Print SImode registers for zero-extended addresses to force
14429 addr32 prefix. Otherwise print DImode registers to avoid it. */
14430 if (TARGET_64BIT)
14431 code = ((GET_CODE (addr) == ZERO_EXTEND
14432 || GET_CODE (addr) == AND)
14433 ? 'l'
14434 : 'q');
14435
14436 if (ASSEMBLER_DIALECT == ASM_ATT)
14437 {
14438 if (disp)
14439 {
14440 if (flag_pic)
14441 output_pic_addr_const (file, disp, 0);
14442 else if (GET_CODE (disp) == LABEL_REF)
14443 output_asm_label (disp);
14444 else
14445 output_addr_const (file, disp);
14446 }
14447
14448 putc ('(', file);
14449 if (base)
14450 print_reg (base, code, file);
14451 if (index)
14452 {
14453 putc (',', file);
14454 print_reg (index, vsib ? 0 : code, file);
14455 if (scale != 1 || vsib)
14456 fprintf (file, ",%d", scale);
14457 }
14458 putc (')', file);
14459 }
14460 else
14461 {
14462 rtx offset = NULL_RTX;
14463
14464 if (disp)
14465 {
14466 /* Pull out the offset of a symbol; print any symbol itself. */
14467 if (GET_CODE (disp) == CONST
14468 && GET_CODE (XEXP (disp, 0)) == PLUS
14469 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14470 {
14471 offset = XEXP (XEXP (disp, 0), 1);
14472 disp = gen_rtx_CONST (VOIDmode,
14473 XEXP (XEXP (disp, 0), 0));
14474 }
14475
14476 if (flag_pic)
14477 output_pic_addr_const (file, disp, 0);
14478 else if (GET_CODE (disp) == LABEL_REF)
14479 output_asm_label (disp);
14480 else if (CONST_INT_P (disp))
14481 offset = disp;
14482 else
14483 output_addr_const (file, disp);
14484 }
14485
14486 putc ('[', file);
14487 if (base)
14488 {
14489 print_reg (base, code, file);
14490 if (offset)
14491 {
14492 if (INTVAL (offset) >= 0)
14493 putc ('+', file);
14494 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14495 }
14496 }
14497 else if (offset)
14498 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14499 else
14500 putc ('0', file);
14501
14502 if (index)
14503 {
14504 putc ('+', file);
14505 print_reg (index, vsib ? 0 : code, file);
14506 if (scale != 1 || vsib)
14507 fprintf (file, "*%d", scale);
14508 }
14509 putc (']', file);
14510 }
14511 }
14512 }
14513
14514 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14515
14516 static bool
14517 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14518 {
14519 rtx op;
14520
14521 if (GET_CODE (x) != UNSPEC)
14522 return false;
14523
14524 op = XVECEXP (x, 0, 0);
14525 switch (XINT (x, 1))
14526 {
14527 case UNSPEC_GOTTPOFF:
14528 output_addr_const (file, op);
14529 /* FIXME: This might be @TPOFF in Sun ld. */
14530 fputs ("@gottpoff", file);
14531 break;
14532 case UNSPEC_TPOFF:
14533 output_addr_const (file, op);
14534 fputs ("@tpoff", file);
14535 break;
14536 case UNSPEC_NTPOFF:
14537 output_addr_const (file, op);
14538 if (TARGET_64BIT)
14539 fputs ("@tpoff", file);
14540 else
14541 fputs ("@ntpoff", file);
14542 break;
14543 case UNSPEC_DTPOFF:
14544 output_addr_const (file, op);
14545 fputs ("@dtpoff", file);
14546 break;
14547 case UNSPEC_GOTNTPOFF:
14548 output_addr_const (file, op);
14549 if (TARGET_64BIT)
14550 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14551 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14552 else
14553 fputs ("@gotntpoff", file);
14554 break;
14555 case UNSPEC_INDNTPOFF:
14556 output_addr_const (file, op);
14557 fputs ("@indntpoff", file);
14558 break;
14559 #if TARGET_MACHO
14560 case UNSPEC_MACHOPIC_OFFSET:
14561 output_addr_const (file, op);
14562 putc ('-', file);
14563 machopic_output_function_base_name (file);
14564 break;
14565 #endif
14566
14567 case UNSPEC_STACK_CHECK:
14568 {
14569 int offset;
14570
14571 gcc_assert (flag_split_stack);
14572
14573 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14574 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14575 #else
14576 gcc_unreachable ();
14577 #endif
14578
14579 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14580 }
14581 break;
14582
14583 default:
14584 return false;
14585 }
14586
14587 return true;
14588 }
14589 \f
14590 /* Split one or more double-mode RTL references into pairs of half-mode
14591 references. The RTL can be REG, offsettable MEM, integer constant, or
14592 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14593 split and "num" is its length. lo_half and hi_half are output arrays
14594 that parallel "operands". */
14595
14596 void
14597 split_double_mode (enum machine_mode mode, rtx operands[],
14598 int num, rtx lo_half[], rtx hi_half[])
14599 {
14600 enum machine_mode half_mode;
14601 unsigned int byte;
14602
14603 switch (mode)
14604 {
14605 case TImode:
14606 half_mode = DImode;
14607 break;
14608 case DImode:
14609 half_mode = SImode;
14610 break;
14611 default:
14612 gcc_unreachable ();
14613 }
14614
14615 byte = GET_MODE_SIZE (half_mode);
14616
14617 while (num--)
14618 {
14619 rtx op = operands[num];
14620
14621 /* simplify_subreg refuse to split volatile memory addresses,
14622 but we still have to handle it. */
14623 if (MEM_P (op))
14624 {
14625 lo_half[num] = adjust_address (op, half_mode, 0);
14626 hi_half[num] = adjust_address (op, half_mode, byte);
14627 }
14628 else
14629 {
14630 lo_half[num] = simplify_gen_subreg (half_mode, op,
14631 GET_MODE (op) == VOIDmode
14632 ? mode : GET_MODE (op), 0);
14633 hi_half[num] = simplify_gen_subreg (half_mode, op,
14634 GET_MODE (op) == VOIDmode
14635 ? mode : GET_MODE (op), byte);
14636 }
14637 }
14638 }
14639 \f
14640 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
14641 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
14642 is the expression of the binary operation. The output may either be
14643 emitted here, or returned to the caller, like all output_* functions.
14644
14645 There is no guarantee that the operands are the same mode, as they
14646 might be within FLOAT or FLOAT_EXTEND expressions. */
14647
14648 #ifndef SYSV386_COMPAT
14649 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
14650 wants to fix the assemblers because that causes incompatibility
14651 with gcc. No-one wants to fix gcc because that causes
14652 incompatibility with assemblers... You can use the option of
14653 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
14654 #define SYSV386_COMPAT 1
14655 #endif
14656
14657 const char *
14658 output_387_binary_op (rtx insn, rtx *operands)
14659 {
14660 static char buf[40];
14661 const char *p;
14662 const char *ssep;
14663 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
14664
14665 #ifdef ENABLE_CHECKING
14666 /* Even if we do not want to check the inputs, this documents input
14667 constraints. Which helps in understanding the following code. */
14668 if (STACK_REG_P (operands[0])
14669 && ((REG_P (operands[1])
14670 && REGNO (operands[0]) == REGNO (operands[1])
14671 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
14672 || (REG_P (operands[2])
14673 && REGNO (operands[0]) == REGNO (operands[2])
14674 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
14675 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
14676 ; /* ok */
14677 else
14678 gcc_assert (is_sse);
14679 #endif
14680
14681 switch (GET_CODE (operands[3]))
14682 {
14683 case PLUS:
14684 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14685 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14686 p = "fiadd";
14687 else
14688 p = "fadd";
14689 ssep = "vadd";
14690 break;
14691
14692 case MINUS:
14693 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14694 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14695 p = "fisub";
14696 else
14697 p = "fsub";
14698 ssep = "vsub";
14699 break;
14700
14701 case MULT:
14702 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14703 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14704 p = "fimul";
14705 else
14706 p = "fmul";
14707 ssep = "vmul";
14708 break;
14709
14710 case DIV:
14711 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14712 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14713 p = "fidiv";
14714 else
14715 p = "fdiv";
14716 ssep = "vdiv";
14717 break;
14718
14719 default:
14720 gcc_unreachable ();
14721 }
14722
14723 if (is_sse)
14724 {
14725 if (TARGET_AVX)
14726 {
14727 strcpy (buf, ssep);
14728 if (GET_MODE (operands[0]) == SFmode)
14729 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
14730 else
14731 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
14732 }
14733 else
14734 {
14735 strcpy (buf, ssep + 1);
14736 if (GET_MODE (operands[0]) == SFmode)
14737 strcat (buf, "ss\t{%2, %0|%0, %2}");
14738 else
14739 strcat (buf, "sd\t{%2, %0|%0, %2}");
14740 }
14741 return buf;
14742 }
14743 strcpy (buf, p);
14744
14745 switch (GET_CODE (operands[3]))
14746 {
14747 case MULT:
14748 case PLUS:
14749 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
14750 {
14751 rtx temp = operands[2];
14752 operands[2] = operands[1];
14753 operands[1] = temp;
14754 }
14755
14756 /* know operands[0] == operands[1]. */
14757
14758 if (MEM_P (operands[2]))
14759 {
14760 p = "%Z2\t%2";
14761 break;
14762 }
14763
14764 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14765 {
14766 if (STACK_TOP_P (operands[0]))
14767 /* How is it that we are storing to a dead operand[2]?
14768 Well, presumably operands[1] is dead too. We can't
14769 store the result to st(0) as st(0) gets popped on this
14770 instruction. Instead store to operands[2] (which I
14771 think has to be st(1)). st(1) will be popped later.
14772 gcc <= 2.8.1 didn't have this check and generated
14773 assembly code that the Unixware assembler rejected. */
14774 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14775 else
14776 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14777 break;
14778 }
14779
14780 if (STACK_TOP_P (operands[0]))
14781 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14782 else
14783 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14784 break;
14785
14786 case MINUS:
14787 case DIV:
14788 if (MEM_P (operands[1]))
14789 {
14790 p = "r%Z1\t%1";
14791 break;
14792 }
14793
14794 if (MEM_P (operands[2]))
14795 {
14796 p = "%Z2\t%2";
14797 break;
14798 }
14799
14800 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14801 {
14802 #if SYSV386_COMPAT
14803 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
14804 derived assemblers, confusingly reverse the direction of
14805 the operation for fsub{r} and fdiv{r} when the
14806 destination register is not st(0). The Intel assembler
14807 doesn't have this brain damage. Read !SYSV386_COMPAT to
14808 figure out what the hardware really does. */
14809 if (STACK_TOP_P (operands[0]))
14810 p = "{p\t%0, %2|rp\t%2, %0}";
14811 else
14812 p = "{rp\t%2, %0|p\t%0, %2}";
14813 #else
14814 if (STACK_TOP_P (operands[0]))
14815 /* As above for fmul/fadd, we can't store to st(0). */
14816 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14817 else
14818 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14819 #endif
14820 break;
14821 }
14822
14823 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
14824 {
14825 #if SYSV386_COMPAT
14826 if (STACK_TOP_P (operands[0]))
14827 p = "{rp\t%0, %1|p\t%1, %0}";
14828 else
14829 p = "{p\t%1, %0|rp\t%0, %1}";
14830 #else
14831 if (STACK_TOP_P (operands[0]))
14832 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
14833 else
14834 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
14835 #endif
14836 break;
14837 }
14838
14839 if (STACK_TOP_P (operands[0]))
14840 {
14841 if (STACK_TOP_P (operands[1]))
14842 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14843 else
14844 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
14845 break;
14846 }
14847 else if (STACK_TOP_P (operands[1]))
14848 {
14849 #if SYSV386_COMPAT
14850 p = "{\t%1, %0|r\t%0, %1}";
14851 #else
14852 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
14853 #endif
14854 }
14855 else
14856 {
14857 #if SYSV386_COMPAT
14858 p = "{r\t%2, %0|\t%0, %2}";
14859 #else
14860 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14861 #endif
14862 }
14863 break;
14864
14865 default:
14866 gcc_unreachable ();
14867 }
14868
14869 strcat (buf, p);
14870 return buf;
14871 }
14872
14873 /* Return needed mode for entity in optimize_mode_switching pass. */
14874
14875 int
14876 ix86_mode_needed (int entity, rtx insn)
14877 {
14878 enum attr_i387_cw mode;
14879
14880 /* The mode UNINITIALIZED is used to store control word after a
14881 function call or ASM pattern. The mode ANY specify that function
14882 has no requirements on the control word and make no changes in the
14883 bits we are interested in. */
14884
14885 if (CALL_P (insn)
14886 || (NONJUMP_INSN_P (insn)
14887 && (asm_noperands (PATTERN (insn)) >= 0
14888 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
14889 return I387_CW_UNINITIALIZED;
14890
14891 if (recog_memoized (insn) < 0)
14892 return I387_CW_ANY;
14893
14894 mode = get_attr_i387_cw (insn);
14895
14896 switch (entity)
14897 {
14898 case I387_TRUNC:
14899 if (mode == I387_CW_TRUNC)
14900 return mode;
14901 break;
14902
14903 case I387_FLOOR:
14904 if (mode == I387_CW_FLOOR)
14905 return mode;
14906 break;
14907
14908 case I387_CEIL:
14909 if (mode == I387_CW_CEIL)
14910 return mode;
14911 break;
14912
14913 case I387_MASK_PM:
14914 if (mode == I387_CW_MASK_PM)
14915 return mode;
14916 break;
14917
14918 default:
14919 gcc_unreachable ();
14920 }
14921
14922 return I387_CW_ANY;
14923 }
14924
14925 /* Output code to initialize control word copies used by trunc?f?i and
14926 rounding patterns. CURRENT_MODE is set to current control word,
14927 while NEW_MODE is set to new control word. */
14928
14929 void
14930 emit_i387_cw_initialization (int mode)
14931 {
14932 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
14933 rtx new_mode;
14934
14935 enum ix86_stack_slot slot;
14936
14937 rtx reg = gen_reg_rtx (HImode);
14938
14939 emit_insn (gen_x86_fnstcw_1 (stored_mode));
14940 emit_move_insn (reg, copy_rtx (stored_mode));
14941
14942 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
14943 || optimize_function_for_size_p (cfun))
14944 {
14945 switch (mode)
14946 {
14947 case I387_CW_TRUNC:
14948 /* round toward zero (truncate) */
14949 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
14950 slot = SLOT_CW_TRUNC;
14951 break;
14952
14953 case I387_CW_FLOOR:
14954 /* round down toward -oo */
14955 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
14956 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
14957 slot = SLOT_CW_FLOOR;
14958 break;
14959
14960 case I387_CW_CEIL:
14961 /* round up toward +oo */
14962 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
14963 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
14964 slot = SLOT_CW_CEIL;
14965 break;
14966
14967 case I387_CW_MASK_PM:
14968 /* mask precision exception for nearbyint() */
14969 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
14970 slot = SLOT_CW_MASK_PM;
14971 break;
14972
14973 default:
14974 gcc_unreachable ();
14975 }
14976 }
14977 else
14978 {
14979 switch (mode)
14980 {
14981 case I387_CW_TRUNC:
14982 /* round toward zero (truncate) */
14983 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
14984 slot = SLOT_CW_TRUNC;
14985 break;
14986
14987 case I387_CW_FLOOR:
14988 /* round down toward -oo */
14989 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
14990 slot = SLOT_CW_FLOOR;
14991 break;
14992
14993 case I387_CW_CEIL:
14994 /* round up toward +oo */
14995 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
14996 slot = SLOT_CW_CEIL;
14997 break;
14998
14999 case I387_CW_MASK_PM:
15000 /* mask precision exception for nearbyint() */
15001 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15002 slot = SLOT_CW_MASK_PM;
15003 break;
15004
15005 default:
15006 gcc_unreachable ();
15007 }
15008 }
15009
15010 gcc_assert (slot < MAX_386_STACK_LOCALS);
15011
15012 new_mode = assign_386_stack_local (HImode, slot);
15013 emit_move_insn (new_mode, reg);
15014 }
15015
15016 /* Output code for INSN to convert a float to a signed int. OPERANDS
15017 are the insn operands. The output may be [HSD]Imode and the input
15018 operand may be [SDX]Fmode. */
15019
15020 const char *
15021 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15022 {
15023 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15024 int dimode_p = GET_MODE (operands[0]) == DImode;
15025 int round_mode = get_attr_i387_cw (insn);
15026
15027 /* Jump through a hoop or two for DImode, since the hardware has no
15028 non-popping instruction. We used to do this a different way, but
15029 that was somewhat fragile and broke with post-reload splitters. */
15030 if ((dimode_p || fisttp) && !stack_top_dies)
15031 output_asm_insn ("fld\t%y1", operands);
15032
15033 gcc_assert (STACK_TOP_P (operands[1]));
15034 gcc_assert (MEM_P (operands[0]));
15035 gcc_assert (GET_MODE (operands[1]) != TFmode);
15036
15037 if (fisttp)
15038 output_asm_insn ("fisttp%Z0\t%0", operands);
15039 else
15040 {
15041 if (round_mode != I387_CW_ANY)
15042 output_asm_insn ("fldcw\t%3", operands);
15043 if (stack_top_dies || dimode_p)
15044 output_asm_insn ("fistp%Z0\t%0", operands);
15045 else
15046 output_asm_insn ("fist%Z0\t%0", operands);
15047 if (round_mode != I387_CW_ANY)
15048 output_asm_insn ("fldcw\t%2", operands);
15049 }
15050
15051 return "";
15052 }
15053
15054 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15055 have the values zero or one, indicates the ffreep insn's operand
15056 from the OPERANDS array. */
15057
15058 static const char *
15059 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15060 {
15061 if (TARGET_USE_FFREEP)
15062 #ifdef HAVE_AS_IX86_FFREEP
15063 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15064 #else
15065 {
15066 static char retval[32];
15067 int regno = REGNO (operands[opno]);
15068
15069 gcc_assert (FP_REGNO_P (regno));
15070
15071 regno -= FIRST_STACK_REG;
15072
15073 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15074 return retval;
15075 }
15076 #endif
15077
15078 return opno ? "fstp\t%y1" : "fstp\t%y0";
15079 }
15080
15081
15082 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15083 should be used. UNORDERED_P is true when fucom should be used. */
15084
15085 const char *
15086 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15087 {
15088 int stack_top_dies;
15089 rtx cmp_op0, cmp_op1;
15090 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15091
15092 if (eflags_p)
15093 {
15094 cmp_op0 = operands[0];
15095 cmp_op1 = operands[1];
15096 }
15097 else
15098 {
15099 cmp_op0 = operands[1];
15100 cmp_op1 = operands[2];
15101 }
15102
15103 if (is_sse)
15104 {
15105 if (GET_MODE (operands[0]) == SFmode)
15106 if (unordered_p)
15107 return "%vucomiss\t{%1, %0|%0, %1}";
15108 else
15109 return "%vcomiss\t{%1, %0|%0, %1}";
15110 else
15111 if (unordered_p)
15112 return "%vucomisd\t{%1, %0|%0, %1}";
15113 else
15114 return "%vcomisd\t{%1, %0|%0, %1}";
15115 }
15116
15117 gcc_assert (STACK_TOP_P (cmp_op0));
15118
15119 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15120
15121 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15122 {
15123 if (stack_top_dies)
15124 {
15125 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15126 return output_387_ffreep (operands, 1);
15127 }
15128 else
15129 return "ftst\n\tfnstsw\t%0";
15130 }
15131
15132 if (STACK_REG_P (cmp_op1)
15133 && stack_top_dies
15134 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15135 && REGNO (cmp_op1) != FIRST_STACK_REG)
15136 {
15137 /* If both the top of the 387 stack dies, and the other operand
15138 is also a stack register that dies, then this must be a
15139 `fcompp' float compare */
15140
15141 if (eflags_p)
15142 {
15143 /* There is no double popping fcomi variant. Fortunately,
15144 eflags is immune from the fstp's cc clobbering. */
15145 if (unordered_p)
15146 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15147 else
15148 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15149 return output_387_ffreep (operands, 0);
15150 }
15151 else
15152 {
15153 if (unordered_p)
15154 return "fucompp\n\tfnstsw\t%0";
15155 else
15156 return "fcompp\n\tfnstsw\t%0";
15157 }
15158 }
15159 else
15160 {
15161 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15162
15163 static const char * const alt[16] =
15164 {
15165 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15166 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15167 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15168 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15169
15170 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15171 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15172 NULL,
15173 NULL,
15174
15175 "fcomi\t{%y1, %0|%0, %y1}",
15176 "fcomip\t{%y1, %0|%0, %y1}",
15177 "fucomi\t{%y1, %0|%0, %y1}",
15178 "fucomip\t{%y1, %0|%0, %y1}",
15179
15180 NULL,
15181 NULL,
15182 NULL,
15183 NULL
15184 };
15185
15186 int mask;
15187 const char *ret;
15188
15189 mask = eflags_p << 3;
15190 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15191 mask |= unordered_p << 1;
15192 mask |= stack_top_dies;
15193
15194 gcc_assert (mask < 16);
15195 ret = alt[mask];
15196 gcc_assert (ret);
15197
15198 return ret;
15199 }
15200 }
15201
15202 void
15203 ix86_output_addr_vec_elt (FILE *file, int value)
15204 {
15205 const char *directive = ASM_LONG;
15206
15207 #ifdef ASM_QUAD
15208 if (TARGET_LP64)
15209 directive = ASM_QUAD;
15210 #else
15211 gcc_assert (!TARGET_64BIT);
15212 #endif
15213
15214 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15215 }
15216
15217 void
15218 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15219 {
15220 const char *directive = ASM_LONG;
15221
15222 #ifdef ASM_QUAD
15223 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15224 directive = ASM_QUAD;
15225 #else
15226 gcc_assert (!TARGET_64BIT);
15227 #endif
15228 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15229 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15230 fprintf (file, "%s%s%d-%s%d\n",
15231 directive, LPREFIX, value, LPREFIX, rel);
15232 else if (HAVE_AS_GOTOFF_IN_DATA)
15233 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15234 #if TARGET_MACHO
15235 else if (TARGET_MACHO)
15236 {
15237 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15238 machopic_output_function_base_name (file);
15239 putc ('\n', file);
15240 }
15241 #endif
15242 else
15243 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15244 GOT_SYMBOL_NAME, LPREFIX, value);
15245 }
15246 \f
15247 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15248 for the target. */
15249
15250 void
15251 ix86_expand_clear (rtx dest)
15252 {
15253 rtx tmp;
15254
15255 /* We play register width games, which are only valid after reload. */
15256 gcc_assert (reload_completed);
15257
15258 /* Avoid HImode and its attendant prefix byte. */
15259 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15260 dest = gen_rtx_REG (SImode, REGNO (dest));
15261 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15262
15263 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15264 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15265 {
15266 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15267 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15268 }
15269
15270 emit_insn (tmp);
15271 }
15272
15273 /* X is an unchanging MEM. If it is a constant pool reference, return
15274 the constant pool rtx, else NULL. */
15275
15276 rtx
15277 maybe_get_pool_constant (rtx x)
15278 {
15279 x = ix86_delegitimize_address (XEXP (x, 0));
15280
15281 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15282 return get_pool_constant (x);
15283
15284 return NULL_RTX;
15285 }
15286
15287 void
15288 ix86_expand_move (enum machine_mode mode, rtx operands[])
15289 {
15290 rtx op0, op1;
15291 enum tls_model model;
15292
15293 op0 = operands[0];
15294 op1 = operands[1];
15295
15296 if (GET_CODE (op1) == SYMBOL_REF)
15297 {
15298 model = SYMBOL_REF_TLS_MODEL (op1);
15299 if (model)
15300 {
15301 op1 = legitimize_tls_address (op1, model, true);
15302 op1 = force_operand (op1, op0);
15303 if (op1 == op0)
15304 return;
15305 if (GET_MODE (op1) != mode)
15306 op1 = convert_to_mode (mode, op1, 1);
15307 }
15308 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15309 && SYMBOL_REF_DLLIMPORT_P (op1))
15310 op1 = legitimize_dllimport_symbol (op1, false);
15311 }
15312 else if (GET_CODE (op1) == CONST
15313 && GET_CODE (XEXP (op1, 0)) == PLUS
15314 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15315 {
15316 rtx addend = XEXP (XEXP (op1, 0), 1);
15317 rtx symbol = XEXP (XEXP (op1, 0), 0);
15318 rtx tmp = NULL;
15319
15320 model = SYMBOL_REF_TLS_MODEL (symbol);
15321 if (model)
15322 tmp = legitimize_tls_address (symbol, model, true);
15323 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15324 && SYMBOL_REF_DLLIMPORT_P (symbol))
15325 tmp = legitimize_dllimport_symbol (symbol, true);
15326
15327 if (tmp)
15328 {
15329 tmp = force_operand (tmp, NULL);
15330 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15331 op0, 1, OPTAB_DIRECT);
15332 if (tmp == op0)
15333 return;
15334 if (GET_MODE (tmp) != mode)
15335 op1 = convert_to_mode (mode, tmp, 1);
15336 }
15337 }
15338
15339 if ((flag_pic || MACHOPIC_INDIRECT)
15340 && symbolic_operand (op1, mode))
15341 {
15342 if (TARGET_MACHO && !TARGET_64BIT)
15343 {
15344 #if TARGET_MACHO
15345 /* dynamic-no-pic */
15346 if (MACHOPIC_INDIRECT)
15347 {
15348 rtx temp = ((reload_in_progress
15349 || ((op0 && REG_P (op0))
15350 && mode == Pmode))
15351 ? op0 : gen_reg_rtx (Pmode));
15352 op1 = machopic_indirect_data_reference (op1, temp);
15353 if (MACHOPIC_PURE)
15354 op1 = machopic_legitimize_pic_address (op1, mode,
15355 temp == op1 ? 0 : temp);
15356 }
15357 if (op0 != op1 && GET_CODE (op0) != MEM)
15358 {
15359 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15360 emit_insn (insn);
15361 return;
15362 }
15363 if (GET_CODE (op0) == MEM)
15364 op1 = force_reg (Pmode, op1);
15365 else
15366 {
15367 rtx temp = op0;
15368 if (GET_CODE (temp) != REG)
15369 temp = gen_reg_rtx (Pmode);
15370 temp = legitimize_pic_address (op1, temp);
15371 if (temp == op0)
15372 return;
15373 op1 = temp;
15374 }
15375 /* dynamic-no-pic */
15376 #endif
15377 }
15378 else
15379 {
15380 if (MEM_P (op0))
15381 op1 = force_reg (mode, op1);
15382 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15383 {
15384 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15385 op1 = legitimize_pic_address (op1, reg);
15386 if (op0 == op1)
15387 return;
15388 if (GET_MODE (op1) != mode)
15389 op1 = convert_to_mode (mode, op1, 1);
15390 }
15391 }
15392 }
15393 else
15394 {
15395 if (MEM_P (op0)
15396 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15397 || !push_operand (op0, mode))
15398 && MEM_P (op1))
15399 op1 = force_reg (mode, op1);
15400
15401 if (push_operand (op0, mode)
15402 && ! general_no_elim_operand (op1, mode))
15403 op1 = copy_to_mode_reg (mode, op1);
15404
15405 /* Force large constants in 64bit compilation into register
15406 to get them CSEed. */
15407 if (can_create_pseudo_p ()
15408 && (mode == DImode) && TARGET_64BIT
15409 && immediate_operand (op1, mode)
15410 && !x86_64_zext_immediate_operand (op1, VOIDmode)
15411 && !register_operand (op0, mode)
15412 && optimize)
15413 op1 = copy_to_mode_reg (mode, op1);
15414
15415 if (can_create_pseudo_p ()
15416 && FLOAT_MODE_P (mode)
15417 && GET_CODE (op1) == CONST_DOUBLE)
15418 {
15419 /* If we are loading a floating point constant to a register,
15420 force the value to memory now, since we'll get better code
15421 out the back end. */
15422
15423 op1 = validize_mem (force_const_mem (mode, op1));
15424 if (!register_operand (op0, mode))
15425 {
15426 rtx temp = gen_reg_rtx (mode);
15427 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15428 emit_move_insn (op0, temp);
15429 return;
15430 }
15431 }
15432 }
15433
15434 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15435 }
15436
15437 void
15438 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15439 {
15440 rtx op0 = operands[0], op1 = operands[1];
15441 unsigned int align = GET_MODE_ALIGNMENT (mode);
15442
15443 /* Force constants other than zero into memory. We do not know how
15444 the instructions used to build constants modify the upper 64 bits
15445 of the register, once we have that information we may be able
15446 to handle some of them more efficiently. */
15447 if (can_create_pseudo_p ()
15448 && register_operand (op0, mode)
15449 && (CONSTANT_P (op1)
15450 || (GET_CODE (op1) == SUBREG
15451 && CONSTANT_P (SUBREG_REG (op1))))
15452 && !standard_sse_constant_p (op1))
15453 op1 = validize_mem (force_const_mem (mode, op1));
15454
15455 /* We need to check memory alignment for SSE mode since attribute
15456 can make operands unaligned. */
15457 if (can_create_pseudo_p ()
15458 && SSE_REG_MODE_P (mode)
15459 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15460 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15461 {
15462 rtx tmp[2];
15463
15464 /* ix86_expand_vector_move_misalign() does not like constants ... */
15465 if (CONSTANT_P (op1)
15466 || (GET_CODE (op1) == SUBREG
15467 && CONSTANT_P (SUBREG_REG (op1))))
15468 op1 = validize_mem (force_const_mem (mode, op1));
15469
15470 /* ... nor both arguments in memory. */
15471 if (!register_operand (op0, mode)
15472 && !register_operand (op1, mode))
15473 op1 = force_reg (mode, op1);
15474
15475 tmp[0] = op0; tmp[1] = op1;
15476 ix86_expand_vector_move_misalign (mode, tmp);
15477 return;
15478 }
15479
15480 /* Make operand1 a register if it isn't already. */
15481 if (can_create_pseudo_p ()
15482 && !register_operand (op0, mode)
15483 && !register_operand (op1, mode))
15484 {
15485 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15486 return;
15487 }
15488
15489 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15490 }
15491
15492 /* Split 32-byte AVX unaligned load and store if needed. */
15493
15494 static void
15495 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
15496 {
15497 rtx m;
15498 rtx (*extract) (rtx, rtx, rtx);
15499 rtx (*move_unaligned) (rtx, rtx);
15500 enum machine_mode mode;
15501
15502 switch (GET_MODE (op0))
15503 {
15504 default:
15505 gcc_unreachable ();
15506 case V32QImode:
15507 extract = gen_avx_vextractf128v32qi;
15508 move_unaligned = gen_avx_movdqu256;
15509 mode = V16QImode;
15510 break;
15511 case V8SFmode:
15512 extract = gen_avx_vextractf128v8sf;
15513 move_unaligned = gen_avx_movups256;
15514 mode = V4SFmode;
15515 break;
15516 case V4DFmode:
15517 extract = gen_avx_vextractf128v4df;
15518 move_unaligned = gen_avx_movupd256;
15519 mode = V2DFmode;
15520 break;
15521 }
15522
15523 if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
15524 {
15525 rtx r = gen_reg_rtx (mode);
15526 m = adjust_address (op1, mode, 0);
15527 emit_move_insn (r, m);
15528 m = adjust_address (op1, mode, 16);
15529 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
15530 emit_move_insn (op0, r);
15531 }
15532 else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
15533 {
15534 m = adjust_address (op0, mode, 0);
15535 emit_insn (extract (m, op1, const0_rtx));
15536 m = adjust_address (op0, mode, 16);
15537 emit_insn (extract (m, op1, const1_rtx));
15538 }
15539 else
15540 emit_insn (move_unaligned (op0, op1));
15541 }
15542
15543 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
15544 straight to ix86_expand_vector_move. */
15545 /* Code generation for scalar reg-reg moves of single and double precision data:
15546 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
15547 movaps reg, reg
15548 else
15549 movss reg, reg
15550 if (x86_sse_partial_reg_dependency == true)
15551 movapd reg, reg
15552 else
15553 movsd reg, reg
15554
15555 Code generation for scalar loads of double precision data:
15556 if (x86_sse_split_regs == true)
15557 movlpd mem, reg (gas syntax)
15558 else
15559 movsd mem, reg
15560
15561 Code generation for unaligned packed loads of single precision data
15562 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
15563 if (x86_sse_unaligned_move_optimal)
15564 movups mem, reg
15565
15566 if (x86_sse_partial_reg_dependency == true)
15567 {
15568 xorps reg, reg
15569 movlps mem, reg
15570 movhps mem+8, reg
15571 }
15572 else
15573 {
15574 movlps mem, reg
15575 movhps mem+8, reg
15576 }
15577
15578 Code generation for unaligned packed loads of double precision data
15579 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
15580 if (x86_sse_unaligned_move_optimal)
15581 movupd mem, reg
15582
15583 if (x86_sse_split_regs == true)
15584 {
15585 movlpd mem, reg
15586 movhpd mem+8, reg
15587 }
15588 else
15589 {
15590 movsd mem, reg
15591 movhpd mem+8, reg
15592 }
15593 */
15594
15595 void
15596 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
15597 {
15598 rtx op0, op1, m;
15599
15600 op0 = operands[0];
15601 op1 = operands[1];
15602
15603 if (TARGET_AVX)
15604 {
15605 switch (GET_MODE_CLASS (mode))
15606 {
15607 case MODE_VECTOR_INT:
15608 case MODE_INT:
15609 switch (GET_MODE_SIZE (mode))
15610 {
15611 case 16:
15612 /* If we're optimizing for size, movups is the smallest. */
15613 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15614 {
15615 op0 = gen_lowpart (V4SFmode, op0);
15616 op1 = gen_lowpart (V4SFmode, op1);
15617 emit_insn (gen_sse_movups (op0, op1));
15618 return;
15619 }
15620 op0 = gen_lowpart (V16QImode, op0);
15621 op1 = gen_lowpart (V16QImode, op1);
15622 emit_insn (gen_sse2_movdqu (op0, op1));
15623 break;
15624 case 32:
15625 op0 = gen_lowpart (V32QImode, op0);
15626 op1 = gen_lowpart (V32QImode, op1);
15627 ix86_avx256_split_vector_move_misalign (op0, op1);
15628 break;
15629 default:
15630 gcc_unreachable ();
15631 }
15632 break;
15633 case MODE_VECTOR_FLOAT:
15634 op0 = gen_lowpart (mode, op0);
15635 op1 = gen_lowpart (mode, op1);
15636
15637 switch (mode)
15638 {
15639 case V4SFmode:
15640 emit_insn (gen_sse_movups (op0, op1));
15641 break;
15642 case V8SFmode:
15643 ix86_avx256_split_vector_move_misalign (op0, op1);
15644 break;
15645 case V2DFmode:
15646 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15647 {
15648 op0 = gen_lowpart (V4SFmode, op0);
15649 op1 = gen_lowpart (V4SFmode, op1);
15650 emit_insn (gen_sse_movups (op0, op1));
15651 return;
15652 }
15653 emit_insn (gen_sse2_movupd (op0, op1));
15654 break;
15655 case V4DFmode:
15656 ix86_avx256_split_vector_move_misalign (op0, op1);
15657 break;
15658 default:
15659 gcc_unreachable ();
15660 }
15661 break;
15662
15663 default:
15664 gcc_unreachable ();
15665 }
15666
15667 return;
15668 }
15669
15670 if (MEM_P (op1))
15671 {
15672 /* If we're optimizing for size, movups is the smallest. */
15673 if (optimize_insn_for_size_p ()
15674 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15675 {
15676 op0 = gen_lowpart (V4SFmode, op0);
15677 op1 = gen_lowpart (V4SFmode, op1);
15678 emit_insn (gen_sse_movups (op0, op1));
15679 return;
15680 }
15681
15682 /* ??? If we have typed data, then it would appear that using
15683 movdqu is the only way to get unaligned data loaded with
15684 integer type. */
15685 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15686 {
15687 op0 = gen_lowpart (V16QImode, op0);
15688 op1 = gen_lowpart (V16QImode, op1);
15689 emit_insn (gen_sse2_movdqu (op0, op1));
15690 return;
15691 }
15692
15693 if (TARGET_SSE2 && mode == V2DFmode)
15694 {
15695 rtx zero;
15696
15697 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15698 {
15699 op0 = gen_lowpart (V2DFmode, op0);
15700 op1 = gen_lowpart (V2DFmode, op1);
15701 emit_insn (gen_sse2_movupd (op0, op1));
15702 return;
15703 }
15704
15705 /* When SSE registers are split into halves, we can avoid
15706 writing to the top half twice. */
15707 if (TARGET_SSE_SPLIT_REGS)
15708 {
15709 emit_clobber (op0);
15710 zero = op0;
15711 }
15712 else
15713 {
15714 /* ??? Not sure about the best option for the Intel chips.
15715 The following would seem to satisfy; the register is
15716 entirely cleared, breaking the dependency chain. We
15717 then store to the upper half, with a dependency depth
15718 of one. A rumor has it that Intel recommends two movsd
15719 followed by an unpacklpd, but this is unconfirmed. And
15720 given that the dependency depth of the unpacklpd would
15721 still be one, I'm not sure why this would be better. */
15722 zero = CONST0_RTX (V2DFmode);
15723 }
15724
15725 m = adjust_address (op1, DFmode, 0);
15726 emit_insn (gen_sse2_loadlpd (op0, zero, m));
15727 m = adjust_address (op1, DFmode, 8);
15728 emit_insn (gen_sse2_loadhpd (op0, op0, m));
15729 }
15730 else
15731 {
15732 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15733 {
15734 op0 = gen_lowpart (V4SFmode, op0);
15735 op1 = gen_lowpart (V4SFmode, op1);
15736 emit_insn (gen_sse_movups (op0, op1));
15737 return;
15738 }
15739
15740 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
15741 emit_move_insn (op0, CONST0_RTX (mode));
15742 else
15743 emit_clobber (op0);
15744
15745 if (mode != V4SFmode)
15746 op0 = gen_lowpart (V4SFmode, op0);
15747 m = adjust_address (op1, V2SFmode, 0);
15748 emit_insn (gen_sse_loadlps (op0, op0, m));
15749 m = adjust_address (op1, V2SFmode, 8);
15750 emit_insn (gen_sse_loadhps (op0, op0, m));
15751 }
15752 }
15753 else if (MEM_P (op0))
15754 {
15755 /* If we're optimizing for size, movups is the smallest. */
15756 if (optimize_insn_for_size_p ()
15757 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15758 {
15759 op0 = gen_lowpart (V4SFmode, op0);
15760 op1 = gen_lowpart (V4SFmode, op1);
15761 emit_insn (gen_sse_movups (op0, op1));
15762 return;
15763 }
15764
15765 /* ??? Similar to above, only less clear because of quote
15766 typeless stores unquote. */
15767 if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
15768 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15769 {
15770 op0 = gen_lowpart (V16QImode, op0);
15771 op1 = gen_lowpart (V16QImode, op1);
15772 emit_insn (gen_sse2_movdqu (op0, op1));
15773 return;
15774 }
15775
15776 if (TARGET_SSE2 && mode == V2DFmode)
15777 {
15778 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15779 {
15780 op0 = gen_lowpart (V2DFmode, op0);
15781 op1 = gen_lowpart (V2DFmode, op1);
15782 emit_insn (gen_sse2_movupd (op0, op1));
15783 }
15784 else
15785 {
15786 m = adjust_address (op0, DFmode, 0);
15787 emit_insn (gen_sse2_storelpd (m, op1));
15788 m = adjust_address (op0, DFmode, 8);
15789 emit_insn (gen_sse2_storehpd (m, op1));
15790 }
15791 }
15792 else
15793 {
15794 if (mode != V4SFmode)
15795 op1 = gen_lowpart (V4SFmode, op1);
15796
15797 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15798 {
15799 op0 = gen_lowpart (V4SFmode, op0);
15800 emit_insn (gen_sse_movups (op0, op1));
15801 }
15802 else
15803 {
15804 m = adjust_address (op0, V2SFmode, 0);
15805 emit_insn (gen_sse_storelps (m, op1));
15806 m = adjust_address (op0, V2SFmode, 8);
15807 emit_insn (gen_sse_storehps (m, op1));
15808 }
15809 }
15810 }
15811 else
15812 gcc_unreachable ();
15813 }
15814
15815 /* Expand a push in MODE. This is some mode for which we do not support
15816 proper push instructions, at least from the registers that we expect
15817 the value to live in. */
15818
15819 void
15820 ix86_expand_push (enum machine_mode mode, rtx x)
15821 {
15822 rtx tmp;
15823
15824 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
15825 GEN_INT (-GET_MODE_SIZE (mode)),
15826 stack_pointer_rtx, 1, OPTAB_DIRECT);
15827 if (tmp != stack_pointer_rtx)
15828 emit_move_insn (stack_pointer_rtx, tmp);
15829
15830 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
15831
15832 /* When we push an operand onto stack, it has to be aligned at least
15833 at the function argument boundary. However since we don't have
15834 the argument type, we can't determine the actual argument
15835 boundary. */
15836 emit_move_insn (tmp, x);
15837 }
15838
15839 /* Helper function of ix86_fixup_binary_operands to canonicalize
15840 operand order. Returns true if the operands should be swapped. */
15841
15842 static bool
15843 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
15844 rtx operands[])
15845 {
15846 rtx dst = operands[0];
15847 rtx src1 = operands[1];
15848 rtx src2 = operands[2];
15849
15850 /* If the operation is not commutative, we can't do anything. */
15851 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
15852 return false;
15853
15854 /* Highest priority is that src1 should match dst. */
15855 if (rtx_equal_p (dst, src1))
15856 return false;
15857 if (rtx_equal_p (dst, src2))
15858 return true;
15859
15860 /* Next highest priority is that immediate constants come second. */
15861 if (immediate_operand (src2, mode))
15862 return false;
15863 if (immediate_operand (src1, mode))
15864 return true;
15865
15866 /* Lowest priority is that memory references should come second. */
15867 if (MEM_P (src2))
15868 return false;
15869 if (MEM_P (src1))
15870 return true;
15871
15872 return false;
15873 }
15874
15875
15876 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
15877 destination to use for the operation. If different from the true
15878 destination in operands[0], a copy operation will be required. */
15879
15880 rtx
15881 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
15882 rtx operands[])
15883 {
15884 rtx dst = operands[0];
15885 rtx src1 = operands[1];
15886 rtx src2 = operands[2];
15887
15888 /* Canonicalize operand order. */
15889 if (ix86_swap_binary_operands_p (code, mode, operands))
15890 {
15891 rtx temp;
15892
15893 /* It is invalid to swap operands of different modes. */
15894 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
15895
15896 temp = src1;
15897 src1 = src2;
15898 src2 = temp;
15899 }
15900
15901 /* Both source operands cannot be in memory. */
15902 if (MEM_P (src1) && MEM_P (src2))
15903 {
15904 /* Optimization: Only read from memory once. */
15905 if (rtx_equal_p (src1, src2))
15906 {
15907 src2 = force_reg (mode, src2);
15908 src1 = src2;
15909 }
15910 else
15911 src2 = force_reg (mode, src2);
15912 }
15913
15914 /* If the destination is memory, and we do not have matching source
15915 operands, do things in registers. */
15916 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
15917 dst = gen_reg_rtx (mode);
15918
15919 /* Source 1 cannot be a constant. */
15920 if (CONSTANT_P (src1))
15921 src1 = force_reg (mode, src1);
15922
15923 /* Source 1 cannot be a non-matching memory. */
15924 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
15925 src1 = force_reg (mode, src1);
15926
15927 /* Improve address combine. */
15928 if (code == PLUS
15929 && GET_MODE_CLASS (mode) == MODE_INT
15930 && MEM_P (src2))
15931 src2 = force_reg (mode, src2);
15932
15933 operands[1] = src1;
15934 operands[2] = src2;
15935 return dst;
15936 }
15937
15938 /* Similarly, but assume that the destination has already been
15939 set up properly. */
15940
15941 void
15942 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
15943 enum machine_mode mode, rtx operands[])
15944 {
15945 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
15946 gcc_assert (dst == operands[0]);
15947 }
15948
15949 /* Attempt to expand a binary operator. Make the expansion closer to the
15950 actual machine, then just general_operand, which will allow 3 separate
15951 memory references (one output, two input) in a single insn. */
15952
15953 void
15954 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
15955 rtx operands[])
15956 {
15957 rtx src1, src2, dst, op, clob;
15958
15959 dst = ix86_fixup_binary_operands (code, mode, operands);
15960 src1 = operands[1];
15961 src2 = operands[2];
15962
15963 /* Emit the instruction. */
15964
15965 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
15966 if (reload_in_progress)
15967 {
15968 /* Reload doesn't know about the flags register, and doesn't know that
15969 it doesn't want to clobber it. We can only do this with PLUS. */
15970 gcc_assert (code == PLUS);
15971 emit_insn (op);
15972 }
15973 else if (reload_completed
15974 && code == PLUS
15975 && !rtx_equal_p (dst, src1))
15976 {
15977 /* This is going to be an LEA; avoid splitting it later. */
15978 emit_insn (op);
15979 }
15980 else
15981 {
15982 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15983 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
15984 }
15985
15986 /* Fix up the destination if needed. */
15987 if (dst != operands[0])
15988 emit_move_insn (operands[0], dst);
15989 }
15990
15991 /* Return TRUE or FALSE depending on whether the binary operator meets the
15992 appropriate constraints. */
15993
15994 bool
15995 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
15996 rtx operands[3])
15997 {
15998 rtx dst = operands[0];
15999 rtx src1 = operands[1];
16000 rtx src2 = operands[2];
16001
16002 /* Both source operands cannot be in memory. */
16003 if (MEM_P (src1) && MEM_P (src2))
16004 return false;
16005
16006 /* Canonicalize operand order for commutative operators. */
16007 if (ix86_swap_binary_operands_p (code, mode, operands))
16008 {
16009 rtx temp = src1;
16010 src1 = src2;
16011 src2 = temp;
16012 }
16013
16014 /* If the destination is memory, we must have a matching source operand. */
16015 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16016 return false;
16017
16018 /* Source 1 cannot be a constant. */
16019 if (CONSTANT_P (src1))
16020 return false;
16021
16022 /* Source 1 cannot be a non-matching memory. */
16023 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16024 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16025 return (code == AND
16026 && (mode == HImode
16027 || mode == SImode
16028 || (TARGET_64BIT && mode == DImode))
16029 && satisfies_constraint_L (src2));
16030
16031 return true;
16032 }
16033
16034 /* Attempt to expand a unary operator. Make the expansion closer to the
16035 actual machine, then just general_operand, which will allow 2 separate
16036 memory references (one output, one input) in a single insn. */
16037
16038 void
16039 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16040 rtx operands[])
16041 {
16042 int matching_memory;
16043 rtx src, dst, op, clob;
16044
16045 dst = operands[0];
16046 src = operands[1];
16047
16048 /* If the destination is memory, and we do not have matching source
16049 operands, do things in registers. */
16050 matching_memory = 0;
16051 if (MEM_P (dst))
16052 {
16053 if (rtx_equal_p (dst, src))
16054 matching_memory = 1;
16055 else
16056 dst = gen_reg_rtx (mode);
16057 }
16058
16059 /* When source operand is memory, destination must match. */
16060 if (MEM_P (src) && !matching_memory)
16061 src = force_reg (mode, src);
16062
16063 /* Emit the instruction. */
16064
16065 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16066 if (reload_in_progress || code == NOT)
16067 {
16068 /* Reload doesn't know about the flags register, and doesn't know that
16069 it doesn't want to clobber it. */
16070 gcc_assert (code == NOT);
16071 emit_insn (op);
16072 }
16073 else
16074 {
16075 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16076 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16077 }
16078
16079 /* Fix up the destination if needed. */
16080 if (dst != operands[0])
16081 emit_move_insn (operands[0], dst);
16082 }
16083
16084 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16085 divisor are within the range [0-255]. */
16086
16087 void
16088 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16089 bool signed_p)
16090 {
16091 rtx end_label, qimode_label;
16092 rtx insn, div, mod;
16093 rtx scratch, tmp0, tmp1, tmp2;
16094 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16095 rtx (*gen_zero_extend) (rtx, rtx);
16096 rtx (*gen_test_ccno_1) (rtx, rtx);
16097
16098 switch (mode)
16099 {
16100 case SImode:
16101 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16102 gen_test_ccno_1 = gen_testsi_ccno_1;
16103 gen_zero_extend = gen_zero_extendqisi2;
16104 break;
16105 case DImode:
16106 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16107 gen_test_ccno_1 = gen_testdi_ccno_1;
16108 gen_zero_extend = gen_zero_extendqidi2;
16109 break;
16110 default:
16111 gcc_unreachable ();
16112 }
16113
16114 end_label = gen_label_rtx ();
16115 qimode_label = gen_label_rtx ();
16116
16117 scratch = gen_reg_rtx (mode);
16118
16119 /* Use 8bit unsigned divimod if dividend and divisor are within
16120 the range [0-255]. */
16121 emit_move_insn (scratch, operands[2]);
16122 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16123 scratch, 1, OPTAB_DIRECT);
16124 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16125 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16126 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16127 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16128 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16129 pc_rtx);
16130 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16131 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16132 JUMP_LABEL (insn) = qimode_label;
16133
16134 /* Generate original signed/unsigned divimod. */
16135 div = gen_divmod4_1 (operands[0], operands[1],
16136 operands[2], operands[3]);
16137 emit_insn (div);
16138
16139 /* Branch to the end. */
16140 emit_jump_insn (gen_jump (end_label));
16141 emit_barrier ();
16142
16143 /* Generate 8bit unsigned divide. */
16144 emit_label (qimode_label);
16145 /* Don't use operands[0] for result of 8bit divide since not all
16146 registers support QImode ZERO_EXTRACT. */
16147 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16148 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16149 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16150 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16151
16152 if (signed_p)
16153 {
16154 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16155 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16156 }
16157 else
16158 {
16159 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16160 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16161 }
16162
16163 /* Extract remainder from AH. */
16164 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16165 if (REG_P (operands[1]))
16166 insn = emit_move_insn (operands[1], tmp1);
16167 else
16168 {
16169 /* Need a new scratch register since the old one has result
16170 of 8bit divide. */
16171 scratch = gen_reg_rtx (mode);
16172 emit_move_insn (scratch, tmp1);
16173 insn = emit_move_insn (operands[1], scratch);
16174 }
16175 set_unique_reg_note (insn, REG_EQUAL, mod);
16176
16177 /* Zero extend quotient from AL. */
16178 tmp1 = gen_lowpart (QImode, tmp0);
16179 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16180 set_unique_reg_note (insn, REG_EQUAL, div);
16181
16182 emit_label (end_label);
16183 }
16184
16185 #define LEA_MAX_STALL (3)
16186 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16187
16188 /* Increase given DISTANCE in half-cycles according to
16189 dependencies between PREV and NEXT instructions.
16190 Add 1 half-cycle if there is no dependency and
16191 go to next cycle if there is some dependecy. */
16192
16193 static unsigned int
16194 increase_distance (rtx prev, rtx next, unsigned int distance)
16195 {
16196 df_ref *use_rec;
16197 df_ref *def_rec;
16198
16199 if (!prev || !next)
16200 return distance + (distance & 1) + 2;
16201
16202 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16203 return distance + 1;
16204
16205 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16206 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16207 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16208 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16209 return distance + (distance & 1) + 2;
16210
16211 return distance + 1;
16212 }
16213
16214 /* Function checks if instruction INSN defines register number
16215 REGNO1 or REGNO2. */
16216
16217 static bool
16218 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16219 rtx insn)
16220 {
16221 df_ref *def_rec;
16222
16223 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16224 if (DF_REF_REG_DEF_P (*def_rec)
16225 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16226 && (regno1 == DF_REF_REGNO (*def_rec)
16227 || regno2 == DF_REF_REGNO (*def_rec)))
16228 {
16229 return true;
16230 }
16231
16232 return false;
16233 }
16234
16235 /* Function checks if instruction INSN uses register number
16236 REGNO as a part of address expression. */
16237
16238 static bool
16239 insn_uses_reg_mem (unsigned int regno, rtx insn)
16240 {
16241 df_ref *use_rec;
16242
16243 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16244 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16245 return true;
16246
16247 return false;
16248 }
16249
16250 /* Search backward for non-agu definition of register number REGNO1
16251 or register number REGNO2 in basic block starting from instruction
16252 START up to head of basic block or instruction INSN.
16253
16254 Function puts true value into *FOUND var if definition was found
16255 and false otherwise.
16256
16257 Distance in half-cycles between START and found instruction or head
16258 of BB is added to DISTANCE and returned. */
16259
16260 static int
16261 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16262 rtx insn, int distance,
16263 rtx start, bool *found)
16264 {
16265 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16266 rtx prev = start;
16267 rtx next = NULL;
16268 enum attr_type insn_type;
16269
16270 *found = false;
16271
16272 while (prev
16273 && prev != insn
16274 && distance < LEA_SEARCH_THRESHOLD)
16275 {
16276 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16277 {
16278 distance = increase_distance (prev, next, distance);
16279 if (insn_defines_reg (regno1, regno2, prev))
16280 {
16281 insn_type = get_attr_type (prev);
16282 if (insn_type != TYPE_LEA)
16283 {
16284 *found = true;
16285 return distance;
16286 }
16287 }
16288
16289 next = prev;
16290 }
16291 if (prev == BB_HEAD (bb))
16292 break;
16293
16294 prev = PREV_INSN (prev);
16295 }
16296
16297 return distance;
16298 }
16299
16300 /* Search backward for non-agu definition of register number REGNO1
16301 or register number REGNO2 in INSN's basic block until
16302 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16303 2. Reach neighbour BBs boundary, or
16304 3. Reach agu definition.
16305 Returns the distance between the non-agu definition point and INSN.
16306 If no definition point, returns -1. */
16307
16308 static int
16309 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16310 rtx insn)
16311 {
16312 basic_block bb = BLOCK_FOR_INSN (insn);
16313 int distance = 0;
16314 bool found = false;
16315
16316 if (insn != BB_HEAD (bb))
16317 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16318 distance, PREV_INSN (insn),
16319 &found);
16320
16321 if (!found && distance < LEA_SEARCH_THRESHOLD)
16322 {
16323 edge e;
16324 edge_iterator ei;
16325 bool simple_loop = false;
16326
16327 FOR_EACH_EDGE (e, ei, bb->preds)
16328 if (e->src == bb)
16329 {
16330 simple_loop = true;
16331 break;
16332 }
16333
16334 if (simple_loop)
16335 distance = distance_non_agu_define_in_bb (regno1, regno2,
16336 insn, distance,
16337 BB_END (bb), &found);
16338 else
16339 {
16340 int shortest_dist = -1;
16341 bool found_in_bb = false;
16342
16343 FOR_EACH_EDGE (e, ei, bb->preds)
16344 {
16345 int bb_dist
16346 = distance_non_agu_define_in_bb (regno1, regno2,
16347 insn, distance,
16348 BB_END (e->src),
16349 &found_in_bb);
16350 if (found_in_bb)
16351 {
16352 if (shortest_dist < 0)
16353 shortest_dist = bb_dist;
16354 else if (bb_dist > 0)
16355 shortest_dist = MIN (bb_dist, shortest_dist);
16356
16357 found = true;
16358 }
16359 }
16360
16361 distance = shortest_dist;
16362 }
16363 }
16364
16365 /* get_attr_type may modify recog data. We want to make sure
16366 that recog data is valid for instruction INSN, on which
16367 distance_non_agu_define is called. INSN is unchanged here. */
16368 extract_insn_cached (insn);
16369
16370 if (!found)
16371 return -1;
16372
16373 return distance >> 1;
16374 }
16375
16376 /* Return the distance in half-cycles between INSN and the next
16377 insn that uses register number REGNO in memory address added
16378 to DISTANCE. Return -1 if REGNO0 is set.
16379
16380 Put true value into *FOUND if register usage was found and
16381 false otherwise.
16382 Put true value into *REDEFINED if register redefinition was
16383 found and false otherwise. */
16384
16385 static int
16386 distance_agu_use_in_bb (unsigned int regno,
16387 rtx insn, int distance, rtx start,
16388 bool *found, bool *redefined)
16389 {
16390 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16391 rtx next = start;
16392 rtx prev = NULL;
16393
16394 *found = false;
16395 *redefined = false;
16396
16397 while (next
16398 && next != insn
16399 && distance < LEA_SEARCH_THRESHOLD)
16400 {
16401 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
16402 {
16403 distance = increase_distance(prev, next, distance);
16404 if (insn_uses_reg_mem (regno, next))
16405 {
16406 /* Return DISTANCE if OP0 is used in memory
16407 address in NEXT. */
16408 *found = true;
16409 return distance;
16410 }
16411
16412 if (insn_defines_reg (regno, INVALID_REGNUM, next))
16413 {
16414 /* Return -1 if OP0 is set in NEXT. */
16415 *redefined = true;
16416 return -1;
16417 }
16418
16419 prev = next;
16420 }
16421
16422 if (next == BB_END (bb))
16423 break;
16424
16425 next = NEXT_INSN (next);
16426 }
16427
16428 return distance;
16429 }
16430
16431 /* Return the distance between INSN and the next insn that uses
16432 register number REGNO0 in memory address. Return -1 if no such
16433 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
16434
16435 static int
16436 distance_agu_use (unsigned int regno0, rtx insn)
16437 {
16438 basic_block bb = BLOCK_FOR_INSN (insn);
16439 int distance = 0;
16440 bool found = false;
16441 bool redefined = false;
16442
16443 if (insn != BB_END (bb))
16444 distance = distance_agu_use_in_bb (regno0, insn, distance,
16445 NEXT_INSN (insn),
16446 &found, &redefined);
16447
16448 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
16449 {
16450 edge e;
16451 edge_iterator ei;
16452 bool simple_loop = false;
16453
16454 FOR_EACH_EDGE (e, ei, bb->succs)
16455 if (e->dest == bb)
16456 {
16457 simple_loop = true;
16458 break;
16459 }
16460
16461 if (simple_loop)
16462 distance = distance_agu_use_in_bb (regno0, insn,
16463 distance, BB_HEAD (bb),
16464 &found, &redefined);
16465 else
16466 {
16467 int shortest_dist = -1;
16468 bool found_in_bb = false;
16469 bool redefined_in_bb = false;
16470
16471 FOR_EACH_EDGE (e, ei, bb->succs)
16472 {
16473 int bb_dist
16474 = distance_agu_use_in_bb (regno0, insn,
16475 distance, BB_HEAD (e->dest),
16476 &found_in_bb, &redefined_in_bb);
16477 if (found_in_bb)
16478 {
16479 if (shortest_dist < 0)
16480 shortest_dist = bb_dist;
16481 else if (bb_dist > 0)
16482 shortest_dist = MIN (bb_dist, shortest_dist);
16483
16484 found = true;
16485 }
16486 }
16487
16488 distance = shortest_dist;
16489 }
16490 }
16491
16492 if (!found || redefined)
16493 return -1;
16494
16495 return distance >> 1;
16496 }
16497
16498 /* Define this macro to tune LEA priority vs ADD, it take effect when
16499 there is a dilemma of choicing LEA or ADD
16500 Negative value: ADD is more preferred than LEA
16501 Zero: Netrual
16502 Positive value: LEA is more preferred than ADD*/
16503 #define IX86_LEA_PRIORITY 0
16504
16505 /* Return true if usage of lea INSN has performance advantage
16506 over a sequence of instructions. Instructions sequence has
16507 SPLIT_COST cycles higher latency than lea latency. */
16508
16509 bool
16510 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
16511 unsigned int regno2, unsigned int split_cost)
16512 {
16513 int dist_define, dist_use;
16514
16515 dist_define = distance_non_agu_define (regno1, regno2, insn);
16516 dist_use = distance_agu_use (regno0, insn);
16517
16518 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
16519 {
16520 /* If there is no non AGU operand definition, no AGU
16521 operand usage and split cost is 0 then both lea
16522 and non lea variants have same priority. Currently
16523 we prefer lea for 64 bit code and non lea on 32 bit
16524 code. */
16525 if (dist_use < 0 && split_cost == 0)
16526 return TARGET_64BIT || IX86_LEA_PRIORITY;
16527 else
16528 return true;
16529 }
16530
16531 /* With longer definitions distance lea is more preferable.
16532 Here we change it to take into account splitting cost and
16533 lea priority. */
16534 dist_define += split_cost + IX86_LEA_PRIORITY;
16535
16536 /* If there is no use in memory addess then we just check
16537 that split cost does not exceed AGU stall. */
16538 if (dist_use < 0)
16539 return dist_define >= LEA_MAX_STALL;
16540
16541 /* If this insn has both backward non-agu dependence and forward
16542 agu dependence, the one with short distance takes effect. */
16543 return dist_define >= dist_use;
16544 }
16545
16546 /* Return true if it is legal to clobber flags by INSN and
16547 false otherwise. */
16548
16549 static bool
16550 ix86_ok_to_clobber_flags (rtx insn)
16551 {
16552 basic_block bb = BLOCK_FOR_INSN (insn);
16553 df_ref *use;
16554 bitmap live;
16555
16556 while (insn)
16557 {
16558 if (NONDEBUG_INSN_P (insn))
16559 {
16560 for (use = DF_INSN_USES (insn); *use; use++)
16561 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
16562 return false;
16563
16564 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
16565 return true;
16566 }
16567
16568 if (insn == BB_END (bb))
16569 break;
16570
16571 insn = NEXT_INSN (insn);
16572 }
16573
16574 live = df_get_live_out(bb);
16575 return !REGNO_REG_SET_P (live, FLAGS_REG);
16576 }
16577
16578 /* Return true if we need to split op0 = op1 + op2 into a sequence of
16579 move and add to avoid AGU stalls. */
16580
16581 bool
16582 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
16583 {
16584 unsigned int regno0 = true_regnum (operands[0]);
16585 unsigned int regno1 = true_regnum (operands[1]);
16586 unsigned int regno2 = true_regnum (operands[2]);
16587
16588 /* Check if we need to optimize. */
16589 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16590 return false;
16591
16592 /* Check it is correct to split here. */
16593 if (!ix86_ok_to_clobber_flags(insn))
16594 return false;
16595
16596 /* We need to split only adds with non destructive
16597 destination operand. */
16598 if (regno0 == regno1 || regno0 == regno2)
16599 return false;
16600 else
16601 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
16602 }
16603
16604 /* Return true if we should emit lea instruction instead of mov
16605 instruction. */
16606
16607 bool
16608 ix86_use_lea_for_mov (rtx insn, rtx operands[])
16609 {
16610 unsigned int regno0;
16611 unsigned int regno1;
16612
16613 /* Check if we need to optimize. */
16614 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16615 return false;
16616
16617 /* Use lea for reg to reg moves only. */
16618 if (!REG_P (operands[0]) || !REG_P (operands[1]))
16619 return false;
16620
16621 regno0 = true_regnum (operands[0]);
16622 regno1 = true_regnum (operands[1]);
16623
16624 return ix86_lea_outperforms (insn, regno0, regno1, -1, 0);
16625 }
16626
16627 /* Return true if we need to split lea into a sequence of
16628 instructions to avoid AGU stalls. */
16629
16630 bool
16631 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
16632 {
16633 unsigned int regno0 = true_regnum (operands[0]) ;
16634 unsigned int regno1 = -1;
16635 unsigned int regno2 = -1;
16636 unsigned int split_cost = 0;
16637 struct ix86_address parts;
16638 int ok;
16639
16640 /* Check we need to optimize. */
16641 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16642 return false;
16643
16644 /* Check it is correct to split here. */
16645 if (!ix86_ok_to_clobber_flags(insn))
16646 return false;
16647
16648 ok = ix86_decompose_address (operands[1], &parts);
16649 gcc_assert (ok);
16650
16651 /* We should not split into add if non legitimate pic
16652 operand is used as displacement. */
16653 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
16654 return false;
16655
16656 if (parts.base)
16657 regno1 = true_regnum (parts.base);
16658 if (parts.index)
16659 regno2 = true_regnum (parts.index);
16660
16661 /* Compute how many cycles we will add to execution time
16662 if split lea into a sequence of instructions. */
16663 if (parts.base || parts.index)
16664 {
16665 /* Have to use mov instruction if non desctructive
16666 destination form is used. */
16667 if (regno1 != regno0 && regno2 != regno0)
16668 split_cost += 1;
16669
16670 /* Have to add index to base if both exist. */
16671 if (parts.base && parts.index)
16672 split_cost += 1;
16673
16674 /* Have to use shift and adds if scale is 2 or greater. */
16675 if (parts.scale > 1)
16676 {
16677 if (regno0 != regno1)
16678 split_cost += 1;
16679 else if (regno2 == regno0)
16680 split_cost += 4;
16681 else
16682 split_cost += parts.scale;
16683 }
16684
16685 /* Have to use add instruction with immediate if
16686 disp is non zero. */
16687 if (parts.disp && parts.disp != const0_rtx)
16688 split_cost += 1;
16689
16690 /* Subtract the price of lea. */
16691 split_cost -= 1;
16692 }
16693
16694 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
16695 }
16696
16697 /* Emit x86 binary operand CODE in mode MODE, where the first operand
16698 matches destination. RTX includes clobber of FLAGS_REG. */
16699
16700 static void
16701 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
16702 rtx dst, rtx src)
16703 {
16704 rtx op, clob;
16705
16706 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
16707 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16708
16709 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16710 }
16711
16712 /* Split lea instructions into a sequence of instructions
16713 which are executed on ALU to avoid AGU stalls.
16714 It is assumed that it is allowed to clobber flags register
16715 at lea position. */
16716
16717 extern void
16718 ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
16719 {
16720 unsigned int regno0 = true_regnum (operands[0]) ;
16721 unsigned int regno1 = INVALID_REGNUM;
16722 unsigned int regno2 = INVALID_REGNUM;
16723 struct ix86_address parts;
16724 rtx tmp;
16725 int ok, adds;
16726
16727 ok = ix86_decompose_address (operands[1], &parts);
16728 gcc_assert (ok);
16729
16730 if (parts.base)
16731 {
16732 if (GET_MODE (parts.base) != mode)
16733 parts.base = gen_rtx_SUBREG (mode, parts.base, 0);
16734 regno1 = true_regnum (parts.base);
16735 }
16736
16737 if (parts.index)
16738 {
16739 if (GET_MODE (parts.index) != mode)
16740 parts.index = gen_rtx_SUBREG (mode, parts.index, 0);
16741 regno2 = true_regnum (parts.index);
16742 }
16743
16744 if (parts.scale > 1)
16745 {
16746 /* Case r1 = r1 + ... */
16747 if (regno1 == regno0)
16748 {
16749 /* If we have a case r1 = r1 + C * r1 then we
16750 should use multiplication which is very
16751 expensive. Assume cost model is wrong if we
16752 have such case here. */
16753 gcc_assert (regno2 != regno0);
16754
16755 for (adds = parts.scale; adds > 0; adds--)
16756 ix86_emit_binop (PLUS, mode, operands[0], parts.index);
16757 }
16758 else
16759 {
16760 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
16761 if (regno0 != regno2)
16762 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16763
16764 /* Use shift for scaling. */
16765 ix86_emit_binop (ASHIFT, mode, operands[0],
16766 GEN_INT (exact_log2 (parts.scale)));
16767
16768 if (parts.base)
16769 ix86_emit_binop (PLUS, mode, operands[0], parts.base);
16770
16771 if (parts.disp && parts.disp != const0_rtx)
16772 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16773 }
16774 }
16775 else if (!parts.base && !parts.index)
16776 {
16777 gcc_assert(parts.disp);
16778 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.disp));
16779 }
16780 else
16781 {
16782 if (!parts.base)
16783 {
16784 if (regno0 != regno2)
16785 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16786 }
16787 else if (!parts.index)
16788 {
16789 if (regno0 != regno1)
16790 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16791 }
16792 else
16793 {
16794 if (regno0 == regno1)
16795 tmp = parts.index;
16796 else if (regno0 == regno2)
16797 tmp = parts.base;
16798 else
16799 {
16800 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16801 tmp = parts.index;
16802 }
16803
16804 ix86_emit_binop (PLUS, mode, operands[0], tmp);
16805 }
16806
16807 if (parts.disp && parts.disp != const0_rtx)
16808 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16809 }
16810 }
16811
16812 /* Return true if it is ok to optimize an ADD operation to LEA
16813 operation to avoid flag register consumation. For most processors,
16814 ADD is faster than LEA. For the processors like ATOM, if the
16815 destination register of LEA holds an actual address which will be
16816 used soon, LEA is better and otherwise ADD is better. */
16817
16818 bool
16819 ix86_lea_for_add_ok (rtx insn, rtx operands[])
16820 {
16821 unsigned int regno0 = true_regnum (operands[0]);
16822 unsigned int regno1 = true_regnum (operands[1]);
16823 unsigned int regno2 = true_regnum (operands[2]);
16824
16825 /* If a = b + c, (a!=b && a!=c), must use lea form. */
16826 if (regno0 != regno1 && regno0 != regno2)
16827 return true;
16828
16829 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16830 return false;
16831
16832 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
16833 }
16834
16835 /* Return true if destination reg of SET_BODY is shift count of
16836 USE_BODY. */
16837
16838 static bool
16839 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
16840 {
16841 rtx set_dest;
16842 rtx shift_rtx;
16843 int i;
16844
16845 /* Retrieve destination of SET_BODY. */
16846 switch (GET_CODE (set_body))
16847 {
16848 case SET:
16849 set_dest = SET_DEST (set_body);
16850 if (!set_dest || !REG_P (set_dest))
16851 return false;
16852 break;
16853 case PARALLEL:
16854 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
16855 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
16856 use_body))
16857 return true;
16858 default:
16859 return false;
16860 break;
16861 }
16862
16863 /* Retrieve shift count of USE_BODY. */
16864 switch (GET_CODE (use_body))
16865 {
16866 case SET:
16867 shift_rtx = XEXP (use_body, 1);
16868 break;
16869 case PARALLEL:
16870 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
16871 if (ix86_dep_by_shift_count_body (set_body,
16872 XVECEXP (use_body, 0, i)))
16873 return true;
16874 default:
16875 return false;
16876 break;
16877 }
16878
16879 if (shift_rtx
16880 && (GET_CODE (shift_rtx) == ASHIFT
16881 || GET_CODE (shift_rtx) == LSHIFTRT
16882 || GET_CODE (shift_rtx) == ASHIFTRT
16883 || GET_CODE (shift_rtx) == ROTATE
16884 || GET_CODE (shift_rtx) == ROTATERT))
16885 {
16886 rtx shift_count = XEXP (shift_rtx, 1);
16887
16888 /* Return true if shift count is dest of SET_BODY. */
16889 if (REG_P (shift_count)
16890 && true_regnum (set_dest) == true_regnum (shift_count))
16891 return true;
16892 }
16893
16894 return false;
16895 }
16896
16897 /* Return true if destination reg of SET_INSN is shift count of
16898 USE_INSN. */
16899
16900 bool
16901 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
16902 {
16903 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
16904 PATTERN (use_insn));
16905 }
16906
16907 /* Return TRUE or FALSE depending on whether the unary operator meets the
16908 appropriate constraints. */
16909
16910 bool
16911 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
16912 enum machine_mode mode ATTRIBUTE_UNUSED,
16913 rtx operands[2] ATTRIBUTE_UNUSED)
16914 {
16915 /* If one of operands is memory, source and destination must match. */
16916 if ((MEM_P (operands[0])
16917 || MEM_P (operands[1]))
16918 && ! rtx_equal_p (operands[0], operands[1]))
16919 return false;
16920 return true;
16921 }
16922
16923 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
16924 are ok, keeping in mind the possible movddup alternative. */
16925
16926 bool
16927 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
16928 {
16929 if (MEM_P (operands[0]))
16930 return rtx_equal_p (operands[0], operands[1 + high]);
16931 if (MEM_P (operands[1]) && MEM_P (operands[2]))
16932 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
16933 return true;
16934 }
16935
16936 /* Post-reload splitter for converting an SF or DFmode value in an
16937 SSE register into an unsigned SImode. */
16938
16939 void
16940 ix86_split_convert_uns_si_sse (rtx operands[])
16941 {
16942 enum machine_mode vecmode;
16943 rtx value, large, zero_or_two31, input, two31, x;
16944
16945 large = operands[1];
16946 zero_or_two31 = operands[2];
16947 input = operands[3];
16948 two31 = operands[4];
16949 vecmode = GET_MODE (large);
16950 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
16951
16952 /* Load up the value into the low element. We must ensure that the other
16953 elements are valid floats -- zero is the easiest such value. */
16954 if (MEM_P (input))
16955 {
16956 if (vecmode == V4SFmode)
16957 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
16958 else
16959 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
16960 }
16961 else
16962 {
16963 input = gen_rtx_REG (vecmode, REGNO (input));
16964 emit_move_insn (value, CONST0_RTX (vecmode));
16965 if (vecmode == V4SFmode)
16966 emit_insn (gen_sse_movss (value, value, input));
16967 else
16968 emit_insn (gen_sse2_movsd (value, value, input));
16969 }
16970
16971 emit_move_insn (large, two31);
16972 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
16973
16974 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
16975 emit_insn (gen_rtx_SET (VOIDmode, large, x));
16976
16977 x = gen_rtx_AND (vecmode, zero_or_two31, large);
16978 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
16979
16980 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
16981 emit_insn (gen_rtx_SET (VOIDmode, value, x));
16982
16983 large = gen_rtx_REG (V4SImode, REGNO (large));
16984 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
16985
16986 x = gen_rtx_REG (V4SImode, REGNO (value));
16987 if (vecmode == V4SFmode)
16988 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
16989 else
16990 emit_insn (gen_sse2_cvttpd2dq (x, value));
16991 value = x;
16992
16993 emit_insn (gen_xorv4si3 (value, value, large));
16994 }
16995
16996 /* Convert an unsigned DImode value into a DFmode, using only SSE.
16997 Expects the 64-bit DImode to be supplied in a pair of integral
16998 registers. Requires SSE2; will use SSE3 if available. For x86_32,
16999 -mfpmath=sse, !optimize_size only. */
17000
17001 void
17002 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17003 {
17004 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17005 rtx int_xmm, fp_xmm;
17006 rtx biases, exponents;
17007 rtx x;
17008
17009 int_xmm = gen_reg_rtx (V4SImode);
17010 if (TARGET_INTER_UNIT_MOVES)
17011 emit_insn (gen_movdi_to_sse (int_xmm, input));
17012 else if (TARGET_SSE_SPLIT_REGS)
17013 {
17014 emit_clobber (int_xmm);
17015 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17016 }
17017 else
17018 {
17019 x = gen_reg_rtx (V2DImode);
17020 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17021 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17022 }
17023
17024 x = gen_rtx_CONST_VECTOR (V4SImode,
17025 gen_rtvec (4, GEN_INT (0x43300000UL),
17026 GEN_INT (0x45300000UL),
17027 const0_rtx, const0_rtx));
17028 exponents = validize_mem (force_const_mem (V4SImode, x));
17029
17030 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17031 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17032
17033 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17034 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17035 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17036 (0x1.0p84 + double(fp_value_hi_xmm)).
17037 Note these exponents differ by 32. */
17038
17039 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17040
17041 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17042 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17043 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17044 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17045 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17046 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17047 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17048 biases = validize_mem (force_const_mem (V2DFmode, biases));
17049 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17050
17051 /* Add the upper and lower DFmode values together. */
17052 if (TARGET_SSE3)
17053 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17054 else
17055 {
17056 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17057 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17058 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17059 }
17060
17061 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17062 }
17063
17064 /* Not used, but eases macroization of patterns. */
17065 void
17066 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17067 rtx input ATTRIBUTE_UNUSED)
17068 {
17069 gcc_unreachable ();
17070 }
17071
17072 /* Convert an unsigned SImode value into a DFmode. Only currently used
17073 for SSE, but applicable anywhere. */
17074
17075 void
17076 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17077 {
17078 REAL_VALUE_TYPE TWO31r;
17079 rtx x, fp;
17080
17081 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17082 NULL, 1, OPTAB_DIRECT);
17083
17084 fp = gen_reg_rtx (DFmode);
17085 emit_insn (gen_floatsidf2 (fp, x));
17086
17087 real_ldexp (&TWO31r, &dconst1, 31);
17088 x = const_double_from_real_value (TWO31r, DFmode);
17089
17090 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17091 if (x != target)
17092 emit_move_insn (target, x);
17093 }
17094
17095 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17096 32-bit mode; otherwise we have a direct convert instruction. */
17097
17098 void
17099 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17100 {
17101 REAL_VALUE_TYPE TWO32r;
17102 rtx fp_lo, fp_hi, x;
17103
17104 fp_lo = gen_reg_rtx (DFmode);
17105 fp_hi = gen_reg_rtx (DFmode);
17106
17107 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17108
17109 real_ldexp (&TWO32r, &dconst1, 32);
17110 x = const_double_from_real_value (TWO32r, DFmode);
17111 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17112
17113 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17114
17115 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17116 0, OPTAB_DIRECT);
17117 if (x != target)
17118 emit_move_insn (target, x);
17119 }
17120
17121 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17122 For x86_32, -mfpmath=sse, !optimize_size only. */
17123 void
17124 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17125 {
17126 REAL_VALUE_TYPE ONE16r;
17127 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17128
17129 real_ldexp (&ONE16r, &dconst1, 16);
17130 x = const_double_from_real_value (ONE16r, SFmode);
17131 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17132 NULL, 0, OPTAB_DIRECT);
17133 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17134 NULL, 0, OPTAB_DIRECT);
17135 fp_hi = gen_reg_rtx (SFmode);
17136 fp_lo = gen_reg_rtx (SFmode);
17137 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17138 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17139 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17140 0, OPTAB_DIRECT);
17141 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17142 0, OPTAB_DIRECT);
17143 if (!rtx_equal_p (target, fp_hi))
17144 emit_move_insn (target, fp_hi);
17145 }
17146
17147 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17148 a vector of unsigned ints VAL to vector of floats TARGET. */
17149
17150 void
17151 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17152 {
17153 rtx tmp[8];
17154 REAL_VALUE_TYPE TWO16r;
17155 enum machine_mode intmode = GET_MODE (val);
17156 enum machine_mode fltmode = GET_MODE (target);
17157 rtx (*cvt) (rtx, rtx);
17158
17159 if (intmode == V4SImode)
17160 cvt = gen_floatv4siv4sf2;
17161 else
17162 cvt = gen_floatv8siv8sf2;
17163 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17164 tmp[0] = force_reg (intmode, tmp[0]);
17165 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17166 OPTAB_DIRECT);
17167 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17168 NULL_RTX, 1, OPTAB_DIRECT);
17169 tmp[3] = gen_reg_rtx (fltmode);
17170 emit_insn (cvt (tmp[3], tmp[1]));
17171 tmp[4] = gen_reg_rtx (fltmode);
17172 emit_insn (cvt (tmp[4], tmp[2]));
17173 real_ldexp (&TWO16r, &dconst1, 16);
17174 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17175 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17176 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17177 OPTAB_DIRECT);
17178 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17179 OPTAB_DIRECT);
17180 if (tmp[7] != target)
17181 emit_move_insn (target, tmp[7]);
17182 }
17183
17184 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17185 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17186 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17187 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17188
17189 rtx
17190 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17191 {
17192 REAL_VALUE_TYPE TWO31r;
17193 rtx two31r, tmp[4];
17194 enum machine_mode mode = GET_MODE (val);
17195 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17196 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17197 rtx (*cmp) (rtx, rtx, rtx, rtx);
17198 int i;
17199
17200 for (i = 0; i < 3; i++)
17201 tmp[i] = gen_reg_rtx (mode);
17202 real_ldexp (&TWO31r, &dconst1, 31);
17203 two31r = const_double_from_real_value (TWO31r, scalarmode);
17204 two31r = ix86_build_const_vector (mode, 1, two31r);
17205 two31r = force_reg (mode, two31r);
17206 switch (mode)
17207 {
17208 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17209 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17210 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17211 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17212 default: gcc_unreachable ();
17213 }
17214 tmp[3] = gen_rtx_LE (mode, two31r, val);
17215 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17216 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17217 0, OPTAB_DIRECT);
17218 if (intmode == V4SImode || TARGET_AVX2)
17219 *xorp = expand_simple_binop (intmode, ASHIFT,
17220 gen_lowpart (intmode, tmp[0]),
17221 GEN_INT (31), NULL_RTX, 0,
17222 OPTAB_DIRECT);
17223 else
17224 {
17225 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17226 two31 = ix86_build_const_vector (intmode, 1, two31);
17227 *xorp = expand_simple_binop (intmode, AND,
17228 gen_lowpart (intmode, tmp[0]),
17229 two31, NULL_RTX, 0,
17230 OPTAB_DIRECT);
17231 }
17232 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17233 0, OPTAB_DIRECT);
17234 }
17235
17236 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17237 then replicate the value for all elements of the vector
17238 register. */
17239
17240 rtx
17241 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17242 {
17243 int i, n_elt;
17244 rtvec v;
17245 enum machine_mode scalar_mode;
17246
17247 switch (mode)
17248 {
17249 case V32QImode:
17250 case V16QImode:
17251 case V16HImode:
17252 case V8HImode:
17253 case V8SImode:
17254 case V4SImode:
17255 case V4DImode:
17256 case V2DImode:
17257 gcc_assert (vect);
17258 case V8SFmode:
17259 case V4SFmode:
17260 case V4DFmode:
17261 case V2DFmode:
17262 n_elt = GET_MODE_NUNITS (mode);
17263 v = rtvec_alloc (n_elt);
17264 scalar_mode = GET_MODE_INNER (mode);
17265
17266 RTVEC_ELT (v, 0) = value;
17267
17268 for (i = 1; i < n_elt; ++i)
17269 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17270
17271 return gen_rtx_CONST_VECTOR (mode, v);
17272
17273 default:
17274 gcc_unreachable ();
17275 }
17276 }
17277
17278 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17279 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17280 for an SSE register. If VECT is true, then replicate the mask for
17281 all elements of the vector register. If INVERT is true, then create
17282 a mask excluding the sign bit. */
17283
17284 rtx
17285 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17286 {
17287 enum machine_mode vec_mode, imode;
17288 HOST_WIDE_INT hi, lo;
17289 int shift = 63;
17290 rtx v;
17291 rtx mask;
17292
17293 /* Find the sign bit, sign extended to 2*HWI. */
17294 switch (mode)
17295 {
17296 case V8SImode:
17297 case V4SImode:
17298 case V8SFmode:
17299 case V4SFmode:
17300 vec_mode = mode;
17301 mode = GET_MODE_INNER (mode);
17302 imode = SImode;
17303 lo = 0x80000000, hi = lo < 0;
17304 break;
17305
17306 case V4DImode:
17307 case V2DImode:
17308 case V4DFmode:
17309 case V2DFmode:
17310 vec_mode = mode;
17311 mode = GET_MODE_INNER (mode);
17312 imode = DImode;
17313 if (HOST_BITS_PER_WIDE_INT >= 64)
17314 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17315 else
17316 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17317 break;
17318
17319 case TImode:
17320 case TFmode:
17321 vec_mode = VOIDmode;
17322 if (HOST_BITS_PER_WIDE_INT >= 64)
17323 {
17324 imode = TImode;
17325 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
17326 }
17327 else
17328 {
17329 rtvec vec;
17330
17331 imode = DImode;
17332 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17333
17334 if (invert)
17335 {
17336 lo = ~lo, hi = ~hi;
17337 v = constm1_rtx;
17338 }
17339 else
17340 v = const0_rtx;
17341
17342 mask = immed_double_const (lo, hi, imode);
17343
17344 vec = gen_rtvec (2, v, mask);
17345 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
17346 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
17347
17348 return v;
17349 }
17350 break;
17351
17352 default:
17353 gcc_unreachable ();
17354 }
17355
17356 if (invert)
17357 lo = ~lo, hi = ~hi;
17358
17359 /* Force this value into the low part of a fp vector constant. */
17360 mask = immed_double_const (lo, hi, imode);
17361 mask = gen_lowpart (mode, mask);
17362
17363 if (vec_mode == VOIDmode)
17364 return force_reg (mode, mask);
17365
17366 v = ix86_build_const_vector (vec_mode, vect, mask);
17367 return force_reg (vec_mode, v);
17368 }
17369
17370 /* Generate code for floating point ABS or NEG. */
17371
17372 void
17373 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
17374 rtx operands[])
17375 {
17376 rtx mask, set, dst, src;
17377 bool use_sse = false;
17378 bool vector_mode = VECTOR_MODE_P (mode);
17379 enum machine_mode vmode = mode;
17380
17381 if (vector_mode)
17382 use_sse = true;
17383 else if (mode == TFmode)
17384 use_sse = true;
17385 else if (TARGET_SSE_MATH)
17386 {
17387 use_sse = SSE_FLOAT_MODE_P (mode);
17388 if (mode == SFmode)
17389 vmode = V4SFmode;
17390 else if (mode == DFmode)
17391 vmode = V2DFmode;
17392 }
17393
17394 /* NEG and ABS performed with SSE use bitwise mask operations.
17395 Create the appropriate mask now. */
17396 if (use_sse)
17397 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
17398 else
17399 mask = NULL_RTX;
17400
17401 dst = operands[0];
17402 src = operands[1];
17403
17404 set = gen_rtx_fmt_e (code, mode, src);
17405 set = gen_rtx_SET (VOIDmode, dst, set);
17406
17407 if (mask)
17408 {
17409 rtx use, clob;
17410 rtvec par;
17411
17412 use = gen_rtx_USE (VOIDmode, mask);
17413 if (vector_mode)
17414 par = gen_rtvec (2, set, use);
17415 else
17416 {
17417 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17418 par = gen_rtvec (3, set, use, clob);
17419 }
17420 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
17421 }
17422 else
17423 emit_insn (set);
17424 }
17425
17426 /* Expand a copysign operation. Special case operand 0 being a constant. */
17427
17428 void
17429 ix86_expand_copysign (rtx operands[])
17430 {
17431 enum machine_mode mode, vmode;
17432 rtx dest, op0, op1, mask, nmask;
17433
17434 dest = operands[0];
17435 op0 = operands[1];
17436 op1 = operands[2];
17437
17438 mode = GET_MODE (dest);
17439
17440 if (mode == SFmode)
17441 vmode = V4SFmode;
17442 else if (mode == DFmode)
17443 vmode = V2DFmode;
17444 else
17445 vmode = mode;
17446
17447 if (GET_CODE (op0) == CONST_DOUBLE)
17448 {
17449 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
17450
17451 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
17452 op0 = simplify_unary_operation (ABS, mode, op0, mode);
17453
17454 if (mode == SFmode || mode == DFmode)
17455 {
17456 if (op0 == CONST0_RTX (mode))
17457 op0 = CONST0_RTX (vmode);
17458 else
17459 {
17460 rtx v = ix86_build_const_vector (vmode, false, op0);
17461
17462 op0 = force_reg (vmode, v);
17463 }
17464 }
17465 else if (op0 != CONST0_RTX (mode))
17466 op0 = force_reg (mode, op0);
17467
17468 mask = ix86_build_signbit_mask (vmode, 0, 0);
17469
17470 if (mode == SFmode)
17471 copysign_insn = gen_copysignsf3_const;
17472 else if (mode == DFmode)
17473 copysign_insn = gen_copysigndf3_const;
17474 else
17475 copysign_insn = gen_copysigntf3_const;
17476
17477 emit_insn (copysign_insn (dest, op0, op1, mask));
17478 }
17479 else
17480 {
17481 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
17482
17483 nmask = ix86_build_signbit_mask (vmode, 0, 1);
17484 mask = ix86_build_signbit_mask (vmode, 0, 0);
17485
17486 if (mode == SFmode)
17487 copysign_insn = gen_copysignsf3_var;
17488 else if (mode == DFmode)
17489 copysign_insn = gen_copysigndf3_var;
17490 else
17491 copysign_insn = gen_copysigntf3_var;
17492
17493 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
17494 }
17495 }
17496
17497 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
17498 be a constant, and so has already been expanded into a vector constant. */
17499
17500 void
17501 ix86_split_copysign_const (rtx operands[])
17502 {
17503 enum machine_mode mode, vmode;
17504 rtx dest, op0, mask, x;
17505
17506 dest = operands[0];
17507 op0 = operands[1];
17508 mask = operands[3];
17509
17510 mode = GET_MODE (dest);
17511 vmode = GET_MODE (mask);
17512
17513 dest = simplify_gen_subreg (vmode, dest, mode, 0);
17514 x = gen_rtx_AND (vmode, dest, mask);
17515 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17516
17517 if (op0 != CONST0_RTX (vmode))
17518 {
17519 x = gen_rtx_IOR (vmode, dest, op0);
17520 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17521 }
17522 }
17523
17524 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
17525 so we have to do two masks. */
17526
17527 void
17528 ix86_split_copysign_var (rtx operands[])
17529 {
17530 enum machine_mode mode, vmode;
17531 rtx dest, scratch, op0, op1, mask, nmask, x;
17532
17533 dest = operands[0];
17534 scratch = operands[1];
17535 op0 = operands[2];
17536 op1 = operands[3];
17537 nmask = operands[4];
17538 mask = operands[5];
17539
17540 mode = GET_MODE (dest);
17541 vmode = GET_MODE (mask);
17542
17543 if (rtx_equal_p (op0, op1))
17544 {
17545 /* Shouldn't happen often (it's useless, obviously), but when it does
17546 we'd generate incorrect code if we continue below. */
17547 emit_move_insn (dest, op0);
17548 return;
17549 }
17550
17551 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
17552 {
17553 gcc_assert (REGNO (op1) == REGNO (scratch));
17554
17555 x = gen_rtx_AND (vmode, scratch, mask);
17556 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17557
17558 dest = mask;
17559 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17560 x = gen_rtx_NOT (vmode, dest);
17561 x = gen_rtx_AND (vmode, x, op0);
17562 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17563 }
17564 else
17565 {
17566 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
17567 {
17568 x = gen_rtx_AND (vmode, scratch, mask);
17569 }
17570 else /* alternative 2,4 */
17571 {
17572 gcc_assert (REGNO (mask) == REGNO (scratch));
17573 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
17574 x = gen_rtx_AND (vmode, scratch, op1);
17575 }
17576 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17577
17578 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
17579 {
17580 dest = simplify_gen_subreg (vmode, op0, mode, 0);
17581 x = gen_rtx_AND (vmode, dest, nmask);
17582 }
17583 else /* alternative 3,4 */
17584 {
17585 gcc_assert (REGNO (nmask) == REGNO (dest));
17586 dest = nmask;
17587 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17588 x = gen_rtx_AND (vmode, dest, op0);
17589 }
17590 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17591 }
17592
17593 x = gen_rtx_IOR (vmode, dest, scratch);
17594 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17595 }
17596
17597 /* Return TRUE or FALSE depending on whether the first SET in INSN
17598 has source and destination with matching CC modes, and that the
17599 CC mode is at least as constrained as REQ_MODE. */
17600
17601 bool
17602 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
17603 {
17604 rtx set;
17605 enum machine_mode set_mode;
17606
17607 set = PATTERN (insn);
17608 if (GET_CODE (set) == PARALLEL)
17609 set = XVECEXP (set, 0, 0);
17610 gcc_assert (GET_CODE (set) == SET);
17611 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
17612
17613 set_mode = GET_MODE (SET_DEST (set));
17614 switch (set_mode)
17615 {
17616 case CCNOmode:
17617 if (req_mode != CCNOmode
17618 && (req_mode != CCmode
17619 || XEXP (SET_SRC (set), 1) != const0_rtx))
17620 return false;
17621 break;
17622 case CCmode:
17623 if (req_mode == CCGCmode)
17624 return false;
17625 /* FALLTHRU */
17626 case CCGCmode:
17627 if (req_mode == CCGOCmode || req_mode == CCNOmode)
17628 return false;
17629 /* FALLTHRU */
17630 case CCGOCmode:
17631 if (req_mode == CCZmode)
17632 return false;
17633 /* FALLTHRU */
17634 case CCZmode:
17635 break;
17636
17637 case CCAmode:
17638 case CCCmode:
17639 case CCOmode:
17640 case CCSmode:
17641 if (set_mode != req_mode)
17642 return false;
17643 break;
17644
17645 default:
17646 gcc_unreachable ();
17647 }
17648
17649 return GET_MODE (SET_SRC (set)) == set_mode;
17650 }
17651
17652 /* Generate insn patterns to do an integer compare of OPERANDS. */
17653
17654 static rtx
17655 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
17656 {
17657 enum machine_mode cmpmode;
17658 rtx tmp, flags;
17659
17660 cmpmode = SELECT_CC_MODE (code, op0, op1);
17661 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
17662
17663 /* This is very simple, but making the interface the same as in the
17664 FP case makes the rest of the code easier. */
17665 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
17666 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
17667
17668 /* Return the test that should be put into the flags user, i.e.
17669 the bcc, scc, or cmov instruction. */
17670 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
17671 }
17672
17673 /* Figure out whether to use ordered or unordered fp comparisons.
17674 Return the appropriate mode to use. */
17675
17676 enum machine_mode
17677 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
17678 {
17679 /* ??? In order to make all comparisons reversible, we do all comparisons
17680 non-trapping when compiling for IEEE. Once gcc is able to distinguish
17681 all forms trapping and nontrapping comparisons, we can make inequality
17682 comparisons trapping again, since it results in better code when using
17683 FCOM based compares. */
17684 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
17685 }
17686
17687 enum machine_mode
17688 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
17689 {
17690 enum machine_mode mode = GET_MODE (op0);
17691
17692 if (SCALAR_FLOAT_MODE_P (mode))
17693 {
17694 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
17695 return ix86_fp_compare_mode (code);
17696 }
17697
17698 switch (code)
17699 {
17700 /* Only zero flag is needed. */
17701 case EQ: /* ZF=0 */
17702 case NE: /* ZF!=0 */
17703 return CCZmode;
17704 /* Codes needing carry flag. */
17705 case GEU: /* CF=0 */
17706 case LTU: /* CF=1 */
17707 /* Detect overflow checks. They need just the carry flag. */
17708 if (GET_CODE (op0) == PLUS
17709 && rtx_equal_p (op1, XEXP (op0, 0)))
17710 return CCCmode;
17711 else
17712 return CCmode;
17713 case GTU: /* CF=0 & ZF=0 */
17714 case LEU: /* CF=1 | ZF=1 */
17715 /* Detect overflow checks. They need just the carry flag. */
17716 if (GET_CODE (op0) == MINUS
17717 && rtx_equal_p (op1, XEXP (op0, 0)))
17718 return CCCmode;
17719 else
17720 return CCmode;
17721 /* Codes possibly doable only with sign flag when
17722 comparing against zero. */
17723 case GE: /* SF=OF or SF=0 */
17724 case LT: /* SF<>OF or SF=1 */
17725 if (op1 == const0_rtx)
17726 return CCGOCmode;
17727 else
17728 /* For other cases Carry flag is not required. */
17729 return CCGCmode;
17730 /* Codes doable only with sign flag when comparing
17731 against zero, but we miss jump instruction for it
17732 so we need to use relational tests against overflow
17733 that thus needs to be zero. */
17734 case GT: /* ZF=0 & SF=OF */
17735 case LE: /* ZF=1 | SF<>OF */
17736 if (op1 == const0_rtx)
17737 return CCNOmode;
17738 else
17739 return CCGCmode;
17740 /* strcmp pattern do (use flags) and combine may ask us for proper
17741 mode. */
17742 case USE:
17743 return CCmode;
17744 default:
17745 gcc_unreachable ();
17746 }
17747 }
17748
17749 /* Return the fixed registers used for condition codes. */
17750
17751 static bool
17752 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
17753 {
17754 *p1 = FLAGS_REG;
17755 *p2 = FPSR_REG;
17756 return true;
17757 }
17758
17759 /* If two condition code modes are compatible, return a condition code
17760 mode which is compatible with both. Otherwise, return
17761 VOIDmode. */
17762
17763 static enum machine_mode
17764 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
17765 {
17766 if (m1 == m2)
17767 return m1;
17768
17769 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
17770 return VOIDmode;
17771
17772 if ((m1 == CCGCmode && m2 == CCGOCmode)
17773 || (m1 == CCGOCmode && m2 == CCGCmode))
17774 return CCGCmode;
17775
17776 switch (m1)
17777 {
17778 default:
17779 gcc_unreachable ();
17780
17781 case CCmode:
17782 case CCGCmode:
17783 case CCGOCmode:
17784 case CCNOmode:
17785 case CCAmode:
17786 case CCCmode:
17787 case CCOmode:
17788 case CCSmode:
17789 case CCZmode:
17790 switch (m2)
17791 {
17792 default:
17793 return VOIDmode;
17794
17795 case CCmode:
17796 case CCGCmode:
17797 case CCGOCmode:
17798 case CCNOmode:
17799 case CCAmode:
17800 case CCCmode:
17801 case CCOmode:
17802 case CCSmode:
17803 case CCZmode:
17804 return CCmode;
17805 }
17806
17807 case CCFPmode:
17808 case CCFPUmode:
17809 /* These are only compatible with themselves, which we already
17810 checked above. */
17811 return VOIDmode;
17812 }
17813 }
17814
17815
17816 /* Return a comparison we can do and that it is equivalent to
17817 swap_condition (code) apart possibly from orderedness.
17818 But, never change orderedness if TARGET_IEEE_FP, returning
17819 UNKNOWN in that case if necessary. */
17820
17821 static enum rtx_code
17822 ix86_fp_swap_condition (enum rtx_code code)
17823 {
17824 switch (code)
17825 {
17826 case GT: /* GTU - CF=0 & ZF=0 */
17827 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
17828 case GE: /* GEU - CF=0 */
17829 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
17830 case UNLT: /* LTU - CF=1 */
17831 return TARGET_IEEE_FP ? UNKNOWN : GT;
17832 case UNLE: /* LEU - CF=1 | ZF=1 */
17833 return TARGET_IEEE_FP ? UNKNOWN : GE;
17834 default:
17835 return swap_condition (code);
17836 }
17837 }
17838
17839 /* Return cost of comparison CODE using the best strategy for performance.
17840 All following functions do use number of instructions as a cost metrics.
17841 In future this should be tweaked to compute bytes for optimize_size and
17842 take into account performance of various instructions on various CPUs. */
17843
17844 static int
17845 ix86_fp_comparison_cost (enum rtx_code code)
17846 {
17847 int arith_cost;
17848
17849 /* The cost of code using bit-twiddling on %ah. */
17850 switch (code)
17851 {
17852 case UNLE:
17853 case UNLT:
17854 case LTGT:
17855 case GT:
17856 case GE:
17857 case UNORDERED:
17858 case ORDERED:
17859 case UNEQ:
17860 arith_cost = 4;
17861 break;
17862 case LT:
17863 case NE:
17864 case EQ:
17865 case UNGE:
17866 arith_cost = TARGET_IEEE_FP ? 5 : 4;
17867 break;
17868 case LE:
17869 case UNGT:
17870 arith_cost = TARGET_IEEE_FP ? 6 : 4;
17871 break;
17872 default:
17873 gcc_unreachable ();
17874 }
17875
17876 switch (ix86_fp_comparison_strategy (code))
17877 {
17878 case IX86_FPCMP_COMI:
17879 return arith_cost > 4 ? 3 : 2;
17880 case IX86_FPCMP_SAHF:
17881 return arith_cost > 4 ? 4 : 3;
17882 default:
17883 return arith_cost;
17884 }
17885 }
17886
17887 /* Return strategy to use for floating-point. We assume that fcomi is always
17888 preferrable where available, since that is also true when looking at size
17889 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
17890
17891 enum ix86_fpcmp_strategy
17892 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
17893 {
17894 /* Do fcomi/sahf based test when profitable. */
17895
17896 if (TARGET_CMOVE)
17897 return IX86_FPCMP_COMI;
17898
17899 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
17900 return IX86_FPCMP_SAHF;
17901
17902 return IX86_FPCMP_ARITH;
17903 }
17904
17905 /* Swap, force into registers, or otherwise massage the two operands
17906 to a fp comparison. The operands are updated in place; the new
17907 comparison code is returned. */
17908
17909 static enum rtx_code
17910 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
17911 {
17912 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
17913 rtx op0 = *pop0, op1 = *pop1;
17914 enum machine_mode op_mode = GET_MODE (op0);
17915 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
17916
17917 /* All of the unordered compare instructions only work on registers.
17918 The same is true of the fcomi compare instructions. The XFmode
17919 compare instructions require registers except when comparing
17920 against zero or when converting operand 1 from fixed point to
17921 floating point. */
17922
17923 if (!is_sse
17924 && (fpcmp_mode == CCFPUmode
17925 || (op_mode == XFmode
17926 && ! (standard_80387_constant_p (op0) == 1
17927 || standard_80387_constant_p (op1) == 1)
17928 && GET_CODE (op1) != FLOAT)
17929 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
17930 {
17931 op0 = force_reg (op_mode, op0);
17932 op1 = force_reg (op_mode, op1);
17933 }
17934 else
17935 {
17936 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
17937 things around if they appear profitable, otherwise force op0
17938 into a register. */
17939
17940 if (standard_80387_constant_p (op0) == 0
17941 || (MEM_P (op0)
17942 && ! (standard_80387_constant_p (op1) == 0
17943 || MEM_P (op1))))
17944 {
17945 enum rtx_code new_code = ix86_fp_swap_condition (code);
17946 if (new_code != UNKNOWN)
17947 {
17948 rtx tmp;
17949 tmp = op0, op0 = op1, op1 = tmp;
17950 code = new_code;
17951 }
17952 }
17953
17954 if (!REG_P (op0))
17955 op0 = force_reg (op_mode, op0);
17956
17957 if (CONSTANT_P (op1))
17958 {
17959 int tmp = standard_80387_constant_p (op1);
17960 if (tmp == 0)
17961 op1 = validize_mem (force_const_mem (op_mode, op1));
17962 else if (tmp == 1)
17963 {
17964 if (TARGET_CMOVE)
17965 op1 = force_reg (op_mode, op1);
17966 }
17967 else
17968 op1 = force_reg (op_mode, op1);
17969 }
17970 }
17971
17972 /* Try to rearrange the comparison to make it cheaper. */
17973 if (ix86_fp_comparison_cost (code)
17974 > ix86_fp_comparison_cost (swap_condition (code))
17975 && (REG_P (op1) || can_create_pseudo_p ()))
17976 {
17977 rtx tmp;
17978 tmp = op0, op0 = op1, op1 = tmp;
17979 code = swap_condition (code);
17980 if (!REG_P (op0))
17981 op0 = force_reg (op_mode, op0);
17982 }
17983
17984 *pop0 = op0;
17985 *pop1 = op1;
17986 return code;
17987 }
17988
17989 /* Convert comparison codes we use to represent FP comparison to integer
17990 code that will result in proper branch. Return UNKNOWN if no such code
17991 is available. */
17992
17993 enum rtx_code
17994 ix86_fp_compare_code_to_integer (enum rtx_code code)
17995 {
17996 switch (code)
17997 {
17998 case GT:
17999 return GTU;
18000 case GE:
18001 return GEU;
18002 case ORDERED:
18003 case UNORDERED:
18004 return code;
18005 break;
18006 case UNEQ:
18007 return EQ;
18008 break;
18009 case UNLT:
18010 return LTU;
18011 break;
18012 case UNLE:
18013 return LEU;
18014 break;
18015 case LTGT:
18016 return NE;
18017 break;
18018 default:
18019 return UNKNOWN;
18020 }
18021 }
18022
18023 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18024
18025 static rtx
18026 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18027 {
18028 enum machine_mode fpcmp_mode, intcmp_mode;
18029 rtx tmp, tmp2;
18030
18031 fpcmp_mode = ix86_fp_compare_mode (code);
18032 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18033
18034 /* Do fcomi/sahf based test when profitable. */
18035 switch (ix86_fp_comparison_strategy (code))
18036 {
18037 case IX86_FPCMP_COMI:
18038 intcmp_mode = fpcmp_mode;
18039 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18040 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18041 tmp);
18042 emit_insn (tmp);
18043 break;
18044
18045 case IX86_FPCMP_SAHF:
18046 intcmp_mode = fpcmp_mode;
18047 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18048 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18049 tmp);
18050
18051 if (!scratch)
18052 scratch = gen_reg_rtx (HImode);
18053 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18054 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18055 break;
18056
18057 case IX86_FPCMP_ARITH:
18058 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18059 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18060 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18061 if (!scratch)
18062 scratch = gen_reg_rtx (HImode);
18063 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18064
18065 /* In the unordered case, we have to check C2 for NaN's, which
18066 doesn't happen to work out to anything nice combination-wise.
18067 So do some bit twiddling on the value we've got in AH to come
18068 up with an appropriate set of condition codes. */
18069
18070 intcmp_mode = CCNOmode;
18071 switch (code)
18072 {
18073 case GT:
18074 case UNGT:
18075 if (code == GT || !TARGET_IEEE_FP)
18076 {
18077 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18078 code = EQ;
18079 }
18080 else
18081 {
18082 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18083 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18084 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18085 intcmp_mode = CCmode;
18086 code = GEU;
18087 }
18088 break;
18089 case LT:
18090 case UNLT:
18091 if (code == LT && TARGET_IEEE_FP)
18092 {
18093 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18094 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18095 intcmp_mode = CCmode;
18096 code = EQ;
18097 }
18098 else
18099 {
18100 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18101 code = NE;
18102 }
18103 break;
18104 case GE:
18105 case UNGE:
18106 if (code == GE || !TARGET_IEEE_FP)
18107 {
18108 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18109 code = EQ;
18110 }
18111 else
18112 {
18113 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18114 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18115 code = NE;
18116 }
18117 break;
18118 case LE:
18119 case UNLE:
18120 if (code == LE && TARGET_IEEE_FP)
18121 {
18122 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18123 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18124 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18125 intcmp_mode = CCmode;
18126 code = LTU;
18127 }
18128 else
18129 {
18130 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18131 code = NE;
18132 }
18133 break;
18134 case EQ:
18135 case UNEQ:
18136 if (code == EQ && TARGET_IEEE_FP)
18137 {
18138 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18139 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18140 intcmp_mode = CCmode;
18141 code = EQ;
18142 }
18143 else
18144 {
18145 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18146 code = NE;
18147 }
18148 break;
18149 case NE:
18150 case LTGT:
18151 if (code == NE && TARGET_IEEE_FP)
18152 {
18153 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18154 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18155 GEN_INT (0x40)));
18156 code = NE;
18157 }
18158 else
18159 {
18160 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18161 code = EQ;
18162 }
18163 break;
18164
18165 case UNORDERED:
18166 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18167 code = NE;
18168 break;
18169 case ORDERED:
18170 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18171 code = EQ;
18172 break;
18173
18174 default:
18175 gcc_unreachable ();
18176 }
18177 break;
18178
18179 default:
18180 gcc_unreachable();
18181 }
18182
18183 /* Return the test that should be put into the flags user, i.e.
18184 the bcc, scc, or cmov instruction. */
18185 return gen_rtx_fmt_ee (code, VOIDmode,
18186 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18187 const0_rtx);
18188 }
18189
18190 static rtx
18191 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18192 {
18193 rtx ret;
18194
18195 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18196 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18197
18198 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18199 {
18200 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18201 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18202 }
18203 else
18204 ret = ix86_expand_int_compare (code, op0, op1);
18205
18206 return ret;
18207 }
18208
18209 void
18210 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18211 {
18212 enum machine_mode mode = GET_MODE (op0);
18213 rtx tmp;
18214
18215 switch (mode)
18216 {
18217 case SFmode:
18218 case DFmode:
18219 case XFmode:
18220 case QImode:
18221 case HImode:
18222 case SImode:
18223 simple:
18224 tmp = ix86_expand_compare (code, op0, op1);
18225 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18226 gen_rtx_LABEL_REF (VOIDmode, label),
18227 pc_rtx);
18228 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18229 return;
18230
18231 case DImode:
18232 if (TARGET_64BIT)
18233 goto simple;
18234 case TImode:
18235 /* Expand DImode branch into multiple compare+branch. */
18236 {
18237 rtx lo[2], hi[2], label2;
18238 enum rtx_code code1, code2, code3;
18239 enum machine_mode submode;
18240
18241 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18242 {
18243 tmp = op0, op0 = op1, op1 = tmp;
18244 code = swap_condition (code);
18245 }
18246
18247 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18248 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18249
18250 submode = mode == DImode ? SImode : DImode;
18251
18252 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18253 avoid two branches. This costs one extra insn, so disable when
18254 optimizing for size. */
18255
18256 if ((code == EQ || code == NE)
18257 && (!optimize_insn_for_size_p ()
18258 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18259 {
18260 rtx xor0, xor1;
18261
18262 xor1 = hi[0];
18263 if (hi[1] != const0_rtx)
18264 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18265 NULL_RTX, 0, OPTAB_WIDEN);
18266
18267 xor0 = lo[0];
18268 if (lo[1] != const0_rtx)
18269 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18270 NULL_RTX, 0, OPTAB_WIDEN);
18271
18272 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18273 NULL_RTX, 0, OPTAB_WIDEN);
18274
18275 ix86_expand_branch (code, tmp, const0_rtx, label);
18276 return;
18277 }
18278
18279 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18280 op1 is a constant and the low word is zero, then we can just
18281 examine the high word. Similarly for low word -1 and
18282 less-or-equal-than or greater-than. */
18283
18284 if (CONST_INT_P (hi[1]))
18285 switch (code)
18286 {
18287 case LT: case LTU: case GE: case GEU:
18288 if (lo[1] == const0_rtx)
18289 {
18290 ix86_expand_branch (code, hi[0], hi[1], label);
18291 return;
18292 }
18293 break;
18294 case LE: case LEU: case GT: case GTU:
18295 if (lo[1] == constm1_rtx)
18296 {
18297 ix86_expand_branch (code, hi[0], hi[1], label);
18298 return;
18299 }
18300 break;
18301 default:
18302 break;
18303 }
18304
18305 /* Otherwise, we need two or three jumps. */
18306
18307 label2 = gen_label_rtx ();
18308
18309 code1 = code;
18310 code2 = swap_condition (code);
18311 code3 = unsigned_condition (code);
18312
18313 switch (code)
18314 {
18315 case LT: case GT: case LTU: case GTU:
18316 break;
18317
18318 case LE: code1 = LT; code2 = GT; break;
18319 case GE: code1 = GT; code2 = LT; break;
18320 case LEU: code1 = LTU; code2 = GTU; break;
18321 case GEU: code1 = GTU; code2 = LTU; break;
18322
18323 case EQ: code1 = UNKNOWN; code2 = NE; break;
18324 case NE: code2 = UNKNOWN; break;
18325
18326 default:
18327 gcc_unreachable ();
18328 }
18329
18330 /*
18331 * a < b =>
18332 * if (hi(a) < hi(b)) goto true;
18333 * if (hi(a) > hi(b)) goto false;
18334 * if (lo(a) < lo(b)) goto true;
18335 * false:
18336 */
18337
18338 if (code1 != UNKNOWN)
18339 ix86_expand_branch (code1, hi[0], hi[1], label);
18340 if (code2 != UNKNOWN)
18341 ix86_expand_branch (code2, hi[0], hi[1], label2);
18342
18343 ix86_expand_branch (code3, lo[0], lo[1], label);
18344
18345 if (code2 != UNKNOWN)
18346 emit_label (label2);
18347 return;
18348 }
18349
18350 default:
18351 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
18352 goto simple;
18353 }
18354 }
18355
18356 /* Split branch based on floating point condition. */
18357 void
18358 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
18359 rtx target1, rtx target2, rtx tmp, rtx pushed)
18360 {
18361 rtx condition;
18362 rtx i;
18363
18364 if (target2 != pc_rtx)
18365 {
18366 rtx tmp = target2;
18367 code = reverse_condition_maybe_unordered (code);
18368 target2 = target1;
18369 target1 = tmp;
18370 }
18371
18372 condition = ix86_expand_fp_compare (code, op1, op2,
18373 tmp);
18374
18375 /* Remove pushed operand from stack. */
18376 if (pushed)
18377 ix86_free_from_memory (GET_MODE (pushed));
18378
18379 i = emit_jump_insn (gen_rtx_SET
18380 (VOIDmode, pc_rtx,
18381 gen_rtx_IF_THEN_ELSE (VOIDmode,
18382 condition, target1, target2)));
18383 if (split_branch_probability >= 0)
18384 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
18385 }
18386
18387 void
18388 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
18389 {
18390 rtx ret;
18391
18392 gcc_assert (GET_MODE (dest) == QImode);
18393
18394 ret = ix86_expand_compare (code, op0, op1);
18395 PUT_MODE (ret, QImode);
18396 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
18397 }
18398
18399 /* Expand comparison setting or clearing carry flag. Return true when
18400 successful and set pop for the operation. */
18401 static bool
18402 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
18403 {
18404 enum machine_mode mode =
18405 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
18406
18407 /* Do not handle double-mode compares that go through special path. */
18408 if (mode == (TARGET_64BIT ? TImode : DImode))
18409 return false;
18410
18411 if (SCALAR_FLOAT_MODE_P (mode))
18412 {
18413 rtx compare_op, compare_seq;
18414
18415 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18416
18417 /* Shortcut: following common codes never translate
18418 into carry flag compares. */
18419 if (code == EQ || code == NE || code == UNEQ || code == LTGT
18420 || code == ORDERED || code == UNORDERED)
18421 return false;
18422
18423 /* These comparisons require zero flag; swap operands so they won't. */
18424 if ((code == GT || code == UNLE || code == LE || code == UNGT)
18425 && !TARGET_IEEE_FP)
18426 {
18427 rtx tmp = op0;
18428 op0 = op1;
18429 op1 = tmp;
18430 code = swap_condition (code);
18431 }
18432
18433 /* Try to expand the comparison and verify that we end up with
18434 carry flag based comparison. This fails to be true only when
18435 we decide to expand comparison using arithmetic that is not
18436 too common scenario. */
18437 start_sequence ();
18438 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18439 compare_seq = get_insns ();
18440 end_sequence ();
18441
18442 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
18443 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
18444 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
18445 else
18446 code = GET_CODE (compare_op);
18447
18448 if (code != LTU && code != GEU)
18449 return false;
18450
18451 emit_insn (compare_seq);
18452 *pop = compare_op;
18453 return true;
18454 }
18455
18456 if (!INTEGRAL_MODE_P (mode))
18457 return false;
18458
18459 switch (code)
18460 {
18461 case LTU:
18462 case GEU:
18463 break;
18464
18465 /* Convert a==0 into (unsigned)a<1. */
18466 case EQ:
18467 case NE:
18468 if (op1 != const0_rtx)
18469 return false;
18470 op1 = const1_rtx;
18471 code = (code == EQ ? LTU : GEU);
18472 break;
18473
18474 /* Convert a>b into b<a or a>=b-1. */
18475 case GTU:
18476 case LEU:
18477 if (CONST_INT_P (op1))
18478 {
18479 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
18480 /* Bail out on overflow. We still can swap operands but that
18481 would force loading of the constant into register. */
18482 if (op1 == const0_rtx
18483 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
18484 return false;
18485 code = (code == GTU ? GEU : LTU);
18486 }
18487 else
18488 {
18489 rtx tmp = op1;
18490 op1 = op0;
18491 op0 = tmp;
18492 code = (code == GTU ? LTU : GEU);
18493 }
18494 break;
18495
18496 /* Convert a>=0 into (unsigned)a<0x80000000. */
18497 case LT:
18498 case GE:
18499 if (mode == DImode || op1 != const0_rtx)
18500 return false;
18501 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18502 code = (code == LT ? GEU : LTU);
18503 break;
18504 case LE:
18505 case GT:
18506 if (mode == DImode || op1 != constm1_rtx)
18507 return false;
18508 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18509 code = (code == LE ? GEU : LTU);
18510 break;
18511
18512 default:
18513 return false;
18514 }
18515 /* Swapping operands may cause constant to appear as first operand. */
18516 if (!nonimmediate_operand (op0, VOIDmode))
18517 {
18518 if (!can_create_pseudo_p ())
18519 return false;
18520 op0 = force_reg (mode, op0);
18521 }
18522 *pop = ix86_expand_compare (code, op0, op1);
18523 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
18524 return true;
18525 }
18526
18527 bool
18528 ix86_expand_int_movcc (rtx operands[])
18529 {
18530 enum rtx_code code = GET_CODE (operands[1]), compare_code;
18531 rtx compare_seq, compare_op;
18532 enum machine_mode mode = GET_MODE (operands[0]);
18533 bool sign_bit_compare_p = false;
18534 rtx op0 = XEXP (operands[1], 0);
18535 rtx op1 = XEXP (operands[1], 1);
18536
18537 start_sequence ();
18538 compare_op = ix86_expand_compare (code, op0, op1);
18539 compare_seq = get_insns ();
18540 end_sequence ();
18541
18542 compare_code = GET_CODE (compare_op);
18543
18544 if ((op1 == const0_rtx && (code == GE || code == LT))
18545 || (op1 == constm1_rtx && (code == GT || code == LE)))
18546 sign_bit_compare_p = true;
18547
18548 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
18549 HImode insns, we'd be swallowed in word prefix ops. */
18550
18551 if ((mode != HImode || TARGET_FAST_PREFIX)
18552 && (mode != (TARGET_64BIT ? TImode : DImode))
18553 && CONST_INT_P (operands[2])
18554 && CONST_INT_P (operands[3]))
18555 {
18556 rtx out = operands[0];
18557 HOST_WIDE_INT ct = INTVAL (operands[2]);
18558 HOST_WIDE_INT cf = INTVAL (operands[3]);
18559 HOST_WIDE_INT diff;
18560
18561 diff = ct - cf;
18562 /* Sign bit compares are better done using shifts than we do by using
18563 sbb. */
18564 if (sign_bit_compare_p
18565 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
18566 {
18567 /* Detect overlap between destination and compare sources. */
18568 rtx tmp = out;
18569
18570 if (!sign_bit_compare_p)
18571 {
18572 rtx flags;
18573 bool fpcmp = false;
18574
18575 compare_code = GET_CODE (compare_op);
18576
18577 flags = XEXP (compare_op, 0);
18578
18579 if (GET_MODE (flags) == CCFPmode
18580 || GET_MODE (flags) == CCFPUmode)
18581 {
18582 fpcmp = true;
18583 compare_code
18584 = ix86_fp_compare_code_to_integer (compare_code);
18585 }
18586
18587 /* To simplify rest of code, restrict to the GEU case. */
18588 if (compare_code == LTU)
18589 {
18590 HOST_WIDE_INT tmp = ct;
18591 ct = cf;
18592 cf = tmp;
18593 compare_code = reverse_condition (compare_code);
18594 code = reverse_condition (code);
18595 }
18596 else
18597 {
18598 if (fpcmp)
18599 PUT_CODE (compare_op,
18600 reverse_condition_maybe_unordered
18601 (GET_CODE (compare_op)));
18602 else
18603 PUT_CODE (compare_op,
18604 reverse_condition (GET_CODE (compare_op)));
18605 }
18606 diff = ct - cf;
18607
18608 if (reg_overlap_mentioned_p (out, op0)
18609 || reg_overlap_mentioned_p (out, op1))
18610 tmp = gen_reg_rtx (mode);
18611
18612 if (mode == DImode)
18613 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
18614 else
18615 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
18616 flags, compare_op));
18617 }
18618 else
18619 {
18620 if (code == GT || code == GE)
18621 code = reverse_condition (code);
18622 else
18623 {
18624 HOST_WIDE_INT tmp = ct;
18625 ct = cf;
18626 cf = tmp;
18627 diff = ct - cf;
18628 }
18629 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
18630 }
18631
18632 if (diff == 1)
18633 {
18634 /*
18635 * cmpl op0,op1
18636 * sbbl dest,dest
18637 * [addl dest, ct]
18638 *
18639 * Size 5 - 8.
18640 */
18641 if (ct)
18642 tmp = expand_simple_binop (mode, PLUS,
18643 tmp, GEN_INT (ct),
18644 copy_rtx (tmp), 1, OPTAB_DIRECT);
18645 }
18646 else if (cf == -1)
18647 {
18648 /*
18649 * cmpl op0,op1
18650 * sbbl dest,dest
18651 * orl $ct, dest
18652 *
18653 * Size 8.
18654 */
18655 tmp = expand_simple_binop (mode, IOR,
18656 tmp, GEN_INT (ct),
18657 copy_rtx (tmp), 1, OPTAB_DIRECT);
18658 }
18659 else if (diff == -1 && ct)
18660 {
18661 /*
18662 * cmpl op0,op1
18663 * sbbl dest,dest
18664 * notl dest
18665 * [addl dest, cf]
18666 *
18667 * Size 8 - 11.
18668 */
18669 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18670 if (cf)
18671 tmp = expand_simple_binop (mode, PLUS,
18672 copy_rtx (tmp), GEN_INT (cf),
18673 copy_rtx (tmp), 1, OPTAB_DIRECT);
18674 }
18675 else
18676 {
18677 /*
18678 * cmpl op0,op1
18679 * sbbl dest,dest
18680 * [notl dest]
18681 * andl cf - ct, dest
18682 * [addl dest, ct]
18683 *
18684 * Size 8 - 11.
18685 */
18686
18687 if (cf == 0)
18688 {
18689 cf = ct;
18690 ct = 0;
18691 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18692 }
18693
18694 tmp = expand_simple_binop (mode, AND,
18695 copy_rtx (tmp),
18696 gen_int_mode (cf - ct, mode),
18697 copy_rtx (tmp), 1, OPTAB_DIRECT);
18698 if (ct)
18699 tmp = expand_simple_binop (mode, PLUS,
18700 copy_rtx (tmp), GEN_INT (ct),
18701 copy_rtx (tmp), 1, OPTAB_DIRECT);
18702 }
18703
18704 if (!rtx_equal_p (tmp, out))
18705 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
18706
18707 return true;
18708 }
18709
18710 if (diff < 0)
18711 {
18712 enum machine_mode cmp_mode = GET_MODE (op0);
18713
18714 HOST_WIDE_INT tmp;
18715 tmp = ct, ct = cf, cf = tmp;
18716 diff = -diff;
18717
18718 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18719 {
18720 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18721
18722 /* We may be reversing unordered compare to normal compare, that
18723 is not valid in general (we may convert non-trapping condition
18724 to trapping one), however on i386 we currently emit all
18725 comparisons unordered. */
18726 compare_code = reverse_condition_maybe_unordered (compare_code);
18727 code = reverse_condition_maybe_unordered (code);
18728 }
18729 else
18730 {
18731 compare_code = reverse_condition (compare_code);
18732 code = reverse_condition (code);
18733 }
18734 }
18735
18736 compare_code = UNKNOWN;
18737 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
18738 && CONST_INT_P (op1))
18739 {
18740 if (op1 == const0_rtx
18741 && (code == LT || code == GE))
18742 compare_code = code;
18743 else if (op1 == constm1_rtx)
18744 {
18745 if (code == LE)
18746 compare_code = LT;
18747 else if (code == GT)
18748 compare_code = GE;
18749 }
18750 }
18751
18752 /* Optimize dest = (op0 < 0) ? -1 : cf. */
18753 if (compare_code != UNKNOWN
18754 && GET_MODE (op0) == GET_MODE (out)
18755 && (cf == -1 || ct == -1))
18756 {
18757 /* If lea code below could be used, only optimize
18758 if it results in a 2 insn sequence. */
18759
18760 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
18761 || diff == 3 || diff == 5 || diff == 9)
18762 || (compare_code == LT && ct == -1)
18763 || (compare_code == GE && cf == -1))
18764 {
18765 /*
18766 * notl op1 (if necessary)
18767 * sarl $31, op1
18768 * orl cf, op1
18769 */
18770 if (ct != -1)
18771 {
18772 cf = ct;
18773 ct = -1;
18774 code = reverse_condition (code);
18775 }
18776
18777 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
18778
18779 out = expand_simple_binop (mode, IOR,
18780 out, GEN_INT (cf),
18781 out, 1, OPTAB_DIRECT);
18782 if (out != operands[0])
18783 emit_move_insn (operands[0], out);
18784
18785 return true;
18786 }
18787 }
18788
18789
18790 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
18791 || diff == 3 || diff == 5 || diff == 9)
18792 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
18793 && (mode != DImode
18794 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
18795 {
18796 /*
18797 * xorl dest,dest
18798 * cmpl op1,op2
18799 * setcc dest
18800 * lea cf(dest*(ct-cf)),dest
18801 *
18802 * Size 14.
18803 *
18804 * This also catches the degenerate setcc-only case.
18805 */
18806
18807 rtx tmp;
18808 int nops;
18809
18810 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
18811
18812 nops = 0;
18813 /* On x86_64 the lea instruction operates on Pmode, so we need
18814 to get arithmetics done in proper mode to match. */
18815 if (diff == 1)
18816 tmp = copy_rtx (out);
18817 else
18818 {
18819 rtx out1;
18820 out1 = copy_rtx (out);
18821 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
18822 nops++;
18823 if (diff & 1)
18824 {
18825 tmp = gen_rtx_PLUS (mode, tmp, out1);
18826 nops++;
18827 }
18828 }
18829 if (cf != 0)
18830 {
18831 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
18832 nops++;
18833 }
18834 if (!rtx_equal_p (tmp, out))
18835 {
18836 if (nops == 1)
18837 out = force_operand (tmp, copy_rtx (out));
18838 else
18839 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
18840 }
18841 if (!rtx_equal_p (out, operands[0]))
18842 emit_move_insn (operands[0], copy_rtx (out));
18843
18844 return true;
18845 }
18846
18847 /*
18848 * General case: Jumpful:
18849 * xorl dest,dest cmpl op1, op2
18850 * cmpl op1, op2 movl ct, dest
18851 * setcc dest jcc 1f
18852 * decl dest movl cf, dest
18853 * andl (cf-ct),dest 1:
18854 * addl ct,dest
18855 *
18856 * Size 20. Size 14.
18857 *
18858 * This is reasonably steep, but branch mispredict costs are
18859 * high on modern cpus, so consider failing only if optimizing
18860 * for space.
18861 */
18862
18863 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
18864 && BRANCH_COST (optimize_insn_for_speed_p (),
18865 false) >= 2)
18866 {
18867 if (cf == 0)
18868 {
18869 enum machine_mode cmp_mode = GET_MODE (op0);
18870
18871 cf = ct;
18872 ct = 0;
18873
18874 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18875 {
18876 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18877
18878 /* We may be reversing unordered compare to normal compare,
18879 that is not valid in general (we may convert non-trapping
18880 condition to trapping one), however on i386 we currently
18881 emit all comparisons unordered. */
18882 code = reverse_condition_maybe_unordered (code);
18883 }
18884 else
18885 {
18886 code = reverse_condition (code);
18887 if (compare_code != UNKNOWN)
18888 compare_code = reverse_condition (compare_code);
18889 }
18890 }
18891
18892 if (compare_code != UNKNOWN)
18893 {
18894 /* notl op1 (if needed)
18895 sarl $31, op1
18896 andl (cf-ct), op1
18897 addl ct, op1
18898
18899 For x < 0 (resp. x <= -1) there will be no notl,
18900 so if possible swap the constants to get rid of the
18901 complement.
18902 True/false will be -1/0 while code below (store flag
18903 followed by decrement) is 0/-1, so the constants need
18904 to be exchanged once more. */
18905
18906 if (compare_code == GE || !cf)
18907 {
18908 code = reverse_condition (code);
18909 compare_code = LT;
18910 }
18911 else
18912 {
18913 HOST_WIDE_INT tmp = cf;
18914 cf = ct;
18915 ct = tmp;
18916 }
18917
18918 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
18919 }
18920 else
18921 {
18922 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
18923
18924 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
18925 constm1_rtx,
18926 copy_rtx (out), 1, OPTAB_DIRECT);
18927 }
18928
18929 out = expand_simple_binop (mode, AND, copy_rtx (out),
18930 gen_int_mode (cf - ct, mode),
18931 copy_rtx (out), 1, OPTAB_DIRECT);
18932 if (ct)
18933 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
18934 copy_rtx (out), 1, OPTAB_DIRECT);
18935 if (!rtx_equal_p (out, operands[0]))
18936 emit_move_insn (operands[0], copy_rtx (out));
18937
18938 return true;
18939 }
18940 }
18941
18942 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
18943 {
18944 /* Try a few things more with specific constants and a variable. */
18945
18946 optab op;
18947 rtx var, orig_out, out, tmp;
18948
18949 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
18950 return false;
18951
18952 /* If one of the two operands is an interesting constant, load a
18953 constant with the above and mask it in with a logical operation. */
18954
18955 if (CONST_INT_P (operands[2]))
18956 {
18957 var = operands[3];
18958 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
18959 operands[3] = constm1_rtx, op = and_optab;
18960 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
18961 operands[3] = const0_rtx, op = ior_optab;
18962 else
18963 return false;
18964 }
18965 else if (CONST_INT_P (operands[3]))
18966 {
18967 var = operands[2];
18968 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
18969 operands[2] = constm1_rtx, op = and_optab;
18970 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
18971 operands[2] = const0_rtx, op = ior_optab;
18972 else
18973 return false;
18974 }
18975 else
18976 return false;
18977
18978 orig_out = operands[0];
18979 tmp = gen_reg_rtx (mode);
18980 operands[0] = tmp;
18981
18982 /* Recurse to get the constant loaded. */
18983 if (ix86_expand_int_movcc (operands) == 0)
18984 return false;
18985
18986 /* Mask in the interesting variable. */
18987 out = expand_binop (mode, op, var, tmp, orig_out, 0,
18988 OPTAB_WIDEN);
18989 if (!rtx_equal_p (out, orig_out))
18990 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
18991
18992 return true;
18993 }
18994
18995 /*
18996 * For comparison with above,
18997 *
18998 * movl cf,dest
18999 * movl ct,tmp
19000 * cmpl op1,op2
19001 * cmovcc tmp,dest
19002 *
19003 * Size 15.
19004 */
19005
19006 if (! nonimmediate_operand (operands[2], mode))
19007 operands[2] = force_reg (mode, operands[2]);
19008 if (! nonimmediate_operand (operands[3], mode))
19009 operands[3] = force_reg (mode, operands[3]);
19010
19011 if (! register_operand (operands[2], VOIDmode)
19012 && (mode == QImode
19013 || ! register_operand (operands[3], VOIDmode)))
19014 operands[2] = force_reg (mode, operands[2]);
19015
19016 if (mode == QImode
19017 && ! register_operand (operands[3], VOIDmode))
19018 operands[3] = force_reg (mode, operands[3]);
19019
19020 emit_insn (compare_seq);
19021 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19022 gen_rtx_IF_THEN_ELSE (mode,
19023 compare_op, operands[2],
19024 operands[3])));
19025 return true;
19026 }
19027
19028 /* Swap, force into registers, or otherwise massage the two operands
19029 to an sse comparison with a mask result. Thus we differ a bit from
19030 ix86_prepare_fp_compare_args which expects to produce a flags result.
19031
19032 The DEST operand exists to help determine whether to commute commutative
19033 operators. The POP0/POP1 operands are updated in place. The new
19034 comparison code is returned, or UNKNOWN if not implementable. */
19035
19036 static enum rtx_code
19037 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19038 rtx *pop0, rtx *pop1)
19039 {
19040 rtx tmp;
19041
19042 switch (code)
19043 {
19044 case LTGT:
19045 case UNEQ:
19046 /* AVX supports all the needed comparisons. */
19047 if (TARGET_AVX)
19048 break;
19049 /* We have no LTGT as an operator. We could implement it with
19050 NE & ORDERED, but this requires an extra temporary. It's
19051 not clear that it's worth it. */
19052 return UNKNOWN;
19053
19054 case LT:
19055 case LE:
19056 case UNGT:
19057 case UNGE:
19058 /* These are supported directly. */
19059 break;
19060
19061 case EQ:
19062 case NE:
19063 case UNORDERED:
19064 case ORDERED:
19065 /* AVX has 3 operand comparisons, no need to swap anything. */
19066 if (TARGET_AVX)
19067 break;
19068 /* For commutative operators, try to canonicalize the destination
19069 operand to be first in the comparison - this helps reload to
19070 avoid extra moves. */
19071 if (!dest || !rtx_equal_p (dest, *pop1))
19072 break;
19073 /* FALLTHRU */
19074
19075 case GE:
19076 case GT:
19077 case UNLE:
19078 case UNLT:
19079 /* These are not supported directly before AVX, and furthermore
19080 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19081 comparison operands to transform into something that is
19082 supported. */
19083 tmp = *pop0;
19084 *pop0 = *pop1;
19085 *pop1 = tmp;
19086 code = swap_condition (code);
19087 break;
19088
19089 default:
19090 gcc_unreachable ();
19091 }
19092
19093 return code;
19094 }
19095
19096 /* Detect conditional moves that exactly match min/max operational
19097 semantics. Note that this is IEEE safe, as long as we don't
19098 interchange the operands.
19099
19100 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19101 and TRUE if the operation is successful and instructions are emitted. */
19102
19103 static bool
19104 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19105 rtx cmp_op1, rtx if_true, rtx if_false)
19106 {
19107 enum machine_mode mode;
19108 bool is_min;
19109 rtx tmp;
19110
19111 if (code == LT)
19112 ;
19113 else if (code == UNGE)
19114 {
19115 tmp = if_true;
19116 if_true = if_false;
19117 if_false = tmp;
19118 }
19119 else
19120 return false;
19121
19122 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19123 is_min = true;
19124 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19125 is_min = false;
19126 else
19127 return false;
19128
19129 mode = GET_MODE (dest);
19130
19131 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19132 but MODE may be a vector mode and thus not appropriate. */
19133 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19134 {
19135 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19136 rtvec v;
19137
19138 if_true = force_reg (mode, if_true);
19139 v = gen_rtvec (2, if_true, if_false);
19140 tmp = gen_rtx_UNSPEC (mode, v, u);
19141 }
19142 else
19143 {
19144 code = is_min ? SMIN : SMAX;
19145 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19146 }
19147
19148 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19149 return true;
19150 }
19151
19152 /* Expand an sse vector comparison. Return the register with the result. */
19153
19154 static rtx
19155 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19156 rtx op_true, rtx op_false)
19157 {
19158 enum machine_mode mode = GET_MODE (dest);
19159 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19160 rtx x;
19161
19162 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19163 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19164 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19165
19166 if (optimize
19167 || reg_overlap_mentioned_p (dest, op_true)
19168 || reg_overlap_mentioned_p (dest, op_false))
19169 dest = gen_reg_rtx (mode);
19170
19171 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19172 if (cmp_mode != mode)
19173 {
19174 x = force_reg (cmp_mode, x);
19175 convert_move (dest, x, false);
19176 }
19177 else
19178 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19179
19180 return dest;
19181 }
19182
19183 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19184 operations. This is used for both scalar and vector conditional moves. */
19185
19186 static void
19187 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19188 {
19189 enum machine_mode mode = GET_MODE (dest);
19190 rtx t2, t3, x;
19191
19192 if (vector_all_ones_operand (op_true, mode)
19193 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19194 {
19195 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19196 }
19197 else if (op_false == CONST0_RTX (mode))
19198 {
19199 op_true = force_reg (mode, op_true);
19200 x = gen_rtx_AND (mode, cmp, op_true);
19201 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19202 }
19203 else if (op_true == CONST0_RTX (mode))
19204 {
19205 op_false = force_reg (mode, op_false);
19206 x = gen_rtx_NOT (mode, cmp);
19207 x = gen_rtx_AND (mode, x, op_false);
19208 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19209 }
19210 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19211 {
19212 op_false = force_reg (mode, op_false);
19213 x = gen_rtx_IOR (mode, cmp, op_false);
19214 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19215 }
19216 else if (TARGET_XOP)
19217 {
19218 op_true = force_reg (mode, op_true);
19219
19220 if (!nonimmediate_operand (op_false, mode))
19221 op_false = force_reg (mode, op_false);
19222
19223 emit_insn (gen_rtx_SET (mode, dest,
19224 gen_rtx_IF_THEN_ELSE (mode, cmp,
19225 op_true,
19226 op_false)));
19227 }
19228 else
19229 {
19230 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19231
19232 if (!nonimmediate_operand (op_true, mode))
19233 op_true = force_reg (mode, op_true);
19234
19235 op_false = force_reg (mode, op_false);
19236
19237 switch (mode)
19238 {
19239 case V4SFmode:
19240 if (TARGET_SSE4_1)
19241 gen = gen_sse4_1_blendvps;
19242 break;
19243 case V2DFmode:
19244 if (TARGET_SSE4_1)
19245 gen = gen_sse4_1_blendvpd;
19246 break;
19247 case V16QImode:
19248 case V8HImode:
19249 case V4SImode:
19250 case V2DImode:
19251 if (TARGET_SSE4_1)
19252 {
19253 gen = gen_sse4_1_pblendvb;
19254 dest = gen_lowpart (V16QImode, dest);
19255 op_false = gen_lowpart (V16QImode, op_false);
19256 op_true = gen_lowpart (V16QImode, op_true);
19257 cmp = gen_lowpart (V16QImode, cmp);
19258 }
19259 break;
19260 case V8SFmode:
19261 if (TARGET_AVX)
19262 gen = gen_avx_blendvps256;
19263 break;
19264 case V4DFmode:
19265 if (TARGET_AVX)
19266 gen = gen_avx_blendvpd256;
19267 break;
19268 case V32QImode:
19269 case V16HImode:
19270 case V8SImode:
19271 case V4DImode:
19272 if (TARGET_AVX2)
19273 {
19274 gen = gen_avx2_pblendvb;
19275 dest = gen_lowpart (V32QImode, dest);
19276 op_false = gen_lowpart (V32QImode, op_false);
19277 op_true = gen_lowpart (V32QImode, op_true);
19278 cmp = gen_lowpart (V32QImode, cmp);
19279 }
19280 break;
19281 default:
19282 break;
19283 }
19284
19285 if (gen != NULL)
19286 emit_insn (gen (dest, op_false, op_true, cmp));
19287 else
19288 {
19289 op_true = force_reg (mode, op_true);
19290
19291 t2 = gen_reg_rtx (mode);
19292 if (optimize)
19293 t3 = gen_reg_rtx (mode);
19294 else
19295 t3 = dest;
19296
19297 x = gen_rtx_AND (mode, op_true, cmp);
19298 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19299
19300 x = gen_rtx_NOT (mode, cmp);
19301 x = gen_rtx_AND (mode, x, op_false);
19302 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19303
19304 x = gen_rtx_IOR (mode, t3, t2);
19305 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19306 }
19307 }
19308 }
19309
19310 /* Expand a floating-point conditional move. Return true if successful. */
19311
19312 bool
19313 ix86_expand_fp_movcc (rtx operands[])
19314 {
19315 enum machine_mode mode = GET_MODE (operands[0]);
19316 enum rtx_code code = GET_CODE (operands[1]);
19317 rtx tmp, compare_op;
19318 rtx op0 = XEXP (operands[1], 0);
19319 rtx op1 = XEXP (operands[1], 1);
19320
19321 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
19322 {
19323 enum machine_mode cmode;
19324
19325 /* Since we've no cmove for sse registers, don't force bad register
19326 allocation just to gain access to it. Deny movcc when the
19327 comparison mode doesn't match the move mode. */
19328 cmode = GET_MODE (op0);
19329 if (cmode == VOIDmode)
19330 cmode = GET_MODE (op1);
19331 if (cmode != mode)
19332 return false;
19333
19334 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
19335 if (code == UNKNOWN)
19336 return false;
19337
19338 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
19339 operands[2], operands[3]))
19340 return true;
19341
19342 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
19343 operands[2], operands[3]);
19344 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
19345 return true;
19346 }
19347
19348 /* The floating point conditional move instructions don't directly
19349 support conditions resulting from a signed integer comparison. */
19350
19351 compare_op = ix86_expand_compare (code, op0, op1);
19352 if (!fcmov_comparison_operator (compare_op, VOIDmode))
19353 {
19354 tmp = gen_reg_rtx (QImode);
19355 ix86_expand_setcc (tmp, code, op0, op1);
19356
19357 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
19358 }
19359
19360 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19361 gen_rtx_IF_THEN_ELSE (mode, compare_op,
19362 operands[2], operands[3])));
19363
19364 return true;
19365 }
19366
19367 /* Expand a floating-point vector conditional move; a vcond operation
19368 rather than a movcc operation. */
19369
19370 bool
19371 ix86_expand_fp_vcond (rtx operands[])
19372 {
19373 enum rtx_code code = GET_CODE (operands[3]);
19374 rtx cmp;
19375
19376 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
19377 &operands[4], &operands[5]);
19378 if (code == UNKNOWN)
19379 {
19380 rtx temp;
19381 switch (GET_CODE (operands[3]))
19382 {
19383 case LTGT:
19384 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
19385 operands[5], operands[0], operands[0]);
19386 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
19387 operands[5], operands[1], operands[2]);
19388 code = AND;
19389 break;
19390 case UNEQ:
19391 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
19392 operands[5], operands[0], operands[0]);
19393 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
19394 operands[5], operands[1], operands[2]);
19395 code = IOR;
19396 break;
19397 default:
19398 gcc_unreachable ();
19399 }
19400 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
19401 OPTAB_DIRECT);
19402 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19403 return true;
19404 }
19405
19406 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
19407 operands[5], operands[1], operands[2]))
19408 return true;
19409
19410 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
19411 operands[1], operands[2]);
19412 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19413 return true;
19414 }
19415
19416 /* Expand a signed/unsigned integral vector conditional move. */
19417
19418 bool
19419 ix86_expand_int_vcond (rtx operands[])
19420 {
19421 enum machine_mode data_mode = GET_MODE (operands[0]);
19422 enum machine_mode mode = GET_MODE (operands[4]);
19423 enum rtx_code code = GET_CODE (operands[3]);
19424 bool negate = false;
19425 rtx x, cop0, cop1;
19426
19427 cop0 = operands[4];
19428 cop1 = operands[5];
19429
19430 /* XOP supports all of the comparisons on all vector int types. */
19431 if (!TARGET_XOP)
19432 {
19433 /* Canonicalize the comparison to EQ, GT, GTU. */
19434 switch (code)
19435 {
19436 case EQ:
19437 case GT:
19438 case GTU:
19439 break;
19440
19441 case NE:
19442 case LE:
19443 case LEU:
19444 code = reverse_condition (code);
19445 negate = true;
19446 break;
19447
19448 case GE:
19449 case GEU:
19450 code = reverse_condition (code);
19451 negate = true;
19452 /* FALLTHRU */
19453
19454 case LT:
19455 case LTU:
19456 code = swap_condition (code);
19457 x = cop0, cop0 = cop1, cop1 = x;
19458 break;
19459
19460 default:
19461 gcc_unreachable ();
19462 }
19463
19464 /* Only SSE4.1/SSE4.2 supports V2DImode. */
19465 if (mode == V2DImode)
19466 {
19467 switch (code)
19468 {
19469 case EQ:
19470 /* SSE4.1 supports EQ. */
19471 if (!TARGET_SSE4_1)
19472 return false;
19473 break;
19474
19475 case GT:
19476 case GTU:
19477 /* SSE4.2 supports GT/GTU. */
19478 if (!TARGET_SSE4_2)
19479 return false;
19480 break;
19481
19482 default:
19483 gcc_unreachable ();
19484 }
19485 }
19486
19487 /* Unsigned parallel compare is not supported by the hardware.
19488 Play some tricks to turn this into a signed comparison
19489 against 0. */
19490 if (code == GTU)
19491 {
19492 cop0 = force_reg (mode, cop0);
19493
19494 switch (mode)
19495 {
19496 case V8SImode:
19497 case V4DImode:
19498 case V4SImode:
19499 case V2DImode:
19500 {
19501 rtx t1, t2, mask;
19502 rtx (*gen_sub3) (rtx, rtx, rtx);
19503
19504 switch (mode)
19505 {
19506 case V8SImode: gen_sub3 = gen_subv8si3; break;
19507 case V4DImode: gen_sub3 = gen_subv4di3; break;
19508 case V4SImode: gen_sub3 = gen_subv4si3; break;
19509 case V2DImode: gen_sub3 = gen_subv2di3; break;
19510 default:
19511 gcc_unreachable ();
19512 }
19513 /* Subtract (-(INT MAX) - 1) from both operands to make
19514 them signed. */
19515 mask = ix86_build_signbit_mask (mode, true, false);
19516 t1 = gen_reg_rtx (mode);
19517 emit_insn (gen_sub3 (t1, cop0, mask));
19518
19519 t2 = gen_reg_rtx (mode);
19520 emit_insn (gen_sub3 (t2, cop1, mask));
19521
19522 cop0 = t1;
19523 cop1 = t2;
19524 code = GT;
19525 }
19526 break;
19527
19528 case V32QImode:
19529 case V16HImode:
19530 case V16QImode:
19531 case V8HImode:
19532 /* Perform a parallel unsigned saturating subtraction. */
19533 x = gen_reg_rtx (mode);
19534 emit_insn (gen_rtx_SET (VOIDmode, x,
19535 gen_rtx_US_MINUS (mode, cop0, cop1)));
19536
19537 cop0 = x;
19538 cop1 = CONST0_RTX (mode);
19539 code = EQ;
19540 negate = !negate;
19541 break;
19542
19543 default:
19544 gcc_unreachable ();
19545 }
19546 }
19547 }
19548
19549 /* Allow the comparison to be done in one mode, but the movcc to
19550 happen in another mode. */
19551 if (data_mode == mode)
19552 {
19553 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
19554 operands[1+negate], operands[2-negate]);
19555 }
19556 else
19557 {
19558 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
19559 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
19560 code, cop0, cop1,
19561 operands[1+negate], operands[2-negate]);
19562 x = gen_lowpart (data_mode, x);
19563 }
19564
19565 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
19566 operands[2-negate]);
19567 return true;
19568 }
19569
19570 /* Expand a variable vector permutation. */
19571
19572 void
19573 ix86_expand_vec_perm (rtx operands[])
19574 {
19575 rtx target = operands[0];
19576 rtx op0 = operands[1];
19577 rtx op1 = operands[2];
19578 rtx mask = operands[3];
19579 rtx t1, t2, t3, t4, vt, vt2, vec[32];
19580 enum machine_mode mode = GET_MODE (op0);
19581 enum machine_mode maskmode = GET_MODE (mask);
19582 int w, e, i;
19583 bool one_operand_shuffle = rtx_equal_p (op0, op1);
19584
19585 /* Number of elements in the vector. */
19586 w = GET_MODE_NUNITS (mode);
19587 e = GET_MODE_UNIT_SIZE (mode);
19588 gcc_assert (w <= 32);
19589
19590 if (TARGET_AVX2)
19591 {
19592 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
19593 {
19594 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
19595 an constant shuffle operand. With a tiny bit of effort we can
19596 use VPERMD instead. A re-interpretation stall for V4DFmode is
19597 unfortunate but there's no avoiding it.
19598 Similarly for V16HImode we don't have instructions for variable
19599 shuffling, while for V32QImode we can use after preparing suitable
19600 masks vpshufb; vpshufb; vpermq; vpor. */
19601
19602 if (mode == V16HImode)
19603 {
19604 maskmode = mode = V32QImode;
19605 w = 32;
19606 e = 1;
19607 }
19608 else
19609 {
19610 maskmode = mode = V8SImode;
19611 w = 8;
19612 e = 4;
19613 }
19614 t1 = gen_reg_rtx (maskmode);
19615
19616 /* Replicate the low bits of the V4DImode mask into V8SImode:
19617 mask = { A B C D }
19618 t1 = { A A B B C C D D }. */
19619 for (i = 0; i < w / 2; ++i)
19620 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
19621 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19622 vt = force_reg (maskmode, vt);
19623 mask = gen_lowpart (maskmode, mask);
19624 if (maskmode == V8SImode)
19625 emit_insn (gen_avx2_permvarv8si (t1, vt, mask));
19626 else
19627 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
19628
19629 /* Multiply the shuffle indicies by two. */
19630 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
19631 OPTAB_DIRECT);
19632
19633 /* Add one to the odd shuffle indicies:
19634 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
19635 for (i = 0; i < w / 2; ++i)
19636 {
19637 vec[i * 2] = const0_rtx;
19638 vec[i * 2 + 1] = const1_rtx;
19639 }
19640 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19641 vt = force_const_mem (maskmode, vt);
19642 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
19643 OPTAB_DIRECT);
19644
19645 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
19646 operands[3] = mask = t1;
19647 target = gen_lowpart (mode, target);
19648 op0 = gen_lowpart (mode, op0);
19649 op1 = gen_lowpart (mode, op1);
19650 }
19651
19652 switch (mode)
19653 {
19654 case V8SImode:
19655 /* The VPERMD and VPERMPS instructions already properly ignore
19656 the high bits of the shuffle elements. No need for us to
19657 perform an AND ourselves. */
19658 if (one_operand_shuffle)
19659 emit_insn (gen_avx2_permvarv8si (target, mask, op0));
19660 else
19661 {
19662 t1 = gen_reg_rtx (V8SImode);
19663 t2 = gen_reg_rtx (V8SImode);
19664 emit_insn (gen_avx2_permvarv8si (t1, mask, op0));
19665 emit_insn (gen_avx2_permvarv8si (t2, mask, op1));
19666 goto merge_two;
19667 }
19668 return;
19669
19670 case V8SFmode:
19671 mask = gen_lowpart (V8SFmode, mask);
19672 if (one_operand_shuffle)
19673 emit_insn (gen_avx2_permvarv8sf (target, mask, op0));
19674 else
19675 {
19676 t1 = gen_reg_rtx (V8SFmode);
19677 t2 = gen_reg_rtx (V8SFmode);
19678 emit_insn (gen_avx2_permvarv8sf (t1, mask, op0));
19679 emit_insn (gen_avx2_permvarv8sf (t2, mask, op1));
19680 goto merge_two;
19681 }
19682 return;
19683
19684 case V4SImode:
19685 /* By combining the two 128-bit input vectors into one 256-bit
19686 input vector, we can use VPERMD and VPERMPS for the full
19687 two-operand shuffle. */
19688 t1 = gen_reg_rtx (V8SImode);
19689 t2 = gen_reg_rtx (V8SImode);
19690 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
19691 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
19692 emit_insn (gen_avx2_permvarv8si (t1, t2, t1));
19693 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
19694 return;
19695
19696 case V4SFmode:
19697 t1 = gen_reg_rtx (V8SFmode);
19698 t2 = gen_reg_rtx (V8SFmode);
19699 mask = gen_lowpart (V4SFmode, mask);
19700 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
19701 emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask));
19702 emit_insn (gen_avx2_permvarv8sf (t1, t2, t1));
19703 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
19704 return;
19705
19706 case V32QImode:
19707 t1 = gen_reg_rtx (V32QImode);
19708 t2 = gen_reg_rtx (V32QImode);
19709 t3 = gen_reg_rtx (V32QImode);
19710 vt2 = GEN_INT (128);
19711 for (i = 0; i < 32; i++)
19712 vec[i] = vt2;
19713 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19714 vt = force_reg (V32QImode, vt);
19715 for (i = 0; i < 32; i++)
19716 vec[i] = i < 16 ? vt2 : const0_rtx;
19717 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19718 vt2 = force_reg (V32QImode, vt2);
19719 /* From mask create two adjusted masks, which contain the same
19720 bits as mask in the low 7 bits of each vector element.
19721 The first mask will have the most significant bit clear
19722 if it requests element from the same 128-bit lane
19723 and MSB set if it requests element from the other 128-bit lane.
19724 The second mask will have the opposite values of the MSB,
19725 and additionally will have its 128-bit lanes swapped.
19726 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
19727 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
19728 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
19729 stands for other 12 bytes. */
19730 /* The bit whether element is from the same lane or the other
19731 lane is bit 4, so shift it up by 3 to the MSB position. */
19732 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
19733 gen_lowpart (V4DImode, mask),
19734 GEN_INT (3)));
19735 /* Clear MSB bits from the mask just in case it had them set. */
19736 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
19737 /* After this t1 will have MSB set for elements from other lane. */
19738 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
19739 /* Clear bits other than MSB. */
19740 emit_insn (gen_andv32qi3 (t1, t1, vt));
19741 /* Or in the lower bits from mask into t3. */
19742 emit_insn (gen_iorv32qi3 (t3, t1, t2));
19743 /* And invert MSB bits in t1, so MSB is set for elements from the same
19744 lane. */
19745 emit_insn (gen_xorv32qi3 (t1, t1, vt));
19746 /* Swap 128-bit lanes in t3. */
19747 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19748 gen_lowpart (V4DImode, t3),
19749 const2_rtx, GEN_INT (3),
19750 const0_rtx, const1_rtx));
19751 /* And or in the lower bits from mask into t1. */
19752 emit_insn (gen_iorv32qi3 (t1, t1, t2));
19753 if (one_operand_shuffle)
19754 {
19755 /* Each of these shuffles will put 0s in places where
19756 element from the other 128-bit lane is needed, otherwise
19757 will shuffle in the requested value. */
19758 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
19759 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
19760 /* For t3 the 128-bit lanes are swapped again. */
19761 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19762 gen_lowpart (V4DImode, t3),
19763 const2_rtx, GEN_INT (3),
19764 const0_rtx, const1_rtx));
19765 /* And oring both together leads to the result. */
19766 emit_insn (gen_iorv32qi3 (target, t1, t3));
19767 return;
19768 }
19769
19770 t4 = gen_reg_rtx (V32QImode);
19771 /* Similarly to the above one_operand_shuffle code,
19772 just for repeated twice for each operand. merge_two:
19773 code will merge the two results together. */
19774 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
19775 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
19776 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
19777 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
19778 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
19779 gen_lowpart (V4DImode, t4),
19780 const2_rtx, GEN_INT (3),
19781 const0_rtx, const1_rtx));
19782 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19783 gen_lowpart (V4DImode, t3),
19784 const2_rtx, GEN_INT (3),
19785 const0_rtx, const1_rtx));
19786 emit_insn (gen_iorv32qi3 (t4, t2, t4));
19787 emit_insn (gen_iorv32qi3 (t3, t1, t3));
19788 t1 = t4;
19789 t2 = t3;
19790 goto merge_two;
19791
19792 default:
19793 gcc_assert (GET_MODE_SIZE (mode) <= 16);
19794 break;
19795 }
19796 }
19797
19798 if (TARGET_XOP)
19799 {
19800 /* The XOP VPPERM insn supports three inputs. By ignoring the
19801 one_operand_shuffle special case, we avoid creating another
19802 set of constant vectors in memory. */
19803 one_operand_shuffle = false;
19804
19805 /* mask = mask & {2*w-1, ...} */
19806 vt = GEN_INT (2*w - 1);
19807 }
19808 else
19809 {
19810 /* mask = mask & {w-1, ...} */
19811 vt = GEN_INT (w - 1);
19812 }
19813
19814 for (i = 0; i < w; i++)
19815 vec[i] = vt;
19816 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19817 mask = expand_simple_binop (maskmode, AND, mask, vt,
19818 NULL_RTX, 0, OPTAB_DIRECT);
19819
19820 /* For non-QImode operations, convert the word permutation control
19821 into a byte permutation control. */
19822 if (mode != V16QImode)
19823 {
19824 mask = expand_simple_binop (maskmode, ASHIFT, mask,
19825 GEN_INT (exact_log2 (e)),
19826 NULL_RTX, 0, OPTAB_DIRECT);
19827
19828 /* Convert mask to vector of chars. */
19829 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
19830
19831 /* Replicate each of the input bytes into byte positions:
19832 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
19833 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
19834 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
19835 for (i = 0; i < 16; ++i)
19836 vec[i] = GEN_INT (i/e * e);
19837 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
19838 vt = force_const_mem (V16QImode, vt);
19839 if (TARGET_XOP)
19840 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
19841 else
19842 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
19843
19844 /* Convert it into the byte positions by doing
19845 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
19846 for (i = 0; i < 16; ++i)
19847 vec[i] = GEN_INT (i % e);
19848 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
19849 vt = force_const_mem (V16QImode, vt);
19850 emit_insn (gen_addv16qi3 (mask, mask, vt));
19851 }
19852
19853 /* The actual shuffle operations all operate on V16QImode. */
19854 op0 = gen_lowpart (V16QImode, op0);
19855 op1 = gen_lowpart (V16QImode, op1);
19856 target = gen_lowpart (V16QImode, target);
19857
19858 if (TARGET_XOP)
19859 {
19860 emit_insn (gen_xop_pperm (target, op0, op1, mask));
19861 }
19862 else if (one_operand_shuffle)
19863 {
19864 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
19865 }
19866 else
19867 {
19868 rtx xops[6];
19869 bool ok;
19870
19871 /* Shuffle the two input vectors independently. */
19872 t1 = gen_reg_rtx (V16QImode);
19873 t2 = gen_reg_rtx (V16QImode);
19874 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
19875 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
19876
19877 merge_two:
19878 /* Then merge them together. The key is whether any given control
19879 element contained a bit set that indicates the second word. */
19880 mask = operands[3];
19881 vt = GEN_INT (w);
19882 if (maskmode == V2DImode && !TARGET_SSE4_1)
19883 {
19884 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
19885 more shuffle to convert the V2DI input mask into a V4SI
19886 input mask. At which point the masking that expand_int_vcond
19887 will work as desired. */
19888 rtx t3 = gen_reg_rtx (V4SImode);
19889 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
19890 const0_rtx, const0_rtx,
19891 const2_rtx, const2_rtx));
19892 mask = t3;
19893 maskmode = V4SImode;
19894 e = w = 4;
19895 }
19896
19897 for (i = 0; i < w; i++)
19898 vec[i] = vt;
19899 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19900 vt = force_reg (maskmode, vt);
19901 mask = expand_simple_binop (maskmode, AND, mask, vt,
19902 NULL_RTX, 0, OPTAB_DIRECT);
19903
19904 xops[0] = gen_lowpart (mode, operands[0]);
19905 xops[1] = gen_lowpart (mode, t2);
19906 xops[2] = gen_lowpart (mode, t1);
19907 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
19908 xops[4] = mask;
19909 xops[5] = vt;
19910 ok = ix86_expand_int_vcond (xops);
19911 gcc_assert (ok);
19912 }
19913 }
19914
19915 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
19916 true if we should do zero extension, else sign extension. HIGH_P is
19917 true if we want the N/2 high elements, else the low elements. */
19918
19919 void
19920 ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
19921 {
19922 enum machine_mode imode = GET_MODE (operands[1]);
19923 rtx tmp, dest;
19924
19925 if (TARGET_SSE4_1)
19926 {
19927 rtx (*unpack)(rtx, rtx);
19928 rtx (*extract)(rtx, rtx) = NULL;
19929 enum machine_mode halfmode = BLKmode;
19930
19931 switch (imode)
19932 {
19933 case V32QImode:
19934 if (unsigned_p)
19935 unpack = gen_avx2_zero_extendv16qiv16hi2;
19936 else
19937 unpack = gen_avx2_sign_extendv16qiv16hi2;
19938 halfmode = V16QImode;
19939 extract
19940 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
19941 break;
19942 case V16HImode:
19943 if (unsigned_p)
19944 unpack = gen_avx2_zero_extendv8hiv8si2;
19945 else
19946 unpack = gen_avx2_sign_extendv8hiv8si2;
19947 halfmode = V8HImode;
19948 extract
19949 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
19950 break;
19951 case V8SImode:
19952 if (unsigned_p)
19953 unpack = gen_avx2_zero_extendv4siv4di2;
19954 else
19955 unpack = gen_avx2_sign_extendv4siv4di2;
19956 halfmode = V4SImode;
19957 extract
19958 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
19959 break;
19960 case V16QImode:
19961 if (unsigned_p)
19962 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
19963 else
19964 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
19965 break;
19966 case V8HImode:
19967 if (unsigned_p)
19968 unpack = gen_sse4_1_zero_extendv4hiv4si2;
19969 else
19970 unpack = gen_sse4_1_sign_extendv4hiv4si2;
19971 break;
19972 case V4SImode:
19973 if (unsigned_p)
19974 unpack = gen_sse4_1_zero_extendv2siv2di2;
19975 else
19976 unpack = gen_sse4_1_sign_extendv2siv2di2;
19977 break;
19978 default:
19979 gcc_unreachable ();
19980 }
19981
19982 if (GET_MODE_SIZE (imode) == 32)
19983 {
19984 tmp = gen_reg_rtx (halfmode);
19985 emit_insn (extract (tmp, operands[1]));
19986 }
19987 else if (high_p)
19988 {
19989 /* Shift higher 8 bytes to lower 8 bytes. */
19990 tmp = gen_reg_rtx (imode);
19991 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
19992 gen_lowpart (V1TImode, operands[1]),
19993 GEN_INT (64)));
19994 }
19995 else
19996 tmp = operands[1];
19997
19998 emit_insn (unpack (operands[0], tmp));
19999 }
20000 else
20001 {
20002 rtx (*unpack)(rtx, rtx, rtx);
20003
20004 switch (imode)
20005 {
20006 case V16QImode:
20007 if (high_p)
20008 unpack = gen_vec_interleave_highv16qi;
20009 else
20010 unpack = gen_vec_interleave_lowv16qi;
20011 break;
20012 case V8HImode:
20013 if (high_p)
20014 unpack = gen_vec_interleave_highv8hi;
20015 else
20016 unpack = gen_vec_interleave_lowv8hi;
20017 break;
20018 case V4SImode:
20019 if (high_p)
20020 unpack = gen_vec_interleave_highv4si;
20021 else
20022 unpack = gen_vec_interleave_lowv4si;
20023 break;
20024 default:
20025 gcc_unreachable ();
20026 }
20027
20028 dest = gen_lowpart (imode, operands[0]);
20029
20030 if (unsigned_p)
20031 tmp = force_reg (imode, CONST0_RTX (imode));
20032 else
20033 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20034 operands[1], pc_rtx, pc_rtx);
20035
20036 emit_insn (unpack (dest, operands[1], tmp));
20037 }
20038 }
20039
20040 /* Expand conditional increment or decrement using adb/sbb instructions.
20041 The default case using setcc followed by the conditional move can be
20042 done by generic code. */
20043 bool
20044 ix86_expand_int_addcc (rtx operands[])
20045 {
20046 enum rtx_code code = GET_CODE (operands[1]);
20047 rtx flags;
20048 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20049 rtx compare_op;
20050 rtx val = const0_rtx;
20051 bool fpcmp = false;
20052 enum machine_mode mode;
20053 rtx op0 = XEXP (operands[1], 0);
20054 rtx op1 = XEXP (operands[1], 1);
20055
20056 if (operands[3] != const1_rtx
20057 && operands[3] != constm1_rtx)
20058 return false;
20059 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20060 return false;
20061 code = GET_CODE (compare_op);
20062
20063 flags = XEXP (compare_op, 0);
20064
20065 if (GET_MODE (flags) == CCFPmode
20066 || GET_MODE (flags) == CCFPUmode)
20067 {
20068 fpcmp = true;
20069 code = ix86_fp_compare_code_to_integer (code);
20070 }
20071
20072 if (code != LTU)
20073 {
20074 val = constm1_rtx;
20075 if (fpcmp)
20076 PUT_CODE (compare_op,
20077 reverse_condition_maybe_unordered
20078 (GET_CODE (compare_op)));
20079 else
20080 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20081 }
20082
20083 mode = GET_MODE (operands[0]);
20084
20085 /* Construct either adc or sbb insn. */
20086 if ((code == LTU) == (operands[3] == constm1_rtx))
20087 {
20088 switch (mode)
20089 {
20090 case QImode:
20091 insn = gen_subqi3_carry;
20092 break;
20093 case HImode:
20094 insn = gen_subhi3_carry;
20095 break;
20096 case SImode:
20097 insn = gen_subsi3_carry;
20098 break;
20099 case DImode:
20100 insn = gen_subdi3_carry;
20101 break;
20102 default:
20103 gcc_unreachable ();
20104 }
20105 }
20106 else
20107 {
20108 switch (mode)
20109 {
20110 case QImode:
20111 insn = gen_addqi3_carry;
20112 break;
20113 case HImode:
20114 insn = gen_addhi3_carry;
20115 break;
20116 case SImode:
20117 insn = gen_addsi3_carry;
20118 break;
20119 case DImode:
20120 insn = gen_adddi3_carry;
20121 break;
20122 default:
20123 gcc_unreachable ();
20124 }
20125 }
20126 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20127
20128 return true;
20129 }
20130
20131
20132 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20133 but works for floating pointer parameters and nonoffsetable memories.
20134 For pushes, it returns just stack offsets; the values will be saved
20135 in the right order. Maximally three parts are generated. */
20136
20137 static int
20138 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20139 {
20140 int size;
20141
20142 if (!TARGET_64BIT)
20143 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20144 else
20145 size = (GET_MODE_SIZE (mode) + 4) / 8;
20146
20147 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20148 gcc_assert (size >= 2 && size <= 4);
20149
20150 /* Optimize constant pool reference to immediates. This is used by fp
20151 moves, that force all constants to memory to allow combining. */
20152 if (MEM_P (operand) && MEM_READONLY_P (operand))
20153 {
20154 rtx tmp = maybe_get_pool_constant (operand);
20155 if (tmp)
20156 operand = tmp;
20157 }
20158
20159 if (MEM_P (operand) && !offsettable_memref_p (operand))
20160 {
20161 /* The only non-offsetable memories we handle are pushes. */
20162 int ok = push_operand (operand, VOIDmode);
20163
20164 gcc_assert (ok);
20165
20166 operand = copy_rtx (operand);
20167 PUT_MODE (operand, Pmode);
20168 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20169 return size;
20170 }
20171
20172 if (GET_CODE (operand) == CONST_VECTOR)
20173 {
20174 enum machine_mode imode = int_mode_for_mode (mode);
20175 /* Caution: if we looked through a constant pool memory above,
20176 the operand may actually have a different mode now. That's
20177 ok, since we want to pun this all the way back to an integer. */
20178 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20179 gcc_assert (operand != NULL);
20180 mode = imode;
20181 }
20182
20183 if (!TARGET_64BIT)
20184 {
20185 if (mode == DImode)
20186 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20187 else
20188 {
20189 int i;
20190
20191 if (REG_P (operand))
20192 {
20193 gcc_assert (reload_completed);
20194 for (i = 0; i < size; i++)
20195 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20196 }
20197 else if (offsettable_memref_p (operand))
20198 {
20199 operand = adjust_address (operand, SImode, 0);
20200 parts[0] = operand;
20201 for (i = 1; i < size; i++)
20202 parts[i] = adjust_address (operand, SImode, 4 * i);
20203 }
20204 else if (GET_CODE (operand) == CONST_DOUBLE)
20205 {
20206 REAL_VALUE_TYPE r;
20207 long l[4];
20208
20209 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20210 switch (mode)
20211 {
20212 case TFmode:
20213 real_to_target (l, &r, mode);
20214 parts[3] = gen_int_mode (l[3], SImode);
20215 parts[2] = gen_int_mode (l[2], SImode);
20216 break;
20217 case XFmode:
20218 REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
20219 parts[2] = gen_int_mode (l[2], SImode);
20220 break;
20221 case DFmode:
20222 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20223 break;
20224 default:
20225 gcc_unreachable ();
20226 }
20227 parts[1] = gen_int_mode (l[1], SImode);
20228 parts[0] = gen_int_mode (l[0], SImode);
20229 }
20230 else
20231 gcc_unreachable ();
20232 }
20233 }
20234 else
20235 {
20236 if (mode == TImode)
20237 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20238 if (mode == XFmode || mode == TFmode)
20239 {
20240 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20241 if (REG_P (operand))
20242 {
20243 gcc_assert (reload_completed);
20244 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20245 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20246 }
20247 else if (offsettable_memref_p (operand))
20248 {
20249 operand = adjust_address (operand, DImode, 0);
20250 parts[0] = operand;
20251 parts[1] = adjust_address (operand, upper_mode, 8);
20252 }
20253 else if (GET_CODE (operand) == CONST_DOUBLE)
20254 {
20255 REAL_VALUE_TYPE r;
20256 long l[4];
20257
20258 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20259 real_to_target (l, &r, mode);
20260
20261 /* Do not use shift by 32 to avoid warning on 32bit systems. */
20262 if (HOST_BITS_PER_WIDE_INT >= 64)
20263 parts[0]
20264 = gen_int_mode
20265 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20266 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
20267 DImode);
20268 else
20269 parts[0] = immed_double_const (l[0], l[1], DImode);
20270
20271 if (upper_mode == SImode)
20272 parts[1] = gen_int_mode (l[2], SImode);
20273 else if (HOST_BITS_PER_WIDE_INT >= 64)
20274 parts[1]
20275 = gen_int_mode
20276 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
20277 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
20278 DImode);
20279 else
20280 parts[1] = immed_double_const (l[2], l[3], DImode);
20281 }
20282 else
20283 gcc_unreachable ();
20284 }
20285 }
20286
20287 return size;
20288 }
20289
20290 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
20291 Return false when normal moves are needed; true when all required
20292 insns have been emitted. Operands 2-4 contain the input values
20293 int the correct order; operands 5-7 contain the output values. */
20294
20295 void
20296 ix86_split_long_move (rtx operands[])
20297 {
20298 rtx part[2][4];
20299 int nparts, i, j;
20300 int push = 0;
20301 int collisions = 0;
20302 enum machine_mode mode = GET_MODE (operands[0]);
20303 bool collisionparts[4];
20304
20305 /* The DFmode expanders may ask us to move double.
20306 For 64bit target this is single move. By hiding the fact
20307 here we simplify i386.md splitters. */
20308 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
20309 {
20310 /* Optimize constant pool reference to immediates. This is used by
20311 fp moves, that force all constants to memory to allow combining. */
20312
20313 if (MEM_P (operands[1])
20314 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
20315 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
20316 operands[1] = get_pool_constant (XEXP (operands[1], 0));
20317 if (push_operand (operands[0], VOIDmode))
20318 {
20319 operands[0] = copy_rtx (operands[0]);
20320 PUT_MODE (operands[0], Pmode);
20321 }
20322 else
20323 operands[0] = gen_lowpart (DImode, operands[0]);
20324 operands[1] = gen_lowpart (DImode, operands[1]);
20325 emit_move_insn (operands[0], operands[1]);
20326 return;
20327 }
20328
20329 /* The only non-offsettable memory we handle is push. */
20330 if (push_operand (operands[0], VOIDmode))
20331 push = 1;
20332 else
20333 gcc_assert (!MEM_P (operands[0])
20334 || offsettable_memref_p (operands[0]));
20335
20336 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
20337 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
20338
20339 /* When emitting push, take care for source operands on the stack. */
20340 if (push && MEM_P (operands[1])
20341 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
20342 {
20343 rtx src_base = XEXP (part[1][nparts - 1], 0);
20344
20345 /* Compensate for the stack decrement by 4. */
20346 if (!TARGET_64BIT && nparts == 3
20347 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
20348 src_base = plus_constant (src_base, 4);
20349
20350 /* src_base refers to the stack pointer and is
20351 automatically decreased by emitted push. */
20352 for (i = 0; i < nparts; i++)
20353 part[1][i] = change_address (part[1][i],
20354 GET_MODE (part[1][i]), src_base);
20355 }
20356
20357 /* We need to do copy in the right order in case an address register
20358 of the source overlaps the destination. */
20359 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
20360 {
20361 rtx tmp;
20362
20363 for (i = 0; i < nparts; i++)
20364 {
20365 collisionparts[i]
20366 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
20367 if (collisionparts[i])
20368 collisions++;
20369 }
20370
20371 /* Collision in the middle part can be handled by reordering. */
20372 if (collisions == 1 && nparts == 3 && collisionparts [1])
20373 {
20374 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20375 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20376 }
20377 else if (collisions == 1
20378 && nparts == 4
20379 && (collisionparts [1] || collisionparts [2]))
20380 {
20381 if (collisionparts [1])
20382 {
20383 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20384 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20385 }
20386 else
20387 {
20388 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
20389 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
20390 }
20391 }
20392
20393 /* If there are more collisions, we can't handle it by reordering.
20394 Do an lea to the last part and use only one colliding move. */
20395 else if (collisions > 1)
20396 {
20397 rtx base;
20398
20399 collisions = 1;
20400
20401 base = part[0][nparts - 1];
20402
20403 /* Handle the case when the last part isn't valid for lea.
20404 Happens in 64-bit mode storing the 12-byte XFmode. */
20405 if (GET_MODE (base) != Pmode)
20406 base = gen_rtx_REG (Pmode, REGNO (base));
20407
20408 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
20409 part[1][0] = replace_equiv_address (part[1][0], base);
20410 for (i = 1; i < nparts; i++)
20411 {
20412 tmp = plus_constant (base, UNITS_PER_WORD * i);
20413 part[1][i] = replace_equiv_address (part[1][i], tmp);
20414 }
20415 }
20416 }
20417
20418 if (push)
20419 {
20420 if (!TARGET_64BIT)
20421 {
20422 if (nparts == 3)
20423 {
20424 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
20425 emit_insn (gen_addsi3 (stack_pointer_rtx,
20426 stack_pointer_rtx, GEN_INT (-4)));
20427 emit_move_insn (part[0][2], part[1][2]);
20428 }
20429 else if (nparts == 4)
20430 {
20431 emit_move_insn (part[0][3], part[1][3]);
20432 emit_move_insn (part[0][2], part[1][2]);
20433 }
20434 }
20435 else
20436 {
20437 /* In 64bit mode we don't have 32bit push available. In case this is
20438 register, it is OK - we will just use larger counterpart. We also
20439 retype memory - these comes from attempt to avoid REX prefix on
20440 moving of second half of TFmode value. */
20441 if (GET_MODE (part[1][1]) == SImode)
20442 {
20443 switch (GET_CODE (part[1][1]))
20444 {
20445 case MEM:
20446 part[1][1] = adjust_address (part[1][1], DImode, 0);
20447 break;
20448
20449 case REG:
20450 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
20451 break;
20452
20453 default:
20454 gcc_unreachable ();
20455 }
20456
20457 if (GET_MODE (part[1][0]) == SImode)
20458 part[1][0] = part[1][1];
20459 }
20460 }
20461 emit_move_insn (part[0][1], part[1][1]);
20462 emit_move_insn (part[0][0], part[1][0]);
20463 return;
20464 }
20465
20466 /* Choose correct order to not overwrite the source before it is copied. */
20467 if ((REG_P (part[0][0])
20468 && REG_P (part[1][1])
20469 && (REGNO (part[0][0]) == REGNO (part[1][1])
20470 || (nparts == 3
20471 && REGNO (part[0][0]) == REGNO (part[1][2]))
20472 || (nparts == 4
20473 && REGNO (part[0][0]) == REGNO (part[1][3]))))
20474 || (collisions > 0
20475 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
20476 {
20477 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
20478 {
20479 operands[2 + i] = part[0][j];
20480 operands[6 + i] = part[1][j];
20481 }
20482 }
20483 else
20484 {
20485 for (i = 0; i < nparts; i++)
20486 {
20487 operands[2 + i] = part[0][i];
20488 operands[6 + i] = part[1][i];
20489 }
20490 }
20491
20492 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
20493 if (optimize_insn_for_size_p ())
20494 {
20495 for (j = 0; j < nparts - 1; j++)
20496 if (CONST_INT_P (operands[6 + j])
20497 && operands[6 + j] != const0_rtx
20498 && REG_P (operands[2 + j]))
20499 for (i = j; i < nparts - 1; i++)
20500 if (CONST_INT_P (operands[7 + i])
20501 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
20502 operands[7 + i] = operands[2 + j];
20503 }
20504
20505 for (i = 0; i < nparts; i++)
20506 emit_move_insn (operands[2 + i], operands[6 + i]);
20507
20508 return;
20509 }
20510
20511 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
20512 left shift by a constant, either using a single shift or
20513 a sequence of add instructions. */
20514
20515 static void
20516 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
20517 {
20518 rtx (*insn)(rtx, rtx, rtx);
20519
20520 if (count == 1
20521 || (count * ix86_cost->add <= ix86_cost->shift_const
20522 && !optimize_insn_for_size_p ()))
20523 {
20524 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
20525 while (count-- > 0)
20526 emit_insn (insn (operand, operand, operand));
20527 }
20528 else
20529 {
20530 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20531 emit_insn (insn (operand, operand, GEN_INT (count)));
20532 }
20533 }
20534
20535 void
20536 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
20537 {
20538 rtx (*gen_ashl3)(rtx, rtx, rtx);
20539 rtx (*gen_shld)(rtx, rtx, rtx);
20540 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20541
20542 rtx low[2], high[2];
20543 int count;
20544
20545 if (CONST_INT_P (operands[2]))
20546 {
20547 split_double_mode (mode, operands, 2, low, high);
20548 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20549
20550 if (count >= half_width)
20551 {
20552 emit_move_insn (high[0], low[1]);
20553 emit_move_insn (low[0], const0_rtx);
20554
20555 if (count > half_width)
20556 ix86_expand_ashl_const (high[0], count - half_width, mode);
20557 }
20558 else
20559 {
20560 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20561
20562 if (!rtx_equal_p (operands[0], operands[1]))
20563 emit_move_insn (operands[0], operands[1]);
20564
20565 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
20566 ix86_expand_ashl_const (low[0], count, mode);
20567 }
20568 return;
20569 }
20570
20571 split_double_mode (mode, operands, 1, low, high);
20572
20573 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20574
20575 if (operands[1] == const1_rtx)
20576 {
20577 /* Assuming we've chosen a QImode capable registers, then 1 << N
20578 can be done with two 32/64-bit shifts, no branches, no cmoves. */
20579 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
20580 {
20581 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
20582
20583 ix86_expand_clear (low[0]);
20584 ix86_expand_clear (high[0]);
20585 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
20586
20587 d = gen_lowpart (QImode, low[0]);
20588 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20589 s = gen_rtx_EQ (QImode, flags, const0_rtx);
20590 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20591
20592 d = gen_lowpart (QImode, high[0]);
20593 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20594 s = gen_rtx_NE (QImode, flags, const0_rtx);
20595 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20596 }
20597
20598 /* Otherwise, we can get the same results by manually performing
20599 a bit extract operation on bit 5/6, and then performing the two
20600 shifts. The two methods of getting 0/1 into low/high are exactly
20601 the same size. Avoiding the shift in the bit extract case helps
20602 pentium4 a bit; no one else seems to care much either way. */
20603 else
20604 {
20605 enum machine_mode half_mode;
20606 rtx (*gen_lshr3)(rtx, rtx, rtx);
20607 rtx (*gen_and3)(rtx, rtx, rtx);
20608 rtx (*gen_xor3)(rtx, rtx, rtx);
20609 HOST_WIDE_INT bits;
20610 rtx x;
20611
20612 if (mode == DImode)
20613 {
20614 half_mode = SImode;
20615 gen_lshr3 = gen_lshrsi3;
20616 gen_and3 = gen_andsi3;
20617 gen_xor3 = gen_xorsi3;
20618 bits = 5;
20619 }
20620 else
20621 {
20622 half_mode = DImode;
20623 gen_lshr3 = gen_lshrdi3;
20624 gen_and3 = gen_anddi3;
20625 gen_xor3 = gen_xordi3;
20626 bits = 6;
20627 }
20628
20629 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
20630 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
20631 else
20632 x = gen_lowpart (half_mode, operands[2]);
20633 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
20634
20635 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
20636 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
20637 emit_move_insn (low[0], high[0]);
20638 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
20639 }
20640
20641 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20642 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
20643 return;
20644 }
20645
20646 if (operands[1] == constm1_rtx)
20647 {
20648 /* For -1 << N, we can avoid the shld instruction, because we
20649 know that we're shifting 0...31/63 ones into a -1. */
20650 emit_move_insn (low[0], constm1_rtx);
20651 if (optimize_insn_for_size_p ())
20652 emit_move_insn (high[0], low[0]);
20653 else
20654 emit_move_insn (high[0], constm1_rtx);
20655 }
20656 else
20657 {
20658 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20659
20660 if (!rtx_equal_p (operands[0], operands[1]))
20661 emit_move_insn (operands[0], operands[1]);
20662
20663 split_double_mode (mode, operands, 1, low, high);
20664 emit_insn (gen_shld (high[0], low[0], operands[2]));
20665 }
20666
20667 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20668
20669 if (TARGET_CMOVE && scratch)
20670 {
20671 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20672 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20673
20674 ix86_expand_clear (scratch);
20675 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
20676 }
20677 else
20678 {
20679 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20680 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20681
20682 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
20683 }
20684 }
20685
20686 void
20687 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
20688 {
20689 rtx (*gen_ashr3)(rtx, rtx, rtx)
20690 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
20691 rtx (*gen_shrd)(rtx, rtx, rtx);
20692 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20693
20694 rtx low[2], high[2];
20695 int count;
20696
20697 if (CONST_INT_P (operands[2]))
20698 {
20699 split_double_mode (mode, operands, 2, low, high);
20700 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20701
20702 if (count == GET_MODE_BITSIZE (mode) - 1)
20703 {
20704 emit_move_insn (high[0], high[1]);
20705 emit_insn (gen_ashr3 (high[0], high[0],
20706 GEN_INT (half_width - 1)));
20707 emit_move_insn (low[0], high[0]);
20708
20709 }
20710 else if (count >= half_width)
20711 {
20712 emit_move_insn (low[0], high[1]);
20713 emit_move_insn (high[0], low[0]);
20714 emit_insn (gen_ashr3 (high[0], high[0],
20715 GEN_INT (half_width - 1)));
20716
20717 if (count > half_width)
20718 emit_insn (gen_ashr3 (low[0], low[0],
20719 GEN_INT (count - half_width)));
20720 }
20721 else
20722 {
20723 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20724
20725 if (!rtx_equal_p (operands[0], operands[1]))
20726 emit_move_insn (operands[0], operands[1]);
20727
20728 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20729 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
20730 }
20731 }
20732 else
20733 {
20734 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20735
20736 if (!rtx_equal_p (operands[0], operands[1]))
20737 emit_move_insn (operands[0], operands[1]);
20738
20739 split_double_mode (mode, operands, 1, low, high);
20740
20741 emit_insn (gen_shrd (low[0], high[0], operands[2]));
20742 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
20743
20744 if (TARGET_CMOVE && scratch)
20745 {
20746 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20747 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20748
20749 emit_move_insn (scratch, high[0]);
20750 emit_insn (gen_ashr3 (scratch, scratch,
20751 GEN_INT (half_width - 1)));
20752 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
20753 scratch));
20754 }
20755 else
20756 {
20757 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
20758 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
20759
20760 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
20761 }
20762 }
20763 }
20764
20765 void
20766 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
20767 {
20768 rtx (*gen_lshr3)(rtx, rtx, rtx)
20769 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
20770 rtx (*gen_shrd)(rtx, rtx, rtx);
20771 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20772
20773 rtx low[2], high[2];
20774 int count;
20775
20776 if (CONST_INT_P (operands[2]))
20777 {
20778 split_double_mode (mode, operands, 2, low, high);
20779 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20780
20781 if (count >= half_width)
20782 {
20783 emit_move_insn (low[0], high[1]);
20784 ix86_expand_clear (high[0]);
20785
20786 if (count > half_width)
20787 emit_insn (gen_lshr3 (low[0], low[0],
20788 GEN_INT (count - half_width)));
20789 }
20790 else
20791 {
20792 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20793
20794 if (!rtx_equal_p (operands[0], operands[1]))
20795 emit_move_insn (operands[0], operands[1]);
20796
20797 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20798 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
20799 }
20800 }
20801 else
20802 {
20803 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20804
20805 if (!rtx_equal_p (operands[0], operands[1]))
20806 emit_move_insn (operands[0], operands[1]);
20807
20808 split_double_mode (mode, operands, 1, low, high);
20809
20810 emit_insn (gen_shrd (low[0], high[0], operands[2]));
20811 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
20812
20813 if (TARGET_CMOVE && scratch)
20814 {
20815 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20816 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20817
20818 ix86_expand_clear (scratch);
20819 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
20820 scratch));
20821 }
20822 else
20823 {
20824 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20825 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20826
20827 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
20828 }
20829 }
20830 }
20831
20832 /* Predict just emitted jump instruction to be taken with probability PROB. */
20833 static void
20834 predict_jump (int prob)
20835 {
20836 rtx insn = get_last_insn ();
20837 gcc_assert (JUMP_P (insn));
20838 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
20839 }
20840
20841 /* Helper function for the string operations below. Dest VARIABLE whether
20842 it is aligned to VALUE bytes. If true, jump to the label. */
20843 static rtx
20844 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
20845 {
20846 rtx label = gen_label_rtx ();
20847 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
20848 if (GET_MODE (variable) == DImode)
20849 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
20850 else
20851 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
20852 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
20853 1, label);
20854 if (epilogue)
20855 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20856 else
20857 predict_jump (REG_BR_PROB_BASE * 90 / 100);
20858 return label;
20859 }
20860
20861 /* Adjust COUNTER by the VALUE. */
20862 static void
20863 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
20864 {
20865 rtx (*gen_add)(rtx, rtx, rtx)
20866 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
20867
20868 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
20869 }
20870
20871 /* Zero extend possibly SImode EXP to Pmode register. */
20872 rtx
20873 ix86_zero_extend_to_Pmode (rtx exp)
20874 {
20875 rtx r;
20876 if (GET_MODE (exp) == VOIDmode)
20877 return force_reg (Pmode, exp);
20878 if (GET_MODE (exp) == Pmode)
20879 return copy_to_mode_reg (Pmode, exp);
20880 r = gen_reg_rtx (Pmode);
20881 emit_insn (gen_zero_extendsidi2 (r, exp));
20882 return r;
20883 }
20884
20885 /* Divide COUNTREG by SCALE. */
20886 static rtx
20887 scale_counter (rtx countreg, int scale)
20888 {
20889 rtx sc;
20890
20891 if (scale == 1)
20892 return countreg;
20893 if (CONST_INT_P (countreg))
20894 return GEN_INT (INTVAL (countreg) / scale);
20895 gcc_assert (REG_P (countreg));
20896
20897 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
20898 GEN_INT (exact_log2 (scale)),
20899 NULL, 1, OPTAB_DIRECT);
20900 return sc;
20901 }
20902
20903 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
20904 DImode for constant loop counts. */
20905
20906 static enum machine_mode
20907 counter_mode (rtx count_exp)
20908 {
20909 if (GET_MODE (count_exp) != VOIDmode)
20910 return GET_MODE (count_exp);
20911 if (!CONST_INT_P (count_exp))
20912 return Pmode;
20913 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
20914 return DImode;
20915 return SImode;
20916 }
20917
20918 /* When SRCPTR is non-NULL, output simple loop to move memory
20919 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
20920 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
20921 equivalent loop to set memory by VALUE (supposed to be in MODE).
20922
20923 The size is rounded down to whole number of chunk size moved at once.
20924 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
20925
20926
20927 static void
20928 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
20929 rtx destptr, rtx srcptr, rtx value,
20930 rtx count, enum machine_mode mode, int unroll,
20931 int expected_size)
20932 {
20933 rtx out_label, top_label, iter, tmp;
20934 enum machine_mode iter_mode = counter_mode (count);
20935 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
20936 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
20937 rtx size;
20938 rtx x_addr;
20939 rtx y_addr;
20940 int i;
20941
20942 top_label = gen_label_rtx ();
20943 out_label = gen_label_rtx ();
20944 iter = gen_reg_rtx (iter_mode);
20945
20946 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
20947 NULL, 1, OPTAB_DIRECT);
20948 /* Those two should combine. */
20949 if (piece_size == const1_rtx)
20950 {
20951 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
20952 true, out_label);
20953 predict_jump (REG_BR_PROB_BASE * 10 / 100);
20954 }
20955 emit_move_insn (iter, const0_rtx);
20956
20957 emit_label (top_label);
20958
20959 tmp = convert_modes (Pmode, iter_mode, iter, true);
20960 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
20961 destmem = change_address (destmem, mode, x_addr);
20962
20963 if (srcmem)
20964 {
20965 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
20966 srcmem = change_address (srcmem, mode, y_addr);
20967
20968 /* When unrolling for chips that reorder memory reads and writes,
20969 we can save registers by using single temporary.
20970 Also using 4 temporaries is overkill in 32bit mode. */
20971 if (!TARGET_64BIT && 0)
20972 {
20973 for (i = 0; i < unroll; i++)
20974 {
20975 if (i)
20976 {
20977 destmem =
20978 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
20979 srcmem =
20980 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
20981 }
20982 emit_move_insn (destmem, srcmem);
20983 }
20984 }
20985 else
20986 {
20987 rtx tmpreg[4];
20988 gcc_assert (unroll <= 4);
20989 for (i = 0; i < unroll; i++)
20990 {
20991 tmpreg[i] = gen_reg_rtx (mode);
20992 if (i)
20993 {
20994 srcmem =
20995 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
20996 }
20997 emit_move_insn (tmpreg[i], srcmem);
20998 }
20999 for (i = 0; i < unroll; i++)
21000 {
21001 if (i)
21002 {
21003 destmem =
21004 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21005 }
21006 emit_move_insn (destmem, tmpreg[i]);
21007 }
21008 }
21009 }
21010 else
21011 for (i = 0; i < unroll; i++)
21012 {
21013 if (i)
21014 destmem =
21015 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21016 emit_move_insn (destmem, value);
21017 }
21018
21019 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21020 true, OPTAB_LIB_WIDEN);
21021 if (tmp != iter)
21022 emit_move_insn (iter, tmp);
21023
21024 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21025 true, top_label);
21026 if (expected_size != -1)
21027 {
21028 expected_size /= GET_MODE_SIZE (mode) * unroll;
21029 if (expected_size == 0)
21030 predict_jump (0);
21031 else if (expected_size > REG_BR_PROB_BASE)
21032 predict_jump (REG_BR_PROB_BASE - 1);
21033 else
21034 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21035 }
21036 else
21037 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21038 iter = ix86_zero_extend_to_Pmode (iter);
21039 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21040 true, OPTAB_LIB_WIDEN);
21041 if (tmp != destptr)
21042 emit_move_insn (destptr, tmp);
21043 if (srcptr)
21044 {
21045 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21046 true, OPTAB_LIB_WIDEN);
21047 if (tmp != srcptr)
21048 emit_move_insn (srcptr, tmp);
21049 }
21050 emit_label (out_label);
21051 }
21052
21053 /* Output "rep; mov" instruction.
21054 Arguments have same meaning as for previous function */
21055 static void
21056 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21057 rtx destptr, rtx srcptr,
21058 rtx count,
21059 enum machine_mode mode)
21060 {
21061 rtx destexp;
21062 rtx srcexp;
21063 rtx countreg;
21064 HOST_WIDE_INT rounded_count;
21065
21066 /* If the size is known, it is shorter to use rep movs. */
21067 if (mode == QImode && CONST_INT_P (count)
21068 && !(INTVAL (count) & 3))
21069 mode = SImode;
21070
21071 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21072 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21073 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21074 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21075 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21076 if (mode != QImode)
21077 {
21078 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21079 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21080 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21081 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21082 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21083 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21084 }
21085 else
21086 {
21087 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21088 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21089 }
21090 if (CONST_INT_P (count))
21091 {
21092 rounded_count = (INTVAL (count)
21093 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21094 destmem = shallow_copy_rtx (destmem);
21095 srcmem = shallow_copy_rtx (srcmem);
21096 set_mem_size (destmem, rounded_count);
21097 set_mem_size (srcmem, rounded_count);
21098 }
21099 else
21100 {
21101 if (MEM_SIZE_KNOWN_P (destmem))
21102 clear_mem_size (destmem);
21103 if (MEM_SIZE_KNOWN_P (srcmem))
21104 clear_mem_size (srcmem);
21105 }
21106 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21107 destexp, srcexp));
21108 }
21109
21110 /* Output "rep; stos" instruction.
21111 Arguments have same meaning as for previous function */
21112 static void
21113 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21114 rtx count, enum machine_mode mode,
21115 rtx orig_value)
21116 {
21117 rtx destexp;
21118 rtx countreg;
21119 HOST_WIDE_INT rounded_count;
21120
21121 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21122 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21123 value = force_reg (mode, gen_lowpart (mode, value));
21124 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21125 if (mode != QImode)
21126 {
21127 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21128 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21129 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21130 }
21131 else
21132 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21133 if (orig_value == const0_rtx && CONST_INT_P (count))
21134 {
21135 rounded_count = (INTVAL (count)
21136 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21137 destmem = shallow_copy_rtx (destmem);
21138 set_mem_size (destmem, rounded_count);
21139 }
21140 else if (MEM_SIZE_KNOWN_P (destmem))
21141 clear_mem_size (destmem);
21142 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21143 }
21144
21145 static void
21146 emit_strmov (rtx destmem, rtx srcmem,
21147 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21148 {
21149 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21150 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21151 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21152 }
21153
21154 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21155 static void
21156 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21157 rtx destptr, rtx srcptr, rtx count, int max_size)
21158 {
21159 rtx src, dest;
21160 if (CONST_INT_P (count))
21161 {
21162 HOST_WIDE_INT countval = INTVAL (count);
21163 int offset = 0;
21164
21165 if ((countval & 0x10) && max_size > 16)
21166 {
21167 if (TARGET_64BIT)
21168 {
21169 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21170 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21171 }
21172 else
21173 gcc_unreachable ();
21174 offset += 16;
21175 }
21176 if ((countval & 0x08) && max_size > 8)
21177 {
21178 if (TARGET_64BIT)
21179 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21180 else
21181 {
21182 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21183 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21184 }
21185 offset += 8;
21186 }
21187 if ((countval & 0x04) && max_size > 4)
21188 {
21189 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21190 offset += 4;
21191 }
21192 if ((countval & 0x02) && max_size > 2)
21193 {
21194 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21195 offset += 2;
21196 }
21197 if ((countval & 0x01) && max_size > 1)
21198 {
21199 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21200 offset += 1;
21201 }
21202 return;
21203 }
21204 if (max_size > 8)
21205 {
21206 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21207 count, 1, OPTAB_DIRECT);
21208 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21209 count, QImode, 1, 4);
21210 return;
21211 }
21212
21213 /* When there are stringops, we can cheaply increase dest and src pointers.
21214 Otherwise we save code size by maintaining offset (zero is readily
21215 available from preceding rep operation) and using x86 addressing modes.
21216 */
21217 if (TARGET_SINGLE_STRINGOP)
21218 {
21219 if (max_size > 4)
21220 {
21221 rtx label = ix86_expand_aligntest (count, 4, true);
21222 src = change_address (srcmem, SImode, srcptr);
21223 dest = change_address (destmem, SImode, destptr);
21224 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21225 emit_label (label);
21226 LABEL_NUSES (label) = 1;
21227 }
21228 if (max_size > 2)
21229 {
21230 rtx label = ix86_expand_aligntest (count, 2, true);
21231 src = change_address (srcmem, HImode, srcptr);
21232 dest = change_address (destmem, HImode, destptr);
21233 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21234 emit_label (label);
21235 LABEL_NUSES (label) = 1;
21236 }
21237 if (max_size > 1)
21238 {
21239 rtx label = ix86_expand_aligntest (count, 1, true);
21240 src = change_address (srcmem, QImode, srcptr);
21241 dest = change_address (destmem, QImode, destptr);
21242 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21243 emit_label (label);
21244 LABEL_NUSES (label) = 1;
21245 }
21246 }
21247 else
21248 {
21249 rtx offset = force_reg (Pmode, const0_rtx);
21250 rtx tmp;
21251
21252 if (max_size > 4)
21253 {
21254 rtx label = ix86_expand_aligntest (count, 4, true);
21255 src = change_address (srcmem, SImode, srcptr);
21256 dest = change_address (destmem, SImode, destptr);
21257 emit_move_insn (dest, src);
21258 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21259 true, OPTAB_LIB_WIDEN);
21260 if (tmp != offset)
21261 emit_move_insn (offset, tmp);
21262 emit_label (label);
21263 LABEL_NUSES (label) = 1;
21264 }
21265 if (max_size > 2)
21266 {
21267 rtx label = ix86_expand_aligntest (count, 2, true);
21268 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21269 src = change_address (srcmem, HImode, tmp);
21270 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21271 dest = change_address (destmem, HImode, tmp);
21272 emit_move_insn (dest, src);
21273 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
21274 true, OPTAB_LIB_WIDEN);
21275 if (tmp != offset)
21276 emit_move_insn (offset, tmp);
21277 emit_label (label);
21278 LABEL_NUSES (label) = 1;
21279 }
21280 if (max_size > 1)
21281 {
21282 rtx label = ix86_expand_aligntest (count, 1, true);
21283 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21284 src = change_address (srcmem, QImode, tmp);
21285 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21286 dest = change_address (destmem, QImode, tmp);
21287 emit_move_insn (dest, src);
21288 emit_label (label);
21289 LABEL_NUSES (label) = 1;
21290 }
21291 }
21292 }
21293
21294 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21295 static void
21296 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
21297 rtx count, int max_size)
21298 {
21299 count =
21300 expand_simple_binop (counter_mode (count), AND, count,
21301 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
21302 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
21303 gen_lowpart (QImode, value), count, QImode,
21304 1, max_size / 2);
21305 }
21306
21307 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21308 static void
21309 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
21310 {
21311 rtx dest;
21312
21313 if (CONST_INT_P (count))
21314 {
21315 HOST_WIDE_INT countval = INTVAL (count);
21316 int offset = 0;
21317
21318 if ((countval & 0x10) && max_size > 16)
21319 {
21320 if (TARGET_64BIT)
21321 {
21322 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21323 emit_insn (gen_strset (destptr, dest, value));
21324 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
21325 emit_insn (gen_strset (destptr, dest, value));
21326 }
21327 else
21328 gcc_unreachable ();
21329 offset += 16;
21330 }
21331 if ((countval & 0x08) && max_size > 8)
21332 {
21333 if (TARGET_64BIT)
21334 {
21335 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21336 emit_insn (gen_strset (destptr, dest, value));
21337 }
21338 else
21339 {
21340 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21341 emit_insn (gen_strset (destptr, dest, value));
21342 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
21343 emit_insn (gen_strset (destptr, dest, value));
21344 }
21345 offset += 8;
21346 }
21347 if ((countval & 0x04) && max_size > 4)
21348 {
21349 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21350 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21351 offset += 4;
21352 }
21353 if ((countval & 0x02) && max_size > 2)
21354 {
21355 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
21356 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21357 offset += 2;
21358 }
21359 if ((countval & 0x01) && max_size > 1)
21360 {
21361 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
21362 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21363 offset += 1;
21364 }
21365 return;
21366 }
21367 if (max_size > 32)
21368 {
21369 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
21370 return;
21371 }
21372 if (max_size > 16)
21373 {
21374 rtx label = ix86_expand_aligntest (count, 16, true);
21375 if (TARGET_64BIT)
21376 {
21377 dest = change_address (destmem, DImode, destptr);
21378 emit_insn (gen_strset (destptr, dest, value));
21379 emit_insn (gen_strset (destptr, dest, value));
21380 }
21381 else
21382 {
21383 dest = change_address (destmem, SImode, destptr);
21384 emit_insn (gen_strset (destptr, dest, value));
21385 emit_insn (gen_strset (destptr, dest, value));
21386 emit_insn (gen_strset (destptr, dest, value));
21387 emit_insn (gen_strset (destptr, dest, value));
21388 }
21389 emit_label (label);
21390 LABEL_NUSES (label) = 1;
21391 }
21392 if (max_size > 8)
21393 {
21394 rtx label = ix86_expand_aligntest (count, 8, true);
21395 if (TARGET_64BIT)
21396 {
21397 dest = change_address (destmem, DImode, destptr);
21398 emit_insn (gen_strset (destptr, dest, value));
21399 }
21400 else
21401 {
21402 dest = change_address (destmem, SImode, destptr);
21403 emit_insn (gen_strset (destptr, dest, value));
21404 emit_insn (gen_strset (destptr, dest, value));
21405 }
21406 emit_label (label);
21407 LABEL_NUSES (label) = 1;
21408 }
21409 if (max_size > 4)
21410 {
21411 rtx label = ix86_expand_aligntest (count, 4, true);
21412 dest = change_address (destmem, SImode, destptr);
21413 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21414 emit_label (label);
21415 LABEL_NUSES (label) = 1;
21416 }
21417 if (max_size > 2)
21418 {
21419 rtx label = ix86_expand_aligntest (count, 2, true);
21420 dest = change_address (destmem, HImode, destptr);
21421 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21422 emit_label (label);
21423 LABEL_NUSES (label) = 1;
21424 }
21425 if (max_size > 1)
21426 {
21427 rtx label = ix86_expand_aligntest (count, 1, true);
21428 dest = change_address (destmem, QImode, destptr);
21429 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21430 emit_label (label);
21431 LABEL_NUSES (label) = 1;
21432 }
21433 }
21434
21435 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
21436 DESIRED_ALIGNMENT. */
21437 static void
21438 expand_movmem_prologue (rtx destmem, rtx srcmem,
21439 rtx destptr, rtx srcptr, rtx count,
21440 int align, int desired_alignment)
21441 {
21442 if (align <= 1 && desired_alignment > 1)
21443 {
21444 rtx label = ix86_expand_aligntest (destptr, 1, false);
21445 srcmem = change_address (srcmem, QImode, srcptr);
21446 destmem = change_address (destmem, QImode, destptr);
21447 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21448 ix86_adjust_counter (count, 1);
21449 emit_label (label);
21450 LABEL_NUSES (label) = 1;
21451 }
21452 if (align <= 2 && desired_alignment > 2)
21453 {
21454 rtx label = ix86_expand_aligntest (destptr, 2, false);
21455 srcmem = change_address (srcmem, HImode, srcptr);
21456 destmem = change_address (destmem, HImode, destptr);
21457 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21458 ix86_adjust_counter (count, 2);
21459 emit_label (label);
21460 LABEL_NUSES (label) = 1;
21461 }
21462 if (align <= 4 && desired_alignment > 4)
21463 {
21464 rtx label = ix86_expand_aligntest (destptr, 4, false);
21465 srcmem = change_address (srcmem, SImode, srcptr);
21466 destmem = change_address (destmem, SImode, destptr);
21467 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21468 ix86_adjust_counter (count, 4);
21469 emit_label (label);
21470 LABEL_NUSES (label) = 1;
21471 }
21472 gcc_assert (desired_alignment <= 8);
21473 }
21474
21475 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
21476 ALIGN_BYTES is how many bytes need to be copied. */
21477 static rtx
21478 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
21479 int desired_align, int align_bytes)
21480 {
21481 rtx src = *srcp;
21482 rtx orig_dst = dst;
21483 rtx orig_src = src;
21484 int off = 0;
21485 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
21486 if (src_align_bytes >= 0)
21487 src_align_bytes = desired_align - src_align_bytes;
21488 if (align_bytes & 1)
21489 {
21490 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21491 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
21492 off = 1;
21493 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21494 }
21495 if (align_bytes & 2)
21496 {
21497 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21498 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
21499 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21500 set_mem_align (dst, 2 * BITS_PER_UNIT);
21501 if (src_align_bytes >= 0
21502 && (src_align_bytes & 1) == (align_bytes & 1)
21503 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
21504 set_mem_align (src, 2 * BITS_PER_UNIT);
21505 off = 2;
21506 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21507 }
21508 if (align_bytes & 4)
21509 {
21510 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21511 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
21512 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21513 set_mem_align (dst, 4 * BITS_PER_UNIT);
21514 if (src_align_bytes >= 0)
21515 {
21516 unsigned int src_align = 0;
21517 if ((src_align_bytes & 3) == (align_bytes & 3))
21518 src_align = 4;
21519 else if ((src_align_bytes & 1) == (align_bytes & 1))
21520 src_align = 2;
21521 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21522 set_mem_align (src, src_align * BITS_PER_UNIT);
21523 }
21524 off = 4;
21525 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21526 }
21527 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21528 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
21529 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21530 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21531 if (src_align_bytes >= 0)
21532 {
21533 unsigned int src_align = 0;
21534 if ((src_align_bytes & 7) == (align_bytes & 7))
21535 src_align = 8;
21536 else if ((src_align_bytes & 3) == (align_bytes & 3))
21537 src_align = 4;
21538 else if ((src_align_bytes & 1) == (align_bytes & 1))
21539 src_align = 2;
21540 if (src_align > (unsigned int) desired_align)
21541 src_align = desired_align;
21542 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21543 set_mem_align (src, src_align * BITS_PER_UNIT);
21544 }
21545 if (MEM_SIZE_KNOWN_P (orig_dst))
21546 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21547 if (MEM_SIZE_KNOWN_P (orig_src))
21548 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
21549 *srcp = src;
21550 return dst;
21551 }
21552
21553 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
21554 DESIRED_ALIGNMENT. */
21555 static void
21556 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
21557 int align, int desired_alignment)
21558 {
21559 if (align <= 1 && desired_alignment > 1)
21560 {
21561 rtx label = ix86_expand_aligntest (destptr, 1, false);
21562 destmem = change_address (destmem, QImode, destptr);
21563 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
21564 ix86_adjust_counter (count, 1);
21565 emit_label (label);
21566 LABEL_NUSES (label) = 1;
21567 }
21568 if (align <= 2 && desired_alignment > 2)
21569 {
21570 rtx label = ix86_expand_aligntest (destptr, 2, false);
21571 destmem = change_address (destmem, HImode, destptr);
21572 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
21573 ix86_adjust_counter (count, 2);
21574 emit_label (label);
21575 LABEL_NUSES (label) = 1;
21576 }
21577 if (align <= 4 && desired_alignment > 4)
21578 {
21579 rtx label = ix86_expand_aligntest (destptr, 4, false);
21580 destmem = change_address (destmem, SImode, destptr);
21581 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
21582 ix86_adjust_counter (count, 4);
21583 emit_label (label);
21584 LABEL_NUSES (label) = 1;
21585 }
21586 gcc_assert (desired_alignment <= 8);
21587 }
21588
21589 /* Set enough from DST to align DST known to by aligned by ALIGN to
21590 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
21591 static rtx
21592 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
21593 int desired_align, int align_bytes)
21594 {
21595 int off = 0;
21596 rtx orig_dst = dst;
21597 if (align_bytes & 1)
21598 {
21599 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21600 off = 1;
21601 emit_insn (gen_strset (destreg, dst,
21602 gen_lowpart (QImode, value)));
21603 }
21604 if (align_bytes & 2)
21605 {
21606 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21607 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21608 set_mem_align (dst, 2 * BITS_PER_UNIT);
21609 off = 2;
21610 emit_insn (gen_strset (destreg, dst,
21611 gen_lowpart (HImode, value)));
21612 }
21613 if (align_bytes & 4)
21614 {
21615 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21616 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21617 set_mem_align (dst, 4 * BITS_PER_UNIT);
21618 off = 4;
21619 emit_insn (gen_strset (destreg, dst,
21620 gen_lowpart (SImode, value)));
21621 }
21622 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21623 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21624 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21625 if (MEM_SIZE_KNOWN_P (orig_dst))
21626 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21627 return dst;
21628 }
21629
21630 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
21631 static enum stringop_alg
21632 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
21633 int *dynamic_check)
21634 {
21635 const struct stringop_algs * algs;
21636 bool optimize_for_speed;
21637 /* Algorithms using the rep prefix want at least edi and ecx;
21638 additionally, memset wants eax and memcpy wants esi. Don't
21639 consider such algorithms if the user has appropriated those
21640 registers for their own purposes. */
21641 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
21642 || (memset
21643 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
21644
21645 #define ALG_USABLE_P(alg) (rep_prefix_usable \
21646 || (alg != rep_prefix_1_byte \
21647 && alg != rep_prefix_4_byte \
21648 && alg != rep_prefix_8_byte))
21649 const struct processor_costs *cost;
21650
21651 /* Even if the string operation call is cold, we still might spend a lot
21652 of time processing large blocks. */
21653 if (optimize_function_for_size_p (cfun)
21654 || (optimize_insn_for_size_p ()
21655 && expected_size != -1 && expected_size < 256))
21656 optimize_for_speed = false;
21657 else
21658 optimize_for_speed = true;
21659
21660 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
21661
21662 *dynamic_check = -1;
21663 if (memset)
21664 algs = &cost->memset[TARGET_64BIT != 0];
21665 else
21666 algs = &cost->memcpy[TARGET_64BIT != 0];
21667 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
21668 return ix86_stringop_alg;
21669 /* rep; movq or rep; movl is the smallest variant. */
21670 else if (!optimize_for_speed)
21671 {
21672 if (!count || (count & 3))
21673 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
21674 else
21675 return rep_prefix_usable ? rep_prefix_4_byte : loop;
21676 }
21677 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
21678 */
21679 else if (expected_size != -1 && expected_size < 4)
21680 return loop_1_byte;
21681 else if (expected_size != -1)
21682 {
21683 unsigned int i;
21684 enum stringop_alg alg = libcall;
21685 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21686 {
21687 /* We get here if the algorithms that were not libcall-based
21688 were rep-prefix based and we are unable to use rep prefixes
21689 based on global register usage. Break out of the loop and
21690 use the heuristic below. */
21691 if (algs->size[i].max == 0)
21692 break;
21693 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
21694 {
21695 enum stringop_alg candidate = algs->size[i].alg;
21696
21697 if (candidate != libcall && ALG_USABLE_P (candidate))
21698 alg = candidate;
21699 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
21700 last non-libcall inline algorithm. */
21701 if (TARGET_INLINE_ALL_STRINGOPS)
21702 {
21703 /* When the current size is best to be copied by a libcall,
21704 but we are still forced to inline, run the heuristic below
21705 that will pick code for medium sized blocks. */
21706 if (alg != libcall)
21707 return alg;
21708 break;
21709 }
21710 else if (ALG_USABLE_P (candidate))
21711 return candidate;
21712 }
21713 }
21714 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
21715 }
21716 /* When asked to inline the call anyway, try to pick meaningful choice.
21717 We look for maximal size of block that is faster to copy by hand and
21718 take blocks of at most of that size guessing that average size will
21719 be roughly half of the block.
21720
21721 If this turns out to be bad, we might simply specify the preferred
21722 choice in ix86_costs. */
21723 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21724 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
21725 {
21726 int max = -1;
21727 enum stringop_alg alg;
21728 int i;
21729 bool any_alg_usable_p = true;
21730
21731 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21732 {
21733 enum stringop_alg candidate = algs->size[i].alg;
21734 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
21735
21736 if (candidate != libcall && candidate
21737 && ALG_USABLE_P (candidate))
21738 max = algs->size[i].max;
21739 }
21740 /* If there aren't any usable algorithms, then recursing on
21741 smaller sizes isn't going to find anything. Just return the
21742 simple byte-at-a-time copy loop. */
21743 if (!any_alg_usable_p)
21744 {
21745 /* Pick something reasonable. */
21746 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21747 *dynamic_check = 128;
21748 return loop_1_byte;
21749 }
21750 if (max == -1)
21751 max = 4096;
21752 alg = decide_alg (count, max / 2, memset, dynamic_check);
21753 gcc_assert (*dynamic_check == -1);
21754 gcc_assert (alg != libcall);
21755 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21756 *dynamic_check = max;
21757 return alg;
21758 }
21759 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
21760 #undef ALG_USABLE_P
21761 }
21762
21763 /* Decide on alignment. We know that the operand is already aligned to ALIGN
21764 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
21765 static int
21766 decide_alignment (int align,
21767 enum stringop_alg alg,
21768 int expected_size)
21769 {
21770 int desired_align = 0;
21771 switch (alg)
21772 {
21773 case no_stringop:
21774 gcc_unreachable ();
21775 case loop:
21776 case unrolled_loop:
21777 desired_align = GET_MODE_SIZE (Pmode);
21778 break;
21779 case rep_prefix_8_byte:
21780 desired_align = 8;
21781 break;
21782 case rep_prefix_4_byte:
21783 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
21784 copying whole cacheline at once. */
21785 if (TARGET_PENTIUMPRO)
21786 desired_align = 8;
21787 else
21788 desired_align = 4;
21789 break;
21790 case rep_prefix_1_byte:
21791 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
21792 copying whole cacheline at once. */
21793 if (TARGET_PENTIUMPRO)
21794 desired_align = 8;
21795 else
21796 desired_align = 1;
21797 break;
21798 case loop_1_byte:
21799 desired_align = 1;
21800 break;
21801 case libcall:
21802 return 0;
21803 }
21804
21805 if (optimize_size)
21806 desired_align = 1;
21807 if (desired_align < align)
21808 desired_align = align;
21809 if (expected_size != -1 && expected_size < 4)
21810 desired_align = align;
21811 return desired_align;
21812 }
21813
21814 /* Return the smallest power of 2 greater than VAL. */
21815 static int
21816 smallest_pow2_greater_than (int val)
21817 {
21818 int ret = 1;
21819 while (ret <= val)
21820 ret <<= 1;
21821 return ret;
21822 }
21823
21824 /* Expand string move (memcpy) operation. Use i386 string operations
21825 when profitable. expand_setmem contains similar code. The code
21826 depends upon architecture, block size and alignment, but always has
21827 the same overall structure:
21828
21829 1) Prologue guard: Conditional that jumps up to epilogues for small
21830 blocks that can be handled by epilogue alone. This is faster
21831 but also needed for correctness, since prologue assume the block
21832 is larger than the desired alignment.
21833
21834 Optional dynamic check for size and libcall for large
21835 blocks is emitted here too, with -minline-stringops-dynamically.
21836
21837 2) Prologue: copy first few bytes in order to get destination
21838 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
21839 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
21840 copied. We emit either a jump tree on power of two sized
21841 blocks, or a byte loop.
21842
21843 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
21844 with specified algorithm.
21845
21846 4) Epilogue: code copying tail of the block that is too small to be
21847 handled by main body (or up to size guarded by prologue guard). */
21848
21849 bool
21850 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
21851 rtx expected_align_exp, rtx expected_size_exp)
21852 {
21853 rtx destreg;
21854 rtx srcreg;
21855 rtx label = NULL;
21856 rtx tmp;
21857 rtx jump_around_label = NULL;
21858 HOST_WIDE_INT align = 1;
21859 unsigned HOST_WIDE_INT count = 0;
21860 HOST_WIDE_INT expected_size = -1;
21861 int size_needed = 0, epilogue_size_needed;
21862 int desired_align = 0, align_bytes = 0;
21863 enum stringop_alg alg;
21864 int dynamic_check;
21865 bool need_zero_guard = false;
21866
21867 if (CONST_INT_P (align_exp))
21868 align = INTVAL (align_exp);
21869 /* i386 can do misaligned access on reasonably increased cost. */
21870 if (CONST_INT_P (expected_align_exp)
21871 && INTVAL (expected_align_exp) > align)
21872 align = INTVAL (expected_align_exp);
21873 /* ALIGN is the minimum of destination and source alignment, but we care here
21874 just about destination alignment. */
21875 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
21876 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
21877
21878 if (CONST_INT_P (count_exp))
21879 count = expected_size = INTVAL (count_exp);
21880 if (CONST_INT_P (expected_size_exp) && count == 0)
21881 expected_size = INTVAL (expected_size_exp);
21882
21883 /* Make sure we don't need to care about overflow later on. */
21884 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
21885 return false;
21886
21887 /* Step 0: Decide on preferred algorithm, desired alignment and
21888 size of chunks to be copied by main loop. */
21889
21890 alg = decide_alg (count, expected_size, false, &dynamic_check);
21891 desired_align = decide_alignment (align, alg, expected_size);
21892
21893 if (!TARGET_ALIGN_STRINGOPS)
21894 align = desired_align;
21895
21896 if (alg == libcall)
21897 return false;
21898 gcc_assert (alg != no_stringop);
21899 if (!count)
21900 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
21901 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
21902 srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0));
21903 switch (alg)
21904 {
21905 case libcall:
21906 case no_stringop:
21907 gcc_unreachable ();
21908 case loop:
21909 need_zero_guard = true;
21910 size_needed = GET_MODE_SIZE (Pmode);
21911 break;
21912 case unrolled_loop:
21913 need_zero_guard = true;
21914 size_needed = GET_MODE_SIZE (Pmode) * (TARGET_64BIT ? 4 : 2);
21915 break;
21916 case rep_prefix_8_byte:
21917 size_needed = 8;
21918 break;
21919 case rep_prefix_4_byte:
21920 size_needed = 4;
21921 break;
21922 case rep_prefix_1_byte:
21923 size_needed = 1;
21924 break;
21925 case loop_1_byte:
21926 need_zero_guard = true;
21927 size_needed = 1;
21928 break;
21929 }
21930
21931 epilogue_size_needed = size_needed;
21932
21933 /* Step 1: Prologue guard. */
21934
21935 /* Alignment code needs count to be in register. */
21936 if (CONST_INT_P (count_exp) && desired_align > align)
21937 {
21938 if (INTVAL (count_exp) > desired_align
21939 && INTVAL (count_exp) > size_needed)
21940 {
21941 align_bytes
21942 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
21943 if (align_bytes <= 0)
21944 align_bytes = 0;
21945 else
21946 align_bytes = desired_align - align_bytes;
21947 }
21948 if (align_bytes == 0)
21949 count_exp = force_reg (counter_mode (count_exp), count_exp);
21950 }
21951 gcc_assert (desired_align >= 1 && align >= 1);
21952
21953 /* Ensure that alignment prologue won't copy past end of block. */
21954 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
21955 {
21956 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
21957 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
21958 Make sure it is power of 2. */
21959 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
21960
21961 if (count)
21962 {
21963 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
21964 {
21965 /* If main algorithm works on QImode, no epilogue is needed.
21966 For small sizes just don't align anything. */
21967 if (size_needed == 1)
21968 desired_align = align;
21969 else
21970 goto epilogue;
21971 }
21972 }
21973 else
21974 {
21975 label = gen_label_rtx ();
21976 emit_cmp_and_jump_insns (count_exp,
21977 GEN_INT (epilogue_size_needed),
21978 LTU, 0, counter_mode (count_exp), 1, label);
21979 if (expected_size == -1 || expected_size < epilogue_size_needed)
21980 predict_jump (REG_BR_PROB_BASE * 60 / 100);
21981 else
21982 predict_jump (REG_BR_PROB_BASE * 20 / 100);
21983 }
21984 }
21985
21986 /* Emit code to decide on runtime whether library call or inline should be
21987 used. */
21988 if (dynamic_check != -1)
21989 {
21990 if (CONST_INT_P (count_exp))
21991 {
21992 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
21993 {
21994 emit_block_move_via_libcall (dst, src, count_exp, false);
21995 count_exp = const0_rtx;
21996 goto epilogue;
21997 }
21998 }
21999 else
22000 {
22001 rtx hot_label = gen_label_rtx ();
22002 jump_around_label = gen_label_rtx ();
22003 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22004 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22005 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22006 emit_block_move_via_libcall (dst, src, count_exp, false);
22007 emit_jump (jump_around_label);
22008 emit_label (hot_label);
22009 }
22010 }
22011
22012 /* Step 2: Alignment prologue. */
22013
22014 if (desired_align > align)
22015 {
22016 if (align_bytes == 0)
22017 {
22018 /* Except for the first move in epilogue, we no longer know
22019 constant offset in aliasing info. It don't seems to worth
22020 the pain to maintain it for the first move, so throw away
22021 the info early. */
22022 src = change_address (src, BLKmode, srcreg);
22023 dst = change_address (dst, BLKmode, destreg);
22024 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22025 desired_align);
22026 }
22027 else
22028 {
22029 /* If we know how many bytes need to be stored before dst is
22030 sufficiently aligned, maintain aliasing info accurately. */
22031 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22032 desired_align, align_bytes);
22033 count_exp = plus_constant (count_exp, -align_bytes);
22034 count -= align_bytes;
22035 }
22036 if (need_zero_guard
22037 && (count < (unsigned HOST_WIDE_INT) size_needed
22038 || (align_bytes == 0
22039 && count < ((unsigned HOST_WIDE_INT) size_needed
22040 + desired_align - align))))
22041 {
22042 /* It is possible that we copied enough so the main loop will not
22043 execute. */
22044 gcc_assert (size_needed > 1);
22045 if (label == NULL_RTX)
22046 label = gen_label_rtx ();
22047 emit_cmp_and_jump_insns (count_exp,
22048 GEN_INT (size_needed),
22049 LTU, 0, counter_mode (count_exp), 1, label);
22050 if (expected_size == -1
22051 || expected_size < (desired_align - align) / 2 + size_needed)
22052 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22053 else
22054 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22055 }
22056 }
22057 if (label && size_needed == 1)
22058 {
22059 emit_label (label);
22060 LABEL_NUSES (label) = 1;
22061 label = NULL;
22062 epilogue_size_needed = 1;
22063 }
22064 else if (label == NULL_RTX)
22065 epilogue_size_needed = size_needed;
22066
22067 /* Step 3: Main loop. */
22068
22069 switch (alg)
22070 {
22071 case libcall:
22072 case no_stringop:
22073 gcc_unreachable ();
22074 case loop_1_byte:
22075 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22076 count_exp, QImode, 1, expected_size);
22077 break;
22078 case loop:
22079 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22080 count_exp, Pmode, 1, expected_size);
22081 break;
22082 case unrolled_loop:
22083 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22084 registers for 4 temporaries anyway. */
22085 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22086 count_exp, Pmode, TARGET_64BIT ? 4 : 2,
22087 expected_size);
22088 break;
22089 case rep_prefix_8_byte:
22090 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22091 DImode);
22092 break;
22093 case rep_prefix_4_byte:
22094 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22095 SImode);
22096 break;
22097 case rep_prefix_1_byte:
22098 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22099 QImode);
22100 break;
22101 }
22102 /* Adjust properly the offset of src and dest memory for aliasing. */
22103 if (CONST_INT_P (count_exp))
22104 {
22105 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22106 (count / size_needed) * size_needed);
22107 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22108 (count / size_needed) * size_needed);
22109 }
22110 else
22111 {
22112 src = change_address (src, BLKmode, srcreg);
22113 dst = change_address (dst, BLKmode, destreg);
22114 }
22115
22116 /* Step 4: Epilogue to copy the remaining bytes. */
22117 epilogue:
22118 if (label)
22119 {
22120 /* When the main loop is done, COUNT_EXP might hold original count,
22121 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22122 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22123 bytes. Compensate if needed. */
22124
22125 if (size_needed < epilogue_size_needed)
22126 {
22127 tmp =
22128 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22129 GEN_INT (size_needed - 1), count_exp, 1,
22130 OPTAB_DIRECT);
22131 if (tmp != count_exp)
22132 emit_move_insn (count_exp, tmp);
22133 }
22134 emit_label (label);
22135 LABEL_NUSES (label) = 1;
22136 }
22137
22138 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22139 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22140 epilogue_size_needed);
22141 if (jump_around_label)
22142 emit_label (jump_around_label);
22143 return true;
22144 }
22145
22146 /* Helper function for memcpy. For QImode value 0xXY produce
22147 0xXYXYXYXY of wide specified by MODE. This is essentially
22148 a * 0x10101010, but we can do slightly better than
22149 synth_mult by unwinding the sequence by hand on CPUs with
22150 slow multiply. */
22151 static rtx
22152 promote_duplicated_reg (enum machine_mode mode, rtx val)
22153 {
22154 enum machine_mode valmode = GET_MODE (val);
22155 rtx tmp;
22156 int nops = mode == DImode ? 3 : 2;
22157
22158 gcc_assert (mode == SImode || mode == DImode);
22159 if (val == const0_rtx)
22160 return copy_to_mode_reg (mode, const0_rtx);
22161 if (CONST_INT_P (val))
22162 {
22163 HOST_WIDE_INT v = INTVAL (val) & 255;
22164
22165 v |= v << 8;
22166 v |= v << 16;
22167 if (mode == DImode)
22168 v |= (v << 16) << 16;
22169 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22170 }
22171
22172 if (valmode == VOIDmode)
22173 valmode = QImode;
22174 if (valmode != QImode)
22175 val = gen_lowpart (QImode, val);
22176 if (mode == QImode)
22177 return val;
22178 if (!TARGET_PARTIAL_REG_STALL)
22179 nops--;
22180 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22181 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22182 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22183 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22184 {
22185 rtx reg = convert_modes (mode, QImode, val, true);
22186 tmp = promote_duplicated_reg (mode, const1_rtx);
22187 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22188 OPTAB_DIRECT);
22189 }
22190 else
22191 {
22192 rtx reg = convert_modes (mode, QImode, val, true);
22193
22194 if (!TARGET_PARTIAL_REG_STALL)
22195 if (mode == SImode)
22196 emit_insn (gen_movsi_insv_1 (reg, reg));
22197 else
22198 emit_insn (gen_movdi_insv_1 (reg, reg));
22199 else
22200 {
22201 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22202 NULL, 1, OPTAB_DIRECT);
22203 reg =
22204 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22205 }
22206 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22207 NULL, 1, OPTAB_DIRECT);
22208 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22209 if (mode == SImode)
22210 return reg;
22211 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22212 NULL, 1, OPTAB_DIRECT);
22213 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22214 return reg;
22215 }
22216 }
22217
22218 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22219 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22220 alignment from ALIGN to DESIRED_ALIGN. */
22221 static rtx
22222 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22223 {
22224 rtx promoted_val;
22225
22226 if (TARGET_64BIT
22227 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22228 promoted_val = promote_duplicated_reg (DImode, val);
22229 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22230 promoted_val = promote_duplicated_reg (SImode, val);
22231 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22232 promoted_val = promote_duplicated_reg (HImode, val);
22233 else
22234 promoted_val = val;
22235
22236 return promoted_val;
22237 }
22238
22239 /* Expand string clear operation (bzero). Use i386 string operations when
22240 profitable. See expand_movmem comment for explanation of individual
22241 steps performed. */
22242 bool
22243 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22244 rtx expected_align_exp, rtx expected_size_exp)
22245 {
22246 rtx destreg;
22247 rtx label = NULL;
22248 rtx tmp;
22249 rtx jump_around_label = NULL;
22250 HOST_WIDE_INT align = 1;
22251 unsigned HOST_WIDE_INT count = 0;
22252 HOST_WIDE_INT expected_size = -1;
22253 int size_needed = 0, epilogue_size_needed;
22254 int desired_align = 0, align_bytes = 0;
22255 enum stringop_alg alg;
22256 rtx promoted_val = NULL;
22257 bool force_loopy_epilogue = false;
22258 int dynamic_check;
22259 bool need_zero_guard = false;
22260
22261 if (CONST_INT_P (align_exp))
22262 align = INTVAL (align_exp);
22263 /* i386 can do misaligned access on reasonably increased cost. */
22264 if (CONST_INT_P (expected_align_exp)
22265 && INTVAL (expected_align_exp) > align)
22266 align = INTVAL (expected_align_exp);
22267 if (CONST_INT_P (count_exp))
22268 count = expected_size = INTVAL (count_exp);
22269 if (CONST_INT_P (expected_size_exp) && count == 0)
22270 expected_size = INTVAL (expected_size_exp);
22271
22272 /* Make sure we don't need to care about overflow later on. */
22273 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22274 return false;
22275
22276 /* Step 0: Decide on preferred algorithm, desired alignment and
22277 size of chunks to be copied by main loop. */
22278
22279 alg = decide_alg (count, expected_size, true, &dynamic_check);
22280 desired_align = decide_alignment (align, alg, expected_size);
22281
22282 if (!TARGET_ALIGN_STRINGOPS)
22283 align = desired_align;
22284
22285 if (alg == libcall)
22286 return false;
22287 gcc_assert (alg != no_stringop);
22288 if (!count)
22289 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
22290 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
22291 switch (alg)
22292 {
22293 case libcall:
22294 case no_stringop:
22295 gcc_unreachable ();
22296 case loop:
22297 need_zero_guard = true;
22298 size_needed = GET_MODE_SIZE (Pmode);
22299 break;
22300 case unrolled_loop:
22301 need_zero_guard = true;
22302 size_needed = GET_MODE_SIZE (Pmode) * 4;
22303 break;
22304 case rep_prefix_8_byte:
22305 size_needed = 8;
22306 break;
22307 case rep_prefix_4_byte:
22308 size_needed = 4;
22309 break;
22310 case rep_prefix_1_byte:
22311 size_needed = 1;
22312 break;
22313 case loop_1_byte:
22314 need_zero_guard = true;
22315 size_needed = 1;
22316 break;
22317 }
22318 epilogue_size_needed = size_needed;
22319
22320 /* Step 1: Prologue guard. */
22321
22322 /* Alignment code needs count to be in register. */
22323 if (CONST_INT_P (count_exp) && desired_align > align)
22324 {
22325 if (INTVAL (count_exp) > desired_align
22326 && INTVAL (count_exp) > size_needed)
22327 {
22328 align_bytes
22329 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22330 if (align_bytes <= 0)
22331 align_bytes = 0;
22332 else
22333 align_bytes = desired_align - align_bytes;
22334 }
22335 if (align_bytes == 0)
22336 {
22337 enum machine_mode mode = SImode;
22338 if (TARGET_64BIT && (count & ~0xffffffff))
22339 mode = DImode;
22340 count_exp = force_reg (mode, count_exp);
22341 }
22342 }
22343 /* Do the cheap promotion to allow better CSE across the
22344 main loop and epilogue (ie one load of the big constant in the
22345 front of all code. */
22346 if (CONST_INT_P (val_exp))
22347 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22348 desired_align, align);
22349 /* Ensure that alignment prologue won't copy past end of block. */
22350 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22351 {
22352 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22353 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
22354 Make sure it is power of 2. */
22355 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22356
22357 /* To improve performance of small blocks, we jump around the VAL
22358 promoting mode. This mean that if the promoted VAL is not constant,
22359 we might not use it in the epilogue and have to use byte
22360 loop variant. */
22361 if (epilogue_size_needed > 2 && !promoted_val)
22362 force_loopy_epilogue = true;
22363 if (count)
22364 {
22365 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22366 {
22367 /* If main algorithm works on QImode, no epilogue is needed.
22368 For small sizes just don't align anything. */
22369 if (size_needed == 1)
22370 desired_align = align;
22371 else
22372 goto epilogue;
22373 }
22374 }
22375 else
22376 {
22377 label = gen_label_rtx ();
22378 emit_cmp_and_jump_insns (count_exp,
22379 GEN_INT (epilogue_size_needed),
22380 LTU, 0, counter_mode (count_exp), 1, label);
22381 if (expected_size == -1 || expected_size <= epilogue_size_needed)
22382 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22383 else
22384 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22385 }
22386 }
22387 if (dynamic_check != -1)
22388 {
22389 rtx hot_label = gen_label_rtx ();
22390 jump_around_label = gen_label_rtx ();
22391 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22392 LEU, 0, counter_mode (count_exp), 1, hot_label);
22393 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22394 set_storage_via_libcall (dst, count_exp, val_exp, false);
22395 emit_jump (jump_around_label);
22396 emit_label (hot_label);
22397 }
22398
22399 /* Step 2: Alignment prologue. */
22400
22401 /* Do the expensive promotion once we branched off the small blocks. */
22402 if (!promoted_val)
22403 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22404 desired_align, align);
22405 gcc_assert (desired_align >= 1 && align >= 1);
22406
22407 if (desired_align > align)
22408 {
22409 if (align_bytes == 0)
22410 {
22411 /* Except for the first move in epilogue, we no longer know
22412 constant offset in aliasing info. It don't seems to worth
22413 the pain to maintain it for the first move, so throw away
22414 the info early. */
22415 dst = change_address (dst, BLKmode, destreg);
22416 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
22417 desired_align);
22418 }
22419 else
22420 {
22421 /* If we know how many bytes need to be stored before dst is
22422 sufficiently aligned, maintain aliasing info accurately. */
22423 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
22424 desired_align, align_bytes);
22425 count_exp = plus_constant (count_exp, -align_bytes);
22426 count -= align_bytes;
22427 }
22428 if (need_zero_guard
22429 && (count < (unsigned HOST_WIDE_INT) size_needed
22430 || (align_bytes == 0
22431 && count < ((unsigned HOST_WIDE_INT) size_needed
22432 + desired_align - align))))
22433 {
22434 /* It is possible that we copied enough so the main loop will not
22435 execute. */
22436 gcc_assert (size_needed > 1);
22437 if (label == NULL_RTX)
22438 label = gen_label_rtx ();
22439 emit_cmp_and_jump_insns (count_exp,
22440 GEN_INT (size_needed),
22441 LTU, 0, counter_mode (count_exp), 1, label);
22442 if (expected_size == -1
22443 || expected_size < (desired_align - align) / 2 + size_needed)
22444 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22445 else
22446 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22447 }
22448 }
22449 if (label && size_needed == 1)
22450 {
22451 emit_label (label);
22452 LABEL_NUSES (label) = 1;
22453 label = NULL;
22454 promoted_val = val_exp;
22455 epilogue_size_needed = 1;
22456 }
22457 else if (label == NULL_RTX)
22458 epilogue_size_needed = size_needed;
22459
22460 /* Step 3: Main loop. */
22461
22462 switch (alg)
22463 {
22464 case libcall:
22465 case no_stringop:
22466 gcc_unreachable ();
22467 case loop_1_byte:
22468 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22469 count_exp, QImode, 1, expected_size);
22470 break;
22471 case loop:
22472 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22473 count_exp, Pmode, 1, expected_size);
22474 break;
22475 case unrolled_loop:
22476 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22477 count_exp, Pmode, 4, expected_size);
22478 break;
22479 case rep_prefix_8_byte:
22480 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22481 DImode, val_exp);
22482 break;
22483 case rep_prefix_4_byte:
22484 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22485 SImode, val_exp);
22486 break;
22487 case rep_prefix_1_byte:
22488 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22489 QImode, val_exp);
22490 break;
22491 }
22492 /* Adjust properly the offset of src and dest memory for aliasing. */
22493 if (CONST_INT_P (count_exp))
22494 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22495 (count / size_needed) * size_needed);
22496 else
22497 dst = change_address (dst, BLKmode, destreg);
22498
22499 /* Step 4: Epilogue to copy the remaining bytes. */
22500
22501 if (label)
22502 {
22503 /* When the main loop is done, COUNT_EXP might hold original count,
22504 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22505 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22506 bytes. Compensate if needed. */
22507
22508 if (size_needed < epilogue_size_needed)
22509 {
22510 tmp =
22511 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22512 GEN_INT (size_needed - 1), count_exp, 1,
22513 OPTAB_DIRECT);
22514 if (tmp != count_exp)
22515 emit_move_insn (count_exp, tmp);
22516 }
22517 emit_label (label);
22518 LABEL_NUSES (label) = 1;
22519 }
22520 epilogue:
22521 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22522 {
22523 if (force_loopy_epilogue)
22524 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
22525 epilogue_size_needed);
22526 else
22527 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
22528 epilogue_size_needed);
22529 }
22530 if (jump_around_label)
22531 emit_label (jump_around_label);
22532 return true;
22533 }
22534
22535 /* Expand the appropriate insns for doing strlen if not just doing
22536 repnz; scasb
22537
22538 out = result, initialized with the start address
22539 align_rtx = alignment of the address.
22540 scratch = scratch register, initialized with the startaddress when
22541 not aligned, otherwise undefined
22542
22543 This is just the body. It needs the initializations mentioned above and
22544 some address computing at the end. These things are done in i386.md. */
22545
22546 static void
22547 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
22548 {
22549 int align;
22550 rtx tmp;
22551 rtx align_2_label = NULL_RTX;
22552 rtx align_3_label = NULL_RTX;
22553 rtx align_4_label = gen_label_rtx ();
22554 rtx end_0_label = gen_label_rtx ();
22555 rtx mem;
22556 rtx tmpreg = gen_reg_rtx (SImode);
22557 rtx scratch = gen_reg_rtx (SImode);
22558 rtx cmp;
22559
22560 align = 0;
22561 if (CONST_INT_P (align_rtx))
22562 align = INTVAL (align_rtx);
22563
22564 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
22565
22566 /* Is there a known alignment and is it less than 4? */
22567 if (align < 4)
22568 {
22569 rtx scratch1 = gen_reg_rtx (Pmode);
22570 emit_move_insn (scratch1, out);
22571 /* Is there a known alignment and is it not 2? */
22572 if (align != 2)
22573 {
22574 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
22575 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
22576
22577 /* Leave just the 3 lower bits. */
22578 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
22579 NULL_RTX, 0, OPTAB_WIDEN);
22580
22581 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22582 Pmode, 1, align_4_label);
22583 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
22584 Pmode, 1, align_2_label);
22585 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
22586 Pmode, 1, align_3_label);
22587 }
22588 else
22589 {
22590 /* Since the alignment is 2, we have to check 2 or 0 bytes;
22591 check if is aligned to 4 - byte. */
22592
22593 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
22594 NULL_RTX, 0, OPTAB_WIDEN);
22595
22596 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22597 Pmode, 1, align_4_label);
22598 }
22599
22600 mem = change_address (src, QImode, out);
22601
22602 /* Now compare the bytes. */
22603
22604 /* Compare the first n unaligned byte on a byte per byte basis. */
22605 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
22606 QImode, 1, end_0_label);
22607
22608 /* Increment the address. */
22609 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22610
22611 /* Not needed with an alignment of 2 */
22612 if (align != 2)
22613 {
22614 emit_label (align_2_label);
22615
22616 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22617 end_0_label);
22618
22619 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22620
22621 emit_label (align_3_label);
22622 }
22623
22624 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22625 end_0_label);
22626
22627 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22628 }
22629
22630 /* Generate loop to check 4 bytes at a time. It is not a good idea to
22631 align this loop. It gives only huge programs, but does not help to
22632 speed up. */
22633 emit_label (align_4_label);
22634
22635 mem = change_address (src, SImode, out);
22636 emit_move_insn (scratch, mem);
22637 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
22638
22639 /* This formula yields a nonzero result iff one of the bytes is zero.
22640 This saves three branches inside loop and many cycles. */
22641
22642 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
22643 emit_insn (gen_one_cmplsi2 (scratch, scratch));
22644 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
22645 emit_insn (gen_andsi3 (tmpreg, tmpreg,
22646 gen_int_mode (0x80808080, SImode)));
22647 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
22648 align_4_label);
22649
22650 if (TARGET_CMOVE)
22651 {
22652 rtx reg = gen_reg_rtx (SImode);
22653 rtx reg2 = gen_reg_rtx (Pmode);
22654 emit_move_insn (reg, tmpreg);
22655 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
22656
22657 /* If zero is not in the first two bytes, move two bytes forward. */
22658 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22659 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22660 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22661 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
22662 gen_rtx_IF_THEN_ELSE (SImode, tmp,
22663 reg,
22664 tmpreg)));
22665 /* Emit lea manually to avoid clobbering of flags. */
22666 emit_insn (gen_rtx_SET (SImode, reg2,
22667 gen_rtx_PLUS (Pmode, out, const2_rtx)));
22668
22669 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22670 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22671 emit_insn (gen_rtx_SET (VOIDmode, out,
22672 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
22673 reg2,
22674 out)));
22675 }
22676 else
22677 {
22678 rtx end_2_label = gen_label_rtx ();
22679 /* Is zero in the first two bytes? */
22680
22681 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22682 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22683 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
22684 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22685 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
22686 pc_rtx);
22687 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
22688 JUMP_LABEL (tmp) = end_2_label;
22689
22690 /* Not in the first two. Move two bytes forward. */
22691 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
22692 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
22693
22694 emit_label (end_2_label);
22695
22696 }
22697
22698 /* Avoid branch in fixing the byte. */
22699 tmpreg = gen_lowpart (QImode, tmpreg);
22700 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
22701 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
22702 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
22703 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
22704
22705 emit_label (end_0_label);
22706 }
22707
22708 /* Expand strlen. */
22709
22710 bool
22711 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
22712 {
22713 rtx addr, scratch1, scratch2, scratch3, scratch4;
22714
22715 /* The generic case of strlen expander is long. Avoid it's
22716 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
22717
22718 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22719 && !TARGET_INLINE_ALL_STRINGOPS
22720 && !optimize_insn_for_size_p ()
22721 && (!CONST_INT_P (align) || INTVAL (align) < 4))
22722 return false;
22723
22724 addr = force_reg (Pmode, XEXP (src, 0));
22725 scratch1 = gen_reg_rtx (Pmode);
22726
22727 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22728 && !optimize_insn_for_size_p ())
22729 {
22730 /* Well it seems that some optimizer does not combine a call like
22731 foo(strlen(bar), strlen(bar));
22732 when the move and the subtraction is done here. It does calculate
22733 the length just once when these instructions are done inside of
22734 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
22735 often used and I use one fewer register for the lifetime of
22736 output_strlen_unroll() this is better. */
22737
22738 emit_move_insn (out, addr);
22739
22740 ix86_expand_strlensi_unroll_1 (out, src, align);
22741
22742 /* strlensi_unroll_1 returns the address of the zero at the end of
22743 the string, like memchr(), so compute the length by subtracting
22744 the start address. */
22745 emit_insn (ix86_gen_sub3 (out, out, addr));
22746 }
22747 else
22748 {
22749 rtx unspec;
22750
22751 /* Can't use this if the user has appropriated eax, ecx, or edi. */
22752 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
22753 return false;
22754
22755 scratch2 = gen_reg_rtx (Pmode);
22756 scratch3 = gen_reg_rtx (Pmode);
22757 scratch4 = force_reg (Pmode, constm1_rtx);
22758
22759 emit_move_insn (scratch3, addr);
22760 eoschar = force_reg (QImode, eoschar);
22761
22762 src = replace_equiv_address_nv (src, scratch3);
22763
22764 /* If .md starts supporting :P, this can be done in .md. */
22765 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
22766 scratch4), UNSPEC_SCAS);
22767 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
22768 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
22769 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
22770 }
22771 return true;
22772 }
22773
22774 /* For given symbol (function) construct code to compute address of it's PLT
22775 entry in large x86-64 PIC model. */
22776 rtx
22777 construct_plt_address (rtx symbol)
22778 {
22779 rtx tmp = gen_reg_rtx (Pmode);
22780 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
22781
22782 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
22783 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
22784
22785 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
22786 emit_insn (gen_adddi3 (tmp, tmp, pic_offset_table_rtx));
22787 return tmp;
22788 }
22789
22790 rtx
22791 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
22792 rtx callarg2,
22793 rtx pop, bool sibcall)
22794 {
22795 /* We need to represent that SI and DI registers are clobbered
22796 by SYSV calls. */
22797 static int clobbered_registers[] = {
22798 XMM6_REG, XMM7_REG, XMM8_REG,
22799 XMM9_REG, XMM10_REG, XMM11_REG,
22800 XMM12_REG, XMM13_REG, XMM14_REG,
22801 XMM15_REG, SI_REG, DI_REG
22802 };
22803 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
22804 rtx use = NULL, call;
22805 unsigned int vec_len;
22806
22807 if (pop == const0_rtx)
22808 pop = NULL;
22809 gcc_assert (!TARGET_64BIT || !pop);
22810
22811 if (TARGET_MACHO && !TARGET_64BIT)
22812 {
22813 #if TARGET_MACHO
22814 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
22815 fnaddr = machopic_indirect_call_target (fnaddr);
22816 #endif
22817 }
22818 else
22819 {
22820 /* Static functions and indirect calls don't need the pic register. */
22821 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
22822 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
22823 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
22824 use_reg (&use, pic_offset_table_rtx);
22825 }
22826
22827 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
22828 {
22829 rtx al = gen_rtx_REG (QImode, AX_REG);
22830 emit_move_insn (al, callarg2);
22831 use_reg (&use, al);
22832 }
22833
22834 if (ix86_cmodel == CM_LARGE_PIC
22835 && MEM_P (fnaddr)
22836 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
22837 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
22838 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
22839 else if (sibcall
22840 ? !sibcall_insn_operand (XEXP (fnaddr, 0), Pmode)
22841 : !call_insn_operand (XEXP (fnaddr, 0), Pmode))
22842 {
22843 fnaddr = XEXP (fnaddr, 0);
22844 if (GET_MODE (fnaddr) != Pmode)
22845 fnaddr = convert_to_mode (Pmode, fnaddr, 1);
22846 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (Pmode, fnaddr));
22847 }
22848
22849 vec_len = 0;
22850 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
22851 if (retval)
22852 call = gen_rtx_SET (VOIDmode, retval, call);
22853 vec[vec_len++] = call;
22854
22855 if (pop)
22856 {
22857 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
22858 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
22859 vec[vec_len++] = pop;
22860 }
22861
22862 if (TARGET_64BIT_MS_ABI
22863 && (!callarg2 || INTVAL (callarg2) != -2))
22864 {
22865 unsigned i;
22866
22867 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
22868 UNSPEC_MS_TO_SYSV_CALL);
22869
22870 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
22871 vec[vec_len++]
22872 = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
22873 ? TImode : DImode,
22874 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
22875 ? TImode : DImode,
22876 clobbered_registers[i]));
22877 }
22878
22879 /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
22880 if (TARGET_VZEROUPPER)
22881 {
22882 int avx256;
22883 if (cfun->machine->callee_pass_avx256_p)
22884 {
22885 if (cfun->machine->callee_return_avx256_p)
22886 avx256 = callee_return_pass_avx256;
22887 else
22888 avx256 = callee_pass_avx256;
22889 }
22890 else if (cfun->machine->callee_return_avx256_p)
22891 avx256 = callee_return_avx256;
22892 else
22893 avx256 = call_no_avx256;
22894
22895 if (reload_completed)
22896 emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
22897 else
22898 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
22899 gen_rtvec (1, GEN_INT (avx256)),
22900 UNSPEC_CALL_NEEDS_VZEROUPPER);
22901 }
22902
22903 if (vec_len > 1)
22904 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
22905 call = emit_call_insn (call);
22906 if (use)
22907 CALL_INSN_FUNCTION_USAGE (call) = use;
22908
22909 return call;
22910 }
22911
22912 void
22913 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
22914 {
22915 rtx pat = PATTERN (insn);
22916 rtvec vec = XVEC (pat, 0);
22917 int len = GET_NUM_ELEM (vec) - 1;
22918
22919 /* Strip off the last entry of the parallel. */
22920 gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
22921 gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
22922 if (len == 1)
22923 pat = RTVEC_ELT (vec, 0);
22924 else
22925 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
22926
22927 emit_insn (gen_avx_vzeroupper (vzeroupper));
22928 emit_call_insn (pat);
22929 }
22930
22931 /* Output the assembly for a call instruction. */
22932
22933 const char *
22934 ix86_output_call_insn (rtx insn, rtx call_op)
22935 {
22936 bool direct_p = constant_call_address_operand (call_op, Pmode);
22937 bool seh_nop_p = false;
22938 const char *xasm;
22939
22940 if (SIBLING_CALL_P (insn))
22941 {
22942 if (direct_p)
22943 xasm = "jmp\t%P0";
22944 /* SEH epilogue detection requires the indirect branch case
22945 to include REX.W. */
22946 else if (TARGET_SEH)
22947 xasm = "rex.W jmp %A0";
22948 else
22949 xasm = "jmp\t%A0";
22950
22951 output_asm_insn (xasm, &call_op);
22952 return "";
22953 }
22954
22955 /* SEH unwinding can require an extra nop to be emitted in several
22956 circumstances. Determine if we have one of those. */
22957 if (TARGET_SEH)
22958 {
22959 rtx i;
22960
22961 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
22962 {
22963 /* If we get to another real insn, we don't need the nop. */
22964 if (INSN_P (i))
22965 break;
22966
22967 /* If we get to the epilogue note, prevent a catch region from
22968 being adjacent to the standard epilogue sequence. If non-
22969 call-exceptions, we'll have done this during epilogue emission. */
22970 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
22971 && !flag_non_call_exceptions
22972 && !can_throw_internal (insn))
22973 {
22974 seh_nop_p = true;
22975 break;
22976 }
22977 }
22978
22979 /* If we didn't find a real insn following the call, prevent the
22980 unwinder from looking into the next function. */
22981 if (i == NULL)
22982 seh_nop_p = true;
22983 }
22984
22985 if (direct_p)
22986 xasm = "call\t%P0";
22987 else
22988 xasm = "call\t%A0";
22989
22990 output_asm_insn (xasm, &call_op);
22991
22992 if (seh_nop_p)
22993 return "nop";
22994
22995 return "";
22996 }
22997 \f
22998 /* Clear stack slot assignments remembered from previous functions.
22999 This is called from INIT_EXPANDERS once before RTL is emitted for each
23000 function. */
23001
23002 static struct machine_function *
23003 ix86_init_machine_status (void)
23004 {
23005 struct machine_function *f;
23006
23007 f = ggc_alloc_cleared_machine_function ();
23008 f->use_fast_prologue_epilogue_nregs = -1;
23009 f->tls_descriptor_call_expanded_p = 0;
23010 f->call_abi = ix86_abi;
23011
23012 return f;
23013 }
23014
23015 /* Return a MEM corresponding to a stack slot with mode MODE.
23016 Allocate a new slot if necessary.
23017
23018 The RTL for a function can have several slots available: N is
23019 which slot to use. */
23020
23021 rtx
23022 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23023 {
23024 struct stack_local_entry *s;
23025
23026 gcc_assert (n < MAX_386_STACK_LOCALS);
23027
23028 /* Virtual slot is valid only before vregs are instantiated. */
23029 gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
23030
23031 for (s = ix86_stack_locals; s; s = s->next)
23032 if (s->mode == mode && s->n == n)
23033 return validize_mem (copy_rtx (s->rtl));
23034
23035 s = ggc_alloc_stack_local_entry ();
23036 s->n = n;
23037 s->mode = mode;
23038 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23039
23040 s->next = ix86_stack_locals;
23041 ix86_stack_locals = s;
23042 return validize_mem (s->rtl);
23043 }
23044 \f
23045 /* Calculate the length of the memory address in the instruction encoding.
23046 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23047 or other prefixes. */
23048
23049 int
23050 memory_address_length (rtx addr)
23051 {
23052 struct ix86_address parts;
23053 rtx base, index, disp;
23054 int len;
23055 int ok;
23056
23057 if (GET_CODE (addr) == PRE_DEC
23058 || GET_CODE (addr) == POST_INC
23059 || GET_CODE (addr) == PRE_MODIFY
23060 || GET_CODE (addr) == POST_MODIFY)
23061 return 0;
23062
23063 ok = ix86_decompose_address (addr, &parts);
23064 gcc_assert (ok);
23065
23066 if (parts.base && GET_CODE (parts.base) == SUBREG)
23067 parts.base = SUBREG_REG (parts.base);
23068 if (parts.index && GET_CODE (parts.index) == SUBREG)
23069 parts.index = SUBREG_REG (parts.index);
23070
23071 base = parts.base;
23072 index = parts.index;
23073 disp = parts.disp;
23074
23075 /* Add length of addr32 prefix. */
23076 len = (GET_CODE (addr) == ZERO_EXTEND
23077 || GET_CODE (addr) == AND);
23078
23079 /* Rule of thumb:
23080 - esp as the base always wants an index,
23081 - ebp as the base always wants a displacement,
23082 - r12 as the base always wants an index,
23083 - r13 as the base always wants a displacement. */
23084
23085 /* Register Indirect. */
23086 if (base && !index && !disp)
23087 {
23088 /* esp (for its index) and ebp (for its displacement) need
23089 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23090 code. */
23091 if (REG_P (addr)
23092 && (addr == arg_pointer_rtx
23093 || addr == frame_pointer_rtx
23094 || REGNO (addr) == SP_REG
23095 || REGNO (addr) == BP_REG
23096 || REGNO (addr) == R12_REG
23097 || REGNO (addr) == R13_REG))
23098 len = 1;
23099 }
23100
23101 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23102 is not disp32, but disp32(%rip), so for disp32
23103 SIB byte is needed, unless print_operand_address
23104 optimizes it into disp32(%rip) or (%rip) is implied
23105 by UNSPEC. */
23106 else if (disp && !base && !index)
23107 {
23108 len = 4;
23109 if (TARGET_64BIT)
23110 {
23111 rtx symbol = disp;
23112
23113 if (GET_CODE (disp) == CONST)
23114 symbol = XEXP (disp, 0);
23115 if (GET_CODE (symbol) == PLUS
23116 && CONST_INT_P (XEXP (symbol, 1)))
23117 symbol = XEXP (symbol, 0);
23118
23119 if (GET_CODE (symbol) != LABEL_REF
23120 && (GET_CODE (symbol) != SYMBOL_REF
23121 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23122 && (GET_CODE (symbol) != UNSPEC
23123 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23124 && XINT (symbol, 1) != UNSPEC_PCREL
23125 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23126 len += 1;
23127 }
23128 }
23129
23130 else
23131 {
23132 /* Find the length of the displacement constant. */
23133 if (disp)
23134 {
23135 if (base && satisfies_constraint_K (disp))
23136 len = 1;
23137 else
23138 len = 4;
23139 }
23140 /* ebp always wants a displacement. Similarly r13. */
23141 else if (base && REG_P (base)
23142 && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23143 len = 1;
23144
23145 /* An index requires the two-byte modrm form.... */
23146 if (index
23147 /* ...like esp (or r12), which always wants an index. */
23148 || base == arg_pointer_rtx
23149 || base == frame_pointer_rtx
23150 || (base && REG_P (base)
23151 && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23152 len += 1;
23153 }
23154
23155 switch (parts.seg)
23156 {
23157 case SEG_FS:
23158 case SEG_GS:
23159 len += 1;
23160 break;
23161 default:
23162 break;
23163 }
23164
23165 return len;
23166 }
23167
23168 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23169 is set, expect that insn have 8bit immediate alternative. */
23170 int
23171 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23172 {
23173 int len = 0;
23174 int i;
23175 extract_insn_cached (insn);
23176 for (i = recog_data.n_operands - 1; i >= 0; --i)
23177 if (CONSTANT_P (recog_data.operand[i]))
23178 {
23179 enum attr_mode mode = get_attr_mode (insn);
23180
23181 gcc_assert (!len);
23182 if (shortform && CONST_INT_P (recog_data.operand[i]))
23183 {
23184 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23185 switch (mode)
23186 {
23187 case MODE_QI:
23188 len = 1;
23189 continue;
23190 case MODE_HI:
23191 ival = trunc_int_for_mode (ival, HImode);
23192 break;
23193 case MODE_SI:
23194 ival = trunc_int_for_mode (ival, SImode);
23195 break;
23196 default:
23197 break;
23198 }
23199 if (IN_RANGE (ival, -128, 127))
23200 {
23201 len = 1;
23202 continue;
23203 }
23204 }
23205 switch (mode)
23206 {
23207 case MODE_QI:
23208 len = 1;
23209 break;
23210 case MODE_HI:
23211 len = 2;
23212 break;
23213 case MODE_SI:
23214 len = 4;
23215 break;
23216 /* Immediates for DImode instructions are encoded as 32bit sign extended values. */
23217 case MODE_DI:
23218 len = 4;
23219 break;
23220 default:
23221 fatal_insn ("unknown insn mode", insn);
23222 }
23223 }
23224 return len;
23225 }
23226 /* Compute default value for "length_address" attribute. */
23227 int
23228 ix86_attr_length_address_default (rtx insn)
23229 {
23230 int i;
23231
23232 if (get_attr_type (insn) == TYPE_LEA)
23233 {
23234 rtx set = PATTERN (insn), addr;
23235
23236 if (GET_CODE (set) == PARALLEL)
23237 set = XVECEXP (set, 0, 0);
23238
23239 gcc_assert (GET_CODE (set) == SET);
23240
23241 addr = SET_SRC (set);
23242 if (TARGET_64BIT && get_attr_mode (insn) == MODE_SI)
23243 {
23244 if (GET_CODE (addr) == ZERO_EXTEND)
23245 addr = XEXP (addr, 0);
23246 if (GET_CODE (addr) == SUBREG)
23247 addr = SUBREG_REG (addr);
23248 }
23249
23250 return memory_address_length (addr);
23251 }
23252
23253 extract_insn_cached (insn);
23254 for (i = recog_data.n_operands - 1; i >= 0; --i)
23255 if (MEM_P (recog_data.operand[i]))
23256 {
23257 constrain_operands_cached (reload_completed);
23258 if (which_alternative != -1)
23259 {
23260 const char *constraints = recog_data.constraints[i];
23261 int alt = which_alternative;
23262
23263 while (*constraints == '=' || *constraints == '+')
23264 constraints++;
23265 while (alt-- > 0)
23266 while (*constraints++ != ',')
23267 ;
23268 /* Skip ignored operands. */
23269 if (*constraints == 'X')
23270 continue;
23271 }
23272 return memory_address_length (XEXP (recog_data.operand[i], 0));
23273 }
23274 return 0;
23275 }
23276
23277 /* Compute default value for "length_vex" attribute. It includes
23278 2 or 3 byte VEX prefix and 1 opcode byte. */
23279
23280 int
23281 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
23282 {
23283 int i;
23284
23285 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
23286 byte VEX prefix. */
23287 if (!has_0f_opcode || has_vex_w)
23288 return 3 + 1;
23289
23290 /* We can always use 2 byte VEX prefix in 32bit. */
23291 if (!TARGET_64BIT)
23292 return 2 + 1;
23293
23294 extract_insn_cached (insn);
23295
23296 for (i = recog_data.n_operands - 1; i >= 0; --i)
23297 if (REG_P (recog_data.operand[i]))
23298 {
23299 /* REX.W bit uses 3 byte VEX prefix. */
23300 if (GET_MODE (recog_data.operand[i]) == DImode
23301 && GENERAL_REG_P (recog_data.operand[i]))
23302 return 3 + 1;
23303 }
23304 else
23305 {
23306 /* REX.X or REX.B bits use 3 byte VEX prefix. */
23307 if (MEM_P (recog_data.operand[i])
23308 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
23309 return 3 + 1;
23310 }
23311
23312 return 2 + 1;
23313 }
23314 \f
23315 /* Return the maximum number of instructions a cpu can issue. */
23316
23317 static int
23318 ix86_issue_rate (void)
23319 {
23320 switch (ix86_tune)
23321 {
23322 case PROCESSOR_PENTIUM:
23323 case PROCESSOR_ATOM:
23324 case PROCESSOR_K6:
23325 return 2;
23326
23327 case PROCESSOR_PENTIUMPRO:
23328 case PROCESSOR_PENTIUM4:
23329 case PROCESSOR_CORE2_32:
23330 case PROCESSOR_CORE2_64:
23331 case PROCESSOR_COREI7_32:
23332 case PROCESSOR_COREI7_64:
23333 case PROCESSOR_ATHLON:
23334 case PROCESSOR_K8:
23335 case PROCESSOR_AMDFAM10:
23336 case PROCESSOR_NOCONA:
23337 case PROCESSOR_GENERIC32:
23338 case PROCESSOR_GENERIC64:
23339 case PROCESSOR_BDVER1:
23340 case PROCESSOR_BDVER2:
23341 case PROCESSOR_BTVER1:
23342 return 3;
23343
23344 default:
23345 return 1;
23346 }
23347 }
23348
23349 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
23350 by DEP_INSN and nothing set by DEP_INSN. */
23351
23352 static bool
23353 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
23354 {
23355 rtx set, set2;
23356
23357 /* Simplify the test for uninteresting insns. */
23358 if (insn_type != TYPE_SETCC
23359 && insn_type != TYPE_ICMOV
23360 && insn_type != TYPE_FCMOV
23361 && insn_type != TYPE_IBR)
23362 return false;
23363
23364 if ((set = single_set (dep_insn)) != 0)
23365 {
23366 set = SET_DEST (set);
23367 set2 = NULL_RTX;
23368 }
23369 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
23370 && XVECLEN (PATTERN (dep_insn), 0) == 2
23371 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
23372 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
23373 {
23374 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23375 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23376 }
23377 else
23378 return false;
23379
23380 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
23381 return false;
23382
23383 /* This test is true if the dependent insn reads the flags but
23384 not any other potentially set register. */
23385 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
23386 return false;
23387
23388 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
23389 return false;
23390
23391 return true;
23392 }
23393
23394 /* Return true iff USE_INSN has a memory address with operands set by
23395 SET_INSN. */
23396
23397 bool
23398 ix86_agi_dependent (rtx set_insn, rtx use_insn)
23399 {
23400 int i;
23401 extract_insn_cached (use_insn);
23402 for (i = recog_data.n_operands - 1; i >= 0; --i)
23403 if (MEM_P (recog_data.operand[i]))
23404 {
23405 rtx addr = XEXP (recog_data.operand[i], 0);
23406 return modified_in_p (addr, set_insn) != 0;
23407 }
23408 return false;
23409 }
23410
23411 static int
23412 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
23413 {
23414 enum attr_type insn_type, dep_insn_type;
23415 enum attr_memory memory;
23416 rtx set, set2;
23417 int dep_insn_code_number;
23418
23419 /* Anti and output dependencies have zero cost on all CPUs. */
23420 if (REG_NOTE_KIND (link) != 0)
23421 return 0;
23422
23423 dep_insn_code_number = recog_memoized (dep_insn);
23424
23425 /* If we can't recognize the insns, we can't really do anything. */
23426 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
23427 return cost;
23428
23429 insn_type = get_attr_type (insn);
23430 dep_insn_type = get_attr_type (dep_insn);
23431
23432 switch (ix86_tune)
23433 {
23434 case PROCESSOR_PENTIUM:
23435 /* Address Generation Interlock adds a cycle of latency. */
23436 if (insn_type == TYPE_LEA)
23437 {
23438 rtx addr = PATTERN (insn);
23439
23440 if (GET_CODE (addr) == PARALLEL)
23441 addr = XVECEXP (addr, 0, 0);
23442
23443 gcc_assert (GET_CODE (addr) == SET);
23444
23445 addr = SET_SRC (addr);
23446 if (modified_in_p (addr, dep_insn))
23447 cost += 1;
23448 }
23449 else if (ix86_agi_dependent (dep_insn, insn))
23450 cost += 1;
23451
23452 /* ??? Compares pair with jump/setcc. */
23453 if (ix86_flags_dependent (insn, dep_insn, insn_type))
23454 cost = 0;
23455
23456 /* Floating point stores require value to be ready one cycle earlier. */
23457 if (insn_type == TYPE_FMOV
23458 && get_attr_memory (insn) == MEMORY_STORE
23459 && !ix86_agi_dependent (dep_insn, insn))
23460 cost += 1;
23461 break;
23462
23463 case PROCESSOR_PENTIUMPRO:
23464 memory = get_attr_memory (insn);
23465
23466 /* INT->FP conversion is expensive. */
23467 if (get_attr_fp_int_src (dep_insn))
23468 cost += 5;
23469
23470 /* There is one cycle extra latency between an FP op and a store. */
23471 if (insn_type == TYPE_FMOV
23472 && (set = single_set (dep_insn)) != NULL_RTX
23473 && (set2 = single_set (insn)) != NULL_RTX
23474 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
23475 && MEM_P (SET_DEST (set2)))
23476 cost += 1;
23477
23478 /* Show ability of reorder buffer to hide latency of load by executing
23479 in parallel with previous instruction in case
23480 previous instruction is not needed to compute the address. */
23481 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23482 && !ix86_agi_dependent (dep_insn, insn))
23483 {
23484 /* Claim moves to take one cycle, as core can issue one load
23485 at time and the next load can start cycle later. */
23486 if (dep_insn_type == TYPE_IMOV
23487 || dep_insn_type == TYPE_FMOV)
23488 cost = 1;
23489 else if (cost > 1)
23490 cost--;
23491 }
23492 break;
23493
23494 case PROCESSOR_K6:
23495 memory = get_attr_memory (insn);
23496
23497 /* The esp dependency is resolved before the instruction is really
23498 finished. */
23499 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
23500 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
23501 return 1;
23502
23503 /* INT->FP conversion is expensive. */
23504 if (get_attr_fp_int_src (dep_insn))
23505 cost += 5;
23506
23507 /* Show ability of reorder buffer to hide latency of load by executing
23508 in parallel with previous instruction in case
23509 previous instruction is not needed to compute the address. */
23510 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23511 && !ix86_agi_dependent (dep_insn, insn))
23512 {
23513 /* Claim moves to take one cycle, as core can issue one load
23514 at time and the next load can start cycle later. */
23515 if (dep_insn_type == TYPE_IMOV
23516 || dep_insn_type == TYPE_FMOV)
23517 cost = 1;
23518 else if (cost > 2)
23519 cost -= 2;
23520 else
23521 cost = 1;
23522 }
23523 break;
23524
23525 case PROCESSOR_ATHLON:
23526 case PROCESSOR_K8:
23527 case PROCESSOR_AMDFAM10:
23528 case PROCESSOR_BDVER1:
23529 case PROCESSOR_BDVER2:
23530 case PROCESSOR_BTVER1:
23531 case PROCESSOR_ATOM:
23532 case PROCESSOR_GENERIC32:
23533 case PROCESSOR_GENERIC64:
23534 memory = get_attr_memory (insn);
23535
23536 /* Show ability of reorder buffer to hide latency of load by executing
23537 in parallel with previous instruction in case
23538 previous instruction is not needed to compute the address. */
23539 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23540 && !ix86_agi_dependent (dep_insn, insn))
23541 {
23542 enum attr_unit unit = get_attr_unit (insn);
23543 int loadcost = 3;
23544
23545 /* Because of the difference between the length of integer and
23546 floating unit pipeline preparation stages, the memory operands
23547 for floating point are cheaper.
23548
23549 ??? For Athlon it the difference is most probably 2. */
23550 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
23551 loadcost = 3;
23552 else
23553 loadcost = TARGET_ATHLON ? 2 : 0;
23554
23555 if (cost >= loadcost)
23556 cost -= loadcost;
23557 else
23558 cost = 0;
23559 }
23560
23561 default:
23562 break;
23563 }
23564
23565 return cost;
23566 }
23567
23568 /* How many alternative schedules to try. This should be as wide as the
23569 scheduling freedom in the DFA, but no wider. Making this value too
23570 large results extra work for the scheduler. */
23571
23572 static int
23573 ia32_multipass_dfa_lookahead (void)
23574 {
23575 switch (ix86_tune)
23576 {
23577 case PROCESSOR_PENTIUM:
23578 return 2;
23579
23580 case PROCESSOR_PENTIUMPRO:
23581 case PROCESSOR_K6:
23582 return 1;
23583
23584 case PROCESSOR_CORE2_32:
23585 case PROCESSOR_CORE2_64:
23586 case PROCESSOR_COREI7_32:
23587 case PROCESSOR_COREI7_64:
23588 /* Generally, we want haifa-sched:max_issue() to look ahead as far
23589 as many instructions can be executed on a cycle, i.e.,
23590 issue_rate. I wonder why tuning for many CPUs does not do this. */
23591 return ix86_issue_rate ();
23592
23593 default:
23594 return 0;
23595 }
23596 }
23597
23598 \f
23599
23600 /* Model decoder of Core 2/i7.
23601 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
23602 track the instruction fetch block boundaries and make sure that long
23603 (9+ bytes) instructions are assigned to D0. */
23604
23605 /* Maximum length of an insn that can be handled by
23606 a secondary decoder unit. '8' for Core 2/i7. */
23607 static int core2i7_secondary_decoder_max_insn_size;
23608
23609 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
23610 '16' for Core 2/i7. */
23611 static int core2i7_ifetch_block_size;
23612
23613 /* Maximum number of instructions decoder can handle per cycle.
23614 '6' for Core 2/i7. */
23615 static int core2i7_ifetch_block_max_insns;
23616
23617 typedef struct ix86_first_cycle_multipass_data_ *
23618 ix86_first_cycle_multipass_data_t;
23619 typedef const struct ix86_first_cycle_multipass_data_ *
23620 const_ix86_first_cycle_multipass_data_t;
23621
23622 /* A variable to store target state across calls to max_issue within
23623 one cycle. */
23624 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
23625 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
23626
23627 /* Initialize DATA. */
23628 static void
23629 core2i7_first_cycle_multipass_init (void *_data)
23630 {
23631 ix86_first_cycle_multipass_data_t data
23632 = (ix86_first_cycle_multipass_data_t) _data;
23633
23634 data->ifetch_block_len = 0;
23635 data->ifetch_block_n_insns = 0;
23636 data->ready_try_change = NULL;
23637 data->ready_try_change_size = 0;
23638 }
23639
23640 /* Advancing the cycle; reset ifetch block counts. */
23641 static void
23642 core2i7_dfa_post_advance_cycle (void)
23643 {
23644 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
23645
23646 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23647
23648 data->ifetch_block_len = 0;
23649 data->ifetch_block_n_insns = 0;
23650 }
23651
23652 static int min_insn_size (rtx);
23653
23654 /* Filter out insns from ready_try that the core will not be able to issue
23655 on current cycle due to decoder. */
23656 static void
23657 core2i7_first_cycle_multipass_filter_ready_try
23658 (const_ix86_first_cycle_multipass_data_t data,
23659 char *ready_try, int n_ready, bool first_cycle_insn_p)
23660 {
23661 while (n_ready--)
23662 {
23663 rtx insn;
23664 int insn_size;
23665
23666 if (ready_try[n_ready])
23667 continue;
23668
23669 insn = get_ready_element (n_ready);
23670 insn_size = min_insn_size (insn);
23671
23672 if (/* If this is a too long an insn for a secondary decoder ... */
23673 (!first_cycle_insn_p
23674 && insn_size > core2i7_secondary_decoder_max_insn_size)
23675 /* ... or it would not fit into the ifetch block ... */
23676 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
23677 /* ... or the decoder is full already ... */
23678 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
23679 /* ... mask the insn out. */
23680 {
23681 ready_try[n_ready] = 1;
23682
23683 if (data->ready_try_change)
23684 SET_BIT (data->ready_try_change, n_ready);
23685 }
23686 }
23687 }
23688
23689 /* Prepare for a new round of multipass lookahead scheduling. */
23690 static void
23691 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
23692 bool first_cycle_insn_p)
23693 {
23694 ix86_first_cycle_multipass_data_t data
23695 = (ix86_first_cycle_multipass_data_t) _data;
23696 const_ix86_first_cycle_multipass_data_t prev_data
23697 = ix86_first_cycle_multipass_data;
23698
23699 /* Restore the state from the end of the previous round. */
23700 data->ifetch_block_len = prev_data->ifetch_block_len;
23701 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
23702
23703 /* Filter instructions that cannot be issued on current cycle due to
23704 decoder restrictions. */
23705 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
23706 first_cycle_insn_p);
23707 }
23708
23709 /* INSN is being issued in current solution. Account for its impact on
23710 the decoder model. */
23711 static void
23712 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
23713 rtx insn, const void *_prev_data)
23714 {
23715 ix86_first_cycle_multipass_data_t data
23716 = (ix86_first_cycle_multipass_data_t) _data;
23717 const_ix86_first_cycle_multipass_data_t prev_data
23718 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
23719
23720 int insn_size = min_insn_size (insn);
23721
23722 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
23723 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
23724 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
23725 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23726
23727 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
23728 if (!data->ready_try_change)
23729 {
23730 data->ready_try_change = sbitmap_alloc (n_ready);
23731 data->ready_try_change_size = n_ready;
23732 }
23733 else if (data->ready_try_change_size < n_ready)
23734 {
23735 data->ready_try_change = sbitmap_resize (data->ready_try_change,
23736 n_ready, 0);
23737 data->ready_try_change_size = n_ready;
23738 }
23739 sbitmap_zero (data->ready_try_change);
23740
23741 /* Filter out insns from ready_try that the core will not be able to issue
23742 on current cycle due to decoder. */
23743 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
23744 false);
23745 }
23746
23747 /* Revert the effect on ready_try. */
23748 static void
23749 core2i7_first_cycle_multipass_backtrack (const void *_data,
23750 char *ready_try,
23751 int n_ready ATTRIBUTE_UNUSED)
23752 {
23753 const_ix86_first_cycle_multipass_data_t data
23754 = (const_ix86_first_cycle_multipass_data_t) _data;
23755 unsigned int i = 0;
23756 sbitmap_iterator sbi;
23757
23758 gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
23759 EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
23760 {
23761 ready_try[i] = 0;
23762 }
23763 }
23764
23765 /* Save the result of multipass lookahead scheduling for the next round. */
23766 static void
23767 core2i7_first_cycle_multipass_end (const void *_data)
23768 {
23769 const_ix86_first_cycle_multipass_data_t data
23770 = (const_ix86_first_cycle_multipass_data_t) _data;
23771 ix86_first_cycle_multipass_data_t next_data
23772 = ix86_first_cycle_multipass_data;
23773
23774 if (data != NULL)
23775 {
23776 next_data->ifetch_block_len = data->ifetch_block_len;
23777 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
23778 }
23779 }
23780
23781 /* Deallocate target data. */
23782 static void
23783 core2i7_first_cycle_multipass_fini (void *_data)
23784 {
23785 ix86_first_cycle_multipass_data_t data
23786 = (ix86_first_cycle_multipass_data_t) _data;
23787
23788 if (data->ready_try_change)
23789 {
23790 sbitmap_free (data->ready_try_change);
23791 data->ready_try_change = NULL;
23792 data->ready_try_change_size = 0;
23793 }
23794 }
23795
23796 /* Prepare for scheduling pass. */
23797 static void
23798 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
23799 int verbose ATTRIBUTE_UNUSED,
23800 int max_uid ATTRIBUTE_UNUSED)
23801 {
23802 /* Install scheduling hooks for current CPU. Some of these hooks are used
23803 in time-critical parts of the scheduler, so we only set them up when
23804 they are actually used. */
23805 switch (ix86_tune)
23806 {
23807 case PROCESSOR_CORE2_32:
23808 case PROCESSOR_CORE2_64:
23809 case PROCESSOR_COREI7_32:
23810 case PROCESSOR_COREI7_64:
23811 targetm.sched.dfa_post_advance_cycle
23812 = core2i7_dfa_post_advance_cycle;
23813 targetm.sched.first_cycle_multipass_init
23814 = core2i7_first_cycle_multipass_init;
23815 targetm.sched.first_cycle_multipass_begin
23816 = core2i7_first_cycle_multipass_begin;
23817 targetm.sched.first_cycle_multipass_issue
23818 = core2i7_first_cycle_multipass_issue;
23819 targetm.sched.first_cycle_multipass_backtrack
23820 = core2i7_first_cycle_multipass_backtrack;
23821 targetm.sched.first_cycle_multipass_end
23822 = core2i7_first_cycle_multipass_end;
23823 targetm.sched.first_cycle_multipass_fini
23824 = core2i7_first_cycle_multipass_fini;
23825
23826 /* Set decoder parameters. */
23827 core2i7_secondary_decoder_max_insn_size = 8;
23828 core2i7_ifetch_block_size = 16;
23829 core2i7_ifetch_block_max_insns = 6;
23830 break;
23831
23832 default:
23833 targetm.sched.dfa_post_advance_cycle = NULL;
23834 targetm.sched.first_cycle_multipass_init = NULL;
23835 targetm.sched.first_cycle_multipass_begin = NULL;
23836 targetm.sched.first_cycle_multipass_issue = NULL;
23837 targetm.sched.first_cycle_multipass_backtrack = NULL;
23838 targetm.sched.first_cycle_multipass_end = NULL;
23839 targetm.sched.first_cycle_multipass_fini = NULL;
23840 break;
23841 }
23842 }
23843
23844 \f
23845 /* Compute the alignment given to a constant that is being placed in memory.
23846 EXP is the constant and ALIGN is the alignment that the object would
23847 ordinarily have.
23848 The value of this function is used instead of that alignment to align
23849 the object. */
23850
23851 int
23852 ix86_constant_alignment (tree exp, int align)
23853 {
23854 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
23855 || TREE_CODE (exp) == INTEGER_CST)
23856 {
23857 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
23858 return 64;
23859 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
23860 return 128;
23861 }
23862 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
23863 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
23864 return BITS_PER_WORD;
23865
23866 return align;
23867 }
23868
23869 /* Compute the alignment for a static variable.
23870 TYPE is the data type, and ALIGN is the alignment that
23871 the object would ordinarily have. The value of this function is used
23872 instead of that alignment to align the object. */
23873
23874 int
23875 ix86_data_alignment (tree type, int align)
23876 {
23877 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
23878
23879 if (AGGREGATE_TYPE_P (type)
23880 && TYPE_SIZE (type)
23881 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23882 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
23883 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
23884 && align < max_align)
23885 align = max_align;
23886
23887 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
23888 to 16byte boundary. */
23889 if (TARGET_64BIT)
23890 {
23891 if (AGGREGATE_TYPE_P (type)
23892 && TYPE_SIZE (type)
23893 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23894 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
23895 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
23896 return 128;
23897 }
23898
23899 if (TREE_CODE (type) == ARRAY_TYPE)
23900 {
23901 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
23902 return 64;
23903 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
23904 return 128;
23905 }
23906 else if (TREE_CODE (type) == COMPLEX_TYPE)
23907 {
23908
23909 if (TYPE_MODE (type) == DCmode && align < 64)
23910 return 64;
23911 if ((TYPE_MODE (type) == XCmode
23912 || TYPE_MODE (type) == TCmode) && align < 128)
23913 return 128;
23914 }
23915 else if ((TREE_CODE (type) == RECORD_TYPE
23916 || TREE_CODE (type) == UNION_TYPE
23917 || TREE_CODE (type) == QUAL_UNION_TYPE)
23918 && TYPE_FIELDS (type))
23919 {
23920 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
23921 return 64;
23922 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
23923 return 128;
23924 }
23925 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
23926 || TREE_CODE (type) == INTEGER_TYPE)
23927 {
23928 if (TYPE_MODE (type) == DFmode && align < 64)
23929 return 64;
23930 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
23931 return 128;
23932 }
23933
23934 return align;
23935 }
23936
23937 /* Compute the alignment for a local variable or a stack slot. EXP is
23938 the data type or decl itself, MODE is the widest mode available and
23939 ALIGN is the alignment that the object would ordinarily have. The
23940 value of this macro is used instead of that alignment to align the
23941 object. */
23942
23943 unsigned int
23944 ix86_local_alignment (tree exp, enum machine_mode mode,
23945 unsigned int align)
23946 {
23947 tree type, decl;
23948
23949 if (exp && DECL_P (exp))
23950 {
23951 type = TREE_TYPE (exp);
23952 decl = exp;
23953 }
23954 else
23955 {
23956 type = exp;
23957 decl = NULL;
23958 }
23959
23960 /* Don't do dynamic stack realignment for long long objects with
23961 -mpreferred-stack-boundary=2. */
23962 if (!TARGET_64BIT
23963 && align == 64
23964 && ix86_preferred_stack_boundary < 64
23965 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
23966 && (!type || !TYPE_USER_ALIGN (type))
23967 && (!decl || !DECL_USER_ALIGN (decl)))
23968 align = 32;
23969
23970 /* If TYPE is NULL, we are allocating a stack slot for caller-save
23971 register in MODE. We will return the largest alignment of XF
23972 and DF. */
23973 if (!type)
23974 {
23975 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
23976 align = GET_MODE_ALIGNMENT (DFmode);
23977 return align;
23978 }
23979
23980 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
23981 to 16byte boundary. Exact wording is:
23982
23983 An array uses the same alignment as its elements, except that a local or
23984 global array variable of length at least 16 bytes or
23985 a C99 variable-length array variable always has alignment of at least 16 bytes.
23986
23987 This was added to allow use of aligned SSE instructions at arrays. This
23988 rule is meant for static storage (where compiler can not do the analysis
23989 by itself). We follow it for automatic variables only when convenient.
23990 We fully control everything in the function compiled and functions from
23991 other unit can not rely on the alignment.
23992
23993 Exclude va_list type. It is the common case of local array where
23994 we can not benefit from the alignment. */
23995 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
23996 && TARGET_SSE)
23997 {
23998 if (AGGREGATE_TYPE_P (type)
23999 && (va_list_type_node == NULL_TREE
24000 || (TYPE_MAIN_VARIANT (type)
24001 != TYPE_MAIN_VARIANT (va_list_type_node)))
24002 && TYPE_SIZE (type)
24003 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24004 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
24005 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24006 return 128;
24007 }
24008 if (TREE_CODE (type) == ARRAY_TYPE)
24009 {
24010 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24011 return 64;
24012 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24013 return 128;
24014 }
24015 else if (TREE_CODE (type) == COMPLEX_TYPE)
24016 {
24017 if (TYPE_MODE (type) == DCmode && align < 64)
24018 return 64;
24019 if ((TYPE_MODE (type) == XCmode
24020 || TYPE_MODE (type) == TCmode) && align < 128)
24021 return 128;
24022 }
24023 else if ((TREE_CODE (type) == RECORD_TYPE
24024 || TREE_CODE (type) == UNION_TYPE
24025 || TREE_CODE (type) == QUAL_UNION_TYPE)
24026 && TYPE_FIELDS (type))
24027 {
24028 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24029 return 64;
24030 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24031 return 128;
24032 }
24033 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24034 || TREE_CODE (type) == INTEGER_TYPE)
24035 {
24036
24037 if (TYPE_MODE (type) == DFmode && align < 64)
24038 return 64;
24039 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24040 return 128;
24041 }
24042 return align;
24043 }
24044
24045 /* Compute the minimum required alignment for dynamic stack realignment
24046 purposes for a local variable, parameter or a stack slot. EXP is
24047 the data type or decl itself, MODE is its mode and ALIGN is the
24048 alignment that the object would ordinarily have. */
24049
24050 unsigned int
24051 ix86_minimum_alignment (tree exp, enum machine_mode mode,
24052 unsigned int align)
24053 {
24054 tree type, decl;
24055
24056 if (exp && DECL_P (exp))
24057 {
24058 type = TREE_TYPE (exp);
24059 decl = exp;
24060 }
24061 else
24062 {
24063 type = exp;
24064 decl = NULL;
24065 }
24066
24067 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
24068 return align;
24069
24070 /* Don't do dynamic stack realignment for long long objects with
24071 -mpreferred-stack-boundary=2. */
24072 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
24073 && (!type || !TYPE_USER_ALIGN (type))
24074 && (!decl || !DECL_USER_ALIGN (decl)))
24075 return 32;
24076
24077 return align;
24078 }
24079 \f
24080 /* Find a location for the static chain incoming to a nested function.
24081 This is a register, unless all free registers are used by arguments. */
24082
24083 static rtx
24084 ix86_static_chain (const_tree fndecl, bool incoming_p)
24085 {
24086 unsigned regno;
24087
24088 if (!DECL_STATIC_CHAIN (fndecl))
24089 return NULL;
24090
24091 if (TARGET_64BIT)
24092 {
24093 /* We always use R10 in 64-bit mode. */
24094 regno = R10_REG;
24095 }
24096 else
24097 {
24098 tree fntype;
24099 unsigned int ccvt;
24100
24101 /* By default in 32-bit mode we use ECX to pass the static chain. */
24102 regno = CX_REG;
24103
24104 fntype = TREE_TYPE (fndecl);
24105 ccvt = ix86_get_callcvt (fntype);
24106 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
24107 {
24108 /* Fastcall functions use ecx/edx for arguments, which leaves
24109 us with EAX for the static chain.
24110 Thiscall functions use ecx for arguments, which also
24111 leaves us with EAX for the static chain. */
24112 regno = AX_REG;
24113 }
24114 else if (ix86_function_regparm (fntype, fndecl) == 3)
24115 {
24116 /* For regparm 3, we have no free call-clobbered registers in
24117 which to store the static chain. In order to implement this,
24118 we have the trampoline push the static chain to the stack.
24119 However, we can't push a value below the return address when
24120 we call the nested function directly, so we have to use an
24121 alternate entry point. For this we use ESI, and have the
24122 alternate entry point push ESI, so that things appear the
24123 same once we're executing the nested function. */
24124 if (incoming_p)
24125 {
24126 if (fndecl == current_function_decl)
24127 ix86_static_chain_on_stack = true;
24128 return gen_frame_mem (SImode,
24129 plus_constant (arg_pointer_rtx, -8));
24130 }
24131 regno = SI_REG;
24132 }
24133 }
24134
24135 return gen_rtx_REG (Pmode, regno);
24136 }
24137
24138 /* Emit RTL insns to initialize the variable parts of a trampoline.
24139 FNDECL is the decl of the target address; M_TRAMP is a MEM for
24140 the trampoline, and CHAIN_VALUE is an RTX for the static chain
24141 to be passed to the target function. */
24142
24143 static void
24144 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
24145 {
24146 rtx mem, fnaddr;
24147 int opcode;
24148 int offset = 0;
24149
24150 fnaddr = XEXP (DECL_RTL (fndecl), 0);
24151
24152 if (TARGET_64BIT)
24153 {
24154 int size;
24155
24156 /* Load the function address to r11. Try to load address using
24157 the shorter movl instead of movabs. We may want to support
24158 movq for kernel mode, but kernel does not use trampolines at
24159 the moment. */
24160 if (x86_64_zext_immediate_operand (fnaddr, VOIDmode))
24161 {
24162 fnaddr = copy_to_mode_reg (DImode, fnaddr);
24163
24164 mem = adjust_address (m_tramp, HImode, offset);
24165 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
24166
24167 mem = adjust_address (m_tramp, SImode, offset + 2);
24168 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
24169 offset += 6;
24170 }
24171 else
24172 {
24173 mem = adjust_address (m_tramp, HImode, offset);
24174 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
24175
24176 mem = adjust_address (m_tramp, DImode, offset + 2);
24177 emit_move_insn (mem, fnaddr);
24178 offset += 10;
24179 }
24180
24181 /* Load static chain using movabs to r10. Use the
24182 shorter movl instead of movabs for x32. */
24183 if (TARGET_X32)
24184 {
24185 opcode = 0xba41;
24186 size = 6;
24187 }
24188 else
24189 {
24190 opcode = 0xba49;
24191 size = 10;
24192 }
24193
24194 mem = adjust_address (m_tramp, HImode, offset);
24195 emit_move_insn (mem, gen_int_mode (opcode, HImode));
24196
24197 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
24198 emit_move_insn (mem, chain_value);
24199 offset += size;
24200
24201 /* Jump to r11; the last (unused) byte is a nop, only there to
24202 pad the write out to a single 32-bit store. */
24203 mem = adjust_address (m_tramp, SImode, offset);
24204 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
24205 offset += 4;
24206 }
24207 else
24208 {
24209 rtx disp, chain;
24210
24211 /* Depending on the static chain location, either load a register
24212 with a constant, or push the constant to the stack. All of the
24213 instructions are the same size. */
24214 chain = ix86_static_chain (fndecl, true);
24215 if (REG_P (chain))
24216 {
24217 switch (REGNO (chain))
24218 {
24219 case AX_REG:
24220 opcode = 0xb8; break;
24221 case CX_REG:
24222 opcode = 0xb9; break;
24223 default:
24224 gcc_unreachable ();
24225 }
24226 }
24227 else
24228 opcode = 0x68;
24229
24230 mem = adjust_address (m_tramp, QImode, offset);
24231 emit_move_insn (mem, gen_int_mode (opcode, QImode));
24232
24233 mem = adjust_address (m_tramp, SImode, offset + 1);
24234 emit_move_insn (mem, chain_value);
24235 offset += 5;
24236
24237 mem = adjust_address (m_tramp, QImode, offset);
24238 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
24239
24240 mem = adjust_address (m_tramp, SImode, offset + 1);
24241
24242 /* Compute offset from the end of the jmp to the target function.
24243 In the case in which the trampoline stores the static chain on
24244 the stack, we need to skip the first insn which pushes the
24245 (call-saved) register static chain; this push is 1 byte. */
24246 offset += 5;
24247 disp = expand_binop (SImode, sub_optab, fnaddr,
24248 plus_constant (XEXP (m_tramp, 0),
24249 offset - (MEM_P (chain) ? 1 : 0)),
24250 NULL_RTX, 1, OPTAB_DIRECT);
24251 emit_move_insn (mem, disp);
24252 }
24253
24254 gcc_assert (offset <= TRAMPOLINE_SIZE);
24255
24256 #ifdef HAVE_ENABLE_EXECUTE_STACK
24257 #ifdef CHECK_EXECUTE_STACK_ENABLED
24258 if (CHECK_EXECUTE_STACK_ENABLED)
24259 #endif
24260 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
24261 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
24262 #endif
24263 }
24264 \f
24265 /* The following file contains several enumerations and data structures
24266 built from the definitions in i386-builtin-types.def. */
24267
24268 #include "i386-builtin-types.inc"
24269
24270 /* Table for the ix86 builtin non-function types. */
24271 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
24272
24273 /* Retrieve an element from the above table, building some of
24274 the types lazily. */
24275
24276 static tree
24277 ix86_get_builtin_type (enum ix86_builtin_type tcode)
24278 {
24279 unsigned int index;
24280 tree type, itype;
24281
24282 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
24283
24284 type = ix86_builtin_type_tab[(int) tcode];
24285 if (type != NULL)
24286 return type;
24287
24288 gcc_assert (tcode > IX86_BT_LAST_PRIM);
24289 if (tcode <= IX86_BT_LAST_VECT)
24290 {
24291 enum machine_mode mode;
24292
24293 index = tcode - IX86_BT_LAST_PRIM - 1;
24294 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
24295 mode = ix86_builtin_type_vect_mode[index];
24296
24297 type = build_vector_type_for_mode (itype, mode);
24298 }
24299 else
24300 {
24301 int quals;
24302
24303 index = tcode - IX86_BT_LAST_VECT - 1;
24304 if (tcode <= IX86_BT_LAST_PTR)
24305 quals = TYPE_UNQUALIFIED;
24306 else
24307 quals = TYPE_QUAL_CONST;
24308
24309 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
24310 if (quals != TYPE_UNQUALIFIED)
24311 itype = build_qualified_type (itype, quals);
24312
24313 type = build_pointer_type (itype);
24314 }
24315
24316 ix86_builtin_type_tab[(int) tcode] = type;
24317 return type;
24318 }
24319
24320 /* Table for the ix86 builtin function types. */
24321 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
24322
24323 /* Retrieve an element from the above table, building some of
24324 the types lazily. */
24325
24326 static tree
24327 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
24328 {
24329 tree type;
24330
24331 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
24332
24333 type = ix86_builtin_func_type_tab[(int) tcode];
24334 if (type != NULL)
24335 return type;
24336
24337 if (tcode <= IX86_BT_LAST_FUNC)
24338 {
24339 unsigned start = ix86_builtin_func_start[(int) tcode];
24340 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
24341 tree rtype, atype, args = void_list_node;
24342 unsigned i;
24343
24344 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
24345 for (i = after - 1; i > start; --i)
24346 {
24347 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
24348 args = tree_cons (NULL, atype, args);
24349 }
24350
24351 type = build_function_type (rtype, args);
24352 }
24353 else
24354 {
24355 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
24356 enum ix86_builtin_func_type icode;
24357
24358 icode = ix86_builtin_func_alias_base[index];
24359 type = ix86_get_builtin_func_type (icode);
24360 }
24361
24362 ix86_builtin_func_type_tab[(int) tcode] = type;
24363 return type;
24364 }
24365
24366
24367 /* Codes for all the SSE/MMX builtins. */
24368 enum ix86_builtins
24369 {
24370 IX86_BUILTIN_ADDPS,
24371 IX86_BUILTIN_ADDSS,
24372 IX86_BUILTIN_DIVPS,
24373 IX86_BUILTIN_DIVSS,
24374 IX86_BUILTIN_MULPS,
24375 IX86_BUILTIN_MULSS,
24376 IX86_BUILTIN_SUBPS,
24377 IX86_BUILTIN_SUBSS,
24378
24379 IX86_BUILTIN_CMPEQPS,
24380 IX86_BUILTIN_CMPLTPS,
24381 IX86_BUILTIN_CMPLEPS,
24382 IX86_BUILTIN_CMPGTPS,
24383 IX86_BUILTIN_CMPGEPS,
24384 IX86_BUILTIN_CMPNEQPS,
24385 IX86_BUILTIN_CMPNLTPS,
24386 IX86_BUILTIN_CMPNLEPS,
24387 IX86_BUILTIN_CMPNGTPS,
24388 IX86_BUILTIN_CMPNGEPS,
24389 IX86_BUILTIN_CMPORDPS,
24390 IX86_BUILTIN_CMPUNORDPS,
24391 IX86_BUILTIN_CMPEQSS,
24392 IX86_BUILTIN_CMPLTSS,
24393 IX86_BUILTIN_CMPLESS,
24394 IX86_BUILTIN_CMPNEQSS,
24395 IX86_BUILTIN_CMPNLTSS,
24396 IX86_BUILTIN_CMPNLESS,
24397 IX86_BUILTIN_CMPNGTSS,
24398 IX86_BUILTIN_CMPNGESS,
24399 IX86_BUILTIN_CMPORDSS,
24400 IX86_BUILTIN_CMPUNORDSS,
24401
24402 IX86_BUILTIN_COMIEQSS,
24403 IX86_BUILTIN_COMILTSS,
24404 IX86_BUILTIN_COMILESS,
24405 IX86_BUILTIN_COMIGTSS,
24406 IX86_BUILTIN_COMIGESS,
24407 IX86_BUILTIN_COMINEQSS,
24408 IX86_BUILTIN_UCOMIEQSS,
24409 IX86_BUILTIN_UCOMILTSS,
24410 IX86_BUILTIN_UCOMILESS,
24411 IX86_BUILTIN_UCOMIGTSS,
24412 IX86_BUILTIN_UCOMIGESS,
24413 IX86_BUILTIN_UCOMINEQSS,
24414
24415 IX86_BUILTIN_CVTPI2PS,
24416 IX86_BUILTIN_CVTPS2PI,
24417 IX86_BUILTIN_CVTSI2SS,
24418 IX86_BUILTIN_CVTSI642SS,
24419 IX86_BUILTIN_CVTSS2SI,
24420 IX86_BUILTIN_CVTSS2SI64,
24421 IX86_BUILTIN_CVTTPS2PI,
24422 IX86_BUILTIN_CVTTSS2SI,
24423 IX86_BUILTIN_CVTTSS2SI64,
24424
24425 IX86_BUILTIN_MAXPS,
24426 IX86_BUILTIN_MAXSS,
24427 IX86_BUILTIN_MINPS,
24428 IX86_BUILTIN_MINSS,
24429
24430 IX86_BUILTIN_LOADUPS,
24431 IX86_BUILTIN_STOREUPS,
24432 IX86_BUILTIN_MOVSS,
24433
24434 IX86_BUILTIN_MOVHLPS,
24435 IX86_BUILTIN_MOVLHPS,
24436 IX86_BUILTIN_LOADHPS,
24437 IX86_BUILTIN_LOADLPS,
24438 IX86_BUILTIN_STOREHPS,
24439 IX86_BUILTIN_STORELPS,
24440
24441 IX86_BUILTIN_MASKMOVQ,
24442 IX86_BUILTIN_MOVMSKPS,
24443 IX86_BUILTIN_PMOVMSKB,
24444
24445 IX86_BUILTIN_MOVNTPS,
24446 IX86_BUILTIN_MOVNTQ,
24447
24448 IX86_BUILTIN_LOADDQU,
24449 IX86_BUILTIN_STOREDQU,
24450
24451 IX86_BUILTIN_PACKSSWB,
24452 IX86_BUILTIN_PACKSSDW,
24453 IX86_BUILTIN_PACKUSWB,
24454
24455 IX86_BUILTIN_PADDB,
24456 IX86_BUILTIN_PADDW,
24457 IX86_BUILTIN_PADDD,
24458 IX86_BUILTIN_PADDQ,
24459 IX86_BUILTIN_PADDSB,
24460 IX86_BUILTIN_PADDSW,
24461 IX86_BUILTIN_PADDUSB,
24462 IX86_BUILTIN_PADDUSW,
24463 IX86_BUILTIN_PSUBB,
24464 IX86_BUILTIN_PSUBW,
24465 IX86_BUILTIN_PSUBD,
24466 IX86_BUILTIN_PSUBQ,
24467 IX86_BUILTIN_PSUBSB,
24468 IX86_BUILTIN_PSUBSW,
24469 IX86_BUILTIN_PSUBUSB,
24470 IX86_BUILTIN_PSUBUSW,
24471
24472 IX86_BUILTIN_PAND,
24473 IX86_BUILTIN_PANDN,
24474 IX86_BUILTIN_POR,
24475 IX86_BUILTIN_PXOR,
24476
24477 IX86_BUILTIN_PAVGB,
24478 IX86_BUILTIN_PAVGW,
24479
24480 IX86_BUILTIN_PCMPEQB,
24481 IX86_BUILTIN_PCMPEQW,
24482 IX86_BUILTIN_PCMPEQD,
24483 IX86_BUILTIN_PCMPGTB,
24484 IX86_BUILTIN_PCMPGTW,
24485 IX86_BUILTIN_PCMPGTD,
24486
24487 IX86_BUILTIN_PMADDWD,
24488
24489 IX86_BUILTIN_PMAXSW,
24490 IX86_BUILTIN_PMAXUB,
24491 IX86_BUILTIN_PMINSW,
24492 IX86_BUILTIN_PMINUB,
24493
24494 IX86_BUILTIN_PMULHUW,
24495 IX86_BUILTIN_PMULHW,
24496 IX86_BUILTIN_PMULLW,
24497
24498 IX86_BUILTIN_PSADBW,
24499 IX86_BUILTIN_PSHUFW,
24500
24501 IX86_BUILTIN_PSLLW,
24502 IX86_BUILTIN_PSLLD,
24503 IX86_BUILTIN_PSLLQ,
24504 IX86_BUILTIN_PSRAW,
24505 IX86_BUILTIN_PSRAD,
24506 IX86_BUILTIN_PSRLW,
24507 IX86_BUILTIN_PSRLD,
24508 IX86_BUILTIN_PSRLQ,
24509 IX86_BUILTIN_PSLLWI,
24510 IX86_BUILTIN_PSLLDI,
24511 IX86_BUILTIN_PSLLQI,
24512 IX86_BUILTIN_PSRAWI,
24513 IX86_BUILTIN_PSRADI,
24514 IX86_BUILTIN_PSRLWI,
24515 IX86_BUILTIN_PSRLDI,
24516 IX86_BUILTIN_PSRLQI,
24517
24518 IX86_BUILTIN_PUNPCKHBW,
24519 IX86_BUILTIN_PUNPCKHWD,
24520 IX86_BUILTIN_PUNPCKHDQ,
24521 IX86_BUILTIN_PUNPCKLBW,
24522 IX86_BUILTIN_PUNPCKLWD,
24523 IX86_BUILTIN_PUNPCKLDQ,
24524
24525 IX86_BUILTIN_SHUFPS,
24526
24527 IX86_BUILTIN_RCPPS,
24528 IX86_BUILTIN_RCPSS,
24529 IX86_BUILTIN_RSQRTPS,
24530 IX86_BUILTIN_RSQRTPS_NR,
24531 IX86_BUILTIN_RSQRTSS,
24532 IX86_BUILTIN_RSQRTF,
24533 IX86_BUILTIN_SQRTPS,
24534 IX86_BUILTIN_SQRTPS_NR,
24535 IX86_BUILTIN_SQRTSS,
24536
24537 IX86_BUILTIN_UNPCKHPS,
24538 IX86_BUILTIN_UNPCKLPS,
24539
24540 IX86_BUILTIN_ANDPS,
24541 IX86_BUILTIN_ANDNPS,
24542 IX86_BUILTIN_ORPS,
24543 IX86_BUILTIN_XORPS,
24544
24545 IX86_BUILTIN_EMMS,
24546 IX86_BUILTIN_LDMXCSR,
24547 IX86_BUILTIN_STMXCSR,
24548 IX86_BUILTIN_SFENCE,
24549
24550 /* 3DNow! Original */
24551 IX86_BUILTIN_FEMMS,
24552 IX86_BUILTIN_PAVGUSB,
24553 IX86_BUILTIN_PF2ID,
24554 IX86_BUILTIN_PFACC,
24555 IX86_BUILTIN_PFADD,
24556 IX86_BUILTIN_PFCMPEQ,
24557 IX86_BUILTIN_PFCMPGE,
24558 IX86_BUILTIN_PFCMPGT,
24559 IX86_BUILTIN_PFMAX,
24560 IX86_BUILTIN_PFMIN,
24561 IX86_BUILTIN_PFMUL,
24562 IX86_BUILTIN_PFRCP,
24563 IX86_BUILTIN_PFRCPIT1,
24564 IX86_BUILTIN_PFRCPIT2,
24565 IX86_BUILTIN_PFRSQIT1,
24566 IX86_BUILTIN_PFRSQRT,
24567 IX86_BUILTIN_PFSUB,
24568 IX86_BUILTIN_PFSUBR,
24569 IX86_BUILTIN_PI2FD,
24570 IX86_BUILTIN_PMULHRW,
24571
24572 /* 3DNow! Athlon Extensions */
24573 IX86_BUILTIN_PF2IW,
24574 IX86_BUILTIN_PFNACC,
24575 IX86_BUILTIN_PFPNACC,
24576 IX86_BUILTIN_PI2FW,
24577 IX86_BUILTIN_PSWAPDSI,
24578 IX86_BUILTIN_PSWAPDSF,
24579
24580 /* SSE2 */
24581 IX86_BUILTIN_ADDPD,
24582 IX86_BUILTIN_ADDSD,
24583 IX86_BUILTIN_DIVPD,
24584 IX86_BUILTIN_DIVSD,
24585 IX86_BUILTIN_MULPD,
24586 IX86_BUILTIN_MULSD,
24587 IX86_BUILTIN_SUBPD,
24588 IX86_BUILTIN_SUBSD,
24589
24590 IX86_BUILTIN_CMPEQPD,
24591 IX86_BUILTIN_CMPLTPD,
24592 IX86_BUILTIN_CMPLEPD,
24593 IX86_BUILTIN_CMPGTPD,
24594 IX86_BUILTIN_CMPGEPD,
24595 IX86_BUILTIN_CMPNEQPD,
24596 IX86_BUILTIN_CMPNLTPD,
24597 IX86_BUILTIN_CMPNLEPD,
24598 IX86_BUILTIN_CMPNGTPD,
24599 IX86_BUILTIN_CMPNGEPD,
24600 IX86_BUILTIN_CMPORDPD,
24601 IX86_BUILTIN_CMPUNORDPD,
24602 IX86_BUILTIN_CMPEQSD,
24603 IX86_BUILTIN_CMPLTSD,
24604 IX86_BUILTIN_CMPLESD,
24605 IX86_BUILTIN_CMPNEQSD,
24606 IX86_BUILTIN_CMPNLTSD,
24607 IX86_BUILTIN_CMPNLESD,
24608 IX86_BUILTIN_CMPORDSD,
24609 IX86_BUILTIN_CMPUNORDSD,
24610
24611 IX86_BUILTIN_COMIEQSD,
24612 IX86_BUILTIN_COMILTSD,
24613 IX86_BUILTIN_COMILESD,
24614 IX86_BUILTIN_COMIGTSD,
24615 IX86_BUILTIN_COMIGESD,
24616 IX86_BUILTIN_COMINEQSD,
24617 IX86_BUILTIN_UCOMIEQSD,
24618 IX86_BUILTIN_UCOMILTSD,
24619 IX86_BUILTIN_UCOMILESD,
24620 IX86_BUILTIN_UCOMIGTSD,
24621 IX86_BUILTIN_UCOMIGESD,
24622 IX86_BUILTIN_UCOMINEQSD,
24623
24624 IX86_BUILTIN_MAXPD,
24625 IX86_BUILTIN_MAXSD,
24626 IX86_BUILTIN_MINPD,
24627 IX86_BUILTIN_MINSD,
24628
24629 IX86_BUILTIN_ANDPD,
24630 IX86_BUILTIN_ANDNPD,
24631 IX86_BUILTIN_ORPD,
24632 IX86_BUILTIN_XORPD,
24633
24634 IX86_BUILTIN_SQRTPD,
24635 IX86_BUILTIN_SQRTSD,
24636
24637 IX86_BUILTIN_UNPCKHPD,
24638 IX86_BUILTIN_UNPCKLPD,
24639
24640 IX86_BUILTIN_SHUFPD,
24641
24642 IX86_BUILTIN_LOADUPD,
24643 IX86_BUILTIN_STOREUPD,
24644 IX86_BUILTIN_MOVSD,
24645
24646 IX86_BUILTIN_LOADHPD,
24647 IX86_BUILTIN_LOADLPD,
24648
24649 IX86_BUILTIN_CVTDQ2PD,
24650 IX86_BUILTIN_CVTDQ2PS,
24651
24652 IX86_BUILTIN_CVTPD2DQ,
24653 IX86_BUILTIN_CVTPD2PI,
24654 IX86_BUILTIN_CVTPD2PS,
24655 IX86_BUILTIN_CVTTPD2DQ,
24656 IX86_BUILTIN_CVTTPD2PI,
24657
24658 IX86_BUILTIN_CVTPI2PD,
24659 IX86_BUILTIN_CVTSI2SD,
24660 IX86_BUILTIN_CVTSI642SD,
24661
24662 IX86_BUILTIN_CVTSD2SI,
24663 IX86_BUILTIN_CVTSD2SI64,
24664 IX86_BUILTIN_CVTSD2SS,
24665 IX86_BUILTIN_CVTSS2SD,
24666 IX86_BUILTIN_CVTTSD2SI,
24667 IX86_BUILTIN_CVTTSD2SI64,
24668
24669 IX86_BUILTIN_CVTPS2DQ,
24670 IX86_BUILTIN_CVTPS2PD,
24671 IX86_BUILTIN_CVTTPS2DQ,
24672
24673 IX86_BUILTIN_MOVNTI,
24674 IX86_BUILTIN_MOVNTPD,
24675 IX86_BUILTIN_MOVNTDQ,
24676
24677 IX86_BUILTIN_MOVQ128,
24678
24679 /* SSE2 MMX */
24680 IX86_BUILTIN_MASKMOVDQU,
24681 IX86_BUILTIN_MOVMSKPD,
24682 IX86_BUILTIN_PMOVMSKB128,
24683
24684 IX86_BUILTIN_PACKSSWB128,
24685 IX86_BUILTIN_PACKSSDW128,
24686 IX86_BUILTIN_PACKUSWB128,
24687
24688 IX86_BUILTIN_PADDB128,
24689 IX86_BUILTIN_PADDW128,
24690 IX86_BUILTIN_PADDD128,
24691 IX86_BUILTIN_PADDQ128,
24692 IX86_BUILTIN_PADDSB128,
24693 IX86_BUILTIN_PADDSW128,
24694 IX86_BUILTIN_PADDUSB128,
24695 IX86_BUILTIN_PADDUSW128,
24696 IX86_BUILTIN_PSUBB128,
24697 IX86_BUILTIN_PSUBW128,
24698 IX86_BUILTIN_PSUBD128,
24699 IX86_BUILTIN_PSUBQ128,
24700 IX86_BUILTIN_PSUBSB128,
24701 IX86_BUILTIN_PSUBSW128,
24702 IX86_BUILTIN_PSUBUSB128,
24703 IX86_BUILTIN_PSUBUSW128,
24704
24705 IX86_BUILTIN_PAND128,
24706 IX86_BUILTIN_PANDN128,
24707 IX86_BUILTIN_POR128,
24708 IX86_BUILTIN_PXOR128,
24709
24710 IX86_BUILTIN_PAVGB128,
24711 IX86_BUILTIN_PAVGW128,
24712
24713 IX86_BUILTIN_PCMPEQB128,
24714 IX86_BUILTIN_PCMPEQW128,
24715 IX86_BUILTIN_PCMPEQD128,
24716 IX86_BUILTIN_PCMPGTB128,
24717 IX86_BUILTIN_PCMPGTW128,
24718 IX86_BUILTIN_PCMPGTD128,
24719
24720 IX86_BUILTIN_PMADDWD128,
24721
24722 IX86_BUILTIN_PMAXSW128,
24723 IX86_BUILTIN_PMAXUB128,
24724 IX86_BUILTIN_PMINSW128,
24725 IX86_BUILTIN_PMINUB128,
24726
24727 IX86_BUILTIN_PMULUDQ,
24728 IX86_BUILTIN_PMULUDQ128,
24729 IX86_BUILTIN_PMULHUW128,
24730 IX86_BUILTIN_PMULHW128,
24731 IX86_BUILTIN_PMULLW128,
24732
24733 IX86_BUILTIN_PSADBW128,
24734 IX86_BUILTIN_PSHUFHW,
24735 IX86_BUILTIN_PSHUFLW,
24736 IX86_BUILTIN_PSHUFD,
24737
24738 IX86_BUILTIN_PSLLDQI128,
24739 IX86_BUILTIN_PSLLWI128,
24740 IX86_BUILTIN_PSLLDI128,
24741 IX86_BUILTIN_PSLLQI128,
24742 IX86_BUILTIN_PSRAWI128,
24743 IX86_BUILTIN_PSRADI128,
24744 IX86_BUILTIN_PSRLDQI128,
24745 IX86_BUILTIN_PSRLWI128,
24746 IX86_BUILTIN_PSRLDI128,
24747 IX86_BUILTIN_PSRLQI128,
24748
24749 IX86_BUILTIN_PSLLDQ128,
24750 IX86_BUILTIN_PSLLW128,
24751 IX86_BUILTIN_PSLLD128,
24752 IX86_BUILTIN_PSLLQ128,
24753 IX86_BUILTIN_PSRAW128,
24754 IX86_BUILTIN_PSRAD128,
24755 IX86_BUILTIN_PSRLW128,
24756 IX86_BUILTIN_PSRLD128,
24757 IX86_BUILTIN_PSRLQ128,
24758
24759 IX86_BUILTIN_PUNPCKHBW128,
24760 IX86_BUILTIN_PUNPCKHWD128,
24761 IX86_BUILTIN_PUNPCKHDQ128,
24762 IX86_BUILTIN_PUNPCKHQDQ128,
24763 IX86_BUILTIN_PUNPCKLBW128,
24764 IX86_BUILTIN_PUNPCKLWD128,
24765 IX86_BUILTIN_PUNPCKLDQ128,
24766 IX86_BUILTIN_PUNPCKLQDQ128,
24767
24768 IX86_BUILTIN_CLFLUSH,
24769 IX86_BUILTIN_MFENCE,
24770 IX86_BUILTIN_LFENCE,
24771 IX86_BUILTIN_PAUSE,
24772
24773 IX86_BUILTIN_BSRSI,
24774 IX86_BUILTIN_BSRDI,
24775 IX86_BUILTIN_RDPMC,
24776 IX86_BUILTIN_RDTSC,
24777 IX86_BUILTIN_RDTSCP,
24778 IX86_BUILTIN_ROLQI,
24779 IX86_BUILTIN_ROLHI,
24780 IX86_BUILTIN_RORQI,
24781 IX86_BUILTIN_RORHI,
24782
24783 /* SSE3. */
24784 IX86_BUILTIN_ADDSUBPS,
24785 IX86_BUILTIN_HADDPS,
24786 IX86_BUILTIN_HSUBPS,
24787 IX86_BUILTIN_MOVSHDUP,
24788 IX86_BUILTIN_MOVSLDUP,
24789 IX86_BUILTIN_ADDSUBPD,
24790 IX86_BUILTIN_HADDPD,
24791 IX86_BUILTIN_HSUBPD,
24792 IX86_BUILTIN_LDDQU,
24793
24794 IX86_BUILTIN_MONITOR,
24795 IX86_BUILTIN_MWAIT,
24796
24797 /* SSSE3. */
24798 IX86_BUILTIN_PHADDW,
24799 IX86_BUILTIN_PHADDD,
24800 IX86_BUILTIN_PHADDSW,
24801 IX86_BUILTIN_PHSUBW,
24802 IX86_BUILTIN_PHSUBD,
24803 IX86_BUILTIN_PHSUBSW,
24804 IX86_BUILTIN_PMADDUBSW,
24805 IX86_BUILTIN_PMULHRSW,
24806 IX86_BUILTIN_PSHUFB,
24807 IX86_BUILTIN_PSIGNB,
24808 IX86_BUILTIN_PSIGNW,
24809 IX86_BUILTIN_PSIGND,
24810 IX86_BUILTIN_PALIGNR,
24811 IX86_BUILTIN_PABSB,
24812 IX86_BUILTIN_PABSW,
24813 IX86_BUILTIN_PABSD,
24814
24815 IX86_BUILTIN_PHADDW128,
24816 IX86_BUILTIN_PHADDD128,
24817 IX86_BUILTIN_PHADDSW128,
24818 IX86_BUILTIN_PHSUBW128,
24819 IX86_BUILTIN_PHSUBD128,
24820 IX86_BUILTIN_PHSUBSW128,
24821 IX86_BUILTIN_PMADDUBSW128,
24822 IX86_BUILTIN_PMULHRSW128,
24823 IX86_BUILTIN_PSHUFB128,
24824 IX86_BUILTIN_PSIGNB128,
24825 IX86_BUILTIN_PSIGNW128,
24826 IX86_BUILTIN_PSIGND128,
24827 IX86_BUILTIN_PALIGNR128,
24828 IX86_BUILTIN_PABSB128,
24829 IX86_BUILTIN_PABSW128,
24830 IX86_BUILTIN_PABSD128,
24831
24832 /* AMDFAM10 - SSE4A New Instructions. */
24833 IX86_BUILTIN_MOVNTSD,
24834 IX86_BUILTIN_MOVNTSS,
24835 IX86_BUILTIN_EXTRQI,
24836 IX86_BUILTIN_EXTRQ,
24837 IX86_BUILTIN_INSERTQI,
24838 IX86_BUILTIN_INSERTQ,
24839
24840 /* SSE4.1. */
24841 IX86_BUILTIN_BLENDPD,
24842 IX86_BUILTIN_BLENDPS,
24843 IX86_BUILTIN_BLENDVPD,
24844 IX86_BUILTIN_BLENDVPS,
24845 IX86_BUILTIN_PBLENDVB128,
24846 IX86_BUILTIN_PBLENDW128,
24847
24848 IX86_BUILTIN_DPPD,
24849 IX86_BUILTIN_DPPS,
24850
24851 IX86_BUILTIN_INSERTPS128,
24852
24853 IX86_BUILTIN_MOVNTDQA,
24854 IX86_BUILTIN_MPSADBW128,
24855 IX86_BUILTIN_PACKUSDW128,
24856 IX86_BUILTIN_PCMPEQQ,
24857 IX86_BUILTIN_PHMINPOSUW128,
24858
24859 IX86_BUILTIN_PMAXSB128,
24860 IX86_BUILTIN_PMAXSD128,
24861 IX86_BUILTIN_PMAXUD128,
24862 IX86_BUILTIN_PMAXUW128,
24863
24864 IX86_BUILTIN_PMINSB128,
24865 IX86_BUILTIN_PMINSD128,
24866 IX86_BUILTIN_PMINUD128,
24867 IX86_BUILTIN_PMINUW128,
24868
24869 IX86_BUILTIN_PMOVSXBW128,
24870 IX86_BUILTIN_PMOVSXBD128,
24871 IX86_BUILTIN_PMOVSXBQ128,
24872 IX86_BUILTIN_PMOVSXWD128,
24873 IX86_BUILTIN_PMOVSXWQ128,
24874 IX86_BUILTIN_PMOVSXDQ128,
24875
24876 IX86_BUILTIN_PMOVZXBW128,
24877 IX86_BUILTIN_PMOVZXBD128,
24878 IX86_BUILTIN_PMOVZXBQ128,
24879 IX86_BUILTIN_PMOVZXWD128,
24880 IX86_BUILTIN_PMOVZXWQ128,
24881 IX86_BUILTIN_PMOVZXDQ128,
24882
24883 IX86_BUILTIN_PMULDQ128,
24884 IX86_BUILTIN_PMULLD128,
24885
24886 IX86_BUILTIN_ROUNDPD,
24887 IX86_BUILTIN_ROUNDPS,
24888 IX86_BUILTIN_ROUNDSD,
24889 IX86_BUILTIN_ROUNDSS,
24890
24891 IX86_BUILTIN_FLOORPD,
24892 IX86_BUILTIN_CEILPD,
24893 IX86_BUILTIN_TRUNCPD,
24894 IX86_BUILTIN_RINTPD,
24895 IX86_BUILTIN_ROUNDPD_AZ,
24896 IX86_BUILTIN_FLOORPS,
24897 IX86_BUILTIN_CEILPS,
24898 IX86_BUILTIN_TRUNCPS,
24899 IX86_BUILTIN_RINTPS,
24900 IX86_BUILTIN_ROUNDPS_AZ,
24901
24902 IX86_BUILTIN_PTESTZ,
24903 IX86_BUILTIN_PTESTC,
24904 IX86_BUILTIN_PTESTNZC,
24905
24906 IX86_BUILTIN_VEC_INIT_V2SI,
24907 IX86_BUILTIN_VEC_INIT_V4HI,
24908 IX86_BUILTIN_VEC_INIT_V8QI,
24909 IX86_BUILTIN_VEC_EXT_V2DF,
24910 IX86_BUILTIN_VEC_EXT_V2DI,
24911 IX86_BUILTIN_VEC_EXT_V4SF,
24912 IX86_BUILTIN_VEC_EXT_V4SI,
24913 IX86_BUILTIN_VEC_EXT_V8HI,
24914 IX86_BUILTIN_VEC_EXT_V2SI,
24915 IX86_BUILTIN_VEC_EXT_V4HI,
24916 IX86_BUILTIN_VEC_EXT_V16QI,
24917 IX86_BUILTIN_VEC_SET_V2DI,
24918 IX86_BUILTIN_VEC_SET_V4SF,
24919 IX86_BUILTIN_VEC_SET_V4SI,
24920 IX86_BUILTIN_VEC_SET_V8HI,
24921 IX86_BUILTIN_VEC_SET_V4HI,
24922 IX86_BUILTIN_VEC_SET_V16QI,
24923
24924 IX86_BUILTIN_VEC_PACK_SFIX,
24925 IX86_BUILTIN_VEC_PACK_SFIX256,
24926
24927 /* SSE4.2. */
24928 IX86_BUILTIN_CRC32QI,
24929 IX86_BUILTIN_CRC32HI,
24930 IX86_BUILTIN_CRC32SI,
24931 IX86_BUILTIN_CRC32DI,
24932
24933 IX86_BUILTIN_PCMPESTRI128,
24934 IX86_BUILTIN_PCMPESTRM128,
24935 IX86_BUILTIN_PCMPESTRA128,
24936 IX86_BUILTIN_PCMPESTRC128,
24937 IX86_BUILTIN_PCMPESTRO128,
24938 IX86_BUILTIN_PCMPESTRS128,
24939 IX86_BUILTIN_PCMPESTRZ128,
24940 IX86_BUILTIN_PCMPISTRI128,
24941 IX86_BUILTIN_PCMPISTRM128,
24942 IX86_BUILTIN_PCMPISTRA128,
24943 IX86_BUILTIN_PCMPISTRC128,
24944 IX86_BUILTIN_PCMPISTRO128,
24945 IX86_BUILTIN_PCMPISTRS128,
24946 IX86_BUILTIN_PCMPISTRZ128,
24947
24948 IX86_BUILTIN_PCMPGTQ,
24949
24950 /* AES instructions */
24951 IX86_BUILTIN_AESENC128,
24952 IX86_BUILTIN_AESENCLAST128,
24953 IX86_BUILTIN_AESDEC128,
24954 IX86_BUILTIN_AESDECLAST128,
24955 IX86_BUILTIN_AESIMC128,
24956 IX86_BUILTIN_AESKEYGENASSIST128,
24957
24958 /* PCLMUL instruction */
24959 IX86_BUILTIN_PCLMULQDQ128,
24960
24961 /* AVX */
24962 IX86_BUILTIN_ADDPD256,
24963 IX86_BUILTIN_ADDPS256,
24964 IX86_BUILTIN_ADDSUBPD256,
24965 IX86_BUILTIN_ADDSUBPS256,
24966 IX86_BUILTIN_ANDPD256,
24967 IX86_BUILTIN_ANDPS256,
24968 IX86_BUILTIN_ANDNPD256,
24969 IX86_BUILTIN_ANDNPS256,
24970 IX86_BUILTIN_BLENDPD256,
24971 IX86_BUILTIN_BLENDPS256,
24972 IX86_BUILTIN_BLENDVPD256,
24973 IX86_BUILTIN_BLENDVPS256,
24974 IX86_BUILTIN_DIVPD256,
24975 IX86_BUILTIN_DIVPS256,
24976 IX86_BUILTIN_DPPS256,
24977 IX86_BUILTIN_HADDPD256,
24978 IX86_BUILTIN_HADDPS256,
24979 IX86_BUILTIN_HSUBPD256,
24980 IX86_BUILTIN_HSUBPS256,
24981 IX86_BUILTIN_MAXPD256,
24982 IX86_BUILTIN_MAXPS256,
24983 IX86_BUILTIN_MINPD256,
24984 IX86_BUILTIN_MINPS256,
24985 IX86_BUILTIN_MULPD256,
24986 IX86_BUILTIN_MULPS256,
24987 IX86_BUILTIN_ORPD256,
24988 IX86_BUILTIN_ORPS256,
24989 IX86_BUILTIN_SHUFPD256,
24990 IX86_BUILTIN_SHUFPS256,
24991 IX86_BUILTIN_SUBPD256,
24992 IX86_BUILTIN_SUBPS256,
24993 IX86_BUILTIN_XORPD256,
24994 IX86_BUILTIN_XORPS256,
24995 IX86_BUILTIN_CMPSD,
24996 IX86_BUILTIN_CMPSS,
24997 IX86_BUILTIN_CMPPD,
24998 IX86_BUILTIN_CMPPS,
24999 IX86_BUILTIN_CMPPD256,
25000 IX86_BUILTIN_CMPPS256,
25001 IX86_BUILTIN_CVTDQ2PD256,
25002 IX86_BUILTIN_CVTDQ2PS256,
25003 IX86_BUILTIN_CVTPD2PS256,
25004 IX86_BUILTIN_CVTPS2DQ256,
25005 IX86_BUILTIN_CVTPS2PD256,
25006 IX86_BUILTIN_CVTTPD2DQ256,
25007 IX86_BUILTIN_CVTPD2DQ256,
25008 IX86_BUILTIN_CVTTPS2DQ256,
25009 IX86_BUILTIN_EXTRACTF128PD256,
25010 IX86_BUILTIN_EXTRACTF128PS256,
25011 IX86_BUILTIN_EXTRACTF128SI256,
25012 IX86_BUILTIN_VZEROALL,
25013 IX86_BUILTIN_VZEROUPPER,
25014 IX86_BUILTIN_VPERMILVARPD,
25015 IX86_BUILTIN_VPERMILVARPS,
25016 IX86_BUILTIN_VPERMILVARPD256,
25017 IX86_BUILTIN_VPERMILVARPS256,
25018 IX86_BUILTIN_VPERMILPD,
25019 IX86_BUILTIN_VPERMILPS,
25020 IX86_BUILTIN_VPERMILPD256,
25021 IX86_BUILTIN_VPERMILPS256,
25022 IX86_BUILTIN_VPERMIL2PD,
25023 IX86_BUILTIN_VPERMIL2PS,
25024 IX86_BUILTIN_VPERMIL2PD256,
25025 IX86_BUILTIN_VPERMIL2PS256,
25026 IX86_BUILTIN_VPERM2F128PD256,
25027 IX86_BUILTIN_VPERM2F128PS256,
25028 IX86_BUILTIN_VPERM2F128SI256,
25029 IX86_BUILTIN_VBROADCASTSS,
25030 IX86_BUILTIN_VBROADCASTSD256,
25031 IX86_BUILTIN_VBROADCASTSS256,
25032 IX86_BUILTIN_VBROADCASTPD256,
25033 IX86_BUILTIN_VBROADCASTPS256,
25034 IX86_BUILTIN_VINSERTF128PD256,
25035 IX86_BUILTIN_VINSERTF128PS256,
25036 IX86_BUILTIN_VINSERTF128SI256,
25037 IX86_BUILTIN_LOADUPD256,
25038 IX86_BUILTIN_LOADUPS256,
25039 IX86_BUILTIN_STOREUPD256,
25040 IX86_BUILTIN_STOREUPS256,
25041 IX86_BUILTIN_LDDQU256,
25042 IX86_BUILTIN_MOVNTDQ256,
25043 IX86_BUILTIN_MOVNTPD256,
25044 IX86_BUILTIN_MOVNTPS256,
25045 IX86_BUILTIN_LOADDQU256,
25046 IX86_BUILTIN_STOREDQU256,
25047 IX86_BUILTIN_MASKLOADPD,
25048 IX86_BUILTIN_MASKLOADPS,
25049 IX86_BUILTIN_MASKSTOREPD,
25050 IX86_BUILTIN_MASKSTOREPS,
25051 IX86_BUILTIN_MASKLOADPD256,
25052 IX86_BUILTIN_MASKLOADPS256,
25053 IX86_BUILTIN_MASKSTOREPD256,
25054 IX86_BUILTIN_MASKSTOREPS256,
25055 IX86_BUILTIN_MOVSHDUP256,
25056 IX86_BUILTIN_MOVSLDUP256,
25057 IX86_BUILTIN_MOVDDUP256,
25058
25059 IX86_BUILTIN_SQRTPD256,
25060 IX86_BUILTIN_SQRTPS256,
25061 IX86_BUILTIN_SQRTPS_NR256,
25062 IX86_BUILTIN_RSQRTPS256,
25063 IX86_BUILTIN_RSQRTPS_NR256,
25064
25065 IX86_BUILTIN_RCPPS256,
25066
25067 IX86_BUILTIN_ROUNDPD256,
25068 IX86_BUILTIN_ROUNDPS256,
25069
25070 IX86_BUILTIN_FLOORPD256,
25071 IX86_BUILTIN_CEILPD256,
25072 IX86_BUILTIN_TRUNCPD256,
25073 IX86_BUILTIN_RINTPD256,
25074 IX86_BUILTIN_ROUNDPD_AZ256,
25075 IX86_BUILTIN_FLOORPS256,
25076 IX86_BUILTIN_CEILPS256,
25077 IX86_BUILTIN_TRUNCPS256,
25078 IX86_BUILTIN_RINTPS256,
25079 IX86_BUILTIN_ROUNDPS_AZ256,
25080
25081 IX86_BUILTIN_UNPCKHPD256,
25082 IX86_BUILTIN_UNPCKLPD256,
25083 IX86_BUILTIN_UNPCKHPS256,
25084 IX86_BUILTIN_UNPCKLPS256,
25085
25086 IX86_BUILTIN_SI256_SI,
25087 IX86_BUILTIN_PS256_PS,
25088 IX86_BUILTIN_PD256_PD,
25089 IX86_BUILTIN_SI_SI256,
25090 IX86_BUILTIN_PS_PS256,
25091 IX86_BUILTIN_PD_PD256,
25092
25093 IX86_BUILTIN_VTESTZPD,
25094 IX86_BUILTIN_VTESTCPD,
25095 IX86_BUILTIN_VTESTNZCPD,
25096 IX86_BUILTIN_VTESTZPS,
25097 IX86_BUILTIN_VTESTCPS,
25098 IX86_BUILTIN_VTESTNZCPS,
25099 IX86_BUILTIN_VTESTZPD256,
25100 IX86_BUILTIN_VTESTCPD256,
25101 IX86_BUILTIN_VTESTNZCPD256,
25102 IX86_BUILTIN_VTESTZPS256,
25103 IX86_BUILTIN_VTESTCPS256,
25104 IX86_BUILTIN_VTESTNZCPS256,
25105 IX86_BUILTIN_PTESTZ256,
25106 IX86_BUILTIN_PTESTC256,
25107 IX86_BUILTIN_PTESTNZC256,
25108
25109 IX86_BUILTIN_MOVMSKPD256,
25110 IX86_BUILTIN_MOVMSKPS256,
25111
25112 /* AVX2 */
25113 IX86_BUILTIN_MPSADBW256,
25114 IX86_BUILTIN_PABSB256,
25115 IX86_BUILTIN_PABSW256,
25116 IX86_BUILTIN_PABSD256,
25117 IX86_BUILTIN_PACKSSDW256,
25118 IX86_BUILTIN_PACKSSWB256,
25119 IX86_BUILTIN_PACKUSDW256,
25120 IX86_BUILTIN_PACKUSWB256,
25121 IX86_BUILTIN_PADDB256,
25122 IX86_BUILTIN_PADDW256,
25123 IX86_BUILTIN_PADDD256,
25124 IX86_BUILTIN_PADDQ256,
25125 IX86_BUILTIN_PADDSB256,
25126 IX86_BUILTIN_PADDSW256,
25127 IX86_BUILTIN_PADDUSB256,
25128 IX86_BUILTIN_PADDUSW256,
25129 IX86_BUILTIN_PALIGNR256,
25130 IX86_BUILTIN_AND256I,
25131 IX86_BUILTIN_ANDNOT256I,
25132 IX86_BUILTIN_PAVGB256,
25133 IX86_BUILTIN_PAVGW256,
25134 IX86_BUILTIN_PBLENDVB256,
25135 IX86_BUILTIN_PBLENDVW256,
25136 IX86_BUILTIN_PCMPEQB256,
25137 IX86_BUILTIN_PCMPEQW256,
25138 IX86_BUILTIN_PCMPEQD256,
25139 IX86_BUILTIN_PCMPEQQ256,
25140 IX86_BUILTIN_PCMPGTB256,
25141 IX86_BUILTIN_PCMPGTW256,
25142 IX86_BUILTIN_PCMPGTD256,
25143 IX86_BUILTIN_PCMPGTQ256,
25144 IX86_BUILTIN_PHADDW256,
25145 IX86_BUILTIN_PHADDD256,
25146 IX86_BUILTIN_PHADDSW256,
25147 IX86_BUILTIN_PHSUBW256,
25148 IX86_BUILTIN_PHSUBD256,
25149 IX86_BUILTIN_PHSUBSW256,
25150 IX86_BUILTIN_PMADDUBSW256,
25151 IX86_BUILTIN_PMADDWD256,
25152 IX86_BUILTIN_PMAXSB256,
25153 IX86_BUILTIN_PMAXSW256,
25154 IX86_BUILTIN_PMAXSD256,
25155 IX86_BUILTIN_PMAXUB256,
25156 IX86_BUILTIN_PMAXUW256,
25157 IX86_BUILTIN_PMAXUD256,
25158 IX86_BUILTIN_PMINSB256,
25159 IX86_BUILTIN_PMINSW256,
25160 IX86_BUILTIN_PMINSD256,
25161 IX86_BUILTIN_PMINUB256,
25162 IX86_BUILTIN_PMINUW256,
25163 IX86_BUILTIN_PMINUD256,
25164 IX86_BUILTIN_PMOVMSKB256,
25165 IX86_BUILTIN_PMOVSXBW256,
25166 IX86_BUILTIN_PMOVSXBD256,
25167 IX86_BUILTIN_PMOVSXBQ256,
25168 IX86_BUILTIN_PMOVSXWD256,
25169 IX86_BUILTIN_PMOVSXWQ256,
25170 IX86_BUILTIN_PMOVSXDQ256,
25171 IX86_BUILTIN_PMOVZXBW256,
25172 IX86_BUILTIN_PMOVZXBD256,
25173 IX86_BUILTIN_PMOVZXBQ256,
25174 IX86_BUILTIN_PMOVZXWD256,
25175 IX86_BUILTIN_PMOVZXWQ256,
25176 IX86_BUILTIN_PMOVZXDQ256,
25177 IX86_BUILTIN_PMULDQ256,
25178 IX86_BUILTIN_PMULHRSW256,
25179 IX86_BUILTIN_PMULHUW256,
25180 IX86_BUILTIN_PMULHW256,
25181 IX86_BUILTIN_PMULLW256,
25182 IX86_BUILTIN_PMULLD256,
25183 IX86_BUILTIN_PMULUDQ256,
25184 IX86_BUILTIN_POR256,
25185 IX86_BUILTIN_PSADBW256,
25186 IX86_BUILTIN_PSHUFB256,
25187 IX86_BUILTIN_PSHUFD256,
25188 IX86_BUILTIN_PSHUFHW256,
25189 IX86_BUILTIN_PSHUFLW256,
25190 IX86_BUILTIN_PSIGNB256,
25191 IX86_BUILTIN_PSIGNW256,
25192 IX86_BUILTIN_PSIGND256,
25193 IX86_BUILTIN_PSLLDQI256,
25194 IX86_BUILTIN_PSLLWI256,
25195 IX86_BUILTIN_PSLLW256,
25196 IX86_BUILTIN_PSLLDI256,
25197 IX86_BUILTIN_PSLLD256,
25198 IX86_BUILTIN_PSLLQI256,
25199 IX86_BUILTIN_PSLLQ256,
25200 IX86_BUILTIN_PSRAWI256,
25201 IX86_BUILTIN_PSRAW256,
25202 IX86_BUILTIN_PSRADI256,
25203 IX86_BUILTIN_PSRAD256,
25204 IX86_BUILTIN_PSRLDQI256,
25205 IX86_BUILTIN_PSRLWI256,
25206 IX86_BUILTIN_PSRLW256,
25207 IX86_BUILTIN_PSRLDI256,
25208 IX86_BUILTIN_PSRLD256,
25209 IX86_BUILTIN_PSRLQI256,
25210 IX86_BUILTIN_PSRLQ256,
25211 IX86_BUILTIN_PSUBB256,
25212 IX86_BUILTIN_PSUBW256,
25213 IX86_BUILTIN_PSUBD256,
25214 IX86_BUILTIN_PSUBQ256,
25215 IX86_BUILTIN_PSUBSB256,
25216 IX86_BUILTIN_PSUBSW256,
25217 IX86_BUILTIN_PSUBUSB256,
25218 IX86_BUILTIN_PSUBUSW256,
25219 IX86_BUILTIN_PUNPCKHBW256,
25220 IX86_BUILTIN_PUNPCKHWD256,
25221 IX86_BUILTIN_PUNPCKHDQ256,
25222 IX86_BUILTIN_PUNPCKHQDQ256,
25223 IX86_BUILTIN_PUNPCKLBW256,
25224 IX86_BUILTIN_PUNPCKLWD256,
25225 IX86_BUILTIN_PUNPCKLDQ256,
25226 IX86_BUILTIN_PUNPCKLQDQ256,
25227 IX86_BUILTIN_PXOR256,
25228 IX86_BUILTIN_MOVNTDQA256,
25229 IX86_BUILTIN_VBROADCASTSS_PS,
25230 IX86_BUILTIN_VBROADCASTSS_PS256,
25231 IX86_BUILTIN_VBROADCASTSD_PD256,
25232 IX86_BUILTIN_VBROADCASTSI256,
25233 IX86_BUILTIN_PBLENDD256,
25234 IX86_BUILTIN_PBLENDD128,
25235 IX86_BUILTIN_PBROADCASTB256,
25236 IX86_BUILTIN_PBROADCASTW256,
25237 IX86_BUILTIN_PBROADCASTD256,
25238 IX86_BUILTIN_PBROADCASTQ256,
25239 IX86_BUILTIN_PBROADCASTB128,
25240 IX86_BUILTIN_PBROADCASTW128,
25241 IX86_BUILTIN_PBROADCASTD128,
25242 IX86_BUILTIN_PBROADCASTQ128,
25243 IX86_BUILTIN_VPERMVARSI256,
25244 IX86_BUILTIN_VPERMDF256,
25245 IX86_BUILTIN_VPERMVARSF256,
25246 IX86_BUILTIN_VPERMDI256,
25247 IX86_BUILTIN_VPERMTI256,
25248 IX86_BUILTIN_VEXTRACT128I256,
25249 IX86_BUILTIN_VINSERT128I256,
25250 IX86_BUILTIN_MASKLOADD,
25251 IX86_BUILTIN_MASKLOADQ,
25252 IX86_BUILTIN_MASKLOADD256,
25253 IX86_BUILTIN_MASKLOADQ256,
25254 IX86_BUILTIN_MASKSTORED,
25255 IX86_BUILTIN_MASKSTOREQ,
25256 IX86_BUILTIN_MASKSTORED256,
25257 IX86_BUILTIN_MASKSTOREQ256,
25258 IX86_BUILTIN_PSLLVV4DI,
25259 IX86_BUILTIN_PSLLVV2DI,
25260 IX86_BUILTIN_PSLLVV8SI,
25261 IX86_BUILTIN_PSLLVV4SI,
25262 IX86_BUILTIN_PSRAVV8SI,
25263 IX86_BUILTIN_PSRAVV4SI,
25264 IX86_BUILTIN_PSRLVV4DI,
25265 IX86_BUILTIN_PSRLVV2DI,
25266 IX86_BUILTIN_PSRLVV8SI,
25267 IX86_BUILTIN_PSRLVV4SI,
25268
25269 IX86_BUILTIN_GATHERSIV2DF,
25270 IX86_BUILTIN_GATHERSIV4DF,
25271 IX86_BUILTIN_GATHERDIV2DF,
25272 IX86_BUILTIN_GATHERDIV4DF,
25273 IX86_BUILTIN_GATHERSIV4SF,
25274 IX86_BUILTIN_GATHERSIV8SF,
25275 IX86_BUILTIN_GATHERDIV4SF,
25276 IX86_BUILTIN_GATHERDIV8SF,
25277 IX86_BUILTIN_GATHERSIV2DI,
25278 IX86_BUILTIN_GATHERSIV4DI,
25279 IX86_BUILTIN_GATHERDIV2DI,
25280 IX86_BUILTIN_GATHERDIV4DI,
25281 IX86_BUILTIN_GATHERSIV4SI,
25282 IX86_BUILTIN_GATHERSIV8SI,
25283 IX86_BUILTIN_GATHERDIV4SI,
25284 IX86_BUILTIN_GATHERDIV8SI,
25285
25286 /* Alternate 4 element gather for the vectorizer where
25287 all operands are 32-byte wide. */
25288 IX86_BUILTIN_GATHERALTSIV4DF,
25289 IX86_BUILTIN_GATHERALTDIV8SF,
25290 IX86_BUILTIN_GATHERALTSIV4DI,
25291 IX86_BUILTIN_GATHERALTDIV8SI,
25292
25293 /* TFmode support builtins. */
25294 IX86_BUILTIN_INFQ,
25295 IX86_BUILTIN_HUGE_VALQ,
25296 IX86_BUILTIN_FABSQ,
25297 IX86_BUILTIN_COPYSIGNQ,
25298
25299 /* Vectorizer support builtins. */
25300 IX86_BUILTIN_CPYSGNPS,
25301 IX86_BUILTIN_CPYSGNPD,
25302 IX86_BUILTIN_CPYSGNPS256,
25303 IX86_BUILTIN_CPYSGNPD256,
25304
25305 /* FMA4 instructions. */
25306 IX86_BUILTIN_VFMADDSS,
25307 IX86_BUILTIN_VFMADDSD,
25308 IX86_BUILTIN_VFMADDPS,
25309 IX86_BUILTIN_VFMADDPD,
25310 IX86_BUILTIN_VFMADDPS256,
25311 IX86_BUILTIN_VFMADDPD256,
25312 IX86_BUILTIN_VFMADDSUBPS,
25313 IX86_BUILTIN_VFMADDSUBPD,
25314 IX86_BUILTIN_VFMADDSUBPS256,
25315 IX86_BUILTIN_VFMADDSUBPD256,
25316
25317 /* FMA3 instructions. */
25318 IX86_BUILTIN_VFMADDSS3,
25319 IX86_BUILTIN_VFMADDSD3,
25320
25321 /* XOP instructions. */
25322 IX86_BUILTIN_VPCMOV,
25323 IX86_BUILTIN_VPCMOV_V2DI,
25324 IX86_BUILTIN_VPCMOV_V4SI,
25325 IX86_BUILTIN_VPCMOV_V8HI,
25326 IX86_BUILTIN_VPCMOV_V16QI,
25327 IX86_BUILTIN_VPCMOV_V4SF,
25328 IX86_BUILTIN_VPCMOV_V2DF,
25329 IX86_BUILTIN_VPCMOV256,
25330 IX86_BUILTIN_VPCMOV_V4DI256,
25331 IX86_BUILTIN_VPCMOV_V8SI256,
25332 IX86_BUILTIN_VPCMOV_V16HI256,
25333 IX86_BUILTIN_VPCMOV_V32QI256,
25334 IX86_BUILTIN_VPCMOV_V8SF256,
25335 IX86_BUILTIN_VPCMOV_V4DF256,
25336
25337 IX86_BUILTIN_VPPERM,
25338
25339 IX86_BUILTIN_VPMACSSWW,
25340 IX86_BUILTIN_VPMACSWW,
25341 IX86_BUILTIN_VPMACSSWD,
25342 IX86_BUILTIN_VPMACSWD,
25343 IX86_BUILTIN_VPMACSSDD,
25344 IX86_BUILTIN_VPMACSDD,
25345 IX86_BUILTIN_VPMACSSDQL,
25346 IX86_BUILTIN_VPMACSSDQH,
25347 IX86_BUILTIN_VPMACSDQL,
25348 IX86_BUILTIN_VPMACSDQH,
25349 IX86_BUILTIN_VPMADCSSWD,
25350 IX86_BUILTIN_VPMADCSWD,
25351
25352 IX86_BUILTIN_VPHADDBW,
25353 IX86_BUILTIN_VPHADDBD,
25354 IX86_BUILTIN_VPHADDBQ,
25355 IX86_BUILTIN_VPHADDWD,
25356 IX86_BUILTIN_VPHADDWQ,
25357 IX86_BUILTIN_VPHADDDQ,
25358 IX86_BUILTIN_VPHADDUBW,
25359 IX86_BUILTIN_VPHADDUBD,
25360 IX86_BUILTIN_VPHADDUBQ,
25361 IX86_BUILTIN_VPHADDUWD,
25362 IX86_BUILTIN_VPHADDUWQ,
25363 IX86_BUILTIN_VPHADDUDQ,
25364 IX86_BUILTIN_VPHSUBBW,
25365 IX86_BUILTIN_VPHSUBWD,
25366 IX86_BUILTIN_VPHSUBDQ,
25367
25368 IX86_BUILTIN_VPROTB,
25369 IX86_BUILTIN_VPROTW,
25370 IX86_BUILTIN_VPROTD,
25371 IX86_BUILTIN_VPROTQ,
25372 IX86_BUILTIN_VPROTB_IMM,
25373 IX86_BUILTIN_VPROTW_IMM,
25374 IX86_BUILTIN_VPROTD_IMM,
25375 IX86_BUILTIN_VPROTQ_IMM,
25376
25377 IX86_BUILTIN_VPSHLB,
25378 IX86_BUILTIN_VPSHLW,
25379 IX86_BUILTIN_VPSHLD,
25380 IX86_BUILTIN_VPSHLQ,
25381 IX86_BUILTIN_VPSHAB,
25382 IX86_BUILTIN_VPSHAW,
25383 IX86_BUILTIN_VPSHAD,
25384 IX86_BUILTIN_VPSHAQ,
25385
25386 IX86_BUILTIN_VFRCZSS,
25387 IX86_BUILTIN_VFRCZSD,
25388 IX86_BUILTIN_VFRCZPS,
25389 IX86_BUILTIN_VFRCZPD,
25390 IX86_BUILTIN_VFRCZPS256,
25391 IX86_BUILTIN_VFRCZPD256,
25392
25393 IX86_BUILTIN_VPCOMEQUB,
25394 IX86_BUILTIN_VPCOMNEUB,
25395 IX86_BUILTIN_VPCOMLTUB,
25396 IX86_BUILTIN_VPCOMLEUB,
25397 IX86_BUILTIN_VPCOMGTUB,
25398 IX86_BUILTIN_VPCOMGEUB,
25399 IX86_BUILTIN_VPCOMFALSEUB,
25400 IX86_BUILTIN_VPCOMTRUEUB,
25401
25402 IX86_BUILTIN_VPCOMEQUW,
25403 IX86_BUILTIN_VPCOMNEUW,
25404 IX86_BUILTIN_VPCOMLTUW,
25405 IX86_BUILTIN_VPCOMLEUW,
25406 IX86_BUILTIN_VPCOMGTUW,
25407 IX86_BUILTIN_VPCOMGEUW,
25408 IX86_BUILTIN_VPCOMFALSEUW,
25409 IX86_BUILTIN_VPCOMTRUEUW,
25410
25411 IX86_BUILTIN_VPCOMEQUD,
25412 IX86_BUILTIN_VPCOMNEUD,
25413 IX86_BUILTIN_VPCOMLTUD,
25414 IX86_BUILTIN_VPCOMLEUD,
25415 IX86_BUILTIN_VPCOMGTUD,
25416 IX86_BUILTIN_VPCOMGEUD,
25417 IX86_BUILTIN_VPCOMFALSEUD,
25418 IX86_BUILTIN_VPCOMTRUEUD,
25419
25420 IX86_BUILTIN_VPCOMEQUQ,
25421 IX86_BUILTIN_VPCOMNEUQ,
25422 IX86_BUILTIN_VPCOMLTUQ,
25423 IX86_BUILTIN_VPCOMLEUQ,
25424 IX86_BUILTIN_VPCOMGTUQ,
25425 IX86_BUILTIN_VPCOMGEUQ,
25426 IX86_BUILTIN_VPCOMFALSEUQ,
25427 IX86_BUILTIN_VPCOMTRUEUQ,
25428
25429 IX86_BUILTIN_VPCOMEQB,
25430 IX86_BUILTIN_VPCOMNEB,
25431 IX86_BUILTIN_VPCOMLTB,
25432 IX86_BUILTIN_VPCOMLEB,
25433 IX86_BUILTIN_VPCOMGTB,
25434 IX86_BUILTIN_VPCOMGEB,
25435 IX86_BUILTIN_VPCOMFALSEB,
25436 IX86_BUILTIN_VPCOMTRUEB,
25437
25438 IX86_BUILTIN_VPCOMEQW,
25439 IX86_BUILTIN_VPCOMNEW,
25440 IX86_BUILTIN_VPCOMLTW,
25441 IX86_BUILTIN_VPCOMLEW,
25442 IX86_BUILTIN_VPCOMGTW,
25443 IX86_BUILTIN_VPCOMGEW,
25444 IX86_BUILTIN_VPCOMFALSEW,
25445 IX86_BUILTIN_VPCOMTRUEW,
25446
25447 IX86_BUILTIN_VPCOMEQD,
25448 IX86_BUILTIN_VPCOMNED,
25449 IX86_BUILTIN_VPCOMLTD,
25450 IX86_BUILTIN_VPCOMLED,
25451 IX86_BUILTIN_VPCOMGTD,
25452 IX86_BUILTIN_VPCOMGED,
25453 IX86_BUILTIN_VPCOMFALSED,
25454 IX86_BUILTIN_VPCOMTRUED,
25455
25456 IX86_BUILTIN_VPCOMEQQ,
25457 IX86_BUILTIN_VPCOMNEQ,
25458 IX86_BUILTIN_VPCOMLTQ,
25459 IX86_BUILTIN_VPCOMLEQ,
25460 IX86_BUILTIN_VPCOMGTQ,
25461 IX86_BUILTIN_VPCOMGEQ,
25462 IX86_BUILTIN_VPCOMFALSEQ,
25463 IX86_BUILTIN_VPCOMTRUEQ,
25464
25465 /* LWP instructions. */
25466 IX86_BUILTIN_LLWPCB,
25467 IX86_BUILTIN_SLWPCB,
25468 IX86_BUILTIN_LWPVAL32,
25469 IX86_BUILTIN_LWPVAL64,
25470 IX86_BUILTIN_LWPINS32,
25471 IX86_BUILTIN_LWPINS64,
25472
25473 IX86_BUILTIN_CLZS,
25474
25475 /* BMI instructions. */
25476 IX86_BUILTIN_BEXTR32,
25477 IX86_BUILTIN_BEXTR64,
25478 IX86_BUILTIN_CTZS,
25479
25480 /* TBM instructions. */
25481 IX86_BUILTIN_BEXTRI32,
25482 IX86_BUILTIN_BEXTRI64,
25483
25484 /* BMI2 instructions. */
25485 IX86_BUILTIN_BZHI32,
25486 IX86_BUILTIN_BZHI64,
25487 IX86_BUILTIN_PDEP32,
25488 IX86_BUILTIN_PDEP64,
25489 IX86_BUILTIN_PEXT32,
25490 IX86_BUILTIN_PEXT64,
25491
25492 /* FSGSBASE instructions. */
25493 IX86_BUILTIN_RDFSBASE32,
25494 IX86_BUILTIN_RDFSBASE64,
25495 IX86_BUILTIN_RDGSBASE32,
25496 IX86_BUILTIN_RDGSBASE64,
25497 IX86_BUILTIN_WRFSBASE32,
25498 IX86_BUILTIN_WRFSBASE64,
25499 IX86_BUILTIN_WRGSBASE32,
25500 IX86_BUILTIN_WRGSBASE64,
25501
25502 /* RDRND instructions. */
25503 IX86_BUILTIN_RDRAND16_STEP,
25504 IX86_BUILTIN_RDRAND32_STEP,
25505 IX86_BUILTIN_RDRAND64_STEP,
25506
25507 /* F16C instructions. */
25508 IX86_BUILTIN_CVTPH2PS,
25509 IX86_BUILTIN_CVTPH2PS256,
25510 IX86_BUILTIN_CVTPS2PH,
25511 IX86_BUILTIN_CVTPS2PH256,
25512
25513 /* CFString built-in for darwin */
25514 IX86_BUILTIN_CFSTRING,
25515
25516 IX86_BUILTIN_MAX
25517 };
25518
25519 /* Table for the ix86 builtin decls. */
25520 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
25521
25522 /* Table of all of the builtin functions that are possible with different ISA's
25523 but are waiting to be built until a function is declared to use that
25524 ISA. */
25525 struct builtin_isa {
25526 const char *name; /* function name */
25527 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
25528 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
25529 bool const_p; /* true if the declaration is constant */
25530 bool set_and_not_built_p;
25531 };
25532
25533 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
25534
25535
25536 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
25537 of which isa_flags to use in the ix86_builtins_isa array. Stores the
25538 function decl in the ix86_builtins array. Returns the function decl or
25539 NULL_TREE, if the builtin was not added.
25540
25541 If the front end has a special hook for builtin functions, delay adding
25542 builtin functions that aren't in the current ISA until the ISA is changed
25543 with function specific optimization. Doing so, can save about 300K for the
25544 default compiler. When the builtin is expanded, check at that time whether
25545 it is valid.
25546
25547 If the front end doesn't have a special hook, record all builtins, even if
25548 it isn't an instruction set in the current ISA in case the user uses
25549 function specific options for a different ISA, so that we don't get scope
25550 errors if a builtin is added in the middle of a function scope. */
25551
25552 static inline tree
25553 def_builtin (HOST_WIDE_INT mask, const char *name,
25554 enum ix86_builtin_func_type tcode,
25555 enum ix86_builtins code)
25556 {
25557 tree decl = NULL_TREE;
25558
25559 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
25560 {
25561 ix86_builtins_isa[(int) code].isa = mask;
25562
25563 mask &= ~OPTION_MASK_ISA_64BIT;
25564 if (mask == 0
25565 || (mask & ix86_isa_flags) != 0
25566 || (lang_hooks.builtin_function
25567 == lang_hooks.builtin_function_ext_scope))
25568
25569 {
25570 tree type = ix86_get_builtin_func_type (tcode);
25571 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
25572 NULL, NULL_TREE);
25573 ix86_builtins[(int) code] = decl;
25574 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
25575 }
25576 else
25577 {
25578 ix86_builtins[(int) code] = NULL_TREE;
25579 ix86_builtins_isa[(int) code].tcode = tcode;
25580 ix86_builtins_isa[(int) code].name = name;
25581 ix86_builtins_isa[(int) code].const_p = false;
25582 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
25583 }
25584 }
25585
25586 return decl;
25587 }
25588
25589 /* Like def_builtin, but also marks the function decl "const". */
25590
25591 static inline tree
25592 def_builtin_const (HOST_WIDE_INT mask, const char *name,
25593 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
25594 {
25595 tree decl = def_builtin (mask, name, tcode, code);
25596 if (decl)
25597 TREE_READONLY (decl) = 1;
25598 else
25599 ix86_builtins_isa[(int) code].const_p = true;
25600
25601 return decl;
25602 }
25603
25604 /* Add any new builtin functions for a given ISA that may not have been
25605 declared. This saves a bit of space compared to adding all of the
25606 declarations to the tree, even if we didn't use them. */
25607
25608 static void
25609 ix86_add_new_builtins (HOST_WIDE_INT isa)
25610 {
25611 int i;
25612
25613 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
25614 {
25615 if ((ix86_builtins_isa[i].isa & isa) != 0
25616 && ix86_builtins_isa[i].set_and_not_built_p)
25617 {
25618 tree decl, type;
25619
25620 /* Don't define the builtin again. */
25621 ix86_builtins_isa[i].set_and_not_built_p = false;
25622
25623 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
25624 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
25625 type, i, BUILT_IN_MD, NULL,
25626 NULL_TREE);
25627
25628 ix86_builtins[i] = decl;
25629 if (ix86_builtins_isa[i].const_p)
25630 TREE_READONLY (decl) = 1;
25631 }
25632 }
25633 }
25634
25635 /* Bits for builtin_description.flag. */
25636
25637 /* Set when we don't support the comparison natively, and should
25638 swap_comparison in order to support it. */
25639 #define BUILTIN_DESC_SWAP_OPERANDS 1
25640
25641 struct builtin_description
25642 {
25643 const HOST_WIDE_INT mask;
25644 const enum insn_code icode;
25645 const char *const name;
25646 const enum ix86_builtins code;
25647 const enum rtx_code comparison;
25648 const int flag;
25649 };
25650
25651 static const struct builtin_description bdesc_comi[] =
25652 {
25653 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
25654 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
25655 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
25656 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
25657 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
25658 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
25659 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
25660 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
25661 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
25662 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
25663 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
25664 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
25665 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
25666 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
25667 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
25668 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
25669 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
25670 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
25671 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
25672 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
25673 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
25674 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
25675 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
25676 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
25677 };
25678
25679 static const struct builtin_description bdesc_pcmpestr[] =
25680 {
25681 /* SSE4.2 */
25682 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
25683 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
25684 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
25685 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
25686 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
25687 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
25688 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
25689 };
25690
25691 static const struct builtin_description bdesc_pcmpistr[] =
25692 {
25693 /* SSE4.2 */
25694 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
25695 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
25696 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
25697 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
25698 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
25699 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
25700 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
25701 };
25702
25703 /* Special builtins with variable number of arguments. */
25704 static const struct builtin_description bdesc_special_args[] =
25705 {
25706 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
25707 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
25708 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
25709
25710 /* MMX */
25711 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
25712
25713 /* 3DNow! */
25714 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
25715
25716 /* SSE */
25717 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25718 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25719 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
25720
25721 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
25722 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
25723 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
25724 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
25725
25726 /* SSE or 3DNow!A */
25727 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25728 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntdi, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
25729
25730 /* SSE2 */
25731 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25732 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25733 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25734 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
25735 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25736 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
25737 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntsi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
25738 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
25739 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
25740
25741 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
25742 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
25743
25744 /* SSE3 */
25745 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
25746
25747 /* SSE4.1 */
25748 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
25749
25750 /* SSE4A */
25751 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25752 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25753
25754 /* AVX */
25755 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
25756 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
25757
25758 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
25759 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
25760 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
25761 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
25762 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
25763
25764 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
25765 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
25766 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
25767 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
25768 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
25769 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
25770 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
25771
25772 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
25773 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
25774 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
25775
25776 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
25777 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
25778 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
25779 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
25780 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
25781 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
25782 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
25783 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
25784
25785 /* AVX2 */
25786 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
25787 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
25788 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
25789 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
25790 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
25791 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
25792 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
25793 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
25794 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
25795
25796 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
25797 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
25798 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
25799 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
25800 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
25801 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
25802
25803 /* FSGSBASE */
25804 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
25805 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
25806 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
25807 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
25808 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
25809 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
25810 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
25811 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
25812 };
25813
25814 /* Builtins with variable number of arguments. */
25815 static const struct builtin_description bdesc_args[] =
25816 {
25817 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
25818 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
25819 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
25820 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
25821 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
25822 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
25823 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
25824
25825 /* MMX */
25826 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25827 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25828 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25829 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25830 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25831 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25832
25833 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25834 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25835 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25836 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25837 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25838 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25839 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25840 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25841
25842 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25843 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25844
25845 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25846 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25847 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25848 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25849
25850 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25851 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25852 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25853 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25854 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25855 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25856
25857 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25858 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25859 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25860 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25861 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
25862 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
25863
25864 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
25865 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
25866 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
25867
25868 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
25869
25870 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
25871 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
25872 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
25873 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
25874 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
25875 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
25876
25877 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
25878 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
25879 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
25880 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
25881 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
25882 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
25883
25884 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
25885 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
25886 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
25887 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
25888
25889 /* 3DNow! */
25890 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
25891 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
25892 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
25893 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
25894
25895 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25896 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25897 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25898 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
25899 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
25900 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
25901 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25902 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25903 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25904 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25905 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25906 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25907 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25908 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25909 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25910
25911 /* 3DNow!A */
25912 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
25913 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
25914 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
25915 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
25916 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25917 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25918
25919 /* SSE */
25920 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
25921 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25922 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25923 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25924 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25925 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25926 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
25927 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
25928 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
25929 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
25930 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
25931 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
25932
25933 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
25934
25935 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25936 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25937 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25938 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25939 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25940 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25941 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25942 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25943
25944 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
25945 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
25946 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
25947 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
25948 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
25949 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
25950 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
25951 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
25952 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
25953 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
25954 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
25955 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
25956 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
25957 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
25958 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
25959 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
25960 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
25961 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
25962 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
25963 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
25964 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
25965 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
25966
25967 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25968 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25969 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25970 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25971
25972 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25973 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25974 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25975 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25976
25977 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25978
25979 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25980 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25981 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25982 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25983 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
25984
25985 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
25986 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
25987 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
25988
25989 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
25990
25991 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
25992 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
25993 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
25994
25995 /* SSE MMX or 3Dnow!A */
25996 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25997 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25998 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25999
26000 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26001 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26002 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26003 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26004
26005 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
26006 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
26007
26008 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
26009
26010 /* SSE2 */
26011 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26012
26013 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
26014 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
26015 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26016 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
26017 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
26018
26019 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26020 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26021 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
26022 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26023 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26024
26025 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
26026
26027 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26028 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26029 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26030 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26031
26032 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26033 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
26034 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26035
26036 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26037 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26038 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26039 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26040 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26041 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26042 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26043 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26044
26045 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26046 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26047 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26048 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26049 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
26050 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26051 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26052 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26053 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26054 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26055 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26056 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26057 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26058 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26059 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26060 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26061 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26062 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26063 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26064 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26065
26066 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26067 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26068 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26069 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26070
26071 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26072 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26073 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26074 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26075
26076 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26077
26078 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26079 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26080 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26081
26082 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26083
26084 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26085 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26086 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26087 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26088 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26089 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26090 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26091 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26092
26093 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26094 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26095 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26096 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26097 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26098 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26099 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26100 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26101
26102 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26103 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
26104
26105 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26106 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26107 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26108 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26109
26110 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26111 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26112
26113 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26114 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26115 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26116 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26117 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26118 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26119
26120 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26121 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26122 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26123 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26124
26125 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26126 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26127 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26128 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26129 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26130 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26131 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26132 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26133
26134 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26135 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26136 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26137
26138 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26139 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
26140
26141 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
26142 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26143
26144 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
26145
26146 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
26147 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
26148 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
26149 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
26150
26151 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26152 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26153 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26154 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26155 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26156 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26157 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26158
26159 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26160 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26161 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26162 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26163 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26164 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26165 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26166
26167 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26168 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26169 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26170 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26171
26172 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
26173 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26174 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26175
26176 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
26177
26178 { OPTION_MASK_ISA_SSE2, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
26179 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
26180
26181 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26182
26183 /* SSE2 MMX */
26184 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26185 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26186
26187 /* SSE3 */
26188 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
26189 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26190
26191 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26192 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26193 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26194 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26195 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26196 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26197
26198 /* SSSE3 */
26199 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26200 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
26201 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26202 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
26203 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26204 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26205
26206 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26207 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26208 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26209 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26210 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26211 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26212 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26213 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26214 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26215 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26216 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26217 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26218 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
26219 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
26220 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26221 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26222 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26223 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26224 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26225 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26226 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26227 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26228 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26229 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26230
26231 /* SSSE3. */
26232 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
26233 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
26234
26235 /* SSE4.1 */
26236 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26237 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26238 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
26239 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
26240 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26241 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26242 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26243 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
26244 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
26245 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
26246
26247 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26248 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26249 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26250 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26251 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26252 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26253 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26254 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26255 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26256 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26257 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26258 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26259 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26260
26261 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26262 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26263 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26264 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26265 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26266 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26267 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26268 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26269 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26270 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26271 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26272 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26273
26274 /* SSE4.1 */
26275 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26276 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26277 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26278 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26279
26280 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
26281 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
26282 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
26283 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
26284
26285 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26286
26287 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
26288 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
26289 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
26290 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
26291
26292 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26293
26294 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26295 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26296 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26297
26298 /* SSE4.2 */
26299 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26300 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
26301 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
26302 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26303 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26304
26305 /* SSE4A */
26306 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
26307 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
26308 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
26309 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26310
26311 /* AES */
26312 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
26313 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26314
26315 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26316 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26317 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26318 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26319
26320 /* PCLMUL */
26321 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
26322
26323 /* AVX */
26324 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26325 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26326 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26327 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26328 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26329 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26330 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26331 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26332 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26333 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26334 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26335 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26336 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26337 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26338 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26339 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26340 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26341 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26342 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26343 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26344 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26345 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26346 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26347 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26348 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26349 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26350
26351 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
26352 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
26353 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
26354 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
26355
26356 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26357 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26358 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
26359 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
26360 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26361 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26362 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26363 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26364 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26365 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26366 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26367 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26368 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26369 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
26370 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
26371 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
26372 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
26373 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
26374 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
26375 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26376 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
26377 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26378 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26379 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26380 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26381 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26382 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26383 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26384 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26385 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26386 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26387 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
26388 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
26389 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
26390
26391 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26392 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26393 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26394
26395 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26396 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26397 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26398 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26399 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26400
26401 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26402
26403 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26404 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26405
26406 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
26407 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
26408 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
26409 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
26410
26411 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26412
26413 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
26414 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
26415 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
26416 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
26417
26418 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26419
26420 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26421 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26422 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26423 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26424
26425 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26426 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26427 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26428 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
26429 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
26430 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
26431
26432 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26433 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26434 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26435 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26436 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26437 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26438 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26439 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26440 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26441 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26442 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26443 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26444 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26445 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26446 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26447
26448 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
26449 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
26450
26451 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26452 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26453
26454 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26455
26456 /* AVX2 */
26457 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
26458 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
26459 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
26460 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
26461 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26462 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26463 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26464 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26465 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26466 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26467 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26468 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26469 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26470 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26471 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26472 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26473 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
26474 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26475 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26476 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26477 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26478 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
26479 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
26480 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26481 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26482 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26483 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26484 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26485 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26486 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26487 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26488 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26489 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26490 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26491 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26492 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26493 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26494 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26495 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
26496 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26497 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26498 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26499 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26500 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26501 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26502 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26503 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26504 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26505 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26506 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26507 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26508 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
26509 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26510 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26511 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26512 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26513 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26514 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26515 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26516 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26517 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26518 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26519 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26520 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26521 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3 , "__builtin_ia32_pmuldq256" , IX86_BUILTIN_PMULDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26522 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26523 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26524 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26525 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26526 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26527 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3 , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26528 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26529 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26530 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26531 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
26532 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26533 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26534 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26535 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26536 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26537 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26538 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26539 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26540 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26541 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26542 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26543 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26544 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26545 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26546 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26547 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26548 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26549 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26550 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26551 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26552 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26553 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26554 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26555 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26556 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26557 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26558 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26559 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26560 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26561 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26562 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26563 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26564 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26565 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26566 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26567 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26568 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26569 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26570 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26571 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26572 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26573 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26574 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26575 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26576 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
26577 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26578 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
26579 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
26580 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26581 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26582 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26583 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26584 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26585 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26586 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26587 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26588 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26589 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
26590 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
26591 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
26592 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
26593 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26594 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26595 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26596 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26597 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26598 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26599 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26600 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26601 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26602 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26603
26604 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26605
26606 /* BMI */
26607 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26608 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26609 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26610
26611 /* TBM */
26612 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26613 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26614
26615 /* F16C */
26616 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
26617 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
26618 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
26619 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
26620
26621 /* BMI2 */
26622 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26623 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26624 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26625 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26626 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26627 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26628 };
26629
26630 /* FMA4 and XOP. */
26631 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
26632 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
26633 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
26634 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
26635 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
26636 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
26637 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
26638 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
26639 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
26640 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
26641 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
26642 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
26643 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
26644 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
26645 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
26646 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
26647 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
26648 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
26649 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
26650 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
26651 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
26652 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
26653 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
26654 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
26655 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
26656 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
26657 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
26658 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
26659 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
26660 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
26661 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
26662 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
26663 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
26664 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
26665 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
26666 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
26667 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
26668 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
26669 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
26670 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
26671 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
26672 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
26673 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
26674 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
26675 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
26676 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
26677 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
26678 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
26679 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
26680 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
26681 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
26682 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
26683
26684 static const struct builtin_description bdesc_multi_arg[] =
26685 {
26686 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
26687 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
26688 UNKNOWN, (int)MULTI_ARG_3_SF },
26689 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
26690 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
26691 UNKNOWN, (int)MULTI_ARG_3_DF },
26692
26693 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
26694 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
26695 UNKNOWN, (int)MULTI_ARG_3_SF },
26696 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
26697 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
26698 UNKNOWN, (int)MULTI_ARG_3_DF },
26699
26700 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
26701 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
26702 UNKNOWN, (int)MULTI_ARG_3_SF },
26703 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
26704 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
26705 UNKNOWN, (int)MULTI_ARG_3_DF },
26706 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
26707 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
26708 UNKNOWN, (int)MULTI_ARG_3_SF2 },
26709 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
26710 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
26711 UNKNOWN, (int)MULTI_ARG_3_DF2 },
26712
26713 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
26714 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
26715 UNKNOWN, (int)MULTI_ARG_3_SF },
26716 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
26717 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
26718 UNKNOWN, (int)MULTI_ARG_3_DF },
26719 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
26720 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
26721 UNKNOWN, (int)MULTI_ARG_3_SF2 },
26722 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
26723 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
26724 UNKNOWN, (int)MULTI_ARG_3_DF2 },
26725
26726 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
26727 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
26728 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
26729 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
26730 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
26731 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
26732 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
26733
26734 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
26735 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
26736 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
26737 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
26738 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
26739 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
26740 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
26741
26742 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
26743
26744 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
26745 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
26746 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26747 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26748 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
26749 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
26750 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26751 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26752 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26753 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26754 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26755 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26756
26757 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26758 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
26759 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
26760 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
26761 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
26762 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
26763 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
26764 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
26765 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26766 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
26767 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
26768 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
26769 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26770 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
26771 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
26772 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
26773
26774 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
26775 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
26776 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
26777 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
26778 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
26779 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
26780
26781 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26782 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
26783 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
26784 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26785 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
26786 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26787 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26788 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
26789 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
26790 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26791 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
26792 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26793 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26794 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26795 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26796
26797 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
26798 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
26799 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
26800 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
26801 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
26802 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
26803 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
26804
26805 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
26806 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
26807 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
26808 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
26809 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
26810 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
26811 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
26812
26813 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
26814 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
26815 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
26816 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
26817 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
26818 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
26819 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
26820
26821 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
26822 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
26823 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
26824 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
26825 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
26826 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
26827 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
26828
26829 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
26830 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
26831 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
26832 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
26833 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
26834 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
26835 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
26836
26837 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
26838 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
26839 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
26840 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
26841 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
26842 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
26843 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
26844
26845 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
26846 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
26847 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
26848 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
26849 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
26850 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
26851 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
26852
26853 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
26854 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
26855 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
26856 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
26857 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
26858 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
26859 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
26860
26861 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
26862 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
26863 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
26864 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
26865 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
26866 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
26867 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
26868 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
26869
26870 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
26871 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
26872 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
26873 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
26874 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
26875 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
26876 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
26877 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
26878
26879 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
26880 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
26881 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
26882 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
26883
26884 };
26885 \f
26886 /* TM vector builtins. */
26887
26888 /* Reuse the existing x86-specific `struct builtin_description' cause
26889 we're lazy. Add casts to make them fit. */
26890 static const struct builtin_description bdesc_tm[] =
26891 {
26892 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
26893 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
26894 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
26895 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
26896 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
26897 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
26898 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
26899
26900 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
26901 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
26902 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
26903 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
26904 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
26905 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
26906 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
26907
26908 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
26909 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
26910 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
26911 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
26912 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
26913 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
26914 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
26915
26916 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
26917 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
26918 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
26919 };
26920
26921 /* TM callbacks. */
26922
26923 /* Return the builtin decl needed to load a vector of TYPE. */
26924
26925 static tree
26926 ix86_builtin_tm_load (tree type)
26927 {
26928 if (TREE_CODE (type) == VECTOR_TYPE)
26929 {
26930 switch (tree_low_cst (TYPE_SIZE (type), 1))
26931 {
26932 case 64:
26933 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
26934 case 128:
26935 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
26936 case 256:
26937 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
26938 }
26939 }
26940 return NULL_TREE;
26941 }
26942
26943 /* Return the builtin decl needed to store a vector of TYPE. */
26944
26945 static tree
26946 ix86_builtin_tm_store (tree type)
26947 {
26948 if (TREE_CODE (type) == VECTOR_TYPE)
26949 {
26950 switch (tree_low_cst (TYPE_SIZE (type), 1))
26951 {
26952 case 64:
26953 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
26954 case 128:
26955 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
26956 case 256:
26957 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
26958 }
26959 }
26960 return NULL_TREE;
26961 }
26962 \f
26963 /* Initialize the transactional memory vector load/store builtins. */
26964
26965 static void
26966 ix86_init_tm_builtins (void)
26967 {
26968 enum ix86_builtin_func_type ftype;
26969 const struct builtin_description *d;
26970 size_t i;
26971 tree decl;
26972 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
26973 tree attrs_log, attrs_type_log;
26974
26975 if (!flag_tm)
26976 return;
26977
26978 /* Use whatever attributes a normal TM load has. */
26979 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
26980 attrs_load = DECL_ATTRIBUTES (decl);
26981 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
26982 /* Use whatever attributes a normal TM store has. */
26983 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
26984 attrs_store = DECL_ATTRIBUTES (decl);
26985 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
26986 /* Use whatever attributes a normal TM log has. */
26987 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
26988 attrs_log = DECL_ATTRIBUTES (decl);
26989 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
26990
26991 for (i = 0, d = bdesc_tm;
26992 i < ARRAY_SIZE (bdesc_tm);
26993 i++, d++)
26994 {
26995 if ((d->mask & ix86_isa_flags) != 0
26996 || (lang_hooks.builtin_function
26997 == lang_hooks.builtin_function_ext_scope))
26998 {
26999 tree type, attrs, attrs_type;
27000 enum built_in_function code = (enum built_in_function) d->code;
27001
27002 ftype = (enum ix86_builtin_func_type) d->flag;
27003 type = ix86_get_builtin_func_type (ftype);
27004
27005 if (BUILTIN_TM_LOAD_P (code))
27006 {
27007 attrs = attrs_load;
27008 attrs_type = attrs_type_load;
27009 }
27010 else if (BUILTIN_TM_STORE_P (code))
27011 {
27012 attrs = attrs_store;
27013 attrs_type = attrs_type_store;
27014 }
27015 else
27016 {
27017 attrs = attrs_log;
27018 attrs_type = attrs_type_log;
27019 }
27020 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
27021 /* The builtin without the prefix for
27022 calling it directly. */
27023 d->name + strlen ("__builtin_"),
27024 attrs);
27025 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
27026 set the TYPE_ATTRIBUTES. */
27027 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
27028
27029 set_builtin_decl (code, decl, false);
27030 }
27031 }
27032 }
27033
27034 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
27035 in the current target ISA to allow the user to compile particular modules
27036 with different target specific options that differ from the command line
27037 options. */
27038 static void
27039 ix86_init_mmx_sse_builtins (void)
27040 {
27041 const struct builtin_description * d;
27042 enum ix86_builtin_func_type ftype;
27043 size_t i;
27044
27045 /* Add all special builtins with variable number of operands. */
27046 for (i = 0, d = bdesc_special_args;
27047 i < ARRAY_SIZE (bdesc_special_args);
27048 i++, d++)
27049 {
27050 if (d->name == 0)
27051 continue;
27052
27053 ftype = (enum ix86_builtin_func_type) d->flag;
27054 def_builtin (d->mask, d->name, ftype, d->code);
27055 }
27056
27057 /* Add all builtins with variable number of operands. */
27058 for (i = 0, d = bdesc_args;
27059 i < ARRAY_SIZE (bdesc_args);
27060 i++, d++)
27061 {
27062 if (d->name == 0)
27063 continue;
27064
27065 ftype = (enum ix86_builtin_func_type) d->flag;
27066 def_builtin_const (d->mask, d->name, ftype, d->code);
27067 }
27068
27069 /* pcmpestr[im] insns. */
27070 for (i = 0, d = bdesc_pcmpestr;
27071 i < ARRAY_SIZE (bdesc_pcmpestr);
27072 i++, d++)
27073 {
27074 if (d->code == IX86_BUILTIN_PCMPESTRM128)
27075 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
27076 else
27077 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
27078 def_builtin_const (d->mask, d->name, ftype, d->code);
27079 }
27080
27081 /* pcmpistr[im] insns. */
27082 for (i = 0, d = bdesc_pcmpistr;
27083 i < ARRAY_SIZE (bdesc_pcmpistr);
27084 i++, d++)
27085 {
27086 if (d->code == IX86_BUILTIN_PCMPISTRM128)
27087 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
27088 else
27089 ftype = INT_FTYPE_V16QI_V16QI_INT;
27090 def_builtin_const (d->mask, d->name, ftype, d->code);
27091 }
27092
27093 /* comi/ucomi insns. */
27094 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
27095 {
27096 if (d->mask == OPTION_MASK_ISA_SSE2)
27097 ftype = INT_FTYPE_V2DF_V2DF;
27098 else
27099 ftype = INT_FTYPE_V4SF_V4SF;
27100 def_builtin_const (d->mask, d->name, ftype, d->code);
27101 }
27102
27103 /* SSE */
27104 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
27105 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
27106 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
27107 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
27108
27109 /* SSE or 3DNow!A */
27110 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27111 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
27112 IX86_BUILTIN_MASKMOVQ);
27113
27114 /* SSE2 */
27115 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
27116 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
27117
27118 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
27119 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
27120 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
27121 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
27122
27123 /* SSE3. */
27124 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
27125 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
27126 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
27127 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
27128
27129 /* AES */
27130 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
27131 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
27132 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
27133 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
27134 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
27135 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
27136 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
27137 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
27138 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
27139 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
27140 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
27141 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
27142
27143 /* PCLMUL */
27144 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
27145 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
27146
27147 /* RDRND */
27148 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
27149 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
27150 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
27151 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
27152 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
27153 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
27154 IX86_BUILTIN_RDRAND64_STEP);
27155
27156 /* AVX2 */
27157 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
27158 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
27159 IX86_BUILTIN_GATHERSIV2DF);
27160
27161 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
27162 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
27163 IX86_BUILTIN_GATHERSIV4DF);
27164
27165 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
27166 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
27167 IX86_BUILTIN_GATHERDIV2DF);
27168
27169 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
27170 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
27171 IX86_BUILTIN_GATHERDIV4DF);
27172
27173 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
27174 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
27175 IX86_BUILTIN_GATHERSIV4SF);
27176
27177 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
27178 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
27179 IX86_BUILTIN_GATHERSIV8SF);
27180
27181 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
27182 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
27183 IX86_BUILTIN_GATHERDIV4SF);
27184
27185 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
27186 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
27187 IX86_BUILTIN_GATHERDIV8SF);
27188
27189 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
27190 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
27191 IX86_BUILTIN_GATHERSIV2DI);
27192
27193 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
27194 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
27195 IX86_BUILTIN_GATHERSIV4DI);
27196
27197 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
27198 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
27199 IX86_BUILTIN_GATHERDIV2DI);
27200
27201 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
27202 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
27203 IX86_BUILTIN_GATHERDIV4DI);
27204
27205 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
27206 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
27207 IX86_BUILTIN_GATHERSIV4SI);
27208
27209 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
27210 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
27211 IX86_BUILTIN_GATHERSIV8SI);
27212
27213 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
27214 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
27215 IX86_BUILTIN_GATHERDIV4SI);
27216
27217 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
27218 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
27219 IX86_BUILTIN_GATHERDIV8SI);
27220
27221 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
27222 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
27223 IX86_BUILTIN_GATHERALTSIV4DF);
27224
27225 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
27226 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
27227 IX86_BUILTIN_GATHERALTDIV8SF);
27228
27229 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
27230 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
27231 IX86_BUILTIN_GATHERALTSIV4DI);
27232
27233 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
27234 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
27235 IX86_BUILTIN_GATHERALTDIV8SI);
27236
27237 /* MMX access to the vec_init patterns. */
27238 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
27239 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
27240
27241 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
27242 V4HI_FTYPE_HI_HI_HI_HI,
27243 IX86_BUILTIN_VEC_INIT_V4HI);
27244
27245 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
27246 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
27247 IX86_BUILTIN_VEC_INIT_V8QI);
27248
27249 /* Access to the vec_extract patterns. */
27250 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
27251 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
27252 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
27253 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
27254 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
27255 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
27256 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
27257 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
27258 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
27259 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
27260
27261 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27262 "__builtin_ia32_vec_ext_v4hi",
27263 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
27264
27265 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
27266 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
27267
27268 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
27269 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
27270
27271 /* Access to the vec_set patterns. */
27272 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
27273 "__builtin_ia32_vec_set_v2di",
27274 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
27275
27276 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
27277 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
27278
27279 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
27280 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
27281
27282 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
27283 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
27284
27285 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27286 "__builtin_ia32_vec_set_v4hi",
27287 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
27288
27289 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
27290 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
27291
27292 /* Add FMA4 multi-arg argument instructions */
27293 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
27294 {
27295 if (d->name == 0)
27296 continue;
27297
27298 ftype = (enum ix86_builtin_func_type) d->flag;
27299 def_builtin_const (d->mask, d->name, ftype, d->code);
27300 }
27301 }
27302
27303 /* Internal method for ix86_init_builtins. */
27304
27305 static void
27306 ix86_init_builtins_va_builtins_abi (void)
27307 {
27308 tree ms_va_ref, sysv_va_ref;
27309 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
27310 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
27311 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
27312 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
27313
27314 if (!TARGET_64BIT)
27315 return;
27316 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
27317 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
27318 ms_va_ref = build_reference_type (ms_va_list_type_node);
27319 sysv_va_ref =
27320 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
27321
27322 fnvoid_va_end_ms =
27323 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27324 fnvoid_va_start_ms =
27325 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27326 fnvoid_va_end_sysv =
27327 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
27328 fnvoid_va_start_sysv =
27329 build_varargs_function_type_list (void_type_node, sysv_va_ref,
27330 NULL_TREE);
27331 fnvoid_va_copy_ms =
27332 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
27333 NULL_TREE);
27334 fnvoid_va_copy_sysv =
27335 build_function_type_list (void_type_node, sysv_va_ref,
27336 sysv_va_ref, NULL_TREE);
27337
27338 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
27339 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
27340 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
27341 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
27342 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
27343 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
27344 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
27345 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27346 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
27347 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27348 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
27349 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27350 }
27351
27352 static void
27353 ix86_init_builtin_types (void)
27354 {
27355 tree float128_type_node, float80_type_node;
27356
27357 /* The __float80 type. */
27358 float80_type_node = long_double_type_node;
27359 if (TYPE_MODE (float80_type_node) != XFmode)
27360 {
27361 /* The __float80 type. */
27362 float80_type_node = make_node (REAL_TYPE);
27363
27364 TYPE_PRECISION (float80_type_node) = 80;
27365 layout_type (float80_type_node);
27366 }
27367 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
27368
27369 /* The __float128 type. */
27370 float128_type_node = make_node (REAL_TYPE);
27371 TYPE_PRECISION (float128_type_node) = 128;
27372 layout_type (float128_type_node);
27373 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
27374
27375 /* This macro is built by i386-builtin-types.awk. */
27376 DEFINE_BUILTIN_PRIMITIVE_TYPES;
27377 }
27378
27379 static void
27380 ix86_init_builtins (void)
27381 {
27382 tree t;
27383
27384 ix86_init_builtin_types ();
27385
27386 /* TFmode support builtins. */
27387 def_builtin_const (0, "__builtin_infq",
27388 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
27389 def_builtin_const (0, "__builtin_huge_valq",
27390 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
27391
27392 /* We will expand them to normal call if SSE2 isn't available since
27393 they are used by libgcc. */
27394 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
27395 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
27396 BUILT_IN_MD, "__fabstf2", NULL_TREE);
27397 TREE_READONLY (t) = 1;
27398 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
27399
27400 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
27401 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
27402 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
27403 TREE_READONLY (t) = 1;
27404 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
27405
27406 ix86_init_tm_builtins ();
27407 ix86_init_mmx_sse_builtins ();
27408
27409 if (TARGET_LP64)
27410 ix86_init_builtins_va_builtins_abi ();
27411
27412 #ifdef SUBTARGET_INIT_BUILTINS
27413 SUBTARGET_INIT_BUILTINS;
27414 #endif
27415 }
27416
27417 /* Return the ix86 builtin for CODE. */
27418
27419 static tree
27420 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
27421 {
27422 if (code >= IX86_BUILTIN_MAX)
27423 return error_mark_node;
27424
27425 return ix86_builtins[code];
27426 }
27427
27428 /* Errors in the source file can cause expand_expr to return const0_rtx
27429 where we expect a vector. To avoid crashing, use one of the vector
27430 clear instructions. */
27431 static rtx
27432 safe_vector_operand (rtx x, enum machine_mode mode)
27433 {
27434 if (x == const0_rtx)
27435 x = CONST0_RTX (mode);
27436 return x;
27437 }
27438
27439 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
27440
27441 static rtx
27442 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
27443 {
27444 rtx pat;
27445 tree arg0 = CALL_EXPR_ARG (exp, 0);
27446 tree arg1 = CALL_EXPR_ARG (exp, 1);
27447 rtx op0 = expand_normal (arg0);
27448 rtx op1 = expand_normal (arg1);
27449 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27450 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27451 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
27452
27453 if (VECTOR_MODE_P (mode0))
27454 op0 = safe_vector_operand (op0, mode0);
27455 if (VECTOR_MODE_P (mode1))
27456 op1 = safe_vector_operand (op1, mode1);
27457
27458 if (optimize || !target
27459 || GET_MODE (target) != tmode
27460 || !insn_data[icode].operand[0].predicate (target, tmode))
27461 target = gen_reg_rtx (tmode);
27462
27463 if (GET_MODE (op1) == SImode && mode1 == TImode)
27464 {
27465 rtx x = gen_reg_rtx (V4SImode);
27466 emit_insn (gen_sse2_loadd (x, op1));
27467 op1 = gen_lowpart (TImode, x);
27468 }
27469
27470 if (!insn_data[icode].operand[1].predicate (op0, mode0))
27471 op0 = copy_to_mode_reg (mode0, op0);
27472 if (!insn_data[icode].operand[2].predicate (op1, mode1))
27473 op1 = copy_to_mode_reg (mode1, op1);
27474
27475 pat = GEN_FCN (icode) (target, op0, op1);
27476 if (! pat)
27477 return 0;
27478
27479 emit_insn (pat);
27480
27481 return target;
27482 }
27483
27484 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
27485
27486 static rtx
27487 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
27488 enum ix86_builtin_func_type m_type,
27489 enum rtx_code sub_code)
27490 {
27491 rtx pat;
27492 int i;
27493 int nargs;
27494 bool comparison_p = false;
27495 bool tf_p = false;
27496 bool last_arg_constant = false;
27497 int num_memory = 0;
27498 struct {
27499 rtx op;
27500 enum machine_mode mode;
27501 } args[4];
27502
27503 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27504
27505 switch (m_type)
27506 {
27507 case MULTI_ARG_4_DF2_DI_I:
27508 case MULTI_ARG_4_DF2_DI_I1:
27509 case MULTI_ARG_4_SF2_SI_I:
27510 case MULTI_ARG_4_SF2_SI_I1:
27511 nargs = 4;
27512 last_arg_constant = true;
27513 break;
27514
27515 case MULTI_ARG_3_SF:
27516 case MULTI_ARG_3_DF:
27517 case MULTI_ARG_3_SF2:
27518 case MULTI_ARG_3_DF2:
27519 case MULTI_ARG_3_DI:
27520 case MULTI_ARG_3_SI:
27521 case MULTI_ARG_3_SI_DI:
27522 case MULTI_ARG_3_HI:
27523 case MULTI_ARG_3_HI_SI:
27524 case MULTI_ARG_3_QI:
27525 case MULTI_ARG_3_DI2:
27526 case MULTI_ARG_3_SI2:
27527 case MULTI_ARG_3_HI2:
27528 case MULTI_ARG_3_QI2:
27529 nargs = 3;
27530 break;
27531
27532 case MULTI_ARG_2_SF:
27533 case MULTI_ARG_2_DF:
27534 case MULTI_ARG_2_DI:
27535 case MULTI_ARG_2_SI:
27536 case MULTI_ARG_2_HI:
27537 case MULTI_ARG_2_QI:
27538 nargs = 2;
27539 break;
27540
27541 case MULTI_ARG_2_DI_IMM:
27542 case MULTI_ARG_2_SI_IMM:
27543 case MULTI_ARG_2_HI_IMM:
27544 case MULTI_ARG_2_QI_IMM:
27545 nargs = 2;
27546 last_arg_constant = true;
27547 break;
27548
27549 case MULTI_ARG_1_SF:
27550 case MULTI_ARG_1_DF:
27551 case MULTI_ARG_1_SF2:
27552 case MULTI_ARG_1_DF2:
27553 case MULTI_ARG_1_DI:
27554 case MULTI_ARG_1_SI:
27555 case MULTI_ARG_1_HI:
27556 case MULTI_ARG_1_QI:
27557 case MULTI_ARG_1_SI_DI:
27558 case MULTI_ARG_1_HI_DI:
27559 case MULTI_ARG_1_HI_SI:
27560 case MULTI_ARG_1_QI_DI:
27561 case MULTI_ARG_1_QI_SI:
27562 case MULTI_ARG_1_QI_HI:
27563 nargs = 1;
27564 break;
27565
27566 case MULTI_ARG_2_DI_CMP:
27567 case MULTI_ARG_2_SI_CMP:
27568 case MULTI_ARG_2_HI_CMP:
27569 case MULTI_ARG_2_QI_CMP:
27570 nargs = 2;
27571 comparison_p = true;
27572 break;
27573
27574 case MULTI_ARG_2_SF_TF:
27575 case MULTI_ARG_2_DF_TF:
27576 case MULTI_ARG_2_DI_TF:
27577 case MULTI_ARG_2_SI_TF:
27578 case MULTI_ARG_2_HI_TF:
27579 case MULTI_ARG_2_QI_TF:
27580 nargs = 2;
27581 tf_p = true;
27582 break;
27583
27584 default:
27585 gcc_unreachable ();
27586 }
27587
27588 if (optimize || !target
27589 || GET_MODE (target) != tmode
27590 || !insn_data[icode].operand[0].predicate (target, tmode))
27591 target = gen_reg_rtx (tmode);
27592
27593 gcc_assert (nargs <= 4);
27594
27595 for (i = 0; i < nargs; i++)
27596 {
27597 tree arg = CALL_EXPR_ARG (exp, i);
27598 rtx op = expand_normal (arg);
27599 int adjust = (comparison_p) ? 1 : 0;
27600 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
27601
27602 if (last_arg_constant && i == nargs - 1)
27603 {
27604 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
27605 {
27606 enum insn_code new_icode = icode;
27607 switch (icode)
27608 {
27609 case CODE_FOR_xop_vpermil2v2df3:
27610 case CODE_FOR_xop_vpermil2v4sf3:
27611 case CODE_FOR_xop_vpermil2v4df3:
27612 case CODE_FOR_xop_vpermil2v8sf3:
27613 error ("the last argument must be a 2-bit immediate");
27614 return gen_reg_rtx (tmode);
27615 case CODE_FOR_xop_rotlv2di3:
27616 new_icode = CODE_FOR_rotlv2di3;
27617 goto xop_rotl;
27618 case CODE_FOR_xop_rotlv4si3:
27619 new_icode = CODE_FOR_rotlv4si3;
27620 goto xop_rotl;
27621 case CODE_FOR_xop_rotlv8hi3:
27622 new_icode = CODE_FOR_rotlv8hi3;
27623 goto xop_rotl;
27624 case CODE_FOR_xop_rotlv16qi3:
27625 new_icode = CODE_FOR_rotlv16qi3;
27626 xop_rotl:
27627 if (CONST_INT_P (op))
27628 {
27629 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
27630 op = GEN_INT (INTVAL (op) & mask);
27631 gcc_checking_assert
27632 (insn_data[icode].operand[i + 1].predicate (op, mode));
27633 }
27634 else
27635 {
27636 gcc_checking_assert
27637 (nargs == 2
27638 && insn_data[new_icode].operand[0].mode == tmode
27639 && insn_data[new_icode].operand[1].mode == tmode
27640 && insn_data[new_icode].operand[2].mode == mode
27641 && insn_data[new_icode].operand[0].predicate
27642 == insn_data[icode].operand[0].predicate
27643 && insn_data[new_icode].operand[1].predicate
27644 == insn_data[icode].operand[1].predicate);
27645 icode = new_icode;
27646 goto non_constant;
27647 }
27648 break;
27649 default:
27650 gcc_unreachable ();
27651 }
27652 }
27653 }
27654 else
27655 {
27656 non_constant:
27657 if (VECTOR_MODE_P (mode))
27658 op = safe_vector_operand (op, mode);
27659
27660 /* If we aren't optimizing, only allow one memory operand to be
27661 generated. */
27662 if (memory_operand (op, mode))
27663 num_memory++;
27664
27665 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
27666
27667 if (optimize
27668 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
27669 || num_memory > 1)
27670 op = force_reg (mode, op);
27671 }
27672
27673 args[i].op = op;
27674 args[i].mode = mode;
27675 }
27676
27677 switch (nargs)
27678 {
27679 case 1:
27680 pat = GEN_FCN (icode) (target, args[0].op);
27681 break;
27682
27683 case 2:
27684 if (tf_p)
27685 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
27686 GEN_INT ((int)sub_code));
27687 else if (! comparison_p)
27688 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
27689 else
27690 {
27691 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
27692 args[0].op,
27693 args[1].op);
27694
27695 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
27696 }
27697 break;
27698
27699 case 3:
27700 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
27701 break;
27702
27703 case 4:
27704 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
27705 break;
27706
27707 default:
27708 gcc_unreachable ();
27709 }
27710
27711 if (! pat)
27712 return 0;
27713
27714 emit_insn (pat);
27715 return target;
27716 }
27717
27718 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
27719 insns with vec_merge. */
27720
27721 static rtx
27722 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
27723 rtx target)
27724 {
27725 rtx pat;
27726 tree arg0 = CALL_EXPR_ARG (exp, 0);
27727 rtx op1, op0 = expand_normal (arg0);
27728 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27729 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27730
27731 if (optimize || !target
27732 || GET_MODE (target) != tmode
27733 || !insn_data[icode].operand[0].predicate (target, tmode))
27734 target = gen_reg_rtx (tmode);
27735
27736 if (VECTOR_MODE_P (mode0))
27737 op0 = safe_vector_operand (op0, mode0);
27738
27739 if ((optimize && !register_operand (op0, mode0))
27740 || !insn_data[icode].operand[1].predicate (op0, mode0))
27741 op0 = copy_to_mode_reg (mode0, op0);
27742
27743 op1 = op0;
27744 if (!insn_data[icode].operand[2].predicate (op1, mode0))
27745 op1 = copy_to_mode_reg (mode0, op1);
27746
27747 pat = GEN_FCN (icode) (target, op0, op1);
27748 if (! pat)
27749 return 0;
27750 emit_insn (pat);
27751 return target;
27752 }
27753
27754 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
27755
27756 static rtx
27757 ix86_expand_sse_compare (const struct builtin_description *d,
27758 tree exp, rtx target, bool swap)
27759 {
27760 rtx pat;
27761 tree arg0 = CALL_EXPR_ARG (exp, 0);
27762 tree arg1 = CALL_EXPR_ARG (exp, 1);
27763 rtx op0 = expand_normal (arg0);
27764 rtx op1 = expand_normal (arg1);
27765 rtx op2;
27766 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
27767 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
27768 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
27769 enum rtx_code comparison = d->comparison;
27770
27771 if (VECTOR_MODE_P (mode0))
27772 op0 = safe_vector_operand (op0, mode0);
27773 if (VECTOR_MODE_P (mode1))
27774 op1 = safe_vector_operand (op1, mode1);
27775
27776 /* Swap operands if we have a comparison that isn't available in
27777 hardware. */
27778 if (swap)
27779 {
27780 rtx tmp = gen_reg_rtx (mode1);
27781 emit_move_insn (tmp, op1);
27782 op1 = op0;
27783 op0 = tmp;
27784 }
27785
27786 if (optimize || !target
27787 || GET_MODE (target) != tmode
27788 || !insn_data[d->icode].operand[0].predicate (target, tmode))
27789 target = gen_reg_rtx (tmode);
27790
27791 if ((optimize && !register_operand (op0, mode0))
27792 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
27793 op0 = copy_to_mode_reg (mode0, op0);
27794 if ((optimize && !register_operand (op1, mode1))
27795 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
27796 op1 = copy_to_mode_reg (mode1, op1);
27797
27798 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
27799 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
27800 if (! pat)
27801 return 0;
27802 emit_insn (pat);
27803 return target;
27804 }
27805
27806 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
27807
27808 static rtx
27809 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
27810 rtx target)
27811 {
27812 rtx pat;
27813 tree arg0 = CALL_EXPR_ARG (exp, 0);
27814 tree arg1 = CALL_EXPR_ARG (exp, 1);
27815 rtx op0 = expand_normal (arg0);
27816 rtx op1 = expand_normal (arg1);
27817 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
27818 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
27819 enum rtx_code comparison = d->comparison;
27820
27821 if (VECTOR_MODE_P (mode0))
27822 op0 = safe_vector_operand (op0, mode0);
27823 if (VECTOR_MODE_P (mode1))
27824 op1 = safe_vector_operand (op1, mode1);
27825
27826 /* Swap operands if we have a comparison that isn't available in
27827 hardware. */
27828 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
27829 {
27830 rtx tmp = op1;
27831 op1 = op0;
27832 op0 = tmp;
27833 }
27834
27835 target = gen_reg_rtx (SImode);
27836 emit_move_insn (target, const0_rtx);
27837 target = gen_rtx_SUBREG (QImode, target, 0);
27838
27839 if ((optimize && !register_operand (op0, mode0))
27840 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
27841 op0 = copy_to_mode_reg (mode0, op0);
27842 if ((optimize && !register_operand (op1, mode1))
27843 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
27844 op1 = copy_to_mode_reg (mode1, op1);
27845
27846 pat = GEN_FCN (d->icode) (op0, op1);
27847 if (! pat)
27848 return 0;
27849 emit_insn (pat);
27850 emit_insn (gen_rtx_SET (VOIDmode,
27851 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
27852 gen_rtx_fmt_ee (comparison, QImode,
27853 SET_DEST (pat),
27854 const0_rtx)));
27855
27856 return SUBREG_REG (target);
27857 }
27858
27859 /* Subroutine of ix86_expand_args_builtin to take care of round insns. */
27860
27861 static rtx
27862 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
27863 rtx target)
27864 {
27865 rtx pat;
27866 tree arg0 = CALL_EXPR_ARG (exp, 0);
27867 rtx op1, op0 = expand_normal (arg0);
27868 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
27869 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
27870
27871 if (optimize || target == 0
27872 || GET_MODE (target) != tmode
27873 || !insn_data[d->icode].operand[0].predicate (target, tmode))
27874 target = gen_reg_rtx (tmode);
27875
27876 if (VECTOR_MODE_P (mode0))
27877 op0 = safe_vector_operand (op0, mode0);
27878
27879 if ((optimize && !register_operand (op0, mode0))
27880 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
27881 op0 = copy_to_mode_reg (mode0, op0);
27882
27883 op1 = GEN_INT (d->comparison);
27884
27885 pat = GEN_FCN (d->icode) (target, op0, op1);
27886 if (! pat)
27887 return 0;
27888 emit_insn (pat);
27889 return target;
27890 }
27891
27892 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
27893
27894 static rtx
27895 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
27896 rtx target)
27897 {
27898 rtx pat;
27899 tree arg0 = CALL_EXPR_ARG (exp, 0);
27900 tree arg1 = CALL_EXPR_ARG (exp, 1);
27901 rtx op0 = expand_normal (arg0);
27902 rtx op1 = expand_normal (arg1);
27903 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
27904 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
27905 enum rtx_code comparison = d->comparison;
27906
27907 if (VECTOR_MODE_P (mode0))
27908 op0 = safe_vector_operand (op0, mode0);
27909 if (VECTOR_MODE_P (mode1))
27910 op1 = safe_vector_operand (op1, mode1);
27911
27912 target = gen_reg_rtx (SImode);
27913 emit_move_insn (target, const0_rtx);
27914 target = gen_rtx_SUBREG (QImode, target, 0);
27915
27916 if ((optimize && !register_operand (op0, mode0))
27917 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
27918 op0 = copy_to_mode_reg (mode0, op0);
27919 if ((optimize && !register_operand (op1, mode1))
27920 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
27921 op1 = copy_to_mode_reg (mode1, op1);
27922
27923 pat = GEN_FCN (d->icode) (op0, op1);
27924 if (! pat)
27925 return 0;
27926 emit_insn (pat);
27927 emit_insn (gen_rtx_SET (VOIDmode,
27928 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
27929 gen_rtx_fmt_ee (comparison, QImode,
27930 SET_DEST (pat),
27931 const0_rtx)));
27932
27933 return SUBREG_REG (target);
27934 }
27935
27936 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
27937
27938 static rtx
27939 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
27940 tree exp, rtx target)
27941 {
27942 rtx pat;
27943 tree arg0 = CALL_EXPR_ARG (exp, 0);
27944 tree arg1 = CALL_EXPR_ARG (exp, 1);
27945 tree arg2 = CALL_EXPR_ARG (exp, 2);
27946 tree arg3 = CALL_EXPR_ARG (exp, 3);
27947 tree arg4 = CALL_EXPR_ARG (exp, 4);
27948 rtx scratch0, scratch1;
27949 rtx op0 = expand_normal (arg0);
27950 rtx op1 = expand_normal (arg1);
27951 rtx op2 = expand_normal (arg2);
27952 rtx op3 = expand_normal (arg3);
27953 rtx op4 = expand_normal (arg4);
27954 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
27955
27956 tmode0 = insn_data[d->icode].operand[0].mode;
27957 tmode1 = insn_data[d->icode].operand[1].mode;
27958 modev2 = insn_data[d->icode].operand[2].mode;
27959 modei3 = insn_data[d->icode].operand[3].mode;
27960 modev4 = insn_data[d->icode].operand[4].mode;
27961 modei5 = insn_data[d->icode].operand[5].mode;
27962 modeimm = insn_data[d->icode].operand[6].mode;
27963
27964 if (VECTOR_MODE_P (modev2))
27965 op0 = safe_vector_operand (op0, modev2);
27966 if (VECTOR_MODE_P (modev4))
27967 op2 = safe_vector_operand (op2, modev4);
27968
27969 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
27970 op0 = copy_to_mode_reg (modev2, op0);
27971 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
27972 op1 = copy_to_mode_reg (modei3, op1);
27973 if ((optimize && !register_operand (op2, modev4))
27974 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
27975 op2 = copy_to_mode_reg (modev4, op2);
27976 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
27977 op3 = copy_to_mode_reg (modei5, op3);
27978
27979 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
27980 {
27981 error ("the fifth argument must be an 8-bit immediate");
27982 return const0_rtx;
27983 }
27984
27985 if (d->code == IX86_BUILTIN_PCMPESTRI128)
27986 {
27987 if (optimize || !target
27988 || GET_MODE (target) != tmode0
27989 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
27990 target = gen_reg_rtx (tmode0);
27991
27992 scratch1 = gen_reg_rtx (tmode1);
27993
27994 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
27995 }
27996 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
27997 {
27998 if (optimize || !target
27999 || GET_MODE (target) != tmode1
28000 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28001 target = gen_reg_rtx (tmode1);
28002
28003 scratch0 = gen_reg_rtx (tmode0);
28004
28005 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
28006 }
28007 else
28008 {
28009 gcc_assert (d->flag);
28010
28011 scratch0 = gen_reg_rtx (tmode0);
28012 scratch1 = gen_reg_rtx (tmode1);
28013
28014 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
28015 }
28016
28017 if (! pat)
28018 return 0;
28019
28020 emit_insn (pat);
28021
28022 if (d->flag)
28023 {
28024 target = gen_reg_rtx (SImode);
28025 emit_move_insn (target, const0_rtx);
28026 target = gen_rtx_SUBREG (QImode, target, 0);
28027
28028 emit_insn
28029 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28030 gen_rtx_fmt_ee (EQ, QImode,
28031 gen_rtx_REG ((enum machine_mode) d->flag,
28032 FLAGS_REG),
28033 const0_rtx)));
28034 return SUBREG_REG (target);
28035 }
28036 else
28037 return target;
28038 }
28039
28040
28041 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
28042
28043 static rtx
28044 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
28045 tree exp, rtx target)
28046 {
28047 rtx pat;
28048 tree arg0 = CALL_EXPR_ARG (exp, 0);
28049 tree arg1 = CALL_EXPR_ARG (exp, 1);
28050 tree arg2 = CALL_EXPR_ARG (exp, 2);
28051 rtx scratch0, scratch1;
28052 rtx op0 = expand_normal (arg0);
28053 rtx op1 = expand_normal (arg1);
28054 rtx op2 = expand_normal (arg2);
28055 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
28056
28057 tmode0 = insn_data[d->icode].operand[0].mode;
28058 tmode1 = insn_data[d->icode].operand[1].mode;
28059 modev2 = insn_data[d->icode].operand[2].mode;
28060 modev3 = insn_data[d->icode].operand[3].mode;
28061 modeimm = insn_data[d->icode].operand[4].mode;
28062
28063 if (VECTOR_MODE_P (modev2))
28064 op0 = safe_vector_operand (op0, modev2);
28065 if (VECTOR_MODE_P (modev3))
28066 op1 = safe_vector_operand (op1, modev3);
28067
28068 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28069 op0 = copy_to_mode_reg (modev2, op0);
28070 if ((optimize && !register_operand (op1, modev3))
28071 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
28072 op1 = copy_to_mode_reg (modev3, op1);
28073
28074 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
28075 {
28076 error ("the third argument must be an 8-bit immediate");
28077 return const0_rtx;
28078 }
28079
28080 if (d->code == IX86_BUILTIN_PCMPISTRI128)
28081 {
28082 if (optimize || !target
28083 || GET_MODE (target) != tmode0
28084 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28085 target = gen_reg_rtx (tmode0);
28086
28087 scratch1 = gen_reg_rtx (tmode1);
28088
28089 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
28090 }
28091 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
28092 {
28093 if (optimize || !target
28094 || GET_MODE (target) != tmode1
28095 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28096 target = gen_reg_rtx (tmode1);
28097
28098 scratch0 = gen_reg_rtx (tmode0);
28099
28100 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
28101 }
28102 else
28103 {
28104 gcc_assert (d->flag);
28105
28106 scratch0 = gen_reg_rtx (tmode0);
28107 scratch1 = gen_reg_rtx (tmode1);
28108
28109 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
28110 }
28111
28112 if (! pat)
28113 return 0;
28114
28115 emit_insn (pat);
28116
28117 if (d->flag)
28118 {
28119 target = gen_reg_rtx (SImode);
28120 emit_move_insn (target, const0_rtx);
28121 target = gen_rtx_SUBREG (QImode, target, 0);
28122
28123 emit_insn
28124 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28125 gen_rtx_fmt_ee (EQ, QImode,
28126 gen_rtx_REG ((enum machine_mode) d->flag,
28127 FLAGS_REG),
28128 const0_rtx)));
28129 return SUBREG_REG (target);
28130 }
28131 else
28132 return target;
28133 }
28134
28135 /* Subroutine of ix86_expand_builtin to take care of insns with
28136 variable number of operands. */
28137
28138 static rtx
28139 ix86_expand_args_builtin (const struct builtin_description *d,
28140 tree exp, rtx target)
28141 {
28142 rtx pat, real_target;
28143 unsigned int i, nargs;
28144 unsigned int nargs_constant = 0;
28145 int num_memory = 0;
28146 struct
28147 {
28148 rtx op;
28149 enum machine_mode mode;
28150 } args[4];
28151 bool last_arg_count = false;
28152 enum insn_code icode = d->icode;
28153 const struct insn_data_d *insn_p = &insn_data[icode];
28154 enum machine_mode tmode = insn_p->operand[0].mode;
28155 enum machine_mode rmode = VOIDmode;
28156 bool swap = false;
28157 enum rtx_code comparison = d->comparison;
28158
28159 switch ((enum ix86_builtin_func_type) d->flag)
28160 {
28161 case V2DF_FTYPE_V2DF_ROUND:
28162 case V4DF_FTYPE_V4DF_ROUND:
28163 case V4SF_FTYPE_V4SF_ROUND:
28164 case V8SF_FTYPE_V8SF_ROUND:
28165 return ix86_expand_sse_round (d, exp, target);
28166 case INT_FTYPE_V8SF_V8SF_PTEST:
28167 case INT_FTYPE_V4DI_V4DI_PTEST:
28168 case INT_FTYPE_V4DF_V4DF_PTEST:
28169 case INT_FTYPE_V4SF_V4SF_PTEST:
28170 case INT_FTYPE_V2DI_V2DI_PTEST:
28171 case INT_FTYPE_V2DF_V2DF_PTEST:
28172 return ix86_expand_sse_ptest (d, exp, target);
28173 case FLOAT128_FTYPE_FLOAT128:
28174 case FLOAT_FTYPE_FLOAT:
28175 case INT_FTYPE_INT:
28176 case UINT64_FTYPE_INT:
28177 case UINT16_FTYPE_UINT16:
28178 case INT64_FTYPE_INT64:
28179 case INT64_FTYPE_V4SF:
28180 case INT64_FTYPE_V2DF:
28181 case INT_FTYPE_V16QI:
28182 case INT_FTYPE_V8QI:
28183 case INT_FTYPE_V8SF:
28184 case INT_FTYPE_V4DF:
28185 case INT_FTYPE_V4SF:
28186 case INT_FTYPE_V2DF:
28187 case INT_FTYPE_V32QI:
28188 case V16QI_FTYPE_V16QI:
28189 case V8SI_FTYPE_V8SF:
28190 case V8SI_FTYPE_V4SI:
28191 case V8HI_FTYPE_V8HI:
28192 case V8HI_FTYPE_V16QI:
28193 case V8QI_FTYPE_V8QI:
28194 case V8SF_FTYPE_V8SF:
28195 case V8SF_FTYPE_V8SI:
28196 case V8SF_FTYPE_V4SF:
28197 case V8SF_FTYPE_V8HI:
28198 case V4SI_FTYPE_V4SI:
28199 case V4SI_FTYPE_V16QI:
28200 case V4SI_FTYPE_V4SF:
28201 case V4SI_FTYPE_V8SI:
28202 case V4SI_FTYPE_V8HI:
28203 case V4SI_FTYPE_V4DF:
28204 case V4SI_FTYPE_V2DF:
28205 case V4HI_FTYPE_V4HI:
28206 case V4DF_FTYPE_V4DF:
28207 case V4DF_FTYPE_V4SI:
28208 case V4DF_FTYPE_V4SF:
28209 case V4DF_FTYPE_V2DF:
28210 case V4SF_FTYPE_V4SF:
28211 case V4SF_FTYPE_V4SI:
28212 case V4SF_FTYPE_V8SF:
28213 case V4SF_FTYPE_V4DF:
28214 case V4SF_FTYPE_V8HI:
28215 case V4SF_FTYPE_V2DF:
28216 case V2DI_FTYPE_V2DI:
28217 case V2DI_FTYPE_V16QI:
28218 case V2DI_FTYPE_V8HI:
28219 case V2DI_FTYPE_V4SI:
28220 case V2DF_FTYPE_V2DF:
28221 case V2DF_FTYPE_V4SI:
28222 case V2DF_FTYPE_V4DF:
28223 case V2DF_FTYPE_V4SF:
28224 case V2DF_FTYPE_V2SI:
28225 case V2SI_FTYPE_V2SI:
28226 case V2SI_FTYPE_V4SF:
28227 case V2SI_FTYPE_V2SF:
28228 case V2SI_FTYPE_V2DF:
28229 case V2SF_FTYPE_V2SF:
28230 case V2SF_FTYPE_V2SI:
28231 case V32QI_FTYPE_V32QI:
28232 case V32QI_FTYPE_V16QI:
28233 case V16HI_FTYPE_V16HI:
28234 case V16HI_FTYPE_V8HI:
28235 case V8SI_FTYPE_V8SI:
28236 case V16HI_FTYPE_V16QI:
28237 case V8SI_FTYPE_V16QI:
28238 case V4DI_FTYPE_V16QI:
28239 case V8SI_FTYPE_V8HI:
28240 case V4DI_FTYPE_V8HI:
28241 case V4DI_FTYPE_V4SI:
28242 case V4DI_FTYPE_V2DI:
28243 nargs = 1;
28244 break;
28245 case V4SF_FTYPE_V4SF_VEC_MERGE:
28246 case V2DF_FTYPE_V2DF_VEC_MERGE:
28247 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
28248 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
28249 case V16QI_FTYPE_V16QI_V16QI:
28250 case V16QI_FTYPE_V8HI_V8HI:
28251 case V8QI_FTYPE_V8QI_V8QI:
28252 case V8QI_FTYPE_V4HI_V4HI:
28253 case V8HI_FTYPE_V8HI_V8HI:
28254 case V8HI_FTYPE_V16QI_V16QI:
28255 case V8HI_FTYPE_V4SI_V4SI:
28256 case V8SF_FTYPE_V8SF_V8SF:
28257 case V8SF_FTYPE_V8SF_V8SI:
28258 case V4SI_FTYPE_V4SI_V4SI:
28259 case V4SI_FTYPE_V8HI_V8HI:
28260 case V4SI_FTYPE_V4SF_V4SF:
28261 case V4SI_FTYPE_V2DF_V2DF:
28262 case V4HI_FTYPE_V4HI_V4HI:
28263 case V4HI_FTYPE_V8QI_V8QI:
28264 case V4HI_FTYPE_V2SI_V2SI:
28265 case V4DF_FTYPE_V4DF_V4DF:
28266 case V4DF_FTYPE_V4DF_V4DI:
28267 case V4SF_FTYPE_V4SF_V4SF:
28268 case V4SF_FTYPE_V4SF_V4SI:
28269 case V4SF_FTYPE_V4SF_V2SI:
28270 case V4SF_FTYPE_V4SF_V2DF:
28271 case V4SF_FTYPE_V4SF_DI:
28272 case V4SF_FTYPE_V4SF_SI:
28273 case V2DI_FTYPE_V2DI_V2DI:
28274 case V2DI_FTYPE_V16QI_V16QI:
28275 case V2DI_FTYPE_V4SI_V4SI:
28276 case V2DI_FTYPE_V2DI_V16QI:
28277 case V2DI_FTYPE_V2DF_V2DF:
28278 case V2SI_FTYPE_V2SI_V2SI:
28279 case V2SI_FTYPE_V4HI_V4HI:
28280 case V2SI_FTYPE_V2SF_V2SF:
28281 case V2DF_FTYPE_V2DF_V2DF:
28282 case V2DF_FTYPE_V2DF_V4SF:
28283 case V2DF_FTYPE_V2DF_V2DI:
28284 case V2DF_FTYPE_V2DF_DI:
28285 case V2DF_FTYPE_V2DF_SI:
28286 case V2SF_FTYPE_V2SF_V2SF:
28287 case V1DI_FTYPE_V1DI_V1DI:
28288 case V1DI_FTYPE_V8QI_V8QI:
28289 case V1DI_FTYPE_V2SI_V2SI:
28290 case V32QI_FTYPE_V16HI_V16HI:
28291 case V16HI_FTYPE_V8SI_V8SI:
28292 case V32QI_FTYPE_V32QI_V32QI:
28293 case V16HI_FTYPE_V32QI_V32QI:
28294 case V16HI_FTYPE_V16HI_V16HI:
28295 case V8SI_FTYPE_V4DF_V4DF:
28296 case V8SI_FTYPE_V8SI_V8SI:
28297 case V8SI_FTYPE_V16HI_V16HI:
28298 case V4DI_FTYPE_V4DI_V4DI:
28299 case V4DI_FTYPE_V8SI_V8SI:
28300 if (comparison == UNKNOWN)
28301 return ix86_expand_binop_builtin (icode, exp, target);
28302 nargs = 2;
28303 break;
28304 case V4SF_FTYPE_V4SF_V4SF_SWAP:
28305 case V2DF_FTYPE_V2DF_V2DF_SWAP:
28306 gcc_assert (comparison != UNKNOWN);
28307 nargs = 2;
28308 swap = true;
28309 break;
28310 case V16HI_FTYPE_V16HI_V8HI_COUNT:
28311 case V16HI_FTYPE_V16HI_SI_COUNT:
28312 case V8SI_FTYPE_V8SI_V4SI_COUNT:
28313 case V8SI_FTYPE_V8SI_SI_COUNT:
28314 case V4DI_FTYPE_V4DI_V2DI_COUNT:
28315 case V4DI_FTYPE_V4DI_INT_COUNT:
28316 case V8HI_FTYPE_V8HI_V8HI_COUNT:
28317 case V8HI_FTYPE_V8HI_SI_COUNT:
28318 case V4SI_FTYPE_V4SI_V4SI_COUNT:
28319 case V4SI_FTYPE_V4SI_SI_COUNT:
28320 case V4HI_FTYPE_V4HI_V4HI_COUNT:
28321 case V4HI_FTYPE_V4HI_SI_COUNT:
28322 case V2DI_FTYPE_V2DI_V2DI_COUNT:
28323 case V2DI_FTYPE_V2DI_SI_COUNT:
28324 case V2SI_FTYPE_V2SI_V2SI_COUNT:
28325 case V2SI_FTYPE_V2SI_SI_COUNT:
28326 case V1DI_FTYPE_V1DI_V1DI_COUNT:
28327 case V1DI_FTYPE_V1DI_SI_COUNT:
28328 nargs = 2;
28329 last_arg_count = true;
28330 break;
28331 case UINT64_FTYPE_UINT64_UINT64:
28332 case UINT_FTYPE_UINT_UINT:
28333 case UINT_FTYPE_UINT_USHORT:
28334 case UINT_FTYPE_UINT_UCHAR:
28335 case UINT16_FTYPE_UINT16_INT:
28336 case UINT8_FTYPE_UINT8_INT:
28337 nargs = 2;
28338 break;
28339 case V2DI_FTYPE_V2DI_INT_CONVERT:
28340 nargs = 2;
28341 rmode = V1TImode;
28342 nargs_constant = 1;
28343 break;
28344 case V4DI_FTYPE_V4DI_INT_CONVERT:
28345 nargs = 2;
28346 rmode = V2TImode;
28347 nargs_constant = 1;
28348 break;
28349 case V8HI_FTYPE_V8HI_INT:
28350 case V8HI_FTYPE_V8SF_INT:
28351 case V8HI_FTYPE_V4SF_INT:
28352 case V8SF_FTYPE_V8SF_INT:
28353 case V4SI_FTYPE_V4SI_INT:
28354 case V4SI_FTYPE_V8SI_INT:
28355 case V4HI_FTYPE_V4HI_INT:
28356 case V4DF_FTYPE_V4DF_INT:
28357 case V4SF_FTYPE_V4SF_INT:
28358 case V4SF_FTYPE_V8SF_INT:
28359 case V2DI_FTYPE_V2DI_INT:
28360 case V2DF_FTYPE_V2DF_INT:
28361 case V2DF_FTYPE_V4DF_INT:
28362 case V16HI_FTYPE_V16HI_INT:
28363 case V8SI_FTYPE_V8SI_INT:
28364 case V4DI_FTYPE_V4DI_INT:
28365 case V2DI_FTYPE_V4DI_INT:
28366 nargs = 2;
28367 nargs_constant = 1;
28368 break;
28369 case V16QI_FTYPE_V16QI_V16QI_V16QI:
28370 case V8SF_FTYPE_V8SF_V8SF_V8SF:
28371 case V4DF_FTYPE_V4DF_V4DF_V4DF:
28372 case V4SF_FTYPE_V4SF_V4SF_V4SF:
28373 case V2DF_FTYPE_V2DF_V2DF_V2DF:
28374 case V32QI_FTYPE_V32QI_V32QI_V32QI:
28375 nargs = 3;
28376 break;
28377 case V32QI_FTYPE_V32QI_V32QI_INT:
28378 case V16HI_FTYPE_V16HI_V16HI_INT:
28379 case V16QI_FTYPE_V16QI_V16QI_INT:
28380 case V4DI_FTYPE_V4DI_V4DI_INT:
28381 case V8HI_FTYPE_V8HI_V8HI_INT:
28382 case V8SI_FTYPE_V8SI_V8SI_INT:
28383 case V8SI_FTYPE_V8SI_V4SI_INT:
28384 case V8SF_FTYPE_V8SF_V8SF_INT:
28385 case V8SF_FTYPE_V8SF_V4SF_INT:
28386 case V4SI_FTYPE_V4SI_V4SI_INT:
28387 case V4DF_FTYPE_V4DF_V4DF_INT:
28388 case V4DF_FTYPE_V4DF_V2DF_INT:
28389 case V4SF_FTYPE_V4SF_V4SF_INT:
28390 case V2DI_FTYPE_V2DI_V2DI_INT:
28391 case V4DI_FTYPE_V4DI_V2DI_INT:
28392 case V2DF_FTYPE_V2DF_V2DF_INT:
28393 nargs = 3;
28394 nargs_constant = 1;
28395 break;
28396 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
28397 nargs = 3;
28398 rmode = V4DImode;
28399 nargs_constant = 1;
28400 break;
28401 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
28402 nargs = 3;
28403 rmode = V2DImode;
28404 nargs_constant = 1;
28405 break;
28406 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
28407 nargs = 3;
28408 rmode = DImode;
28409 nargs_constant = 1;
28410 break;
28411 case V2DI_FTYPE_V2DI_UINT_UINT:
28412 nargs = 3;
28413 nargs_constant = 2;
28414 break;
28415 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
28416 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
28417 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
28418 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
28419 nargs = 4;
28420 nargs_constant = 1;
28421 break;
28422 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
28423 nargs = 4;
28424 nargs_constant = 2;
28425 break;
28426 default:
28427 gcc_unreachable ();
28428 }
28429
28430 gcc_assert (nargs <= ARRAY_SIZE (args));
28431
28432 if (comparison != UNKNOWN)
28433 {
28434 gcc_assert (nargs == 2);
28435 return ix86_expand_sse_compare (d, exp, target, swap);
28436 }
28437
28438 if (rmode == VOIDmode || rmode == tmode)
28439 {
28440 if (optimize
28441 || target == 0
28442 || GET_MODE (target) != tmode
28443 || !insn_p->operand[0].predicate (target, tmode))
28444 target = gen_reg_rtx (tmode);
28445 real_target = target;
28446 }
28447 else
28448 {
28449 target = gen_reg_rtx (rmode);
28450 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
28451 }
28452
28453 for (i = 0; i < nargs; i++)
28454 {
28455 tree arg = CALL_EXPR_ARG (exp, i);
28456 rtx op = expand_normal (arg);
28457 enum machine_mode mode = insn_p->operand[i + 1].mode;
28458 bool match = insn_p->operand[i + 1].predicate (op, mode);
28459
28460 if (last_arg_count && (i + 1) == nargs)
28461 {
28462 /* SIMD shift insns take either an 8-bit immediate or
28463 register as count. But builtin functions take int as
28464 count. If count doesn't match, we put it in register. */
28465 if (!match)
28466 {
28467 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
28468 if (!insn_p->operand[i + 1].predicate (op, mode))
28469 op = copy_to_reg (op);
28470 }
28471 }
28472 else if ((nargs - i) <= nargs_constant)
28473 {
28474 if (!match)
28475 switch (icode)
28476 {
28477 case CODE_FOR_avx2_inserti128:
28478 case CODE_FOR_avx2_extracti128:
28479 error ("the last argument must be an 1-bit immediate");
28480 return const0_rtx;
28481
28482 case CODE_FOR_sse4_1_roundpd:
28483 case CODE_FOR_sse4_1_roundps:
28484 case CODE_FOR_sse4_1_roundsd:
28485 case CODE_FOR_sse4_1_roundss:
28486 case CODE_FOR_sse4_1_blendps:
28487 case CODE_FOR_avx_blendpd256:
28488 case CODE_FOR_avx_vpermilv4df:
28489 case CODE_FOR_avx_roundpd256:
28490 case CODE_FOR_avx_roundps256:
28491 error ("the last argument must be a 4-bit immediate");
28492 return const0_rtx;
28493
28494 case CODE_FOR_sse4_1_blendpd:
28495 case CODE_FOR_avx_vpermilv2df:
28496 case CODE_FOR_xop_vpermil2v2df3:
28497 case CODE_FOR_xop_vpermil2v4sf3:
28498 case CODE_FOR_xop_vpermil2v4df3:
28499 case CODE_FOR_xop_vpermil2v8sf3:
28500 error ("the last argument must be a 2-bit immediate");
28501 return const0_rtx;
28502
28503 case CODE_FOR_avx_vextractf128v4df:
28504 case CODE_FOR_avx_vextractf128v8sf:
28505 case CODE_FOR_avx_vextractf128v8si:
28506 case CODE_FOR_avx_vinsertf128v4df:
28507 case CODE_FOR_avx_vinsertf128v8sf:
28508 case CODE_FOR_avx_vinsertf128v8si:
28509 error ("the last argument must be a 1-bit immediate");
28510 return const0_rtx;
28511
28512 case CODE_FOR_avx_vmcmpv2df3:
28513 case CODE_FOR_avx_vmcmpv4sf3:
28514 case CODE_FOR_avx_cmpv2df3:
28515 case CODE_FOR_avx_cmpv4sf3:
28516 case CODE_FOR_avx_cmpv4df3:
28517 case CODE_FOR_avx_cmpv8sf3:
28518 error ("the last argument must be a 5-bit immediate");
28519 return const0_rtx;
28520
28521 default:
28522 switch (nargs_constant)
28523 {
28524 case 2:
28525 if ((nargs - i) == nargs_constant)
28526 {
28527 error ("the next to last argument must be an 8-bit immediate");
28528 break;
28529 }
28530 case 1:
28531 error ("the last argument must be an 8-bit immediate");
28532 break;
28533 default:
28534 gcc_unreachable ();
28535 }
28536 return const0_rtx;
28537 }
28538 }
28539 else
28540 {
28541 if (VECTOR_MODE_P (mode))
28542 op = safe_vector_operand (op, mode);
28543
28544 /* If we aren't optimizing, only allow one memory operand to
28545 be generated. */
28546 if (memory_operand (op, mode))
28547 num_memory++;
28548
28549 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
28550 {
28551 if (optimize || !match || num_memory > 1)
28552 op = copy_to_mode_reg (mode, op);
28553 }
28554 else
28555 {
28556 op = copy_to_reg (op);
28557 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
28558 }
28559 }
28560
28561 args[i].op = op;
28562 args[i].mode = mode;
28563 }
28564
28565 switch (nargs)
28566 {
28567 case 1:
28568 pat = GEN_FCN (icode) (real_target, args[0].op);
28569 break;
28570 case 2:
28571 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
28572 break;
28573 case 3:
28574 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28575 args[2].op);
28576 break;
28577 case 4:
28578 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28579 args[2].op, args[3].op);
28580 break;
28581 default:
28582 gcc_unreachable ();
28583 }
28584
28585 if (! pat)
28586 return 0;
28587
28588 emit_insn (pat);
28589 return target;
28590 }
28591
28592 /* Subroutine of ix86_expand_builtin to take care of special insns
28593 with variable number of operands. */
28594
28595 static rtx
28596 ix86_expand_special_args_builtin (const struct builtin_description *d,
28597 tree exp, rtx target)
28598 {
28599 tree arg;
28600 rtx pat, op;
28601 unsigned int i, nargs, arg_adjust, memory;
28602 struct
28603 {
28604 rtx op;
28605 enum machine_mode mode;
28606 } args[3];
28607 enum insn_code icode = d->icode;
28608 bool last_arg_constant = false;
28609 const struct insn_data_d *insn_p = &insn_data[icode];
28610 enum machine_mode tmode = insn_p->operand[0].mode;
28611 enum { load, store } klass;
28612
28613 switch ((enum ix86_builtin_func_type) d->flag)
28614 {
28615 case VOID_FTYPE_VOID:
28616 if (icode == CODE_FOR_avx_vzeroupper)
28617 target = GEN_INT (vzeroupper_intrinsic);
28618 emit_insn (GEN_FCN (icode) (target));
28619 return 0;
28620 case VOID_FTYPE_UINT64:
28621 case VOID_FTYPE_UNSIGNED:
28622 nargs = 0;
28623 klass = store;
28624 memory = 0;
28625 break;
28626 case UINT64_FTYPE_VOID:
28627 case UNSIGNED_FTYPE_VOID:
28628 nargs = 0;
28629 klass = load;
28630 memory = 0;
28631 break;
28632 case UINT64_FTYPE_PUNSIGNED:
28633 case V2DI_FTYPE_PV2DI:
28634 case V4DI_FTYPE_PV4DI:
28635 case V32QI_FTYPE_PCCHAR:
28636 case V16QI_FTYPE_PCCHAR:
28637 case V8SF_FTYPE_PCV4SF:
28638 case V8SF_FTYPE_PCFLOAT:
28639 case V4SF_FTYPE_PCFLOAT:
28640 case V4DF_FTYPE_PCV2DF:
28641 case V4DF_FTYPE_PCDOUBLE:
28642 case V2DF_FTYPE_PCDOUBLE:
28643 case VOID_FTYPE_PVOID:
28644 nargs = 1;
28645 klass = load;
28646 memory = 0;
28647 break;
28648 case VOID_FTYPE_PV2SF_V4SF:
28649 case VOID_FTYPE_PV4DI_V4DI:
28650 case VOID_FTYPE_PV2DI_V2DI:
28651 case VOID_FTYPE_PCHAR_V32QI:
28652 case VOID_FTYPE_PCHAR_V16QI:
28653 case VOID_FTYPE_PFLOAT_V8SF:
28654 case VOID_FTYPE_PFLOAT_V4SF:
28655 case VOID_FTYPE_PDOUBLE_V4DF:
28656 case VOID_FTYPE_PDOUBLE_V2DF:
28657 case VOID_FTYPE_PULONGLONG_ULONGLONG:
28658 case VOID_FTYPE_PINT_INT:
28659 nargs = 1;
28660 klass = store;
28661 /* Reserve memory operand for target. */
28662 memory = ARRAY_SIZE (args);
28663 break;
28664 case V4SF_FTYPE_V4SF_PCV2SF:
28665 case V2DF_FTYPE_V2DF_PCDOUBLE:
28666 nargs = 2;
28667 klass = load;
28668 memory = 1;
28669 break;
28670 case V8SF_FTYPE_PCV8SF_V8SI:
28671 case V4DF_FTYPE_PCV4DF_V4DI:
28672 case V4SF_FTYPE_PCV4SF_V4SI:
28673 case V2DF_FTYPE_PCV2DF_V2DI:
28674 case V8SI_FTYPE_PCV8SI_V8SI:
28675 case V4DI_FTYPE_PCV4DI_V4DI:
28676 case V4SI_FTYPE_PCV4SI_V4SI:
28677 case V2DI_FTYPE_PCV2DI_V2DI:
28678 nargs = 2;
28679 klass = load;
28680 memory = 0;
28681 break;
28682 case VOID_FTYPE_PV8SF_V8SI_V8SF:
28683 case VOID_FTYPE_PV4DF_V4DI_V4DF:
28684 case VOID_FTYPE_PV4SF_V4SI_V4SF:
28685 case VOID_FTYPE_PV2DF_V2DI_V2DF:
28686 case VOID_FTYPE_PV8SI_V8SI_V8SI:
28687 case VOID_FTYPE_PV4DI_V4DI_V4DI:
28688 case VOID_FTYPE_PV4SI_V4SI_V4SI:
28689 case VOID_FTYPE_PV2DI_V2DI_V2DI:
28690 nargs = 2;
28691 klass = store;
28692 /* Reserve memory operand for target. */
28693 memory = ARRAY_SIZE (args);
28694 break;
28695 case VOID_FTYPE_UINT_UINT_UINT:
28696 case VOID_FTYPE_UINT64_UINT_UINT:
28697 case UCHAR_FTYPE_UINT_UINT_UINT:
28698 case UCHAR_FTYPE_UINT64_UINT_UINT:
28699 nargs = 3;
28700 klass = load;
28701 memory = ARRAY_SIZE (args);
28702 last_arg_constant = true;
28703 break;
28704 default:
28705 gcc_unreachable ();
28706 }
28707
28708 gcc_assert (nargs <= ARRAY_SIZE (args));
28709
28710 if (klass == store)
28711 {
28712 arg = CALL_EXPR_ARG (exp, 0);
28713 op = expand_normal (arg);
28714 gcc_assert (target == 0);
28715 if (memory)
28716 {
28717 if (GET_MODE (op) != Pmode)
28718 op = convert_to_mode (Pmode, op, 1);
28719 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
28720 }
28721 else
28722 target = force_reg (tmode, op);
28723 arg_adjust = 1;
28724 }
28725 else
28726 {
28727 arg_adjust = 0;
28728 if (optimize
28729 || target == 0
28730 || GET_MODE (target) != tmode
28731 || !insn_p->operand[0].predicate (target, tmode))
28732 target = gen_reg_rtx (tmode);
28733 }
28734
28735 for (i = 0; i < nargs; i++)
28736 {
28737 enum machine_mode mode = insn_p->operand[i + 1].mode;
28738 bool match;
28739
28740 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
28741 op = expand_normal (arg);
28742 match = insn_p->operand[i + 1].predicate (op, mode);
28743
28744 if (last_arg_constant && (i + 1) == nargs)
28745 {
28746 if (!match)
28747 {
28748 if (icode == CODE_FOR_lwp_lwpvalsi3
28749 || icode == CODE_FOR_lwp_lwpinssi3
28750 || icode == CODE_FOR_lwp_lwpvaldi3
28751 || icode == CODE_FOR_lwp_lwpinsdi3)
28752 error ("the last argument must be a 32-bit immediate");
28753 else
28754 error ("the last argument must be an 8-bit immediate");
28755 return const0_rtx;
28756 }
28757 }
28758 else
28759 {
28760 if (i == memory)
28761 {
28762 /* This must be the memory operand. */
28763 if (GET_MODE (op) != Pmode)
28764 op = convert_to_mode (Pmode, op, 1);
28765 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
28766 gcc_assert (GET_MODE (op) == mode
28767 || GET_MODE (op) == VOIDmode);
28768 }
28769 else
28770 {
28771 /* This must be register. */
28772 if (VECTOR_MODE_P (mode))
28773 op = safe_vector_operand (op, mode);
28774
28775 gcc_assert (GET_MODE (op) == mode
28776 || GET_MODE (op) == VOIDmode);
28777 op = copy_to_mode_reg (mode, op);
28778 }
28779 }
28780
28781 args[i].op = op;
28782 args[i].mode = mode;
28783 }
28784
28785 switch (nargs)
28786 {
28787 case 0:
28788 pat = GEN_FCN (icode) (target);
28789 break;
28790 case 1:
28791 pat = GEN_FCN (icode) (target, args[0].op);
28792 break;
28793 case 2:
28794 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
28795 break;
28796 case 3:
28797 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
28798 break;
28799 default:
28800 gcc_unreachable ();
28801 }
28802
28803 if (! pat)
28804 return 0;
28805 emit_insn (pat);
28806 return klass == store ? 0 : target;
28807 }
28808
28809 /* Return the integer constant in ARG. Constrain it to be in the range
28810 of the subparts of VEC_TYPE; issue an error if not. */
28811
28812 static int
28813 get_element_number (tree vec_type, tree arg)
28814 {
28815 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
28816
28817 if (!host_integerp (arg, 1)
28818 || (elt = tree_low_cst (arg, 1), elt > max))
28819 {
28820 error ("selector must be an integer constant in the range 0..%wi", max);
28821 return 0;
28822 }
28823
28824 return elt;
28825 }
28826
28827 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
28828 ix86_expand_vector_init. We DO have language-level syntax for this, in
28829 the form of (type){ init-list }. Except that since we can't place emms
28830 instructions from inside the compiler, we can't allow the use of MMX
28831 registers unless the user explicitly asks for it. So we do *not* define
28832 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
28833 we have builtins invoked by mmintrin.h that gives us license to emit
28834 these sorts of instructions. */
28835
28836 static rtx
28837 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
28838 {
28839 enum machine_mode tmode = TYPE_MODE (type);
28840 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
28841 int i, n_elt = GET_MODE_NUNITS (tmode);
28842 rtvec v = rtvec_alloc (n_elt);
28843
28844 gcc_assert (VECTOR_MODE_P (tmode));
28845 gcc_assert (call_expr_nargs (exp) == n_elt);
28846
28847 for (i = 0; i < n_elt; ++i)
28848 {
28849 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
28850 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
28851 }
28852
28853 if (!target || !register_operand (target, tmode))
28854 target = gen_reg_rtx (tmode);
28855
28856 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
28857 return target;
28858 }
28859
28860 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
28861 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
28862 had a language-level syntax for referencing vector elements. */
28863
28864 static rtx
28865 ix86_expand_vec_ext_builtin (tree exp, rtx target)
28866 {
28867 enum machine_mode tmode, mode0;
28868 tree arg0, arg1;
28869 int elt;
28870 rtx op0;
28871
28872 arg0 = CALL_EXPR_ARG (exp, 0);
28873 arg1 = CALL_EXPR_ARG (exp, 1);
28874
28875 op0 = expand_normal (arg0);
28876 elt = get_element_number (TREE_TYPE (arg0), arg1);
28877
28878 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
28879 mode0 = TYPE_MODE (TREE_TYPE (arg0));
28880 gcc_assert (VECTOR_MODE_P (mode0));
28881
28882 op0 = force_reg (mode0, op0);
28883
28884 if (optimize || !target || !register_operand (target, tmode))
28885 target = gen_reg_rtx (tmode);
28886
28887 ix86_expand_vector_extract (true, target, op0, elt);
28888
28889 return target;
28890 }
28891
28892 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
28893 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
28894 a language-level syntax for referencing vector elements. */
28895
28896 static rtx
28897 ix86_expand_vec_set_builtin (tree exp)
28898 {
28899 enum machine_mode tmode, mode1;
28900 tree arg0, arg1, arg2;
28901 int elt;
28902 rtx op0, op1, target;
28903
28904 arg0 = CALL_EXPR_ARG (exp, 0);
28905 arg1 = CALL_EXPR_ARG (exp, 1);
28906 arg2 = CALL_EXPR_ARG (exp, 2);
28907
28908 tmode = TYPE_MODE (TREE_TYPE (arg0));
28909 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
28910 gcc_assert (VECTOR_MODE_P (tmode));
28911
28912 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
28913 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
28914 elt = get_element_number (TREE_TYPE (arg0), arg2);
28915
28916 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
28917 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
28918
28919 op0 = force_reg (tmode, op0);
28920 op1 = force_reg (mode1, op1);
28921
28922 /* OP0 is the source of these builtin functions and shouldn't be
28923 modified. Create a copy, use it and return it as target. */
28924 target = gen_reg_rtx (tmode);
28925 emit_move_insn (target, op0);
28926 ix86_expand_vector_set (true, target, op1, elt);
28927
28928 return target;
28929 }
28930
28931 /* Expand an expression EXP that calls a built-in function,
28932 with result going to TARGET if that's convenient
28933 (and in mode MODE if that's convenient).
28934 SUBTARGET may be used as the target for computing one of EXP's operands.
28935 IGNORE is nonzero if the value is to be ignored. */
28936
28937 static rtx
28938 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
28939 enum machine_mode mode ATTRIBUTE_UNUSED,
28940 int ignore ATTRIBUTE_UNUSED)
28941 {
28942 const struct builtin_description *d;
28943 size_t i;
28944 enum insn_code icode;
28945 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
28946 tree arg0, arg1, arg2, arg3, arg4;
28947 rtx op0, op1, op2, op3, op4, pat;
28948 enum machine_mode mode0, mode1, mode2, mode3, mode4;
28949 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
28950
28951 /* Determine whether the builtin function is available under the current ISA.
28952 Originally the builtin was not created if it wasn't applicable to the
28953 current ISA based on the command line switches. With function specific
28954 options, we need to check in the context of the function making the call
28955 whether it is supported. */
28956 if (ix86_builtins_isa[fcode].isa
28957 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
28958 {
28959 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
28960 NULL, (enum fpmath_unit) 0, false);
28961
28962 if (!opts)
28963 error ("%qE needs unknown isa option", fndecl);
28964 else
28965 {
28966 gcc_assert (opts != NULL);
28967 error ("%qE needs isa option %s", fndecl, opts);
28968 free (opts);
28969 }
28970 return const0_rtx;
28971 }
28972
28973 switch (fcode)
28974 {
28975 case IX86_BUILTIN_MASKMOVQ:
28976 case IX86_BUILTIN_MASKMOVDQU:
28977 icode = (fcode == IX86_BUILTIN_MASKMOVQ
28978 ? CODE_FOR_mmx_maskmovq
28979 : CODE_FOR_sse2_maskmovdqu);
28980 /* Note the arg order is different from the operand order. */
28981 arg1 = CALL_EXPR_ARG (exp, 0);
28982 arg2 = CALL_EXPR_ARG (exp, 1);
28983 arg0 = CALL_EXPR_ARG (exp, 2);
28984 op0 = expand_normal (arg0);
28985 op1 = expand_normal (arg1);
28986 op2 = expand_normal (arg2);
28987 mode0 = insn_data[icode].operand[0].mode;
28988 mode1 = insn_data[icode].operand[1].mode;
28989 mode2 = insn_data[icode].operand[2].mode;
28990
28991 if (GET_MODE (op0) != Pmode)
28992 op0 = convert_to_mode (Pmode, op0, 1);
28993 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
28994
28995 if (!insn_data[icode].operand[0].predicate (op0, mode0))
28996 op0 = copy_to_mode_reg (mode0, op0);
28997 if (!insn_data[icode].operand[1].predicate (op1, mode1))
28998 op1 = copy_to_mode_reg (mode1, op1);
28999 if (!insn_data[icode].operand[2].predicate (op2, mode2))
29000 op2 = copy_to_mode_reg (mode2, op2);
29001 pat = GEN_FCN (icode) (op0, op1, op2);
29002 if (! pat)
29003 return 0;
29004 emit_insn (pat);
29005 return 0;
29006
29007 case IX86_BUILTIN_LDMXCSR:
29008 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
29009 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29010 emit_move_insn (target, op0);
29011 emit_insn (gen_sse_ldmxcsr (target));
29012 return 0;
29013
29014 case IX86_BUILTIN_STMXCSR:
29015 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29016 emit_insn (gen_sse_stmxcsr (target));
29017 return copy_to_mode_reg (SImode, target);
29018
29019 case IX86_BUILTIN_CLFLUSH:
29020 arg0 = CALL_EXPR_ARG (exp, 0);
29021 op0 = expand_normal (arg0);
29022 icode = CODE_FOR_sse2_clflush;
29023 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29024 {
29025 if (GET_MODE (op0) != Pmode)
29026 op0 = convert_to_mode (Pmode, op0, 1);
29027 op0 = force_reg (Pmode, op0);
29028 }
29029
29030 emit_insn (gen_sse2_clflush (op0));
29031 return 0;
29032
29033 case IX86_BUILTIN_MONITOR:
29034 arg0 = CALL_EXPR_ARG (exp, 0);
29035 arg1 = CALL_EXPR_ARG (exp, 1);
29036 arg2 = CALL_EXPR_ARG (exp, 2);
29037 op0 = expand_normal (arg0);
29038 op1 = expand_normal (arg1);
29039 op2 = expand_normal (arg2);
29040 if (!REG_P (op0))
29041 {
29042 if (GET_MODE (op0) != Pmode)
29043 op0 = convert_to_mode (Pmode, op0, 1);
29044 op0 = force_reg (Pmode, op0);
29045 }
29046 if (!REG_P (op1))
29047 op1 = copy_to_mode_reg (SImode, op1);
29048 if (!REG_P (op2))
29049 op2 = copy_to_mode_reg (SImode, op2);
29050 emit_insn (ix86_gen_monitor (op0, op1, op2));
29051 return 0;
29052
29053 case IX86_BUILTIN_MWAIT:
29054 arg0 = CALL_EXPR_ARG (exp, 0);
29055 arg1 = CALL_EXPR_ARG (exp, 1);
29056 op0 = expand_normal (arg0);
29057 op1 = expand_normal (arg1);
29058 if (!REG_P (op0))
29059 op0 = copy_to_mode_reg (SImode, op0);
29060 if (!REG_P (op1))
29061 op1 = copy_to_mode_reg (SImode, op1);
29062 emit_insn (gen_sse3_mwait (op0, op1));
29063 return 0;
29064
29065 case IX86_BUILTIN_VEC_INIT_V2SI:
29066 case IX86_BUILTIN_VEC_INIT_V4HI:
29067 case IX86_BUILTIN_VEC_INIT_V8QI:
29068 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
29069
29070 case IX86_BUILTIN_VEC_EXT_V2DF:
29071 case IX86_BUILTIN_VEC_EXT_V2DI:
29072 case IX86_BUILTIN_VEC_EXT_V4SF:
29073 case IX86_BUILTIN_VEC_EXT_V4SI:
29074 case IX86_BUILTIN_VEC_EXT_V8HI:
29075 case IX86_BUILTIN_VEC_EXT_V2SI:
29076 case IX86_BUILTIN_VEC_EXT_V4HI:
29077 case IX86_BUILTIN_VEC_EXT_V16QI:
29078 return ix86_expand_vec_ext_builtin (exp, target);
29079
29080 case IX86_BUILTIN_VEC_SET_V2DI:
29081 case IX86_BUILTIN_VEC_SET_V4SF:
29082 case IX86_BUILTIN_VEC_SET_V4SI:
29083 case IX86_BUILTIN_VEC_SET_V8HI:
29084 case IX86_BUILTIN_VEC_SET_V4HI:
29085 case IX86_BUILTIN_VEC_SET_V16QI:
29086 return ix86_expand_vec_set_builtin (exp);
29087
29088 case IX86_BUILTIN_INFQ:
29089 case IX86_BUILTIN_HUGE_VALQ:
29090 {
29091 REAL_VALUE_TYPE inf;
29092 rtx tmp;
29093
29094 real_inf (&inf);
29095 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
29096
29097 tmp = validize_mem (force_const_mem (mode, tmp));
29098
29099 if (target == 0)
29100 target = gen_reg_rtx (mode);
29101
29102 emit_move_insn (target, tmp);
29103 return target;
29104 }
29105
29106 case IX86_BUILTIN_LLWPCB:
29107 arg0 = CALL_EXPR_ARG (exp, 0);
29108 op0 = expand_normal (arg0);
29109 icode = CODE_FOR_lwp_llwpcb;
29110 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29111 {
29112 if (GET_MODE (op0) != Pmode)
29113 op0 = convert_to_mode (Pmode, op0, 1);
29114 op0 = force_reg (Pmode, op0);
29115 }
29116 emit_insn (gen_lwp_llwpcb (op0));
29117 return 0;
29118
29119 case IX86_BUILTIN_SLWPCB:
29120 icode = CODE_FOR_lwp_slwpcb;
29121 if (!target
29122 || !insn_data[icode].operand[0].predicate (target, Pmode))
29123 target = gen_reg_rtx (Pmode);
29124 emit_insn (gen_lwp_slwpcb (target));
29125 return target;
29126
29127 case IX86_BUILTIN_BEXTRI32:
29128 case IX86_BUILTIN_BEXTRI64:
29129 arg0 = CALL_EXPR_ARG (exp, 0);
29130 arg1 = CALL_EXPR_ARG (exp, 1);
29131 op0 = expand_normal (arg0);
29132 op1 = expand_normal (arg1);
29133 icode = (fcode == IX86_BUILTIN_BEXTRI32
29134 ? CODE_FOR_tbm_bextri_si
29135 : CODE_FOR_tbm_bextri_di);
29136 if (!CONST_INT_P (op1))
29137 {
29138 error ("last argument must be an immediate");
29139 return const0_rtx;
29140 }
29141 else
29142 {
29143 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
29144 unsigned char lsb_index = INTVAL (op1) & 0xFF;
29145 op1 = GEN_INT (length);
29146 op2 = GEN_INT (lsb_index);
29147 pat = GEN_FCN (icode) (target, op0, op1, op2);
29148 if (pat)
29149 emit_insn (pat);
29150 return target;
29151 }
29152
29153 case IX86_BUILTIN_RDRAND16_STEP:
29154 icode = CODE_FOR_rdrandhi_1;
29155 mode0 = HImode;
29156 goto rdrand_step;
29157
29158 case IX86_BUILTIN_RDRAND32_STEP:
29159 icode = CODE_FOR_rdrandsi_1;
29160 mode0 = SImode;
29161 goto rdrand_step;
29162
29163 case IX86_BUILTIN_RDRAND64_STEP:
29164 icode = CODE_FOR_rdranddi_1;
29165 mode0 = DImode;
29166
29167 rdrand_step:
29168 op0 = gen_reg_rtx (mode0);
29169 emit_insn (GEN_FCN (icode) (op0));
29170
29171 arg0 = CALL_EXPR_ARG (exp, 0);
29172 op1 = expand_normal (arg0);
29173 if (!address_operand (op1, VOIDmode))
29174 {
29175 op1 = convert_memory_address (Pmode, op1);
29176 op1 = copy_addr_to_reg (op1);
29177 }
29178 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
29179
29180 op1 = gen_reg_rtx (SImode);
29181 emit_move_insn (op1, CONST1_RTX (SImode));
29182
29183 /* Emit SImode conditional move. */
29184 if (mode0 == HImode)
29185 {
29186 op2 = gen_reg_rtx (SImode);
29187 emit_insn (gen_zero_extendhisi2 (op2, op0));
29188 }
29189 else if (mode0 == SImode)
29190 op2 = op0;
29191 else
29192 op2 = gen_rtx_SUBREG (SImode, op0, 0);
29193
29194 if (target == 0)
29195 target = gen_reg_rtx (SImode);
29196
29197 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
29198 const0_rtx);
29199 emit_insn (gen_rtx_SET (VOIDmode, target,
29200 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
29201 return target;
29202
29203 case IX86_BUILTIN_GATHERSIV2DF:
29204 icode = CODE_FOR_avx2_gathersiv2df;
29205 goto gather_gen;
29206 case IX86_BUILTIN_GATHERSIV4DF:
29207 icode = CODE_FOR_avx2_gathersiv4df;
29208 goto gather_gen;
29209 case IX86_BUILTIN_GATHERDIV2DF:
29210 icode = CODE_FOR_avx2_gatherdiv2df;
29211 goto gather_gen;
29212 case IX86_BUILTIN_GATHERDIV4DF:
29213 icode = CODE_FOR_avx2_gatherdiv4df;
29214 goto gather_gen;
29215 case IX86_BUILTIN_GATHERSIV4SF:
29216 icode = CODE_FOR_avx2_gathersiv4sf;
29217 goto gather_gen;
29218 case IX86_BUILTIN_GATHERSIV8SF:
29219 icode = CODE_FOR_avx2_gathersiv8sf;
29220 goto gather_gen;
29221 case IX86_BUILTIN_GATHERDIV4SF:
29222 icode = CODE_FOR_avx2_gatherdiv4sf;
29223 goto gather_gen;
29224 case IX86_BUILTIN_GATHERDIV8SF:
29225 icode = CODE_FOR_avx2_gatherdiv8sf;
29226 goto gather_gen;
29227 case IX86_BUILTIN_GATHERSIV2DI:
29228 icode = CODE_FOR_avx2_gathersiv2di;
29229 goto gather_gen;
29230 case IX86_BUILTIN_GATHERSIV4DI:
29231 icode = CODE_FOR_avx2_gathersiv4di;
29232 goto gather_gen;
29233 case IX86_BUILTIN_GATHERDIV2DI:
29234 icode = CODE_FOR_avx2_gatherdiv2di;
29235 goto gather_gen;
29236 case IX86_BUILTIN_GATHERDIV4DI:
29237 icode = CODE_FOR_avx2_gatherdiv4di;
29238 goto gather_gen;
29239 case IX86_BUILTIN_GATHERSIV4SI:
29240 icode = CODE_FOR_avx2_gathersiv4si;
29241 goto gather_gen;
29242 case IX86_BUILTIN_GATHERSIV8SI:
29243 icode = CODE_FOR_avx2_gathersiv8si;
29244 goto gather_gen;
29245 case IX86_BUILTIN_GATHERDIV4SI:
29246 icode = CODE_FOR_avx2_gatherdiv4si;
29247 goto gather_gen;
29248 case IX86_BUILTIN_GATHERDIV8SI:
29249 icode = CODE_FOR_avx2_gatherdiv8si;
29250 goto gather_gen;
29251 case IX86_BUILTIN_GATHERALTSIV4DF:
29252 icode = CODE_FOR_avx2_gathersiv4df;
29253 goto gather_gen;
29254 case IX86_BUILTIN_GATHERALTDIV8SF:
29255 icode = CODE_FOR_avx2_gatherdiv8sf;
29256 goto gather_gen;
29257 case IX86_BUILTIN_GATHERALTSIV4DI:
29258 icode = CODE_FOR_avx2_gathersiv4df;
29259 goto gather_gen;
29260 case IX86_BUILTIN_GATHERALTDIV8SI:
29261 icode = CODE_FOR_avx2_gatherdiv8si;
29262 goto gather_gen;
29263
29264 gather_gen:
29265 arg0 = CALL_EXPR_ARG (exp, 0);
29266 arg1 = CALL_EXPR_ARG (exp, 1);
29267 arg2 = CALL_EXPR_ARG (exp, 2);
29268 arg3 = CALL_EXPR_ARG (exp, 3);
29269 arg4 = CALL_EXPR_ARG (exp, 4);
29270 op0 = expand_normal (arg0);
29271 op1 = expand_normal (arg1);
29272 op2 = expand_normal (arg2);
29273 op3 = expand_normal (arg3);
29274 op4 = expand_normal (arg4);
29275 /* Note the arg order is different from the operand order. */
29276 mode0 = insn_data[icode].operand[1].mode;
29277 mode2 = insn_data[icode].operand[3].mode;
29278 mode3 = insn_data[icode].operand[4].mode;
29279 mode4 = insn_data[icode].operand[5].mode;
29280
29281 if (target == NULL_RTX
29282 || GET_MODE (target) != insn_data[icode].operand[0].mode)
29283 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
29284 else
29285 subtarget = target;
29286
29287 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
29288 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
29289 {
29290 rtx half = gen_reg_rtx (V4SImode);
29291 if (!nonimmediate_operand (op2, V8SImode))
29292 op2 = copy_to_mode_reg (V8SImode, op2);
29293 emit_insn (gen_vec_extract_lo_v8si (half, op2));
29294 op2 = half;
29295 }
29296 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
29297 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
29298 {
29299 rtx (*gen) (rtx, rtx);
29300 rtx half = gen_reg_rtx (mode0);
29301 if (mode0 == V4SFmode)
29302 gen = gen_vec_extract_lo_v8sf;
29303 else
29304 gen = gen_vec_extract_lo_v8si;
29305 if (!nonimmediate_operand (op0, GET_MODE (op0)))
29306 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
29307 emit_insn (gen (half, op0));
29308 op0 = half;
29309 if (!nonimmediate_operand (op3, GET_MODE (op3)))
29310 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
29311 emit_insn (gen (half, op3));
29312 op3 = half;
29313 }
29314
29315 /* Force memory operand only with base register here. But we
29316 don't want to do it on memory operand for other builtin
29317 functions. */
29318 if (GET_MODE (op1) != Pmode)
29319 op1 = convert_to_mode (Pmode, op1, 1);
29320 op1 = force_reg (Pmode, op1);
29321
29322 if (!insn_data[icode].operand[1].predicate (op0, mode0))
29323 op0 = copy_to_mode_reg (mode0, op0);
29324 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
29325 op1 = copy_to_mode_reg (Pmode, op1);
29326 if (!insn_data[icode].operand[3].predicate (op2, mode2))
29327 op2 = copy_to_mode_reg (mode2, op2);
29328 if (!insn_data[icode].operand[4].predicate (op3, mode3))
29329 op3 = copy_to_mode_reg (mode3, op3);
29330 if (!insn_data[icode].operand[5].predicate (op4, mode4))
29331 {
29332 error ("last argument must be scale 1, 2, 4, 8");
29333 return const0_rtx;
29334 }
29335
29336 /* Optimize. If mask is known to have all high bits set,
29337 replace op0 with pc_rtx to signal that the instruction
29338 overwrites the whole destination and doesn't use its
29339 previous contents. */
29340 if (optimize)
29341 {
29342 if (TREE_CODE (arg3) == VECTOR_CST)
29343 {
29344 tree elt;
29345 unsigned int negative = 0;
29346 for (elt = TREE_VECTOR_CST_ELTS (arg3);
29347 elt; elt = TREE_CHAIN (elt))
29348 {
29349 tree cst = TREE_VALUE (elt);
29350 if (TREE_CODE (cst) == INTEGER_CST
29351 && tree_int_cst_sign_bit (cst))
29352 negative++;
29353 else if (TREE_CODE (cst) == REAL_CST
29354 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
29355 negative++;
29356 }
29357 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
29358 op0 = pc_rtx;
29359 }
29360 else if (TREE_CODE (arg3) == SSA_NAME)
29361 {
29362 /* Recognize also when mask is like:
29363 __v2df src = _mm_setzero_pd ();
29364 __v2df mask = _mm_cmpeq_pd (src, src);
29365 or
29366 __v8sf src = _mm256_setzero_ps ();
29367 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
29368 as that is a cheaper way to load all ones into
29369 a register than having to load a constant from
29370 memory. */
29371 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
29372 if (is_gimple_call (def_stmt))
29373 {
29374 tree fndecl = gimple_call_fndecl (def_stmt);
29375 if (fndecl
29376 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
29377 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
29378 {
29379 case IX86_BUILTIN_CMPPD:
29380 case IX86_BUILTIN_CMPPS:
29381 case IX86_BUILTIN_CMPPD256:
29382 case IX86_BUILTIN_CMPPS256:
29383 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
29384 break;
29385 /* FALLTHRU */
29386 case IX86_BUILTIN_CMPEQPD:
29387 case IX86_BUILTIN_CMPEQPS:
29388 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
29389 && initializer_zerop (gimple_call_arg (def_stmt,
29390 1)))
29391 op0 = pc_rtx;
29392 break;
29393 default:
29394 break;
29395 }
29396 }
29397 }
29398 }
29399
29400 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
29401 if (! pat)
29402 return const0_rtx;
29403 emit_insn (pat);
29404
29405 if (fcode == IX86_BUILTIN_GATHERDIV8SF
29406 || fcode == IX86_BUILTIN_GATHERDIV8SI)
29407 {
29408 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
29409 ? V4SFmode : V4SImode;
29410 if (target == NULL_RTX)
29411 target = gen_reg_rtx (tmode);
29412 if (tmode == V4SFmode)
29413 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
29414 else
29415 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
29416 }
29417 else
29418 target = subtarget;
29419
29420 return target;
29421
29422 default:
29423 break;
29424 }
29425
29426 for (i = 0, d = bdesc_special_args;
29427 i < ARRAY_SIZE (bdesc_special_args);
29428 i++, d++)
29429 if (d->code == fcode)
29430 return ix86_expand_special_args_builtin (d, exp, target);
29431
29432 for (i = 0, d = bdesc_args;
29433 i < ARRAY_SIZE (bdesc_args);
29434 i++, d++)
29435 if (d->code == fcode)
29436 switch (fcode)
29437 {
29438 case IX86_BUILTIN_FABSQ:
29439 case IX86_BUILTIN_COPYSIGNQ:
29440 if (!TARGET_SSE2)
29441 /* Emit a normal call if SSE2 isn't available. */
29442 return expand_call (exp, target, ignore);
29443 default:
29444 return ix86_expand_args_builtin (d, exp, target);
29445 }
29446
29447 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
29448 if (d->code == fcode)
29449 return ix86_expand_sse_comi (d, exp, target);
29450
29451 for (i = 0, d = bdesc_pcmpestr;
29452 i < ARRAY_SIZE (bdesc_pcmpestr);
29453 i++, d++)
29454 if (d->code == fcode)
29455 return ix86_expand_sse_pcmpestr (d, exp, target);
29456
29457 for (i = 0, d = bdesc_pcmpistr;
29458 i < ARRAY_SIZE (bdesc_pcmpistr);
29459 i++, d++)
29460 if (d->code == fcode)
29461 return ix86_expand_sse_pcmpistr (d, exp, target);
29462
29463 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29464 if (d->code == fcode)
29465 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
29466 (enum ix86_builtin_func_type)
29467 d->flag, d->comparison);
29468
29469 gcc_unreachable ();
29470 }
29471
29472 /* Returns a function decl for a vectorized version of the builtin function
29473 with builtin function code FN and the result vector type TYPE, or NULL_TREE
29474 if it is not available. */
29475
29476 static tree
29477 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
29478 tree type_in)
29479 {
29480 enum machine_mode in_mode, out_mode;
29481 int in_n, out_n;
29482 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
29483
29484 if (TREE_CODE (type_out) != VECTOR_TYPE
29485 || TREE_CODE (type_in) != VECTOR_TYPE
29486 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
29487 return NULL_TREE;
29488
29489 out_mode = TYPE_MODE (TREE_TYPE (type_out));
29490 out_n = TYPE_VECTOR_SUBPARTS (type_out);
29491 in_mode = TYPE_MODE (TREE_TYPE (type_in));
29492 in_n = TYPE_VECTOR_SUBPARTS (type_in);
29493
29494 switch (fn)
29495 {
29496 case BUILT_IN_SQRT:
29497 if (out_mode == DFmode && in_mode == DFmode)
29498 {
29499 if (out_n == 2 && in_n == 2)
29500 return ix86_builtins[IX86_BUILTIN_SQRTPD];
29501 else if (out_n == 4 && in_n == 4)
29502 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
29503 }
29504 break;
29505
29506 case BUILT_IN_SQRTF:
29507 if (out_mode == SFmode && in_mode == SFmode)
29508 {
29509 if (out_n == 4 && in_n == 4)
29510 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
29511 else if (out_n == 8 && in_n == 8)
29512 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
29513 }
29514 break;
29515
29516 case BUILT_IN_IRINT:
29517 case BUILT_IN_LRINT:
29518 case BUILT_IN_LLRINT:
29519 if (out_mode == SImode && in_mode == DFmode)
29520 {
29521 if (out_n == 4 && in_n == 2)
29522 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
29523 else if (out_n == 8 && in_n == 4)
29524 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
29525 }
29526 break;
29527
29528 case BUILT_IN_IRINTF:
29529 case BUILT_IN_LRINTF:
29530 case BUILT_IN_LLRINTF:
29531 if (out_mode == SImode && in_mode == SFmode)
29532 {
29533 if (out_n == 4 && in_n == 4)
29534 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
29535 else if (out_n == 8 && in_n == 8)
29536 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
29537 }
29538 break;
29539
29540 case BUILT_IN_COPYSIGN:
29541 if (out_mode == DFmode && in_mode == DFmode)
29542 {
29543 if (out_n == 2 && in_n == 2)
29544 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
29545 else if (out_n == 4 && in_n == 4)
29546 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
29547 }
29548 break;
29549
29550 case BUILT_IN_COPYSIGNF:
29551 if (out_mode == SFmode && in_mode == SFmode)
29552 {
29553 if (out_n == 4 && in_n == 4)
29554 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
29555 else if (out_n == 8 && in_n == 8)
29556 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
29557 }
29558 break;
29559
29560 case BUILT_IN_FLOOR:
29561 /* The round insn does not trap on denormals. */
29562 if (flag_trapping_math || !TARGET_ROUND)
29563 break;
29564
29565 if (out_mode == DFmode && in_mode == DFmode)
29566 {
29567 if (out_n == 2 && in_n == 2)
29568 return ix86_builtins[IX86_BUILTIN_FLOORPD];
29569 else if (out_n == 4 && in_n == 4)
29570 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
29571 }
29572 break;
29573
29574 case BUILT_IN_FLOORF:
29575 /* The round insn does not trap on denormals. */
29576 if (flag_trapping_math || !TARGET_ROUND)
29577 break;
29578
29579 if (out_mode == SFmode && in_mode == SFmode)
29580 {
29581 if (out_n == 4 && in_n == 4)
29582 return ix86_builtins[IX86_BUILTIN_FLOORPS];
29583 else if (out_n == 8 && in_n == 8)
29584 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
29585 }
29586 break;
29587
29588 case BUILT_IN_CEIL:
29589 /* The round insn does not trap on denormals. */
29590 if (flag_trapping_math || !TARGET_ROUND)
29591 break;
29592
29593 if (out_mode == DFmode && in_mode == DFmode)
29594 {
29595 if (out_n == 2 && in_n == 2)
29596 return ix86_builtins[IX86_BUILTIN_CEILPD];
29597 else if (out_n == 4 && in_n == 4)
29598 return ix86_builtins[IX86_BUILTIN_CEILPD256];
29599 }
29600 break;
29601
29602 case BUILT_IN_CEILF:
29603 /* The round insn does not trap on denormals. */
29604 if (flag_trapping_math || !TARGET_ROUND)
29605 break;
29606
29607 if (out_mode == SFmode && in_mode == SFmode)
29608 {
29609 if (out_n == 4 && in_n == 4)
29610 return ix86_builtins[IX86_BUILTIN_CEILPS];
29611 else if (out_n == 8 && in_n == 8)
29612 return ix86_builtins[IX86_BUILTIN_CEILPS256];
29613 }
29614 break;
29615
29616 case BUILT_IN_TRUNC:
29617 /* The round insn does not trap on denormals. */
29618 if (flag_trapping_math || !TARGET_ROUND)
29619 break;
29620
29621 if (out_mode == DFmode && in_mode == DFmode)
29622 {
29623 if (out_n == 2 && in_n == 2)
29624 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
29625 else if (out_n == 4 && in_n == 4)
29626 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
29627 }
29628 break;
29629
29630 case BUILT_IN_TRUNCF:
29631 /* The round insn does not trap on denormals. */
29632 if (flag_trapping_math || !TARGET_ROUND)
29633 break;
29634
29635 if (out_mode == SFmode && in_mode == SFmode)
29636 {
29637 if (out_n == 4 && in_n == 4)
29638 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
29639 else if (out_n == 8 && in_n == 8)
29640 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
29641 }
29642 break;
29643
29644 case BUILT_IN_RINT:
29645 /* The round insn does not trap on denormals. */
29646 if (flag_trapping_math || !TARGET_ROUND)
29647 break;
29648
29649 if (out_mode == DFmode && in_mode == DFmode)
29650 {
29651 if (out_n == 2 && in_n == 2)
29652 return ix86_builtins[IX86_BUILTIN_RINTPD];
29653 else if (out_n == 4 && in_n == 4)
29654 return ix86_builtins[IX86_BUILTIN_RINTPD256];
29655 }
29656 break;
29657
29658 case BUILT_IN_RINTF:
29659 /* The round insn does not trap on denormals. */
29660 if (flag_trapping_math || !TARGET_ROUND)
29661 break;
29662
29663 if (out_mode == SFmode && in_mode == SFmode)
29664 {
29665 if (out_n == 4 && in_n == 4)
29666 return ix86_builtins[IX86_BUILTIN_RINTPS];
29667 else if (out_n == 8 && in_n == 8)
29668 return ix86_builtins[IX86_BUILTIN_RINTPS256];
29669 }
29670 break;
29671
29672 case BUILT_IN_ROUND:
29673 /* The round insn does not trap on denormals. */
29674 if (flag_trapping_math || !TARGET_ROUND)
29675 break;
29676
29677 if (out_mode == DFmode && in_mode == DFmode)
29678 {
29679 if (out_n == 2 && in_n == 2)
29680 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
29681 else if (out_n == 4 && in_n == 4)
29682 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
29683 }
29684 break;
29685
29686 case BUILT_IN_ROUNDF:
29687 /* The round insn does not trap on denormals. */
29688 if (flag_trapping_math || !TARGET_ROUND)
29689 break;
29690
29691 if (out_mode == SFmode && in_mode == SFmode)
29692 {
29693 if (out_n == 4 && in_n == 4)
29694 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
29695 else if (out_n == 8 && in_n == 8)
29696 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
29697 }
29698 break;
29699
29700 case BUILT_IN_FMA:
29701 if (out_mode == DFmode && in_mode == DFmode)
29702 {
29703 if (out_n == 2 && in_n == 2)
29704 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
29705 if (out_n == 4 && in_n == 4)
29706 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
29707 }
29708 break;
29709
29710 case BUILT_IN_FMAF:
29711 if (out_mode == SFmode && in_mode == SFmode)
29712 {
29713 if (out_n == 4 && in_n == 4)
29714 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
29715 if (out_n == 8 && in_n == 8)
29716 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
29717 }
29718 break;
29719
29720 default:
29721 break;
29722 }
29723
29724 /* Dispatch to a handler for a vectorization library. */
29725 if (ix86_veclib_handler)
29726 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
29727 type_in);
29728
29729 return NULL_TREE;
29730 }
29731
29732 /* Handler for an SVML-style interface to
29733 a library with vectorized intrinsics. */
29734
29735 static tree
29736 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
29737 {
29738 char name[20];
29739 tree fntype, new_fndecl, args;
29740 unsigned arity;
29741 const char *bname;
29742 enum machine_mode el_mode, in_mode;
29743 int n, in_n;
29744
29745 /* The SVML is suitable for unsafe math only. */
29746 if (!flag_unsafe_math_optimizations)
29747 return NULL_TREE;
29748
29749 el_mode = TYPE_MODE (TREE_TYPE (type_out));
29750 n = TYPE_VECTOR_SUBPARTS (type_out);
29751 in_mode = TYPE_MODE (TREE_TYPE (type_in));
29752 in_n = TYPE_VECTOR_SUBPARTS (type_in);
29753 if (el_mode != in_mode
29754 || n != in_n)
29755 return NULL_TREE;
29756
29757 switch (fn)
29758 {
29759 case BUILT_IN_EXP:
29760 case BUILT_IN_LOG:
29761 case BUILT_IN_LOG10:
29762 case BUILT_IN_POW:
29763 case BUILT_IN_TANH:
29764 case BUILT_IN_TAN:
29765 case BUILT_IN_ATAN:
29766 case BUILT_IN_ATAN2:
29767 case BUILT_IN_ATANH:
29768 case BUILT_IN_CBRT:
29769 case BUILT_IN_SINH:
29770 case BUILT_IN_SIN:
29771 case BUILT_IN_ASINH:
29772 case BUILT_IN_ASIN:
29773 case BUILT_IN_COSH:
29774 case BUILT_IN_COS:
29775 case BUILT_IN_ACOSH:
29776 case BUILT_IN_ACOS:
29777 if (el_mode != DFmode || n != 2)
29778 return NULL_TREE;
29779 break;
29780
29781 case BUILT_IN_EXPF:
29782 case BUILT_IN_LOGF:
29783 case BUILT_IN_LOG10F:
29784 case BUILT_IN_POWF:
29785 case BUILT_IN_TANHF:
29786 case BUILT_IN_TANF:
29787 case BUILT_IN_ATANF:
29788 case BUILT_IN_ATAN2F:
29789 case BUILT_IN_ATANHF:
29790 case BUILT_IN_CBRTF:
29791 case BUILT_IN_SINHF:
29792 case BUILT_IN_SINF:
29793 case BUILT_IN_ASINHF:
29794 case BUILT_IN_ASINF:
29795 case BUILT_IN_COSHF:
29796 case BUILT_IN_COSF:
29797 case BUILT_IN_ACOSHF:
29798 case BUILT_IN_ACOSF:
29799 if (el_mode != SFmode || n != 4)
29800 return NULL_TREE;
29801 break;
29802
29803 default:
29804 return NULL_TREE;
29805 }
29806
29807 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
29808
29809 if (fn == BUILT_IN_LOGF)
29810 strcpy (name, "vmlsLn4");
29811 else if (fn == BUILT_IN_LOG)
29812 strcpy (name, "vmldLn2");
29813 else if (n == 4)
29814 {
29815 sprintf (name, "vmls%s", bname+10);
29816 name[strlen (name)-1] = '4';
29817 }
29818 else
29819 sprintf (name, "vmld%s2", bname+10);
29820
29821 /* Convert to uppercase. */
29822 name[4] &= ~0x20;
29823
29824 arity = 0;
29825 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
29826 args;
29827 args = TREE_CHAIN (args))
29828 arity++;
29829
29830 if (arity == 1)
29831 fntype = build_function_type_list (type_out, type_in, NULL);
29832 else
29833 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
29834
29835 /* Build a function declaration for the vectorized function. */
29836 new_fndecl = build_decl (BUILTINS_LOCATION,
29837 FUNCTION_DECL, get_identifier (name), fntype);
29838 TREE_PUBLIC (new_fndecl) = 1;
29839 DECL_EXTERNAL (new_fndecl) = 1;
29840 DECL_IS_NOVOPS (new_fndecl) = 1;
29841 TREE_READONLY (new_fndecl) = 1;
29842
29843 return new_fndecl;
29844 }
29845
29846 /* Handler for an ACML-style interface to
29847 a library with vectorized intrinsics. */
29848
29849 static tree
29850 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
29851 {
29852 char name[20] = "__vr.._";
29853 tree fntype, new_fndecl, args;
29854 unsigned arity;
29855 const char *bname;
29856 enum machine_mode el_mode, in_mode;
29857 int n, in_n;
29858
29859 /* The ACML is 64bits only and suitable for unsafe math only as
29860 it does not correctly support parts of IEEE with the required
29861 precision such as denormals. */
29862 if (!TARGET_64BIT
29863 || !flag_unsafe_math_optimizations)
29864 return NULL_TREE;
29865
29866 el_mode = TYPE_MODE (TREE_TYPE (type_out));
29867 n = TYPE_VECTOR_SUBPARTS (type_out);
29868 in_mode = TYPE_MODE (TREE_TYPE (type_in));
29869 in_n = TYPE_VECTOR_SUBPARTS (type_in);
29870 if (el_mode != in_mode
29871 || n != in_n)
29872 return NULL_TREE;
29873
29874 switch (fn)
29875 {
29876 case BUILT_IN_SIN:
29877 case BUILT_IN_COS:
29878 case BUILT_IN_EXP:
29879 case BUILT_IN_LOG:
29880 case BUILT_IN_LOG2:
29881 case BUILT_IN_LOG10:
29882 name[4] = 'd';
29883 name[5] = '2';
29884 if (el_mode != DFmode
29885 || n != 2)
29886 return NULL_TREE;
29887 break;
29888
29889 case BUILT_IN_SINF:
29890 case BUILT_IN_COSF:
29891 case BUILT_IN_EXPF:
29892 case BUILT_IN_POWF:
29893 case BUILT_IN_LOGF:
29894 case BUILT_IN_LOG2F:
29895 case BUILT_IN_LOG10F:
29896 name[4] = 's';
29897 name[5] = '4';
29898 if (el_mode != SFmode
29899 || n != 4)
29900 return NULL_TREE;
29901 break;
29902
29903 default:
29904 return NULL_TREE;
29905 }
29906
29907 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
29908 sprintf (name + 7, "%s", bname+10);
29909
29910 arity = 0;
29911 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
29912 args;
29913 args = TREE_CHAIN (args))
29914 arity++;
29915
29916 if (arity == 1)
29917 fntype = build_function_type_list (type_out, type_in, NULL);
29918 else
29919 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
29920
29921 /* Build a function declaration for the vectorized function. */
29922 new_fndecl = build_decl (BUILTINS_LOCATION,
29923 FUNCTION_DECL, get_identifier (name), fntype);
29924 TREE_PUBLIC (new_fndecl) = 1;
29925 DECL_EXTERNAL (new_fndecl) = 1;
29926 DECL_IS_NOVOPS (new_fndecl) = 1;
29927 TREE_READONLY (new_fndecl) = 1;
29928
29929 return new_fndecl;
29930 }
29931
29932 /* Returns a decl of a function that implements gather load with
29933 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
29934 Return NULL_TREE if it is not available. */
29935
29936 static tree
29937 ix86_vectorize_builtin_gather (const_tree mem_vectype,
29938 const_tree index_type, int scale)
29939 {
29940 bool si;
29941 enum ix86_builtins code;
29942
29943 if (! TARGET_AVX2)
29944 return NULL_TREE;
29945
29946 if ((TREE_CODE (index_type) != INTEGER_TYPE
29947 && !POINTER_TYPE_P (index_type))
29948 || (TYPE_MODE (index_type) != SImode
29949 && TYPE_MODE (index_type) != DImode))
29950 return NULL_TREE;
29951
29952 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
29953 return NULL_TREE;
29954
29955 /* v*gather* insn sign extends index to pointer mode. */
29956 if (TYPE_PRECISION (index_type) < POINTER_SIZE
29957 && TYPE_UNSIGNED (index_type))
29958 return NULL_TREE;
29959
29960 if (scale <= 0
29961 || scale > 8
29962 || (scale & (scale - 1)) != 0)
29963 return NULL_TREE;
29964
29965 si = TYPE_MODE (index_type) == SImode;
29966 switch (TYPE_MODE (mem_vectype))
29967 {
29968 case V2DFmode:
29969 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
29970 break;
29971 case V4DFmode:
29972 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
29973 break;
29974 case V2DImode:
29975 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
29976 break;
29977 case V4DImode:
29978 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
29979 break;
29980 case V4SFmode:
29981 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
29982 break;
29983 case V8SFmode:
29984 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
29985 break;
29986 case V4SImode:
29987 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
29988 break;
29989 case V8SImode:
29990 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
29991 break;
29992 default:
29993 return NULL_TREE;
29994 }
29995
29996 return ix86_builtins[code];
29997 }
29998
29999 /* Returns a code for a target-specific builtin that implements
30000 reciprocal of the function, or NULL_TREE if not available. */
30001
30002 static tree
30003 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
30004 bool sqrt ATTRIBUTE_UNUSED)
30005 {
30006 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
30007 && flag_finite_math_only && !flag_trapping_math
30008 && flag_unsafe_math_optimizations))
30009 return NULL_TREE;
30010
30011 if (md_fn)
30012 /* Machine dependent builtins. */
30013 switch (fn)
30014 {
30015 /* Vectorized version of sqrt to rsqrt conversion. */
30016 case IX86_BUILTIN_SQRTPS_NR:
30017 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
30018
30019 case IX86_BUILTIN_SQRTPS_NR256:
30020 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
30021
30022 default:
30023 return NULL_TREE;
30024 }
30025 else
30026 /* Normal builtins. */
30027 switch (fn)
30028 {
30029 /* Sqrt to rsqrt conversion. */
30030 case BUILT_IN_SQRTF:
30031 return ix86_builtins[IX86_BUILTIN_RSQRTF];
30032
30033 default:
30034 return NULL_TREE;
30035 }
30036 }
30037 \f
30038 /* Helper for avx_vpermilps256_operand et al. This is also used by
30039 the expansion functions to turn the parallel back into a mask.
30040 The return value is 0 for no match and the imm8+1 for a match. */
30041
30042 int
30043 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
30044 {
30045 unsigned i, nelt = GET_MODE_NUNITS (mode);
30046 unsigned mask = 0;
30047 unsigned char ipar[8];
30048
30049 if (XVECLEN (par, 0) != (int) nelt)
30050 return 0;
30051
30052 /* Validate that all of the elements are constants, and not totally
30053 out of range. Copy the data into an integral array to make the
30054 subsequent checks easier. */
30055 for (i = 0; i < nelt; ++i)
30056 {
30057 rtx er = XVECEXP (par, 0, i);
30058 unsigned HOST_WIDE_INT ei;
30059
30060 if (!CONST_INT_P (er))
30061 return 0;
30062 ei = INTVAL (er);
30063 if (ei >= nelt)
30064 return 0;
30065 ipar[i] = ei;
30066 }
30067
30068 switch (mode)
30069 {
30070 case V4DFmode:
30071 /* In the 256-bit DFmode case, we can only move elements within
30072 a 128-bit lane. */
30073 for (i = 0; i < 2; ++i)
30074 {
30075 if (ipar[i] >= 2)
30076 return 0;
30077 mask |= ipar[i] << i;
30078 }
30079 for (i = 2; i < 4; ++i)
30080 {
30081 if (ipar[i] < 2)
30082 return 0;
30083 mask |= (ipar[i] - 2) << i;
30084 }
30085 break;
30086
30087 case V8SFmode:
30088 /* In the 256-bit SFmode case, we have full freedom of movement
30089 within the low 128-bit lane, but the high 128-bit lane must
30090 mirror the exact same pattern. */
30091 for (i = 0; i < 4; ++i)
30092 if (ipar[i] + 4 != ipar[i + 4])
30093 return 0;
30094 nelt = 4;
30095 /* FALLTHRU */
30096
30097 case V2DFmode:
30098 case V4SFmode:
30099 /* In the 128-bit case, we've full freedom in the placement of
30100 the elements from the source operand. */
30101 for (i = 0; i < nelt; ++i)
30102 mask |= ipar[i] << (i * (nelt / 2));
30103 break;
30104
30105 default:
30106 gcc_unreachable ();
30107 }
30108
30109 /* Make sure success has a non-zero value by adding one. */
30110 return mask + 1;
30111 }
30112
30113 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
30114 the expansion functions to turn the parallel back into a mask.
30115 The return value is 0 for no match and the imm8+1 for a match. */
30116
30117 int
30118 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
30119 {
30120 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
30121 unsigned mask = 0;
30122 unsigned char ipar[8];
30123
30124 if (XVECLEN (par, 0) != (int) nelt)
30125 return 0;
30126
30127 /* Validate that all of the elements are constants, and not totally
30128 out of range. Copy the data into an integral array to make the
30129 subsequent checks easier. */
30130 for (i = 0; i < nelt; ++i)
30131 {
30132 rtx er = XVECEXP (par, 0, i);
30133 unsigned HOST_WIDE_INT ei;
30134
30135 if (!CONST_INT_P (er))
30136 return 0;
30137 ei = INTVAL (er);
30138 if (ei >= 2 * nelt)
30139 return 0;
30140 ipar[i] = ei;
30141 }
30142
30143 /* Validate that the halves of the permute are halves. */
30144 for (i = 0; i < nelt2 - 1; ++i)
30145 if (ipar[i] + 1 != ipar[i + 1])
30146 return 0;
30147 for (i = nelt2; i < nelt - 1; ++i)
30148 if (ipar[i] + 1 != ipar[i + 1])
30149 return 0;
30150
30151 /* Reconstruct the mask. */
30152 for (i = 0; i < 2; ++i)
30153 {
30154 unsigned e = ipar[i * nelt2];
30155 if (e % nelt2)
30156 return 0;
30157 e /= nelt2;
30158 mask |= e << (i * 4);
30159 }
30160
30161 /* Make sure success has a non-zero value by adding one. */
30162 return mask + 1;
30163 }
30164 \f
30165 /* Store OPERAND to the memory after reload is completed. This means
30166 that we can't easily use assign_stack_local. */
30167 rtx
30168 ix86_force_to_memory (enum machine_mode mode, rtx operand)
30169 {
30170 rtx result;
30171
30172 gcc_assert (reload_completed);
30173 if (ix86_using_red_zone ())
30174 {
30175 result = gen_rtx_MEM (mode,
30176 gen_rtx_PLUS (Pmode,
30177 stack_pointer_rtx,
30178 GEN_INT (-RED_ZONE_SIZE)));
30179 emit_move_insn (result, operand);
30180 }
30181 else if (TARGET_64BIT)
30182 {
30183 switch (mode)
30184 {
30185 case HImode:
30186 case SImode:
30187 operand = gen_lowpart (DImode, operand);
30188 /* FALLTHRU */
30189 case DImode:
30190 emit_insn (
30191 gen_rtx_SET (VOIDmode,
30192 gen_rtx_MEM (DImode,
30193 gen_rtx_PRE_DEC (DImode,
30194 stack_pointer_rtx)),
30195 operand));
30196 break;
30197 default:
30198 gcc_unreachable ();
30199 }
30200 result = gen_rtx_MEM (mode, stack_pointer_rtx);
30201 }
30202 else
30203 {
30204 switch (mode)
30205 {
30206 case DImode:
30207 {
30208 rtx operands[2];
30209 split_double_mode (mode, &operand, 1, operands, operands + 1);
30210 emit_insn (
30211 gen_rtx_SET (VOIDmode,
30212 gen_rtx_MEM (SImode,
30213 gen_rtx_PRE_DEC (Pmode,
30214 stack_pointer_rtx)),
30215 operands[1]));
30216 emit_insn (
30217 gen_rtx_SET (VOIDmode,
30218 gen_rtx_MEM (SImode,
30219 gen_rtx_PRE_DEC (Pmode,
30220 stack_pointer_rtx)),
30221 operands[0]));
30222 }
30223 break;
30224 case HImode:
30225 /* Store HImodes as SImodes. */
30226 operand = gen_lowpart (SImode, operand);
30227 /* FALLTHRU */
30228 case SImode:
30229 emit_insn (
30230 gen_rtx_SET (VOIDmode,
30231 gen_rtx_MEM (GET_MODE (operand),
30232 gen_rtx_PRE_DEC (SImode,
30233 stack_pointer_rtx)),
30234 operand));
30235 break;
30236 default:
30237 gcc_unreachable ();
30238 }
30239 result = gen_rtx_MEM (mode, stack_pointer_rtx);
30240 }
30241 return result;
30242 }
30243
30244 /* Free operand from the memory. */
30245 void
30246 ix86_free_from_memory (enum machine_mode mode)
30247 {
30248 if (!ix86_using_red_zone ())
30249 {
30250 int size;
30251
30252 if (mode == DImode || TARGET_64BIT)
30253 size = 8;
30254 else
30255 size = 4;
30256 /* Use LEA to deallocate stack space. In peephole2 it will be converted
30257 to pop or add instruction if registers are available. */
30258 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
30259 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
30260 GEN_INT (size))));
30261 }
30262 }
30263
30264 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
30265
30266 Put float CONST_DOUBLE in the constant pool instead of fp regs.
30267 QImode must go into class Q_REGS.
30268 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
30269 movdf to do mem-to-mem moves through integer regs. */
30270
30271 static reg_class_t
30272 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
30273 {
30274 enum machine_mode mode = GET_MODE (x);
30275
30276 /* We're only allowed to return a subclass of CLASS. Many of the
30277 following checks fail for NO_REGS, so eliminate that early. */
30278 if (regclass == NO_REGS)
30279 return NO_REGS;
30280
30281 /* All classes can load zeros. */
30282 if (x == CONST0_RTX (mode))
30283 return regclass;
30284
30285 /* Force constants into memory if we are loading a (nonzero) constant into
30286 an MMX or SSE register. This is because there are no MMX/SSE instructions
30287 to load from a constant. */
30288 if (CONSTANT_P (x)
30289 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
30290 return NO_REGS;
30291
30292 /* Prefer SSE regs only, if we can use them for math. */
30293 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
30294 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
30295
30296 /* Floating-point constants need more complex checks. */
30297 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
30298 {
30299 /* General regs can load everything. */
30300 if (reg_class_subset_p (regclass, GENERAL_REGS))
30301 return regclass;
30302
30303 /* Floats can load 0 and 1 plus some others. Note that we eliminated
30304 zero above. We only want to wind up preferring 80387 registers if
30305 we plan on doing computation with them. */
30306 if (TARGET_80387
30307 && standard_80387_constant_p (x) > 0)
30308 {
30309 /* Limit class to non-sse. */
30310 if (regclass == FLOAT_SSE_REGS)
30311 return FLOAT_REGS;
30312 if (regclass == FP_TOP_SSE_REGS)
30313 return FP_TOP_REG;
30314 if (regclass == FP_SECOND_SSE_REGS)
30315 return FP_SECOND_REG;
30316 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
30317 return regclass;
30318 }
30319
30320 return NO_REGS;
30321 }
30322
30323 /* Generally when we see PLUS here, it's the function invariant
30324 (plus soft-fp const_int). Which can only be computed into general
30325 regs. */
30326 if (GET_CODE (x) == PLUS)
30327 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
30328
30329 /* QImode constants are easy to load, but non-constant QImode data
30330 must go into Q_REGS. */
30331 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
30332 {
30333 if (reg_class_subset_p (regclass, Q_REGS))
30334 return regclass;
30335 if (reg_class_subset_p (Q_REGS, regclass))
30336 return Q_REGS;
30337 return NO_REGS;
30338 }
30339
30340 return regclass;
30341 }
30342
30343 /* Discourage putting floating-point values in SSE registers unless
30344 SSE math is being used, and likewise for the 387 registers. */
30345 static reg_class_t
30346 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
30347 {
30348 enum machine_mode mode = GET_MODE (x);
30349
30350 /* Restrict the output reload class to the register bank that we are doing
30351 math on. If we would like not to return a subset of CLASS, reject this
30352 alternative: if reload cannot do this, it will still use its choice. */
30353 mode = GET_MODE (x);
30354 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
30355 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
30356
30357 if (X87_FLOAT_MODE_P (mode))
30358 {
30359 if (regclass == FP_TOP_SSE_REGS)
30360 return FP_TOP_REG;
30361 else if (regclass == FP_SECOND_SSE_REGS)
30362 return FP_SECOND_REG;
30363 else
30364 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
30365 }
30366
30367 return regclass;
30368 }
30369
30370 static reg_class_t
30371 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
30372 enum machine_mode mode, secondary_reload_info *sri)
30373 {
30374 /* Double-word spills from general registers to non-offsettable memory
30375 references (zero-extended addresses) require special handling. */
30376 if (TARGET_64BIT
30377 && MEM_P (x)
30378 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
30379 && rclass == GENERAL_REGS
30380 && !offsettable_memref_p (x))
30381 {
30382 sri->icode = (in_p
30383 ? CODE_FOR_reload_noff_load
30384 : CODE_FOR_reload_noff_store);
30385 /* Add the cost of moving address to a temporary. */
30386 sri->extra_cost = 1;
30387
30388 return NO_REGS;
30389 }
30390
30391 /* QImode spills from non-QI registers require
30392 intermediate register on 32bit targets. */
30393 if (!TARGET_64BIT
30394 && !in_p && mode == QImode
30395 && (rclass == GENERAL_REGS
30396 || rclass == LEGACY_REGS
30397 || rclass == INDEX_REGS))
30398 {
30399 int regno;
30400
30401 if (REG_P (x))
30402 regno = REGNO (x);
30403 else
30404 regno = -1;
30405
30406 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
30407 regno = true_regnum (x);
30408
30409 /* Return Q_REGS if the operand is in memory. */
30410 if (regno == -1)
30411 return Q_REGS;
30412 }
30413
30414 /* This condition handles corner case where an expression involving
30415 pointers gets vectorized. We're trying to use the address of a
30416 stack slot as a vector initializer.
30417
30418 (set (reg:V2DI 74 [ vect_cst_.2 ])
30419 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
30420
30421 Eventually frame gets turned into sp+offset like this:
30422
30423 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30424 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
30425 (const_int 392 [0x188]))))
30426
30427 That later gets turned into:
30428
30429 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30430 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
30431 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
30432
30433 We'll have the following reload recorded:
30434
30435 Reload 0: reload_in (DI) =
30436 (plus:DI (reg/f:DI 7 sp)
30437 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
30438 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30439 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
30440 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
30441 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30442 reload_reg_rtx: (reg:V2DI 22 xmm1)
30443
30444 Which isn't going to work since SSE instructions can't handle scalar
30445 additions. Returning GENERAL_REGS forces the addition into integer
30446 register and reload can handle subsequent reloads without problems. */
30447
30448 if (in_p && GET_CODE (x) == PLUS
30449 && SSE_CLASS_P (rclass)
30450 && SCALAR_INT_MODE_P (mode))
30451 return GENERAL_REGS;
30452
30453 return NO_REGS;
30454 }
30455
30456 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
30457
30458 static bool
30459 ix86_class_likely_spilled_p (reg_class_t rclass)
30460 {
30461 switch (rclass)
30462 {
30463 case AREG:
30464 case DREG:
30465 case CREG:
30466 case BREG:
30467 case AD_REGS:
30468 case SIREG:
30469 case DIREG:
30470 case SSE_FIRST_REG:
30471 case FP_TOP_REG:
30472 case FP_SECOND_REG:
30473 return true;
30474
30475 default:
30476 break;
30477 }
30478
30479 return false;
30480 }
30481
30482 /* If we are copying between general and FP registers, we need a memory
30483 location. The same is true for SSE and MMX registers.
30484
30485 To optimize register_move_cost performance, allow inline variant.
30486
30487 The macro can't work reliably when one of the CLASSES is class containing
30488 registers from multiple units (SSE, MMX, integer). We avoid this by never
30489 combining those units in single alternative in the machine description.
30490 Ensure that this constraint holds to avoid unexpected surprises.
30491
30492 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
30493 enforce these sanity checks. */
30494
30495 static inline bool
30496 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
30497 enum machine_mode mode, int strict)
30498 {
30499 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
30500 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
30501 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
30502 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
30503 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
30504 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
30505 {
30506 gcc_assert (!strict);
30507 return true;
30508 }
30509
30510 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
30511 return true;
30512
30513 /* ??? This is a lie. We do have moves between mmx/general, and for
30514 mmx/sse2. But by saying we need secondary memory we discourage the
30515 register allocator from using the mmx registers unless needed. */
30516 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
30517 return true;
30518
30519 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
30520 {
30521 /* SSE1 doesn't have any direct moves from other classes. */
30522 if (!TARGET_SSE2)
30523 return true;
30524
30525 /* If the target says that inter-unit moves are more expensive
30526 than moving through memory, then don't generate them. */
30527 if (!TARGET_INTER_UNIT_MOVES)
30528 return true;
30529
30530 /* Between SSE and general, we have moves no larger than word size. */
30531 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
30532 return true;
30533 }
30534
30535 return false;
30536 }
30537
30538 bool
30539 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
30540 enum machine_mode mode, int strict)
30541 {
30542 return inline_secondary_memory_needed (class1, class2, mode, strict);
30543 }
30544
30545 /* Implement the TARGET_CLASS_MAX_NREGS hook.
30546
30547 On the 80386, this is the size of MODE in words,
30548 except in the FP regs, where a single reg is always enough. */
30549
30550 static unsigned char
30551 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
30552 {
30553 if (MAYBE_INTEGER_CLASS_P (rclass))
30554 {
30555 if (mode == XFmode)
30556 return (TARGET_64BIT ? 2 : 3);
30557 else if (mode == XCmode)
30558 return (TARGET_64BIT ? 4 : 6);
30559 else
30560 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
30561 }
30562 else
30563 {
30564 if (COMPLEX_MODE_P (mode))
30565 return 2;
30566 else
30567 return 1;
30568 }
30569 }
30570
30571 /* Return true if the registers in CLASS cannot represent the change from
30572 modes FROM to TO. */
30573
30574 bool
30575 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
30576 enum reg_class regclass)
30577 {
30578 if (from == to)
30579 return false;
30580
30581 /* x87 registers can't do subreg at all, as all values are reformatted
30582 to extended precision. */
30583 if (MAYBE_FLOAT_CLASS_P (regclass))
30584 return true;
30585
30586 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
30587 {
30588 /* Vector registers do not support QI or HImode loads. If we don't
30589 disallow a change to these modes, reload will assume it's ok to
30590 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
30591 the vec_dupv4hi pattern. */
30592 if (GET_MODE_SIZE (from) < 4)
30593 return true;
30594
30595 /* Vector registers do not support subreg with nonzero offsets, which
30596 are otherwise valid for integer registers. Since we can't see
30597 whether we have a nonzero offset from here, prohibit all
30598 nonparadoxical subregs changing size. */
30599 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
30600 return true;
30601 }
30602
30603 return false;
30604 }
30605
30606 /* Return the cost of moving data of mode M between a
30607 register and memory. A value of 2 is the default; this cost is
30608 relative to those in `REGISTER_MOVE_COST'.
30609
30610 This function is used extensively by register_move_cost that is used to
30611 build tables at startup. Make it inline in this case.
30612 When IN is 2, return maximum of in and out move cost.
30613
30614 If moving between registers and memory is more expensive than
30615 between two registers, you should define this macro to express the
30616 relative cost.
30617
30618 Model also increased moving costs of QImode registers in non
30619 Q_REGS classes.
30620 */
30621 static inline int
30622 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
30623 int in)
30624 {
30625 int cost;
30626 if (FLOAT_CLASS_P (regclass))
30627 {
30628 int index;
30629 switch (mode)
30630 {
30631 case SFmode:
30632 index = 0;
30633 break;
30634 case DFmode:
30635 index = 1;
30636 break;
30637 case XFmode:
30638 index = 2;
30639 break;
30640 default:
30641 return 100;
30642 }
30643 if (in == 2)
30644 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
30645 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
30646 }
30647 if (SSE_CLASS_P (regclass))
30648 {
30649 int index;
30650 switch (GET_MODE_SIZE (mode))
30651 {
30652 case 4:
30653 index = 0;
30654 break;
30655 case 8:
30656 index = 1;
30657 break;
30658 case 16:
30659 index = 2;
30660 break;
30661 default:
30662 return 100;
30663 }
30664 if (in == 2)
30665 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
30666 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
30667 }
30668 if (MMX_CLASS_P (regclass))
30669 {
30670 int index;
30671 switch (GET_MODE_SIZE (mode))
30672 {
30673 case 4:
30674 index = 0;
30675 break;
30676 case 8:
30677 index = 1;
30678 break;
30679 default:
30680 return 100;
30681 }
30682 if (in)
30683 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
30684 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
30685 }
30686 switch (GET_MODE_SIZE (mode))
30687 {
30688 case 1:
30689 if (Q_CLASS_P (regclass) || TARGET_64BIT)
30690 {
30691 if (!in)
30692 return ix86_cost->int_store[0];
30693 if (TARGET_PARTIAL_REG_DEPENDENCY
30694 && optimize_function_for_speed_p (cfun))
30695 cost = ix86_cost->movzbl_load;
30696 else
30697 cost = ix86_cost->int_load[0];
30698 if (in == 2)
30699 return MAX (cost, ix86_cost->int_store[0]);
30700 return cost;
30701 }
30702 else
30703 {
30704 if (in == 2)
30705 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
30706 if (in)
30707 return ix86_cost->movzbl_load;
30708 else
30709 return ix86_cost->int_store[0] + 4;
30710 }
30711 break;
30712 case 2:
30713 if (in == 2)
30714 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
30715 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
30716 default:
30717 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
30718 if (mode == TFmode)
30719 mode = XFmode;
30720 if (in == 2)
30721 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
30722 else if (in)
30723 cost = ix86_cost->int_load[2];
30724 else
30725 cost = ix86_cost->int_store[2];
30726 return (cost * (((int) GET_MODE_SIZE (mode)
30727 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
30728 }
30729 }
30730
30731 static int
30732 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
30733 bool in)
30734 {
30735 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
30736 }
30737
30738
30739 /* Return the cost of moving data from a register in class CLASS1 to
30740 one in class CLASS2.
30741
30742 It is not required that the cost always equal 2 when FROM is the same as TO;
30743 on some machines it is expensive to move between registers if they are not
30744 general registers. */
30745
30746 static int
30747 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
30748 reg_class_t class2_i)
30749 {
30750 enum reg_class class1 = (enum reg_class) class1_i;
30751 enum reg_class class2 = (enum reg_class) class2_i;
30752
30753 /* In case we require secondary memory, compute cost of the store followed
30754 by load. In order to avoid bad register allocation choices, we need
30755 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
30756
30757 if (inline_secondary_memory_needed (class1, class2, mode, 0))
30758 {
30759 int cost = 1;
30760
30761 cost += inline_memory_move_cost (mode, class1, 2);
30762 cost += inline_memory_move_cost (mode, class2, 2);
30763
30764 /* In case of copying from general_purpose_register we may emit multiple
30765 stores followed by single load causing memory size mismatch stall.
30766 Count this as arbitrarily high cost of 20. */
30767 if (targetm.class_max_nregs (class1, mode)
30768 > targetm.class_max_nregs (class2, mode))
30769 cost += 20;
30770
30771 /* In the case of FP/MMX moves, the registers actually overlap, and we
30772 have to switch modes in order to treat them differently. */
30773 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
30774 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
30775 cost += 20;
30776
30777 return cost;
30778 }
30779
30780 /* Moves between SSE/MMX and integer unit are expensive. */
30781 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
30782 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
30783
30784 /* ??? By keeping returned value relatively high, we limit the number
30785 of moves between integer and MMX/SSE registers for all targets.
30786 Additionally, high value prevents problem with x86_modes_tieable_p(),
30787 where integer modes in MMX/SSE registers are not tieable
30788 because of missing QImode and HImode moves to, from or between
30789 MMX/SSE registers. */
30790 return MAX (8, ix86_cost->mmxsse_to_integer);
30791
30792 if (MAYBE_FLOAT_CLASS_P (class1))
30793 return ix86_cost->fp_move;
30794 if (MAYBE_SSE_CLASS_P (class1))
30795 return ix86_cost->sse_move;
30796 if (MAYBE_MMX_CLASS_P (class1))
30797 return ix86_cost->mmx_move;
30798 return 2;
30799 }
30800
30801 /* Return TRUE if hard register REGNO can hold a value of machine-mode
30802 MODE. */
30803
30804 bool
30805 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
30806 {
30807 /* Flags and only flags can only hold CCmode values. */
30808 if (CC_REGNO_P (regno))
30809 return GET_MODE_CLASS (mode) == MODE_CC;
30810 if (GET_MODE_CLASS (mode) == MODE_CC
30811 || GET_MODE_CLASS (mode) == MODE_RANDOM
30812 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
30813 return false;
30814 if (FP_REGNO_P (regno))
30815 return VALID_FP_MODE_P (mode);
30816 if (SSE_REGNO_P (regno))
30817 {
30818 /* We implement the move patterns for all vector modes into and
30819 out of SSE registers, even when no operation instructions
30820 are available. OImode move is available only when AVX is
30821 enabled. */
30822 return ((TARGET_AVX && mode == OImode)
30823 || VALID_AVX256_REG_MODE (mode)
30824 || VALID_SSE_REG_MODE (mode)
30825 || VALID_SSE2_REG_MODE (mode)
30826 || VALID_MMX_REG_MODE (mode)
30827 || VALID_MMX_REG_MODE_3DNOW (mode));
30828 }
30829 if (MMX_REGNO_P (regno))
30830 {
30831 /* We implement the move patterns for 3DNOW modes even in MMX mode,
30832 so if the register is available at all, then we can move data of
30833 the given mode into or out of it. */
30834 return (VALID_MMX_REG_MODE (mode)
30835 || VALID_MMX_REG_MODE_3DNOW (mode));
30836 }
30837
30838 if (mode == QImode)
30839 {
30840 /* Take care for QImode values - they can be in non-QI regs,
30841 but then they do cause partial register stalls. */
30842 if (regno <= BX_REG || TARGET_64BIT)
30843 return true;
30844 if (!TARGET_PARTIAL_REG_STALL)
30845 return true;
30846 return !can_create_pseudo_p ();
30847 }
30848 /* We handle both integer and floats in the general purpose registers. */
30849 else if (VALID_INT_MODE_P (mode))
30850 return true;
30851 else if (VALID_FP_MODE_P (mode))
30852 return true;
30853 else if (VALID_DFP_MODE_P (mode))
30854 return true;
30855 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
30856 on to use that value in smaller contexts, this can easily force a
30857 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
30858 supporting DImode, allow it. */
30859 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
30860 return true;
30861
30862 return false;
30863 }
30864
30865 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
30866 tieable integer mode. */
30867
30868 static bool
30869 ix86_tieable_integer_mode_p (enum machine_mode mode)
30870 {
30871 switch (mode)
30872 {
30873 case HImode:
30874 case SImode:
30875 return true;
30876
30877 case QImode:
30878 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
30879
30880 case DImode:
30881 return TARGET_64BIT;
30882
30883 default:
30884 return false;
30885 }
30886 }
30887
30888 /* Return true if MODE1 is accessible in a register that can hold MODE2
30889 without copying. That is, all register classes that can hold MODE2
30890 can also hold MODE1. */
30891
30892 bool
30893 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
30894 {
30895 if (mode1 == mode2)
30896 return true;
30897
30898 if (ix86_tieable_integer_mode_p (mode1)
30899 && ix86_tieable_integer_mode_p (mode2))
30900 return true;
30901
30902 /* MODE2 being XFmode implies fp stack or general regs, which means we
30903 can tie any smaller floating point modes to it. Note that we do not
30904 tie this with TFmode. */
30905 if (mode2 == XFmode)
30906 return mode1 == SFmode || mode1 == DFmode;
30907
30908 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
30909 that we can tie it with SFmode. */
30910 if (mode2 == DFmode)
30911 return mode1 == SFmode;
30912
30913 /* If MODE2 is only appropriate for an SSE register, then tie with
30914 any other mode acceptable to SSE registers. */
30915 if (GET_MODE_SIZE (mode2) == 16
30916 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
30917 return (GET_MODE_SIZE (mode1) == 16
30918 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
30919
30920 /* If MODE2 is appropriate for an MMX register, then tie
30921 with any other mode acceptable to MMX registers. */
30922 if (GET_MODE_SIZE (mode2) == 8
30923 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
30924 return (GET_MODE_SIZE (mode1) == 8
30925 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
30926
30927 return false;
30928 }
30929
30930 /* Compute a (partial) cost for rtx X. Return true if the complete
30931 cost has been computed, and false if subexpressions should be
30932 scanned. In either case, *TOTAL contains the cost result. */
30933
30934 static bool
30935 ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
30936 bool speed)
30937 {
30938 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
30939 enum machine_mode mode = GET_MODE (x);
30940 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
30941
30942 switch (code)
30943 {
30944 case CONST_INT:
30945 case CONST:
30946 case LABEL_REF:
30947 case SYMBOL_REF:
30948 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
30949 *total = 3;
30950 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
30951 *total = 2;
30952 else if (flag_pic && SYMBOLIC_CONST (x)
30953 && (!TARGET_64BIT
30954 || (!GET_CODE (x) != LABEL_REF
30955 && (GET_CODE (x) != SYMBOL_REF
30956 || !SYMBOL_REF_LOCAL_P (x)))))
30957 *total = 1;
30958 else
30959 *total = 0;
30960 return true;
30961
30962 case CONST_DOUBLE:
30963 if (mode == VOIDmode)
30964 *total = 0;
30965 else
30966 switch (standard_80387_constant_p (x))
30967 {
30968 case 1: /* 0.0 */
30969 *total = 1;
30970 break;
30971 default: /* Other constants */
30972 *total = 2;
30973 break;
30974 case 0:
30975 case -1:
30976 /* Start with (MEM (SYMBOL_REF)), since that's where
30977 it'll probably end up. Add a penalty for size. */
30978 *total = (COSTS_N_INSNS (1)
30979 + (flag_pic != 0 && !TARGET_64BIT)
30980 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
30981 break;
30982 }
30983 return true;
30984
30985 case ZERO_EXTEND:
30986 /* The zero extensions is often completely free on x86_64, so make
30987 it as cheap as possible. */
30988 if (TARGET_64BIT && mode == DImode
30989 && GET_MODE (XEXP (x, 0)) == SImode)
30990 *total = 1;
30991 else if (TARGET_ZERO_EXTEND_WITH_AND)
30992 *total = cost->add;
30993 else
30994 *total = cost->movzx;
30995 return false;
30996
30997 case SIGN_EXTEND:
30998 *total = cost->movsx;
30999 return false;
31000
31001 case ASHIFT:
31002 if (CONST_INT_P (XEXP (x, 1))
31003 && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
31004 {
31005 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31006 if (value == 1)
31007 {
31008 *total = cost->add;
31009 return false;
31010 }
31011 if ((value == 2 || value == 3)
31012 && cost->lea <= cost->shift_const)
31013 {
31014 *total = cost->lea;
31015 return false;
31016 }
31017 }
31018 /* FALLTHRU */
31019
31020 case ROTATE:
31021 case ASHIFTRT:
31022 case LSHIFTRT:
31023 case ROTATERT:
31024 if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
31025 {
31026 if (CONST_INT_P (XEXP (x, 1)))
31027 {
31028 if (INTVAL (XEXP (x, 1)) > 32)
31029 *total = cost->shift_const + COSTS_N_INSNS (2);
31030 else
31031 *total = cost->shift_const * 2;
31032 }
31033 else
31034 {
31035 if (GET_CODE (XEXP (x, 1)) == AND)
31036 *total = cost->shift_var * 2;
31037 else
31038 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
31039 }
31040 }
31041 else
31042 {
31043 if (CONST_INT_P (XEXP (x, 1)))
31044 *total = cost->shift_const;
31045 else
31046 *total = cost->shift_var;
31047 }
31048 return false;
31049
31050 case FMA:
31051 {
31052 rtx sub;
31053
31054 gcc_assert (FLOAT_MODE_P (mode));
31055 gcc_assert (TARGET_FMA || TARGET_FMA4);
31056
31057 /* ??? SSE scalar/vector cost should be used here. */
31058 /* ??? Bald assumption that fma has the same cost as fmul. */
31059 *total = cost->fmul;
31060 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
31061
31062 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
31063 sub = XEXP (x, 0);
31064 if (GET_CODE (sub) == NEG)
31065 sub = XEXP (sub, 0);
31066 *total += rtx_cost (sub, FMA, 0, speed);
31067
31068 sub = XEXP (x, 2);
31069 if (GET_CODE (sub) == NEG)
31070 sub = XEXP (sub, 0);
31071 *total += rtx_cost (sub, FMA, 2, speed);
31072 return true;
31073 }
31074
31075 case MULT:
31076 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31077 {
31078 /* ??? SSE scalar cost should be used here. */
31079 *total = cost->fmul;
31080 return false;
31081 }
31082 else if (X87_FLOAT_MODE_P (mode))
31083 {
31084 *total = cost->fmul;
31085 return false;
31086 }
31087 else if (FLOAT_MODE_P (mode))
31088 {
31089 /* ??? SSE vector cost should be used here. */
31090 *total = cost->fmul;
31091 return false;
31092 }
31093 else
31094 {
31095 rtx op0 = XEXP (x, 0);
31096 rtx op1 = XEXP (x, 1);
31097 int nbits;
31098 if (CONST_INT_P (XEXP (x, 1)))
31099 {
31100 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31101 for (nbits = 0; value != 0; value &= value - 1)
31102 nbits++;
31103 }
31104 else
31105 /* This is arbitrary. */
31106 nbits = 7;
31107
31108 /* Compute costs correctly for widening multiplication. */
31109 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
31110 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
31111 == GET_MODE_SIZE (mode))
31112 {
31113 int is_mulwiden = 0;
31114 enum machine_mode inner_mode = GET_MODE (op0);
31115
31116 if (GET_CODE (op0) == GET_CODE (op1))
31117 is_mulwiden = 1, op1 = XEXP (op1, 0);
31118 else if (CONST_INT_P (op1))
31119 {
31120 if (GET_CODE (op0) == SIGN_EXTEND)
31121 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
31122 == INTVAL (op1);
31123 else
31124 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
31125 }
31126
31127 if (is_mulwiden)
31128 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
31129 }
31130
31131 *total = (cost->mult_init[MODE_INDEX (mode)]
31132 + nbits * cost->mult_bit
31133 + rtx_cost (op0, outer_code, opno, speed)
31134 + rtx_cost (op1, outer_code, opno, speed));
31135
31136 return true;
31137 }
31138
31139 case DIV:
31140 case UDIV:
31141 case MOD:
31142 case UMOD:
31143 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31144 /* ??? SSE cost should be used here. */
31145 *total = cost->fdiv;
31146 else if (X87_FLOAT_MODE_P (mode))
31147 *total = cost->fdiv;
31148 else if (FLOAT_MODE_P (mode))
31149 /* ??? SSE vector cost should be used here. */
31150 *total = cost->fdiv;
31151 else
31152 *total = cost->divide[MODE_INDEX (mode)];
31153 return false;
31154
31155 case PLUS:
31156 if (GET_MODE_CLASS (mode) == MODE_INT
31157 && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
31158 {
31159 if (GET_CODE (XEXP (x, 0)) == PLUS
31160 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
31161 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
31162 && CONSTANT_P (XEXP (x, 1)))
31163 {
31164 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
31165 if (val == 2 || val == 4 || val == 8)
31166 {
31167 *total = cost->lea;
31168 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31169 outer_code, opno, speed);
31170 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
31171 outer_code, opno, speed);
31172 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31173 return true;
31174 }
31175 }
31176 else if (GET_CODE (XEXP (x, 0)) == MULT
31177 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
31178 {
31179 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
31180 if (val == 2 || val == 4 || val == 8)
31181 {
31182 *total = cost->lea;
31183 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31184 outer_code, opno, speed);
31185 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31186 return true;
31187 }
31188 }
31189 else if (GET_CODE (XEXP (x, 0)) == PLUS)
31190 {
31191 *total = cost->lea;
31192 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31193 outer_code, opno, speed);
31194 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31195 outer_code, opno, speed);
31196 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31197 return true;
31198 }
31199 }
31200 /* FALLTHRU */
31201
31202 case MINUS:
31203 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31204 {
31205 /* ??? SSE cost should be used here. */
31206 *total = cost->fadd;
31207 return false;
31208 }
31209 else if (X87_FLOAT_MODE_P (mode))
31210 {
31211 *total = cost->fadd;
31212 return false;
31213 }
31214 else if (FLOAT_MODE_P (mode))
31215 {
31216 /* ??? SSE vector cost should be used here. */
31217 *total = cost->fadd;
31218 return false;
31219 }
31220 /* FALLTHRU */
31221
31222 case AND:
31223 case IOR:
31224 case XOR:
31225 if (!TARGET_64BIT && mode == DImode)
31226 {
31227 *total = (cost->add * 2
31228 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
31229 << (GET_MODE (XEXP (x, 0)) != DImode))
31230 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
31231 << (GET_MODE (XEXP (x, 1)) != DImode)));
31232 return true;
31233 }
31234 /* FALLTHRU */
31235
31236 case NEG:
31237 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31238 {
31239 /* ??? SSE cost should be used here. */
31240 *total = cost->fchs;
31241 return false;
31242 }
31243 else if (X87_FLOAT_MODE_P (mode))
31244 {
31245 *total = cost->fchs;
31246 return false;
31247 }
31248 else if (FLOAT_MODE_P (mode))
31249 {
31250 /* ??? SSE vector cost should be used here. */
31251 *total = cost->fchs;
31252 return false;
31253 }
31254 /* FALLTHRU */
31255
31256 case NOT:
31257 if (!TARGET_64BIT && mode == DImode)
31258 *total = cost->add * 2;
31259 else
31260 *total = cost->add;
31261 return false;
31262
31263 case COMPARE:
31264 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
31265 && XEXP (XEXP (x, 0), 1) == const1_rtx
31266 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
31267 && XEXP (x, 1) == const0_rtx)
31268 {
31269 /* This kind of construct is implemented using test[bwl].
31270 Treat it as if we had an AND. */
31271 *total = (cost->add
31272 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
31273 + rtx_cost (const1_rtx, outer_code, opno, speed));
31274 return true;
31275 }
31276 return false;
31277
31278 case FLOAT_EXTEND:
31279 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
31280 *total = 0;
31281 return false;
31282
31283 case ABS:
31284 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31285 /* ??? SSE cost should be used here. */
31286 *total = cost->fabs;
31287 else if (X87_FLOAT_MODE_P (mode))
31288 *total = cost->fabs;
31289 else if (FLOAT_MODE_P (mode))
31290 /* ??? SSE vector cost should be used here. */
31291 *total = cost->fabs;
31292 return false;
31293
31294 case SQRT:
31295 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31296 /* ??? SSE cost should be used here. */
31297 *total = cost->fsqrt;
31298 else if (X87_FLOAT_MODE_P (mode))
31299 *total = cost->fsqrt;
31300 else if (FLOAT_MODE_P (mode))
31301 /* ??? SSE vector cost should be used here. */
31302 *total = cost->fsqrt;
31303 return false;
31304
31305 case UNSPEC:
31306 if (XINT (x, 1) == UNSPEC_TP)
31307 *total = 0;
31308 return false;
31309
31310 case VEC_SELECT:
31311 case VEC_CONCAT:
31312 case VEC_MERGE:
31313 case VEC_DUPLICATE:
31314 /* ??? Assume all of these vector manipulation patterns are
31315 recognizable. In which case they all pretty much have the
31316 same cost. */
31317 *total = COSTS_N_INSNS (1);
31318 return true;
31319
31320 default:
31321 return false;
31322 }
31323 }
31324
31325 #if TARGET_MACHO
31326
31327 static int current_machopic_label_num;
31328
31329 /* Given a symbol name and its associated stub, write out the
31330 definition of the stub. */
31331
31332 void
31333 machopic_output_stub (FILE *file, const char *symb, const char *stub)
31334 {
31335 unsigned int length;
31336 char *binder_name, *symbol_name, lazy_ptr_name[32];
31337 int label = ++current_machopic_label_num;
31338
31339 /* For 64-bit we shouldn't get here. */
31340 gcc_assert (!TARGET_64BIT);
31341
31342 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
31343 symb = targetm.strip_name_encoding (symb);
31344
31345 length = strlen (stub);
31346 binder_name = XALLOCAVEC (char, length + 32);
31347 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
31348
31349 length = strlen (symb);
31350 symbol_name = XALLOCAVEC (char, length + 32);
31351 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
31352
31353 sprintf (lazy_ptr_name, "L%d$lz", label);
31354
31355 if (MACHOPIC_ATT_STUB)
31356 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
31357 else if (MACHOPIC_PURE)
31358 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
31359 else
31360 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
31361
31362 fprintf (file, "%s:\n", stub);
31363 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
31364
31365 if (MACHOPIC_ATT_STUB)
31366 {
31367 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
31368 }
31369 else if (MACHOPIC_PURE)
31370 {
31371 /* PIC stub. */
31372 /* 25-byte PIC stub using "CALL get_pc_thunk". */
31373 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
31374 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
31375 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
31376 label, lazy_ptr_name, label);
31377 fprintf (file, "\tjmp\t*%%ecx\n");
31378 }
31379 else
31380 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
31381
31382 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
31383 it needs no stub-binding-helper. */
31384 if (MACHOPIC_ATT_STUB)
31385 return;
31386
31387 fprintf (file, "%s:\n", binder_name);
31388
31389 if (MACHOPIC_PURE)
31390 {
31391 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
31392 fprintf (file, "\tpushl\t%%ecx\n");
31393 }
31394 else
31395 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
31396
31397 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
31398
31399 /* N.B. Keep the correspondence of these
31400 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
31401 old-pic/new-pic/non-pic stubs; altering this will break
31402 compatibility with existing dylibs. */
31403 if (MACHOPIC_PURE)
31404 {
31405 /* 25-byte PIC stub using "CALL get_pc_thunk". */
31406 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
31407 }
31408 else
31409 /* 16-byte -mdynamic-no-pic stub. */
31410 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
31411
31412 fprintf (file, "%s:\n", lazy_ptr_name);
31413 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
31414 fprintf (file, ASM_LONG "%s\n", binder_name);
31415 }
31416 #endif /* TARGET_MACHO */
31417
31418 /* Order the registers for register allocator. */
31419
31420 void
31421 x86_order_regs_for_local_alloc (void)
31422 {
31423 int pos = 0;
31424 int i;
31425
31426 /* First allocate the local general purpose registers. */
31427 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
31428 if (GENERAL_REGNO_P (i) && call_used_regs[i])
31429 reg_alloc_order [pos++] = i;
31430
31431 /* Global general purpose registers. */
31432 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
31433 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
31434 reg_alloc_order [pos++] = i;
31435
31436 /* x87 registers come first in case we are doing FP math
31437 using them. */
31438 if (!TARGET_SSE_MATH)
31439 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
31440 reg_alloc_order [pos++] = i;
31441
31442 /* SSE registers. */
31443 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
31444 reg_alloc_order [pos++] = i;
31445 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
31446 reg_alloc_order [pos++] = i;
31447
31448 /* x87 registers. */
31449 if (TARGET_SSE_MATH)
31450 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
31451 reg_alloc_order [pos++] = i;
31452
31453 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
31454 reg_alloc_order [pos++] = i;
31455
31456 /* Initialize the rest of array as we do not allocate some registers
31457 at all. */
31458 while (pos < FIRST_PSEUDO_REGISTER)
31459 reg_alloc_order [pos++] = 0;
31460 }
31461
31462 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
31463 in struct attribute_spec handler. */
31464 static tree
31465 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
31466 tree args,
31467 int flags ATTRIBUTE_UNUSED,
31468 bool *no_add_attrs)
31469 {
31470 if (TREE_CODE (*node) != FUNCTION_TYPE
31471 && TREE_CODE (*node) != METHOD_TYPE
31472 && TREE_CODE (*node) != FIELD_DECL
31473 && TREE_CODE (*node) != TYPE_DECL)
31474 {
31475 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31476 name);
31477 *no_add_attrs = true;
31478 return NULL_TREE;
31479 }
31480 if (TARGET_64BIT)
31481 {
31482 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
31483 name);
31484 *no_add_attrs = true;
31485 return NULL_TREE;
31486 }
31487 if (is_attribute_p ("callee_pop_aggregate_return", name))
31488 {
31489 tree cst;
31490
31491 cst = TREE_VALUE (args);
31492 if (TREE_CODE (cst) != INTEGER_CST)
31493 {
31494 warning (OPT_Wattributes,
31495 "%qE attribute requires an integer constant argument",
31496 name);
31497 *no_add_attrs = true;
31498 }
31499 else if (compare_tree_int (cst, 0) != 0
31500 && compare_tree_int (cst, 1) != 0)
31501 {
31502 warning (OPT_Wattributes,
31503 "argument to %qE attribute is neither zero, nor one",
31504 name);
31505 *no_add_attrs = true;
31506 }
31507
31508 return NULL_TREE;
31509 }
31510
31511 return NULL_TREE;
31512 }
31513
31514 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
31515 struct attribute_spec.handler. */
31516 static tree
31517 ix86_handle_abi_attribute (tree *node, tree name,
31518 tree args ATTRIBUTE_UNUSED,
31519 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31520 {
31521 if (TREE_CODE (*node) != FUNCTION_TYPE
31522 && TREE_CODE (*node) != METHOD_TYPE
31523 && TREE_CODE (*node) != FIELD_DECL
31524 && TREE_CODE (*node) != TYPE_DECL)
31525 {
31526 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31527 name);
31528 *no_add_attrs = true;
31529 return NULL_TREE;
31530 }
31531
31532 /* Can combine regparm with all attributes but fastcall. */
31533 if (is_attribute_p ("ms_abi", name))
31534 {
31535 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
31536 {
31537 error ("ms_abi and sysv_abi attributes are not compatible");
31538 }
31539
31540 return NULL_TREE;
31541 }
31542 else if (is_attribute_p ("sysv_abi", name))
31543 {
31544 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
31545 {
31546 error ("ms_abi and sysv_abi attributes are not compatible");
31547 }
31548
31549 return NULL_TREE;
31550 }
31551
31552 return NULL_TREE;
31553 }
31554
31555 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
31556 struct attribute_spec.handler. */
31557 static tree
31558 ix86_handle_struct_attribute (tree *node, tree name,
31559 tree args ATTRIBUTE_UNUSED,
31560 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31561 {
31562 tree *type = NULL;
31563 if (DECL_P (*node))
31564 {
31565 if (TREE_CODE (*node) == TYPE_DECL)
31566 type = &TREE_TYPE (*node);
31567 }
31568 else
31569 type = node;
31570
31571 if (!(type && (TREE_CODE (*type) == RECORD_TYPE
31572 || TREE_CODE (*type) == UNION_TYPE)))
31573 {
31574 warning (OPT_Wattributes, "%qE attribute ignored",
31575 name);
31576 *no_add_attrs = true;
31577 }
31578
31579 else if ((is_attribute_p ("ms_struct", name)
31580 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
31581 || ((is_attribute_p ("gcc_struct", name)
31582 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
31583 {
31584 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
31585 name);
31586 *no_add_attrs = true;
31587 }
31588
31589 return NULL_TREE;
31590 }
31591
31592 static tree
31593 ix86_handle_fndecl_attribute (tree *node, tree name,
31594 tree args ATTRIBUTE_UNUSED,
31595 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31596 {
31597 if (TREE_CODE (*node) != FUNCTION_DECL)
31598 {
31599 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31600 name);
31601 *no_add_attrs = true;
31602 }
31603 return NULL_TREE;
31604 }
31605
31606 static bool
31607 ix86_ms_bitfield_layout_p (const_tree record_type)
31608 {
31609 return ((TARGET_MS_BITFIELD_LAYOUT
31610 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
31611 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
31612 }
31613
31614 /* Returns an expression indicating where the this parameter is
31615 located on entry to the FUNCTION. */
31616
31617 static rtx
31618 x86_this_parameter (tree function)
31619 {
31620 tree type = TREE_TYPE (function);
31621 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
31622 int nregs;
31623
31624 if (TARGET_64BIT)
31625 {
31626 const int *parm_regs;
31627
31628 if (ix86_function_type_abi (type) == MS_ABI)
31629 parm_regs = x86_64_ms_abi_int_parameter_registers;
31630 else
31631 parm_regs = x86_64_int_parameter_registers;
31632 return gen_rtx_REG (DImode, parm_regs[aggr]);
31633 }
31634
31635 nregs = ix86_function_regparm (type, function);
31636
31637 if (nregs > 0 && !stdarg_p (type))
31638 {
31639 int regno;
31640 unsigned int ccvt = ix86_get_callcvt (type);
31641
31642 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
31643 regno = aggr ? DX_REG : CX_REG;
31644 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
31645 {
31646 regno = CX_REG;
31647 if (aggr)
31648 return gen_rtx_MEM (SImode,
31649 plus_constant (stack_pointer_rtx, 4));
31650 }
31651 else
31652 {
31653 regno = AX_REG;
31654 if (aggr)
31655 {
31656 regno = DX_REG;
31657 if (nregs == 1)
31658 return gen_rtx_MEM (SImode,
31659 plus_constant (stack_pointer_rtx, 4));
31660 }
31661 }
31662 return gen_rtx_REG (SImode, regno);
31663 }
31664
31665 return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, aggr ? 8 : 4));
31666 }
31667
31668 /* Determine whether x86_output_mi_thunk can succeed. */
31669
31670 static bool
31671 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
31672 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
31673 HOST_WIDE_INT vcall_offset, const_tree function)
31674 {
31675 /* 64-bit can handle anything. */
31676 if (TARGET_64BIT)
31677 return true;
31678
31679 /* For 32-bit, everything's fine if we have one free register. */
31680 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
31681 return true;
31682
31683 /* Need a free register for vcall_offset. */
31684 if (vcall_offset)
31685 return false;
31686
31687 /* Need a free register for GOT references. */
31688 if (flag_pic && !targetm.binds_local_p (function))
31689 return false;
31690
31691 /* Otherwise ok. */
31692 return true;
31693 }
31694
31695 /* Output the assembler code for a thunk function. THUNK_DECL is the
31696 declaration for the thunk function itself, FUNCTION is the decl for
31697 the target function. DELTA is an immediate constant offset to be
31698 added to THIS. If VCALL_OFFSET is nonzero, the word at
31699 *(*this + vcall_offset) should be added to THIS. */
31700
31701 static void
31702 x86_output_mi_thunk (FILE *file,
31703 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
31704 HOST_WIDE_INT vcall_offset, tree function)
31705 {
31706 rtx this_param = x86_this_parameter (function);
31707 rtx this_reg, tmp, fnaddr;
31708
31709 emit_note (NOTE_INSN_PROLOGUE_END);
31710
31711 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
31712 pull it in now and let DELTA benefit. */
31713 if (REG_P (this_param))
31714 this_reg = this_param;
31715 else if (vcall_offset)
31716 {
31717 /* Put the this parameter into %eax. */
31718 this_reg = gen_rtx_REG (Pmode, AX_REG);
31719 emit_move_insn (this_reg, this_param);
31720 }
31721 else
31722 this_reg = NULL_RTX;
31723
31724 /* Adjust the this parameter by a fixed constant. */
31725 if (delta)
31726 {
31727 rtx delta_rtx = GEN_INT (delta);
31728 rtx delta_dst = this_reg ? this_reg : this_param;
31729
31730 if (TARGET_64BIT)
31731 {
31732 if (!x86_64_general_operand (delta_rtx, Pmode))
31733 {
31734 tmp = gen_rtx_REG (Pmode, R10_REG);
31735 emit_move_insn (tmp, delta_rtx);
31736 delta_rtx = tmp;
31737 }
31738 }
31739
31740 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
31741 }
31742
31743 /* Adjust the this parameter by a value stored in the vtable. */
31744 if (vcall_offset)
31745 {
31746 rtx vcall_addr, vcall_mem, this_mem;
31747 unsigned int tmp_regno;
31748
31749 if (TARGET_64BIT)
31750 tmp_regno = R10_REG;
31751 else
31752 {
31753 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
31754 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
31755 tmp_regno = AX_REG;
31756 else
31757 tmp_regno = CX_REG;
31758 }
31759 tmp = gen_rtx_REG (Pmode, tmp_regno);
31760
31761 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
31762 if (Pmode != ptr_mode)
31763 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
31764 emit_move_insn (tmp, this_mem);
31765
31766 /* Adjust the this parameter. */
31767 vcall_addr = plus_constant (tmp, vcall_offset);
31768 if (TARGET_64BIT
31769 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
31770 {
31771 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
31772 emit_move_insn (tmp2, GEN_INT (vcall_offset));
31773 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
31774 }
31775
31776 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
31777 if (Pmode != ptr_mode)
31778 emit_insn (gen_addsi_1_zext (this_reg,
31779 gen_rtx_REG (ptr_mode,
31780 REGNO (this_reg)),
31781 vcall_mem));
31782 else
31783 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
31784 }
31785
31786 /* If necessary, drop THIS back to its stack slot. */
31787 if (this_reg && this_reg != this_param)
31788 emit_move_insn (this_param, this_reg);
31789
31790 fnaddr = XEXP (DECL_RTL (function), 0);
31791 if (TARGET_64BIT)
31792 {
31793 if (!flag_pic || targetm.binds_local_p (function)
31794 || cfun->machine->call_abi == MS_ABI)
31795 ;
31796 else
31797 {
31798 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
31799 tmp = gen_rtx_CONST (Pmode, tmp);
31800 fnaddr = gen_rtx_MEM (Pmode, tmp);
31801 }
31802 }
31803 else
31804 {
31805 if (!flag_pic || targetm.binds_local_p (function))
31806 ;
31807 #if TARGET_MACHO
31808 else if (TARGET_MACHO)
31809 {
31810 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
31811 fnaddr = XEXP (fnaddr, 0);
31812 }
31813 #endif /* TARGET_MACHO */
31814 else
31815 {
31816 tmp = gen_rtx_REG (Pmode, CX_REG);
31817 output_set_got (tmp, NULL_RTX);
31818
31819 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
31820 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
31821 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
31822 }
31823 }
31824
31825 /* Our sibling call patterns do not allow memories, because we have no
31826 predicate that can distinguish between frame and non-frame memory.
31827 For our purposes here, we can get away with (ab)using a jump pattern,
31828 because we're going to do no optimization. */
31829 if (MEM_P (fnaddr))
31830 emit_jump_insn (gen_indirect_jump (fnaddr));
31831 else
31832 {
31833 tmp = gen_rtx_MEM (QImode, fnaddr);
31834 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
31835 tmp = emit_call_insn (tmp);
31836 SIBLING_CALL_P (tmp) = 1;
31837 }
31838 emit_barrier ();
31839
31840 /* Emit just enough of rest_of_compilation to get the insns emitted.
31841 Note that use_thunk calls assemble_start_function et al. */
31842 tmp = get_insns ();
31843 insn_locators_alloc ();
31844 shorten_branches (tmp);
31845 final_start_function (tmp, file, 1);
31846 final (tmp, file, 1);
31847 final_end_function ();
31848 }
31849
31850 static void
31851 x86_file_start (void)
31852 {
31853 default_file_start ();
31854 #if TARGET_MACHO
31855 darwin_file_start ();
31856 #endif
31857 if (X86_FILE_START_VERSION_DIRECTIVE)
31858 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
31859 if (X86_FILE_START_FLTUSED)
31860 fputs ("\t.global\t__fltused\n", asm_out_file);
31861 if (ix86_asm_dialect == ASM_INTEL)
31862 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
31863 }
31864
31865 int
31866 x86_field_alignment (tree field, int computed)
31867 {
31868 enum machine_mode mode;
31869 tree type = TREE_TYPE (field);
31870
31871 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
31872 return computed;
31873 mode = TYPE_MODE (strip_array_types (type));
31874 if (mode == DFmode || mode == DCmode
31875 || GET_MODE_CLASS (mode) == MODE_INT
31876 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
31877 return MIN (32, computed);
31878 return computed;
31879 }
31880
31881 /* Output assembler code to FILE to increment profiler label # LABELNO
31882 for profiling a function entry. */
31883 void
31884 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
31885 {
31886 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
31887 : MCOUNT_NAME);
31888
31889 if (TARGET_64BIT)
31890 {
31891 #ifndef NO_PROFILE_COUNTERS
31892 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
31893 #endif
31894
31895 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
31896 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
31897 else
31898 fprintf (file, "\tcall\t%s\n", mcount_name);
31899 }
31900 else if (flag_pic)
31901 {
31902 #ifndef NO_PROFILE_COUNTERS
31903 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
31904 LPREFIX, labelno);
31905 #endif
31906 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
31907 }
31908 else
31909 {
31910 #ifndef NO_PROFILE_COUNTERS
31911 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
31912 LPREFIX, labelno);
31913 #endif
31914 fprintf (file, "\tcall\t%s\n", mcount_name);
31915 }
31916 }
31917
31918 /* We don't have exact information about the insn sizes, but we may assume
31919 quite safely that we are informed about all 1 byte insns and memory
31920 address sizes. This is enough to eliminate unnecessary padding in
31921 99% of cases. */
31922
31923 static int
31924 min_insn_size (rtx insn)
31925 {
31926 int l = 0, len;
31927
31928 if (!INSN_P (insn) || !active_insn_p (insn))
31929 return 0;
31930
31931 /* Discard alignments we've emit and jump instructions. */
31932 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
31933 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
31934 return 0;
31935 if (JUMP_TABLE_DATA_P (insn))
31936 return 0;
31937
31938 /* Important case - calls are always 5 bytes.
31939 It is common to have many calls in the row. */
31940 if (CALL_P (insn)
31941 && symbolic_reference_mentioned_p (PATTERN (insn))
31942 && !SIBLING_CALL_P (insn))
31943 return 5;
31944 len = get_attr_length (insn);
31945 if (len <= 1)
31946 return 1;
31947
31948 /* For normal instructions we rely on get_attr_length being exact,
31949 with a few exceptions. */
31950 if (!JUMP_P (insn))
31951 {
31952 enum attr_type type = get_attr_type (insn);
31953
31954 switch (type)
31955 {
31956 case TYPE_MULTI:
31957 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
31958 || asm_noperands (PATTERN (insn)) >= 0)
31959 return 0;
31960 break;
31961 case TYPE_OTHER:
31962 case TYPE_FCMP:
31963 break;
31964 default:
31965 /* Otherwise trust get_attr_length. */
31966 return len;
31967 }
31968
31969 l = get_attr_length_address (insn);
31970 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
31971 l = 4;
31972 }
31973 if (l)
31974 return 1+l;
31975 else
31976 return 2;
31977 }
31978
31979 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
31980
31981 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
31982 window. */
31983
31984 static void
31985 ix86_avoid_jump_mispredicts (void)
31986 {
31987 rtx insn, start = get_insns ();
31988 int nbytes = 0, njumps = 0;
31989 int isjump = 0;
31990
31991 /* Look for all minimal intervals of instructions containing 4 jumps.
31992 The intervals are bounded by START and INSN. NBYTES is the total
31993 size of instructions in the interval including INSN and not including
31994 START. When the NBYTES is smaller than 16 bytes, it is possible
31995 that the end of START and INSN ends up in the same 16byte page.
31996
31997 The smallest offset in the page INSN can start is the case where START
31998 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
31999 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
32000 */
32001 for (insn = start; insn; insn = NEXT_INSN (insn))
32002 {
32003 int min_size;
32004
32005 if (LABEL_P (insn))
32006 {
32007 int align = label_to_alignment (insn);
32008 int max_skip = label_to_max_skip (insn);
32009
32010 if (max_skip > 15)
32011 max_skip = 15;
32012 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
32013 already in the current 16 byte page, because otherwise
32014 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
32015 bytes to reach 16 byte boundary. */
32016 if (align <= 0
32017 || (align <= 3 && max_skip != (1 << align) - 1))
32018 max_skip = 0;
32019 if (dump_file)
32020 fprintf (dump_file, "Label %i with max_skip %i\n",
32021 INSN_UID (insn), max_skip);
32022 if (max_skip)
32023 {
32024 while (nbytes + max_skip >= 16)
32025 {
32026 start = NEXT_INSN (start);
32027 if ((JUMP_P (start)
32028 && GET_CODE (PATTERN (start)) != ADDR_VEC
32029 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32030 || CALL_P (start))
32031 njumps--, isjump = 1;
32032 else
32033 isjump = 0;
32034 nbytes -= min_insn_size (start);
32035 }
32036 }
32037 continue;
32038 }
32039
32040 min_size = min_insn_size (insn);
32041 nbytes += min_size;
32042 if (dump_file)
32043 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
32044 INSN_UID (insn), min_size);
32045 if ((JUMP_P (insn)
32046 && GET_CODE (PATTERN (insn)) != ADDR_VEC
32047 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
32048 || CALL_P (insn))
32049 njumps++;
32050 else
32051 continue;
32052
32053 while (njumps > 3)
32054 {
32055 start = NEXT_INSN (start);
32056 if ((JUMP_P (start)
32057 && GET_CODE (PATTERN (start)) != ADDR_VEC
32058 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32059 || CALL_P (start))
32060 njumps--, isjump = 1;
32061 else
32062 isjump = 0;
32063 nbytes -= min_insn_size (start);
32064 }
32065 gcc_assert (njumps >= 0);
32066 if (dump_file)
32067 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
32068 INSN_UID (start), INSN_UID (insn), nbytes);
32069
32070 if (njumps == 3 && isjump && nbytes < 16)
32071 {
32072 int padsize = 15 - nbytes + min_insn_size (insn);
32073
32074 if (dump_file)
32075 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
32076 INSN_UID (insn), padsize);
32077 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
32078 }
32079 }
32080 }
32081 #endif
32082
32083 /* AMD Athlon works faster
32084 when RET is not destination of conditional jump or directly preceded
32085 by other jump instruction. We avoid the penalty by inserting NOP just
32086 before the RET instructions in such cases. */
32087 static void
32088 ix86_pad_returns (void)
32089 {
32090 edge e;
32091 edge_iterator ei;
32092
32093 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32094 {
32095 basic_block bb = e->src;
32096 rtx ret = BB_END (bb);
32097 rtx prev;
32098 bool replace = false;
32099
32100 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
32101 || optimize_bb_for_size_p (bb))
32102 continue;
32103 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
32104 if (active_insn_p (prev) || LABEL_P (prev))
32105 break;
32106 if (prev && LABEL_P (prev))
32107 {
32108 edge e;
32109 edge_iterator ei;
32110
32111 FOR_EACH_EDGE (e, ei, bb->preds)
32112 if (EDGE_FREQUENCY (e) && e->src->index >= 0
32113 && !(e->flags & EDGE_FALLTHRU))
32114 replace = true;
32115 }
32116 if (!replace)
32117 {
32118 prev = prev_active_insn (ret);
32119 if (prev
32120 && ((JUMP_P (prev) && any_condjump_p (prev))
32121 || CALL_P (prev)))
32122 replace = true;
32123 /* Empty functions get branch mispredict even when
32124 the jump destination is not visible to us. */
32125 if (!prev && !optimize_function_for_size_p (cfun))
32126 replace = true;
32127 }
32128 if (replace)
32129 {
32130 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
32131 delete_insn (ret);
32132 }
32133 }
32134 }
32135
32136 /* Count the minimum number of instructions in BB. Return 4 if the
32137 number of instructions >= 4. */
32138
32139 static int
32140 ix86_count_insn_bb (basic_block bb)
32141 {
32142 rtx insn;
32143 int insn_count = 0;
32144
32145 /* Count number of instructions in this block. Return 4 if the number
32146 of instructions >= 4. */
32147 FOR_BB_INSNS (bb, insn)
32148 {
32149 /* Only happen in exit blocks. */
32150 if (JUMP_P (insn)
32151 && ANY_RETURN_P (PATTERN (insn)))
32152 break;
32153
32154 if (NONDEBUG_INSN_P (insn)
32155 && GET_CODE (PATTERN (insn)) != USE
32156 && GET_CODE (PATTERN (insn)) != CLOBBER)
32157 {
32158 insn_count++;
32159 if (insn_count >= 4)
32160 return insn_count;
32161 }
32162 }
32163
32164 return insn_count;
32165 }
32166
32167
32168 /* Count the minimum number of instructions in code path in BB.
32169 Return 4 if the number of instructions >= 4. */
32170
32171 static int
32172 ix86_count_insn (basic_block bb)
32173 {
32174 edge e;
32175 edge_iterator ei;
32176 int min_prev_count;
32177
32178 /* Only bother counting instructions along paths with no
32179 more than 2 basic blocks between entry and exit. Given
32180 that BB has an edge to exit, determine if a predecessor
32181 of BB has an edge from entry. If so, compute the number
32182 of instructions in the predecessor block. If there
32183 happen to be multiple such blocks, compute the minimum. */
32184 min_prev_count = 4;
32185 FOR_EACH_EDGE (e, ei, bb->preds)
32186 {
32187 edge prev_e;
32188 edge_iterator prev_ei;
32189
32190 if (e->src == ENTRY_BLOCK_PTR)
32191 {
32192 min_prev_count = 0;
32193 break;
32194 }
32195 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
32196 {
32197 if (prev_e->src == ENTRY_BLOCK_PTR)
32198 {
32199 int count = ix86_count_insn_bb (e->src);
32200 if (count < min_prev_count)
32201 min_prev_count = count;
32202 break;
32203 }
32204 }
32205 }
32206
32207 if (min_prev_count < 4)
32208 min_prev_count += ix86_count_insn_bb (bb);
32209
32210 return min_prev_count;
32211 }
32212
32213 /* Pad short funtion to 4 instructions. */
32214
32215 static void
32216 ix86_pad_short_function (void)
32217 {
32218 edge e;
32219 edge_iterator ei;
32220
32221 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32222 {
32223 rtx ret = BB_END (e->src);
32224 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
32225 {
32226 int insn_count = ix86_count_insn (e->src);
32227
32228 /* Pad short function. */
32229 if (insn_count < 4)
32230 {
32231 rtx insn = ret;
32232
32233 /* Find epilogue. */
32234 while (insn
32235 && (!NOTE_P (insn)
32236 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
32237 insn = PREV_INSN (insn);
32238
32239 if (!insn)
32240 insn = ret;
32241
32242 /* Two NOPs count as one instruction. */
32243 insn_count = 2 * (4 - insn_count);
32244 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
32245 }
32246 }
32247 }
32248 }
32249
32250 /* Implement machine specific optimizations. We implement padding of returns
32251 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
32252 static void
32253 ix86_reorg (void)
32254 {
32255 /* We are freeing block_for_insn in the toplev to keep compatibility
32256 with old MDEP_REORGS that are not CFG based. Recompute it now. */
32257 compute_bb_for_insn ();
32258
32259 /* Run the vzeroupper optimization if needed. */
32260 if (TARGET_VZEROUPPER)
32261 move_or_delete_vzeroupper ();
32262
32263 if (optimize && optimize_function_for_speed_p (cfun))
32264 {
32265 if (TARGET_PAD_SHORT_FUNCTION)
32266 ix86_pad_short_function ();
32267 else if (TARGET_PAD_RETURNS)
32268 ix86_pad_returns ();
32269 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32270 if (TARGET_FOUR_JUMP_LIMIT)
32271 ix86_avoid_jump_mispredicts ();
32272 #endif
32273 }
32274 }
32275
32276 /* Return nonzero when QImode register that must be represented via REX prefix
32277 is used. */
32278 bool
32279 x86_extended_QIreg_mentioned_p (rtx insn)
32280 {
32281 int i;
32282 extract_insn_cached (insn);
32283 for (i = 0; i < recog_data.n_operands; i++)
32284 if (REG_P (recog_data.operand[i])
32285 && REGNO (recog_data.operand[i]) > BX_REG)
32286 return true;
32287 return false;
32288 }
32289
32290 /* Return nonzero when P points to register encoded via REX prefix.
32291 Called via for_each_rtx. */
32292 static int
32293 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
32294 {
32295 unsigned int regno;
32296 if (!REG_P (*p))
32297 return 0;
32298 regno = REGNO (*p);
32299 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
32300 }
32301
32302 /* Return true when INSN mentions register that must be encoded using REX
32303 prefix. */
32304 bool
32305 x86_extended_reg_mentioned_p (rtx insn)
32306 {
32307 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
32308 extended_reg_mentioned_1, NULL);
32309 }
32310
32311 /* If profitable, negate (without causing overflow) integer constant
32312 of mode MODE at location LOC. Return true in this case. */
32313 bool
32314 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
32315 {
32316 HOST_WIDE_INT val;
32317
32318 if (!CONST_INT_P (*loc))
32319 return false;
32320
32321 switch (mode)
32322 {
32323 case DImode:
32324 /* DImode x86_64 constants must fit in 32 bits. */
32325 gcc_assert (x86_64_immediate_operand (*loc, mode));
32326
32327 mode = SImode;
32328 break;
32329
32330 case SImode:
32331 case HImode:
32332 case QImode:
32333 break;
32334
32335 default:
32336 gcc_unreachable ();
32337 }
32338
32339 /* Avoid overflows. */
32340 if (mode_signbit_p (mode, *loc))
32341 return false;
32342
32343 val = INTVAL (*loc);
32344
32345 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
32346 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
32347 if ((val < 0 && val != -128)
32348 || val == 128)
32349 {
32350 *loc = GEN_INT (-val);
32351 return true;
32352 }
32353
32354 return false;
32355 }
32356
32357 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
32358 optabs would emit if we didn't have TFmode patterns. */
32359
32360 void
32361 x86_emit_floatuns (rtx operands[2])
32362 {
32363 rtx neglab, donelab, i0, i1, f0, in, out;
32364 enum machine_mode mode, inmode;
32365
32366 inmode = GET_MODE (operands[1]);
32367 gcc_assert (inmode == SImode || inmode == DImode);
32368
32369 out = operands[0];
32370 in = force_reg (inmode, operands[1]);
32371 mode = GET_MODE (out);
32372 neglab = gen_label_rtx ();
32373 donelab = gen_label_rtx ();
32374 f0 = gen_reg_rtx (mode);
32375
32376 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
32377
32378 expand_float (out, in, 0);
32379
32380 emit_jump_insn (gen_jump (donelab));
32381 emit_barrier ();
32382
32383 emit_label (neglab);
32384
32385 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
32386 1, OPTAB_DIRECT);
32387 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
32388 1, OPTAB_DIRECT);
32389 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
32390
32391 expand_float (f0, i0, 0);
32392
32393 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
32394
32395 emit_label (donelab);
32396 }
32397 \f
32398 /* AVX2 does support 32-byte integer vector operations,
32399 thus the longest vector we are faced with is V32QImode. */
32400 #define MAX_VECT_LEN 32
32401
32402 struct expand_vec_perm_d
32403 {
32404 rtx target, op0, op1;
32405 unsigned char perm[MAX_VECT_LEN];
32406 enum machine_mode vmode;
32407 unsigned char nelt;
32408 bool testing_p;
32409 };
32410
32411 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
32412 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
32413
32414 /* Get a vector mode of the same size as the original but with elements
32415 twice as wide. This is only guaranteed to apply to integral vectors. */
32416
32417 static inline enum machine_mode
32418 get_mode_wider_vector (enum machine_mode o)
32419 {
32420 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
32421 enum machine_mode n = GET_MODE_WIDER_MODE (o);
32422 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
32423 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
32424 return n;
32425 }
32426
32427 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32428 with all elements equal to VAR. Return true if successful. */
32429
32430 static bool
32431 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
32432 rtx target, rtx val)
32433 {
32434 bool ok;
32435
32436 switch (mode)
32437 {
32438 case V2SImode:
32439 case V2SFmode:
32440 if (!mmx_ok)
32441 return false;
32442 /* FALLTHRU */
32443
32444 case V4DFmode:
32445 case V4DImode:
32446 case V8SFmode:
32447 case V8SImode:
32448 case V2DFmode:
32449 case V2DImode:
32450 case V4SFmode:
32451 case V4SImode:
32452 {
32453 rtx insn, dup;
32454
32455 /* First attempt to recognize VAL as-is. */
32456 dup = gen_rtx_VEC_DUPLICATE (mode, val);
32457 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
32458 if (recog_memoized (insn) < 0)
32459 {
32460 rtx seq;
32461 /* If that fails, force VAL into a register. */
32462
32463 start_sequence ();
32464 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
32465 seq = get_insns ();
32466 end_sequence ();
32467 if (seq)
32468 emit_insn_before (seq, insn);
32469
32470 ok = recog_memoized (insn) >= 0;
32471 gcc_assert (ok);
32472 }
32473 }
32474 return true;
32475
32476 case V4HImode:
32477 if (!mmx_ok)
32478 return false;
32479 if (TARGET_SSE || TARGET_3DNOW_A)
32480 {
32481 rtx x;
32482
32483 val = gen_lowpart (SImode, val);
32484 x = gen_rtx_TRUNCATE (HImode, val);
32485 x = gen_rtx_VEC_DUPLICATE (mode, x);
32486 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32487 return true;
32488 }
32489 goto widen;
32490
32491 case V8QImode:
32492 if (!mmx_ok)
32493 return false;
32494 goto widen;
32495
32496 case V8HImode:
32497 if (TARGET_SSE2)
32498 {
32499 struct expand_vec_perm_d dperm;
32500 rtx tmp1, tmp2;
32501
32502 permute:
32503 memset (&dperm, 0, sizeof (dperm));
32504 dperm.target = target;
32505 dperm.vmode = mode;
32506 dperm.nelt = GET_MODE_NUNITS (mode);
32507 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
32508
32509 /* Extend to SImode using a paradoxical SUBREG. */
32510 tmp1 = gen_reg_rtx (SImode);
32511 emit_move_insn (tmp1, gen_lowpart (SImode, val));
32512
32513 /* Insert the SImode value as low element of a V4SImode vector. */
32514 tmp2 = gen_lowpart (V4SImode, dperm.op0);
32515 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
32516
32517 ok = (expand_vec_perm_1 (&dperm)
32518 || expand_vec_perm_broadcast_1 (&dperm));
32519 gcc_assert (ok);
32520 return ok;
32521 }
32522 goto widen;
32523
32524 case V16QImode:
32525 if (TARGET_SSE2)
32526 goto permute;
32527 goto widen;
32528
32529 widen:
32530 /* Replicate the value once into the next wider mode and recurse. */
32531 {
32532 enum machine_mode smode, wsmode, wvmode;
32533 rtx x;
32534
32535 smode = GET_MODE_INNER (mode);
32536 wvmode = get_mode_wider_vector (mode);
32537 wsmode = GET_MODE_INNER (wvmode);
32538
32539 val = convert_modes (wsmode, smode, val, true);
32540 x = expand_simple_binop (wsmode, ASHIFT, val,
32541 GEN_INT (GET_MODE_BITSIZE (smode)),
32542 NULL_RTX, 1, OPTAB_LIB_WIDEN);
32543 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
32544
32545 x = gen_lowpart (wvmode, target);
32546 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
32547 gcc_assert (ok);
32548 return ok;
32549 }
32550
32551 case V16HImode:
32552 case V32QImode:
32553 {
32554 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
32555 rtx x = gen_reg_rtx (hvmode);
32556
32557 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
32558 gcc_assert (ok);
32559
32560 x = gen_rtx_VEC_CONCAT (mode, x, x);
32561 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32562 }
32563 return true;
32564
32565 default:
32566 return false;
32567 }
32568 }
32569
32570 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32571 whose ONE_VAR element is VAR, and other elements are zero. Return true
32572 if successful. */
32573
32574 static bool
32575 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
32576 rtx target, rtx var, int one_var)
32577 {
32578 enum machine_mode vsimode;
32579 rtx new_target;
32580 rtx x, tmp;
32581 bool use_vector_set = false;
32582
32583 switch (mode)
32584 {
32585 case V2DImode:
32586 /* For SSE4.1, we normally use vector set. But if the second
32587 element is zero and inter-unit moves are OK, we use movq
32588 instead. */
32589 use_vector_set = (TARGET_64BIT
32590 && TARGET_SSE4_1
32591 && !(TARGET_INTER_UNIT_MOVES
32592 && one_var == 0));
32593 break;
32594 case V16QImode:
32595 case V4SImode:
32596 case V4SFmode:
32597 use_vector_set = TARGET_SSE4_1;
32598 break;
32599 case V8HImode:
32600 use_vector_set = TARGET_SSE2;
32601 break;
32602 case V4HImode:
32603 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
32604 break;
32605 case V32QImode:
32606 case V16HImode:
32607 case V8SImode:
32608 case V8SFmode:
32609 case V4DFmode:
32610 use_vector_set = TARGET_AVX;
32611 break;
32612 case V4DImode:
32613 /* Use ix86_expand_vector_set in 64bit mode only. */
32614 use_vector_set = TARGET_AVX && TARGET_64BIT;
32615 break;
32616 default:
32617 break;
32618 }
32619
32620 if (use_vector_set)
32621 {
32622 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
32623 var = force_reg (GET_MODE_INNER (mode), var);
32624 ix86_expand_vector_set (mmx_ok, target, var, one_var);
32625 return true;
32626 }
32627
32628 switch (mode)
32629 {
32630 case V2SFmode:
32631 case V2SImode:
32632 if (!mmx_ok)
32633 return false;
32634 /* FALLTHRU */
32635
32636 case V2DFmode:
32637 case V2DImode:
32638 if (one_var != 0)
32639 return false;
32640 var = force_reg (GET_MODE_INNER (mode), var);
32641 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
32642 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32643 return true;
32644
32645 case V4SFmode:
32646 case V4SImode:
32647 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
32648 new_target = gen_reg_rtx (mode);
32649 else
32650 new_target = target;
32651 var = force_reg (GET_MODE_INNER (mode), var);
32652 x = gen_rtx_VEC_DUPLICATE (mode, var);
32653 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
32654 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
32655 if (one_var != 0)
32656 {
32657 /* We need to shuffle the value to the correct position, so
32658 create a new pseudo to store the intermediate result. */
32659
32660 /* With SSE2, we can use the integer shuffle insns. */
32661 if (mode != V4SFmode && TARGET_SSE2)
32662 {
32663 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
32664 const1_rtx,
32665 GEN_INT (one_var == 1 ? 0 : 1),
32666 GEN_INT (one_var == 2 ? 0 : 1),
32667 GEN_INT (one_var == 3 ? 0 : 1)));
32668 if (target != new_target)
32669 emit_move_insn (target, new_target);
32670 return true;
32671 }
32672
32673 /* Otherwise convert the intermediate result to V4SFmode and
32674 use the SSE1 shuffle instructions. */
32675 if (mode != V4SFmode)
32676 {
32677 tmp = gen_reg_rtx (V4SFmode);
32678 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
32679 }
32680 else
32681 tmp = new_target;
32682
32683 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
32684 const1_rtx,
32685 GEN_INT (one_var == 1 ? 0 : 1),
32686 GEN_INT (one_var == 2 ? 0+4 : 1+4),
32687 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
32688
32689 if (mode != V4SFmode)
32690 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
32691 else if (tmp != target)
32692 emit_move_insn (target, tmp);
32693 }
32694 else if (target != new_target)
32695 emit_move_insn (target, new_target);
32696 return true;
32697
32698 case V8HImode:
32699 case V16QImode:
32700 vsimode = V4SImode;
32701 goto widen;
32702 case V4HImode:
32703 case V8QImode:
32704 if (!mmx_ok)
32705 return false;
32706 vsimode = V2SImode;
32707 goto widen;
32708 widen:
32709 if (one_var != 0)
32710 return false;
32711
32712 /* Zero extend the variable element to SImode and recurse. */
32713 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
32714
32715 x = gen_reg_rtx (vsimode);
32716 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
32717 var, one_var))
32718 gcc_unreachable ();
32719
32720 emit_move_insn (target, gen_lowpart (mode, x));
32721 return true;
32722
32723 default:
32724 return false;
32725 }
32726 }
32727
32728 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32729 consisting of the values in VALS. It is known that all elements
32730 except ONE_VAR are constants. Return true if successful. */
32731
32732 static bool
32733 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
32734 rtx target, rtx vals, int one_var)
32735 {
32736 rtx var = XVECEXP (vals, 0, one_var);
32737 enum machine_mode wmode;
32738 rtx const_vec, x;
32739
32740 const_vec = copy_rtx (vals);
32741 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
32742 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
32743
32744 switch (mode)
32745 {
32746 case V2DFmode:
32747 case V2DImode:
32748 case V2SFmode:
32749 case V2SImode:
32750 /* For the two element vectors, it's just as easy to use
32751 the general case. */
32752 return false;
32753
32754 case V4DImode:
32755 /* Use ix86_expand_vector_set in 64bit mode only. */
32756 if (!TARGET_64BIT)
32757 return false;
32758 case V4DFmode:
32759 case V8SFmode:
32760 case V8SImode:
32761 case V16HImode:
32762 case V32QImode:
32763 case V4SFmode:
32764 case V4SImode:
32765 case V8HImode:
32766 case V4HImode:
32767 break;
32768
32769 case V16QImode:
32770 if (TARGET_SSE4_1)
32771 break;
32772 wmode = V8HImode;
32773 goto widen;
32774 case V8QImode:
32775 wmode = V4HImode;
32776 goto widen;
32777 widen:
32778 /* There's no way to set one QImode entry easily. Combine
32779 the variable value with its adjacent constant value, and
32780 promote to an HImode set. */
32781 x = XVECEXP (vals, 0, one_var ^ 1);
32782 if (one_var & 1)
32783 {
32784 var = convert_modes (HImode, QImode, var, true);
32785 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
32786 NULL_RTX, 1, OPTAB_LIB_WIDEN);
32787 x = GEN_INT (INTVAL (x) & 0xff);
32788 }
32789 else
32790 {
32791 var = convert_modes (HImode, QImode, var, true);
32792 x = gen_int_mode (INTVAL (x) << 8, HImode);
32793 }
32794 if (x != const0_rtx)
32795 var = expand_simple_binop (HImode, IOR, var, x, var,
32796 1, OPTAB_LIB_WIDEN);
32797
32798 x = gen_reg_rtx (wmode);
32799 emit_move_insn (x, gen_lowpart (wmode, const_vec));
32800 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
32801
32802 emit_move_insn (target, gen_lowpart (mode, x));
32803 return true;
32804
32805 default:
32806 return false;
32807 }
32808
32809 emit_move_insn (target, const_vec);
32810 ix86_expand_vector_set (mmx_ok, target, var, one_var);
32811 return true;
32812 }
32813
32814 /* A subroutine of ix86_expand_vector_init_general. Use vector
32815 concatenate to handle the most general case: all values variable,
32816 and none identical. */
32817
32818 static void
32819 ix86_expand_vector_init_concat (enum machine_mode mode,
32820 rtx target, rtx *ops, int n)
32821 {
32822 enum machine_mode cmode, hmode = VOIDmode;
32823 rtx first[8], second[4];
32824 rtvec v;
32825 int i, j;
32826
32827 switch (n)
32828 {
32829 case 2:
32830 switch (mode)
32831 {
32832 case V8SImode:
32833 cmode = V4SImode;
32834 break;
32835 case V8SFmode:
32836 cmode = V4SFmode;
32837 break;
32838 case V4DImode:
32839 cmode = V2DImode;
32840 break;
32841 case V4DFmode:
32842 cmode = V2DFmode;
32843 break;
32844 case V4SImode:
32845 cmode = V2SImode;
32846 break;
32847 case V4SFmode:
32848 cmode = V2SFmode;
32849 break;
32850 case V2DImode:
32851 cmode = DImode;
32852 break;
32853 case V2SImode:
32854 cmode = SImode;
32855 break;
32856 case V2DFmode:
32857 cmode = DFmode;
32858 break;
32859 case V2SFmode:
32860 cmode = SFmode;
32861 break;
32862 default:
32863 gcc_unreachable ();
32864 }
32865
32866 if (!register_operand (ops[1], cmode))
32867 ops[1] = force_reg (cmode, ops[1]);
32868 if (!register_operand (ops[0], cmode))
32869 ops[0] = force_reg (cmode, ops[0]);
32870 emit_insn (gen_rtx_SET (VOIDmode, target,
32871 gen_rtx_VEC_CONCAT (mode, ops[0],
32872 ops[1])));
32873 break;
32874
32875 case 4:
32876 switch (mode)
32877 {
32878 case V4DImode:
32879 cmode = V2DImode;
32880 break;
32881 case V4DFmode:
32882 cmode = V2DFmode;
32883 break;
32884 case V4SImode:
32885 cmode = V2SImode;
32886 break;
32887 case V4SFmode:
32888 cmode = V2SFmode;
32889 break;
32890 default:
32891 gcc_unreachable ();
32892 }
32893 goto half;
32894
32895 case 8:
32896 switch (mode)
32897 {
32898 case V8SImode:
32899 cmode = V2SImode;
32900 hmode = V4SImode;
32901 break;
32902 case V8SFmode:
32903 cmode = V2SFmode;
32904 hmode = V4SFmode;
32905 break;
32906 default:
32907 gcc_unreachable ();
32908 }
32909 goto half;
32910
32911 half:
32912 /* FIXME: We process inputs backward to help RA. PR 36222. */
32913 i = n - 1;
32914 j = (n >> 1) - 1;
32915 for (; i > 0; i -= 2, j--)
32916 {
32917 first[j] = gen_reg_rtx (cmode);
32918 v = gen_rtvec (2, ops[i - 1], ops[i]);
32919 ix86_expand_vector_init (false, first[j],
32920 gen_rtx_PARALLEL (cmode, v));
32921 }
32922
32923 n >>= 1;
32924 if (n > 2)
32925 {
32926 gcc_assert (hmode != VOIDmode);
32927 for (i = j = 0; i < n; i += 2, j++)
32928 {
32929 second[j] = gen_reg_rtx (hmode);
32930 ix86_expand_vector_init_concat (hmode, second [j],
32931 &first [i], 2);
32932 }
32933 n >>= 1;
32934 ix86_expand_vector_init_concat (mode, target, second, n);
32935 }
32936 else
32937 ix86_expand_vector_init_concat (mode, target, first, n);
32938 break;
32939
32940 default:
32941 gcc_unreachable ();
32942 }
32943 }
32944
32945 /* A subroutine of ix86_expand_vector_init_general. Use vector
32946 interleave to handle the most general case: all values variable,
32947 and none identical. */
32948
32949 static void
32950 ix86_expand_vector_init_interleave (enum machine_mode mode,
32951 rtx target, rtx *ops, int n)
32952 {
32953 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
32954 int i, j;
32955 rtx op0, op1;
32956 rtx (*gen_load_even) (rtx, rtx, rtx);
32957 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
32958 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
32959
32960 switch (mode)
32961 {
32962 case V8HImode:
32963 gen_load_even = gen_vec_setv8hi;
32964 gen_interleave_first_low = gen_vec_interleave_lowv4si;
32965 gen_interleave_second_low = gen_vec_interleave_lowv2di;
32966 inner_mode = HImode;
32967 first_imode = V4SImode;
32968 second_imode = V2DImode;
32969 third_imode = VOIDmode;
32970 break;
32971 case V16QImode:
32972 gen_load_even = gen_vec_setv16qi;
32973 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
32974 gen_interleave_second_low = gen_vec_interleave_lowv4si;
32975 inner_mode = QImode;
32976 first_imode = V8HImode;
32977 second_imode = V4SImode;
32978 third_imode = V2DImode;
32979 break;
32980 default:
32981 gcc_unreachable ();
32982 }
32983
32984 for (i = 0; i < n; i++)
32985 {
32986 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
32987 op0 = gen_reg_rtx (SImode);
32988 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
32989
32990 /* Insert the SImode value as low element of V4SImode vector. */
32991 op1 = gen_reg_rtx (V4SImode);
32992 op0 = gen_rtx_VEC_MERGE (V4SImode,
32993 gen_rtx_VEC_DUPLICATE (V4SImode,
32994 op0),
32995 CONST0_RTX (V4SImode),
32996 const1_rtx);
32997 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
32998
32999 /* Cast the V4SImode vector back to a vector in orignal mode. */
33000 op0 = gen_reg_rtx (mode);
33001 emit_move_insn (op0, gen_lowpart (mode, op1));
33002
33003 /* Load even elements into the second positon. */
33004 emit_insn (gen_load_even (op0,
33005 force_reg (inner_mode,
33006 ops [i + i + 1]),
33007 const1_rtx));
33008
33009 /* Cast vector to FIRST_IMODE vector. */
33010 ops[i] = gen_reg_rtx (first_imode);
33011 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
33012 }
33013
33014 /* Interleave low FIRST_IMODE vectors. */
33015 for (i = j = 0; i < n; i += 2, j++)
33016 {
33017 op0 = gen_reg_rtx (first_imode);
33018 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
33019
33020 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
33021 ops[j] = gen_reg_rtx (second_imode);
33022 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
33023 }
33024
33025 /* Interleave low SECOND_IMODE vectors. */
33026 switch (second_imode)
33027 {
33028 case V4SImode:
33029 for (i = j = 0; i < n / 2; i += 2, j++)
33030 {
33031 op0 = gen_reg_rtx (second_imode);
33032 emit_insn (gen_interleave_second_low (op0, ops[i],
33033 ops[i + 1]));
33034
33035 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
33036 vector. */
33037 ops[j] = gen_reg_rtx (third_imode);
33038 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
33039 }
33040 second_imode = V2DImode;
33041 gen_interleave_second_low = gen_vec_interleave_lowv2di;
33042 /* FALLTHRU */
33043
33044 case V2DImode:
33045 op0 = gen_reg_rtx (second_imode);
33046 emit_insn (gen_interleave_second_low (op0, ops[0],
33047 ops[1]));
33048
33049 /* Cast the SECOND_IMODE vector back to a vector on original
33050 mode. */
33051 emit_insn (gen_rtx_SET (VOIDmode, target,
33052 gen_lowpart (mode, op0)));
33053 break;
33054
33055 default:
33056 gcc_unreachable ();
33057 }
33058 }
33059
33060 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
33061 all values variable, and none identical. */
33062
33063 static void
33064 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
33065 rtx target, rtx vals)
33066 {
33067 rtx ops[32], op0, op1;
33068 enum machine_mode half_mode = VOIDmode;
33069 int n, i;
33070
33071 switch (mode)
33072 {
33073 case V2SFmode:
33074 case V2SImode:
33075 if (!mmx_ok && !TARGET_SSE)
33076 break;
33077 /* FALLTHRU */
33078
33079 case V8SFmode:
33080 case V8SImode:
33081 case V4DFmode:
33082 case V4DImode:
33083 case V4SFmode:
33084 case V4SImode:
33085 case V2DFmode:
33086 case V2DImode:
33087 n = GET_MODE_NUNITS (mode);
33088 for (i = 0; i < n; i++)
33089 ops[i] = XVECEXP (vals, 0, i);
33090 ix86_expand_vector_init_concat (mode, target, ops, n);
33091 return;
33092
33093 case V32QImode:
33094 half_mode = V16QImode;
33095 goto half;
33096
33097 case V16HImode:
33098 half_mode = V8HImode;
33099 goto half;
33100
33101 half:
33102 n = GET_MODE_NUNITS (mode);
33103 for (i = 0; i < n; i++)
33104 ops[i] = XVECEXP (vals, 0, i);
33105 op0 = gen_reg_rtx (half_mode);
33106 op1 = gen_reg_rtx (half_mode);
33107 ix86_expand_vector_init_interleave (half_mode, op0, ops,
33108 n >> 2);
33109 ix86_expand_vector_init_interleave (half_mode, op1,
33110 &ops [n >> 1], n >> 2);
33111 emit_insn (gen_rtx_SET (VOIDmode, target,
33112 gen_rtx_VEC_CONCAT (mode, op0, op1)));
33113 return;
33114
33115 case V16QImode:
33116 if (!TARGET_SSE4_1)
33117 break;
33118 /* FALLTHRU */
33119
33120 case V8HImode:
33121 if (!TARGET_SSE2)
33122 break;
33123
33124 /* Don't use ix86_expand_vector_init_interleave if we can't
33125 move from GPR to SSE register directly. */
33126 if (!TARGET_INTER_UNIT_MOVES)
33127 break;
33128
33129 n = GET_MODE_NUNITS (mode);
33130 for (i = 0; i < n; i++)
33131 ops[i] = XVECEXP (vals, 0, i);
33132 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
33133 return;
33134
33135 case V4HImode:
33136 case V8QImode:
33137 break;
33138
33139 default:
33140 gcc_unreachable ();
33141 }
33142
33143 {
33144 int i, j, n_elts, n_words, n_elt_per_word;
33145 enum machine_mode inner_mode;
33146 rtx words[4], shift;
33147
33148 inner_mode = GET_MODE_INNER (mode);
33149 n_elts = GET_MODE_NUNITS (mode);
33150 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
33151 n_elt_per_word = n_elts / n_words;
33152 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
33153
33154 for (i = 0; i < n_words; ++i)
33155 {
33156 rtx word = NULL_RTX;
33157
33158 for (j = 0; j < n_elt_per_word; ++j)
33159 {
33160 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
33161 elt = convert_modes (word_mode, inner_mode, elt, true);
33162
33163 if (j == 0)
33164 word = elt;
33165 else
33166 {
33167 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
33168 word, 1, OPTAB_LIB_WIDEN);
33169 word = expand_simple_binop (word_mode, IOR, word, elt,
33170 word, 1, OPTAB_LIB_WIDEN);
33171 }
33172 }
33173
33174 words[i] = word;
33175 }
33176
33177 if (n_words == 1)
33178 emit_move_insn (target, gen_lowpart (mode, words[0]));
33179 else if (n_words == 2)
33180 {
33181 rtx tmp = gen_reg_rtx (mode);
33182 emit_clobber (tmp);
33183 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
33184 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
33185 emit_move_insn (target, tmp);
33186 }
33187 else if (n_words == 4)
33188 {
33189 rtx tmp = gen_reg_rtx (V4SImode);
33190 gcc_assert (word_mode == SImode);
33191 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
33192 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
33193 emit_move_insn (target, gen_lowpart (mode, tmp));
33194 }
33195 else
33196 gcc_unreachable ();
33197 }
33198 }
33199
33200 /* Initialize vector TARGET via VALS. Suppress the use of MMX
33201 instructions unless MMX_OK is true. */
33202
33203 void
33204 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
33205 {
33206 enum machine_mode mode = GET_MODE (target);
33207 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33208 int n_elts = GET_MODE_NUNITS (mode);
33209 int n_var = 0, one_var = -1;
33210 bool all_same = true, all_const_zero = true;
33211 int i;
33212 rtx x;
33213
33214 for (i = 0; i < n_elts; ++i)
33215 {
33216 x = XVECEXP (vals, 0, i);
33217 if (!(CONST_INT_P (x)
33218 || GET_CODE (x) == CONST_DOUBLE
33219 || GET_CODE (x) == CONST_FIXED))
33220 n_var++, one_var = i;
33221 else if (x != CONST0_RTX (inner_mode))
33222 all_const_zero = false;
33223 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
33224 all_same = false;
33225 }
33226
33227 /* Constants are best loaded from the constant pool. */
33228 if (n_var == 0)
33229 {
33230 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
33231 return;
33232 }
33233
33234 /* If all values are identical, broadcast the value. */
33235 if (all_same
33236 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
33237 XVECEXP (vals, 0, 0)))
33238 return;
33239
33240 /* Values where only one field is non-constant are best loaded from
33241 the pool and overwritten via move later. */
33242 if (n_var == 1)
33243 {
33244 if (all_const_zero
33245 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
33246 XVECEXP (vals, 0, one_var),
33247 one_var))
33248 return;
33249
33250 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
33251 return;
33252 }
33253
33254 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
33255 }
33256
33257 void
33258 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
33259 {
33260 enum machine_mode mode = GET_MODE (target);
33261 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33262 enum machine_mode half_mode;
33263 bool use_vec_merge = false;
33264 rtx tmp;
33265 static rtx (*gen_extract[6][2]) (rtx, rtx)
33266 = {
33267 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
33268 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
33269 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
33270 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
33271 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
33272 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
33273 };
33274 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
33275 = {
33276 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
33277 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
33278 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
33279 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
33280 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
33281 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
33282 };
33283 int i, j, n;
33284
33285 switch (mode)
33286 {
33287 case V2SFmode:
33288 case V2SImode:
33289 if (mmx_ok)
33290 {
33291 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33292 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
33293 if (elt == 0)
33294 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33295 else
33296 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33297 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33298 return;
33299 }
33300 break;
33301
33302 case V2DImode:
33303 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
33304 if (use_vec_merge)
33305 break;
33306
33307 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33308 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
33309 if (elt == 0)
33310 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33311 else
33312 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33313 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33314 return;
33315
33316 case V2DFmode:
33317 {
33318 rtx op0, op1;
33319
33320 /* For the two element vectors, we implement a VEC_CONCAT with
33321 the extraction of the other element. */
33322
33323 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
33324 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
33325
33326 if (elt == 0)
33327 op0 = val, op1 = tmp;
33328 else
33329 op0 = tmp, op1 = val;
33330
33331 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
33332 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33333 }
33334 return;
33335
33336 case V4SFmode:
33337 use_vec_merge = TARGET_SSE4_1;
33338 if (use_vec_merge)
33339 break;
33340
33341 switch (elt)
33342 {
33343 case 0:
33344 use_vec_merge = true;
33345 break;
33346
33347 case 1:
33348 /* tmp = target = A B C D */
33349 tmp = copy_to_reg (target);
33350 /* target = A A B B */
33351 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
33352 /* target = X A B B */
33353 ix86_expand_vector_set (false, target, val, 0);
33354 /* target = A X C D */
33355 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33356 const1_rtx, const0_rtx,
33357 GEN_INT (2+4), GEN_INT (3+4)));
33358 return;
33359
33360 case 2:
33361 /* tmp = target = A B C D */
33362 tmp = copy_to_reg (target);
33363 /* tmp = X B C D */
33364 ix86_expand_vector_set (false, tmp, val, 0);
33365 /* target = A B X D */
33366 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33367 const0_rtx, const1_rtx,
33368 GEN_INT (0+4), GEN_INT (3+4)));
33369 return;
33370
33371 case 3:
33372 /* tmp = target = A B C D */
33373 tmp = copy_to_reg (target);
33374 /* tmp = X B C D */
33375 ix86_expand_vector_set (false, tmp, val, 0);
33376 /* target = A B X D */
33377 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33378 const0_rtx, const1_rtx,
33379 GEN_INT (2+4), GEN_INT (0+4)));
33380 return;
33381
33382 default:
33383 gcc_unreachable ();
33384 }
33385 break;
33386
33387 case V4SImode:
33388 use_vec_merge = TARGET_SSE4_1;
33389 if (use_vec_merge)
33390 break;
33391
33392 /* Element 0 handled by vec_merge below. */
33393 if (elt == 0)
33394 {
33395 use_vec_merge = true;
33396 break;
33397 }
33398
33399 if (TARGET_SSE2)
33400 {
33401 /* With SSE2, use integer shuffles to swap element 0 and ELT,
33402 store into element 0, then shuffle them back. */
33403
33404 rtx order[4];
33405
33406 order[0] = GEN_INT (elt);
33407 order[1] = const1_rtx;
33408 order[2] = const2_rtx;
33409 order[3] = GEN_INT (3);
33410 order[elt] = const0_rtx;
33411
33412 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
33413 order[1], order[2], order[3]));
33414
33415 ix86_expand_vector_set (false, target, val, 0);
33416
33417 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
33418 order[1], order[2], order[3]));
33419 }
33420 else
33421 {
33422 /* For SSE1, we have to reuse the V4SF code. */
33423 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
33424 gen_lowpart (SFmode, val), elt);
33425 }
33426 return;
33427
33428 case V8HImode:
33429 use_vec_merge = TARGET_SSE2;
33430 break;
33431 case V4HImode:
33432 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
33433 break;
33434
33435 case V16QImode:
33436 use_vec_merge = TARGET_SSE4_1;
33437 break;
33438
33439 case V8QImode:
33440 break;
33441
33442 case V32QImode:
33443 half_mode = V16QImode;
33444 j = 0;
33445 n = 16;
33446 goto half;
33447
33448 case V16HImode:
33449 half_mode = V8HImode;
33450 j = 1;
33451 n = 8;
33452 goto half;
33453
33454 case V8SImode:
33455 half_mode = V4SImode;
33456 j = 2;
33457 n = 4;
33458 goto half;
33459
33460 case V4DImode:
33461 half_mode = V2DImode;
33462 j = 3;
33463 n = 2;
33464 goto half;
33465
33466 case V8SFmode:
33467 half_mode = V4SFmode;
33468 j = 4;
33469 n = 4;
33470 goto half;
33471
33472 case V4DFmode:
33473 half_mode = V2DFmode;
33474 j = 5;
33475 n = 2;
33476 goto half;
33477
33478 half:
33479 /* Compute offset. */
33480 i = elt / n;
33481 elt %= n;
33482
33483 gcc_assert (i <= 1);
33484
33485 /* Extract the half. */
33486 tmp = gen_reg_rtx (half_mode);
33487 emit_insn (gen_extract[j][i] (tmp, target));
33488
33489 /* Put val in tmp at elt. */
33490 ix86_expand_vector_set (false, tmp, val, elt);
33491
33492 /* Put it back. */
33493 emit_insn (gen_insert[j][i] (target, target, tmp));
33494 return;
33495
33496 default:
33497 break;
33498 }
33499
33500 if (use_vec_merge)
33501 {
33502 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
33503 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
33504 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33505 }
33506 else
33507 {
33508 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
33509
33510 emit_move_insn (mem, target);
33511
33512 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
33513 emit_move_insn (tmp, val);
33514
33515 emit_move_insn (target, mem);
33516 }
33517 }
33518
33519 void
33520 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
33521 {
33522 enum machine_mode mode = GET_MODE (vec);
33523 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33524 bool use_vec_extr = false;
33525 rtx tmp;
33526
33527 switch (mode)
33528 {
33529 case V2SImode:
33530 case V2SFmode:
33531 if (!mmx_ok)
33532 break;
33533 /* FALLTHRU */
33534
33535 case V2DFmode:
33536 case V2DImode:
33537 use_vec_extr = true;
33538 break;
33539
33540 case V4SFmode:
33541 use_vec_extr = TARGET_SSE4_1;
33542 if (use_vec_extr)
33543 break;
33544
33545 switch (elt)
33546 {
33547 case 0:
33548 tmp = vec;
33549 break;
33550
33551 case 1:
33552 case 3:
33553 tmp = gen_reg_rtx (mode);
33554 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
33555 GEN_INT (elt), GEN_INT (elt),
33556 GEN_INT (elt+4), GEN_INT (elt+4)));
33557 break;
33558
33559 case 2:
33560 tmp = gen_reg_rtx (mode);
33561 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
33562 break;
33563
33564 default:
33565 gcc_unreachable ();
33566 }
33567 vec = tmp;
33568 use_vec_extr = true;
33569 elt = 0;
33570 break;
33571
33572 case V4SImode:
33573 use_vec_extr = TARGET_SSE4_1;
33574 if (use_vec_extr)
33575 break;
33576
33577 if (TARGET_SSE2)
33578 {
33579 switch (elt)
33580 {
33581 case 0:
33582 tmp = vec;
33583 break;
33584
33585 case 1:
33586 case 3:
33587 tmp = gen_reg_rtx (mode);
33588 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
33589 GEN_INT (elt), GEN_INT (elt),
33590 GEN_INT (elt), GEN_INT (elt)));
33591 break;
33592
33593 case 2:
33594 tmp = gen_reg_rtx (mode);
33595 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
33596 break;
33597
33598 default:
33599 gcc_unreachable ();
33600 }
33601 vec = tmp;
33602 use_vec_extr = true;
33603 elt = 0;
33604 }
33605 else
33606 {
33607 /* For SSE1, we have to reuse the V4SF code. */
33608 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
33609 gen_lowpart (V4SFmode, vec), elt);
33610 return;
33611 }
33612 break;
33613
33614 case V8HImode:
33615 use_vec_extr = TARGET_SSE2;
33616 break;
33617 case V4HImode:
33618 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
33619 break;
33620
33621 case V16QImode:
33622 use_vec_extr = TARGET_SSE4_1;
33623 break;
33624
33625 case V8SFmode:
33626 if (TARGET_AVX)
33627 {
33628 tmp = gen_reg_rtx (V4SFmode);
33629 if (elt < 4)
33630 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
33631 else
33632 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
33633 ix86_expand_vector_extract (false, target, tmp, elt & 3);
33634 return;
33635 }
33636 break;
33637
33638 case V4DFmode:
33639 if (TARGET_AVX)
33640 {
33641 tmp = gen_reg_rtx (V2DFmode);
33642 if (elt < 2)
33643 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
33644 else
33645 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
33646 ix86_expand_vector_extract (false, target, tmp, elt & 1);
33647 return;
33648 }
33649 break;
33650
33651 case V32QImode:
33652 if (TARGET_AVX)
33653 {
33654 tmp = gen_reg_rtx (V16QImode);
33655 if (elt < 16)
33656 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
33657 else
33658 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
33659 ix86_expand_vector_extract (false, target, tmp, elt & 15);
33660 return;
33661 }
33662 break;
33663
33664 case V16HImode:
33665 if (TARGET_AVX)
33666 {
33667 tmp = gen_reg_rtx (V8HImode);
33668 if (elt < 8)
33669 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
33670 else
33671 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
33672 ix86_expand_vector_extract (false, target, tmp, elt & 7);
33673 return;
33674 }
33675 break;
33676
33677 case V8SImode:
33678 if (TARGET_AVX)
33679 {
33680 tmp = gen_reg_rtx (V4SImode);
33681 if (elt < 4)
33682 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
33683 else
33684 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
33685 ix86_expand_vector_extract (false, target, tmp, elt & 3);
33686 return;
33687 }
33688 break;
33689
33690 case V4DImode:
33691 if (TARGET_AVX)
33692 {
33693 tmp = gen_reg_rtx (V2DImode);
33694 if (elt < 2)
33695 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
33696 else
33697 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
33698 ix86_expand_vector_extract (false, target, tmp, elt & 1);
33699 return;
33700 }
33701 break;
33702
33703 case V8QImode:
33704 /* ??? Could extract the appropriate HImode element and shift. */
33705 default:
33706 break;
33707 }
33708
33709 if (use_vec_extr)
33710 {
33711 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
33712 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
33713
33714 /* Let the rtl optimizers know about the zero extension performed. */
33715 if (inner_mode == QImode || inner_mode == HImode)
33716 {
33717 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
33718 target = gen_lowpart (SImode, target);
33719 }
33720
33721 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33722 }
33723 else
33724 {
33725 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
33726
33727 emit_move_insn (mem, vec);
33728
33729 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
33730 emit_move_insn (target, tmp);
33731 }
33732 }
33733
33734 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
33735 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
33736 The upper bits of DEST are undefined, though they shouldn't cause
33737 exceptions (some bits from src or all zeros are ok). */
33738
33739 static void
33740 emit_reduc_half (rtx dest, rtx src, int i)
33741 {
33742 rtx tem;
33743 switch (GET_MODE (src))
33744 {
33745 case V4SFmode:
33746 if (i == 128)
33747 tem = gen_sse_movhlps (dest, src, src);
33748 else
33749 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
33750 GEN_INT (1 + 4), GEN_INT (1 + 4));
33751 break;
33752 case V2DFmode:
33753 tem = gen_vec_interleave_highv2df (dest, src, src);
33754 break;
33755 case V16QImode:
33756 case V8HImode:
33757 case V4SImode:
33758 case V2DImode:
33759 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
33760 gen_lowpart (V1TImode, src),
33761 GEN_INT (i / 2));
33762 break;
33763 case V8SFmode:
33764 if (i == 256)
33765 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
33766 else
33767 tem = gen_avx_shufps256 (dest, src, src,
33768 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
33769 break;
33770 case V4DFmode:
33771 if (i == 256)
33772 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
33773 else
33774 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
33775 break;
33776 case V32QImode:
33777 case V16HImode:
33778 case V8SImode:
33779 case V4DImode:
33780 if (i == 256)
33781 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
33782 gen_lowpart (V4DImode, src),
33783 gen_lowpart (V4DImode, src),
33784 const1_rtx);
33785 else
33786 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
33787 gen_lowpart (V2TImode, src),
33788 GEN_INT (i / 2));
33789 break;
33790 default:
33791 gcc_unreachable ();
33792 }
33793 emit_insn (tem);
33794 }
33795
33796 /* Expand a vector reduction. FN is the binary pattern to reduce;
33797 DEST is the destination; IN is the input vector. */
33798
33799 void
33800 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
33801 {
33802 rtx half, dst, vec = in;
33803 enum machine_mode mode = GET_MODE (in);
33804 int i;
33805
33806 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
33807 if (TARGET_SSE4_1
33808 && mode == V8HImode
33809 && fn == gen_uminv8hi3)
33810 {
33811 emit_insn (gen_sse4_1_phminposuw (dest, in));
33812 return;
33813 }
33814
33815 for (i = GET_MODE_BITSIZE (mode);
33816 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
33817 i >>= 1)
33818 {
33819 half = gen_reg_rtx (mode);
33820 emit_reduc_half (half, vec, i);
33821 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
33822 dst = dest;
33823 else
33824 dst = gen_reg_rtx (mode);
33825 emit_insn (fn (dst, half, vec));
33826 vec = dst;
33827 }
33828 }
33829 \f
33830 /* Target hook for scalar_mode_supported_p. */
33831 static bool
33832 ix86_scalar_mode_supported_p (enum machine_mode mode)
33833 {
33834 if (DECIMAL_FLOAT_MODE_P (mode))
33835 return default_decimal_float_supported_p ();
33836 else if (mode == TFmode)
33837 return true;
33838 else
33839 return default_scalar_mode_supported_p (mode);
33840 }
33841
33842 /* Implements target hook vector_mode_supported_p. */
33843 static bool
33844 ix86_vector_mode_supported_p (enum machine_mode mode)
33845 {
33846 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
33847 return true;
33848 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
33849 return true;
33850 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
33851 return true;
33852 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
33853 return true;
33854 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
33855 return true;
33856 return false;
33857 }
33858
33859 /* Target hook for c_mode_for_suffix. */
33860 static enum machine_mode
33861 ix86_c_mode_for_suffix (char suffix)
33862 {
33863 if (suffix == 'q')
33864 return TFmode;
33865 if (suffix == 'w')
33866 return XFmode;
33867
33868 return VOIDmode;
33869 }
33870
33871 /* Worker function for TARGET_MD_ASM_CLOBBERS.
33872
33873 We do this in the new i386 backend to maintain source compatibility
33874 with the old cc0-based compiler. */
33875
33876 static tree
33877 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
33878 tree inputs ATTRIBUTE_UNUSED,
33879 tree clobbers)
33880 {
33881 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
33882 clobbers);
33883 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
33884 clobbers);
33885 return clobbers;
33886 }
33887
33888 /* Implements target vector targetm.asm.encode_section_info. */
33889
33890 static void ATTRIBUTE_UNUSED
33891 ix86_encode_section_info (tree decl, rtx rtl, int first)
33892 {
33893 default_encode_section_info (decl, rtl, first);
33894
33895 if (TREE_CODE (decl) == VAR_DECL
33896 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
33897 && ix86_in_large_data_p (decl))
33898 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
33899 }
33900
33901 /* Worker function for REVERSE_CONDITION. */
33902
33903 enum rtx_code
33904 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
33905 {
33906 return (mode != CCFPmode && mode != CCFPUmode
33907 ? reverse_condition (code)
33908 : reverse_condition_maybe_unordered (code));
33909 }
33910
33911 /* Output code to perform an x87 FP register move, from OPERANDS[1]
33912 to OPERANDS[0]. */
33913
33914 const char *
33915 output_387_reg_move (rtx insn, rtx *operands)
33916 {
33917 if (REG_P (operands[0]))
33918 {
33919 if (REG_P (operands[1])
33920 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
33921 {
33922 if (REGNO (operands[0]) == FIRST_STACK_REG)
33923 return output_387_ffreep (operands, 0);
33924 return "fstp\t%y0";
33925 }
33926 if (STACK_TOP_P (operands[0]))
33927 return "fld%Z1\t%y1";
33928 return "fst\t%y0";
33929 }
33930 else if (MEM_P (operands[0]))
33931 {
33932 gcc_assert (REG_P (operands[1]));
33933 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
33934 return "fstp%Z0\t%y0";
33935 else
33936 {
33937 /* There is no non-popping store to memory for XFmode.
33938 So if we need one, follow the store with a load. */
33939 if (GET_MODE (operands[0]) == XFmode)
33940 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
33941 else
33942 return "fst%Z0\t%y0";
33943 }
33944 }
33945 else
33946 gcc_unreachable();
33947 }
33948
33949 /* Output code to perform a conditional jump to LABEL, if C2 flag in
33950 FP status register is set. */
33951
33952 void
33953 ix86_emit_fp_unordered_jump (rtx label)
33954 {
33955 rtx reg = gen_reg_rtx (HImode);
33956 rtx temp;
33957
33958 emit_insn (gen_x86_fnstsw_1 (reg));
33959
33960 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
33961 {
33962 emit_insn (gen_x86_sahf_1 (reg));
33963
33964 temp = gen_rtx_REG (CCmode, FLAGS_REG);
33965 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
33966 }
33967 else
33968 {
33969 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
33970
33971 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
33972 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
33973 }
33974
33975 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
33976 gen_rtx_LABEL_REF (VOIDmode, label),
33977 pc_rtx);
33978 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
33979
33980 emit_jump_insn (temp);
33981 predict_jump (REG_BR_PROB_BASE * 10 / 100);
33982 }
33983
33984 /* Output code to perform a log1p XFmode calculation. */
33985
33986 void ix86_emit_i387_log1p (rtx op0, rtx op1)
33987 {
33988 rtx label1 = gen_label_rtx ();
33989 rtx label2 = gen_label_rtx ();
33990
33991 rtx tmp = gen_reg_rtx (XFmode);
33992 rtx tmp2 = gen_reg_rtx (XFmode);
33993 rtx test;
33994
33995 emit_insn (gen_absxf2 (tmp, op1));
33996 test = gen_rtx_GE (VOIDmode, tmp,
33997 CONST_DOUBLE_FROM_REAL_VALUE (
33998 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
33999 XFmode));
34000 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
34001
34002 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34003 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
34004 emit_jump (label2);
34005
34006 emit_label (label1);
34007 emit_move_insn (tmp, CONST1_RTX (XFmode));
34008 emit_insn (gen_addxf3 (tmp, op1, tmp));
34009 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34010 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
34011
34012 emit_label (label2);
34013 }
34014
34015 /* Emit code for round calculation. */
34016 void ix86_emit_i387_round (rtx op0, rtx op1)
34017 {
34018 enum machine_mode inmode = GET_MODE (op1);
34019 enum machine_mode outmode = GET_MODE (op0);
34020 rtx e1, e2, res, tmp, tmp1, half;
34021 rtx scratch = gen_reg_rtx (HImode);
34022 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
34023 rtx jump_label = gen_label_rtx ();
34024 rtx insn;
34025 rtx (*gen_abs) (rtx, rtx);
34026 rtx (*gen_neg) (rtx, rtx);
34027
34028 switch (inmode)
34029 {
34030 case SFmode:
34031 gen_abs = gen_abssf2;
34032 break;
34033 case DFmode:
34034 gen_abs = gen_absdf2;
34035 break;
34036 case XFmode:
34037 gen_abs = gen_absxf2;
34038 break;
34039 default:
34040 gcc_unreachable ();
34041 }
34042
34043 switch (outmode)
34044 {
34045 case SFmode:
34046 gen_neg = gen_negsf2;
34047 break;
34048 case DFmode:
34049 gen_neg = gen_negdf2;
34050 break;
34051 case XFmode:
34052 gen_neg = gen_negxf2;
34053 break;
34054 case HImode:
34055 gen_neg = gen_neghi2;
34056 break;
34057 case SImode:
34058 gen_neg = gen_negsi2;
34059 break;
34060 case DImode:
34061 gen_neg = gen_negdi2;
34062 break;
34063 default:
34064 gcc_unreachable ();
34065 }
34066
34067 e1 = gen_reg_rtx (inmode);
34068 e2 = gen_reg_rtx (inmode);
34069 res = gen_reg_rtx (outmode);
34070
34071 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
34072
34073 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
34074
34075 /* scratch = fxam(op1) */
34076 emit_insn (gen_rtx_SET (VOIDmode, scratch,
34077 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
34078 UNSPEC_FXAM)));
34079 /* e1 = fabs(op1) */
34080 emit_insn (gen_abs (e1, op1));
34081
34082 /* e2 = e1 + 0.5 */
34083 half = force_reg (inmode, half);
34084 emit_insn (gen_rtx_SET (VOIDmode, e2,
34085 gen_rtx_PLUS (inmode, e1, half)));
34086
34087 /* res = floor(e2) */
34088 if (inmode != XFmode)
34089 {
34090 tmp1 = gen_reg_rtx (XFmode);
34091
34092 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
34093 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
34094 }
34095 else
34096 tmp1 = e2;
34097
34098 switch (outmode)
34099 {
34100 case SFmode:
34101 case DFmode:
34102 {
34103 rtx tmp0 = gen_reg_rtx (XFmode);
34104
34105 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
34106
34107 emit_insn (gen_rtx_SET (VOIDmode, res,
34108 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
34109 UNSPEC_TRUNC_NOOP)));
34110 }
34111 break;
34112 case XFmode:
34113 emit_insn (gen_frndintxf2_floor (res, tmp1));
34114 break;
34115 case HImode:
34116 emit_insn (gen_lfloorxfhi2 (res, tmp1));
34117 break;
34118 case SImode:
34119 emit_insn (gen_lfloorxfsi2 (res, tmp1));
34120 break;
34121 case DImode:
34122 emit_insn (gen_lfloorxfdi2 (res, tmp1));
34123 break;
34124 default:
34125 gcc_unreachable ();
34126 }
34127
34128 /* flags = signbit(a) */
34129 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
34130
34131 /* if (flags) then res = -res */
34132 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
34133 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
34134 gen_rtx_LABEL_REF (VOIDmode, jump_label),
34135 pc_rtx);
34136 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
34137 predict_jump (REG_BR_PROB_BASE * 50 / 100);
34138 JUMP_LABEL (insn) = jump_label;
34139
34140 emit_insn (gen_neg (res, res));
34141
34142 emit_label (jump_label);
34143 LABEL_NUSES (jump_label) = 1;
34144
34145 emit_move_insn (op0, res);
34146 }
34147
34148 /* Output code to perform a Newton-Rhapson approximation of a single precision
34149 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
34150
34151 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
34152 {
34153 rtx x0, x1, e0, e1;
34154
34155 x0 = gen_reg_rtx (mode);
34156 e0 = gen_reg_rtx (mode);
34157 e1 = gen_reg_rtx (mode);
34158 x1 = gen_reg_rtx (mode);
34159
34160 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
34161
34162 b = force_reg (mode, b);
34163
34164 /* x0 = rcp(b) estimate */
34165 emit_insn (gen_rtx_SET (VOIDmode, x0,
34166 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
34167 UNSPEC_RCP)));
34168 /* e0 = x0 * b */
34169 emit_insn (gen_rtx_SET (VOIDmode, e0,
34170 gen_rtx_MULT (mode, x0, b)));
34171
34172 /* e0 = x0 * e0 */
34173 emit_insn (gen_rtx_SET (VOIDmode, e0,
34174 gen_rtx_MULT (mode, x0, e0)));
34175
34176 /* e1 = x0 + x0 */
34177 emit_insn (gen_rtx_SET (VOIDmode, e1,
34178 gen_rtx_PLUS (mode, x0, x0)));
34179
34180 /* x1 = e1 - e0 */
34181 emit_insn (gen_rtx_SET (VOIDmode, x1,
34182 gen_rtx_MINUS (mode, e1, e0)));
34183
34184 /* res = a * x1 */
34185 emit_insn (gen_rtx_SET (VOIDmode, res,
34186 gen_rtx_MULT (mode, a, x1)));
34187 }
34188
34189 /* Output code to perform a Newton-Rhapson approximation of a
34190 single precision floating point [reciprocal] square root. */
34191
34192 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
34193 bool recip)
34194 {
34195 rtx x0, e0, e1, e2, e3, mthree, mhalf;
34196 REAL_VALUE_TYPE r;
34197
34198 x0 = gen_reg_rtx (mode);
34199 e0 = gen_reg_rtx (mode);
34200 e1 = gen_reg_rtx (mode);
34201 e2 = gen_reg_rtx (mode);
34202 e3 = gen_reg_rtx (mode);
34203
34204 real_from_integer (&r, VOIDmode, -3, -1, 0);
34205 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34206
34207 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
34208 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34209
34210 if (VECTOR_MODE_P (mode))
34211 {
34212 mthree = ix86_build_const_vector (mode, true, mthree);
34213 mhalf = ix86_build_const_vector (mode, true, mhalf);
34214 }
34215
34216 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
34217 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
34218
34219 a = force_reg (mode, a);
34220
34221 /* x0 = rsqrt(a) estimate */
34222 emit_insn (gen_rtx_SET (VOIDmode, x0,
34223 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
34224 UNSPEC_RSQRT)));
34225
34226 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
34227 if (!recip)
34228 {
34229 rtx zero, mask;
34230
34231 zero = gen_reg_rtx (mode);
34232 mask = gen_reg_rtx (mode);
34233
34234 zero = force_reg (mode, CONST0_RTX(mode));
34235 emit_insn (gen_rtx_SET (VOIDmode, mask,
34236 gen_rtx_NE (mode, zero, a)));
34237
34238 emit_insn (gen_rtx_SET (VOIDmode, x0,
34239 gen_rtx_AND (mode, x0, mask)));
34240 }
34241
34242 /* e0 = x0 * a */
34243 emit_insn (gen_rtx_SET (VOIDmode, e0,
34244 gen_rtx_MULT (mode, x0, a)));
34245 /* e1 = e0 * x0 */
34246 emit_insn (gen_rtx_SET (VOIDmode, e1,
34247 gen_rtx_MULT (mode, e0, x0)));
34248
34249 /* e2 = e1 - 3. */
34250 mthree = force_reg (mode, mthree);
34251 emit_insn (gen_rtx_SET (VOIDmode, e2,
34252 gen_rtx_PLUS (mode, e1, mthree)));
34253
34254 mhalf = force_reg (mode, mhalf);
34255 if (recip)
34256 /* e3 = -.5 * x0 */
34257 emit_insn (gen_rtx_SET (VOIDmode, e3,
34258 gen_rtx_MULT (mode, x0, mhalf)));
34259 else
34260 /* e3 = -.5 * e0 */
34261 emit_insn (gen_rtx_SET (VOIDmode, e3,
34262 gen_rtx_MULT (mode, e0, mhalf)));
34263 /* ret = e2 * e3 */
34264 emit_insn (gen_rtx_SET (VOIDmode, res,
34265 gen_rtx_MULT (mode, e2, e3)));
34266 }
34267
34268 #ifdef TARGET_SOLARIS
34269 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
34270
34271 static void
34272 i386_solaris_elf_named_section (const char *name, unsigned int flags,
34273 tree decl)
34274 {
34275 /* With Binutils 2.15, the "@unwind" marker must be specified on
34276 every occurrence of the ".eh_frame" section, not just the first
34277 one. */
34278 if (TARGET_64BIT
34279 && strcmp (name, ".eh_frame") == 0)
34280 {
34281 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
34282 flags & SECTION_WRITE ? "aw" : "a");
34283 return;
34284 }
34285
34286 #ifndef USE_GAS
34287 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
34288 {
34289 solaris_elf_asm_comdat_section (name, flags, decl);
34290 return;
34291 }
34292 #endif
34293
34294 default_elf_asm_named_section (name, flags, decl);
34295 }
34296 #endif /* TARGET_SOLARIS */
34297
34298 /* Return the mangling of TYPE if it is an extended fundamental type. */
34299
34300 static const char *
34301 ix86_mangle_type (const_tree type)
34302 {
34303 type = TYPE_MAIN_VARIANT (type);
34304
34305 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
34306 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
34307 return NULL;
34308
34309 switch (TYPE_MODE (type))
34310 {
34311 case TFmode:
34312 /* __float128 is "g". */
34313 return "g";
34314 case XFmode:
34315 /* "long double" or __float80 is "e". */
34316 return "e";
34317 default:
34318 return NULL;
34319 }
34320 }
34321
34322 /* For 32-bit code we can save PIC register setup by using
34323 __stack_chk_fail_local hidden function instead of calling
34324 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
34325 register, so it is better to call __stack_chk_fail directly. */
34326
34327 static tree ATTRIBUTE_UNUSED
34328 ix86_stack_protect_fail (void)
34329 {
34330 return TARGET_64BIT
34331 ? default_external_stack_protect_fail ()
34332 : default_hidden_stack_protect_fail ();
34333 }
34334
34335 /* Select a format to encode pointers in exception handling data. CODE
34336 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
34337 true if the symbol may be affected by dynamic relocations.
34338
34339 ??? All x86 object file formats are capable of representing this.
34340 After all, the relocation needed is the same as for the call insn.
34341 Whether or not a particular assembler allows us to enter such, I
34342 guess we'll have to see. */
34343 int
34344 asm_preferred_eh_data_format (int code, int global)
34345 {
34346 if (flag_pic)
34347 {
34348 int type = DW_EH_PE_sdata8;
34349 if (!TARGET_64BIT
34350 || ix86_cmodel == CM_SMALL_PIC
34351 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
34352 type = DW_EH_PE_sdata4;
34353 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
34354 }
34355 if (ix86_cmodel == CM_SMALL
34356 || (ix86_cmodel == CM_MEDIUM && code))
34357 return DW_EH_PE_udata4;
34358 return DW_EH_PE_absptr;
34359 }
34360 \f
34361 /* Expand copysign from SIGN to the positive value ABS_VALUE
34362 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
34363 the sign-bit. */
34364 static void
34365 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
34366 {
34367 enum machine_mode mode = GET_MODE (sign);
34368 rtx sgn = gen_reg_rtx (mode);
34369 if (mask == NULL_RTX)
34370 {
34371 enum machine_mode vmode;
34372
34373 if (mode == SFmode)
34374 vmode = V4SFmode;
34375 else if (mode == DFmode)
34376 vmode = V2DFmode;
34377 else
34378 vmode = mode;
34379
34380 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
34381 if (!VECTOR_MODE_P (mode))
34382 {
34383 /* We need to generate a scalar mode mask in this case. */
34384 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
34385 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
34386 mask = gen_reg_rtx (mode);
34387 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
34388 }
34389 }
34390 else
34391 mask = gen_rtx_NOT (mode, mask);
34392 emit_insn (gen_rtx_SET (VOIDmode, sgn,
34393 gen_rtx_AND (mode, mask, sign)));
34394 emit_insn (gen_rtx_SET (VOIDmode, result,
34395 gen_rtx_IOR (mode, abs_value, sgn)));
34396 }
34397
34398 /* Expand fabs (OP0) and return a new rtx that holds the result. The
34399 mask for masking out the sign-bit is stored in *SMASK, if that is
34400 non-null. */
34401 static rtx
34402 ix86_expand_sse_fabs (rtx op0, rtx *smask)
34403 {
34404 enum machine_mode vmode, mode = GET_MODE (op0);
34405 rtx xa, mask;
34406
34407 xa = gen_reg_rtx (mode);
34408 if (mode == SFmode)
34409 vmode = V4SFmode;
34410 else if (mode == DFmode)
34411 vmode = V2DFmode;
34412 else
34413 vmode = mode;
34414 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
34415 if (!VECTOR_MODE_P (mode))
34416 {
34417 /* We need to generate a scalar mode mask in this case. */
34418 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
34419 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
34420 mask = gen_reg_rtx (mode);
34421 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
34422 }
34423 emit_insn (gen_rtx_SET (VOIDmode, xa,
34424 gen_rtx_AND (mode, op0, mask)));
34425
34426 if (smask)
34427 *smask = mask;
34428
34429 return xa;
34430 }
34431
34432 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
34433 swapping the operands if SWAP_OPERANDS is true. The expanded
34434 code is a forward jump to a newly created label in case the
34435 comparison is true. The generated label rtx is returned. */
34436 static rtx
34437 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
34438 bool swap_operands)
34439 {
34440 rtx label, tmp;
34441
34442 if (swap_operands)
34443 {
34444 tmp = op0;
34445 op0 = op1;
34446 op1 = tmp;
34447 }
34448
34449 label = gen_label_rtx ();
34450 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
34451 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34452 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
34453 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
34454 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
34455 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
34456 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
34457 JUMP_LABEL (tmp) = label;
34458
34459 return label;
34460 }
34461
34462 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
34463 using comparison code CODE. Operands are swapped for the comparison if
34464 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
34465 static rtx
34466 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
34467 bool swap_operands)
34468 {
34469 rtx (*insn)(rtx, rtx, rtx, rtx);
34470 enum machine_mode mode = GET_MODE (op0);
34471 rtx mask = gen_reg_rtx (mode);
34472
34473 if (swap_operands)
34474 {
34475 rtx tmp = op0;
34476 op0 = op1;
34477 op1 = tmp;
34478 }
34479
34480 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
34481
34482 emit_insn (insn (mask, op0, op1,
34483 gen_rtx_fmt_ee (code, mode, op0, op1)));
34484 return mask;
34485 }
34486
34487 /* Generate and return a rtx of mode MODE for 2**n where n is the number
34488 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
34489 static rtx
34490 ix86_gen_TWO52 (enum machine_mode mode)
34491 {
34492 REAL_VALUE_TYPE TWO52r;
34493 rtx TWO52;
34494
34495 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
34496 TWO52 = const_double_from_real_value (TWO52r, mode);
34497 TWO52 = force_reg (mode, TWO52);
34498
34499 return TWO52;
34500 }
34501
34502 /* Expand SSE sequence for computing lround from OP1 storing
34503 into OP0. */
34504 void
34505 ix86_expand_lround (rtx op0, rtx op1)
34506 {
34507 /* C code for the stuff we're doing below:
34508 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
34509 return (long)tmp;
34510 */
34511 enum machine_mode mode = GET_MODE (op1);
34512 const struct real_format *fmt;
34513 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
34514 rtx adj;
34515
34516 /* load nextafter (0.5, 0.0) */
34517 fmt = REAL_MODE_FORMAT (mode);
34518 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
34519 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
34520
34521 /* adj = copysign (0.5, op1) */
34522 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
34523 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
34524
34525 /* adj = op1 + adj */
34526 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
34527
34528 /* op0 = (imode)adj */
34529 expand_fix (op0, adj, 0);
34530 }
34531
34532 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
34533 into OPERAND0. */
34534 void
34535 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
34536 {
34537 /* C code for the stuff we're doing below (for do_floor):
34538 xi = (long)op1;
34539 xi -= (double)xi > op1 ? 1 : 0;
34540 return xi;
34541 */
34542 enum machine_mode fmode = GET_MODE (op1);
34543 enum machine_mode imode = GET_MODE (op0);
34544 rtx ireg, freg, label, tmp;
34545
34546 /* reg = (long)op1 */
34547 ireg = gen_reg_rtx (imode);
34548 expand_fix (ireg, op1, 0);
34549
34550 /* freg = (double)reg */
34551 freg = gen_reg_rtx (fmode);
34552 expand_float (freg, ireg, 0);
34553
34554 /* ireg = (freg > op1) ? ireg - 1 : ireg */
34555 label = ix86_expand_sse_compare_and_jump (UNLE,
34556 freg, op1, !do_floor);
34557 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
34558 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
34559 emit_move_insn (ireg, tmp);
34560
34561 emit_label (label);
34562 LABEL_NUSES (label) = 1;
34563
34564 emit_move_insn (op0, ireg);
34565 }
34566
34567 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
34568 result in OPERAND0. */
34569 void
34570 ix86_expand_rint (rtx operand0, rtx operand1)
34571 {
34572 /* C code for the stuff we're doing below:
34573 xa = fabs (operand1);
34574 if (!isless (xa, 2**52))
34575 return operand1;
34576 xa = xa + 2**52 - 2**52;
34577 return copysign (xa, operand1);
34578 */
34579 enum machine_mode mode = GET_MODE (operand0);
34580 rtx res, xa, label, TWO52, mask;
34581
34582 res = gen_reg_rtx (mode);
34583 emit_move_insn (res, operand1);
34584
34585 /* xa = abs (operand1) */
34586 xa = ix86_expand_sse_fabs (res, &mask);
34587
34588 /* if (!isless (xa, TWO52)) goto label; */
34589 TWO52 = ix86_gen_TWO52 (mode);
34590 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34591
34592 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34593 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
34594
34595 ix86_sse_copysign_to_positive (res, xa, res, mask);
34596
34597 emit_label (label);
34598 LABEL_NUSES (label) = 1;
34599
34600 emit_move_insn (operand0, res);
34601 }
34602
34603 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
34604 into OPERAND0. */
34605 void
34606 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
34607 {
34608 /* C code for the stuff we expand below.
34609 double xa = fabs (x), x2;
34610 if (!isless (xa, TWO52))
34611 return x;
34612 xa = xa + TWO52 - TWO52;
34613 x2 = copysign (xa, x);
34614 Compensate. Floor:
34615 if (x2 > x)
34616 x2 -= 1;
34617 Compensate. Ceil:
34618 if (x2 < x)
34619 x2 -= -1;
34620 return x2;
34621 */
34622 enum machine_mode mode = GET_MODE (operand0);
34623 rtx xa, TWO52, tmp, label, one, res, mask;
34624
34625 TWO52 = ix86_gen_TWO52 (mode);
34626
34627 /* Temporary for holding the result, initialized to the input
34628 operand to ease control flow. */
34629 res = gen_reg_rtx (mode);
34630 emit_move_insn (res, operand1);
34631
34632 /* xa = abs (operand1) */
34633 xa = ix86_expand_sse_fabs (res, &mask);
34634
34635 /* if (!isless (xa, TWO52)) goto label; */
34636 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34637
34638 /* xa = xa + TWO52 - TWO52; */
34639 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34640 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
34641
34642 /* xa = copysign (xa, operand1) */
34643 ix86_sse_copysign_to_positive (xa, xa, res, mask);
34644
34645 /* generate 1.0 or -1.0 */
34646 one = force_reg (mode,
34647 const_double_from_real_value (do_floor
34648 ? dconst1 : dconstm1, mode));
34649
34650 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
34651 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
34652 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34653 gen_rtx_AND (mode, one, tmp)));
34654 /* We always need to subtract here to preserve signed zero. */
34655 tmp = expand_simple_binop (mode, MINUS,
34656 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
34657 emit_move_insn (res, tmp);
34658
34659 emit_label (label);
34660 LABEL_NUSES (label) = 1;
34661
34662 emit_move_insn (operand0, res);
34663 }
34664
34665 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
34666 into OPERAND0. */
34667 void
34668 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
34669 {
34670 /* C code for the stuff we expand below.
34671 double xa = fabs (x), x2;
34672 if (!isless (xa, TWO52))
34673 return x;
34674 x2 = (double)(long)x;
34675 Compensate. Floor:
34676 if (x2 > x)
34677 x2 -= 1;
34678 Compensate. Ceil:
34679 if (x2 < x)
34680 x2 += 1;
34681 if (HONOR_SIGNED_ZEROS (mode))
34682 return copysign (x2, x);
34683 return x2;
34684 */
34685 enum machine_mode mode = GET_MODE (operand0);
34686 rtx xa, xi, TWO52, tmp, label, one, res, mask;
34687
34688 TWO52 = ix86_gen_TWO52 (mode);
34689
34690 /* Temporary for holding the result, initialized to the input
34691 operand to ease control flow. */
34692 res = gen_reg_rtx (mode);
34693 emit_move_insn (res, operand1);
34694
34695 /* xa = abs (operand1) */
34696 xa = ix86_expand_sse_fabs (res, &mask);
34697
34698 /* if (!isless (xa, TWO52)) goto label; */
34699 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34700
34701 /* xa = (double)(long)x */
34702 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
34703 expand_fix (xi, res, 0);
34704 expand_float (xa, xi, 0);
34705
34706 /* generate 1.0 */
34707 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
34708
34709 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
34710 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
34711 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34712 gen_rtx_AND (mode, one, tmp)));
34713 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
34714 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
34715 emit_move_insn (res, tmp);
34716
34717 if (HONOR_SIGNED_ZEROS (mode))
34718 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
34719
34720 emit_label (label);
34721 LABEL_NUSES (label) = 1;
34722
34723 emit_move_insn (operand0, res);
34724 }
34725
34726 /* Expand SSE sequence for computing round from OPERAND1 storing
34727 into OPERAND0. Sequence that works without relying on DImode truncation
34728 via cvttsd2siq that is only available on 64bit targets. */
34729 void
34730 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
34731 {
34732 /* C code for the stuff we expand below.
34733 double xa = fabs (x), xa2, x2;
34734 if (!isless (xa, TWO52))
34735 return x;
34736 Using the absolute value and copying back sign makes
34737 -0.0 -> -0.0 correct.
34738 xa2 = xa + TWO52 - TWO52;
34739 Compensate.
34740 dxa = xa2 - xa;
34741 if (dxa <= -0.5)
34742 xa2 += 1;
34743 else if (dxa > 0.5)
34744 xa2 -= 1;
34745 x2 = copysign (xa2, x);
34746 return x2;
34747 */
34748 enum machine_mode mode = GET_MODE (operand0);
34749 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
34750
34751 TWO52 = ix86_gen_TWO52 (mode);
34752
34753 /* Temporary for holding the result, initialized to the input
34754 operand to ease control flow. */
34755 res = gen_reg_rtx (mode);
34756 emit_move_insn (res, operand1);
34757
34758 /* xa = abs (operand1) */
34759 xa = ix86_expand_sse_fabs (res, &mask);
34760
34761 /* if (!isless (xa, TWO52)) goto label; */
34762 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34763
34764 /* xa2 = xa + TWO52 - TWO52; */
34765 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34766 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
34767
34768 /* dxa = xa2 - xa; */
34769 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
34770
34771 /* generate 0.5, 1.0 and -0.5 */
34772 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
34773 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
34774 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
34775 0, OPTAB_DIRECT);
34776
34777 /* Compensate. */
34778 tmp = gen_reg_rtx (mode);
34779 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
34780 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
34781 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34782 gen_rtx_AND (mode, one, tmp)));
34783 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
34784 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
34785 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
34786 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34787 gen_rtx_AND (mode, one, tmp)));
34788 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
34789
34790 /* res = copysign (xa2, operand1) */
34791 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
34792
34793 emit_label (label);
34794 LABEL_NUSES (label) = 1;
34795
34796 emit_move_insn (operand0, res);
34797 }
34798
34799 /* Expand SSE sequence for computing trunc from OPERAND1 storing
34800 into OPERAND0. */
34801 void
34802 ix86_expand_trunc (rtx operand0, rtx operand1)
34803 {
34804 /* C code for SSE variant we expand below.
34805 double xa = fabs (x), x2;
34806 if (!isless (xa, TWO52))
34807 return x;
34808 x2 = (double)(long)x;
34809 if (HONOR_SIGNED_ZEROS (mode))
34810 return copysign (x2, x);
34811 return x2;
34812 */
34813 enum machine_mode mode = GET_MODE (operand0);
34814 rtx xa, xi, TWO52, label, res, mask;
34815
34816 TWO52 = ix86_gen_TWO52 (mode);
34817
34818 /* Temporary for holding the result, initialized to the input
34819 operand to ease control flow. */
34820 res = gen_reg_rtx (mode);
34821 emit_move_insn (res, operand1);
34822
34823 /* xa = abs (operand1) */
34824 xa = ix86_expand_sse_fabs (res, &mask);
34825
34826 /* if (!isless (xa, TWO52)) goto label; */
34827 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34828
34829 /* x = (double)(long)x */
34830 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
34831 expand_fix (xi, res, 0);
34832 expand_float (res, xi, 0);
34833
34834 if (HONOR_SIGNED_ZEROS (mode))
34835 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
34836
34837 emit_label (label);
34838 LABEL_NUSES (label) = 1;
34839
34840 emit_move_insn (operand0, res);
34841 }
34842
34843 /* Expand SSE sequence for computing trunc from OPERAND1 storing
34844 into OPERAND0. */
34845 void
34846 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
34847 {
34848 enum machine_mode mode = GET_MODE (operand0);
34849 rtx xa, mask, TWO52, label, one, res, smask, tmp;
34850
34851 /* C code for SSE variant we expand below.
34852 double xa = fabs (x), x2;
34853 if (!isless (xa, TWO52))
34854 return x;
34855 xa2 = xa + TWO52 - TWO52;
34856 Compensate:
34857 if (xa2 > xa)
34858 xa2 -= 1.0;
34859 x2 = copysign (xa2, x);
34860 return x2;
34861 */
34862
34863 TWO52 = ix86_gen_TWO52 (mode);
34864
34865 /* Temporary for holding the result, initialized to the input
34866 operand to ease control flow. */
34867 res = gen_reg_rtx (mode);
34868 emit_move_insn (res, operand1);
34869
34870 /* xa = abs (operand1) */
34871 xa = ix86_expand_sse_fabs (res, &smask);
34872
34873 /* if (!isless (xa, TWO52)) goto label; */
34874 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34875
34876 /* res = xa + TWO52 - TWO52; */
34877 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34878 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
34879 emit_move_insn (res, tmp);
34880
34881 /* generate 1.0 */
34882 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
34883
34884 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
34885 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
34886 emit_insn (gen_rtx_SET (VOIDmode, mask,
34887 gen_rtx_AND (mode, mask, one)));
34888 tmp = expand_simple_binop (mode, MINUS,
34889 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
34890 emit_move_insn (res, tmp);
34891
34892 /* res = copysign (res, operand1) */
34893 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
34894
34895 emit_label (label);
34896 LABEL_NUSES (label) = 1;
34897
34898 emit_move_insn (operand0, res);
34899 }
34900
34901 /* Expand SSE sequence for computing round from OPERAND1 storing
34902 into OPERAND0. */
34903 void
34904 ix86_expand_round (rtx operand0, rtx operand1)
34905 {
34906 /* C code for the stuff we're doing below:
34907 double xa = fabs (x);
34908 if (!isless (xa, TWO52))
34909 return x;
34910 xa = (double)(long)(xa + nextafter (0.5, 0.0));
34911 return copysign (xa, x);
34912 */
34913 enum machine_mode mode = GET_MODE (operand0);
34914 rtx res, TWO52, xa, label, xi, half, mask;
34915 const struct real_format *fmt;
34916 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
34917
34918 /* Temporary for holding the result, initialized to the input
34919 operand to ease control flow. */
34920 res = gen_reg_rtx (mode);
34921 emit_move_insn (res, operand1);
34922
34923 TWO52 = ix86_gen_TWO52 (mode);
34924 xa = ix86_expand_sse_fabs (res, &mask);
34925 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34926
34927 /* load nextafter (0.5, 0.0) */
34928 fmt = REAL_MODE_FORMAT (mode);
34929 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
34930 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
34931
34932 /* xa = xa + 0.5 */
34933 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
34934 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
34935
34936 /* xa = (double)(int64_t)xa */
34937 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
34938 expand_fix (xi, xa, 0);
34939 expand_float (xa, xi, 0);
34940
34941 /* res = copysign (xa, operand1) */
34942 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
34943
34944 emit_label (label);
34945 LABEL_NUSES (label) = 1;
34946
34947 emit_move_insn (operand0, res);
34948 }
34949
34950 /* Expand SSE sequence for computing round
34951 from OP1 storing into OP0 using sse4 round insn. */
34952 void
34953 ix86_expand_round_sse4 (rtx op0, rtx op1)
34954 {
34955 enum machine_mode mode = GET_MODE (op0);
34956 rtx e1, e2, res, half;
34957 const struct real_format *fmt;
34958 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
34959 rtx (*gen_copysign) (rtx, rtx, rtx);
34960 rtx (*gen_round) (rtx, rtx, rtx);
34961
34962 switch (mode)
34963 {
34964 case SFmode:
34965 gen_copysign = gen_copysignsf3;
34966 gen_round = gen_sse4_1_roundsf2;
34967 break;
34968 case DFmode:
34969 gen_copysign = gen_copysigndf3;
34970 gen_round = gen_sse4_1_rounddf2;
34971 break;
34972 default:
34973 gcc_unreachable ();
34974 }
34975
34976 /* round (a) = trunc (a + copysign (0.5, a)) */
34977
34978 /* load nextafter (0.5, 0.0) */
34979 fmt = REAL_MODE_FORMAT (mode);
34980 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
34981 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
34982 half = const_double_from_real_value (pred_half, mode);
34983
34984 /* e1 = copysign (0.5, op1) */
34985 e1 = gen_reg_rtx (mode);
34986 emit_insn (gen_copysign (e1, half, op1));
34987
34988 /* e2 = op1 + e1 */
34989 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
34990
34991 /* res = trunc (e2) */
34992 res = gen_reg_rtx (mode);
34993 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
34994
34995 emit_move_insn (op0, res);
34996 }
34997 \f
34998
34999 /* Table of valid machine attributes. */
35000 static const struct attribute_spec ix86_attribute_table[] =
35001 {
35002 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
35003 affects_type_identity } */
35004 /* Stdcall attribute says callee is responsible for popping arguments
35005 if they are not variable. */
35006 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35007 true },
35008 /* Fastcall attribute says callee is responsible for popping arguments
35009 if they are not variable. */
35010 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35011 true },
35012 /* Thiscall attribute says callee is responsible for popping arguments
35013 if they are not variable. */
35014 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35015 true },
35016 /* Cdecl attribute says the callee is a normal C declaration */
35017 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35018 true },
35019 /* Regparm attribute specifies how many integer arguments are to be
35020 passed in registers. */
35021 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
35022 true },
35023 /* Sseregparm attribute says we are using x86_64 calling conventions
35024 for FP arguments. */
35025 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35026 true },
35027 /* The transactional memory builtins are implicitly regparm or fastcall
35028 depending on the ABI. Override the generic do-nothing attribute that
35029 these builtins were declared with. */
35030 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
35031 true },
35032 /* force_align_arg_pointer says this function realigns the stack at entry. */
35033 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
35034 false, true, true, ix86_handle_cconv_attribute, false },
35035 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
35036 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
35037 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
35038 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
35039 false },
35040 #endif
35041 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
35042 false },
35043 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
35044 false },
35045 #ifdef SUBTARGET_ATTRIBUTE_TABLE
35046 SUBTARGET_ATTRIBUTE_TABLE,
35047 #endif
35048 /* ms_abi and sysv_abi calling convention function attributes. */
35049 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35050 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35051 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
35052 false },
35053 { "callee_pop_aggregate_return", 1, 1, false, true, true,
35054 ix86_handle_callee_pop_aggregate_return, true },
35055 /* End element. */
35056 { NULL, 0, 0, false, false, false, NULL, false }
35057 };
35058
35059 /* Implement targetm.vectorize.builtin_vectorization_cost. */
35060 static int
35061 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
35062 tree vectype ATTRIBUTE_UNUSED,
35063 int misalign ATTRIBUTE_UNUSED)
35064 {
35065 switch (type_of_cost)
35066 {
35067 case scalar_stmt:
35068 return ix86_cost->scalar_stmt_cost;
35069
35070 case scalar_load:
35071 return ix86_cost->scalar_load_cost;
35072
35073 case scalar_store:
35074 return ix86_cost->scalar_store_cost;
35075
35076 case vector_stmt:
35077 return ix86_cost->vec_stmt_cost;
35078
35079 case vector_load:
35080 return ix86_cost->vec_align_load_cost;
35081
35082 case vector_store:
35083 return ix86_cost->vec_store_cost;
35084
35085 case vec_to_scalar:
35086 return ix86_cost->vec_to_scalar_cost;
35087
35088 case scalar_to_vec:
35089 return ix86_cost->scalar_to_vec_cost;
35090
35091 case unaligned_load:
35092 case unaligned_store:
35093 return ix86_cost->vec_unalign_load_cost;
35094
35095 case cond_branch_taken:
35096 return ix86_cost->cond_taken_branch_cost;
35097
35098 case cond_branch_not_taken:
35099 return ix86_cost->cond_not_taken_branch_cost;
35100
35101 case vec_perm:
35102 return 1;
35103
35104 default:
35105 gcc_unreachable ();
35106 }
35107 }
35108
35109
35110 /* Return a vector mode with twice as many elements as VMODE. */
35111 /* ??? Consider moving this to a table generated by genmodes.c. */
35112
35113 static enum machine_mode
35114 doublesize_vector_mode (enum machine_mode vmode)
35115 {
35116 switch (vmode)
35117 {
35118 case V2SFmode: return V4SFmode;
35119 case V1DImode: return V2DImode;
35120 case V2SImode: return V4SImode;
35121 case V4HImode: return V8HImode;
35122 case V8QImode: return V16QImode;
35123
35124 case V2DFmode: return V4DFmode;
35125 case V4SFmode: return V8SFmode;
35126 case V2DImode: return V4DImode;
35127 case V4SImode: return V8SImode;
35128 case V8HImode: return V16HImode;
35129 case V16QImode: return V32QImode;
35130
35131 case V4DFmode: return V8DFmode;
35132 case V8SFmode: return V16SFmode;
35133 case V4DImode: return V8DImode;
35134 case V8SImode: return V16SImode;
35135 case V16HImode: return V32HImode;
35136 case V32QImode: return V64QImode;
35137
35138 default:
35139 gcc_unreachable ();
35140 }
35141 }
35142
35143 /* Construct (set target (vec_select op0 (parallel perm))) and
35144 return true if that's a valid instruction in the active ISA. */
35145
35146 static bool
35147 expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)
35148 {
35149 rtx rperm[MAX_VECT_LEN], x;
35150 unsigned i;
35151
35152 for (i = 0; i < nelt; ++i)
35153 rperm[i] = GEN_INT (perm[i]);
35154
35155 x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, rperm));
35156 x = gen_rtx_VEC_SELECT (GET_MODE (target), op0, x);
35157 x = gen_rtx_SET (VOIDmode, target, x);
35158
35159 x = emit_insn (x);
35160 if (recog_memoized (x) < 0)
35161 {
35162 remove_insn (x);
35163 return false;
35164 }
35165 return true;
35166 }
35167
35168 /* Similar, but generate a vec_concat from op0 and op1 as well. */
35169
35170 static bool
35171 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
35172 const unsigned char *perm, unsigned nelt)
35173 {
35174 enum machine_mode v2mode;
35175 rtx x;
35176
35177 v2mode = doublesize_vector_mode (GET_MODE (op0));
35178 x = gen_rtx_VEC_CONCAT (v2mode, op0, op1);
35179 return expand_vselect (target, x, perm, nelt);
35180 }
35181
35182 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35183 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
35184
35185 static bool
35186 expand_vec_perm_blend (struct expand_vec_perm_d *d)
35187 {
35188 enum machine_mode vmode = d->vmode;
35189 unsigned i, mask, nelt = d->nelt;
35190 rtx target, op0, op1, x;
35191 rtx rperm[32], vperm;
35192
35193 if (d->op0 == d->op1)
35194 return false;
35195 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
35196 ;
35197 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
35198 ;
35199 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
35200 ;
35201 else
35202 return false;
35203
35204 /* This is a blend, not a permute. Elements must stay in their
35205 respective lanes. */
35206 for (i = 0; i < nelt; ++i)
35207 {
35208 unsigned e = d->perm[i];
35209 if (!(e == i || e == i + nelt))
35210 return false;
35211 }
35212
35213 if (d->testing_p)
35214 return true;
35215
35216 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
35217 decision should be extracted elsewhere, so that we only try that
35218 sequence once all budget==3 options have been tried. */
35219 target = d->target;
35220 op0 = d->op0;
35221 op1 = d->op1;
35222 mask = 0;
35223
35224 switch (vmode)
35225 {
35226 case V4DFmode:
35227 case V8SFmode:
35228 case V2DFmode:
35229 case V4SFmode:
35230 case V8HImode:
35231 case V8SImode:
35232 for (i = 0; i < nelt; ++i)
35233 mask |= (d->perm[i] >= nelt) << i;
35234 break;
35235
35236 case V2DImode:
35237 for (i = 0; i < 2; ++i)
35238 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
35239 vmode = V8HImode;
35240 goto do_subreg;
35241
35242 case V4SImode:
35243 for (i = 0; i < 4; ++i)
35244 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35245 vmode = V8HImode;
35246 goto do_subreg;
35247
35248 case V16QImode:
35249 /* See if bytes move in pairs so we can use pblendw with
35250 an immediate argument, rather than pblendvb with a vector
35251 argument. */
35252 for (i = 0; i < 16; i += 2)
35253 if (d->perm[i] + 1 != d->perm[i + 1])
35254 {
35255 use_pblendvb:
35256 for (i = 0; i < nelt; ++i)
35257 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
35258
35259 finish_pblendvb:
35260 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
35261 vperm = force_reg (vmode, vperm);
35262
35263 if (GET_MODE_SIZE (vmode) == 16)
35264 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
35265 else
35266 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
35267 return true;
35268 }
35269
35270 for (i = 0; i < 8; ++i)
35271 mask |= (d->perm[i * 2] >= 16) << i;
35272 vmode = V8HImode;
35273 /* FALLTHRU */
35274
35275 do_subreg:
35276 target = gen_lowpart (vmode, target);
35277 op0 = gen_lowpart (vmode, op0);
35278 op1 = gen_lowpart (vmode, op1);
35279 break;
35280
35281 case V32QImode:
35282 /* See if bytes move in pairs. If not, vpblendvb must be used. */
35283 for (i = 0; i < 32; i += 2)
35284 if (d->perm[i] + 1 != d->perm[i + 1])
35285 goto use_pblendvb;
35286 /* See if bytes move in quadruplets. If yes, vpblendd
35287 with immediate can be used. */
35288 for (i = 0; i < 32; i += 4)
35289 if (d->perm[i] + 2 != d->perm[i + 2])
35290 break;
35291 if (i < 32)
35292 {
35293 /* See if bytes move the same in both lanes. If yes,
35294 vpblendw with immediate can be used. */
35295 for (i = 0; i < 16; i += 2)
35296 if (d->perm[i] + 16 != d->perm[i + 16])
35297 goto use_pblendvb;
35298
35299 /* Use vpblendw. */
35300 for (i = 0; i < 16; ++i)
35301 mask |= (d->perm[i * 2] >= 32) << i;
35302 vmode = V16HImode;
35303 goto do_subreg;
35304 }
35305
35306 /* Use vpblendd. */
35307 for (i = 0; i < 8; ++i)
35308 mask |= (d->perm[i * 4] >= 32) << i;
35309 vmode = V8SImode;
35310 goto do_subreg;
35311
35312 case V16HImode:
35313 /* See if words move in pairs. If yes, vpblendd can be used. */
35314 for (i = 0; i < 16; i += 2)
35315 if (d->perm[i] + 1 != d->perm[i + 1])
35316 break;
35317 if (i < 16)
35318 {
35319 /* See if words move the same in both lanes. If not,
35320 vpblendvb must be used. */
35321 for (i = 0; i < 8; i++)
35322 if (d->perm[i] + 8 != d->perm[i + 8])
35323 {
35324 /* Use vpblendvb. */
35325 for (i = 0; i < 32; ++i)
35326 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
35327
35328 vmode = V32QImode;
35329 nelt = 32;
35330 target = gen_lowpart (vmode, target);
35331 op0 = gen_lowpart (vmode, op0);
35332 op1 = gen_lowpart (vmode, op1);
35333 goto finish_pblendvb;
35334 }
35335
35336 /* Use vpblendw. */
35337 for (i = 0; i < 16; ++i)
35338 mask |= (d->perm[i] >= 16) << i;
35339 break;
35340 }
35341
35342 /* Use vpblendd. */
35343 for (i = 0; i < 8; ++i)
35344 mask |= (d->perm[i * 2] >= 16) << i;
35345 vmode = V8SImode;
35346 goto do_subreg;
35347
35348 case V4DImode:
35349 /* Use vpblendd. */
35350 for (i = 0; i < 4; ++i)
35351 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35352 vmode = V8SImode;
35353 goto do_subreg;
35354
35355 default:
35356 gcc_unreachable ();
35357 }
35358
35359 /* This matches five different patterns with the different modes. */
35360 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
35361 x = gen_rtx_SET (VOIDmode, target, x);
35362 emit_insn (x);
35363
35364 return true;
35365 }
35366
35367 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35368 in terms of the variable form of vpermilps.
35369
35370 Note that we will have already failed the immediate input vpermilps,
35371 which requires that the high and low part shuffle be identical; the
35372 variable form doesn't require that. */
35373
35374 static bool
35375 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
35376 {
35377 rtx rperm[8], vperm;
35378 unsigned i;
35379
35380 if (!TARGET_AVX || d->vmode != V8SFmode || d->op0 != d->op1)
35381 return false;
35382
35383 /* We can only permute within the 128-bit lane. */
35384 for (i = 0; i < 8; ++i)
35385 {
35386 unsigned e = d->perm[i];
35387 if (i < 4 ? e >= 4 : e < 4)
35388 return false;
35389 }
35390
35391 if (d->testing_p)
35392 return true;
35393
35394 for (i = 0; i < 8; ++i)
35395 {
35396 unsigned e = d->perm[i];
35397
35398 /* Within each 128-bit lane, the elements of op0 are numbered
35399 from 0 and the elements of op1 are numbered from 4. */
35400 if (e >= 8 + 4)
35401 e -= 8;
35402 else if (e >= 4)
35403 e -= 4;
35404
35405 rperm[i] = GEN_INT (e);
35406 }
35407
35408 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
35409 vperm = force_reg (V8SImode, vperm);
35410 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
35411
35412 return true;
35413 }
35414
35415 /* Return true if permutation D can be performed as VMODE permutation
35416 instead. */
35417
35418 static bool
35419 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
35420 {
35421 unsigned int i, j, chunk;
35422
35423 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
35424 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
35425 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
35426 return false;
35427
35428 if (GET_MODE_NUNITS (vmode) >= d->nelt)
35429 return true;
35430
35431 chunk = d->nelt / GET_MODE_NUNITS (vmode);
35432 for (i = 0; i < d->nelt; i += chunk)
35433 if (d->perm[i] & (chunk - 1))
35434 return false;
35435 else
35436 for (j = 1; j < chunk; ++j)
35437 if (d->perm[i] + j != d->perm[i + j])
35438 return false;
35439
35440 return true;
35441 }
35442
35443 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35444 in terms of pshufb, vpperm, vpermq, vpermd or vperm2i128. */
35445
35446 static bool
35447 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
35448 {
35449 unsigned i, nelt, eltsz, mask;
35450 unsigned char perm[32];
35451 enum machine_mode vmode = V16QImode;
35452 rtx rperm[32], vperm, target, op0, op1;
35453
35454 nelt = d->nelt;
35455
35456 if (d->op0 != d->op1)
35457 {
35458 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
35459 {
35460 if (TARGET_AVX2
35461 && valid_perm_using_mode_p (V2TImode, d))
35462 {
35463 if (d->testing_p)
35464 return true;
35465
35466 /* Use vperm2i128 insn. The pattern uses
35467 V4DImode instead of V2TImode. */
35468 target = gen_lowpart (V4DImode, d->target);
35469 op0 = gen_lowpart (V4DImode, d->op0);
35470 op1 = gen_lowpart (V4DImode, d->op1);
35471 rperm[0]
35472 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
35473 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
35474 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
35475 return true;
35476 }
35477 return false;
35478 }
35479 }
35480 else
35481 {
35482 if (GET_MODE_SIZE (d->vmode) == 16)
35483 {
35484 if (!TARGET_SSSE3)
35485 return false;
35486 }
35487 else if (GET_MODE_SIZE (d->vmode) == 32)
35488 {
35489 if (!TARGET_AVX2)
35490 return false;
35491
35492 /* V4DImode should be already handled through
35493 expand_vselect by vpermq instruction. */
35494 gcc_assert (d->vmode != V4DImode);
35495
35496 vmode = V32QImode;
35497 if (d->vmode == V8SImode
35498 || d->vmode == V16HImode
35499 || d->vmode == V32QImode)
35500 {
35501 /* First see if vpermq can be used for
35502 V8SImode/V16HImode/V32QImode. */
35503 if (valid_perm_using_mode_p (V4DImode, d))
35504 {
35505 for (i = 0; i < 4; i++)
35506 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
35507 if (d->testing_p)
35508 return true;
35509 return expand_vselect (gen_lowpart (V4DImode, d->target),
35510 gen_lowpart (V4DImode, d->op0),
35511 perm, 4);
35512 }
35513
35514 /* Next see if vpermd can be used. */
35515 if (valid_perm_using_mode_p (V8SImode, d))
35516 vmode = V8SImode;
35517 }
35518
35519 if (vmode == V32QImode)
35520 {
35521 /* vpshufb only works intra lanes, it is not
35522 possible to shuffle bytes in between the lanes. */
35523 for (i = 0; i < nelt; ++i)
35524 if ((d->perm[i] ^ i) & (nelt / 2))
35525 return false;
35526 }
35527 }
35528 else
35529 return false;
35530 }
35531
35532 if (d->testing_p)
35533 return true;
35534
35535 if (vmode == V8SImode)
35536 for (i = 0; i < 8; ++i)
35537 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
35538 else
35539 {
35540 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
35541 if (d->op0 != d->op1)
35542 mask = 2 * nelt - 1;
35543 else if (vmode == V16QImode)
35544 mask = nelt - 1;
35545 else
35546 mask = nelt / 2 - 1;
35547
35548 for (i = 0; i < nelt; ++i)
35549 {
35550 unsigned j, e = d->perm[i] & mask;
35551 for (j = 0; j < eltsz; ++j)
35552 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
35553 }
35554 }
35555
35556 vperm = gen_rtx_CONST_VECTOR (vmode,
35557 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
35558 vperm = force_reg (vmode, vperm);
35559
35560 target = gen_lowpart (vmode, d->target);
35561 op0 = gen_lowpart (vmode, d->op0);
35562 if (d->op0 == d->op1)
35563 {
35564 if (vmode == V16QImode)
35565 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
35566 else if (vmode == V32QImode)
35567 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
35568 else
35569 emit_insn (gen_avx2_permvarv8si (target, vperm, op0));
35570 }
35571 else
35572 {
35573 op1 = gen_lowpart (vmode, d->op1);
35574 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
35575 }
35576
35577 return true;
35578 }
35579
35580 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
35581 in a single instruction. */
35582
35583 static bool
35584 expand_vec_perm_1 (struct expand_vec_perm_d *d)
35585 {
35586 unsigned i, nelt = d->nelt;
35587 unsigned char perm2[MAX_VECT_LEN];
35588
35589 /* Check plain VEC_SELECT first, because AVX has instructions that could
35590 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
35591 input where SEL+CONCAT may not. */
35592 if (d->op0 == d->op1)
35593 {
35594 int mask = nelt - 1;
35595 bool identity_perm = true;
35596 bool broadcast_perm = true;
35597
35598 for (i = 0; i < nelt; i++)
35599 {
35600 perm2[i] = d->perm[i] & mask;
35601 if (perm2[i] != i)
35602 identity_perm = false;
35603 if (perm2[i])
35604 broadcast_perm = false;
35605 }
35606
35607 if (identity_perm)
35608 {
35609 if (!d->testing_p)
35610 emit_move_insn (d->target, d->op0);
35611 return true;
35612 }
35613 else if (broadcast_perm && TARGET_AVX2)
35614 {
35615 /* Use vpbroadcast{b,w,d}. */
35616 rtx op = d->op0, (*gen) (rtx, rtx) = NULL;
35617 switch (d->vmode)
35618 {
35619 case V32QImode:
35620 op = gen_lowpart (V16QImode, op);
35621 gen = gen_avx2_pbroadcastv32qi;
35622 break;
35623 case V16HImode:
35624 op = gen_lowpart (V8HImode, op);
35625 gen = gen_avx2_pbroadcastv16hi;
35626 break;
35627 case V8SImode:
35628 op = gen_lowpart (V4SImode, op);
35629 gen = gen_avx2_pbroadcastv8si;
35630 break;
35631 case V16QImode:
35632 gen = gen_avx2_pbroadcastv16qi;
35633 break;
35634 case V8HImode:
35635 gen = gen_avx2_pbroadcastv8hi;
35636 break;
35637 /* For other modes prefer other shuffles this function creates. */
35638 default: break;
35639 }
35640 if (gen != NULL)
35641 {
35642 if (!d->testing_p)
35643 emit_insn (gen (d->target, op));
35644 return true;
35645 }
35646 }
35647
35648 if (expand_vselect (d->target, d->op0, perm2, nelt))
35649 return true;
35650
35651 /* There are plenty of patterns in sse.md that are written for
35652 SEL+CONCAT and are not replicated for a single op. Perhaps
35653 that should be changed, to avoid the nastiness here. */
35654
35655 /* Recognize interleave style patterns, which means incrementing
35656 every other permutation operand. */
35657 for (i = 0; i < nelt; i += 2)
35658 {
35659 perm2[i] = d->perm[i] & mask;
35660 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
35661 }
35662 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
35663 return true;
35664
35665 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
35666 if (nelt >= 4)
35667 {
35668 for (i = 0; i < nelt; i += 4)
35669 {
35670 perm2[i + 0] = d->perm[i + 0] & mask;
35671 perm2[i + 1] = d->perm[i + 1] & mask;
35672 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
35673 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
35674 }
35675
35676 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
35677 return true;
35678 }
35679 }
35680
35681 /* Finally, try the fully general two operand permute. */
35682 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt))
35683 return true;
35684
35685 /* Recognize interleave style patterns with reversed operands. */
35686 if (d->op0 != d->op1)
35687 {
35688 for (i = 0; i < nelt; ++i)
35689 {
35690 unsigned e = d->perm[i];
35691 if (e >= nelt)
35692 e -= nelt;
35693 else
35694 e += nelt;
35695 perm2[i] = e;
35696 }
35697
35698 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt))
35699 return true;
35700 }
35701
35702 /* Try the SSE4.1 blend variable merge instructions. */
35703 if (expand_vec_perm_blend (d))
35704 return true;
35705
35706 /* Try one of the AVX vpermil variable permutations. */
35707 if (expand_vec_perm_vpermil (d))
35708 return true;
35709
35710 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
35711 vpshufb, vpermd or vpermq variable permutation. */
35712 if (expand_vec_perm_pshufb (d))
35713 return true;
35714
35715 return false;
35716 }
35717
35718 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35719 in terms of a pair of pshuflw + pshufhw instructions. */
35720
35721 static bool
35722 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
35723 {
35724 unsigned char perm2[MAX_VECT_LEN];
35725 unsigned i;
35726 bool ok;
35727
35728 if (d->vmode != V8HImode || d->op0 != d->op1)
35729 return false;
35730
35731 /* The two permutations only operate in 64-bit lanes. */
35732 for (i = 0; i < 4; ++i)
35733 if (d->perm[i] >= 4)
35734 return false;
35735 for (i = 4; i < 8; ++i)
35736 if (d->perm[i] < 4)
35737 return false;
35738
35739 if (d->testing_p)
35740 return true;
35741
35742 /* Emit the pshuflw. */
35743 memcpy (perm2, d->perm, 4);
35744 for (i = 4; i < 8; ++i)
35745 perm2[i] = i;
35746 ok = expand_vselect (d->target, d->op0, perm2, 8);
35747 gcc_assert (ok);
35748
35749 /* Emit the pshufhw. */
35750 memcpy (perm2 + 4, d->perm + 4, 4);
35751 for (i = 0; i < 4; ++i)
35752 perm2[i] = i;
35753 ok = expand_vselect (d->target, d->target, perm2, 8);
35754 gcc_assert (ok);
35755
35756 return true;
35757 }
35758
35759 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
35760 the permutation using the SSSE3 palignr instruction. This succeeds
35761 when all of the elements in PERM fit within one vector and we merely
35762 need to shift them down so that a single vector permutation has a
35763 chance to succeed. */
35764
35765 static bool
35766 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
35767 {
35768 unsigned i, nelt = d->nelt;
35769 unsigned min, max;
35770 bool in_order, ok;
35771 rtx shift;
35772
35773 /* Even with AVX, palignr only operates on 128-bit vectors. */
35774 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
35775 return false;
35776
35777 min = nelt, max = 0;
35778 for (i = 0; i < nelt; ++i)
35779 {
35780 unsigned e = d->perm[i];
35781 if (e < min)
35782 min = e;
35783 if (e > max)
35784 max = e;
35785 }
35786 if (min == 0 || max - min >= nelt)
35787 return false;
35788
35789 /* Given that we have SSSE3, we know we'll be able to implement the
35790 single operand permutation after the palignr with pshufb. */
35791 if (d->testing_p)
35792 return true;
35793
35794 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
35795 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
35796 gen_lowpart (TImode, d->op1),
35797 gen_lowpart (TImode, d->op0), shift));
35798
35799 d->op0 = d->op1 = d->target;
35800
35801 in_order = true;
35802 for (i = 0; i < nelt; ++i)
35803 {
35804 unsigned e = d->perm[i] - min;
35805 if (e != i)
35806 in_order = false;
35807 d->perm[i] = e;
35808 }
35809
35810 /* Test for the degenerate case where the alignment by itself
35811 produces the desired permutation. */
35812 if (in_order)
35813 return true;
35814
35815 ok = expand_vec_perm_1 (d);
35816 gcc_assert (ok);
35817
35818 return ok;
35819 }
35820
35821 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
35822 a two vector permutation into a single vector permutation by using
35823 an interleave operation to merge the vectors. */
35824
35825 static bool
35826 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
35827 {
35828 struct expand_vec_perm_d dremap, dfinal;
35829 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
35830 unsigned HOST_WIDE_INT contents;
35831 unsigned char remap[2 * MAX_VECT_LEN];
35832 rtx seq;
35833 bool ok, same_halves = false;
35834
35835 if (GET_MODE_SIZE (d->vmode) == 16)
35836 {
35837 if (d->op0 == d->op1)
35838 return false;
35839 }
35840 else if (GET_MODE_SIZE (d->vmode) == 32)
35841 {
35842 if (!TARGET_AVX)
35843 return false;
35844 /* For 32-byte modes allow even d->op0 == d->op1.
35845 The lack of cross-lane shuffling in some instructions
35846 might prevent a single insn shuffle. */
35847 }
35848 else
35849 return false;
35850
35851 /* Examine from whence the elements come. */
35852 contents = 0;
35853 for (i = 0; i < nelt; ++i)
35854 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
35855
35856 memset (remap, 0xff, sizeof (remap));
35857 dremap = *d;
35858
35859 if (GET_MODE_SIZE (d->vmode) == 16)
35860 {
35861 unsigned HOST_WIDE_INT h1, h2, h3, h4;
35862
35863 /* Split the two input vectors into 4 halves. */
35864 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
35865 h2 = h1 << nelt2;
35866 h3 = h2 << nelt2;
35867 h4 = h3 << nelt2;
35868
35869 /* If the elements from the low halves use interleave low, and similarly
35870 for interleave high. If the elements are from mis-matched halves, we
35871 can use shufps for V4SF/V4SI or do a DImode shuffle. */
35872 if ((contents & (h1 | h3)) == contents)
35873 {
35874 /* punpckl* */
35875 for (i = 0; i < nelt2; ++i)
35876 {
35877 remap[i] = i * 2;
35878 remap[i + nelt] = i * 2 + 1;
35879 dremap.perm[i * 2] = i;
35880 dremap.perm[i * 2 + 1] = i + nelt;
35881 }
35882 if (!TARGET_SSE2 && d->vmode == V4SImode)
35883 dremap.vmode = V4SFmode;
35884 }
35885 else if ((contents & (h2 | h4)) == contents)
35886 {
35887 /* punpckh* */
35888 for (i = 0; i < nelt2; ++i)
35889 {
35890 remap[i + nelt2] = i * 2;
35891 remap[i + nelt + nelt2] = i * 2 + 1;
35892 dremap.perm[i * 2] = i + nelt2;
35893 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
35894 }
35895 if (!TARGET_SSE2 && d->vmode == V4SImode)
35896 dremap.vmode = V4SFmode;
35897 }
35898 else if ((contents & (h1 | h4)) == contents)
35899 {
35900 /* shufps */
35901 for (i = 0; i < nelt2; ++i)
35902 {
35903 remap[i] = i;
35904 remap[i + nelt + nelt2] = i + nelt2;
35905 dremap.perm[i] = i;
35906 dremap.perm[i + nelt2] = i + nelt + nelt2;
35907 }
35908 if (nelt != 4)
35909 {
35910 /* shufpd */
35911 dremap.vmode = V2DImode;
35912 dremap.nelt = 2;
35913 dremap.perm[0] = 0;
35914 dremap.perm[1] = 3;
35915 }
35916 }
35917 else if ((contents & (h2 | h3)) == contents)
35918 {
35919 /* shufps */
35920 for (i = 0; i < nelt2; ++i)
35921 {
35922 remap[i + nelt2] = i;
35923 remap[i + nelt] = i + nelt2;
35924 dremap.perm[i] = i + nelt2;
35925 dremap.perm[i + nelt2] = i + nelt;
35926 }
35927 if (nelt != 4)
35928 {
35929 /* shufpd */
35930 dremap.vmode = V2DImode;
35931 dremap.nelt = 2;
35932 dremap.perm[0] = 1;
35933 dremap.perm[1] = 2;
35934 }
35935 }
35936 else
35937 return false;
35938 }
35939 else
35940 {
35941 unsigned int nelt4 = nelt / 4, nzcnt = 0;
35942 unsigned HOST_WIDE_INT q[8];
35943 unsigned int nonzero_halves[4];
35944
35945 /* Split the two input vectors into 8 quarters. */
35946 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
35947 for (i = 1; i < 8; ++i)
35948 q[i] = q[0] << (nelt4 * i);
35949 for (i = 0; i < 4; ++i)
35950 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
35951 {
35952 nonzero_halves[nzcnt] = i;
35953 ++nzcnt;
35954 }
35955
35956 if (nzcnt == 1)
35957 {
35958 gcc_assert (d->op0 == d->op1);
35959 nonzero_halves[1] = nonzero_halves[0];
35960 same_halves = true;
35961 }
35962 else if (d->op0 == d->op1)
35963 {
35964 gcc_assert (nonzero_halves[0] == 0);
35965 gcc_assert (nonzero_halves[1] == 1);
35966 }
35967
35968 if (nzcnt <= 2)
35969 {
35970 if (d->perm[0] / nelt2 == nonzero_halves[1])
35971 {
35972 /* Attempt to increase the likelyhood that dfinal
35973 shuffle will be intra-lane. */
35974 char tmph = nonzero_halves[0];
35975 nonzero_halves[0] = nonzero_halves[1];
35976 nonzero_halves[1] = tmph;
35977 }
35978
35979 /* vperm2f128 or vperm2i128. */
35980 for (i = 0; i < nelt2; ++i)
35981 {
35982 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
35983 remap[i + nonzero_halves[0] * nelt2] = i;
35984 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
35985 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
35986 }
35987
35988 if (d->vmode != V8SFmode
35989 && d->vmode != V4DFmode
35990 && d->vmode != V8SImode)
35991 {
35992 dremap.vmode = V8SImode;
35993 dremap.nelt = 8;
35994 for (i = 0; i < 4; ++i)
35995 {
35996 dremap.perm[i] = i + nonzero_halves[0] * 4;
35997 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
35998 }
35999 }
36000 }
36001 else if (d->op0 == d->op1)
36002 return false;
36003 else if (TARGET_AVX2
36004 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
36005 {
36006 /* vpunpckl* */
36007 for (i = 0; i < nelt4; ++i)
36008 {
36009 remap[i] = i * 2;
36010 remap[i + nelt] = i * 2 + 1;
36011 remap[i + nelt2] = i * 2 + nelt2;
36012 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
36013 dremap.perm[i * 2] = i;
36014 dremap.perm[i * 2 + 1] = i + nelt;
36015 dremap.perm[i * 2 + nelt2] = i + nelt2;
36016 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
36017 }
36018 }
36019 else if (TARGET_AVX2
36020 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
36021 {
36022 /* vpunpckh* */
36023 for (i = 0; i < nelt4; ++i)
36024 {
36025 remap[i + nelt4] = i * 2;
36026 remap[i + nelt + nelt4] = i * 2 + 1;
36027 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
36028 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
36029 dremap.perm[i * 2] = i + nelt4;
36030 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
36031 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
36032 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
36033 }
36034 }
36035 else
36036 return false;
36037 }
36038
36039 /* Use the remapping array set up above to move the elements from their
36040 swizzled locations into their final destinations. */
36041 dfinal = *d;
36042 for (i = 0; i < nelt; ++i)
36043 {
36044 unsigned e = remap[d->perm[i]];
36045 gcc_assert (e < nelt);
36046 /* If same_halves is true, both halves of the remapped vector are the
36047 same. Avoid cross-lane accesses if possible. */
36048 if (same_halves && i >= nelt2)
36049 {
36050 gcc_assert (e < nelt2);
36051 dfinal.perm[i] = e + nelt2;
36052 }
36053 else
36054 dfinal.perm[i] = e;
36055 }
36056 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
36057 dfinal.op1 = dfinal.op0;
36058 dremap.target = dfinal.op0;
36059
36060 /* Test if the final remap can be done with a single insn. For V4SFmode or
36061 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
36062 start_sequence ();
36063 ok = expand_vec_perm_1 (&dfinal);
36064 seq = get_insns ();
36065 end_sequence ();
36066
36067 if (!ok)
36068 return false;
36069
36070 if (d->testing_p)
36071 return true;
36072
36073 if (dremap.vmode != dfinal.vmode)
36074 {
36075 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
36076 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
36077 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
36078 }
36079
36080 ok = expand_vec_perm_1 (&dremap);
36081 gcc_assert (ok);
36082
36083 emit_insn (seq);
36084 return true;
36085 }
36086
36087 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36088 a single vector cross-lane permutation into vpermq followed
36089 by any of the single insn permutations. */
36090
36091 static bool
36092 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
36093 {
36094 struct expand_vec_perm_d dremap, dfinal;
36095 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
36096 unsigned contents[2];
36097 bool ok;
36098
36099 if (!(TARGET_AVX2
36100 && (d->vmode == V32QImode || d->vmode == V16HImode)
36101 && d->op0 == d->op1))
36102 return false;
36103
36104 contents[0] = 0;
36105 contents[1] = 0;
36106 for (i = 0; i < nelt2; ++i)
36107 {
36108 contents[0] |= 1u << (d->perm[i] / nelt4);
36109 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
36110 }
36111
36112 for (i = 0; i < 2; ++i)
36113 {
36114 unsigned int cnt = 0;
36115 for (j = 0; j < 4; ++j)
36116 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
36117 return false;
36118 }
36119
36120 if (d->testing_p)
36121 return true;
36122
36123 dremap = *d;
36124 dremap.vmode = V4DImode;
36125 dremap.nelt = 4;
36126 dremap.target = gen_reg_rtx (V4DImode);
36127 dremap.op0 = gen_lowpart (V4DImode, d->op0);
36128 dremap.op1 = dremap.op0;
36129 for (i = 0; i < 2; ++i)
36130 {
36131 unsigned int cnt = 0;
36132 for (j = 0; j < 4; ++j)
36133 if ((contents[i] & (1u << j)) != 0)
36134 dremap.perm[2 * i + cnt++] = j;
36135 for (; cnt < 2; ++cnt)
36136 dremap.perm[2 * i + cnt] = 0;
36137 }
36138
36139 dfinal = *d;
36140 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
36141 dfinal.op1 = dfinal.op0;
36142 for (i = 0, j = 0; i < nelt; ++i)
36143 {
36144 if (i == nelt2)
36145 j = 2;
36146 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
36147 if ((d->perm[i] / nelt4) == dremap.perm[j])
36148 ;
36149 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
36150 dfinal.perm[i] |= nelt4;
36151 else
36152 gcc_unreachable ();
36153 }
36154
36155 ok = expand_vec_perm_1 (&dremap);
36156 gcc_assert (ok);
36157
36158 ok = expand_vec_perm_1 (&dfinal);
36159 gcc_assert (ok);
36160
36161 return true;
36162 }
36163
36164 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36165 a two vector permutation using 2 intra-lane interleave insns
36166 and cross-lane shuffle for 32-byte vectors. */
36167
36168 static bool
36169 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
36170 {
36171 unsigned i, nelt;
36172 rtx (*gen) (rtx, rtx, rtx);
36173
36174 if (d->op0 == d->op1)
36175 return false;
36176 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
36177 ;
36178 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
36179 ;
36180 else
36181 return false;
36182
36183 nelt = d->nelt;
36184 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
36185 return false;
36186 for (i = 0; i < nelt; i += 2)
36187 if (d->perm[i] != d->perm[0] + i / 2
36188 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
36189 return false;
36190
36191 if (d->testing_p)
36192 return true;
36193
36194 switch (d->vmode)
36195 {
36196 case V32QImode:
36197 if (d->perm[0])
36198 gen = gen_vec_interleave_highv32qi;
36199 else
36200 gen = gen_vec_interleave_lowv32qi;
36201 break;
36202 case V16HImode:
36203 if (d->perm[0])
36204 gen = gen_vec_interleave_highv16hi;
36205 else
36206 gen = gen_vec_interleave_lowv16hi;
36207 break;
36208 case V8SImode:
36209 if (d->perm[0])
36210 gen = gen_vec_interleave_highv8si;
36211 else
36212 gen = gen_vec_interleave_lowv8si;
36213 break;
36214 case V4DImode:
36215 if (d->perm[0])
36216 gen = gen_vec_interleave_highv4di;
36217 else
36218 gen = gen_vec_interleave_lowv4di;
36219 break;
36220 case V8SFmode:
36221 if (d->perm[0])
36222 gen = gen_vec_interleave_highv8sf;
36223 else
36224 gen = gen_vec_interleave_lowv8sf;
36225 break;
36226 case V4DFmode:
36227 if (d->perm[0])
36228 gen = gen_vec_interleave_highv4df;
36229 else
36230 gen = gen_vec_interleave_lowv4df;
36231 break;
36232 default:
36233 gcc_unreachable ();
36234 }
36235
36236 emit_insn (gen (d->target, d->op0, d->op1));
36237 return true;
36238 }
36239
36240 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
36241 permutation with two pshufb insns and an ior. We should have already
36242 failed all two instruction sequences. */
36243
36244 static bool
36245 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
36246 {
36247 rtx rperm[2][16], vperm, l, h, op, m128;
36248 unsigned int i, nelt, eltsz;
36249
36250 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36251 return false;
36252 gcc_assert (d->op0 != d->op1);
36253
36254 nelt = d->nelt;
36255 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36256
36257 /* Generate two permutation masks. If the required element is within
36258 the given vector it is shuffled into the proper lane. If the required
36259 element is in the other vector, force a zero into the lane by setting
36260 bit 7 in the permutation mask. */
36261 m128 = GEN_INT (-128);
36262 for (i = 0; i < nelt; ++i)
36263 {
36264 unsigned j, e = d->perm[i];
36265 unsigned which = (e >= nelt);
36266 if (e >= nelt)
36267 e -= nelt;
36268
36269 for (j = 0; j < eltsz; ++j)
36270 {
36271 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
36272 rperm[1-which][i*eltsz + j] = m128;
36273 }
36274 }
36275
36276 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
36277 vperm = force_reg (V16QImode, vperm);
36278
36279 l = gen_reg_rtx (V16QImode);
36280 op = gen_lowpart (V16QImode, d->op0);
36281 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
36282
36283 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
36284 vperm = force_reg (V16QImode, vperm);
36285
36286 h = gen_reg_rtx (V16QImode);
36287 op = gen_lowpart (V16QImode, d->op1);
36288 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
36289
36290 op = gen_lowpart (V16QImode, d->target);
36291 emit_insn (gen_iorv16qi3 (op, l, h));
36292
36293 return true;
36294 }
36295
36296 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
36297 with two vpshufb insns, vpermq and vpor. We should have already failed
36298 all two or three instruction sequences. */
36299
36300 static bool
36301 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
36302 {
36303 rtx rperm[2][32], vperm, l, h, hp, op, m128;
36304 unsigned int i, nelt, eltsz;
36305
36306 if (!TARGET_AVX2
36307 || d->op0 != d->op1
36308 || (d->vmode != V32QImode && d->vmode != V16HImode))
36309 return false;
36310
36311 if (d->testing_p)
36312 return true;
36313
36314 nelt = d->nelt;
36315 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36316
36317 /* Generate two permutation masks. If the required element is within
36318 the same lane, it is shuffled in. If the required element from the
36319 other lane, force a zero by setting bit 7 in the permutation mask.
36320 In the other mask the mask has non-negative elements if element
36321 is requested from the other lane, but also moved to the other lane,
36322 so that the result of vpshufb can have the two V2TImode halves
36323 swapped. */
36324 m128 = GEN_INT (-128);
36325 for (i = 0; i < nelt; ++i)
36326 {
36327 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36328 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
36329
36330 for (j = 0; j < eltsz; ++j)
36331 {
36332 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
36333 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
36334 }
36335 }
36336
36337 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
36338 vperm = force_reg (V32QImode, vperm);
36339
36340 h = gen_reg_rtx (V32QImode);
36341 op = gen_lowpart (V32QImode, d->op0);
36342 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
36343
36344 /* Swap the 128-byte lanes of h into hp. */
36345 hp = gen_reg_rtx (V4DImode);
36346 op = gen_lowpart (V4DImode, h);
36347 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
36348 const1_rtx));
36349
36350 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
36351 vperm = force_reg (V32QImode, vperm);
36352
36353 l = gen_reg_rtx (V32QImode);
36354 op = gen_lowpart (V32QImode, d->op0);
36355 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
36356
36357 op = gen_lowpart (V32QImode, d->target);
36358 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
36359
36360 return true;
36361 }
36362
36363 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
36364 and extract-odd permutations of two V32QImode and V16QImode operand
36365 with two vpshufb insns, vpor and vpermq. We should have already
36366 failed all two or three instruction sequences. */
36367
36368 static bool
36369 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
36370 {
36371 rtx rperm[2][32], vperm, l, h, ior, op, m128;
36372 unsigned int i, nelt, eltsz;
36373
36374 if (!TARGET_AVX2
36375 || d->op0 == d->op1
36376 || (d->vmode != V32QImode && d->vmode != V16HImode))
36377 return false;
36378
36379 for (i = 0; i < d->nelt; ++i)
36380 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
36381 return false;
36382
36383 if (d->testing_p)
36384 return true;
36385
36386 nelt = d->nelt;
36387 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36388
36389 /* Generate two permutation masks. In the first permutation mask
36390 the first quarter will contain indexes for the first half
36391 of the op0, the second quarter will contain bit 7 set, third quarter
36392 will contain indexes for the second half of the op0 and the
36393 last quarter bit 7 set. In the second permutation mask
36394 the first quarter will contain bit 7 set, the second quarter
36395 indexes for the first half of the op1, the third quarter bit 7 set
36396 and last quarter indexes for the second half of the op1.
36397 I.e. the first mask e.g. for V32QImode extract even will be:
36398 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
36399 (all values masked with 0xf except for -128) and second mask
36400 for extract even will be
36401 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
36402 m128 = GEN_INT (-128);
36403 for (i = 0; i < nelt; ++i)
36404 {
36405 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36406 unsigned which = d->perm[i] >= nelt;
36407 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
36408
36409 for (j = 0; j < eltsz; ++j)
36410 {
36411 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
36412 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
36413 }
36414 }
36415
36416 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
36417 vperm = force_reg (V32QImode, vperm);
36418
36419 l = gen_reg_rtx (V32QImode);
36420 op = gen_lowpart (V32QImode, d->op0);
36421 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
36422
36423 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
36424 vperm = force_reg (V32QImode, vperm);
36425
36426 h = gen_reg_rtx (V32QImode);
36427 op = gen_lowpart (V32QImode, d->op1);
36428 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
36429
36430 ior = gen_reg_rtx (V32QImode);
36431 emit_insn (gen_iorv32qi3 (ior, l, h));
36432
36433 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
36434 op = gen_lowpart (V4DImode, d->target);
36435 ior = gen_lowpart (V4DImode, ior);
36436 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
36437 const1_rtx, GEN_INT (3)));
36438
36439 return true;
36440 }
36441
36442 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
36443 and extract-odd permutations. */
36444
36445 static bool
36446 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
36447 {
36448 rtx t1, t2, t3;
36449
36450 switch (d->vmode)
36451 {
36452 case V4DFmode:
36453 t1 = gen_reg_rtx (V4DFmode);
36454 t2 = gen_reg_rtx (V4DFmode);
36455
36456 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
36457 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
36458 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
36459
36460 /* Now an unpck[lh]pd will produce the result required. */
36461 if (odd)
36462 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
36463 else
36464 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
36465 emit_insn (t3);
36466 break;
36467
36468 case V8SFmode:
36469 {
36470 int mask = odd ? 0xdd : 0x88;
36471
36472 t1 = gen_reg_rtx (V8SFmode);
36473 t2 = gen_reg_rtx (V8SFmode);
36474 t3 = gen_reg_rtx (V8SFmode);
36475
36476 /* Shuffle within the 128-bit lanes to produce:
36477 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
36478 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
36479 GEN_INT (mask)));
36480
36481 /* Shuffle the lanes around to produce:
36482 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
36483 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
36484 GEN_INT (0x3)));
36485
36486 /* Shuffle within the 128-bit lanes to produce:
36487 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
36488 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
36489
36490 /* Shuffle within the 128-bit lanes to produce:
36491 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
36492 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
36493
36494 /* Shuffle the lanes around to produce:
36495 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
36496 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
36497 GEN_INT (0x20)));
36498 }
36499 break;
36500
36501 case V2DFmode:
36502 case V4SFmode:
36503 case V2DImode:
36504 case V4SImode:
36505 /* These are always directly implementable by expand_vec_perm_1. */
36506 gcc_unreachable ();
36507
36508 case V8HImode:
36509 if (TARGET_SSSE3)
36510 return expand_vec_perm_pshufb2 (d);
36511 else
36512 {
36513 /* We need 2*log2(N)-1 operations to achieve odd/even
36514 with interleave. */
36515 t1 = gen_reg_rtx (V8HImode);
36516 t2 = gen_reg_rtx (V8HImode);
36517 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
36518 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
36519 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
36520 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
36521 if (odd)
36522 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
36523 else
36524 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
36525 emit_insn (t3);
36526 }
36527 break;
36528
36529 case V16QImode:
36530 if (TARGET_SSSE3)
36531 return expand_vec_perm_pshufb2 (d);
36532 else
36533 {
36534 t1 = gen_reg_rtx (V16QImode);
36535 t2 = gen_reg_rtx (V16QImode);
36536 t3 = gen_reg_rtx (V16QImode);
36537 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
36538 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
36539 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
36540 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
36541 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
36542 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
36543 if (odd)
36544 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
36545 else
36546 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
36547 emit_insn (t3);
36548 }
36549 break;
36550
36551 case V16HImode:
36552 case V32QImode:
36553 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
36554
36555 case V4DImode:
36556 if (!TARGET_AVX2)
36557 {
36558 struct expand_vec_perm_d d_copy = *d;
36559 d_copy.vmode = V4DFmode;
36560 d_copy.target = gen_lowpart (V4DFmode, d->target);
36561 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
36562 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
36563 return expand_vec_perm_even_odd_1 (&d_copy, odd);
36564 }
36565
36566 t1 = gen_reg_rtx (V4DImode);
36567 t2 = gen_reg_rtx (V4DImode);
36568
36569 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
36570 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
36571 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
36572
36573 /* Now an vpunpck[lh]qdq will produce the result required. */
36574 if (odd)
36575 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
36576 else
36577 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
36578 emit_insn (t3);
36579 break;
36580
36581 case V8SImode:
36582 if (!TARGET_AVX2)
36583 {
36584 struct expand_vec_perm_d d_copy = *d;
36585 d_copy.vmode = V8SFmode;
36586 d_copy.target = gen_lowpart (V8SFmode, d->target);
36587 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
36588 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
36589 return expand_vec_perm_even_odd_1 (&d_copy, odd);
36590 }
36591
36592 t1 = gen_reg_rtx (V8SImode);
36593 t2 = gen_reg_rtx (V8SImode);
36594
36595 /* Shuffle the lanes around into
36596 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
36597 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
36598 gen_lowpart (V4DImode, d->op0),
36599 gen_lowpart (V4DImode, d->op1),
36600 GEN_INT (0x20)));
36601 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
36602 gen_lowpart (V4DImode, d->op0),
36603 gen_lowpart (V4DImode, d->op1),
36604 GEN_INT (0x31)));
36605
36606 /* Swap the 2nd and 3rd position in each lane into
36607 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
36608 emit_insn (gen_avx2_pshufdv3 (t1, t1,
36609 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
36610 emit_insn (gen_avx2_pshufdv3 (t2, t2,
36611 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
36612
36613 /* Now an vpunpck[lh]qdq will produce
36614 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
36615 if (odd)
36616 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
36617 gen_lowpart (V4DImode, t1),
36618 gen_lowpart (V4DImode, t2));
36619 else
36620 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
36621 gen_lowpart (V4DImode, t1),
36622 gen_lowpart (V4DImode, t2));
36623 emit_insn (t3);
36624 break;
36625
36626 default:
36627 gcc_unreachable ();
36628 }
36629
36630 return true;
36631 }
36632
36633 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
36634 extract-even and extract-odd permutations. */
36635
36636 static bool
36637 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
36638 {
36639 unsigned i, odd, nelt = d->nelt;
36640
36641 odd = d->perm[0];
36642 if (odd != 0 && odd != 1)
36643 return false;
36644
36645 for (i = 1; i < nelt; ++i)
36646 if (d->perm[i] != 2 * i + odd)
36647 return false;
36648
36649 return expand_vec_perm_even_odd_1 (d, odd);
36650 }
36651
36652 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
36653 permutations. We assume that expand_vec_perm_1 has already failed. */
36654
36655 static bool
36656 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
36657 {
36658 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
36659 enum machine_mode vmode = d->vmode;
36660 unsigned char perm2[4];
36661 rtx op0 = d->op0;
36662 bool ok;
36663
36664 switch (vmode)
36665 {
36666 case V4DFmode:
36667 case V8SFmode:
36668 /* These are special-cased in sse.md so that we can optionally
36669 use the vbroadcast instruction. They expand to two insns
36670 if the input happens to be in a register. */
36671 gcc_unreachable ();
36672
36673 case V2DFmode:
36674 case V2DImode:
36675 case V4SFmode:
36676 case V4SImode:
36677 /* These are always implementable using standard shuffle patterns. */
36678 gcc_unreachable ();
36679
36680 case V8HImode:
36681 case V16QImode:
36682 /* These can be implemented via interleave. We save one insn by
36683 stopping once we have promoted to V4SImode and then use pshufd. */
36684 do
36685 {
36686 optab otab = vec_interleave_low_optab;
36687
36688 if (elt >= nelt2)
36689 {
36690 otab = vec_interleave_high_optab;
36691 elt -= nelt2;
36692 }
36693 nelt2 /= 2;
36694
36695 op0 = expand_binop (vmode, otab, op0, op0, NULL, 0, OPTAB_DIRECT);
36696 vmode = get_mode_wider_vector (vmode);
36697 op0 = gen_lowpart (vmode, op0);
36698 }
36699 while (vmode != V4SImode);
36700
36701 memset (perm2, elt, 4);
36702 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4);
36703 gcc_assert (ok);
36704 return true;
36705
36706 case V32QImode:
36707 case V16HImode:
36708 case V8SImode:
36709 case V4DImode:
36710 /* For AVX2 broadcasts of the first element vpbroadcast* or
36711 vpermq should be used by expand_vec_perm_1. */
36712 gcc_assert (!TARGET_AVX2 || d->perm[0]);
36713 return false;
36714
36715 default:
36716 gcc_unreachable ();
36717 }
36718 }
36719
36720 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
36721 broadcast permutations. */
36722
36723 static bool
36724 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
36725 {
36726 unsigned i, elt, nelt = d->nelt;
36727
36728 if (d->op0 != d->op1)
36729 return false;
36730
36731 elt = d->perm[0];
36732 for (i = 1; i < nelt; ++i)
36733 if (d->perm[i] != elt)
36734 return false;
36735
36736 return expand_vec_perm_broadcast_1 (d);
36737 }
36738
36739 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
36740 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
36741 all the shorter instruction sequences. */
36742
36743 static bool
36744 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
36745 {
36746 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
36747 unsigned int i, nelt, eltsz;
36748 bool used[4];
36749
36750 if (!TARGET_AVX2
36751 || d->op0 == d->op1
36752 || (d->vmode != V32QImode && d->vmode != V16HImode))
36753 return false;
36754
36755 if (d->testing_p)
36756 return true;
36757
36758 nelt = d->nelt;
36759 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36760
36761 /* Generate 4 permutation masks. If the required element is within
36762 the same lane, it is shuffled in. If the required element from the
36763 other lane, force a zero by setting bit 7 in the permutation mask.
36764 In the other mask the mask has non-negative elements if element
36765 is requested from the other lane, but also moved to the other lane,
36766 so that the result of vpshufb can have the two V2TImode halves
36767 swapped. */
36768 m128 = GEN_INT (-128);
36769 for (i = 0; i < 32; ++i)
36770 {
36771 rperm[0][i] = m128;
36772 rperm[1][i] = m128;
36773 rperm[2][i] = m128;
36774 rperm[3][i] = m128;
36775 }
36776 used[0] = false;
36777 used[1] = false;
36778 used[2] = false;
36779 used[3] = false;
36780 for (i = 0; i < nelt; ++i)
36781 {
36782 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36783 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
36784 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
36785
36786 for (j = 0; j < eltsz; ++j)
36787 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
36788 used[which] = true;
36789 }
36790
36791 for (i = 0; i < 2; ++i)
36792 {
36793 if (!used[2 * i + 1])
36794 {
36795 h[i] = NULL_RTX;
36796 continue;
36797 }
36798 vperm = gen_rtx_CONST_VECTOR (V32QImode,
36799 gen_rtvec_v (32, rperm[2 * i + 1]));
36800 vperm = force_reg (V32QImode, vperm);
36801 h[i] = gen_reg_rtx (V32QImode);
36802 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
36803 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
36804 }
36805
36806 /* Swap the 128-byte lanes of h[X]. */
36807 for (i = 0; i < 2; ++i)
36808 {
36809 if (h[i] == NULL_RTX)
36810 continue;
36811 op = gen_reg_rtx (V4DImode);
36812 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
36813 const2_rtx, GEN_INT (3), const0_rtx,
36814 const1_rtx));
36815 h[i] = gen_lowpart (V32QImode, op);
36816 }
36817
36818 for (i = 0; i < 2; ++i)
36819 {
36820 if (!used[2 * i])
36821 {
36822 l[i] = NULL_RTX;
36823 continue;
36824 }
36825 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
36826 vperm = force_reg (V32QImode, vperm);
36827 l[i] = gen_reg_rtx (V32QImode);
36828 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
36829 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
36830 }
36831
36832 for (i = 0; i < 2; ++i)
36833 {
36834 if (h[i] && l[i])
36835 {
36836 op = gen_reg_rtx (V32QImode);
36837 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
36838 l[i] = op;
36839 }
36840 else if (h[i])
36841 l[i] = h[i];
36842 }
36843
36844 gcc_assert (l[0] && l[1]);
36845 op = gen_lowpart (V32QImode, d->target);
36846 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
36847 return true;
36848 }
36849
36850 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
36851 With all of the interface bits taken care of, perform the expansion
36852 in D and return true on success. */
36853
36854 static bool
36855 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
36856 {
36857 /* Try a single instruction expansion. */
36858 if (expand_vec_perm_1 (d))
36859 return true;
36860
36861 /* Try sequences of two instructions. */
36862
36863 if (expand_vec_perm_pshuflw_pshufhw (d))
36864 return true;
36865
36866 if (expand_vec_perm_palignr (d))
36867 return true;
36868
36869 if (expand_vec_perm_interleave2 (d))
36870 return true;
36871
36872 if (expand_vec_perm_broadcast (d))
36873 return true;
36874
36875 if (expand_vec_perm_vpermq_perm_1 (d))
36876 return true;
36877
36878 /* Try sequences of three instructions. */
36879
36880 if (expand_vec_perm_pshufb2 (d))
36881 return true;
36882
36883 if (expand_vec_perm_interleave3 (d))
36884 return true;
36885
36886 /* Try sequences of four instructions. */
36887
36888 if (expand_vec_perm_vpshufb2_vpermq (d))
36889 return true;
36890
36891 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
36892 return true;
36893
36894 /* ??? Look for narrow permutations whose element orderings would
36895 allow the promotion to a wider mode. */
36896
36897 /* ??? Look for sequences of interleave or a wider permute that place
36898 the data into the correct lanes for a half-vector shuffle like
36899 pshuf[lh]w or vpermilps. */
36900
36901 /* ??? Look for sequences of interleave that produce the desired results.
36902 The combinatorics of punpck[lh] get pretty ugly... */
36903
36904 if (expand_vec_perm_even_odd (d))
36905 return true;
36906
36907 /* Even longer sequences. */
36908 if (expand_vec_perm_vpshufb4_vpermq2 (d))
36909 return true;
36910
36911 return false;
36912 }
36913
36914 bool
36915 ix86_expand_vec_perm_const (rtx operands[4])
36916 {
36917 struct expand_vec_perm_d d;
36918 unsigned char perm[MAX_VECT_LEN];
36919 int i, nelt, which;
36920 rtx sel;
36921
36922 d.target = operands[0];
36923 d.op0 = operands[1];
36924 d.op1 = operands[2];
36925 sel = operands[3];
36926
36927 d.vmode = GET_MODE (d.target);
36928 gcc_assert (VECTOR_MODE_P (d.vmode));
36929 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
36930 d.testing_p = false;
36931
36932 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
36933 gcc_assert (XVECLEN (sel, 0) == nelt);
36934 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
36935
36936 for (i = which = 0; i < nelt; ++i)
36937 {
36938 rtx e = XVECEXP (sel, 0, i);
36939 int ei = INTVAL (e) & (2 * nelt - 1);
36940
36941 which |= (ei < nelt ? 1 : 2);
36942 d.perm[i] = ei;
36943 perm[i] = ei;
36944 }
36945
36946 switch (which)
36947 {
36948 default:
36949 gcc_unreachable();
36950
36951 case 3:
36952 if (!rtx_equal_p (d.op0, d.op1))
36953 break;
36954
36955 /* The elements of PERM do not suggest that only the first operand
36956 is used, but both operands are identical. Allow easier matching
36957 of the permutation by folding the permutation into the single
36958 input vector. */
36959 for (i = 0; i < nelt; ++i)
36960 if (d.perm[i] >= nelt)
36961 d.perm[i] -= nelt;
36962 /* FALLTHRU */
36963
36964 case 1:
36965 d.op1 = d.op0;
36966 break;
36967
36968 case 2:
36969 for (i = 0; i < nelt; ++i)
36970 d.perm[i] -= nelt;
36971 d.op0 = d.op1;
36972 break;
36973 }
36974
36975 if (ix86_expand_vec_perm_const_1 (&d))
36976 return true;
36977
36978 /* If the mask says both arguments are needed, but they are the same,
36979 the above tried to expand with d.op0 == d.op1. If that didn't work,
36980 retry with d.op0 != d.op1 as that is what testing has been done with. */
36981 if (which == 3 && d.op0 == d.op1)
36982 {
36983 rtx seq;
36984 bool ok;
36985
36986 memcpy (d.perm, perm, sizeof (perm));
36987 d.op1 = gen_reg_rtx (d.vmode);
36988 start_sequence ();
36989 ok = ix86_expand_vec_perm_const_1 (&d);
36990 seq = get_insns ();
36991 end_sequence ();
36992 if (ok)
36993 {
36994 emit_move_insn (d.op1, d.op0);
36995 emit_insn (seq);
36996 return true;
36997 }
36998 }
36999
37000 return false;
37001 }
37002
37003 /* Implement targetm.vectorize.vec_perm_const_ok. */
37004
37005 static bool
37006 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
37007 const unsigned char *sel)
37008 {
37009 struct expand_vec_perm_d d;
37010 unsigned int i, nelt, which;
37011 bool ret, one_vec;
37012
37013 d.vmode = vmode;
37014 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37015 d.testing_p = true;
37016
37017 /* Given sufficient ISA support we can just return true here
37018 for selected vector modes. */
37019 if (GET_MODE_SIZE (d.vmode) == 16)
37020 {
37021 /* All implementable with a single vpperm insn. */
37022 if (TARGET_XOP)
37023 return true;
37024 /* All implementable with 2 pshufb + 1 ior. */
37025 if (TARGET_SSSE3)
37026 return true;
37027 /* All implementable with shufpd or unpck[lh]pd. */
37028 if (d.nelt == 2)
37029 return true;
37030 }
37031
37032 /* Extract the values from the vector CST into the permutation
37033 array in D. */
37034 memcpy (d.perm, sel, nelt);
37035 for (i = which = 0; i < nelt; ++i)
37036 {
37037 unsigned char e = d.perm[i];
37038 gcc_assert (e < 2 * nelt);
37039 which |= (e < nelt ? 1 : 2);
37040 }
37041
37042 /* For all elements from second vector, fold the elements to first. */
37043 if (which == 2)
37044 for (i = 0; i < nelt; ++i)
37045 d.perm[i] -= nelt;
37046
37047 /* Check whether the mask can be applied to the vector type. */
37048 one_vec = (which != 3);
37049
37050 /* Implementable with shufps or pshufd. */
37051 if (one_vec && (d.vmode == V4SFmode || d.vmode == V4SImode))
37052 return true;
37053
37054 /* Otherwise we have to go through the motions and see if we can
37055 figure out how to generate the requested permutation. */
37056 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
37057 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
37058 if (!one_vec)
37059 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
37060
37061 start_sequence ();
37062 ret = ix86_expand_vec_perm_const_1 (&d);
37063 end_sequence ();
37064
37065 return ret;
37066 }
37067
37068 void
37069 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
37070 {
37071 struct expand_vec_perm_d d;
37072 unsigned i, nelt;
37073
37074 d.target = targ;
37075 d.op0 = op0;
37076 d.op1 = op1;
37077 d.vmode = GET_MODE (targ);
37078 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37079 d.testing_p = false;
37080
37081 for (i = 0; i < nelt; ++i)
37082 d.perm[i] = i * 2 + odd;
37083
37084 /* We'll either be able to implement the permutation directly... */
37085 if (expand_vec_perm_1 (&d))
37086 return;
37087
37088 /* ... or we use the special-case patterns. */
37089 expand_vec_perm_even_odd_1 (&d, odd);
37090 }
37091
37092 /* Expand an insert into a vector register through pinsr insn.
37093 Return true if successful. */
37094
37095 bool
37096 ix86_expand_pinsr (rtx *operands)
37097 {
37098 rtx dst = operands[0];
37099 rtx src = operands[3];
37100
37101 unsigned int size = INTVAL (operands[1]);
37102 unsigned int pos = INTVAL (operands[2]);
37103
37104 if (GET_CODE (dst) == SUBREG)
37105 {
37106 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
37107 dst = SUBREG_REG (dst);
37108 }
37109
37110 if (GET_CODE (src) == SUBREG)
37111 src = SUBREG_REG (src);
37112
37113 switch (GET_MODE (dst))
37114 {
37115 case V16QImode:
37116 case V8HImode:
37117 case V4SImode:
37118 case V2DImode:
37119 {
37120 enum machine_mode srcmode, dstmode;
37121 rtx (*pinsr)(rtx, rtx, rtx, rtx);
37122
37123 srcmode = mode_for_size (size, MODE_INT, 0);
37124
37125 switch (srcmode)
37126 {
37127 case QImode:
37128 if (!TARGET_SSE4_1)
37129 return false;
37130 dstmode = V16QImode;
37131 pinsr = gen_sse4_1_pinsrb;
37132 break;
37133
37134 case HImode:
37135 if (!TARGET_SSE2)
37136 return false;
37137 dstmode = V8HImode;
37138 pinsr = gen_sse2_pinsrw;
37139 break;
37140
37141 case SImode:
37142 if (!TARGET_SSE4_1)
37143 return false;
37144 dstmode = V4SImode;
37145 pinsr = gen_sse4_1_pinsrd;
37146 break;
37147
37148 case DImode:
37149 gcc_assert (TARGET_64BIT);
37150 if (!TARGET_SSE4_1)
37151 return false;
37152 dstmode = V2DImode;
37153 pinsr = gen_sse4_1_pinsrq;
37154 break;
37155
37156 default:
37157 return false;
37158 }
37159
37160 dst = gen_lowpart (dstmode, dst);
37161 src = gen_lowpart (srcmode, src);
37162
37163 pos /= size;
37164
37165 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
37166 return true;
37167 }
37168
37169 default:
37170 return false;
37171 }
37172 }
37173 \f
37174 /* This function returns the calling abi specific va_list type node.
37175 It returns the FNDECL specific va_list type. */
37176
37177 static tree
37178 ix86_fn_abi_va_list (tree fndecl)
37179 {
37180 if (!TARGET_64BIT)
37181 return va_list_type_node;
37182 gcc_assert (fndecl != NULL_TREE);
37183
37184 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
37185 return ms_va_list_type_node;
37186 else
37187 return sysv_va_list_type_node;
37188 }
37189
37190 /* Returns the canonical va_list type specified by TYPE. If there
37191 is no valid TYPE provided, it return NULL_TREE. */
37192
37193 static tree
37194 ix86_canonical_va_list_type (tree type)
37195 {
37196 tree wtype, htype;
37197
37198 /* Resolve references and pointers to va_list type. */
37199 if (TREE_CODE (type) == MEM_REF)
37200 type = TREE_TYPE (type);
37201 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
37202 type = TREE_TYPE (type);
37203 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
37204 type = TREE_TYPE (type);
37205
37206 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
37207 {
37208 wtype = va_list_type_node;
37209 gcc_assert (wtype != NULL_TREE);
37210 htype = type;
37211 if (TREE_CODE (wtype) == ARRAY_TYPE)
37212 {
37213 /* If va_list is an array type, the argument may have decayed
37214 to a pointer type, e.g. by being passed to another function.
37215 In that case, unwrap both types so that we can compare the
37216 underlying records. */
37217 if (TREE_CODE (htype) == ARRAY_TYPE
37218 || POINTER_TYPE_P (htype))
37219 {
37220 wtype = TREE_TYPE (wtype);
37221 htype = TREE_TYPE (htype);
37222 }
37223 }
37224 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37225 return va_list_type_node;
37226 wtype = sysv_va_list_type_node;
37227 gcc_assert (wtype != NULL_TREE);
37228 htype = type;
37229 if (TREE_CODE (wtype) == ARRAY_TYPE)
37230 {
37231 /* If va_list is an array type, the argument may have decayed
37232 to a pointer type, e.g. by being passed to another function.
37233 In that case, unwrap both types so that we can compare the
37234 underlying records. */
37235 if (TREE_CODE (htype) == ARRAY_TYPE
37236 || POINTER_TYPE_P (htype))
37237 {
37238 wtype = TREE_TYPE (wtype);
37239 htype = TREE_TYPE (htype);
37240 }
37241 }
37242 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37243 return sysv_va_list_type_node;
37244 wtype = ms_va_list_type_node;
37245 gcc_assert (wtype != NULL_TREE);
37246 htype = type;
37247 if (TREE_CODE (wtype) == ARRAY_TYPE)
37248 {
37249 /* If va_list is an array type, the argument may have decayed
37250 to a pointer type, e.g. by being passed to another function.
37251 In that case, unwrap both types so that we can compare the
37252 underlying records. */
37253 if (TREE_CODE (htype) == ARRAY_TYPE
37254 || POINTER_TYPE_P (htype))
37255 {
37256 wtype = TREE_TYPE (wtype);
37257 htype = TREE_TYPE (htype);
37258 }
37259 }
37260 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37261 return ms_va_list_type_node;
37262 return NULL_TREE;
37263 }
37264 return std_canonical_va_list_type (type);
37265 }
37266
37267 /* Iterate through the target-specific builtin types for va_list.
37268 IDX denotes the iterator, *PTREE is set to the result type of
37269 the va_list builtin, and *PNAME to its internal type.
37270 Returns zero if there is no element for this index, otherwise
37271 IDX should be increased upon the next call.
37272 Note, do not iterate a base builtin's name like __builtin_va_list.
37273 Used from c_common_nodes_and_builtins. */
37274
37275 static int
37276 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
37277 {
37278 if (TARGET_64BIT)
37279 {
37280 switch (idx)
37281 {
37282 default:
37283 break;
37284
37285 case 0:
37286 *ptree = ms_va_list_type_node;
37287 *pname = "__builtin_ms_va_list";
37288 return 1;
37289
37290 case 1:
37291 *ptree = sysv_va_list_type_node;
37292 *pname = "__builtin_sysv_va_list";
37293 return 1;
37294 }
37295 }
37296
37297 return 0;
37298 }
37299
37300 #undef TARGET_SCHED_DISPATCH
37301 #define TARGET_SCHED_DISPATCH has_dispatch
37302 #undef TARGET_SCHED_DISPATCH_DO
37303 #define TARGET_SCHED_DISPATCH_DO do_dispatch
37304 #undef TARGET_SCHED_REASSOCIATION_WIDTH
37305 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
37306
37307 /* The size of the dispatch window is the total number of bytes of
37308 object code allowed in a window. */
37309 #define DISPATCH_WINDOW_SIZE 16
37310
37311 /* Number of dispatch windows considered for scheduling. */
37312 #define MAX_DISPATCH_WINDOWS 3
37313
37314 /* Maximum number of instructions in a window. */
37315 #define MAX_INSN 4
37316
37317 /* Maximum number of immediate operands in a window. */
37318 #define MAX_IMM 4
37319
37320 /* Maximum number of immediate bits allowed in a window. */
37321 #define MAX_IMM_SIZE 128
37322
37323 /* Maximum number of 32 bit immediates allowed in a window. */
37324 #define MAX_IMM_32 4
37325
37326 /* Maximum number of 64 bit immediates allowed in a window. */
37327 #define MAX_IMM_64 2
37328
37329 /* Maximum total of loads or prefetches allowed in a window. */
37330 #define MAX_LOAD 2
37331
37332 /* Maximum total of stores allowed in a window. */
37333 #define MAX_STORE 1
37334
37335 #undef BIG
37336 #define BIG 100
37337
37338
37339 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
37340 enum dispatch_group {
37341 disp_no_group = 0,
37342 disp_load,
37343 disp_store,
37344 disp_load_store,
37345 disp_prefetch,
37346 disp_imm,
37347 disp_imm_32,
37348 disp_imm_64,
37349 disp_branch,
37350 disp_cmp,
37351 disp_jcc,
37352 disp_last
37353 };
37354
37355 /* Number of allowable groups in a dispatch window. It is an array
37356 indexed by dispatch_group enum. 100 is used as a big number,
37357 because the number of these kind of operations does not have any
37358 effect in dispatch window, but we need them for other reasons in
37359 the table. */
37360 static unsigned int num_allowable_groups[disp_last] = {
37361 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
37362 };
37363
37364 char group_name[disp_last + 1][16] = {
37365 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
37366 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
37367 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
37368 };
37369
37370 /* Instruction path. */
37371 enum insn_path {
37372 no_path = 0,
37373 path_single, /* Single micro op. */
37374 path_double, /* Double micro op. */
37375 path_multi, /* Instructions with more than 2 micro op.. */
37376 last_path
37377 };
37378
37379 /* sched_insn_info defines a window to the instructions scheduled in
37380 the basic block. It contains a pointer to the insn_info table and
37381 the instruction scheduled.
37382
37383 Windows are allocated for each basic block and are linked
37384 together. */
37385 typedef struct sched_insn_info_s {
37386 rtx insn;
37387 enum dispatch_group group;
37388 enum insn_path path;
37389 int byte_len;
37390 int imm_bytes;
37391 } sched_insn_info;
37392
37393 /* Linked list of dispatch windows. This is a two way list of
37394 dispatch windows of a basic block. It contains information about
37395 the number of uops in the window and the total number of
37396 instructions and of bytes in the object code for this dispatch
37397 window. */
37398 typedef struct dispatch_windows_s {
37399 int num_insn; /* Number of insn in the window. */
37400 int num_uops; /* Number of uops in the window. */
37401 int window_size; /* Number of bytes in the window. */
37402 int window_num; /* Window number between 0 or 1. */
37403 int num_imm; /* Number of immediates in an insn. */
37404 int num_imm_32; /* Number of 32 bit immediates in an insn. */
37405 int num_imm_64; /* Number of 64 bit immediates in an insn. */
37406 int imm_size; /* Total immediates in the window. */
37407 int num_loads; /* Total memory loads in the window. */
37408 int num_stores; /* Total memory stores in the window. */
37409 int violation; /* Violation exists in window. */
37410 sched_insn_info *window; /* Pointer to the window. */
37411 struct dispatch_windows_s *next;
37412 struct dispatch_windows_s *prev;
37413 } dispatch_windows;
37414
37415 /* Immediate valuse used in an insn. */
37416 typedef struct imm_info_s
37417 {
37418 int imm;
37419 int imm32;
37420 int imm64;
37421 } imm_info;
37422
37423 static dispatch_windows *dispatch_window_list;
37424 static dispatch_windows *dispatch_window_list1;
37425
37426 /* Get dispatch group of insn. */
37427
37428 static enum dispatch_group
37429 get_mem_group (rtx insn)
37430 {
37431 enum attr_memory memory;
37432
37433 if (INSN_CODE (insn) < 0)
37434 return disp_no_group;
37435 memory = get_attr_memory (insn);
37436 if (memory == MEMORY_STORE)
37437 return disp_store;
37438
37439 if (memory == MEMORY_LOAD)
37440 return disp_load;
37441
37442 if (memory == MEMORY_BOTH)
37443 return disp_load_store;
37444
37445 return disp_no_group;
37446 }
37447
37448 /* Return true if insn is a compare instruction. */
37449
37450 static bool
37451 is_cmp (rtx insn)
37452 {
37453 enum attr_type type;
37454
37455 type = get_attr_type (insn);
37456 return (type == TYPE_TEST
37457 || type == TYPE_ICMP
37458 || type == TYPE_FCMP
37459 || GET_CODE (PATTERN (insn)) == COMPARE);
37460 }
37461
37462 /* Return true if a dispatch violation encountered. */
37463
37464 static bool
37465 dispatch_violation (void)
37466 {
37467 if (dispatch_window_list->next)
37468 return dispatch_window_list->next->violation;
37469 return dispatch_window_list->violation;
37470 }
37471
37472 /* Return true if insn is a branch instruction. */
37473
37474 static bool
37475 is_branch (rtx insn)
37476 {
37477 return (CALL_P (insn) || JUMP_P (insn));
37478 }
37479
37480 /* Return true if insn is a prefetch instruction. */
37481
37482 static bool
37483 is_prefetch (rtx insn)
37484 {
37485 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
37486 }
37487
37488 /* This function initializes a dispatch window and the list container holding a
37489 pointer to the window. */
37490
37491 static void
37492 init_window (int window_num)
37493 {
37494 int i;
37495 dispatch_windows *new_list;
37496
37497 if (window_num == 0)
37498 new_list = dispatch_window_list;
37499 else
37500 new_list = dispatch_window_list1;
37501
37502 new_list->num_insn = 0;
37503 new_list->num_uops = 0;
37504 new_list->window_size = 0;
37505 new_list->next = NULL;
37506 new_list->prev = NULL;
37507 new_list->window_num = window_num;
37508 new_list->num_imm = 0;
37509 new_list->num_imm_32 = 0;
37510 new_list->num_imm_64 = 0;
37511 new_list->imm_size = 0;
37512 new_list->num_loads = 0;
37513 new_list->num_stores = 0;
37514 new_list->violation = false;
37515
37516 for (i = 0; i < MAX_INSN; i++)
37517 {
37518 new_list->window[i].insn = NULL;
37519 new_list->window[i].group = disp_no_group;
37520 new_list->window[i].path = no_path;
37521 new_list->window[i].byte_len = 0;
37522 new_list->window[i].imm_bytes = 0;
37523 }
37524 return;
37525 }
37526
37527 /* This function allocates and initializes a dispatch window and the
37528 list container holding a pointer to the window. */
37529
37530 static dispatch_windows *
37531 allocate_window (void)
37532 {
37533 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
37534 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
37535
37536 return new_list;
37537 }
37538
37539 /* This routine initializes the dispatch scheduling information. It
37540 initiates building dispatch scheduler tables and constructs the
37541 first dispatch window. */
37542
37543 static void
37544 init_dispatch_sched (void)
37545 {
37546 /* Allocate a dispatch list and a window. */
37547 dispatch_window_list = allocate_window ();
37548 dispatch_window_list1 = allocate_window ();
37549 init_window (0);
37550 init_window (1);
37551 }
37552
37553 /* This function returns true if a branch is detected. End of a basic block
37554 does not have to be a branch, but here we assume only branches end a
37555 window. */
37556
37557 static bool
37558 is_end_basic_block (enum dispatch_group group)
37559 {
37560 return group == disp_branch;
37561 }
37562
37563 /* This function is called when the end of a window processing is reached. */
37564
37565 static void
37566 process_end_window (void)
37567 {
37568 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
37569 if (dispatch_window_list->next)
37570 {
37571 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
37572 gcc_assert (dispatch_window_list->window_size
37573 + dispatch_window_list1->window_size <= 48);
37574 init_window (1);
37575 }
37576 init_window (0);
37577 }
37578
37579 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
37580 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
37581 for 48 bytes of instructions. Note that these windows are not dispatch
37582 windows that their sizes are DISPATCH_WINDOW_SIZE. */
37583
37584 static dispatch_windows *
37585 allocate_next_window (int window_num)
37586 {
37587 if (window_num == 0)
37588 {
37589 if (dispatch_window_list->next)
37590 init_window (1);
37591 init_window (0);
37592 return dispatch_window_list;
37593 }
37594
37595 dispatch_window_list->next = dispatch_window_list1;
37596 dispatch_window_list1->prev = dispatch_window_list;
37597
37598 return dispatch_window_list1;
37599 }
37600
37601 /* Increment the number of immediate operands of an instruction. */
37602
37603 static int
37604 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
37605 {
37606 if (*in_rtx == 0)
37607 return 0;
37608
37609 switch ( GET_CODE (*in_rtx))
37610 {
37611 case CONST:
37612 case SYMBOL_REF:
37613 case CONST_INT:
37614 (imm_values->imm)++;
37615 if (x86_64_immediate_operand (*in_rtx, SImode))
37616 (imm_values->imm32)++;
37617 else
37618 (imm_values->imm64)++;
37619 break;
37620
37621 case CONST_DOUBLE:
37622 (imm_values->imm)++;
37623 (imm_values->imm64)++;
37624 break;
37625
37626 case CODE_LABEL:
37627 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
37628 {
37629 (imm_values->imm)++;
37630 (imm_values->imm32)++;
37631 }
37632 break;
37633
37634 default:
37635 break;
37636 }
37637
37638 return 0;
37639 }
37640
37641 /* Compute number of immediate operands of an instruction. */
37642
37643 static void
37644 find_constant (rtx in_rtx, imm_info *imm_values)
37645 {
37646 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
37647 (rtx_function) find_constant_1, (void *) imm_values);
37648 }
37649
37650 /* Return total size of immediate operands of an instruction along with number
37651 of corresponding immediate-operands. It initializes its parameters to zero
37652 befor calling FIND_CONSTANT.
37653 INSN is the input instruction. IMM is the total of immediates.
37654 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
37655 bit immediates. */
37656
37657 static int
37658 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
37659 {
37660 imm_info imm_values = {0, 0, 0};
37661
37662 find_constant (insn, &imm_values);
37663 *imm = imm_values.imm;
37664 *imm32 = imm_values.imm32;
37665 *imm64 = imm_values.imm64;
37666 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
37667 }
37668
37669 /* This function indicates if an operand of an instruction is an
37670 immediate. */
37671
37672 static bool
37673 has_immediate (rtx insn)
37674 {
37675 int num_imm_operand;
37676 int num_imm32_operand;
37677 int num_imm64_operand;
37678
37679 if (insn)
37680 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
37681 &num_imm64_operand);
37682 return false;
37683 }
37684
37685 /* Return single or double path for instructions. */
37686
37687 static enum insn_path
37688 get_insn_path (rtx insn)
37689 {
37690 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
37691
37692 if ((int)path == 0)
37693 return path_single;
37694
37695 if ((int)path == 1)
37696 return path_double;
37697
37698 return path_multi;
37699 }
37700
37701 /* Return insn dispatch group. */
37702
37703 static enum dispatch_group
37704 get_insn_group (rtx insn)
37705 {
37706 enum dispatch_group group = get_mem_group (insn);
37707 if (group)
37708 return group;
37709
37710 if (is_branch (insn))
37711 return disp_branch;
37712
37713 if (is_cmp (insn))
37714 return disp_cmp;
37715
37716 if (has_immediate (insn))
37717 return disp_imm;
37718
37719 if (is_prefetch (insn))
37720 return disp_prefetch;
37721
37722 return disp_no_group;
37723 }
37724
37725 /* Count number of GROUP restricted instructions in a dispatch
37726 window WINDOW_LIST. */
37727
37728 static int
37729 count_num_restricted (rtx insn, dispatch_windows *window_list)
37730 {
37731 enum dispatch_group group = get_insn_group (insn);
37732 int imm_size;
37733 int num_imm_operand;
37734 int num_imm32_operand;
37735 int num_imm64_operand;
37736
37737 if (group == disp_no_group)
37738 return 0;
37739
37740 if (group == disp_imm)
37741 {
37742 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
37743 &num_imm64_operand);
37744 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
37745 || num_imm_operand + window_list->num_imm > MAX_IMM
37746 || (num_imm32_operand > 0
37747 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
37748 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
37749 || (num_imm64_operand > 0
37750 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
37751 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
37752 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
37753 && num_imm64_operand > 0
37754 && ((window_list->num_imm_64 > 0
37755 && window_list->num_insn >= 2)
37756 || window_list->num_insn >= 3)))
37757 return BIG;
37758
37759 return 1;
37760 }
37761
37762 if ((group == disp_load_store
37763 && (window_list->num_loads >= MAX_LOAD
37764 || window_list->num_stores >= MAX_STORE))
37765 || ((group == disp_load
37766 || group == disp_prefetch)
37767 && window_list->num_loads >= MAX_LOAD)
37768 || (group == disp_store
37769 && window_list->num_stores >= MAX_STORE))
37770 return BIG;
37771
37772 return 1;
37773 }
37774
37775 /* This function returns true if insn satisfies dispatch rules on the
37776 last window scheduled. */
37777
37778 static bool
37779 fits_dispatch_window (rtx insn)
37780 {
37781 dispatch_windows *window_list = dispatch_window_list;
37782 dispatch_windows *window_list_next = dispatch_window_list->next;
37783 unsigned int num_restrict;
37784 enum dispatch_group group = get_insn_group (insn);
37785 enum insn_path path = get_insn_path (insn);
37786 int sum;
37787
37788 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
37789 instructions should be given the lowest priority in the
37790 scheduling process in Haifa scheduler to make sure they will be
37791 scheduled in the same dispatch window as the refrence to them. */
37792 if (group == disp_jcc || group == disp_cmp)
37793 return false;
37794
37795 /* Check nonrestricted. */
37796 if (group == disp_no_group || group == disp_branch)
37797 return true;
37798
37799 /* Get last dispatch window. */
37800 if (window_list_next)
37801 window_list = window_list_next;
37802
37803 if (window_list->window_num == 1)
37804 {
37805 sum = window_list->prev->window_size + window_list->window_size;
37806
37807 if (sum == 32
37808 || (min_insn_size (insn) + sum) >= 48)
37809 /* Window 1 is full. Go for next window. */
37810 return true;
37811 }
37812
37813 num_restrict = count_num_restricted (insn, window_list);
37814
37815 if (num_restrict > num_allowable_groups[group])
37816 return false;
37817
37818 /* See if it fits in the first window. */
37819 if (window_list->window_num == 0)
37820 {
37821 /* The first widow should have only single and double path
37822 uops. */
37823 if (path == path_double
37824 && (window_list->num_uops + 2) > MAX_INSN)
37825 return false;
37826 else if (path != path_single)
37827 return false;
37828 }
37829 return true;
37830 }
37831
37832 /* Add an instruction INSN with NUM_UOPS micro-operations to the
37833 dispatch window WINDOW_LIST. */
37834
37835 static void
37836 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
37837 {
37838 int byte_len = min_insn_size (insn);
37839 int num_insn = window_list->num_insn;
37840 int imm_size;
37841 sched_insn_info *window = window_list->window;
37842 enum dispatch_group group = get_insn_group (insn);
37843 enum insn_path path = get_insn_path (insn);
37844 int num_imm_operand;
37845 int num_imm32_operand;
37846 int num_imm64_operand;
37847
37848 if (!window_list->violation && group != disp_cmp
37849 && !fits_dispatch_window (insn))
37850 window_list->violation = true;
37851
37852 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
37853 &num_imm64_operand);
37854
37855 /* Initialize window with new instruction. */
37856 window[num_insn].insn = insn;
37857 window[num_insn].byte_len = byte_len;
37858 window[num_insn].group = group;
37859 window[num_insn].path = path;
37860 window[num_insn].imm_bytes = imm_size;
37861
37862 window_list->window_size += byte_len;
37863 window_list->num_insn = num_insn + 1;
37864 window_list->num_uops = window_list->num_uops + num_uops;
37865 window_list->imm_size += imm_size;
37866 window_list->num_imm += num_imm_operand;
37867 window_list->num_imm_32 += num_imm32_operand;
37868 window_list->num_imm_64 += num_imm64_operand;
37869
37870 if (group == disp_store)
37871 window_list->num_stores += 1;
37872 else if (group == disp_load
37873 || group == disp_prefetch)
37874 window_list->num_loads += 1;
37875 else if (group == disp_load_store)
37876 {
37877 window_list->num_stores += 1;
37878 window_list->num_loads += 1;
37879 }
37880 }
37881
37882 /* Adds a scheduled instruction, INSN, to the current dispatch window.
37883 If the total bytes of instructions or the number of instructions in
37884 the window exceed allowable, it allocates a new window. */
37885
37886 static void
37887 add_to_dispatch_window (rtx insn)
37888 {
37889 int byte_len;
37890 dispatch_windows *window_list;
37891 dispatch_windows *next_list;
37892 dispatch_windows *window0_list;
37893 enum insn_path path;
37894 enum dispatch_group insn_group;
37895 bool insn_fits;
37896 int num_insn;
37897 int num_uops;
37898 int window_num;
37899 int insn_num_uops;
37900 int sum;
37901
37902 if (INSN_CODE (insn) < 0)
37903 return;
37904
37905 byte_len = min_insn_size (insn);
37906 window_list = dispatch_window_list;
37907 next_list = window_list->next;
37908 path = get_insn_path (insn);
37909 insn_group = get_insn_group (insn);
37910
37911 /* Get the last dispatch window. */
37912 if (next_list)
37913 window_list = dispatch_window_list->next;
37914
37915 if (path == path_single)
37916 insn_num_uops = 1;
37917 else if (path == path_double)
37918 insn_num_uops = 2;
37919 else
37920 insn_num_uops = (int) path;
37921
37922 /* If current window is full, get a new window.
37923 Window number zero is full, if MAX_INSN uops are scheduled in it.
37924 Window number one is full, if window zero's bytes plus window
37925 one's bytes is 32, or if the bytes of the new instruction added
37926 to the total makes it greater than 48, or it has already MAX_INSN
37927 instructions in it. */
37928 num_insn = window_list->num_insn;
37929 num_uops = window_list->num_uops;
37930 window_num = window_list->window_num;
37931 insn_fits = fits_dispatch_window (insn);
37932
37933 if (num_insn >= MAX_INSN
37934 || num_uops + insn_num_uops > MAX_INSN
37935 || !(insn_fits))
37936 {
37937 window_num = ~window_num & 1;
37938 window_list = allocate_next_window (window_num);
37939 }
37940
37941 if (window_num == 0)
37942 {
37943 add_insn_window (insn, window_list, insn_num_uops);
37944 if (window_list->num_insn >= MAX_INSN
37945 && insn_group == disp_branch)
37946 {
37947 process_end_window ();
37948 return;
37949 }
37950 }
37951 else if (window_num == 1)
37952 {
37953 window0_list = window_list->prev;
37954 sum = window0_list->window_size + window_list->window_size;
37955 if (sum == 32
37956 || (byte_len + sum) >= 48)
37957 {
37958 process_end_window ();
37959 window_list = dispatch_window_list;
37960 }
37961
37962 add_insn_window (insn, window_list, insn_num_uops);
37963 }
37964 else
37965 gcc_unreachable ();
37966
37967 if (is_end_basic_block (insn_group))
37968 {
37969 /* End of basic block is reached do end-basic-block process. */
37970 process_end_window ();
37971 return;
37972 }
37973 }
37974
37975 /* Print the dispatch window, WINDOW_NUM, to FILE. */
37976
37977 DEBUG_FUNCTION static void
37978 debug_dispatch_window_file (FILE *file, int window_num)
37979 {
37980 dispatch_windows *list;
37981 int i;
37982
37983 if (window_num == 0)
37984 list = dispatch_window_list;
37985 else
37986 list = dispatch_window_list1;
37987
37988 fprintf (file, "Window #%d:\n", list->window_num);
37989 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
37990 list->num_insn, list->num_uops, list->window_size);
37991 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
37992 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
37993
37994 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
37995 list->num_stores);
37996 fprintf (file, " insn info:\n");
37997
37998 for (i = 0; i < MAX_INSN; i++)
37999 {
38000 if (!list->window[i].insn)
38001 break;
38002 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
38003 i, group_name[list->window[i].group],
38004 i, (void *)list->window[i].insn,
38005 i, list->window[i].path,
38006 i, list->window[i].byte_len,
38007 i, list->window[i].imm_bytes);
38008 }
38009 }
38010
38011 /* Print to stdout a dispatch window. */
38012
38013 DEBUG_FUNCTION void
38014 debug_dispatch_window (int window_num)
38015 {
38016 debug_dispatch_window_file (stdout, window_num);
38017 }
38018
38019 /* Print INSN dispatch information to FILE. */
38020
38021 DEBUG_FUNCTION static void
38022 debug_insn_dispatch_info_file (FILE *file, rtx insn)
38023 {
38024 int byte_len;
38025 enum insn_path path;
38026 enum dispatch_group group;
38027 int imm_size;
38028 int num_imm_operand;
38029 int num_imm32_operand;
38030 int num_imm64_operand;
38031
38032 if (INSN_CODE (insn) < 0)
38033 return;
38034
38035 byte_len = min_insn_size (insn);
38036 path = get_insn_path (insn);
38037 group = get_insn_group (insn);
38038 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38039 &num_imm64_operand);
38040
38041 fprintf (file, " insn info:\n");
38042 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
38043 group_name[group], path, byte_len);
38044 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
38045 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
38046 }
38047
38048 /* Print to STDERR the status of the ready list with respect to
38049 dispatch windows. */
38050
38051 DEBUG_FUNCTION void
38052 debug_ready_dispatch (void)
38053 {
38054 int i;
38055 int no_ready = number_in_ready ();
38056
38057 fprintf (stdout, "Number of ready: %d\n", no_ready);
38058
38059 for (i = 0; i < no_ready; i++)
38060 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
38061 }
38062
38063 /* This routine is the driver of the dispatch scheduler. */
38064
38065 static void
38066 do_dispatch (rtx insn, int mode)
38067 {
38068 if (mode == DISPATCH_INIT)
38069 init_dispatch_sched ();
38070 else if (mode == ADD_TO_DISPATCH_WINDOW)
38071 add_to_dispatch_window (insn);
38072 }
38073
38074 /* Return TRUE if Dispatch Scheduling is supported. */
38075
38076 static bool
38077 has_dispatch (rtx insn, int action)
38078 {
38079 if ((ix86_tune == PROCESSOR_BDVER1 || ix86_tune == PROCESSOR_BDVER2)
38080 && flag_dispatch_scheduler)
38081 switch (action)
38082 {
38083 default:
38084 return false;
38085
38086 case IS_DISPATCH_ON:
38087 return true;
38088 break;
38089
38090 case IS_CMP:
38091 return is_cmp (insn);
38092
38093 case DISPATCH_VIOLATION:
38094 return dispatch_violation ();
38095
38096 case FITS_DISPATCH_WINDOW:
38097 return fits_dispatch_window (insn);
38098 }
38099
38100 return false;
38101 }
38102
38103 /* Implementation of reassociation_width target hook used by
38104 reassoc phase to identify parallelism level in reassociated
38105 tree. Statements tree_code is passed in OPC. Arguments type
38106 is passed in MODE.
38107
38108 Currently parallel reassociation is enabled for Atom
38109 processors only and we set reassociation width to be 2
38110 because Atom may issue up to 2 instructions per cycle.
38111
38112 Return value should be fixed if parallel reassociation is
38113 enabled for other processors. */
38114
38115 static int
38116 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
38117 enum machine_mode mode)
38118 {
38119 int res = 1;
38120
38121 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
38122 res = 2;
38123 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
38124 res = 2;
38125
38126 return res;
38127 }
38128
38129 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
38130 place emms and femms instructions. */
38131
38132 static enum machine_mode
38133 ix86_preferred_simd_mode (enum machine_mode mode)
38134 {
38135 if (!TARGET_SSE)
38136 return word_mode;
38137
38138 switch (mode)
38139 {
38140 case QImode:
38141 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
38142 case HImode:
38143 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
38144 case SImode:
38145 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
38146 case DImode:
38147 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
38148
38149 case SFmode:
38150 if (TARGET_AVX && !TARGET_PREFER_AVX128)
38151 return V8SFmode;
38152 else
38153 return V4SFmode;
38154
38155 case DFmode:
38156 if (!TARGET_VECTORIZE_DOUBLE)
38157 return word_mode;
38158 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
38159 return V4DFmode;
38160 else if (TARGET_SSE2)
38161 return V2DFmode;
38162 /* FALLTHRU */
38163
38164 default:
38165 return word_mode;
38166 }
38167 }
38168
38169 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
38170 vectors. */
38171
38172 static unsigned int
38173 ix86_autovectorize_vector_sizes (void)
38174 {
38175 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
38176 }
38177
38178 /* Initialize the GCC target structure. */
38179 #undef TARGET_RETURN_IN_MEMORY
38180 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
38181
38182 #undef TARGET_LEGITIMIZE_ADDRESS
38183 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
38184
38185 #undef TARGET_ATTRIBUTE_TABLE
38186 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
38187 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38188 # undef TARGET_MERGE_DECL_ATTRIBUTES
38189 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
38190 #endif
38191
38192 #undef TARGET_COMP_TYPE_ATTRIBUTES
38193 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
38194
38195 #undef TARGET_INIT_BUILTINS
38196 #define TARGET_INIT_BUILTINS ix86_init_builtins
38197 #undef TARGET_BUILTIN_DECL
38198 #define TARGET_BUILTIN_DECL ix86_builtin_decl
38199 #undef TARGET_EXPAND_BUILTIN
38200 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
38201
38202 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
38203 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
38204 ix86_builtin_vectorized_function
38205
38206 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
38207 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
38208
38209 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
38210 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
38211
38212 #undef TARGET_VECTORIZE_BUILTIN_GATHER
38213 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
38214
38215 #undef TARGET_BUILTIN_RECIPROCAL
38216 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
38217
38218 #undef TARGET_ASM_FUNCTION_EPILOGUE
38219 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
38220
38221 #undef TARGET_ENCODE_SECTION_INFO
38222 #ifndef SUBTARGET_ENCODE_SECTION_INFO
38223 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
38224 #else
38225 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
38226 #endif
38227
38228 #undef TARGET_ASM_OPEN_PAREN
38229 #define TARGET_ASM_OPEN_PAREN ""
38230 #undef TARGET_ASM_CLOSE_PAREN
38231 #define TARGET_ASM_CLOSE_PAREN ""
38232
38233 #undef TARGET_ASM_BYTE_OP
38234 #define TARGET_ASM_BYTE_OP ASM_BYTE
38235
38236 #undef TARGET_ASM_ALIGNED_HI_OP
38237 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
38238 #undef TARGET_ASM_ALIGNED_SI_OP
38239 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
38240 #ifdef ASM_QUAD
38241 #undef TARGET_ASM_ALIGNED_DI_OP
38242 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
38243 #endif
38244
38245 #undef TARGET_PROFILE_BEFORE_PROLOGUE
38246 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
38247
38248 #undef TARGET_ASM_UNALIGNED_HI_OP
38249 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
38250 #undef TARGET_ASM_UNALIGNED_SI_OP
38251 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
38252 #undef TARGET_ASM_UNALIGNED_DI_OP
38253 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
38254
38255 #undef TARGET_PRINT_OPERAND
38256 #define TARGET_PRINT_OPERAND ix86_print_operand
38257 #undef TARGET_PRINT_OPERAND_ADDRESS
38258 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
38259 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
38260 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
38261 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
38262 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
38263
38264 #undef TARGET_SCHED_INIT_GLOBAL
38265 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
38266 #undef TARGET_SCHED_ADJUST_COST
38267 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
38268 #undef TARGET_SCHED_ISSUE_RATE
38269 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
38270 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
38271 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
38272 ia32_multipass_dfa_lookahead
38273
38274 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
38275 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
38276
38277 #ifdef HAVE_AS_TLS
38278 #undef TARGET_HAVE_TLS
38279 #define TARGET_HAVE_TLS true
38280 #endif
38281 #undef TARGET_CANNOT_FORCE_CONST_MEM
38282 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
38283 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
38284 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
38285
38286 #undef TARGET_DELEGITIMIZE_ADDRESS
38287 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
38288
38289 #undef TARGET_MS_BITFIELD_LAYOUT_P
38290 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
38291
38292 #if TARGET_MACHO
38293 #undef TARGET_BINDS_LOCAL_P
38294 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
38295 #endif
38296 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38297 #undef TARGET_BINDS_LOCAL_P
38298 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
38299 #endif
38300
38301 #undef TARGET_ASM_OUTPUT_MI_THUNK
38302 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
38303 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
38304 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
38305
38306 #undef TARGET_ASM_FILE_START
38307 #define TARGET_ASM_FILE_START x86_file_start
38308
38309 #undef TARGET_OPTION_OVERRIDE
38310 #define TARGET_OPTION_OVERRIDE ix86_option_override
38311
38312 #undef TARGET_REGISTER_MOVE_COST
38313 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
38314 #undef TARGET_MEMORY_MOVE_COST
38315 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
38316 #undef TARGET_RTX_COSTS
38317 #define TARGET_RTX_COSTS ix86_rtx_costs
38318 #undef TARGET_ADDRESS_COST
38319 #define TARGET_ADDRESS_COST ix86_address_cost
38320
38321 #undef TARGET_FIXED_CONDITION_CODE_REGS
38322 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
38323 #undef TARGET_CC_MODES_COMPATIBLE
38324 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
38325
38326 #undef TARGET_MACHINE_DEPENDENT_REORG
38327 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
38328
38329 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
38330 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
38331
38332 #undef TARGET_BUILD_BUILTIN_VA_LIST
38333 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
38334
38335 #undef TARGET_ENUM_VA_LIST_P
38336 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
38337
38338 #undef TARGET_FN_ABI_VA_LIST
38339 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
38340
38341 #undef TARGET_CANONICAL_VA_LIST_TYPE
38342 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
38343
38344 #undef TARGET_EXPAND_BUILTIN_VA_START
38345 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
38346
38347 #undef TARGET_MD_ASM_CLOBBERS
38348 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
38349
38350 #undef TARGET_PROMOTE_PROTOTYPES
38351 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
38352 #undef TARGET_STRUCT_VALUE_RTX
38353 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
38354 #undef TARGET_SETUP_INCOMING_VARARGS
38355 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
38356 #undef TARGET_MUST_PASS_IN_STACK
38357 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
38358 #undef TARGET_FUNCTION_ARG_ADVANCE
38359 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
38360 #undef TARGET_FUNCTION_ARG
38361 #define TARGET_FUNCTION_ARG ix86_function_arg
38362 #undef TARGET_FUNCTION_ARG_BOUNDARY
38363 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
38364 #undef TARGET_PASS_BY_REFERENCE
38365 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
38366 #undef TARGET_INTERNAL_ARG_POINTER
38367 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
38368 #undef TARGET_UPDATE_STACK_BOUNDARY
38369 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
38370 #undef TARGET_GET_DRAP_RTX
38371 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
38372 #undef TARGET_STRICT_ARGUMENT_NAMING
38373 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
38374 #undef TARGET_STATIC_CHAIN
38375 #define TARGET_STATIC_CHAIN ix86_static_chain
38376 #undef TARGET_TRAMPOLINE_INIT
38377 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
38378 #undef TARGET_RETURN_POPS_ARGS
38379 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
38380
38381 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
38382 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
38383
38384 #undef TARGET_SCALAR_MODE_SUPPORTED_P
38385 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
38386
38387 #undef TARGET_VECTOR_MODE_SUPPORTED_P
38388 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
38389
38390 #undef TARGET_C_MODE_FOR_SUFFIX
38391 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
38392
38393 #ifdef HAVE_AS_TLS
38394 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
38395 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
38396 #endif
38397
38398 #ifdef SUBTARGET_INSERT_ATTRIBUTES
38399 #undef TARGET_INSERT_ATTRIBUTES
38400 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
38401 #endif
38402
38403 #undef TARGET_MANGLE_TYPE
38404 #define TARGET_MANGLE_TYPE ix86_mangle_type
38405
38406 #ifndef TARGET_MACHO
38407 #undef TARGET_STACK_PROTECT_FAIL
38408 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
38409 #endif
38410
38411 #undef TARGET_FUNCTION_VALUE
38412 #define TARGET_FUNCTION_VALUE ix86_function_value
38413
38414 #undef TARGET_FUNCTION_VALUE_REGNO_P
38415 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
38416
38417 #undef TARGET_PROMOTE_FUNCTION_MODE
38418 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
38419
38420 #undef TARGET_SECONDARY_RELOAD
38421 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
38422
38423 #undef TARGET_CLASS_MAX_NREGS
38424 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
38425
38426 #undef TARGET_PREFERRED_RELOAD_CLASS
38427 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
38428 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
38429 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
38430 #undef TARGET_CLASS_LIKELY_SPILLED_P
38431 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
38432
38433 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
38434 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
38435 ix86_builtin_vectorization_cost
38436 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
38437 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
38438 ix86_vectorize_vec_perm_const_ok
38439 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
38440 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
38441 ix86_preferred_simd_mode
38442 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
38443 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
38444 ix86_autovectorize_vector_sizes
38445
38446 #undef TARGET_SET_CURRENT_FUNCTION
38447 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
38448
38449 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
38450 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
38451
38452 #undef TARGET_OPTION_SAVE
38453 #define TARGET_OPTION_SAVE ix86_function_specific_save
38454
38455 #undef TARGET_OPTION_RESTORE
38456 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
38457
38458 #undef TARGET_OPTION_PRINT
38459 #define TARGET_OPTION_PRINT ix86_function_specific_print
38460
38461 #undef TARGET_CAN_INLINE_P
38462 #define TARGET_CAN_INLINE_P ix86_can_inline_p
38463
38464 #undef TARGET_EXPAND_TO_RTL_HOOK
38465 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
38466
38467 #undef TARGET_LEGITIMATE_ADDRESS_P
38468 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
38469
38470 #undef TARGET_LEGITIMATE_CONSTANT_P
38471 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
38472
38473 #undef TARGET_FRAME_POINTER_REQUIRED
38474 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
38475
38476 #undef TARGET_CAN_ELIMINATE
38477 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
38478
38479 #undef TARGET_EXTRA_LIVE_ON_ENTRY
38480 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
38481
38482 #undef TARGET_ASM_CODE_END
38483 #define TARGET_ASM_CODE_END ix86_code_end
38484
38485 #undef TARGET_CONDITIONAL_REGISTER_USAGE
38486 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
38487
38488 #if TARGET_MACHO
38489 #undef TARGET_INIT_LIBFUNCS
38490 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
38491 #endif
38492
38493 struct gcc_target targetm = TARGET_INITIALIZER;
38494 \f
38495 #include "gt-i386.h"