]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/i386/i386-expand.c
builtins.c (get_memory_rtx): Fix comment.
[thirdparty/gcc.git] / gcc / config / i386 / i386-expand.c
1 /* Copyright (C) 1988-2019 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
18
19 #define IN_TARGET_CODE 1
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "params.h"
62 #include "cselib.h"
63 #include "sched-int.h"
64 #include "opts.h"
65 #include "tree-pass.h"
66 #include "context.h"
67 #include "pass_manager.h"
68 #include "target-globals.h"
69 #include "gimple-iterator.h"
70 #include "tree-vectorizer.h"
71 #include "shrink-wrap.h"
72 #include "builtins.h"
73 #include "rtl-iter.h"
74 #include "tree-iterator.h"
75 #include "dbgcnt.h"
76 #include "case-cfn-macros.h"
77 #include "dojump.h"
78 #include "fold-const-call.h"
79 #include "tree-vrp.h"
80 #include "tree-ssanames.h"
81 #include "selftest.h"
82 #include "selftest-rtl.h"
83 #include "print-rtl.h"
84 #include "intl.h"
85 #include "ifcvt.h"
86 #include "symbol-summary.h"
87 #include "ipa-prop.h"
88 #include "ipa-fnsummary.h"
89 #include "wide-int-bitmask.h"
90 #include "tree-vector-builder.h"
91 #include "debug.h"
92 #include "dwarf2out.h"
93 #include "i386-options.h"
94 #include "i386-builtins.h"
95 #include "i386-expand.h"
96
97 /* Split one or more double-mode RTL references into pairs of half-mode
98 references. The RTL can be REG, offsettable MEM, integer constant, or
99 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
100 split and "num" is its length. lo_half and hi_half are output arrays
101 that parallel "operands". */
102
103 void
104 split_double_mode (machine_mode mode, rtx operands[],
105 int num, rtx lo_half[], rtx hi_half[])
106 {
107 machine_mode half_mode;
108 unsigned int byte;
109
110 switch (mode)
111 {
112 case E_TImode:
113 half_mode = DImode;
114 break;
115 case E_DImode:
116 half_mode = SImode;
117 break;
118 default:
119 gcc_unreachable ();
120 }
121
122 byte = GET_MODE_SIZE (half_mode);
123
124 while (num--)
125 {
126 rtx op = operands[num];
127
128 /* simplify_subreg refuse to split volatile memory addresses,
129 but we still have to handle it. */
130 if (MEM_P (op))
131 {
132 lo_half[num] = adjust_address (op, half_mode, 0);
133 hi_half[num] = adjust_address (op, half_mode, byte);
134 }
135 else
136 {
137 lo_half[num] = simplify_gen_subreg (half_mode, op,
138 GET_MODE (op) == VOIDmode
139 ? mode : GET_MODE (op), 0);
140 hi_half[num] = simplify_gen_subreg (half_mode, op,
141 GET_MODE (op) == VOIDmode
142 ? mode : GET_MODE (op), byte);
143 }
144 }
145 }
146
147 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
148 for the target. */
149
150 void
151 ix86_expand_clear (rtx dest)
152 {
153 rtx tmp;
154
155 /* We play register width games, which are only valid after reload. */
156 gcc_assert (reload_completed);
157
158 /* Avoid HImode and its attendant prefix byte. */
159 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
160 dest = gen_rtx_REG (SImode, REGNO (dest));
161 tmp = gen_rtx_SET (dest, const0_rtx);
162
163 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
164 {
165 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
166 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
167 }
168
169 emit_insn (tmp);
170 }
171
172 void
173 ix86_expand_move (machine_mode mode, rtx operands[])
174 {
175 rtx op0, op1;
176 rtx tmp, addend = NULL_RTX;
177 enum tls_model model;
178
179 op0 = operands[0];
180 op1 = operands[1];
181
182 switch (GET_CODE (op1))
183 {
184 case CONST:
185 tmp = XEXP (op1, 0);
186
187 if (GET_CODE (tmp) != PLUS
188 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
189 break;
190
191 op1 = XEXP (tmp, 0);
192 addend = XEXP (tmp, 1);
193 /* FALLTHRU */
194
195 case SYMBOL_REF:
196 model = SYMBOL_REF_TLS_MODEL (op1);
197
198 if (model)
199 op1 = legitimize_tls_address (op1, model, true);
200 else if (ix86_force_load_from_GOT_p (op1))
201 {
202 /* Load the external function address via GOT slot to avoid PLT. */
203 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
204 (TARGET_64BIT
205 ? UNSPEC_GOTPCREL
206 : UNSPEC_GOT));
207 op1 = gen_rtx_CONST (Pmode, op1);
208 op1 = gen_const_mem (Pmode, op1);
209 set_mem_alias_set (op1, ix86_GOT_alias_set ());
210 }
211 else
212 {
213 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
214 if (tmp)
215 {
216 op1 = tmp;
217 if (!addend)
218 break;
219 }
220 else
221 {
222 op1 = operands[1];
223 break;
224 }
225 }
226
227 if (addend)
228 {
229 op1 = force_operand (op1, NULL_RTX);
230 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
231 op0, 1, OPTAB_DIRECT);
232 }
233 else
234 op1 = force_operand (op1, op0);
235
236 if (op1 == op0)
237 return;
238
239 op1 = convert_to_mode (mode, op1, 1);
240
241 default:
242 break;
243 }
244
245 if ((flag_pic || MACHOPIC_INDIRECT)
246 && symbolic_operand (op1, mode))
247 {
248 if (TARGET_MACHO && !TARGET_64BIT)
249 {
250 #if TARGET_MACHO
251 /* dynamic-no-pic */
252 if (MACHOPIC_INDIRECT)
253 {
254 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
255 ? op0 : gen_reg_rtx (Pmode);
256 op1 = machopic_indirect_data_reference (op1, temp);
257 if (MACHOPIC_PURE)
258 op1 = machopic_legitimize_pic_address (op1, mode,
259 temp == op1 ? 0 : temp);
260 }
261 if (op0 != op1 && GET_CODE (op0) != MEM)
262 {
263 rtx insn = gen_rtx_SET (op0, op1);
264 emit_insn (insn);
265 return;
266 }
267 if (GET_CODE (op0) == MEM)
268 op1 = force_reg (Pmode, op1);
269 else
270 {
271 rtx temp = op0;
272 if (GET_CODE (temp) != REG)
273 temp = gen_reg_rtx (Pmode);
274 temp = legitimize_pic_address (op1, temp);
275 if (temp == op0)
276 return;
277 op1 = temp;
278 }
279 /* dynamic-no-pic */
280 #endif
281 }
282 else
283 {
284 if (MEM_P (op0))
285 op1 = force_reg (mode, op1);
286 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
287 {
288 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
289 op1 = legitimize_pic_address (op1, reg);
290 if (op0 == op1)
291 return;
292 op1 = convert_to_mode (mode, op1, 1);
293 }
294 }
295 }
296 else
297 {
298 if (MEM_P (op0)
299 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
300 || !push_operand (op0, mode))
301 && MEM_P (op1))
302 op1 = force_reg (mode, op1);
303
304 if (push_operand (op0, mode)
305 && ! general_no_elim_operand (op1, mode))
306 op1 = copy_to_mode_reg (mode, op1);
307
308 /* Force large constants in 64bit compilation into register
309 to get them CSEed. */
310 if (can_create_pseudo_p ()
311 && (mode == DImode) && TARGET_64BIT
312 && immediate_operand (op1, mode)
313 && !x86_64_zext_immediate_operand (op1, VOIDmode)
314 && !register_operand (op0, mode)
315 && optimize)
316 op1 = copy_to_mode_reg (mode, op1);
317
318 if (can_create_pseudo_p ()
319 && CONST_DOUBLE_P (op1))
320 {
321 /* If we are loading a floating point constant to a register,
322 force the value to memory now, since we'll get better code
323 out the back end. */
324
325 op1 = validize_mem (force_const_mem (mode, op1));
326 if (!register_operand (op0, mode))
327 {
328 rtx temp = gen_reg_rtx (mode);
329 emit_insn (gen_rtx_SET (temp, op1));
330 emit_move_insn (op0, temp);
331 return;
332 }
333 }
334 }
335
336 emit_insn (gen_rtx_SET (op0, op1));
337 }
338
339 void
340 ix86_expand_vector_move (machine_mode mode, rtx operands[])
341 {
342 rtx op0 = operands[0], op1 = operands[1];
343 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
344 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
345 unsigned int align = (TARGET_IAMCU
346 ? GET_MODE_BITSIZE (mode)
347 : GET_MODE_ALIGNMENT (mode));
348
349 if (push_operand (op0, VOIDmode))
350 op0 = emit_move_resolve_push (mode, op0);
351
352 /* Force constants other than zero into memory. We do not know how
353 the instructions used to build constants modify the upper 64 bits
354 of the register, once we have that information we may be able
355 to handle some of them more efficiently. */
356 if (can_create_pseudo_p ()
357 && (CONSTANT_P (op1)
358 || (SUBREG_P (op1)
359 && CONSTANT_P (SUBREG_REG (op1))))
360 && ((register_operand (op0, mode)
361 && !standard_sse_constant_p (op1, mode))
362 /* ix86_expand_vector_move_misalign() does not like constants. */
363 || (SSE_REG_MODE_P (mode)
364 && MEM_P (op0)
365 && MEM_ALIGN (op0) < align)))
366 {
367 if (SUBREG_P (op1))
368 {
369 machine_mode imode = GET_MODE (SUBREG_REG (op1));
370 rtx r = force_const_mem (imode, SUBREG_REG (op1));
371 if (r)
372 r = validize_mem (r);
373 else
374 r = force_reg (imode, SUBREG_REG (op1));
375 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
376 }
377 else
378 op1 = validize_mem (force_const_mem (mode, op1));
379 }
380
381 /* We need to check memory alignment for SSE mode since attribute
382 can make operands unaligned. */
383 if (can_create_pseudo_p ()
384 && SSE_REG_MODE_P (mode)
385 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
386 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
387 {
388 rtx tmp[2];
389
390 /* ix86_expand_vector_move_misalign() does not like both
391 arguments in memory. */
392 if (!register_operand (op0, mode)
393 && !register_operand (op1, mode))
394 op1 = force_reg (mode, op1);
395
396 tmp[0] = op0; tmp[1] = op1;
397 ix86_expand_vector_move_misalign (mode, tmp);
398 return;
399 }
400
401 /* Make operand1 a register if it isn't already. */
402 if (can_create_pseudo_p ()
403 && !register_operand (op0, mode)
404 && !register_operand (op1, mode))
405 {
406 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
407 return;
408 }
409
410 emit_insn (gen_rtx_SET (op0, op1));
411 }
412
413 /* Split 32-byte AVX unaligned load and store if needed. */
414
415 static void
416 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
417 {
418 rtx m;
419 rtx (*extract) (rtx, rtx, rtx);
420 machine_mode mode;
421
422 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
423 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
424 {
425 emit_insn (gen_rtx_SET (op0, op1));
426 return;
427 }
428
429 rtx orig_op0 = NULL_RTX;
430 mode = GET_MODE (op0);
431 switch (GET_MODE_CLASS (mode))
432 {
433 case MODE_VECTOR_INT:
434 case MODE_INT:
435 if (mode != V32QImode)
436 {
437 if (!MEM_P (op0))
438 {
439 orig_op0 = op0;
440 op0 = gen_reg_rtx (V32QImode);
441 }
442 else
443 op0 = gen_lowpart (V32QImode, op0);
444 op1 = gen_lowpart (V32QImode, op1);
445 mode = V32QImode;
446 }
447 break;
448 case MODE_VECTOR_FLOAT:
449 break;
450 default:
451 gcc_unreachable ();
452 }
453
454 switch (mode)
455 {
456 default:
457 gcc_unreachable ();
458 case E_V32QImode:
459 extract = gen_avx_vextractf128v32qi;
460 mode = V16QImode;
461 break;
462 case E_V8SFmode:
463 extract = gen_avx_vextractf128v8sf;
464 mode = V4SFmode;
465 break;
466 case E_V4DFmode:
467 extract = gen_avx_vextractf128v4df;
468 mode = V2DFmode;
469 break;
470 }
471
472 if (MEM_P (op1))
473 {
474 rtx r = gen_reg_rtx (mode);
475 m = adjust_address (op1, mode, 0);
476 emit_move_insn (r, m);
477 m = adjust_address (op1, mode, 16);
478 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
479 emit_move_insn (op0, r);
480 }
481 else if (MEM_P (op0))
482 {
483 m = adjust_address (op0, mode, 0);
484 emit_insn (extract (m, op1, const0_rtx));
485 m = adjust_address (op0, mode, 16);
486 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
487 }
488 else
489 gcc_unreachable ();
490
491 if (orig_op0)
492 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
493 }
494
495 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
496 straight to ix86_expand_vector_move. */
497 /* Code generation for scalar reg-reg moves of single and double precision data:
498 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
499 movaps reg, reg
500 else
501 movss reg, reg
502 if (x86_sse_partial_reg_dependency == true)
503 movapd reg, reg
504 else
505 movsd reg, reg
506
507 Code generation for scalar loads of double precision data:
508 if (x86_sse_split_regs == true)
509 movlpd mem, reg (gas syntax)
510 else
511 movsd mem, reg
512
513 Code generation for unaligned packed loads of single precision data
514 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
515 if (x86_sse_unaligned_move_optimal)
516 movups mem, reg
517
518 if (x86_sse_partial_reg_dependency == true)
519 {
520 xorps reg, reg
521 movlps mem, reg
522 movhps mem+8, reg
523 }
524 else
525 {
526 movlps mem, reg
527 movhps mem+8, reg
528 }
529
530 Code generation for unaligned packed loads of double precision data
531 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
532 if (x86_sse_unaligned_move_optimal)
533 movupd mem, reg
534
535 if (x86_sse_split_regs == true)
536 {
537 movlpd mem, reg
538 movhpd mem+8, reg
539 }
540 else
541 {
542 movsd mem, reg
543 movhpd mem+8, reg
544 }
545 */
546
547 void
548 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
549 {
550 rtx op0, op1, m;
551
552 op0 = operands[0];
553 op1 = operands[1];
554
555 /* Use unaligned load/store for AVX512 or when optimizing for size. */
556 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
557 {
558 emit_insn (gen_rtx_SET (op0, op1));
559 return;
560 }
561
562 if (TARGET_AVX)
563 {
564 if (GET_MODE_SIZE (mode) == 32)
565 ix86_avx256_split_vector_move_misalign (op0, op1);
566 else
567 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
568 emit_insn (gen_rtx_SET (op0, op1));
569 return;
570 }
571
572 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
573 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
574 {
575 emit_insn (gen_rtx_SET (op0, op1));
576 return;
577 }
578
579 /* ??? If we have typed data, then it would appear that using
580 movdqu is the only way to get unaligned data loaded with
581 integer type. */
582 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
583 {
584 emit_insn (gen_rtx_SET (op0, op1));
585 return;
586 }
587
588 if (MEM_P (op1))
589 {
590 if (TARGET_SSE2 && mode == V2DFmode)
591 {
592 rtx zero;
593
594 /* When SSE registers are split into halves, we can avoid
595 writing to the top half twice. */
596 if (TARGET_SSE_SPLIT_REGS)
597 {
598 emit_clobber (op0);
599 zero = op0;
600 }
601 else
602 {
603 /* ??? Not sure about the best option for the Intel chips.
604 The following would seem to satisfy; the register is
605 entirely cleared, breaking the dependency chain. We
606 then store to the upper half, with a dependency depth
607 of one. A rumor has it that Intel recommends two movsd
608 followed by an unpacklpd, but this is unconfirmed. And
609 given that the dependency depth of the unpacklpd would
610 still be one, I'm not sure why this would be better. */
611 zero = CONST0_RTX (V2DFmode);
612 }
613
614 m = adjust_address (op1, DFmode, 0);
615 emit_insn (gen_sse2_loadlpd (op0, zero, m));
616 m = adjust_address (op1, DFmode, 8);
617 emit_insn (gen_sse2_loadhpd (op0, op0, m));
618 }
619 else
620 {
621 rtx t;
622
623 if (mode != V4SFmode)
624 t = gen_reg_rtx (V4SFmode);
625 else
626 t = op0;
627
628 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
629 emit_move_insn (t, CONST0_RTX (V4SFmode));
630 else
631 emit_clobber (t);
632
633 m = adjust_address (op1, V2SFmode, 0);
634 emit_insn (gen_sse_loadlps (t, t, m));
635 m = adjust_address (op1, V2SFmode, 8);
636 emit_insn (gen_sse_loadhps (t, t, m));
637 if (mode != V4SFmode)
638 emit_move_insn (op0, gen_lowpart (mode, t));
639 }
640 }
641 else if (MEM_P (op0))
642 {
643 if (TARGET_SSE2 && mode == V2DFmode)
644 {
645 m = adjust_address (op0, DFmode, 0);
646 emit_insn (gen_sse2_storelpd (m, op1));
647 m = adjust_address (op0, DFmode, 8);
648 emit_insn (gen_sse2_storehpd (m, op1));
649 }
650 else
651 {
652 if (mode != V4SFmode)
653 op1 = gen_lowpart (V4SFmode, op1);
654
655 m = adjust_address (op0, V2SFmode, 0);
656 emit_insn (gen_sse_storelps (m, op1));
657 m = adjust_address (op0, V2SFmode, 8);
658 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
659 }
660 }
661 else
662 gcc_unreachable ();
663 }
664
665 /* Move bits 64:95 to bits 32:63. */
666
667 void
668 ix86_move_vector_high_sse_to_mmx (rtx op)
669 {
670 rtx mask = gen_rtx_PARALLEL (VOIDmode,
671 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
672 GEN_INT (0), GEN_INT (0)));
673 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
674 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
675 rtx insn = gen_rtx_SET (dest, op);
676 emit_insn (insn);
677 }
678
679 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
680
681 void
682 ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
683 {
684 rtx op0 = operands[0];
685 rtx op1 = operands[1];
686 rtx op2 = operands[2];
687
688 machine_mode dmode = GET_MODE (op0);
689 machine_mode smode = GET_MODE (op1);
690 machine_mode inner_dmode = GET_MODE_INNER (dmode);
691 machine_mode inner_smode = GET_MODE_INNER (smode);
692
693 /* Get the corresponding SSE mode for destination. */
694 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
695 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
696 nunits).require ();
697 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
698 nunits / 2).require ();
699
700 /* Get the corresponding SSE mode for source. */
701 nunits = 16 / GET_MODE_SIZE (inner_smode);
702 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
703 nunits).require ();
704
705 /* Generate SSE pack with signed/unsigned saturation. */
706 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
707 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
708 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
709
710 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
711 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
712 rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
713 op1, op2));
714 emit_insn (insn);
715
716 ix86_move_vector_high_sse_to_mmx (op0);
717 }
718
719 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
720
721 void
722 ix86_split_mmx_punpck (rtx operands[], bool high_p)
723 {
724 rtx op0 = operands[0];
725 rtx op1 = operands[1];
726 rtx op2 = operands[2];
727 machine_mode mode = GET_MODE (op0);
728 rtx mask;
729 /* The corresponding SSE mode. */
730 machine_mode sse_mode, double_sse_mode;
731
732 switch (mode)
733 {
734 case E_V8QImode:
735 sse_mode = V16QImode;
736 double_sse_mode = V32QImode;
737 mask = gen_rtx_PARALLEL (VOIDmode,
738 gen_rtvec (16,
739 GEN_INT (0), GEN_INT (16),
740 GEN_INT (1), GEN_INT (17),
741 GEN_INT (2), GEN_INT (18),
742 GEN_INT (3), GEN_INT (19),
743 GEN_INT (4), GEN_INT (20),
744 GEN_INT (5), GEN_INT (21),
745 GEN_INT (6), GEN_INT (22),
746 GEN_INT (7), GEN_INT (23)));
747 break;
748
749 case E_V4HImode:
750 sse_mode = V8HImode;
751 double_sse_mode = V16HImode;
752 mask = gen_rtx_PARALLEL (VOIDmode,
753 gen_rtvec (8,
754 GEN_INT (0), GEN_INT (8),
755 GEN_INT (1), GEN_INT (9),
756 GEN_INT (2), GEN_INT (10),
757 GEN_INT (3), GEN_INT (11)));
758 break;
759
760 case E_V2SImode:
761 sse_mode = V4SImode;
762 double_sse_mode = V8SImode;
763 mask = gen_rtx_PARALLEL (VOIDmode,
764 gen_rtvec (4,
765 GEN_INT (0), GEN_INT (4),
766 GEN_INT (1), GEN_INT (5)));
767 break;
768
769 default:
770 gcc_unreachable ();
771 }
772
773 /* Generate SSE punpcklXX. */
774 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
775 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
776 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
777
778 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
779 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
780 rtx insn = gen_rtx_SET (dest, op2);
781 emit_insn (insn);
782
783 if (high_p)
784 {
785 /* Move bits 64:127 to bits 0:63. */
786 mask = gen_rtx_PARALLEL (VOIDmode,
787 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
788 GEN_INT (0), GEN_INT (0)));
789 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
790 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
791 insn = gen_rtx_SET (dest, op1);
792 emit_insn (insn);
793 }
794 }
795
796 /* Helper function of ix86_fixup_binary_operands to canonicalize
797 operand order. Returns true if the operands should be swapped. */
798
799 static bool
800 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
801 rtx operands[])
802 {
803 rtx dst = operands[0];
804 rtx src1 = operands[1];
805 rtx src2 = operands[2];
806
807 /* If the operation is not commutative, we can't do anything. */
808 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
809 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
810 return false;
811
812 /* Highest priority is that src1 should match dst. */
813 if (rtx_equal_p (dst, src1))
814 return false;
815 if (rtx_equal_p (dst, src2))
816 return true;
817
818 /* Next highest priority is that immediate constants come second. */
819 if (immediate_operand (src2, mode))
820 return false;
821 if (immediate_operand (src1, mode))
822 return true;
823
824 /* Lowest priority is that memory references should come second. */
825 if (MEM_P (src2))
826 return false;
827 if (MEM_P (src1))
828 return true;
829
830 return false;
831 }
832
833
834 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
835 destination to use for the operation. If different from the true
836 destination in operands[0], a copy operation will be required. */
837
838 rtx
839 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
840 rtx operands[])
841 {
842 rtx dst = operands[0];
843 rtx src1 = operands[1];
844 rtx src2 = operands[2];
845
846 /* Canonicalize operand order. */
847 if (ix86_swap_binary_operands_p (code, mode, operands))
848 {
849 /* It is invalid to swap operands of different modes. */
850 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
851
852 std::swap (src1, src2);
853 }
854
855 /* Both source operands cannot be in memory. */
856 if (MEM_P (src1) && MEM_P (src2))
857 {
858 /* Optimization: Only read from memory once. */
859 if (rtx_equal_p (src1, src2))
860 {
861 src2 = force_reg (mode, src2);
862 src1 = src2;
863 }
864 else if (rtx_equal_p (dst, src1))
865 src2 = force_reg (mode, src2);
866 else
867 src1 = force_reg (mode, src1);
868 }
869
870 /* If the destination is memory, and we do not have matching source
871 operands, do things in registers. */
872 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
873 dst = gen_reg_rtx (mode);
874
875 /* Source 1 cannot be a constant. */
876 if (CONSTANT_P (src1))
877 src1 = force_reg (mode, src1);
878
879 /* Source 1 cannot be a non-matching memory. */
880 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
881 src1 = force_reg (mode, src1);
882
883 /* Improve address combine. */
884 if (code == PLUS
885 && GET_MODE_CLASS (mode) == MODE_INT
886 && MEM_P (src2))
887 src2 = force_reg (mode, src2);
888
889 operands[1] = src1;
890 operands[2] = src2;
891 return dst;
892 }
893
894 /* Similarly, but assume that the destination has already been
895 set up properly. */
896
897 void
898 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
899 machine_mode mode, rtx operands[])
900 {
901 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
902 gcc_assert (dst == operands[0]);
903 }
904
905 /* Attempt to expand a binary operator. Make the expansion closer to the
906 actual machine, then just general_operand, which will allow 3 separate
907 memory references (one output, two input) in a single insn. */
908
909 void
910 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
911 rtx operands[])
912 {
913 rtx src1, src2, dst, op, clob;
914
915 dst = ix86_fixup_binary_operands (code, mode, operands);
916 src1 = operands[1];
917 src2 = operands[2];
918
919 /* Emit the instruction. */
920
921 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
922
923 if (reload_completed
924 && code == PLUS
925 && !rtx_equal_p (dst, src1))
926 {
927 /* This is going to be an LEA; avoid splitting it later. */
928 emit_insn (op);
929 }
930 else
931 {
932 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
933 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
934 }
935
936 /* Fix up the destination if needed. */
937 if (dst != operands[0])
938 emit_move_insn (operands[0], dst);
939 }
940
941 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
942 the given OPERANDS. */
943
944 void
945 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
946 rtx operands[])
947 {
948 rtx op1 = NULL_RTX, op2 = NULL_RTX;
949 if (SUBREG_P (operands[1]))
950 {
951 op1 = operands[1];
952 op2 = operands[2];
953 }
954 else if (SUBREG_P (operands[2]))
955 {
956 op1 = operands[2];
957 op2 = operands[1];
958 }
959 /* Optimize (__m128i) d | (__m128i) e and similar code
960 when d and e are float vectors into float vector logical
961 insn. In C/C++ without using intrinsics there is no other way
962 to express vector logical operation on float vectors than
963 to cast them temporarily to integer vectors. */
964 if (op1
965 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
966 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
967 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
968 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
969 && SUBREG_BYTE (op1) == 0
970 && (GET_CODE (op2) == CONST_VECTOR
971 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
972 && SUBREG_BYTE (op2) == 0))
973 && can_create_pseudo_p ())
974 {
975 rtx dst;
976 switch (GET_MODE (SUBREG_REG (op1)))
977 {
978 case E_V4SFmode:
979 case E_V8SFmode:
980 case E_V16SFmode:
981 case E_V2DFmode:
982 case E_V4DFmode:
983 case E_V8DFmode:
984 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
985 if (GET_CODE (op2) == CONST_VECTOR)
986 {
987 op2 = gen_lowpart (GET_MODE (dst), op2);
988 op2 = force_reg (GET_MODE (dst), op2);
989 }
990 else
991 {
992 op1 = operands[1];
993 op2 = SUBREG_REG (operands[2]);
994 if (!vector_operand (op2, GET_MODE (dst)))
995 op2 = force_reg (GET_MODE (dst), op2);
996 }
997 op1 = SUBREG_REG (op1);
998 if (!vector_operand (op1, GET_MODE (dst)))
999 op1 = force_reg (GET_MODE (dst), op1);
1000 emit_insn (gen_rtx_SET (dst,
1001 gen_rtx_fmt_ee (code, GET_MODE (dst),
1002 op1, op2)));
1003 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1004 return;
1005 default:
1006 break;
1007 }
1008 }
1009 if (!vector_operand (operands[1], mode))
1010 operands[1] = force_reg (mode, operands[1]);
1011 if (!vector_operand (operands[2], mode))
1012 operands[2] = force_reg (mode, operands[2]);
1013 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1014 emit_insn (gen_rtx_SET (operands[0],
1015 gen_rtx_fmt_ee (code, mode, operands[1],
1016 operands[2])));
1017 }
1018
1019 /* Return TRUE or FALSE depending on whether the binary operator meets the
1020 appropriate constraints. */
1021
1022 bool
1023 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1024 rtx operands[3])
1025 {
1026 rtx dst = operands[0];
1027 rtx src1 = operands[1];
1028 rtx src2 = operands[2];
1029
1030 /* Both source operands cannot be in memory. */
1031 if (MEM_P (src1) && MEM_P (src2))
1032 return false;
1033
1034 /* Canonicalize operand order for commutative operators. */
1035 if (ix86_swap_binary_operands_p (code, mode, operands))
1036 std::swap (src1, src2);
1037
1038 /* If the destination is memory, we must have a matching source operand. */
1039 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1040 return false;
1041
1042 /* Source 1 cannot be a constant. */
1043 if (CONSTANT_P (src1))
1044 return false;
1045
1046 /* Source 1 cannot be a non-matching memory. */
1047 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1048 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1049 return (code == AND
1050 && (mode == HImode
1051 || mode == SImode
1052 || (TARGET_64BIT && mode == DImode))
1053 && satisfies_constraint_L (src2));
1054
1055 return true;
1056 }
1057
1058 /* Attempt to expand a unary operator. Make the expansion closer to the
1059 actual machine, then just general_operand, which will allow 2 separate
1060 memory references (one output, one input) in a single insn. */
1061
1062 void
1063 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1064 rtx operands[])
1065 {
1066 bool matching_memory = false;
1067 rtx src, dst, op, clob;
1068
1069 dst = operands[0];
1070 src = operands[1];
1071
1072 /* If the destination is memory, and we do not have matching source
1073 operands, do things in registers. */
1074 if (MEM_P (dst))
1075 {
1076 if (rtx_equal_p (dst, src))
1077 matching_memory = true;
1078 else
1079 dst = gen_reg_rtx (mode);
1080 }
1081
1082 /* When source operand is memory, destination must match. */
1083 if (MEM_P (src) && !matching_memory)
1084 src = force_reg (mode, src);
1085
1086 /* Emit the instruction. */
1087
1088 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1089
1090 if (code == NOT)
1091 emit_insn (op);
1092 else
1093 {
1094 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1095 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1096 }
1097
1098 /* Fix up the destination if needed. */
1099 if (dst != operands[0])
1100 emit_move_insn (operands[0], dst);
1101 }
1102
1103 /* Predict just emitted jump instruction to be taken with probability PROB. */
1104
1105 static void
1106 predict_jump (int prob)
1107 {
1108 rtx_insn *insn = get_last_insn ();
1109 gcc_assert (JUMP_P (insn));
1110 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1111 }
1112
1113 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1114 divisor are within the range [0-255]. */
1115
1116 void
1117 ix86_split_idivmod (machine_mode mode, rtx operands[],
1118 bool unsigned_p)
1119 {
1120 rtx_code_label *end_label, *qimode_label;
1121 rtx div, mod;
1122 rtx_insn *insn;
1123 rtx scratch, tmp0, tmp1, tmp2;
1124 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1125 rtx (*gen_zero_extend) (rtx, rtx);
1126 rtx (*gen_test_ccno_1) (rtx, rtx);
1127
1128 switch (mode)
1129 {
1130 case E_SImode:
1131 if (GET_MODE (operands[0]) == SImode)
1132 {
1133 if (GET_MODE (operands[1]) == SImode)
1134 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1135 else
1136 gen_divmod4_1
1137 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1138 gen_zero_extend = gen_zero_extendqisi2;
1139 }
1140 else
1141 {
1142 gen_divmod4_1
1143 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1144 gen_zero_extend = gen_zero_extendqidi2;
1145 }
1146 gen_test_ccno_1 = gen_testsi_ccno_1;
1147 break;
1148 case E_DImode:
1149 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1150 gen_test_ccno_1 = gen_testdi_ccno_1;
1151 gen_zero_extend = gen_zero_extendqidi2;
1152 break;
1153 default:
1154 gcc_unreachable ();
1155 }
1156
1157 end_label = gen_label_rtx ();
1158 qimode_label = gen_label_rtx ();
1159
1160 scratch = gen_reg_rtx (mode);
1161
1162 /* Use 8bit unsigned divimod if dividend and divisor are within
1163 the range [0-255]. */
1164 emit_move_insn (scratch, operands[2]);
1165 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1166 scratch, 1, OPTAB_DIRECT);
1167 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
1168 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1169 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1170 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1171 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1172 pc_rtx);
1173 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1174 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1175 JUMP_LABEL (insn) = qimode_label;
1176
1177 /* Generate original signed/unsigned divimod. */
1178 div = gen_divmod4_1 (operands[0], operands[1],
1179 operands[2], operands[3]);
1180 emit_insn (div);
1181
1182 /* Branch to the end. */
1183 emit_jump_insn (gen_jump (end_label));
1184 emit_barrier ();
1185
1186 /* Generate 8bit unsigned divide. */
1187 emit_label (qimode_label);
1188 /* Don't use operands[0] for result of 8bit divide since not all
1189 registers support QImode ZERO_EXTRACT. */
1190 tmp0 = lowpart_subreg (HImode, scratch, mode);
1191 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1192 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1193 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1194
1195 if (unsigned_p)
1196 {
1197 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1198 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1199 }
1200 else
1201 {
1202 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1203 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1204 }
1205 if (mode == SImode)
1206 {
1207 if (GET_MODE (operands[0]) != SImode)
1208 div = gen_rtx_ZERO_EXTEND (DImode, div);
1209 if (GET_MODE (operands[1]) != SImode)
1210 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1211 }
1212
1213 /* Extract remainder from AH. */
1214 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
1215 tmp0, GEN_INT (8), GEN_INT (8));
1216 if (REG_P (operands[1]))
1217 insn = emit_move_insn (operands[1], tmp1);
1218 else
1219 {
1220 /* Need a new scratch register since the old one has result
1221 of 8bit divide. */
1222 scratch = gen_reg_rtx (GET_MODE (operands[1]));
1223 emit_move_insn (scratch, tmp1);
1224 insn = emit_move_insn (operands[1], scratch);
1225 }
1226 set_unique_reg_note (insn, REG_EQUAL, mod);
1227
1228 /* Zero extend quotient from AL. */
1229 tmp1 = gen_lowpart (QImode, tmp0);
1230 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
1231 set_unique_reg_note (insn, REG_EQUAL, div);
1232
1233 emit_label (end_label);
1234 }
1235
1236 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1237 matches destination. RTX includes clobber of FLAGS_REG. */
1238
1239 void
1240 ix86_emit_binop (enum rtx_code code, machine_mode mode,
1241 rtx dst, rtx src)
1242 {
1243 rtx op, clob;
1244
1245 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1246 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1247
1248 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1249 }
1250
1251 /* Return true if regno1 def is nearest to the insn. */
1252
1253 static bool
1254 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1255 {
1256 rtx_insn *prev = insn;
1257 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1258
1259 if (insn == start)
1260 return false;
1261 while (prev && prev != start)
1262 {
1263 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1264 {
1265 prev = PREV_INSN (prev);
1266 continue;
1267 }
1268 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1269 return true;
1270 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1271 return false;
1272 prev = PREV_INSN (prev);
1273 }
1274
1275 /* None of the regs is defined in the bb. */
1276 return false;
1277 }
1278
1279 /* Split lea instructions into a sequence of instructions
1280 which are executed on ALU to avoid AGU stalls.
1281 It is assumed that it is allowed to clobber flags register
1282 at lea position. */
1283
1284 void
1285 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1286 {
1287 unsigned int regno0, regno1, regno2;
1288 struct ix86_address parts;
1289 rtx target, tmp;
1290 int ok, adds;
1291
1292 ok = ix86_decompose_address (operands[1], &parts);
1293 gcc_assert (ok);
1294
1295 target = gen_lowpart (mode, operands[0]);
1296
1297 regno0 = true_regnum (target);
1298 regno1 = INVALID_REGNUM;
1299 regno2 = INVALID_REGNUM;
1300
1301 if (parts.base)
1302 {
1303 parts.base = gen_lowpart (mode, parts.base);
1304 regno1 = true_regnum (parts.base);
1305 }
1306
1307 if (parts.index)
1308 {
1309 parts.index = gen_lowpart (mode, parts.index);
1310 regno2 = true_regnum (parts.index);
1311 }
1312
1313 if (parts.disp)
1314 parts.disp = gen_lowpart (mode, parts.disp);
1315
1316 if (parts.scale > 1)
1317 {
1318 /* Case r1 = r1 + ... */
1319 if (regno1 == regno0)
1320 {
1321 /* If we have a case r1 = r1 + C * r2 then we
1322 should use multiplication which is very
1323 expensive. Assume cost model is wrong if we
1324 have such case here. */
1325 gcc_assert (regno2 != regno0);
1326
1327 for (adds = parts.scale; adds > 0; adds--)
1328 ix86_emit_binop (PLUS, mode, target, parts.index);
1329 }
1330 else
1331 {
1332 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1333 if (regno0 != regno2)
1334 emit_insn (gen_rtx_SET (target, parts.index));
1335
1336 /* Use shift for scaling. */
1337 ix86_emit_binop (ASHIFT, mode, target,
1338 GEN_INT (exact_log2 (parts.scale)));
1339
1340 if (parts.base)
1341 ix86_emit_binop (PLUS, mode, target, parts.base);
1342
1343 if (parts.disp && parts.disp != const0_rtx)
1344 ix86_emit_binop (PLUS, mode, target, parts.disp);
1345 }
1346 }
1347 else if (!parts.base && !parts.index)
1348 {
1349 gcc_assert(parts.disp);
1350 emit_insn (gen_rtx_SET (target, parts.disp));
1351 }
1352 else
1353 {
1354 if (!parts.base)
1355 {
1356 if (regno0 != regno2)
1357 emit_insn (gen_rtx_SET (target, parts.index));
1358 }
1359 else if (!parts.index)
1360 {
1361 if (regno0 != regno1)
1362 emit_insn (gen_rtx_SET (target, parts.base));
1363 }
1364 else
1365 {
1366 if (regno0 == regno1)
1367 tmp = parts.index;
1368 else if (regno0 == regno2)
1369 tmp = parts.base;
1370 else
1371 {
1372 rtx tmp1;
1373
1374 /* Find better operand for SET instruction, depending
1375 on which definition is farther from the insn. */
1376 if (find_nearest_reg_def (insn, regno1, regno2))
1377 tmp = parts.index, tmp1 = parts.base;
1378 else
1379 tmp = parts.base, tmp1 = parts.index;
1380
1381 emit_insn (gen_rtx_SET (target, tmp));
1382
1383 if (parts.disp && parts.disp != const0_rtx)
1384 ix86_emit_binop (PLUS, mode, target, parts.disp);
1385
1386 ix86_emit_binop (PLUS, mode, target, tmp1);
1387 return;
1388 }
1389
1390 ix86_emit_binop (PLUS, mode, target, tmp);
1391 }
1392
1393 if (parts.disp && parts.disp != const0_rtx)
1394 ix86_emit_binop (PLUS, mode, target, parts.disp);
1395 }
1396 }
1397
1398 /* Post-reload splitter for converting an SF or DFmode value in an
1399 SSE register into an unsigned SImode. */
1400
1401 void
1402 ix86_split_convert_uns_si_sse (rtx operands[])
1403 {
1404 machine_mode vecmode;
1405 rtx value, large, zero_or_two31, input, two31, x;
1406
1407 large = operands[1];
1408 zero_or_two31 = operands[2];
1409 input = operands[3];
1410 two31 = operands[4];
1411 vecmode = GET_MODE (large);
1412 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1413
1414 /* Load up the value into the low element. We must ensure that the other
1415 elements are valid floats -- zero is the easiest such value. */
1416 if (MEM_P (input))
1417 {
1418 if (vecmode == V4SFmode)
1419 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1420 else
1421 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1422 }
1423 else
1424 {
1425 input = gen_rtx_REG (vecmode, REGNO (input));
1426 emit_move_insn (value, CONST0_RTX (vecmode));
1427 if (vecmode == V4SFmode)
1428 emit_insn (gen_sse_movss (value, value, input));
1429 else
1430 emit_insn (gen_sse2_movsd (value, value, input));
1431 }
1432
1433 emit_move_insn (large, two31);
1434 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1435
1436 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1437 emit_insn (gen_rtx_SET (large, x));
1438
1439 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1440 emit_insn (gen_rtx_SET (zero_or_two31, x));
1441
1442 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1443 emit_insn (gen_rtx_SET (value, x));
1444
1445 large = gen_rtx_REG (V4SImode, REGNO (large));
1446 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1447
1448 x = gen_rtx_REG (V4SImode, REGNO (value));
1449 if (vecmode == V4SFmode)
1450 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1451 else
1452 emit_insn (gen_sse2_cvttpd2dq (x, value));
1453 value = x;
1454
1455 emit_insn (gen_xorv4si3 (value, value, large));
1456 }
1457
1458 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1459 machine_mode mode, rtx target,
1460 rtx var, int one_var);
1461
1462 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1463 Expects the 64-bit DImode to be supplied in a pair of integral
1464 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1465 -mfpmath=sse, !optimize_size only. */
1466
1467 void
1468 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1469 {
1470 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1471 rtx int_xmm, fp_xmm;
1472 rtx biases, exponents;
1473 rtx x;
1474
1475 int_xmm = gen_reg_rtx (V4SImode);
1476 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1477 emit_insn (gen_movdi_to_sse (int_xmm, input));
1478 else if (TARGET_SSE_SPLIT_REGS)
1479 {
1480 emit_clobber (int_xmm);
1481 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1482 }
1483 else
1484 {
1485 x = gen_reg_rtx (V2DImode);
1486 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1487 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1488 }
1489
1490 x = gen_rtx_CONST_VECTOR (V4SImode,
1491 gen_rtvec (4, GEN_INT (0x43300000UL),
1492 GEN_INT (0x45300000UL),
1493 const0_rtx, const0_rtx));
1494 exponents = validize_mem (force_const_mem (V4SImode, x));
1495
1496 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1497 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1498
1499 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1500 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1501 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1502 (0x1.0p84 + double(fp_value_hi_xmm)).
1503 Note these exponents differ by 32. */
1504
1505 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1506
1507 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1508 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1509 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1510 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1511 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1512 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1513 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1514 biases = validize_mem (force_const_mem (V2DFmode, biases));
1515 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1516
1517 /* Add the upper and lower DFmode values together. */
1518 if (TARGET_SSE3)
1519 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1520 else
1521 {
1522 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1523 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1524 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1525 }
1526
1527 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1528 }
1529
1530 /* Not used, but eases macroization of patterns. */
1531 void
1532 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1533 {
1534 gcc_unreachable ();
1535 }
1536
1537 /* Convert an unsigned SImode value into a DFmode. Only currently used
1538 for SSE, but applicable anywhere. */
1539
1540 void
1541 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1542 {
1543 REAL_VALUE_TYPE TWO31r;
1544 rtx x, fp;
1545
1546 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1547 NULL, 1, OPTAB_DIRECT);
1548
1549 fp = gen_reg_rtx (DFmode);
1550 emit_insn (gen_floatsidf2 (fp, x));
1551
1552 real_ldexp (&TWO31r, &dconst1, 31);
1553 x = const_double_from_real_value (TWO31r, DFmode);
1554
1555 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1556 if (x != target)
1557 emit_move_insn (target, x);
1558 }
1559
1560 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1561 32-bit mode; otherwise we have a direct convert instruction. */
1562
1563 void
1564 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1565 {
1566 REAL_VALUE_TYPE TWO32r;
1567 rtx fp_lo, fp_hi, x;
1568
1569 fp_lo = gen_reg_rtx (DFmode);
1570 fp_hi = gen_reg_rtx (DFmode);
1571
1572 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1573
1574 real_ldexp (&TWO32r, &dconst1, 32);
1575 x = const_double_from_real_value (TWO32r, DFmode);
1576 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1577
1578 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1579
1580 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1581 0, OPTAB_DIRECT);
1582 if (x != target)
1583 emit_move_insn (target, x);
1584 }
1585
1586 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1587 For x86_32, -mfpmath=sse, !optimize_size only. */
1588 void
1589 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1590 {
1591 REAL_VALUE_TYPE ONE16r;
1592 rtx fp_hi, fp_lo, int_hi, int_lo, x;
1593
1594 real_ldexp (&ONE16r, &dconst1, 16);
1595 x = const_double_from_real_value (ONE16r, SFmode);
1596 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1597 NULL, 0, OPTAB_DIRECT);
1598 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1599 NULL, 0, OPTAB_DIRECT);
1600 fp_hi = gen_reg_rtx (SFmode);
1601 fp_lo = gen_reg_rtx (SFmode);
1602 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1603 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
1604 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1605 0, OPTAB_DIRECT);
1606 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1607 0, OPTAB_DIRECT);
1608 if (!rtx_equal_p (target, fp_hi))
1609 emit_move_insn (target, fp_hi);
1610 }
1611
1612 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1613 a vector of unsigned ints VAL to vector of floats TARGET. */
1614
1615 void
1616 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1617 {
1618 rtx tmp[8];
1619 REAL_VALUE_TYPE TWO16r;
1620 machine_mode intmode = GET_MODE (val);
1621 machine_mode fltmode = GET_MODE (target);
1622 rtx (*cvt) (rtx, rtx);
1623
1624 if (intmode == V4SImode)
1625 cvt = gen_floatv4siv4sf2;
1626 else
1627 cvt = gen_floatv8siv8sf2;
1628 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1629 tmp[0] = force_reg (intmode, tmp[0]);
1630 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1631 OPTAB_DIRECT);
1632 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1633 NULL_RTX, 1, OPTAB_DIRECT);
1634 tmp[3] = gen_reg_rtx (fltmode);
1635 emit_insn (cvt (tmp[3], tmp[1]));
1636 tmp[4] = gen_reg_rtx (fltmode);
1637 emit_insn (cvt (tmp[4], tmp[2]));
1638 real_ldexp (&TWO16r, &dconst1, 16);
1639 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
1640 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
1641 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
1642 OPTAB_DIRECT);
1643 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
1644 OPTAB_DIRECT);
1645 if (tmp[7] != target)
1646 emit_move_insn (target, tmp[7]);
1647 }
1648
1649 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1650 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1651 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1652 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1653
1654 rtx
1655 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
1656 {
1657 REAL_VALUE_TYPE TWO31r;
1658 rtx two31r, tmp[4];
1659 machine_mode mode = GET_MODE (val);
1660 machine_mode scalarmode = GET_MODE_INNER (mode);
1661 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
1662 rtx (*cmp) (rtx, rtx, rtx, rtx);
1663 int i;
1664
1665 for (i = 0; i < 3; i++)
1666 tmp[i] = gen_reg_rtx (mode);
1667 real_ldexp (&TWO31r, &dconst1, 31);
1668 two31r = const_double_from_real_value (TWO31r, scalarmode);
1669 two31r = ix86_build_const_vector (mode, 1, two31r);
1670 two31r = force_reg (mode, two31r);
1671 switch (mode)
1672 {
1673 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
1674 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
1675 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
1676 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
1677 default: gcc_unreachable ();
1678 }
1679 tmp[3] = gen_rtx_LE (mode, two31r, val);
1680 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
1681 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
1682 0, OPTAB_DIRECT);
1683 if (intmode == V4SImode || TARGET_AVX2)
1684 *xorp = expand_simple_binop (intmode, ASHIFT,
1685 gen_lowpart (intmode, tmp[0]),
1686 GEN_INT (31), NULL_RTX, 0,
1687 OPTAB_DIRECT);
1688 else
1689 {
1690 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
1691 two31 = ix86_build_const_vector (intmode, 1, two31);
1692 *xorp = expand_simple_binop (intmode, AND,
1693 gen_lowpart (intmode, tmp[0]),
1694 two31, NULL_RTX, 0,
1695 OPTAB_DIRECT);
1696 }
1697 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
1698 0, OPTAB_DIRECT);
1699 }
1700
1701 /* Generate code for floating point ABS or NEG. */
1702
1703 void
1704 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1705 rtx operands[])
1706 {
1707 rtx set, dst, src;
1708 bool use_sse = false;
1709 bool vector_mode = VECTOR_MODE_P (mode);
1710 machine_mode vmode = mode;
1711 rtvec par;
1712
1713 if (vector_mode)
1714 use_sse = true;
1715 else if (mode == TFmode)
1716 use_sse = true;
1717 else if (TARGET_SSE_MATH)
1718 {
1719 use_sse = SSE_FLOAT_MODE_P (mode);
1720 if (mode == SFmode)
1721 vmode = V4SFmode;
1722 else if (mode == DFmode)
1723 vmode = V2DFmode;
1724 }
1725
1726 dst = operands[0];
1727 src = operands[1];
1728
1729 set = gen_rtx_fmt_e (code, mode, src);
1730 set = gen_rtx_SET (dst, set);
1731
1732 if (use_sse)
1733 {
1734 rtx mask, use, clob;
1735
1736 /* NEG and ABS performed with SSE use bitwise mask operations.
1737 Create the appropriate mask now. */
1738 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
1739 use = gen_rtx_USE (VOIDmode, mask);
1740 if (vector_mode)
1741 par = gen_rtvec (2, set, use);
1742 else
1743 {
1744 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1745 par = gen_rtvec (3, set, use, clob);
1746 }
1747 }
1748 else
1749 {
1750 rtx clob;
1751
1752 /* Changing of sign for FP values is doable using integer unit too. */
1753 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1754 par = gen_rtvec (2, set, clob);
1755 }
1756
1757 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1758 }
1759
1760 /* Deconstruct a floating point ABS or NEG operation
1761 with integer registers into integer operations. */
1762
1763 void
1764 ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1765 rtx operands[])
1766 {
1767 enum rtx_code absneg_op;
1768 rtx dst, set;
1769
1770 gcc_assert (operands_match_p (operands[0], operands[1]));
1771
1772 switch (mode)
1773 {
1774 case E_SFmode:
1775 dst = gen_lowpart (SImode, operands[0]);
1776
1777 if (code == ABS)
1778 {
1779 set = gen_int_mode (0x7fffffff, SImode);
1780 absneg_op = AND;
1781 }
1782 else
1783 {
1784 set = gen_int_mode (0x80000000, SImode);
1785 absneg_op = XOR;
1786 }
1787 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1788 break;
1789
1790 case E_DFmode:
1791 if (TARGET_64BIT)
1792 {
1793 dst = gen_lowpart (DImode, operands[0]);
1794 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
1795
1796 if (code == ABS)
1797 set = const0_rtx;
1798 else
1799 set = gen_rtx_NOT (DImode, dst);
1800 }
1801 else
1802 {
1803 dst = gen_highpart (SImode, operands[0]);
1804
1805 if (code == ABS)
1806 {
1807 set = gen_int_mode (0x7fffffff, SImode);
1808 absneg_op = AND;
1809 }
1810 else
1811 {
1812 set = gen_int_mode (0x80000000, SImode);
1813 absneg_op = XOR;
1814 }
1815 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1816 }
1817 break;
1818
1819 case E_XFmode:
1820 dst = gen_rtx_REG (SImode,
1821 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
1822 if (code == ABS)
1823 {
1824 set = GEN_INT (0x7fff);
1825 absneg_op = AND;
1826 }
1827 else
1828 {
1829 set = GEN_INT (0x8000);
1830 absneg_op = XOR;
1831 }
1832 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1833 break;
1834
1835 default:
1836 gcc_unreachable ();
1837 }
1838
1839 set = gen_rtx_SET (dst, set);
1840
1841 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1842 rtvec par = gen_rtvec (2, set, clob);
1843
1844 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1845 }
1846
1847 /* Expand a copysign operation. Special case operand 0 being a constant. */
1848
1849 void
1850 ix86_expand_copysign (rtx operands[])
1851 {
1852 machine_mode mode, vmode;
1853 rtx dest, op0, op1, mask;
1854
1855 dest = operands[0];
1856 op0 = operands[1];
1857 op1 = operands[2];
1858
1859 mode = GET_MODE (dest);
1860
1861 if (mode == SFmode)
1862 vmode = V4SFmode;
1863 else if (mode == DFmode)
1864 vmode = V2DFmode;
1865 else if (mode == TFmode)
1866 vmode = mode;
1867 else
1868 gcc_unreachable ();
1869
1870 mask = ix86_build_signbit_mask (vmode, 0, 0);
1871
1872 if (CONST_DOUBLE_P (op0))
1873 {
1874 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
1875 op0 = simplify_unary_operation (ABS, mode, op0, mode);
1876
1877 if (mode == SFmode || mode == DFmode)
1878 {
1879 if (op0 == CONST0_RTX (mode))
1880 op0 = CONST0_RTX (vmode);
1881 else
1882 {
1883 rtx v = ix86_build_const_vector (vmode, false, op0);
1884
1885 op0 = force_reg (vmode, v);
1886 }
1887 }
1888 else if (op0 != CONST0_RTX (mode))
1889 op0 = force_reg (mode, op0);
1890
1891 emit_insn (gen_copysign3_const (mode, dest, op0, op1, mask));
1892 }
1893 else
1894 {
1895 rtx nmask = ix86_build_signbit_mask (vmode, 0, 1);
1896
1897 emit_insn (gen_copysign3_var
1898 (mode, dest, NULL_RTX, op0, op1, nmask, mask));
1899 }
1900 }
1901
1902 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
1903 be a constant, and so has already been expanded into a vector constant. */
1904
1905 void
1906 ix86_split_copysign_const (rtx operands[])
1907 {
1908 machine_mode mode, vmode;
1909 rtx dest, op0, mask, x;
1910
1911 dest = operands[0];
1912 op0 = operands[1];
1913 mask = operands[3];
1914
1915 mode = GET_MODE (dest);
1916 vmode = GET_MODE (mask);
1917
1918 dest = lowpart_subreg (vmode, dest, mode);
1919 x = gen_rtx_AND (vmode, dest, mask);
1920 emit_insn (gen_rtx_SET (dest, x));
1921
1922 if (op0 != CONST0_RTX (vmode))
1923 {
1924 x = gen_rtx_IOR (vmode, dest, op0);
1925 emit_insn (gen_rtx_SET (dest, x));
1926 }
1927 }
1928
1929 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
1930 so we have to do two masks. */
1931
1932 void
1933 ix86_split_copysign_var (rtx operands[])
1934 {
1935 machine_mode mode, vmode;
1936 rtx dest, scratch, op0, op1, mask, nmask, x;
1937
1938 dest = operands[0];
1939 scratch = operands[1];
1940 op0 = operands[2];
1941 op1 = operands[3];
1942 nmask = operands[4];
1943 mask = operands[5];
1944
1945 mode = GET_MODE (dest);
1946 vmode = GET_MODE (mask);
1947
1948 if (rtx_equal_p (op0, op1))
1949 {
1950 /* Shouldn't happen often (it's useless, obviously), but when it does
1951 we'd generate incorrect code if we continue below. */
1952 emit_move_insn (dest, op0);
1953 return;
1954 }
1955
1956 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
1957 {
1958 gcc_assert (REGNO (op1) == REGNO (scratch));
1959
1960 x = gen_rtx_AND (vmode, scratch, mask);
1961 emit_insn (gen_rtx_SET (scratch, x));
1962
1963 dest = mask;
1964 op0 = lowpart_subreg (vmode, op0, mode);
1965 x = gen_rtx_NOT (vmode, dest);
1966 x = gen_rtx_AND (vmode, x, op0);
1967 emit_insn (gen_rtx_SET (dest, x));
1968 }
1969 else
1970 {
1971 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
1972 {
1973 x = gen_rtx_AND (vmode, scratch, mask);
1974 }
1975 else /* alternative 2,4 */
1976 {
1977 gcc_assert (REGNO (mask) == REGNO (scratch));
1978 op1 = lowpart_subreg (vmode, op1, mode);
1979 x = gen_rtx_AND (vmode, scratch, op1);
1980 }
1981 emit_insn (gen_rtx_SET (scratch, x));
1982
1983 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
1984 {
1985 dest = lowpart_subreg (vmode, op0, mode);
1986 x = gen_rtx_AND (vmode, dest, nmask);
1987 }
1988 else /* alternative 3,4 */
1989 {
1990 gcc_assert (REGNO (nmask) == REGNO (dest));
1991 dest = nmask;
1992 op0 = lowpart_subreg (vmode, op0, mode);
1993 x = gen_rtx_AND (vmode, dest, op0);
1994 }
1995 emit_insn (gen_rtx_SET (dest, x));
1996 }
1997
1998 x = gen_rtx_IOR (vmode, dest, scratch);
1999 emit_insn (gen_rtx_SET (dest, x));
2000 }
2001
2002 /* Expand an xorsign operation. */
2003
2004 void
2005 ix86_expand_xorsign (rtx operands[])
2006 {
2007 machine_mode mode, vmode;
2008 rtx dest, op0, op1, mask;
2009
2010 dest = operands[0];
2011 op0 = operands[1];
2012 op1 = operands[2];
2013
2014 mode = GET_MODE (dest);
2015
2016 if (mode == SFmode)
2017 vmode = V4SFmode;
2018 else if (mode == DFmode)
2019 vmode = V2DFmode;
2020 else
2021 gcc_unreachable ();
2022
2023 mask = ix86_build_signbit_mask (vmode, 0, 0);
2024
2025 emit_insn (gen_xorsign3_1 (mode, dest, op0, op1, mask));
2026 }
2027
2028 /* Deconstruct an xorsign operation into bit masks. */
2029
2030 void
2031 ix86_split_xorsign (rtx operands[])
2032 {
2033 machine_mode mode, vmode;
2034 rtx dest, op0, mask, x;
2035
2036 dest = operands[0];
2037 op0 = operands[1];
2038 mask = operands[3];
2039
2040 mode = GET_MODE (dest);
2041 vmode = GET_MODE (mask);
2042
2043 dest = lowpart_subreg (vmode, dest, mode);
2044 x = gen_rtx_AND (vmode, dest, mask);
2045 emit_insn (gen_rtx_SET (dest, x));
2046
2047 op0 = lowpart_subreg (vmode, op0, mode);
2048 x = gen_rtx_XOR (vmode, dest, op0);
2049 emit_insn (gen_rtx_SET (dest, x));
2050 }
2051
2052 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2053
2054 void
2055 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2056 {
2057 machine_mode mode = GET_MODE (op0);
2058 rtx tmp;
2059
2060 /* Handle special case - vector comparsion with boolean result, transform
2061 it using ptest instruction. */
2062 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
2063 {
2064 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2065 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2066
2067 gcc_assert (code == EQ || code == NE);
2068 /* Generate XOR since we can't check that one operand is zero vector. */
2069 tmp = gen_reg_rtx (mode);
2070 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2071 tmp = gen_lowpart (p_mode, tmp);
2072 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2073 gen_rtx_UNSPEC (CCmode,
2074 gen_rtvec (2, tmp, tmp),
2075 UNSPEC_PTEST)));
2076 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2077 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2078 gen_rtx_LABEL_REF (VOIDmode, label),
2079 pc_rtx);
2080 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2081 return;
2082 }
2083
2084 switch (mode)
2085 {
2086 case E_SFmode:
2087 case E_DFmode:
2088 case E_XFmode:
2089 case E_QImode:
2090 case E_HImode:
2091 case E_SImode:
2092 simple:
2093 tmp = ix86_expand_compare (code, op0, op1);
2094 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2095 gen_rtx_LABEL_REF (VOIDmode, label),
2096 pc_rtx);
2097 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2098 return;
2099
2100 case E_DImode:
2101 if (TARGET_64BIT)
2102 goto simple;
2103 /* For 32-bit target DI comparison may be performed on
2104 SSE registers. To allow this we should avoid split
2105 to SI mode which is achieved by doing xor in DI mode
2106 and then comparing with zero (which is recognized by
2107 STV pass). We don't compare using xor when optimizing
2108 for size. */
2109 if (!optimize_insn_for_size_p ()
2110 && TARGET_STV
2111 && (code == EQ || code == NE))
2112 {
2113 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
2114 op1 = const0_rtx;
2115 }
2116 /* FALLTHRU */
2117 case E_TImode:
2118 /* Expand DImode branch into multiple compare+branch. */
2119 {
2120 rtx lo[2], hi[2];
2121 rtx_code_label *label2;
2122 enum rtx_code code1, code2, code3;
2123 machine_mode submode;
2124
2125 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2126 {
2127 std::swap (op0, op1);
2128 code = swap_condition (code);
2129 }
2130
2131 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2132 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2133
2134 submode = mode == DImode ? SImode : DImode;
2135
2136 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2137 avoid two branches. This costs one extra insn, so disable when
2138 optimizing for size. */
2139
2140 if ((code == EQ || code == NE)
2141 && (!optimize_insn_for_size_p ()
2142 || hi[1] == const0_rtx || lo[1] == const0_rtx))
2143 {
2144 rtx xor0, xor1;
2145
2146 xor1 = hi[0];
2147 if (hi[1] != const0_rtx)
2148 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
2149 NULL_RTX, 0, OPTAB_WIDEN);
2150
2151 xor0 = lo[0];
2152 if (lo[1] != const0_rtx)
2153 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
2154 NULL_RTX, 0, OPTAB_WIDEN);
2155
2156 tmp = expand_binop (submode, ior_optab, xor1, xor0,
2157 NULL_RTX, 0, OPTAB_WIDEN);
2158
2159 ix86_expand_branch (code, tmp, const0_rtx, label);
2160 return;
2161 }
2162
2163 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2164 op1 is a constant and the low word is zero, then we can just
2165 examine the high word. Similarly for low word -1 and
2166 less-or-equal-than or greater-than. */
2167
2168 if (CONST_INT_P (hi[1]))
2169 switch (code)
2170 {
2171 case LT: case LTU: case GE: case GEU:
2172 if (lo[1] == const0_rtx)
2173 {
2174 ix86_expand_branch (code, hi[0], hi[1], label);
2175 return;
2176 }
2177 break;
2178 case LE: case LEU: case GT: case GTU:
2179 if (lo[1] == constm1_rtx)
2180 {
2181 ix86_expand_branch (code, hi[0], hi[1], label);
2182 return;
2183 }
2184 break;
2185 default:
2186 break;
2187 }
2188
2189 /* Emulate comparisons that do not depend on Zero flag with
2190 double-word subtraction. Note that only Overflow, Sign
2191 and Carry flags are valid, so swap arguments and condition
2192 of comparisons that would otherwise test Zero flag. */
2193
2194 switch (code)
2195 {
2196 case LE: case LEU: case GT: case GTU:
2197 std::swap (lo[0], lo[1]);
2198 std::swap (hi[0], hi[1]);
2199 code = swap_condition (code);
2200 /* FALLTHRU */
2201
2202 case LT: case LTU: case GE: case GEU:
2203 {
2204 bool uns = (code == LTU || code == GEU);
2205 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2206 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2207
2208 if (!nonimmediate_operand (lo[0], submode))
2209 lo[0] = force_reg (submode, lo[0]);
2210 if (!x86_64_general_operand (lo[1], submode))
2211 lo[1] = force_reg (submode, lo[1]);
2212
2213 if (!register_operand (hi[0], submode))
2214 hi[0] = force_reg (submode, hi[0]);
2215 if ((uns && !nonimmediate_operand (hi[1], submode))
2216 || (!uns && !x86_64_general_operand (hi[1], submode)))
2217 hi[1] = force_reg (submode, hi[1]);
2218
2219 emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2220
2221 tmp = gen_rtx_SCRATCH (submode);
2222 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2223
2224 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2225 ix86_expand_branch (code, tmp, const0_rtx, label);
2226 return;
2227 }
2228
2229 default:
2230 break;
2231 }
2232
2233 /* Otherwise, we need two or three jumps. */
2234
2235 label2 = gen_label_rtx ();
2236
2237 code1 = code;
2238 code2 = swap_condition (code);
2239 code3 = unsigned_condition (code);
2240
2241 switch (code)
2242 {
2243 case LT: case GT: case LTU: case GTU:
2244 break;
2245
2246 case LE: code1 = LT; code2 = GT; break;
2247 case GE: code1 = GT; code2 = LT; break;
2248 case LEU: code1 = LTU; code2 = GTU; break;
2249 case GEU: code1 = GTU; code2 = LTU; break;
2250
2251 case EQ: code1 = UNKNOWN; code2 = NE; break;
2252 case NE: code2 = UNKNOWN; break;
2253
2254 default:
2255 gcc_unreachable ();
2256 }
2257
2258 /*
2259 * a < b =>
2260 * if (hi(a) < hi(b)) goto true;
2261 * if (hi(a) > hi(b)) goto false;
2262 * if (lo(a) < lo(b)) goto true;
2263 * false:
2264 */
2265
2266 if (code1 != UNKNOWN)
2267 ix86_expand_branch (code1, hi[0], hi[1], label);
2268 if (code2 != UNKNOWN)
2269 ix86_expand_branch (code2, hi[0], hi[1], label2);
2270
2271 ix86_expand_branch (code3, lo[0], lo[1], label);
2272
2273 if (code2 != UNKNOWN)
2274 emit_label (label2);
2275 return;
2276 }
2277
2278 default:
2279 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2280 goto simple;
2281 }
2282 }
2283
2284 /* Figure out whether to use unordered fp comparisons. */
2285
2286 static bool
2287 ix86_unordered_fp_compare (enum rtx_code code)
2288 {
2289 if (!TARGET_IEEE_FP)
2290 return false;
2291
2292 switch (code)
2293 {
2294 case GT:
2295 case GE:
2296 case LT:
2297 case LE:
2298 return false;
2299
2300 case EQ:
2301 case NE:
2302
2303 case LTGT:
2304 case UNORDERED:
2305 case ORDERED:
2306 case UNLT:
2307 case UNLE:
2308 case UNGT:
2309 case UNGE:
2310 case UNEQ:
2311 return true;
2312
2313 default:
2314 gcc_unreachable ();
2315 }
2316 }
2317
2318 /* Return a comparison we can do and that it is equivalent to
2319 swap_condition (code) apart possibly from orderedness.
2320 But, never change orderedness if TARGET_IEEE_FP, returning
2321 UNKNOWN in that case if necessary. */
2322
2323 static enum rtx_code
2324 ix86_fp_swap_condition (enum rtx_code code)
2325 {
2326 switch (code)
2327 {
2328 case GT: /* GTU - CF=0 & ZF=0 */
2329 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2330 case GE: /* GEU - CF=0 */
2331 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2332 case UNLT: /* LTU - CF=1 */
2333 return TARGET_IEEE_FP ? UNKNOWN : GT;
2334 case UNLE: /* LEU - CF=1 | ZF=1 */
2335 return TARGET_IEEE_FP ? UNKNOWN : GE;
2336 default:
2337 return swap_condition (code);
2338 }
2339 }
2340
2341 /* Return cost of comparison CODE using the best strategy for performance.
2342 All following functions do use number of instructions as a cost metrics.
2343 In future this should be tweaked to compute bytes for optimize_size and
2344 take into account performance of various instructions on various CPUs. */
2345
2346 static int
2347 ix86_fp_comparison_cost (enum rtx_code code)
2348 {
2349 int arith_cost;
2350
2351 /* The cost of code using bit-twiddling on %ah. */
2352 switch (code)
2353 {
2354 case UNLE:
2355 case UNLT:
2356 case LTGT:
2357 case GT:
2358 case GE:
2359 case UNORDERED:
2360 case ORDERED:
2361 case UNEQ:
2362 arith_cost = 4;
2363 break;
2364 case LT:
2365 case NE:
2366 case EQ:
2367 case UNGE:
2368 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2369 break;
2370 case LE:
2371 case UNGT:
2372 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2373 break;
2374 default:
2375 gcc_unreachable ();
2376 }
2377
2378 switch (ix86_fp_comparison_strategy (code))
2379 {
2380 case IX86_FPCMP_COMI:
2381 return arith_cost > 4 ? 3 : 2;
2382 case IX86_FPCMP_SAHF:
2383 return arith_cost > 4 ? 4 : 3;
2384 default:
2385 return arith_cost;
2386 }
2387 }
2388
2389 /* Swap, force into registers, or otherwise massage the two operands
2390 to a fp comparison. The operands are updated in place; the new
2391 comparison code is returned. */
2392
2393 static enum rtx_code
2394 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2395 {
2396 bool unordered_compare = ix86_unordered_fp_compare (code);
2397 rtx op0 = *pop0, op1 = *pop1;
2398 machine_mode op_mode = GET_MODE (op0);
2399 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
2400
2401 /* All of the unordered compare instructions only work on registers.
2402 The same is true of the fcomi compare instructions. The XFmode
2403 compare instructions require registers except when comparing
2404 against zero or when converting operand 1 from fixed point to
2405 floating point. */
2406
2407 if (!is_sse
2408 && (unordered_compare
2409 || (op_mode == XFmode
2410 && ! (standard_80387_constant_p (op0) == 1
2411 || standard_80387_constant_p (op1) == 1)
2412 && GET_CODE (op1) != FLOAT)
2413 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2414 {
2415 op0 = force_reg (op_mode, op0);
2416 op1 = force_reg (op_mode, op1);
2417 }
2418 else
2419 {
2420 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2421 things around if they appear profitable, otherwise force op0
2422 into a register. */
2423
2424 if (standard_80387_constant_p (op0) == 0
2425 || (MEM_P (op0)
2426 && ! (standard_80387_constant_p (op1) == 0
2427 || MEM_P (op1))))
2428 {
2429 enum rtx_code new_code = ix86_fp_swap_condition (code);
2430 if (new_code != UNKNOWN)
2431 {
2432 std::swap (op0, op1);
2433 code = new_code;
2434 }
2435 }
2436
2437 if (!REG_P (op0))
2438 op0 = force_reg (op_mode, op0);
2439
2440 if (CONSTANT_P (op1))
2441 {
2442 int tmp = standard_80387_constant_p (op1);
2443 if (tmp == 0)
2444 op1 = validize_mem (force_const_mem (op_mode, op1));
2445 else if (tmp == 1)
2446 {
2447 if (TARGET_CMOVE)
2448 op1 = force_reg (op_mode, op1);
2449 }
2450 else
2451 op1 = force_reg (op_mode, op1);
2452 }
2453 }
2454
2455 /* Try to rearrange the comparison to make it cheaper. */
2456 if (ix86_fp_comparison_cost (code)
2457 > ix86_fp_comparison_cost (swap_condition (code))
2458 && (REG_P (op1) || can_create_pseudo_p ()))
2459 {
2460 std::swap (op0, op1);
2461 code = swap_condition (code);
2462 if (!REG_P (op0))
2463 op0 = force_reg (op_mode, op0);
2464 }
2465
2466 *pop0 = op0;
2467 *pop1 = op1;
2468 return code;
2469 }
2470
2471 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2472
2473 static rtx
2474 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2475 {
2476 bool unordered_compare = ix86_unordered_fp_compare (code);
2477 machine_mode cmp_mode;
2478 rtx tmp, scratch;
2479
2480 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2481
2482 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2483 if (unordered_compare)
2484 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2485
2486 /* Do fcomi/sahf based test when profitable. */
2487 switch (ix86_fp_comparison_strategy (code))
2488 {
2489 case IX86_FPCMP_COMI:
2490 cmp_mode = CCFPmode;
2491 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2492 break;
2493
2494 case IX86_FPCMP_SAHF:
2495 cmp_mode = CCFPmode;
2496 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2497 scratch = gen_reg_rtx (HImode);
2498 emit_insn (gen_rtx_SET (scratch, tmp));
2499 emit_insn (gen_x86_sahf_1 (scratch));
2500 break;
2501
2502 case IX86_FPCMP_ARITH:
2503 cmp_mode = CCNOmode;
2504 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2505 scratch = gen_reg_rtx (HImode);
2506 emit_insn (gen_rtx_SET (scratch, tmp));
2507
2508 /* In the unordered case, we have to check C2 for NaN's, which
2509 doesn't happen to work out to anything nice combination-wise.
2510 So do some bit twiddling on the value we've got in AH to come
2511 up with an appropriate set of condition codes. */
2512
2513 switch (code)
2514 {
2515 case GT:
2516 case UNGT:
2517 if (code == GT || !TARGET_IEEE_FP)
2518 {
2519 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2520 code = EQ;
2521 }
2522 else
2523 {
2524 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2525 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2526 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2527 cmp_mode = CCmode;
2528 code = GEU;
2529 }
2530 break;
2531 case LT:
2532 case UNLT:
2533 if (code == LT && TARGET_IEEE_FP)
2534 {
2535 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2536 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2537 cmp_mode = CCmode;
2538 code = EQ;
2539 }
2540 else
2541 {
2542 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2543 code = NE;
2544 }
2545 break;
2546 case GE:
2547 case UNGE:
2548 if (code == GE || !TARGET_IEEE_FP)
2549 {
2550 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2551 code = EQ;
2552 }
2553 else
2554 {
2555 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2556 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2557 code = NE;
2558 }
2559 break;
2560 case LE:
2561 case UNLE:
2562 if (code == LE && TARGET_IEEE_FP)
2563 {
2564 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2565 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2566 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2567 cmp_mode = CCmode;
2568 code = LTU;
2569 }
2570 else
2571 {
2572 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2573 code = NE;
2574 }
2575 break;
2576 case EQ:
2577 case UNEQ:
2578 if (code == EQ && TARGET_IEEE_FP)
2579 {
2580 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2581 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2582 cmp_mode = CCmode;
2583 code = EQ;
2584 }
2585 else
2586 {
2587 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2588 code = NE;
2589 }
2590 break;
2591 case NE:
2592 case LTGT:
2593 if (code == NE && TARGET_IEEE_FP)
2594 {
2595 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2596 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2597 GEN_INT (0x40)));
2598 code = NE;
2599 }
2600 else
2601 {
2602 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2603 code = EQ;
2604 }
2605 break;
2606
2607 case UNORDERED:
2608 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2609 code = NE;
2610 break;
2611 case ORDERED:
2612 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2613 code = EQ;
2614 break;
2615
2616 default:
2617 gcc_unreachable ();
2618 }
2619 break;
2620
2621 default:
2622 gcc_unreachable();
2623 }
2624
2625 /* Return the test that should be put into the flags user, i.e.
2626 the bcc, scc, or cmov instruction. */
2627 return gen_rtx_fmt_ee (code, VOIDmode,
2628 gen_rtx_REG (cmp_mode, FLAGS_REG),
2629 const0_rtx);
2630 }
2631
2632 /* Generate insn patterns to do an integer compare of OPERANDS. */
2633
2634 static rtx
2635 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2636 {
2637 machine_mode cmpmode;
2638 rtx tmp, flags;
2639
2640 cmpmode = SELECT_CC_MODE (code, op0, op1);
2641 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2642
2643 /* This is very simple, but making the interface the same as in the
2644 FP case makes the rest of the code easier. */
2645 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2646 emit_insn (gen_rtx_SET (flags, tmp));
2647
2648 /* Return the test that should be put into the flags user, i.e.
2649 the bcc, scc, or cmov instruction. */
2650 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2651 }
2652
2653 static rtx
2654 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2655 {
2656 rtx ret;
2657
2658 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2659 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2660
2661 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2662 {
2663 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2664 ret = ix86_expand_fp_compare (code, op0, op1);
2665 }
2666 else
2667 ret = ix86_expand_int_compare (code, op0, op1);
2668
2669 return ret;
2670 }
2671
2672 void
2673 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2674 {
2675 rtx ret;
2676
2677 gcc_assert (GET_MODE (dest) == QImode);
2678
2679 ret = ix86_expand_compare (code, op0, op1);
2680 PUT_MODE (ret, QImode);
2681 emit_insn (gen_rtx_SET (dest, ret));
2682 }
2683
2684 /* Expand comparison setting or clearing carry flag. Return true when
2685 successful and set pop for the operation. */
2686 static bool
2687 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
2688 {
2689 machine_mode mode
2690 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
2691
2692 /* Do not handle double-mode compares that go through special path. */
2693 if (mode == (TARGET_64BIT ? TImode : DImode))
2694 return false;
2695
2696 if (SCALAR_FLOAT_MODE_P (mode))
2697 {
2698 rtx compare_op;
2699 rtx_insn *compare_seq;
2700
2701 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
2702
2703 /* Shortcut: following common codes never translate
2704 into carry flag compares. */
2705 if (code == EQ || code == NE || code == UNEQ || code == LTGT
2706 || code == ORDERED || code == UNORDERED)
2707 return false;
2708
2709 /* These comparisons require zero flag; swap operands so they won't. */
2710 if ((code == GT || code == UNLE || code == LE || code == UNGT)
2711 && !TARGET_IEEE_FP)
2712 {
2713 std::swap (op0, op1);
2714 code = swap_condition (code);
2715 }
2716
2717 /* Try to expand the comparison and verify that we end up with
2718 carry flag based comparison. This fails to be true only when
2719 we decide to expand comparison using arithmetic that is not
2720 too common scenario. */
2721 start_sequence ();
2722 compare_op = ix86_expand_fp_compare (code, op0, op1);
2723 compare_seq = get_insns ();
2724 end_sequence ();
2725
2726 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
2727 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
2728 else
2729 code = GET_CODE (compare_op);
2730
2731 if (code != LTU && code != GEU)
2732 return false;
2733
2734 emit_insn (compare_seq);
2735 *pop = compare_op;
2736 return true;
2737 }
2738
2739 if (!INTEGRAL_MODE_P (mode))
2740 return false;
2741
2742 switch (code)
2743 {
2744 case LTU:
2745 case GEU:
2746 break;
2747
2748 /* Convert a==0 into (unsigned)a<1. */
2749 case EQ:
2750 case NE:
2751 if (op1 != const0_rtx)
2752 return false;
2753 op1 = const1_rtx;
2754 code = (code == EQ ? LTU : GEU);
2755 break;
2756
2757 /* Convert a>b into b<a or a>=b-1. */
2758 case GTU:
2759 case LEU:
2760 if (CONST_INT_P (op1))
2761 {
2762 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
2763 /* Bail out on overflow. We still can swap operands but that
2764 would force loading of the constant into register. */
2765 if (op1 == const0_rtx
2766 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
2767 return false;
2768 code = (code == GTU ? GEU : LTU);
2769 }
2770 else
2771 {
2772 std::swap (op0, op1);
2773 code = (code == GTU ? LTU : GEU);
2774 }
2775 break;
2776
2777 /* Convert a>=0 into (unsigned)a<0x80000000. */
2778 case LT:
2779 case GE:
2780 if (mode == DImode || op1 != const0_rtx)
2781 return false;
2782 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2783 code = (code == LT ? GEU : LTU);
2784 break;
2785 case LE:
2786 case GT:
2787 if (mode == DImode || op1 != constm1_rtx)
2788 return false;
2789 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2790 code = (code == LE ? GEU : LTU);
2791 break;
2792
2793 default:
2794 return false;
2795 }
2796 /* Swapping operands may cause constant to appear as first operand. */
2797 if (!nonimmediate_operand (op0, VOIDmode))
2798 {
2799 if (!can_create_pseudo_p ())
2800 return false;
2801 op0 = force_reg (mode, op0);
2802 }
2803 *pop = ix86_expand_compare (code, op0, op1);
2804 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
2805 return true;
2806 }
2807
2808 /* Expand conditional increment or decrement using adb/sbb instructions.
2809 The default case using setcc followed by the conditional move can be
2810 done by generic code. */
2811 bool
2812 ix86_expand_int_addcc (rtx operands[])
2813 {
2814 enum rtx_code code = GET_CODE (operands[1]);
2815 rtx flags;
2816 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
2817 rtx compare_op;
2818 rtx val = const0_rtx;
2819 bool fpcmp = false;
2820 machine_mode mode;
2821 rtx op0 = XEXP (operands[1], 0);
2822 rtx op1 = XEXP (operands[1], 1);
2823
2824 if (operands[3] != const1_rtx
2825 && operands[3] != constm1_rtx)
2826 return false;
2827 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2828 return false;
2829 code = GET_CODE (compare_op);
2830
2831 flags = XEXP (compare_op, 0);
2832
2833 if (GET_MODE (flags) == CCFPmode)
2834 {
2835 fpcmp = true;
2836 code = ix86_fp_compare_code_to_integer (code);
2837 }
2838
2839 if (code != LTU)
2840 {
2841 val = constm1_rtx;
2842 if (fpcmp)
2843 PUT_CODE (compare_op,
2844 reverse_condition_maybe_unordered
2845 (GET_CODE (compare_op)));
2846 else
2847 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
2848 }
2849
2850 mode = GET_MODE (operands[0]);
2851
2852 /* Construct either adc or sbb insn. */
2853 if ((code == LTU) == (operands[3] == constm1_rtx))
2854 insn = gen_sub3_carry;
2855 else
2856 insn = gen_add3_carry;
2857
2858 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
2859
2860 return true;
2861 }
2862
2863 bool
2864 ix86_expand_int_movcc (rtx operands[])
2865 {
2866 enum rtx_code code = GET_CODE (operands[1]), compare_code;
2867 rtx_insn *compare_seq;
2868 rtx compare_op;
2869 machine_mode mode = GET_MODE (operands[0]);
2870 bool sign_bit_compare_p = false;
2871 rtx op0 = XEXP (operands[1], 0);
2872 rtx op1 = XEXP (operands[1], 1);
2873
2874 if (GET_MODE (op0) == TImode
2875 || (GET_MODE (op0) == DImode
2876 && !TARGET_64BIT))
2877 return false;
2878
2879 start_sequence ();
2880 compare_op = ix86_expand_compare (code, op0, op1);
2881 compare_seq = get_insns ();
2882 end_sequence ();
2883
2884 compare_code = GET_CODE (compare_op);
2885
2886 if ((op1 == const0_rtx && (code == GE || code == LT))
2887 || (op1 == constm1_rtx && (code == GT || code == LE)))
2888 sign_bit_compare_p = true;
2889
2890 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2891 HImode insns, we'd be swallowed in word prefix ops. */
2892
2893 if ((mode != HImode || TARGET_FAST_PREFIX)
2894 && (mode != (TARGET_64BIT ? TImode : DImode))
2895 && CONST_INT_P (operands[2])
2896 && CONST_INT_P (operands[3]))
2897 {
2898 rtx out = operands[0];
2899 HOST_WIDE_INT ct = INTVAL (operands[2]);
2900 HOST_WIDE_INT cf = INTVAL (operands[3]);
2901 HOST_WIDE_INT diff;
2902
2903 diff = ct - cf;
2904 /* Sign bit compares are better done using shifts than we do by using
2905 sbb. */
2906 if (sign_bit_compare_p
2907 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2908 {
2909 /* Detect overlap between destination and compare sources. */
2910 rtx tmp = out;
2911
2912 if (!sign_bit_compare_p)
2913 {
2914 rtx flags;
2915 bool fpcmp = false;
2916
2917 compare_code = GET_CODE (compare_op);
2918
2919 flags = XEXP (compare_op, 0);
2920
2921 if (GET_MODE (flags) == CCFPmode)
2922 {
2923 fpcmp = true;
2924 compare_code
2925 = ix86_fp_compare_code_to_integer (compare_code);
2926 }
2927
2928 /* To simplify rest of code, restrict to the GEU case. */
2929 if (compare_code == LTU)
2930 {
2931 std::swap (ct, cf);
2932 compare_code = reverse_condition (compare_code);
2933 code = reverse_condition (code);
2934 }
2935 else
2936 {
2937 if (fpcmp)
2938 PUT_CODE (compare_op,
2939 reverse_condition_maybe_unordered
2940 (GET_CODE (compare_op)));
2941 else
2942 PUT_CODE (compare_op,
2943 reverse_condition (GET_CODE (compare_op)));
2944 }
2945 diff = ct - cf;
2946
2947 if (reg_overlap_mentioned_p (out, op0)
2948 || reg_overlap_mentioned_p (out, op1))
2949 tmp = gen_reg_rtx (mode);
2950
2951 if (mode == DImode)
2952 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
2953 else
2954 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
2955 flags, compare_op));
2956 }
2957 else
2958 {
2959 if (code == GT || code == GE)
2960 code = reverse_condition (code);
2961 else
2962 {
2963 std::swap (ct, cf);
2964 diff = ct - cf;
2965 }
2966 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
2967 }
2968
2969 if (diff == 1)
2970 {
2971 /*
2972 * cmpl op0,op1
2973 * sbbl dest,dest
2974 * [addl dest, ct]
2975 *
2976 * Size 5 - 8.
2977 */
2978 if (ct)
2979 tmp = expand_simple_binop (mode, PLUS,
2980 tmp, GEN_INT (ct),
2981 copy_rtx (tmp), 1, OPTAB_DIRECT);
2982 }
2983 else if (cf == -1)
2984 {
2985 /*
2986 * cmpl op0,op1
2987 * sbbl dest,dest
2988 * orl $ct, dest
2989 *
2990 * Size 8.
2991 */
2992 tmp = expand_simple_binop (mode, IOR,
2993 tmp, GEN_INT (ct),
2994 copy_rtx (tmp), 1, OPTAB_DIRECT);
2995 }
2996 else if (diff == -1 && ct)
2997 {
2998 /*
2999 * cmpl op0,op1
3000 * sbbl dest,dest
3001 * notl dest
3002 * [addl dest, cf]
3003 *
3004 * Size 8 - 11.
3005 */
3006 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3007 if (cf)
3008 tmp = expand_simple_binop (mode, PLUS,
3009 copy_rtx (tmp), GEN_INT (cf),
3010 copy_rtx (tmp), 1, OPTAB_DIRECT);
3011 }
3012 else
3013 {
3014 /*
3015 * cmpl op0,op1
3016 * sbbl dest,dest
3017 * [notl dest]
3018 * andl cf - ct, dest
3019 * [addl dest, ct]
3020 *
3021 * Size 8 - 11.
3022 */
3023
3024 if (cf == 0)
3025 {
3026 cf = ct;
3027 ct = 0;
3028 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3029 }
3030
3031 tmp = expand_simple_binop (mode, AND,
3032 copy_rtx (tmp),
3033 gen_int_mode (cf - ct, mode),
3034 copy_rtx (tmp), 1, OPTAB_DIRECT);
3035 if (ct)
3036 tmp = expand_simple_binop (mode, PLUS,
3037 copy_rtx (tmp), GEN_INT (ct),
3038 copy_rtx (tmp), 1, OPTAB_DIRECT);
3039 }
3040
3041 if (!rtx_equal_p (tmp, out))
3042 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3043
3044 return true;
3045 }
3046
3047 if (diff < 0)
3048 {
3049 machine_mode cmp_mode = GET_MODE (op0);
3050 enum rtx_code new_code;
3051
3052 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3053 {
3054 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3055
3056 /* We may be reversing unordered compare to normal compare, that
3057 is not valid in general (we may convert non-trapping condition
3058 to trapping one), however on i386 we currently emit all
3059 comparisons unordered. */
3060 new_code = reverse_condition_maybe_unordered (code);
3061 }
3062 else
3063 new_code = ix86_reverse_condition (code, cmp_mode);
3064 if (new_code != UNKNOWN)
3065 {
3066 std::swap (ct, cf);
3067 diff = -diff;
3068 code = new_code;
3069 }
3070 }
3071
3072 compare_code = UNKNOWN;
3073 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3074 && CONST_INT_P (op1))
3075 {
3076 if (op1 == const0_rtx
3077 && (code == LT || code == GE))
3078 compare_code = code;
3079 else if (op1 == constm1_rtx)
3080 {
3081 if (code == LE)
3082 compare_code = LT;
3083 else if (code == GT)
3084 compare_code = GE;
3085 }
3086 }
3087
3088 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3089 if (compare_code != UNKNOWN
3090 && GET_MODE (op0) == GET_MODE (out)
3091 && (cf == -1 || ct == -1))
3092 {
3093 /* If lea code below could be used, only optimize
3094 if it results in a 2 insn sequence. */
3095
3096 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3097 || diff == 3 || diff == 5 || diff == 9)
3098 || (compare_code == LT && ct == -1)
3099 || (compare_code == GE && cf == -1))
3100 {
3101 /*
3102 * notl op1 (if necessary)
3103 * sarl $31, op1
3104 * orl cf, op1
3105 */
3106 if (ct != -1)
3107 {
3108 cf = ct;
3109 ct = -1;
3110 code = reverse_condition (code);
3111 }
3112
3113 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3114
3115 out = expand_simple_binop (mode, IOR,
3116 out, GEN_INT (cf),
3117 out, 1, OPTAB_DIRECT);
3118 if (out != operands[0])
3119 emit_move_insn (operands[0], out);
3120
3121 return true;
3122 }
3123 }
3124
3125
3126 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3127 || diff == 3 || diff == 5 || diff == 9)
3128 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3129 && (mode != DImode
3130 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3131 {
3132 /*
3133 * xorl dest,dest
3134 * cmpl op1,op2
3135 * setcc dest
3136 * lea cf(dest*(ct-cf)),dest
3137 *
3138 * Size 14.
3139 *
3140 * This also catches the degenerate setcc-only case.
3141 */
3142
3143 rtx tmp;
3144 int nops;
3145
3146 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3147
3148 nops = 0;
3149 /* On x86_64 the lea instruction operates on Pmode, so we need
3150 to get arithmetics done in proper mode to match. */
3151 if (diff == 1)
3152 tmp = copy_rtx (out);
3153 else
3154 {
3155 rtx out1;
3156 out1 = copy_rtx (out);
3157 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3158 nops++;
3159 if (diff & 1)
3160 {
3161 tmp = gen_rtx_PLUS (mode, tmp, out1);
3162 nops++;
3163 }
3164 }
3165 if (cf != 0)
3166 {
3167 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
3168 nops++;
3169 }
3170 if (!rtx_equal_p (tmp, out))
3171 {
3172 if (nops == 1)
3173 out = force_operand (tmp, copy_rtx (out));
3174 else
3175 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3176 }
3177 if (!rtx_equal_p (out, operands[0]))
3178 emit_move_insn (operands[0], copy_rtx (out));
3179
3180 return true;
3181 }
3182
3183 /*
3184 * General case: Jumpful:
3185 * xorl dest,dest cmpl op1, op2
3186 * cmpl op1, op2 movl ct, dest
3187 * setcc dest jcc 1f
3188 * decl dest movl cf, dest
3189 * andl (cf-ct),dest 1:
3190 * addl ct,dest
3191 *
3192 * Size 20. Size 14.
3193 *
3194 * This is reasonably steep, but branch mispredict costs are
3195 * high on modern cpus, so consider failing only if optimizing
3196 * for space.
3197 */
3198
3199 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3200 && BRANCH_COST (optimize_insn_for_speed_p (),
3201 false) >= 2)
3202 {
3203 if (cf == 0)
3204 {
3205 machine_mode cmp_mode = GET_MODE (op0);
3206 enum rtx_code new_code;
3207
3208 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3209 {
3210 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3211
3212 /* We may be reversing unordered compare to normal compare,
3213 that is not valid in general (we may convert non-trapping
3214 condition to trapping one), however on i386 we currently
3215 emit all comparisons unordered. */
3216 new_code = reverse_condition_maybe_unordered (code);
3217 }
3218 else
3219 {
3220 new_code = ix86_reverse_condition (code, cmp_mode);
3221 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3222 compare_code = reverse_condition (compare_code);
3223 }
3224
3225 if (new_code != UNKNOWN)
3226 {
3227 cf = ct;
3228 ct = 0;
3229 code = new_code;
3230 }
3231 }
3232
3233 if (compare_code != UNKNOWN)
3234 {
3235 /* notl op1 (if needed)
3236 sarl $31, op1
3237 andl (cf-ct), op1
3238 addl ct, op1
3239
3240 For x < 0 (resp. x <= -1) there will be no notl,
3241 so if possible swap the constants to get rid of the
3242 complement.
3243 True/false will be -1/0 while code below (store flag
3244 followed by decrement) is 0/-1, so the constants need
3245 to be exchanged once more. */
3246
3247 if (compare_code == GE || !cf)
3248 {
3249 code = reverse_condition (code);
3250 compare_code = LT;
3251 }
3252 else
3253 std::swap (ct, cf);
3254
3255 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3256 }
3257 else
3258 {
3259 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3260
3261 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3262 constm1_rtx,
3263 copy_rtx (out), 1, OPTAB_DIRECT);
3264 }
3265
3266 out = expand_simple_binop (mode, AND, copy_rtx (out),
3267 gen_int_mode (cf - ct, mode),
3268 copy_rtx (out), 1, OPTAB_DIRECT);
3269 if (ct)
3270 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3271 copy_rtx (out), 1, OPTAB_DIRECT);
3272 if (!rtx_equal_p (out, operands[0]))
3273 emit_move_insn (operands[0], copy_rtx (out));
3274
3275 return true;
3276 }
3277 }
3278
3279 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3280 {
3281 /* Try a few things more with specific constants and a variable. */
3282
3283 optab op;
3284 rtx var, orig_out, out, tmp;
3285
3286 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3287 return false;
3288
3289 /* If one of the two operands is an interesting constant, load a
3290 constant with the above and mask it in with a logical operation. */
3291
3292 if (CONST_INT_P (operands[2]))
3293 {
3294 var = operands[3];
3295 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3296 operands[3] = constm1_rtx, op = and_optab;
3297 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3298 operands[3] = const0_rtx, op = ior_optab;
3299 else
3300 return false;
3301 }
3302 else if (CONST_INT_P (operands[3]))
3303 {
3304 var = operands[2];
3305 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3306 operands[2] = constm1_rtx, op = and_optab;
3307 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3308 operands[2] = const0_rtx, op = ior_optab;
3309 else
3310 return false;
3311 }
3312 else
3313 return false;
3314
3315 orig_out = operands[0];
3316 tmp = gen_reg_rtx (mode);
3317 operands[0] = tmp;
3318
3319 /* Recurse to get the constant loaded. */
3320 if (!ix86_expand_int_movcc (operands))
3321 return false;
3322
3323 /* Mask in the interesting variable. */
3324 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3325 OPTAB_WIDEN);
3326 if (!rtx_equal_p (out, orig_out))
3327 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3328
3329 return true;
3330 }
3331
3332 /*
3333 * For comparison with above,
3334 *
3335 * movl cf,dest
3336 * movl ct,tmp
3337 * cmpl op1,op2
3338 * cmovcc tmp,dest
3339 *
3340 * Size 15.
3341 */
3342
3343 if (! nonimmediate_operand (operands[2], mode))
3344 operands[2] = force_reg (mode, operands[2]);
3345 if (! nonimmediate_operand (operands[3], mode))
3346 operands[3] = force_reg (mode, operands[3]);
3347
3348 if (! register_operand (operands[2], VOIDmode)
3349 && (mode == QImode
3350 || ! register_operand (operands[3], VOIDmode)))
3351 operands[2] = force_reg (mode, operands[2]);
3352
3353 if (mode == QImode
3354 && ! register_operand (operands[3], VOIDmode))
3355 operands[3] = force_reg (mode, operands[3]);
3356
3357 emit_insn (compare_seq);
3358 emit_insn (gen_rtx_SET (operands[0],
3359 gen_rtx_IF_THEN_ELSE (mode,
3360 compare_op, operands[2],
3361 operands[3])));
3362 return true;
3363 }
3364
3365 /* Detect conditional moves that exactly match min/max operational
3366 semantics. Note that this is IEEE safe, as long as we don't
3367 interchange the operands.
3368
3369 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3370 and TRUE if the operation is successful and instructions are emitted. */
3371
3372 static bool
3373 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3374 rtx cmp_op1, rtx if_true, rtx if_false)
3375 {
3376 machine_mode mode;
3377 bool is_min;
3378 rtx tmp;
3379
3380 if (code == LT)
3381 ;
3382 else if (code == UNGE)
3383 std::swap (if_true, if_false);
3384 else
3385 return false;
3386
3387 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3388 is_min = true;
3389 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3390 is_min = false;
3391 else
3392 return false;
3393
3394 mode = GET_MODE (dest);
3395
3396 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3397 but MODE may be a vector mode and thus not appropriate. */
3398 if (!flag_finite_math_only || flag_signed_zeros)
3399 {
3400 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3401 rtvec v;
3402
3403 if_true = force_reg (mode, if_true);
3404 v = gen_rtvec (2, if_true, if_false);
3405 tmp = gen_rtx_UNSPEC (mode, v, u);
3406 }
3407 else
3408 {
3409 code = is_min ? SMIN : SMAX;
3410 if (MEM_P (if_true) && MEM_P (if_false))
3411 if_true = force_reg (mode, if_true);
3412 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3413 }
3414
3415 emit_insn (gen_rtx_SET (dest, tmp));
3416 return true;
3417 }
3418
3419 /* Expand an SSE comparison. Return the register with the result. */
3420
3421 static rtx
3422 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3423 rtx op_true, rtx op_false)
3424 {
3425 machine_mode mode = GET_MODE (dest);
3426 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3427
3428 /* In general case result of comparison can differ from operands' type. */
3429 machine_mode cmp_mode;
3430
3431 /* In AVX512F the result of comparison is an integer mask. */
3432 bool maskcmp = false;
3433 rtx x;
3434
3435 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
3436 {
3437 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
3438 cmp_mode = int_mode_for_size (nbits, 0).require ();
3439 maskcmp = true;
3440 }
3441 else
3442 cmp_mode = cmp_ops_mode;
3443
3444 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3445
3446 int (*op1_predicate)(rtx, machine_mode)
3447 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3448
3449 if (!op1_predicate (cmp_op1, cmp_ops_mode))
3450 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3451
3452 if (optimize
3453 || (maskcmp && cmp_mode != mode)
3454 || (op_true && reg_overlap_mentioned_p (dest, op_true))
3455 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3456 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3457
3458 /* Compare patterns for int modes are unspec in AVX512F only. */
3459 if (maskcmp && (code == GT || code == EQ))
3460 {
3461 rtx (*gen)(rtx, rtx, rtx);
3462
3463 switch (cmp_ops_mode)
3464 {
3465 case E_V64QImode:
3466 gcc_assert (TARGET_AVX512BW);
3467 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
3468 break;
3469 case E_V32HImode:
3470 gcc_assert (TARGET_AVX512BW);
3471 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
3472 break;
3473 case E_V16SImode:
3474 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
3475 break;
3476 case E_V8DImode:
3477 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
3478 break;
3479 default:
3480 gen = NULL;
3481 }
3482
3483 if (gen)
3484 {
3485 emit_insn (gen (dest, cmp_op0, cmp_op1));
3486 return dest;
3487 }
3488 }
3489 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3490
3491 if (cmp_mode != mode && !maskcmp)
3492 {
3493 x = force_reg (cmp_ops_mode, x);
3494 convert_move (dest, x, false);
3495 }
3496 else
3497 emit_insn (gen_rtx_SET (dest, x));
3498
3499 return dest;
3500 }
3501
3502 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3503 operations. This is used for both scalar and vector conditional moves. */
3504
3505 void
3506 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3507 {
3508 machine_mode mode = GET_MODE (dest);
3509 machine_mode cmpmode = GET_MODE (cmp);
3510
3511 /* In AVX512F the result of comparison is an integer mask. */
3512 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
3513
3514 rtx t2, t3, x;
3515
3516 /* If we have an integer mask and FP value then we need
3517 to cast mask to FP mode. */
3518 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3519 {
3520 cmp = force_reg (cmpmode, cmp);
3521 cmp = gen_rtx_SUBREG (mode, cmp, 0);
3522 }
3523
3524 if (maskcmp)
3525 {
3526 rtx (*gen) (rtx, rtx) = NULL;
3527 if ((op_true == CONST0_RTX (mode)
3528 && vector_all_ones_operand (op_false, mode))
3529 || (op_false == CONST0_RTX (mode)
3530 && vector_all_ones_operand (op_true, mode)))
3531 switch (mode)
3532 {
3533 case E_V64QImode:
3534 if (TARGET_AVX512BW)
3535 gen = gen_avx512bw_cvtmask2bv64qi;
3536 break;
3537 case E_V32QImode:
3538 if (TARGET_AVX512VL && TARGET_AVX512BW)
3539 gen = gen_avx512vl_cvtmask2bv32qi;
3540 break;
3541 case E_V16QImode:
3542 if (TARGET_AVX512VL && TARGET_AVX512BW)
3543 gen = gen_avx512vl_cvtmask2bv16qi;
3544 break;
3545 case E_V32HImode:
3546 if (TARGET_AVX512BW)
3547 gen = gen_avx512bw_cvtmask2wv32hi;
3548 break;
3549 case E_V16HImode:
3550 if (TARGET_AVX512VL && TARGET_AVX512BW)
3551 gen = gen_avx512vl_cvtmask2wv16hi;
3552 break;
3553 case E_V8HImode:
3554 if (TARGET_AVX512VL && TARGET_AVX512BW)
3555 gen = gen_avx512vl_cvtmask2wv8hi;
3556 break;
3557 case E_V16SImode:
3558 if (TARGET_AVX512DQ)
3559 gen = gen_avx512f_cvtmask2dv16si;
3560 break;
3561 case E_V8SImode:
3562 if (TARGET_AVX512VL && TARGET_AVX512DQ)
3563 gen = gen_avx512vl_cvtmask2dv8si;
3564 break;
3565 case E_V4SImode:
3566 if (TARGET_AVX512VL && TARGET_AVX512DQ)
3567 gen = gen_avx512vl_cvtmask2dv4si;
3568 break;
3569 case E_V8DImode:
3570 if (TARGET_AVX512DQ)
3571 gen = gen_avx512f_cvtmask2qv8di;
3572 break;
3573 case E_V4DImode:
3574 if (TARGET_AVX512VL && TARGET_AVX512DQ)
3575 gen = gen_avx512vl_cvtmask2qv4di;
3576 break;
3577 case E_V2DImode:
3578 if (TARGET_AVX512VL && TARGET_AVX512DQ)
3579 gen = gen_avx512vl_cvtmask2qv2di;
3580 break;
3581 default:
3582 break;
3583 }
3584 if (gen && SCALAR_INT_MODE_P (cmpmode))
3585 {
3586 cmp = force_reg (cmpmode, cmp);
3587 if (op_true == CONST0_RTX (mode))
3588 {
3589 rtx (*gen_not) (rtx, rtx);
3590 switch (cmpmode)
3591 {
3592 case E_QImode: gen_not = gen_knotqi; break;
3593 case E_HImode: gen_not = gen_knothi; break;
3594 case E_SImode: gen_not = gen_knotsi; break;
3595 case E_DImode: gen_not = gen_knotdi; break;
3596 default: gcc_unreachable ();
3597 }
3598 rtx n = gen_reg_rtx (cmpmode);
3599 emit_insn (gen_not (n, cmp));
3600 cmp = n;
3601 }
3602 emit_insn (gen (dest, cmp));
3603 return;
3604 }
3605 }
3606 else if (vector_all_ones_operand (op_true, mode)
3607 && op_false == CONST0_RTX (mode))
3608 {
3609 emit_insn (gen_rtx_SET (dest, cmp));
3610 return;
3611 }
3612 else if (op_false == CONST0_RTX (mode))
3613 {
3614 op_true = force_reg (mode, op_true);
3615 x = gen_rtx_AND (mode, cmp, op_true);
3616 emit_insn (gen_rtx_SET (dest, x));
3617 return;
3618 }
3619 else if (op_true == CONST0_RTX (mode))
3620 {
3621 op_false = force_reg (mode, op_false);
3622 x = gen_rtx_NOT (mode, cmp);
3623 x = gen_rtx_AND (mode, x, op_false);
3624 emit_insn (gen_rtx_SET (dest, x));
3625 return;
3626 }
3627 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
3628 {
3629 op_false = force_reg (mode, op_false);
3630 x = gen_rtx_IOR (mode, cmp, op_false);
3631 emit_insn (gen_rtx_SET (dest, x));
3632 return;
3633 }
3634 else if (TARGET_XOP)
3635 {
3636 op_true = force_reg (mode, op_true);
3637
3638 if (!nonimmediate_operand (op_false, mode))
3639 op_false = force_reg (mode, op_false);
3640
3641 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
3642 op_true,
3643 op_false)));
3644 return;
3645 }
3646
3647 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
3648 rtx d = dest;
3649
3650 if (!vector_operand (op_true, mode))
3651 op_true = force_reg (mode, op_true);
3652
3653 op_false = force_reg (mode, op_false);
3654
3655 switch (mode)
3656 {
3657 case E_V4SFmode:
3658 if (TARGET_SSE4_1)
3659 gen = gen_sse4_1_blendvps;
3660 break;
3661 case E_V2DFmode:
3662 if (TARGET_SSE4_1)
3663 gen = gen_sse4_1_blendvpd;
3664 break;
3665 case E_SFmode:
3666 if (TARGET_SSE4_1)
3667 {
3668 gen = gen_sse4_1_blendvss;
3669 op_true = force_reg (mode, op_true);
3670 }
3671 break;
3672 case E_DFmode:
3673 if (TARGET_SSE4_1)
3674 {
3675 gen = gen_sse4_1_blendvsd;
3676 op_true = force_reg (mode, op_true);
3677 }
3678 break;
3679 case E_V16QImode:
3680 case E_V8HImode:
3681 case E_V4SImode:
3682 case E_V2DImode:
3683 if (TARGET_SSE4_1)
3684 {
3685 gen = gen_sse4_1_pblendvb;
3686 if (mode != V16QImode)
3687 d = gen_reg_rtx (V16QImode);
3688 op_false = gen_lowpart (V16QImode, op_false);
3689 op_true = gen_lowpart (V16QImode, op_true);
3690 cmp = gen_lowpart (V16QImode, cmp);
3691 }
3692 break;
3693 case E_V8SFmode:
3694 if (TARGET_AVX)
3695 gen = gen_avx_blendvps256;
3696 break;
3697 case E_V4DFmode:
3698 if (TARGET_AVX)
3699 gen = gen_avx_blendvpd256;
3700 break;
3701 case E_V32QImode:
3702 case E_V16HImode:
3703 case E_V8SImode:
3704 case E_V4DImode:
3705 if (TARGET_AVX2)
3706 {
3707 gen = gen_avx2_pblendvb;
3708 if (mode != V32QImode)
3709 d = gen_reg_rtx (V32QImode);
3710 op_false = gen_lowpart (V32QImode, op_false);
3711 op_true = gen_lowpart (V32QImode, op_true);
3712 cmp = gen_lowpart (V32QImode, cmp);
3713 }
3714 break;
3715
3716 case E_V64QImode:
3717 gen = gen_avx512bw_blendmv64qi;
3718 break;
3719 case E_V32HImode:
3720 gen = gen_avx512bw_blendmv32hi;
3721 break;
3722 case E_V16SImode:
3723 gen = gen_avx512f_blendmv16si;
3724 break;
3725 case E_V8DImode:
3726 gen = gen_avx512f_blendmv8di;
3727 break;
3728 case E_V8DFmode:
3729 gen = gen_avx512f_blendmv8df;
3730 break;
3731 case E_V16SFmode:
3732 gen = gen_avx512f_blendmv16sf;
3733 break;
3734
3735 default:
3736 break;
3737 }
3738
3739 if (gen != NULL)
3740 {
3741 emit_insn (gen (d, op_false, op_true, cmp));
3742 if (d != dest)
3743 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
3744 }
3745 else
3746 {
3747 op_true = force_reg (mode, op_true);
3748
3749 t2 = gen_reg_rtx (mode);
3750 if (optimize)
3751 t3 = gen_reg_rtx (mode);
3752 else
3753 t3 = dest;
3754
3755 x = gen_rtx_AND (mode, op_true, cmp);
3756 emit_insn (gen_rtx_SET (t2, x));
3757
3758 x = gen_rtx_NOT (mode, cmp);
3759 x = gen_rtx_AND (mode, x, op_false);
3760 emit_insn (gen_rtx_SET (t3, x));
3761
3762 x = gen_rtx_IOR (mode, t3, t2);
3763 emit_insn (gen_rtx_SET (dest, x));
3764 }
3765 }
3766
3767 /* Swap, force into registers, or otherwise massage the two operands
3768 to an sse comparison with a mask result. Thus we differ a bit from
3769 ix86_prepare_fp_compare_args which expects to produce a flags result.
3770
3771 The DEST operand exists to help determine whether to commute commutative
3772 operators. The POP0/POP1 operands are updated in place. The new
3773 comparison code is returned, or UNKNOWN if not implementable. */
3774
3775 static enum rtx_code
3776 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
3777 rtx *pop0, rtx *pop1)
3778 {
3779 switch (code)
3780 {
3781 case LTGT:
3782 case UNEQ:
3783 /* AVX supports all the needed comparisons. */
3784 if (TARGET_AVX)
3785 break;
3786 /* We have no LTGT as an operator. We could implement it with
3787 NE & ORDERED, but this requires an extra temporary. It's
3788 not clear that it's worth it. */
3789 return UNKNOWN;
3790
3791 case LT:
3792 case LE:
3793 case UNGT:
3794 case UNGE:
3795 /* These are supported directly. */
3796 break;
3797
3798 case EQ:
3799 case NE:
3800 case UNORDERED:
3801 case ORDERED:
3802 /* AVX has 3 operand comparisons, no need to swap anything. */
3803 if (TARGET_AVX)
3804 break;
3805 /* For commutative operators, try to canonicalize the destination
3806 operand to be first in the comparison - this helps reload to
3807 avoid extra moves. */
3808 if (!dest || !rtx_equal_p (dest, *pop1))
3809 break;
3810 /* FALLTHRU */
3811
3812 case GE:
3813 case GT:
3814 case UNLE:
3815 case UNLT:
3816 /* These are not supported directly before AVX, and furthermore
3817 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
3818 comparison operands to transform into something that is
3819 supported. */
3820 std::swap (*pop0, *pop1);
3821 code = swap_condition (code);
3822 break;
3823
3824 default:
3825 gcc_unreachable ();
3826 }
3827
3828 return code;
3829 }
3830
3831 /* Expand a floating-point conditional move. Return true if successful. */
3832
3833 bool
3834 ix86_expand_fp_movcc (rtx operands[])
3835 {
3836 machine_mode mode = GET_MODE (operands[0]);
3837 enum rtx_code code = GET_CODE (operands[1]);
3838 rtx tmp, compare_op;
3839 rtx op0 = XEXP (operands[1], 0);
3840 rtx op1 = XEXP (operands[1], 1);
3841
3842 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
3843 {
3844 machine_mode cmode;
3845
3846 /* Since we've no cmove for sse registers, don't force bad register
3847 allocation just to gain access to it. Deny movcc when the
3848 comparison mode doesn't match the move mode. */
3849 cmode = GET_MODE (op0);
3850 if (cmode == VOIDmode)
3851 cmode = GET_MODE (op1);
3852 if (cmode != mode)
3853 return false;
3854
3855 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
3856 if (code == UNKNOWN)
3857 return false;
3858
3859 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
3860 operands[2], operands[3]))
3861 return true;
3862
3863 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
3864 operands[2], operands[3]);
3865 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
3866 return true;
3867 }
3868
3869 if (GET_MODE (op0) == TImode
3870 || (GET_MODE (op0) == DImode
3871 && !TARGET_64BIT))
3872 return false;
3873
3874 /* The floating point conditional move instructions don't directly
3875 support conditions resulting from a signed integer comparison. */
3876
3877 compare_op = ix86_expand_compare (code, op0, op1);
3878 if (!fcmov_comparison_operator (compare_op, VOIDmode))
3879 {
3880 tmp = gen_reg_rtx (QImode);
3881 ix86_expand_setcc (tmp, code, op0, op1);
3882
3883 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
3884 }
3885
3886 emit_insn (gen_rtx_SET (operands[0],
3887 gen_rtx_IF_THEN_ELSE (mode, compare_op,
3888 operands[2], operands[3])));
3889
3890 return true;
3891 }
3892
3893 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
3894
3895 static int
3896 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
3897 {
3898 switch (code)
3899 {
3900 case EQ:
3901 return 0;
3902 case LT:
3903 case LTU:
3904 return 1;
3905 case LE:
3906 case LEU:
3907 return 2;
3908 case NE:
3909 return 4;
3910 case GE:
3911 case GEU:
3912 return 5;
3913 case GT:
3914 case GTU:
3915 return 6;
3916 default:
3917 gcc_unreachable ();
3918 }
3919 }
3920
3921 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
3922
3923 static int
3924 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
3925 {
3926 switch (code)
3927 {
3928 case EQ:
3929 return 0x00;
3930 case NE:
3931 return 0x04;
3932 case GT:
3933 return 0x0e;
3934 case LE:
3935 return 0x02;
3936 case GE:
3937 return 0x0d;
3938 case LT:
3939 return 0x01;
3940 case UNLE:
3941 return 0x0a;
3942 case UNLT:
3943 return 0x09;
3944 case UNGE:
3945 return 0x05;
3946 case UNGT:
3947 return 0x06;
3948 case UNEQ:
3949 return 0x18;
3950 case LTGT:
3951 return 0x0c;
3952 case ORDERED:
3953 return 0x07;
3954 case UNORDERED:
3955 return 0x03;
3956 default:
3957 gcc_unreachable ();
3958 }
3959 }
3960
3961 /* Return immediate value to be used in UNSPEC_PCMP
3962 for comparison CODE in MODE. */
3963
3964 static int
3965 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
3966 {
3967 if (FLOAT_MODE_P (mode))
3968 return ix86_fp_cmp_code_to_pcmp_immediate (code);
3969 return ix86_int_cmp_code_to_pcmp_immediate (code);
3970 }
3971
3972 /* Expand AVX-512 vector comparison. */
3973
3974 bool
3975 ix86_expand_mask_vec_cmp (rtx operands[])
3976 {
3977 machine_mode mask_mode = GET_MODE (operands[0]);
3978 machine_mode cmp_mode = GET_MODE (operands[2]);
3979 enum rtx_code code = GET_CODE (operands[1]);
3980 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
3981 int unspec_code;
3982 rtx unspec;
3983
3984 switch (code)
3985 {
3986 case LEU:
3987 case GTU:
3988 case GEU:
3989 case LTU:
3990 unspec_code = UNSPEC_UNSIGNED_PCMP;
3991 break;
3992
3993 default:
3994 unspec_code = UNSPEC_PCMP;
3995 }
3996
3997 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
3998 operands[3], imm),
3999 unspec_code);
4000 emit_insn (gen_rtx_SET (operands[0], unspec));
4001
4002 return true;
4003 }
4004
4005 /* Expand fp vector comparison. */
4006
4007 bool
4008 ix86_expand_fp_vec_cmp (rtx operands[])
4009 {
4010 enum rtx_code code = GET_CODE (operands[1]);
4011 rtx cmp;
4012
4013 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4014 &operands[2], &operands[3]);
4015 if (code == UNKNOWN)
4016 {
4017 rtx temp;
4018 switch (GET_CODE (operands[1]))
4019 {
4020 case LTGT:
4021 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4022 operands[3], NULL, NULL);
4023 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4024 operands[3], NULL, NULL);
4025 code = AND;
4026 break;
4027 case UNEQ:
4028 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4029 operands[3], NULL, NULL);
4030 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4031 operands[3], NULL, NULL);
4032 code = IOR;
4033 break;
4034 default:
4035 gcc_unreachable ();
4036 }
4037 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4038 OPTAB_DIRECT);
4039 }
4040 else
4041 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4042 operands[1], operands[2]);
4043
4044 if (operands[0] != cmp)
4045 emit_move_insn (operands[0], cmp);
4046
4047 return true;
4048 }
4049
4050 static rtx
4051 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4052 rtx op_true, rtx op_false, bool *negate)
4053 {
4054 machine_mode data_mode = GET_MODE (dest);
4055 machine_mode mode = GET_MODE (cop0);
4056 rtx x;
4057
4058 *negate = false;
4059
4060 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4061 if (TARGET_XOP
4062 && (mode == V16QImode || mode == V8HImode
4063 || mode == V4SImode || mode == V2DImode))
4064 ;
4065 else
4066 {
4067 /* Canonicalize the comparison to EQ, GT, GTU. */
4068 switch (code)
4069 {
4070 case EQ:
4071 case GT:
4072 case GTU:
4073 break;
4074
4075 case NE:
4076 case LE:
4077 case LEU:
4078 code = reverse_condition (code);
4079 *negate = true;
4080 break;
4081
4082 case GE:
4083 case GEU:
4084 code = reverse_condition (code);
4085 *negate = true;
4086 /* FALLTHRU */
4087
4088 case LT:
4089 case LTU:
4090 std::swap (cop0, cop1);
4091 code = swap_condition (code);
4092 break;
4093
4094 default:
4095 gcc_unreachable ();
4096 }
4097
4098 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4099 if (mode == V2DImode)
4100 {
4101 switch (code)
4102 {
4103 case EQ:
4104 /* SSE4.1 supports EQ. */
4105 if (!TARGET_SSE4_1)
4106 return NULL;
4107 break;
4108
4109 case GT:
4110 case GTU:
4111 /* SSE4.2 supports GT/GTU. */
4112 if (!TARGET_SSE4_2)
4113 return NULL;
4114 break;
4115
4116 default:
4117 gcc_unreachable ();
4118 }
4119 }
4120
4121 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4122 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4123 if (*negate)
4124 std::swap (optrue, opfalse);
4125
4126 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4127 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4128 min (x, y) == x). While we add one instruction (the minimum),
4129 we remove the need for two instructions in the negation, as the
4130 result is done this way.
4131 When using masks, do it for SI/DImode element types, as it is shorter
4132 than the two subtractions. */
4133 if ((code != EQ
4134 && GET_MODE_SIZE (mode) != 64
4135 && vector_all_ones_operand (opfalse, data_mode)
4136 && optrue == CONST0_RTX (data_mode))
4137 || (code == GTU
4138 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4139 /* Don't do it if not using integer masks and we'd end up with
4140 the right values in the registers though. */
4141 && (GET_MODE_SIZE (mode) == 64
4142 || !vector_all_ones_operand (optrue, data_mode)
4143 || opfalse != CONST0_RTX (data_mode))))
4144 {
4145 rtx (*gen) (rtx, rtx, rtx) = NULL;
4146
4147 switch (mode)
4148 {
4149 case E_V16SImode:
4150 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4151 break;
4152 case E_V8DImode:
4153 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4154 cop0 = force_reg (mode, cop0);
4155 cop1 = force_reg (mode, cop1);
4156 break;
4157 case E_V32QImode:
4158 if (TARGET_AVX2)
4159 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4160 break;
4161 case E_V16HImode:
4162 if (TARGET_AVX2)
4163 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4164 break;
4165 case E_V8SImode:
4166 if (TARGET_AVX2)
4167 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4168 break;
4169 case E_V4DImode:
4170 if (TARGET_AVX512VL)
4171 {
4172 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4173 cop0 = force_reg (mode, cop0);
4174 cop1 = force_reg (mode, cop1);
4175 }
4176 break;
4177 case E_V16QImode:
4178 if (code == GTU && TARGET_SSE2)
4179 gen = gen_uminv16qi3;
4180 else if (code == GT && TARGET_SSE4_1)
4181 gen = gen_sminv16qi3;
4182 break;
4183 case E_V8HImode:
4184 if (code == GTU && TARGET_SSE4_1)
4185 gen = gen_uminv8hi3;
4186 else if (code == GT && TARGET_SSE2)
4187 gen = gen_sminv8hi3;
4188 break;
4189 case E_V4SImode:
4190 if (TARGET_SSE4_1)
4191 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4192 break;
4193 case E_V2DImode:
4194 if (TARGET_AVX512VL)
4195 {
4196 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4197 cop0 = force_reg (mode, cop0);
4198 cop1 = force_reg (mode, cop1);
4199 }
4200 break;
4201 default:
4202 break;
4203 }
4204
4205 if (gen)
4206 {
4207 rtx tem = gen_reg_rtx (mode);
4208 if (!vector_operand (cop0, mode))
4209 cop0 = force_reg (mode, cop0);
4210 if (!vector_operand (cop1, mode))
4211 cop1 = force_reg (mode, cop1);
4212 *negate = !*negate;
4213 emit_insn (gen (tem, cop0, cop1));
4214 cop1 = tem;
4215 code = EQ;
4216 }
4217 }
4218
4219 /* Unsigned parallel compare is not supported by the hardware.
4220 Play some tricks to turn this into a signed comparison
4221 against 0. */
4222 if (code == GTU)
4223 {
4224 cop0 = force_reg (mode, cop0);
4225
4226 switch (mode)
4227 {
4228 case E_V16SImode:
4229 case E_V8DImode:
4230 case E_V8SImode:
4231 case E_V4DImode:
4232 case E_V4SImode:
4233 case E_V2DImode:
4234 {
4235 rtx t1, t2, mask;
4236
4237 /* Subtract (-(INT MAX) - 1) from both operands to make
4238 them signed. */
4239 mask = ix86_build_signbit_mask (mode, true, false);
4240 t1 = gen_reg_rtx (mode);
4241 emit_insn (gen_sub3_insn (t1, cop0, mask));
4242
4243 t2 = gen_reg_rtx (mode);
4244 emit_insn (gen_sub3_insn (t2, cop1, mask));
4245
4246 cop0 = t1;
4247 cop1 = t2;
4248 code = GT;
4249 }
4250 break;
4251
4252 case E_V64QImode:
4253 case E_V32HImode:
4254 case E_V32QImode:
4255 case E_V16HImode:
4256 case E_V16QImode:
4257 case E_V8HImode:
4258 /* Perform a parallel unsigned saturating subtraction. */
4259 x = gen_reg_rtx (mode);
4260 emit_insn (gen_rtx_SET
4261 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
4262 cop0 = x;
4263 cop1 = CONST0_RTX (mode);
4264 code = EQ;
4265 *negate = !*negate;
4266 break;
4267
4268 default:
4269 gcc_unreachable ();
4270 }
4271 }
4272 }
4273
4274 if (*negate)
4275 std::swap (op_true, op_false);
4276
4277 /* Allow the comparison to be done in one mode, but the movcc to
4278 happen in another mode. */
4279 if (data_mode == mode)
4280 {
4281 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
4282 op_true, op_false);
4283 }
4284 else
4285 {
4286 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4287 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4288 op_true, op_false);
4289 if (GET_MODE (x) == mode)
4290 x = gen_lowpart (data_mode, x);
4291 }
4292
4293 return x;
4294 }
4295
4296 /* Expand integer vector comparison. */
4297
4298 bool
4299 ix86_expand_int_vec_cmp (rtx operands[])
4300 {
4301 rtx_code code = GET_CODE (operands[1]);
4302 bool negate = false;
4303 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4304 operands[3], NULL, NULL, &negate);
4305
4306 if (!cmp)
4307 return false;
4308
4309 if (negate)
4310 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4311 CONST0_RTX (GET_MODE (cmp)),
4312 NULL, NULL, &negate);
4313
4314 gcc_assert (!negate);
4315
4316 if (operands[0] != cmp)
4317 emit_move_insn (operands[0], cmp);
4318
4319 return true;
4320 }
4321
4322 /* Expand a floating-point vector conditional move; a vcond operation
4323 rather than a movcc operation. */
4324
4325 bool
4326 ix86_expand_fp_vcond (rtx operands[])
4327 {
4328 enum rtx_code code = GET_CODE (operands[3]);
4329 rtx cmp;
4330
4331 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4332 &operands[4], &operands[5]);
4333 if (code == UNKNOWN)
4334 {
4335 rtx temp;
4336 switch (GET_CODE (operands[3]))
4337 {
4338 case LTGT:
4339 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4340 operands[5], operands[0], operands[0]);
4341 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4342 operands[5], operands[1], operands[2]);
4343 code = AND;
4344 break;
4345 case UNEQ:
4346 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4347 operands[5], operands[0], operands[0]);
4348 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4349 operands[5], operands[1], operands[2]);
4350 code = IOR;
4351 break;
4352 default:
4353 gcc_unreachable ();
4354 }
4355 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4356 OPTAB_DIRECT);
4357 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4358 return true;
4359 }
4360
4361 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4362 operands[5], operands[1], operands[2]))
4363 return true;
4364
4365 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4366 operands[1], operands[2]);
4367 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4368 return true;
4369 }
4370
4371 /* Expand a signed/unsigned integral vector conditional move. */
4372
4373 bool
4374 ix86_expand_int_vcond (rtx operands[])
4375 {
4376 machine_mode data_mode = GET_MODE (operands[0]);
4377 machine_mode mode = GET_MODE (operands[4]);
4378 enum rtx_code code = GET_CODE (operands[3]);
4379 bool negate = false;
4380 rtx x, cop0, cop1;
4381
4382 cop0 = operands[4];
4383 cop1 = operands[5];
4384
4385 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4386 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4387 if ((code == LT || code == GE)
4388 && data_mode == mode
4389 && cop1 == CONST0_RTX (mode)
4390 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4391 && GET_MODE_UNIT_SIZE (data_mode) > 1
4392 && GET_MODE_UNIT_SIZE (data_mode) <= 8
4393 && (GET_MODE_SIZE (data_mode) == 16
4394 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4395 {
4396 rtx negop = operands[2 - (code == LT)];
4397 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4398 if (negop == CONST1_RTX (data_mode))
4399 {
4400 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4401 operands[0], 1, OPTAB_DIRECT);
4402 if (res != operands[0])
4403 emit_move_insn (operands[0], res);
4404 return true;
4405 }
4406 else if (GET_MODE_INNER (data_mode) != DImode
4407 && vector_all_ones_operand (negop, data_mode))
4408 {
4409 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4410 operands[0], 0, OPTAB_DIRECT);
4411 if (res != operands[0])
4412 emit_move_insn (operands[0], res);
4413 return true;
4414 }
4415 }
4416
4417 if (!nonimmediate_operand (cop1, mode))
4418 cop1 = force_reg (mode, cop1);
4419 if (!general_operand (operands[1], data_mode))
4420 operands[1] = force_reg (data_mode, operands[1]);
4421 if (!general_operand (operands[2], data_mode))
4422 operands[2] = force_reg (data_mode, operands[2]);
4423
4424 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
4425 operands[1], operands[2], &negate);
4426
4427 if (!x)
4428 return false;
4429
4430 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
4431 operands[2-negate]);
4432 return true;
4433 }
4434
4435 static bool
4436 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
4437 struct expand_vec_perm_d *d)
4438 {
4439 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4440 expander, so args are either in d, or in op0, op1 etc. */
4441 machine_mode mode = GET_MODE (d ? d->op0 : op0);
4442 machine_mode maskmode = mode;
4443 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4444
4445 switch (mode)
4446 {
4447 case E_V8HImode:
4448 if (TARGET_AVX512VL && TARGET_AVX512BW)
4449 gen = gen_avx512vl_vpermt2varv8hi3;
4450 break;
4451 case E_V16HImode:
4452 if (TARGET_AVX512VL && TARGET_AVX512BW)
4453 gen = gen_avx512vl_vpermt2varv16hi3;
4454 break;
4455 case E_V64QImode:
4456 if (TARGET_AVX512VBMI)
4457 gen = gen_avx512bw_vpermt2varv64qi3;
4458 break;
4459 case E_V32HImode:
4460 if (TARGET_AVX512BW)
4461 gen = gen_avx512bw_vpermt2varv32hi3;
4462 break;
4463 case E_V4SImode:
4464 if (TARGET_AVX512VL)
4465 gen = gen_avx512vl_vpermt2varv4si3;
4466 break;
4467 case E_V8SImode:
4468 if (TARGET_AVX512VL)
4469 gen = gen_avx512vl_vpermt2varv8si3;
4470 break;
4471 case E_V16SImode:
4472 if (TARGET_AVX512F)
4473 gen = gen_avx512f_vpermt2varv16si3;
4474 break;
4475 case E_V4SFmode:
4476 if (TARGET_AVX512VL)
4477 {
4478 gen = gen_avx512vl_vpermt2varv4sf3;
4479 maskmode = V4SImode;
4480 }
4481 break;
4482 case E_V8SFmode:
4483 if (TARGET_AVX512VL)
4484 {
4485 gen = gen_avx512vl_vpermt2varv8sf3;
4486 maskmode = V8SImode;
4487 }
4488 break;
4489 case E_V16SFmode:
4490 if (TARGET_AVX512F)
4491 {
4492 gen = gen_avx512f_vpermt2varv16sf3;
4493 maskmode = V16SImode;
4494 }
4495 break;
4496 case E_V2DImode:
4497 if (TARGET_AVX512VL)
4498 gen = gen_avx512vl_vpermt2varv2di3;
4499 break;
4500 case E_V4DImode:
4501 if (TARGET_AVX512VL)
4502 gen = gen_avx512vl_vpermt2varv4di3;
4503 break;
4504 case E_V8DImode:
4505 if (TARGET_AVX512F)
4506 gen = gen_avx512f_vpermt2varv8di3;
4507 break;
4508 case E_V2DFmode:
4509 if (TARGET_AVX512VL)
4510 {
4511 gen = gen_avx512vl_vpermt2varv2df3;
4512 maskmode = V2DImode;
4513 }
4514 break;
4515 case E_V4DFmode:
4516 if (TARGET_AVX512VL)
4517 {
4518 gen = gen_avx512vl_vpermt2varv4df3;
4519 maskmode = V4DImode;
4520 }
4521 break;
4522 case E_V8DFmode:
4523 if (TARGET_AVX512F)
4524 {
4525 gen = gen_avx512f_vpermt2varv8df3;
4526 maskmode = V8DImode;
4527 }
4528 break;
4529 default:
4530 break;
4531 }
4532
4533 if (gen == NULL)
4534 return false;
4535
4536 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4537 expander, so args are either in d, or in op0, op1 etc. */
4538 if (d)
4539 {
4540 rtx vec[64];
4541 target = d->target;
4542 op0 = d->op0;
4543 op1 = d->op1;
4544 for (int i = 0; i < d->nelt; ++i)
4545 vec[i] = GEN_INT (d->perm[i]);
4546 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
4547 }
4548
4549 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
4550 return true;
4551 }
4552
4553 /* Expand a variable vector permutation. */
4554
4555 void
4556 ix86_expand_vec_perm (rtx operands[])
4557 {
4558 rtx target = operands[0];
4559 rtx op0 = operands[1];
4560 rtx op1 = operands[2];
4561 rtx mask = operands[3];
4562 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
4563 machine_mode mode = GET_MODE (op0);
4564 machine_mode maskmode = GET_MODE (mask);
4565 int w, e, i;
4566 bool one_operand_shuffle = rtx_equal_p (op0, op1);
4567
4568 /* Number of elements in the vector. */
4569 w = GET_MODE_NUNITS (mode);
4570 e = GET_MODE_UNIT_SIZE (mode);
4571 gcc_assert (w <= 64);
4572
4573 if (TARGET_AVX512F && one_operand_shuffle)
4574 {
4575 rtx (*gen) (rtx, rtx, rtx) = NULL;
4576 switch (mode)
4577 {
4578 case E_V16SImode:
4579 gen =gen_avx512f_permvarv16si;
4580 break;
4581 case E_V16SFmode:
4582 gen = gen_avx512f_permvarv16sf;
4583 break;
4584 case E_V8DImode:
4585 gen = gen_avx512f_permvarv8di;
4586 break;
4587 case E_V8DFmode:
4588 gen = gen_avx512f_permvarv8df;
4589 break;
4590 default:
4591 break;
4592 }
4593 if (gen != NULL)
4594 {
4595 emit_insn (gen (target, op0, mask));
4596 return;
4597 }
4598 }
4599
4600 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
4601 return;
4602
4603 if (TARGET_AVX2)
4604 {
4605 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
4606 {
4607 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4608 an constant shuffle operand. With a tiny bit of effort we can
4609 use VPERMD instead. A re-interpretation stall for V4DFmode is
4610 unfortunate but there's no avoiding it.
4611 Similarly for V16HImode we don't have instructions for variable
4612 shuffling, while for V32QImode we can use after preparing suitable
4613 masks vpshufb; vpshufb; vpermq; vpor. */
4614
4615 if (mode == V16HImode)
4616 {
4617 maskmode = mode = V32QImode;
4618 w = 32;
4619 e = 1;
4620 }
4621 else
4622 {
4623 maskmode = mode = V8SImode;
4624 w = 8;
4625 e = 4;
4626 }
4627 t1 = gen_reg_rtx (maskmode);
4628
4629 /* Replicate the low bits of the V4DImode mask into V8SImode:
4630 mask = { A B C D }
4631 t1 = { A A B B C C D D }. */
4632 for (i = 0; i < w / 2; ++i)
4633 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
4634 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4635 vt = force_reg (maskmode, vt);
4636 mask = gen_lowpart (maskmode, mask);
4637 if (maskmode == V8SImode)
4638 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
4639 else
4640 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
4641
4642 /* Multiply the shuffle indicies by two. */
4643 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
4644 OPTAB_DIRECT);
4645
4646 /* Add one to the odd shuffle indicies:
4647 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
4648 for (i = 0; i < w / 2; ++i)
4649 {
4650 vec[i * 2] = const0_rtx;
4651 vec[i * 2 + 1] = const1_rtx;
4652 }
4653 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4654 vt = validize_mem (force_const_mem (maskmode, vt));
4655 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
4656 OPTAB_DIRECT);
4657
4658 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
4659 operands[3] = mask = t1;
4660 target = gen_reg_rtx (mode);
4661 op0 = gen_lowpart (mode, op0);
4662 op1 = gen_lowpart (mode, op1);
4663 }
4664
4665 switch (mode)
4666 {
4667 case E_V8SImode:
4668 /* The VPERMD and VPERMPS instructions already properly ignore
4669 the high bits of the shuffle elements. No need for us to
4670 perform an AND ourselves. */
4671 if (one_operand_shuffle)
4672 {
4673 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
4674 if (target != operands[0])
4675 emit_move_insn (operands[0],
4676 gen_lowpart (GET_MODE (operands[0]), target));
4677 }
4678 else
4679 {
4680 t1 = gen_reg_rtx (V8SImode);
4681 t2 = gen_reg_rtx (V8SImode);
4682 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
4683 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
4684 goto merge_two;
4685 }
4686 return;
4687
4688 case E_V8SFmode:
4689 mask = gen_lowpart (V8SImode, mask);
4690 if (one_operand_shuffle)
4691 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
4692 else
4693 {
4694 t1 = gen_reg_rtx (V8SFmode);
4695 t2 = gen_reg_rtx (V8SFmode);
4696 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
4697 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
4698 goto merge_two;
4699 }
4700 return;
4701
4702 case E_V4SImode:
4703 /* By combining the two 128-bit input vectors into one 256-bit
4704 input vector, we can use VPERMD and VPERMPS for the full
4705 two-operand shuffle. */
4706 t1 = gen_reg_rtx (V8SImode);
4707 t2 = gen_reg_rtx (V8SImode);
4708 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
4709 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4710 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
4711 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
4712 return;
4713
4714 case E_V4SFmode:
4715 t1 = gen_reg_rtx (V8SFmode);
4716 t2 = gen_reg_rtx (V8SImode);
4717 mask = gen_lowpart (V4SImode, mask);
4718 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
4719 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4720 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
4721 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
4722 return;
4723
4724 case E_V32QImode:
4725 t1 = gen_reg_rtx (V32QImode);
4726 t2 = gen_reg_rtx (V32QImode);
4727 t3 = gen_reg_rtx (V32QImode);
4728 vt2 = GEN_INT (-128);
4729 vt = gen_const_vec_duplicate (V32QImode, vt2);
4730 vt = force_reg (V32QImode, vt);
4731 for (i = 0; i < 32; i++)
4732 vec[i] = i < 16 ? vt2 : const0_rtx;
4733 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
4734 vt2 = force_reg (V32QImode, vt2);
4735 /* From mask create two adjusted masks, which contain the same
4736 bits as mask in the low 7 bits of each vector element.
4737 The first mask will have the most significant bit clear
4738 if it requests element from the same 128-bit lane
4739 and MSB set if it requests element from the other 128-bit lane.
4740 The second mask will have the opposite values of the MSB,
4741 and additionally will have its 128-bit lanes swapped.
4742 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4743 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
4744 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4745 stands for other 12 bytes. */
4746 /* The bit whether element is from the same lane or the other
4747 lane is bit 4, so shift it up by 3 to the MSB position. */
4748 t5 = gen_reg_rtx (V4DImode);
4749 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
4750 GEN_INT (3)));
4751 /* Clear MSB bits from the mask just in case it had them set. */
4752 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
4753 /* After this t1 will have MSB set for elements from other lane. */
4754 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
4755 /* Clear bits other than MSB. */
4756 emit_insn (gen_andv32qi3 (t1, t1, vt));
4757 /* Or in the lower bits from mask into t3. */
4758 emit_insn (gen_iorv32qi3 (t3, t1, t2));
4759 /* And invert MSB bits in t1, so MSB is set for elements from the same
4760 lane. */
4761 emit_insn (gen_xorv32qi3 (t1, t1, vt));
4762 /* Swap 128-bit lanes in t3. */
4763 t6 = gen_reg_rtx (V4DImode);
4764 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
4765 const2_rtx, GEN_INT (3),
4766 const0_rtx, const1_rtx));
4767 /* And or in the lower bits from mask into t1. */
4768 emit_insn (gen_iorv32qi3 (t1, t1, t2));
4769 if (one_operand_shuffle)
4770 {
4771 /* Each of these shuffles will put 0s in places where
4772 element from the other 128-bit lane is needed, otherwise
4773 will shuffle in the requested value. */
4774 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
4775 gen_lowpart (V32QImode, t6)));
4776 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
4777 /* For t3 the 128-bit lanes are swapped again. */
4778 t7 = gen_reg_rtx (V4DImode);
4779 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
4780 const2_rtx, GEN_INT (3),
4781 const0_rtx, const1_rtx));
4782 /* And oring both together leads to the result. */
4783 emit_insn (gen_iorv32qi3 (target, t1,
4784 gen_lowpart (V32QImode, t7)));
4785 if (target != operands[0])
4786 emit_move_insn (operands[0],
4787 gen_lowpart (GET_MODE (operands[0]), target));
4788 return;
4789 }
4790
4791 t4 = gen_reg_rtx (V32QImode);
4792 /* Similarly to the above one_operand_shuffle code,
4793 just for repeated twice for each operand. merge_two:
4794 code will merge the two results together. */
4795 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
4796 gen_lowpart (V32QImode, t6)));
4797 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
4798 gen_lowpart (V32QImode, t6)));
4799 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
4800 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
4801 t7 = gen_reg_rtx (V4DImode);
4802 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
4803 const2_rtx, GEN_INT (3),
4804 const0_rtx, const1_rtx));
4805 t8 = gen_reg_rtx (V4DImode);
4806 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
4807 const2_rtx, GEN_INT (3),
4808 const0_rtx, const1_rtx));
4809 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
4810 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
4811 t1 = t4;
4812 t2 = t3;
4813 goto merge_two;
4814
4815 default:
4816 gcc_assert (GET_MODE_SIZE (mode) <= 16);
4817 break;
4818 }
4819 }
4820
4821 if (TARGET_XOP)
4822 {
4823 /* The XOP VPPERM insn supports three inputs. By ignoring the
4824 one_operand_shuffle special case, we avoid creating another
4825 set of constant vectors in memory. */
4826 one_operand_shuffle = false;
4827
4828 /* mask = mask & {2*w-1, ...} */
4829 vt = GEN_INT (2*w - 1);
4830 }
4831 else
4832 {
4833 /* mask = mask & {w-1, ...} */
4834 vt = GEN_INT (w - 1);
4835 }
4836
4837 vt = gen_const_vec_duplicate (maskmode, vt);
4838 mask = expand_simple_binop (maskmode, AND, mask, vt,
4839 NULL_RTX, 0, OPTAB_DIRECT);
4840
4841 /* For non-QImode operations, convert the word permutation control
4842 into a byte permutation control. */
4843 if (mode != V16QImode)
4844 {
4845 mask = expand_simple_binop (maskmode, ASHIFT, mask,
4846 GEN_INT (exact_log2 (e)),
4847 NULL_RTX, 0, OPTAB_DIRECT);
4848
4849 /* Convert mask to vector of chars. */
4850 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
4851
4852 /* Replicate each of the input bytes into byte positions:
4853 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4854 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4855 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
4856 for (i = 0; i < 16; ++i)
4857 vec[i] = GEN_INT (i/e * e);
4858 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4859 vt = validize_mem (force_const_mem (V16QImode, vt));
4860 if (TARGET_XOP)
4861 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
4862 else
4863 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
4864
4865 /* Convert it into the byte positions by doing
4866 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
4867 for (i = 0; i < 16; ++i)
4868 vec[i] = GEN_INT (i % e);
4869 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4870 vt = validize_mem (force_const_mem (V16QImode, vt));
4871 emit_insn (gen_addv16qi3 (mask, mask, vt));
4872 }
4873
4874 /* The actual shuffle operations all operate on V16QImode. */
4875 op0 = gen_lowpart (V16QImode, op0);
4876 op1 = gen_lowpart (V16QImode, op1);
4877
4878 if (TARGET_XOP)
4879 {
4880 if (GET_MODE (target) != V16QImode)
4881 target = gen_reg_rtx (V16QImode);
4882 emit_insn (gen_xop_pperm (target, op0, op1, mask));
4883 if (target != operands[0])
4884 emit_move_insn (operands[0],
4885 gen_lowpart (GET_MODE (operands[0]), target));
4886 }
4887 else if (one_operand_shuffle)
4888 {
4889 if (GET_MODE (target) != V16QImode)
4890 target = gen_reg_rtx (V16QImode);
4891 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
4892 if (target != operands[0])
4893 emit_move_insn (operands[0],
4894 gen_lowpart (GET_MODE (operands[0]), target));
4895 }
4896 else
4897 {
4898 rtx xops[6];
4899 bool ok;
4900
4901 /* Shuffle the two input vectors independently. */
4902 t1 = gen_reg_rtx (V16QImode);
4903 t2 = gen_reg_rtx (V16QImode);
4904 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
4905 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
4906
4907 merge_two:
4908 /* Then merge them together. The key is whether any given control
4909 element contained a bit set that indicates the second word. */
4910 mask = operands[3];
4911 vt = GEN_INT (w);
4912 if (maskmode == V2DImode && !TARGET_SSE4_1)
4913 {
4914 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
4915 more shuffle to convert the V2DI input mask into a V4SI
4916 input mask. At which point the masking that expand_int_vcond
4917 will work as desired. */
4918 rtx t3 = gen_reg_rtx (V4SImode);
4919 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
4920 const0_rtx, const0_rtx,
4921 const2_rtx, const2_rtx));
4922 mask = t3;
4923 maskmode = V4SImode;
4924 e = w = 4;
4925 }
4926
4927 vt = gen_const_vec_duplicate (maskmode, vt);
4928 vt = force_reg (maskmode, vt);
4929 mask = expand_simple_binop (maskmode, AND, mask, vt,
4930 NULL_RTX, 0, OPTAB_DIRECT);
4931
4932 if (GET_MODE (target) != mode)
4933 target = gen_reg_rtx (mode);
4934 xops[0] = target;
4935 xops[1] = gen_lowpart (mode, t2);
4936 xops[2] = gen_lowpart (mode, t1);
4937 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
4938 xops[4] = mask;
4939 xops[5] = vt;
4940 ok = ix86_expand_int_vcond (xops);
4941 gcc_assert (ok);
4942 if (target != operands[0])
4943 emit_move_insn (operands[0],
4944 gen_lowpart (GET_MODE (operands[0]), target));
4945 }
4946 }
4947
4948 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
4949 true if we should do zero extension, else sign extension. HIGH_P is
4950 true if we want the N/2 high elements, else the low elements. */
4951
4952 void
4953 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
4954 {
4955 machine_mode imode = GET_MODE (src);
4956 rtx tmp;
4957
4958 if (TARGET_SSE4_1)
4959 {
4960 rtx (*unpack)(rtx, rtx);
4961 rtx (*extract)(rtx, rtx) = NULL;
4962 machine_mode halfmode = BLKmode;
4963
4964 switch (imode)
4965 {
4966 case E_V64QImode:
4967 if (unsigned_p)
4968 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
4969 else
4970 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
4971 halfmode = V32QImode;
4972 extract
4973 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
4974 break;
4975 case E_V32QImode:
4976 if (unsigned_p)
4977 unpack = gen_avx2_zero_extendv16qiv16hi2;
4978 else
4979 unpack = gen_avx2_sign_extendv16qiv16hi2;
4980 halfmode = V16QImode;
4981 extract
4982 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
4983 break;
4984 case E_V32HImode:
4985 if (unsigned_p)
4986 unpack = gen_avx512f_zero_extendv16hiv16si2;
4987 else
4988 unpack = gen_avx512f_sign_extendv16hiv16si2;
4989 halfmode = V16HImode;
4990 extract
4991 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
4992 break;
4993 case E_V16HImode:
4994 if (unsigned_p)
4995 unpack = gen_avx2_zero_extendv8hiv8si2;
4996 else
4997 unpack = gen_avx2_sign_extendv8hiv8si2;
4998 halfmode = V8HImode;
4999 extract
5000 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
5001 break;
5002 case E_V16SImode:
5003 if (unsigned_p)
5004 unpack = gen_avx512f_zero_extendv8siv8di2;
5005 else
5006 unpack = gen_avx512f_sign_extendv8siv8di2;
5007 halfmode = V8SImode;
5008 extract
5009 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
5010 break;
5011 case E_V8SImode:
5012 if (unsigned_p)
5013 unpack = gen_avx2_zero_extendv4siv4di2;
5014 else
5015 unpack = gen_avx2_sign_extendv4siv4di2;
5016 halfmode = V4SImode;
5017 extract
5018 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5019 break;
5020 case E_V16QImode:
5021 if (unsigned_p)
5022 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5023 else
5024 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5025 break;
5026 case E_V8HImode:
5027 if (unsigned_p)
5028 unpack = gen_sse4_1_zero_extendv4hiv4si2;
5029 else
5030 unpack = gen_sse4_1_sign_extendv4hiv4si2;
5031 break;
5032 case E_V4SImode:
5033 if (unsigned_p)
5034 unpack = gen_sse4_1_zero_extendv2siv2di2;
5035 else
5036 unpack = gen_sse4_1_sign_extendv2siv2di2;
5037 break;
5038 default:
5039 gcc_unreachable ();
5040 }
5041
5042 if (GET_MODE_SIZE (imode) >= 32)
5043 {
5044 tmp = gen_reg_rtx (halfmode);
5045 emit_insn (extract (tmp, src));
5046 }
5047 else if (high_p)
5048 {
5049 /* Shift higher 8 bytes to lower 8 bytes. */
5050 tmp = gen_reg_rtx (V1TImode);
5051 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5052 GEN_INT (64)));
5053 tmp = gen_lowpart (imode, tmp);
5054 }
5055 else
5056 tmp = src;
5057
5058 emit_insn (unpack (dest, tmp));
5059 }
5060 else
5061 {
5062 rtx (*unpack)(rtx, rtx, rtx);
5063
5064 switch (imode)
5065 {
5066 case E_V16QImode:
5067 if (high_p)
5068 unpack = gen_vec_interleave_highv16qi;
5069 else
5070 unpack = gen_vec_interleave_lowv16qi;
5071 break;
5072 case E_V8HImode:
5073 if (high_p)
5074 unpack = gen_vec_interleave_highv8hi;
5075 else
5076 unpack = gen_vec_interleave_lowv8hi;
5077 break;
5078 case E_V4SImode:
5079 if (high_p)
5080 unpack = gen_vec_interleave_highv4si;
5081 else
5082 unpack = gen_vec_interleave_lowv4si;
5083 break;
5084 default:
5085 gcc_unreachable ();
5086 }
5087
5088 if (unsigned_p)
5089 tmp = force_reg (imode, CONST0_RTX (imode));
5090 else
5091 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5092 src, pc_rtx, pc_rtx);
5093
5094 rtx tmp2 = gen_reg_rtx (imode);
5095 emit_insn (unpack (tmp2, src, tmp));
5096 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5097 }
5098 }
5099
5100 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5101 but works for floating pointer parameters and nonoffsetable memories.
5102 For pushes, it returns just stack offsets; the values will be saved
5103 in the right order. Maximally three parts are generated. */
5104
5105 static int
5106 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5107 {
5108 int size;
5109
5110 if (!TARGET_64BIT)
5111 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5112 else
5113 size = (GET_MODE_SIZE (mode) + 4) / 8;
5114
5115 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5116 gcc_assert (size >= 2 && size <= 4);
5117
5118 /* Optimize constant pool reference to immediates. This is used by fp
5119 moves, that force all constants to memory to allow combining. */
5120 if (MEM_P (operand) && MEM_READONLY_P (operand))
5121 operand = avoid_constant_pool_reference (operand);
5122
5123 if (MEM_P (operand) && !offsettable_memref_p (operand))
5124 {
5125 /* The only non-offsetable memories we handle are pushes. */
5126 int ok = push_operand (operand, VOIDmode);
5127
5128 gcc_assert (ok);
5129
5130 operand = copy_rtx (operand);
5131 PUT_MODE (operand, word_mode);
5132 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5133 return size;
5134 }
5135
5136 if (GET_CODE (operand) == CONST_VECTOR)
5137 {
5138 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5139 /* Caution: if we looked through a constant pool memory above,
5140 the operand may actually have a different mode now. That's
5141 ok, since we want to pun this all the way back to an integer. */
5142 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5143 gcc_assert (operand != NULL);
5144 mode = imode;
5145 }
5146
5147 if (!TARGET_64BIT)
5148 {
5149 if (mode == DImode)
5150 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5151 else
5152 {
5153 int i;
5154
5155 if (REG_P (operand))
5156 {
5157 gcc_assert (reload_completed);
5158 for (i = 0; i < size; i++)
5159 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5160 }
5161 else if (offsettable_memref_p (operand))
5162 {
5163 operand = adjust_address (operand, SImode, 0);
5164 parts[0] = operand;
5165 for (i = 1; i < size; i++)
5166 parts[i] = adjust_address (operand, SImode, 4 * i);
5167 }
5168 else if (CONST_DOUBLE_P (operand))
5169 {
5170 const REAL_VALUE_TYPE *r;
5171 long l[4];
5172
5173 r = CONST_DOUBLE_REAL_VALUE (operand);
5174 switch (mode)
5175 {
5176 case E_TFmode:
5177 real_to_target (l, r, mode);
5178 parts[3] = gen_int_mode (l[3], SImode);
5179 parts[2] = gen_int_mode (l[2], SImode);
5180 break;
5181 case E_XFmode:
5182 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5183 long double may not be 80-bit. */
5184 real_to_target (l, r, mode);
5185 parts[2] = gen_int_mode (l[2], SImode);
5186 break;
5187 case E_DFmode:
5188 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5189 break;
5190 default:
5191 gcc_unreachable ();
5192 }
5193 parts[1] = gen_int_mode (l[1], SImode);
5194 parts[0] = gen_int_mode (l[0], SImode);
5195 }
5196 else
5197 gcc_unreachable ();
5198 }
5199 }
5200 else
5201 {
5202 if (mode == TImode)
5203 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5204 if (mode == XFmode || mode == TFmode)
5205 {
5206 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5207 if (REG_P (operand))
5208 {
5209 gcc_assert (reload_completed);
5210 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5211 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5212 }
5213 else if (offsettable_memref_p (operand))
5214 {
5215 operand = adjust_address (operand, DImode, 0);
5216 parts[0] = operand;
5217 parts[1] = adjust_address (operand, upper_mode, 8);
5218 }
5219 else if (CONST_DOUBLE_P (operand))
5220 {
5221 long l[4];
5222
5223 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5224
5225 /* real_to_target puts 32-bit pieces in each long. */
5226 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5227 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5228 << 32), DImode);
5229
5230 if (upper_mode == SImode)
5231 parts[1] = gen_int_mode (l[2], SImode);
5232 else
5233 parts[1]
5234 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5235 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5236 << 32), DImode);
5237 }
5238 else
5239 gcc_unreachable ();
5240 }
5241 }
5242
5243 return size;
5244 }
5245
5246 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5247 Return false when normal moves are needed; true when all required
5248 insns have been emitted. Operands 2-4 contain the input values
5249 int the correct order; operands 5-7 contain the output values. */
5250
5251 void
5252 ix86_split_long_move (rtx operands[])
5253 {
5254 rtx part[2][4];
5255 int nparts, i, j;
5256 int push = 0;
5257 int collisions = 0;
5258 machine_mode mode = GET_MODE (operands[0]);
5259 bool collisionparts[4];
5260
5261 /* The DFmode expanders may ask us to move double.
5262 For 64bit target this is single move. By hiding the fact
5263 here we simplify i386.md splitters. */
5264 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5265 {
5266 /* Optimize constant pool reference to immediates. This is used by
5267 fp moves, that force all constants to memory to allow combining. */
5268
5269 if (MEM_P (operands[1])
5270 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5271 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5272 operands[1] = get_pool_constant (XEXP (operands[1], 0));
5273 if (push_operand (operands[0], VOIDmode))
5274 {
5275 operands[0] = copy_rtx (operands[0]);
5276 PUT_MODE (operands[0], word_mode);
5277 }
5278 else
5279 operands[0] = gen_lowpart (DImode, operands[0]);
5280 operands[1] = gen_lowpart (DImode, operands[1]);
5281 emit_move_insn (operands[0], operands[1]);
5282 return;
5283 }
5284
5285 /* The only non-offsettable memory we handle is push. */
5286 if (push_operand (operands[0], VOIDmode))
5287 push = 1;
5288 else
5289 gcc_assert (!MEM_P (operands[0])
5290 || offsettable_memref_p (operands[0]));
5291
5292 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5293 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5294
5295 /* When emitting push, take care for source operands on the stack. */
5296 if (push && MEM_P (operands[1])
5297 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5298 {
5299 rtx src_base = XEXP (part[1][nparts - 1], 0);
5300
5301 /* Compensate for the stack decrement by 4. */
5302 if (!TARGET_64BIT && nparts == 3
5303 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
5304 src_base = plus_constant (Pmode, src_base, 4);
5305
5306 /* src_base refers to the stack pointer and is
5307 automatically decreased by emitted push. */
5308 for (i = 0; i < nparts; i++)
5309 part[1][i] = change_address (part[1][i],
5310 GET_MODE (part[1][i]), src_base);
5311 }
5312
5313 /* We need to do copy in the right order in case an address register
5314 of the source overlaps the destination. */
5315 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
5316 {
5317 rtx tmp;
5318
5319 for (i = 0; i < nparts; i++)
5320 {
5321 collisionparts[i]
5322 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
5323 if (collisionparts[i])
5324 collisions++;
5325 }
5326
5327 /* Collision in the middle part can be handled by reordering. */
5328 if (collisions == 1 && nparts == 3 && collisionparts [1])
5329 {
5330 std::swap (part[0][1], part[0][2]);
5331 std::swap (part[1][1], part[1][2]);
5332 }
5333 else if (collisions == 1
5334 && nparts == 4
5335 && (collisionparts [1] || collisionparts [2]))
5336 {
5337 if (collisionparts [1])
5338 {
5339 std::swap (part[0][1], part[0][2]);
5340 std::swap (part[1][1], part[1][2]);
5341 }
5342 else
5343 {
5344 std::swap (part[0][2], part[0][3]);
5345 std::swap (part[1][2], part[1][3]);
5346 }
5347 }
5348
5349 /* If there are more collisions, we can't handle it by reordering.
5350 Do an lea to the last part and use only one colliding move. */
5351 else if (collisions > 1)
5352 {
5353 rtx base, addr;
5354
5355 collisions = 1;
5356
5357 base = part[0][nparts - 1];
5358
5359 /* Handle the case when the last part isn't valid for lea.
5360 Happens in 64-bit mode storing the 12-byte XFmode. */
5361 if (GET_MODE (base) != Pmode)
5362 base = gen_rtx_REG (Pmode, REGNO (base));
5363
5364 addr = XEXP (part[1][0], 0);
5365 if (TARGET_TLS_DIRECT_SEG_REFS)
5366 {
5367 struct ix86_address parts;
5368 int ok = ix86_decompose_address (addr, &parts);
5369 gcc_assert (ok);
5370 /* It is not valid to use %gs: or %fs: in lea. */
5371 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
5372 }
5373 emit_insn (gen_rtx_SET (base, addr));
5374 part[1][0] = replace_equiv_address (part[1][0], base);
5375 for (i = 1; i < nparts; i++)
5376 {
5377 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
5378 part[1][i] = replace_equiv_address (part[1][i], tmp);
5379 }
5380 }
5381 }
5382
5383 if (push)
5384 {
5385 if (!TARGET_64BIT)
5386 {
5387 if (nparts == 3)
5388 {
5389 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
5390 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
5391 emit_move_insn (part[0][2], part[1][2]);
5392 }
5393 else if (nparts == 4)
5394 {
5395 emit_move_insn (part[0][3], part[1][3]);
5396 emit_move_insn (part[0][2], part[1][2]);
5397 }
5398 }
5399 else
5400 {
5401 /* In 64bit mode we don't have 32bit push available. In case this is
5402 register, it is OK - we will just use larger counterpart. We also
5403 retype memory - these comes from attempt to avoid REX prefix on
5404 moving of second half of TFmode value. */
5405 if (GET_MODE (part[1][1]) == SImode)
5406 {
5407 switch (GET_CODE (part[1][1]))
5408 {
5409 case MEM:
5410 part[1][1] = adjust_address (part[1][1], DImode, 0);
5411 break;
5412
5413 case REG:
5414 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
5415 break;
5416
5417 default:
5418 gcc_unreachable ();
5419 }
5420
5421 if (GET_MODE (part[1][0]) == SImode)
5422 part[1][0] = part[1][1];
5423 }
5424 }
5425 emit_move_insn (part[0][1], part[1][1]);
5426 emit_move_insn (part[0][0], part[1][0]);
5427 return;
5428 }
5429
5430 /* Choose correct order to not overwrite the source before it is copied. */
5431 if ((REG_P (part[0][0])
5432 && REG_P (part[1][1])
5433 && (REGNO (part[0][0]) == REGNO (part[1][1])
5434 || (nparts == 3
5435 && REGNO (part[0][0]) == REGNO (part[1][2]))
5436 || (nparts == 4
5437 && REGNO (part[0][0]) == REGNO (part[1][3]))))
5438 || (collisions > 0
5439 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
5440 {
5441 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
5442 {
5443 operands[2 + i] = part[0][j];
5444 operands[6 + i] = part[1][j];
5445 }
5446 }
5447 else
5448 {
5449 for (i = 0; i < nparts; i++)
5450 {
5451 operands[2 + i] = part[0][i];
5452 operands[6 + i] = part[1][i];
5453 }
5454 }
5455
5456 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5457 if (optimize_insn_for_size_p ())
5458 {
5459 for (j = 0; j < nparts - 1; j++)
5460 if (CONST_INT_P (operands[6 + j])
5461 && operands[6 + j] != const0_rtx
5462 && REG_P (operands[2 + j]))
5463 for (i = j; i < nparts - 1; i++)
5464 if (CONST_INT_P (operands[7 + i])
5465 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
5466 operands[7 + i] = operands[2 + j];
5467 }
5468
5469 for (i = 0; i < nparts; i++)
5470 emit_move_insn (operands[2 + i], operands[6 + i]);
5471
5472 return;
5473 }
5474
5475 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5476 left shift by a constant, either using a single shift or
5477 a sequence of add instructions. */
5478
5479 static void
5480 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
5481 {
5482 if (count == 1
5483 || (count * ix86_cost->add <= ix86_cost->shift_const
5484 && !optimize_insn_for_size_p ()))
5485 {
5486 while (count-- > 0)
5487 emit_insn (gen_add2_insn (operand, operand));
5488 }
5489 else
5490 {
5491 rtx (*insn)(rtx, rtx, rtx);
5492
5493 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5494 emit_insn (insn (operand, operand, GEN_INT (count)));
5495 }
5496 }
5497
5498 void
5499 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
5500 {
5501 rtx (*gen_ashl3)(rtx, rtx, rtx);
5502 rtx (*gen_shld)(rtx, rtx, rtx);
5503 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5504 machine_mode half_mode;
5505
5506 rtx low[2], high[2];
5507 int count;
5508
5509 if (CONST_INT_P (operands[2]))
5510 {
5511 split_double_mode (mode, operands, 2, low, high);
5512 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5513
5514 if (count >= half_width)
5515 {
5516 emit_move_insn (high[0], low[1]);
5517 emit_move_insn (low[0], const0_rtx);
5518
5519 if (count > half_width)
5520 ix86_expand_ashl_const (high[0], count - half_width, mode);
5521 }
5522 else
5523 {
5524 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5525
5526 if (!rtx_equal_p (operands[0], operands[1]))
5527 emit_move_insn (operands[0], operands[1]);
5528
5529 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
5530 ix86_expand_ashl_const (low[0], count, mode);
5531 }
5532 return;
5533 }
5534
5535 split_double_mode (mode, operands, 1, low, high);
5536 half_mode = mode == DImode ? SImode : DImode;
5537
5538 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5539
5540 if (operands[1] == const1_rtx)
5541 {
5542 /* Assuming we've chosen a QImode capable registers, then 1 << N
5543 can be done with two 32/64-bit shifts, no branches, no cmoves. */
5544 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
5545 {
5546 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
5547
5548 ix86_expand_clear (low[0]);
5549 ix86_expand_clear (high[0]);
5550 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
5551
5552 d = gen_lowpart (QImode, low[0]);
5553 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5554 s = gen_rtx_EQ (QImode, flags, const0_rtx);
5555 emit_insn (gen_rtx_SET (d, s));
5556
5557 d = gen_lowpart (QImode, high[0]);
5558 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5559 s = gen_rtx_NE (QImode, flags, const0_rtx);
5560 emit_insn (gen_rtx_SET (d, s));
5561 }
5562
5563 /* Otherwise, we can get the same results by manually performing
5564 a bit extract operation on bit 5/6, and then performing the two
5565 shifts. The two methods of getting 0/1 into low/high are exactly
5566 the same size. Avoiding the shift in the bit extract case helps
5567 pentium4 a bit; no one else seems to care much either way. */
5568 else
5569 {
5570 rtx (*gen_lshr3)(rtx, rtx, rtx);
5571 rtx (*gen_and3)(rtx, rtx, rtx);
5572 rtx (*gen_xor3)(rtx, rtx, rtx);
5573 HOST_WIDE_INT bits;
5574 rtx x;
5575
5576 if (mode == DImode)
5577 {
5578 gen_lshr3 = gen_lshrsi3;
5579 gen_and3 = gen_andsi3;
5580 gen_xor3 = gen_xorsi3;
5581 bits = 5;
5582 }
5583 else
5584 {
5585 gen_lshr3 = gen_lshrdi3;
5586 gen_and3 = gen_anddi3;
5587 gen_xor3 = gen_xordi3;
5588 bits = 6;
5589 }
5590
5591 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
5592 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
5593 else
5594 x = gen_lowpart (half_mode, operands[2]);
5595 emit_insn (gen_rtx_SET (high[0], x));
5596
5597 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
5598 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
5599 emit_move_insn (low[0], high[0]);
5600 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
5601 }
5602
5603 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5604 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
5605 return;
5606 }
5607
5608 if (operands[1] == constm1_rtx)
5609 {
5610 /* For -1 << N, we can avoid the shld instruction, because we
5611 know that we're shifting 0...31/63 ones into a -1. */
5612 emit_move_insn (low[0], constm1_rtx);
5613 if (optimize_insn_for_size_p ())
5614 emit_move_insn (high[0], low[0]);
5615 else
5616 emit_move_insn (high[0], constm1_rtx);
5617 }
5618 else
5619 {
5620 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5621
5622 if (!rtx_equal_p (operands[0], operands[1]))
5623 emit_move_insn (operands[0], operands[1]);
5624
5625 split_double_mode (mode, operands, 1, low, high);
5626 emit_insn (gen_shld (high[0], low[0], operands[2]));
5627 }
5628
5629 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5630
5631 if (TARGET_CMOVE && scratch)
5632 {
5633 ix86_expand_clear (scratch);
5634 emit_insn (gen_x86_shift_adj_1
5635 (half_mode, high[0], low[0], operands[2], scratch));
5636 }
5637 else
5638 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
5639 }
5640
5641 void
5642 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
5643 {
5644 rtx (*gen_ashr3)(rtx, rtx, rtx)
5645 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
5646 rtx (*gen_shrd)(rtx, rtx, rtx);
5647 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5648
5649 rtx low[2], high[2];
5650 int count;
5651
5652 if (CONST_INT_P (operands[2]))
5653 {
5654 split_double_mode (mode, operands, 2, low, high);
5655 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5656
5657 if (count == GET_MODE_BITSIZE (mode) - 1)
5658 {
5659 emit_move_insn (high[0], high[1]);
5660 emit_insn (gen_ashr3 (high[0], high[0],
5661 GEN_INT (half_width - 1)));
5662 emit_move_insn (low[0], high[0]);
5663
5664 }
5665 else if (count >= half_width)
5666 {
5667 emit_move_insn (low[0], high[1]);
5668 emit_move_insn (high[0], low[0]);
5669 emit_insn (gen_ashr3 (high[0], high[0],
5670 GEN_INT (half_width - 1)));
5671
5672 if (count > half_width)
5673 emit_insn (gen_ashr3 (low[0], low[0],
5674 GEN_INT (count - half_width)));
5675 }
5676 else
5677 {
5678 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5679
5680 if (!rtx_equal_p (operands[0], operands[1]))
5681 emit_move_insn (operands[0], operands[1]);
5682
5683 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5684 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
5685 }
5686 }
5687 else
5688 {
5689 machine_mode half_mode;
5690
5691 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5692
5693 if (!rtx_equal_p (operands[0], operands[1]))
5694 emit_move_insn (operands[0], operands[1]);
5695
5696 split_double_mode (mode, operands, 1, low, high);
5697 half_mode = mode == DImode ? SImode : DImode;
5698
5699 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5700 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
5701
5702 if (TARGET_CMOVE && scratch)
5703 {
5704 emit_move_insn (scratch, high[0]);
5705 emit_insn (gen_ashr3 (scratch, scratch,
5706 GEN_INT (half_width - 1)));
5707 emit_insn (gen_x86_shift_adj_1
5708 (half_mode, low[0], high[0], operands[2], scratch));
5709 }
5710 else
5711 emit_insn (gen_x86_shift_adj_3
5712 (half_mode, low[0], high[0], operands[2]));
5713 }
5714 }
5715
5716 void
5717 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
5718 {
5719 rtx (*gen_lshr3)(rtx, rtx, rtx)
5720 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
5721 rtx (*gen_shrd)(rtx, rtx, rtx);
5722 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5723
5724 rtx low[2], high[2];
5725 int count;
5726
5727 if (CONST_INT_P (operands[2]))
5728 {
5729 split_double_mode (mode, operands, 2, low, high);
5730 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5731
5732 if (count >= half_width)
5733 {
5734 emit_move_insn (low[0], high[1]);
5735 ix86_expand_clear (high[0]);
5736
5737 if (count > half_width)
5738 emit_insn (gen_lshr3 (low[0], low[0],
5739 GEN_INT (count - half_width)));
5740 }
5741 else
5742 {
5743 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5744
5745 if (!rtx_equal_p (operands[0], operands[1]))
5746 emit_move_insn (operands[0], operands[1]);
5747
5748 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5749 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
5750 }
5751 }
5752 else
5753 {
5754 machine_mode half_mode;
5755
5756 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5757
5758 if (!rtx_equal_p (operands[0], operands[1]))
5759 emit_move_insn (operands[0], operands[1]);
5760
5761 split_double_mode (mode, operands, 1, low, high);
5762 half_mode = mode == DImode ? SImode : DImode;
5763
5764 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5765 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
5766
5767 if (TARGET_CMOVE && scratch)
5768 {
5769 ix86_expand_clear (scratch);
5770 emit_insn (gen_x86_shift_adj_1
5771 (half_mode, low[0], high[0], operands[2], scratch));
5772 }
5773 else
5774 emit_insn (gen_x86_shift_adj_2
5775 (half_mode, low[0], high[0], operands[2]));
5776 }
5777 }
5778
5779 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
5780 DImode for constant loop counts. */
5781
5782 static machine_mode
5783 counter_mode (rtx count_exp)
5784 {
5785 if (GET_MODE (count_exp) != VOIDmode)
5786 return GET_MODE (count_exp);
5787 if (!CONST_INT_P (count_exp))
5788 return Pmode;
5789 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
5790 return DImode;
5791 return SImode;
5792 }
5793
5794 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5795 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5796 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
5797 memory by VALUE (supposed to be in MODE).
5798
5799 The size is rounded down to whole number of chunk size moved at once.
5800 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
5801
5802
5803 static void
5804 expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
5805 rtx destptr, rtx srcptr, rtx value,
5806 rtx count, machine_mode mode, int unroll,
5807 int expected_size, bool issetmem)
5808 {
5809 rtx_code_label *out_label, *top_label;
5810 rtx iter, tmp;
5811 machine_mode iter_mode = counter_mode (count);
5812 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
5813 rtx piece_size = GEN_INT (piece_size_n);
5814 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
5815 rtx size;
5816 int i;
5817
5818 top_label = gen_label_rtx ();
5819 out_label = gen_label_rtx ();
5820 iter = gen_reg_rtx (iter_mode);
5821
5822 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
5823 NULL, 1, OPTAB_DIRECT);
5824 /* Those two should combine. */
5825 if (piece_size == const1_rtx)
5826 {
5827 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
5828 true, out_label);
5829 predict_jump (REG_BR_PROB_BASE * 10 / 100);
5830 }
5831 emit_move_insn (iter, const0_rtx);
5832
5833 emit_label (top_label);
5834
5835 tmp = convert_modes (Pmode, iter_mode, iter, true);
5836
5837 /* This assert could be relaxed - in this case we'll need to compute
5838 smallest power of two, containing in PIECE_SIZE_N and pass it to
5839 offset_address. */
5840 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
5841 destmem = offset_address (destmem, tmp, piece_size_n);
5842 destmem = adjust_address (destmem, mode, 0);
5843
5844 if (!issetmem)
5845 {
5846 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
5847 srcmem = adjust_address (srcmem, mode, 0);
5848
5849 /* When unrolling for chips that reorder memory reads and writes,
5850 we can save registers by using single temporary.
5851 Also using 4 temporaries is overkill in 32bit mode. */
5852 if (!TARGET_64BIT && 0)
5853 {
5854 for (i = 0; i < unroll; i++)
5855 {
5856 if (i)
5857 {
5858 destmem = adjust_address (copy_rtx (destmem), mode,
5859 GET_MODE_SIZE (mode));
5860 srcmem = adjust_address (copy_rtx (srcmem), mode,
5861 GET_MODE_SIZE (mode));
5862 }
5863 emit_move_insn (destmem, srcmem);
5864 }
5865 }
5866 else
5867 {
5868 rtx tmpreg[4];
5869 gcc_assert (unroll <= 4);
5870 for (i = 0; i < unroll; i++)
5871 {
5872 tmpreg[i] = gen_reg_rtx (mode);
5873 if (i)
5874 srcmem = adjust_address (copy_rtx (srcmem), mode,
5875 GET_MODE_SIZE (mode));
5876 emit_move_insn (tmpreg[i], srcmem);
5877 }
5878 for (i = 0; i < unroll; i++)
5879 {
5880 if (i)
5881 destmem = adjust_address (copy_rtx (destmem), mode,
5882 GET_MODE_SIZE (mode));
5883 emit_move_insn (destmem, tmpreg[i]);
5884 }
5885 }
5886 }
5887 else
5888 for (i = 0; i < unroll; i++)
5889 {
5890 if (i)
5891 destmem = adjust_address (copy_rtx (destmem), mode,
5892 GET_MODE_SIZE (mode));
5893 emit_move_insn (destmem, value);
5894 }
5895
5896 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
5897 true, OPTAB_LIB_WIDEN);
5898 if (tmp != iter)
5899 emit_move_insn (iter, tmp);
5900
5901 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
5902 true, top_label);
5903 if (expected_size != -1)
5904 {
5905 expected_size /= GET_MODE_SIZE (mode) * unroll;
5906 if (expected_size == 0)
5907 predict_jump (0);
5908 else if (expected_size > REG_BR_PROB_BASE)
5909 predict_jump (REG_BR_PROB_BASE - 1);
5910 else
5911 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
5912 / expected_size);
5913 }
5914 else
5915 predict_jump (REG_BR_PROB_BASE * 80 / 100);
5916 iter = ix86_zero_extend_to_Pmode (iter);
5917 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
5918 true, OPTAB_LIB_WIDEN);
5919 if (tmp != destptr)
5920 emit_move_insn (destptr, tmp);
5921 if (!issetmem)
5922 {
5923 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
5924 true, OPTAB_LIB_WIDEN);
5925 if (tmp != srcptr)
5926 emit_move_insn (srcptr, tmp);
5927 }
5928 emit_label (out_label);
5929 }
5930
5931 /* Divide COUNTREG by SCALE. */
5932 static rtx
5933 scale_counter (rtx countreg, int scale)
5934 {
5935 rtx sc;
5936
5937 if (scale == 1)
5938 return countreg;
5939 if (CONST_INT_P (countreg))
5940 return GEN_INT (INTVAL (countreg) / scale);
5941 gcc_assert (REG_P (countreg));
5942
5943 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
5944 GEN_INT (exact_log2 (scale)),
5945 NULL, 1, OPTAB_DIRECT);
5946 return sc;
5947 }
5948
5949 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5950 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5951 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5952 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5953 ORIG_VALUE is the original value passed to memset to fill the memory with.
5954 Other arguments have same meaning as for previous function. */
5955
5956 static void
5957 expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
5958 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
5959 rtx count,
5960 machine_mode mode, bool issetmem)
5961 {
5962 rtx destexp;
5963 rtx srcexp;
5964 rtx countreg;
5965 HOST_WIDE_INT rounded_count;
5966
5967 /* If possible, it is shorter to use rep movs.
5968 TODO: Maybe it is better to move this logic to decide_alg. */
5969 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
5970 && (!issetmem || orig_value == const0_rtx))
5971 mode = SImode;
5972
5973 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
5974 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
5975
5976 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
5977 GET_MODE_SIZE (mode)));
5978 if (mode != QImode)
5979 {
5980 destexp = gen_rtx_ASHIFT (Pmode, countreg,
5981 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
5982 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
5983 }
5984 else
5985 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
5986 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
5987 {
5988 rounded_count
5989 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
5990 destmem = shallow_copy_rtx (destmem);
5991 set_mem_size (destmem, rounded_count);
5992 }
5993 else if (MEM_SIZE_KNOWN_P (destmem))
5994 clear_mem_size (destmem);
5995
5996 if (issetmem)
5997 {
5998 value = force_reg (mode, gen_lowpart (mode, value));
5999 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
6000 }
6001 else
6002 {
6003 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
6004 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
6005 if (mode != QImode)
6006 {
6007 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
6008 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
6009 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
6010 }
6011 else
6012 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
6013 if (CONST_INT_P (count))
6014 {
6015 rounded_count
6016 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
6017 srcmem = shallow_copy_rtx (srcmem);
6018 set_mem_size (srcmem, rounded_count);
6019 }
6020 else
6021 {
6022 if (MEM_SIZE_KNOWN_P (srcmem))
6023 clear_mem_size (srcmem);
6024 }
6025 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
6026 destexp, srcexp));
6027 }
6028 }
6029
6030 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
6031 DESTMEM.
6032 SRC is passed by pointer to be updated on return.
6033 Return value is updated DST. */
6034 static rtx
6035 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
6036 HOST_WIDE_INT size_to_move)
6037 {
6038 rtx dst = destmem, src = *srcmem, adjust, tempreg;
6039 enum insn_code code;
6040 machine_mode move_mode;
6041 int piece_size, i;
6042
6043 /* Find the widest mode in which we could perform moves.
6044 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6045 it until move of such size is supported. */
6046 piece_size = 1 << floor_log2 (size_to_move);
6047 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
6048 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6049 {
6050 gcc_assert (piece_size > 1);
6051 piece_size >>= 1;
6052 }
6053
6054 /* Find the corresponding vector mode with the same size as MOVE_MODE.
6055 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
6056 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
6057 {
6058 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
6059 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
6060 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6061 {
6062 move_mode = word_mode;
6063 piece_size = GET_MODE_SIZE (move_mode);
6064 code = optab_handler (mov_optab, move_mode);
6065 }
6066 }
6067 gcc_assert (code != CODE_FOR_nothing);
6068
6069 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6070 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
6071
6072 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6073 gcc_assert (size_to_move % piece_size == 0);
6074 adjust = GEN_INT (piece_size);
6075 for (i = 0; i < size_to_move; i += piece_size)
6076 {
6077 /* We move from memory to memory, so we'll need to do it via
6078 a temporary register. */
6079 tempreg = gen_reg_rtx (move_mode);
6080 emit_insn (GEN_FCN (code) (tempreg, src));
6081 emit_insn (GEN_FCN (code) (dst, tempreg));
6082
6083 emit_move_insn (destptr,
6084 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
6085 emit_move_insn (srcptr,
6086 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
6087
6088 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6089 piece_size);
6090 src = adjust_automodify_address_nv (src, move_mode, srcptr,
6091 piece_size);
6092 }
6093
6094 /* Update DST and SRC rtx. */
6095 *srcmem = src;
6096 return dst;
6097 }
6098
6099 /* Helper function for the string operations below. Dest VARIABLE whether
6100 it is aligned to VALUE bytes. If true, jump to the label. */
6101
6102 static rtx_code_label *
6103 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
6104 {
6105 rtx_code_label *label = gen_label_rtx ();
6106 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
6107 if (GET_MODE (variable) == DImode)
6108 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
6109 else
6110 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
6111 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
6112 1, label);
6113 if (epilogue)
6114 predict_jump (REG_BR_PROB_BASE * 50 / 100);
6115 else
6116 predict_jump (REG_BR_PROB_BASE * 90 / 100);
6117 return label;
6118 }
6119
6120
6121 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
6122
6123 static void
6124 expand_cpymem_epilogue (rtx destmem, rtx srcmem,
6125 rtx destptr, rtx srcptr, rtx count, int max_size)
6126 {
6127 rtx src, dest;
6128 if (CONST_INT_P (count))
6129 {
6130 HOST_WIDE_INT countval = INTVAL (count);
6131 HOST_WIDE_INT epilogue_size = countval % max_size;
6132 int i;
6133
6134 /* For now MAX_SIZE should be a power of 2. This assert could be
6135 relaxed, but it'll require a bit more complicated epilogue
6136 expanding. */
6137 gcc_assert ((max_size & (max_size - 1)) == 0);
6138 for (i = max_size; i >= 1; i >>= 1)
6139 {
6140 if (epilogue_size & i)
6141 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6142 }
6143 return;
6144 }
6145 if (max_size > 8)
6146 {
6147 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
6148 count, 1, OPTAB_DIRECT);
6149 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
6150 count, QImode, 1, 4, false);
6151 return;
6152 }
6153
6154 /* When there are stringops, we can cheaply increase dest and src pointers.
6155 Otherwise we save code size by maintaining offset (zero is readily
6156 available from preceding rep operation) and using x86 addressing modes.
6157 */
6158 if (TARGET_SINGLE_STRINGOP)
6159 {
6160 if (max_size > 4)
6161 {
6162 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6163 src = change_address (srcmem, SImode, srcptr);
6164 dest = change_address (destmem, SImode, destptr);
6165 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6166 emit_label (label);
6167 LABEL_NUSES (label) = 1;
6168 }
6169 if (max_size > 2)
6170 {
6171 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6172 src = change_address (srcmem, HImode, srcptr);
6173 dest = change_address (destmem, HImode, destptr);
6174 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6175 emit_label (label);
6176 LABEL_NUSES (label) = 1;
6177 }
6178 if (max_size > 1)
6179 {
6180 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6181 src = change_address (srcmem, QImode, srcptr);
6182 dest = change_address (destmem, QImode, destptr);
6183 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6184 emit_label (label);
6185 LABEL_NUSES (label) = 1;
6186 }
6187 }
6188 else
6189 {
6190 rtx offset = force_reg (Pmode, const0_rtx);
6191 rtx tmp;
6192
6193 if (max_size > 4)
6194 {
6195 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6196 src = change_address (srcmem, SImode, srcptr);
6197 dest = change_address (destmem, SImode, destptr);
6198 emit_move_insn (dest, src);
6199 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
6200 true, OPTAB_LIB_WIDEN);
6201 if (tmp != offset)
6202 emit_move_insn (offset, tmp);
6203 emit_label (label);
6204 LABEL_NUSES (label) = 1;
6205 }
6206 if (max_size > 2)
6207 {
6208 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6209 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6210 src = change_address (srcmem, HImode, tmp);
6211 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6212 dest = change_address (destmem, HImode, tmp);
6213 emit_move_insn (dest, src);
6214 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
6215 true, OPTAB_LIB_WIDEN);
6216 if (tmp != offset)
6217 emit_move_insn (offset, tmp);
6218 emit_label (label);
6219 LABEL_NUSES (label) = 1;
6220 }
6221 if (max_size > 1)
6222 {
6223 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6224 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6225 src = change_address (srcmem, QImode, tmp);
6226 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6227 dest = change_address (destmem, QImode, tmp);
6228 emit_move_insn (dest, src);
6229 emit_label (label);
6230 LABEL_NUSES (label) = 1;
6231 }
6232 }
6233 }
6234
6235 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6236 with value PROMOTED_VAL.
6237 SRC is passed by pointer to be updated on return.
6238 Return value is updated DST. */
6239 static rtx
6240 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
6241 HOST_WIDE_INT size_to_move)
6242 {
6243 rtx dst = destmem, adjust;
6244 enum insn_code code;
6245 machine_mode move_mode;
6246 int piece_size, i;
6247
6248 /* Find the widest mode in which we could perform moves.
6249 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6250 it until move of such size is supported. */
6251 move_mode = GET_MODE (promoted_val);
6252 if (move_mode == VOIDmode)
6253 move_mode = QImode;
6254 if (size_to_move < GET_MODE_SIZE (move_mode))
6255 {
6256 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
6257 move_mode = int_mode_for_size (move_bits, 0).require ();
6258 promoted_val = gen_lowpart (move_mode, promoted_val);
6259 }
6260 piece_size = GET_MODE_SIZE (move_mode);
6261 code = optab_handler (mov_optab, move_mode);
6262 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
6263
6264 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6265
6266 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6267 gcc_assert (size_to_move % piece_size == 0);
6268 adjust = GEN_INT (piece_size);
6269 for (i = 0; i < size_to_move; i += piece_size)
6270 {
6271 if (piece_size <= GET_MODE_SIZE (word_mode))
6272 {
6273 emit_insn (gen_strset (destptr, dst, promoted_val));
6274 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6275 piece_size);
6276 continue;
6277 }
6278
6279 emit_insn (GEN_FCN (code) (dst, promoted_val));
6280
6281 emit_move_insn (destptr,
6282 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
6283
6284 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6285 piece_size);
6286 }
6287
6288 /* Update DST rtx. */
6289 return dst;
6290 }
6291 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6292 static void
6293 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
6294 rtx count, int max_size)
6295 {
6296 count = expand_simple_binop (counter_mode (count), AND, count,
6297 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
6298 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
6299 gen_lowpart (QImode, value), count, QImode,
6300 1, max_size / 2, true);
6301 }
6302
6303 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6304 static void
6305 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
6306 rtx count, int max_size)
6307 {
6308 rtx dest;
6309
6310 if (CONST_INT_P (count))
6311 {
6312 HOST_WIDE_INT countval = INTVAL (count);
6313 HOST_WIDE_INT epilogue_size = countval % max_size;
6314 int i;
6315
6316 /* For now MAX_SIZE should be a power of 2. This assert could be
6317 relaxed, but it'll require a bit more complicated epilogue
6318 expanding. */
6319 gcc_assert ((max_size & (max_size - 1)) == 0);
6320 for (i = max_size; i >= 1; i >>= 1)
6321 {
6322 if (epilogue_size & i)
6323 {
6324 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6325 destmem = emit_memset (destmem, destptr, vec_value, i);
6326 else
6327 destmem = emit_memset (destmem, destptr, value, i);
6328 }
6329 }
6330 return;
6331 }
6332 if (max_size > 32)
6333 {
6334 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
6335 return;
6336 }
6337 if (max_size > 16)
6338 {
6339 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
6340 if (TARGET_64BIT)
6341 {
6342 dest = change_address (destmem, DImode, destptr);
6343 emit_insn (gen_strset (destptr, dest, value));
6344 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
6345 emit_insn (gen_strset (destptr, dest, value));
6346 }
6347 else
6348 {
6349 dest = change_address (destmem, SImode, destptr);
6350 emit_insn (gen_strset (destptr, dest, value));
6351 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6352 emit_insn (gen_strset (destptr, dest, value));
6353 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
6354 emit_insn (gen_strset (destptr, dest, value));
6355 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
6356 emit_insn (gen_strset (destptr, dest, value));
6357 }
6358 emit_label (label);
6359 LABEL_NUSES (label) = 1;
6360 }
6361 if (max_size > 8)
6362 {
6363 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
6364 if (TARGET_64BIT)
6365 {
6366 dest = change_address (destmem, DImode, destptr);
6367 emit_insn (gen_strset (destptr, dest, value));
6368 }
6369 else
6370 {
6371 dest = change_address (destmem, SImode, destptr);
6372 emit_insn (gen_strset (destptr, dest, value));
6373 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6374 emit_insn (gen_strset (destptr, dest, value));
6375 }
6376 emit_label (label);
6377 LABEL_NUSES (label) = 1;
6378 }
6379 if (max_size > 4)
6380 {
6381 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6382 dest = change_address (destmem, SImode, destptr);
6383 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
6384 emit_label (label);
6385 LABEL_NUSES (label) = 1;
6386 }
6387 if (max_size > 2)
6388 {
6389 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6390 dest = change_address (destmem, HImode, destptr);
6391 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
6392 emit_label (label);
6393 LABEL_NUSES (label) = 1;
6394 }
6395 if (max_size > 1)
6396 {
6397 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6398 dest = change_address (destmem, QImode, destptr);
6399 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
6400 emit_label (label);
6401 LABEL_NUSES (label) = 1;
6402 }
6403 }
6404
6405 /* Adjust COUNTER by the VALUE. */
6406 static void
6407 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
6408 {
6409 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
6410 }
6411
6412 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6413 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
6414 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6415 ignored.
6416 Return value is updated DESTMEM. */
6417
6418 static rtx
6419 expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
6420 rtx destptr, rtx srcptr, rtx value,
6421 rtx vec_value, rtx count, int align,
6422 int desired_alignment, bool issetmem)
6423 {
6424 int i;
6425 for (i = 1; i < desired_alignment; i <<= 1)
6426 {
6427 if (align <= i)
6428 {
6429 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
6430 if (issetmem)
6431 {
6432 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6433 destmem = emit_memset (destmem, destptr, vec_value, i);
6434 else
6435 destmem = emit_memset (destmem, destptr, value, i);
6436 }
6437 else
6438 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6439 ix86_adjust_counter (count, i);
6440 emit_label (label);
6441 LABEL_NUSES (label) = 1;
6442 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
6443 }
6444 }
6445 return destmem;
6446 }
6447
6448 /* Test if COUNT&SIZE is nonzero and if so, expand movme
6449 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6450 and jump to DONE_LABEL. */
6451 static void
6452 expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
6453 rtx destptr, rtx srcptr,
6454 rtx value, rtx vec_value,
6455 rtx count, int size,
6456 rtx done_label, bool issetmem)
6457 {
6458 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
6459 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
6460 rtx modesize;
6461 int n;
6462
6463 /* If we do not have vector value to copy, we must reduce size. */
6464 if (issetmem)
6465 {
6466 if (!vec_value)
6467 {
6468 if (GET_MODE (value) == VOIDmode && size > 8)
6469 mode = Pmode;
6470 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
6471 mode = GET_MODE (value);
6472 }
6473 else
6474 mode = GET_MODE (vec_value), value = vec_value;
6475 }
6476 else
6477 {
6478 /* Choose appropriate vector mode. */
6479 if (size >= 32)
6480 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
6481 else if (size >= 16)
6482 mode = TARGET_SSE ? V16QImode : DImode;
6483 srcmem = change_address (srcmem, mode, srcptr);
6484 }
6485 destmem = change_address (destmem, mode, destptr);
6486 modesize = GEN_INT (GET_MODE_SIZE (mode));
6487 gcc_assert (GET_MODE_SIZE (mode) <= size);
6488 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6489 {
6490 if (issetmem)
6491 emit_move_insn (destmem, gen_lowpart (mode, value));
6492 else
6493 {
6494 emit_move_insn (destmem, srcmem);
6495 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6496 }
6497 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6498 }
6499
6500 destmem = offset_address (destmem, count, 1);
6501 destmem = offset_address (destmem, GEN_INT (-2 * size),
6502 GET_MODE_SIZE (mode));
6503 if (!issetmem)
6504 {
6505 srcmem = offset_address (srcmem, count, 1);
6506 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
6507 GET_MODE_SIZE (mode));
6508 }
6509 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6510 {
6511 if (issetmem)
6512 emit_move_insn (destmem, gen_lowpart (mode, value));
6513 else
6514 {
6515 emit_move_insn (destmem, srcmem);
6516 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6517 }
6518 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6519 }
6520 emit_jump_insn (gen_jump (done_label));
6521 emit_barrier ();
6522
6523 emit_label (label);
6524 LABEL_NUSES (label) = 1;
6525 }
6526
6527 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6528 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6529 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6530 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6531 DONE_LABEL is a label after the whole copying sequence. The label is created
6532 on demand if *DONE_LABEL is NULL.
6533 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
6534 bounds after the initial copies.
6535
6536 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6537 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6538 we will dispatch to a library call for large blocks.
6539
6540 In pseudocode we do:
6541
6542 if (COUNT < SIZE)
6543 {
6544 Assume that SIZE is 4. Bigger sizes are handled analogously
6545 if (COUNT & 4)
6546 {
6547 copy 4 bytes from SRCPTR to DESTPTR
6548 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6549 goto done_label
6550 }
6551 if (!COUNT)
6552 goto done_label;
6553 copy 1 byte from SRCPTR to DESTPTR
6554 if (COUNT & 2)
6555 {
6556 copy 2 bytes from SRCPTR to DESTPTR
6557 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6558 }
6559 }
6560 else
6561 {
6562 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6563 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6564
6565 OLD_DESPTR = DESTPTR;
6566 Align DESTPTR up to DESIRED_ALIGN
6567 SRCPTR += DESTPTR - OLD_DESTPTR
6568 COUNT -= DEST_PTR - OLD_DESTPTR
6569 if (DYNAMIC_CHECK)
6570 Round COUNT down to multiple of SIZE
6571 << optional caller supplied zero size guard is here >>
6572 << optional caller supplied dynamic check is here >>
6573 << caller supplied main copy loop is here >>
6574 }
6575 done_label:
6576 */
6577 static void
6578 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
6579 rtx *destptr, rtx *srcptr,
6580 machine_mode mode,
6581 rtx value, rtx vec_value,
6582 rtx *count,
6583 rtx_code_label **done_label,
6584 int size,
6585 int desired_align,
6586 int align,
6587 unsigned HOST_WIDE_INT *min_size,
6588 bool dynamic_check,
6589 bool issetmem)
6590 {
6591 rtx_code_label *loop_label = NULL, *label;
6592 int n;
6593 rtx modesize;
6594 int prolog_size = 0;
6595 rtx mode_value;
6596
6597 /* Chose proper value to copy. */
6598 if (issetmem && VECTOR_MODE_P (mode))
6599 mode_value = vec_value;
6600 else
6601 mode_value = value;
6602 gcc_assert (GET_MODE_SIZE (mode) <= size);
6603
6604 /* See if block is big or small, handle small blocks. */
6605 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
6606 {
6607 int size2 = size;
6608 loop_label = gen_label_rtx ();
6609
6610 if (!*done_label)
6611 *done_label = gen_label_rtx ();
6612
6613 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
6614 1, loop_label);
6615 size2 >>= 1;
6616
6617 /* Handle sizes > 3. */
6618 for (;size2 > 2; size2 >>= 1)
6619 expand_small_cpymem_or_setmem (destmem, srcmem,
6620 *destptr, *srcptr,
6621 value, vec_value,
6622 *count,
6623 size2, *done_label, issetmem);
6624 /* Nothing to copy? Jump to DONE_LABEL if so */
6625 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
6626 1, *done_label);
6627
6628 /* Do a byte copy. */
6629 destmem = change_address (destmem, QImode, *destptr);
6630 if (issetmem)
6631 emit_move_insn (destmem, gen_lowpart (QImode, value));
6632 else
6633 {
6634 srcmem = change_address (srcmem, QImode, *srcptr);
6635 emit_move_insn (destmem, srcmem);
6636 }
6637
6638 /* Handle sizes 2 and 3. */
6639 label = ix86_expand_aligntest (*count, 2, false);
6640 destmem = change_address (destmem, HImode, *destptr);
6641 destmem = offset_address (destmem, *count, 1);
6642 destmem = offset_address (destmem, GEN_INT (-2), 2);
6643 if (issetmem)
6644 emit_move_insn (destmem, gen_lowpart (HImode, value));
6645 else
6646 {
6647 srcmem = change_address (srcmem, HImode, *srcptr);
6648 srcmem = offset_address (srcmem, *count, 1);
6649 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
6650 emit_move_insn (destmem, srcmem);
6651 }
6652
6653 emit_label (label);
6654 LABEL_NUSES (label) = 1;
6655 emit_jump_insn (gen_jump (*done_label));
6656 emit_barrier ();
6657 }
6658 else
6659 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
6660 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
6661
6662 /* Start memcpy for COUNT >= SIZE. */
6663 if (loop_label)
6664 {
6665 emit_label (loop_label);
6666 LABEL_NUSES (loop_label) = 1;
6667 }
6668
6669 /* Copy first desired_align bytes. */
6670 if (!issetmem)
6671 srcmem = change_address (srcmem, mode, *srcptr);
6672 destmem = change_address (destmem, mode, *destptr);
6673 modesize = GEN_INT (GET_MODE_SIZE (mode));
6674 for (n = 0; prolog_size < desired_align - align; n++)
6675 {
6676 if (issetmem)
6677 emit_move_insn (destmem, mode_value);
6678 else
6679 {
6680 emit_move_insn (destmem, srcmem);
6681 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6682 }
6683 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6684 prolog_size += GET_MODE_SIZE (mode);
6685 }
6686
6687
6688 /* Copy last SIZE bytes. */
6689 destmem = offset_address (destmem, *count, 1);
6690 destmem = offset_address (destmem,
6691 GEN_INT (-size - prolog_size),
6692 1);
6693 if (issetmem)
6694 emit_move_insn (destmem, mode_value);
6695 else
6696 {
6697 srcmem = offset_address (srcmem, *count, 1);
6698 srcmem = offset_address (srcmem,
6699 GEN_INT (-size - prolog_size),
6700 1);
6701 emit_move_insn (destmem, srcmem);
6702 }
6703 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
6704 {
6705 destmem = offset_address (destmem, modesize, 1);
6706 if (issetmem)
6707 emit_move_insn (destmem, mode_value);
6708 else
6709 {
6710 srcmem = offset_address (srcmem, modesize, 1);
6711 emit_move_insn (destmem, srcmem);
6712 }
6713 }
6714
6715 /* Align destination. */
6716 if (desired_align > 1 && desired_align > align)
6717 {
6718 rtx saveddest = *destptr;
6719
6720 gcc_assert (desired_align <= size);
6721 /* Align destptr up, place it to new register. */
6722 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
6723 GEN_INT (prolog_size),
6724 NULL_RTX, 1, OPTAB_DIRECT);
6725 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
6726 REG_POINTER (*destptr) = 1;
6727 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
6728 GEN_INT (-desired_align),
6729 *destptr, 1, OPTAB_DIRECT);
6730 /* See how many bytes we skipped. */
6731 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
6732 *destptr,
6733 saveddest, 1, OPTAB_DIRECT);
6734 /* Adjust srcptr and count. */
6735 if (!issetmem)
6736 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
6737 saveddest, *srcptr, 1, OPTAB_DIRECT);
6738 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6739 saveddest, *count, 1, OPTAB_DIRECT);
6740 /* We copied at most size + prolog_size. */
6741 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
6742 *min_size
6743 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
6744 else
6745 *min_size = 0;
6746
6747 /* Our loops always round down the block size, but for dispatch to
6748 library we need precise value. */
6749 if (dynamic_check)
6750 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
6751 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
6752 }
6753 else
6754 {
6755 gcc_assert (prolog_size == 0);
6756 /* Decrease count, so we won't end up copying last word twice. */
6757 if (!CONST_INT_P (*count))
6758 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6759 constm1_rtx, *count, 1, OPTAB_DIRECT);
6760 else
6761 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
6762 (unsigned HOST_WIDE_INT)size));
6763 if (*min_size)
6764 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
6765 }
6766 }
6767
6768
6769 /* This function is like the previous one, except here we know how many bytes
6770 need to be copied. That allows us to update alignment not only of DST, which
6771 is returned, but also of SRC, which is passed as a pointer for that
6772 reason. */
6773 static rtx
6774 expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
6775 rtx srcreg, rtx value, rtx vec_value,
6776 int desired_align, int align_bytes,
6777 bool issetmem)
6778 {
6779 rtx src = NULL;
6780 rtx orig_dst = dst;
6781 rtx orig_src = NULL;
6782 int piece_size = 1;
6783 int copied_bytes = 0;
6784
6785 if (!issetmem)
6786 {
6787 gcc_assert (srcp != NULL);
6788 src = *srcp;
6789 orig_src = src;
6790 }
6791
6792 for (piece_size = 1;
6793 piece_size <= desired_align && copied_bytes < align_bytes;
6794 piece_size <<= 1)
6795 {
6796 if (align_bytes & piece_size)
6797 {
6798 if (issetmem)
6799 {
6800 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
6801 dst = emit_memset (dst, destreg, vec_value, piece_size);
6802 else
6803 dst = emit_memset (dst, destreg, value, piece_size);
6804 }
6805 else
6806 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
6807 copied_bytes += piece_size;
6808 }
6809 }
6810 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
6811 set_mem_align (dst, desired_align * BITS_PER_UNIT);
6812 if (MEM_SIZE_KNOWN_P (orig_dst))
6813 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
6814
6815 if (!issetmem)
6816 {
6817 int src_align_bytes = get_mem_align_offset (src, desired_align
6818 * BITS_PER_UNIT);
6819 if (src_align_bytes >= 0)
6820 src_align_bytes = desired_align - src_align_bytes;
6821 if (src_align_bytes >= 0)
6822 {
6823 unsigned int src_align;
6824 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
6825 {
6826 if ((src_align_bytes & (src_align - 1))
6827 == (align_bytes & (src_align - 1)))
6828 break;
6829 }
6830 if (src_align > (unsigned int) desired_align)
6831 src_align = desired_align;
6832 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
6833 set_mem_align (src, src_align * BITS_PER_UNIT);
6834 }
6835 if (MEM_SIZE_KNOWN_P (orig_src))
6836 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
6837 *srcp = src;
6838 }
6839
6840 return dst;
6841 }
6842
6843 /* Return true if ALG can be used in current context.
6844 Assume we expand memset if MEMSET is true. */
6845 static bool
6846 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
6847 {
6848 if (alg == no_stringop)
6849 return false;
6850 if (alg == vector_loop)
6851 return TARGET_SSE || TARGET_AVX;
6852 /* Algorithms using the rep prefix want at least edi and ecx;
6853 additionally, memset wants eax and memcpy wants esi. Don't
6854 consider such algorithms if the user has appropriated those
6855 registers for their own purposes, or if we have a non-default
6856 address space, since some string insns cannot override the segment. */
6857 if (alg == rep_prefix_1_byte
6858 || alg == rep_prefix_4_byte
6859 || alg == rep_prefix_8_byte)
6860 {
6861 if (have_as)
6862 return false;
6863 if (fixed_regs[CX_REG]
6864 || fixed_regs[DI_REG]
6865 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
6866 return false;
6867 }
6868 return true;
6869 }
6870
6871 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
6872 static enum stringop_alg
6873 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
6874 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
6875 bool memset, bool zero_memset, bool have_as,
6876 int *dynamic_check, bool *noalign, bool recur)
6877 {
6878 const struct stringop_algs *algs;
6879 bool optimize_for_speed;
6880 int max = 0;
6881 const struct processor_costs *cost;
6882 int i;
6883 bool any_alg_usable_p = false;
6884
6885 *noalign = false;
6886 *dynamic_check = -1;
6887
6888 /* Even if the string operation call is cold, we still might spend a lot
6889 of time processing large blocks. */
6890 if (optimize_function_for_size_p (cfun)
6891 || (optimize_insn_for_size_p ()
6892 && (max_size < 256
6893 || (expected_size != -1 && expected_size < 256))))
6894 optimize_for_speed = false;
6895 else
6896 optimize_for_speed = true;
6897
6898 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
6899 if (memset)
6900 algs = &cost->memset[TARGET_64BIT != 0];
6901 else
6902 algs = &cost->memcpy[TARGET_64BIT != 0];
6903
6904 /* See maximal size for user defined algorithm. */
6905 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6906 {
6907 enum stringop_alg candidate = algs->size[i].alg;
6908 bool usable = alg_usable_p (candidate, memset, have_as);
6909 any_alg_usable_p |= usable;
6910
6911 if (candidate != libcall && candidate && usable)
6912 max = algs->size[i].max;
6913 }
6914
6915 /* If expected size is not known but max size is small enough
6916 so inline version is a win, set expected size into
6917 the range. */
6918 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
6919 && expected_size == -1)
6920 expected_size = min_size / 2 + max_size / 2;
6921
6922 /* If user specified the algorithm, honor it if possible. */
6923 if (ix86_stringop_alg != no_stringop
6924 && alg_usable_p (ix86_stringop_alg, memset, have_as))
6925 return ix86_stringop_alg;
6926 /* rep; movq or rep; movl is the smallest variant. */
6927 else if (!optimize_for_speed)
6928 {
6929 *noalign = true;
6930 if (!count || (count & 3) || (memset && !zero_memset))
6931 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
6932 ? rep_prefix_1_byte : loop_1_byte;
6933 else
6934 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
6935 ? rep_prefix_4_byte : loop;
6936 }
6937 /* Very tiny blocks are best handled via the loop, REP is expensive to
6938 setup. */
6939 else if (expected_size != -1 && expected_size < 4)
6940 return loop_1_byte;
6941 else if (expected_size != -1)
6942 {
6943 enum stringop_alg alg = libcall;
6944 bool alg_noalign = false;
6945 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6946 {
6947 /* We get here if the algorithms that were not libcall-based
6948 were rep-prefix based and we are unable to use rep prefixes
6949 based on global register usage. Break out of the loop and
6950 use the heuristic below. */
6951 if (algs->size[i].max == 0)
6952 break;
6953 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
6954 {
6955 enum stringop_alg candidate = algs->size[i].alg;
6956
6957 if (candidate != libcall
6958 && alg_usable_p (candidate, memset, have_as))
6959 {
6960 alg = candidate;
6961 alg_noalign = algs->size[i].noalign;
6962 }
6963 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6964 last non-libcall inline algorithm. */
6965 if (TARGET_INLINE_ALL_STRINGOPS)
6966 {
6967 /* When the current size is best to be copied by a libcall,
6968 but we are still forced to inline, run the heuristic below
6969 that will pick code for medium sized blocks. */
6970 if (alg != libcall)
6971 {
6972 *noalign = alg_noalign;
6973 return alg;
6974 }
6975 else if (!any_alg_usable_p)
6976 break;
6977 }
6978 else if (alg_usable_p (candidate, memset, have_as))
6979 {
6980 *noalign = algs->size[i].noalign;
6981 return candidate;
6982 }
6983 }
6984 }
6985 }
6986 /* When asked to inline the call anyway, try to pick meaningful choice.
6987 We look for maximal size of block that is faster to copy by hand and
6988 take blocks of at most of that size guessing that average size will
6989 be roughly half of the block.
6990
6991 If this turns out to be bad, we might simply specify the preferred
6992 choice in ix86_costs. */
6993 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
6994 && (algs->unknown_size == libcall
6995 || !alg_usable_p (algs->unknown_size, memset, have_as)))
6996 {
6997 enum stringop_alg alg;
6998 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
6999
7000 /* If there aren't any usable algorithms or if recursing already,
7001 then recursing on smaller sizes or same size isn't going to
7002 find anything. Just return the simple byte-at-a-time copy loop. */
7003 if (!any_alg_usable_p || recur)
7004 {
7005 /* Pick something reasonable. */
7006 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
7007 *dynamic_check = 128;
7008 return loop_1_byte;
7009 }
7010 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
7011 zero_memset, have_as, dynamic_check, noalign, true);
7012 gcc_assert (*dynamic_check == -1);
7013 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
7014 *dynamic_check = max;
7015 else
7016 gcc_assert (alg != libcall);
7017 return alg;
7018 }
7019 return (alg_usable_p (algs->unknown_size, memset, have_as)
7020 ? algs->unknown_size : libcall);
7021 }
7022
7023 /* Decide on alignment. We know that the operand is already aligned to ALIGN
7024 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
7025 static int
7026 decide_alignment (int align,
7027 enum stringop_alg alg,
7028 int expected_size,
7029 machine_mode move_mode)
7030 {
7031 int desired_align = 0;
7032
7033 gcc_assert (alg != no_stringop);
7034
7035 if (alg == libcall)
7036 return 0;
7037 if (move_mode == VOIDmode)
7038 return 0;
7039
7040 desired_align = GET_MODE_SIZE (move_mode);
7041 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
7042 copying whole cacheline at once. */
7043 if (TARGET_PENTIUMPRO
7044 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
7045 desired_align = 8;
7046
7047 if (optimize_size)
7048 desired_align = 1;
7049 if (desired_align < align)
7050 desired_align = align;
7051 if (expected_size != -1 && expected_size < 4)
7052 desired_align = align;
7053
7054 return desired_align;
7055 }
7056
7057
7058 /* Helper function for memcpy. For QImode value 0xXY produce
7059 0xXYXYXYXY of wide specified by MODE. This is essentially
7060 a * 0x10101010, but we can do slightly better than
7061 synth_mult by unwinding the sequence by hand on CPUs with
7062 slow multiply. */
7063 static rtx
7064 promote_duplicated_reg (machine_mode mode, rtx val)
7065 {
7066 machine_mode valmode = GET_MODE (val);
7067 rtx tmp;
7068 int nops = mode == DImode ? 3 : 2;
7069
7070 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
7071 if (val == const0_rtx)
7072 return copy_to_mode_reg (mode, CONST0_RTX (mode));
7073 if (CONST_INT_P (val))
7074 {
7075 HOST_WIDE_INT v = INTVAL (val) & 255;
7076
7077 v |= v << 8;
7078 v |= v << 16;
7079 if (mode == DImode)
7080 v |= (v << 16) << 16;
7081 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
7082 }
7083
7084 if (valmode == VOIDmode)
7085 valmode = QImode;
7086 if (valmode != QImode)
7087 val = gen_lowpart (QImode, val);
7088 if (mode == QImode)
7089 return val;
7090 if (!TARGET_PARTIAL_REG_STALL)
7091 nops--;
7092 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
7093 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
7094 <= (ix86_cost->shift_const + ix86_cost->add) * nops
7095 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
7096 {
7097 rtx reg = convert_modes (mode, QImode, val, true);
7098 tmp = promote_duplicated_reg (mode, const1_rtx);
7099 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
7100 OPTAB_DIRECT);
7101 }
7102 else
7103 {
7104 rtx reg = convert_modes (mode, QImode, val, true);
7105
7106 if (!TARGET_PARTIAL_REG_STALL)
7107 if (mode == SImode)
7108 emit_insn (gen_insvsi_1 (reg, reg));
7109 else
7110 emit_insn (gen_insvdi_1 (reg, reg));
7111 else
7112 {
7113 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
7114 NULL, 1, OPTAB_DIRECT);
7115 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
7116 OPTAB_DIRECT);
7117 }
7118 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
7119 NULL, 1, OPTAB_DIRECT);
7120 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7121 if (mode == SImode)
7122 return reg;
7123 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
7124 NULL, 1, OPTAB_DIRECT);
7125 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7126 return reg;
7127 }
7128 }
7129
7130 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7131 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7132 alignment from ALIGN to DESIRED_ALIGN. */
7133 static rtx
7134 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
7135 int align)
7136 {
7137 rtx promoted_val;
7138
7139 if (TARGET_64BIT
7140 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
7141 promoted_val = promote_duplicated_reg (DImode, val);
7142 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
7143 promoted_val = promote_duplicated_reg (SImode, val);
7144 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
7145 promoted_val = promote_duplicated_reg (HImode, val);
7146 else
7147 promoted_val = val;
7148
7149 return promoted_val;
7150 }
7151
7152 /* Copy the address to a Pmode register. This is used for x32 to
7153 truncate DImode TLS address to a SImode register. */
7154
7155 static rtx
7156 ix86_copy_addr_to_reg (rtx addr)
7157 {
7158 rtx reg;
7159 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
7160 {
7161 reg = copy_addr_to_reg (addr);
7162 REG_POINTER (reg) = 1;
7163 return reg;
7164 }
7165 else
7166 {
7167 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
7168 reg = copy_to_mode_reg (DImode, addr);
7169 REG_POINTER (reg) = 1;
7170 return gen_rtx_SUBREG (SImode, reg, 0);
7171 }
7172 }
7173
7174 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
7175 operations when profitable. The code depends upon architecture, block size
7176 and alignment, but always has one of the following overall structures:
7177
7178 Aligned move sequence:
7179
7180 1) Prologue guard: Conditional that jumps up to epilogues for small
7181 blocks that can be handled by epilogue alone. This is faster
7182 but also needed for correctness, since prologue assume the block
7183 is larger than the desired alignment.
7184
7185 Optional dynamic check for size and libcall for large
7186 blocks is emitted here too, with -minline-stringops-dynamically.
7187
7188 2) Prologue: copy first few bytes in order to get destination
7189 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
7190 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7191 copied. We emit either a jump tree on power of two sized
7192 blocks, or a byte loop.
7193
7194 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7195 with specified algorithm.
7196
7197 4) Epilogue: code copying tail of the block that is too small to be
7198 handled by main body (or up to size guarded by prologue guard).
7199
7200 Misaligned move sequence
7201
7202 1) missaligned move prologue/epilogue containing:
7203 a) Prologue handling small memory blocks and jumping to done_label
7204 (skipped if blocks are known to be large enough)
7205 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7206 needed by single possibly misaligned move
7207 (skipped if alignment is not needed)
7208 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7209
7210 2) Zero size guard dispatching to done_label, if needed
7211
7212 3) dispatch to library call, if needed,
7213
7214 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7215 with specified algorithm. */
7216 bool
7217 ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
7218 rtx align_exp, rtx expected_align_exp,
7219 rtx expected_size_exp, rtx min_size_exp,
7220 rtx max_size_exp, rtx probable_max_size_exp,
7221 bool issetmem)
7222 {
7223 rtx destreg;
7224 rtx srcreg = NULL;
7225 rtx_code_label *label = NULL;
7226 rtx tmp;
7227 rtx_code_label *jump_around_label = NULL;
7228 HOST_WIDE_INT align = 1;
7229 unsigned HOST_WIDE_INT count = 0;
7230 HOST_WIDE_INT expected_size = -1;
7231 int size_needed = 0, epilogue_size_needed;
7232 int desired_align = 0, align_bytes = 0;
7233 enum stringop_alg alg;
7234 rtx promoted_val = NULL;
7235 rtx vec_promoted_val = NULL;
7236 bool force_loopy_epilogue = false;
7237 int dynamic_check;
7238 bool need_zero_guard = false;
7239 bool noalign;
7240 machine_mode move_mode = VOIDmode;
7241 machine_mode wider_mode;
7242 int unroll_factor = 1;
7243 /* TODO: Once value ranges are available, fill in proper data. */
7244 unsigned HOST_WIDE_INT min_size = 0;
7245 unsigned HOST_WIDE_INT max_size = -1;
7246 unsigned HOST_WIDE_INT probable_max_size = -1;
7247 bool misaligned_prologue_used = false;
7248 bool have_as;
7249
7250 if (CONST_INT_P (align_exp))
7251 align = INTVAL (align_exp);
7252 /* i386 can do misaligned access on reasonably increased cost. */
7253 if (CONST_INT_P (expected_align_exp)
7254 && INTVAL (expected_align_exp) > align)
7255 align = INTVAL (expected_align_exp);
7256 /* ALIGN is the minimum of destination and source alignment, but we care here
7257 just about destination alignment. */
7258 else if (!issetmem
7259 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
7260 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
7261
7262 if (CONST_INT_P (count_exp))
7263 {
7264 min_size = max_size = probable_max_size = count = expected_size
7265 = INTVAL (count_exp);
7266 /* When COUNT is 0, there is nothing to do. */
7267 if (!count)
7268 return true;
7269 }
7270 else
7271 {
7272 if (min_size_exp)
7273 min_size = INTVAL (min_size_exp);
7274 if (max_size_exp)
7275 max_size = INTVAL (max_size_exp);
7276 if (probable_max_size_exp)
7277 probable_max_size = INTVAL (probable_max_size_exp);
7278 if (CONST_INT_P (expected_size_exp))
7279 expected_size = INTVAL (expected_size_exp);
7280 }
7281
7282 /* Make sure we don't need to care about overflow later on. */
7283 if (count > (HOST_WIDE_INT_1U << 30))
7284 return false;
7285
7286 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
7287 if (!issetmem)
7288 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
7289
7290 /* Step 0: Decide on preferred algorithm, desired alignment and
7291 size of chunks to be copied by main loop. */
7292 alg = decide_alg (count, expected_size, min_size, probable_max_size,
7293 issetmem,
7294 issetmem && val_exp == const0_rtx, have_as,
7295 &dynamic_check, &noalign, false);
7296
7297 if (dump_file)
7298 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
7299 stringop_alg_names[alg]);
7300
7301 if (alg == libcall)
7302 return false;
7303 gcc_assert (alg != no_stringop);
7304
7305 /* For now vector-version of memset is generated only for memory zeroing, as
7306 creating of promoted vector value is very cheap in this case. */
7307 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
7308 alg = unrolled_loop;
7309
7310 if (!count)
7311 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
7312 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
7313 if (!issetmem)
7314 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
7315
7316 unroll_factor = 1;
7317 move_mode = word_mode;
7318 switch (alg)
7319 {
7320 case libcall:
7321 case no_stringop:
7322 case last_alg:
7323 gcc_unreachable ();
7324 case loop_1_byte:
7325 need_zero_guard = true;
7326 move_mode = QImode;
7327 break;
7328 case loop:
7329 need_zero_guard = true;
7330 break;
7331 case unrolled_loop:
7332 need_zero_guard = true;
7333 unroll_factor = (TARGET_64BIT ? 4 : 2);
7334 break;
7335 case vector_loop:
7336 need_zero_guard = true;
7337 unroll_factor = 4;
7338 /* Find the widest supported mode. */
7339 move_mode = word_mode;
7340 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
7341 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
7342 move_mode = wider_mode;
7343
7344 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
7345 move_mode = TImode;
7346
7347 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7348 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7349 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7350 {
7351 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7352 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7353 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
7354 move_mode = word_mode;
7355 }
7356 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
7357 break;
7358 case rep_prefix_8_byte:
7359 move_mode = DImode;
7360 break;
7361 case rep_prefix_4_byte:
7362 move_mode = SImode;
7363 break;
7364 case rep_prefix_1_byte:
7365 move_mode = QImode;
7366 break;
7367 }
7368 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
7369 epilogue_size_needed = size_needed;
7370
7371 /* If we are going to call any library calls conditionally, make sure any
7372 pending stack adjustment happen before the first conditional branch,
7373 otherwise they will be emitted before the library call only and won't
7374 happen from the other branches. */
7375 if (dynamic_check != -1)
7376 do_pending_stack_adjust ();
7377
7378 desired_align = decide_alignment (align, alg, expected_size, move_mode);
7379 if (!TARGET_ALIGN_STRINGOPS || noalign)
7380 align = desired_align;
7381
7382 /* Step 1: Prologue guard. */
7383
7384 /* Alignment code needs count to be in register. */
7385 if (CONST_INT_P (count_exp) && desired_align > align)
7386 {
7387 if (INTVAL (count_exp) > desired_align
7388 && INTVAL (count_exp) > size_needed)
7389 {
7390 align_bytes
7391 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
7392 if (align_bytes <= 0)
7393 align_bytes = 0;
7394 else
7395 align_bytes = desired_align - align_bytes;
7396 }
7397 if (align_bytes == 0)
7398 count_exp = force_reg (counter_mode (count_exp), count_exp);
7399 }
7400 gcc_assert (desired_align >= 1 && align >= 1);
7401
7402 /* Misaligned move sequences handle both prologue and epilogue at once.
7403 Default code generation results in a smaller code for large alignments
7404 and also avoids redundant job when sizes are known precisely. */
7405 misaligned_prologue_used
7406 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7407 && MAX (desired_align, epilogue_size_needed) <= 32
7408 && desired_align <= epilogue_size_needed
7409 && ((desired_align > align && !align_bytes)
7410 || (!count && epilogue_size_needed > 1)));
7411
7412 /* Do the cheap promotion to allow better CSE across the
7413 main loop and epilogue (ie one load of the big constant in the
7414 front of all code.
7415 For now the misaligned move sequences do not have fast path
7416 without broadcasting. */
7417 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
7418 {
7419 if (alg == vector_loop)
7420 {
7421 gcc_assert (val_exp == const0_rtx);
7422 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
7423 promoted_val = promote_duplicated_reg_to_size (val_exp,
7424 GET_MODE_SIZE (word_mode),
7425 desired_align, align);
7426 }
7427 else
7428 {
7429 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7430 desired_align, align);
7431 }
7432 }
7433 /* Misaligned move sequences handles both prologues and epilogues at once.
7434 Default code generation results in smaller code for large alignments and
7435 also avoids redundant job when sizes are known precisely. */
7436 if (misaligned_prologue_used)
7437 {
7438 /* Misaligned move prologue handled small blocks by itself. */
7439 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
7440 (dst, src, &destreg, &srcreg,
7441 move_mode, promoted_val, vec_promoted_val,
7442 &count_exp,
7443 &jump_around_label,
7444 desired_align < align
7445 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
7446 desired_align, align, &min_size, dynamic_check, issetmem);
7447 if (!issetmem)
7448 src = change_address (src, BLKmode, srcreg);
7449 dst = change_address (dst, BLKmode, destreg);
7450 set_mem_align (dst, desired_align * BITS_PER_UNIT);
7451 epilogue_size_needed = 0;
7452 if (need_zero_guard
7453 && min_size < (unsigned HOST_WIDE_INT) size_needed)
7454 {
7455 /* It is possible that we copied enough so the main loop will not
7456 execute. */
7457 gcc_assert (size_needed > 1);
7458 if (jump_around_label == NULL_RTX)
7459 jump_around_label = gen_label_rtx ();
7460 emit_cmp_and_jump_insns (count_exp,
7461 GEN_INT (size_needed),
7462 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
7463 if (expected_size == -1
7464 || expected_size < (desired_align - align) / 2 + size_needed)
7465 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7466 else
7467 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7468 }
7469 }
7470 /* Ensure that alignment prologue won't copy past end of block. */
7471 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
7472 {
7473 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
7474 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7475 Make sure it is power of 2. */
7476 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
7477
7478 /* To improve performance of small blocks, we jump around the VAL
7479 promoting mode. This mean that if the promoted VAL is not constant,
7480 we might not use it in the epilogue and have to use byte
7481 loop variant. */
7482 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
7483 force_loopy_epilogue = true;
7484 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7485 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7486 {
7487 /* If main algorithm works on QImode, no epilogue is needed.
7488 For small sizes just don't align anything. */
7489 if (size_needed == 1)
7490 desired_align = align;
7491 else
7492 goto epilogue;
7493 }
7494 else if (!count
7495 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7496 {
7497 label = gen_label_rtx ();
7498 emit_cmp_and_jump_insns (count_exp,
7499 GEN_INT (epilogue_size_needed),
7500 LTU, 0, counter_mode (count_exp), 1, label);
7501 if (expected_size == -1 || expected_size < epilogue_size_needed)
7502 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7503 else
7504 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7505 }
7506 }
7507
7508 /* Emit code to decide on runtime whether library call or inline should be
7509 used. */
7510 if (dynamic_check != -1)
7511 {
7512 if (!issetmem && CONST_INT_P (count_exp))
7513 {
7514 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
7515 {
7516 emit_block_copy_via_libcall (dst, src, count_exp);
7517 count_exp = const0_rtx;
7518 goto epilogue;
7519 }
7520 }
7521 else
7522 {
7523 rtx_code_label *hot_label = gen_label_rtx ();
7524 if (jump_around_label == NULL_RTX)
7525 jump_around_label = gen_label_rtx ();
7526 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
7527 LEU, 0, counter_mode (count_exp),
7528 1, hot_label);
7529 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7530 if (issetmem)
7531 set_storage_via_libcall (dst, count_exp, val_exp);
7532 else
7533 emit_block_copy_via_libcall (dst, src, count_exp);
7534 emit_jump (jump_around_label);
7535 emit_label (hot_label);
7536 }
7537 }
7538
7539 /* Step 2: Alignment prologue. */
7540 /* Do the expensive promotion once we branched off the small blocks. */
7541 if (issetmem && !promoted_val)
7542 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7543 desired_align, align);
7544
7545 if (desired_align > align && !misaligned_prologue_used)
7546 {
7547 if (align_bytes == 0)
7548 {
7549 /* Except for the first move in prologue, we no longer know
7550 constant offset in aliasing info. It don't seems to worth
7551 the pain to maintain it for the first move, so throw away
7552 the info early. */
7553 dst = change_address (dst, BLKmode, destreg);
7554 if (!issetmem)
7555 src = change_address (src, BLKmode, srcreg);
7556 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
7557 promoted_val, vec_promoted_val,
7558 count_exp, align, desired_align,
7559 issetmem);
7560 /* At most desired_align - align bytes are copied. */
7561 if (min_size < (unsigned)(desired_align - align))
7562 min_size = 0;
7563 else
7564 min_size -= desired_align - align;
7565 }
7566 else
7567 {
7568 /* If we know how many bytes need to be stored before dst is
7569 sufficiently aligned, maintain aliasing info accurately. */
7570 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
7571 srcreg,
7572 promoted_val,
7573 vec_promoted_val,
7574 desired_align,
7575 align_bytes,
7576 issetmem);
7577
7578 count_exp = plus_constant (counter_mode (count_exp),
7579 count_exp, -align_bytes);
7580 count -= align_bytes;
7581 min_size -= align_bytes;
7582 max_size -= align_bytes;
7583 }
7584 if (need_zero_guard
7585 && min_size < (unsigned HOST_WIDE_INT) size_needed
7586 && (count < (unsigned HOST_WIDE_INT) size_needed
7587 || (align_bytes == 0
7588 && count < ((unsigned HOST_WIDE_INT) size_needed
7589 + desired_align - align))))
7590 {
7591 /* It is possible that we copied enough so the main loop will not
7592 execute. */
7593 gcc_assert (size_needed > 1);
7594 if (label == NULL_RTX)
7595 label = gen_label_rtx ();
7596 emit_cmp_and_jump_insns (count_exp,
7597 GEN_INT (size_needed),
7598 LTU, 0, counter_mode (count_exp), 1, label);
7599 if (expected_size == -1
7600 || expected_size < (desired_align - align) / 2 + size_needed)
7601 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7602 else
7603 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7604 }
7605 }
7606 if (label && size_needed == 1)
7607 {
7608 emit_label (label);
7609 LABEL_NUSES (label) = 1;
7610 label = NULL;
7611 epilogue_size_needed = 1;
7612 if (issetmem)
7613 promoted_val = val_exp;
7614 }
7615 else if (label == NULL_RTX && !misaligned_prologue_used)
7616 epilogue_size_needed = size_needed;
7617
7618 /* Step 3: Main loop. */
7619
7620 switch (alg)
7621 {
7622 case libcall:
7623 case no_stringop:
7624 case last_alg:
7625 gcc_unreachable ();
7626 case loop_1_byte:
7627 case loop:
7628 case unrolled_loop:
7629 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
7630 count_exp, move_mode, unroll_factor,
7631 expected_size, issetmem);
7632 break;
7633 case vector_loop:
7634 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
7635 vec_promoted_val, count_exp, move_mode,
7636 unroll_factor, expected_size, issetmem);
7637 break;
7638 case rep_prefix_8_byte:
7639 case rep_prefix_4_byte:
7640 case rep_prefix_1_byte:
7641 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
7642 val_exp, count_exp, move_mode, issetmem);
7643 break;
7644 }
7645 /* Adjust properly the offset of src and dest memory for aliasing. */
7646 if (CONST_INT_P (count_exp))
7647 {
7648 if (!issetmem)
7649 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
7650 (count / size_needed) * size_needed);
7651 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
7652 (count / size_needed) * size_needed);
7653 }
7654 else
7655 {
7656 if (!issetmem)
7657 src = change_address (src, BLKmode, srcreg);
7658 dst = change_address (dst, BLKmode, destreg);
7659 }
7660
7661 /* Step 4: Epilogue to copy the remaining bytes. */
7662 epilogue:
7663 if (label)
7664 {
7665 /* When the main loop is done, COUNT_EXP might hold original count,
7666 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7667 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7668 bytes. Compensate if needed. */
7669
7670 if (size_needed < epilogue_size_needed)
7671 {
7672 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
7673 GEN_INT (size_needed - 1), count_exp, 1,
7674 OPTAB_DIRECT);
7675 if (tmp != count_exp)
7676 emit_move_insn (count_exp, tmp);
7677 }
7678 emit_label (label);
7679 LABEL_NUSES (label) = 1;
7680 }
7681
7682 if (count_exp != const0_rtx && epilogue_size_needed > 1)
7683 {
7684 if (force_loopy_epilogue)
7685 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
7686 epilogue_size_needed);
7687 else
7688 {
7689 if (issetmem)
7690 expand_setmem_epilogue (dst, destreg, promoted_val,
7691 vec_promoted_val, count_exp,
7692 epilogue_size_needed);
7693 else
7694 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
7695 epilogue_size_needed);
7696 }
7697 }
7698 if (jump_around_label)
7699 emit_label (jump_around_label);
7700 return true;
7701 }
7702
7703
7704 /* Expand the appropriate insns for doing strlen if not just doing
7705 repnz; scasb
7706
7707 out = result, initialized with the start address
7708 align_rtx = alignment of the address.
7709 scratch = scratch register, initialized with the startaddress when
7710 not aligned, otherwise undefined
7711
7712 This is just the body. It needs the initializations mentioned above and
7713 some address computing at the end. These things are done in i386.md. */
7714
7715 static void
7716 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
7717 {
7718 int align;
7719 rtx tmp;
7720 rtx_code_label *align_2_label = NULL;
7721 rtx_code_label *align_3_label = NULL;
7722 rtx_code_label *align_4_label = gen_label_rtx ();
7723 rtx_code_label *end_0_label = gen_label_rtx ();
7724 rtx mem;
7725 rtx tmpreg = gen_reg_rtx (SImode);
7726 rtx scratch = gen_reg_rtx (SImode);
7727 rtx cmp;
7728
7729 align = 0;
7730 if (CONST_INT_P (align_rtx))
7731 align = INTVAL (align_rtx);
7732
7733 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
7734
7735 /* Is there a known alignment and is it less than 4? */
7736 if (align < 4)
7737 {
7738 rtx scratch1 = gen_reg_rtx (Pmode);
7739 emit_move_insn (scratch1, out);
7740 /* Is there a known alignment and is it not 2? */
7741 if (align != 2)
7742 {
7743 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
7744 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
7745
7746 /* Leave just the 3 lower bits. */
7747 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
7748 NULL_RTX, 0, OPTAB_WIDEN);
7749
7750 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7751 Pmode, 1, align_4_label);
7752 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
7753 Pmode, 1, align_2_label);
7754 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
7755 Pmode, 1, align_3_label);
7756 }
7757 else
7758 {
7759 /* Since the alignment is 2, we have to check 2 or 0 bytes;
7760 check if is aligned to 4 - byte. */
7761
7762 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
7763 NULL_RTX, 0, OPTAB_WIDEN);
7764
7765 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7766 Pmode, 1, align_4_label);
7767 }
7768
7769 mem = change_address (src, QImode, out);
7770
7771 /* Now compare the bytes. */
7772
7773 /* Compare the first n unaligned byte on a byte per byte basis. */
7774 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
7775 QImode, 1, end_0_label);
7776
7777 /* Increment the address. */
7778 emit_insn (gen_add2_insn (out, const1_rtx));
7779
7780 /* Not needed with an alignment of 2 */
7781 if (align != 2)
7782 {
7783 emit_label (align_2_label);
7784
7785 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7786 end_0_label);
7787
7788 emit_insn (gen_add2_insn (out, const1_rtx));
7789
7790 emit_label (align_3_label);
7791 }
7792
7793 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7794 end_0_label);
7795
7796 emit_insn (gen_add2_insn (out, const1_rtx));
7797 }
7798
7799 /* Generate loop to check 4 bytes at a time. It is not a good idea to
7800 align this loop. It gives only huge programs, but does not help to
7801 speed up. */
7802 emit_label (align_4_label);
7803
7804 mem = change_address (src, SImode, out);
7805 emit_move_insn (scratch, mem);
7806 emit_insn (gen_add2_insn (out, GEN_INT (4)));
7807
7808 /* This formula yields a nonzero result iff one of the bytes is zero.
7809 This saves three branches inside loop and many cycles. */
7810
7811 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
7812 emit_insn (gen_one_cmplsi2 (scratch, scratch));
7813 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
7814 emit_insn (gen_andsi3 (tmpreg, tmpreg,
7815 gen_int_mode (0x80808080, SImode)));
7816 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
7817 align_4_label);
7818
7819 if (TARGET_CMOVE)
7820 {
7821 rtx reg = gen_reg_rtx (SImode);
7822 rtx reg2 = gen_reg_rtx (Pmode);
7823 emit_move_insn (reg, tmpreg);
7824 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
7825
7826 /* If zero is not in the first two bytes, move two bytes forward. */
7827 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7828 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7829 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7830 emit_insn (gen_rtx_SET (tmpreg,
7831 gen_rtx_IF_THEN_ELSE (SImode, tmp,
7832 reg,
7833 tmpreg)));
7834 /* Emit lea manually to avoid clobbering of flags. */
7835 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
7836
7837 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7838 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7839 emit_insn (gen_rtx_SET (out,
7840 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
7841 reg2,
7842 out)));
7843 }
7844 else
7845 {
7846 rtx_code_label *end_2_label = gen_label_rtx ();
7847 /* Is zero in the first two bytes? */
7848
7849 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7850 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7851 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
7852 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
7853 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
7854 pc_rtx);
7855 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
7856 JUMP_LABEL (tmp) = end_2_label;
7857
7858 /* Not in the first two. Move two bytes forward. */
7859 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
7860 emit_insn (gen_add2_insn (out, const2_rtx));
7861
7862 emit_label (end_2_label);
7863
7864 }
7865
7866 /* Avoid branch in fixing the byte. */
7867 tmpreg = gen_lowpart (QImode, tmpreg);
7868 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
7869 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
7870 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
7871 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
7872
7873 emit_label (end_0_label);
7874 }
7875
7876 /* Expand strlen. */
7877
7878 bool
7879 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
7880 {
7881 if (TARGET_UNROLL_STRLEN
7882 && TARGET_INLINE_ALL_STRINGOPS
7883 && eoschar == const0_rtx
7884 && optimize > 1)
7885 {
7886 /* The generic case of strlen expander is long. Avoid it's
7887 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
7888 rtx addr = force_reg (Pmode, XEXP (src, 0));
7889 /* Well it seems that some optimizer does not combine a call like
7890 foo(strlen(bar), strlen(bar));
7891 when the move and the subtraction is done here. It does calculate
7892 the length just once when these instructions are done inside of
7893 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
7894 often used and I use one fewer register for the lifetime of
7895 output_strlen_unroll() this is better. */
7896
7897 emit_move_insn (out, addr);
7898
7899 ix86_expand_strlensi_unroll_1 (out, src, align);
7900
7901 /* strlensi_unroll_1 returns the address of the zero at the end of
7902 the string, like memchr(), so compute the length by subtracting
7903 the start address. */
7904 emit_insn (gen_sub2_insn (out, addr));
7905 return true;
7906 }
7907 else
7908 return false;
7909 }
7910
7911 /* For given symbol (function) construct code to compute address of it's PLT
7912 entry in large x86-64 PIC model. */
7913
7914 static rtx
7915 construct_plt_address (rtx symbol)
7916 {
7917 rtx tmp, unspec;
7918
7919 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
7920 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
7921 gcc_assert (Pmode == DImode);
7922
7923 tmp = gen_reg_rtx (Pmode);
7924 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
7925
7926 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
7927 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
7928 return tmp;
7929 }
7930
7931 /* Additional registers that are clobbered by SYSV calls. */
7932
7933 static int const x86_64_ms_sysv_extra_clobbered_registers
7934 [NUM_X86_64_MS_CLOBBERED_REGS] =
7935 {
7936 SI_REG, DI_REG,
7937 XMM6_REG, XMM7_REG,
7938 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
7939 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
7940 };
7941
7942 rtx_insn *
7943 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
7944 rtx callarg2,
7945 rtx pop, bool sibcall)
7946 {
7947 rtx vec[3];
7948 rtx use = NULL, call;
7949 unsigned int vec_len = 0;
7950 tree fndecl;
7951
7952 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
7953 {
7954 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
7955 if (fndecl
7956 && (lookup_attribute ("interrupt",
7957 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
7958 error ("interrupt service routine cannot be called directly");
7959 }
7960 else
7961 fndecl = NULL_TREE;
7962
7963 if (pop == const0_rtx)
7964 pop = NULL;
7965 gcc_assert (!TARGET_64BIT || !pop);
7966
7967 if (TARGET_MACHO && !TARGET_64BIT)
7968 {
7969 #if TARGET_MACHO
7970 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
7971 fnaddr = machopic_indirect_call_target (fnaddr);
7972 #endif
7973 }
7974 else
7975 {
7976 /* Static functions and indirect calls don't need the pic register. Also,
7977 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
7978 it an indirect call. */
7979 rtx addr = XEXP (fnaddr, 0);
7980 if (flag_pic
7981 && GET_CODE (addr) == SYMBOL_REF
7982 && !SYMBOL_REF_LOCAL_P (addr))
7983 {
7984 if (flag_plt
7985 && (SYMBOL_REF_DECL (addr) == NULL_TREE
7986 || !lookup_attribute ("noplt",
7987 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
7988 {
7989 if (!TARGET_64BIT
7990 || (ix86_cmodel == CM_LARGE_PIC
7991 && DEFAULT_ABI != MS_ABI))
7992 {
7993 use_reg (&use, gen_rtx_REG (Pmode,
7994 REAL_PIC_OFFSET_TABLE_REGNUM));
7995 if (ix86_use_pseudo_pic_reg ())
7996 emit_move_insn (gen_rtx_REG (Pmode,
7997 REAL_PIC_OFFSET_TABLE_REGNUM),
7998 pic_offset_table_rtx);
7999 }
8000 }
8001 else if (!TARGET_PECOFF && !TARGET_MACHO)
8002 {
8003 if (TARGET_64BIT)
8004 {
8005 fnaddr = gen_rtx_UNSPEC (Pmode,
8006 gen_rtvec (1, addr),
8007 UNSPEC_GOTPCREL);
8008 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8009 }
8010 else
8011 {
8012 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
8013 UNSPEC_GOT);
8014 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8015 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
8016 fnaddr);
8017 }
8018 fnaddr = gen_const_mem (Pmode, fnaddr);
8019 /* Pmode may not be the same as word_mode for x32, which
8020 doesn't support indirect branch via 32-bit memory slot.
8021 Since x32 GOT slot is 64 bit with zero upper 32 bits,
8022 indirect branch via x32 GOT slot is OK. */
8023 if (GET_MODE (fnaddr) != word_mode)
8024 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
8025 fnaddr = gen_rtx_MEM (QImode, fnaddr);
8026 }
8027 }
8028 }
8029
8030 /* Skip setting up RAX register for -mskip-rax-setup when there are no
8031 parameters passed in vector registers. */
8032 if (TARGET_64BIT
8033 && (INTVAL (callarg2) > 0
8034 || (INTVAL (callarg2) == 0
8035 && (TARGET_SSE || !flag_skip_rax_setup))))
8036 {
8037 rtx al = gen_rtx_REG (QImode, AX_REG);
8038 emit_move_insn (al, callarg2);
8039 use_reg (&use, al);
8040 }
8041
8042 if (ix86_cmodel == CM_LARGE_PIC
8043 && !TARGET_PECOFF
8044 && MEM_P (fnaddr)
8045 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
8046 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
8047 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
8048 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
8049 branch via x32 GOT slot is OK. */
8050 else if (!(TARGET_X32
8051 && MEM_P (fnaddr)
8052 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
8053 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
8054 && (sibcall
8055 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
8056 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
8057 {
8058 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
8059 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
8060 }
8061
8062 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
8063
8064 if (retval)
8065 call = gen_rtx_SET (retval, call);
8066 vec[vec_len++] = call;
8067
8068 if (pop)
8069 {
8070 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
8071 pop = gen_rtx_SET (stack_pointer_rtx, pop);
8072 vec[vec_len++] = pop;
8073 }
8074
8075 if (cfun->machine->no_caller_saved_registers
8076 && (!fndecl
8077 || (!TREE_THIS_VOLATILE (fndecl)
8078 && !lookup_attribute ("no_caller_saved_registers",
8079 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
8080 {
8081 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
8082 bool is_64bit_ms_abi = (TARGET_64BIT
8083 && ix86_function_abi (fndecl) == MS_ABI);
8084 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
8085
8086 /* If there are no caller-saved registers, add all registers
8087 that are clobbered by the call which returns. */
8088 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
8089 if (!fixed_regs[i]
8090 && (ix86_call_used_regs[i] == 1
8091 || (ix86_call_used_regs[i] & c_mask))
8092 && !STACK_REGNO_P (i)
8093 && !MMX_REGNO_P (i))
8094 clobber_reg (&use,
8095 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
8096 }
8097 else if (TARGET_64BIT_MS_ABI
8098 && (!callarg2 || INTVAL (callarg2) != -2))
8099 {
8100 unsigned i;
8101
8102 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
8103 {
8104 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
8105 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
8106
8107 clobber_reg (&use, gen_rtx_REG (mode, regno));
8108 }
8109
8110 /* Set here, but it may get cleared later. */
8111 if (TARGET_CALL_MS2SYSV_XLOGUES)
8112 {
8113 if (!TARGET_SSE)
8114 ;
8115
8116 /* Don't break hot-patched functions. */
8117 else if (ix86_function_ms_hook_prologue (current_function_decl))
8118 ;
8119
8120 /* TODO: Cases not yet examined. */
8121 else if (flag_split_stack)
8122 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8123
8124 else
8125 {
8126 gcc_assert (!reload_completed);
8127 cfun->machine->call_ms2sysv = true;
8128 }
8129 }
8130 }
8131
8132 if (vec_len > 1)
8133 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
8134 rtx_insn *call_insn = emit_call_insn (call);
8135 if (use)
8136 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
8137
8138 return call_insn;
8139 }
8140
8141 /* Split simple return with popping POPC bytes from stack to indirect
8142 branch with stack adjustment . */
8143
8144 void
8145 ix86_split_simple_return_pop_internal (rtx popc)
8146 {
8147 struct machine_function *m = cfun->machine;
8148 rtx ecx = gen_rtx_REG (SImode, CX_REG);
8149 rtx_insn *insn;
8150
8151 /* There is no "pascal" calling convention in any 64bit ABI. */
8152 gcc_assert (!TARGET_64BIT);
8153
8154 insn = emit_insn (gen_pop (ecx));
8155 m->fs.cfa_offset -= UNITS_PER_WORD;
8156 m->fs.sp_offset -= UNITS_PER_WORD;
8157
8158 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
8159 x = gen_rtx_SET (stack_pointer_rtx, x);
8160 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8161 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
8162 RTX_FRAME_RELATED_P (insn) = 1;
8163
8164 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
8165 x = gen_rtx_SET (stack_pointer_rtx, x);
8166 insn = emit_insn (x);
8167 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8168 RTX_FRAME_RELATED_P (insn) = 1;
8169
8170 /* Now return address is in ECX. */
8171 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
8172 }
8173
8174 /* Errors in the source file can cause expand_expr to return const0_rtx
8175 where we expect a vector. To avoid crashing, use one of the vector
8176 clear instructions. */
8177
8178 static rtx
8179 safe_vector_operand (rtx x, machine_mode mode)
8180 {
8181 if (x == const0_rtx)
8182 x = CONST0_RTX (mode);
8183 return x;
8184 }
8185
8186 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
8187
8188 static rtx
8189 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
8190 {
8191 rtx pat;
8192 tree arg0 = CALL_EXPR_ARG (exp, 0);
8193 tree arg1 = CALL_EXPR_ARG (exp, 1);
8194 rtx op0 = expand_normal (arg0);
8195 rtx op1 = expand_normal (arg1);
8196 machine_mode tmode = insn_data[icode].operand[0].mode;
8197 machine_mode mode0 = insn_data[icode].operand[1].mode;
8198 machine_mode mode1 = insn_data[icode].operand[2].mode;
8199
8200 if (VECTOR_MODE_P (mode0))
8201 op0 = safe_vector_operand (op0, mode0);
8202 if (VECTOR_MODE_P (mode1))
8203 op1 = safe_vector_operand (op1, mode1);
8204
8205 if (optimize || !target
8206 || GET_MODE (target) != tmode
8207 || !insn_data[icode].operand[0].predicate (target, tmode))
8208 target = gen_reg_rtx (tmode);
8209
8210 if (GET_MODE (op1) == SImode && mode1 == TImode)
8211 {
8212 rtx x = gen_reg_rtx (V4SImode);
8213 emit_insn (gen_sse2_loadd (x, op1));
8214 op1 = gen_lowpart (TImode, x);
8215 }
8216
8217 if (!insn_data[icode].operand[1].predicate (op0, mode0))
8218 op0 = copy_to_mode_reg (mode0, op0);
8219 if (!insn_data[icode].operand[2].predicate (op1, mode1))
8220 op1 = copy_to_mode_reg (mode1, op1);
8221
8222 pat = GEN_FCN (icode) (target, op0, op1);
8223 if (! pat)
8224 return 0;
8225
8226 emit_insn (pat);
8227
8228 return target;
8229 }
8230
8231 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
8232
8233 static rtx
8234 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
8235 enum ix86_builtin_func_type m_type,
8236 enum rtx_code sub_code)
8237 {
8238 rtx pat;
8239 int i;
8240 int nargs;
8241 bool comparison_p = false;
8242 bool tf_p = false;
8243 bool last_arg_constant = false;
8244 int num_memory = 0;
8245 struct {
8246 rtx op;
8247 machine_mode mode;
8248 } args[4];
8249
8250 machine_mode tmode = insn_data[icode].operand[0].mode;
8251
8252 switch (m_type)
8253 {
8254 case MULTI_ARG_4_DF2_DI_I:
8255 case MULTI_ARG_4_DF2_DI_I1:
8256 case MULTI_ARG_4_SF2_SI_I:
8257 case MULTI_ARG_4_SF2_SI_I1:
8258 nargs = 4;
8259 last_arg_constant = true;
8260 break;
8261
8262 case MULTI_ARG_3_SF:
8263 case MULTI_ARG_3_DF:
8264 case MULTI_ARG_3_SF2:
8265 case MULTI_ARG_3_DF2:
8266 case MULTI_ARG_3_DI:
8267 case MULTI_ARG_3_SI:
8268 case MULTI_ARG_3_SI_DI:
8269 case MULTI_ARG_3_HI:
8270 case MULTI_ARG_3_HI_SI:
8271 case MULTI_ARG_3_QI:
8272 case MULTI_ARG_3_DI2:
8273 case MULTI_ARG_3_SI2:
8274 case MULTI_ARG_3_HI2:
8275 case MULTI_ARG_3_QI2:
8276 nargs = 3;
8277 break;
8278
8279 case MULTI_ARG_2_SF:
8280 case MULTI_ARG_2_DF:
8281 case MULTI_ARG_2_DI:
8282 case MULTI_ARG_2_SI:
8283 case MULTI_ARG_2_HI:
8284 case MULTI_ARG_2_QI:
8285 nargs = 2;
8286 break;
8287
8288 case MULTI_ARG_2_DI_IMM:
8289 case MULTI_ARG_2_SI_IMM:
8290 case MULTI_ARG_2_HI_IMM:
8291 case MULTI_ARG_2_QI_IMM:
8292 nargs = 2;
8293 last_arg_constant = true;
8294 break;
8295
8296 case MULTI_ARG_1_SF:
8297 case MULTI_ARG_1_DF:
8298 case MULTI_ARG_1_SF2:
8299 case MULTI_ARG_1_DF2:
8300 case MULTI_ARG_1_DI:
8301 case MULTI_ARG_1_SI:
8302 case MULTI_ARG_1_HI:
8303 case MULTI_ARG_1_QI:
8304 case MULTI_ARG_1_SI_DI:
8305 case MULTI_ARG_1_HI_DI:
8306 case MULTI_ARG_1_HI_SI:
8307 case MULTI_ARG_1_QI_DI:
8308 case MULTI_ARG_1_QI_SI:
8309 case MULTI_ARG_1_QI_HI:
8310 nargs = 1;
8311 break;
8312
8313 case MULTI_ARG_2_DI_CMP:
8314 case MULTI_ARG_2_SI_CMP:
8315 case MULTI_ARG_2_HI_CMP:
8316 case MULTI_ARG_2_QI_CMP:
8317 nargs = 2;
8318 comparison_p = true;
8319 break;
8320
8321 case MULTI_ARG_2_SF_TF:
8322 case MULTI_ARG_2_DF_TF:
8323 case MULTI_ARG_2_DI_TF:
8324 case MULTI_ARG_2_SI_TF:
8325 case MULTI_ARG_2_HI_TF:
8326 case MULTI_ARG_2_QI_TF:
8327 nargs = 2;
8328 tf_p = true;
8329 break;
8330
8331 default:
8332 gcc_unreachable ();
8333 }
8334
8335 if (optimize || !target
8336 || GET_MODE (target) != tmode
8337 || !insn_data[icode].operand[0].predicate (target, tmode))
8338 target = gen_reg_rtx (tmode);
8339 else if (memory_operand (target, tmode))
8340 num_memory++;
8341
8342 gcc_assert (nargs <= 4);
8343
8344 for (i = 0; i < nargs; i++)
8345 {
8346 tree arg = CALL_EXPR_ARG (exp, i);
8347 rtx op = expand_normal (arg);
8348 int adjust = (comparison_p) ? 1 : 0;
8349 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
8350
8351 if (last_arg_constant && i == nargs - 1)
8352 {
8353 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
8354 {
8355 enum insn_code new_icode = icode;
8356 switch (icode)
8357 {
8358 case CODE_FOR_xop_vpermil2v2df3:
8359 case CODE_FOR_xop_vpermil2v4sf3:
8360 case CODE_FOR_xop_vpermil2v4df3:
8361 case CODE_FOR_xop_vpermil2v8sf3:
8362 error ("the last argument must be a 2-bit immediate");
8363 return gen_reg_rtx (tmode);
8364 case CODE_FOR_xop_rotlv2di3:
8365 new_icode = CODE_FOR_rotlv2di3;
8366 goto xop_rotl;
8367 case CODE_FOR_xop_rotlv4si3:
8368 new_icode = CODE_FOR_rotlv4si3;
8369 goto xop_rotl;
8370 case CODE_FOR_xop_rotlv8hi3:
8371 new_icode = CODE_FOR_rotlv8hi3;
8372 goto xop_rotl;
8373 case CODE_FOR_xop_rotlv16qi3:
8374 new_icode = CODE_FOR_rotlv16qi3;
8375 xop_rotl:
8376 if (CONST_INT_P (op))
8377 {
8378 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
8379 op = GEN_INT (INTVAL (op) & mask);
8380 gcc_checking_assert
8381 (insn_data[icode].operand[i + 1].predicate (op, mode));
8382 }
8383 else
8384 {
8385 gcc_checking_assert
8386 (nargs == 2
8387 && insn_data[new_icode].operand[0].mode == tmode
8388 && insn_data[new_icode].operand[1].mode == tmode
8389 && insn_data[new_icode].operand[2].mode == mode
8390 && insn_data[new_icode].operand[0].predicate
8391 == insn_data[icode].operand[0].predicate
8392 && insn_data[new_icode].operand[1].predicate
8393 == insn_data[icode].operand[1].predicate);
8394 icode = new_icode;
8395 goto non_constant;
8396 }
8397 break;
8398 default:
8399 gcc_unreachable ();
8400 }
8401 }
8402 }
8403 else
8404 {
8405 non_constant:
8406 if (VECTOR_MODE_P (mode))
8407 op = safe_vector_operand (op, mode);
8408
8409 /* If we aren't optimizing, only allow one memory operand to be
8410 generated. */
8411 if (memory_operand (op, mode))
8412 num_memory++;
8413
8414 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
8415
8416 if (optimize
8417 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
8418 || num_memory > 1)
8419 op = force_reg (mode, op);
8420 }
8421
8422 args[i].op = op;
8423 args[i].mode = mode;
8424 }
8425
8426 switch (nargs)
8427 {
8428 case 1:
8429 pat = GEN_FCN (icode) (target, args[0].op);
8430 break;
8431
8432 case 2:
8433 if (tf_p)
8434 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
8435 GEN_INT ((int)sub_code));
8436 else if (! comparison_p)
8437 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
8438 else
8439 {
8440 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
8441 args[0].op,
8442 args[1].op);
8443
8444 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
8445 }
8446 break;
8447
8448 case 3:
8449 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
8450 break;
8451
8452 case 4:
8453 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
8454 break;
8455
8456 default:
8457 gcc_unreachable ();
8458 }
8459
8460 if (! pat)
8461 return 0;
8462
8463 emit_insn (pat);
8464 return target;
8465 }
8466
8467 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8468 insns with vec_merge. */
8469
8470 static rtx
8471 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
8472 rtx target)
8473 {
8474 rtx pat;
8475 tree arg0 = CALL_EXPR_ARG (exp, 0);
8476 rtx op1, op0 = expand_normal (arg0);
8477 machine_mode tmode = insn_data[icode].operand[0].mode;
8478 machine_mode mode0 = insn_data[icode].operand[1].mode;
8479
8480 if (optimize || !target
8481 || GET_MODE (target) != tmode
8482 || !insn_data[icode].operand[0].predicate (target, tmode))
8483 target = gen_reg_rtx (tmode);
8484
8485 if (VECTOR_MODE_P (mode0))
8486 op0 = safe_vector_operand (op0, mode0);
8487
8488 if ((optimize && !register_operand (op0, mode0))
8489 || !insn_data[icode].operand[1].predicate (op0, mode0))
8490 op0 = copy_to_mode_reg (mode0, op0);
8491
8492 op1 = op0;
8493 if (!insn_data[icode].operand[2].predicate (op1, mode0))
8494 op1 = copy_to_mode_reg (mode0, op1);
8495
8496 pat = GEN_FCN (icode) (target, op0, op1);
8497 if (! pat)
8498 return 0;
8499 emit_insn (pat);
8500 return target;
8501 }
8502
8503 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
8504
8505 static rtx
8506 ix86_expand_sse_compare (const struct builtin_description *d,
8507 tree exp, rtx target, bool swap)
8508 {
8509 rtx pat;
8510 tree arg0 = CALL_EXPR_ARG (exp, 0);
8511 tree arg1 = CALL_EXPR_ARG (exp, 1);
8512 rtx op0 = expand_normal (arg0);
8513 rtx op1 = expand_normal (arg1);
8514 rtx op2;
8515 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8516 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8517 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8518 enum rtx_code comparison = d->comparison;
8519
8520 if (VECTOR_MODE_P (mode0))
8521 op0 = safe_vector_operand (op0, mode0);
8522 if (VECTOR_MODE_P (mode1))
8523 op1 = safe_vector_operand (op1, mode1);
8524
8525 /* Swap operands if we have a comparison that isn't available in
8526 hardware. */
8527 if (swap)
8528 std::swap (op0, op1);
8529
8530 if (optimize || !target
8531 || GET_MODE (target) != tmode
8532 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8533 target = gen_reg_rtx (tmode);
8534
8535 if ((optimize && !register_operand (op0, mode0))
8536 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
8537 op0 = copy_to_mode_reg (mode0, op0);
8538 if ((optimize && !register_operand (op1, mode1))
8539 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
8540 op1 = copy_to_mode_reg (mode1, op1);
8541
8542 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
8543 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8544 if (! pat)
8545 return 0;
8546 emit_insn (pat);
8547 return target;
8548 }
8549
8550 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
8551
8552 static rtx
8553 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
8554 rtx target)
8555 {
8556 rtx pat;
8557 tree arg0 = CALL_EXPR_ARG (exp, 0);
8558 tree arg1 = CALL_EXPR_ARG (exp, 1);
8559 rtx op0 = expand_normal (arg0);
8560 rtx op1 = expand_normal (arg1);
8561 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8562 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8563 enum rtx_code comparison = d->comparison;
8564
8565 if (VECTOR_MODE_P (mode0))
8566 op0 = safe_vector_operand (op0, mode0);
8567 if (VECTOR_MODE_P (mode1))
8568 op1 = safe_vector_operand (op1, mode1);
8569
8570 /* Swap operands if we have a comparison that isn't available in
8571 hardware. */
8572 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
8573 std::swap (op0, op1);
8574
8575 target = gen_reg_rtx (SImode);
8576 emit_move_insn (target, const0_rtx);
8577 target = gen_rtx_SUBREG (QImode, target, 0);
8578
8579 if ((optimize && !register_operand (op0, mode0))
8580 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8581 op0 = copy_to_mode_reg (mode0, op0);
8582 if ((optimize && !register_operand (op1, mode1))
8583 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8584 op1 = copy_to_mode_reg (mode1, op1);
8585
8586 pat = GEN_FCN (d->icode) (op0, op1);
8587 if (! pat)
8588 return 0;
8589 emit_insn (pat);
8590 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8591 gen_rtx_fmt_ee (comparison, QImode,
8592 SET_DEST (pat),
8593 const0_rtx)));
8594
8595 return SUBREG_REG (target);
8596 }
8597
8598 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
8599
8600 static rtx
8601 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
8602 rtx target)
8603 {
8604 rtx pat;
8605 tree arg0 = CALL_EXPR_ARG (exp, 0);
8606 rtx op1, op0 = expand_normal (arg0);
8607 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8608 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8609
8610 if (optimize || target == 0
8611 || GET_MODE (target) != tmode
8612 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8613 target = gen_reg_rtx (tmode);
8614
8615 if (VECTOR_MODE_P (mode0))
8616 op0 = safe_vector_operand (op0, mode0);
8617
8618 if ((optimize && !register_operand (op0, mode0))
8619 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8620 op0 = copy_to_mode_reg (mode0, op0);
8621
8622 op1 = GEN_INT (d->comparison);
8623
8624 pat = GEN_FCN (d->icode) (target, op0, op1);
8625 if (! pat)
8626 return 0;
8627 emit_insn (pat);
8628 return target;
8629 }
8630
8631 static rtx
8632 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
8633 tree exp, rtx target)
8634 {
8635 rtx pat;
8636 tree arg0 = CALL_EXPR_ARG (exp, 0);
8637 tree arg1 = CALL_EXPR_ARG (exp, 1);
8638 rtx op0 = expand_normal (arg0);
8639 rtx op1 = expand_normal (arg1);
8640 rtx op2;
8641 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8642 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8643 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8644
8645 if (optimize || target == 0
8646 || GET_MODE (target) != tmode
8647 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8648 target = gen_reg_rtx (tmode);
8649
8650 op0 = safe_vector_operand (op0, mode0);
8651 op1 = safe_vector_operand (op1, mode1);
8652
8653 if ((optimize && !register_operand (op0, mode0))
8654 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8655 op0 = copy_to_mode_reg (mode0, op0);
8656 if ((optimize && !register_operand (op1, mode1))
8657 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8658 op1 = copy_to_mode_reg (mode1, op1);
8659
8660 op2 = GEN_INT (d->comparison);
8661
8662 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8663 if (! pat)
8664 return 0;
8665 emit_insn (pat);
8666 return target;
8667 }
8668
8669 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
8670
8671 static rtx
8672 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
8673 rtx target)
8674 {
8675 rtx pat;
8676 tree arg0 = CALL_EXPR_ARG (exp, 0);
8677 tree arg1 = CALL_EXPR_ARG (exp, 1);
8678 rtx op0 = expand_normal (arg0);
8679 rtx op1 = expand_normal (arg1);
8680 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8681 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8682 enum rtx_code comparison = d->comparison;
8683
8684 if (VECTOR_MODE_P (mode0))
8685 op0 = safe_vector_operand (op0, mode0);
8686 if (VECTOR_MODE_P (mode1))
8687 op1 = safe_vector_operand (op1, mode1);
8688
8689 target = gen_reg_rtx (SImode);
8690 emit_move_insn (target, const0_rtx);
8691 target = gen_rtx_SUBREG (QImode, target, 0);
8692
8693 if ((optimize && !register_operand (op0, mode0))
8694 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8695 op0 = copy_to_mode_reg (mode0, op0);
8696 if ((optimize && !register_operand (op1, mode1))
8697 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8698 op1 = copy_to_mode_reg (mode1, op1);
8699
8700 pat = GEN_FCN (d->icode) (op0, op1);
8701 if (! pat)
8702 return 0;
8703 emit_insn (pat);
8704 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8705 gen_rtx_fmt_ee (comparison, QImode,
8706 SET_DEST (pat),
8707 const0_rtx)));
8708
8709 return SUBREG_REG (target);
8710 }
8711
8712 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
8713
8714 static rtx
8715 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
8716 tree exp, rtx target)
8717 {
8718 rtx pat;
8719 tree arg0 = CALL_EXPR_ARG (exp, 0);
8720 tree arg1 = CALL_EXPR_ARG (exp, 1);
8721 tree arg2 = CALL_EXPR_ARG (exp, 2);
8722 tree arg3 = CALL_EXPR_ARG (exp, 3);
8723 tree arg4 = CALL_EXPR_ARG (exp, 4);
8724 rtx scratch0, scratch1;
8725 rtx op0 = expand_normal (arg0);
8726 rtx op1 = expand_normal (arg1);
8727 rtx op2 = expand_normal (arg2);
8728 rtx op3 = expand_normal (arg3);
8729 rtx op4 = expand_normal (arg4);
8730 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
8731
8732 tmode0 = insn_data[d->icode].operand[0].mode;
8733 tmode1 = insn_data[d->icode].operand[1].mode;
8734 modev2 = insn_data[d->icode].operand[2].mode;
8735 modei3 = insn_data[d->icode].operand[3].mode;
8736 modev4 = insn_data[d->icode].operand[4].mode;
8737 modei5 = insn_data[d->icode].operand[5].mode;
8738 modeimm = insn_data[d->icode].operand[6].mode;
8739
8740 if (VECTOR_MODE_P (modev2))
8741 op0 = safe_vector_operand (op0, modev2);
8742 if (VECTOR_MODE_P (modev4))
8743 op2 = safe_vector_operand (op2, modev4);
8744
8745 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8746 op0 = copy_to_mode_reg (modev2, op0);
8747 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
8748 op1 = copy_to_mode_reg (modei3, op1);
8749 if ((optimize && !register_operand (op2, modev4))
8750 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
8751 op2 = copy_to_mode_reg (modev4, op2);
8752 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
8753 op3 = copy_to_mode_reg (modei5, op3);
8754
8755 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
8756 {
8757 error ("the fifth argument must be an 8-bit immediate");
8758 return const0_rtx;
8759 }
8760
8761 if (d->code == IX86_BUILTIN_PCMPESTRI128)
8762 {
8763 if (optimize || !target
8764 || GET_MODE (target) != tmode0
8765 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8766 target = gen_reg_rtx (tmode0);
8767
8768 scratch1 = gen_reg_rtx (tmode1);
8769
8770 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
8771 }
8772 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
8773 {
8774 if (optimize || !target
8775 || GET_MODE (target) != tmode1
8776 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8777 target = gen_reg_rtx (tmode1);
8778
8779 scratch0 = gen_reg_rtx (tmode0);
8780
8781 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
8782 }
8783 else
8784 {
8785 gcc_assert (d->flag);
8786
8787 scratch0 = gen_reg_rtx (tmode0);
8788 scratch1 = gen_reg_rtx (tmode1);
8789
8790 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
8791 }
8792
8793 if (! pat)
8794 return 0;
8795
8796 emit_insn (pat);
8797
8798 if (d->flag)
8799 {
8800 target = gen_reg_rtx (SImode);
8801 emit_move_insn (target, const0_rtx);
8802 target = gen_rtx_SUBREG (QImode, target, 0);
8803
8804 emit_insn
8805 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8806 gen_rtx_fmt_ee (EQ, QImode,
8807 gen_rtx_REG ((machine_mode) d->flag,
8808 FLAGS_REG),
8809 const0_rtx)));
8810 return SUBREG_REG (target);
8811 }
8812 else
8813 return target;
8814 }
8815
8816
8817 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
8818
8819 static rtx
8820 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
8821 tree exp, rtx target)
8822 {
8823 rtx pat;
8824 tree arg0 = CALL_EXPR_ARG (exp, 0);
8825 tree arg1 = CALL_EXPR_ARG (exp, 1);
8826 tree arg2 = CALL_EXPR_ARG (exp, 2);
8827 rtx scratch0, scratch1;
8828 rtx op0 = expand_normal (arg0);
8829 rtx op1 = expand_normal (arg1);
8830 rtx op2 = expand_normal (arg2);
8831 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
8832
8833 tmode0 = insn_data[d->icode].operand[0].mode;
8834 tmode1 = insn_data[d->icode].operand[1].mode;
8835 modev2 = insn_data[d->icode].operand[2].mode;
8836 modev3 = insn_data[d->icode].operand[3].mode;
8837 modeimm = insn_data[d->icode].operand[4].mode;
8838
8839 if (VECTOR_MODE_P (modev2))
8840 op0 = safe_vector_operand (op0, modev2);
8841 if (VECTOR_MODE_P (modev3))
8842 op1 = safe_vector_operand (op1, modev3);
8843
8844 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8845 op0 = copy_to_mode_reg (modev2, op0);
8846 if ((optimize && !register_operand (op1, modev3))
8847 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
8848 op1 = copy_to_mode_reg (modev3, op1);
8849
8850 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
8851 {
8852 error ("the third argument must be an 8-bit immediate");
8853 return const0_rtx;
8854 }
8855
8856 if (d->code == IX86_BUILTIN_PCMPISTRI128)
8857 {
8858 if (optimize || !target
8859 || GET_MODE (target) != tmode0
8860 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8861 target = gen_reg_rtx (tmode0);
8862
8863 scratch1 = gen_reg_rtx (tmode1);
8864
8865 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
8866 }
8867 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
8868 {
8869 if (optimize || !target
8870 || GET_MODE (target) != tmode1
8871 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8872 target = gen_reg_rtx (tmode1);
8873
8874 scratch0 = gen_reg_rtx (tmode0);
8875
8876 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
8877 }
8878 else
8879 {
8880 gcc_assert (d->flag);
8881
8882 scratch0 = gen_reg_rtx (tmode0);
8883 scratch1 = gen_reg_rtx (tmode1);
8884
8885 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
8886 }
8887
8888 if (! pat)
8889 return 0;
8890
8891 emit_insn (pat);
8892
8893 if (d->flag)
8894 {
8895 target = gen_reg_rtx (SImode);
8896 emit_move_insn (target, const0_rtx);
8897 target = gen_rtx_SUBREG (QImode, target, 0);
8898
8899 emit_insn
8900 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8901 gen_rtx_fmt_ee (EQ, QImode,
8902 gen_rtx_REG ((machine_mode) d->flag,
8903 FLAGS_REG),
8904 const0_rtx)));
8905 return SUBREG_REG (target);
8906 }
8907 else
8908 return target;
8909 }
8910
8911 /* Fixup modeless constants to fit required mode. */
8912
8913 static rtx
8914 fixup_modeless_constant (rtx x, machine_mode mode)
8915 {
8916 if (GET_MODE (x) == VOIDmode)
8917 x = convert_to_mode (mode, x, 1);
8918 return x;
8919 }
8920
8921 /* Subroutine of ix86_expand_builtin to take care of insns with
8922 variable number of operands. */
8923
8924 static rtx
8925 ix86_expand_args_builtin (const struct builtin_description *d,
8926 tree exp, rtx target)
8927 {
8928 rtx pat, real_target;
8929 unsigned int i, nargs;
8930 unsigned int nargs_constant = 0;
8931 unsigned int mask_pos = 0;
8932 int num_memory = 0;
8933 struct
8934 {
8935 rtx op;
8936 machine_mode mode;
8937 } args[6];
8938 bool second_arg_count = false;
8939 enum insn_code icode = d->icode;
8940 const struct insn_data_d *insn_p = &insn_data[icode];
8941 machine_mode tmode = insn_p->operand[0].mode;
8942 machine_mode rmode = VOIDmode;
8943 bool swap = false;
8944 enum rtx_code comparison = d->comparison;
8945
8946 switch ((enum ix86_builtin_func_type) d->flag)
8947 {
8948 case V2DF_FTYPE_V2DF_ROUND:
8949 case V4DF_FTYPE_V4DF_ROUND:
8950 case V8DF_FTYPE_V8DF_ROUND:
8951 case V4SF_FTYPE_V4SF_ROUND:
8952 case V8SF_FTYPE_V8SF_ROUND:
8953 case V16SF_FTYPE_V16SF_ROUND:
8954 case V4SI_FTYPE_V4SF_ROUND:
8955 case V8SI_FTYPE_V8SF_ROUND:
8956 case V16SI_FTYPE_V16SF_ROUND:
8957 return ix86_expand_sse_round (d, exp, target);
8958 case V4SI_FTYPE_V2DF_V2DF_ROUND:
8959 case V8SI_FTYPE_V4DF_V4DF_ROUND:
8960 case V16SI_FTYPE_V8DF_V8DF_ROUND:
8961 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
8962 case INT_FTYPE_V8SF_V8SF_PTEST:
8963 case INT_FTYPE_V4DI_V4DI_PTEST:
8964 case INT_FTYPE_V4DF_V4DF_PTEST:
8965 case INT_FTYPE_V4SF_V4SF_PTEST:
8966 case INT_FTYPE_V2DI_V2DI_PTEST:
8967 case INT_FTYPE_V2DF_V2DF_PTEST:
8968 return ix86_expand_sse_ptest (d, exp, target);
8969 case FLOAT128_FTYPE_FLOAT128:
8970 case FLOAT_FTYPE_FLOAT:
8971 case INT_FTYPE_INT:
8972 case UINT_FTYPE_UINT:
8973 case UINT16_FTYPE_UINT16:
8974 case UINT64_FTYPE_INT:
8975 case UINT64_FTYPE_UINT64:
8976 case INT64_FTYPE_INT64:
8977 case INT64_FTYPE_V4SF:
8978 case INT64_FTYPE_V2DF:
8979 case INT_FTYPE_V16QI:
8980 case INT_FTYPE_V8QI:
8981 case INT_FTYPE_V8SF:
8982 case INT_FTYPE_V4DF:
8983 case INT_FTYPE_V4SF:
8984 case INT_FTYPE_V2DF:
8985 case INT_FTYPE_V32QI:
8986 case V16QI_FTYPE_V16QI:
8987 case V8SI_FTYPE_V8SF:
8988 case V8SI_FTYPE_V4SI:
8989 case V8HI_FTYPE_V8HI:
8990 case V8HI_FTYPE_V16QI:
8991 case V8QI_FTYPE_V8QI:
8992 case V8SF_FTYPE_V8SF:
8993 case V8SF_FTYPE_V8SI:
8994 case V8SF_FTYPE_V4SF:
8995 case V8SF_FTYPE_V8HI:
8996 case V4SI_FTYPE_V4SI:
8997 case V4SI_FTYPE_V16QI:
8998 case V4SI_FTYPE_V4SF:
8999 case V4SI_FTYPE_V8SI:
9000 case V4SI_FTYPE_V8HI:
9001 case V4SI_FTYPE_V4DF:
9002 case V4SI_FTYPE_V2DF:
9003 case V4HI_FTYPE_V4HI:
9004 case V4DF_FTYPE_V4DF:
9005 case V4DF_FTYPE_V4SI:
9006 case V4DF_FTYPE_V4SF:
9007 case V4DF_FTYPE_V2DF:
9008 case V4SF_FTYPE_V4SF:
9009 case V4SF_FTYPE_V4SI:
9010 case V4SF_FTYPE_V8SF:
9011 case V4SF_FTYPE_V4DF:
9012 case V4SF_FTYPE_V8HI:
9013 case V4SF_FTYPE_V2DF:
9014 case V2DI_FTYPE_V2DI:
9015 case V2DI_FTYPE_V16QI:
9016 case V2DI_FTYPE_V8HI:
9017 case V2DI_FTYPE_V4SI:
9018 case V2DF_FTYPE_V2DF:
9019 case V2DF_FTYPE_V4SI:
9020 case V2DF_FTYPE_V4DF:
9021 case V2DF_FTYPE_V4SF:
9022 case V2DF_FTYPE_V2SI:
9023 case V2SI_FTYPE_V2SI:
9024 case V2SI_FTYPE_V4SF:
9025 case V2SI_FTYPE_V2SF:
9026 case V2SI_FTYPE_V2DF:
9027 case V2SF_FTYPE_V2SF:
9028 case V2SF_FTYPE_V2SI:
9029 case V32QI_FTYPE_V32QI:
9030 case V32QI_FTYPE_V16QI:
9031 case V16HI_FTYPE_V16HI:
9032 case V16HI_FTYPE_V8HI:
9033 case V8SI_FTYPE_V8SI:
9034 case V16HI_FTYPE_V16QI:
9035 case V8SI_FTYPE_V16QI:
9036 case V4DI_FTYPE_V16QI:
9037 case V8SI_FTYPE_V8HI:
9038 case V4DI_FTYPE_V8HI:
9039 case V4DI_FTYPE_V4SI:
9040 case V4DI_FTYPE_V2DI:
9041 case UQI_FTYPE_UQI:
9042 case UHI_FTYPE_UHI:
9043 case USI_FTYPE_USI:
9044 case USI_FTYPE_UQI:
9045 case USI_FTYPE_UHI:
9046 case UDI_FTYPE_UDI:
9047 case UHI_FTYPE_V16QI:
9048 case USI_FTYPE_V32QI:
9049 case UDI_FTYPE_V64QI:
9050 case V16QI_FTYPE_UHI:
9051 case V32QI_FTYPE_USI:
9052 case V64QI_FTYPE_UDI:
9053 case V8HI_FTYPE_UQI:
9054 case V16HI_FTYPE_UHI:
9055 case V32HI_FTYPE_USI:
9056 case V4SI_FTYPE_UQI:
9057 case V8SI_FTYPE_UQI:
9058 case V4SI_FTYPE_UHI:
9059 case V8SI_FTYPE_UHI:
9060 case UQI_FTYPE_V8HI:
9061 case UHI_FTYPE_V16HI:
9062 case USI_FTYPE_V32HI:
9063 case UQI_FTYPE_V4SI:
9064 case UQI_FTYPE_V8SI:
9065 case UHI_FTYPE_V16SI:
9066 case UQI_FTYPE_V2DI:
9067 case UQI_FTYPE_V4DI:
9068 case UQI_FTYPE_V8DI:
9069 case V16SI_FTYPE_UHI:
9070 case V2DI_FTYPE_UQI:
9071 case V4DI_FTYPE_UQI:
9072 case V16SI_FTYPE_INT:
9073 case V16SF_FTYPE_V8SF:
9074 case V16SI_FTYPE_V8SI:
9075 case V16SF_FTYPE_V4SF:
9076 case V16SI_FTYPE_V4SI:
9077 case V16SI_FTYPE_V16SF:
9078 case V16SI_FTYPE_V16SI:
9079 case V64QI_FTYPE_V64QI:
9080 case V32HI_FTYPE_V32HI:
9081 case V16SF_FTYPE_V16SF:
9082 case V8DI_FTYPE_UQI:
9083 case V8DI_FTYPE_V8DI:
9084 case V8DF_FTYPE_V4DF:
9085 case V8DF_FTYPE_V2DF:
9086 case V8DF_FTYPE_V8DF:
9087 case V4DI_FTYPE_V4DI:
9088 case V16HI_FTYPE_V16SF:
9089 case V8HI_FTYPE_V8SF:
9090 case V8HI_FTYPE_V4SF:
9091 nargs = 1;
9092 break;
9093 case V4SF_FTYPE_V4SF_VEC_MERGE:
9094 case V2DF_FTYPE_V2DF_VEC_MERGE:
9095 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
9096 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
9097 case V16QI_FTYPE_V16QI_V16QI:
9098 case V16QI_FTYPE_V8HI_V8HI:
9099 case V16SF_FTYPE_V16SF_V16SF:
9100 case V8QI_FTYPE_V8QI_V8QI:
9101 case V8QI_FTYPE_V4HI_V4HI:
9102 case V8HI_FTYPE_V8HI_V8HI:
9103 case V8HI_FTYPE_V16QI_V16QI:
9104 case V8HI_FTYPE_V4SI_V4SI:
9105 case V8SF_FTYPE_V8SF_V8SF:
9106 case V8SF_FTYPE_V8SF_V8SI:
9107 case V8DF_FTYPE_V8DF_V8DF:
9108 case V4SI_FTYPE_V4SI_V4SI:
9109 case V4SI_FTYPE_V8HI_V8HI:
9110 case V4SI_FTYPE_V2DF_V2DF:
9111 case V4HI_FTYPE_V4HI_V4HI:
9112 case V4HI_FTYPE_V8QI_V8QI:
9113 case V4HI_FTYPE_V2SI_V2SI:
9114 case V4DF_FTYPE_V4DF_V4DF:
9115 case V4DF_FTYPE_V4DF_V4DI:
9116 case V4SF_FTYPE_V4SF_V4SF:
9117 case V4SF_FTYPE_V4SF_V4SI:
9118 case V4SF_FTYPE_V4SF_V2SI:
9119 case V4SF_FTYPE_V4SF_V2DF:
9120 case V4SF_FTYPE_V4SF_UINT:
9121 case V4SF_FTYPE_V4SF_DI:
9122 case V4SF_FTYPE_V4SF_SI:
9123 case V2DI_FTYPE_V2DI_V2DI:
9124 case V2DI_FTYPE_V16QI_V16QI:
9125 case V2DI_FTYPE_V4SI_V4SI:
9126 case V2DI_FTYPE_V2DI_V16QI:
9127 case V2SI_FTYPE_V2SI_V2SI:
9128 case V2SI_FTYPE_V4HI_V4HI:
9129 case V2SI_FTYPE_V2SF_V2SF:
9130 case V2DF_FTYPE_V2DF_V2DF:
9131 case V2DF_FTYPE_V2DF_V4SF:
9132 case V2DF_FTYPE_V2DF_V2DI:
9133 case V2DF_FTYPE_V2DF_DI:
9134 case V2DF_FTYPE_V2DF_SI:
9135 case V2DF_FTYPE_V2DF_UINT:
9136 case V2SF_FTYPE_V2SF_V2SF:
9137 case V1DI_FTYPE_V1DI_V1DI:
9138 case V1DI_FTYPE_V8QI_V8QI:
9139 case V1DI_FTYPE_V2SI_V2SI:
9140 case V32QI_FTYPE_V16HI_V16HI:
9141 case V16HI_FTYPE_V8SI_V8SI:
9142 case V64QI_FTYPE_V64QI_V64QI:
9143 case V32QI_FTYPE_V32QI_V32QI:
9144 case V16HI_FTYPE_V32QI_V32QI:
9145 case V16HI_FTYPE_V16HI_V16HI:
9146 case V8SI_FTYPE_V4DF_V4DF:
9147 case V8SI_FTYPE_V8SI_V8SI:
9148 case V8SI_FTYPE_V16HI_V16HI:
9149 case V4DI_FTYPE_V4DI_V4DI:
9150 case V4DI_FTYPE_V8SI_V8SI:
9151 case V8DI_FTYPE_V64QI_V64QI:
9152 if (comparison == UNKNOWN)
9153 return ix86_expand_binop_builtin (icode, exp, target);
9154 nargs = 2;
9155 break;
9156 case V4SF_FTYPE_V4SF_V4SF_SWAP:
9157 case V2DF_FTYPE_V2DF_V2DF_SWAP:
9158 gcc_assert (comparison != UNKNOWN);
9159 nargs = 2;
9160 swap = true;
9161 break;
9162 case V16HI_FTYPE_V16HI_V8HI_COUNT:
9163 case V16HI_FTYPE_V16HI_SI_COUNT:
9164 case V8SI_FTYPE_V8SI_V4SI_COUNT:
9165 case V8SI_FTYPE_V8SI_SI_COUNT:
9166 case V4DI_FTYPE_V4DI_V2DI_COUNT:
9167 case V4DI_FTYPE_V4DI_INT_COUNT:
9168 case V8HI_FTYPE_V8HI_V8HI_COUNT:
9169 case V8HI_FTYPE_V8HI_SI_COUNT:
9170 case V4SI_FTYPE_V4SI_V4SI_COUNT:
9171 case V4SI_FTYPE_V4SI_SI_COUNT:
9172 case V4HI_FTYPE_V4HI_V4HI_COUNT:
9173 case V4HI_FTYPE_V4HI_SI_COUNT:
9174 case V2DI_FTYPE_V2DI_V2DI_COUNT:
9175 case V2DI_FTYPE_V2DI_SI_COUNT:
9176 case V2SI_FTYPE_V2SI_V2SI_COUNT:
9177 case V2SI_FTYPE_V2SI_SI_COUNT:
9178 case V1DI_FTYPE_V1DI_V1DI_COUNT:
9179 case V1DI_FTYPE_V1DI_SI_COUNT:
9180 nargs = 2;
9181 second_arg_count = true;
9182 break;
9183 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
9184 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
9185 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
9186 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
9187 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
9188 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
9189 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
9190 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
9191 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
9192 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
9193 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
9194 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
9195 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
9196 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
9197 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
9198 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
9199 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
9200 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
9201 nargs = 4;
9202 second_arg_count = true;
9203 break;
9204 case UINT64_FTYPE_UINT64_UINT64:
9205 case UINT_FTYPE_UINT_UINT:
9206 case UINT_FTYPE_UINT_USHORT:
9207 case UINT_FTYPE_UINT_UCHAR:
9208 case UINT16_FTYPE_UINT16_INT:
9209 case UINT8_FTYPE_UINT8_INT:
9210 case UQI_FTYPE_UQI_UQI:
9211 case UHI_FTYPE_UHI_UHI:
9212 case USI_FTYPE_USI_USI:
9213 case UDI_FTYPE_UDI_UDI:
9214 case V16SI_FTYPE_V8DF_V8DF:
9215 case V32HI_FTYPE_V16SF_V16SF:
9216 case V16HI_FTYPE_V8SF_V8SF:
9217 case V8HI_FTYPE_V4SF_V4SF:
9218 case V16HI_FTYPE_V16SF_UHI:
9219 case V8HI_FTYPE_V8SF_UQI:
9220 case V8HI_FTYPE_V4SF_UQI:
9221 nargs = 2;
9222 break;
9223 case V2DI_FTYPE_V2DI_INT_CONVERT:
9224 nargs = 2;
9225 rmode = V1TImode;
9226 nargs_constant = 1;
9227 break;
9228 case V4DI_FTYPE_V4DI_INT_CONVERT:
9229 nargs = 2;
9230 rmode = V2TImode;
9231 nargs_constant = 1;
9232 break;
9233 case V8DI_FTYPE_V8DI_INT_CONVERT:
9234 nargs = 2;
9235 rmode = V4TImode;
9236 nargs_constant = 1;
9237 break;
9238 case V8HI_FTYPE_V8HI_INT:
9239 case V8HI_FTYPE_V8SF_INT:
9240 case V16HI_FTYPE_V16SF_INT:
9241 case V8HI_FTYPE_V4SF_INT:
9242 case V8SF_FTYPE_V8SF_INT:
9243 case V4SF_FTYPE_V16SF_INT:
9244 case V16SF_FTYPE_V16SF_INT:
9245 case V4SI_FTYPE_V4SI_INT:
9246 case V4SI_FTYPE_V8SI_INT:
9247 case V4HI_FTYPE_V4HI_INT:
9248 case V4DF_FTYPE_V4DF_INT:
9249 case V4DF_FTYPE_V8DF_INT:
9250 case V4SF_FTYPE_V4SF_INT:
9251 case V4SF_FTYPE_V8SF_INT:
9252 case V2DI_FTYPE_V2DI_INT:
9253 case V2DF_FTYPE_V2DF_INT:
9254 case V2DF_FTYPE_V4DF_INT:
9255 case V16HI_FTYPE_V16HI_INT:
9256 case V8SI_FTYPE_V8SI_INT:
9257 case V16SI_FTYPE_V16SI_INT:
9258 case V4SI_FTYPE_V16SI_INT:
9259 case V4DI_FTYPE_V4DI_INT:
9260 case V2DI_FTYPE_V4DI_INT:
9261 case V4DI_FTYPE_V8DI_INT:
9262 case UQI_FTYPE_UQI_UQI_CONST:
9263 case UHI_FTYPE_UHI_UQI:
9264 case USI_FTYPE_USI_UQI:
9265 case UDI_FTYPE_UDI_UQI:
9266 nargs = 2;
9267 nargs_constant = 1;
9268 break;
9269 case V16QI_FTYPE_V16QI_V16QI_V16QI:
9270 case V8SF_FTYPE_V8SF_V8SF_V8SF:
9271 case V4DF_FTYPE_V4DF_V4DF_V4DF:
9272 case V4SF_FTYPE_V4SF_V4SF_V4SF:
9273 case V2DF_FTYPE_V2DF_V2DF_V2DF:
9274 case V32QI_FTYPE_V32QI_V32QI_V32QI:
9275 case UHI_FTYPE_V16SI_V16SI_UHI:
9276 case UQI_FTYPE_V8DI_V8DI_UQI:
9277 case V16HI_FTYPE_V16SI_V16HI_UHI:
9278 case V16QI_FTYPE_V16SI_V16QI_UHI:
9279 case V16QI_FTYPE_V8DI_V16QI_UQI:
9280 case V16SF_FTYPE_V16SF_V16SF_UHI:
9281 case V16SF_FTYPE_V4SF_V16SF_UHI:
9282 case V16SI_FTYPE_SI_V16SI_UHI:
9283 case V16SI_FTYPE_V16HI_V16SI_UHI:
9284 case V16SI_FTYPE_V16QI_V16SI_UHI:
9285 case V8SF_FTYPE_V4SF_V8SF_UQI:
9286 case V4DF_FTYPE_V2DF_V4DF_UQI:
9287 case V8SI_FTYPE_V4SI_V8SI_UQI:
9288 case V8SI_FTYPE_SI_V8SI_UQI:
9289 case V4SI_FTYPE_V4SI_V4SI_UQI:
9290 case V4SI_FTYPE_SI_V4SI_UQI:
9291 case V4DI_FTYPE_V2DI_V4DI_UQI:
9292 case V4DI_FTYPE_DI_V4DI_UQI:
9293 case V2DI_FTYPE_V2DI_V2DI_UQI:
9294 case V2DI_FTYPE_DI_V2DI_UQI:
9295 case V64QI_FTYPE_V64QI_V64QI_UDI:
9296 case V64QI_FTYPE_V16QI_V64QI_UDI:
9297 case V64QI_FTYPE_QI_V64QI_UDI:
9298 case V32QI_FTYPE_V32QI_V32QI_USI:
9299 case V32QI_FTYPE_V16QI_V32QI_USI:
9300 case V32QI_FTYPE_QI_V32QI_USI:
9301 case V16QI_FTYPE_V16QI_V16QI_UHI:
9302 case V16QI_FTYPE_QI_V16QI_UHI:
9303 case V32HI_FTYPE_V8HI_V32HI_USI:
9304 case V32HI_FTYPE_HI_V32HI_USI:
9305 case V16HI_FTYPE_V8HI_V16HI_UHI:
9306 case V16HI_FTYPE_HI_V16HI_UHI:
9307 case V8HI_FTYPE_V8HI_V8HI_UQI:
9308 case V8HI_FTYPE_HI_V8HI_UQI:
9309 case V8SF_FTYPE_V8HI_V8SF_UQI:
9310 case V4SF_FTYPE_V8HI_V4SF_UQI:
9311 case V8SI_FTYPE_V8SF_V8SI_UQI:
9312 case V4SI_FTYPE_V4SF_V4SI_UQI:
9313 case V4DI_FTYPE_V4SF_V4DI_UQI:
9314 case V2DI_FTYPE_V4SF_V2DI_UQI:
9315 case V4SF_FTYPE_V4DI_V4SF_UQI:
9316 case V4SF_FTYPE_V2DI_V4SF_UQI:
9317 case V4DF_FTYPE_V4DI_V4DF_UQI:
9318 case V2DF_FTYPE_V2DI_V2DF_UQI:
9319 case V16QI_FTYPE_V8HI_V16QI_UQI:
9320 case V16QI_FTYPE_V16HI_V16QI_UHI:
9321 case V16QI_FTYPE_V4SI_V16QI_UQI:
9322 case V16QI_FTYPE_V8SI_V16QI_UQI:
9323 case V8HI_FTYPE_V4SI_V8HI_UQI:
9324 case V8HI_FTYPE_V8SI_V8HI_UQI:
9325 case V16QI_FTYPE_V2DI_V16QI_UQI:
9326 case V16QI_FTYPE_V4DI_V16QI_UQI:
9327 case V8HI_FTYPE_V2DI_V8HI_UQI:
9328 case V8HI_FTYPE_V4DI_V8HI_UQI:
9329 case V4SI_FTYPE_V2DI_V4SI_UQI:
9330 case V4SI_FTYPE_V4DI_V4SI_UQI:
9331 case V32QI_FTYPE_V32HI_V32QI_USI:
9332 case UHI_FTYPE_V16QI_V16QI_UHI:
9333 case USI_FTYPE_V32QI_V32QI_USI:
9334 case UDI_FTYPE_V64QI_V64QI_UDI:
9335 case UQI_FTYPE_V8HI_V8HI_UQI:
9336 case UHI_FTYPE_V16HI_V16HI_UHI:
9337 case USI_FTYPE_V32HI_V32HI_USI:
9338 case UQI_FTYPE_V4SI_V4SI_UQI:
9339 case UQI_FTYPE_V8SI_V8SI_UQI:
9340 case UQI_FTYPE_V2DI_V2DI_UQI:
9341 case UQI_FTYPE_V4DI_V4DI_UQI:
9342 case V4SF_FTYPE_V2DF_V4SF_UQI:
9343 case V4SF_FTYPE_V4DF_V4SF_UQI:
9344 case V16SI_FTYPE_V16SI_V16SI_UHI:
9345 case V16SI_FTYPE_V4SI_V16SI_UHI:
9346 case V2DI_FTYPE_V4SI_V2DI_UQI:
9347 case V2DI_FTYPE_V8HI_V2DI_UQI:
9348 case V2DI_FTYPE_V16QI_V2DI_UQI:
9349 case V4DI_FTYPE_V4DI_V4DI_UQI:
9350 case V4DI_FTYPE_V4SI_V4DI_UQI:
9351 case V4DI_FTYPE_V8HI_V4DI_UQI:
9352 case V4DI_FTYPE_V16QI_V4DI_UQI:
9353 case V4DI_FTYPE_V4DF_V4DI_UQI:
9354 case V2DI_FTYPE_V2DF_V2DI_UQI:
9355 case V4SI_FTYPE_V4DF_V4SI_UQI:
9356 case V4SI_FTYPE_V2DF_V4SI_UQI:
9357 case V4SI_FTYPE_V8HI_V4SI_UQI:
9358 case V4SI_FTYPE_V16QI_V4SI_UQI:
9359 case V4DI_FTYPE_V4DI_V4DI_V4DI:
9360 case V8DF_FTYPE_V2DF_V8DF_UQI:
9361 case V8DF_FTYPE_V4DF_V8DF_UQI:
9362 case V8DF_FTYPE_V8DF_V8DF_UQI:
9363 case V8SF_FTYPE_V8SF_V8SF_UQI:
9364 case V8SF_FTYPE_V8SI_V8SF_UQI:
9365 case V4DF_FTYPE_V4DF_V4DF_UQI:
9366 case V4SF_FTYPE_V4SF_V4SF_UQI:
9367 case V2DF_FTYPE_V2DF_V2DF_UQI:
9368 case V2DF_FTYPE_V4SF_V2DF_UQI:
9369 case V2DF_FTYPE_V4SI_V2DF_UQI:
9370 case V4SF_FTYPE_V4SI_V4SF_UQI:
9371 case V4DF_FTYPE_V4SF_V4DF_UQI:
9372 case V4DF_FTYPE_V4SI_V4DF_UQI:
9373 case V8SI_FTYPE_V8SI_V8SI_UQI:
9374 case V8SI_FTYPE_V8HI_V8SI_UQI:
9375 case V8SI_FTYPE_V16QI_V8SI_UQI:
9376 case V8DF_FTYPE_V8SI_V8DF_UQI:
9377 case V8DI_FTYPE_DI_V8DI_UQI:
9378 case V16SF_FTYPE_V8SF_V16SF_UHI:
9379 case V16SI_FTYPE_V8SI_V16SI_UHI:
9380 case V16HI_FTYPE_V16HI_V16HI_UHI:
9381 case V8HI_FTYPE_V16QI_V8HI_UQI:
9382 case V16HI_FTYPE_V16QI_V16HI_UHI:
9383 case V32HI_FTYPE_V32HI_V32HI_USI:
9384 case V32HI_FTYPE_V32QI_V32HI_USI:
9385 case V8DI_FTYPE_V16QI_V8DI_UQI:
9386 case V8DI_FTYPE_V2DI_V8DI_UQI:
9387 case V8DI_FTYPE_V4DI_V8DI_UQI:
9388 case V8DI_FTYPE_V8DI_V8DI_UQI:
9389 case V8DI_FTYPE_V8HI_V8DI_UQI:
9390 case V8DI_FTYPE_V8SI_V8DI_UQI:
9391 case V8HI_FTYPE_V8DI_V8HI_UQI:
9392 case V8SI_FTYPE_V8DI_V8SI_UQI:
9393 case V4SI_FTYPE_V4SI_V4SI_V4SI:
9394 case V16SI_FTYPE_V16SI_V16SI_V16SI:
9395 case V8DI_FTYPE_V8DI_V8DI_V8DI:
9396 case V32HI_FTYPE_V32HI_V32HI_V32HI:
9397 case V2DI_FTYPE_V2DI_V2DI_V2DI:
9398 case V16HI_FTYPE_V16HI_V16HI_V16HI:
9399 case V8SI_FTYPE_V8SI_V8SI_V8SI:
9400 case V8HI_FTYPE_V8HI_V8HI_V8HI:
9401 case V32HI_FTYPE_V16SF_V16SF_USI:
9402 case V16HI_FTYPE_V8SF_V8SF_UHI:
9403 case V8HI_FTYPE_V4SF_V4SF_UQI:
9404 case V16HI_FTYPE_V16SF_V16HI_UHI:
9405 case V8HI_FTYPE_V8SF_V8HI_UQI:
9406 case V8HI_FTYPE_V4SF_V8HI_UQI:
9407 case V16SF_FTYPE_V16SF_V32HI_V32HI:
9408 case V8SF_FTYPE_V8SF_V16HI_V16HI:
9409 case V4SF_FTYPE_V4SF_V8HI_V8HI:
9410 nargs = 3;
9411 break;
9412 case V32QI_FTYPE_V32QI_V32QI_INT:
9413 case V16HI_FTYPE_V16HI_V16HI_INT:
9414 case V16QI_FTYPE_V16QI_V16QI_INT:
9415 case V4DI_FTYPE_V4DI_V4DI_INT:
9416 case V8HI_FTYPE_V8HI_V8HI_INT:
9417 case V8SI_FTYPE_V8SI_V8SI_INT:
9418 case V8SI_FTYPE_V8SI_V4SI_INT:
9419 case V8SF_FTYPE_V8SF_V8SF_INT:
9420 case V8SF_FTYPE_V8SF_V4SF_INT:
9421 case V4SI_FTYPE_V4SI_V4SI_INT:
9422 case V4DF_FTYPE_V4DF_V4DF_INT:
9423 case V16SF_FTYPE_V16SF_V16SF_INT:
9424 case V16SF_FTYPE_V16SF_V4SF_INT:
9425 case V16SI_FTYPE_V16SI_V4SI_INT:
9426 case V4DF_FTYPE_V4DF_V2DF_INT:
9427 case V4SF_FTYPE_V4SF_V4SF_INT:
9428 case V2DI_FTYPE_V2DI_V2DI_INT:
9429 case V4DI_FTYPE_V4DI_V2DI_INT:
9430 case V2DF_FTYPE_V2DF_V2DF_INT:
9431 case UQI_FTYPE_V8DI_V8UDI_INT:
9432 case UQI_FTYPE_V8DF_V8DF_INT:
9433 case UQI_FTYPE_V2DF_V2DF_INT:
9434 case UQI_FTYPE_V4SF_V4SF_INT:
9435 case UHI_FTYPE_V16SI_V16SI_INT:
9436 case UHI_FTYPE_V16SF_V16SF_INT:
9437 case V64QI_FTYPE_V64QI_V64QI_INT:
9438 case V32HI_FTYPE_V32HI_V32HI_INT:
9439 case V16SI_FTYPE_V16SI_V16SI_INT:
9440 case V8DI_FTYPE_V8DI_V8DI_INT:
9441 nargs = 3;
9442 nargs_constant = 1;
9443 break;
9444 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
9445 nargs = 3;
9446 rmode = V4DImode;
9447 nargs_constant = 1;
9448 break;
9449 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
9450 nargs = 3;
9451 rmode = V2DImode;
9452 nargs_constant = 1;
9453 break;
9454 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
9455 nargs = 3;
9456 rmode = DImode;
9457 nargs_constant = 1;
9458 break;
9459 case V2DI_FTYPE_V2DI_UINT_UINT:
9460 nargs = 3;
9461 nargs_constant = 2;
9462 break;
9463 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
9464 nargs = 3;
9465 rmode = V8DImode;
9466 nargs_constant = 1;
9467 break;
9468 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
9469 nargs = 5;
9470 rmode = V8DImode;
9471 mask_pos = 2;
9472 nargs_constant = 1;
9473 break;
9474 case QI_FTYPE_V8DF_INT_UQI:
9475 case QI_FTYPE_V4DF_INT_UQI:
9476 case QI_FTYPE_V2DF_INT_UQI:
9477 case HI_FTYPE_V16SF_INT_UHI:
9478 case QI_FTYPE_V8SF_INT_UQI:
9479 case QI_FTYPE_V4SF_INT_UQI:
9480 case V4SI_FTYPE_V4SI_V4SI_UHI:
9481 case V8SI_FTYPE_V8SI_V8SI_UHI:
9482 nargs = 3;
9483 mask_pos = 1;
9484 nargs_constant = 1;
9485 break;
9486 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
9487 nargs = 5;
9488 rmode = V4DImode;
9489 mask_pos = 2;
9490 nargs_constant = 1;
9491 break;
9492 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
9493 nargs = 5;
9494 rmode = V2DImode;
9495 mask_pos = 2;
9496 nargs_constant = 1;
9497 break;
9498 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
9499 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
9500 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
9501 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
9502 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
9503 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
9504 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
9505 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
9506 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
9507 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
9508 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
9509 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
9510 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
9511 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
9512 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
9513 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
9514 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
9515 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
9516 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
9517 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
9518 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
9519 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
9520 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
9521 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
9522 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
9523 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
9524 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
9525 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
9526 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
9527 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
9528 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
9529 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
9530 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
9531 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
9532 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
9533 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
9534 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
9535 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
9536 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
9537 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
9538 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
9539 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
9540 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
9541 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
9542 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
9543 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
9544 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
9545 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
9546 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
9547 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
9548 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
9549 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
9550 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
9551 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
9552 nargs = 4;
9553 break;
9554 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
9555 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
9556 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
9557 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
9558 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
9559 nargs = 4;
9560 nargs_constant = 1;
9561 break;
9562 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
9563 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
9564 case QI_FTYPE_V4DF_V4DF_INT_UQI:
9565 case QI_FTYPE_V8SF_V8SF_INT_UQI:
9566 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
9567 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
9568 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
9569 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
9570 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
9571 case USI_FTYPE_V32QI_V32QI_INT_USI:
9572 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
9573 case USI_FTYPE_V32HI_V32HI_INT_USI:
9574 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
9575 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
9576 case V32HI_FTYPE_V32HI_V32HI_V32HI_INT:
9577 case V16HI_FTYPE_V16HI_V16HI_V16HI_INT:
9578 case V8HI_FTYPE_V8HI_V8HI_V8HI_INT:
9579 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT:
9580 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT:
9581 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT:
9582 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT:
9583 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT:
9584 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
9585 nargs = 4;
9586 mask_pos = 1;
9587 nargs_constant = 1;
9588 break;
9589 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
9590 nargs = 4;
9591 nargs_constant = 2;
9592 break;
9593 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
9594 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
9595 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
9596 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
9597 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
9598 nargs = 4;
9599 break;
9600 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
9601 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
9602 mask_pos = 1;
9603 nargs = 4;
9604 nargs_constant = 1;
9605 break;
9606 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
9607 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
9608 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
9609 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
9610 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
9611 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
9612 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
9613 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
9614 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
9615 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
9616 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
9617 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
9618 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
9619 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
9620 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
9621 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
9622 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
9623 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
9624 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
9625 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
9626 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
9627 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
9628 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
9629 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
9630 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
9631 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
9632 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
9633 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
9634 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
9635 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
9636 nargs = 4;
9637 mask_pos = 2;
9638 nargs_constant = 1;
9639 break;
9640 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
9641 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
9642 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
9643 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
9644 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
9645 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
9646 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
9647 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
9648 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
9649 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
9650 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
9651 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
9652 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
9653 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
9654 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
9655 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
9656 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
9657 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
9658 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
9659 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
9660 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
9661 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
9662 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
9663 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
9664 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
9665 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
9666 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
9667 nargs = 5;
9668 mask_pos = 2;
9669 nargs_constant = 1;
9670 break;
9671 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
9672 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
9673 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
9674 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
9675 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
9676 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
9677 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
9678 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
9679 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
9680 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
9681 nargs = 5;
9682 mask_pos = 1;
9683 nargs_constant = 1;
9684 break;
9685 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
9686 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
9687 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
9688 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
9689 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
9690 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
9691 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
9692 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
9693 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
9694 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
9695 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
9696 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
9697 nargs = 5;
9698 mask_pos = 1;
9699 nargs_constant = 2;
9700 break;
9701
9702 default:
9703 gcc_unreachable ();
9704 }
9705
9706 gcc_assert (nargs <= ARRAY_SIZE (args));
9707
9708 if (comparison != UNKNOWN)
9709 {
9710 gcc_assert (nargs == 2);
9711 return ix86_expand_sse_compare (d, exp, target, swap);
9712 }
9713
9714 if (rmode == VOIDmode || rmode == tmode)
9715 {
9716 if (optimize
9717 || target == 0
9718 || GET_MODE (target) != tmode
9719 || !insn_p->operand[0].predicate (target, tmode))
9720 target = gen_reg_rtx (tmode);
9721 else if (memory_operand (target, tmode))
9722 num_memory++;
9723 real_target = target;
9724 }
9725 else
9726 {
9727 real_target = gen_reg_rtx (tmode);
9728 target = lowpart_subreg (rmode, real_target, tmode);
9729 }
9730
9731 for (i = 0; i < nargs; i++)
9732 {
9733 tree arg = CALL_EXPR_ARG (exp, i);
9734 rtx op = expand_normal (arg);
9735 machine_mode mode = insn_p->operand[i + 1].mode;
9736 bool match = insn_p->operand[i + 1].predicate (op, mode);
9737
9738 if (second_arg_count && i == 1)
9739 {
9740 /* SIMD shift insns take either an 8-bit immediate or
9741 register as count. But builtin functions take int as
9742 count. If count doesn't match, we put it in register.
9743 The instructions are using 64-bit count, if op is just
9744 32-bit, zero-extend it, as negative shift counts
9745 are undefined behavior and zero-extension is more
9746 efficient. */
9747 if (!match)
9748 {
9749 if (SCALAR_INT_MODE_P (GET_MODE (op)))
9750 op = convert_modes (mode, GET_MODE (op), op, 1);
9751 else
9752 op = lowpart_subreg (mode, op, GET_MODE (op));
9753 if (!insn_p->operand[i + 1].predicate (op, mode))
9754 op = copy_to_reg (op);
9755 }
9756 }
9757 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9758 (!mask_pos && (nargs - i) <= nargs_constant))
9759 {
9760 if (!match)
9761 switch (icode)
9762 {
9763 case CODE_FOR_avx_vinsertf128v4di:
9764 case CODE_FOR_avx_vextractf128v4di:
9765 error ("the last argument must be an 1-bit immediate");
9766 return const0_rtx;
9767
9768 case CODE_FOR_avx512f_cmpv8di3_mask:
9769 case CODE_FOR_avx512f_cmpv16si3_mask:
9770 case CODE_FOR_avx512f_ucmpv8di3_mask:
9771 case CODE_FOR_avx512f_ucmpv16si3_mask:
9772 case CODE_FOR_avx512vl_cmpv4di3_mask:
9773 case CODE_FOR_avx512vl_cmpv8si3_mask:
9774 case CODE_FOR_avx512vl_ucmpv4di3_mask:
9775 case CODE_FOR_avx512vl_ucmpv8si3_mask:
9776 case CODE_FOR_avx512vl_cmpv2di3_mask:
9777 case CODE_FOR_avx512vl_cmpv4si3_mask:
9778 case CODE_FOR_avx512vl_ucmpv2di3_mask:
9779 case CODE_FOR_avx512vl_ucmpv4si3_mask:
9780 error ("the last argument must be a 3-bit immediate");
9781 return const0_rtx;
9782
9783 case CODE_FOR_sse4_1_roundsd:
9784 case CODE_FOR_sse4_1_roundss:
9785
9786 case CODE_FOR_sse4_1_roundpd:
9787 case CODE_FOR_sse4_1_roundps:
9788 case CODE_FOR_avx_roundpd256:
9789 case CODE_FOR_avx_roundps256:
9790
9791 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
9792 case CODE_FOR_sse4_1_roundps_sfix:
9793 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
9794 case CODE_FOR_avx_roundps_sfix256:
9795
9796 case CODE_FOR_sse4_1_blendps:
9797 case CODE_FOR_avx_blendpd256:
9798 case CODE_FOR_avx_vpermilv4df:
9799 case CODE_FOR_avx_vpermilv4df_mask:
9800 case CODE_FOR_avx512f_getmantv8df_mask:
9801 case CODE_FOR_avx512f_getmantv16sf_mask:
9802 case CODE_FOR_avx512vl_getmantv8sf_mask:
9803 case CODE_FOR_avx512vl_getmantv4df_mask:
9804 case CODE_FOR_avx512vl_getmantv4sf_mask:
9805 case CODE_FOR_avx512vl_getmantv2df_mask:
9806 case CODE_FOR_avx512dq_rangepv8df_mask_round:
9807 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
9808 case CODE_FOR_avx512dq_rangepv4df_mask:
9809 case CODE_FOR_avx512dq_rangepv8sf_mask:
9810 case CODE_FOR_avx512dq_rangepv2df_mask:
9811 case CODE_FOR_avx512dq_rangepv4sf_mask:
9812 case CODE_FOR_avx_shufpd256_mask:
9813 error ("the last argument must be a 4-bit immediate");
9814 return const0_rtx;
9815
9816 case CODE_FOR_sha1rnds4:
9817 case CODE_FOR_sse4_1_blendpd:
9818 case CODE_FOR_avx_vpermilv2df:
9819 case CODE_FOR_avx_vpermilv2df_mask:
9820 case CODE_FOR_xop_vpermil2v2df3:
9821 case CODE_FOR_xop_vpermil2v4sf3:
9822 case CODE_FOR_xop_vpermil2v4df3:
9823 case CODE_FOR_xop_vpermil2v8sf3:
9824 case CODE_FOR_avx512f_vinsertf32x4_mask:
9825 case CODE_FOR_avx512f_vinserti32x4_mask:
9826 case CODE_FOR_avx512f_vextractf32x4_mask:
9827 case CODE_FOR_avx512f_vextracti32x4_mask:
9828 case CODE_FOR_sse2_shufpd:
9829 case CODE_FOR_sse2_shufpd_mask:
9830 case CODE_FOR_avx512dq_shuf_f64x2_mask:
9831 case CODE_FOR_avx512dq_shuf_i64x2_mask:
9832 case CODE_FOR_avx512vl_shuf_i32x4_mask:
9833 case CODE_FOR_avx512vl_shuf_f32x4_mask:
9834 error ("the last argument must be a 2-bit immediate");
9835 return const0_rtx;
9836
9837 case CODE_FOR_avx_vextractf128v4df:
9838 case CODE_FOR_avx_vextractf128v8sf:
9839 case CODE_FOR_avx_vextractf128v8si:
9840 case CODE_FOR_avx_vinsertf128v4df:
9841 case CODE_FOR_avx_vinsertf128v8sf:
9842 case CODE_FOR_avx_vinsertf128v8si:
9843 case CODE_FOR_avx512f_vinsertf64x4_mask:
9844 case CODE_FOR_avx512f_vinserti64x4_mask:
9845 case CODE_FOR_avx512f_vextractf64x4_mask:
9846 case CODE_FOR_avx512f_vextracti64x4_mask:
9847 case CODE_FOR_avx512dq_vinsertf32x8_mask:
9848 case CODE_FOR_avx512dq_vinserti32x8_mask:
9849 case CODE_FOR_avx512vl_vinsertv4df:
9850 case CODE_FOR_avx512vl_vinsertv4di:
9851 case CODE_FOR_avx512vl_vinsertv8sf:
9852 case CODE_FOR_avx512vl_vinsertv8si:
9853 error ("the last argument must be a 1-bit immediate");
9854 return const0_rtx;
9855
9856 case CODE_FOR_avx_vmcmpv2df3:
9857 case CODE_FOR_avx_vmcmpv4sf3:
9858 case CODE_FOR_avx_cmpv2df3:
9859 case CODE_FOR_avx_cmpv4sf3:
9860 case CODE_FOR_avx_cmpv4df3:
9861 case CODE_FOR_avx_cmpv8sf3:
9862 case CODE_FOR_avx512f_cmpv8df3_mask:
9863 case CODE_FOR_avx512f_cmpv16sf3_mask:
9864 case CODE_FOR_avx512f_vmcmpv2df3_mask:
9865 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
9866 error ("the last argument must be a 5-bit immediate");
9867 return const0_rtx;
9868
9869 default:
9870 switch (nargs_constant)
9871 {
9872 case 2:
9873 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9874 (!mask_pos && (nargs - i) == nargs_constant))
9875 {
9876 error ("the next to last argument must be an 8-bit immediate");
9877 break;
9878 }
9879 /* FALLTHRU */
9880 case 1:
9881 error ("the last argument must be an 8-bit immediate");
9882 break;
9883 default:
9884 gcc_unreachable ();
9885 }
9886 return const0_rtx;
9887 }
9888 }
9889 else
9890 {
9891 if (VECTOR_MODE_P (mode))
9892 op = safe_vector_operand (op, mode);
9893
9894 /* If we aren't optimizing, only allow one memory operand to
9895 be generated. */
9896 if (memory_operand (op, mode))
9897 num_memory++;
9898
9899 op = fixup_modeless_constant (op, mode);
9900
9901 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
9902 {
9903 if (optimize || !match || num_memory > 1)
9904 op = copy_to_mode_reg (mode, op);
9905 }
9906 else
9907 {
9908 op = copy_to_reg (op);
9909 op = lowpart_subreg (mode, op, GET_MODE (op));
9910 }
9911 }
9912
9913 args[i].op = op;
9914 args[i].mode = mode;
9915 }
9916
9917 switch (nargs)
9918 {
9919 case 1:
9920 pat = GEN_FCN (icode) (real_target, args[0].op);
9921 break;
9922 case 2:
9923 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
9924 break;
9925 case 3:
9926 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9927 args[2].op);
9928 break;
9929 case 4:
9930 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9931 args[2].op, args[3].op);
9932 break;
9933 case 5:
9934 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9935 args[2].op, args[3].op, args[4].op);
9936 break;
9937 case 6:
9938 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9939 args[2].op, args[3].op, args[4].op,
9940 args[5].op);
9941 break;
9942 default:
9943 gcc_unreachable ();
9944 }
9945
9946 if (! pat)
9947 return 0;
9948
9949 emit_insn (pat);
9950 return target;
9951 }
9952
9953 /* Transform pattern of following layout:
9954 (set A
9955 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
9956 )
9957 into:
9958 (set (A B)) */
9959
9960 static rtx
9961 ix86_erase_embedded_rounding (rtx pat)
9962 {
9963 if (GET_CODE (pat) == INSN)
9964 pat = PATTERN (pat);
9965
9966 gcc_assert (GET_CODE (pat) == SET);
9967 rtx src = SET_SRC (pat);
9968 gcc_assert (XVECLEN (src, 0) == 2);
9969 rtx p0 = XVECEXP (src, 0, 0);
9970 gcc_assert (GET_CODE (src) == UNSPEC
9971 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
9972 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
9973 return res;
9974 }
9975
9976 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
9977 with rounding. */
9978 static rtx
9979 ix86_expand_sse_comi_round (const struct builtin_description *d,
9980 tree exp, rtx target)
9981 {
9982 rtx pat, set_dst;
9983 tree arg0 = CALL_EXPR_ARG (exp, 0);
9984 tree arg1 = CALL_EXPR_ARG (exp, 1);
9985 tree arg2 = CALL_EXPR_ARG (exp, 2);
9986 tree arg3 = CALL_EXPR_ARG (exp, 3);
9987 rtx op0 = expand_normal (arg0);
9988 rtx op1 = expand_normal (arg1);
9989 rtx op2 = expand_normal (arg2);
9990 rtx op3 = expand_normal (arg3);
9991 enum insn_code icode = d->icode;
9992 const struct insn_data_d *insn_p = &insn_data[icode];
9993 machine_mode mode0 = insn_p->operand[0].mode;
9994 machine_mode mode1 = insn_p->operand[1].mode;
9995
9996 /* See avxintrin.h for values. */
9997 static const enum rtx_code comparisons[32] =
9998 {
9999 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
10000 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
10001 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
10002 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
10003 };
10004 static const bool ordereds[32] =
10005 {
10006 true, true, true, false, false, false, false, true,
10007 false, false, false, true, true, true, true, false,
10008 true, true, true, false, false, false, false, true,
10009 false, false, false, true, true, true, true, false
10010 };
10011 static const bool non_signalings[32] =
10012 {
10013 true, false, false, true, true, false, false, true,
10014 true, false, false, true, true, false, false, true,
10015 false, true, true, false, false, true, true, false,
10016 false, true, true, false, false, true, true, false
10017 };
10018
10019 if (!CONST_INT_P (op2))
10020 {
10021 error ("the third argument must be comparison constant");
10022 return const0_rtx;
10023 }
10024 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
10025 {
10026 error ("incorrect comparison mode");
10027 return const0_rtx;
10028 }
10029
10030 if (!insn_p->operand[2].predicate (op3, SImode))
10031 {
10032 error ("incorrect rounding operand");
10033 return const0_rtx;
10034 }
10035
10036 if (VECTOR_MODE_P (mode0))
10037 op0 = safe_vector_operand (op0, mode0);
10038 if (VECTOR_MODE_P (mode1))
10039 op1 = safe_vector_operand (op1, mode1);
10040
10041 enum rtx_code comparison = comparisons[INTVAL (op2)];
10042 bool ordered = ordereds[INTVAL (op2)];
10043 bool non_signaling = non_signalings[INTVAL (op2)];
10044 rtx const_val = const0_rtx;
10045
10046 bool check_unordered = false;
10047 machine_mode mode = CCFPmode;
10048 switch (comparison)
10049 {
10050 case ORDERED:
10051 if (!ordered)
10052 {
10053 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
10054 if (!non_signaling)
10055 ordered = true;
10056 mode = CCSmode;
10057 }
10058 else
10059 {
10060 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
10061 if (non_signaling)
10062 ordered = false;
10063 mode = CCPmode;
10064 }
10065 comparison = NE;
10066 break;
10067 case UNORDERED:
10068 if (ordered)
10069 {
10070 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
10071 if (non_signaling)
10072 ordered = false;
10073 mode = CCSmode;
10074 }
10075 else
10076 {
10077 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
10078 if (!non_signaling)
10079 ordered = true;
10080 mode = CCPmode;
10081 }
10082 comparison = EQ;
10083 break;
10084
10085 case LE: /* -> GE */
10086 case LT: /* -> GT */
10087 case UNGE: /* -> UNLE */
10088 case UNGT: /* -> UNLT */
10089 std::swap (op0, op1);
10090 comparison = swap_condition (comparison);
10091 /* FALLTHRU */
10092 case GT:
10093 case GE:
10094 case UNEQ:
10095 case UNLT:
10096 case UNLE:
10097 case LTGT:
10098 /* These are supported by CCFPmode. NB: Use ordered/signaling
10099 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
10100 with NAN operands. */
10101 if (ordered == non_signaling)
10102 ordered = !ordered;
10103 break;
10104 case EQ:
10105 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10106 _CMP_EQ_OQ/_CMP_EQ_OS. */
10107 check_unordered = true;
10108 mode = CCZmode;
10109 break;
10110 case NE:
10111 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10112 _CMP_NEQ_UQ/_CMP_NEQ_US. */
10113 gcc_assert (!ordered);
10114 check_unordered = true;
10115 mode = CCZmode;
10116 const_val = const1_rtx;
10117 break;
10118 default:
10119 gcc_unreachable ();
10120 }
10121
10122 target = gen_reg_rtx (SImode);
10123 emit_move_insn (target, const_val);
10124 target = gen_rtx_SUBREG (QImode, target, 0);
10125
10126 if ((optimize && !register_operand (op0, mode0))
10127 || !insn_p->operand[0].predicate (op0, mode0))
10128 op0 = copy_to_mode_reg (mode0, op0);
10129 if ((optimize && !register_operand (op1, mode1))
10130 || !insn_p->operand[1].predicate (op1, mode1))
10131 op1 = copy_to_mode_reg (mode1, op1);
10132
10133 /*
10134 1. COMI: ordered and signaling.
10135 2. UCOMI: unordered and non-signaling.
10136 */
10137 if (non_signaling)
10138 icode = (icode == CODE_FOR_sse_comi_round
10139 ? CODE_FOR_sse_ucomi_round
10140 : CODE_FOR_sse2_ucomi_round);
10141
10142 pat = GEN_FCN (icode) (op0, op1, op3);
10143 if (! pat)
10144 return 0;
10145
10146 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
10147 if (INTVAL (op3) == NO_ROUND)
10148 {
10149 pat = ix86_erase_embedded_rounding (pat);
10150 if (! pat)
10151 return 0;
10152
10153 set_dst = SET_DEST (pat);
10154 }
10155 else
10156 {
10157 gcc_assert (GET_CODE (pat) == SET);
10158 set_dst = SET_DEST (pat);
10159 }
10160
10161 emit_insn (pat);
10162
10163 rtx_code_label *label = NULL;
10164
10165 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10166 with NAN operands. */
10167 if (check_unordered)
10168 {
10169 gcc_assert (comparison == EQ || comparison == NE);
10170
10171 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
10172 label = gen_label_rtx ();
10173 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
10174 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10175 gen_rtx_LABEL_REF (VOIDmode, label),
10176 pc_rtx);
10177 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10178 }
10179
10180 /* NB: Set CCFPmode and check a different CCmode which is in subset
10181 of CCFPmode. */
10182 if (GET_MODE (set_dst) != mode)
10183 {
10184 gcc_assert (mode == CCAmode || mode == CCCmode
10185 || mode == CCOmode || mode == CCPmode
10186 || mode == CCSmode || mode == CCZmode);
10187 set_dst = gen_rtx_REG (mode, FLAGS_REG);
10188 }
10189
10190 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10191 gen_rtx_fmt_ee (comparison, QImode,
10192 set_dst,
10193 const0_rtx)));
10194
10195 if (label)
10196 emit_label (label);
10197
10198 return SUBREG_REG (target);
10199 }
10200
10201 static rtx
10202 ix86_expand_round_builtin (const struct builtin_description *d,
10203 tree exp, rtx target)
10204 {
10205 rtx pat;
10206 unsigned int i, nargs;
10207 struct
10208 {
10209 rtx op;
10210 machine_mode mode;
10211 } args[6];
10212 enum insn_code icode = d->icode;
10213 const struct insn_data_d *insn_p = &insn_data[icode];
10214 machine_mode tmode = insn_p->operand[0].mode;
10215 unsigned int nargs_constant = 0;
10216 unsigned int redundant_embed_rnd = 0;
10217
10218 switch ((enum ix86_builtin_func_type) d->flag)
10219 {
10220 case UINT64_FTYPE_V2DF_INT:
10221 case UINT64_FTYPE_V4SF_INT:
10222 case UINT_FTYPE_V2DF_INT:
10223 case UINT_FTYPE_V4SF_INT:
10224 case INT64_FTYPE_V2DF_INT:
10225 case INT64_FTYPE_V4SF_INT:
10226 case INT_FTYPE_V2DF_INT:
10227 case INT_FTYPE_V4SF_INT:
10228 nargs = 2;
10229 break;
10230 case V4SF_FTYPE_V4SF_UINT_INT:
10231 case V4SF_FTYPE_V4SF_UINT64_INT:
10232 case V2DF_FTYPE_V2DF_UINT64_INT:
10233 case V4SF_FTYPE_V4SF_INT_INT:
10234 case V4SF_FTYPE_V4SF_INT64_INT:
10235 case V2DF_FTYPE_V2DF_INT64_INT:
10236 case V4SF_FTYPE_V4SF_V4SF_INT:
10237 case V2DF_FTYPE_V2DF_V2DF_INT:
10238 case V4SF_FTYPE_V4SF_V2DF_INT:
10239 case V2DF_FTYPE_V2DF_V4SF_INT:
10240 nargs = 3;
10241 break;
10242 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
10243 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
10244 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
10245 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
10246 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
10247 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
10248 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
10249 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
10250 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
10251 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
10252 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
10253 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
10254 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
10255 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
10256 nargs = 4;
10257 break;
10258 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
10259 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
10260 nargs_constant = 2;
10261 nargs = 4;
10262 break;
10263 case INT_FTYPE_V4SF_V4SF_INT_INT:
10264 case INT_FTYPE_V2DF_V2DF_INT_INT:
10265 return ix86_expand_sse_comi_round (d, exp, target);
10266 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
10267 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
10268 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
10269 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
10270 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
10271 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
10272 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
10273 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
10274 nargs = 5;
10275 break;
10276 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
10277 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
10278 nargs_constant = 4;
10279 nargs = 5;
10280 break;
10281 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
10282 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
10283 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
10284 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
10285 nargs_constant = 3;
10286 nargs = 5;
10287 break;
10288 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
10289 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
10290 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
10291 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
10292 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
10293 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
10294 nargs = 6;
10295 nargs_constant = 4;
10296 break;
10297 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
10298 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
10299 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
10300 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
10301 nargs = 6;
10302 nargs_constant = 3;
10303 break;
10304 default:
10305 gcc_unreachable ();
10306 }
10307 gcc_assert (nargs <= ARRAY_SIZE (args));
10308
10309 if (optimize
10310 || target == 0
10311 || GET_MODE (target) != tmode
10312 || !insn_p->operand[0].predicate (target, tmode))
10313 target = gen_reg_rtx (tmode);
10314
10315 for (i = 0; i < nargs; i++)
10316 {
10317 tree arg = CALL_EXPR_ARG (exp, i);
10318 rtx op = expand_normal (arg);
10319 machine_mode mode = insn_p->operand[i + 1].mode;
10320 bool match = insn_p->operand[i + 1].predicate (op, mode);
10321
10322 if (i == nargs - nargs_constant)
10323 {
10324 if (!match)
10325 {
10326 switch (icode)
10327 {
10328 case CODE_FOR_avx512f_getmantv8df_mask_round:
10329 case CODE_FOR_avx512f_getmantv16sf_mask_round:
10330 case CODE_FOR_avx512f_vgetmantv2df_round:
10331 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
10332 case CODE_FOR_avx512f_vgetmantv4sf_round:
10333 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
10334 error ("the immediate argument must be a 4-bit immediate");
10335 return const0_rtx;
10336 case CODE_FOR_avx512f_cmpv8df3_mask_round:
10337 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
10338 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
10339 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
10340 error ("the immediate argument must be a 5-bit immediate");
10341 return const0_rtx;
10342 default:
10343 error ("the immediate argument must be an 8-bit immediate");
10344 return const0_rtx;
10345 }
10346 }
10347 }
10348 else if (i == nargs-1)
10349 {
10350 if (!insn_p->operand[nargs].predicate (op, SImode))
10351 {
10352 error ("incorrect rounding operand");
10353 return const0_rtx;
10354 }
10355
10356 /* If there is no rounding use normal version of the pattern. */
10357 if (INTVAL (op) == NO_ROUND)
10358 redundant_embed_rnd = 1;
10359 }
10360 else
10361 {
10362 if (VECTOR_MODE_P (mode))
10363 op = safe_vector_operand (op, mode);
10364
10365 op = fixup_modeless_constant (op, mode);
10366
10367 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10368 {
10369 if (optimize || !match)
10370 op = copy_to_mode_reg (mode, op);
10371 }
10372 else
10373 {
10374 op = copy_to_reg (op);
10375 op = lowpart_subreg (mode, op, GET_MODE (op));
10376 }
10377 }
10378
10379 args[i].op = op;
10380 args[i].mode = mode;
10381 }
10382
10383 switch (nargs)
10384 {
10385 case 1:
10386 pat = GEN_FCN (icode) (target, args[0].op);
10387 break;
10388 case 2:
10389 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
10390 break;
10391 case 3:
10392 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10393 args[2].op);
10394 break;
10395 case 4:
10396 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10397 args[2].op, args[3].op);
10398 break;
10399 case 5:
10400 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10401 args[2].op, args[3].op, args[4].op);
10402 break;
10403 case 6:
10404 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10405 args[2].op, args[3].op, args[4].op,
10406 args[5].op);
10407 break;
10408 default:
10409 gcc_unreachable ();
10410 }
10411
10412 if (!pat)
10413 return 0;
10414
10415 if (redundant_embed_rnd)
10416 pat = ix86_erase_embedded_rounding (pat);
10417
10418 emit_insn (pat);
10419 return target;
10420 }
10421
10422 /* Subroutine of ix86_expand_builtin to take care of special insns
10423 with variable number of operands. */
10424
10425 static rtx
10426 ix86_expand_special_args_builtin (const struct builtin_description *d,
10427 tree exp, rtx target)
10428 {
10429 tree arg;
10430 rtx pat, op;
10431 unsigned int i, nargs, arg_adjust, memory;
10432 bool aligned_mem = false;
10433 struct
10434 {
10435 rtx op;
10436 machine_mode mode;
10437 } args[3];
10438 enum insn_code icode = d->icode;
10439 bool last_arg_constant = false;
10440 const struct insn_data_d *insn_p = &insn_data[icode];
10441 machine_mode tmode = insn_p->operand[0].mode;
10442 enum { load, store } klass;
10443
10444 switch ((enum ix86_builtin_func_type) d->flag)
10445 {
10446 case VOID_FTYPE_VOID:
10447 emit_insn (GEN_FCN (icode) (target));
10448 return 0;
10449 case VOID_FTYPE_UINT64:
10450 case VOID_FTYPE_UNSIGNED:
10451 nargs = 0;
10452 klass = store;
10453 memory = 0;
10454 break;
10455
10456 case INT_FTYPE_VOID:
10457 case USHORT_FTYPE_VOID:
10458 case UINT64_FTYPE_VOID:
10459 case UINT_FTYPE_VOID:
10460 case UNSIGNED_FTYPE_VOID:
10461 nargs = 0;
10462 klass = load;
10463 memory = 0;
10464 break;
10465 case UINT64_FTYPE_PUNSIGNED:
10466 case V2DI_FTYPE_PV2DI:
10467 case V4DI_FTYPE_PV4DI:
10468 case V32QI_FTYPE_PCCHAR:
10469 case V16QI_FTYPE_PCCHAR:
10470 case V8SF_FTYPE_PCV4SF:
10471 case V8SF_FTYPE_PCFLOAT:
10472 case V4SF_FTYPE_PCFLOAT:
10473 case V4DF_FTYPE_PCV2DF:
10474 case V4DF_FTYPE_PCDOUBLE:
10475 case V2DF_FTYPE_PCDOUBLE:
10476 case VOID_FTYPE_PVOID:
10477 case V8DI_FTYPE_PV8DI:
10478 nargs = 1;
10479 klass = load;
10480 memory = 0;
10481 switch (icode)
10482 {
10483 case CODE_FOR_sse4_1_movntdqa:
10484 case CODE_FOR_avx2_movntdqa:
10485 case CODE_FOR_avx512f_movntdqa:
10486 aligned_mem = true;
10487 break;
10488 default:
10489 break;
10490 }
10491 break;
10492 case VOID_FTYPE_PV2SF_V4SF:
10493 case VOID_FTYPE_PV8DI_V8DI:
10494 case VOID_FTYPE_PV4DI_V4DI:
10495 case VOID_FTYPE_PV2DI_V2DI:
10496 case VOID_FTYPE_PCHAR_V32QI:
10497 case VOID_FTYPE_PCHAR_V16QI:
10498 case VOID_FTYPE_PFLOAT_V16SF:
10499 case VOID_FTYPE_PFLOAT_V8SF:
10500 case VOID_FTYPE_PFLOAT_V4SF:
10501 case VOID_FTYPE_PDOUBLE_V8DF:
10502 case VOID_FTYPE_PDOUBLE_V4DF:
10503 case VOID_FTYPE_PDOUBLE_V2DF:
10504 case VOID_FTYPE_PLONGLONG_LONGLONG:
10505 case VOID_FTYPE_PULONGLONG_ULONGLONG:
10506 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
10507 case VOID_FTYPE_PINT_INT:
10508 nargs = 1;
10509 klass = store;
10510 /* Reserve memory operand for target. */
10511 memory = ARRAY_SIZE (args);
10512 switch (icode)
10513 {
10514 /* These builtins and instructions require the memory
10515 to be properly aligned. */
10516 case CODE_FOR_avx_movntv4di:
10517 case CODE_FOR_sse2_movntv2di:
10518 case CODE_FOR_avx_movntv8sf:
10519 case CODE_FOR_sse_movntv4sf:
10520 case CODE_FOR_sse4a_vmmovntv4sf:
10521 case CODE_FOR_avx_movntv4df:
10522 case CODE_FOR_sse2_movntv2df:
10523 case CODE_FOR_sse4a_vmmovntv2df:
10524 case CODE_FOR_sse2_movntidi:
10525 case CODE_FOR_sse_movntq:
10526 case CODE_FOR_sse2_movntisi:
10527 case CODE_FOR_avx512f_movntv16sf:
10528 case CODE_FOR_avx512f_movntv8df:
10529 case CODE_FOR_avx512f_movntv8di:
10530 aligned_mem = true;
10531 break;
10532 default:
10533 break;
10534 }
10535 break;
10536 case VOID_FTYPE_PVOID_PCVOID:
10537 nargs = 1;
10538 klass = store;
10539 memory = 0;
10540
10541 break;
10542 case V4SF_FTYPE_V4SF_PCV2SF:
10543 case V2DF_FTYPE_V2DF_PCDOUBLE:
10544 nargs = 2;
10545 klass = load;
10546 memory = 1;
10547 break;
10548 case V8SF_FTYPE_PCV8SF_V8SI:
10549 case V4DF_FTYPE_PCV4DF_V4DI:
10550 case V4SF_FTYPE_PCV4SF_V4SI:
10551 case V2DF_FTYPE_PCV2DF_V2DI:
10552 case V8SI_FTYPE_PCV8SI_V8SI:
10553 case V4DI_FTYPE_PCV4DI_V4DI:
10554 case V4SI_FTYPE_PCV4SI_V4SI:
10555 case V2DI_FTYPE_PCV2DI_V2DI:
10556 case VOID_FTYPE_INT_INT64:
10557 nargs = 2;
10558 klass = load;
10559 memory = 0;
10560 break;
10561 case VOID_FTYPE_PV8DF_V8DF_UQI:
10562 case VOID_FTYPE_PV4DF_V4DF_UQI:
10563 case VOID_FTYPE_PV2DF_V2DF_UQI:
10564 case VOID_FTYPE_PV16SF_V16SF_UHI:
10565 case VOID_FTYPE_PV8SF_V8SF_UQI:
10566 case VOID_FTYPE_PV4SF_V4SF_UQI:
10567 case VOID_FTYPE_PV8DI_V8DI_UQI:
10568 case VOID_FTYPE_PV4DI_V4DI_UQI:
10569 case VOID_FTYPE_PV2DI_V2DI_UQI:
10570 case VOID_FTYPE_PV16SI_V16SI_UHI:
10571 case VOID_FTYPE_PV8SI_V8SI_UQI:
10572 case VOID_FTYPE_PV4SI_V4SI_UQI:
10573 case VOID_FTYPE_PV64QI_V64QI_UDI:
10574 case VOID_FTYPE_PV32HI_V32HI_USI:
10575 case VOID_FTYPE_PV32QI_V32QI_USI:
10576 case VOID_FTYPE_PV16QI_V16QI_UHI:
10577 case VOID_FTYPE_PV16HI_V16HI_UHI:
10578 case VOID_FTYPE_PV8HI_V8HI_UQI:
10579 switch (icode)
10580 {
10581 /* These builtins and instructions require the memory
10582 to be properly aligned. */
10583 case CODE_FOR_avx512f_storev16sf_mask:
10584 case CODE_FOR_avx512f_storev16si_mask:
10585 case CODE_FOR_avx512f_storev8df_mask:
10586 case CODE_FOR_avx512f_storev8di_mask:
10587 case CODE_FOR_avx512vl_storev8sf_mask:
10588 case CODE_FOR_avx512vl_storev8si_mask:
10589 case CODE_FOR_avx512vl_storev4df_mask:
10590 case CODE_FOR_avx512vl_storev4di_mask:
10591 case CODE_FOR_avx512vl_storev4sf_mask:
10592 case CODE_FOR_avx512vl_storev4si_mask:
10593 case CODE_FOR_avx512vl_storev2df_mask:
10594 case CODE_FOR_avx512vl_storev2di_mask:
10595 aligned_mem = true;
10596 break;
10597 default:
10598 break;
10599 }
10600 /* FALLTHRU */
10601 case VOID_FTYPE_PV8SF_V8SI_V8SF:
10602 case VOID_FTYPE_PV4DF_V4DI_V4DF:
10603 case VOID_FTYPE_PV4SF_V4SI_V4SF:
10604 case VOID_FTYPE_PV2DF_V2DI_V2DF:
10605 case VOID_FTYPE_PV8SI_V8SI_V8SI:
10606 case VOID_FTYPE_PV4DI_V4DI_V4DI:
10607 case VOID_FTYPE_PV4SI_V4SI_V4SI:
10608 case VOID_FTYPE_PV2DI_V2DI_V2DI:
10609 case VOID_FTYPE_PV8SI_V8DI_UQI:
10610 case VOID_FTYPE_PV8HI_V8DI_UQI:
10611 case VOID_FTYPE_PV16HI_V16SI_UHI:
10612 case VOID_FTYPE_PV16QI_V8DI_UQI:
10613 case VOID_FTYPE_PV16QI_V16SI_UHI:
10614 case VOID_FTYPE_PV4SI_V4DI_UQI:
10615 case VOID_FTYPE_PV4SI_V2DI_UQI:
10616 case VOID_FTYPE_PV8HI_V4DI_UQI:
10617 case VOID_FTYPE_PV8HI_V2DI_UQI:
10618 case VOID_FTYPE_PV8HI_V8SI_UQI:
10619 case VOID_FTYPE_PV8HI_V4SI_UQI:
10620 case VOID_FTYPE_PV16QI_V4DI_UQI:
10621 case VOID_FTYPE_PV16QI_V2DI_UQI:
10622 case VOID_FTYPE_PV16QI_V8SI_UQI:
10623 case VOID_FTYPE_PV16QI_V4SI_UQI:
10624 case VOID_FTYPE_PCHAR_V64QI_UDI:
10625 case VOID_FTYPE_PCHAR_V32QI_USI:
10626 case VOID_FTYPE_PCHAR_V16QI_UHI:
10627 case VOID_FTYPE_PSHORT_V32HI_USI:
10628 case VOID_FTYPE_PSHORT_V16HI_UHI:
10629 case VOID_FTYPE_PSHORT_V8HI_UQI:
10630 case VOID_FTYPE_PINT_V16SI_UHI:
10631 case VOID_FTYPE_PINT_V8SI_UQI:
10632 case VOID_FTYPE_PINT_V4SI_UQI:
10633 case VOID_FTYPE_PINT64_V8DI_UQI:
10634 case VOID_FTYPE_PINT64_V4DI_UQI:
10635 case VOID_FTYPE_PINT64_V2DI_UQI:
10636 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
10637 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
10638 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
10639 case VOID_FTYPE_PFLOAT_V16SF_UHI:
10640 case VOID_FTYPE_PFLOAT_V8SF_UQI:
10641 case VOID_FTYPE_PFLOAT_V4SF_UQI:
10642 case VOID_FTYPE_PV32QI_V32HI_USI:
10643 case VOID_FTYPE_PV16QI_V16HI_UHI:
10644 case VOID_FTYPE_PV8QI_V8HI_UQI:
10645 nargs = 2;
10646 klass = store;
10647 /* Reserve memory operand for target. */
10648 memory = ARRAY_SIZE (args);
10649 break;
10650 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
10651 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
10652 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
10653 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
10654 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
10655 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
10656 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
10657 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
10658 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
10659 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
10660 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
10661 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
10662 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
10663 case V32HI_FTYPE_PCV32HI_V32HI_USI:
10664 case V32QI_FTYPE_PCV32QI_V32QI_USI:
10665 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
10666 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
10667 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
10668 switch (icode)
10669 {
10670 /* These builtins and instructions require the memory
10671 to be properly aligned. */
10672 case CODE_FOR_avx512f_loadv16sf_mask:
10673 case CODE_FOR_avx512f_loadv16si_mask:
10674 case CODE_FOR_avx512f_loadv8df_mask:
10675 case CODE_FOR_avx512f_loadv8di_mask:
10676 case CODE_FOR_avx512vl_loadv8sf_mask:
10677 case CODE_FOR_avx512vl_loadv8si_mask:
10678 case CODE_FOR_avx512vl_loadv4df_mask:
10679 case CODE_FOR_avx512vl_loadv4di_mask:
10680 case CODE_FOR_avx512vl_loadv4sf_mask:
10681 case CODE_FOR_avx512vl_loadv4si_mask:
10682 case CODE_FOR_avx512vl_loadv2df_mask:
10683 case CODE_FOR_avx512vl_loadv2di_mask:
10684 case CODE_FOR_avx512bw_loadv64qi_mask:
10685 case CODE_FOR_avx512vl_loadv32qi_mask:
10686 case CODE_FOR_avx512vl_loadv16qi_mask:
10687 case CODE_FOR_avx512bw_loadv32hi_mask:
10688 case CODE_FOR_avx512vl_loadv16hi_mask:
10689 case CODE_FOR_avx512vl_loadv8hi_mask:
10690 aligned_mem = true;
10691 break;
10692 default:
10693 break;
10694 }
10695 /* FALLTHRU */
10696 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
10697 case V32QI_FTYPE_PCCHAR_V32QI_USI:
10698 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
10699 case V32HI_FTYPE_PCSHORT_V32HI_USI:
10700 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
10701 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
10702 case V16SI_FTYPE_PCINT_V16SI_UHI:
10703 case V8SI_FTYPE_PCINT_V8SI_UQI:
10704 case V4SI_FTYPE_PCINT_V4SI_UQI:
10705 case V8DI_FTYPE_PCINT64_V8DI_UQI:
10706 case V4DI_FTYPE_PCINT64_V4DI_UQI:
10707 case V2DI_FTYPE_PCINT64_V2DI_UQI:
10708 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
10709 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
10710 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
10711 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
10712 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
10713 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
10714 nargs = 3;
10715 klass = load;
10716 memory = 0;
10717 break;
10718 case VOID_FTYPE_UINT_UINT_UINT:
10719 case VOID_FTYPE_UINT64_UINT_UINT:
10720 case UCHAR_FTYPE_UINT_UINT_UINT:
10721 case UCHAR_FTYPE_UINT64_UINT_UINT:
10722 nargs = 3;
10723 klass = load;
10724 memory = ARRAY_SIZE (args);
10725 last_arg_constant = true;
10726 break;
10727 default:
10728 gcc_unreachable ();
10729 }
10730
10731 gcc_assert (nargs <= ARRAY_SIZE (args));
10732
10733 if (klass == store)
10734 {
10735 arg = CALL_EXPR_ARG (exp, 0);
10736 op = expand_normal (arg);
10737 gcc_assert (target == 0);
10738 if (memory)
10739 {
10740 op = ix86_zero_extend_to_Pmode (op);
10741 target = gen_rtx_MEM (tmode, op);
10742 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10743 on it. Try to improve it using get_pointer_alignment,
10744 and if the special builtin is one that requires strict
10745 mode alignment, also from it's GET_MODE_ALIGNMENT.
10746 Failure to do so could lead to ix86_legitimate_combined_insn
10747 rejecting all changes to such insns. */
10748 unsigned int align = get_pointer_alignment (arg);
10749 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
10750 align = GET_MODE_ALIGNMENT (tmode);
10751 if (MEM_ALIGN (target) < align)
10752 set_mem_align (target, align);
10753 }
10754 else
10755 target = force_reg (tmode, op);
10756 arg_adjust = 1;
10757 }
10758 else
10759 {
10760 arg_adjust = 0;
10761 if (optimize
10762 || target == 0
10763 || !register_operand (target, tmode)
10764 || GET_MODE (target) != tmode)
10765 target = gen_reg_rtx (tmode);
10766 }
10767
10768 for (i = 0; i < nargs; i++)
10769 {
10770 machine_mode mode = insn_p->operand[i + 1].mode;
10771 bool match;
10772
10773 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
10774 op = expand_normal (arg);
10775 match = insn_p->operand[i + 1].predicate (op, mode);
10776
10777 if (last_arg_constant && (i + 1) == nargs)
10778 {
10779 if (!match)
10780 {
10781 if (icode == CODE_FOR_lwp_lwpvalsi3
10782 || icode == CODE_FOR_lwp_lwpinssi3
10783 || icode == CODE_FOR_lwp_lwpvaldi3
10784 || icode == CODE_FOR_lwp_lwpinsdi3)
10785 error ("the last argument must be a 32-bit immediate");
10786 else
10787 error ("the last argument must be an 8-bit immediate");
10788 return const0_rtx;
10789 }
10790 }
10791 else
10792 {
10793 if (i == memory)
10794 {
10795 /* This must be the memory operand. */
10796 op = ix86_zero_extend_to_Pmode (op);
10797 op = gen_rtx_MEM (mode, op);
10798 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10799 on it. Try to improve it using get_pointer_alignment,
10800 and if the special builtin is one that requires strict
10801 mode alignment, also from it's GET_MODE_ALIGNMENT.
10802 Failure to do so could lead to ix86_legitimate_combined_insn
10803 rejecting all changes to such insns. */
10804 unsigned int align = get_pointer_alignment (arg);
10805 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
10806 align = GET_MODE_ALIGNMENT (mode);
10807 if (MEM_ALIGN (op) < align)
10808 set_mem_align (op, align);
10809 }
10810 else
10811 {
10812 /* This must be register. */
10813 if (VECTOR_MODE_P (mode))
10814 op = safe_vector_operand (op, mode);
10815
10816 op = fixup_modeless_constant (op, mode);
10817
10818 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10819 op = copy_to_mode_reg (mode, op);
10820 else
10821 {
10822 op = copy_to_reg (op);
10823 op = lowpart_subreg (mode, op, GET_MODE (op));
10824 }
10825 }
10826 }
10827
10828 args[i].op = op;
10829 args[i].mode = mode;
10830 }
10831
10832 switch (nargs)
10833 {
10834 case 0:
10835 pat = GEN_FCN (icode) (target);
10836 break;
10837 case 1:
10838 pat = GEN_FCN (icode) (target, args[0].op);
10839 break;
10840 case 2:
10841 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
10842 break;
10843 case 3:
10844 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
10845 break;
10846 default:
10847 gcc_unreachable ();
10848 }
10849
10850 if (! pat)
10851 return 0;
10852 emit_insn (pat);
10853 return klass == store ? 0 : target;
10854 }
10855
10856 /* Return the integer constant in ARG. Constrain it to be in the range
10857 of the subparts of VEC_TYPE; issue an error if not. */
10858
10859 static int
10860 get_element_number (tree vec_type, tree arg)
10861 {
10862 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
10863
10864 if (!tree_fits_uhwi_p (arg)
10865 || (elt = tree_to_uhwi (arg), elt > max))
10866 {
10867 error ("selector must be an integer constant in the range "
10868 "[0, %wi]", max);
10869 return 0;
10870 }
10871
10872 return elt;
10873 }
10874
10875 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10876 ix86_expand_vector_init. We DO have language-level syntax for this, in
10877 the form of (type){ init-list }. Except that since we can't place emms
10878 instructions from inside the compiler, we can't allow the use of MMX
10879 registers unless the user explicitly asks for it. So we do *not* define
10880 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
10881 we have builtins invoked by mmintrin.h that gives us license to emit
10882 these sorts of instructions. */
10883
10884 static rtx
10885 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
10886 {
10887 machine_mode tmode = TYPE_MODE (type);
10888 machine_mode inner_mode = GET_MODE_INNER (tmode);
10889 int i, n_elt = GET_MODE_NUNITS (tmode);
10890 rtvec v = rtvec_alloc (n_elt);
10891
10892 gcc_assert (VECTOR_MODE_P (tmode));
10893 gcc_assert (call_expr_nargs (exp) == n_elt);
10894
10895 for (i = 0; i < n_elt; ++i)
10896 {
10897 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
10898 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
10899 }
10900
10901 if (!target || !register_operand (target, tmode))
10902 target = gen_reg_rtx (tmode);
10903
10904 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
10905 return target;
10906 }
10907
10908 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10909 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
10910 had a language-level syntax for referencing vector elements. */
10911
10912 static rtx
10913 ix86_expand_vec_ext_builtin (tree exp, rtx target)
10914 {
10915 machine_mode tmode, mode0;
10916 tree arg0, arg1;
10917 int elt;
10918 rtx op0;
10919
10920 arg0 = CALL_EXPR_ARG (exp, 0);
10921 arg1 = CALL_EXPR_ARG (exp, 1);
10922
10923 op0 = expand_normal (arg0);
10924 elt = get_element_number (TREE_TYPE (arg0), arg1);
10925
10926 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10927 mode0 = TYPE_MODE (TREE_TYPE (arg0));
10928 gcc_assert (VECTOR_MODE_P (mode0));
10929
10930 op0 = force_reg (mode0, op0);
10931
10932 if (optimize || !target || !register_operand (target, tmode))
10933 target = gen_reg_rtx (tmode);
10934
10935 ix86_expand_vector_extract (true, target, op0, elt);
10936
10937 return target;
10938 }
10939
10940 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10941 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
10942 a language-level syntax for referencing vector elements. */
10943
10944 static rtx
10945 ix86_expand_vec_set_builtin (tree exp)
10946 {
10947 machine_mode tmode, mode1;
10948 tree arg0, arg1, arg2;
10949 int elt;
10950 rtx op0, op1, target;
10951
10952 arg0 = CALL_EXPR_ARG (exp, 0);
10953 arg1 = CALL_EXPR_ARG (exp, 1);
10954 arg2 = CALL_EXPR_ARG (exp, 2);
10955
10956 tmode = TYPE_MODE (TREE_TYPE (arg0));
10957 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10958 gcc_assert (VECTOR_MODE_P (tmode));
10959
10960 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
10961 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
10962 elt = get_element_number (TREE_TYPE (arg0), arg2);
10963
10964 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
10965 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
10966
10967 op0 = force_reg (tmode, op0);
10968 op1 = force_reg (mode1, op1);
10969
10970 /* OP0 is the source of these builtin functions and shouldn't be
10971 modified. Create a copy, use it and return it as target. */
10972 target = gen_reg_rtx (tmode);
10973 emit_move_insn (target, op0);
10974 ix86_expand_vector_set (true, target, op1, elt);
10975
10976 return target;
10977 }
10978
10979 /* Expand an expression EXP that calls a built-in function,
10980 with result going to TARGET if that's convenient
10981 (and in mode MODE if that's convenient).
10982 SUBTARGET may be used as the target for computing one of EXP's operands.
10983 IGNORE is nonzero if the value is to be ignored. */
10984
10985 rtx
10986 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
10987 machine_mode mode, int ignore)
10988 {
10989 size_t i;
10990 enum insn_code icode, icode2;
10991 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
10992 tree arg0, arg1, arg2, arg3, arg4;
10993 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
10994 machine_mode mode0, mode1, mode2, mode3, mode4;
10995 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
10996
10997 /* For CPU builtins that can be folded, fold first and expand the fold. */
10998 switch (fcode)
10999 {
11000 case IX86_BUILTIN_CPU_INIT:
11001 {
11002 /* Make it call __cpu_indicator_init in libgcc. */
11003 tree call_expr, fndecl, type;
11004 type = build_function_type_list (integer_type_node, NULL_TREE);
11005 fndecl = build_fn_decl ("__cpu_indicator_init", type);
11006 call_expr = build_call_expr (fndecl, 0);
11007 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
11008 }
11009 case IX86_BUILTIN_CPU_IS:
11010 case IX86_BUILTIN_CPU_SUPPORTS:
11011 {
11012 tree arg0 = CALL_EXPR_ARG (exp, 0);
11013 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
11014 gcc_assert (fold_expr != NULL_TREE);
11015 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
11016 }
11017 }
11018
11019 HOST_WIDE_INT isa = ix86_isa_flags;
11020 HOST_WIDE_INT isa2 = ix86_isa_flags2;
11021 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
11022 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
11023 /* The general case is we require all the ISAs specified in bisa{,2}
11024 to be enabled.
11025 The exceptions are:
11026 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
11027 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
11028 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
11029 where for each this pair it is sufficient if either of the ISAs is
11030 enabled, plus if it is ored with other options also those others. */
11031 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11032 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11033 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
11034 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
11035 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11036 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11037 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
11038 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
11039 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11040 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11041 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
11042 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
11043 /* Use SSE/SSE2/SSSE3 to emulate MMX intrinsics in 64-bit mode when
11044 MMX is disabled. NB: Since MMX intrinsics are marked with
11045 SSE/SSE2/SSSE3, enable them without SSE/SSE2/SSSE3 if MMX is
11046 enabled. */
11047 if (TARGET_MMX || TARGET_MMX_WITH_SSE)
11048 {
11049 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX))
11050 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX))
11051 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX)) != 0)
11052 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX);
11053 if (((bisa & (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX))
11054 == (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX))
11055 && (isa & (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX)) != 0)
11056 isa |= (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX);
11057 if (((bisa & (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX))
11058 == (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX))
11059 && (isa & (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX)) != 0)
11060 isa |= (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX);
11061 }
11062 if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
11063 {
11064 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
11065 if (TARGET_ABI_X32)
11066 bisa |= OPTION_MASK_ABI_X32;
11067 else
11068 bisa |= OPTION_MASK_ABI_64;
11069 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
11070 (enum fpmath_unit) 0, false, add_abi_p);
11071 if (!opts)
11072 error ("%qE needs unknown isa option", fndecl);
11073 else
11074 {
11075 gcc_assert (opts != NULL);
11076 error ("%qE needs isa option %s", fndecl, opts);
11077 free (opts);
11078 }
11079 return expand_call (exp, target, ignore);
11080 }
11081
11082 switch (fcode)
11083 {
11084 case IX86_BUILTIN_MASKMOVQ:
11085 case IX86_BUILTIN_MASKMOVDQU:
11086 icode = (fcode == IX86_BUILTIN_MASKMOVQ
11087 ? CODE_FOR_mmx_maskmovq
11088 : CODE_FOR_sse2_maskmovdqu);
11089 /* Note the arg order is different from the operand order. */
11090 arg1 = CALL_EXPR_ARG (exp, 0);
11091 arg2 = CALL_EXPR_ARG (exp, 1);
11092 arg0 = CALL_EXPR_ARG (exp, 2);
11093 op0 = expand_normal (arg0);
11094 op1 = expand_normal (arg1);
11095 op2 = expand_normal (arg2);
11096 mode0 = insn_data[icode].operand[0].mode;
11097 mode1 = insn_data[icode].operand[1].mode;
11098 mode2 = insn_data[icode].operand[2].mode;
11099
11100 op0 = ix86_zero_extend_to_Pmode (op0);
11101 op0 = gen_rtx_MEM (mode1, op0);
11102
11103 if (!insn_data[icode].operand[0].predicate (op0, mode0))
11104 op0 = copy_to_mode_reg (mode0, op0);
11105 if (!insn_data[icode].operand[1].predicate (op1, mode1))
11106 op1 = copy_to_mode_reg (mode1, op1);
11107 if (!insn_data[icode].operand[2].predicate (op2, mode2))
11108 op2 = copy_to_mode_reg (mode2, op2);
11109 pat = GEN_FCN (icode) (op0, op1, op2);
11110 if (! pat)
11111 return 0;
11112 emit_insn (pat);
11113 return 0;
11114
11115 case IX86_BUILTIN_LDMXCSR:
11116 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
11117 target = assign_386_stack_local (SImode, SLOT_TEMP);
11118 emit_move_insn (target, op0);
11119 emit_insn (gen_sse_ldmxcsr (target));
11120 return 0;
11121
11122 case IX86_BUILTIN_STMXCSR:
11123 target = assign_386_stack_local (SImode, SLOT_TEMP);
11124 emit_insn (gen_sse_stmxcsr (target));
11125 return copy_to_mode_reg (SImode, target);
11126
11127 case IX86_BUILTIN_CLFLUSH:
11128 arg0 = CALL_EXPR_ARG (exp, 0);
11129 op0 = expand_normal (arg0);
11130 icode = CODE_FOR_sse2_clflush;
11131 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11132 op0 = ix86_zero_extend_to_Pmode (op0);
11133
11134 emit_insn (gen_sse2_clflush (op0));
11135 return 0;
11136
11137 case IX86_BUILTIN_CLWB:
11138 arg0 = CALL_EXPR_ARG (exp, 0);
11139 op0 = expand_normal (arg0);
11140 icode = CODE_FOR_clwb;
11141 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11142 op0 = ix86_zero_extend_to_Pmode (op0);
11143
11144 emit_insn (gen_clwb (op0));
11145 return 0;
11146
11147 case IX86_BUILTIN_CLFLUSHOPT:
11148 arg0 = CALL_EXPR_ARG (exp, 0);
11149 op0 = expand_normal (arg0);
11150 icode = CODE_FOR_clflushopt;
11151 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11152 op0 = ix86_zero_extend_to_Pmode (op0);
11153
11154 emit_insn (gen_clflushopt (op0));
11155 return 0;
11156
11157 case IX86_BUILTIN_MONITOR:
11158 case IX86_BUILTIN_MONITORX:
11159 arg0 = CALL_EXPR_ARG (exp, 0);
11160 arg1 = CALL_EXPR_ARG (exp, 1);
11161 arg2 = CALL_EXPR_ARG (exp, 2);
11162 op0 = expand_normal (arg0);
11163 op1 = expand_normal (arg1);
11164 op2 = expand_normal (arg2);
11165 if (!REG_P (op0))
11166 op0 = ix86_zero_extend_to_Pmode (op0);
11167 if (!REG_P (op1))
11168 op1 = copy_to_mode_reg (SImode, op1);
11169 if (!REG_P (op2))
11170 op2 = copy_to_mode_reg (SImode, op2);
11171
11172 emit_insn (fcode == IX86_BUILTIN_MONITOR
11173 ? gen_sse3_monitor (Pmode, op0, op1, op2)
11174 : gen_monitorx (Pmode, op0, op1, op2));
11175 return 0;
11176
11177 case IX86_BUILTIN_MWAIT:
11178 arg0 = CALL_EXPR_ARG (exp, 0);
11179 arg1 = CALL_EXPR_ARG (exp, 1);
11180 op0 = expand_normal (arg0);
11181 op1 = expand_normal (arg1);
11182 if (!REG_P (op0))
11183 op0 = copy_to_mode_reg (SImode, op0);
11184 if (!REG_P (op1))
11185 op1 = copy_to_mode_reg (SImode, op1);
11186 emit_insn (gen_sse3_mwait (op0, op1));
11187 return 0;
11188
11189 case IX86_BUILTIN_MWAITX:
11190 arg0 = CALL_EXPR_ARG (exp, 0);
11191 arg1 = CALL_EXPR_ARG (exp, 1);
11192 arg2 = CALL_EXPR_ARG (exp, 2);
11193 op0 = expand_normal (arg0);
11194 op1 = expand_normal (arg1);
11195 op2 = expand_normal (arg2);
11196 if (!REG_P (op0))
11197 op0 = copy_to_mode_reg (SImode, op0);
11198 if (!REG_P (op1))
11199 op1 = copy_to_mode_reg (SImode, op1);
11200 if (!REG_P (op2))
11201 op2 = copy_to_mode_reg (SImode, op2);
11202 emit_insn (gen_mwaitx (op0, op1, op2));
11203 return 0;
11204
11205 case IX86_BUILTIN_UMONITOR:
11206 arg0 = CALL_EXPR_ARG (exp, 0);
11207 op0 = expand_normal (arg0);
11208
11209 op0 = ix86_zero_extend_to_Pmode (op0);
11210 emit_insn (gen_umonitor (Pmode, op0));
11211 return 0;
11212
11213 case IX86_BUILTIN_UMWAIT:
11214 case IX86_BUILTIN_TPAUSE:
11215 arg0 = CALL_EXPR_ARG (exp, 0);
11216 arg1 = CALL_EXPR_ARG (exp, 1);
11217 op0 = expand_normal (arg0);
11218 op1 = expand_normal (arg1);
11219
11220 if (!REG_P (op0))
11221 op0 = copy_to_mode_reg (SImode, op0);
11222
11223 op1 = force_reg (DImode, op1);
11224
11225 if (TARGET_64BIT)
11226 {
11227 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11228 NULL, 1, OPTAB_DIRECT);
11229 switch (fcode)
11230 {
11231 case IX86_BUILTIN_UMWAIT:
11232 icode = CODE_FOR_umwait_rex64;
11233 break;
11234 case IX86_BUILTIN_TPAUSE:
11235 icode = CODE_FOR_tpause_rex64;
11236 break;
11237 default:
11238 gcc_unreachable ();
11239 }
11240
11241 op2 = gen_lowpart (SImode, op2);
11242 op1 = gen_lowpart (SImode, op1);
11243 pat = GEN_FCN (icode) (op0, op1, op2);
11244 }
11245 else
11246 {
11247 switch (fcode)
11248 {
11249 case IX86_BUILTIN_UMWAIT:
11250 icode = CODE_FOR_umwait;
11251 break;
11252 case IX86_BUILTIN_TPAUSE:
11253 icode = CODE_FOR_tpause;
11254 break;
11255 default:
11256 gcc_unreachable ();
11257 }
11258 pat = GEN_FCN (icode) (op0, op1);
11259 }
11260
11261 if (!pat)
11262 return 0;
11263
11264 emit_insn (pat);
11265
11266 if (target == 0
11267 || !register_operand (target, QImode))
11268 target = gen_reg_rtx (QImode);
11269
11270 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11271 const0_rtx);
11272 emit_insn (gen_rtx_SET (target, pat));
11273
11274 return target;
11275
11276 case IX86_BUILTIN_CLZERO:
11277 arg0 = CALL_EXPR_ARG (exp, 0);
11278 op0 = expand_normal (arg0);
11279 if (!REG_P (op0))
11280 op0 = ix86_zero_extend_to_Pmode (op0);
11281 emit_insn (gen_clzero (Pmode, op0));
11282 return 0;
11283
11284 case IX86_BUILTIN_CLDEMOTE:
11285 arg0 = CALL_EXPR_ARG (exp, 0);
11286 op0 = expand_normal (arg0);
11287 icode = CODE_FOR_cldemote;
11288 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11289 op0 = ix86_zero_extend_to_Pmode (op0);
11290
11291 emit_insn (gen_cldemote (op0));
11292 return 0;
11293
11294 case IX86_BUILTIN_VEC_INIT_V2SI:
11295 case IX86_BUILTIN_VEC_INIT_V4HI:
11296 case IX86_BUILTIN_VEC_INIT_V8QI:
11297 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
11298
11299 case IX86_BUILTIN_VEC_EXT_V2DF:
11300 case IX86_BUILTIN_VEC_EXT_V2DI:
11301 case IX86_BUILTIN_VEC_EXT_V4SF:
11302 case IX86_BUILTIN_VEC_EXT_V4SI:
11303 case IX86_BUILTIN_VEC_EXT_V8HI:
11304 case IX86_BUILTIN_VEC_EXT_V2SI:
11305 case IX86_BUILTIN_VEC_EXT_V4HI:
11306 case IX86_BUILTIN_VEC_EXT_V16QI:
11307 return ix86_expand_vec_ext_builtin (exp, target);
11308
11309 case IX86_BUILTIN_VEC_SET_V2DI:
11310 case IX86_BUILTIN_VEC_SET_V4SF:
11311 case IX86_BUILTIN_VEC_SET_V4SI:
11312 case IX86_BUILTIN_VEC_SET_V8HI:
11313 case IX86_BUILTIN_VEC_SET_V4HI:
11314 case IX86_BUILTIN_VEC_SET_V16QI:
11315 return ix86_expand_vec_set_builtin (exp);
11316
11317 case IX86_BUILTIN_NANQ:
11318 case IX86_BUILTIN_NANSQ:
11319 return expand_call (exp, target, ignore);
11320
11321 case IX86_BUILTIN_RDPID:
11322
11323 op0 = gen_reg_rtx (word_mode);
11324
11325 if (TARGET_64BIT)
11326 {
11327 insn = gen_rdpid_rex64 (op0);
11328 op0 = convert_to_mode (SImode, op0, 1);
11329 }
11330 else
11331 insn = gen_rdpid (op0);
11332
11333 emit_insn (insn);
11334
11335 if (target == 0
11336 || !register_operand (target, SImode))
11337 target = gen_reg_rtx (SImode);
11338
11339 emit_move_insn (target, op0);
11340 return target;
11341
11342 case IX86_BUILTIN_2INTERSECTD512:
11343 case IX86_BUILTIN_2INTERSECTQ512:
11344 case IX86_BUILTIN_2INTERSECTD256:
11345 case IX86_BUILTIN_2INTERSECTQ256:
11346 case IX86_BUILTIN_2INTERSECTD128:
11347 case IX86_BUILTIN_2INTERSECTQ128:
11348 arg0 = CALL_EXPR_ARG (exp, 0);
11349 arg1 = CALL_EXPR_ARG (exp, 1);
11350 arg2 = CALL_EXPR_ARG (exp, 2);
11351 arg3 = CALL_EXPR_ARG (exp, 3);
11352 op0 = expand_normal (arg0);
11353 op1 = expand_normal (arg1);
11354 op2 = expand_normal (arg2);
11355 op3 = expand_normal (arg3);
11356
11357 if (!address_operand (op0, VOIDmode))
11358 {
11359 op0 = convert_memory_address (Pmode, op0);
11360 op0 = copy_addr_to_reg (op0);
11361 }
11362 if (!address_operand (op1, VOIDmode))
11363 {
11364 op1 = convert_memory_address (Pmode, op1);
11365 op1 = copy_addr_to_reg (op1);
11366 }
11367
11368 switch (fcode)
11369 {
11370 case IX86_BUILTIN_2INTERSECTD512:
11371 mode4 = P2HImode;
11372 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
11373 break;
11374 case IX86_BUILTIN_2INTERSECTQ512:
11375 mode4 = P2QImode;
11376 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
11377 break;
11378 case IX86_BUILTIN_2INTERSECTD256:
11379 mode4 = P2QImode;
11380 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
11381 break;
11382 case IX86_BUILTIN_2INTERSECTQ256:
11383 mode4 = P2QImode;
11384 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
11385 break;
11386 case IX86_BUILTIN_2INTERSECTD128:
11387 mode4 = P2QImode;
11388 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
11389 break;
11390 case IX86_BUILTIN_2INTERSECTQ128:
11391 mode4 = P2QImode;
11392 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
11393 break;
11394 default:
11395 gcc_unreachable ();
11396 }
11397
11398 mode2 = insn_data[icode].operand[1].mode;
11399 mode3 = insn_data[icode].operand[2].mode;
11400 if (!insn_data[icode].operand[1].predicate (op2, mode2))
11401 op2 = copy_to_mode_reg (mode2, op2);
11402 if (!insn_data[icode].operand[2].predicate (op3, mode3))
11403 op3 = copy_to_mode_reg (mode3, op3);
11404
11405 op4 = gen_reg_rtx (mode4);
11406 emit_insn (GEN_FCN (icode) (op4, op2, op3));
11407 mode0 = mode4 == P2HImode ? HImode : QImode;
11408 emit_move_insn (gen_rtx_MEM (mode0, op0),
11409 gen_lowpart (mode0, op4));
11410 emit_move_insn (gen_rtx_MEM (mode0, op1),
11411 gen_highpart (mode0, op4));
11412
11413 return 0;
11414
11415 case IX86_BUILTIN_RDPMC:
11416 case IX86_BUILTIN_RDTSC:
11417 case IX86_BUILTIN_RDTSCP:
11418 case IX86_BUILTIN_XGETBV:
11419
11420 op0 = gen_reg_rtx (DImode);
11421 op1 = gen_reg_rtx (DImode);
11422
11423 if (fcode == IX86_BUILTIN_RDPMC)
11424 {
11425 arg0 = CALL_EXPR_ARG (exp, 0);
11426 op2 = expand_normal (arg0);
11427 if (!register_operand (op2, SImode))
11428 op2 = copy_to_mode_reg (SImode, op2);
11429
11430 insn = (TARGET_64BIT
11431 ? gen_rdpmc_rex64 (op0, op1, op2)
11432 : gen_rdpmc (op0, op2));
11433 emit_insn (insn);
11434 }
11435 else if (fcode == IX86_BUILTIN_XGETBV)
11436 {
11437 arg0 = CALL_EXPR_ARG (exp, 0);
11438 op2 = expand_normal (arg0);
11439 if (!register_operand (op2, SImode))
11440 op2 = copy_to_mode_reg (SImode, op2);
11441
11442 insn = (TARGET_64BIT
11443 ? gen_xgetbv_rex64 (op0, op1, op2)
11444 : gen_xgetbv (op0, op2));
11445 emit_insn (insn);
11446 }
11447 else if (fcode == IX86_BUILTIN_RDTSC)
11448 {
11449 insn = (TARGET_64BIT
11450 ? gen_rdtsc_rex64 (op0, op1)
11451 : gen_rdtsc (op0));
11452 emit_insn (insn);
11453 }
11454 else
11455 {
11456 op2 = gen_reg_rtx (SImode);
11457
11458 insn = (TARGET_64BIT
11459 ? gen_rdtscp_rex64 (op0, op1, op2)
11460 : gen_rdtscp (op0, op2));
11461 emit_insn (insn);
11462
11463 arg0 = CALL_EXPR_ARG (exp, 0);
11464 op4 = expand_normal (arg0);
11465 if (!address_operand (op4, VOIDmode))
11466 {
11467 op4 = convert_memory_address (Pmode, op4);
11468 op4 = copy_addr_to_reg (op4);
11469 }
11470 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
11471 }
11472
11473 if (target == 0
11474 || !register_operand (target, DImode))
11475 target = gen_reg_rtx (DImode);
11476
11477 if (TARGET_64BIT)
11478 {
11479 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
11480 op1, 1, OPTAB_DIRECT);
11481 op0 = expand_simple_binop (DImode, IOR, op0, op1,
11482 op0, 1, OPTAB_DIRECT);
11483 }
11484
11485 emit_move_insn (target, op0);
11486 return target;
11487
11488 case IX86_BUILTIN_ENQCMD:
11489 case IX86_BUILTIN_ENQCMDS:
11490 case IX86_BUILTIN_MOVDIR64B:
11491
11492 arg0 = CALL_EXPR_ARG (exp, 0);
11493 arg1 = CALL_EXPR_ARG (exp, 1);
11494 op0 = expand_normal (arg0);
11495 op1 = expand_normal (arg1);
11496
11497 op0 = ix86_zero_extend_to_Pmode (op0);
11498 if (!address_operand (op1, VOIDmode))
11499 {
11500 op1 = convert_memory_address (Pmode, op1);
11501 op1 = copy_addr_to_reg (op1);
11502 }
11503 op1 = gen_rtx_MEM (XImode, op1);
11504
11505 if (fcode == IX86_BUILTIN_MOVDIR64B)
11506 {
11507 emit_insn (gen_movdir64b (Pmode, op0, op1));
11508 return 0;
11509 }
11510 else
11511 {
11512 rtx pat;
11513
11514 target = gen_reg_rtx (SImode);
11515 emit_move_insn (target, const0_rtx);
11516 target = gen_rtx_SUBREG (QImode, target, 0);
11517
11518 if (fcode == IX86_BUILTIN_ENQCMD)
11519 pat = gen_enqcmd (UNSPECV_ENQCMD, Pmode, op0, op1);
11520 else
11521 pat = gen_enqcmd (UNSPECV_ENQCMDS, Pmode, op0, op1);
11522
11523 emit_insn (pat);
11524
11525 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
11526 gen_rtx_fmt_ee (EQ, QImode,
11527 SET_DEST (pat),
11528 const0_rtx)));
11529
11530 return SUBREG_REG (target);
11531 }
11532
11533 case IX86_BUILTIN_FXSAVE:
11534 case IX86_BUILTIN_FXRSTOR:
11535 case IX86_BUILTIN_FXSAVE64:
11536 case IX86_BUILTIN_FXRSTOR64:
11537 case IX86_BUILTIN_FNSTENV:
11538 case IX86_BUILTIN_FLDENV:
11539 mode0 = BLKmode;
11540 switch (fcode)
11541 {
11542 case IX86_BUILTIN_FXSAVE:
11543 icode = CODE_FOR_fxsave;
11544 break;
11545 case IX86_BUILTIN_FXRSTOR:
11546 icode = CODE_FOR_fxrstor;
11547 break;
11548 case IX86_BUILTIN_FXSAVE64:
11549 icode = CODE_FOR_fxsave64;
11550 break;
11551 case IX86_BUILTIN_FXRSTOR64:
11552 icode = CODE_FOR_fxrstor64;
11553 break;
11554 case IX86_BUILTIN_FNSTENV:
11555 icode = CODE_FOR_fnstenv;
11556 break;
11557 case IX86_BUILTIN_FLDENV:
11558 icode = CODE_FOR_fldenv;
11559 break;
11560 default:
11561 gcc_unreachable ();
11562 }
11563
11564 arg0 = CALL_EXPR_ARG (exp, 0);
11565 op0 = expand_normal (arg0);
11566
11567 if (!address_operand (op0, VOIDmode))
11568 {
11569 op0 = convert_memory_address (Pmode, op0);
11570 op0 = copy_addr_to_reg (op0);
11571 }
11572 op0 = gen_rtx_MEM (mode0, op0);
11573
11574 pat = GEN_FCN (icode) (op0);
11575 if (pat)
11576 emit_insn (pat);
11577 return 0;
11578
11579 case IX86_BUILTIN_XSETBV:
11580 arg0 = CALL_EXPR_ARG (exp, 0);
11581 arg1 = CALL_EXPR_ARG (exp, 1);
11582 op0 = expand_normal (arg0);
11583 op1 = expand_normal (arg1);
11584
11585 if (!REG_P (op0))
11586 op0 = copy_to_mode_reg (SImode, op0);
11587
11588 op1 = force_reg (DImode, op1);
11589
11590 if (TARGET_64BIT)
11591 {
11592 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11593 NULL, 1, OPTAB_DIRECT);
11594
11595 icode = CODE_FOR_xsetbv_rex64;
11596
11597 op2 = gen_lowpart (SImode, op2);
11598 op1 = gen_lowpart (SImode, op1);
11599 pat = GEN_FCN (icode) (op0, op1, op2);
11600 }
11601 else
11602 {
11603 icode = CODE_FOR_xsetbv;
11604
11605 pat = GEN_FCN (icode) (op0, op1);
11606 }
11607 if (pat)
11608 emit_insn (pat);
11609 return 0;
11610
11611 case IX86_BUILTIN_XSAVE:
11612 case IX86_BUILTIN_XRSTOR:
11613 case IX86_BUILTIN_XSAVE64:
11614 case IX86_BUILTIN_XRSTOR64:
11615 case IX86_BUILTIN_XSAVEOPT:
11616 case IX86_BUILTIN_XSAVEOPT64:
11617 case IX86_BUILTIN_XSAVES:
11618 case IX86_BUILTIN_XRSTORS:
11619 case IX86_BUILTIN_XSAVES64:
11620 case IX86_BUILTIN_XRSTORS64:
11621 case IX86_BUILTIN_XSAVEC:
11622 case IX86_BUILTIN_XSAVEC64:
11623 arg0 = CALL_EXPR_ARG (exp, 0);
11624 arg1 = CALL_EXPR_ARG (exp, 1);
11625 op0 = expand_normal (arg0);
11626 op1 = expand_normal (arg1);
11627
11628 if (!address_operand (op0, VOIDmode))
11629 {
11630 op0 = convert_memory_address (Pmode, op0);
11631 op0 = copy_addr_to_reg (op0);
11632 }
11633 op0 = gen_rtx_MEM (BLKmode, op0);
11634
11635 op1 = force_reg (DImode, op1);
11636
11637 if (TARGET_64BIT)
11638 {
11639 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11640 NULL, 1, OPTAB_DIRECT);
11641 switch (fcode)
11642 {
11643 case IX86_BUILTIN_XSAVE:
11644 icode = CODE_FOR_xsave_rex64;
11645 break;
11646 case IX86_BUILTIN_XRSTOR:
11647 icode = CODE_FOR_xrstor_rex64;
11648 break;
11649 case IX86_BUILTIN_XSAVE64:
11650 icode = CODE_FOR_xsave64;
11651 break;
11652 case IX86_BUILTIN_XRSTOR64:
11653 icode = CODE_FOR_xrstor64;
11654 break;
11655 case IX86_BUILTIN_XSAVEOPT:
11656 icode = CODE_FOR_xsaveopt_rex64;
11657 break;
11658 case IX86_BUILTIN_XSAVEOPT64:
11659 icode = CODE_FOR_xsaveopt64;
11660 break;
11661 case IX86_BUILTIN_XSAVES:
11662 icode = CODE_FOR_xsaves_rex64;
11663 break;
11664 case IX86_BUILTIN_XRSTORS:
11665 icode = CODE_FOR_xrstors_rex64;
11666 break;
11667 case IX86_BUILTIN_XSAVES64:
11668 icode = CODE_FOR_xsaves64;
11669 break;
11670 case IX86_BUILTIN_XRSTORS64:
11671 icode = CODE_FOR_xrstors64;
11672 break;
11673 case IX86_BUILTIN_XSAVEC:
11674 icode = CODE_FOR_xsavec_rex64;
11675 break;
11676 case IX86_BUILTIN_XSAVEC64:
11677 icode = CODE_FOR_xsavec64;
11678 break;
11679 default:
11680 gcc_unreachable ();
11681 }
11682
11683 op2 = gen_lowpart (SImode, op2);
11684 op1 = gen_lowpart (SImode, op1);
11685 pat = GEN_FCN (icode) (op0, op1, op2);
11686 }
11687 else
11688 {
11689 switch (fcode)
11690 {
11691 case IX86_BUILTIN_XSAVE:
11692 icode = CODE_FOR_xsave;
11693 break;
11694 case IX86_BUILTIN_XRSTOR:
11695 icode = CODE_FOR_xrstor;
11696 break;
11697 case IX86_BUILTIN_XSAVEOPT:
11698 icode = CODE_FOR_xsaveopt;
11699 break;
11700 case IX86_BUILTIN_XSAVES:
11701 icode = CODE_FOR_xsaves;
11702 break;
11703 case IX86_BUILTIN_XRSTORS:
11704 icode = CODE_FOR_xrstors;
11705 break;
11706 case IX86_BUILTIN_XSAVEC:
11707 icode = CODE_FOR_xsavec;
11708 break;
11709 default:
11710 gcc_unreachable ();
11711 }
11712 pat = GEN_FCN (icode) (op0, op1);
11713 }
11714
11715 if (pat)
11716 emit_insn (pat);
11717 return 0;
11718
11719 case IX86_BUILTIN_LLWPCB:
11720 arg0 = CALL_EXPR_ARG (exp, 0);
11721 op0 = expand_normal (arg0);
11722 icode = CODE_FOR_lwp_llwpcb;
11723 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11724 op0 = ix86_zero_extend_to_Pmode (op0);
11725 emit_insn (gen_lwp_llwpcb (op0));
11726 return 0;
11727
11728 case IX86_BUILTIN_SLWPCB:
11729 icode = CODE_FOR_lwp_slwpcb;
11730 if (!target
11731 || !insn_data[icode].operand[0].predicate (target, Pmode))
11732 target = gen_reg_rtx (Pmode);
11733 emit_insn (gen_lwp_slwpcb (target));
11734 return target;
11735
11736 case IX86_BUILTIN_BEXTRI32:
11737 case IX86_BUILTIN_BEXTRI64:
11738 arg0 = CALL_EXPR_ARG (exp, 0);
11739 arg1 = CALL_EXPR_ARG (exp, 1);
11740 op0 = expand_normal (arg0);
11741 op1 = expand_normal (arg1);
11742 icode = (fcode == IX86_BUILTIN_BEXTRI32
11743 ? CODE_FOR_tbm_bextri_si
11744 : CODE_FOR_tbm_bextri_di);
11745 if (!CONST_INT_P (op1))
11746 {
11747 error ("last argument must be an immediate");
11748 return const0_rtx;
11749 }
11750 else
11751 {
11752 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
11753 unsigned char lsb_index = INTVAL (op1) & 0xFF;
11754 op1 = GEN_INT (length);
11755 op2 = GEN_INT (lsb_index);
11756
11757 mode1 = insn_data[icode].operand[1].mode;
11758 if (!insn_data[icode].operand[1].predicate (op0, mode1))
11759 op0 = copy_to_mode_reg (mode1, op0);
11760
11761 mode0 = insn_data[icode].operand[0].mode;
11762 if (target == 0
11763 || !register_operand (target, mode0))
11764 target = gen_reg_rtx (mode0);
11765
11766 pat = GEN_FCN (icode) (target, op0, op1, op2);
11767 if (pat)
11768 emit_insn (pat);
11769 return target;
11770 }
11771
11772 case IX86_BUILTIN_RDRAND16_STEP:
11773 icode = CODE_FOR_rdrandhi_1;
11774 mode0 = HImode;
11775 goto rdrand_step;
11776
11777 case IX86_BUILTIN_RDRAND32_STEP:
11778 icode = CODE_FOR_rdrandsi_1;
11779 mode0 = SImode;
11780 goto rdrand_step;
11781
11782 case IX86_BUILTIN_RDRAND64_STEP:
11783 icode = CODE_FOR_rdranddi_1;
11784 mode0 = DImode;
11785
11786 rdrand_step:
11787 arg0 = CALL_EXPR_ARG (exp, 0);
11788 op1 = expand_normal (arg0);
11789 if (!address_operand (op1, VOIDmode))
11790 {
11791 op1 = convert_memory_address (Pmode, op1);
11792 op1 = copy_addr_to_reg (op1);
11793 }
11794
11795 op0 = gen_reg_rtx (mode0);
11796 emit_insn (GEN_FCN (icode) (op0));
11797
11798 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
11799
11800 op1 = gen_reg_rtx (SImode);
11801 emit_move_insn (op1, CONST1_RTX (SImode));
11802
11803 /* Emit SImode conditional move. */
11804 if (mode0 == HImode)
11805 {
11806 if (TARGET_ZERO_EXTEND_WITH_AND
11807 && optimize_function_for_speed_p (cfun))
11808 {
11809 op2 = force_reg (SImode, const0_rtx);
11810
11811 emit_insn (gen_movstricthi
11812 (gen_lowpart (HImode, op2), op0));
11813 }
11814 else
11815 {
11816 op2 = gen_reg_rtx (SImode);
11817
11818 emit_insn (gen_zero_extendhisi2 (op2, op0));
11819 }
11820 }
11821 else if (mode0 == SImode)
11822 op2 = op0;
11823 else
11824 op2 = gen_rtx_SUBREG (SImode, op0, 0);
11825
11826 if (target == 0
11827 || !register_operand (target, SImode))
11828 target = gen_reg_rtx (SImode);
11829
11830 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
11831 const0_rtx);
11832 emit_insn (gen_rtx_SET (target,
11833 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
11834 return target;
11835
11836 case IX86_BUILTIN_RDSEED16_STEP:
11837 icode = CODE_FOR_rdseedhi_1;
11838 mode0 = HImode;
11839 goto rdseed_step;
11840
11841 case IX86_BUILTIN_RDSEED32_STEP:
11842 icode = CODE_FOR_rdseedsi_1;
11843 mode0 = SImode;
11844 goto rdseed_step;
11845
11846 case IX86_BUILTIN_RDSEED64_STEP:
11847 icode = CODE_FOR_rdseeddi_1;
11848 mode0 = DImode;
11849
11850 rdseed_step:
11851 arg0 = CALL_EXPR_ARG (exp, 0);
11852 op1 = expand_normal (arg0);
11853 if (!address_operand (op1, VOIDmode))
11854 {
11855 op1 = convert_memory_address (Pmode, op1);
11856 op1 = copy_addr_to_reg (op1);
11857 }
11858
11859 op0 = gen_reg_rtx (mode0);
11860 emit_insn (GEN_FCN (icode) (op0));
11861
11862 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
11863
11864 op2 = gen_reg_rtx (QImode);
11865
11866 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11867 const0_rtx);
11868 emit_insn (gen_rtx_SET (op2, pat));
11869
11870 if (target == 0
11871 || !register_operand (target, SImode))
11872 target = gen_reg_rtx (SImode);
11873
11874 emit_insn (gen_zero_extendqisi2 (target, op2));
11875 return target;
11876
11877 case IX86_BUILTIN_SBB32:
11878 icode = CODE_FOR_subborrowsi;
11879 icode2 = CODE_FOR_subborrowsi_0;
11880 mode0 = SImode;
11881 mode1 = DImode;
11882 mode2 = CCmode;
11883 goto handlecarry;
11884
11885 case IX86_BUILTIN_SBB64:
11886 icode = CODE_FOR_subborrowdi;
11887 icode2 = CODE_FOR_subborrowdi_0;
11888 mode0 = DImode;
11889 mode1 = TImode;
11890 mode2 = CCmode;
11891 goto handlecarry;
11892
11893 case IX86_BUILTIN_ADDCARRYX32:
11894 icode = CODE_FOR_addcarrysi;
11895 icode2 = CODE_FOR_addcarrysi_0;
11896 mode0 = SImode;
11897 mode1 = DImode;
11898 mode2 = CCCmode;
11899 goto handlecarry;
11900
11901 case IX86_BUILTIN_ADDCARRYX64:
11902 icode = CODE_FOR_addcarrydi;
11903 icode2 = CODE_FOR_addcarrydi_0;
11904 mode0 = DImode;
11905 mode1 = TImode;
11906 mode2 = CCCmode;
11907
11908 handlecarry:
11909 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
11910 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
11911 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
11912 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
11913
11914 op1 = expand_normal (arg0);
11915 if (!integer_zerop (arg0))
11916 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
11917
11918 op2 = expand_normal (arg1);
11919 if (!register_operand (op2, mode0))
11920 op2 = copy_to_mode_reg (mode0, op2);
11921
11922 op3 = expand_normal (arg2);
11923 if (!register_operand (op3, mode0))
11924 op3 = copy_to_mode_reg (mode0, op3);
11925
11926 op4 = expand_normal (arg3);
11927 if (!address_operand (op4, VOIDmode))
11928 {
11929 op4 = convert_memory_address (Pmode, op4);
11930 op4 = copy_addr_to_reg (op4);
11931 }
11932
11933 op0 = gen_reg_rtx (mode0);
11934 if (integer_zerop (arg0))
11935 {
11936 /* If arg0 is 0, optimize right away into add or sub
11937 instruction that sets CCCmode flags. */
11938 op1 = gen_rtx_REG (mode2, FLAGS_REG);
11939 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
11940 }
11941 else
11942 {
11943 /* Generate CF from input operand. */
11944 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
11945
11946 /* Generate instruction that consumes CF. */
11947 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
11948 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
11949 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
11950 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
11951 }
11952
11953 /* Return current CF value. */
11954 if (target == 0)
11955 target = gen_reg_rtx (QImode);
11956
11957 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
11958 emit_insn (gen_rtx_SET (target, pat));
11959
11960 /* Store the result. */
11961 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
11962
11963 return target;
11964
11965 case IX86_BUILTIN_READ_FLAGS:
11966 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
11967
11968 if (optimize
11969 || target == NULL_RTX
11970 || !nonimmediate_operand (target, word_mode)
11971 || GET_MODE (target) != word_mode)
11972 target = gen_reg_rtx (word_mode);
11973
11974 emit_insn (gen_pop (target));
11975 return target;
11976
11977 case IX86_BUILTIN_WRITE_FLAGS:
11978
11979 arg0 = CALL_EXPR_ARG (exp, 0);
11980 op0 = expand_normal (arg0);
11981 if (!general_no_elim_operand (op0, word_mode))
11982 op0 = copy_to_mode_reg (word_mode, op0);
11983
11984 emit_insn (gen_push (op0));
11985 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
11986 return 0;
11987
11988 case IX86_BUILTIN_KTESTC8:
11989 icode = CODE_FOR_ktestqi;
11990 mode3 = CCCmode;
11991 goto kortest;
11992
11993 case IX86_BUILTIN_KTESTZ8:
11994 icode = CODE_FOR_ktestqi;
11995 mode3 = CCZmode;
11996 goto kortest;
11997
11998 case IX86_BUILTIN_KTESTC16:
11999 icode = CODE_FOR_ktesthi;
12000 mode3 = CCCmode;
12001 goto kortest;
12002
12003 case IX86_BUILTIN_KTESTZ16:
12004 icode = CODE_FOR_ktesthi;
12005 mode3 = CCZmode;
12006 goto kortest;
12007
12008 case IX86_BUILTIN_KTESTC32:
12009 icode = CODE_FOR_ktestsi;
12010 mode3 = CCCmode;
12011 goto kortest;
12012
12013 case IX86_BUILTIN_KTESTZ32:
12014 icode = CODE_FOR_ktestsi;
12015 mode3 = CCZmode;
12016 goto kortest;
12017
12018 case IX86_BUILTIN_KTESTC64:
12019 icode = CODE_FOR_ktestdi;
12020 mode3 = CCCmode;
12021 goto kortest;
12022
12023 case IX86_BUILTIN_KTESTZ64:
12024 icode = CODE_FOR_ktestdi;
12025 mode3 = CCZmode;
12026 goto kortest;
12027
12028 case IX86_BUILTIN_KORTESTC8:
12029 icode = CODE_FOR_kortestqi;
12030 mode3 = CCCmode;
12031 goto kortest;
12032
12033 case IX86_BUILTIN_KORTESTZ8:
12034 icode = CODE_FOR_kortestqi;
12035 mode3 = CCZmode;
12036 goto kortest;
12037
12038 case IX86_BUILTIN_KORTESTC16:
12039 icode = CODE_FOR_kortesthi;
12040 mode3 = CCCmode;
12041 goto kortest;
12042
12043 case IX86_BUILTIN_KORTESTZ16:
12044 icode = CODE_FOR_kortesthi;
12045 mode3 = CCZmode;
12046 goto kortest;
12047
12048 case IX86_BUILTIN_KORTESTC32:
12049 icode = CODE_FOR_kortestsi;
12050 mode3 = CCCmode;
12051 goto kortest;
12052
12053 case IX86_BUILTIN_KORTESTZ32:
12054 icode = CODE_FOR_kortestsi;
12055 mode3 = CCZmode;
12056 goto kortest;
12057
12058 case IX86_BUILTIN_KORTESTC64:
12059 icode = CODE_FOR_kortestdi;
12060 mode3 = CCCmode;
12061 goto kortest;
12062
12063 case IX86_BUILTIN_KORTESTZ64:
12064 icode = CODE_FOR_kortestdi;
12065 mode3 = CCZmode;
12066
12067 kortest:
12068 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
12069 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
12070 op0 = expand_normal (arg0);
12071 op1 = expand_normal (arg1);
12072
12073 mode0 = insn_data[icode].operand[0].mode;
12074 mode1 = insn_data[icode].operand[1].mode;
12075
12076 if (GET_MODE (op0) != VOIDmode)
12077 op0 = force_reg (GET_MODE (op0), op0);
12078
12079 op0 = gen_lowpart (mode0, op0);
12080
12081 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12082 op0 = copy_to_mode_reg (mode0, op0);
12083
12084 if (GET_MODE (op1) != VOIDmode)
12085 op1 = force_reg (GET_MODE (op1), op1);
12086
12087 op1 = gen_lowpart (mode1, op1);
12088
12089 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12090 op1 = copy_to_mode_reg (mode1, op1);
12091
12092 target = gen_reg_rtx (QImode);
12093
12094 /* Emit kortest. */
12095 emit_insn (GEN_FCN (icode) (op0, op1));
12096 /* And use setcc to return result from flags. */
12097 ix86_expand_setcc (target, EQ,
12098 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
12099 return target;
12100
12101 case IX86_BUILTIN_GATHERSIV2DF:
12102 icode = CODE_FOR_avx2_gathersiv2df;
12103 goto gather_gen;
12104 case IX86_BUILTIN_GATHERSIV4DF:
12105 icode = CODE_FOR_avx2_gathersiv4df;
12106 goto gather_gen;
12107 case IX86_BUILTIN_GATHERDIV2DF:
12108 icode = CODE_FOR_avx2_gatherdiv2df;
12109 goto gather_gen;
12110 case IX86_BUILTIN_GATHERDIV4DF:
12111 icode = CODE_FOR_avx2_gatherdiv4df;
12112 goto gather_gen;
12113 case IX86_BUILTIN_GATHERSIV4SF:
12114 icode = CODE_FOR_avx2_gathersiv4sf;
12115 goto gather_gen;
12116 case IX86_BUILTIN_GATHERSIV8SF:
12117 icode = CODE_FOR_avx2_gathersiv8sf;
12118 goto gather_gen;
12119 case IX86_BUILTIN_GATHERDIV4SF:
12120 icode = CODE_FOR_avx2_gatherdiv4sf;
12121 goto gather_gen;
12122 case IX86_BUILTIN_GATHERDIV8SF:
12123 icode = CODE_FOR_avx2_gatherdiv8sf;
12124 goto gather_gen;
12125 case IX86_BUILTIN_GATHERSIV2DI:
12126 icode = CODE_FOR_avx2_gathersiv2di;
12127 goto gather_gen;
12128 case IX86_BUILTIN_GATHERSIV4DI:
12129 icode = CODE_FOR_avx2_gathersiv4di;
12130 goto gather_gen;
12131 case IX86_BUILTIN_GATHERDIV2DI:
12132 icode = CODE_FOR_avx2_gatherdiv2di;
12133 goto gather_gen;
12134 case IX86_BUILTIN_GATHERDIV4DI:
12135 icode = CODE_FOR_avx2_gatherdiv4di;
12136 goto gather_gen;
12137 case IX86_BUILTIN_GATHERSIV4SI:
12138 icode = CODE_FOR_avx2_gathersiv4si;
12139 goto gather_gen;
12140 case IX86_BUILTIN_GATHERSIV8SI:
12141 icode = CODE_FOR_avx2_gathersiv8si;
12142 goto gather_gen;
12143 case IX86_BUILTIN_GATHERDIV4SI:
12144 icode = CODE_FOR_avx2_gatherdiv4si;
12145 goto gather_gen;
12146 case IX86_BUILTIN_GATHERDIV8SI:
12147 icode = CODE_FOR_avx2_gatherdiv8si;
12148 goto gather_gen;
12149 case IX86_BUILTIN_GATHERALTSIV4DF:
12150 icode = CODE_FOR_avx2_gathersiv4df;
12151 goto gather_gen;
12152 case IX86_BUILTIN_GATHERALTDIV8SF:
12153 icode = CODE_FOR_avx2_gatherdiv8sf;
12154 goto gather_gen;
12155 case IX86_BUILTIN_GATHERALTSIV4DI:
12156 icode = CODE_FOR_avx2_gathersiv4di;
12157 goto gather_gen;
12158 case IX86_BUILTIN_GATHERALTDIV8SI:
12159 icode = CODE_FOR_avx2_gatherdiv8si;
12160 goto gather_gen;
12161 case IX86_BUILTIN_GATHER3SIV16SF:
12162 icode = CODE_FOR_avx512f_gathersiv16sf;
12163 goto gather_gen;
12164 case IX86_BUILTIN_GATHER3SIV8DF:
12165 icode = CODE_FOR_avx512f_gathersiv8df;
12166 goto gather_gen;
12167 case IX86_BUILTIN_GATHER3DIV16SF:
12168 icode = CODE_FOR_avx512f_gatherdiv16sf;
12169 goto gather_gen;
12170 case IX86_BUILTIN_GATHER3DIV8DF:
12171 icode = CODE_FOR_avx512f_gatherdiv8df;
12172 goto gather_gen;
12173 case IX86_BUILTIN_GATHER3SIV16SI:
12174 icode = CODE_FOR_avx512f_gathersiv16si;
12175 goto gather_gen;
12176 case IX86_BUILTIN_GATHER3SIV8DI:
12177 icode = CODE_FOR_avx512f_gathersiv8di;
12178 goto gather_gen;
12179 case IX86_BUILTIN_GATHER3DIV16SI:
12180 icode = CODE_FOR_avx512f_gatherdiv16si;
12181 goto gather_gen;
12182 case IX86_BUILTIN_GATHER3DIV8DI:
12183 icode = CODE_FOR_avx512f_gatherdiv8di;
12184 goto gather_gen;
12185 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12186 icode = CODE_FOR_avx512f_gathersiv8df;
12187 goto gather_gen;
12188 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12189 icode = CODE_FOR_avx512f_gatherdiv16sf;
12190 goto gather_gen;
12191 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12192 icode = CODE_FOR_avx512f_gathersiv8di;
12193 goto gather_gen;
12194 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12195 icode = CODE_FOR_avx512f_gatherdiv16si;
12196 goto gather_gen;
12197 case IX86_BUILTIN_GATHER3SIV2DF:
12198 icode = CODE_FOR_avx512vl_gathersiv2df;
12199 goto gather_gen;
12200 case IX86_BUILTIN_GATHER3SIV4DF:
12201 icode = CODE_FOR_avx512vl_gathersiv4df;
12202 goto gather_gen;
12203 case IX86_BUILTIN_GATHER3DIV2DF:
12204 icode = CODE_FOR_avx512vl_gatherdiv2df;
12205 goto gather_gen;
12206 case IX86_BUILTIN_GATHER3DIV4DF:
12207 icode = CODE_FOR_avx512vl_gatherdiv4df;
12208 goto gather_gen;
12209 case IX86_BUILTIN_GATHER3SIV4SF:
12210 icode = CODE_FOR_avx512vl_gathersiv4sf;
12211 goto gather_gen;
12212 case IX86_BUILTIN_GATHER3SIV8SF:
12213 icode = CODE_FOR_avx512vl_gathersiv8sf;
12214 goto gather_gen;
12215 case IX86_BUILTIN_GATHER3DIV4SF:
12216 icode = CODE_FOR_avx512vl_gatherdiv4sf;
12217 goto gather_gen;
12218 case IX86_BUILTIN_GATHER3DIV8SF:
12219 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12220 goto gather_gen;
12221 case IX86_BUILTIN_GATHER3SIV2DI:
12222 icode = CODE_FOR_avx512vl_gathersiv2di;
12223 goto gather_gen;
12224 case IX86_BUILTIN_GATHER3SIV4DI:
12225 icode = CODE_FOR_avx512vl_gathersiv4di;
12226 goto gather_gen;
12227 case IX86_BUILTIN_GATHER3DIV2DI:
12228 icode = CODE_FOR_avx512vl_gatherdiv2di;
12229 goto gather_gen;
12230 case IX86_BUILTIN_GATHER3DIV4DI:
12231 icode = CODE_FOR_avx512vl_gatherdiv4di;
12232 goto gather_gen;
12233 case IX86_BUILTIN_GATHER3SIV4SI:
12234 icode = CODE_FOR_avx512vl_gathersiv4si;
12235 goto gather_gen;
12236 case IX86_BUILTIN_GATHER3SIV8SI:
12237 icode = CODE_FOR_avx512vl_gathersiv8si;
12238 goto gather_gen;
12239 case IX86_BUILTIN_GATHER3DIV4SI:
12240 icode = CODE_FOR_avx512vl_gatherdiv4si;
12241 goto gather_gen;
12242 case IX86_BUILTIN_GATHER3DIV8SI:
12243 icode = CODE_FOR_avx512vl_gatherdiv8si;
12244 goto gather_gen;
12245 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12246 icode = CODE_FOR_avx512vl_gathersiv4df;
12247 goto gather_gen;
12248 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12249 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12250 goto gather_gen;
12251 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12252 icode = CODE_FOR_avx512vl_gathersiv4di;
12253 goto gather_gen;
12254 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12255 icode = CODE_FOR_avx512vl_gatherdiv8si;
12256 goto gather_gen;
12257 case IX86_BUILTIN_SCATTERSIV16SF:
12258 icode = CODE_FOR_avx512f_scattersiv16sf;
12259 goto scatter_gen;
12260 case IX86_BUILTIN_SCATTERSIV8DF:
12261 icode = CODE_FOR_avx512f_scattersiv8df;
12262 goto scatter_gen;
12263 case IX86_BUILTIN_SCATTERDIV16SF:
12264 icode = CODE_FOR_avx512f_scatterdiv16sf;
12265 goto scatter_gen;
12266 case IX86_BUILTIN_SCATTERDIV8DF:
12267 icode = CODE_FOR_avx512f_scatterdiv8df;
12268 goto scatter_gen;
12269 case IX86_BUILTIN_SCATTERSIV16SI:
12270 icode = CODE_FOR_avx512f_scattersiv16si;
12271 goto scatter_gen;
12272 case IX86_BUILTIN_SCATTERSIV8DI:
12273 icode = CODE_FOR_avx512f_scattersiv8di;
12274 goto scatter_gen;
12275 case IX86_BUILTIN_SCATTERDIV16SI:
12276 icode = CODE_FOR_avx512f_scatterdiv16si;
12277 goto scatter_gen;
12278 case IX86_BUILTIN_SCATTERDIV8DI:
12279 icode = CODE_FOR_avx512f_scatterdiv8di;
12280 goto scatter_gen;
12281 case IX86_BUILTIN_SCATTERSIV8SF:
12282 icode = CODE_FOR_avx512vl_scattersiv8sf;
12283 goto scatter_gen;
12284 case IX86_BUILTIN_SCATTERSIV4SF:
12285 icode = CODE_FOR_avx512vl_scattersiv4sf;
12286 goto scatter_gen;
12287 case IX86_BUILTIN_SCATTERSIV4DF:
12288 icode = CODE_FOR_avx512vl_scattersiv4df;
12289 goto scatter_gen;
12290 case IX86_BUILTIN_SCATTERSIV2DF:
12291 icode = CODE_FOR_avx512vl_scattersiv2df;
12292 goto scatter_gen;
12293 case IX86_BUILTIN_SCATTERDIV8SF:
12294 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12295 goto scatter_gen;
12296 case IX86_BUILTIN_SCATTERDIV4SF:
12297 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12298 goto scatter_gen;
12299 case IX86_BUILTIN_SCATTERDIV4DF:
12300 icode = CODE_FOR_avx512vl_scatterdiv4df;
12301 goto scatter_gen;
12302 case IX86_BUILTIN_SCATTERDIV2DF:
12303 icode = CODE_FOR_avx512vl_scatterdiv2df;
12304 goto scatter_gen;
12305 case IX86_BUILTIN_SCATTERSIV8SI:
12306 icode = CODE_FOR_avx512vl_scattersiv8si;
12307 goto scatter_gen;
12308 case IX86_BUILTIN_SCATTERSIV4SI:
12309 icode = CODE_FOR_avx512vl_scattersiv4si;
12310 goto scatter_gen;
12311 case IX86_BUILTIN_SCATTERSIV4DI:
12312 icode = CODE_FOR_avx512vl_scattersiv4di;
12313 goto scatter_gen;
12314 case IX86_BUILTIN_SCATTERSIV2DI:
12315 icode = CODE_FOR_avx512vl_scattersiv2di;
12316 goto scatter_gen;
12317 case IX86_BUILTIN_SCATTERDIV8SI:
12318 icode = CODE_FOR_avx512vl_scatterdiv8si;
12319 goto scatter_gen;
12320 case IX86_BUILTIN_SCATTERDIV4SI:
12321 icode = CODE_FOR_avx512vl_scatterdiv4si;
12322 goto scatter_gen;
12323 case IX86_BUILTIN_SCATTERDIV4DI:
12324 icode = CODE_FOR_avx512vl_scatterdiv4di;
12325 goto scatter_gen;
12326 case IX86_BUILTIN_SCATTERDIV2DI:
12327 icode = CODE_FOR_avx512vl_scatterdiv2di;
12328 goto scatter_gen;
12329 case IX86_BUILTIN_GATHERPFDPD:
12330 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
12331 goto vec_prefetch_gen;
12332 case IX86_BUILTIN_SCATTERALTSIV8DF:
12333 icode = CODE_FOR_avx512f_scattersiv8df;
12334 goto scatter_gen;
12335 case IX86_BUILTIN_SCATTERALTDIV16SF:
12336 icode = CODE_FOR_avx512f_scatterdiv16sf;
12337 goto scatter_gen;
12338 case IX86_BUILTIN_SCATTERALTSIV8DI:
12339 icode = CODE_FOR_avx512f_scattersiv8di;
12340 goto scatter_gen;
12341 case IX86_BUILTIN_SCATTERALTDIV16SI:
12342 icode = CODE_FOR_avx512f_scatterdiv16si;
12343 goto scatter_gen;
12344 case IX86_BUILTIN_SCATTERALTSIV4DF:
12345 icode = CODE_FOR_avx512vl_scattersiv4df;
12346 goto scatter_gen;
12347 case IX86_BUILTIN_SCATTERALTDIV8SF:
12348 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12349 goto scatter_gen;
12350 case IX86_BUILTIN_SCATTERALTSIV4DI:
12351 icode = CODE_FOR_avx512vl_scattersiv4di;
12352 goto scatter_gen;
12353 case IX86_BUILTIN_SCATTERALTDIV8SI:
12354 icode = CODE_FOR_avx512vl_scatterdiv8si;
12355 goto scatter_gen;
12356 case IX86_BUILTIN_SCATTERALTSIV2DF:
12357 icode = CODE_FOR_avx512vl_scattersiv2df;
12358 goto scatter_gen;
12359 case IX86_BUILTIN_SCATTERALTDIV4SF:
12360 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12361 goto scatter_gen;
12362 case IX86_BUILTIN_SCATTERALTSIV2DI:
12363 icode = CODE_FOR_avx512vl_scattersiv2di;
12364 goto scatter_gen;
12365 case IX86_BUILTIN_SCATTERALTDIV4SI:
12366 icode = CODE_FOR_avx512vl_scatterdiv4si;
12367 goto scatter_gen;
12368 case IX86_BUILTIN_GATHERPFDPS:
12369 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
12370 goto vec_prefetch_gen;
12371 case IX86_BUILTIN_GATHERPFQPD:
12372 icode = CODE_FOR_avx512pf_gatherpfv8didf;
12373 goto vec_prefetch_gen;
12374 case IX86_BUILTIN_GATHERPFQPS:
12375 icode = CODE_FOR_avx512pf_gatherpfv8disf;
12376 goto vec_prefetch_gen;
12377 case IX86_BUILTIN_SCATTERPFDPD:
12378 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
12379 goto vec_prefetch_gen;
12380 case IX86_BUILTIN_SCATTERPFDPS:
12381 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
12382 goto vec_prefetch_gen;
12383 case IX86_BUILTIN_SCATTERPFQPD:
12384 icode = CODE_FOR_avx512pf_scatterpfv8didf;
12385 goto vec_prefetch_gen;
12386 case IX86_BUILTIN_SCATTERPFQPS:
12387 icode = CODE_FOR_avx512pf_scatterpfv8disf;
12388 goto vec_prefetch_gen;
12389
12390 gather_gen:
12391 rtx half;
12392 rtx (*gen) (rtx, rtx);
12393
12394 arg0 = CALL_EXPR_ARG (exp, 0);
12395 arg1 = CALL_EXPR_ARG (exp, 1);
12396 arg2 = CALL_EXPR_ARG (exp, 2);
12397 arg3 = CALL_EXPR_ARG (exp, 3);
12398 arg4 = CALL_EXPR_ARG (exp, 4);
12399 op0 = expand_normal (arg0);
12400 op1 = expand_normal (arg1);
12401 op2 = expand_normal (arg2);
12402 op3 = expand_normal (arg3);
12403 op4 = expand_normal (arg4);
12404 /* Note the arg order is different from the operand order. */
12405 mode0 = insn_data[icode].operand[1].mode;
12406 mode2 = insn_data[icode].operand[3].mode;
12407 mode3 = insn_data[icode].operand[4].mode;
12408 mode4 = insn_data[icode].operand[5].mode;
12409
12410 if (target == NULL_RTX
12411 || GET_MODE (target) != insn_data[icode].operand[0].mode
12412 || !insn_data[icode].operand[0].predicate (target,
12413 GET_MODE (target)))
12414 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
12415 else
12416 subtarget = target;
12417
12418 switch (fcode)
12419 {
12420 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12421 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12422 half = gen_reg_rtx (V8SImode);
12423 if (!nonimmediate_operand (op2, V16SImode))
12424 op2 = copy_to_mode_reg (V16SImode, op2);
12425 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12426 op2 = half;
12427 break;
12428 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12429 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12430 case IX86_BUILTIN_GATHERALTSIV4DF:
12431 case IX86_BUILTIN_GATHERALTSIV4DI:
12432 half = gen_reg_rtx (V4SImode);
12433 if (!nonimmediate_operand (op2, V8SImode))
12434 op2 = copy_to_mode_reg (V8SImode, op2);
12435 emit_insn (gen_vec_extract_lo_v8si (half, op2));
12436 op2 = half;
12437 break;
12438 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12439 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12440 half = gen_reg_rtx (mode0);
12441 if (mode0 == V8SFmode)
12442 gen = gen_vec_extract_lo_v16sf;
12443 else
12444 gen = gen_vec_extract_lo_v16si;
12445 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12446 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12447 emit_insn (gen (half, op0));
12448 op0 = half;
12449 op3 = lowpart_subreg (QImode, op3, HImode);
12450 break;
12451 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12452 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12453 case IX86_BUILTIN_GATHERALTDIV8SF:
12454 case IX86_BUILTIN_GATHERALTDIV8SI:
12455 half = gen_reg_rtx (mode0);
12456 if (mode0 == V4SFmode)
12457 gen = gen_vec_extract_lo_v8sf;
12458 else
12459 gen = gen_vec_extract_lo_v8si;
12460 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12461 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12462 emit_insn (gen (half, op0));
12463 op0 = half;
12464 if (VECTOR_MODE_P (GET_MODE (op3)))
12465 {
12466 half = gen_reg_rtx (mode0);
12467 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12468 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12469 emit_insn (gen (half, op3));
12470 op3 = half;
12471 }
12472 break;
12473 default:
12474 break;
12475 }
12476
12477 /* Force memory operand only with base register here. But we
12478 don't want to do it on memory operand for other builtin
12479 functions. */
12480 op1 = ix86_zero_extend_to_Pmode (op1);
12481
12482 if (!insn_data[icode].operand[1].predicate (op0, mode0))
12483 op0 = copy_to_mode_reg (mode0, op0);
12484 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
12485 op1 = copy_to_mode_reg (Pmode, op1);
12486 if (!insn_data[icode].operand[3].predicate (op2, mode2))
12487 op2 = copy_to_mode_reg (mode2, op2);
12488
12489 op3 = fixup_modeless_constant (op3, mode3);
12490
12491 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
12492 {
12493 if (!insn_data[icode].operand[4].predicate (op3, mode3))
12494 op3 = copy_to_mode_reg (mode3, op3);
12495 }
12496 else
12497 {
12498 op3 = copy_to_reg (op3);
12499 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
12500 }
12501 if (!insn_data[icode].operand[5].predicate (op4, mode4))
12502 {
12503 error ("the last argument must be scale 1, 2, 4, 8");
12504 return const0_rtx;
12505 }
12506
12507 /* Optimize. If mask is known to have all high bits set,
12508 replace op0 with pc_rtx to signal that the instruction
12509 overwrites the whole destination and doesn't use its
12510 previous contents. */
12511 if (optimize)
12512 {
12513 if (TREE_CODE (arg3) == INTEGER_CST)
12514 {
12515 if (integer_all_onesp (arg3))
12516 op0 = pc_rtx;
12517 }
12518 else if (TREE_CODE (arg3) == VECTOR_CST)
12519 {
12520 unsigned int negative = 0;
12521 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
12522 {
12523 tree cst = VECTOR_CST_ELT (arg3, i);
12524 if (TREE_CODE (cst) == INTEGER_CST
12525 && tree_int_cst_sign_bit (cst))
12526 negative++;
12527 else if (TREE_CODE (cst) == REAL_CST
12528 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
12529 negative++;
12530 }
12531 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
12532 op0 = pc_rtx;
12533 }
12534 else if (TREE_CODE (arg3) == SSA_NAME
12535 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
12536 {
12537 /* Recognize also when mask is like:
12538 __v2df src = _mm_setzero_pd ();
12539 __v2df mask = _mm_cmpeq_pd (src, src);
12540 or
12541 __v8sf src = _mm256_setzero_ps ();
12542 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12543 as that is a cheaper way to load all ones into
12544 a register than having to load a constant from
12545 memory. */
12546 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
12547 if (is_gimple_call (def_stmt))
12548 {
12549 tree fndecl = gimple_call_fndecl (def_stmt);
12550 if (fndecl
12551 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
12552 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
12553 {
12554 case IX86_BUILTIN_CMPPD:
12555 case IX86_BUILTIN_CMPPS:
12556 case IX86_BUILTIN_CMPPD256:
12557 case IX86_BUILTIN_CMPPS256:
12558 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
12559 break;
12560 /* FALLTHRU */
12561 case IX86_BUILTIN_CMPEQPD:
12562 case IX86_BUILTIN_CMPEQPS:
12563 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
12564 && initializer_zerop (gimple_call_arg (def_stmt,
12565 1)))
12566 op0 = pc_rtx;
12567 break;
12568 default:
12569 break;
12570 }
12571 }
12572 }
12573 }
12574
12575 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
12576 if (! pat)
12577 return const0_rtx;
12578 emit_insn (pat);
12579
12580 switch (fcode)
12581 {
12582 case IX86_BUILTIN_GATHER3DIV16SF:
12583 if (target == NULL_RTX)
12584 target = gen_reg_rtx (V8SFmode);
12585 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
12586 break;
12587 case IX86_BUILTIN_GATHER3DIV16SI:
12588 if (target == NULL_RTX)
12589 target = gen_reg_rtx (V8SImode);
12590 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
12591 break;
12592 case IX86_BUILTIN_GATHER3DIV8SF:
12593 case IX86_BUILTIN_GATHERDIV8SF:
12594 if (target == NULL_RTX)
12595 target = gen_reg_rtx (V4SFmode);
12596 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
12597 break;
12598 case IX86_BUILTIN_GATHER3DIV8SI:
12599 case IX86_BUILTIN_GATHERDIV8SI:
12600 if (target == NULL_RTX)
12601 target = gen_reg_rtx (V4SImode);
12602 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
12603 break;
12604 default:
12605 target = subtarget;
12606 break;
12607 }
12608 return target;
12609
12610 scatter_gen:
12611 arg0 = CALL_EXPR_ARG (exp, 0);
12612 arg1 = CALL_EXPR_ARG (exp, 1);
12613 arg2 = CALL_EXPR_ARG (exp, 2);
12614 arg3 = CALL_EXPR_ARG (exp, 3);
12615 arg4 = CALL_EXPR_ARG (exp, 4);
12616 op0 = expand_normal (arg0);
12617 op1 = expand_normal (arg1);
12618 op2 = expand_normal (arg2);
12619 op3 = expand_normal (arg3);
12620 op4 = expand_normal (arg4);
12621 mode1 = insn_data[icode].operand[1].mode;
12622 mode2 = insn_data[icode].operand[2].mode;
12623 mode3 = insn_data[icode].operand[3].mode;
12624 mode4 = insn_data[icode].operand[4].mode;
12625
12626 /* Scatter instruction stores operand op3 to memory with
12627 indices from op2 and scale from op4 under writemask op1.
12628 If index operand op2 has more elements then source operand
12629 op3 one need to use only its low half. And vice versa. */
12630 switch (fcode)
12631 {
12632 case IX86_BUILTIN_SCATTERALTSIV8DF:
12633 case IX86_BUILTIN_SCATTERALTSIV8DI:
12634 half = gen_reg_rtx (V8SImode);
12635 if (!nonimmediate_operand (op2, V16SImode))
12636 op2 = copy_to_mode_reg (V16SImode, op2);
12637 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12638 op2 = half;
12639 break;
12640 case IX86_BUILTIN_SCATTERALTDIV16SF:
12641 case IX86_BUILTIN_SCATTERALTDIV16SI:
12642 half = gen_reg_rtx (mode3);
12643 if (mode3 == V8SFmode)
12644 gen = gen_vec_extract_lo_v16sf;
12645 else
12646 gen = gen_vec_extract_lo_v16si;
12647 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12648 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12649 emit_insn (gen (half, op3));
12650 op3 = half;
12651 break;
12652 case IX86_BUILTIN_SCATTERALTSIV4DF:
12653 case IX86_BUILTIN_SCATTERALTSIV4DI:
12654 half = gen_reg_rtx (V4SImode);
12655 if (!nonimmediate_operand (op2, V8SImode))
12656 op2 = copy_to_mode_reg (V8SImode, op2);
12657 emit_insn (gen_vec_extract_lo_v8si (half, op2));
12658 op2 = half;
12659 break;
12660 case IX86_BUILTIN_SCATTERALTDIV8SF:
12661 case IX86_BUILTIN_SCATTERALTDIV8SI:
12662 half = gen_reg_rtx (mode3);
12663 if (mode3 == V4SFmode)
12664 gen = gen_vec_extract_lo_v8sf;
12665 else
12666 gen = gen_vec_extract_lo_v8si;
12667 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12668 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12669 emit_insn (gen (half, op3));
12670 op3 = half;
12671 break;
12672 case IX86_BUILTIN_SCATTERALTSIV2DF:
12673 case IX86_BUILTIN_SCATTERALTSIV2DI:
12674 if (!nonimmediate_operand (op2, V4SImode))
12675 op2 = copy_to_mode_reg (V4SImode, op2);
12676 break;
12677 case IX86_BUILTIN_SCATTERALTDIV4SF:
12678 case IX86_BUILTIN_SCATTERALTDIV4SI:
12679 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12680 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12681 break;
12682 default:
12683 break;
12684 }
12685
12686 /* Force memory operand only with base register here. But we
12687 don't want to do it on memory operand for other builtin
12688 functions. */
12689 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
12690
12691 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12692 op0 = copy_to_mode_reg (Pmode, op0);
12693
12694 op1 = fixup_modeless_constant (op1, mode1);
12695
12696 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
12697 {
12698 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12699 op1 = copy_to_mode_reg (mode1, op1);
12700 }
12701 else
12702 {
12703 op1 = copy_to_reg (op1);
12704 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
12705 }
12706
12707 if (!insn_data[icode].operand[2].predicate (op2, mode2))
12708 op2 = copy_to_mode_reg (mode2, op2);
12709
12710 if (!insn_data[icode].operand[3].predicate (op3, mode3))
12711 op3 = copy_to_mode_reg (mode3, op3);
12712
12713 if (!insn_data[icode].operand[4].predicate (op4, mode4))
12714 {
12715 error ("the last argument must be scale 1, 2, 4, 8");
12716 return const0_rtx;
12717 }
12718
12719 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
12720 if (! pat)
12721 return const0_rtx;
12722
12723 emit_insn (pat);
12724 return 0;
12725
12726 vec_prefetch_gen:
12727 arg0 = CALL_EXPR_ARG (exp, 0);
12728 arg1 = CALL_EXPR_ARG (exp, 1);
12729 arg2 = CALL_EXPR_ARG (exp, 2);
12730 arg3 = CALL_EXPR_ARG (exp, 3);
12731 arg4 = CALL_EXPR_ARG (exp, 4);
12732 op0 = expand_normal (arg0);
12733 op1 = expand_normal (arg1);
12734 op2 = expand_normal (arg2);
12735 op3 = expand_normal (arg3);
12736 op4 = expand_normal (arg4);
12737 mode0 = insn_data[icode].operand[0].mode;
12738 mode1 = insn_data[icode].operand[1].mode;
12739 mode3 = insn_data[icode].operand[3].mode;
12740 mode4 = insn_data[icode].operand[4].mode;
12741
12742 op0 = fixup_modeless_constant (op0, mode0);
12743
12744 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
12745 {
12746 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12747 op0 = copy_to_mode_reg (mode0, op0);
12748 }
12749 else
12750 {
12751 op0 = copy_to_reg (op0);
12752 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
12753 }
12754
12755 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12756 op1 = copy_to_mode_reg (mode1, op1);
12757
12758 /* Force memory operand only with base register here. But we
12759 don't want to do it on memory operand for other builtin
12760 functions. */
12761 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
12762
12763 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
12764 op2 = copy_to_mode_reg (Pmode, op2);
12765
12766 if (!insn_data[icode].operand[3].predicate (op3, mode3))
12767 {
12768 error ("the forth argument must be scale 1, 2, 4, 8");
12769 return const0_rtx;
12770 }
12771
12772 if (!insn_data[icode].operand[4].predicate (op4, mode4))
12773 {
12774 error ("incorrect hint operand");
12775 return const0_rtx;
12776 }
12777
12778 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
12779 if (! pat)
12780 return const0_rtx;
12781
12782 emit_insn (pat);
12783
12784 return 0;
12785
12786 case IX86_BUILTIN_XABORT:
12787 icode = CODE_FOR_xabort;
12788 arg0 = CALL_EXPR_ARG (exp, 0);
12789 op0 = expand_normal (arg0);
12790 mode0 = insn_data[icode].operand[0].mode;
12791 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12792 {
12793 error ("the argument to %<xabort%> intrinsic must "
12794 "be an 8-bit immediate");
12795 return const0_rtx;
12796 }
12797 emit_insn (gen_xabort (op0));
12798 return 0;
12799
12800 case IX86_BUILTIN_RSTORSSP:
12801 case IX86_BUILTIN_CLRSSBSY:
12802 arg0 = CALL_EXPR_ARG (exp, 0);
12803 op0 = expand_normal (arg0);
12804 icode = (fcode == IX86_BUILTIN_RSTORSSP
12805 ? CODE_FOR_rstorssp
12806 : CODE_FOR_clrssbsy);
12807 if (!address_operand (op0, VOIDmode))
12808 {
12809 op1 = convert_memory_address (Pmode, op0);
12810 op0 = copy_addr_to_reg (op1);
12811 }
12812 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
12813 return 0;
12814
12815 case IX86_BUILTIN_WRSSD:
12816 case IX86_BUILTIN_WRSSQ:
12817 case IX86_BUILTIN_WRUSSD:
12818 case IX86_BUILTIN_WRUSSQ:
12819 arg0 = CALL_EXPR_ARG (exp, 0);
12820 op0 = expand_normal (arg0);
12821 arg1 = CALL_EXPR_ARG (exp, 1);
12822 op1 = expand_normal (arg1);
12823 switch (fcode)
12824 {
12825 case IX86_BUILTIN_WRSSD:
12826 icode = CODE_FOR_wrsssi;
12827 mode = SImode;
12828 break;
12829 case IX86_BUILTIN_WRSSQ:
12830 icode = CODE_FOR_wrssdi;
12831 mode = DImode;
12832 break;
12833 case IX86_BUILTIN_WRUSSD:
12834 icode = CODE_FOR_wrusssi;
12835 mode = SImode;
12836 break;
12837 case IX86_BUILTIN_WRUSSQ:
12838 icode = CODE_FOR_wrussdi;
12839 mode = DImode;
12840 break;
12841 }
12842 op0 = force_reg (mode, op0);
12843 if (!address_operand (op1, VOIDmode))
12844 {
12845 op2 = convert_memory_address (Pmode, op1);
12846 op1 = copy_addr_to_reg (op2);
12847 }
12848 emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
12849 return 0;
12850
12851 default:
12852 break;
12853 }
12854
12855 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
12856 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
12857 {
12858 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
12859 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
12860 target);
12861 }
12862
12863 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
12864 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
12865 {
12866 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
12867 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
12868 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
12869 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
12870 int masked = 1;
12871 machine_mode mode, wide_mode, nar_mode;
12872
12873 nar_mode = V4SFmode;
12874 mode = V16SFmode;
12875 wide_mode = V64SFmode;
12876 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
12877 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
12878
12879 switch (fcode)
12880 {
12881 case IX86_BUILTIN_4FMAPS:
12882 fcn = gen_avx5124fmaddps_4fmaddps;
12883 masked = 0;
12884 goto v4fma_expand;
12885
12886 case IX86_BUILTIN_4DPWSSD:
12887 nar_mode = V4SImode;
12888 mode = V16SImode;
12889 wide_mode = V64SImode;
12890 fcn = gen_avx5124vnniw_vp4dpwssd;
12891 masked = 0;
12892 goto v4fma_expand;
12893
12894 case IX86_BUILTIN_4DPWSSDS:
12895 nar_mode = V4SImode;
12896 mode = V16SImode;
12897 wide_mode = V64SImode;
12898 fcn = gen_avx5124vnniw_vp4dpwssds;
12899 masked = 0;
12900 goto v4fma_expand;
12901
12902 case IX86_BUILTIN_4FNMAPS:
12903 fcn = gen_avx5124fmaddps_4fnmaddps;
12904 masked = 0;
12905 goto v4fma_expand;
12906
12907 case IX86_BUILTIN_4FNMAPS_MASK:
12908 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
12909 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
12910 goto v4fma_expand;
12911
12912 case IX86_BUILTIN_4DPWSSD_MASK:
12913 nar_mode = V4SImode;
12914 mode = V16SImode;
12915 wide_mode = V64SImode;
12916 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
12917 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
12918 goto v4fma_expand;
12919
12920 case IX86_BUILTIN_4DPWSSDS_MASK:
12921 nar_mode = V4SImode;
12922 mode = V16SImode;
12923 wide_mode = V64SImode;
12924 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
12925 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
12926 goto v4fma_expand;
12927
12928 case IX86_BUILTIN_4FMAPS_MASK:
12929 {
12930 tree args[4];
12931 rtx ops[4];
12932 rtx wide_reg;
12933 rtx accum;
12934 rtx addr;
12935 rtx mem;
12936
12937 v4fma_expand:
12938 wide_reg = gen_reg_rtx (wide_mode);
12939 for (i = 0; i < 4; i++)
12940 {
12941 args[i] = CALL_EXPR_ARG (exp, i);
12942 ops[i] = expand_normal (args[i]);
12943
12944 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
12945 ops[i]);
12946 }
12947
12948 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
12949 accum = force_reg (mode, accum);
12950
12951 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
12952 addr = force_reg (Pmode, addr);
12953
12954 mem = gen_rtx_MEM (nar_mode, addr);
12955
12956 target = gen_reg_rtx (mode);
12957
12958 emit_move_insn (target, accum);
12959
12960 if (! masked)
12961 emit_insn (fcn (target, accum, wide_reg, mem));
12962 else
12963 {
12964 rtx merge, mask;
12965 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
12966
12967 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
12968
12969 if (CONST_INT_P (mask))
12970 mask = fixup_modeless_constant (mask, HImode);
12971
12972 mask = force_reg (HImode, mask);
12973
12974 if (GET_MODE (mask) != HImode)
12975 mask = gen_rtx_SUBREG (HImode, mask, 0);
12976
12977 /* If merge is 0 then we're about to emit z-masked variant. */
12978 if (const0_operand (merge, mode))
12979 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
12980 /* If merge is the same as accum then emit merge-masked variant. */
12981 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
12982 {
12983 merge = force_reg (mode, merge);
12984 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
12985 }
12986 /* Merge with something unknown might happen if we z-mask w/ -O0. */
12987 else
12988 {
12989 target = gen_reg_rtx (mode);
12990 emit_move_insn (target, merge);
12991 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
12992 }
12993 }
12994 return target;
12995 }
12996
12997 case IX86_BUILTIN_4FNMASS:
12998 fcn = gen_avx5124fmaddps_4fnmaddss;
12999 masked = 0;
13000 goto s4fma_expand;
13001
13002 case IX86_BUILTIN_4FMASS:
13003 fcn = gen_avx5124fmaddps_4fmaddss;
13004 masked = 0;
13005 goto s4fma_expand;
13006
13007 case IX86_BUILTIN_4FNMASS_MASK:
13008 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
13009 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
13010 goto s4fma_expand;
13011
13012 case IX86_BUILTIN_4FMASS_MASK:
13013 {
13014 tree args[4];
13015 rtx ops[4];
13016 rtx wide_reg;
13017 rtx accum;
13018 rtx addr;
13019 rtx mem;
13020
13021 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
13022 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
13023
13024 s4fma_expand:
13025 mode = V4SFmode;
13026 wide_reg = gen_reg_rtx (V64SFmode);
13027 for (i = 0; i < 4; i++)
13028 {
13029 rtx tmp;
13030 args[i] = CALL_EXPR_ARG (exp, i);
13031 ops[i] = expand_normal (args[i]);
13032
13033 tmp = gen_reg_rtx (SFmode);
13034 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
13035
13036 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
13037 gen_rtx_SUBREG (V16SFmode, tmp, 0));
13038 }
13039
13040 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
13041 accum = force_reg (V4SFmode, accum);
13042
13043 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
13044 addr = force_reg (Pmode, addr);
13045
13046 mem = gen_rtx_MEM (V4SFmode, addr);
13047
13048 target = gen_reg_rtx (V4SFmode);
13049
13050 emit_move_insn (target, accum);
13051
13052 if (! masked)
13053 emit_insn (fcn (target, accum, wide_reg, mem));
13054 else
13055 {
13056 rtx merge, mask;
13057 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
13058
13059 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
13060
13061 if (CONST_INT_P (mask))
13062 mask = fixup_modeless_constant (mask, QImode);
13063
13064 mask = force_reg (QImode, mask);
13065
13066 if (GET_MODE (mask) != QImode)
13067 mask = gen_rtx_SUBREG (QImode, mask, 0);
13068
13069 /* If merge is 0 then we're about to emit z-masked variant. */
13070 if (const0_operand (merge, mode))
13071 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
13072 /* If merge is the same as accum then emit merge-masked
13073 variant. */
13074 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
13075 {
13076 merge = force_reg (mode, merge);
13077 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
13078 }
13079 /* Merge with something unknown might happen if we z-mask
13080 w/ -O0. */
13081 else
13082 {
13083 target = gen_reg_rtx (mode);
13084 emit_move_insn (target, merge);
13085 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
13086 }
13087 }
13088 return target;
13089 }
13090 case IX86_BUILTIN_RDPID:
13091 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
13092 target);
13093 case IX86_BUILTIN_FABSQ:
13094 case IX86_BUILTIN_COPYSIGNQ:
13095 if (!TARGET_SSE)
13096 /* Emit a normal call if SSE isn't available. */
13097 return expand_call (exp, target, ignore);
13098 /* FALLTHRU */
13099 default:
13100 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
13101 }
13102 }
13103
13104 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
13105 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
13106 {
13107 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
13108 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
13109 }
13110
13111 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
13112 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
13113 {
13114 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
13115 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
13116 }
13117
13118 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
13119 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
13120 {
13121 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
13122 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
13123 }
13124
13125 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
13126 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
13127 {
13128 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
13129 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
13130 }
13131
13132 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
13133 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
13134 {
13135 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
13136 const struct builtin_description *d = bdesc_multi_arg + i;
13137 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
13138 (enum ix86_builtin_func_type)
13139 d->flag, d->comparison);
13140 }
13141
13142 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
13143 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
13144 {
13145 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
13146 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
13147 target);
13148 }
13149
13150 if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
13151 && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
13152 {
13153 i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
13154 return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
13155 target);
13156 }
13157
13158 gcc_unreachable ();
13159 }
13160
13161 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
13162 fill target with val via vec_duplicate. */
13163
13164 static bool
13165 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
13166 {
13167 bool ok;
13168 rtx_insn *insn;
13169 rtx dup;
13170
13171 /* First attempt to recognize VAL as-is. */
13172 dup = gen_vec_duplicate (mode, val);
13173 insn = emit_insn (gen_rtx_SET (target, dup));
13174 if (recog_memoized (insn) < 0)
13175 {
13176 rtx_insn *seq;
13177 machine_mode innermode = GET_MODE_INNER (mode);
13178 rtx reg;
13179
13180 /* If that fails, force VAL into a register. */
13181
13182 start_sequence ();
13183 reg = force_reg (innermode, val);
13184 if (GET_MODE (reg) != innermode)
13185 reg = gen_lowpart (innermode, reg);
13186 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
13187 seq = get_insns ();
13188 end_sequence ();
13189 if (seq)
13190 emit_insn_before (seq, insn);
13191
13192 ok = recog_memoized (insn) >= 0;
13193 gcc_assert (ok);
13194 }
13195 return true;
13196 }
13197
13198 /* Get a vector mode of the same size as the original but with elements
13199 twice as wide. This is only guaranteed to apply to integral vectors. */
13200
13201 static machine_mode
13202 get_mode_wider_vector (machine_mode o)
13203 {
13204 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
13205 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
13206 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
13207 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
13208 return n;
13209 }
13210
13211 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
13212 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
13213
13214 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13215 with all elements equal to VAR. Return true if successful. */
13216
13217 static bool
13218 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
13219 rtx target, rtx val)
13220 {
13221 bool ok;
13222
13223 switch (mode)
13224 {
13225 case E_V2SImode:
13226 case E_V2SFmode:
13227 if (!mmx_ok)
13228 return false;
13229 /* FALLTHRU */
13230
13231 case E_V4DFmode:
13232 case E_V4DImode:
13233 case E_V8SFmode:
13234 case E_V8SImode:
13235 case E_V2DFmode:
13236 case E_V2DImode:
13237 case E_V4SFmode:
13238 case E_V4SImode:
13239 case E_V16SImode:
13240 case E_V8DImode:
13241 case E_V16SFmode:
13242 case E_V8DFmode:
13243 return ix86_vector_duplicate_value (mode, target, val);
13244
13245 case E_V4HImode:
13246 if (!mmx_ok)
13247 return false;
13248 if (TARGET_SSE || TARGET_3DNOW_A)
13249 {
13250 rtx x;
13251
13252 val = gen_lowpart (SImode, val);
13253 x = gen_rtx_TRUNCATE (HImode, val);
13254 x = gen_rtx_VEC_DUPLICATE (mode, x);
13255 emit_insn (gen_rtx_SET (target, x));
13256 return true;
13257 }
13258 goto widen;
13259
13260 case E_V8QImode:
13261 if (!mmx_ok)
13262 return false;
13263 goto widen;
13264
13265 case E_V8HImode:
13266 if (TARGET_AVX2)
13267 return ix86_vector_duplicate_value (mode, target, val);
13268
13269 if (TARGET_SSE2)
13270 {
13271 struct expand_vec_perm_d dperm;
13272 rtx tmp1, tmp2;
13273
13274 permute:
13275 memset (&dperm, 0, sizeof (dperm));
13276 dperm.target = target;
13277 dperm.vmode = mode;
13278 dperm.nelt = GET_MODE_NUNITS (mode);
13279 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
13280 dperm.one_operand_p = true;
13281
13282 /* Extend to SImode using a paradoxical SUBREG. */
13283 tmp1 = gen_reg_rtx (SImode);
13284 emit_move_insn (tmp1, gen_lowpart (SImode, val));
13285
13286 /* Insert the SImode value as low element of a V4SImode vector. */
13287 tmp2 = gen_reg_rtx (V4SImode);
13288 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
13289 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
13290
13291 ok = (expand_vec_perm_1 (&dperm)
13292 || expand_vec_perm_broadcast_1 (&dperm));
13293 gcc_assert (ok);
13294 return ok;
13295 }
13296 goto widen;
13297
13298 case E_V16QImode:
13299 if (TARGET_AVX2)
13300 return ix86_vector_duplicate_value (mode, target, val);
13301
13302 if (TARGET_SSE2)
13303 goto permute;
13304 goto widen;
13305
13306 widen:
13307 /* Replicate the value once into the next wider mode and recurse. */
13308 {
13309 machine_mode smode, wsmode, wvmode;
13310 rtx x;
13311
13312 smode = GET_MODE_INNER (mode);
13313 wvmode = get_mode_wider_vector (mode);
13314 wsmode = GET_MODE_INNER (wvmode);
13315
13316 val = convert_modes (wsmode, smode, val, true);
13317 x = expand_simple_binop (wsmode, ASHIFT, val,
13318 GEN_INT (GET_MODE_BITSIZE (smode)),
13319 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13320 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
13321
13322 x = gen_reg_rtx (wvmode);
13323 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
13324 gcc_assert (ok);
13325 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
13326 return ok;
13327 }
13328
13329 case E_V16HImode:
13330 case E_V32QImode:
13331 if (TARGET_AVX2)
13332 return ix86_vector_duplicate_value (mode, target, val);
13333 else
13334 {
13335 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
13336 rtx x = gen_reg_rtx (hvmode);
13337
13338 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13339 gcc_assert (ok);
13340
13341 x = gen_rtx_VEC_CONCAT (mode, x, x);
13342 emit_insn (gen_rtx_SET (target, x));
13343 }
13344 return true;
13345
13346 case E_V64QImode:
13347 case E_V32HImode:
13348 if (TARGET_AVX512BW)
13349 return ix86_vector_duplicate_value (mode, target, val);
13350 else
13351 {
13352 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
13353 rtx x = gen_reg_rtx (hvmode);
13354
13355 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13356 gcc_assert (ok);
13357
13358 x = gen_rtx_VEC_CONCAT (mode, x, x);
13359 emit_insn (gen_rtx_SET (target, x));
13360 }
13361 return true;
13362
13363 default:
13364 return false;
13365 }
13366 }
13367
13368 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13369 whose ONE_VAR element is VAR, and other elements are zero. Return true
13370 if successful. */
13371
13372 static bool
13373 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
13374 rtx target, rtx var, int one_var)
13375 {
13376 machine_mode vsimode;
13377 rtx new_target;
13378 rtx x, tmp;
13379 bool use_vector_set = false;
13380 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
13381
13382 switch (mode)
13383 {
13384 case E_V2DImode:
13385 /* For SSE4.1, we normally use vector set. But if the second
13386 element is zero and inter-unit moves are OK, we use movq
13387 instead. */
13388 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
13389 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
13390 && one_var == 0));
13391 break;
13392 case E_V16QImode:
13393 case E_V4SImode:
13394 case E_V4SFmode:
13395 use_vector_set = TARGET_SSE4_1;
13396 break;
13397 case E_V8HImode:
13398 use_vector_set = TARGET_SSE2;
13399 break;
13400 case E_V4HImode:
13401 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
13402 break;
13403 case E_V32QImode:
13404 case E_V16HImode:
13405 use_vector_set = TARGET_AVX;
13406 break;
13407 case E_V8SImode:
13408 use_vector_set = TARGET_AVX;
13409 gen_vec_set_0 = gen_vec_setv8si_0;
13410 break;
13411 case E_V8SFmode:
13412 use_vector_set = TARGET_AVX;
13413 gen_vec_set_0 = gen_vec_setv8sf_0;
13414 break;
13415 case E_V4DFmode:
13416 use_vector_set = TARGET_AVX;
13417 gen_vec_set_0 = gen_vec_setv4df_0;
13418 break;
13419 case E_V4DImode:
13420 /* Use ix86_expand_vector_set in 64bit mode only. */
13421 use_vector_set = TARGET_AVX && TARGET_64BIT;
13422 gen_vec_set_0 = gen_vec_setv4di_0;
13423 break;
13424 case E_V16SImode:
13425 use_vector_set = TARGET_AVX512F && one_var == 0;
13426 gen_vec_set_0 = gen_vec_setv16si_0;
13427 break;
13428 case E_V16SFmode:
13429 use_vector_set = TARGET_AVX512F && one_var == 0;
13430 gen_vec_set_0 = gen_vec_setv16sf_0;
13431 break;
13432 case E_V8DFmode:
13433 use_vector_set = TARGET_AVX512F && one_var == 0;
13434 gen_vec_set_0 = gen_vec_setv8df_0;
13435 break;
13436 case E_V8DImode:
13437 /* Use ix86_expand_vector_set in 64bit mode only. */
13438 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
13439 gen_vec_set_0 = gen_vec_setv8di_0;
13440 break;
13441 default:
13442 break;
13443 }
13444
13445 if (use_vector_set)
13446 {
13447 if (gen_vec_set_0 && one_var == 0)
13448 {
13449 var = force_reg (GET_MODE_INNER (mode), var);
13450 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
13451 return true;
13452 }
13453 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
13454 var = force_reg (GET_MODE_INNER (mode), var);
13455 ix86_expand_vector_set (mmx_ok, target, var, one_var);
13456 return true;
13457 }
13458
13459 switch (mode)
13460 {
13461 case E_V2SFmode:
13462 case E_V2SImode:
13463 if (!mmx_ok)
13464 return false;
13465 /* FALLTHRU */
13466
13467 case E_V2DFmode:
13468 case E_V2DImode:
13469 if (one_var != 0)
13470 return false;
13471 var = force_reg (GET_MODE_INNER (mode), var);
13472 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
13473 emit_insn (gen_rtx_SET (target, x));
13474 return true;
13475
13476 case E_V4SFmode:
13477 case E_V4SImode:
13478 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
13479 new_target = gen_reg_rtx (mode);
13480 else
13481 new_target = target;
13482 var = force_reg (GET_MODE_INNER (mode), var);
13483 x = gen_rtx_VEC_DUPLICATE (mode, var);
13484 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
13485 emit_insn (gen_rtx_SET (new_target, x));
13486 if (one_var != 0)
13487 {
13488 /* We need to shuffle the value to the correct position, so
13489 create a new pseudo to store the intermediate result. */
13490
13491 /* With SSE2, we can use the integer shuffle insns. */
13492 if (mode != V4SFmode && TARGET_SSE2)
13493 {
13494 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
13495 const1_rtx,
13496 GEN_INT (one_var == 1 ? 0 : 1),
13497 GEN_INT (one_var == 2 ? 0 : 1),
13498 GEN_INT (one_var == 3 ? 0 : 1)));
13499 if (target != new_target)
13500 emit_move_insn (target, new_target);
13501 return true;
13502 }
13503
13504 /* Otherwise convert the intermediate result to V4SFmode and
13505 use the SSE1 shuffle instructions. */
13506 if (mode != V4SFmode)
13507 {
13508 tmp = gen_reg_rtx (V4SFmode);
13509 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
13510 }
13511 else
13512 tmp = new_target;
13513
13514 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
13515 const1_rtx,
13516 GEN_INT (one_var == 1 ? 0 : 1),
13517 GEN_INT (one_var == 2 ? 0+4 : 1+4),
13518 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
13519
13520 if (mode != V4SFmode)
13521 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
13522 else if (tmp != target)
13523 emit_move_insn (target, tmp);
13524 }
13525 else if (target != new_target)
13526 emit_move_insn (target, new_target);
13527 return true;
13528
13529 case E_V8HImode:
13530 case E_V16QImode:
13531 vsimode = V4SImode;
13532 goto widen;
13533 case E_V4HImode:
13534 case E_V8QImode:
13535 if (!mmx_ok)
13536 return false;
13537 vsimode = V2SImode;
13538 goto widen;
13539 widen:
13540 if (one_var != 0)
13541 return false;
13542
13543 /* Zero extend the variable element to SImode and recurse. */
13544 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
13545
13546 x = gen_reg_rtx (vsimode);
13547 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
13548 var, one_var))
13549 gcc_unreachable ();
13550
13551 emit_move_insn (target, gen_lowpart (mode, x));
13552 return true;
13553
13554 default:
13555 return false;
13556 }
13557 }
13558
13559 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13560 consisting of the values in VALS. It is known that all elements
13561 except ONE_VAR are constants. Return true if successful. */
13562
13563 static bool
13564 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
13565 rtx target, rtx vals, int one_var)
13566 {
13567 rtx var = XVECEXP (vals, 0, one_var);
13568 machine_mode wmode;
13569 rtx const_vec, x;
13570
13571 const_vec = copy_rtx (vals);
13572 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
13573 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
13574
13575 switch (mode)
13576 {
13577 case E_V2DFmode:
13578 case E_V2DImode:
13579 case E_V2SFmode:
13580 case E_V2SImode:
13581 /* For the two element vectors, it's just as easy to use
13582 the general case. */
13583 return false;
13584
13585 case E_V4DImode:
13586 /* Use ix86_expand_vector_set in 64bit mode only. */
13587 if (!TARGET_64BIT)
13588 return false;
13589 /* FALLTHRU */
13590 case E_V4DFmode:
13591 case E_V8SFmode:
13592 case E_V8SImode:
13593 case E_V16HImode:
13594 case E_V32QImode:
13595 case E_V4SFmode:
13596 case E_V4SImode:
13597 case E_V8HImode:
13598 case E_V4HImode:
13599 break;
13600
13601 case E_V16QImode:
13602 if (TARGET_SSE4_1)
13603 break;
13604 wmode = V8HImode;
13605 goto widen;
13606 case E_V8QImode:
13607 wmode = V4HImode;
13608 goto widen;
13609 widen:
13610 /* There's no way to set one QImode entry easily. Combine
13611 the variable value with its adjacent constant value, and
13612 promote to an HImode set. */
13613 x = XVECEXP (vals, 0, one_var ^ 1);
13614 if (one_var & 1)
13615 {
13616 var = convert_modes (HImode, QImode, var, true);
13617 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
13618 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13619 x = GEN_INT (INTVAL (x) & 0xff);
13620 }
13621 else
13622 {
13623 var = convert_modes (HImode, QImode, var, true);
13624 x = gen_int_mode (UINTVAL (x) << 8, HImode);
13625 }
13626 if (x != const0_rtx)
13627 var = expand_simple_binop (HImode, IOR, var, x, var,
13628 1, OPTAB_LIB_WIDEN);
13629
13630 x = gen_reg_rtx (wmode);
13631 emit_move_insn (x, gen_lowpart (wmode, const_vec));
13632 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
13633
13634 emit_move_insn (target, gen_lowpart (mode, x));
13635 return true;
13636
13637 default:
13638 return false;
13639 }
13640
13641 emit_move_insn (target, const_vec);
13642 ix86_expand_vector_set (mmx_ok, target, var, one_var);
13643 return true;
13644 }
13645
13646 /* A subroutine of ix86_expand_vector_init_general. Use vector
13647 concatenate to handle the most general case: all values variable,
13648 and none identical. */
13649
13650 static void
13651 ix86_expand_vector_init_concat (machine_mode mode,
13652 rtx target, rtx *ops, int n)
13653 {
13654 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
13655 rtx first[16], second[8], third[4];
13656 rtvec v;
13657 int i, j;
13658
13659 switch (n)
13660 {
13661 case 2:
13662 switch (mode)
13663 {
13664 case E_V16SImode:
13665 cmode = V8SImode;
13666 break;
13667 case E_V16SFmode:
13668 cmode = V8SFmode;
13669 break;
13670 case E_V8DImode:
13671 cmode = V4DImode;
13672 break;
13673 case E_V8DFmode:
13674 cmode = V4DFmode;
13675 break;
13676 case E_V8SImode:
13677 cmode = V4SImode;
13678 break;
13679 case E_V8SFmode:
13680 cmode = V4SFmode;
13681 break;
13682 case E_V4DImode:
13683 cmode = V2DImode;
13684 break;
13685 case E_V4DFmode:
13686 cmode = V2DFmode;
13687 break;
13688 case E_V4SImode:
13689 cmode = V2SImode;
13690 break;
13691 case E_V4SFmode:
13692 cmode = V2SFmode;
13693 break;
13694 case E_V2DImode:
13695 cmode = DImode;
13696 break;
13697 case E_V2SImode:
13698 cmode = SImode;
13699 break;
13700 case E_V2DFmode:
13701 cmode = DFmode;
13702 break;
13703 case E_V2SFmode:
13704 cmode = SFmode;
13705 break;
13706 default:
13707 gcc_unreachable ();
13708 }
13709
13710 if (!register_operand (ops[1], cmode))
13711 ops[1] = force_reg (cmode, ops[1]);
13712 if (!register_operand (ops[0], cmode))
13713 ops[0] = force_reg (cmode, ops[0]);
13714 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
13715 ops[1])));
13716 break;
13717
13718 case 4:
13719 switch (mode)
13720 {
13721 case E_V4DImode:
13722 cmode = V2DImode;
13723 break;
13724 case E_V4DFmode:
13725 cmode = V2DFmode;
13726 break;
13727 case E_V4SImode:
13728 cmode = V2SImode;
13729 break;
13730 case E_V4SFmode:
13731 cmode = V2SFmode;
13732 break;
13733 default:
13734 gcc_unreachable ();
13735 }
13736 goto half;
13737
13738 case 8:
13739 switch (mode)
13740 {
13741 case E_V8DImode:
13742 cmode = V2DImode;
13743 hmode = V4DImode;
13744 break;
13745 case E_V8DFmode:
13746 cmode = V2DFmode;
13747 hmode = V4DFmode;
13748 break;
13749 case E_V8SImode:
13750 cmode = V2SImode;
13751 hmode = V4SImode;
13752 break;
13753 case E_V8SFmode:
13754 cmode = V2SFmode;
13755 hmode = V4SFmode;
13756 break;
13757 default:
13758 gcc_unreachable ();
13759 }
13760 goto half;
13761
13762 case 16:
13763 switch (mode)
13764 {
13765 case E_V16SImode:
13766 cmode = V2SImode;
13767 hmode = V4SImode;
13768 gmode = V8SImode;
13769 break;
13770 case E_V16SFmode:
13771 cmode = V2SFmode;
13772 hmode = V4SFmode;
13773 gmode = V8SFmode;
13774 break;
13775 default:
13776 gcc_unreachable ();
13777 }
13778 goto half;
13779
13780 half:
13781 /* FIXME: We process inputs backward to help RA. PR 36222. */
13782 i = n - 1;
13783 j = (n >> 1) - 1;
13784 for (; i > 0; i -= 2, j--)
13785 {
13786 first[j] = gen_reg_rtx (cmode);
13787 v = gen_rtvec (2, ops[i - 1], ops[i]);
13788 ix86_expand_vector_init (false, first[j],
13789 gen_rtx_PARALLEL (cmode, v));
13790 }
13791
13792 n >>= 1;
13793 if (n > 4)
13794 {
13795 gcc_assert (hmode != VOIDmode);
13796 gcc_assert (gmode != VOIDmode);
13797 for (i = j = 0; i < n; i += 2, j++)
13798 {
13799 second[j] = gen_reg_rtx (hmode);
13800 ix86_expand_vector_init_concat (hmode, second [j],
13801 &first [i], 2);
13802 }
13803 n >>= 1;
13804 for (i = j = 0; i < n; i += 2, j++)
13805 {
13806 third[j] = gen_reg_rtx (gmode);
13807 ix86_expand_vector_init_concat (gmode, third[j],
13808 &second[i], 2);
13809 }
13810 n >>= 1;
13811 ix86_expand_vector_init_concat (mode, target, third, n);
13812 }
13813 else if (n > 2)
13814 {
13815 gcc_assert (hmode != VOIDmode);
13816 for (i = j = 0; i < n; i += 2, j++)
13817 {
13818 second[j] = gen_reg_rtx (hmode);
13819 ix86_expand_vector_init_concat (hmode, second [j],
13820 &first [i], 2);
13821 }
13822 n >>= 1;
13823 ix86_expand_vector_init_concat (mode, target, second, n);
13824 }
13825 else
13826 ix86_expand_vector_init_concat (mode, target, first, n);
13827 break;
13828
13829 default:
13830 gcc_unreachable ();
13831 }
13832 }
13833
13834 /* A subroutine of ix86_expand_vector_init_general. Use vector
13835 interleave to handle the most general case: all values variable,
13836 and none identical. */
13837
13838 static void
13839 ix86_expand_vector_init_interleave (machine_mode mode,
13840 rtx target, rtx *ops, int n)
13841 {
13842 machine_mode first_imode, second_imode, third_imode, inner_mode;
13843 int i, j;
13844 rtx op0, op1;
13845 rtx (*gen_load_even) (rtx, rtx, rtx);
13846 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
13847 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
13848
13849 switch (mode)
13850 {
13851 case E_V8HImode:
13852 gen_load_even = gen_vec_setv8hi;
13853 gen_interleave_first_low = gen_vec_interleave_lowv4si;
13854 gen_interleave_second_low = gen_vec_interleave_lowv2di;
13855 inner_mode = HImode;
13856 first_imode = V4SImode;
13857 second_imode = V2DImode;
13858 third_imode = VOIDmode;
13859 break;
13860 case E_V16QImode:
13861 gen_load_even = gen_vec_setv16qi;
13862 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
13863 gen_interleave_second_low = gen_vec_interleave_lowv4si;
13864 inner_mode = QImode;
13865 first_imode = V8HImode;
13866 second_imode = V4SImode;
13867 third_imode = V2DImode;
13868 break;
13869 default:
13870 gcc_unreachable ();
13871 }
13872
13873 for (i = 0; i < n; i++)
13874 {
13875 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
13876 op0 = gen_reg_rtx (SImode);
13877 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
13878
13879 /* Insert the SImode value as low element of V4SImode vector. */
13880 op1 = gen_reg_rtx (V4SImode);
13881 op0 = gen_rtx_VEC_MERGE (V4SImode,
13882 gen_rtx_VEC_DUPLICATE (V4SImode,
13883 op0),
13884 CONST0_RTX (V4SImode),
13885 const1_rtx);
13886 emit_insn (gen_rtx_SET (op1, op0));
13887
13888 /* Cast the V4SImode vector back to a vector in orignal mode. */
13889 op0 = gen_reg_rtx (mode);
13890 emit_move_insn (op0, gen_lowpart (mode, op1));
13891
13892 /* Load even elements into the second position. */
13893 emit_insn (gen_load_even (op0,
13894 force_reg (inner_mode,
13895 ops [i + i + 1]),
13896 const1_rtx));
13897
13898 /* Cast vector to FIRST_IMODE vector. */
13899 ops[i] = gen_reg_rtx (first_imode);
13900 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
13901 }
13902
13903 /* Interleave low FIRST_IMODE vectors. */
13904 for (i = j = 0; i < n; i += 2, j++)
13905 {
13906 op0 = gen_reg_rtx (first_imode);
13907 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
13908
13909 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
13910 ops[j] = gen_reg_rtx (second_imode);
13911 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
13912 }
13913
13914 /* Interleave low SECOND_IMODE vectors. */
13915 switch (second_imode)
13916 {
13917 case E_V4SImode:
13918 for (i = j = 0; i < n / 2; i += 2, j++)
13919 {
13920 op0 = gen_reg_rtx (second_imode);
13921 emit_insn (gen_interleave_second_low (op0, ops[i],
13922 ops[i + 1]));
13923
13924 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
13925 vector. */
13926 ops[j] = gen_reg_rtx (third_imode);
13927 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
13928 }
13929 second_imode = V2DImode;
13930 gen_interleave_second_low = gen_vec_interleave_lowv2di;
13931 /* FALLTHRU */
13932
13933 case E_V2DImode:
13934 op0 = gen_reg_rtx (second_imode);
13935 emit_insn (gen_interleave_second_low (op0, ops[0],
13936 ops[1]));
13937
13938 /* Cast the SECOND_IMODE vector back to a vector on original
13939 mode. */
13940 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
13941 break;
13942
13943 default:
13944 gcc_unreachable ();
13945 }
13946 }
13947
13948 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
13949 all values variable, and none identical. */
13950
13951 static void
13952 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
13953 rtx target, rtx vals)
13954 {
13955 rtx ops[64], op0, op1, op2, op3, op4, op5;
13956 machine_mode half_mode = VOIDmode;
13957 machine_mode quarter_mode = VOIDmode;
13958 int n, i;
13959
13960 switch (mode)
13961 {
13962 case E_V2SFmode:
13963 case E_V2SImode:
13964 if (!mmx_ok && !TARGET_SSE)
13965 break;
13966 /* FALLTHRU */
13967
13968 case E_V16SImode:
13969 case E_V16SFmode:
13970 case E_V8DFmode:
13971 case E_V8DImode:
13972 case E_V8SFmode:
13973 case E_V8SImode:
13974 case E_V4DFmode:
13975 case E_V4DImode:
13976 case E_V4SFmode:
13977 case E_V4SImode:
13978 case E_V2DFmode:
13979 case E_V2DImode:
13980 n = GET_MODE_NUNITS (mode);
13981 for (i = 0; i < n; i++)
13982 ops[i] = XVECEXP (vals, 0, i);
13983 ix86_expand_vector_init_concat (mode, target, ops, n);
13984 return;
13985
13986 case E_V2TImode:
13987 for (i = 0; i < 2; i++)
13988 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
13989 op0 = gen_reg_rtx (V4DImode);
13990 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
13991 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
13992 return;
13993
13994 case E_V4TImode:
13995 for (i = 0; i < 4; i++)
13996 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
13997 ops[4] = gen_reg_rtx (V4DImode);
13998 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
13999 ops[5] = gen_reg_rtx (V4DImode);
14000 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
14001 op0 = gen_reg_rtx (V8DImode);
14002 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
14003 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
14004 return;
14005
14006 case E_V32QImode:
14007 half_mode = V16QImode;
14008 goto half;
14009
14010 case E_V16HImode:
14011 half_mode = V8HImode;
14012 goto half;
14013
14014 half:
14015 n = GET_MODE_NUNITS (mode);
14016 for (i = 0; i < n; i++)
14017 ops[i] = XVECEXP (vals, 0, i);
14018 op0 = gen_reg_rtx (half_mode);
14019 op1 = gen_reg_rtx (half_mode);
14020 ix86_expand_vector_init_interleave (half_mode, op0, ops,
14021 n >> 2);
14022 ix86_expand_vector_init_interleave (half_mode, op1,
14023 &ops [n >> 1], n >> 2);
14024 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
14025 return;
14026
14027 case E_V64QImode:
14028 quarter_mode = V16QImode;
14029 half_mode = V32QImode;
14030 goto quarter;
14031
14032 case E_V32HImode:
14033 quarter_mode = V8HImode;
14034 half_mode = V16HImode;
14035 goto quarter;
14036
14037 quarter:
14038 n = GET_MODE_NUNITS (mode);
14039 for (i = 0; i < n; i++)
14040 ops[i] = XVECEXP (vals, 0, i);
14041 op0 = gen_reg_rtx (quarter_mode);
14042 op1 = gen_reg_rtx (quarter_mode);
14043 op2 = gen_reg_rtx (quarter_mode);
14044 op3 = gen_reg_rtx (quarter_mode);
14045 op4 = gen_reg_rtx (half_mode);
14046 op5 = gen_reg_rtx (half_mode);
14047 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
14048 n >> 3);
14049 ix86_expand_vector_init_interleave (quarter_mode, op1,
14050 &ops [n >> 2], n >> 3);
14051 ix86_expand_vector_init_interleave (quarter_mode, op2,
14052 &ops [n >> 1], n >> 3);
14053 ix86_expand_vector_init_interleave (quarter_mode, op3,
14054 &ops [(n >> 1) | (n >> 2)], n >> 3);
14055 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
14056 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
14057 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
14058 return;
14059
14060 case E_V16QImode:
14061 if (!TARGET_SSE4_1)
14062 break;
14063 /* FALLTHRU */
14064
14065 case E_V8HImode:
14066 if (!TARGET_SSE2)
14067 break;
14068
14069 /* Don't use ix86_expand_vector_init_interleave if we can't
14070 move from GPR to SSE register directly. */
14071 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
14072 break;
14073
14074 n = GET_MODE_NUNITS (mode);
14075 for (i = 0; i < n; i++)
14076 ops[i] = XVECEXP (vals, 0, i);
14077 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
14078 return;
14079
14080 case E_V4HImode:
14081 case E_V8QImode:
14082 break;
14083
14084 default:
14085 gcc_unreachable ();
14086 }
14087
14088 {
14089 int i, j, n_elts, n_words, n_elt_per_word;
14090 machine_mode inner_mode;
14091 rtx words[4], shift;
14092
14093 inner_mode = GET_MODE_INNER (mode);
14094 n_elts = GET_MODE_NUNITS (mode);
14095 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
14096 n_elt_per_word = n_elts / n_words;
14097 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
14098
14099 for (i = 0; i < n_words; ++i)
14100 {
14101 rtx word = NULL_RTX;
14102
14103 for (j = 0; j < n_elt_per_word; ++j)
14104 {
14105 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
14106 elt = convert_modes (word_mode, inner_mode, elt, true);
14107
14108 if (j == 0)
14109 word = elt;
14110 else
14111 {
14112 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
14113 word, 1, OPTAB_LIB_WIDEN);
14114 word = expand_simple_binop (word_mode, IOR, word, elt,
14115 word, 1, OPTAB_LIB_WIDEN);
14116 }
14117 }
14118
14119 words[i] = word;
14120 }
14121
14122 if (n_words == 1)
14123 emit_move_insn (target, gen_lowpart (mode, words[0]));
14124 else if (n_words == 2)
14125 {
14126 rtx tmp = gen_reg_rtx (mode);
14127 emit_clobber (tmp);
14128 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
14129 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
14130 emit_move_insn (target, tmp);
14131 }
14132 else if (n_words == 4)
14133 {
14134 rtx tmp = gen_reg_rtx (V4SImode);
14135 gcc_assert (word_mode == SImode);
14136 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
14137 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
14138 emit_move_insn (target, gen_lowpart (mode, tmp));
14139 }
14140 else
14141 gcc_unreachable ();
14142 }
14143 }
14144
14145 /* Initialize vector TARGET via VALS. Suppress the use of MMX
14146 instructions unless MMX_OK is true. */
14147
14148 void
14149 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
14150 {
14151 machine_mode mode = GET_MODE (target);
14152 machine_mode inner_mode = GET_MODE_INNER (mode);
14153 int n_elts = GET_MODE_NUNITS (mode);
14154 int n_var = 0, one_var = -1;
14155 bool all_same = true, all_const_zero = true;
14156 int i;
14157 rtx x;
14158
14159 /* Handle first initialization from vector elts. */
14160 if (n_elts != XVECLEN (vals, 0))
14161 {
14162 rtx subtarget = target;
14163 x = XVECEXP (vals, 0, 0);
14164 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
14165 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
14166 {
14167 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
14168 if (inner_mode == QImode || inner_mode == HImode)
14169 {
14170 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
14171 mode = mode_for_vector (SImode, n_bits / 4).require ();
14172 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
14173 ops[0] = gen_lowpart (inner_mode, ops[0]);
14174 ops[1] = gen_lowpart (inner_mode, ops[1]);
14175 subtarget = gen_reg_rtx (mode);
14176 }
14177 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
14178 if (subtarget != target)
14179 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
14180 return;
14181 }
14182 gcc_unreachable ();
14183 }
14184
14185 for (i = 0; i < n_elts; ++i)
14186 {
14187 x = XVECEXP (vals, 0, i);
14188 if (!(CONST_SCALAR_INT_P (x)
14189 || CONST_DOUBLE_P (x)
14190 || CONST_FIXED_P (x)))
14191 n_var++, one_var = i;
14192 else if (x != CONST0_RTX (inner_mode))
14193 all_const_zero = false;
14194 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
14195 all_same = false;
14196 }
14197
14198 /* Constants are best loaded from the constant pool. */
14199 if (n_var == 0)
14200 {
14201 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
14202 return;
14203 }
14204
14205 /* If all values are identical, broadcast the value. */
14206 if (all_same
14207 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
14208 XVECEXP (vals, 0, 0)))
14209 return;
14210
14211 /* Values where only one field is non-constant are best loaded from
14212 the pool and overwritten via move later. */
14213 if (n_var == 1)
14214 {
14215 if (all_const_zero
14216 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
14217 XVECEXP (vals, 0, one_var),
14218 one_var))
14219 return;
14220
14221 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
14222 return;
14223 }
14224
14225 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
14226 }
14227
14228 void
14229 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
14230 {
14231 machine_mode mode = GET_MODE (target);
14232 machine_mode inner_mode = GET_MODE_INNER (mode);
14233 machine_mode half_mode;
14234 bool use_vec_merge = false;
14235 rtx tmp;
14236 static rtx (*gen_extract[6][2]) (rtx, rtx)
14237 = {
14238 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
14239 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
14240 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
14241 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
14242 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
14243 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
14244 };
14245 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
14246 = {
14247 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
14248 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
14249 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
14250 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
14251 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
14252 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
14253 };
14254 int i, j, n;
14255 machine_mode mmode = VOIDmode;
14256 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
14257
14258 switch (mode)
14259 {
14260 case E_V2SFmode:
14261 case E_V2SImode:
14262 if (mmx_ok)
14263 {
14264 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14265 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
14266 if (elt == 0)
14267 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14268 else
14269 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14270 emit_insn (gen_rtx_SET (target, tmp));
14271 return;
14272 }
14273 break;
14274
14275 case E_V2DImode:
14276 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
14277 if (use_vec_merge)
14278 break;
14279
14280 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14281 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
14282 if (elt == 0)
14283 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14284 else
14285 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14286 emit_insn (gen_rtx_SET (target, tmp));
14287 return;
14288
14289 case E_V2DFmode:
14290 /* NB: For ELT == 0, use standard scalar operation patterns which
14291 preserve the rest of the vector for combiner:
14292
14293 (vec_merge:V2DF
14294 (vec_duplicate:V2DF (reg:DF))
14295 (reg:V2DF)
14296 (const_int 1))
14297 */
14298 if (elt == 0)
14299 goto do_vec_merge;
14300
14301 {
14302 rtx op0, op1;
14303
14304 /* For the two element vectors, we implement a VEC_CONCAT with
14305 the extraction of the other element. */
14306
14307 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
14308 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
14309
14310 if (elt == 0)
14311 op0 = val, op1 = tmp;
14312 else
14313 op0 = tmp, op1 = val;
14314
14315 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
14316 emit_insn (gen_rtx_SET (target, tmp));
14317 }
14318 return;
14319
14320 case E_V4SFmode:
14321 use_vec_merge = TARGET_SSE4_1;
14322 if (use_vec_merge)
14323 break;
14324
14325 switch (elt)
14326 {
14327 case 0:
14328 use_vec_merge = true;
14329 break;
14330
14331 case 1:
14332 /* tmp = target = A B C D */
14333 tmp = copy_to_reg (target);
14334 /* target = A A B B */
14335 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
14336 /* target = X A B B */
14337 ix86_expand_vector_set (false, target, val, 0);
14338 /* target = A X C D */
14339 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14340 const1_rtx, const0_rtx,
14341 GEN_INT (2+4), GEN_INT (3+4)));
14342 return;
14343
14344 case 2:
14345 /* tmp = target = A B C D */
14346 tmp = copy_to_reg (target);
14347 /* tmp = X B C D */
14348 ix86_expand_vector_set (false, tmp, val, 0);
14349 /* target = A B X D */
14350 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14351 const0_rtx, const1_rtx,
14352 GEN_INT (0+4), GEN_INT (3+4)));
14353 return;
14354
14355 case 3:
14356 /* tmp = target = A B C D */
14357 tmp = copy_to_reg (target);
14358 /* tmp = X B C D */
14359 ix86_expand_vector_set (false, tmp, val, 0);
14360 /* target = A B X D */
14361 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14362 const0_rtx, const1_rtx,
14363 GEN_INT (2+4), GEN_INT (0+4)));
14364 return;
14365
14366 default:
14367 gcc_unreachable ();
14368 }
14369 break;
14370
14371 case E_V4SImode:
14372 use_vec_merge = TARGET_SSE4_1;
14373 if (use_vec_merge)
14374 break;
14375
14376 /* Element 0 handled by vec_merge below. */
14377 if (elt == 0)
14378 {
14379 use_vec_merge = true;
14380 break;
14381 }
14382
14383 if (TARGET_SSE2)
14384 {
14385 /* With SSE2, use integer shuffles to swap element 0 and ELT,
14386 store into element 0, then shuffle them back. */
14387
14388 rtx order[4];
14389
14390 order[0] = GEN_INT (elt);
14391 order[1] = const1_rtx;
14392 order[2] = const2_rtx;
14393 order[3] = GEN_INT (3);
14394 order[elt] = const0_rtx;
14395
14396 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14397 order[1], order[2], order[3]));
14398
14399 ix86_expand_vector_set (false, target, val, 0);
14400
14401 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14402 order[1], order[2], order[3]));
14403 }
14404 else
14405 {
14406 /* For SSE1, we have to reuse the V4SF code. */
14407 rtx t = gen_reg_rtx (V4SFmode);
14408 emit_move_insn (t, gen_lowpart (V4SFmode, target));
14409 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
14410 emit_move_insn (target, gen_lowpart (mode, t));
14411 }
14412 return;
14413
14414 case E_V8HImode:
14415 use_vec_merge = TARGET_SSE2;
14416 break;
14417 case E_V4HImode:
14418 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14419 break;
14420
14421 case E_V16QImode:
14422 use_vec_merge = TARGET_SSE4_1;
14423 break;
14424
14425 case E_V8QImode:
14426 break;
14427
14428 case E_V32QImode:
14429 half_mode = V16QImode;
14430 j = 0;
14431 n = 16;
14432 goto half;
14433
14434 case E_V16HImode:
14435 half_mode = V8HImode;
14436 j = 1;
14437 n = 8;
14438 goto half;
14439
14440 case E_V8SImode:
14441 half_mode = V4SImode;
14442 j = 2;
14443 n = 4;
14444 goto half;
14445
14446 case E_V4DImode:
14447 half_mode = V2DImode;
14448 j = 3;
14449 n = 2;
14450 goto half;
14451
14452 case E_V8SFmode:
14453 half_mode = V4SFmode;
14454 j = 4;
14455 n = 4;
14456 goto half;
14457
14458 case E_V4DFmode:
14459 half_mode = V2DFmode;
14460 j = 5;
14461 n = 2;
14462 goto half;
14463
14464 half:
14465 /* Compute offset. */
14466 i = elt / n;
14467 elt %= n;
14468
14469 gcc_assert (i <= 1);
14470
14471 /* Extract the half. */
14472 tmp = gen_reg_rtx (half_mode);
14473 emit_insn (gen_extract[j][i] (tmp, target));
14474
14475 /* Put val in tmp at elt. */
14476 ix86_expand_vector_set (false, tmp, val, elt);
14477
14478 /* Put it back. */
14479 emit_insn (gen_insert[j][i] (target, target, tmp));
14480 return;
14481
14482 case E_V8DFmode:
14483 if (TARGET_AVX512F)
14484 {
14485 mmode = QImode;
14486 gen_blendm = gen_avx512f_blendmv8df;
14487 }
14488 break;
14489
14490 case E_V8DImode:
14491 if (TARGET_AVX512F)
14492 {
14493 mmode = QImode;
14494 gen_blendm = gen_avx512f_blendmv8di;
14495 }
14496 break;
14497
14498 case E_V16SFmode:
14499 if (TARGET_AVX512F)
14500 {
14501 mmode = HImode;
14502 gen_blendm = gen_avx512f_blendmv16sf;
14503 }
14504 break;
14505
14506 case E_V16SImode:
14507 if (TARGET_AVX512F)
14508 {
14509 mmode = HImode;
14510 gen_blendm = gen_avx512f_blendmv16si;
14511 }
14512 break;
14513
14514 case E_V32HImode:
14515 if (TARGET_AVX512BW)
14516 {
14517 mmode = SImode;
14518 gen_blendm = gen_avx512bw_blendmv32hi;
14519 }
14520 else if (TARGET_AVX512F)
14521 {
14522 half_mode = E_V8HImode;
14523 n = 8;
14524 goto quarter;
14525 }
14526 break;
14527
14528 case E_V64QImode:
14529 if (TARGET_AVX512BW)
14530 {
14531 mmode = DImode;
14532 gen_blendm = gen_avx512bw_blendmv64qi;
14533 }
14534 else if (TARGET_AVX512F)
14535 {
14536 half_mode = E_V16QImode;
14537 n = 16;
14538 goto quarter;
14539 }
14540 break;
14541
14542 quarter:
14543 /* Compute offset. */
14544 i = elt / n;
14545 elt %= n;
14546
14547 gcc_assert (i <= 3);
14548
14549 {
14550 /* Extract the quarter. */
14551 tmp = gen_reg_rtx (V4SImode);
14552 rtx tmp2 = gen_lowpart (V16SImode, target);
14553 rtx mask = gen_reg_rtx (QImode);
14554
14555 emit_move_insn (mask, constm1_rtx);
14556 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
14557 tmp, mask));
14558
14559 tmp2 = gen_reg_rtx (half_mode);
14560 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
14561 tmp = tmp2;
14562
14563 /* Put val in tmp at elt. */
14564 ix86_expand_vector_set (false, tmp, val, elt);
14565
14566 /* Put it back. */
14567 tmp2 = gen_reg_rtx (V16SImode);
14568 rtx tmp3 = gen_lowpart (V16SImode, target);
14569 mask = gen_reg_rtx (HImode);
14570 emit_move_insn (mask, constm1_rtx);
14571 tmp = gen_lowpart (V4SImode, tmp);
14572 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
14573 tmp3, mask));
14574 emit_move_insn (target, gen_lowpart (mode, tmp2));
14575 }
14576 return;
14577
14578 default:
14579 break;
14580 }
14581
14582 if (mmode != VOIDmode)
14583 {
14584 tmp = gen_reg_rtx (mode);
14585 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
14586 /* The avx512*_blendm<mode> expanders have different operand order
14587 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
14588 elements where the mask is set and second input operand otherwise,
14589 in {sse,avx}*_*blend* the first input operand is used for elements
14590 where the mask is clear and second input operand otherwise. */
14591 emit_insn (gen_blendm (target, target, tmp,
14592 force_reg (mmode,
14593 gen_int_mode (HOST_WIDE_INT_1U << elt,
14594 mmode))));
14595 }
14596 else if (use_vec_merge)
14597 {
14598 do_vec_merge:
14599 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
14600 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
14601 GEN_INT (HOST_WIDE_INT_1U << elt));
14602 emit_insn (gen_rtx_SET (target, tmp));
14603 }
14604 else
14605 {
14606 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
14607
14608 emit_move_insn (mem, target);
14609
14610 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
14611 emit_move_insn (tmp, val);
14612
14613 emit_move_insn (target, mem);
14614 }
14615 }
14616
14617 void
14618 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
14619 {
14620 machine_mode mode = GET_MODE (vec);
14621 machine_mode inner_mode = GET_MODE_INNER (mode);
14622 bool use_vec_extr = false;
14623 rtx tmp;
14624
14625 switch (mode)
14626 {
14627 case E_V2SImode:
14628 case E_V2SFmode:
14629 if (!mmx_ok)
14630 break;
14631 /* FALLTHRU */
14632
14633 case E_V2DFmode:
14634 case E_V2DImode:
14635 case E_V2TImode:
14636 case E_V4TImode:
14637 use_vec_extr = true;
14638 break;
14639
14640 case E_V4SFmode:
14641 use_vec_extr = TARGET_SSE4_1;
14642 if (use_vec_extr)
14643 break;
14644
14645 switch (elt)
14646 {
14647 case 0:
14648 tmp = vec;
14649 break;
14650
14651 case 1:
14652 case 3:
14653 tmp = gen_reg_rtx (mode);
14654 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
14655 GEN_INT (elt), GEN_INT (elt),
14656 GEN_INT (elt+4), GEN_INT (elt+4)));
14657 break;
14658
14659 case 2:
14660 tmp = gen_reg_rtx (mode);
14661 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
14662 break;
14663
14664 default:
14665 gcc_unreachable ();
14666 }
14667 vec = tmp;
14668 use_vec_extr = true;
14669 elt = 0;
14670 break;
14671
14672 case E_V4SImode:
14673 use_vec_extr = TARGET_SSE4_1;
14674 if (use_vec_extr)
14675 break;
14676
14677 if (TARGET_SSE2)
14678 {
14679 switch (elt)
14680 {
14681 case 0:
14682 tmp = vec;
14683 break;
14684
14685 case 1:
14686 case 3:
14687 tmp = gen_reg_rtx (mode);
14688 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
14689 GEN_INT (elt), GEN_INT (elt),
14690 GEN_INT (elt), GEN_INT (elt)));
14691 break;
14692
14693 case 2:
14694 tmp = gen_reg_rtx (mode);
14695 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
14696 break;
14697
14698 default:
14699 gcc_unreachable ();
14700 }
14701 vec = tmp;
14702 use_vec_extr = true;
14703 elt = 0;
14704 }
14705 else
14706 {
14707 /* For SSE1, we have to reuse the V4SF code. */
14708 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
14709 gen_lowpart (V4SFmode, vec), elt);
14710 return;
14711 }
14712 break;
14713
14714 case E_V8HImode:
14715 use_vec_extr = TARGET_SSE2;
14716 break;
14717 case E_V4HImode:
14718 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14719 break;
14720
14721 case E_V16QImode:
14722 use_vec_extr = TARGET_SSE4_1;
14723 break;
14724
14725 case E_V8SFmode:
14726 if (TARGET_AVX)
14727 {
14728 tmp = gen_reg_rtx (V4SFmode);
14729 if (elt < 4)
14730 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
14731 else
14732 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
14733 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14734 return;
14735 }
14736 break;
14737
14738 case E_V4DFmode:
14739 if (TARGET_AVX)
14740 {
14741 tmp = gen_reg_rtx (V2DFmode);
14742 if (elt < 2)
14743 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
14744 else
14745 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
14746 ix86_expand_vector_extract (false, target, tmp, elt & 1);
14747 return;
14748 }
14749 break;
14750
14751 case E_V32QImode:
14752 if (TARGET_AVX)
14753 {
14754 tmp = gen_reg_rtx (V16QImode);
14755 if (elt < 16)
14756 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
14757 else
14758 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
14759 ix86_expand_vector_extract (false, target, tmp, elt & 15);
14760 return;
14761 }
14762 break;
14763
14764 case E_V16HImode:
14765 if (TARGET_AVX)
14766 {
14767 tmp = gen_reg_rtx (V8HImode);
14768 if (elt < 8)
14769 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
14770 else
14771 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
14772 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14773 return;
14774 }
14775 break;
14776
14777 case E_V8SImode:
14778 if (TARGET_AVX)
14779 {
14780 tmp = gen_reg_rtx (V4SImode);
14781 if (elt < 4)
14782 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
14783 else
14784 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
14785 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14786 return;
14787 }
14788 break;
14789
14790 case E_V4DImode:
14791 if (TARGET_AVX)
14792 {
14793 tmp = gen_reg_rtx (V2DImode);
14794 if (elt < 2)
14795 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
14796 else
14797 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
14798 ix86_expand_vector_extract (false, target, tmp, elt & 1);
14799 return;
14800 }
14801 break;
14802
14803 case E_V32HImode:
14804 if (TARGET_AVX512BW)
14805 {
14806 tmp = gen_reg_rtx (V16HImode);
14807 if (elt < 16)
14808 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
14809 else
14810 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
14811 ix86_expand_vector_extract (false, target, tmp, elt & 15);
14812 return;
14813 }
14814 break;
14815
14816 case E_V64QImode:
14817 if (TARGET_AVX512BW)
14818 {
14819 tmp = gen_reg_rtx (V32QImode);
14820 if (elt < 32)
14821 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
14822 else
14823 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
14824 ix86_expand_vector_extract (false, target, tmp, elt & 31);
14825 return;
14826 }
14827 break;
14828
14829 case E_V16SFmode:
14830 tmp = gen_reg_rtx (V8SFmode);
14831 if (elt < 8)
14832 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
14833 else
14834 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
14835 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14836 return;
14837
14838 case E_V8DFmode:
14839 tmp = gen_reg_rtx (V4DFmode);
14840 if (elt < 4)
14841 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
14842 else
14843 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
14844 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14845 return;
14846
14847 case E_V16SImode:
14848 tmp = gen_reg_rtx (V8SImode);
14849 if (elt < 8)
14850 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
14851 else
14852 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
14853 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14854 return;
14855
14856 case E_V8DImode:
14857 tmp = gen_reg_rtx (V4DImode);
14858 if (elt < 4)
14859 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
14860 else
14861 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
14862 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14863 return;
14864
14865 case E_V8QImode:
14866 /* ??? Could extract the appropriate HImode element and shift. */
14867 default:
14868 break;
14869 }
14870
14871 if (use_vec_extr)
14872 {
14873 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
14874 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
14875
14876 /* Let the rtl optimizers know about the zero extension performed. */
14877 if (inner_mode == QImode || inner_mode == HImode)
14878 {
14879 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
14880 target = gen_lowpart (SImode, target);
14881 }
14882
14883 emit_insn (gen_rtx_SET (target, tmp));
14884 }
14885 else
14886 {
14887 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
14888
14889 emit_move_insn (mem, vec);
14890
14891 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
14892 emit_move_insn (target, tmp);
14893 }
14894 }
14895
14896 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
14897 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
14898 The upper bits of DEST are undefined, though they shouldn't cause
14899 exceptions (some bits from src or all zeros are ok). */
14900
14901 static void
14902 emit_reduc_half (rtx dest, rtx src, int i)
14903 {
14904 rtx tem, d = dest;
14905 switch (GET_MODE (src))
14906 {
14907 case E_V4SFmode:
14908 if (i == 128)
14909 tem = gen_sse_movhlps (dest, src, src);
14910 else
14911 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
14912 GEN_INT (1 + 4), GEN_INT (1 + 4));
14913 break;
14914 case E_V2DFmode:
14915 tem = gen_vec_interleave_highv2df (dest, src, src);
14916 break;
14917 case E_V16QImode:
14918 case E_V8HImode:
14919 case E_V4SImode:
14920 case E_V2DImode:
14921 d = gen_reg_rtx (V1TImode);
14922 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
14923 GEN_INT (i / 2));
14924 break;
14925 case E_V8SFmode:
14926 if (i == 256)
14927 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
14928 else
14929 tem = gen_avx_shufps256 (dest, src, src,
14930 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
14931 break;
14932 case E_V4DFmode:
14933 if (i == 256)
14934 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
14935 else
14936 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
14937 break;
14938 case E_V32QImode:
14939 case E_V16HImode:
14940 case E_V8SImode:
14941 case E_V4DImode:
14942 if (i == 256)
14943 {
14944 if (GET_MODE (dest) != V4DImode)
14945 d = gen_reg_rtx (V4DImode);
14946 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
14947 gen_lowpart (V4DImode, src),
14948 const1_rtx);
14949 }
14950 else
14951 {
14952 d = gen_reg_rtx (V2TImode);
14953 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
14954 GEN_INT (i / 2));
14955 }
14956 break;
14957 case E_V64QImode:
14958 case E_V32HImode:
14959 case E_V16SImode:
14960 case E_V16SFmode:
14961 case E_V8DImode:
14962 case E_V8DFmode:
14963 if (i > 128)
14964 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
14965 gen_lowpart (V16SImode, src),
14966 gen_lowpart (V16SImode, src),
14967 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
14968 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
14969 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
14970 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
14971 GEN_INT (0xC), GEN_INT (0xD),
14972 GEN_INT (0xE), GEN_INT (0xF),
14973 GEN_INT (0x10), GEN_INT (0x11),
14974 GEN_INT (0x12), GEN_INT (0x13),
14975 GEN_INT (0x14), GEN_INT (0x15),
14976 GEN_INT (0x16), GEN_INT (0x17));
14977 else
14978 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
14979 gen_lowpart (V16SImode, src),
14980 GEN_INT (i == 128 ? 0x2 : 0x1),
14981 GEN_INT (0x3),
14982 GEN_INT (0x3),
14983 GEN_INT (0x3),
14984 GEN_INT (i == 128 ? 0x6 : 0x5),
14985 GEN_INT (0x7),
14986 GEN_INT (0x7),
14987 GEN_INT (0x7),
14988 GEN_INT (i == 128 ? 0xA : 0x9),
14989 GEN_INT (0xB),
14990 GEN_INT (0xB),
14991 GEN_INT (0xB),
14992 GEN_INT (i == 128 ? 0xE : 0xD),
14993 GEN_INT (0xF),
14994 GEN_INT (0xF),
14995 GEN_INT (0xF));
14996 break;
14997 default:
14998 gcc_unreachable ();
14999 }
15000 emit_insn (tem);
15001 if (d != dest)
15002 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
15003 }
15004
15005 /* Expand a vector reduction. FN is the binary pattern to reduce;
15006 DEST is the destination; IN is the input vector. */
15007
15008 void
15009 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
15010 {
15011 rtx half, dst, vec = in;
15012 machine_mode mode = GET_MODE (in);
15013 int i;
15014
15015 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
15016 if (TARGET_SSE4_1
15017 && mode == V8HImode
15018 && fn == gen_uminv8hi3)
15019 {
15020 emit_insn (gen_sse4_1_phminposuw (dest, in));
15021 return;
15022 }
15023
15024 for (i = GET_MODE_BITSIZE (mode);
15025 i > GET_MODE_UNIT_BITSIZE (mode);
15026 i >>= 1)
15027 {
15028 half = gen_reg_rtx (mode);
15029 emit_reduc_half (half, vec, i);
15030 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
15031 dst = dest;
15032 else
15033 dst = gen_reg_rtx (mode);
15034 emit_insn (fn (dst, half, vec));
15035 vec = dst;
15036 }
15037 }
15038
15039 /* Output code to perform a conditional jump to LABEL, if C2 flag in
15040 FP status register is set. */
15041
15042 void
15043 ix86_emit_fp_unordered_jump (rtx label)
15044 {
15045 rtx reg = gen_reg_rtx (HImode);
15046 rtx_insn *insn;
15047 rtx temp;
15048
15049 emit_insn (gen_x86_fnstsw_1 (reg));
15050
15051 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
15052 {
15053 emit_insn (gen_x86_sahf_1 (reg));
15054
15055 temp = gen_rtx_REG (CCmode, FLAGS_REG);
15056 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
15057 }
15058 else
15059 {
15060 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
15061
15062 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
15063 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
15064 }
15065
15066 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
15067 gen_rtx_LABEL_REF (VOIDmode, label),
15068 pc_rtx);
15069 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
15070 predict_jump (REG_BR_PROB_BASE * 10 / 100);
15071 JUMP_LABEL (insn) = label;
15072 }
15073
15074 /* Output code to perform an sinh XFmode calculation. */
15075
15076 void ix86_emit_i387_sinh (rtx op0, rtx op1)
15077 {
15078 rtx e1 = gen_reg_rtx (XFmode);
15079 rtx e2 = gen_reg_rtx (XFmode);
15080 rtx scratch = gen_reg_rtx (HImode);
15081 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15082 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15083 rtx cst1, tmp;
15084 rtx_code_label *jump_label = gen_label_rtx ();
15085 rtx_insn *insn;
15086
15087 /* scratch = fxam (op1) */
15088 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15089
15090 /* e1 = expm1 (|op1|) */
15091 emit_insn (gen_absxf2 (e2, op1));
15092 emit_insn (gen_expm1xf2 (e1, e2));
15093
15094 /* e2 = e1 / (e1 + 1.0) + e1 */
15095 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15096 emit_insn (gen_addxf3 (e2, e1, cst1));
15097 emit_insn (gen_divxf3 (e2, e1, e2));
15098 emit_insn (gen_addxf3 (e2, e2, e1));
15099
15100 /* flags = signbit (op1) */
15101 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15102
15103 /* if (flags) then e2 = -e2 */
15104 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15105 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15106 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15107 pc_rtx);
15108 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15109 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15110 JUMP_LABEL (insn) = jump_label;
15111
15112 emit_insn (gen_negxf2 (e2, e2));
15113
15114 emit_label (jump_label);
15115 LABEL_NUSES (jump_label) = 1;
15116
15117 /* op0 = 0.5 * e2 */
15118 half = force_reg (XFmode, half);
15119 emit_insn (gen_mulxf3 (op0, e2, half));
15120 }
15121
15122 /* Output code to perform an cosh XFmode calculation. */
15123
15124 void ix86_emit_i387_cosh (rtx op0, rtx op1)
15125 {
15126 rtx e1 = gen_reg_rtx (XFmode);
15127 rtx e2 = gen_reg_rtx (XFmode);
15128 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15129 rtx cst1;
15130
15131 /* e1 = exp (op1) */
15132 emit_insn (gen_expxf2 (e1, op1));
15133
15134 /* e2 = e1 + 1.0 / e1 */
15135 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15136 emit_insn (gen_divxf3 (e2, cst1, e1));
15137 emit_insn (gen_addxf3 (e2, e1, e2));
15138
15139 /* op0 = 0.5 * e2 */
15140 half = force_reg (XFmode, half);
15141 emit_insn (gen_mulxf3 (op0, e2, half));
15142 }
15143
15144 /* Output code to perform an tanh XFmode calculation. */
15145
15146 void ix86_emit_i387_tanh (rtx op0, rtx op1)
15147 {
15148 rtx e1 = gen_reg_rtx (XFmode);
15149 rtx e2 = gen_reg_rtx (XFmode);
15150 rtx scratch = gen_reg_rtx (HImode);
15151 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15152 rtx cst2, tmp;
15153 rtx_code_label *jump_label = gen_label_rtx ();
15154 rtx_insn *insn;
15155
15156 /* scratch = fxam (op1) */
15157 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15158
15159 /* e1 = expm1 (-|2 * op1|) */
15160 emit_insn (gen_addxf3 (e2, op1, op1));
15161 emit_insn (gen_absxf2 (e2, e2));
15162 emit_insn (gen_negxf2 (e2, e2));
15163 emit_insn (gen_expm1xf2 (e1, e2));
15164
15165 /* e2 = e1 / (e1 + 2.0) */
15166 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
15167 emit_insn (gen_addxf3 (e2, e1, cst2));
15168 emit_insn (gen_divxf3 (e2, e1, e2));
15169
15170 /* flags = signbit (op1) */
15171 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15172
15173 /* if (!flags) then e2 = -e2 */
15174 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15175 gen_rtx_NE (VOIDmode, flags, const0_rtx),
15176 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15177 pc_rtx);
15178 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15179 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15180 JUMP_LABEL (insn) = jump_label;
15181
15182 emit_insn (gen_negxf2 (e2, e2));
15183
15184 emit_label (jump_label);
15185 LABEL_NUSES (jump_label) = 1;
15186
15187 emit_move_insn (op0, e2);
15188 }
15189
15190 /* Output code to perform an asinh XFmode calculation. */
15191
15192 void ix86_emit_i387_asinh (rtx op0, rtx op1)
15193 {
15194 rtx e1 = gen_reg_rtx (XFmode);
15195 rtx e2 = gen_reg_rtx (XFmode);
15196 rtx scratch = gen_reg_rtx (HImode);
15197 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15198 rtx cst1, tmp;
15199 rtx_code_label *jump_label = gen_label_rtx ();
15200 rtx_insn *insn;
15201
15202 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15203 emit_insn (gen_mulxf3 (e1, op1, op1));
15204 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15205 emit_insn (gen_addxf3 (e2, e1, cst1));
15206 emit_insn (gen_sqrtxf2 (e2, e2));
15207 emit_insn (gen_addxf3 (e2, e2, cst1));
15208
15209 /* e1 = e1 / e2 */
15210 emit_insn (gen_divxf3 (e1, e1, e2));
15211
15212 /* scratch = fxam (op1) */
15213 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15214
15215 /* e1 = e1 + |op1| */
15216 emit_insn (gen_absxf2 (e2, op1));
15217 emit_insn (gen_addxf3 (e1, e1, e2));
15218
15219 /* e2 = log1p (e1) */
15220 ix86_emit_i387_log1p (e2, e1);
15221
15222 /* flags = signbit (op1) */
15223 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15224
15225 /* if (flags) then e2 = -e2 */
15226 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15227 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15228 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15229 pc_rtx);
15230 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15231 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15232 JUMP_LABEL (insn) = jump_label;
15233
15234 emit_insn (gen_negxf2 (e2, e2));
15235
15236 emit_label (jump_label);
15237 LABEL_NUSES (jump_label) = 1;
15238
15239 emit_move_insn (op0, e2);
15240 }
15241
15242 /* Output code to perform an acosh XFmode calculation. */
15243
15244 void ix86_emit_i387_acosh (rtx op0, rtx op1)
15245 {
15246 rtx e1 = gen_reg_rtx (XFmode);
15247 rtx e2 = gen_reg_rtx (XFmode);
15248 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15249
15250 /* e2 = sqrt (op1 + 1.0) */
15251 emit_insn (gen_addxf3 (e2, op1, cst1));
15252 emit_insn (gen_sqrtxf2 (e2, e2));
15253
15254 /* e1 = sqrt (op1 - 1.0) */
15255 emit_insn (gen_subxf3 (e1, op1, cst1));
15256 emit_insn (gen_sqrtxf2 (e1, e1));
15257
15258 /* e1 = e1 * e2 */
15259 emit_insn (gen_mulxf3 (e1, e1, e2));
15260
15261 /* e1 = e1 + op1 */
15262 emit_insn (gen_addxf3 (e1, e1, op1));
15263
15264 /* op0 = log (e1) */
15265 emit_insn (gen_logxf2 (op0, e1));
15266 }
15267
15268 /* Output code to perform an atanh XFmode calculation. */
15269
15270 void ix86_emit_i387_atanh (rtx op0, rtx op1)
15271 {
15272 rtx e1 = gen_reg_rtx (XFmode);
15273 rtx e2 = gen_reg_rtx (XFmode);
15274 rtx scratch = gen_reg_rtx (HImode);
15275 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15276 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15277 rtx cst1, tmp;
15278 rtx_code_label *jump_label = gen_label_rtx ();
15279 rtx_insn *insn;
15280
15281 /* scratch = fxam (op1) */
15282 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15283
15284 /* e2 = |op1| */
15285 emit_insn (gen_absxf2 (e2, op1));
15286
15287 /* e1 = -(e2 + e2) / (e2 + 1.0) */
15288 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15289 emit_insn (gen_addxf3 (e1, e2, cst1));
15290 emit_insn (gen_addxf3 (e2, e2, e2));
15291 emit_insn (gen_negxf2 (e2, e2));
15292 emit_insn (gen_divxf3 (e1, e2, e1));
15293
15294 /* e2 = log1p (e1) */
15295 ix86_emit_i387_log1p (e2, e1);
15296
15297 /* flags = signbit (op1) */
15298 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15299
15300 /* if (!flags) then e2 = -e2 */
15301 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15302 gen_rtx_NE (VOIDmode, flags, const0_rtx),
15303 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15304 pc_rtx);
15305 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15306 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15307 JUMP_LABEL (insn) = jump_label;
15308
15309 emit_insn (gen_negxf2 (e2, e2));
15310
15311 emit_label (jump_label);
15312 LABEL_NUSES (jump_label) = 1;
15313
15314 /* op0 = 0.5 * e2 */
15315 half = force_reg (XFmode, half);
15316 emit_insn (gen_mulxf3 (op0, e2, half));
15317 }
15318
15319 /* Output code to perform a log1p XFmode calculation. */
15320
15321 void ix86_emit_i387_log1p (rtx op0, rtx op1)
15322 {
15323 rtx_code_label *label1 = gen_label_rtx ();
15324 rtx_code_label *label2 = gen_label_rtx ();
15325
15326 rtx tmp = gen_reg_rtx (XFmode);
15327 rtx res = gen_reg_rtx (XFmode);
15328 rtx cst, cstln2, cst1;
15329 rtx_insn *insn;
15330
15331 cst = const_double_from_real_value
15332 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
15333 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
15334
15335 emit_insn (gen_absxf2 (tmp, op1));
15336
15337 cst = force_reg (XFmode, cst);
15338 ix86_expand_branch (GE, tmp, cst, label1);
15339 predict_jump (REG_BR_PROB_BASE * 10 / 100);
15340 insn = get_last_insn ();
15341 JUMP_LABEL (insn) = label1;
15342
15343 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
15344 emit_jump (label2);
15345
15346 emit_label (label1);
15347 LABEL_NUSES (label1) = 1;
15348
15349 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15350 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
15351 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
15352
15353 emit_label (label2);
15354 LABEL_NUSES (label2) = 1;
15355
15356 emit_move_insn (op0, res);
15357 }
15358
15359 /* Emit code for round calculation. */
15360 void ix86_emit_i387_round (rtx op0, rtx op1)
15361 {
15362 machine_mode inmode = GET_MODE (op1);
15363 machine_mode outmode = GET_MODE (op0);
15364 rtx e1 = gen_reg_rtx (XFmode);
15365 rtx e2 = gen_reg_rtx (XFmode);
15366 rtx scratch = gen_reg_rtx (HImode);
15367 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15368 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15369 rtx res = gen_reg_rtx (outmode);
15370 rtx_code_label *jump_label = gen_label_rtx ();
15371 rtx (*floor_insn) (rtx, rtx);
15372 rtx (*neg_insn) (rtx, rtx);
15373 rtx_insn *insn;
15374 rtx tmp;
15375
15376 switch (inmode)
15377 {
15378 case E_SFmode:
15379 case E_DFmode:
15380 tmp = gen_reg_rtx (XFmode);
15381
15382 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
15383 op1 = tmp;
15384 break;
15385 case E_XFmode:
15386 break;
15387 default:
15388 gcc_unreachable ();
15389 }
15390
15391 switch (outmode)
15392 {
15393 case E_SFmode:
15394 floor_insn = gen_frndintxf2_floor;
15395 neg_insn = gen_negsf2;
15396 break;
15397 case E_DFmode:
15398 floor_insn = gen_frndintxf2_floor;
15399 neg_insn = gen_negdf2;
15400 break;
15401 case E_XFmode:
15402 floor_insn = gen_frndintxf2_floor;
15403 neg_insn = gen_negxf2;
15404 break;
15405 case E_HImode:
15406 floor_insn = gen_lfloorxfhi2;
15407 neg_insn = gen_neghi2;
15408 break;
15409 case E_SImode:
15410 floor_insn = gen_lfloorxfsi2;
15411 neg_insn = gen_negsi2;
15412 break;
15413 case E_DImode:
15414 floor_insn = gen_lfloorxfdi2;
15415 neg_insn = gen_negdi2;
15416 break;
15417 default:
15418 gcc_unreachable ();
15419 }
15420
15421 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15422
15423 /* scratch = fxam(op1) */
15424 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15425
15426 /* e1 = fabs(op1) */
15427 emit_insn (gen_absxf2 (e1, op1));
15428
15429 /* e2 = e1 + 0.5 */
15430 half = force_reg (XFmode, half);
15431 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
15432
15433 /* res = floor(e2) */
15434 switch (outmode)
15435 {
15436 case E_SFmode:
15437 case E_DFmode:
15438 {
15439 tmp = gen_reg_rtx (XFmode);
15440
15441 emit_insn (floor_insn (tmp, e2));
15442 emit_insn (gen_rtx_SET (res,
15443 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
15444 UNSPEC_TRUNC_NOOP)));
15445 }
15446 break;
15447 default:
15448 emit_insn (floor_insn (res, e2));
15449 }
15450
15451 /* flags = signbit(a) */
15452 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15453
15454 /* if (flags) then res = -res */
15455 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15456 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15457 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15458 pc_rtx);
15459 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15460 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15461 JUMP_LABEL (insn) = jump_label;
15462
15463 emit_insn (neg_insn (res, res));
15464
15465 emit_label (jump_label);
15466 LABEL_NUSES (jump_label) = 1;
15467
15468 emit_move_insn (op0, res);
15469 }
15470
15471 /* Output code to perform a Newton-Rhapson approximation of a single precision
15472 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
15473
15474 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
15475 {
15476 rtx x0, x1, e0, e1;
15477
15478 x0 = gen_reg_rtx (mode);
15479 e0 = gen_reg_rtx (mode);
15480 e1 = gen_reg_rtx (mode);
15481 x1 = gen_reg_rtx (mode);
15482
15483 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15484
15485 b = force_reg (mode, b);
15486
15487 /* x0 = rcp(b) estimate */
15488 if (mode == V16SFmode || mode == V8DFmode)
15489 {
15490 if (TARGET_AVX512ER)
15491 {
15492 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15493 UNSPEC_RCP28)));
15494 /* res = a * x0 */
15495 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
15496 return;
15497 }
15498 else
15499 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15500 UNSPEC_RCP14)));
15501 }
15502 else
15503 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15504 UNSPEC_RCP)));
15505
15506 /* e0 = x0 * b */
15507 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
15508
15509 /* e0 = x0 * e0 */
15510 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
15511
15512 /* e1 = x0 + x0 */
15513 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
15514
15515 /* x1 = e1 - e0 */
15516 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
15517
15518 /* res = a * x1 */
15519 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
15520 }
15521
15522 /* Output code to perform a Newton-Rhapson approximation of a
15523 single precision floating point [reciprocal] square root. */
15524
15525 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
15526 {
15527 rtx x0, e0, e1, e2, e3, mthree, mhalf;
15528 REAL_VALUE_TYPE r;
15529 int unspec;
15530
15531 x0 = gen_reg_rtx (mode);
15532 e0 = gen_reg_rtx (mode);
15533 e1 = gen_reg_rtx (mode);
15534 e2 = gen_reg_rtx (mode);
15535 e3 = gen_reg_rtx (mode);
15536
15537 if (TARGET_AVX512ER && mode == V16SFmode)
15538 {
15539 if (recip)
15540 /* res = rsqrt28(a) estimate */
15541 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15542 UNSPEC_RSQRT28)));
15543 else
15544 {
15545 /* x0 = rsqrt28(a) estimate */
15546 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15547 UNSPEC_RSQRT28)));
15548 /* res = rcp28(x0) estimate */
15549 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
15550 UNSPEC_RCP28)));
15551 }
15552 return;
15553 }
15554
15555 real_from_integer (&r, VOIDmode, -3, SIGNED);
15556 mthree = const_double_from_real_value (r, SFmode);
15557
15558 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
15559 mhalf = const_double_from_real_value (r, SFmode);
15560 unspec = UNSPEC_RSQRT;
15561
15562 if (VECTOR_MODE_P (mode))
15563 {
15564 mthree = ix86_build_const_vector (mode, true, mthree);
15565 mhalf = ix86_build_const_vector (mode, true, mhalf);
15566 /* There is no 512-bit rsqrt. There is however rsqrt14. */
15567 if (GET_MODE_SIZE (mode) == 64)
15568 unspec = UNSPEC_RSQRT14;
15569 }
15570
15571 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
15572 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
15573
15574 a = force_reg (mode, a);
15575
15576 /* x0 = rsqrt(a) estimate */
15577 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15578 unspec)));
15579
15580 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
15581 if (!recip)
15582 {
15583 rtx zero = force_reg (mode, CONST0_RTX(mode));
15584 rtx mask;
15585
15586 /* Handle masked compare. */
15587 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
15588 {
15589 mask = gen_reg_rtx (HImode);
15590 /* Imm value 0x4 corresponds to not-equal comparison. */
15591 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
15592 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
15593 }
15594 else
15595 {
15596 mask = gen_reg_rtx (mode);
15597 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
15598 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
15599 }
15600 }
15601
15602 /* e0 = x0 * a */
15603 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
15604 /* e1 = e0 * x0 */
15605 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
15606
15607 /* e2 = e1 - 3. */
15608 mthree = force_reg (mode, mthree);
15609 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
15610
15611 mhalf = force_reg (mode, mhalf);
15612 if (recip)
15613 /* e3 = -.5 * x0 */
15614 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
15615 else
15616 /* e3 = -.5 * e0 */
15617 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
15618 /* ret = e2 * e3 */
15619 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
15620 }
15621
15622 /* Expand fabs (OP0) and return a new rtx that holds the result. The
15623 mask for masking out the sign-bit is stored in *SMASK, if that is
15624 non-null. */
15625
15626 static rtx
15627 ix86_expand_sse_fabs (rtx op0, rtx *smask)
15628 {
15629 machine_mode vmode, mode = GET_MODE (op0);
15630 rtx xa, mask;
15631
15632 xa = gen_reg_rtx (mode);
15633 if (mode == SFmode)
15634 vmode = V4SFmode;
15635 else if (mode == DFmode)
15636 vmode = V2DFmode;
15637 else
15638 vmode = mode;
15639 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
15640 if (!VECTOR_MODE_P (mode))
15641 {
15642 /* We need to generate a scalar mode mask in this case. */
15643 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
15644 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
15645 mask = gen_reg_rtx (mode);
15646 emit_insn (gen_rtx_SET (mask, tmp));
15647 }
15648 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
15649
15650 if (smask)
15651 *smask = mask;
15652
15653 return xa;
15654 }
15655
15656 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
15657 swapping the operands if SWAP_OPERANDS is true. The expanded
15658 code is a forward jump to a newly created label in case the
15659 comparison is true. The generated label rtx is returned. */
15660 static rtx_code_label *
15661 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
15662 bool swap_operands)
15663 {
15664 bool unordered_compare = ix86_unordered_fp_compare (code);
15665 rtx_code_label *label;
15666 rtx tmp, reg;
15667
15668 if (swap_operands)
15669 std::swap (op0, op1);
15670
15671 label = gen_label_rtx ();
15672 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
15673 if (unordered_compare)
15674 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
15675 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
15676 emit_insn (gen_rtx_SET (reg, tmp));
15677 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
15678 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
15679 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
15680 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15681 JUMP_LABEL (tmp) = label;
15682
15683 return label;
15684 }
15685
15686 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
15687 using comparison code CODE. Operands are swapped for the comparison if
15688 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
15689 static rtx
15690 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
15691 bool swap_operands)
15692 {
15693 rtx (*insn)(rtx, rtx, rtx, rtx);
15694 machine_mode mode = GET_MODE (op0);
15695 rtx mask = gen_reg_rtx (mode);
15696
15697 if (swap_operands)
15698 std::swap (op0, op1);
15699
15700 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
15701
15702 emit_insn (insn (mask, op0, op1,
15703 gen_rtx_fmt_ee (code, mode, op0, op1)));
15704 return mask;
15705 }
15706
15707 /* Expand copysign from SIGN to the positive value ABS_VALUE
15708 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
15709 the sign-bit. */
15710
15711 static void
15712 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
15713 {
15714 machine_mode mode = GET_MODE (sign);
15715 rtx sgn = gen_reg_rtx (mode);
15716 if (mask == NULL_RTX)
15717 {
15718 machine_mode vmode;
15719
15720 if (mode == SFmode)
15721 vmode = V4SFmode;
15722 else if (mode == DFmode)
15723 vmode = V2DFmode;
15724 else
15725 vmode = mode;
15726
15727 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
15728 if (!VECTOR_MODE_P (mode))
15729 {
15730 /* We need to generate a scalar mode mask in this case. */
15731 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
15732 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
15733 mask = gen_reg_rtx (mode);
15734 emit_insn (gen_rtx_SET (mask, tmp));
15735 }
15736 }
15737 else
15738 mask = gen_rtx_NOT (mode, mask);
15739 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
15740 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
15741 }
15742
15743 /* Expand SSE sequence for computing lround from OP1 storing
15744 into OP0. */
15745
15746 void
15747 ix86_expand_lround (rtx op0, rtx op1)
15748 {
15749 /* C code for the stuff we're doing below:
15750 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
15751 return (long)tmp;
15752 */
15753 machine_mode mode = GET_MODE (op1);
15754 const struct real_format *fmt;
15755 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
15756 rtx adj;
15757
15758 /* load nextafter (0.5, 0.0) */
15759 fmt = REAL_MODE_FORMAT (mode);
15760 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
15761 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
15762
15763 /* adj = copysign (0.5, op1) */
15764 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
15765 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
15766
15767 /* adj = op1 + adj */
15768 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
15769
15770 /* op0 = (imode)adj */
15771 expand_fix (op0, adj, 0);
15772 }
15773
15774 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
15775 into OPERAND0. */
15776
15777 void
15778 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
15779 {
15780 /* C code for the stuff we're doing below (for do_floor):
15781 xi = (long)op1;
15782 xi -= (double)xi > op1 ? 1 : 0;
15783 return xi;
15784 */
15785 machine_mode fmode = GET_MODE (op1);
15786 machine_mode imode = GET_MODE (op0);
15787 rtx ireg, freg, tmp;
15788 rtx_code_label *label;
15789
15790 /* reg = (long)op1 */
15791 ireg = gen_reg_rtx (imode);
15792 expand_fix (ireg, op1, 0);
15793
15794 /* freg = (double)reg */
15795 freg = gen_reg_rtx (fmode);
15796 expand_float (freg, ireg, 0);
15797
15798 /* ireg = (freg > op1) ? ireg - 1 : ireg */
15799 label = ix86_expand_sse_compare_and_jump (UNLE,
15800 freg, op1, !do_floor);
15801 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
15802 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
15803 emit_move_insn (ireg, tmp);
15804
15805 emit_label (label);
15806 LABEL_NUSES (label) = 1;
15807
15808 emit_move_insn (op0, ireg);
15809 }
15810
15811 /* Generate and return a rtx of mode MODE for 2**n where n is the number
15812 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
15813
15814 static rtx
15815 ix86_gen_TWO52 (machine_mode mode)
15816 {
15817 REAL_VALUE_TYPE TWO52r;
15818 rtx TWO52;
15819
15820 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
15821 TWO52 = const_double_from_real_value (TWO52r, mode);
15822 TWO52 = force_reg (mode, TWO52);
15823
15824 return TWO52;
15825 }
15826
15827 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
15828
15829 void
15830 ix86_expand_rint (rtx operand0, rtx operand1)
15831 {
15832 /* C code for the stuff we're doing below:
15833 xa = fabs (operand1);
15834 if (!isless (xa, 2**52))
15835 return operand1;
15836 two52 = 2**52;
15837 if (flag_rounding_math)
15838 {
15839 two52 = copysign (two52, operand1);
15840 xa = operand1;
15841 }
15842 xa = xa + two52 - two52;
15843 return copysign (xa, operand1);
15844 */
15845 machine_mode mode = GET_MODE (operand0);
15846 rtx res, xa, TWO52, two52, mask;
15847 rtx_code_label *label;
15848
15849 res = gen_reg_rtx (mode);
15850 emit_move_insn (res, operand1);
15851
15852 /* xa = abs (operand1) */
15853 xa = ix86_expand_sse_fabs (res, &mask);
15854
15855 /* if (!isless (xa, TWO52)) goto label; */
15856 TWO52 = ix86_gen_TWO52 (mode);
15857 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15858
15859 two52 = TWO52;
15860 if (flag_rounding_math)
15861 {
15862 two52 = gen_reg_rtx (mode);
15863 ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
15864 xa = res;
15865 }
15866
15867 xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
15868 xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
15869
15870 ix86_sse_copysign_to_positive (res, xa, res, mask);
15871
15872 emit_label (label);
15873 LABEL_NUSES (label) = 1;
15874
15875 emit_move_insn (operand0, res);
15876 }
15877
15878 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15879 into OPERAND0. */
15880 void
15881 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
15882 {
15883 /* C code for the stuff we expand below.
15884 double xa = fabs (x), x2;
15885 if (!isless (xa, TWO52))
15886 return x;
15887 xa = xa + TWO52 - TWO52;
15888 x2 = copysign (xa, x);
15889 Compensate. Floor:
15890 if (x2 > x)
15891 x2 -= 1;
15892 Compensate. Ceil:
15893 if (x2 < x)
15894 x2 += 1;
15895 if (HONOR_SIGNED_ZEROS (mode))
15896 x2 = copysign (x2, x);
15897 return x2;
15898 */
15899 machine_mode mode = GET_MODE (operand0);
15900 rtx xa, TWO52, tmp, one, res, mask;
15901 rtx_code_label *label;
15902
15903 TWO52 = ix86_gen_TWO52 (mode);
15904
15905 /* Temporary for holding the result, initialized to the input
15906 operand to ease control flow. */
15907 res = gen_reg_rtx (mode);
15908 emit_move_insn (res, operand1);
15909
15910 /* xa = abs (operand1) */
15911 xa = ix86_expand_sse_fabs (res, &mask);
15912
15913 /* if (!isless (xa, TWO52)) goto label; */
15914 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15915
15916 /* xa = xa + TWO52 - TWO52; */
15917 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
15918 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
15919
15920 /* xa = copysign (xa, operand1) */
15921 ix86_sse_copysign_to_positive (xa, xa, res, mask);
15922
15923 /* generate 1.0 */
15924 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
15925
15926 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15927 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
15928 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
15929 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
15930 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
15931 if (!do_floor && HONOR_SIGNED_ZEROS (mode))
15932 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
15933 emit_move_insn (res, tmp);
15934
15935 emit_label (label);
15936 LABEL_NUSES (label) = 1;
15937
15938 emit_move_insn (operand0, res);
15939 }
15940
15941 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15942 into OPERAND0. */
15943 void
15944 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
15945 {
15946 /* C code for the stuff we expand below.
15947 double xa = fabs (x), x2;
15948 if (!isless (xa, TWO52))
15949 return x;
15950 x2 = (double)(long)x;
15951 Compensate. Floor:
15952 if (x2 > x)
15953 x2 -= 1;
15954 Compensate. Ceil:
15955 if (x2 < x)
15956 x2 += 1;
15957 if (HONOR_SIGNED_ZEROS (mode))
15958 return copysign (x2, x);
15959 return x2;
15960 */
15961 machine_mode mode = GET_MODE (operand0);
15962 rtx xa, xi, TWO52, tmp, one, res, mask;
15963 rtx_code_label *label;
15964
15965 TWO52 = ix86_gen_TWO52 (mode);
15966
15967 /* Temporary for holding the result, initialized to the input
15968 operand to ease control flow. */
15969 res = gen_reg_rtx (mode);
15970 emit_move_insn (res, operand1);
15971
15972 /* xa = abs (operand1) */
15973 xa = ix86_expand_sse_fabs (res, &mask);
15974
15975 /* if (!isless (xa, TWO52)) goto label; */
15976 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15977
15978 /* xa = (double)(long)x */
15979 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
15980 expand_fix (xi, res, 0);
15981 expand_float (xa, xi, 0);
15982
15983 /* generate 1.0 */
15984 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
15985
15986 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15987 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
15988 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
15989 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
15990 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
15991 emit_move_insn (res, tmp);
15992
15993 if (HONOR_SIGNED_ZEROS (mode))
15994 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
15995
15996 emit_label (label);
15997 LABEL_NUSES (label) = 1;
15998
15999 emit_move_insn (operand0, res);
16000 }
16001
16002 /* Expand SSE sequence for computing round from OPERAND1 storing
16003 into OPERAND0. Sequence that works without relying on DImode truncation
16004 via cvttsd2siq that is only available on 64bit targets. */
16005 void
16006 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
16007 {
16008 /* C code for the stuff we expand below.
16009 double xa = fabs (x), xa2, x2;
16010 if (!isless (xa, TWO52))
16011 return x;
16012 Using the absolute value and copying back sign makes
16013 -0.0 -> -0.0 correct.
16014 xa2 = xa + TWO52 - TWO52;
16015 Compensate.
16016 dxa = xa2 - xa;
16017 if (dxa <= -0.5)
16018 xa2 += 1;
16019 else if (dxa > 0.5)
16020 xa2 -= 1;
16021 x2 = copysign (xa2, x);
16022 return x2;
16023 */
16024 machine_mode mode = GET_MODE (operand0);
16025 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
16026 rtx_code_label *label;
16027
16028 TWO52 = ix86_gen_TWO52 (mode);
16029
16030 /* Temporary for holding the result, initialized to the input
16031 operand to ease control flow. */
16032 res = gen_reg_rtx (mode);
16033 emit_move_insn (res, operand1);
16034
16035 /* xa = abs (operand1) */
16036 xa = ix86_expand_sse_fabs (res, &mask);
16037
16038 /* if (!isless (xa, TWO52)) goto label; */
16039 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16040
16041 /* xa2 = xa + TWO52 - TWO52; */
16042 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16043 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
16044
16045 /* dxa = xa2 - xa; */
16046 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
16047
16048 /* generate 0.5, 1.0 and -0.5 */
16049 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
16050 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
16051 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
16052 0, OPTAB_DIRECT);
16053
16054 /* Compensate. */
16055 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
16056 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
16057 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16058 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16059 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
16060 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
16061 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16062 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16063
16064 /* res = copysign (xa2, operand1) */
16065 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
16066
16067 emit_label (label);
16068 LABEL_NUSES (label) = 1;
16069
16070 emit_move_insn (operand0, res);
16071 }
16072
16073 /* Expand SSE sequence for computing trunc from OPERAND1 storing
16074 into OPERAND0. */
16075 void
16076 ix86_expand_trunc (rtx operand0, rtx operand1)
16077 {
16078 /* C code for SSE variant we expand below.
16079 double xa = fabs (x), x2;
16080 if (!isless (xa, TWO52))
16081 return x;
16082 x2 = (double)(long)x;
16083 if (HONOR_SIGNED_ZEROS (mode))
16084 return copysign (x2, x);
16085 return x2;
16086 */
16087 machine_mode mode = GET_MODE (operand0);
16088 rtx xa, xi, TWO52, res, mask;
16089 rtx_code_label *label;
16090
16091 TWO52 = ix86_gen_TWO52 (mode);
16092
16093 /* Temporary for holding the result, initialized to the input
16094 operand to ease control flow. */
16095 res = gen_reg_rtx (mode);
16096 emit_move_insn (res, operand1);
16097
16098 /* xa = abs (operand1) */
16099 xa = ix86_expand_sse_fabs (res, &mask);
16100
16101 /* if (!isless (xa, TWO52)) goto label; */
16102 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16103
16104 /* x = (double)(long)x */
16105 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
16106 expand_fix (xi, res, 0);
16107 expand_float (res, xi, 0);
16108
16109 if (HONOR_SIGNED_ZEROS (mode))
16110 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
16111
16112 emit_label (label);
16113 LABEL_NUSES (label) = 1;
16114
16115 emit_move_insn (operand0, res);
16116 }
16117
16118 /* Expand SSE sequence for computing trunc from OPERAND1 storing
16119 into OPERAND0. */
16120 void
16121 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
16122 {
16123 machine_mode mode = GET_MODE (operand0);
16124 rtx xa, mask, TWO52, one, res, smask, tmp;
16125 rtx_code_label *label;
16126
16127 /* C code for SSE variant we expand below.
16128 double xa = fabs (x), x2;
16129 if (!isless (xa, TWO52))
16130 return x;
16131 xa2 = xa + TWO52 - TWO52;
16132 Compensate:
16133 if (xa2 > xa)
16134 xa2 -= 1.0;
16135 x2 = copysign (xa2, x);
16136 return x2;
16137 */
16138
16139 TWO52 = ix86_gen_TWO52 (mode);
16140
16141 /* Temporary for holding the result, initialized to the input
16142 operand to ease control flow. */
16143 res = gen_reg_rtx (mode);
16144 emit_move_insn (res, operand1);
16145
16146 /* xa = abs (operand1) */
16147 xa = ix86_expand_sse_fabs (res, &smask);
16148
16149 /* if (!isless (xa, TWO52)) goto label; */
16150 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16151
16152 /* res = xa + TWO52 - TWO52; */
16153 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16154 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
16155 emit_move_insn (res, tmp);
16156
16157 /* generate 1.0 */
16158 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16159
16160 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
16161 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
16162 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
16163 tmp = expand_simple_binop (mode, MINUS,
16164 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
16165 emit_move_insn (res, tmp);
16166
16167 /* res = copysign (res, operand1) */
16168 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
16169
16170 emit_label (label);
16171 LABEL_NUSES (label) = 1;
16172
16173 emit_move_insn (operand0, res);
16174 }
16175
16176 /* Expand SSE sequence for computing round from OPERAND1 storing
16177 into OPERAND0. */
16178 void
16179 ix86_expand_round (rtx operand0, rtx operand1)
16180 {
16181 /* C code for the stuff we're doing below:
16182 double xa = fabs (x);
16183 if (!isless (xa, TWO52))
16184 return x;
16185 xa = (double)(long)(xa + nextafter (0.5, 0.0));
16186 return copysign (xa, x);
16187 */
16188 machine_mode mode = GET_MODE (operand0);
16189 rtx res, TWO52, xa, xi, half, mask;
16190 rtx_code_label *label;
16191 const struct real_format *fmt;
16192 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16193
16194 /* Temporary for holding the result, initialized to the input
16195 operand to ease control flow. */
16196 res = gen_reg_rtx (mode);
16197 emit_move_insn (res, operand1);
16198
16199 TWO52 = ix86_gen_TWO52 (mode);
16200 xa = ix86_expand_sse_fabs (res, &mask);
16201 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16202
16203 /* load nextafter (0.5, 0.0) */
16204 fmt = REAL_MODE_FORMAT (mode);
16205 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16206 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16207
16208 /* xa = xa + 0.5 */
16209 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
16210 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
16211
16212 /* xa = (double)(int64_t)xa */
16213 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
16214 expand_fix (xi, xa, 0);
16215 expand_float (xa, xi, 0);
16216
16217 /* res = copysign (xa, operand1) */
16218 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
16219
16220 emit_label (label);
16221 LABEL_NUSES (label) = 1;
16222
16223 emit_move_insn (operand0, res);
16224 }
16225
16226 /* Expand SSE sequence for computing round
16227 from OP1 storing into OP0 using sse4 round insn. */
16228 void
16229 ix86_expand_round_sse4 (rtx op0, rtx op1)
16230 {
16231 machine_mode mode = GET_MODE (op0);
16232 rtx e1, e2, res, half;
16233 const struct real_format *fmt;
16234 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16235 rtx (*gen_copysign) (rtx, rtx, rtx);
16236 rtx (*gen_round) (rtx, rtx, rtx);
16237
16238 switch (mode)
16239 {
16240 case E_SFmode:
16241 gen_copysign = gen_copysignsf3;
16242 gen_round = gen_sse4_1_roundsf2;
16243 break;
16244 case E_DFmode:
16245 gen_copysign = gen_copysigndf3;
16246 gen_round = gen_sse4_1_rounddf2;
16247 break;
16248 default:
16249 gcc_unreachable ();
16250 }
16251
16252 /* round (a) = trunc (a + copysign (0.5, a)) */
16253
16254 /* load nextafter (0.5, 0.0) */
16255 fmt = REAL_MODE_FORMAT (mode);
16256 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16257 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16258 half = const_double_from_real_value (pred_half, mode);
16259
16260 /* e1 = copysign (0.5, op1) */
16261 e1 = gen_reg_rtx (mode);
16262 emit_insn (gen_copysign (e1, half, op1));
16263
16264 /* e2 = op1 + e1 */
16265 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
16266
16267 /* res = trunc (e2) */
16268 res = gen_reg_rtx (mode);
16269 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
16270
16271 emit_move_insn (op0, res);
16272 }
16273
16274 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16275 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16276 insn every time. */
16277
16278 static GTY(()) rtx_insn *vselect_insn;
16279
16280 /* Initialize vselect_insn. */
16281
16282 static void
16283 init_vselect_insn (void)
16284 {
16285 unsigned i;
16286 rtx x;
16287
16288 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
16289 for (i = 0; i < MAX_VECT_LEN; ++i)
16290 XVECEXP (x, 0, i) = const0_rtx;
16291 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
16292 const0_rtx), x);
16293 x = gen_rtx_SET (const0_rtx, x);
16294 start_sequence ();
16295 vselect_insn = emit_insn (x);
16296 end_sequence ();
16297 }
16298
16299 /* Construct (set target (vec_select op0 (parallel perm))) and
16300 return true if that's a valid instruction in the active ISA. */
16301
16302 static bool
16303 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
16304 unsigned nelt, bool testing_p)
16305 {
16306 unsigned int i;
16307 rtx x, save_vconcat;
16308 int icode;
16309
16310 if (vselect_insn == NULL_RTX)
16311 init_vselect_insn ();
16312
16313 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
16314 PUT_NUM_ELEM (XVEC (x, 0), nelt);
16315 for (i = 0; i < nelt; ++i)
16316 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
16317 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16318 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
16319 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
16320 SET_DEST (PATTERN (vselect_insn)) = target;
16321 icode = recog_memoized (vselect_insn);
16322
16323 if (icode >= 0 && !testing_p)
16324 emit_insn (copy_rtx (PATTERN (vselect_insn)));
16325
16326 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
16327 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
16328 INSN_CODE (vselect_insn) = -1;
16329
16330 return icode >= 0;
16331 }
16332
16333 /* Similar, but generate a vec_concat from op0 and op1 as well. */
16334
16335 static bool
16336 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
16337 const unsigned char *perm, unsigned nelt,
16338 bool testing_p)
16339 {
16340 machine_mode v2mode;
16341 rtx x;
16342 bool ok;
16343
16344 if (vselect_insn == NULL_RTX)
16345 init_vselect_insn ();
16346
16347 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
16348 return false;
16349 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16350 PUT_MODE (x, v2mode);
16351 XEXP (x, 0) = op0;
16352 XEXP (x, 1) = op1;
16353 ok = expand_vselect (target, x, perm, nelt, testing_p);
16354 XEXP (x, 0) = const0_rtx;
16355 XEXP (x, 1) = const0_rtx;
16356 return ok;
16357 }
16358
16359 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16360 using movss or movsd. */
16361 static bool
16362 expand_vec_perm_movs (struct expand_vec_perm_d *d)
16363 {
16364 machine_mode vmode = d->vmode;
16365 unsigned i, nelt = d->nelt;
16366 rtx x;
16367
16368 if (d->one_operand_p)
16369 return false;
16370
16371 if (!(TARGET_SSE && vmode == V4SFmode)
16372 && !(TARGET_SSE2 && vmode == V2DFmode))
16373 return false;
16374
16375 /* Only the first element is changed. */
16376 if (d->perm[0] != nelt && d->perm[0] != 0)
16377 return false;
16378 for (i = 1; i < nelt; ++i)
16379 if (d->perm[i] != i + nelt - d->perm[0])
16380 return false;
16381
16382 if (d->testing_p)
16383 return true;
16384
16385 if (d->perm[0] == nelt)
16386 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
16387 else
16388 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
16389
16390 emit_insn (gen_rtx_SET (d->target, x));
16391
16392 return true;
16393 }
16394
16395 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16396 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
16397
16398 static bool
16399 expand_vec_perm_blend (struct expand_vec_perm_d *d)
16400 {
16401 machine_mode mmode, vmode = d->vmode;
16402 unsigned i, mask, nelt = d->nelt;
16403 rtx target, op0, op1, maskop, x;
16404 rtx rperm[32], vperm;
16405
16406 if (d->one_operand_p)
16407 return false;
16408 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
16409 && (TARGET_AVX512BW
16410 || GET_MODE_UNIT_SIZE (vmode) >= 4))
16411 ;
16412 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
16413 ;
16414 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
16415 ;
16416 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
16417 ;
16418 else
16419 return false;
16420
16421 /* This is a blend, not a permute. Elements must stay in their
16422 respective lanes. */
16423 for (i = 0; i < nelt; ++i)
16424 {
16425 unsigned e = d->perm[i];
16426 if (!(e == i || e == i + nelt))
16427 return false;
16428 }
16429
16430 if (d->testing_p)
16431 return true;
16432
16433 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
16434 decision should be extracted elsewhere, so that we only try that
16435 sequence once all budget==3 options have been tried. */
16436 target = d->target;
16437 op0 = d->op0;
16438 op1 = d->op1;
16439 mask = 0;
16440
16441 switch (vmode)
16442 {
16443 case E_V8DFmode:
16444 case E_V16SFmode:
16445 case E_V4DFmode:
16446 case E_V8SFmode:
16447 case E_V2DFmode:
16448 case E_V4SFmode:
16449 case E_V8HImode:
16450 case E_V8SImode:
16451 case E_V32HImode:
16452 case E_V64QImode:
16453 case E_V16SImode:
16454 case E_V8DImode:
16455 for (i = 0; i < nelt; ++i)
16456 mask |= (d->perm[i] >= nelt) << i;
16457 break;
16458
16459 case E_V2DImode:
16460 for (i = 0; i < 2; ++i)
16461 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
16462 vmode = V8HImode;
16463 goto do_subreg;
16464
16465 case E_V4SImode:
16466 for (i = 0; i < 4; ++i)
16467 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16468 vmode = V8HImode;
16469 goto do_subreg;
16470
16471 case E_V16QImode:
16472 /* See if bytes move in pairs so we can use pblendw with
16473 an immediate argument, rather than pblendvb with a vector
16474 argument. */
16475 for (i = 0; i < 16; i += 2)
16476 if (d->perm[i] + 1 != d->perm[i + 1])
16477 {
16478 use_pblendvb:
16479 for (i = 0; i < nelt; ++i)
16480 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
16481
16482 finish_pblendvb:
16483 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16484 vperm = force_reg (vmode, vperm);
16485
16486 if (GET_MODE_SIZE (vmode) == 16)
16487 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
16488 else
16489 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
16490 if (target != d->target)
16491 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16492 return true;
16493 }
16494
16495 for (i = 0; i < 8; ++i)
16496 mask |= (d->perm[i * 2] >= 16) << i;
16497 vmode = V8HImode;
16498 /* FALLTHRU */
16499
16500 do_subreg:
16501 target = gen_reg_rtx (vmode);
16502 op0 = gen_lowpart (vmode, op0);
16503 op1 = gen_lowpart (vmode, op1);
16504 break;
16505
16506 case E_V32QImode:
16507 /* See if bytes move in pairs. If not, vpblendvb must be used. */
16508 for (i = 0; i < 32; i += 2)
16509 if (d->perm[i] + 1 != d->perm[i + 1])
16510 goto use_pblendvb;
16511 /* See if bytes move in quadruplets. If yes, vpblendd
16512 with immediate can be used. */
16513 for (i = 0; i < 32; i += 4)
16514 if (d->perm[i] + 2 != d->perm[i + 2])
16515 break;
16516 if (i < 32)
16517 {
16518 /* See if bytes move the same in both lanes. If yes,
16519 vpblendw with immediate can be used. */
16520 for (i = 0; i < 16; i += 2)
16521 if (d->perm[i] + 16 != d->perm[i + 16])
16522 goto use_pblendvb;
16523
16524 /* Use vpblendw. */
16525 for (i = 0; i < 16; ++i)
16526 mask |= (d->perm[i * 2] >= 32) << i;
16527 vmode = V16HImode;
16528 goto do_subreg;
16529 }
16530
16531 /* Use vpblendd. */
16532 for (i = 0; i < 8; ++i)
16533 mask |= (d->perm[i * 4] >= 32) << i;
16534 vmode = V8SImode;
16535 goto do_subreg;
16536
16537 case E_V16HImode:
16538 /* See if words move in pairs. If yes, vpblendd can be used. */
16539 for (i = 0; i < 16; i += 2)
16540 if (d->perm[i] + 1 != d->perm[i + 1])
16541 break;
16542 if (i < 16)
16543 {
16544 /* See if words move the same in both lanes. If not,
16545 vpblendvb must be used. */
16546 for (i = 0; i < 8; i++)
16547 if (d->perm[i] + 8 != d->perm[i + 8])
16548 {
16549 /* Use vpblendvb. */
16550 for (i = 0; i < 32; ++i)
16551 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
16552
16553 vmode = V32QImode;
16554 nelt = 32;
16555 target = gen_reg_rtx (vmode);
16556 op0 = gen_lowpart (vmode, op0);
16557 op1 = gen_lowpart (vmode, op1);
16558 goto finish_pblendvb;
16559 }
16560
16561 /* Use vpblendw. */
16562 for (i = 0; i < 16; ++i)
16563 mask |= (d->perm[i] >= 16) << i;
16564 break;
16565 }
16566
16567 /* Use vpblendd. */
16568 for (i = 0; i < 8; ++i)
16569 mask |= (d->perm[i * 2] >= 16) << i;
16570 vmode = V8SImode;
16571 goto do_subreg;
16572
16573 case E_V4DImode:
16574 /* Use vpblendd. */
16575 for (i = 0; i < 4; ++i)
16576 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16577 vmode = V8SImode;
16578 goto do_subreg;
16579
16580 default:
16581 gcc_unreachable ();
16582 }
16583
16584 switch (vmode)
16585 {
16586 case E_V8DFmode:
16587 case E_V8DImode:
16588 mmode = QImode;
16589 break;
16590 case E_V16SFmode:
16591 case E_V16SImode:
16592 mmode = HImode;
16593 break;
16594 case E_V32HImode:
16595 mmode = SImode;
16596 break;
16597 case E_V64QImode:
16598 mmode = DImode;
16599 break;
16600 default:
16601 mmode = VOIDmode;
16602 }
16603
16604 if (mmode != VOIDmode)
16605 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
16606 else
16607 maskop = GEN_INT (mask);
16608
16609 /* This matches five different patterns with the different modes. */
16610 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
16611 x = gen_rtx_SET (target, x);
16612 emit_insn (x);
16613 if (target != d->target)
16614 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16615
16616 return true;
16617 }
16618
16619 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16620 in terms of the variable form of vpermilps.
16621
16622 Note that we will have already failed the immediate input vpermilps,
16623 which requires that the high and low part shuffle be identical; the
16624 variable form doesn't require that. */
16625
16626 static bool
16627 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
16628 {
16629 rtx rperm[8], vperm;
16630 unsigned i;
16631
16632 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
16633 return false;
16634
16635 /* We can only permute within the 128-bit lane. */
16636 for (i = 0; i < 8; ++i)
16637 {
16638 unsigned e = d->perm[i];
16639 if (i < 4 ? e >= 4 : e < 4)
16640 return false;
16641 }
16642
16643 if (d->testing_p)
16644 return true;
16645
16646 for (i = 0; i < 8; ++i)
16647 {
16648 unsigned e = d->perm[i];
16649
16650 /* Within each 128-bit lane, the elements of op0 are numbered
16651 from 0 and the elements of op1 are numbered from 4. */
16652 if (e >= 8 + 4)
16653 e -= 8;
16654 else if (e >= 4)
16655 e -= 4;
16656
16657 rperm[i] = GEN_INT (e);
16658 }
16659
16660 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
16661 vperm = force_reg (V8SImode, vperm);
16662 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
16663
16664 return true;
16665 }
16666
16667 /* Return true if permutation D can be performed as VMODE permutation
16668 instead. */
16669
16670 static bool
16671 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
16672 {
16673 unsigned int i, j, chunk;
16674
16675 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
16676 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
16677 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
16678 return false;
16679
16680 if (GET_MODE_NUNITS (vmode) >= d->nelt)
16681 return true;
16682
16683 chunk = d->nelt / GET_MODE_NUNITS (vmode);
16684 for (i = 0; i < d->nelt; i += chunk)
16685 if (d->perm[i] & (chunk - 1))
16686 return false;
16687 else
16688 for (j = 1; j < chunk; ++j)
16689 if (d->perm[i] + j != d->perm[i + j])
16690 return false;
16691
16692 return true;
16693 }
16694
16695 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16696 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
16697
16698 static bool
16699 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
16700 {
16701 unsigned i, nelt, eltsz, mask;
16702 unsigned char perm[64];
16703 machine_mode vmode = V16QImode;
16704 rtx rperm[64], vperm, target, op0, op1;
16705
16706 nelt = d->nelt;
16707
16708 if (!d->one_operand_p)
16709 {
16710 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
16711 {
16712 if (TARGET_AVX2
16713 && valid_perm_using_mode_p (V2TImode, d))
16714 {
16715 if (d->testing_p)
16716 return true;
16717
16718 /* Use vperm2i128 insn. The pattern uses
16719 V4DImode instead of V2TImode. */
16720 target = d->target;
16721 if (d->vmode != V4DImode)
16722 target = gen_reg_rtx (V4DImode);
16723 op0 = gen_lowpart (V4DImode, d->op0);
16724 op1 = gen_lowpart (V4DImode, d->op1);
16725 rperm[0]
16726 = GEN_INT ((d->perm[0] / (nelt / 2))
16727 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
16728 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
16729 if (target != d->target)
16730 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16731 return true;
16732 }
16733 return false;
16734 }
16735 }
16736 else
16737 {
16738 if (GET_MODE_SIZE (d->vmode) == 16)
16739 {
16740 if (!TARGET_SSSE3)
16741 return false;
16742 }
16743 else if (GET_MODE_SIZE (d->vmode) == 32)
16744 {
16745 if (!TARGET_AVX2)
16746 return false;
16747
16748 /* V4DImode should be already handled through
16749 expand_vselect by vpermq instruction. */
16750 gcc_assert (d->vmode != V4DImode);
16751
16752 vmode = V32QImode;
16753 if (d->vmode == V8SImode
16754 || d->vmode == V16HImode
16755 || d->vmode == V32QImode)
16756 {
16757 /* First see if vpermq can be used for
16758 V8SImode/V16HImode/V32QImode. */
16759 if (valid_perm_using_mode_p (V4DImode, d))
16760 {
16761 for (i = 0; i < 4; i++)
16762 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
16763 if (d->testing_p)
16764 return true;
16765 target = gen_reg_rtx (V4DImode);
16766 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
16767 perm, 4, false))
16768 {
16769 emit_move_insn (d->target,
16770 gen_lowpart (d->vmode, target));
16771 return true;
16772 }
16773 return false;
16774 }
16775
16776 /* Next see if vpermd can be used. */
16777 if (valid_perm_using_mode_p (V8SImode, d))
16778 vmode = V8SImode;
16779 }
16780 /* Or if vpermps can be used. */
16781 else if (d->vmode == V8SFmode)
16782 vmode = V8SImode;
16783
16784 if (vmode == V32QImode)
16785 {
16786 /* vpshufb only works intra lanes, it is not
16787 possible to shuffle bytes in between the lanes. */
16788 for (i = 0; i < nelt; ++i)
16789 if ((d->perm[i] ^ i) & (nelt / 2))
16790 return false;
16791 }
16792 }
16793 else if (GET_MODE_SIZE (d->vmode) == 64)
16794 {
16795 if (!TARGET_AVX512BW)
16796 return false;
16797
16798 /* If vpermq didn't work, vpshufb won't work either. */
16799 if (d->vmode == V8DFmode || d->vmode == V8DImode)
16800 return false;
16801
16802 vmode = V64QImode;
16803 if (d->vmode == V16SImode
16804 || d->vmode == V32HImode
16805 || d->vmode == V64QImode)
16806 {
16807 /* First see if vpermq can be used for
16808 V16SImode/V32HImode/V64QImode. */
16809 if (valid_perm_using_mode_p (V8DImode, d))
16810 {
16811 for (i = 0; i < 8; i++)
16812 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
16813 if (d->testing_p)
16814 return true;
16815 target = gen_reg_rtx (V8DImode);
16816 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
16817 perm, 8, false))
16818 {
16819 emit_move_insn (d->target,
16820 gen_lowpart (d->vmode, target));
16821 return true;
16822 }
16823 return false;
16824 }
16825
16826 /* Next see if vpermd can be used. */
16827 if (valid_perm_using_mode_p (V16SImode, d))
16828 vmode = V16SImode;
16829 }
16830 /* Or if vpermps can be used. */
16831 else if (d->vmode == V16SFmode)
16832 vmode = V16SImode;
16833 if (vmode == V64QImode)
16834 {
16835 /* vpshufb only works intra lanes, it is not
16836 possible to shuffle bytes in between the lanes. */
16837 for (i = 0; i < nelt; ++i)
16838 if ((d->perm[i] ^ i) & (nelt / 4))
16839 return false;
16840 }
16841 }
16842 else
16843 return false;
16844 }
16845
16846 if (d->testing_p)
16847 return true;
16848
16849 if (vmode == V8SImode)
16850 for (i = 0; i < 8; ++i)
16851 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
16852 else if (vmode == V16SImode)
16853 for (i = 0; i < 16; ++i)
16854 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
16855 else
16856 {
16857 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
16858 if (!d->one_operand_p)
16859 mask = 2 * nelt - 1;
16860 else if (vmode == V16QImode)
16861 mask = nelt - 1;
16862 else if (vmode == V64QImode)
16863 mask = nelt / 4 - 1;
16864 else
16865 mask = nelt / 2 - 1;
16866
16867 for (i = 0; i < nelt; ++i)
16868 {
16869 unsigned j, e = d->perm[i] & mask;
16870 for (j = 0; j < eltsz; ++j)
16871 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
16872 }
16873 }
16874
16875 vperm = gen_rtx_CONST_VECTOR (vmode,
16876 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
16877 vperm = force_reg (vmode, vperm);
16878
16879 target = d->target;
16880 if (d->vmode != vmode)
16881 target = gen_reg_rtx (vmode);
16882 op0 = gen_lowpart (vmode, d->op0);
16883 if (d->one_operand_p)
16884 {
16885 if (vmode == V16QImode)
16886 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
16887 else if (vmode == V32QImode)
16888 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
16889 else if (vmode == V64QImode)
16890 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
16891 else if (vmode == V8SFmode)
16892 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
16893 else if (vmode == V8SImode)
16894 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
16895 else if (vmode == V16SFmode)
16896 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
16897 else if (vmode == V16SImode)
16898 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
16899 else
16900 gcc_unreachable ();
16901 }
16902 else
16903 {
16904 op1 = gen_lowpart (vmode, d->op1);
16905 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
16906 }
16907 if (target != d->target)
16908 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16909
16910 return true;
16911 }
16912
16913 /* For V*[QHS]Imode permutations, check if the same permutation
16914 can't be performed in a 2x, 4x or 8x wider inner mode. */
16915
16916 static bool
16917 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
16918 struct expand_vec_perm_d *nd)
16919 {
16920 int i;
16921 machine_mode mode = VOIDmode;
16922
16923 switch (d->vmode)
16924 {
16925 case E_V16QImode: mode = V8HImode; break;
16926 case E_V32QImode: mode = V16HImode; break;
16927 case E_V64QImode: mode = V32HImode; break;
16928 case E_V8HImode: mode = V4SImode; break;
16929 case E_V16HImode: mode = V8SImode; break;
16930 case E_V32HImode: mode = V16SImode; break;
16931 case E_V4SImode: mode = V2DImode; break;
16932 case E_V8SImode: mode = V4DImode; break;
16933 case E_V16SImode: mode = V8DImode; break;
16934 default: return false;
16935 }
16936 for (i = 0; i < d->nelt; i += 2)
16937 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
16938 return false;
16939 nd->vmode = mode;
16940 nd->nelt = d->nelt / 2;
16941 for (i = 0; i < nd->nelt; i++)
16942 nd->perm[i] = d->perm[2 * i] / 2;
16943 if (GET_MODE_INNER (mode) != DImode)
16944 canonicalize_vector_int_perm (nd, nd);
16945 if (nd != d)
16946 {
16947 nd->one_operand_p = d->one_operand_p;
16948 nd->testing_p = d->testing_p;
16949 if (d->op0 == d->op1)
16950 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
16951 else
16952 {
16953 nd->op0 = gen_lowpart (nd->vmode, d->op0);
16954 nd->op1 = gen_lowpart (nd->vmode, d->op1);
16955 }
16956 if (d->testing_p)
16957 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
16958 else
16959 nd->target = gen_reg_rtx (nd->vmode);
16960 }
16961 return true;
16962 }
16963
16964 /* Try to expand one-operand permutation with constant mask. */
16965
16966 static bool
16967 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
16968 {
16969 machine_mode mode = GET_MODE (d->op0);
16970 machine_mode maskmode = mode;
16971 rtx (*gen) (rtx, rtx, rtx) = NULL;
16972 rtx target, op0, mask;
16973 rtx vec[64];
16974
16975 if (!rtx_equal_p (d->op0, d->op1))
16976 return false;
16977
16978 if (!TARGET_AVX512F)
16979 return false;
16980
16981 switch (mode)
16982 {
16983 case E_V16SImode:
16984 gen = gen_avx512f_permvarv16si;
16985 break;
16986 case E_V16SFmode:
16987 gen = gen_avx512f_permvarv16sf;
16988 maskmode = V16SImode;
16989 break;
16990 case E_V8DImode:
16991 gen = gen_avx512f_permvarv8di;
16992 break;
16993 case E_V8DFmode:
16994 gen = gen_avx512f_permvarv8df;
16995 maskmode = V8DImode;
16996 break;
16997 default:
16998 return false;
16999 }
17000
17001 target = d->target;
17002 op0 = d->op0;
17003 for (int i = 0; i < d->nelt; ++i)
17004 vec[i] = GEN_INT (d->perm[i]);
17005 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
17006 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
17007 return true;
17008 }
17009
17010 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
17011
17012 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
17013 in a single instruction. */
17014
17015 static bool
17016 expand_vec_perm_1 (struct expand_vec_perm_d *d)
17017 {
17018 unsigned i, nelt = d->nelt;
17019 struct expand_vec_perm_d nd;
17020
17021 /* Check plain VEC_SELECT first, because AVX has instructions that could
17022 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
17023 input where SEL+CONCAT may not. */
17024 if (d->one_operand_p)
17025 {
17026 int mask = nelt - 1;
17027 bool identity_perm = true;
17028 bool broadcast_perm = true;
17029
17030 for (i = 0; i < nelt; i++)
17031 {
17032 nd.perm[i] = d->perm[i] & mask;
17033 if (nd.perm[i] != i)
17034 identity_perm = false;
17035 if (nd.perm[i])
17036 broadcast_perm = false;
17037 }
17038
17039 if (identity_perm)
17040 {
17041 if (!d->testing_p)
17042 emit_move_insn (d->target, d->op0);
17043 return true;
17044 }
17045 else if (broadcast_perm && TARGET_AVX2)
17046 {
17047 /* Use vpbroadcast{b,w,d}. */
17048 rtx (*gen) (rtx, rtx) = NULL;
17049 switch (d->vmode)
17050 {
17051 case E_V64QImode:
17052 if (TARGET_AVX512BW)
17053 gen = gen_avx512bw_vec_dupv64qi_1;
17054 break;
17055 case E_V32QImode:
17056 gen = gen_avx2_pbroadcastv32qi_1;
17057 break;
17058 case E_V32HImode:
17059 if (TARGET_AVX512BW)
17060 gen = gen_avx512bw_vec_dupv32hi_1;
17061 break;
17062 case E_V16HImode:
17063 gen = gen_avx2_pbroadcastv16hi_1;
17064 break;
17065 case E_V16SImode:
17066 if (TARGET_AVX512F)
17067 gen = gen_avx512f_vec_dupv16si_1;
17068 break;
17069 case E_V8SImode:
17070 gen = gen_avx2_pbroadcastv8si_1;
17071 break;
17072 case E_V16QImode:
17073 gen = gen_avx2_pbroadcastv16qi;
17074 break;
17075 case E_V8HImode:
17076 gen = gen_avx2_pbroadcastv8hi;
17077 break;
17078 case E_V16SFmode:
17079 if (TARGET_AVX512F)
17080 gen = gen_avx512f_vec_dupv16sf_1;
17081 break;
17082 case E_V8SFmode:
17083 gen = gen_avx2_vec_dupv8sf_1;
17084 break;
17085 case E_V8DFmode:
17086 if (TARGET_AVX512F)
17087 gen = gen_avx512f_vec_dupv8df_1;
17088 break;
17089 case E_V8DImode:
17090 if (TARGET_AVX512F)
17091 gen = gen_avx512f_vec_dupv8di_1;
17092 break;
17093 /* For other modes prefer other shuffles this function creates. */
17094 default: break;
17095 }
17096 if (gen != NULL)
17097 {
17098 if (!d->testing_p)
17099 emit_insn (gen (d->target, d->op0));
17100 return true;
17101 }
17102 }
17103
17104 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
17105 return true;
17106
17107 /* There are plenty of patterns in sse.md that are written for
17108 SEL+CONCAT and are not replicated for a single op. Perhaps
17109 that should be changed, to avoid the nastiness here. */
17110
17111 /* Recognize interleave style patterns, which means incrementing
17112 every other permutation operand. */
17113 for (i = 0; i < nelt; i += 2)
17114 {
17115 nd.perm[i] = d->perm[i] & mask;
17116 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
17117 }
17118 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17119 d->testing_p))
17120 return true;
17121
17122 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
17123 if (nelt >= 4)
17124 {
17125 for (i = 0; i < nelt; i += 4)
17126 {
17127 nd.perm[i + 0] = d->perm[i + 0] & mask;
17128 nd.perm[i + 1] = d->perm[i + 1] & mask;
17129 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
17130 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
17131 }
17132
17133 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17134 d->testing_p))
17135 return true;
17136 }
17137 }
17138
17139 /* Try movss/movsd instructions. */
17140 if (expand_vec_perm_movs (d))
17141 return true;
17142
17143 /* Finally, try the fully general two operand permute. */
17144 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
17145 d->testing_p))
17146 return true;
17147
17148 /* Recognize interleave style patterns with reversed operands. */
17149 if (!d->one_operand_p)
17150 {
17151 for (i = 0; i < nelt; ++i)
17152 {
17153 unsigned e = d->perm[i];
17154 if (e >= nelt)
17155 e -= nelt;
17156 else
17157 e += nelt;
17158 nd.perm[i] = e;
17159 }
17160
17161 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
17162 d->testing_p))
17163 return true;
17164 }
17165
17166 /* Try the SSE4.1 blend variable merge instructions. */
17167 if (expand_vec_perm_blend (d))
17168 return true;
17169
17170 /* Try one of the AVX vpermil variable permutations. */
17171 if (expand_vec_perm_vpermil (d))
17172 return true;
17173
17174 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
17175 vpshufb, vpermd, vpermps or vpermq variable permutation. */
17176 if (expand_vec_perm_pshufb (d))
17177 return true;
17178
17179 /* Try the AVX2 vpalignr instruction. */
17180 if (expand_vec_perm_palignr (d, true))
17181 return true;
17182
17183 /* Try the AVX512F vperm{s,d} instructions. */
17184 if (ix86_expand_vec_one_operand_perm_avx512 (d))
17185 return true;
17186
17187 /* Try the AVX512F vpermt2/vpermi2 instructions. */
17188 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
17189 return true;
17190
17191 /* See if we can get the same permutation in different vector integer
17192 mode. */
17193 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
17194 {
17195 if (!d->testing_p)
17196 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
17197 return true;
17198 }
17199 return false;
17200 }
17201
17202 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
17203 in terms of a pair of pshuflw + pshufhw instructions. */
17204
17205 static bool
17206 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
17207 {
17208 unsigned char perm2[MAX_VECT_LEN];
17209 unsigned i;
17210 bool ok;
17211
17212 if (d->vmode != V8HImode || !d->one_operand_p)
17213 return false;
17214
17215 /* The two permutations only operate in 64-bit lanes. */
17216 for (i = 0; i < 4; ++i)
17217 if (d->perm[i] >= 4)
17218 return false;
17219 for (i = 4; i < 8; ++i)
17220 if (d->perm[i] < 4)
17221 return false;
17222
17223 if (d->testing_p)
17224 return true;
17225
17226 /* Emit the pshuflw. */
17227 memcpy (perm2, d->perm, 4);
17228 for (i = 4; i < 8; ++i)
17229 perm2[i] = i;
17230 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
17231 gcc_assert (ok);
17232
17233 /* Emit the pshufhw. */
17234 memcpy (perm2 + 4, d->perm + 4, 4);
17235 for (i = 0; i < 4; ++i)
17236 perm2[i] = i;
17237 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
17238 gcc_assert (ok);
17239
17240 return true;
17241 }
17242
17243 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17244 the permutation using the SSSE3 palignr instruction. This succeeds
17245 when all of the elements in PERM fit within one vector and we merely
17246 need to shift them down so that a single vector permutation has a
17247 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
17248 the vpalignr instruction itself can perform the requested permutation. */
17249
17250 static bool
17251 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
17252 {
17253 unsigned i, nelt = d->nelt;
17254 unsigned min, max, minswap, maxswap;
17255 bool in_order, ok, swap = false;
17256 rtx shift, target;
17257 struct expand_vec_perm_d dcopy;
17258
17259 /* Even with AVX, palignr only operates on 128-bit vectors,
17260 in AVX2 palignr operates on both 128-bit lanes. */
17261 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
17262 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
17263 return false;
17264
17265 min = 2 * nelt;
17266 max = 0;
17267 minswap = 2 * nelt;
17268 maxswap = 0;
17269 for (i = 0; i < nelt; ++i)
17270 {
17271 unsigned e = d->perm[i];
17272 unsigned eswap = d->perm[i] ^ nelt;
17273 if (GET_MODE_SIZE (d->vmode) == 32)
17274 {
17275 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
17276 eswap = e ^ (nelt / 2);
17277 }
17278 if (e < min)
17279 min = e;
17280 if (e > max)
17281 max = e;
17282 if (eswap < minswap)
17283 minswap = eswap;
17284 if (eswap > maxswap)
17285 maxswap = eswap;
17286 }
17287 if (min == 0
17288 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
17289 {
17290 if (d->one_operand_p
17291 || minswap == 0
17292 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
17293 ? nelt / 2 : nelt))
17294 return false;
17295 swap = true;
17296 min = minswap;
17297 max = maxswap;
17298 }
17299
17300 /* Given that we have SSSE3, we know we'll be able to implement the
17301 single operand permutation after the palignr with pshufb for
17302 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
17303 first. */
17304 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
17305 return true;
17306
17307 dcopy = *d;
17308 if (swap)
17309 {
17310 dcopy.op0 = d->op1;
17311 dcopy.op1 = d->op0;
17312 for (i = 0; i < nelt; ++i)
17313 dcopy.perm[i] ^= nelt;
17314 }
17315
17316 in_order = true;
17317 for (i = 0; i < nelt; ++i)
17318 {
17319 unsigned e = dcopy.perm[i];
17320 if (GET_MODE_SIZE (d->vmode) == 32
17321 && e >= nelt
17322 && (e & (nelt / 2 - 1)) < min)
17323 e = e - min - (nelt / 2);
17324 else
17325 e = e - min;
17326 if (e != i)
17327 in_order = false;
17328 dcopy.perm[i] = e;
17329 }
17330 dcopy.one_operand_p = true;
17331
17332 if (single_insn_only_p && !in_order)
17333 return false;
17334
17335 /* For AVX2, test whether we can permute the result in one instruction. */
17336 if (d->testing_p)
17337 {
17338 if (in_order)
17339 return true;
17340 dcopy.op1 = dcopy.op0;
17341 return expand_vec_perm_1 (&dcopy);
17342 }
17343
17344 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
17345 if (GET_MODE_SIZE (d->vmode) == 16)
17346 {
17347 target = gen_reg_rtx (TImode);
17348 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
17349 gen_lowpart (TImode, dcopy.op0), shift));
17350 }
17351 else
17352 {
17353 target = gen_reg_rtx (V2TImode);
17354 emit_insn (gen_avx2_palignrv2ti (target,
17355 gen_lowpart (V2TImode, dcopy.op1),
17356 gen_lowpart (V2TImode, dcopy.op0),
17357 shift));
17358 }
17359
17360 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
17361
17362 /* Test for the degenerate case where the alignment by itself
17363 produces the desired permutation. */
17364 if (in_order)
17365 {
17366 emit_move_insn (d->target, dcopy.op0);
17367 return true;
17368 }
17369
17370 ok = expand_vec_perm_1 (&dcopy);
17371 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
17372
17373 return ok;
17374 }
17375
17376 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17377 the permutation using the SSE4_1 pblendv instruction. Potentially
17378 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
17379
17380 static bool
17381 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
17382 {
17383 unsigned i, which, nelt = d->nelt;
17384 struct expand_vec_perm_d dcopy, dcopy1;
17385 machine_mode vmode = d->vmode;
17386 bool ok;
17387
17388 /* Use the same checks as in expand_vec_perm_blend. */
17389 if (d->one_operand_p)
17390 return false;
17391 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
17392 ;
17393 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
17394 ;
17395 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
17396 ;
17397 else
17398 return false;
17399
17400 /* Figure out where permutation elements stay not in their
17401 respective lanes. */
17402 for (i = 0, which = 0; i < nelt; ++i)
17403 {
17404 unsigned e = d->perm[i];
17405 if (e != i)
17406 which |= (e < nelt ? 1 : 2);
17407 }
17408 /* We can pblend the part where elements stay not in their
17409 respective lanes only when these elements are all in one
17410 half of a permutation.
17411 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17412 lanes, but both 8 and 9 >= 8
17413 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17414 respective lanes and 8 >= 8, but 2 not. */
17415 if (which != 1 && which != 2)
17416 return false;
17417 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
17418 return true;
17419
17420 /* First we apply one operand permutation to the part where
17421 elements stay not in their respective lanes. */
17422 dcopy = *d;
17423 if (which == 2)
17424 dcopy.op0 = dcopy.op1 = d->op1;
17425 else
17426 dcopy.op0 = dcopy.op1 = d->op0;
17427 if (!d->testing_p)
17428 dcopy.target = gen_reg_rtx (vmode);
17429 dcopy.one_operand_p = true;
17430
17431 for (i = 0; i < nelt; ++i)
17432 dcopy.perm[i] = d->perm[i] & (nelt - 1);
17433
17434 ok = expand_vec_perm_1 (&dcopy);
17435 if (GET_MODE_SIZE (vmode) != 16 && !ok)
17436 return false;
17437 else
17438 gcc_assert (ok);
17439 if (d->testing_p)
17440 return true;
17441
17442 /* Next we put permuted elements into their positions. */
17443 dcopy1 = *d;
17444 if (which == 2)
17445 dcopy1.op1 = dcopy.target;
17446 else
17447 dcopy1.op0 = dcopy.target;
17448
17449 for (i = 0; i < nelt; ++i)
17450 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
17451
17452 ok = expand_vec_perm_blend (&dcopy1);
17453 gcc_assert (ok);
17454
17455 return true;
17456 }
17457
17458 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
17459
17460 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17461 a two vector permutation into a single vector permutation by using
17462 an interleave operation to merge the vectors. */
17463
17464 static bool
17465 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
17466 {
17467 struct expand_vec_perm_d dremap, dfinal;
17468 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
17469 unsigned HOST_WIDE_INT contents;
17470 unsigned char remap[2 * MAX_VECT_LEN];
17471 rtx_insn *seq;
17472 bool ok, same_halves = false;
17473
17474 if (GET_MODE_SIZE (d->vmode) == 16)
17475 {
17476 if (d->one_operand_p)
17477 return false;
17478 }
17479 else if (GET_MODE_SIZE (d->vmode) == 32)
17480 {
17481 if (!TARGET_AVX)
17482 return false;
17483 /* For 32-byte modes allow even d->one_operand_p.
17484 The lack of cross-lane shuffling in some instructions
17485 might prevent a single insn shuffle. */
17486 dfinal = *d;
17487 dfinal.testing_p = true;
17488 /* If expand_vec_perm_interleave3 can expand this into
17489 a 3 insn sequence, give up and let it be expanded as
17490 3 insn sequence. While that is one insn longer,
17491 it doesn't need a memory operand and in the common
17492 case that both interleave low and high permutations
17493 with the same operands are adjacent needs 4 insns
17494 for both after CSE. */
17495 if (expand_vec_perm_interleave3 (&dfinal))
17496 return false;
17497 }
17498 else
17499 return false;
17500
17501 /* Examine from whence the elements come. */
17502 contents = 0;
17503 for (i = 0; i < nelt; ++i)
17504 contents |= HOST_WIDE_INT_1U << d->perm[i];
17505
17506 memset (remap, 0xff, sizeof (remap));
17507 dremap = *d;
17508
17509 if (GET_MODE_SIZE (d->vmode) == 16)
17510 {
17511 unsigned HOST_WIDE_INT h1, h2, h3, h4;
17512
17513 /* Split the two input vectors into 4 halves. */
17514 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
17515 h2 = h1 << nelt2;
17516 h3 = h2 << nelt2;
17517 h4 = h3 << nelt2;
17518
17519 /* If the elements from the low halves use interleave low, and similarly
17520 for interleave high. If the elements are from mis-matched halves, we
17521 can use shufps for V4SF/V4SI or do a DImode shuffle. */
17522 if ((contents & (h1 | h3)) == contents)
17523 {
17524 /* punpckl* */
17525 for (i = 0; i < nelt2; ++i)
17526 {
17527 remap[i] = i * 2;
17528 remap[i + nelt] = i * 2 + 1;
17529 dremap.perm[i * 2] = i;
17530 dremap.perm[i * 2 + 1] = i + nelt;
17531 }
17532 if (!TARGET_SSE2 && d->vmode == V4SImode)
17533 dremap.vmode = V4SFmode;
17534 }
17535 else if ((contents & (h2 | h4)) == contents)
17536 {
17537 /* punpckh* */
17538 for (i = 0; i < nelt2; ++i)
17539 {
17540 remap[i + nelt2] = i * 2;
17541 remap[i + nelt + nelt2] = i * 2 + 1;
17542 dremap.perm[i * 2] = i + nelt2;
17543 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
17544 }
17545 if (!TARGET_SSE2 && d->vmode == V4SImode)
17546 dremap.vmode = V4SFmode;
17547 }
17548 else if ((contents & (h1 | h4)) == contents)
17549 {
17550 /* shufps */
17551 for (i = 0; i < nelt2; ++i)
17552 {
17553 remap[i] = i;
17554 remap[i + nelt + nelt2] = i + nelt2;
17555 dremap.perm[i] = i;
17556 dremap.perm[i + nelt2] = i + nelt + nelt2;
17557 }
17558 if (nelt != 4)
17559 {
17560 /* shufpd */
17561 dremap.vmode = V2DImode;
17562 dremap.nelt = 2;
17563 dremap.perm[0] = 0;
17564 dremap.perm[1] = 3;
17565 }
17566 }
17567 else if ((contents & (h2 | h3)) == contents)
17568 {
17569 /* shufps */
17570 for (i = 0; i < nelt2; ++i)
17571 {
17572 remap[i + nelt2] = i;
17573 remap[i + nelt] = i + nelt2;
17574 dremap.perm[i] = i + nelt2;
17575 dremap.perm[i + nelt2] = i + nelt;
17576 }
17577 if (nelt != 4)
17578 {
17579 /* shufpd */
17580 dremap.vmode = V2DImode;
17581 dremap.nelt = 2;
17582 dremap.perm[0] = 1;
17583 dremap.perm[1] = 2;
17584 }
17585 }
17586 else
17587 return false;
17588 }
17589 else
17590 {
17591 unsigned int nelt4 = nelt / 4, nzcnt = 0;
17592 unsigned HOST_WIDE_INT q[8];
17593 unsigned int nonzero_halves[4];
17594
17595 /* Split the two input vectors into 8 quarters. */
17596 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
17597 for (i = 1; i < 8; ++i)
17598 q[i] = q[0] << (nelt4 * i);
17599 for (i = 0; i < 4; ++i)
17600 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
17601 {
17602 nonzero_halves[nzcnt] = i;
17603 ++nzcnt;
17604 }
17605
17606 if (nzcnt == 1)
17607 {
17608 gcc_assert (d->one_operand_p);
17609 nonzero_halves[1] = nonzero_halves[0];
17610 same_halves = true;
17611 }
17612 else if (d->one_operand_p)
17613 {
17614 gcc_assert (nonzero_halves[0] == 0);
17615 gcc_assert (nonzero_halves[1] == 1);
17616 }
17617
17618 if (nzcnt <= 2)
17619 {
17620 if (d->perm[0] / nelt2 == nonzero_halves[1])
17621 {
17622 /* Attempt to increase the likelihood that dfinal
17623 shuffle will be intra-lane. */
17624 std::swap (nonzero_halves[0], nonzero_halves[1]);
17625 }
17626
17627 /* vperm2f128 or vperm2i128. */
17628 for (i = 0; i < nelt2; ++i)
17629 {
17630 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
17631 remap[i + nonzero_halves[0] * nelt2] = i;
17632 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
17633 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
17634 }
17635
17636 if (d->vmode != V8SFmode
17637 && d->vmode != V4DFmode
17638 && d->vmode != V8SImode)
17639 {
17640 dremap.vmode = V8SImode;
17641 dremap.nelt = 8;
17642 for (i = 0; i < 4; ++i)
17643 {
17644 dremap.perm[i] = i + nonzero_halves[0] * 4;
17645 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
17646 }
17647 }
17648 }
17649 else if (d->one_operand_p)
17650 return false;
17651 else if (TARGET_AVX2
17652 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
17653 {
17654 /* vpunpckl* */
17655 for (i = 0; i < nelt4; ++i)
17656 {
17657 remap[i] = i * 2;
17658 remap[i + nelt] = i * 2 + 1;
17659 remap[i + nelt2] = i * 2 + nelt2;
17660 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
17661 dremap.perm[i * 2] = i;
17662 dremap.perm[i * 2 + 1] = i + nelt;
17663 dremap.perm[i * 2 + nelt2] = i + nelt2;
17664 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
17665 }
17666 }
17667 else if (TARGET_AVX2
17668 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
17669 {
17670 /* vpunpckh* */
17671 for (i = 0; i < nelt4; ++i)
17672 {
17673 remap[i + nelt4] = i * 2;
17674 remap[i + nelt + nelt4] = i * 2 + 1;
17675 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
17676 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
17677 dremap.perm[i * 2] = i + nelt4;
17678 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
17679 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
17680 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
17681 }
17682 }
17683 else
17684 return false;
17685 }
17686
17687 /* Use the remapping array set up above to move the elements from their
17688 swizzled locations into their final destinations. */
17689 dfinal = *d;
17690 for (i = 0; i < nelt; ++i)
17691 {
17692 unsigned e = remap[d->perm[i]];
17693 gcc_assert (e < nelt);
17694 /* If same_halves is true, both halves of the remapped vector are the
17695 same. Avoid cross-lane accesses if possible. */
17696 if (same_halves && i >= nelt2)
17697 {
17698 gcc_assert (e < nelt2);
17699 dfinal.perm[i] = e + nelt2;
17700 }
17701 else
17702 dfinal.perm[i] = e;
17703 }
17704 if (!d->testing_p)
17705 {
17706 dremap.target = gen_reg_rtx (dremap.vmode);
17707 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
17708 }
17709 dfinal.op1 = dfinal.op0;
17710 dfinal.one_operand_p = true;
17711
17712 /* Test if the final remap can be done with a single insn. For V4SFmode or
17713 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
17714 start_sequence ();
17715 ok = expand_vec_perm_1 (&dfinal);
17716 seq = get_insns ();
17717 end_sequence ();
17718
17719 if (!ok)
17720 return false;
17721
17722 if (d->testing_p)
17723 return true;
17724
17725 if (dremap.vmode != dfinal.vmode)
17726 {
17727 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
17728 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
17729 }
17730
17731 ok = expand_vec_perm_1 (&dremap);
17732 gcc_assert (ok);
17733
17734 emit_insn (seq);
17735 return true;
17736 }
17737
17738 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17739 a single vector cross-lane permutation into vpermq followed
17740 by any of the single insn permutations. */
17741
17742 static bool
17743 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
17744 {
17745 struct expand_vec_perm_d dremap, dfinal;
17746 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
17747 unsigned contents[2];
17748 bool ok;
17749
17750 if (!(TARGET_AVX2
17751 && (d->vmode == V32QImode || d->vmode == V16HImode)
17752 && d->one_operand_p))
17753 return false;
17754
17755 contents[0] = 0;
17756 contents[1] = 0;
17757 for (i = 0; i < nelt2; ++i)
17758 {
17759 contents[0] |= 1u << (d->perm[i] / nelt4);
17760 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
17761 }
17762
17763 for (i = 0; i < 2; ++i)
17764 {
17765 unsigned int cnt = 0;
17766 for (j = 0; j < 4; ++j)
17767 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
17768 return false;
17769 }
17770
17771 if (d->testing_p)
17772 return true;
17773
17774 dremap = *d;
17775 dremap.vmode = V4DImode;
17776 dremap.nelt = 4;
17777 dremap.target = gen_reg_rtx (V4DImode);
17778 dremap.op0 = gen_lowpart (V4DImode, d->op0);
17779 dremap.op1 = dremap.op0;
17780 dremap.one_operand_p = true;
17781 for (i = 0; i < 2; ++i)
17782 {
17783 unsigned int cnt = 0;
17784 for (j = 0; j < 4; ++j)
17785 if ((contents[i] & (1u << j)) != 0)
17786 dremap.perm[2 * i + cnt++] = j;
17787 for (; cnt < 2; ++cnt)
17788 dremap.perm[2 * i + cnt] = 0;
17789 }
17790
17791 dfinal = *d;
17792 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
17793 dfinal.op1 = dfinal.op0;
17794 dfinal.one_operand_p = true;
17795 for (i = 0, j = 0; i < nelt; ++i)
17796 {
17797 if (i == nelt2)
17798 j = 2;
17799 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
17800 if ((d->perm[i] / nelt4) == dremap.perm[j])
17801 ;
17802 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
17803 dfinal.perm[i] |= nelt4;
17804 else
17805 gcc_unreachable ();
17806 }
17807
17808 ok = expand_vec_perm_1 (&dremap);
17809 gcc_assert (ok);
17810
17811 ok = expand_vec_perm_1 (&dfinal);
17812 gcc_assert (ok);
17813
17814 return true;
17815 }
17816
17817 static bool canonicalize_perm (struct expand_vec_perm_d *d);
17818
17819 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
17820 a vector permutation using two instructions, vperm2f128 resp.
17821 vperm2i128 followed by any single in-lane permutation. */
17822
17823 static bool
17824 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
17825 {
17826 struct expand_vec_perm_d dfirst, dsecond;
17827 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
17828 bool ok;
17829
17830 if (!TARGET_AVX
17831 || GET_MODE_SIZE (d->vmode) != 32
17832 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
17833 return false;
17834
17835 dsecond = *d;
17836 dsecond.one_operand_p = false;
17837 dsecond.testing_p = true;
17838
17839 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
17840 immediate. For perm < 16 the second permutation uses
17841 d->op0 as first operand, for perm >= 16 it uses d->op1
17842 as first operand. The second operand is the result of
17843 vperm2[fi]128. */
17844 for (perm = 0; perm < 32; perm++)
17845 {
17846 /* Ignore permutations which do not move anything cross-lane. */
17847 if (perm < 16)
17848 {
17849 /* The second shuffle for e.g. V4DFmode has
17850 0123 and ABCD operands.
17851 Ignore AB23, as 23 is already in the second lane
17852 of the first operand. */
17853 if ((perm & 0xc) == (1 << 2)) continue;
17854 /* And 01CD, as 01 is in the first lane of the first
17855 operand. */
17856 if ((perm & 3) == 0) continue;
17857 /* And 4567, as then the vperm2[fi]128 doesn't change
17858 anything on the original 4567 second operand. */
17859 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
17860 }
17861 else
17862 {
17863 /* The second shuffle for e.g. V4DFmode has
17864 4567 and ABCD operands.
17865 Ignore AB67, as 67 is already in the second lane
17866 of the first operand. */
17867 if ((perm & 0xc) == (3 << 2)) continue;
17868 /* And 45CD, as 45 is in the first lane of the first
17869 operand. */
17870 if ((perm & 3) == 2) continue;
17871 /* And 0123, as then the vperm2[fi]128 doesn't change
17872 anything on the original 0123 first operand. */
17873 if ((perm & 0xf) == (1 << 2)) continue;
17874 }
17875
17876 for (i = 0; i < nelt; i++)
17877 {
17878 j = d->perm[i] / nelt2;
17879 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
17880 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
17881 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
17882 dsecond.perm[i] = d->perm[i] & (nelt - 1);
17883 else
17884 break;
17885 }
17886
17887 if (i == nelt)
17888 {
17889 start_sequence ();
17890 ok = expand_vec_perm_1 (&dsecond);
17891 end_sequence ();
17892 }
17893 else
17894 ok = false;
17895
17896 if (ok)
17897 {
17898 if (d->testing_p)
17899 return true;
17900
17901 /* Found a usable second shuffle. dfirst will be
17902 vperm2f128 on d->op0 and d->op1. */
17903 dsecond.testing_p = false;
17904 dfirst = *d;
17905 dfirst.target = gen_reg_rtx (d->vmode);
17906 for (i = 0; i < nelt; i++)
17907 dfirst.perm[i] = (i & (nelt2 - 1))
17908 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
17909
17910 canonicalize_perm (&dfirst);
17911 ok = expand_vec_perm_1 (&dfirst);
17912 gcc_assert (ok);
17913
17914 /* And dsecond is some single insn shuffle, taking
17915 d->op0 and result of vperm2f128 (if perm < 16) or
17916 d->op1 and result of vperm2f128 (otherwise). */
17917 if (perm >= 16)
17918 dsecond.op0 = dsecond.op1;
17919 dsecond.op1 = dfirst.target;
17920
17921 ok = expand_vec_perm_1 (&dsecond);
17922 gcc_assert (ok);
17923
17924 return true;
17925 }
17926
17927 /* For one operand, the only useful vperm2f128 permutation is 0x01
17928 aka lanes swap. */
17929 if (d->one_operand_p)
17930 return false;
17931 }
17932
17933 return false;
17934 }
17935
17936 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17937 a two vector permutation using 2 intra-lane interleave insns
17938 and cross-lane shuffle for 32-byte vectors. */
17939
17940 static bool
17941 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
17942 {
17943 unsigned i, nelt;
17944 rtx (*gen) (rtx, rtx, rtx);
17945
17946 if (d->one_operand_p)
17947 return false;
17948 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
17949 ;
17950 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
17951 ;
17952 else
17953 return false;
17954
17955 nelt = d->nelt;
17956 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
17957 return false;
17958 for (i = 0; i < nelt; i += 2)
17959 if (d->perm[i] != d->perm[0] + i / 2
17960 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
17961 return false;
17962
17963 if (d->testing_p)
17964 return true;
17965
17966 switch (d->vmode)
17967 {
17968 case E_V32QImode:
17969 if (d->perm[0])
17970 gen = gen_vec_interleave_highv32qi;
17971 else
17972 gen = gen_vec_interleave_lowv32qi;
17973 break;
17974 case E_V16HImode:
17975 if (d->perm[0])
17976 gen = gen_vec_interleave_highv16hi;
17977 else
17978 gen = gen_vec_interleave_lowv16hi;
17979 break;
17980 case E_V8SImode:
17981 if (d->perm[0])
17982 gen = gen_vec_interleave_highv8si;
17983 else
17984 gen = gen_vec_interleave_lowv8si;
17985 break;
17986 case E_V4DImode:
17987 if (d->perm[0])
17988 gen = gen_vec_interleave_highv4di;
17989 else
17990 gen = gen_vec_interleave_lowv4di;
17991 break;
17992 case E_V8SFmode:
17993 if (d->perm[0])
17994 gen = gen_vec_interleave_highv8sf;
17995 else
17996 gen = gen_vec_interleave_lowv8sf;
17997 break;
17998 case E_V4DFmode:
17999 if (d->perm[0])
18000 gen = gen_vec_interleave_highv4df;
18001 else
18002 gen = gen_vec_interleave_lowv4df;
18003 break;
18004 default:
18005 gcc_unreachable ();
18006 }
18007
18008 emit_insn (gen (d->target, d->op0, d->op1));
18009 return true;
18010 }
18011
18012 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
18013 a single vector permutation using a single intra-lane vector
18014 permutation, vperm2f128 swapping the lanes and vblend* insn blending
18015 the non-swapped and swapped vectors together. */
18016
18017 static bool
18018 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
18019 {
18020 struct expand_vec_perm_d dfirst, dsecond;
18021 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
18022 rtx_insn *seq;
18023 bool ok;
18024 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18025
18026 if (!TARGET_AVX
18027 || TARGET_AVX2
18028 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18029 || !d->one_operand_p)
18030 return false;
18031
18032 dfirst = *d;
18033 for (i = 0; i < nelt; i++)
18034 dfirst.perm[i] = 0xff;
18035 for (i = 0, msk = 0; i < nelt; i++)
18036 {
18037 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18038 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
18039 return false;
18040 dfirst.perm[j] = d->perm[i];
18041 if (j != i)
18042 msk |= (1 << i);
18043 }
18044 for (i = 0; i < nelt; i++)
18045 if (dfirst.perm[i] == 0xff)
18046 dfirst.perm[i] = i;
18047
18048 if (!d->testing_p)
18049 dfirst.target = gen_reg_rtx (dfirst.vmode);
18050
18051 start_sequence ();
18052 ok = expand_vec_perm_1 (&dfirst);
18053 seq = get_insns ();
18054 end_sequence ();
18055
18056 if (!ok)
18057 return false;
18058
18059 if (d->testing_p)
18060 return true;
18061
18062 emit_insn (seq);
18063
18064 dsecond = *d;
18065 dsecond.op0 = dfirst.target;
18066 dsecond.op1 = dfirst.target;
18067 dsecond.one_operand_p = true;
18068 dsecond.target = gen_reg_rtx (dsecond.vmode);
18069 for (i = 0; i < nelt; i++)
18070 dsecond.perm[i] = i ^ nelt2;
18071
18072 ok = expand_vec_perm_1 (&dsecond);
18073 gcc_assert (ok);
18074
18075 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18076 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
18077 return true;
18078 }
18079
18080 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
18081 permutation using two vperm2f128, followed by a vshufpd insn blending
18082 the two vectors together. */
18083
18084 static bool
18085 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
18086 {
18087 struct expand_vec_perm_d dfirst, dsecond, dthird;
18088 bool ok;
18089
18090 if (!TARGET_AVX || (d->vmode != V4DFmode))
18091 return false;
18092
18093 if (d->testing_p)
18094 return true;
18095
18096 dfirst = *d;
18097 dsecond = *d;
18098 dthird = *d;
18099
18100 dfirst.perm[0] = (d->perm[0] & ~1);
18101 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
18102 dfirst.perm[2] = (d->perm[2] & ~1);
18103 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
18104 dsecond.perm[0] = (d->perm[1] & ~1);
18105 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
18106 dsecond.perm[2] = (d->perm[3] & ~1);
18107 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
18108 dthird.perm[0] = (d->perm[0] % 2);
18109 dthird.perm[1] = (d->perm[1] % 2) + 4;
18110 dthird.perm[2] = (d->perm[2] % 2) + 2;
18111 dthird.perm[3] = (d->perm[3] % 2) + 6;
18112
18113 dfirst.target = gen_reg_rtx (dfirst.vmode);
18114 dsecond.target = gen_reg_rtx (dsecond.vmode);
18115 dthird.op0 = dfirst.target;
18116 dthird.op1 = dsecond.target;
18117 dthird.one_operand_p = false;
18118
18119 canonicalize_perm (&dfirst);
18120 canonicalize_perm (&dsecond);
18121
18122 ok = expand_vec_perm_1 (&dfirst)
18123 && expand_vec_perm_1 (&dsecond)
18124 && expand_vec_perm_1 (&dthird);
18125
18126 gcc_assert (ok);
18127
18128 return true;
18129 }
18130
18131 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
18132 permutation with two pshufb insns and an ior. We should have already
18133 failed all two instruction sequences. */
18134
18135 static bool
18136 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
18137 {
18138 rtx rperm[2][16], vperm, l, h, op, m128;
18139 unsigned int i, nelt, eltsz;
18140
18141 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
18142 return false;
18143 gcc_assert (!d->one_operand_p);
18144
18145 if (d->testing_p)
18146 return true;
18147
18148 nelt = d->nelt;
18149 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18150
18151 /* Generate two permutation masks. If the required element is within
18152 the given vector it is shuffled into the proper lane. If the required
18153 element is in the other vector, force a zero into the lane by setting
18154 bit 7 in the permutation mask. */
18155 m128 = GEN_INT (-128);
18156 for (i = 0; i < nelt; ++i)
18157 {
18158 unsigned j, e = d->perm[i];
18159 unsigned which = (e >= nelt);
18160 if (e >= nelt)
18161 e -= nelt;
18162
18163 for (j = 0; j < eltsz; ++j)
18164 {
18165 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
18166 rperm[1-which][i*eltsz + j] = m128;
18167 }
18168 }
18169
18170 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
18171 vperm = force_reg (V16QImode, vperm);
18172
18173 l = gen_reg_rtx (V16QImode);
18174 op = gen_lowpart (V16QImode, d->op0);
18175 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
18176
18177 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
18178 vperm = force_reg (V16QImode, vperm);
18179
18180 h = gen_reg_rtx (V16QImode);
18181 op = gen_lowpart (V16QImode, d->op1);
18182 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
18183
18184 op = d->target;
18185 if (d->vmode != V16QImode)
18186 op = gen_reg_rtx (V16QImode);
18187 emit_insn (gen_iorv16qi3 (op, l, h));
18188 if (op != d->target)
18189 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18190
18191 return true;
18192 }
18193
18194 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
18195 with two vpshufb insns, vpermq and vpor. We should have already failed
18196 all two or three instruction sequences. */
18197
18198 static bool
18199 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
18200 {
18201 rtx rperm[2][32], vperm, l, h, hp, op, m128;
18202 unsigned int i, nelt, eltsz;
18203
18204 if (!TARGET_AVX2
18205 || !d->one_operand_p
18206 || (d->vmode != V32QImode && d->vmode != V16HImode))
18207 return false;
18208
18209 if (d->testing_p)
18210 return true;
18211
18212 nelt = d->nelt;
18213 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18214
18215 /* Generate two permutation masks. If the required element is within
18216 the same lane, it is shuffled in. If the required element from the
18217 other lane, force a zero by setting bit 7 in the permutation mask.
18218 In the other mask the mask has non-negative elements if element
18219 is requested from the other lane, but also moved to the other lane,
18220 so that the result of vpshufb can have the two V2TImode halves
18221 swapped. */
18222 m128 = GEN_INT (-128);
18223 for (i = 0; i < nelt; ++i)
18224 {
18225 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18226 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
18227
18228 for (j = 0; j < eltsz; ++j)
18229 {
18230 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
18231 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
18232 }
18233 }
18234
18235 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18236 vperm = force_reg (V32QImode, vperm);
18237
18238 h = gen_reg_rtx (V32QImode);
18239 op = gen_lowpart (V32QImode, d->op0);
18240 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18241
18242 /* Swap the 128-byte lanes of h into hp. */
18243 hp = gen_reg_rtx (V4DImode);
18244 op = gen_lowpart (V4DImode, h);
18245 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
18246 const1_rtx));
18247
18248 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18249 vperm = force_reg (V32QImode, vperm);
18250
18251 l = gen_reg_rtx (V32QImode);
18252 op = gen_lowpart (V32QImode, d->op0);
18253 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18254
18255 op = d->target;
18256 if (d->vmode != V32QImode)
18257 op = gen_reg_rtx (V32QImode);
18258 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
18259 if (op != d->target)
18260 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18261
18262 return true;
18263 }
18264
18265 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18266 and extract-odd permutations of two V32QImode and V16QImode operand
18267 with two vpshufb insns, vpor and vpermq. We should have already
18268 failed all two or three instruction sequences. */
18269
18270 static bool
18271 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
18272 {
18273 rtx rperm[2][32], vperm, l, h, ior, op, m128;
18274 unsigned int i, nelt, eltsz;
18275
18276 if (!TARGET_AVX2
18277 || d->one_operand_p
18278 || (d->vmode != V32QImode && d->vmode != V16HImode))
18279 return false;
18280
18281 for (i = 0; i < d->nelt; ++i)
18282 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
18283 return false;
18284
18285 if (d->testing_p)
18286 return true;
18287
18288 nelt = d->nelt;
18289 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18290
18291 /* Generate two permutation masks. In the first permutation mask
18292 the first quarter will contain indexes for the first half
18293 of the op0, the second quarter will contain bit 7 set, third quarter
18294 will contain indexes for the second half of the op0 and the
18295 last quarter bit 7 set. In the second permutation mask
18296 the first quarter will contain bit 7 set, the second quarter
18297 indexes for the first half of the op1, the third quarter bit 7 set
18298 and last quarter indexes for the second half of the op1.
18299 I.e. the first mask e.g. for V32QImode extract even will be:
18300 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18301 (all values masked with 0xf except for -128) and second mask
18302 for extract even will be
18303 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
18304 m128 = GEN_INT (-128);
18305 for (i = 0; i < nelt; ++i)
18306 {
18307 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18308 unsigned which = d->perm[i] >= nelt;
18309 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
18310
18311 for (j = 0; j < eltsz; ++j)
18312 {
18313 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
18314 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
18315 }
18316 }
18317
18318 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18319 vperm = force_reg (V32QImode, vperm);
18320
18321 l = gen_reg_rtx (V32QImode);
18322 op = gen_lowpart (V32QImode, d->op0);
18323 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18324
18325 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18326 vperm = force_reg (V32QImode, vperm);
18327
18328 h = gen_reg_rtx (V32QImode);
18329 op = gen_lowpart (V32QImode, d->op1);
18330 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18331
18332 ior = gen_reg_rtx (V32QImode);
18333 emit_insn (gen_iorv32qi3 (ior, l, h));
18334
18335 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
18336 op = gen_reg_rtx (V4DImode);
18337 ior = gen_lowpart (V4DImode, ior);
18338 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
18339 const1_rtx, GEN_INT (3)));
18340 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18341
18342 return true;
18343 }
18344
18345 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18346 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18347 with two "and" and "pack" or two "shift" and "pack" insns. We should
18348 have already failed all two instruction sequences. */
18349
18350 static bool
18351 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
18352 {
18353 rtx op, dop0, dop1, t;
18354 unsigned i, odd, c, s, nelt = d->nelt;
18355 bool end_perm = false;
18356 machine_mode half_mode;
18357 rtx (*gen_and) (rtx, rtx, rtx);
18358 rtx (*gen_pack) (rtx, rtx, rtx);
18359 rtx (*gen_shift) (rtx, rtx, rtx);
18360
18361 if (d->one_operand_p)
18362 return false;
18363
18364 switch (d->vmode)
18365 {
18366 case E_V8HImode:
18367 /* Required for "pack". */
18368 if (!TARGET_SSE4_1)
18369 return false;
18370 c = 0xffff;
18371 s = 16;
18372 half_mode = V4SImode;
18373 gen_and = gen_andv4si3;
18374 gen_pack = gen_sse4_1_packusdw;
18375 gen_shift = gen_lshrv4si3;
18376 break;
18377 case E_V16QImode:
18378 /* No check as all instructions are SSE2. */
18379 c = 0xff;
18380 s = 8;
18381 half_mode = V8HImode;
18382 gen_and = gen_andv8hi3;
18383 gen_pack = gen_sse2_packuswb;
18384 gen_shift = gen_lshrv8hi3;
18385 break;
18386 case E_V16HImode:
18387 if (!TARGET_AVX2)
18388 return false;
18389 c = 0xffff;
18390 s = 16;
18391 half_mode = V8SImode;
18392 gen_and = gen_andv8si3;
18393 gen_pack = gen_avx2_packusdw;
18394 gen_shift = gen_lshrv8si3;
18395 end_perm = true;
18396 break;
18397 case E_V32QImode:
18398 if (!TARGET_AVX2)
18399 return false;
18400 c = 0xff;
18401 s = 8;
18402 half_mode = V16HImode;
18403 gen_and = gen_andv16hi3;
18404 gen_pack = gen_avx2_packuswb;
18405 gen_shift = gen_lshrv16hi3;
18406 end_perm = true;
18407 break;
18408 default:
18409 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
18410 general shuffles. */
18411 return false;
18412 }
18413
18414 /* Check that permutation is even or odd. */
18415 odd = d->perm[0];
18416 if (odd > 1)
18417 return false;
18418
18419 for (i = 1; i < nelt; ++i)
18420 if (d->perm[i] != 2 * i + odd)
18421 return false;
18422
18423 if (d->testing_p)
18424 return true;
18425
18426 dop0 = gen_reg_rtx (half_mode);
18427 dop1 = gen_reg_rtx (half_mode);
18428 if (odd == 0)
18429 {
18430 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
18431 t = force_reg (half_mode, t);
18432 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
18433 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
18434 }
18435 else
18436 {
18437 emit_insn (gen_shift (dop0,
18438 gen_lowpart (half_mode, d->op0),
18439 GEN_INT (s)));
18440 emit_insn (gen_shift (dop1,
18441 gen_lowpart (half_mode, d->op1),
18442 GEN_INT (s)));
18443 }
18444 /* In AVX2 for 256 bit case we need to permute pack result. */
18445 if (TARGET_AVX2 && end_perm)
18446 {
18447 op = gen_reg_rtx (d->vmode);
18448 t = gen_reg_rtx (V4DImode);
18449 emit_insn (gen_pack (op, dop0, dop1));
18450 emit_insn (gen_avx2_permv4di_1 (t,
18451 gen_lowpart (V4DImode, op),
18452 const0_rtx,
18453 const2_rtx,
18454 const1_rtx,
18455 GEN_INT (3)));
18456 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
18457 }
18458 else
18459 emit_insn (gen_pack (d->target, dop0, dop1));
18460
18461 return true;
18462 }
18463
18464 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18465 and extract-odd permutations of two V64QI operands
18466 with two "shifts", two "truncs" and one "concat" insns for "odd"
18467 and two "truncs" and one concat insn for "even."
18468 Have already failed all two instruction sequences. */
18469
18470 static bool
18471 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
18472 {
18473 rtx t1, t2, t3, t4;
18474 unsigned i, odd, nelt = d->nelt;
18475
18476 if (!TARGET_AVX512BW
18477 || d->one_operand_p
18478 || d->vmode != V64QImode)
18479 return false;
18480
18481 /* Check that permutation is even or odd. */
18482 odd = d->perm[0];
18483 if (odd > 1)
18484 return false;
18485
18486 for (i = 1; i < nelt; ++i)
18487 if (d->perm[i] != 2 * i + odd)
18488 return false;
18489
18490 if (d->testing_p)
18491 return true;
18492
18493
18494 if (odd)
18495 {
18496 t1 = gen_reg_rtx (V32HImode);
18497 t2 = gen_reg_rtx (V32HImode);
18498 emit_insn (gen_lshrv32hi3 (t1,
18499 gen_lowpart (V32HImode, d->op0),
18500 GEN_INT (8)));
18501 emit_insn (gen_lshrv32hi3 (t2,
18502 gen_lowpart (V32HImode, d->op1),
18503 GEN_INT (8)));
18504 }
18505 else
18506 {
18507 t1 = gen_lowpart (V32HImode, d->op0);
18508 t2 = gen_lowpart (V32HImode, d->op1);
18509 }
18510
18511 t3 = gen_reg_rtx (V32QImode);
18512 t4 = gen_reg_rtx (V32QImode);
18513 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
18514 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
18515 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
18516
18517 return true;
18518 }
18519
18520 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
18521 and extract-odd permutations. */
18522
18523 static bool
18524 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
18525 {
18526 rtx t1, t2, t3, t4, t5;
18527
18528 switch (d->vmode)
18529 {
18530 case E_V4DFmode:
18531 if (d->testing_p)
18532 break;
18533 t1 = gen_reg_rtx (V4DFmode);
18534 t2 = gen_reg_rtx (V4DFmode);
18535
18536 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18537 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
18538 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
18539
18540 /* Now an unpck[lh]pd will produce the result required. */
18541 if (odd)
18542 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
18543 else
18544 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
18545 emit_insn (t3);
18546 break;
18547
18548 case E_V8SFmode:
18549 {
18550 int mask = odd ? 0xdd : 0x88;
18551
18552 if (d->testing_p)
18553 break;
18554 t1 = gen_reg_rtx (V8SFmode);
18555 t2 = gen_reg_rtx (V8SFmode);
18556 t3 = gen_reg_rtx (V8SFmode);
18557
18558 /* Shuffle within the 128-bit lanes to produce:
18559 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
18560 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
18561 GEN_INT (mask)));
18562
18563 /* Shuffle the lanes around to produce:
18564 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
18565 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
18566 GEN_INT (0x3)));
18567
18568 /* Shuffle within the 128-bit lanes to produce:
18569 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
18570 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
18571
18572 /* Shuffle within the 128-bit lanes to produce:
18573 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
18574 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
18575
18576 /* Shuffle the lanes around to produce:
18577 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
18578 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
18579 GEN_INT (0x20)));
18580 }
18581 break;
18582
18583 case E_V2DFmode:
18584 case E_V4SFmode:
18585 case E_V2DImode:
18586 case E_V4SImode:
18587 /* These are always directly implementable by expand_vec_perm_1. */
18588 gcc_unreachable ();
18589
18590 case E_V8HImode:
18591 if (TARGET_SSE4_1)
18592 return expand_vec_perm_even_odd_pack (d);
18593 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
18594 return expand_vec_perm_pshufb2 (d);
18595 else
18596 {
18597 if (d->testing_p)
18598 break;
18599 /* We need 2*log2(N)-1 operations to achieve odd/even
18600 with interleave. */
18601 t1 = gen_reg_rtx (V8HImode);
18602 t2 = gen_reg_rtx (V8HImode);
18603 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
18604 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
18605 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
18606 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
18607 if (odd)
18608 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
18609 else
18610 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
18611 emit_insn (t3);
18612 }
18613 break;
18614
18615 case E_V16QImode:
18616 return expand_vec_perm_even_odd_pack (d);
18617
18618 case E_V16HImode:
18619 case E_V32QImode:
18620 return expand_vec_perm_even_odd_pack (d);
18621
18622 case E_V64QImode:
18623 return expand_vec_perm_even_odd_trunc (d);
18624
18625 case E_V4DImode:
18626 if (!TARGET_AVX2)
18627 {
18628 struct expand_vec_perm_d d_copy = *d;
18629 d_copy.vmode = V4DFmode;
18630 if (d->testing_p)
18631 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
18632 else
18633 d_copy.target = gen_reg_rtx (V4DFmode);
18634 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
18635 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
18636 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
18637 {
18638 if (!d->testing_p)
18639 emit_move_insn (d->target,
18640 gen_lowpart (V4DImode, d_copy.target));
18641 return true;
18642 }
18643 return false;
18644 }
18645
18646 if (d->testing_p)
18647 break;
18648
18649 t1 = gen_reg_rtx (V4DImode);
18650 t2 = gen_reg_rtx (V4DImode);
18651
18652 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18653 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
18654 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
18655
18656 /* Now an vpunpck[lh]qdq will produce the result required. */
18657 if (odd)
18658 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
18659 else
18660 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
18661 emit_insn (t3);
18662 break;
18663
18664 case E_V8SImode:
18665 if (!TARGET_AVX2)
18666 {
18667 struct expand_vec_perm_d d_copy = *d;
18668 d_copy.vmode = V8SFmode;
18669 if (d->testing_p)
18670 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
18671 else
18672 d_copy.target = gen_reg_rtx (V8SFmode);
18673 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
18674 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
18675 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
18676 {
18677 if (!d->testing_p)
18678 emit_move_insn (d->target,
18679 gen_lowpart (V8SImode, d_copy.target));
18680 return true;
18681 }
18682 return false;
18683 }
18684
18685 if (d->testing_p)
18686 break;
18687
18688 t1 = gen_reg_rtx (V8SImode);
18689 t2 = gen_reg_rtx (V8SImode);
18690 t3 = gen_reg_rtx (V4DImode);
18691 t4 = gen_reg_rtx (V4DImode);
18692 t5 = gen_reg_rtx (V4DImode);
18693
18694 /* Shuffle the lanes around into
18695 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
18696 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
18697 gen_lowpart (V4DImode, d->op1),
18698 GEN_INT (0x20)));
18699 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
18700 gen_lowpart (V4DImode, d->op1),
18701 GEN_INT (0x31)));
18702
18703 /* Swap the 2nd and 3rd position in each lane into
18704 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
18705 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
18706 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18707 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
18708 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18709
18710 /* Now an vpunpck[lh]qdq will produce
18711 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
18712 if (odd)
18713 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
18714 gen_lowpart (V4DImode, t2));
18715 else
18716 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
18717 gen_lowpart (V4DImode, t2));
18718 emit_insn (t3);
18719 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
18720 break;
18721
18722 default:
18723 gcc_unreachable ();
18724 }
18725
18726 return true;
18727 }
18728
18729 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
18730 extract-even and extract-odd permutations. */
18731
18732 static bool
18733 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
18734 {
18735 unsigned i, odd, nelt = d->nelt;
18736
18737 odd = d->perm[0];
18738 if (odd != 0 && odd != 1)
18739 return false;
18740
18741 for (i = 1; i < nelt; ++i)
18742 if (d->perm[i] != 2 * i + odd)
18743 return false;
18744
18745 return expand_vec_perm_even_odd_1 (d, odd);
18746 }
18747
18748 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
18749 permutations. We assume that expand_vec_perm_1 has already failed. */
18750
18751 static bool
18752 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
18753 {
18754 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
18755 machine_mode vmode = d->vmode;
18756 unsigned char perm2[4];
18757 rtx op0 = d->op0, dest;
18758 bool ok;
18759
18760 switch (vmode)
18761 {
18762 case E_V4DFmode:
18763 case E_V8SFmode:
18764 /* These are special-cased in sse.md so that we can optionally
18765 use the vbroadcast instruction. They expand to two insns
18766 if the input happens to be in a register. */
18767 gcc_unreachable ();
18768
18769 case E_V2DFmode:
18770 case E_V2DImode:
18771 case E_V4SFmode:
18772 case E_V4SImode:
18773 /* These are always implementable using standard shuffle patterns. */
18774 gcc_unreachable ();
18775
18776 case E_V8HImode:
18777 case E_V16QImode:
18778 /* These can be implemented via interleave. We save one insn by
18779 stopping once we have promoted to V4SImode and then use pshufd. */
18780 if (d->testing_p)
18781 return true;
18782 do
18783 {
18784 rtx dest;
18785 rtx (*gen) (rtx, rtx, rtx)
18786 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
18787 : gen_vec_interleave_lowv8hi;
18788
18789 if (elt >= nelt2)
18790 {
18791 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
18792 : gen_vec_interleave_highv8hi;
18793 elt -= nelt2;
18794 }
18795 nelt2 /= 2;
18796
18797 dest = gen_reg_rtx (vmode);
18798 emit_insn (gen (dest, op0, op0));
18799 vmode = get_mode_wider_vector (vmode);
18800 op0 = gen_lowpart (vmode, dest);
18801 }
18802 while (vmode != V4SImode);
18803
18804 memset (perm2, elt, 4);
18805 dest = gen_reg_rtx (V4SImode);
18806 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
18807 gcc_assert (ok);
18808 if (!d->testing_p)
18809 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
18810 return true;
18811
18812 case E_V64QImode:
18813 case E_V32QImode:
18814 case E_V16HImode:
18815 case E_V8SImode:
18816 case E_V4DImode:
18817 /* For AVX2 broadcasts of the first element vpbroadcast* or
18818 vpermq should be used by expand_vec_perm_1. */
18819 gcc_assert (!TARGET_AVX2 || d->perm[0]);
18820 return false;
18821
18822 default:
18823 gcc_unreachable ();
18824 }
18825 }
18826
18827 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
18828 broadcast permutations. */
18829
18830 static bool
18831 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
18832 {
18833 unsigned i, elt, nelt = d->nelt;
18834
18835 if (!d->one_operand_p)
18836 return false;
18837
18838 elt = d->perm[0];
18839 for (i = 1; i < nelt; ++i)
18840 if (d->perm[i] != elt)
18841 return false;
18842
18843 return expand_vec_perm_broadcast_1 (d);
18844 }
18845
18846 /* Implement arbitrary permutations of two V64QImode operands
18847 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
18848 static bool
18849 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
18850 {
18851 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
18852 return false;
18853
18854 if (d->testing_p)
18855 return true;
18856
18857 struct expand_vec_perm_d ds[2];
18858 rtx rperm[128], vperm, target0, target1;
18859 unsigned int i, nelt;
18860 machine_mode vmode;
18861
18862 nelt = d->nelt;
18863 vmode = V64QImode;
18864
18865 for (i = 0; i < 2; i++)
18866 {
18867 ds[i] = *d;
18868 ds[i].vmode = V32HImode;
18869 ds[i].nelt = 32;
18870 ds[i].target = gen_reg_rtx (V32HImode);
18871 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
18872 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
18873 }
18874
18875 /* Prepare permutations such that the first one takes care of
18876 putting the even bytes into the right positions or one higher
18877 positions (ds[0]) and the second one takes care of
18878 putting the odd bytes into the right positions or one below
18879 (ds[1]). */
18880
18881 for (i = 0; i < nelt; i++)
18882 {
18883 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
18884 if (i & 1)
18885 {
18886 rperm[i] = constm1_rtx;
18887 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
18888 }
18889 else
18890 {
18891 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
18892 rperm[i + 64] = constm1_rtx;
18893 }
18894 }
18895
18896 bool ok = expand_vec_perm_1 (&ds[0]);
18897 gcc_assert (ok);
18898 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
18899
18900 ok = expand_vec_perm_1 (&ds[1]);
18901 gcc_assert (ok);
18902 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
18903
18904 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
18905 vperm = force_reg (vmode, vperm);
18906 target0 = gen_reg_rtx (V64QImode);
18907 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
18908
18909 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
18910 vperm = force_reg (vmode, vperm);
18911 target1 = gen_reg_rtx (V64QImode);
18912 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
18913
18914 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
18915 return true;
18916 }
18917
18918 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
18919 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
18920 all the shorter instruction sequences. */
18921
18922 static bool
18923 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
18924 {
18925 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
18926 unsigned int i, nelt, eltsz;
18927 bool used[4];
18928
18929 if (!TARGET_AVX2
18930 || d->one_operand_p
18931 || (d->vmode != V32QImode && d->vmode != V16HImode))
18932 return false;
18933
18934 if (d->testing_p)
18935 return true;
18936
18937 nelt = d->nelt;
18938 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18939
18940 /* Generate 4 permutation masks. If the required element is within
18941 the same lane, it is shuffled in. If the required element from the
18942 other lane, force a zero by setting bit 7 in the permutation mask.
18943 In the other mask the mask has non-negative elements if element
18944 is requested from the other lane, but also moved to the other lane,
18945 so that the result of vpshufb can have the two V2TImode halves
18946 swapped. */
18947 m128 = GEN_INT (-128);
18948 for (i = 0; i < 32; ++i)
18949 {
18950 rperm[0][i] = m128;
18951 rperm[1][i] = m128;
18952 rperm[2][i] = m128;
18953 rperm[3][i] = m128;
18954 }
18955 used[0] = false;
18956 used[1] = false;
18957 used[2] = false;
18958 used[3] = false;
18959 for (i = 0; i < nelt; ++i)
18960 {
18961 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18962 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
18963 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
18964
18965 for (j = 0; j < eltsz; ++j)
18966 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
18967 used[which] = true;
18968 }
18969
18970 for (i = 0; i < 2; ++i)
18971 {
18972 if (!used[2 * i + 1])
18973 {
18974 h[i] = NULL_RTX;
18975 continue;
18976 }
18977 vperm = gen_rtx_CONST_VECTOR (V32QImode,
18978 gen_rtvec_v (32, rperm[2 * i + 1]));
18979 vperm = force_reg (V32QImode, vperm);
18980 h[i] = gen_reg_rtx (V32QImode);
18981 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
18982 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
18983 }
18984
18985 /* Swap the 128-byte lanes of h[X]. */
18986 for (i = 0; i < 2; ++i)
18987 {
18988 if (h[i] == NULL_RTX)
18989 continue;
18990 op = gen_reg_rtx (V4DImode);
18991 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
18992 const2_rtx, GEN_INT (3), const0_rtx,
18993 const1_rtx));
18994 h[i] = gen_lowpart (V32QImode, op);
18995 }
18996
18997 for (i = 0; i < 2; ++i)
18998 {
18999 if (!used[2 * i])
19000 {
19001 l[i] = NULL_RTX;
19002 continue;
19003 }
19004 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
19005 vperm = force_reg (V32QImode, vperm);
19006 l[i] = gen_reg_rtx (V32QImode);
19007 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19008 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
19009 }
19010
19011 for (i = 0; i < 2; ++i)
19012 {
19013 if (h[i] && l[i])
19014 {
19015 op = gen_reg_rtx (V32QImode);
19016 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
19017 l[i] = op;
19018 }
19019 else if (h[i])
19020 l[i] = h[i];
19021 }
19022
19023 gcc_assert (l[0] && l[1]);
19024 op = d->target;
19025 if (d->vmode != V32QImode)
19026 op = gen_reg_rtx (V32QImode);
19027 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
19028 if (op != d->target)
19029 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
19030 return true;
19031 }
19032
19033 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
19034 taken care of, perform the expansion in D and return true on success. */
19035
19036 static bool
19037 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
19038 {
19039 /* Try a single instruction expansion. */
19040 if (expand_vec_perm_1 (d))
19041 return true;
19042
19043 /* Try sequences of two instructions. */
19044
19045 if (expand_vec_perm_pshuflw_pshufhw (d))
19046 return true;
19047
19048 if (expand_vec_perm_palignr (d, false))
19049 return true;
19050
19051 if (expand_vec_perm_interleave2 (d))
19052 return true;
19053
19054 if (expand_vec_perm_broadcast (d))
19055 return true;
19056
19057 if (expand_vec_perm_vpermq_perm_1 (d))
19058 return true;
19059
19060 if (expand_vec_perm_vperm2f128 (d))
19061 return true;
19062
19063 if (expand_vec_perm_pblendv (d))
19064 return true;
19065
19066 /* Try sequences of three instructions. */
19067
19068 if (expand_vec_perm_even_odd_pack (d))
19069 return true;
19070
19071 if (expand_vec_perm_2vperm2f128_vshuf (d))
19072 return true;
19073
19074 if (expand_vec_perm_pshufb2 (d))
19075 return true;
19076
19077 if (expand_vec_perm_interleave3 (d))
19078 return true;
19079
19080 if (expand_vec_perm_vperm2f128_vblend (d))
19081 return true;
19082
19083 /* Try sequences of four instructions. */
19084
19085 if (expand_vec_perm_even_odd_trunc (d))
19086 return true;
19087 if (expand_vec_perm_vpshufb2_vpermq (d))
19088 return true;
19089
19090 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
19091 return true;
19092
19093 if (expand_vec_perm_vpermt2_vpshub2 (d))
19094 return true;
19095
19096 /* ??? Look for narrow permutations whose element orderings would
19097 allow the promotion to a wider mode. */
19098
19099 /* ??? Look for sequences of interleave or a wider permute that place
19100 the data into the correct lanes for a half-vector shuffle like
19101 pshuf[lh]w or vpermilps. */
19102
19103 /* ??? Look for sequences of interleave that produce the desired results.
19104 The combinatorics of punpck[lh] get pretty ugly... */
19105
19106 if (expand_vec_perm_even_odd (d))
19107 return true;
19108
19109 /* Even longer sequences. */
19110 if (expand_vec_perm_vpshufb4_vpermq2 (d))
19111 return true;
19112
19113 /* See if we can get the same permutation in different vector integer
19114 mode. */
19115 struct expand_vec_perm_d nd;
19116 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19117 {
19118 if (!d->testing_p)
19119 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19120 return true;
19121 }
19122
19123 return false;
19124 }
19125
19126 /* If a permutation only uses one operand, make it clear. Returns true
19127 if the permutation references both operands. */
19128
19129 static bool
19130 canonicalize_perm (struct expand_vec_perm_d *d)
19131 {
19132 int i, which, nelt = d->nelt;
19133
19134 for (i = which = 0; i < nelt; ++i)
19135 which |= (d->perm[i] < nelt ? 1 : 2);
19136
19137 d->one_operand_p = true;
19138 switch (which)
19139 {
19140 default:
19141 gcc_unreachable();
19142
19143 case 3:
19144 if (!rtx_equal_p (d->op0, d->op1))
19145 {
19146 d->one_operand_p = false;
19147 break;
19148 }
19149 /* The elements of PERM do not suggest that only the first operand
19150 is used, but both operands are identical. Allow easier matching
19151 of the permutation by folding the permutation into the single
19152 input vector. */
19153 /* FALLTHRU */
19154
19155 case 2:
19156 for (i = 0; i < nelt; ++i)
19157 d->perm[i] &= nelt - 1;
19158 d->op0 = d->op1;
19159 break;
19160
19161 case 1:
19162 d->op1 = d->op0;
19163 break;
19164 }
19165
19166 return (which == 3);
19167 }
19168
19169 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19170
19171 bool
19172 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
19173 rtx op1, const vec_perm_indices &sel)
19174 {
19175 struct expand_vec_perm_d d;
19176 unsigned char perm[MAX_VECT_LEN];
19177 unsigned int i, nelt, which;
19178 bool two_args;
19179
19180 d.target = target;
19181 d.op0 = op0;
19182 d.op1 = op1;
19183
19184 d.vmode = vmode;
19185 gcc_assert (VECTOR_MODE_P (d.vmode));
19186 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19187 d.testing_p = !target;
19188
19189 gcc_assert (sel.length () == nelt);
19190 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
19191
19192 /* Given sufficient ISA support we can just return true here
19193 for selected vector modes. */
19194 switch (d.vmode)
19195 {
19196 case E_V16SFmode:
19197 case E_V16SImode:
19198 case E_V8DImode:
19199 case E_V8DFmode:
19200 if (!TARGET_AVX512F)
19201 return false;
19202 /* All implementable with a single vperm[it]2 insn. */
19203 if (d.testing_p)
19204 return true;
19205 break;
19206 case E_V32HImode:
19207 if (!TARGET_AVX512BW)
19208 return false;
19209 if (d.testing_p)
19210 /* All implementable with a single vperm[it]2 insn. */
19211 return true;
19212 break;
19213 case E_V64QImode:
19214 if (!TARGET_AVX512BW)
19215 return false;
19216 if (d.testing_p)
19217 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
19218 return true;
19219 break;
19220 case E_V8SImode:
19221 case E_V8SFmode:
19222 case E_V4DFmode:
19223 case E_V4DImode:
19224 if (!TARGET_AVX)
19225 return false;
19226 if (d.testing_p && TARGET_AVX512VL)
19227 /* All implementable with a single vperm[it]2 insn. */
19228 return true;
19229 break;
19230 case E_V16HImode:
19231 if (!TARGET_SSE2)
19232 return false;
19233 if (d.testing_p && TARGET_AVX2)
19234 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19235 return true;
19236 break;
19237 case E_V32QImode:
19238 if (!TARGET_SSE2)
19239 return false;
19240 if (d.testing_p && TARGET_AVX2)
19241 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19242 return true;
19243 break;
19244 case E_V8HImode:
19245 case E_V16QImode:
19246 if (!TARGET_SSE2)
19247 return false;
19248 /* Fall through. */
19249 case E_V4SImode:
19250 case E_V4SFmode:
19251 if (!TARGET_SSE)
19252 return false;
19253 /* All implementable with a single vpperm insn. */
19254 if (d.testing_p && TARGET_XOP)
19255 return true;
19256 /* All implementable with 2 pshufb + 1 ior. */
19257 if (d.testing_p && TARGET_SSSE3)
19258 return true;
19259 break;
19260 case E_V2DImode:
19261 case E_V2DFmode:
19262 if (!TARGET_SSE)
19263 return false;
19264 /* All implementable with shufpd or unpck[lh]pd. */
19265 if (d.testing_p)
19266 return true;
19267 break;
19268 default:
19269 return false;
19270 }
19271
19272 for (i = which = 0; i < nelt; ++i)
19273 {
19274 unsigned char e = sel[i];
19275 gcc_assert (e < 2 * nelt);
19276 d.perm[i] = e;
19277 perm[i] = e;
19278 which |= (e < nelt ? 1 : 2);
19279 }
19280
19281 if (d.testing_p)
19282 {
19283 /* For all elements from second vector, fold the elements to first. */
19284 if (which == 2)
19285 for (i = 0; i < nelt; ++i)
19286 d.perm[i] -= nelt;
19287
19288 /* Check whether the mask can be applied to the vector type. */
19289 d.one_operand_p = (which != 3);
19290
19291 /* Implementable with shufps or pshufd. */
19292 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
19293 return true;
19294
19295 /* Otherwise we have to go through the motions and see if we can
19296 figure out how to generate the requested permutation. */
19297 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
19298 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
19299 if (!d.one_operand_p)
19300 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
19301
19302 start_sequence ();
19303 bool ret = ix86_expand_vec_perm_const_1 (&d);
19304 end_sequence ();
19305
19306 return ret;
19307 }
19308
19309 two_args = canonicalize_perm (&d);
19310
19311 if (ix86_expand_vec_perm_const_1 (&d))
19312 return true;
19313
19314 /* If the selector says both arguments are needed, but the operands are the
19315 same, the above tried to expand with one_operand_p and flattened selector.
19316 If that didn't work, retry without one_operand_p; we succeeded with that
19317 during testing. */
19318 if (two_args && d.one_operand_p)
19319 {
19320 d.one_operand_p = false;
19321 memcpy (d.perm, perm, sizeof (perm));
19322 return ix86_expand_vec_perm_const_1 (&d);
19323 }
19324
19325 return false;
19326 }
19327
19328 void
19329 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
19330 {
19331 struct expand_vec_perm_d d;
19332 unsigned i, nelt;
19333
19334 d.target = targ;
19335 d.op0 = op0;
19336 d.op1 = op1;
19337 d.vmode = GET_MODE (targ);
19338 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19339 d.one_operand_p = false;
19340 d.testing_p = false;
19341
19342 for (i = 0; i < nelt; ++i)
19343 d.perm[i] = i * 2 + odd;
19344
19345 /* We'll either be able to implement the permutation directly... */
19346 if (expand_vec_perm_1 (&d))
19347 return;
19348
19349 /* ... or we use the special-case patterns. */
19350 expand_vec_perm_even_odd_1 (&d, odd);
19351 }
19352
19353 static void
19354 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
19355 {
19356 struct expand_vec_perm_d d;
19357 unsigned i, nelt, base;
19358 bool ok;
19359
19360 d.target = targ;
19361 d.op0 = op0;
19362 d.op1 = op1;
19363 d.vmode = GET_MODE (targ);
19364 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19365 d.one_operand_p = false;
19366 d.testing_p = false;
19367
19368 base = high_p ? nelt / 2 : 0;
19369 for (i = 0; i < nelt / 2; ++i)
19370 {
19371 d.perm[i * 2] = i + base;
19372 d.perm[i * 2 + 1] = i + base + nelt;
19373 }
19374
19375 /* Note that for AVX this isn't one instruction. */
19376 ok = ix86_expand_vec_perm_const_1 (&d);
19377 gcc_assert (ok);
19378 }
19379
19380
19381 /* Expand a vector operation CODE for a V*QImode in terms of the
19382 same operation on V*HImode. */
19383
19384 void
19385 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
19386 {
19387 machine_mode qimode = GET_MODE (dest);
19388 machine_mode himode;
19389 rtx (*gen_il) (rtx, rtx, rtx);
19390 rtx (*gen_ih) (rtx, rtx, rtx);
19391 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
19392 struct expand_vec_perm_d d;
19393 bool ok, full_interleave;
19394 bool uns_p = false;
19395 int i;
19396
19397 switch (qimode)
19398 {
19399 case E_V16QImode:
19400 himode = V8HImode;
19401 gen_il = gen_vec_interleave_lowv16qi;
19402 gen_ih = gen_vec_interleave_highv16qi;
19403 break;
19404 case E_V32QImode:
19405 himode = V16HImode;
19406 gen_il = gen_avx2_interleave_lowv32qi;
19407 gen_ih = gen_avx2_interleave_highv32qi;
19408 break;
19409 case E_V64QImode:
19410 himode = V32HImode;
19411 gen_il = gen_avx512bw_interleave_lowv64qi;
19412 gen_ih = gen_avx512bw_interleave_highv64qi;
19413 break;
19414 default:
19415 gcc_unreachable ();
19416 }
19417
19418 op2_l = op2_h = op2;
19419 switch (code)
19420 {
19421 case MULT:
19422 /* Unpack data such that we've got a source byte in each low byte of
19423 each word. We don't care what goes into the high byte of each word.
19424 Rather than trying to get zero in there, most convenient is to let
19425 it be a copy of the low byte. */
19426 op2_l = gen_reg_rtx (qimode);
19427 op2_h = gen_reg_rtx (qimode);
19428 emit_insn (gen_il (op2_l, op2, op2));
19429 emit_insn (gen_ih (op2_h, op2, op2));
19430
19431 op1_l = gen_reg_rtx (qimode);
19432 op1_h = gen_reg_rtx (qimode);
19433 emit_insn (gen_il (op1_l, op1, op1));
19434 emit_insn (gen_ih (op1_h, op1, op1));
19435 full_interleave = qimode == V16QImode;
19436 break;
19437
19438 case ASHIFT:
19439 case LSHIFTRT:
19440 uns_p = true;
19441 /* FALLTHRU */
19442 case ASHIFTRT:
19443 op1_l = gen_reg_rtx (himode);
19444 op1_h = gen_reg_rtx (himode);
19445 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
19446 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
19447 full_interleave = true;
19448 break;
19449 default:
19450 gcc_unreachable ();
19451 }
19452
19453 /* Perform the operation. */
19454 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
19455 1, OPTAB_DIRECT);
19456 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
19457 1, OPTAB_DIRECT);
19458 gcc_assert (res_l && res_h);
19459
19460 /* Merge the data back into the right place. */
19461 d.target = dest;
19462 d.op0 = gen_lowpart (qimode, res_l);
19463 d.op1 = gen_lowpart (qimode, res_h);
19464 d.vmode = qimode;
19465 d.nelt = GET_MODE_NUNITS (qimode);
19466 d.one_operand_p = false;
19467 d.testing_p = false;
19468
19469 if (full_interleave)
19470 {
19471 /* For SSE2, we used an full interleave, so the desired
19472 results are in the even elements. */
19473 for (i = 0; i < d.nelt; ++i)
19474 d.perm[i] = i * 2;
19475 }
19476 else
19477 {
19478 /* For AVX, the interleave used above was not cross-lane. So the
19479 extraction is evens but with the second and third quarter swapped.
19480 Happily, that is even one insn shorter than even extraction.
19481 For AVX512BW we have 4 lanes. We extract evens from within a lane,
19482 always first from the first and then from the second source operand,
19483 the index bits above the low 4 bits remains the same.
19484 Thus, for d.nelt == 32 we want permutation
19485 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
19486 and for d.nelt == 64 we want permutation
19487 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
19488 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
19489 for (i = 0; i < d.nelt; ++i)
19490 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
19491 }
19492
19493 ok = ix86_expand_vec_perm_const_1 (&d);
19494 gcc_assert (ok);
19495
19496 set_unique_reg_note (get_last_insn (), REG_EQUAL,
19497 gen_rtx_fmt_ee (code, qimode, op1, op2));
19498 }
19499
19500 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
19501 if op is CONST_VECTOR with all odd elements equal to their
19502 preceding element. */
19503
19504 static bool
19505 const_vector_equal_evenodd_p (rtx op)
19506 {
19507 machine_mode mode = GET_MODE (op);
19508 int i, nunits = GET_MODE_NUNITS (mode);
19509 if (GET_CODE (op) != CONST_VECTOR
19510 || nunits != CONST_VECTOR_NUNITS (op))
19511 return false;
19512 for (i = 0; i < nunits; i += 2)
19513 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
19514 return false;
19515 return true;
19516 }
19517
19518 void
19519 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
19520 bool uns_p, bool odd_p)
19521 {
19522 machine_mode mode = GET_MODE (op1);
19523 machine_mode wmode = GET_MODE (dest);
19524 rtx x;
19525 rtx orig_op1 = op1, orig_op2 = op2;
19526
19527 if (!nonimmediate_operand (op1, mode))
19528 op1 = force_reg (mode, op1);
19529 if (!nonimmediate_operand (op2, mode))
19530 op2 = force_reg (mode, op2);
19531
19532 /* We only play even/odd games with vectors of SImode. */
19533 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
19534
19535 /* If we're looking for the odd results, shift those members down to
19536 the even slots. For some cpus this is faster than a PSHUFD. */
19537 if (odd_p)
19538 {
19539 /* For XOP use vpmacsdqh, but only for smult, as it is only
19540 signed. */
19541 if (TARGET_XOP && mode == V4SImode && !uns_p)
19542 {
19543 x = force_reg (wmode, CONST0_RTX (wmode));
19544 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
19545 return;
19546 }
19547
19548 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
19549 if (!const_vector_equal_evenodd_p (orig_op1))
19550 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
19551 x, NULL, 1, OPTAB_DIRECT);
19552 if (!const_vector_equal_evenodd_p (orig_op2))
19553 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
19554 x, NULL, 1, OPTAB_DIRECT);
19555 op1 = gen_lowpart (mode, op1);
19556 op2 = gen_lowpart (mode, op2);
19557 }
19558
19559 if (mode == V16SImode)
19560 {
19561 if (uns_p)
19562 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
19563 else
19564 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
19565 }
19566 else if (mode == V8SImode)
19567 {
19568 if (uns_p)
19569 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
19570 else
19571 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
19572 }
19573 else if (uns_p)
19574 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
19575 else if (TARGET_SSE4_1)
19576 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
19577 else
19578 {
19579 rtx s1, s2, t0, t1, t2;
19580
19581 /* The easiest way to implement this without PMULDQ is to go through
19582 the motions as if we are performing a full 64-bit multiply. With
19583 the exception that we need to do less shuffling of the elements. */
19584
19585 /* Compute the sign-extension, aka highparts, of the two operands. */
19586 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
19587 op1, pc_rtx, pc_rtx);
19588 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
19589 op2, pc_rtx, pc_rtx);
19590
19591 /* Multiply LO(A) * HI(B), and vice-versa. */
19592 t1 = gen_reg_rtx (wmode);
19593 t2 = gen_reg_rtx (wmode);
19594 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
19595 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
19596
19597 /* Multiply LO(A) * LO(B). */
19598 t0 = gen_reg_rtx (wmode);
19599 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
19600
19601 /* Combine and shift the highparts into place. */
19602 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
19603 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
19604 1, OPTAB_DIRECT);
19605
19606 /* Combine high and low parts. */
19607 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
19608 return;
19609 }
19610 emit_insn (x);
19611 }
19612
19613 void
19614 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
19615 bool uns_p, bool high_p)
19616 {
19617 machine_mode wmode = GET_MODE (dest);
19618 machine_mode mode = GET_MODE (op1);
19619 rtx t1, t2, t3, t4, mask;
19620
19621 switch (mode)
19622 {
19623 case E_V4SImode:
19624 t1 = gen_reg_rtx (mode);
19625 t2 = gen_reg_rtx (mode);
19626 if (TARGET_XOP && !uns_p)
19627 {
19628 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
19629 shuffle the elements once so that all elements are in the right
19630 place for immediate use: { A C B D }. */
19631 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
19632 const1_rtx, GEN_INT (3)));
19633 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
19634 const1_rtx, GEN_INT (3)));
19635 }
19636 else
19637 {
19638 /* Put the elements into place for the multiply. */
19639 ix86_expand_vec_interleave (t1, op1, op1, high_p);
19640 ix86_expand_vec_interleave (t2, op2, op2, high_p);
19641 high_p = false;
19642 }
19643 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
19644 break;
19645
19646 case E_V8SImode:
19647 /* Shuffle the elements between the lanes. After this we
19648 have { A B E F | C D G H } for each operand. */
19649 t1 = gen_reg_rtx (V4DImode);
19650 t2 = gen_reg_rtx (V4DImode);
19651 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
19652 const0_rtx, const2_rtx,
19653 const1_rtx, GEN_INT (3)));
19654 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
19655 const0_rtx, const2_rtx,
19656 const1_rtx, GEN_INT (3)));
19657
19658 /* Shuffle the elements within the lanes. After this we
19659 have { A A B B | C C D D } or { E E F F | G G H H }. */
19660 t3 = gen_reg_rtx (V8SImode);
19661 t4 = gen_reg_rtx (V8SImode);
19662 mask = GEN_INT (high_p
19663 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
19664 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
19665 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
19666 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
19667
19668 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
19669 break;
19670
19671 case E_V8HImode:
19672 case E_V16HImode:
19673 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
19674 uns_p, OPTAB_DIRECT);
19675 t2 = expand_binop (mode,
19676 uns_p ? umul_highpart_optab : smul_highpart_optab,
19677 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
19678 gcc_assert (t1 && t2);
19679
19680 t3 = gen_reg_rtx (mode);
19681 ix86_expand_vec_interleave (t3, t1, t2, high_p);
19682 emit_move_insn (dest, gen_lowpart (wmode, t3));
19683 break;
19684
19685 case E_V16QImode:
19686 case E_V32QImode:
19687 case E_V32HImode:
19688 case E_V16SImode:
19689 case E_V64QImode:
19690 t1 = gen_reg_rtx (wmode);
19691 t2 = gen_reg_rtx (wmode);
19692 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
19693 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
19694
19695 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
19696 break;
19697
19698 default:
19699 gcc_unreachable ();
19700 }
19701 }
19702
19703 void
19704 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
19705 {
19706 rtx res_1, res_2, res_3, res_4;
19707
19708 res_1 = gen_reg_rtx (V4SImode);
19709 res_2 = gen_reg_rtx (V4SImode);
19710 res_3 = gen_reg_rtx (V2DImode);
19711 res_4 = gen_reg_rtx (V2DImode);
19712 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
19713 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
19714
19715 /* Move the results in element 2 down to element 1; we don't care
19716 what goes in elements 2 and 3. Then we can merge the parts
19717 back together with an interleave.
19718
19719 Note that two other sequences were tried:
19720 (1) Use interleaves at the start instead of psrldq, which allows
19721 us to use a single shufps to merge things back at the end.
19722 (2) Use shufps here to combine the two vectors, then pshufd to
19723 put the elements in the correct order.
19724 In both cases the cost of the reformatting stall was too high
19725 and the overall sequence slower. */
19726
19727 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
19728 const0_rtx, const2_rtx,
19729 const0_rtx, const0_rtx));
19730 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
19731 const0_rtx, const2_rtx,
19732 const0_rtx, const0_rtx));
19733 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
19734
19735 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
19736 }
19737
19738 void
19739 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
19740 {
19741 machine_mode mode = GET_MODE (op0);
19742 rtx t1, t2, t3, t4, t5, t6;
19743
19744 if (TARGET_AVX512DQ && mode == V8DImode)
19745 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
19746 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
19747 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
19748 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
19749 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
19750 else if (TARGET_XOP && mode == V2DImode)
19751 {
19752 /* op1: A,B,C,D, op2: E,F,G,H */
19753 op1 = gen_lowpart (V4SImode, op1);
19754 op2 = gen_lowpart (V4SImode, op2);
19755
19756 t1 = gen_reg_rtx (V4SImode);
19757 t2 = gen_reg_rtx (V4SImode);
19758 t3 = gen_reg_rtx (V2DImode);
19759 t4 = gen_reg_rtx (V2DImode);
19760
19761 /* t1: B,A,D,C */
19762 emit_insn (gen_sse2_pshufd_1 (t1, op1,
19763 GEN_INT (1),
19764 GEN_INT (0),
19765 GEN_INT (3),
19766 GEN_INT (2)));
19767
19768 /* t2: (B*E),(A*F),(D*G),(C*H) */
19769 emit_insn (gen_mulv4si3 (t2, t1, op2));
19770
19771 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
19772 emit_insn (gen_xop_phadddq (t3, t2));
19773
19774 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
19775 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
19776
19777 /* Multiply lower parts and add all */
19778 t5 = gen_reg_rtx (V2DImode);
19779 emit_insn (gen_vec_widen_umult_even_v4si (t5,
19780 gen_lowpart (V4SImode, op1),
19781 gen_lowpart (V4SImode, op2)));
19782 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
19783
19784 }
19785 else
19786 {
19787 machine_mode nmode;
19788 rtx (*umul) (rtx, rtx, rtx);
19789
19790 if (mode == V2DImode)
19791 {
19792 umul = gen_vec_widen_umult_even_v4si;
19793 nmode = V4SImode;
19794 }
19795 else if (mode == V4DImode)
19796 {
19797 umul = gen_vec_widen_umult_even_v8si;
19798 nmode = V8SImode;
19799 }
19800 else if (mode == V8DImode)
19801 {
19802 umul = gen_vec_widen_umult_even_v16si;
19803 nmode = V16SImode;
19804 }
19805 else
19806 gcc_unreachable ();
19807
19808
19809 /* Multiply low parts. */
19810 t1 = gen_reg_rtx (mode);
19811 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
19812
19813 /* Shift input vectors right 32 bits so we can multiply high parts. */
19814 t6 = GEN_INT (32);
19815 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
19816 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
19817
19818 /* Multiply high parts by low parts. */
19819 t4 = gen_reg_rtx (mode);
19820 t5 = gen_reg_rtx (mode);
19821 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
19822 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
19823
19824 /* Combine and shift the highparts back. */
19825 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
19826 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
19827
19828 /* Combine high and low parts. */
19829 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
19830 }
19831
19832 set_unique_reg_note (get_last_insn (), REG_EQUAL,
19833 gen_rtx_MULT (mode, op1, op2));
19834 }
19835
19836 /* Return 1 if control tansfer instruction INSN
19837 should be encoded with notrack prefix. */
19838
19839 bool
19840 ix86_notrack_prefixed_insn_p (rtx insn)
19841 {
19842 if (!insn || !((flag_cf_protection & CF_BRANCH)))
19843 return false;
19844
19845 if (CALL_P (insn))
19846 {
19847 rtx call = get_call_rtx_from (insn);
19848 gcc_assert (call != NULL_RTX);
19849 rtx addr = XEXP (call, 0);
19850
19851 /* Do not emit 'notrack' if it's not an indirect call. */
19852 if (MEM_P (addr)
19853 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
19854 return false;
19855 else
19856 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
19857 }
19858
19859 if (JUMP_P (insn) && !flag_cet_switch)
19860 {
19861 rtx target = JUMP_LABEL (insn);
19862 if (target == NULL_RTX || ANY_RETURN_P (target))
19863 return false;
19864
19865 /* Check the jump is a switch table. */
19866 rtx_insn *label = as_a<rtx_insn *> (target);
19867 rtx_insn *table = next_insn (label);
19868 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
19869 return false;
19870 else
19871 return true;
19872 }
19873 return false;
19874 }
19875
19876 /* Calculate integer abs() using only SSE2 instructions. */
19877
19878 void
19879 ix86_expand_sse2_abs (rtx target, rtx input)
19880 {
19881 machine_mode mode = GET_MODE (target);
19882 rtx tmp0, tmp1, x;
19883
19884 switch (mode)
19885 {
19886 case E_V2DImode:
19887 case E_V4DImode:
19888 /* For 64-bit signed integer X, with SSE4.2 use
19889 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
19890 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
19891 32 and use logical instead of arithmetic right shift (which is
19892 unimplemented) and subtract. */
19893 if (TARGET_SSE4_2)
19894 {
19895 tmp0 = gen_reg_rtx (mode);
19896 tmp1 = gen_reg_rtx (mode);
19897 emit_move_insn (tmp1, CONST0_RTX (mode));
19898 if (mode == E_V2DImode)
19899 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
19900 else
19901 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
19902 }
19903 else
19904 {
19905 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
19906 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
19907 - 1), NULL, 0, OPTAB_DIRECT);
19908 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
19909 }
19910
19911 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
19912 NULL, 0, OPTAB_DIRECT);
19913 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
19914 target, 0, OPTAB_DIRECT);
19915 break;
19916
19917 case E_V4SImode:
19918 /* For 32-bit signed integer X, the best way to calculate the absolute
19919 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
19920 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
19921 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
19922 NULL, 0, OPTAB_DIRECT);
19923 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
19924 NULL, 0, OPTAB_DIRECT);
19925 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
19926 target, 0, OPTAB_DIRECT);
19927 break;
19928
19929 case E_V8HImode:
19930 /* For 16-bit signed integer X, the best way to calculate the absolute
19931 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
19932 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
19933
19934 x = expand_simple_binop (mode, SMAX, tmp0, input,
19935 target, 0, OPTAB_DIRECT);
19936 break;
19937
19938 case E_V16QImode:
19939 /* For 8-bit signed integer X, the best way to calculate the absolute
19940 value of X is min ((unsigned char) X, (unsigned char) (-X)),
19941 as SSE2 provides the PMINUB insn. */
19942 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
19943
19944 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
19945 target, 0, OPTAB_DIRECT);
19946 break;
19947
19948 default:
19949 gcc_unreachable ();
19950 }
19951
19952 if (x != target)
19953 emit_move_insn (target, x);
19954 }
19955
19956 /* Expand an extract from a vector register through pextr insn.
19957 Return true if successful. */
19958
19959 bool
19960 ix86_expand_pextr (rtx *operands)
19961 {
19962 rtx dst = operands[0];
19963 rtx src = operands[1];
19964
19965 unsigned int size = INTVAL (operands[2]);
19966 unsigned int pos = INTVAL (operands[3]);
19967
19968 if (SUBREG_P (dst))
19969 {
19970 /* Reject non-lowpart subregs. */
19971 if (SUBREG_BYTE (dst) > 0)
19972 return false;
19973 dst = SUBREG_REG (dst);
19974 }
19975
19976 if (SUBREG_P (src))
19977 {
19978 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
19979 src = SUBREG_REG (src);
19980 }
19981
19982 switch (GET_MODE (src))
19983 {
19984 case E_V16QImode:
19985 case E_V8HImode:
19986 case E_V4SImode:
19987 case E_V2DImode:
19988 case E_V1TImode:
19989 case E_TImode:
19990 {
19991 machine_mode srcmode, dstmode;
19992 rtx d, pat;
19993
19994 if (!int_mode_for_size (size, 0).exists (&dstmode))
19995 return false;
19996
19997 switch (dstmode)
19998 {
19999 case E_QImode:
20000 if (!TARGET_SSE4_1)
20001 return false;
20002 srcmode = V16QImode;
20003 break;
20004
20005 case E_HImode:
20006 if (!TARGET_SSE2)
20007 return false;
20008 srcmode = V8HImode;
20009 break;
20010
20011 case E_SImode:
20012 if (!TARGET_SSE4_1)
20013 return false;
20014 srcmode = V4SImode;
20015 break;
20016
20017 case E_DImode:
20018 gcc_assert (TARGET_64BIT);
20019 if (!TARGET_SSE4_1)
20020 return false;
20021 srcmode = V2DImode;
20022 break;
20023
20024 default:
20025 return false;
20026 }
20027
20028 /* Reject extractions from misaligned positions. */
20029 if (pos & (size-1))
20030 return false;
20031
20032 if (GET_MODE (dst) == dstmode)
20033 d = dst;
20034 else
20035 d = gen_reg_rtx (dstmode);
20036
20037 /* Construct insn pattern. */
20038 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
20039 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
20040
20041 /* Let the rtl optimizers know about the zero extension performed. */
20042 if (dstmode == QImode || dstmode == HImode)
20043 {
20044 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
20045 d = gen_lowpart (SImode, d);
20046 }
20047
20048 emit_insn (gen_rtx_SET (d, pat));
20049
20050 if (d != dst)
20051 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20052 return true;
20053 }
20054
20055 default:
20056 return false;
20057 }
20058 }
20059
20060 /* Expand an insert into a vector register through pinsr insn.
20061 Return true if successful. */
20062
20063 bool
20064 ix86_expand_pinsr (rtx *operands)
20065 {
20066 rtx dst = operands[0];
20067 rtx src = operands[3];
20068
20069 unsigned int size = INTVAL (operands[1]);
20070 unsigned int pos = INTVAL (operands[2]);
20071
20072 if (SUBREG_P (dst))
20073 {
20074 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
20075 dst = SUBREG_REG (dst);
20076 }
20077
20078 switch (GET_MODE (dst))
20079 {
20080 case E_V16QImode:
20081 case E_V8HImode:
20082 case E_V4SImode:
20083 case E_V2DImode:
20084 case E_V1TImode:
20085 case E_TImode:
20086 {
20087 machine_mode srcmode, dstmode;
20088 rtx (*pinsr)(rtx, rtx, rtx, rtx);
20089 rtx d;
20090
20091 if (!int_mode_for_size (size, 0).exists (&srcmode))
20092 return false;
20093
20094 switch (srcmode)
20095 {
20096 case E_QImode:
20097 if (!TARGET_SSE4_1)
20098 return false;
20099 dstmode = V16QImode;
20100 pinsr = gen_sse4_1_pinsrb;
20101 break;
20102
20103 case E_HImode:
20104 if (!TARGET_SSE2)
20105 return false;
20106 dstmode = V8HImode;
20107 pinsr = gen_sse2_pinsrw;
20108 break;
20109
20110 case E_SImode:
20111 if (!TARGET_SSE4_1)
20112 return false;
20113 dstmode = V4SImode;
20114 pinsr = gen_sse4_1_pinsrd;
20115 break;
20116
20117 case E_DImode:
20118 gcc_assert (TARGET_64BIT);
20119 if (!TARGET_SSE4_1)
20120 return false;
20121 dstmode = V2DImode;
20122 pinsr = gen_sse4_1_pinsrq;
20123 break;
20124
20125 default:
20126 return false;
20127 }
20128
20129 /* Reject insertions to misaligned positions. */
20130 if (pos & (size-1))
20131 return false;
20132
20133 if (SUBREG_P (src))
20134 {
20135 unsigned int srcpos = SUBREG_BYTE (src);
20136
20137 if (srcpos > 0)
20138 {
20139 rtx extr_ops[4];
20140
20141 extr_ops[0] = gen_reg_rtx (srcmode);
20142 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
20143 extr_ops[2] = GEN_INT (size);
20144 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
20145
20146 if (!ix86_expand_pextr (extr_ops))
20147 return false;
20148
20149 src = extr_ops[0];
20150 }
20151 else
20152 src = gen_lowpart (srcmode, SUBREG_REG (src));
20153 }
20154
20155 if (GET_MODE (dst) == dstmode)
20156 d = dst;
20157 else
20158 d = gen_reg_rtx (dstmode);
20159
20160 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
20161 gen_lowpart (srcmode, src),
20162 GEN_INT (1 << (pos / size))));
20163 if (d != dst)
20164 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20165 return true;
20166 }
20167
20168 default:
20169 return false;
20170 }
20171 }
20172
20173 /* All CPUs prefer to avoid cross-lane operations so perform reductions
20174 upper against lower halves up to SSE reg size. */
20175
20176 machine_mode
20177 ix86_split_reduction (machine_mode mode)
20178 {
20179 /* Reduce lowpart against highpart until we reach SSE reg width to
20180 avoid cross-lane operations. */
20181 switch (mode)
20182 {
20183 case E_V8DImode:
20184 case E_V4DImode:
20185 return V2DImode;
20186 case E_V16SImode:
20187 case E_V8SImode:
20188 return V4SImode;
20189 case E_V32HImode:
20190 case E_V16HImode:
20191 return V8HImode;
20192 case E_V64QImode:
20193 case E_V32QImode:
20194 return V16QImode;
20195 case E_V16SFmode:
20196 case E_V8SFmode:
20197 return V4SFmode;
20198 case E_V8DFmode:
20199 case E_V4DFmode:
20200 return V2DFmode;
20201 default:
20202 return mode;
20203 }
20204 }
20205
20206 /* Generate call to __divmoddi4. */
20207
20208 void
20209 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
20210 rtx op0, rtx op1,
20211 rtx *quot_p, rtx *rem_p)
20212 {
20213 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
20214
20215 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
20216 mode, op0, mode, op1, mode,
20217 XEXP (rem, 0), Pmode);
20218 *quot_p = quot;
20219 *rem_p = rem;
20220 }
20221
20222 #include "gt-i386-expand.h"