]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/i386/i386-expand.c
dwarf: fix language_string for C2X
[thirdparty/gcc.git] / gcc / config / i386 / i386-expand.c
CommitLineData
8d9254fc 1/* Copyright (C) 1988-2020 Free Software Foundation, Inc.
2bf6d935
ML
2
3This file is part of GCC.
4
5GCC is free software; you can redistribute it and/or modify
6it under the terms of the GNU General Public License as published by
7the Free Software Foundation; either version 3, or (at your option)
8any later version.
9
10GCC is distributed in the hope that it will be useful,
11but WITHOUT ANY WARRANTY; without even the implied warranty of
12MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13GNU General Public License for more details.
14
15You should have received a copy of the GNU General Public License
16along with GCC; see the file COPYING3. If not see
17<http://www.gnu.org/licenses/>. */
18
19#define IN_TARGET_CODE 1
20
21#include "config.h"
22#include "system.h"
23#include "coretypes.h"
24#include "backend.h"
25#include "rtl.h"
26#include "tree.h"
27#include "memmodel.h"
28#include "gimple.h"
29#include "cfghooks.h"
30#include "cfgloop.h"
31#include "df.h"
32#include "tm_p.h"
33#include "stringpool.h"
34#include "expmed.h"
35#include "optabs.h"
36#include "regs.h"
37#include "emit-rtl.h"
38#include "recog.h"
39#include "cgraph.h"
40#include "diagnostic.h"
41#include "cfgbuild.h"
42#include "alias.h"
43#include "fold-const.h"
44#include "attribs.h"
45#include "calls.h"
46#include "stor-layout.h"
47#include "varasm.h"
48#include "output.h"
49#include "insn-attr.h"
50#include "flags.h"
51#include "except.h"
52#include "explow.h"
53#include "expr.h"
54#include "cfgrtl.h"
55#include "common/common-target.h"
56#include "langhooks.h"
57#include "reload.h"
58#include "gimplify.h"
59#include "dwarf2.h"
60#include "tm-constrs.h"
2bf6d935
ML
61#include "cselib.h"
62#include "sched-int.h"
63#include "opts.h"
64#include "tree-pass.h"
65#include "context.h"
66#include "pass_manager.h"
67#include "target-globals.h"
68#include "gimple-iterator.h"
69#include "tree-vectorizer.h"
70#include "shrink-wrap.h"
71#include "builtins.h"
72#include "rtl-iter.h"
73#include "tree-iterator.h"
74#include "dbgcnt.h"
75#include "case-cfn-macros.h"
76#include "dojump.h"
77#include "fold-const-call.h"
78#include "tree-vrp.h"
79#include "tree-ssanames.h"
80#include "selftest.h"
81#include "selftest-rtl.h"
82#include "print-rtl.h"
83#include "intl.h"
84#include "ifcvt.h"
85#include "symbol-summary.h"
86#include "ipa-prop.h"
87#include "ipa-fnsummary.h"
88#include "wide-int-bitmask.h"
89#include "tree-vector-builder.h"
90#include "debug.h"
91#include "dwarf2out.h"
92#include "i386-options.h"
93#include "i386-builtins.h"
94#include "i386-expand.h"
95
96/* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
101
102void
103split_double_mode (machine_mode mode, rtx operands[],
104 int num, rtx lo_half[], rtx hi_half[])
105{
106 machine_mode half_mode;
107 unsigned int byte;
deeedbad
JJ
108 rtx mem_op = NULL_RTX;
109 int mem_num = 0;
2bf6d935
ML
110
111 switch (mode)
112 {
113 case E_TImode:
114 half_mode = DImode;
115 break;
116 case E_DImode:
117 half_mode = SImode;
118 break;
58d6eea0 119 case E_P2HImode:
120 half_mode = HImode;
121 break;
122 case E_P2QImode:
123 half_mode = QImode;
124 break;
2bf6d935
ML
125 default:
126 gcc_unreachable ();
127 }
128
129 byte = GET_MODE_SIZE (half_mode);
130
131 while (num--)
132 {
133 rtx op = operands[num];
134
135 /* simplify_subreg refuse to split volatile memory addresses,
136 but we still have to handle it. */
137 if (MEM_P (op))
138 {
deeedbad
JJ
139 if (mem_op && rtx_equal_p (op, mem_op))
140 {
141 lo_half[num] = lo_half[mem_num];
142 hi_half[num] = hi_half[mem_num];
143 }
144 else
145 {
146 mem_op = op;
147 mem_num = num;
148 lo_half[num] = adjust_address (op, half_mode, 0);
149 hi_half[num] = adjust_address (op, half_mode, byte);
150 }
2bf6d935
ML
151 }
152 else
153 {
154 lo_half[num] = simplify_gen_subreg (half_mode, op,
155 GET_MODE (op) == VOIDmode
156 ? mode : GET_MODE (op), 0);
157 hi_half[num] = simplify_gen_subreg (half_mode, op,
158 GET_MODE (op) == VOIDmode
159 ? mode : GET_MODE (op), byte);
160 }
161 }
162}
163
164/* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
165 for the target. */
166
167void
168ix86_expand_clear (rtx dest)
169{
170 rtx tmp;
171
172 /* We play register width games, which are only valid after reload. */
173 gcc_assert (reload_completed);
174
175 /* Avoid HImode and its attendant prefix byte. */
176 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
177 dest = gen_rtx_REG (SImode, REGNO (dest));
178 tmp = gen_rtx_SET (dest, const0_rtx);
179
180 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
181 {
182 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
183 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
184 }
185
186 emit_insn (tmp);
187}
188
189void
190ix86_expand_move (machine_mode mode, rtx operands[])
191{
192 rtx op0, op1;
193 rtx tmp, addend = NULL_RTX;
194 enum tls_model model;
195
196 op0 = operands[0];
197 op1 = operands[1];
198
be39636d
RS
199 /* Avoid complex sets of likely spilled hard registers before reload. */
200 if (!ix86_hardreg_mov_ok (op0, op1))
201 {
202 tmp = gen_reg_rtx (mode);
203 operands[0] = tmp;
204 ix86_expand_move (mode, operands);
205 operands[0] = op0;
206 operands[1] = tmp;
207 op1 = tmp;
208 }
209
2bf6d935
ML
210 switch (GET_CODE (op1))
211 {
212 case CONST:
213 tmp = XEXP (op1, 0);
214
215 if (GET_CODE (tmp) != PLUS
216 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
217 break;
218
219 op1 = XEXP (tmp, 0);
220 addend = XEXP (tmp, 1);
221 /* FALLTHRU */
222
223 case SYMBOL_REF:
224 model = SYMBOL_REF_TLS_MODEL (op1);
225
226 if (model)
227 op1 = legitimize_tls_address (op1, model, true);
228 else if (ix86_force_load_from_GOT_p (op1))
229 {
230 /* Load the external function address via GOT slot to avoid PLT. */
231 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
232 (TARGET_64BIT
233 ? UNSPEC_GOTPCREL
234 : UNSPEC_GOT));
235 op1 = gen_rtx_CONST (Pmode, op1);
236 op1 = gen_const_mem (Pmode, op1);
237 set_mem_alias_set (op1, ix86_GOT_alias_set ());
238 }
239 else
240 {
241 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
242 if (tmp)
243 {
244 op1 = tmp;
245 if (!addend)
246 break;
247 }
248 else
249 {
250 op1 = operands[1];
251 break;
252 }
253 }
254
255 if (addend)
256 {
257 op1 = force_operand (op1, NULL_RTX);
258 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
259 op0, 1, OPTAB_DIRECT);
260 }
261 else
262 op1 = force_operand (op1, op0);
263
264 if (op1 == op0)
265 return;
266
267 op1 = convert_to_mode (mode, op1, 1);
268
269 default:
270 break;
271 }
272
273 if ((flag_pic || MACHOPIC_INDIRECT)
274 && symbolic_operand (op1, mode))
275 {
276 if (TARGET_MACHO && !TARGET_64BIT)
277 {
278#if TARGET_MACHO
279 /* dynamic-no-pic */
280 if (MACHOPIC_INDIRECT)
281 {
282 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
283 ? op0 : gen_reg_rtx (Pmode);
284 op1 = machopic_indirect_data_reference (op1, temp);
285 if (MACHOPIC_PURE)
286 op1 = machopic_legitimize_pic_address (op1, mode,
287 temp == op1 ? 0 : temp);
288 }
289 if (op0 != op1 && GET_CODE (op0) != MEM)
290 {
291 rtx insn = gen_rtx_SET (op0, op1);
292 emit_insn (insn);
293 return;
294 }
295 if (GET_CODE (op0) == MEM)
296 op1 = force_reg (Pmode, op1);
297 else
298 {
299 rtx temp = op0;
300 if (GET_CODE (temp) != REG)
301 temp = gen_reg_rtx (Pmode);
302 temp = legitimize_pic_address (op1, temp);
303 if (temp == op0)
304 return;
305 op1 = temp;
306 }
307 /* dynamic-no-pic */
308#endif
309 }
310 else
311 {
312 if (MEM_P (op0))
313 op1 = force_reg (mode, op1);
314 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
315 {
316 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
317 op1 = legitimize_pic_address (op1, reg);
318 if (op0 == op1)
319 return;
320 op1 = convert_to_mode (mode, op1, 1);
321 }
322 }
323 }
324 else
325 {
326 if (MEM_P (op0)
327 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
328 || !push_operand (op0, mode))
329 && MEM_P (op1))
330 op1 = force_reg (mode, op1);
331
332 if (push_operand (op0, mode)
333 && ! general_no_elim_operand (op1, mode))
334 op1 = copy_to_mode_reg (mode, op1);
335
336 /* Force large constants in 64bit compilation into register
337 to get them CSEed. */
338 if (can_create_pseudo_p ()
339 && (mode == DImode) && TARGET_64BIT
340 && immediate_operand (op1, mode)
341 && !x86_64_zext_immediate_operand (op1, VOIDmode)
342 && !register_operand (op0, mode)
343 && optimize)
344 op1 = copy_to_mode_reg (mode, op1);
345
346 if (can_create_pseudo_p ()
347 && CONST_DOUBLE_P (op1))
348 {
349 /* If we are loading a floating point constant to a register,
350 force the value to memory now, since we'll get better code
351 out the back end. */
352
353 op1 = validize_mem (force_const_mem (mode, op1));
354 if (!register_operand (op0, mode))
355 {
356 rtx temp = gen_reg_rtx (mode);
357 emit_insn (gen_rtx_SET (temp, op1));
358 emit_move_insn (op0, temp);
359 return;
360 }
361 }
362 }
363
364 emit_insn (gen_rtx_SET (op0, op1));
365}
366
367void
368ix86_expand_vector_move (machine_mode mode, rtx operands[])
369{
370 rtx op0 = operands[0], op1 = operands[1];
371 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
372 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
373 unsigned int align = (TARGET_IAMCU
374 ? GET_MODE_BITSIZE (mode)
375 : GET_MODE_ALIGNMENT (mode));
376
377 if (push_operand (op0, VOIDmode))
378 op0 = emit_move_resolve_push (mode, op0);
379
380 /* Force constants other than zero into memory. We do not know how
381 the instructions used to build constants modify the upper 64 bits
382 of the register, once we have that information we may be able
383 to handle some of them more efficiently. */
384 if (can_create_pseudo_p ()
385 && (CONSTANT_P (op1)
386 || (SUBREG_P (op1)
387 && CONSTANT_P (SUBREG_REG (op1))))
388 && ((register_operand (op0, mode)
389 && !standard_sse_constant_p (op1, mode))
390 /* ix86_expand_vector_move_misalign() does not like constants. */
391 || (SSE_REG_MODE_P (mode)
392 && MEM_P (op0)
393 && MEM_ALIGN (op0) < align)))
394 {
395 if (SUBREG_P (op1))
396 {
397 machine_mode imode = GET_MODE (SUBREG_REG (op1));
398 rtx r = force_const_mem (imode, SUBREG_REG (op1));
399 if (r)
400 r = validize_mem (r);
401 else
402 r = force_reg (imode, SUBREG_REG (op1));
403 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
404 }
405 else
406 op1 = validize_mem (force_const_mem (mode, op1));
407 }
408
409 /* We need to check memory alignment for SSE mode since attribute
410 can make operands unaligned. */
411 if (can_create_pseudo_p ()
412 && SSE_REG_MODE_P (mode)
413 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
414 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
415 {
416 rtx tmp[2];
417
418 /* ix86_expand_vector_move_misalign() does not like both
419 arguments in memory. */
420 if (!register_operand (op0, mode)
421 && !register_operand (op1, mode))
422 op1 = force_reg (mode, op1);
423
424 tmp[0] = op0; tmp[1] = op1;
425 ix86_expand_vector_move_misalign (mode, tmp);
426 return;
427 }
428
429 /* Make operand1 a register if it isn't already. */
430 if (can_create_pseudo_p ()
431 && !register_operand (op0, mode)
432 && !register_operand (op1, mode))
433 {
434 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
435 return;
436 }
437
438 emit_insn (gen_rtx_SET (op0, op1));
439}
440
441/* Split 32-byte AVX unaligned load and store if needed. */
442
443static void
444ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
445{
446 rtx m;
447 rtx (*extract) (rtx, rtx, rtx);
448 machine_mode mode;
449
450 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
451 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
452 {
453 emit_insn (gen_rtx_SET (op0, op1));
454 return;
455 }
456
457 rtx orig_op0 = NULL_RTX;
458 mode = GET_MODE (op0);
459 switch (GET_MODE_CLASS (mode))
460 {
461 case MODE_VECTOR_INT:
462 case MODE_INT:
463 if (mode != V32QImode)
464 {
465 if (!MEM_P (op0))
466 {
467 orig_op0 = op0;
468 op0 = gen_reg_rtx (V32QImode);
469 }
470 else
471 op0 = gen_lowpart (V32QImode, op0);
472 op1 = gen_lowpart (V32QImode, op1);
473 mode = V32QImode;
474 }
475 break;
476 case MODE_VECTOR_FLOAT:
477 break;
478 default:
479 gcc_unreachable ();
480 }
481
482 switch (mode)
483 {
484 default:
485 gcc_unreachable ();
486 case E_V32QImode:
487 extract = gen_avx_vextractf128v32qi;
488 mode = V16QImode;
489 break;
490 case E_V8SFmode:
491 extract = gen_avx_vextractf128v8sf;
492 mode = V4SFmode;
493 break;
494 case E_V4DFmode:
495 extract = gen_avx_vextractf128v4df;
496 mode = V2DFmode;
497 break;
498 }
499
500 if (MEM_P (op1))
501 {
502 rtx r = gen_reg_rtx (mode);
503 m = adjust_address (op1, mode, 0);
504 emit_move_insn (r, m);
505 m = adjust_address (op1, mode, 16);
506 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
507 emit_move_insn (op0, r);
508 }
509 else if (MEM_P (op0))
510 {
511 m = adjust_address (op0, mode, 0);
512 emit_insn (extract (m, op1, const0_rtx));
513 m = adjust_address (op0, mode, 16);
514 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
515 }
516 else
517 gcc_unreachable ();
518
519 if (orig_op0)
520 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
521}
522
523/* Implement the movmisalign patterns for SSE. Non-SSE modes go
524 straight to ix86_expand_vector_move. */
525/* Code generation for scalar reg-reg moves of single and double precision data:
526 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
527 movaps reg, reg
528 else
529 movss reg, reg
530 if (x86_sse_partial_reg_dependency == true)
531 movapd reg, reg
532 else
533 movsd reg, reg
534
535 Code generation for scalar loads of double precision data:
536 if (x86_sse_split_regs == true)
537 movlpd mem, reg (gas syntax)
538 else
539 movsd mem, reg
540
541 Code generation for unaligned packed loads of single precision data
542 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
543 if (x86_sse_unaligned_move_optimal)
544 movups mem, reg
545
546 if (x86_sse_partial_reg_dependency == true)
547 {
548 xorps reg, reg
549 movlps mem, reg
550 movhps mem+8, reg
551 }
552 else
553 {
554 movlps mem, reg
555 movhps mem+8, reg
556 }
557
558 Code generation for unaligned packed loads of double precision data
559 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
560 if (x86_sse_unaligned_move_optimal)
561 movupd mem, reg
562
563 if (x86_sse_split_regs == true)
564 {
565 movlpd mem, reg
566 movhpd mem+8, reg
567 }
568 else
569 {
570 movsd mem, reg
571 movhpd mem+8, reg
572 }
573 */
574
575void
576ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
577{
578 rtx op0, op1, m;
579
580 op0 = operands[0];
581 op1 = operands[1];
582
583 /* Use unaligned load/store for AVX512 or when optimizing for size. */
584 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
585 {
586 emit_insn (gen_rtx_SET (op0, op1));
587 return;
588 }
589
590 if (TARGET_AVX)
591 {
592 if (GET_MODE_SIZE (mode) == 32)
593 ix86_avx256_split_vector_move_misalign (op0, op1);
594 else
595 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
596 emit_insn (gen_rtx_SET (op0, op1));
597 return;
598 }
599
600 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
601 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
602 {
603 emit_insn (gen_rtx_SET (op0, op1));
604 return;
605 }
606
607 /* ??? If we have typed data, then it would appear that using
608 movdqu is the only way to get unaligned data loaded with
609 integer type. */
610 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
611 {
612 emit_insn (gen_rtx_SET (op0, op1));
613 return;
614 }
615
616 if (MEM_P (op1))
617 {
618 if (TARGET_SSE2 && mode == V2DFmode)
619 {
620 rtx zero;
621
622 /* When SSE registers are split into halves, we can avoid
623 writing to the top half twice. */
624 if (TARGET_SSE_SPLIT_REGS)
625 {
626 emit_clobber (op0);
627 zero = op0;
628 }
629 else
630 {
631 /* ??? Not sure about the best option for the Intel chips.
632 The following would seem to satisfy; the register is
633 entirely cleared, breaking the dependency chain. We
634 then store to the upper half, with a dependency depth
635 of one. A rumor has it that Intel recommends two movsd
636 followed by an unpacklpd, but this is unconfirmed. And
637 given that the dependency depth of the unpacklpd would
638 still be one, I'm not sure why this would be better. */
639 zero = CONST0_RTX (V2DFmode);
640 }
641
642 m = adjust_address (op1, DFmode, 0);
643 emit_insn (gen_sse2_loadlpd (op0, zero, m));
644 m = adjust_address (op1, DFmode, 8);
645 emit_insn (gen_sse2_loadhpd (op0, op0, m));
646 }
647 else
648 {
649 rtx t;
650
651 if (mode != V4SFmode)
652 t = gen_reg_rtx (V4SFmode);
653 else
654 t = op0;
655
656 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
657 emit_move_insn (t, CONST0_RTX (V4SFmode));
658 else
659 emit_clobber (t);
660
661 m = adjust_address (op1, V2SFmode, 0);
662 emit_insn (gen_sse_loadlps (t, t, m));
663 m = adjust_address (op1, V2SFmode, 8);
664 emit_insn (gen_sse_loadhps (t, t, m));
665 if (mode != V4SFmode)
666 emit_move_insn (op0, gen_lowpart (mode, t));
667 }
668 }
669 else if (MEM_P (op0))
670 {
671 if (TARGET_SSE2 && mode == V2DFmode)
672 {
673 m = adjust_address (op0, DFmode, 0);
674 emit_insn (gen_sse2_storelpd (m, op1));
675 m = adjust_address (op0, DFmode, 8);
676 emit_insn (gen_sse2_storehpd (m, op1));
677 }
678 else
679 {
680 if (mode != V4SFmode)
681 op1 = gen_lowpart (V4SFmode, op1);
682
683 m = adjust_address (op0, V2SFmode, 0);
684 emit_insn (gen_sse_storelps (m, op1));
685 m = adjust_address (op0, V2SFmode, 8);
686 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
687 }
688 }
689 else
690 gcc_unreachable ();
691}
692
b74ebb2a
L
693/* Move bits 64:95 to bits 32:63. */
694
695void
696ix86_move_vector_high_sse_to_mmx (rtx op)
697{
698 rtx mask = gen_rtx_PARALLEL (VOIDmode,
699 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
700 GEN_INT (0), GEN_INT (0)));
701 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
702 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
703 rtx insn = gen_rtx_SET (dest, op);
704 emit_insn (insn);
705}
706
707/* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
708
709void
710ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
711{
712 rtx op0 = operands[0];
713 rtx op1 = operands[1];
714 rtx op2 = operands[2];
715
716 machine_mode dmode = GET_MODE (op0);
717 machine_mode smode = GET_MODE (op1);
718 machine_mode inner_dmode = GET_MODE_INNER (dmode);
719 machine_mode inner_smode = GET_MODE_INNER (smode);
720
721 /* Get the corresponding SSE mode for destination. */
722 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
723 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
724 nunits).require ();
725 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
726 nunits / 2).require ();
727
728 /* Get the corresponding SSE mode for source. */
729 nunits = 16 / GET_MODE_SIZE (inner_smode);
730 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
731 nunits).require ();
732
733 /* Generate SSE pack with signed/unsigned saturation. */
734 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
735 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
736 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
737
738 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
739 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
740 rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
741 op1, op2));
742 emit_insn (insn);
743
744 ix86_move_vector_high_sse_to_mmx (op0);
745}
746
6e9fffcf
L
747/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
748
749void
750ix86_split_mmx_punpck (rtx operands[], bool high_p)
751{
752 rtx op0 = operands[0];
753 rtx op1 = operands[1];
754 rtx op2 = operands[2];
755 machine_mode mode = GET_MODE (op0);
756 rtx mask;
757 /* The corresponding SSE mode. */
758 machine_mode sse_mode, double_sse_mode;
759
760 switch (mode)
761 {
762 case E_V8QImode:
763 sse_mode = V16QImode;
764 double_sse_mode = V32QImode;
765 mask = gen_rtx_PARALLEL (VOIDmode,
766 gen_rtvec (16,
767 GEN_INT (0), GEN_INT (16),
768 GEN_INT (1), GEN_INT (17),
769 GEN_INT (2), GEN_INT (18),
770 GEN_INT (3), GEN_INT (19),
771 GEN_INT (4), GEN_INT (20),
772 GEN_INT (5), GEN_INT (21),
773 GEN_INT (6), GEN_INT (22),
774 GEN_INT (7), GEN_INT (23)));
775 break;
776
777 case E_V4HImode:
778 sse_mode = V8HImode;
779 double_sse_mode = V16HImode;
780 mask = gen_rtx_PARALLEL (VOIDmode,
781 gen_rtvec (8,
782 GEN_INT (0), GEN_INT (8),
783 GEN_INT (1), GEN_INT (9),
784 GEN_INT (2), GEN_INT (10),
785 GEN_INT (3), GEN_INT (11)));
786 break;
787
788 case E_V2SImode:
789 sse_mode = V4SImode;
790 double_sse_mode = V8SImode;
791 mask = gen_rtx_PARALLEL (VOIDmode,
792 gen_rtvec (4,
793 GEN_INT (0), GEN_INT (4),
794 GEN_INT (1), GEN_INT (5)));
795 break;
796
797 default:
798 gcc_unreachable ();
799 }
800
801 /* Generate SSE punpcklXX. */
802 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
803 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
804 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
805
806 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
807 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
808 rtx insn = gen_rtx_SET (dest, op2);
809 emit_insn (insn);
810
811 if (high_p)
812 {
813 /* Move bits 64:127 to bits 0:63. */
814 mask = gen_rtx_PARALLEL (VOIDmode,
815 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
816 GEN_INT (0), GEN_INT (0)));
817 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
818 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
819 insn = gen_rtx_SET (dest, op1);
820 emit_insn (insn);
821 }
822}
823
2bf6d935
ML
824/* Helper function of ix86_fixup_binary_operands to canonicalize
825 operand order. Returns true if the operands should be swapped. */
826
827static bool
828ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
829 rtx operands[])
830{
831 rtx dst = operands[0];
832 rtx src1 = operands[1];
833 rtx src2 = operands[2];
834
835 /* If the operation is not commutative, we can't do anything. */
836 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
837 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
838 return false;
839
840 /* Highest priority is that src1 should match dst. */
841 if (rtx_equal_p (dst, src1))
842 return false;
843 if (rtx_equal_p (dst, src2))
844 return true;
845
846 /* Next highest priority is that immediate constants come second. */
847 if (immediate_operand (src2, mode))
848 return false;
849 if (immediate_operand (src1, mode))
850 return true;
851
852 /* Lowest priority is that memory references should come second. */
853 if (MEM_P (src2))
854 return false;
855 if (MEM_P (src1))
856 return true;
857
858 return false;
859}
860
861
862/* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
863 destination to use for the operation. If different from the true
864 destination in operands[0], a copy operation will be required. */
865
866rtx
867ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
868 rtx operands[])
869{
870 rtx dst = operands[0];
871 rtx src1 = operands[1];
872 rtx src2 = operands[2];
873
874 /* Canonicalize operand order. */
875 if (ix86_swap_binary_operands_p (code, mode, operands))
876 {
877 /* It is invalid to swap operands of different modes. */
878 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
879
880 std::swap (src1, src2);
881 }
882
883 /* Both source operands cannot be in memory. */
884 if (MEM_P (src1) && MEM_P (src2))
885 {
886 /* Optimization: Only read from memory once. */
887 if (rtx_equal_p (src1, src2))
888 {
889 src2 = force_reg (mode, src2);
890 src1 = src2;
891 }
892 else if (rtx_equal_p (dst, src1))
893 src2 = force_reg (mode, src2);
894 else
895 src1 = force_reg (mode, src1);
896 }
897
898 /* If the destination is memory, and we do not have matching source
899 operands, do things in registers. */
900 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
901 dst = gen_reg_rtx (mode);
902
903 /* Source 1 cannot be a constant. */
904 if (CONSTANT_P (src1))
905 src1 = force_reg (mode, src1);
906
907 /* Source 1 cannot be a non-matching memory. */
908 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
909 src1 = force_reg (mode, src1);
910
911 /* Improve address combine. */
912 if (code == PLUS
913 && GET_MODE_CLASS (mode) == MODE_INT
914 && MEM_P (src2))
915 src2 = force_reg (mode, src2);
916
917 operands[1] = src1;
918 operands[2] = src2;
919 return dst;
920}
921
922/* Similarly, but assume that the destination has already been
923 set up properly. */
924
925void
926ix86_fixup_binary_operands_no_copy (enum rtx_code code,
927 machine_mode mode, rtx operands[])
928{
929 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
930 gcc_assert (dst == operands[0]);
931}
932
933/* Attempt to expand a binary operator. Make the expansion closer to the
934 actual machine, then just general_operand, which will allow 3 separate
935 memory references (one output, two input) in a single insn. */
936
937void
938ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
939 rtx operands[])
940{
941 rtx src1, src2, dst, op, clob;
942
943 dst = ix86_fixup_binary_operands (code, mode, operands);
944 src1 = operands[1];
945 src2 = operands[2];
946
947 /* Emit the instruction. */
948
949 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
950
951 if (reload_completed
952 && code == PLUS
953 && !rtx_equal_p (dst, src1))
954 {
955 /* This is going to be an LEA; avoid splitting it later. */
956 emit_insn (op);
957 }
958 else
959 {
960 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
961 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
962 }
963
964 /* Fix up the destination if needed. */
965 if (dst != operands[0])
966 emit_move_insn (operands[0], dst);
967}
968
969/* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
970 the given OPERANDS. */
971
972void
973ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
974 rtx operands[])
975{
976 rtx op1 = NULL_RTX, op2 = NULL_RTX;
977 if (SUBREG_P (operands[1]))
978 {
979 op1 = operands[1];
980 op2 = operands[2];
981 }
982 else if (SUBREG_P (operands[2]))
983 {
984 op1 = operands[2];
985 op2 = operands[1];
986 }
987 /* Optimize (__m128i) d | (__m128i) e and similar code
988 when d and e are float vectors into float vector logical
989 insn. In C/C++ without using intrinsics there is no other way
990 to express vector logical operation on float vectors than
991 to cast them temporarily to integer vectors. */
992 if (op1
993 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
994 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
995 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
996 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
997 && SUBREG_BYTE (op1) == 0
998 && (GET_CODE (op2) == CONST_VECTOR
999 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
1000 && SUBREG_BYTE (op2) == 0))
1001 && can_create_pseudo_p ())
1002 {
1003 rtx dst;
1004 switch (GET_MODE (SUBREG_REG (op1)))
1005 {
1006 case E_V4SFmode:
1007 case E_V8SFmode:
1008 case E_V16SFmode:
1009 case E_V2DFmode:
1010 case E_V4DFmode:
1011 case E_V8DFmode:
1012 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1013 if (GET_CODE (op2) == CONST_VECTOR)
1014 {
1015 op2 = gen_lowpart (GET_MODE (dst), op2);
1016 op2 = force_reg (GET_MODE (dst), op2);
1017 }
1018 else
1019 {
1020 op1 = operands[1];
1021 op2 = SUBREG_REG (operands[2]);
1022 if (!vector_operand (op2, GET_MODE (dst)))
1023 op2 = force_reg (GET_MODE (dst), op2);
1024 }
1025 op1 = SUBREG_REG (op1);
1026 if (!vector_operand (op1, GET_MODE (dst)))
1027 op1 = force_reg (GET_MODE (dst), op1);
1028 emit_insn (gen_rtx_SET (dst,
1029 gen_rtx_fmt_ee (code, GET_MODE (dst),
1030 op1, op2)));
1031 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1032 return;
1033 default:
1034 break;
1035 }
1036 }
1037 if (!vector_operand (operands[1], mode))
1038 operands[1] = force_reg (mode, operands[1]);
1039 if (!vector_operand (operands[2], mode))
1040 operands[2] = force_reg (mode, operands[2]);
1041 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1042 emit_insn (gen_rtx_SET (operands[0],
1043 gen_rtx_fmt_ee (code, mode, operands[1],
1044 operands[2])));
1045}
1046
1047/* Return TRUE or FALSE depending on whether the binary operator meets the
1048 appropriate constraints. */
1049
1050bool
1051ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1052 rtx operands[3])
1053{
1054 rtx dst = operands[0];
1055 rtx src1 = operands[1];
1056 rtx src2 = operands[2];
1057
1058 /* Both source operands cannot be in memory. */
7026bb95 1059 if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
1060 && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
2bf6d935
ML
1061 return false;
1062
1063 /* Canonicalize operand order for commutative operators. */
1064 if (ix86_swap_binary_operands_p (code, mode, operands))
1065 std::swap (src1, src2);
1066
1067 /* If the destination is memory, we must have a matching source operand. */
1068 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1069 return false;
1070
1071 /* Source 1 cannot be a constant. */
1072 if (CONSTANT_P (src1))
1073 return false;
1074
1075 /* Source 1 cannot be a non-matching memory. */
1076 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1077 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1078 return (code == AND
1079 && (mode == HImode
1080 || mode == SImode
1081 || (TARGET_64BIT && mode == DImode))
1082 && satisfies_constraint_L (src2));
1083
1084 return true;
1085}
1086
1087/* Attempt to expand a unary operator. Make the expansion closer to the
1088 actual machine, then just general_operand, which will allow 2 separate
1089 memory references (one output, one input) in a single insn. */
1090
1091void
1092ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1093 rtx operands[])
1094{
1095 bool matching_memory = false;
1096 rtx src, dst, op, clob;
1097
1098 dst = operands[0];
1099 src = operands[1];
1100
1101 /* If the destination is memory, and we do not have matching source
1102 operands, do things in registers. */
1103 if (MEM_P (dst))
1104 {
1105 if (rtx_equal_p (dst, src))
1106 matching_memory = true;
1107 else
1108 dst = gen_reg_rtx (mode);
1109 }
1110
1111 /* When source operand is memory, destination must match. */
1112 if (MEM_P (src) && !matching_memory)
1113 src = force_reg (mode, src);
1114
1115 /* Emit the instruction. */
1116
1117 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1118
1119 if (code == NOT)
1120 emit_insn (op);
1121 else
1122 {
1123 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1124 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1125 }
1126
1127 /* Fix up the destination if needed. */
1128 if (dst != operands[0])
1129 emit_move_insn (operands[0], dst);
1130}
1131
1132/* Predict just emitted jump instruction to be taken with probability PROB. */
1133
1134static void
1135predict_jump (int prob)
1136{
1137 rtx_insn *insn = get_last_insn ();
1138 gcc_assert (JUMP_P (insn));
1139 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1140}
1141
1142/* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1143 divisor are within the range [0-255]. */
1144
1145void
1146ix86_split_idivmod (machine_mode mode, rtx operands[],
40c81f84 1147 bool unsigned_p)
2bf6d935
ML
1148{
1149 rtx_code_label *end_label, *qimode_label;
1150 rtx div, mod;
1151 rtx_insn *insn;
1152 rtx scratch, tmp0, tmp1, tmp2;
1153 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
2bf6d935
ML
1154
1155 switch (mode)
1156 {
1157 case E_SImode:
1158 if (GET_MODE (operands[0]) == SImode)
1159 {
1160 if (GET_MODE (operands[1]) == SImode)
40c81f84 1161 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
2bf6d935
ML
1162 else
1163 gen_divmod4_1
40c81f84 1164 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
2bf6d935
ML
1165 }
1166 else
ea298f7a
UB
1167 gen_divmod4_1
1168 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
2bf6d935 1169 break;
ea298f7a 1170
2bf6d935 1171 case E_DImode:
40c81f84 1172 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
2bf6d935 1173 break;
ea298f7a 1174
2bf6d935
ML
1175 default:
1176 gcc_unreachable ();
1177 }
1178
1179 end_label = gen_label_rtx ();
1180 qimode_label = gen_label_rtx ();
1181
1182 scratch = gen_reg_rtx (mode);
1183
1184 /* Use 8bit unsigned divimod if dividend and divisor are within
1185 the range [0-255]. */
1186 emit_move_insn (scratch, operands[2]);
1187 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1188 scratch, 1, OPTAB_DIRECT);
ea298f7a 1189 emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
2bf6d935
ML
1190 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1191 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1192 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1193 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1194 pc_rtx);
1195 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1196 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1197 JUMP_LABEL (insn) = qimode_label;
1198
1199 /* Generate original signed/unsigned divimod. */
e9539592
UB
1200 emit_insn (gen_divmod4_1 (operands[0], operands[1],
1201 operands[2], operands[3]));
2bf6d935
ML
1202
1203 /* Branch to the end. */
1204 emit_jump_insn (gen_jump (end_label));
1205 emit_barrier ();
1206
1207 /* Generate 8bit unsigned divide. */
1208 emit_label (qimode_label);
1209 /* Don't use operands[0] for result of 8bit divide since not all
1210 registers support QImode ZERO_EXTRACT. */
1211 tmp0 = lowpart_subreg (HImode, scratch, mode);
1212 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1213 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1214 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1215
40c81f84 1216 if (unsigned_p)
2bf6d935 1217 {
40c81f84
UB
1218 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1219 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
2bf6d935
ML
1220 }
1221 else
1222 {
40c81f84
UB
1223 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1224 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
2bf6d935
ML
1225 }
1226 if (mode == SImode)
1227 {
1228 if (GET_MODE (operands[0]) != SImode)
1229 div = gen_rtx_ZERO_EXTEND (DImode, div);
1230 if (GET_MODE (operands[1]) != SImode)
1231 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1232 }
1233
1234 /* Extract remainder from AH. */
e9539592
UB
1235 scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1236 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1237 GEN_INT (8), GEN_INT (8));
1238 insn = emit_move_insn (operands[1], tmp1);
2bf6d935
ML
1239 set_unique_reg_note (insn, REG_EQUAL, mod);
1240
1241 /* Zero extend quotient from AL. */
1242 tmp1 = gen_lowpart (QImode, tmp0);
ea298f7a
UB
1243 insn = emit_insn (gen_extend_insn
1244 (operands[0], tmp1,
1245 GET_MODE (operands[0]), QImode, 1));
2bf6d935
ML
1246 set_unique_reg_note (insn, REG_EQUAL, div);
1247
1248 emit_label (end_label);
1249}
1250
1251/* Emit x86 binary operand CODE in mode MODE, where the first operand
1252 matches destination. RTX includes clobber of FLAGS_REG. */
1253
1254void
1255ix86_emit_binop (enum rtx_code code, machine_mode mode,
1256 rtx dst, rtx src)
1257{
1258 rtx op, clob;
1259
1260 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1261 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1262
1263 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1264}
1265
1266/* Return true if regno1 def is nearest to the insn. */
1267
1268static bool
1269find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1270{
1271 rtx_insn *prev = insn;
1272 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1273
1274 if (insn == start)
1275 return false;
1276 while (prev && prev != start)
1277 {
1278 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1279 {
1280 prev = PREV_INSN (prev);
1281 continue;
1282 }
1283 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1284 return true;
1285 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1286 return false;
1287 prev = PREV_INSN (prev);
1288 }
1289
1290 /* None of the regs is defined in the bb. */
1291 return false;
1292}
1293
1294/* Split lea instructions into a sequence of instructions
1295 which are executed on ALU to avoid AGU stalls.
1296 It is assumed that it is allowed to clobber flags register
1297 at lea position. */
1298
1299void
1300ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1301{
1302 unsigned int regno0, regno1, regno2;
1303 struct ix86_address parts;
1304 rtx target, tmp;
1305 int ok, adds;
1306
1307 ok = ix86_decompose_address (operands[1], &parts);
1308 gcc_assert (ok);
1309
1310 target = gen_lowpart (mode, operands[0]);
1311
1312 regno0 = true_regnum (target);
1313 regno1 = INVALID_REGNUM;
1314 regno2 = INVALID_REGNUM;
1315
1316 if (parts.base)
1317 {
1318 parts.base = gen_lowpart (mode, parts.base);
1319 regno1 = true_regnum (parts.base);
1320 }
1321
1322 if (parts.index)
1323 {
1324 parts.index = gen_lowpart (mode, parts.index);
1325 regno2 = true_regnum (parts.index);
1326 }
1327
1328 if (parts.disp)
1329 parts.disp = gen_lowpart (mode, parts.disp);
1330
1331 if (parts.scale > 1)
1332 {
1333 /* Case r1 = r1 + ... */
1334 if (regno1 == regno0)
1335 {
1336 /* If we have a case r1 = r1 + C * r2 then we
1337 should use multiplication which is very
1338 expensive. Assume cost model is wrong if we
1339 have such case here. */
1340 gcc_assert (regno2 != regno0);
1341
1342 for (adds = parts.scale; adds > 0; adds--)
1343 ix86_emit_binop (PLUS, mode, target, parts.index);
1344 }
1345 else
1346 {
1347 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1348 if (regno0 != regno2)
1349 emit_insn (gen_rtx_SET (target, parts.index));
1350
1351 /* Use shift for scaling. */
1352 ix86_emit_binop (ASHIFT, mode, target,
1353 GEN_INT (exact_log2 (parts.scale)));
1354
1355 if (parts.base)
1356 ix86_emit_binop (PLUS, mode, target, parts.base);
1357
1358 if (parts.disp && parts.disp != const0_rtx)
1359 ix86_emit_binop (PLUS, mode, target, parts.disp);
1360 }
1361 }
1362 else if (!parts.base && !parts.index)
1363 {
1364 gcc_assert(parts.disp);
1365 emit_insn (gen_rtx_SET (target, parts.disp));
1366 }
1367 else
1368 {
1369 if (!parts.base)
1370 {
1371 if (regno0 != regno2)
1372 emit_insn (gen_rtx_SET (target, parts.index));
1373 }
1374 else if (!parts.index)
1375 {
1376 if (regno0 != regno1)
1377 emit_insn (gen_rtx_SET (target, parts.base));
1378 }
1379 else
1380 {
1381 if (regno0 == regno1)
1382 tmp = parts.index;
1383 else if (regno0 == regno2)
1384 tmp = parts.base;
1385 else
1386 {
1387 rtx tmp1;
1388
1389 /* Find better operand for SET instruction, depending
1390 on which definition is farther from the insn. */
1391 if (find_nearest_reg_def (insn, regno1, regno2))
1392 tmp = parts.index, tmp1 = parts.base;
1393 else
1394 tmp = parts.base, tmp1 = parts.index;
1395
1396 emit_insn (gen_rtx_SET (target, tmp));
1397
1398 if (parts.disp && parts.disp != const0_rtx)
1399 ix86_emit_binop (PLUS, mode, target, parts.disp);
1400
1401 ix86_emit_binop (PLUS, mode, target, tmp1);
1402 return;
1403 }
1404
1405 ix86_emit_binop (PLUS, mode, target, tmp);
1406 }
1407
1408 if (parts.disp && parts.disp != const0_rtx)
1409 ix86_emit_binop (PLUS, mode, target, parts.disp);
1410 }
1411}
1412
1413/* Post-reload splitter for converting an SF or DFmode value in an
1414 SSE register into an unsigned SImode. */
1415
1416void
1417ix86_split_convert_uns_si_sse (rtx operands[])
1418{
1419 machine_mode vecmode;
1420 rtx value, large, zero_or_two31, input, two31, x;
1421
1422 large = operands[1];
1423 zero_or_two31 = operands[2];
1424 input = operands[3];
1425 two31 = operands[4];
1426 vecmode = GET_MODE (large);
1427 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1428
1429 /* Load up the value into the low element. We must ensure that the other
1430 elements are valid floats -- zero is the easiest such value. */
1431 if (MEM_P (input))
1432 {
1433 if (vecmode == V4SFmode)
1434 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1435 else
1436 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1437 }
1438 else
1439 {
1440 input = gen_rtx_REG (vecmode, REGNO (input));
1441 emit_move_insn (value, CONST0_RTX (vecmode));
1442 if (vecmode == V4SFmode)
1443 emit_insn (gen_sse_movss (value, value, input));
1444 else
1445 emit_insn (gen_sse2_movsd (value, value, input));
1446 }
1447
1448 emit_move_insn (large, two31);
1449 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1450
1451 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1452 emit_insn (gen_rtx_SET (large, x));
1453
1454 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1455 emit_insn (gen_rtx_SET (zero_or_two31, x));
1456
1457 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1458 emit_insn (gen_rtx_SET (value, x));
1459
1460 large = gen_rtx_REG (V4SImode, REGNO (large));
1461 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1462
1463 x = gen_rtx_REG (V4SImode, REGNO (value));
1464 if (vecmode == V4SFmode)
1465 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1466 else
1467 emit_insn (gen_sse2_cvttpd2dq (x, value));
1468 value = x;
1469
1470 emit_insn (gen_xorv4si3 (value, value, large));
1471}
1472
1473static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1474 machine_mode mode, rtx target,
1475 rtx var, int one_var);
1476
1477/* Convert an unsigned DImode value into a DFmode, using only SSE.
1478 Expects the 64-bit DImode to be supplied in a pair of integral
1479 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1480 -mfpmath=sse, !optimize_size only. */
1481
1482void
1483ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1484{
1485 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1486 rtx int_xmm, fp_xmm;
1487 rtx biases, exponents;
1488 rtx x;
1489
1490 int_xmm = gen_reg_rtx (V4SImode);
1491 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1492 emit_insn (gen_movdi_to_sse (int_xmm, input));
1493 else if (TARGET_SSE_SPLIT_REGS)
1494 {
1495 emit_clobber (int_xmm);
1496 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1497 }
1498 else
1499 {
1500 x = gen_reg_rtx (V2DImode);
1501 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1502 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1503 }
1504
1505 x = gen_rtx_CONST_VECTOR (V4SImode,
1506 gen_rtvec (4, GEN_INT (0x43300000UL),
1507 GEN_INT (0x45300000UL),
1508 const0_rtx, const0_rtx));
1509 exponents = validize_mem (force_const_mem (V4SImode, x));
1510
1511 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1512 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1513
1514 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1515 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1516 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1517 (0x1.0p84 + double(fp_value_hi_xmm)).
1518 Note these exponents differ by 32. */
1519
1520 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1521
1522 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1523 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1524 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1525 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1526 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1527 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1528 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1529 biases = validize_mem (force_const_mem (V2DFmode, biases));
1530 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1531
1532 /* Add the upper and lower DFmode values together. */
1533 if (TARGET_SSE3)
1534 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1535 else
1536 {
1537 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1538 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1539 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1540 }
1541
1542 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1543}
1544
1545/* Not used, but eases macroization of patterns. */
1546void
1547ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1548{
1549 gcc_unreachable ();
1550}
1551
1552/* Convert an unsigned SImode value into a DFmode. Only currently used
1553 for SSE, but applicable anywhere. */
1554
1555void
1556ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1557{
1558 REAL_VALUE_TYPE TWO31r;
1559 rtx x, fp;
1560
1561 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1562 NULL, 1, OPTAB_DIRECT);
1563
1564 fp = gen_reg_rtx (DFmode);
1565 emit_insn (gen_floatsidf2 (fp, x));
1566
1567 real_ldexp (&TWO31r, &dconst1, 31);
1568 x = const_double_from_real_value (TWO31r, DFmode);
1569
1570 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1571 if (x != target)
1572 emit_move_insn (target, x);
1573}
1574
1575/* Convert a signed DImode value into a DFmode. Only used for SSE in
1576 32-bit mode; otherwise we have a direct convert instruction. */
1577
1578void
1579ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1580{
1581 REAL_VALUE_TYPE TWO32r;
1582 rtx fp_lo, fp_hi, x;
1583
1584 fp_lo = gen_reg_rtx (DFmode);
1585 fp_hi = gen_reg_rtx (DFmode);
1586
1587 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1588
1589 real_ldexp (&TWO32r, &dconst1, 32);
1590 x = const_double_from_real_value (TWO32r, DFmode);
1591 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1592
1593 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1594
1595 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1596 0, OPTAB_DIRECT);
1597 if (x != target)
1598 emit_move_insn (target, x);
1599}
1600
1601/* Convert an unsigned SImode value into a SFmode, using only SSE.
1602 For x86_32, -mfpmath=sse, !optimize_size only. */
1603void
1604ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1605{
1606 REAL_VALUE_TYPE ONE16r;
1607 rtx fp_hi, fp_lo, int_hi, int_lo, x;
1608
1609 real_ldexp (&ONE16r, &dconst1, 16);
1610 x = const_double_from_real_value (ONE16r, SFmode);
1611 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1612 NULL, 0, OPTAB_DIRECT);
1613 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1614 NULL, 0, OPTAB_DIRECT);
1615 fp_hi = gen_reg_rtx (SFmode);
1616 fp_lo = gen_reg_rtx (SFmode);
1617 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1618 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
1619 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1620 0, OPTAB_DIRECT);
1621 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1622 0, OPTAB_DIRECT);
1623 if (!rtx_equal_p (target, fp_hi))
1624 emit_move_insn (target, fp_hi);
1625}
1626
1627/* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1628 a vector of unsigned ints VAL to vector of floats TARGET. */
1629
1630void
1631ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1632{
1633 rtx tmp[8];
1634 REAL_VALUE_TYPE TWO16r;
1635 machine_mode intmode = GET_MODE (val);
1636 machine_mode fltmode = GET_MODE (target);
1637 rtx (*cvt) (rtx, rtx);
1638
1639 if (intmode == V4SImode)
1640 cvt = gen_floatv4siv4sf2;
1641 else
1642 cvt = gen_floatv8siv8sf2;
1643 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1644 tmp[0] = force_reg (intmode, tmp[0]);
1645 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1646 OPTAB_DIRECT);
1647 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1648 NULL_RTX, 1, OPTAB_DIRECT);
1649 tmp[3] = gen_reg_rtx (fltmode);
1650 emit_insn (cvt (tmp[3], tmp[1]));
1651 tmp[4] = gen_reg_rtx (fltmode);
1652 emit_insn (cvt (tmp[4], tmp[2]));
1653 real_ldexp (&TWO16r, &dconst1, 16);
1654 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
1655 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
1656 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
1657 OPTAB_DIRECT);
1658 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
1659 OPTAB_DIRECT);
1660 if (tmp[7] != target)
1661 emit_move_insn (target, tmp[7]);
1662}
1663
1664/* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1665 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1666 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1667 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1668
1669rtx
1670ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
1671{
1672 REAL_VALUE_TYPE TWO31r;
1673 rtx two31r, tmp[4];
1674 machine_mode mode = GET_MODE (val);
1675 machine_mode scalarmode = GET_MODE_INNER (mode);
1676 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
1677 rtx (*cmp) (rtx, rtx, rtx, rtx);
1678 int i;
1679
1680 for (i = 0; i < 3; i++)
1681 tmp[i] = gen_reg_rtx (mode);
1682 real_ldexp (&TWO31r, &dconst1, 31);
1683 two31r = const_double_from_real_value (TWO31r, scalarmode);
1684 two31r = ix86_build_const_vector (mode, 1, two31r);
1685 two31r = force_reg (mode, two31r);
1686 switch (mode)
1687 {
1688 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
1689 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
1690 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
1691 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
1692 default: gcc_unreachable ();
1693 }
1694 tmp[3] = gen_rtx_LE (mode, two31r, val);
1695 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
1696 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
1697 0, OPTAB_DIRECT);
1698 if (intmode == V4SImode || TARGET_AVX2)
1699 *xorp = expand_simple_binop (intmode, ASHIFT,
1700 gen_lowpart (intmode, tmp[0]),
1701 GEN_INT (31), NULL_RTX, 0,
1702 OPTAB_DIRECT);
1703 else
1704 {
6a556ba4 1705 rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
2bf6d935
ML
1706 two31 = ix86_build_const_vector (intmode, 1, two31);
1707 *xorp = expand_simple_binop (intmode, AND,
1708 gen_lowpart (intmode, tmp[0]),
1709 two31, NULL_RTX, 0,
1710 OPTAB_DIRECT);
1711 }
1712 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
1713 0, OPTAB_DIRECT);
1714}
1715
1716/* Generate code for floating point ABS or NEG. */
1717
1718void
1719ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1720 rtx operands[])
1721{
f359611b 1722 rtx set, dst, src;
2bf6d935
ML
1723 bool use_sse = false;
1724 bool vector_mode = VECTOR_MODE_P (mode);
1725 machine_mode vmode = mode;
f359611b 1726 rtvec par;
2bf6d935 1727
94f687bd 1728 if (vector_mode || mode == TFmode)
2bf6d935
ML
1729 use_sse = true;
1730 else if (TARGET_SSE_MATH)
1731 {
1732 use_sse = SSE_FLOAT_MODE_P (mode);
1733 if (mode == SFmode)
1734 vmode = V4SFmode;
1735 else if (mode == DFmode)
1736 vmode = V2DFmode;
1737 }
1738
2bf6d935
ML
1739 dst = operands[0];
1740 src = operands[1];
1741
1742 set = gen_rtx_fmt_e (code, mode, src);
1743 set = gen_rtx_SET (dst, set);
1744
f359611b 1745 if (use_sse)
2bf6d935 1746 {
f359611b 1747 rtx mask, use, clob;
2bf6d935 1748
f359611b
UB
1749 /* NEG and ABS performed with SSE use bitwise mask operations.
1750 Create the appropriate mask now. */
1751 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
2bf6d935 1752 use = gen_rtx_USE (VOIDmode, mask);
94f687bd 1753 if (vector_mode || mode == TFmode)
2bf6d935
ML
1754 par = gen_rtvec (2, set, use);
1755 else
1756 {
1757 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1758 par = gen_rtvec (3, set, use, clob);
1759 }
2bf6d935
ML
1760 }
1761 else
f359611b
UB
1762 {
1763 rtx clob;
1764
1765 /* Changing of sign for FP values is doable using integer unit too. */
1766 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1767 par = gen_rtvec (2, set, clob);
1768 }
1769
1770 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1771}
1772
1773/* Deconstruct a floating point ABS or NEG operation
1774 with integer registers into integer operations. */
1775
1776void
1777ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1778 rtx operands[])
1779{
1780 enum rtx_code absneg_op;
1781 rtx dst, set;
1782
1783 gcc_assert (operands_match_p (operands[0], operands[1]));
1784
1785 switch (mode)
1786 {
1787 case E_SFmode:
1788 dst = gen_lowpart (SImode, operands[0]);
1789
1790 if (code == ABS)
1791 {
1792 set = gen_int_mode (0x7fffffff, SImode);
1793 absneg_op = AND;
1794 }
1795 else
1796 {
1797 set = gen_int_mode (0x80000000, SImode);
1798 absneg_op = XOR;
1799 }
1800 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1801 break;
1802
1803 case E_DFmode:
1804 if (TARGET_64BIT)
1805 {
1806 dst = gen_lowpart (DImode, operands[0]);
1807 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
1808
1809 if (code == ABS)
1810 set = const0_rtx;
1811 else
1812 set = gen_rtx_NOT (DImode, dst);
1813 }
1814 else
1815 {
1816 dst = gen_highpart (SImode, operands[0]);
1817
1818 if (code == ABS)
1819 {
1820 set = gen_int_mode (0x7fffffff, SImode);
1821 absneg_op = AND;
1822 }
1823 else
1824 {
1825 set = gen_int_mode (0x80000000, SImode);
1826 absneg_op = XOR;
1827 }
1828 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1829 }
1830 break;
1831
1832 case E_XFmode:
1833 dst = gen_rtx_REG (SImode,
1834 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
1835 if (code == ABS)
1836 {
1837 set = GEN_INT (0x7fff);
1838 absneg_op = AND;
1839 }
1840 else
1841 {
1842 set = GEN_INT (0x8000);
1843 absneg_op = XOR;
1844 }
1845 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1846 break;
1847
1848 default:
1849 gcc_unreachable ();
1850 }
1851
1852 set = gen_rtx_SET (dst, set);
1853
1854 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1855 rtvec par = gen_rtvec (2, set, clob);
1856
1857 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2bf6d935
ML
1858}
1859
1860/* Expand a copysign operation. Special case operand 0 being a constant. */
1861
1862void
1863ix86_expand_copysign (rtx operands[])
1864{
1865 machine_mode mode, vmode;
987a3082 1866 rtx dest, op0, op1, mask;
2bf6d935
ML
1867
1868 dest = operands[0];
1869 op0 = operands[1];
1870 op1 = operands[2];
1871
1872 mode = GET_MODE (dest);
1873
1874 if (mode == SFmode)
1875 vmode = V4SFmode;
1876 else if (mode == DFmode)
1877 vmode = V2DFmode;
987a3082 1878 else if (mode == TFmode)
2bf6d935 1879 vmode = mode;
987a3082
UB
1880 else
1881 gcc_unreachable ();
1882
1883 mask = ix86_build_signbit_mask (vmode, 0, 0);
2bf6d935
ML
1884
1885 if (CONST_DOUBLE_P (op0))
1886 {
2bf6d935
ML
1887 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
1888 op0 = simplify_unary_operation (ABS, mode, op0, mode);
1889
1890 if (mode == SFmode || mode == DFmode)
1891 {
1892 if (op0 == CONST0_RTX (mode))
1893 op0 = CONST0_RTX (vmode);
1894 else
1895 {
1896 rtx v = ix86_build_const_vector (vmode, false, op0);
1897
1898 op0 = force_reg (vmode, v);
1899 }
1900 }
1901 else if (op0 != CONST0_RTX (mode))
1902 op0 = force_reg (mode, op0);
1903
987a3082 1904 emit_insn (gen_copysign3_const (mode, dest, op0, op1, mask));
2bf6d935
ML
1905 }
1906 else
1907 {
987a3082 1908 rtx nmask = ix86_build_signbit_mask (vmode, 0, 1);
2bf6d935 1909
987a3082
UB
1910 emit_insn (gen_copysign3_var
1911 (mode, dest, NULL_RTX, op0, op1, nmask, mask));
2bf6d935
ML
1912 }
1913}
1914
1915/* Deconstruct a copysign operation into bit masks. Operand 0 is known to
1916 be a constant, and so has already been expanded into a vector constant. */
1917
1918void
1919ix86_split_copysign_const (rtx operands[])
1920{
1921 machine_mode mode, vmode;
1922 rtx dest, op0, mask, x;
1923
1924 dest = operands[0];
1925 op0 = operands[1];
1926 mask = operands[3];
1927
1928 mode = GET_MODE (dest);
1929 vmode = GET_MODE (mask);
1930
1931 dest = lowpart_subreg (vmode, dest, mode);
1932 x = gen_rtx_AND (vmode, dest, mask);
1933 emit_insn (gen_rtx_SET (dest, x));
1934
1935 if (op0 != CONST0_RTX (vmode))
1936 {
1937 x = gen_rtx_IOR (vmode, dest, op0);
1938 emit_insn (gen_rtx_SET (dest, x));
1939 }
1940}
1941
1942/* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
1943 so we have to do two masks. */
1944
1945void
1946ix86_split_copysign_var (rtx operands[])
1947{
1948 machine_mode mode, vmode;
1949 rtx dest, scratch, op0, op1, mask, nmask, x;
1950
1951 dest = operands[0];
1952 scratch = operands[1];
1953 op0 = operands[2];
1954 op1 = operands[3];
1955 nmask = operands[4];
1956 mask = operands[5];
1957
1958 mode = GET_MODE (dest);
1959 vmode = GET_MODE (mask);
1960
1961 if (rtx_equal_p (op0, op1))
1962 {
1963 /* Shouldn't happen often (it's useless, obviously), but when it does
1964 we'd generate incorrect code if we continue below. */
1965 emit_move_insn (dest, op0);
1966 return;
1967 }
1968
1969 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
1970 {
1971 gcc_assert (REGNO (op1) == REGNO (scratch));
1972
1973 x = gen_rtx_AND (vmode, scratch, mask);
1974 emit_insn (gen_rtx_SET (scratch, x));
1975
1976 dest = mask;
1977 op0 = lowpart_subreg (vmode, op0, mode);
1978 x = gen_rtx_NOT (vmode, dest);
1979 x = gen_rtx_AND (vmode, x, op0);
1980 emit_insn (gen_rtx_SET (dest, x));
1981 }
1982 else
1983 {
1984 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
1985 {
1986 x = gen_rtx_AND (vmode, scratch, mask);
1987 }
1988 else /* alternative 2,4 */
1989 {
1990 gcc_assert (REGNO (mask) == REGNO (scratch));
1991 op1 = lowpart_subreg (vmode, op1, mode);
1992 x = gen_rtx_AND (vmode, scratch, op1);
1993 }
1994 emit_insn (gen_rtx_SET (scratch, x));
1995
1996 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
1997 {
1998 dest = lowpart_subreg (vmode, op0, mode);
1999 x = gen_rtx_AND (vmode, dest, nmask);
2000 }
2001 else /* alternative 3,4 */
2002 {
2003 gcc_assert (REGNO (nmask) == REGNO (dest));
2004 dest = nmask;
2005 op0 = lowpart_subreg (vmode, op0, mode);
2006 x = gen_rtx_AND (vmode, dest, op0);
2007 }
2008 emit_insn (gen_rtx_SET (dest, x));
2009 }
2010
2011 x = gen_rtx_IOR (vmode, dest, scratch);
2012 emit_insn (gen_rtx_SET (dest, x));
2013}
2014
2015/* Expand an xorsign operation. */
2016
2017void
2018ix86_expand_xorsign (rtx operands[])
2019{
2bf6d935
ML
2020 machine_mode mode, vmode;
2021 rtx dest, op0, op1, mask;
2022
2023 dest = operands[0];
2024 op0 = operands[1];
2025 op1 = operands[2];
2026
2027 mode = GET_MODE (dest);
2028
2029 if (mode == SFmode)
987a3082 2030 vmode = V4SFmode;
2bf6d935 2031 else if (mode == DFmode)
987a3082 2032 vmode = V2DFmode;
2bf6d935
ML
2033 else
2034 gcc_unreachable ();
2035
2036 mask = ix86_build_signbit_mask (vmode, 0, 0);
2037
987a3082 2038 emit_insn (gen_xorsign3_1 (mode, dest, op0, op1, mask));
2bf6d935
ML
2039}
2040
2041/* Deconstruct an xorsign operation into bit masks. */
2042
2043void
2044ix86_split_xorsign (rtx operands[])
2045{
2046 machine_mode mode, vmode;
2047 rtx dest, op0, mask, x;
2048
2049 dest = operands[0];
2050 op0 = operands[1];
2051 mask = operands[3];
2052
2053 mode = GET_MODE (dest);
2054 vmode = GET_MODE (mask);
2055
2056 dest = lowpart_subreg (vmode, dest, mode);
2057 x = gen_rtx_AND (vmode, dest, mask);
2058 emit_insn (gen_rtx_SET (dest, x));
2059
2060 op0 = lowpart_subreg (vmode, op0, mode);
2061 x = gen_rtx_XOR (vmode, dest, op0);
2062 emit_insn (gen_rtx_SET (dest, x));
2063}
2064
2065static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2066
2067void
2068ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2069{
2070 machine_mode mode = GET_MODE (op0);
2071 rtx tmp;
2072
2073 /* Handle special case - vector comparsion with boolean result, transform
2074 it using ptest instruction. */
2075 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
2076 {
2077 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2078 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2079
2080 gcc_assert (code == EQ || code == NE);
2081 /* Generate XOR since we can't check that one operand is zero vector. */
2082 tmp = gen_reg_rtx (mode);
2083 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2084 tmp = gen_lowpart (p_mode, tmp);
2085 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2086 gen_rtx_UNSPEC (CCmode,
2087 gen_rtvec (2, tmp, tmp),
2088 UNSPEC_PTEST)));
2089 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2090 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2091 gen_rtx_LABEL_REF (VOIDmode, label),
2092 pc_rtx);
2093 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2094 return;
2095 }
2096
2097 switch (mode)
2098 {
2099 case E_SFmode:
2100 case E_DFmode:
2101 case E_XFmode:
2102 case E_QImode:
2103 case E_HImode:
2104 case E_SImode:
2105 simple:
2106 tmp = ix86_expand_compare (code, op0, op1);
2107 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2108 gen_rtx_LABEL_REF (VOIDmode, label),
2109 pc_rtx);
2110 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2111 return;
2112
2113 case E_DImode:
2114 if (TARGET_64BIT)
2115 goto simple;
2116 /* For 32-bit target DI comparison may be performed on
2117 SSE registers. To allow this we should avoid split
2118 to SI mode which is achieved by doing xor in DI mode
2119 and then comparing with zero (which is recognized by
2120 STV pass). We don't compare using xor when optimizing
2121 for size. */
2122 if (!optimize_insn_for_size_p ()
2123 && TARGET_STV
2124 && (code == EQ || code == NE))
2125 {
2126 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
2127 op1 = const0_rtx;
2128 }
2129 /* FALLTHRU */
2130 case E_TImode:
2131 /* Expand DImode branch into multiple compare+branch. */
2132 {
2133 rtx lo[2], hi[2];
2134 rtx_code_label *label2;
2135 enum rtx_code code1, code2, code3;
2136 machine_mode submode;
2137
2138 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2139 {
2140 std::swap (op0, op1);
2141 code = swap_condition (code);
2142 }
2143
2144 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2145 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2146
2147 submode = mode == DImode ? SImode : DImode;
2148
2149 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2150 avoid two branches. This costs one extra insn, so disable when
2151 optimizing for size. */
2152
2153 if ((code == EQ || code == NE)
2154 && (!optimize_insn_for_size_p ()
2155 || hi[1] == const0_rtx || lo[1] == const0_rtx))
2156 {
2157 rtx xor0, xor1;
2158
2159 xor1 = hi[0];
2160 if (hi[1] != const0_rtx)
2161 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
2162 NULL_RTX, 0, OPTAB_WIDEN);
2163
2164 xor0 = lo[0];
2165 if (lo[1] != const0_rtx)
2166 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
2167 NULL_RTX, 0, OPTAB_WIDEN);
2168
2169 tmp = expand_binop (submode, ior_optab, xor1, xor0,
2170 NULL_RTX, 0, OPTAB_WIDEN);
2171
2172 ix86_expand_branch (code, tmp, const0_rtx, label);
2173 return;
2174 }
2175
2176 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2177 op1 is a constant and the low word is zero, then we can just
2178 examine the high word. Similarly for low word -1 and
2179 less-or-equal-than or greater-than. */
2180
2181 if (CONST_INT_P (hi[1]))
2182 switch (code)
2183 {
2184 case LT: case LTU: case GE: case GEU:
2185 if (lo[1] == const0_rtx)
2186 {
2187 ix86_expand_branch (code, hi[0], hi[1], label);
2188 return;
2189 }
2190 break;
2191 case LE: case LEU: case GT: case GTU:
2192 if (lo[1] == constm1_rtx)
2193 {
2194 ix86_expand_branch (code, hi[0], hi[1], label);
2195 return;
2196 }
2197 break;
2198 default:
2199 break;
2200 }
2201
2202 /* Emulate comparisons that do not depend on Zero flag with
2203 double-word subtraction. Note that only Overflow, Sign
2204 and Carry flags are valid, so swap arguments and condition
2205 of comparisons that would otherwise test Zero flag. */
2206
2207 switch (code)
2208 {
2209 case LE: case LEU: case GT: case GTU:
2210 std::swap (lo[0], lo[1]);
2211 std::swap (hi[0], hi[1]);
2212 code = swap_condition (code);
2213 /* FALLTHRU */
2214
2215 case LT: case LTU: case GE: case GEU:
2216 {
2bf6d935 2217 bool uns = (code == LTU || code == GEU);
987a3082
UB
2218 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2219 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2bf6d935
ML
2220
2221 if (!nonimmediate_operand (lo[0], submode))
2222 lo[0] = force_reg (submode, lo[0]);
2223 if (!x86_64_general_operand (lo[1], submode))
2224 lo[1] = force_reg (submode, lo[1]);
2225
2226 if (!register_operand (hi[0], submode))
2227 hi[0] = force_reg (submode, hi[0]);
2228 if ((uns && !nonimmediate_operand (hi[1], submode))
2229 || (!uns && !x86_64_general_operand (hi[1], submode)))
2230 hi[1] = force_reg (submode, hi[1]);
2231
987a3082 2232 emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2bf6d935 2233
987a3082
UB
2234 tmp = gen_rtx_SCRATCH (submode);
2235 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2bf6d935 2236
987a3082 2237 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2bf6d935
ML
2238 ix86_expand_branch (code, tmp, const0_rtx, label);
2239 return;
2240 }
2241
2242 default:
2243 break;
2244 }
2245
2246 /* Otherwise, we need two or three jumps. */
2247
2248 label2 = gen_label_rtx ();
2249
2250 code1 = code;
2251 code2 = swap_condition (code);
2252 code3 = unsigned_condition (code);
2253
2254 switch (code)
2255 {
2256 case LT: case GT: case LTU: case GTU:
2257 break;
2258
2259 case LE: code1 = LT; code2 = GT; break;
2260 case GE: code1 = GT; code2 = LT; break;
2261 case LEU: code1 = LTU; code2 = GTU; break;
2262 case GEU: code1 = GTU; code2 = LTU; break;
2263
2264 case EQ: code1 = UNKNOWN; code2 = NE; break;
2265 case NE: code2 = UNKNOWN; break;
2266
2267 default:
2268 gcc_unreachable ();
2269 }
2270
2271 /*
2272 * a < b =>
2273 * if (hi(a) < hi(b)) goto true;
2274 * if (hi(a) > hi(b)) goto false;
2275 * if (lo(a) < lo(b)) goto true;
2276 * false:
2277 */
2278
2279 if (code1 != UNKNOWN)
2280 ix86_expand_branch (code1, hi[0], hi[1], label);
2281 if (code2 != UNKNOWN)
2282 ix86_expand_branch (code2, hi[0], hi[1], label2);
2283
2284 ix86_expand_branch (code3, lo[0], lo[1], label);
2285
2286 if (code2 != UNKNOWN)
2287 emit_label (label2);
2288 return;
2289 }
2290
2291 default:
2292 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2293 goto simple;
2294 }
2295}
2296
2297/* Figure out whether to use unordered fp comparisons. */
2298
2299static bool
2300ix86_unordered_fp_compare (enum rtx_code code)
2301{
2302 if (!TARGET_IEEE_FP)
2303 return false;
2304
2305 switch (code)
2306 {
2bf6d935
ML
2307 case LT:
2308 case LE:
d6038777
UB
2309 case GT:
2310 case GE:
2311 case LTGT:
2bf6d935
ML
2312 return false;
2313
2314 case EQ:
2315 case NE:
2316
2bf6d935
ML
2317 case UNORDERED:
2318 case ORDERED:
2319 case UNLT:
2320 case UNLE:
2321 case UNGT:
2322 case UNGE:
2323 case UNEQ:
2324 return true;
2325
2326 default:
2327 gcc_unreachable ();
2328 }
2329}
2330
2331/* Return a comparison we can do and that it is equivalent to
2332 swap_condition (code) apart possibly from orderedness.
2333 But, never change orderedness if TARGET_IEEE_FP, returning
2334 UNKNOWN in that case if necessary. */
2335
2336static enum rtx_code
2337ix86_fp_swap_condition (enum rtx_code code)
2338{
2339 switch (code)
2340 {
2341 case GT: /* GTU - CF=0 & ZF=0 */
2342 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2343 case GE: /* GEU - CF=0 */
2344 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2345 case UNLT: /* LTU - CF=1 */
2346 return TARGET_IEEE_FP ? UNKNOWN : GT;
2347 case UNLE: /* LEU - CF=1 | ZF=1 */
2348 return TARGET_IEEE_FP ? UNKNOWN : GE;
2349 default:
2350 return swap_condition (code);
2351 }
2352}
2353
2354/* Return cost of comparison CODE using the best strategy for performance.
2355 All following functions do use number of instructions as a cost metrics.
2356 In future this should be tweaked to compute bytes for optimize_size and
2357 take into account performance of various instructions on various CPUs. */
2358
2359static int
2360ix86_fp_comparison_cost (enum rtx_code code)
2361{
2362 int arith_cost;
2363
2364 /* The cost of code using bit-twiddling on %ah. */
2365 switch (code)
2366 {
2367 case UNLE:
2368 case UNLT:
2369 case LTGT:
2370 case GT:
2371 case GE:
2372 case UNORDERED:
2373 case ORDERED:
2374 case UNEQ:
2375 arith_cost = 4;
2376 break;
2377 case LT:
2378 case NE:
2379 case EQ:
2380 case UNGE:
2381 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2382 break;
2383 case LE:
2384 case UNGT:
2385 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2386 break;
2387 default:
2388 gcc_unreachable ();
2389 }
2390
2391 switch (ix86_fp_comparison_strategy (code))
2392 {
2393 case IX86_FPCMP_COMI:
2394 return arith_cost > 4 ? 3 : 2;
2395 case IX86_FPCMP_SAHF:
2396 return arith_cost > 4 ? 4 : 3;
2397 default:
2398 return arith_cost;
2399 }
2400}
2401
2402/* Swap, force into registers, or otherwise massage the two operands
2403 to a fp comparison. The operands are updated in place; the new
2404 comparison code is returned. */
2405
2406static enum rtx_code
2407ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2408{
2409 bool unordered_compare = ix86_unordered_fp_compare (code);
2410 rtx op0 = *pop0, op1 = *pop1;
2411 machine_mode op_mode = GET_MODE (op0);
2412 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
2413
2414 /* All of the unordered compare instructions only work on registers.
2415 The same is true of the fcomi compare instructions. The XFmode
2416 compare instructions require registers except when comparing
2417 against zero or when converting operand 1 from fixed point to
2418 floating point. */
2419
2420 if (!is_sse
2421 && (unordered_compare
2422 || (op_mode == XFmode
2423 && ! (standard_80387_constant_p (op0) == 1
2424 || standard_80387_constant_p (op1) == 1)
2425 && GET_CODE (op1) != FLOAT)
2426 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2427 {
2428 op0 = force_reg (op_mode, op0);
2429 op1 = force_reg (op_mode, op1);
2430 }
2431 else
2432 {
2433 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2434 things around if they appear profitable, otherwise force op0
2435 into a register. */
2436
2437 if (standard_80387_constant_p (op0) == 0
2438 || (MEM_P (op0)
2439 && ! (standard_80387_constant_p (op1) == 0
2440 || MEM_P (op1))))
2441 {
2442 enum rtx_code new_code = ix86_fp_swap_condition (code);
2443 if (new_code != UNKNOWN)
2444 {
2445 std::swap (op0, op1);
2446 code = new_code;
2447 }
2448 }
2449
2450 if (!REG_P (op0))
2451 op0 = force_reg (op_mode, op0);
2452
2453 if (CONSTANT_P (op1))
2454 {
2455 int tmp = standard_80387_constant_p (op1);
2456 if (tmp == 0)
2457 op1 = validize_mem (force_const_mem (op_mode, op1));
2458 else if (tmp == 1)
2459 {
2460 if (TARGET_CMOVE)
2461 op1 = force_reg (op_mode, op1);
2462 }
2463 else
2464 op1 = force_reg (op_mode, op1);
2465 }
2466 }
2467
2468 /* Try to rearrange the comparison to make it cheaper. */
2469 if (ix86_fp_comparison_cost (code)
2470 > ix86_fp_comparison_cost (swap_condition (code))
2471 && (REG_P (op1) || can_create_pseudo_p ()))
2472 {
2473 std::swap (op0, op1);
2474 code = swap_condition (code);
2475 if (!REG_P (op0))
2476 op0 = force_reg (op_mode, op0);
2477 }
2478
2479 *pop0 = op0;
2480 *pop1 = op1;
2481 return code;
2482}
2483
2484/* Generate insn patterns to do a floating point compare of OPERANDS. */
2485
2486static rtx
2487ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2488{
2489 bool unordered_compare = ix86_unordered_fp_compare (code);
2490 machine_mode cmp_mode;
2491 rtx tmp, scratch;
2492
2493 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2494
2495 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2496 if (unordered_compare)
2497 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2498
2499 /* Do fcomi/sahf based test when profitable. */
2500 switch (ix86_fp_comparison_strategy (code))
2501 {
2502 case IX86_FPCMP_COMI:
2503 cmp_mode = CCFPmode;
2504 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2505 break;
2506
2507 case IX86_FPCMP_SAHF:
2508 cmp_mode = CCFPmode;
2509 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2510 scratch = gen_reg_rtx (HImode);
2511 emit_insn (gen_rtx_SET (scratch, tmp));
2512 emit_insn (gen_x86_sahf_1 (scratch));
2513 break;
2514
2515 case IX86_FPCMP_ARITH:
2516 cmp_mode = CCNOmode;
2517 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2518 scratch = gen_reg_rtx (HImode);
2519 emit_insn (gen_rtx_SET (scratch, tmp));
2520
2521 /* In the unordered case, we have to check C2 for NaN's, which
2522 doesn't happen to work out to anything nice combination-wise.
2523 So do some bit twiddling on the value we've got in AH to come
2524 up with an appropriate set of condition codes. */
2525
2526 switch (code)
2527 {
2528 case GT:
2529 case UNGT:
2530 if (code == GT || !TARGET_IEEE_FP)
2531 {
2532 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2533 code = EQ;
2534 }
2535 else
2536 {
2537 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2538 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2539 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2540 cmp_mode = CCmode;
2541 code = GEU;
2542 }
2543 break;
2544 case LT:
2545 case UNLT:
2546 if (code == LT && TARGET_IEEE_FP)
2547 {
2548 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2549 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2550 cmp_mode = CCmode;
2551 code = EQ;
2552 }
2553 else
2554 {
2555 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2556 code = NE;
2557 }
2558 break;
2559 case GE:
2560 case UNGE:
2561 if (code == GE || !TARGET_IEEE_FP)
2562 {
2563 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2564 code = EQ;
2565 }
2566 else
2567 {
2568 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2569 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2570 code = NE;
2571 }
2572 break;
2573 case LE:
2574 case UNLE:
2575 if (code == LE && TARGET_IEEE_FP)
2576 {
2577 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2578 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2579 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2580 cmp_mode = CCmode;
2581 code = LTU;
2582 }
2583 else
2584 {
2585 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2586 code = NE;
2587 }
2588 break;
2589 case EQ:
2590 case UNEQ:
2591 if (code == EQ && TARGET_IEEE_FP)
2592 {
2593 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2594 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2595 cmp_mode = CCmode;
2596 code = EQ;
2597 }
2598 else
2599 {
2600 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2601 code = NE;
2602 }
2603 break;
2604 case NE:
2605 case LTGT:
2606 if (code == NE && TARGET_IEEE_FP)
2607 {
2608 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2609 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2610 GEN_INT (0x40)));
2611 code = NE;
2612 }
2613 else
2614 {
2615 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2616 code = EQ;
2617 }
2618 break;
2619
2620 case UNORDERED:
2621 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2622 code = NE;
2623 break;
2624 case ORDERED:
2625 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2626 code = EQ;
2627 break;
2628
2629 default:
2630 gcc_unreachable ();
2631 }
2632 break;
2633
2634 default:
2635 gcc_unreachable();
2636 }
2637
2638 /* Return the test that should be put into the flags user, i.e.
2639 the bcc, scc, or cmov instruction. */
2640 return gen_rtx_fmt_ee (code, VOIDmode,
2641 gen_rtx_REG (cmp_mode, FLAGS_REG),
2642 const0_rtx);
2643}
2644
2645/* Generate insn patterns to do an integer compare of OPERANDS. */
2646
2647static rtx
2648ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2649{
2650 machine_mode cmpmode;
2651 rtx tmp, flags;
2652
2653 cmpmode = SELECT_CC_MODE (code, op0, op1);
2654 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2655
2656 /* This is very simple, but making the interface the same as in the
2657 FP case makes the rest of the code easier. */
2658 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2659 emit_insn (gen_rtx_SET (flags, tmp));
2660
2661 /* Return the test that should be put into the flags user, i.e.
2662 the bcc, scc, or cmov instruction. */
2663 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2664}
2665
2666static rtx
2667ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2668{
2669 rtx ret;
2670
2671 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2672 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2673
2674 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2675 {
2676 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2677 ret = ix86_expand_fp_compare (code, op0, op1);
2678 }
2679 else
2680 ret = ix86_expand_int_compare (code, op0, op1);
2681
2682 return ret;
2683}
2684
2685void
2686ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2687{
2688 rtx ret;
2689
2690 gcc_assert (GET_MODE (dest) == QImode);
2691
2692 ret = ix86_expand_compare (code, op0, op1);
2693 PUT_MODE (ret, QImode);
2694 emit_insn (gen_rtx_SET (dest, ret));
2695}
2696
2697/* Expand comparison setting or clearing carry flag. Return true when
2698 successful and set pop for the operation. */
2699static bool
2700ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
2701{
2702 machine_mode mode
2703 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
2704
2705 /* Do not handle double-mode compares that go through special path. */
2706 if (mode == (TARGET_64BIT ? TImode : DImode))
2707 return false;
2708
2709 if (SCALAR_FLOAT_MODE_P (mode))
2710 {
2711 rtx compare_op;
2712 rtx_insn *compare_seq;
2713
2714 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
2715
2716 /* Shortcut: following common codes never translate
2717 into carry flag compares. */
2718 if (code == EQ || code == NE || code == UNEQ || code == LTGT
2719 || code == ORDERED || code == UNORDERED)
2720 return false;
2721
2722 /* These comparisons require zero flag; swap operands so they won't. */
2723 if ((code == GT || code == UNLE || code == LE || code == UNGT)
2724 && !TARGET_IEEE_FP)
2725 {
2726 std::swap (op0, op1);
2727 code = swap_condition (code);
2728 }
2729
2730 /* Try to expand the comparison and verify that we end up with
2731 carry flag based comparison. This fails to be true only when
2732 we decide to expand comparison using arithmetic that is not
2733 too common scenario. */
2734 start_sequence ();
2735 compare_op = ix86_expand_fp_compare (code, op0, op1);
2736 compare_seq = get_insns ();
2737 end_sequence ();
2738
2739 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
2740 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
2741 else
2742 code = GET_CODE (compare_op);
2743
2744 if (code != LTU && code != GEU)
2745 return false;
2746
2747 emit_insn (compare_seq);
2748 *pop = compare_op;
2749 return true;
2750 }
2751
2752 if (!INTEGRAL_MODE_P (mode))
2753 return false;
2754
2755 switch (code)
2756 {
2757 case LTU:
2758 case GEU:
2759 break;
2760
2761 /* Convert a==0 into (unsigned)a<1. */
2762 case EQ:
2763 case NE:
2764 if (op1 != const0_rtx)
2765 return false;
2766 op1 = const1_rtx;
2767 code = (code == EQ ? LTU : GEU);
2768 break;
2769
2770 /* Convert a>b into b<a or a>=b-1. */
2771 case GTU:
2772 case LEU:
2773 if (CONST_INT_P (op1))
2774 {
2775 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
2776 /* Bail out on overflow. We still can swap operands but that
2777 would force loading of the constant into register. */
2778 if (op1 == const0_rtx
2779 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
2780 return false;
2781 code = (code == GTU ? GEU : LTU);
2782 }
2783 else
2784 {
2785 std::swap (op0, op1);
2786 code = (code == GTU ? LTU : GEU);
2787 }
2788 break;
2789
2790 /* Convert a>=0 into (unsigned)a<0x80000000. */
2791 case LT:
2792 case GE:
2793 if (mode == DImode || op1 != const0_rtx)
2794 return false;
2795 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2796 code = (code == LT ? GEU : LTU);
2797 break;
2798 case LE:
2799 case GT:
2800 if (mode == DImode || op1 != constm1_rtx)
2801 return false;
2802 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2803 code = (code == LE ? GEU : LTU);
2804 break;
2805
2806 default:
2807 return false;
2808 }
2809 /* Swapping operands may cause constant to appear as first operand. */
2810 if (!nonimmediate_operand (op0, VOIDmode))
2811 {
2812 if (!can_create_pseudo_p ())
2813 return false;
2814 op0 = force_reg (mode, op0);
2815 }
2816 *pop = ix86_expand_compare (code, op0, op1);
2817 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
2818 return true;
2819}
2820
2821/* Expand conditional increment or decrement using adb/sbb instructions.
2822 The default case using setcc followed by the conditional move can be
2823 done by generic code. */
2824bool
2825ix86_expand_int_addcc (rtx operands[])
2826{
2827 enum rtx_code code = GET_CODE (operands[1]);
2828 rtx flags;
987a3082 2829 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
2bf6d935
ML
2830 rtx compare_op;
2831 rtx val = const0_rtx;
2832 bool fpcmp = false;
2833 machine_mode mode;
2834 rtx op0 = XEXP (operands[1], 0);
2835 rtx op1 = XEXP (operands[1], 1);
2836
2837 if (operands[3] != const1_rtx
2838 && operands[3] != constm1_rtx)
2839 return false;
2840 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2841 return false;
2842 code = GET_CODE (compare_op);
2843
2844 flags = XEXP (compare_op, 0);
2845
2846 if (GET_MODE (flags) == CCFPmode)
2847 {
2848 fpcmp = true;
2849 code = ix86_fp_compare_code_to_integer (code);
2850 }
2851
2852 if (code != LTU)
2853 {
2854 val = constm1_rtx;
2855 if (fpcmp)
2856 PUT_CODE (compare_op,
2857 reverse_condition_maybe_unordered
2858 (GET_CODE (compare_op)));
2859 else
2860 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
2861 }
2862
2863 mode = GET_MODE (operands[0]);
2864
2865 /* Construct either adc or sbb insn. */
2866 if ((code == LTU) == (operands[3] == constm1_rtx))
987a3082 2867 insn = gen_sub3_carry;
2bf6d935 2868 else
987a3082
UB
2869 insn = gen_add3_carry;
2870
2871 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
2bf6d935
ML
2872
2873 return true;
2874}
2875
2876bool
2877ix86_expand_int_movcc (rtx operands[])
2878{
2879 enum rtx_code code = GET_CODE (operands[1]), compare_code;
2880 rtx_insn *compare_seq;
2881 rtx compare_op;
2882 machine_mode mode = GET_MODE (operands[0]);
2883 bool sign_bit_compare_p = false;
2884 rtx op0 = XEXP (operands[1], 0);
2885 rtx op1 = XEXP (operands[1], 1);
2886
2887 if (GET_MODE (op0) == TImode
2888 || (GET_MODE (op0) == DImode
2889 && !TARGET_64BIT))
2890 return false;
2891
2892 start_sequence ();
2893 compare_op = ix86_expand_compare (code, op0, op1);
2894 compare_seq = get_insns ();
2895 end_sequence ();
2896
2897 compare_code = GET_CODE (compare_op);
2898
2899 if ((op1 == const0_rtx && (code == GE || code == LT))
2900 || (op1 == constm1_rtx && (code == GT || code == LE)))
2901 sign_bit_compare_p = true;
2902
2903 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2904 HImode insns, we'd be swallowed in word prefix ops. */
2905
2906 if ((mode != HImode || TARGET_FAST_PREFIX)
2907 && (mode != (TARGET_64BIT ? TImode : DImode))
2908 && CONST_INT_P (operands[2])
2909 && CONST_INT_P (operands[3]))
2910 {
2911 rtx out = operands[0];
2912 HOST_WIDE_INT ct = INTVAL (operands[2]);
2913 HOST_WIDE_INT cf = INTVAL (operands[3]);
2914 HOST_WIDE_INT diff;
2915
2916 diff = ct - cf;
2917 /* Sign bit compares are better done using shifts than we do by using
2918 sbb. */
2919 if (sign_bit_compare_p
2920 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2921 {
2922 /* Detect overlap between destination and compare sources. */
2923 rtx tmp = out;
2924
2925 if (!sign_bit_compare_p)
2926 {
2927 rtx flags;
2928 bool fpcmp = false;
2929
2930 compare_code = GET_CODE (compare_op);
2931
2932 flags = XEXP (compare_op, 0);
2933
2934 if (GET_MODE (flags) == CCFPmode)
2935 {
2936 fpcmp = true;
2937 compare_code
2938 = ix86_fp_compare_code_to_integer (compare_code);
2939 }
2940
2941 /* To simplify rest of code, restrict to the GEU case. */
2942 if (compare_code == LTU)
2943 {
2944 std::swap (ct, cf);
2945 compare_code = reverse_condition (compare_code);
2946 code = reverse_condition (code);
2947 }
2948 else
2949 {
2950 if (fpcmp)
2951 PUT_CODE (compare_op,
2952 reverse_condition_maybe_unordered
2953 (GET_CODE (compare_op)));
2954 else
2955 PUT_CODE (compare_op,
2956 reverse_condition (GET_CODE (compare_op)));
2957 }
2958 diff = ct - cf;
2959
2960 if (reg_overlap_mentioned_p (out, op0)
2961 || reg_overlap_mentioned_p (out, op1))
2962 tmp = gen_reg_rtx (mode);
2963
2964 if (mode == DImode)
2965 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
2966 else
2967 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
2968 flags, compare_op));
2969 }
2970 else
2971 {
2972 if (code == GT || code == GE)
2973 code = reverse_condition (code);
2974 else
2975 {
2976 std::swap (ct, cf);
2977 diff = ct - cf;
2978 }
2979 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
2980 }
2981
2982 if (diff == 1)
2983 {
2984 /*
2985 * cmpl op0,op1
2986 * sbbl dest,dest
2987 * [addl dest, ct]
2988 *
2989 * Size 5 - 8.
2990 */
2991 if (ct)
2992 tmp = expand_simple_binop (mode, PLUS,
2993 tmp, GEN_INT (ct),
2994 copy_rtx (tmp), 1, OPTAB_DIRECT);
2995 }
2996 else if (cf == -1)
2997 {
2998 /*
2999 * cmpl op0,op1
3000 * sbbl dest,dest
3001 * orl $ct, dest
3002 *
3003 * Size 8.
3004 */
3005 tmp = expand_simple_binop (mode, IOR,
3006 tmp, GEN_INT (ct),
3007 copy_rtx (tmp), 1, OPTAB_DIRECT);
3008 }
3009 else if (diff == -1 && ct)
3010 {
3011 /*
3012 * cmpl op0,op1
3013 * sbbl dest,dest
3014 * notl dest
3015 * [addl dest, cf]
3016 *
3017 * Size 8 - 11.
3018 */
3019 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3020 if (cf)
3021 tmp = expand_simple_binop (mode, PLUS,
3022 copy_rtx (tmp), GEN_INT (cf),
3023 copy_rtx (tmp), 1, OPTAB_DIRECT);
3024 }
3025 else
3026 {
3027 /*
3028 * cmpl op0,op1
3029 * sbbl dest,dest
3030 * [notl dest]
3031 * andl cf - ct, dest
3032 * [addl dest, ct]
3033 *
3034 * Size 8 - 11.
3035 */
3036
3037 if (cf == 0)
3038 {
3039 cf = ct;
3040 ct = 0;
3041 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3042 }
3043
3044 tmp = expand_simple_binop (mode, AND,
3045 copy_rtx (tmp),
3046 gen_int_mode (cf - ct, mode),
3047 copy_rtx (tmp), 1, OPTAB_DIRECT);
3048 if (ct)
3049 tmp = expand_simple_binop (mode, PLUS,
3050 copy_rtx (tmp), GEN_INT (ct),
3051 copy_rtx (tmp), 1, OPTAB_DIRECT);
3052 }
3053
3054 if (!rtx_equal_p (tmp, out))
3055 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3056
3057 return true;
3058 }
3059
3060 if (diff < 0)
3061 {
3062 machine_mode cmp_mode = GET_MODE (op0);
3063 enum rtx_code new_code;
3064
3065 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3066 {
3067 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3068
8f17461b
UB
3069 /* We may be reversing a non-trapping
3070 comparison to a trapping comparison. */
3071 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3072 && code != EQ && code != NE
3073 && code != ORDERED && code != UNORDERED)
3074 new_code = UNKNOWN;
3075 else
3076 new_code = reverse_condition_maybe_unordered (code);
2bf6d935
ML
3077 }
3078 else
3079 new_code = ix86_reverse_condition (code, cmp_mode);
3080 if (new_code != UNKNOWN)
3081 {
3082 std::swap (ct, cf);
3083 diff = -diff;
3084 code = new_code;
3085 }
3086 }
3087
3088 compare_code = UNKNOWN;
3089 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3090 && CONST_INT_P (op1))
3091 {
3092 if (op1 == const0_rtx
3093 && (code == LT || code == GE))
3094 compare_code = code;
3095 else if (op1 == constm1_rtx)
3096 {
3097 if (code == LE)
3098 compare_code = LT;
3099 else if (code == GT)
3100 compare_code = GE;
3101 }
3102 }
3103
3104 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3105 if (compare_code != UNKNOWN
3106 && GET_MODE (op0) == GET_MODE (out)
3107 && (cf == -1 || ct == -1))
3108 {
3109 /* If lea code below could be used, only optimize
3110 if it results in a 2 insn sequence. */
3111
3112 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3113 || diff == 3 || diff == 5 || diff == 9)
3114 || (compare_code == LT && ct == -1)
3115 || (compare_code == GE && cf == -1))
3116 {
3117 /*
3118 * notl op1 (if necessary)
3119 * sarl $31, op1
3120 * orl cf, op1
3121 */
3122 if (ct != -1)
3123 {
3124 cf = ct;
3125 ct = -1;
3126 code = reverse_condition (code);
3127 }
3128
3129 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3130
3131 out = expand_simple_binop (mode, IOR,
3132 out, GEN_INT (cf),
3133 out, 1, OPTAB_DIRECT);
3134 if (out != operands[0])
3135 emit_move_insn (operands[0], out);
3136
3137 return true;
3138 }
3139 }
3140
3141
3142 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3143 || diff == 3 || diff == 5 || diff == 9)
3144 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3145 && (mode != DImode
3146 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3147 {
3148 /*
3149 * xorl dest,dest
3150 * cmpl op1,op2
3151 * setcc dest
3152 * lea cf(dest*(ct-cf)),dest
3153 *
3154 * Size 14.
3155 *
3156 * This also catches the degenerate setcc-only case.
3157 */
3158
3159 rtx tmp;
3160 int nops;
3161
3162 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3163
3164 nops = 0;
3165 /* On x86_64 the lea instruction operates on Pmode, so we need
3166 to get arithmetics done in proper mode to match. */
3167 if (diff == 1)
3168 tmp = copy_rtx (out);
3169 else
3170 {
3171 rtx out1;
3172 out1 = copy_rtx (out);
3173 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3174 nops++;
3175 if (diff & 1)
3176 {
3177 tmp = gen_rtx_PLUS (mode, tmp, out1);
3178 nops++;
3179 }
3180 }
3181 if (cf != 0)
3182 {
c3185b64 3183 tmp = plus_constant (mode, tmp, cf);
2bf6d935
ML
3184 nops++;
3185 }
3186 if (!rtx_equal_p (tmp, out))
3187 {
3188 if (nops == 1)
3189 out = force_operand (tmp, copy_rtx (out));
3190 else
3191 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3192 }
3193 if (!rtx_equal_p (out, operands[0]))
3194 emit_move_insn (operands[0], copy_rtx (out));
3195
3196 return true;
3197 }
3198
3199 /*
3200 * General case: Jumpful:
3201 * xorl dest,dest cmpl op1, op2
3202 * cmpl op1, op2 movl ct, dest
3203 * setcc dest jcc 1f
3204 * decl dest movl cf, dest
3205 * andl (cf-ct),dest 1:
3206 * addl ct,dest
3207 *
3208 * Size 20. Size 14.
3209 *
3210 * This is reasonably steep, but branch mispredict costs are
3211 * high on modern cpus, so consider failing only if optimizing
3212 * for space.
3213 */
3214
3215 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3216 && BRANCH_COST (optimize_insn_for_speed_p (),
3217 false) >= 2)
3218 {
3219 if (cf == 0)
3220 {
3221 machine_mode cmp_mode = GET_MODE (op0);
3222 enum rtx_code new_code;
3223
3224 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3225 {
3226 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3227
8f17461b
UB
3228 /* We may be reversing a non-trapping
3229 comparison to a trapping comparison. */
3230 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3231 && code != EQ && code != NE
3232 && code != ORDERED && code != UNORDERED)
3233 new_code = UNKNOWN;
3234 else
3235 new_code = reverse_condition_maybe_unordered (code);
3236
2bf6d935
ML
3237 }
3238 else
3239 {
3240 new_code = ix86_reverse_condition (code, cmp_mode);
3241 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3242 compare_code = reverse_condition (compare_code);
3243 }
3244
3245 if (new_code != UNKNOWN)
3246 {
3247 cf = ct;
3248 ct = 0;
3249 code = new_code;
3250 }
3251 }
3252
3253 if (compare_code != UNKNOWN)
3254 {
3255 /* notl op1 (if needed)
3256 sarl $31, op1
3257 andl (cf-ct), op1
3258 addl ct, op1
3259
3260 For x < 0 (resp. x <= -1) there will be no notl,
3261 so if possible swap the constants to get rid of the
3262 complement.
3263 True/false will be -1/0 while code below (store flag
3264 followed by decrement) is 0/-1, so the constants need
3265 to be exchanged once more. */
3266
3267 if (compare_code == GE || !cf)
3268 {
3269 code = reverse_condition (code);
3270 compare_code = LT;
3271 }
3272 else
3273 std::swap (ct, cf);
3274
3275 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3276 }
3277 else
3278 {
3279 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3280
3281 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3282 constm1_rtx,
3283 copy_rtx (out), 1, OPTAB_DIRECT);
3284 }
3285
3286 out = expand_simple_binop (mode, AND, copy_rtx (out),
3287 gen_int_mode (cf - ct, mode),
3288 copy_rtx (out), 1, OPTAB_DIRECT);
3289 if (ct)
3290 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3291 copy_rtx (out), 1, OPTAB_DIRECT);
3292 if (!rtx_equal_p (out, operands[0]))
3293 emit_move_insn (operands[0], copy_rtx (out));
3294
3295 return true;
3296 }
3297 }
3298
3299 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3300 {
3301 /* Try a few things more with specific constants and a variable. */
3302
3303 optab op;
3304 rtx var, orig_out, out, tmp;
3305
3306 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3307 return false;
3308
3309 /* If one of the two operands is an interesting constant, load a
3310 constant with the above and mask it in with a logical operation. */
3311
3312 if (CONST_INT_P (operands[2]))
3313 {
3314 var = operands[3];
3315 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3316 operands[3] = constm1_rtx, op = and_optab;
3317 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3318 operands[3] = const0_rtx, op = ior_optab;
3319 else
3320 return false;
3321 }
3322 else if (CONST_INT_P (operands[3]))
3323 {
3324 var = operands[2];
3325 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
e4ced0b6
RS
3326 {
3327 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3328 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3329 if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
3330 operands[1] = simplify_gen_relational (LT, VOIDmode,
3331 GET_MODE (op0),
3332 op0, const0_rtx);
3333
3334 operands[2] = constm1_rtx;
3335 op = and_optab;
3336 }
2bf6d935
ML
3337 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3338 operands[2] = const0_rtx, op = ior_optab;
3339 else
3340 return false;
3341 }
3342 else
3343 return false;
3344
3345 orig_out = operands[0];
3346 tmp = gen_reg_rtx (mode);
3347 operands[0] = tmp;
3348
3349 /* Recurse to get the constant loaded. */
3350 if (!ix86_expand_int_movcc (operands))
3351 return false;
3352
3353 /* Mask in the interesting variable. */
3354 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3355 OPTAB_WIDEN);
3356 if (!rtx_equal_p (out, orig_out))
3357 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3358
3359 return true;
3360 }
3361
3362 /*
3363 * For comparison with above,
3364 *
3365 * movl cf,dest
3366 * movl ct,tmp
3367 * cmpl op1,op2
3368 * cmovcc tmp,dest
3369 *
3370 * Size 15.
3371 */
3372
3373 if (! nonimmediate_operand (operands[2], mode))
3374 operands[2] = force_reg (mode, operands[2]);
3375 if (! nonimmediate_operand (operands[3], mode))
3376 operands[3] = force_reg (mode, operands[3]);
3377
3378 if (! register_operand (operands[2], VOIDmode)
3379 && (mode == QImode
3380 || ! register_operand (operands[3], VOIDmode)))
3381 operands[2] = force_reg (mode, operands[2]);
3382
3383 if (mode == QImode
3384 && ! register_operand (operands[3], VOIDmode))
3385 operands[3] = force_reg (mode, operands[3]);
3386
3387 emit_insn (compare_seq);
3388 emit_insn (gen_rtx_SET (operands[0],
3389 gen_rtx_IF_THEN_ELSE (mode,
3390 compare_op, operands[2],
3391 operands[3])));
3392 return true;
3393}
3394
3395/* Detect conditional moves that exactly match min/max operational
3396 semantics. Note that this is IEEE safe, as long as we don't
3397 interchange the operands.
3398
3399 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3400 and TRUE if the operation is successful and instructions are emitted. */
3401
3402static bool
3403ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3404 rtx cmp_op1, rtx if_true, rtx if_false)
3405{
3406 machine_mode mode;
3407 bool is_min;
3408 rtx tmp;
3409
3410 if (code == LT)
3411 ;
3412 else if (code == UNGE)
3413 std::swap (if_true, if_false);
3414 else
3415 return false;
3416
3417 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3418 is_min = true;
3419 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3420 is_min = false;
3421 else
3422 return false;
3423
3424 mode = GET_MODE (dest);
3425
3426 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3427 but MODE may be a vector mode and thus not appropriate. */
3428 if (!flag_finite_math_only || flag_signed_zeros)
3429 {
3430 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3431 rtvec v;
3432
3433 if_true = force_reg (mode, if_true);
3434 v = gen_rtvec (2, if_true, if_false);
3435 tmp = gen_rtx_UNSPEC (mode, v, u);
3436 }
3437 else
3438 {
3439 code = is_min ? SMIN : SMAX;
3440 if (MEM_P (if_true) && MEM_P (if_false))
3441 if_true = force_reg (mode, if_true);
3442 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3443 }
3444
3445 emit_insn (gen_rtx_SET (dest, tmp));
3446 return true;
3447}
3448
8b905e9b
HL
3449/* Return true if MODE is valid for vector compare to mask register,
3450 Same result for conditionl vector move with mask register. */
3451static bool
3452ix86_valid_mask_cmp_mode (machine_mode mode)
3453{
3454 /* XOP has its own vector conditional movement. */
a8654147 3455 if (TARGET_XOP && !TARGET_AVX512F)
8b905e9b
HL
3456 return false;
3457
3458 /* AVX512F is needed for mask operation. */
3459 if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3460 return false;
3461
3462 /* AVX512BW is needed for vector QI/HImode,
3463 AVX512VL is needed for 128/256-bit vector. */
3464 machine_mode inner_mode = GET_MODE_INNER (mode);
3465 int vector_size = GET_MODE_SIZE (mode);
3466 if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3467 return false;
3468
3469 return vector_size == 64 || TARGET_AVX512VL;
3470}
3471
2bf6d935
ML
3472/* Expand an SSE comparison. Return the register with the result. */
3473
3474static rtx
3475ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3476 rtx op_true, rtx op_false)
3477{
3478 machine_mode mode = GET_MODE (dest);
3479 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3480
3481 /* In general case result of comparison can differ from operands' type. */
3482 machine_mode cmp_mode;
3483
3484 /* In AVX512F the result of comparison is an integer mask. */
3485 bool maskcmp = false;
3486 rtx x;
3487
8b905e9b 3488 if (ix86_valid_mask_cmp_mode (cmp_ops_mode))
2bf6d935
ML
3489 {
3490 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
2bf6d935 3491 maskcmp = true;
8b905e9b 3492 cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
2bf6d935
ML
3493 }
3494 else
3495 cmp_mode = cmp_ops_mode;
3496
3497 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3498
3499 int (*op1_predicate)(rtx, machine_mode)
3500 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3501
3502 if (!op1_predicate (cmp_op1, cmp_ops_mode))
3503 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3504
3505 if (optimize
3506 || (maskcmp && cmp_mode != mode)
3507 || (op_true && reg_overlap_mentioned_p (dest, op_true))
3508 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3509 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3510
99e4891e 3511 if (maskcmp)
3512 {
3513 bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
3514 gcc_assert (ok);
3515 return dest;
3516 }
3517
2bf6d935
ML
3518 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3519
3520 if (cmp_mode != mode && !maskcmp)
3521 {
3522 x = force_reg (cmp_ops_mode, x);
3523 convert_move (dest, x, false);
3524 }
3525 else
3526 emit_insn (gen_rtx_SET (dest, x));
3527
3528 return dest;
3529}
3530
3531/* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3532 operations. This is used for both scalar and vector conditional moves. */
3533
3534void
3535ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3536{
3537 machine_mode mode = GET_MODE (dest);
3538 machine_mode cmpmode = GET_MODE (cmp);
3539
9b5d50b7 3540 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
3541 if (rtx_equal_p (op_true, op_false))
3542 {
3543 emit_move_insn (dest, op_true);
3544 return;
3545 }
3546
2bf6d935 3547 /* In AVX512F the result of comparison is an integer mask. */
8b905e9b 3548 bool maskcmp = mode != cmpmode && ix86_valid_mask_cmp_mode (mode);
2bf6d935
ML
3549
3550 rtx t2, t3, x;
3551
3552 /* If we have an integer mask and FP value then we need
3553 to cast mask to FP mode. */
3554 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3555 {
3556 cmp = force_reg (cmpmode, cmp);
3557 cmp = gen_rtx_SUBREG (mode, cmp, 0);
3558 }
3559
3560 if (maskcmp)
3561 {
8b905e9b
HL
3562 /* Using vector move with mask register. */
3563 cmp = force_reg (cmpmode, cmp);
3564 /* Optimize for mask zero. */
3565 op_true = (op_true != CONST0_RTX (mode)
3566 ? force_reg (mode, op_true) : op_true);
3567 op_false = (op_false != CONST0_RTX (mode)
3568 ? force_reg (mode, op_false) : op_false);
3569 if (op_true == CONST0_RTX (mode))
2bf6d935 3570 {
8b905e9b
HL
3571 rtx (*gen_not) (rtx, rtx);
3572 switch (cmpmode)
2bf6d935 3573 {
8b905e9b
HL
3574 case E_QImode: gen_not = gen_knotqi; break;
3575 case E_HImode: gen_not = gen_knothi; break;
3576 case E_SImode: gen_not = gen_knotsi; break;
3577 case E_DImode: gen_not = gen_knotdi; break;
3578 default: gcc_unreachable ();
2bf6d935 3579 }
8b905e9b
HL
3580 rtx n = gen_reg_rtx (cmpmode);
3581 emit_insn (gen_not (n, cmp));
3582 cmp = n;
3583 /* Reverse op_true op_false. */
3584 std::swap (op_true, op_false);
2bf6d935 3585 }
8b905e9b
HL
3586
3587 rtx vec_merge = gen_rtx_VEC_MERGE (mode, op_true, op_false, cmp);
3588 emit_insn (gen_rtx_SET (dest, vec_merge));
3589 return;
2bf6d935
ML
3590 }
3591 else if (vector_all_ones_operand (op_true, mode)
3592 && op_false == CONST0_RTX (mode))
3593 {
3594 emit_insn (gen_rtx_SET (dest, cmp));
3595 return;
3596 }
3597 else if (op_false == CONST0_RTX (mode))
3598 {
3599 op_true = force_reg (mode, op_true);
3600 x = gen_rtx_AND (mode, cmp, op_true);
3601 emit_insn (gen_rtx_SET (dest, x));
3602 return;
3603 }
3604 else if (op_true == CONST0_RTX (mode))
3605 {
3606 op_false = force_reg (mode, op_false);
3607 x = gen_rtx_NOT (mode, cmp);
3608 x = gen_rtx_AND (mode, x, op_false);
3609 emit_insn (gen_rtx_SET (dest, x));
3610 return;
3611 }
3612 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
3613 {
3614 op_false = force_reg (mode, op_false);
3615 x = gen_rtx_IOR (mode, cmp, op_false);
3616 emit_insn (gen_rtx_SET (dest, x));
3617 return;
3618 }
3619 else if (TARGET_XOP)
3620 {
3621 op_true = force_reg (mode, op_true);
3622
3623 if (!nonimmediate_operand (op_false, mode))
3624 op_false = force_reg (mode, op_false);
3625
3626 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
3627 op_true,
3628 op_false)));
3629 return;
3630 }
3631
3632 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
3633 rtx d = dest;
3634
3635 if (!vector_operand (op_true, mode))
3636 op_true = force_reg (mode, op_true);
3637
3638 op_false = force_reg (mode, op_false);
3639
3640 switch (mode)
3641 {
3642 case E_V4SFmode:
3643 if (TARGET_SSE4_1)
3644 gen = gen_sse4_1_blendvps;
3645 break;
3646 case E_V2DFmode:
3647 if (TARGET_SSE4_1)
3648 gen = gen_sse4_1_blendvpd;
3649 break;
3650 case E_SFmode:
3651 if (TARGET_SSE4_1)
3652 {
3653 gen = gen_sse4_1_blendvss;
3654 op_true = force_reg (mode, op_true);
3655 }
3656 break;
3657 case E_DFmode:
3658 if (TARGET_SSE4_1)
3659 {
3660 gen = gen_sse4_1_blendvsd;
3661 op_true = force_reg (mode, op_true);
3662 }
3663 break;
3664 case E_V16QImode:
3665 case E_V8HImode:
3666 case E_V4SImode:
3667 case E_V2DImode:
3668 if (TARGET_SSE4_1)
3669 {
3670 gen = gen_sse4_1_pblendvb;
3671 if (mode != V16QImode)
3672 d = gen_reg_rtx (V16QImode);
3673 op_false = gen_lowpart (V16QImode, op_false);
3674 op_true = gen_lowpart (V16QImode, op_true);
3675 cmp = gen_lowpart (V16QImode, cmp);
3676 }
3677 break;
3678 case E_V8SFmode:
3679 if (TARGET_AVX)
3680 gen = gen_avx_blendvps256;
3681 break;
3682 case E_V4DFmode:
3683 if (TARGET_AVX)
3684 gen = gen_avx_blendvpd256;
3685 break;
3686 case E_V32QImode:
3687 case E_V16HImode:
3688 case E_V8SImode:
3689 case E_V4DImode:
3690 if (TARGET_AVX2)
3691 {
3692 gen = gen_avx2_pblendvb;
3693 if (mode != V32QImode)
3694 d = gen_reg_rtx (V32QImode);
3695 op_false = gen_lowpart (V32QImode, op_false);
3696 op_true = gen_lowpart (V32QImode, op_true);
3697 cmp = gen_lowpart (V32QImode, cmp);
3698 }
3699 break;
3700
3701 case E_V64QImode:
3702 gen = gen_avx512bw_blendmv64qi;
3703 break;
3704 case E_V32HImode:
3705 gen = gen_avx512bw_blendmv32hi;
3706 break;
3707 case E_V16SImode:
3708 gen = gen_avx512f_blendmv16si;
3709 break;
3710 case E_V8DImode:
3711 gen = gen_avx512f_blendmv8di;
3712 break;
3713 case E_V8DFmode:
3714 gen = gen_avx512f_blendmv8df;
3715 break;
3716 case E_V16SFmode:
3717 gen = gen_avx512f_blendmv16sf;
3718 break;
3719
3720 default:
3721 break;
3722 }
3723
3724 if (gen != NULL)
3725 {
3726 emit_insn (gen (d, op_false, op_true, cmp));
3727 if (d != dest)
3728 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
3729 }
3730 else
3731 {
3732 op_true = force_reg (mode, op_true);
3733
3734 t2 = gen_reg_rtx (mode);
3735 if (optimize)
3736 t3 = gen_reg_rtx (mode);
3737 else
3738 t3 = dest;
3739
3740 x = gen_rtx_AND (mode, op_true, cmp);
3741 emit_insn (gen_rtx_SET (t2, x));
3742
3743 x = gen_rtx_NOT (mode, cmp);
3744 x = gen_rtx_AND (mode, x, op_false);
3745 emit_insn (gen_rtx_SET (t3, x));
3746
3747 x = gen_rtx_IOR (mode, t3, t2);
3748 emit_insn (gen_rtx_SET (dest, x));
3749 }
3750}
3751
3752/* Swap, force into registers, or otherwise massage the two operands
3753 to an sse comparison with a mask result. Thus we differ a bit from
3754 ix86_prepare_fp_compare_args which expects to produce a flags result.
3755
3756 The DEST operand exists to help determine whether to commute commutative
3757 operators. The POP0/POP1 operands are updated in place. The new
3758 comparison code is returned, or UNKNOWN if not implementable. */
3759
3760static enum rtx_code
3761ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
3762 rtx *pop0, rtx *pop1)
3763{
3764 switch (code)
3765 {
3766 case LTGT:
3767 case UNEQ:
3768 /* AVX supports all the needed comparisons. */
3769 if (TARGET_AVX)
3770 break;
3771 /* We have no LTGT as an operator. We could implement it with
3772 NE & ORDERED, but this requires an extra temporary. It's
3773 not clear that it's worth it. */
3774 return UNKNOWN;
3775
3776 case LT:
3777 case LE:
3778 case UNGT:
3779 case UNGE:
3780 /* These are supported directly. */
3781 break;
3782
3783 case EQ:
3784 case NE:
3785 case UNORDERED:
3786 case ORDERED:
3787 /* AVX has 3 operand comparisons, no need to swap anything. */
3788 if (TARGET_AVX)
3789 break;
3790 /* For commutative operators, try to canonicalize the destination
3791 operand to be first in the comparison - this helps reload to
3792 avoid extra moves. */
3793 if (!dest || !rtx_equal_p (dest, *pop1))
3794 break;
3795 /* FALLTHRU */
3796
3797 case GE:
3798 case GT:
3799 case UNLE:
3800 case UNLT:
3801 /* These are not supported directly before AVX, and furthermore
3802 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
3803 comparison operands to transform into something that is
3804 supported. */
3805 std::swap (*pop0, *pop1);
3806 code = swap_condition (code);
3807 break;
3808
3809 default:
3810 gcc_unreachable ();
3811 }
3812
3813 return code;
3814}
3815
3816/* Expand a floating-point conditional move. Return true if successful. */
3817
3818bool
3819ix86_expand_fp_movcc (rtx operands[])
3820{
3821 machine_mode mode = GET_MODE (operands[0]);
3822 enum rtx_code code = GET_CODE (operands[1]);
3823 rtx tmp, compare_op;
3824 rtx op0 = XEXP (operands[1], 0);
3825 rtx op1 = XEXP (operands[1], 1);
3826
3827 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
3828 {
3829 machine_mode cmode;
3830
3831 /* Since we've no cmove for sse registers, don't force bad register
3832 allocation just to gain access to it. Deny movcc when the
3833 comparison mode doesn't match the move mode. */
3834 cmode = GET_MODE (op0);
3835 if (cmode == VOIDmode)
3836 cmode = GET_MODE (op1);
3837 if (cmode != mode)
3838 return false;
3839
3840 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
3841 if (code == UNKNOWN)
3842 return false;
3843
3844 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
3845 operands[2], operands[3]))
3846 return true;
3847
3848 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
3849 operands[2], operands[3]);
3850 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
3851 return true;
3852 }
3853
3854 if (GET_MODE (op0) == TImode
3855 || (GET_MODE (op0) == DImode
3856 && !TARGET_64BIT))
3857 return false;
3858
3859 /* The floating point conditional move instructions don't directly
3860 support conditions resulting from a signed integer comparison. */
3861
3862 compare_op = ix86_expand_compare (code, op0, op1);
3863 if (!fcmov_comparison_operator (compare_op, VOIDmode))
3864 {
3865 tmp = gen_reg_rtx (QImode);
3866 ix86_expand_setcc (tmp, code, op0, op1);
3867
3868 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
3869 }
3870
3871 emit_insn (gen_rtx_SET (operands[0],
3872 gen_rtx_IF_THEN_ELSE (mode, compare_op,
3873 operands[2], operands[3])));
3874
3875 return true;
3876}
3877
3878/* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
3879
3880static int
3881ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
3882{
3883 switch (code)
3884 {
3885 case EQ:
3886 return 0;
3887 case LT:
3888 case LTU:
3889 return 1;
3890 case LE:
3891 case LEU:
3892 return 2;
3893 case NE:
3894 return 4;
3895 case GE:
3896 case GEU:
3897 return 5;
3898 case GT:
3899 case GTU:
3900 return 6;
3901 default:
3902 gcc_unreachable ();
3903 }
3904}
3905
3906/* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
3907
3908static int
3909ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
3910{
3911 switch (code)
3912 {
3913 case EQ:
3914 return 0x00;
3915 case NE:
3916 return 0x04;
3917 case GT:
3918 return 0x0e;
3919 case LE:
3920 return 0x02;
3921 case GE:
3922 return 0x0d;
3923 case LT:
3924 return 0x01;
3925 case UNLE:
3926 return 0x0a;
3927 case UNLT:
3928 return 0x09;
3929 case UNGE:
3930 return 0x05;
3931 case UNGT:
3932 return 0x06;
3933 case UNEQ:
3934 return 0x18;
3935 case LTGT:
3936 return 0x0c;
3937 case ORDERED:
3938 return 0x07;
3939 case UNORDERED:
3940 return 0x03;
3941 default:
3942 gcc_unreachable ();
3943 }
3944}
3945
3946/* Return immediate value to be used in UNSPEC_PCMP
3947 for comparison CODE in MODE. */
3948
3949static int
3950ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
3951{
3952 if (FLOAT_MODE_P (mode))
3953 return ix86_fp_cmp_code_to_pcmp_immediate (code);
3954 return ix86_int_cmp_code_to_pcmp_immediate (code);
3955}
3956
3957/* Expand AVX-512 vector comparison. */
3958
3959bool
99e4891e 3960ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
2bf6d935 3961{
99e4891e 3962 machine_mode mask_mode = GET_MODE (dest);
3963 machine_mode cmp_mode = GET_MODE (cmp_op0);
2bf6d935
ML
3964 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
3965 int unspec_code;
3966 rtx unspec;
3967
3968 switch (code)
3969 {
3970 case LEU:
3971 case GTU:
3972 case GEU:
3973 case LTU:
3974 unspec_code = UNSPEC_UNSIGNED_PCMP;
3975 break;
3976
3977 default:
3978 unspec_code = UNSPEC_PCMP;
3979 }
3980
99e4891e 3981 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
2bf6d935 3982 unspec_code);
99e4891e 3983 emit_insn (gen_rtx_SET (dest, unspec));
2bf6d935
ML
3984
3985 return true;
3986}
3987
3988/* Expand fp vector comparison. */
3989
3990bool
3991ix86_expand_fp_vec_cmp (rtx operands[])
3992{
3993 enum rtx_code code = GET_CODE (operands[1]);
3994 rtx cmp;
3995
3996 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
3997 &operands[2], &operands[3]);
3998 if (code == UNKNOWN)
3999 {
4000 rtx temp;
4001 switch (GET_CODE (operands[1]))
4002 {
4003 case LTGT:
4004 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4005 operands[3], NULL, NULL);
4006 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4007 operands[3], NULL, NULL);
4008 code = AND;
4009 break;
4010 case UNEQ:
4011 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4012 operands[3], NULL, NULL);
4013 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4014 operands[3], NULL, NULL);
4015 code = IOR;
4016 break;
4017 default:
4018 gcc_unreachable ();
4019 }
4020 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4021 OPTAB_DIRECT);
4022 }
4023 else
4024 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4025 operands[1], operands[2]);
4026
4027 if (operands[0] != cmp)
4028 emit_move_insn (operands[0], cmp);
4029
4030 return true;
4031}
4032
4033static rtx
4034ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4035 rtx op_true, rtx op_false, bool *negate)
4036{
4037 machine_mode data_mode = GET_MODE (dest);
4038 machine_mode mode = GET_MODE (cop0);
4039 rtx x;
4040
4041 *negate = false;
4042
4043 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4044 if (TARGET_XOP
4045 && (mode == V16QImode || mode == V8HImode
4046 || mode == V4SImode || mode == V2DImode))
4047 ;
8b905e9b
HL
4048 /* AVX512F supports all of the comparsions
4049 on all 128/256/512-bit vector int types. */
4050 else if (ix86_valid_mask_cmp_mode (mode))
4051 ;
2bf6d935
ML
4052 else
4053 {
4054 /* Canonicalize the comparison to EQ, GT, GTU. */
4055 switch (code)
4056 {
4057 case EQ:
4058 case GT:
4059 case GTU:
4060 break;
4061
4062 case NE:
4063 case LE:
4064 case LEU:
4065 code = reverse_condition (code);
4066 *negate = true;
4067 break;
4068
4069 case GE:
4070 case GEU:
4071 code = reverse_condition (code);
4072 *negate = true;
4073 /* FALLTHRU */
4074
4075 case LT:
4076 case LTU:
4077 std::swap (cop0, cop1);
4078 code = swap_condition (code);
4079 break;
4080
4081 default:
4082 gcc_unreachable ();
4083 }
4084
4085 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4086 if (mode == V2DImode)
4087 {
4088 switch (code)
4089 {
4090 case EQ:
4091 /* SSE4.1 supports EQ. */
4092 if (!TARGET_SSE4_1)
4093 return NULL;
4094 break;
4095
4096 case GT:
4097 case GTU:
4098 /* SSE4.2 supports GT/GTU. */
4099 if (!TARGET_SSE4_2)
4100 return NULL;
4101 break;
4102
4103 default:
4104 gcc_unreachable ();
4105 }
4106 }
4107
4108 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4109 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4110 if (*negate)
4111 std::swap (optrue, opfalse);
4112
4113 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4114 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4115 min (x, y) == x). While we add one instruction (the minimum),
4116 we remove the need for two instructions in the negation, as the
4117 result is done this way.
4118 When using masks, do it for SI/DImode element types, as it is shorter
4119 than the two subtractions. */
4120 if ((code != EQ
4121 && GET_MODE_SIZE (mode) != 64
4122 && vector_all_ones_operand (opfalse, data_mode)
4123 && optrue == CONST0_RTX (data_mode))
4124 || (code == GTU
4125 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4126 /* Don't do it if not using integer masks and we'd end up with
4127 the right values in the registers though. */
4128 && (GET_MODE_SIZE (mode) == 64
4129 || !vector_all_ones_operand (optrue, data_mode)
4130 || opfalse != CONST0_RTX (data_mode))))
4131 {
4132 rtx (*gen) (rtx, rtx, rtx) = NULL;
4133
4134 switch (mode)
4135 {
4136 case E_V16SImode:
4137 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4138 break;
4139 case E_V8DImode:
4140 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4141 cop0 = force_reg (mode, cop0);
4142 cop1 = force_reg (mode, cop1);
4143 break;
4144 case E_V32QImode:
4145 if (TARGET_AVX2)
4146 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4147 break;
4148 case E_V16HImode:
4149 if (TARGET_AVX2)
4150 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4151 break;
4152 case E_V8SImode:
4153 if (TARGET_AVX2)
4154 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4155 break;
4156 case E_V4DImode:
4157 if (TARGET_AVX512VL)
4158 {
4159 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4160 cop0 = force_reg (mode, cop0);
4161 cop1 = force_reg (mode, cop1);
4162 }
4163 break;
4164 case E_V16QImode:
4165 if (code == GTU && TARGET_SSE2)
4166 gen = gen_uminv16qi3;
4167 else if (code == GT && TARGET_SSE4_1)
4168 gen = gen_sminv16qi3;
4169 break;
4170 case E_V8HImode:
4171 if (code == GTU && TARGET_SSE4_1)
4172 gen = gen_uminv8hi3;
4173 else if (code == GT && TARGET_SSE2)
4174 gen = gen_sminv8hi3;
4175 break;
4176 case E_V4SImode:
4177 if (TARGET_SSE4_1)
4178 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4179 break;
4180 case E_V2DImode:
4181 if (TARGET_AVX512VL)
4182 {
4183 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4184 cop0 = force_reg (mode, cop0);
4185 cop1 = force_reg (mode, cop1);
4186 }
4187 break;
4188 default:
4189 break;
4190 }
4191
4192 if (gen)
4193 {
4194 rtx tem = gen_reg_rtx (mode);
4195 if (!vector_operand (cop0, mode))
4196 cop0 = force_reg (mode, cop0);
4197 if (!vector_operand (cop1, mode))
4198 cop1 = force_reg (mode, cop1);
4199 *negate = !*negate;
4200 emit_insn (gen (tem, cop0, cop1));
4201 cop1 = tem;
4202 code = EQ;
4203 }
4204 }
4205
4206 /* Unsigned parallel compare is not supported by the hardware.
4207 Play some tricks to turn this into a signed comparison
4208 against 0. */
4209 if (code == GTU)
4210 {
4211 cop0 = force_reg (mode, cop0);
4212
4213 switch (mode)
4214 {
4215 case E_V16SImode:
4216 case E_V8DImode:
4217 case E_V8SImode:
4218 case E_V4DImode:
4219 case E_V4SImode:
4220 case E_V2DImode:
4221 {
4222 rtx t1, t2, mask;
83bc5e44 4223
2bf6d935
ML
4224 /* Subtract (-(INT MAX) - 1) from both operands to make
4225 them signed. */
4226 mask = ix86_build_signbit_mask (mode, true, false);
4227 t1 = gen_reg_rtx (mode);
83bc5e44 4228 emit_insn (gen_sub3_insn (t1, cop0, mask));
2bf6d935
ML
4229
4230 t2 = gen_reg_rtx (mode);
83bc5e44 4231 emit_insn (gen_sub3_insn (t2, cop1, mask));
2bf6d935
ML
4232
4233 cop0 = t1;
4234 cop1 = t2;
4235 code = GT;
4236 }
4237 break;
4238
4239 case E_V64QImode:
4240 case E_V32HImode:
4241 case E_V32QImode:
4242 case E_V16HImode:
4243 case E_V16QImode:
4244 case E_V8HImode:
4245 /* Perform a parallel unsigned saturating subtraction. */
4246 x = gen_reg_rtx (mode);
83bc5e44
UB
4247 emit_insn (gen_rtx_SET
4248 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
2bf6d935
ML
4249 cop0 = x;
4250 cop1 = CONST0_RTX (mode);
4251 code = EQ;
4252 *negate = !*negate;
4253 break;
4254
4255 default:
4256 gcc_unreachable ();
4257 }
4258 }
4259 }
4260
4261 if (*negate)
4262 std::swap (op_true, op_false);
4263
4264 /* Allow the comparison to be done in one mode, but the movcc to
4265 happen in another mode. */
4266 if (data_mode == mode)
4267 {
4268 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
4269 op_true, op_false);
4270 }
4271 else
4272 {
4273 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4274 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4275 op_true, op_false);
4276 if (GET_MODE (x) == mode)
4277 x = gen_lowpart (data_mode, x);
4278 }
4279
4280 return x;
4281}
4282
4283/* Expand integer vector comparison. */
4284
4285bool
4286ix86_expand_int_vec_cmp (rtx operands[])
4287{
4288 rtx_code code = GET_CODE (operands[1]);
4289 bool negate = false;
4290 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4291 operands[3], NULL, NULL, &negate);
4292
4293 if (!cmp)
4294 return false;
4295
4296 if (negate)
4297 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4298 CONST0_RTX (GET_MODE (cmp)),
4299 NULL, NULL, &negate);
4300
4301 gcc_assert (!negate);
4302
4303 if (operands[0] != cmp)
4304 emit_move_insn (operands[0], cmp);
4305
4306 return true;
4307}
4308
4309/* Expand a floating-point vector conditional move; a vcond operation
4310 rather than a movcc operation. */
4311
4312bool
4313ix86_expand_fp_vcond (rtx operands[])
4314{
4315 enum rtx_code code = GET_CODE (operands[3]);
4316 rtx cmp;
4317
4318 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4319 &operands[4], &operands[5]);
4320 if (code == UNKNOWN)
4321 {
4322 rtx temp;
4323 switch (GET_CODE (operands[3]))
4324 {
4325 case LTGT:
4326 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4327 operands[5], operands[0], operands[0]);
4328 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4329 operands[5], operands[1], operands[2]);
4330 code = AND;
4331 break;
4332 case UNEQ:
4333 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4334 operands[5], operands[0], operands[0]);
4335 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4336 operands[5], operands[1], operands[2]);
4337 code = IOR;
4338 break;
4339 default:
4340 gcc_unreachable ();
4341 }
4342 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4343 OPTAB_DIRECT);
4344 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4345 return true;
4346 }
4347
4348 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4349 operands[5], operands[1], operands[2]))
4350 return true;
4351
4352 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4353 operands[1], operands[2]);
4354 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4355 return true;
4356}
4357
4358/* Expand a signed/unsigned integral vector conditional move. */
4359
4360bool
4361ix86_expand_int_vcond (rtx operands[])
4362{
4363 machine_mode data_mode = GET_MODE (operands[0]);
4364 machine_mode mode = GET_MODE (operands[4]);
4365 enum rtx_code code = GET_CODE (operands[3]);
4366 bool negate = false;
4367 rtx x, cop0, cop1;
4368
4369 cop0 = operands[4];
4370 cop1 = operands[5];
4371
4372 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4373 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4374 if ((code == LT || code == GE)
4375 && data_mode == mode
4376 && cop1 == CONST0_RTX (mode)
4377 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4378 && GET_MODE_UNIT_SIZE (data_mode) > 1
4379 && GET_MODE_UNIT_SIZE (data_mode) <= 8
4380 && (GET_MODE_SIZE (data_mode) == 16
4381 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4382 {
4383 rtx negop = operands[2 - (code == LT)];
4384 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4385 if (negop == CONST1_RTX (data_mode))
4386 {
4387 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4388 operands[0], 1, OPTAB_DIRECT);
4389 if (res != operands[0])
4390 emit_move_insn (operands[0], res);
4391 return true;
4392 }
4393 else if (GET_MODE_INNER (data_mode) != DImode
4394 && vector_all_ones_operand (negop, data_mode))
4395 {
4396 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4397 operands[0], 0, OPTAB_DIRECT);
4398 if (res != operands[0])
4399 emit_move_insn (operands[0], res);
4400 return true;
4401 }
4402 }
4403
4404 if (!nonimmediate_operand (cop1, mode))
4405 cop1 = force_reg (mode, cop1);
4406 if (!general_operand (operands[1], data_mode))
4407 operands[1] = force_reg (data_mode, operands[1]);
4408 if (!general_operand (operands[2], data_mode))
4409 operands[2] = force_reg (data_mode, operands[2]);
4410
4411 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
4412 operands[1], operands[2], &negate);
4413
4414 if (!x)
4415 return false;
4416
4417 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
4418 operands[2-negate]);
4419 return true;
4420}
4421
4422static bool
4423ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
4424 struct expand_vec_perm_d *d)
4425{
4426 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4427 expander, so args are either in d, or in op0, op1 etc. */
4428 machine_mode mode = GET_MODE (d ? d->op0 : op0);
4429 machine_mode maskmode = mode;
4430 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4431
4432 switch (mode)
4433 {
4434 case E_V8HImode:
4435 if (TARGET_AVX512VL && TARGET_AVX512BW)
4436 gen = gen_avx512vl_vpermt2varv8hi3;
4437 break;
4438 case E_V16HImode:
4439 if (TARGET_AVX512VL && TARGET_AVX512BW)
4440 gen = gen_avx512vl_vpermt2varv16hi3;
4441 break;
4442 case E_V64QImode:
4443 if (TARGET_AVX512VBMI)
4444 gen = gen_avx512bw_vpermt2varv64qi3;
4445 break;
4446 case E_V32HImode:
4447 if (TARGET_AVX512BW)
4448 gen = gen_avx512bw_vpermt2varv32hi3;
4449 break;
4450 case E_V4SImode:
4451 if (TARGET_AVX512VL)
4452 gen = gen_avx512vl_vpermt2varv4si3;
4453 break;
4454 case E_V8SImode:
4455 if (TARGET_AVX512VL)
4456 gen = gen_avx512vl_vpermt2varv8si3;
4457 break;
4458 case E_V16SImode:
4459 if (TARGET_AVX512F)
4460 gen = gen_avx512f_vpermt2varv16si3;
4461 break;
4462 case E_V4SFmode:
4463 if (TARGET_AVX512VL)
4464 {
4465 gen = gen_avx512vl_vpermt2varv4sf3;
4466 maskmode = V4SImode;
4467 }
4468 break;
4469 case E_V8SFmode:
4470 if (TARGET_AVX512VL)
4471 {
4472 gen = gen_avx512vl_vpermt2varv8sf3;
4473 maskmode = V8SImode;
4474 }
4475 break;
4476 case E_V16SFmode:
4477 if (TARGET_AVX512F)
4478 {
4479 gen = gen_avx512f_vpermt2varv16sf3;
4480 maskmode = V16SImode;
4481 }
4482 break;
4483 case E_V2DImode:
4484 if (TARGET_AVX512VL)
4485 gen = gen_avx512vl_vpermt2varv2di3;
4486 break;
4487 case E_V4DImode:
4488 if (TARGET_AVX512VL)
4489 gen = gen_avx512vl_vpermt2varv4di3;
4490 break;
4491 case E_V8DImode:
4492 if (TARGET_AVX512F)
4493 gen = gen_avx512f_vpermt2varv8di3;
4494 break;
4495 case E_V2DFmode:
4496 if (TARGET_AVX512VL)
4497 {
4498 gen = gen_avx512vl_vpermt2varv2df3;
4499 maskmode = V2DImode;
4500 }
4501 break;
4502 case E_V4DFmode:
4503 if (TARGET_AVX512VL)
4504 {
4505 gen = gen_avx512vl_vpermt2varv4df3;
4506 maskmode = V4DImode;
4507 }
4508 break;
4509 case E_V8DFmode:
4510 if (TARGET_AVX512F)
4511 {
4512 gen = gen_avx512f_vpermt2varv8df3;
4513 maskmode = V8DImode;
4514 }
4515 break;
4516 default:
4517 break;
4518 }
4519
4520 if (gen == NULL)
4521 return false;
4522
4523 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4524 expander, so args are either in d, or in op0, op1 etc. */
4525 if (d)
4526 {
4527 rtx vec[64];
4528 target = d->target;
4529 op0 = d->op0;
4530 op1 = d->op1;
4531 for (int i = 0; i < d->nelt; ++i)
4532 vec[i] = GEN_INT (d->perm[i]);
4533 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
4534 }
4535
4536 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
4537 return true;
4538}
4539
4540/* Expand a variable vector permutation. */
4541
4542void
4543ix86_expand_vec_perm (rtx operands[])
4544{
4545 rtx target = operands[0];
4546 rtx op0 = operands[1];
4547 rtx op1 = operands[2];
4548 rtx mask = operands[3];
4549 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
4550 machine_mode mode = GET_MODE (op0);
4551 machine_mode maskmode = GET_MODE (mask);
4552 int w, e, i;
4553 bool one_operand_shuffle = rtx_equal_p (op0, op1);
4554
4555 /* Number of elements in the vector. */
4556 w = GET_MODE_NUNITS (mode);
4557 e = GET_MODE_UNIT_SIZE (mode);
4558 gcc_assert (w <= 64);
4559
4560 if (TARGET_AVX512F && one_operand_shuffle)
4561 {
4562 rtx (*gen) (rtx, rtx, rtx) = NULL;
4563 switch (mode)
4564 {
4565 case E_V16SImode:
4566 gen =gen_avx512f_permvarv16si;
4567 break;
4568 case E_V16SFmode:
4569 gen = gen_avx512f_permvarv16sf;
4570 break;
4571 case E_V8DImode:
4572 gen = gen_avx512f_permvarv8di;
4573 break;
4574 case E_V8DFmode:
4575 gen = gen_avx512f_permvarv8df;
4576 break;
4577 default:
4578 break;
4579 }
4580 if (gen != NULL)
4581 {
4582 emit_insn (gen (target, op0, mask));
4583 return;
4584 }
4585 }
4586
4587 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
4588 return;
4589
4590 if (TARGET_AVX2)
4591 {
4592 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
4593 {
4594 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4595 an constant shuffle operand. With a tiny bit of effort we can
4596 use VPERMD instead. A re-interpretation stall for V4DFmode is
4597 unfortunate but there's no avoiding it.
4598 Similarly for V16HImode we don't have instructions for variable
4599 shuffling, while for V32QImode we can use after preparing suitable
4600 masks vpshufb; vpshufb; vpermq; vpor. */
4601
4602 if (mode == V16HImode)
4603 {
4604 maskmode = mode = V32QImode;
4605 w = 32;
4606 e = 1;
4607 }
4608 else
4609 {
4610 maskmode = mode = V8SImode;
4611 w = 8;
4612 e = 4;
4613 }
4614 t1 = gen_reg_rtx (maskmode);
4615
4616 /* Replicate the low bits of the V4DImode mask into V8SImode:
4617 mask = { A B C D }
4618 t1 = { A A B B C C D D }. */
4619 for (i = 0; i < w / 2; ++i)
4620 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
4621 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4622 vt = force_reg (maskmode, vt);
4623 mask = gen_lowpart (maskmode, mask);
4624 if (maskmode == V8SImode)
4625 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
4626 else
4627 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
4628
4629 /* Multiply the shuffle indicies by two. */
4630 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
4631 OPTAB_DIRECT);
4632
4633 /* Add one to the odd shuffle indicies:
4634 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
4635 for (i = 0; i < w / 2; ++i)
4636 {
4637 vec[i * 2] = const0_rtx;
4638 vec[i * 2 + 1] = const1_rtx;
4639 }
4640 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4641 vt = validize_mem (force_const_mem (maskmode, vt));
4642 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
4643 OPTAB_DIRECT);
4644
4645 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
4646 operands[3] = mask = t1;
4647 target = gen_reg_rtx (mode);
4648 op0 = gen_lowpart (mode, op0);
4649 op1 = gen_lowpart (mode, op1);
4650 }
4651
4652 switch (mode)
4653 {
4654 case E_V8SImode:
4655 /* The VPERMD and VPERMPS instructions already properly ignore
4656 the high bits of the shuffle elements. No need for us to
4657 perform an AND ourselves. */
4658 if (one_operand_shuffle)
4659 {
4660 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
4661 if (target != operands[0])
4662 emit_move_insn (operands[0],
4663 gen_lowpart (GET_MODE (operands[0]), target));
4664 }
4665 else
4666 {
4667 t1 = gen_reg_rtx (V8SImode);
4668 t2 = gen_reg_rtx (V8SImode);
4669 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
4670 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
4671 goto merge_two;
4672 }
4673 return;
4674
4675 case E_V8SFmode:
4676 mask = gen_lowpart (V8SImode, mask);
4677 if (one_operand_shuffle)
4678 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
4679 else
4680 {
4681 t1 = gen_reg_rtx (V8SFmode);
4682 t2 = gen_reg_rtx (V8SFmode);
4683 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
4684 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
4685 goto merge_two;
4686 }
4687 return;
4688
4689 case E_V4SImode:
4690 /* By combining the two 128-bit input vectors into one 256-bit
4691 input vector, we can use VPERMD and VPERMPS for the full
4692 two-operand shuffle. */
4693 t1 = gen_reg_rtx (V8SImode);
4694 t2 = gen_reg_rtx (V8SImode);
4695 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
4696 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4697 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
4698 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
4699 return;
4700
4701 case E_V4SFmode:
4702 t1 = gen_reg_rtx (V8SFmode);
4703 t2 = gen_reg_rtx (V8SImode);
4704 mask = gen_lowpart (V4SImode, mask);
4705 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
4706 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4707 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
4708 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
4709 return;
4710
4711 case E_V32QImode:
4712 t1 = gen_reg_rtx (V32QImode);
4713 t2 = gen_reg_rtx (V32QImode);
4714 t3 = gen_reg_rtx (V32QImode);
4715 vt2 = GEN_INT (-128);
4716 vt = gen_const_vec_duplicate (V32QImode, vt2);
4717 vt = force_reg (V32QImode, vt);
4718 for (i = 0; i < 32; i++)
4719 vec[i] = i < 16 ? vt2 : const0_rtx;
4720 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
4721 vt2 = force_reg (V32QImode, vt2);
4722 /* From mask create two adjusted masks, which contain the same
4723 bits as mask in the low 7 bits of each vector element.
4724 The first mask will have the most significant bit clear
4725 if it requests element from the same 128-bit lane
4726 and MSB set if it requests element from the other 128-bit lane.
4727 The second mask will have the opposite values of the MSB,
4728 and additionally will have its 128-bit lanes swapped.
4729 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4730 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
4731 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4732 stands for other 12 bytes. */
4733 /* The bit whether element is from the same lane or the other
4734 lane is bit 4, so shift it up by 3 to the MSB position. */
4735 t5 = gen_reg_rtx (V4DImode);
4736 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
4737 GEN_INT (3)));
4738 /* Clear MSB bits from the mask just in case it had them set. */
4739 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
4740 /* After this t1 will have MSB set for elements from other lane. */
4741 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
4742 /* Clear bits other than MSB. */
4743 emit_insn (gen_andv32qi3 (t1, t1, vt));
4744 /* Or in the lower bits from mask into t3. */
4745 emit_insn (gen_iorv32qi3 (t3, t1, t2));
4746 /* And invert MSB bits in t1, so MSB is set for elements from the same
4747 lane. */
4748 emit_insn (gen_xorv32qi3 (t1, t1, vt));
4749 /* Swap 128-bit lanes in t3. */
4750 t6 = gen_reg_rtx (V4DImode);
4751 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
4752 const2_rtx, GEN_INT (3),
4753 const0_rtx, const1_rtx));
4754 /* And or in the lower bits from mask into t1. */
4755 emit_insn (gen_iorv32qi3 (t1, t1, t2));
4756 if (one_operand_shuffle)
4757 {
4758 /* Each of these shuffles will put 0s in places where
4759 element from the other 128-bit lane is needed, otherwise
4760 will shuffle in the requested value. */
4761 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
4762 gen_lowpart (V32QImode, t6)));
4763 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
4764 /* For t3 the 128-bit lanes are swapped again. */
4765 t7 = gen_reg_rtx (V4DImode);
4766 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
4767 const2_rtx, GEN_INT (3),
4768 const0_rtx, const1_rtx));
4769 /* And oring both together leads to the result. */
4770 emit_insn (gen_iorv32qi3 (target, t1,
4771 gen_lowpart (V32QImode, t7)));
4772 if (target != operands[0])
4773 emit_move_insn (operands[0],
4774 gen_lowpart (GET_MODE (operands[0]), target));
4775 return;
4776 }
4777
4778 t4 = gen_reg_rtx (V32QImode);
4779 /* Similarly to the above one_operand_shuffle code,
4780 just for repeated twice for each operand. merge_two:
4781 code will merge the two results together. */
4782 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
4783 gen_lowpart (V32QImode, t6)));
4784 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
4785 gen_lowpart (V32QImode, t6)));
4786 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
4787 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
4788 t7 = gen_reg_rtx (V4DImode);
4789 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
4790 const2_rtx, GEN_INT (3),
4791 const0_rtx, const1_rtx));
4792 t8 = gen_reg_rtx (V4DImode);
4793 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
4794 const2_rtx, GEN_INT (3),
4795 const0_rtx, const1_rtx));
4796 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
4797 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
4798 t1 = t4;
4799 t2 = t3;
4800 goto merge_two;
4801
4802 default:
4803 gcc_assert (GET_MODE_SIZE (mode) <= 16);
4804 break;
4805 }
4806 }
4807
4808 if (TARGET_XOP)
4809 {
4810 /* The XOP VPPERM insn supports three inputs. By ignoring the
4811 one_operand_shuffle special case, we avoid creating another
4812 set of constant vectors in memory. */
4813 one_operand_shuffle = false;
4814
4815 /* mask = mask & {2*w-1, ...} */
4816 vt = GEN_INT (2*w - 1);
4817 }
4818 else
4819 {
4820 /* mask = mask & {w-1, ...} */
4821 vt = GEN_INT (w - 1);
4822 }
4823
4824 vt = gen_const_vec_duplicate (maskmode, vt);
4825 mask = expand_simple_binop (maskmode, AND, mask, vt,
4826 NULL_RTX, 0, OPTAB_DIRECT);
4827
4828 /* For non-QImode operations, convert the word permutation control
4829 into a byte permutation control. */
4830 if (mode != V16QImode)
4831 {
4832 mask = expand_simple_binop (maskmode, ASHIFT, mask,
4833 GEN_INT (exact_log2 (e)),
4834 NULL_RTX, 0, OPTAB_DIRECT);
4835
4836 /* Convert mask to vector of chars. */
4837 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
4838
4839 /* Replicate each of the input bytes into byte positions:
4840 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4841 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4842 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
4843 for (i = 0; i < 16; ++i)
4844 vec[i] = GEN_INT (i/e * e);
4845 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4846 vt = validize_mem (force_const_mem (V16QImode, vt));
4847 if (TARGET_XOP)
4848 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
4849 else
4850 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
4851
4852 /* Convert it into the byte positions by doing
4853 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
4854 for (i = 0; i < 16; ++i)
4855 vec[i] = GEN_INT (i % e);
4856 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4857 vt = validize_mem (force_const_mem (V16QImode, vt));
4858 emit_insn (gen_addv16qi3 (mask, mask, vt));
4859 }
4860
4861 /* The actual shuffle operations all operate on V16QImode. */
4862 op0 = gen_lowpart (V16QImode, op0);
4863 op1 = gen_lowpart (V16QImode, op1);
4864
4865 if (TARGET_XOP)
4866 {
4867 if (GET_MODE (target) != V16QImode)
4868 target = gen_reg_rtx (V16QImode);
4869 emit_insn (gen_xop_pperm (target, op0, op1, mask));
4870 if (target != operands[0])
4871 emit_move_insn (operands[0],
4872 gen_lowpart (GET_MODE (operands[0]), target));
4873 }
4874 else if (one_operand_shuffle)
4875 {
4876 if (GET_MODE (target) != V16QImode)
4877 target = gen_reg_rtx (V16QImode);
4878 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
4879 if (target != operands[0])
4880 emit_move_insn (operands[0],
4881 gen_lowpart (GET_MODE (operands[0]), target));
4882 }
4883 else
4884 {
4885 rtx xops[6];
4886 bool ok;
4887
4888 /* Shuffle the two input vectors independently. */
4889 t1 = gen_reg_rtx (V16QImode);
4890 t2 = gen_reg_rtx (V16QImode);
4891 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
4892 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
4893
4894 merge_two:
4895 /* Then merge them together. The key is whether any given control
4896 element contained a bit set that indicates the second word. */
4897 mask = operands[3];
4898 vt = GEN_INT (w);
4899 if (maskmode == V2DImode && !TARGET_SSE4_1)
4900 {
4901 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
4902 more shuffle to convert the V2DI input mask into a V4SI
4903 input mask. At which point the masking that expand_int_vcond
4904 will work as desired. */
4905 rtx t3 = gen_reg_rtx (V4SImode);
4906 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
4907 const0_rtx, const0_rtx,
4908 const2_rtx, const2_rtx));
4909 mask = t3;
4910 maskmode = V4SImode;
4911 e = w = 4;
4912 }
4913
4914 vt = gen_const_vec_duplicate (maskmode, vt);
4915 vt = force_reg (maskmode, vt);
4916 mask = expand_simple_binop (maskmode, AND, mask, vt,
4917 NULL_RTX, 0, OPTAB_DIRECT);
4918
4919 if (GET_MODE (target) != mode)
4920 target = gen_reg_rtx (mode);
4921 xops[0] = target;
4922 xops[1] = gen_lowpart (mode, t2);
4923 xops[2] = gen_lowpart (mode, t1);
4924 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
4925 xops[4] = mask;
4926 xops[5] = vt;
4927 ok = ix86_expand_int_vcond (xops);
4928 gcc_assert (ok);
4929 if (target != operands[0])
4930 emit_move_insn (operands[0],
4931 gen_lowpart (GET_MODE (operands[0]), target));
4932 }
4933}
4934
4935/* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
4936 true if we should do zero extension, else sign extension. HIGH_P is
4937 true if we want the N/2 high elements, else the low elements. */
4938
4939void
4940ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
4941{
4942 machine_mode imode = GET_MODE (src);
4943 rtx tmp;
4944
4945 if (TARGET_SSE4_1)
4946 {
4947 rtx (*unpack)(rtx, rtx);
4948 rtx (*extract)(rtx, rtx) = NULL;
4949 machine_mode halfmode = BLKmode;
4950
4951 switch (imode)
4952 {
4953 case E_V64QImode:
4954 if (unsigned_p)
4955 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
4956 else
4957 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
4958 halfmode = V32QImode;
4959 extract
4960 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
4961 break;
4962 case E_V32QImode:
4963 if (unsigned_p)
4964 unpack = gen_avx2_zero_extendv16qiv16hi2;
4965 else
4966 unpack = gen_avx2_sign_extendv16qiv16hi2;
4967 halfmode = V16QImode;
4968 extract
4969 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
4970 break;
4971 case E_V32HImode:
4972 if (unsigned_p)
4973 unpack = gen_avx512f_zero_extendv16hiv16si2;
4974 else
4975 unpack = gen_avx512f_sign_extendv16hiv16si2;
4976 halfmode = V16HImode;
4977 extract
4978 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
4979 break;
4980 case E_V16HImode:
4981 if (unsigned_p)
4982 unpack = gen_avx2_zero_extendv8hiv8si2;
4983 else
4984 unpack = gen_avx2_sign_extendv8hiv8si2;
4985 halfmode = V8HImode;
4986 extract
4987 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
4988 break;
4989 case E_V16SImode:
4990 if (unsigned_p)
4991 unpack = gen_avx512f_zero_extendv8siv8di2;
4992 else
4993 unpack = gen_avx512f_sign_extendv8siv8di2;
4994 halfmode = V8SImode;
4995 extract
4996 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
4997 break;
4998 case E_V8SImode:
4999 if (unsigned_p)
5000 unpack = gen_avx2_zero_extendv4siv4di2;
5001 else
5002 unpack = gen_avx2_sign_extendv4siv4di2;
5003 halfmode = V4SImode;
5004 extract
5005 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5006 break;
5007 case E_V16QImode:
5008 if (unsigned_p)
5009 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5010 else
5011 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5012 break;
5013 case E_V8HImode:
5014 if (unsigned_p)
5015 unpack = gen_sse4_1_zero_extendv4hiv4si2;
5016 else
5017 unpack = gen_sse4_1_sign_extendv4hiv4si2;
5018 break;
5019 case E_V4SImode:
5020 if (unsigned_p)
5021 unpack = gen_sse4_1_zero_extendv2siv2di2;
5022 else
5023 unpack = gen_sse4_1_sign_extendv2siv2di2;
5024 break;
5025 default:
5026 gcc_unreachable ();
5027 }
5028
5029 if (GET_MODE_SIZE (imode) >= 32)
5030 {
5031 tmp = gen_reg_rtx (halfmode);
5032 emit_insn (extract (tmp, src));
5033 }
5034 else if (high_p)
5035 {
5036 /* Shift higher 8 bytes to lower 8 bytes. */
5037 tmp = gen_reg_rtx (V1TImode);
5038 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5039 GEN_INT (64)));
5040 tmp = gen_lowpart (imode, tmp);
5041 }
5042 else
5043 tmp = src;
5044
5045 emit_insn (unpack (dest, tmp));
5046 }
5047 else
5048 {
5049 rtx (*unpack)(rtx, rtx, rtx);
5050
5051 switch (imode)
5052 {
5053 case E_V16QImode:
5054 if (high_p)
5055 unpack = gen_vec_interleave_highv16qi;
5056 else
5057 unpack = gen_vec_interleave_lowv16qi;
5058 break;
5059 case E_V8HImode:
5060 if (high_p)
5061 unpack = gen_vec_interleave_highv8hi;
5062 else
5063 unpack = gen_vec_interleave_lowv8hi;
5064 break;
5065 case E_V4SImode:
5066 if (high_p)
5067 unpack = gen_vec_interleave_highv4si;
5068 else
5069 unpack = gen_vec_interleave_lowv4si;
5070 break;
5071 default:
5072 gcc_unreachable ();
5073 }
5074
5075 if (unsigned_p)
5076 tmp = force_reg (imode, CONST0_RTX (imode));
5077 else
5078 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5079 src, pc_rtx, pc_rtx);
5080
5081 rtx tmp2 = gen_reg_rtx (imode);
5082 emit_insn (unpack (tmp2, src, tmp));
5083 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5084 }
5085}
5086
5087/* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5088 but works for floating pointer parameters and nonoffsetable memories.
5089 For pushes, it returns just stack offsets; the values will be saved
5090 in the right order. Maximally three parts are generated. */
5091
5092static int
5093ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5094{
5095 int size;
5096
5097 if (!TARGET_64BIT)
5098 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5099 else
5100 size = (GET_MODE_SIZE (mode) + 4) / 8;
5101
5102 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5103 gcc_assert (size >= 2 && size <= 4);
5104
5105 /* Optimize constant pool reference to immediates. This is used by fp
5106 moves, that force all constants to memory to allow combining. */
5107 if (MEM_P (operand) && MEM_READONLY_P (operand))
5108 operand = avoid_constant_pool_reference (operand);
5109
5110 if (MEM_P (operand) && !offsettable_memref_p (operand))
5111 {
5112 /* The only non-offsetable memories we handle are pushes. */
5113 int ok = push_operand (operand, VOIDmode);
5114
5115 gcc_assert (ok);
5116
5117 operand = copy_rtx (operand);
5118 PUT_MODE (operand, word_mode);
5119 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5120 return size;
5121 }
5122
5123 if (GET_CODE (operand) == CONST_VECTOR)
5124 {
5125 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5126 /* Caution: if we looked through a constant pool memory above,
5127 the operand may actually have a different mode now. That's
5128 ok, since we want to pun this all the way back to an integer. */
5129 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5130 gcc_assert (operand != NULL);
5131 mode = imode;
5132 }
5133
5134 if (!TARGET_64BIT)
5135 {
5136 if (mode == DImode)
5137 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5138 else
5139 {
5140 int i;
5141
5142 if (REG_P (operand))
5143 {
5144 gcc_assert (reload_completed);
5145 for (i = 0; i < size; i++)
5146 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5147 }
5148 else if (offsettable_memref_p (operand))
5149 {
5150 operand = adjust_address (operand, SImode, 0);
5151 parts[0] = operand;
5152 for (i = 1; i < size; i++)
5153 parts[i] = adjust_address (operand, SImode, 4 * i);
5154 }
5155 else if (CONST_DOUBLE_P (operand))
5156 {
5157 const REAL_VALUE_TYPE *r;
5158 long l[4];
5159
5160 r = CONST_DOUBLE_REAL_VALUE (operand);
5161 switch (mode)
5162 {
5163 case E_TFmode:
5164 real_to_target (l, r, mode);
5165 parts[3] = gen_int_mode (l[3], SImode);
5166 parts[2] = gen_int_mode (l[2], SImode);
5167 break;
5168 case E_XFmode:
5169 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5170 long double may not be 80-bit. */
5171 real_to_target (l, r, mode);
5172 parts[2] = gen_int_mode (l[2], SImode);
5173 break;
5174 case E_DFmode:
5175 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5176 break;
5177 default:
5178 gcc_unreachable ();
5179 }
5180 parts[1] = gen_int_mode (l[1], SImode);
5181 parts[0] = gen_int_mode (l[0], SImode);
5182 }
5183 else
5184 gcc_unreachable ();
5185 }
5186 }
5187 else
5188 {
5189 if (mode == TImode)
5190 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5191 if (mode == XFmode || mode == TFmode)
5192 {
5193 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5194 if (REG_P (operand))
5195 {
5196 gcc_assert (reload_completed);
5197 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5198 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5199 }
5200 else if (offsettable_memref_p (operand))
5201 {
5202 operand = adjust_address (operand, DImode, 0);
5203 parts[0] = operand;
5204 parts[1] = adjust_address (operand, upper_mode, 8);
5205 }
5206 else if (CONST_DOUBLE_P (operand))
5207 {
5208 long l[4];
5209
5210 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5211
5212 /* real_to_target puts 32-bit pieces in each long. */
5213 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5214 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5215 << 32), DImode);
5216
5217 if (upper_mode == SImode)
5218 parts[1] = gen_int_mode (l[2], SImode);
5219 else
5220 parts[1]
5221 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5222 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5223 << 32), DImode);
5224 }
5225 else
5226 gcc_unreachable ();
5227 }
5228 }
5229
5230 return size;
5231}
5232
5233/* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5234 Return false when normal moves are needed; true when all required
5235 insns have been emitted. Operands 2-4 contain the input values
5236 int the correct order; operands 5-7 contain the output values. */
5237
5238void
5239ix86_split_long_move (rtx operands[])
5240{
5241 rtx part[2][4];
5242 int nparts, i, j;
5243 int push = 0;
5244 int collisions = 0;
5245 machine_mode mode = GET_MODE (operands[0]);
5246 bool collisionparts[4];
5247
5248 /* The DFmode expanders may ask us to move double.
5249 For 64bit target this is single move. By hiding the fact
5250 here we simplify i386.md splitters. */
5251 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5252 {
5253 /* Optimize constant pool reference to immediates. This is used by
5254 fp moves, that force all constants to memory to allow combining. */
5255
5256 if (MEM_P (operands[1])
5257 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5258 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5259 operands[1] = get_pool_constant (XEXP (operands[1], 0));
5260 if (push_operand (operands[0], VOIDmode))
5261 {
5262 operands[0] = copy_rtx (operands[0]);
5263 PUT_MODE (operands[0], word_mode);
5264 }
5265 else
5266 operands[0] = gen_lowpart (DImode, operands[0]);
5267 operands[1] = gen_lowpart (DImode, operands[1]);
5268 emit_move_insn (operands[0], operands[1]);
5269 return;
5270 }
5271
5272 /* The only non-offsettable memory we handle is push. */
5273 if (push_operand (operands[0], VOIDmode))
5274 push = 1;
5275 else
5276 gcc_assert (!MEM_P (operands[0])
5277 || offsettable_memref_p (operands[0]));
5278
5279 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5280 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5281
5282 /* When emitting push, take care for source operands on the stack. */
5283 if (push && MEM_P (operands[1])
5284 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5285 {
5286 rtx src_base = XEXP (part[1][nparts - 1], 0);
5287
5288 /* Compensate for the stack decrement by 4. */
5289 if (!TARGET_64BIT && nparts == 3
5290 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
5291 src_base = plus_constant (Pmode, src_base, 4);
5292
5293 /* src_base refers to the stack pointer and is
5294 automatically decreased by emitted push. */
5295 for (i = 0; i < nparts; i++)
5296 part[1][i] = change_address (part[1][i],
5297 GET_MODE (part[1][i]), src_base);
5298 }
5299
5300 /* We need to do copy in the right order in case an address register
5301 of the source overlaps the destination. */
5302 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
5303 {
5304 rtx tmp;
5305
5306 for (i = 0; i < nparts; i++)
5307 {
5308 collisionparts[i]
5309 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
5310 if (collisionparts[i])
5311 collisions++;
5312 }
5313
5314 /* Collision in the middle part can be handled by reordering. */
5315 if (collisions == 1 && nparts == 3 && collisionparts [1])
5316 {
5317 std::swap (part[0][1], part[0][2]);
5318 std::swap (part[1][1], part[1][2]);
5319 }
5320 else if (collisions == 1
5321 && nparts == 4
5322 && (collisionparts [1] || collisionparts [2]))
5323 {
5324 if (collisionparts [1])
5325 {
5326 std::swap (part[0][1], part[0][2]);
5327 std::swap (part[1][1], part[1][2]);
5328 }
5329 else
5330 {
5331 std::swap (part[0][2], part[0][3]);
5332 std::swap (part[1][2], part[1][3]);
5333 }
5334 }
5335
5336 /* If there are more collisions, we can't handle it by reordering.
5337 Do an lea to the last part and use only one colliding move. */
5338 else if (collisions > 1)
5339 {
5340 rtx base, addr;
5341
5342 collisions = 1;
5343
5344 base = part[0][nparts - 1];
5345
5346 /* Handle the case when the last part isn't valid for lea.
5347 Happens in 64-bit mode storing the 12-byte XFmode. */
5348 if (GET_MODE (base) != Pmode)
5349 base = gen_rtx_REG (Pmode, REGNO (base));
5350
5351 addr = XEXP (part[1][0], 0);
5352 if (TARGET_TLS_DIRECT_SEG_REFS)
5353 {
5354 struct ix86_address parts;
5355 int ok = ix86_decompose_address (addr, &parts);
5356 gcc_assert (ok);
5357 /* It is not valid to use %gs: or %fs: in lea. */
5358 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
5359 }
5360 emit_insn (gen_rtx_SET (base, addr));
5361 part[1][0] = replace_equiv_address (part[1][0], base);
5362 for (i = 1; i < nparts; i++)
5363 {
5364 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
5365 part[1][i] = replace_equiv_address (part[1][i], tmp);
5366 }
5367 }
5368 }
5369
5370 if (push)
5371 {
5372 if (!TARGET_64BIT)
5373 {
5374 if (nparts == 3)
5375 {
5376 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
d9330fb5 5377 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
2bf6d935
ML
5378 emit_move_insn (part[0][2], part[1][2]);
5379 }
5380 else if (nparts == 4)
5381 {
5382 emit_move_insn (part[0][3], part[1][3]);
5383 emit_move_insn (part[0][2], part[1][2]);
5384 }
5385 }
5386 else
5387 {
5388 /* In 64bit mode we don't have 32bit push available. In case this is
5389 register, it is OK - we will just use larger counterpart. We also
5390 retype memory - these comes from attempt to avoid REX prefix on
5391 moving of second half of TFmode value. */
5392 if (GET_MODE (part[1][1]) == SImode)
5393 {
5394 switch (GET_CODE (part[1][1]))
5395 {
5396 case MEM:
5397 part[1][1] = adjust_address (part[1][1], DImode, 0);
5398 break;
5399
5400 case REG:
5401 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
5402 break;
5403
5404 default:
5405 gcc_unreachable ();
5406 }
5407
5408 if (GET_MODE (part[1][0]) == SImode)
5409 part[1][0] = part[1][1];
5410 }
5411 }
5412 emit_move_insn (part[0][1], part[1][1]);
5413 emit_move_insn (part[0][0], part[1][0]);
5414 return;
5415 }
5416
5417 /* Choose correct order to not overwrite the source before it is copied. */
5418 if ((REG_P (part[0][0])
5419 && REG_P (part[1][1])
5420 && (REGNO (part[0][0]) == REGNO (part[1][1])
5421 || (nparts == 3
5422 && REGNO (part[0][0]) == REGNO (part[1][2]))
5423 || (nparts == 4
5424 && REGNO (part[0][0]) == REGNO (part[1][3]))))
5425 || (collisions > 0
5426 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
5427 {
5428 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
5429 {
5430 operands[2 + i] = part[0][j];
5431 operands[6 + i] = part[1][j];
5432 }
5433 }
5434 else
5435 {
5436 for (i = 0; i < nparts; i++)
5437 {
5438 operands[2 + i] = part[0][i];
5439 operands[6 + i] = part[1][i];
5440 }
5441 }
5442
5443 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5444 if (optimize_insn_for_size_p ())
5445 {
5446 for (j = 0; j < nparts - 1; j++)
5447 if (CONST_INT_P (operands[6 + j])
5448 && operands[6 + j] != const0_rtx
5449 && REG_P (operands[2 + j]))
5450 for (i = j; i < nparts - 1; i++)
5451 if (CONST_INT_P (operands[7 + i])
5452 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
5453 operands[7 + i] = operands[2 + j];
5454 }
5455
5456 for (i = 0; i < nparts; i++)
5457 emit_move_insn (operands[2 + i], operands[6 + i]);
5458
5459 return;
5460}
5461
5462/* Helper function of ix86_split_ashl used to generate an SImode/DImode
5463 left shift by a constant, either using a single shift or
5464 a sequence of add instructions. */
5465
5466static void
5467ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
5468{
2bf6d935
ML
5469 if (count == 1
5470 || (count * ix86_cost->add <= ix86_cost->shift_const
5471 && !optimize_insn_for_size_p ()))
5472 {
2bf6d935 5473 while (count-- > 0)
83bc5e44 5474 emit_insn (gen_add2_insn (operand, operand));
2bf6d935
ML
5475 }
5476 else
5477 {
83bc5e44
UB
5478 rtx (*insn)(rtx, rtx, rtx);
5479
2bf6d935
ML
5480 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5481 emit_insn (insn (operand, operand, GEN_INT (count)));
5482 }
5483}
5484
5485void
5486ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
5487{
5488 rtx (*gen_ashl3)(rtx, rtx, rtx);
5489 rtx (*gen_shld)(rtx, rtx, rtx);
5490 int half_width = GET_MODE_BITSIZE (mode) >> 1;
987a3082 5491 machine_mode half_mode;
2bf6d935
ML
5492
5493 rtx low[2], high[2];
5494 int count;
5495
5496 if (CONST_INT_P (operands[2]))
5497 {
5498 split_double_mode (mode, operands, 2, low, high);
5499 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5500
5501 if (count >= half_width)
5502 {
5503 emit_move_insn (high[0], low[1]);
5504 emit_move_insn (low[0], const0_rtx);
5505
5506 if (count > half_width)
5507 ix86_expand_ashl_const (high[0], count - half_width, mode);
5508 }
5509 else
5510 {
5511 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5512
5513 if (!rtx_equal_p (operands[0], operands[1]))
5514 emit_move_insn (operands[0], operands[1]);
5515
5516 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
5517 ix86_expand_ashl_const (low[0], count, mode);
5518 }
5519 return;
5520 }
5521
5522 split_double_mode (mode, operands, 1, low, high);
987a3082 5523 half_mode = mode == DImode ? SImode : DImode;
2bf6d935
ML
5524
5525 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5526
5527 if (operands[1] == const1_rtx)
5528 {
5529 /* Assuming we've chosen a QImode capable registers, then 1 << N
5530 can be done with two 32/64-bit shifts, no branches, no cmoves. */
5531 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
5532 {
5533 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
5534
5535 ix86_expand_clear (low[0]);
5536 ix86_expand_clear (high[0]);
5537 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
5538
5539 d = gen_lowpart (QImode, low[0]);
5540 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5541 s = gen_rtx_EQ (QImode, flags, const0_rtx);
5542 emit_insn (gen_rtx_SET (d, s));
5543
5544 d = gen_lowpart (QImode, high[0]);
5545 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5546 s = gen_rtx_NE (QImode, flags, const0_rtx);
5547 emit_insn (gen_rtx_SET (d, s));
5548 }
5549
5550 /* Otherwise, we can get the same results by manually performing
5551 a bit extract operation on bit 5/6, and then performing the two
5552 shifts. The two methods of getting 0/1 into low/high are exactly
5553 the same size. Avoiding the shift in the bit extract case helps
5554 pentium4 a bit; no one else seems to care much either way. */
5555 else
5556 {
2bf6d935
ML
5557 rtx (*gen_lshr3)(rtx, rtx, rtx);
5558 rtx (*gen_and3)(rtx, rtx, rtx);
5559 rtx (*gen_xor3)(rtx, rtx, rtx);
5560 HOST_WIDE_INT bits;
5561 rtx x;
5562
5563 if (mode == DImode)
5564 {
2bf6d935
ML
5565 gen_lshr3 = gen_lshrsi3;
5566 gen_and3 = gen_andsi3;
5567 gen_xor3 = gen_xorsi3;
5568 bits = 5;
5569 }
5570 else
5571 {
2bf6d935
ML
5572 gen_lshr3 = gen_lshrdi3;
5573 gen_and3 = gen_anddi3;
5574 gen_xor3 = gen_xordi3;
5575 bits = 6;
5576 }
5577
5578 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
5579 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
5580 else
5581 x = gen_lowpart (half_mode, operands[2]);
5582 emit_insn (gen_rtx_SET (high[0], x));
5583
5584 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
5585 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
5586 emit_move_insn (low[0], high[0]);
5587 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
5588 }
5589
5590 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5591 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
5592 return;
5593 }
5594
5595 if (operands[1] == constm1_rtx)
5596 {
5597 /* For -1 << N, we can avoid the shld instruction, because we
5598 know that we're shifting 0...31/63 ones into a -1. */
5599 emit_move_insn (low[0], constm1_rtx);
5600 if (optimize_insn_for_size_p ())
5601 emit_move_insn (high[0], low[0]);
5602 else
5603 emit_move_insn (high[0], constm1_rtx);
5604 }
5605 else
5606 {
5607 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5608
5609 if (!rtx_equal_p (operands[0], operands[1]))
5610 emit_move_insn (operands[0], operands[1]);
5611
5612 split_double_mode (mode, operands, 1, low, high);
5613 emit_insn (gen_shld (high[0], low[0], operands[2]));
5614 }
5615
5616 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5617
5618 if (TARGET_CMOVE && scratch)
5619 {
2bf6d935 5620 ix86_expand_clear (scratch);
987a3082
UB
5621 emit_insn (gen_x86_shift_adj_1
5622 (half_mode, high[0], low[0], operands[2], scratch));
2bf6d935
ML
5623 }
5624 else
987a3082 5625 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
2bf6d935
ML
5626}
5627
5628void
5629ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
5630{
5631 rtx (*gen_ashr3)(rtx, rtx, rtx)
5632 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
5633 rtx (*gen_shrd)(rtx, rtx, rtx);
5634 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5635
5636 rtx low[2], high[2];
5637 int count;
5638
5639 if (CONST_INT_P (operands[2]))
5640 {
5641 split_double_mode (mode, operands, 2, low, high);
5642 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5643
5644 if (count == GET_MODE_BITSIZE (mode) - 1)
5645 {
5646 emit_move_insn (high[0], high[1]);
5647 emit_insn (gen_ashr3 (high[0], high[0],
5648 GEN_INT (half_width - 1)));
5649 emit_move_insn (low[0], high[0]);
5650
5651 }
5652 else if (count >= half_width)
5653 {
5654 emit_move_insn (low[0], high[1]);
5655 emit_move_insn (high[0], low[0]);
5656 emit_insn (gen_ashr3 (high[0], high[0],
5657 GEN_INT (half_width - 1)));
5658
5659 if (count > half_width)
5660 emit_insn (gen_ashr3 (low[0], low[0],
5661 GEN_INT (count - half_width)));
5662 }
5663 else
5664 {
5665 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5666
5667 if (!rtx_equal_p (operands[0], operands[1]))
5668 emit_move_insn (operands[0], operands[1]);
5669
5670 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5671 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
5672 }
5673 }
5674 else
5675 {
987a3082
UB
5676 machine_mode half_mode;
5677
2bf6d935
ML
5678 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5679
5680 if (!rtx_equal_p (operands[0], operands[1]))
5681 emit_move_insn (operands[0], operands[1]);
5682
5683 split_double_mode (mode, operands, 1, low, high);
987a3082 5684 half_mode = mode == DImode ? SImode : DImode;
2bf6d935
ML
5685
5686 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5687 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
5688
5689 if (TARGET_CMOVE && scratch)
5690 {
2bf6d935
ML
5691 emit_move_insn (scratch, high[0]);
5692 emit_insn (gen_ashr3 (scratch, scratch,
5693 GEN_INT (half_width - 1)));
987a3082
UB
5694 emit_insn (gen_x86_shift_adj_1
5695 (half_mode, low[0], high[0], operands[2], scratch));
2bf6d935
ML
5696 }
5697 else
987a3082
UB
5698 emit_insn (gen_x86_shift_adj_3
5699 (half_mode, low[0], high[0], operands[2]));
2bf6d935
ML
5700 }
5701}
5702
5703void
5704ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
5705{
5706 rtx (*gen_lshr3)(rtx, rtx, rtx)
5707 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
5708 rtx (*gen_shrd)(rtx, rtx, rtx);
5709 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5710
5711 rtx low[2], high[2];
5712 int count;
5713
5714 if (CONST_INT_P (operands[2]))
5715 {
5716 split_double_mode (mode, operands, 2, low, high);
5717 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5718
5719 if (count >= half_width)
5720 {
5721 emit_move_insn (low[0], high[1]);
5722 ix86_expand_clear (high[0]);
5723
5724 if (count > half_width)
5725 emit_insn (gen_lshr3 (low[0], low[0],
5726 GEN_INT (count - half_width)));
5727 }
5728 else
5729 {
5730 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5731
5732 if (!rtx_equal_p (operands[0], operands[1]))
5733 emit_move_insn (operands[0], operands[1]);
5734
5735 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5736 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
5737 }
5738 }
5739 else
5740 {
987a3082
UB
5741 machine_mode half_mode;
5742
2bf6d935
ML
5743 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5744
5745 if (!rtx_equal_p (operands[0], operands[1]))
5746 emit_move_insn (operands[0], operands[1]);
5747
5748 split_double_mode (mode, operands, 1, low, high);
987a3082 5749 half_mode = mode == DImode ? SImode : DImode;
2bf6d935
ML
5750
5751 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5752 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
5753
5754 if (TARGET_CMOVE && scratch)
5755 {
2bf6d935 5756 ix86_expand_clear (scratch);
987a3082
UB
5757 emit_insn (gen_x86_shift_adj_1
5758 (half_mode, low[0], high[0], operands[2], scratch));
2bf6d935
ML
5759 }
5760 else
987a3082
UB
5761 emit_insn (gen_x86_shift_adj_2
5762 (half_mode, low[0], high[0], operands[2]));
2bf6d935
ML
5763 }
5764}
5765
5766/* Return mode for the memcpy/memset loop counter. Prefer SImode over
5767 DImode for constant loop counts. */
5768
5769static machine_mode
5770counter_mode (rtx count_exp)
5771{
5772 if (GET_MODE (count_exp) != VOIDmode)
5773 return GET_MODE (count_exp);
5774 if (!CONST_INT_P (count_exp))
5775 return Pmode;
5776 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
5777 return DImode;
5778 return SImode;
5779}
5780
5781/* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5782 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5783 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
5784 memory by VALUE (supposed to be in MODE).
5785
5786 The size is rounded down to whole number of chunk size moved at once.
5787 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
5788
5789
5790static void
76715c32 5791expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
2bf6d935
ML
5792 rtx destptr, rtx srcptr, rtx value,
5793 rtx count, machine_mode mode, int unroll,
5794 int expected_size, bool issetmem)
5795{
5796 rtx_code_label *out_label, *top_label;
5797 rtx iter, tmp;
5798 machine_mode iter_mode = counter_mode (count);
5799 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
5800 rtx piece_size = GEN_INT (piece_size_n);
5801 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
5802 rtx size;
5803 int i;
5804
5805 top_label = gen_label_rtx ();
5806 out_label = gen_label_rtx ();
5807 iter = gen_reg_rtx (iter_mode);
5808
5809 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
5810 NULL, 1, OPTAB_DIRECT);
5811 /* Those two should combine. */
5812 if (piece_size == const1_rtx)
5813 {
5814 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
5815 true, out_label);
5816 predict_jump (REG_BR_PROB_BASE * 10 / 100);
5817 }
5818 emit_move_insn (iter, const0_rtx);
5819
5820 emit_label (top_label);
5821
5822 tmp = convert_modes (Pmode, iter_mode, iter, true);
5823
5824 /* This assert could be relaxed - in this case we'll need to compute
5825 smallest power of two, containing in PIECE_SIZE_N and pass it to
5826 offset_address. */
5827 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
5828 destmem = offset_address (destmem, tmp, piece_size_n);
5829 destmem = adjust_address (destmem, mode, 0);
5830
5831 if (!issetmem)
5832 {
5833 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
5834 srcmem = adjust_address (srcmem, mode, 0);
5835
5836 /* When unrolling for chips that reorder memory reads and writes,
5837 we can save registers by using single temporary.
5838 Also using 4 temporaries is overkill in 32bit mode. */
5839 if (!TARGET_64BIT && 0)
5840 {
5841 for (i = 0; i < unroll; i++)
5842 {
5843 if (i)
5844 {
5845 destmem = adjust_address (copy_rtx (destmem), mode,
5846 GET_MODE_SIZE (mode));
5847 srcmem = adjust_address (copy_rtx (srcmem), mode,
5848 GET_MODE_SIZE (mode));
5849 }
5850 emit_move_insn (destmem, srcmem);
5851 }
5852 }
5853 else
5854 {
5855 rtx tmpreg[4];
5856 gcc_assert (unroll <= 4);
5857 for (i = 0; i < unroll; i++)
5858 {
5859 tmpreg[i] = gen_reg_rtx (mode);
5860 if (i)
5861 srcmem = adjust_address (copy_rtx (srcmem), mode,
5862 GET_MODE_SIZE (mode));
5863 emit_move_insn (tmpreg[i], srcmem);
5864 }
5865 for (i = 0; i < unroll; i++)
5866 {
5867 if (i)
5868 destmem = adjust_address (copy_rtx (destmem), mode,
5869 GET_MODE_SIZE (mode));
5870 emit_move_insn (destmem, tmpreg[i]);
5871 }
5872 }
5873 }
5874 else
5875 for (i = 0; i < unroll; i++)
5876 {
5877 if (i)
5878 destmem = adjust_address (copy_rtx (destmem), mode,
5879 GET_MODE_SIZE (mode));
5880 emit_move_insn (destmem, value);
5881 }
5882
5883 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
5884 true, OPTAB_LIB_WIDEN);
5885 if (tmp != iter)
5886 emit_move_insn (iter, tmp);
5887
5888 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
5889 true, top_label);
5890 if (expected_size != -1)
5891 {
5892 expected_size /= GET_MODE_SIZE (mode) * unroll;
5893 if (expected_size == 0)
5894 predict_jump (0);
5895 else if (expected_size > REG_BR_PROB_BASE)
5896 predict_jump (REG_BR_PROB_BASE - 1);
5897 else
5898 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
5899 / expected_size);
5900 }
5901 else
5902 predict_jump (REG_BR_PROB_BASE * 80 / 100);
5903 iter = ix86_zero_extend_to_Pmode (iter);
5904 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
5905 true, OPTAB_LIB_WIDEN);
5906 if (tmp != destptr)
5907 emit_move_insn (destptr, tmp);
5908 if (!issetmem)
5909 {
5910 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
5911 true, OPTAB_LIB_WIDEN);
5912 if (tmp != srcptr)
5913 emit_move_insn (srcptr, tmp);
5914 }
5915 emit_label (out_label);
5916}
5917
5918/* Divide COUNTREG by SCALE. */
5919static rtx
5920scale_counter (rtx countreg, int scale)
5921{
5922 rtx sc;
5923
5924 if (scale == 1)
5925 return countreg;
5926 if (CONST_INT_P (countreg))
5927 return GEN_INT (INTVAL (countreg) / scale);
5928 gcc_assert (REG_P (countreg));
5929
5930 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
5931 GEN_INT (exact_log2 (scale)),
5932 NULL, 1, OPTAB_DIRECT);
5933 return sc;
5934}
5935
5936/* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5937 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5938 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5939 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5940 ORIG_VALUE is the original value passed to memset to fill the memory with.
5941 Other arguments have same meaning as for previous function. */
5942
5943static void
76715c32 5944expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
2bf6d935
ML
5945 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
5946 rtx count,
5947 machine_mode mode, bool issetmem)
5948{
5949 rtx destexp;
5950 rtx srcexp;
5951 rtx countreg;
5952 HOST_WIDE_INT rounded_count;
5953
5954 /* If possible, it is shorter to use rep movs.
5955 TODO: Maybe it is better to move this logic to decide_alg. */
5956 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
5957 && (!issetmem || orig_value == const0_rtx))
5958 mode = SImode;
5959
5960 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
5961 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
5962
5963 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
5964 GET_MODE_SIZE (mode)));
5965 if (mode != QImode)
5966 {
5967 destexp = gen_rtx_ASHIFT (Pmode, countreg,
5968 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
5969 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
5970 }
5971 else
5972 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
5973 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
5974 {
5975 rounded_count
5976 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
5977 destmem = shallow_copy_rtx (destmem);
5978 set_mem_size (destmem, rounded_count);
5979 }
5980 else if (MEM_SIZE_KNOWN_P (destmem))
5981 clear_mem_size (destmem);
5982
5983 if (issetmem)
5984 {
5985 value = force_reg (mode, gen_lowpart (mode, value));
5986 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
5987 }
5988 else
5989 {
5990 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
5991 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
5992 if (mode != QImode)
5993 {
5994 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
5995 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
5996 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
5997 }
5998 else
5999 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
6000 if (CONST_INT_P (count))
6001 {
6002 rounded_count
6003 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
6004 srcmem = shallow_copy_rtx (srcmem);
6005 set_mem_size (srcmem, rounded_count);
6006 }
6007 else
6008 {
6009 if (MEM_SIZE_KNOWN_P (srcmem))
6010 clear_mem_size (srcmem);
6011 }
6012 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
6013 destexp, srcexp));
6014 }
6015}
6016
6017/* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
6018 DESTMEM.
6019 SRC is passed by pointer to be updated on return.
6020 Return value is updated DST. */
6021static rtx
6022emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
6023 HOST_WIDE_INT size_to_move)
6024{
c3185b64 6025 rtx dst = destmem, src = *srcmem, tempreg;
2bf6d935
ML
6026 enum insn_code code;
6027 machine_mode move_mode;
6028 int piece_size, i;
6029
6030 /* Find the widest mode in which we could perform moves.
6031 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6032 it until move of such size is supported. */
6033 piece_size = 1 << floor_log2 (size_to_move);
6034 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
6035 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6036 {
6037 gcc_assert (piece_size > 1);
6038 piece_size >>= 1;
6039 }
6040
6041 /* Find the corresponding vector mode with the same size as MOVE_MODE.
6042 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
6043 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
6044 {
6045 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
6046 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
6047 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6048 {
6049 move_mode = word_mode;
6050 piece_size = GET_MODE_SIZE (move_mode);
6051 code = optab_handler (mov_optab, move_mode);
6052 }
6053 }
6054 gcc_assert (code != CODE_FOR_nothing);
6055
6056 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6057 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
6058
6059 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6060 gcc_assert (size_to_move % piece_size == 0);
c3185b64 6061
2bf6d935
ML
6062 for (i = 0; i < size_to_move; i += piece_size)
6063 {
6064 /* We move from memory to memory, so we'll need to do it via
6065 a temporary register. */
6066 tempreg = gen_reg_rtx (move_mode);
6067 emit_insn (GEN_FCN (code) (tempreg, src));
6068 emit_insn (GEN_FCN (code) (dst, tempreg));
6069
6070 emit_move_insn (destptr,
c3185b64 6071 plus_constant (Pmode, copy_rtx (destptr), piece_size));
2bf6d935 6072 emit_move_insn (srcptr,
c3185b64 6073 plus_constant (Pmode, copy_rtx (srcptr), piece_size));
2bf6d935
ML
6074
6075 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6076 piece_size);
6077 src = adjust_automodify_address_nv (src, move_mode, srcptr,
6078 piece_size);
6079 }
6080
6081 /* Update DST and SRC rtx. */
6082 *srcmem = src;
6083 return dst;
6084}
6085
6086/* Helper function for the string operations below. Dest VARIABLE whether
6087 it is aligned to VALUE bytes. If true, jump to the label. */
6088
6089static rtx_code_label *
6090ix86_expand_aligntest (rtx variable, int value, bool epilogue)
6091{
6092 rtx_code_label *label = gen_label_rtx ();
6093 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
6094 if (GET_MODE (variable) == DImode)
6095 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
6096 else
6097 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
6098 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
6099 1, label);
6100 if (epilogue)
6101 predict_jump (REG_BR_PROB_BASE * 50 / 100);
6102 else
6103 predict_jump (REG_BR_PROB_BASE * 90 / 100);
6104 return label;
6105}
6106
6107
6108/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
6109
6110static void
76715c32 6111expand_cpymem_epilogue (rtx destmem, rtx srcmem,
2bf6d935
ML
6112 rtx destptr, rtx srcptr, rtx count, int max_size)
6113{
6114 rtx src, dest;
6115 if (CONST_INT_P (count))
6116 {
6117 HOST_WIDE_INT countval = INTVAL (count);
6118 HOST_WIDE_INT epilogue_size = countval % max_size;
6119 int i;
6120
6121 /* For now MAX_SIZE should be a power of 2. This assert could be
6122 relaxed, but it'll require a bit more complicated epilogue
6123 expanding. */
6124 gcc_assert ((max_size & (max_size - 1)) == 0);
6125 for (i = max_size; i >= 1; i >>= 1)
6126 {
6127 if (epilogue_size & i)
6128 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6129 }
6130 return;
6131 }
6132 if (max_size > 8)
6133 {
6134 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
6135 count, 1, OPTAB_DIRECT);
76715c32 6136 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
2bf6d935
ML
6137 count, QImode, 1, 4, false);
6138 return;
6139 }
6140
6141 /* When there are stringops, we can cheaply increase dest and src pointers.
6142 Otherwise we save code size by maintaining offset (zero is readily
6143 available from preceding rep operation) and using x86 addressing modes.
6144 */
6145 if (TARGET_SINGLE_STRINGOP)
6146 {
6147 if (max_size > 4)
6148 {
6149 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6150 src = change_address (srcmem, SImode, srcptr);
6151 dest = change_address (destmem, SImode, destptr);
6152 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6153 emit_label (label);
6154 LABEL_NUSES (label) = 1;
6155 }
6156 if (max_size > 2)
6157 {
6158 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6159 src = change_address (srcmem, HImode, srcptr);
6160 dest = change_address (destmem, HImode, destptr);
6161 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6162 emit_label (label);
6163 LABEL_NUSES (label) = 1;
6164 }
6165 if (max_size > 1)
6166 {
6167 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6168 src = change_address (srcmem, QImode, srcptr);
6169 dest = change_address (destmem, QImode, destptr);
6170 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6171 emit_label (label);
6172 LABEL_NUSES (label) = 1;
6173 }
6174 }
6175 else
6176 {
6177 rtx offset = force_reg (Pmode, const0_rtx);
6178 rtx tmp;
6179
6180 if (max_size > 4)
6181 {
6182 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6183 src = change_address (srcmem, SImode, srcptr);
6184 dest = change_address (destmem, SImode, destptr);
6185 emit_move_insn (dest, src);
6186 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
6187 true, OPTAB_LIB_WIDEN);
6188 if (tmp != offset)
6189 emit_move_insn (offset, tmp);
6190 emit_label (label);
6191 LABEL_NUSES (label) = 1;
6192 }
6193 if (max_size > 2)
6194 {
6195 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6196 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6197 src = change_address (srcmem, HImode, tmp);
6198 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6199 dest = change_address (destmem, HImode, tmp);
6200 emit_move_insn (dest, src);
6201 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
6202 true, OPTAB_LIB_WIDEN);
6203 if (tmp != offset)
6204 emit_move_insn (offset, tmp);
6205 emit_label (label);
6206 LABEL_NUSES (label) = 1;
6207 }
6208 if (max_size > 1)
6209 {
6210 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6211 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6212 src = change_address (srcmem, QImode, tmp);
6213 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6214 dest = change_address (destmem, QImode, tmp);
6215 emit_move_insn (dest, src);
6216 emit_label (label);
6217 LABEL_NUSES (label) = 1;
6218 }
6219 }
6220}
6221
6222/* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6223 with value PROMOTED_VAL.
6224 SRC is passed by pointer to be updated on return.
6225 Return value is updated DST. */
6226static rtx
6227emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
6228 HOST_WIDE_INT size_to_move)
6229{
c3185b64 6230 rtx dst = destmem;
2bf6d935
ML
6231 enum insn_code code;
6232 machine_mode move_mode;
6233 int piece_size, i;
6234
6235 /* Find the widest mode in which we could perform moves.
6236 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6237 it until move of such size is supported. */
6238 move_mode = GET_MODE (promoted_val);
6239 if (move_mode == VOIDmode)
6240 move_mode = QImode;
6241 if (size_to_move < GET_MODE_SIZE (move_mode))
6242 {
6243 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
6244 move_mode = int_mode_for_size (move_bits, 0).require ();
6245 promoted_val = gen_lowpart (move_mode, promoted_val);
6246 }
6247 piece_size = GET_MODE_SIZE (move_mode);
6248 code = optab_handler (mov_optab, move_mode);
6249 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
6250
6251 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6252
6253 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6254 gcc_assert (size_to_move % piece_size == 0);
c3185b64 6255
2bf6d935
ML
6256 for (i = 0; i < size_to_move; i += piece_size)
6257 {
6258 if (piece_size <= GET_MODE_SIZE (word_mode))
6259 {
6260 emit_insn (gen_strset (destptr, dst, promoted_val));
6261 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6262 piece_size);
6263 continue;
6264 }
6265
6266 emit_insn (GEN_FCN (code) (dst, promoted_val));
6267
6268 emit_move_insn (destptr,
c3185b64 6269 plus_constant (Pmode, copy_rtx (destptr), piece_size));
2bf6d935
ML
6270
6271 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6272 piece_size);
6273 }
6274
6275 /* Update DST rtx. */
6276 return dst;
6277}
6278/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6279static void
6280expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
6281 rtx count, int max_size)
6282{
6283 count = expand_simple_binop (counter_mode (count), AND, count,
6284 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
76715c32 6285 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
2bf6d935
ML
6286 gen_lowpart (QImode, value), count, QImode,
6287 1, max_size / 2, true);
6288}
6289
6290/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6291static void
6292expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
6293 rtx count, int max_size)
6294{
6295 rtx dest;
6296
6297 if (CONST_INT_P (count))
6298 {
6299 HOST_WIDE_INT countval = INTVAL (count);
6300 HOST_WIDE_INT epilogue_size = countval % max_size;
6301 int i;
6302
6303 /* For now MAX_SIZE should be a power of 2. This assert could be
6304 relaxed, but it'll require a bit more complicated epilogue
6305 expanding. */
6306 gcc_assert ((max_size & (max_size - 1)) == 0);
6307 for (i = max_size; i >= 1; i >>= 1)
6308 {
6309 if (epilogue_size & i)
6310 {
6311 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6312 destmem = emit_memset (destmem, destptr, vec_value, i);
6313 else
6314 destmem = emit_memset (destmem, destptr, value, i);
6315 }
6316 }
6317 return;
6318 }
6319 if (max_size > 32)
6320 {
6321 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
6322 return;
6323 }
6324 if (max_size > 16)
6325 {
6326 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
6327 if (TARGET_64BIT)
6328 {
6329 dest = change_address (destmem, DImode, destptr);
6330 emit_insn (gen_strset (destptr, dest, value));
6331 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
6332 emit_insn (gen_strset (destptr, dest, value));
6333 }
6334 else
6335 {
6336 dest = change_address (destmem, SImode, destptr);
6337 emit_insn (gen_strset (destptr, dest, value));
6338 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6339 emit_insn (gen_strset (destptr, dest, value));
6340 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
6341 emit_insn (gen_strset (destptr, dest, value));
6342 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
6343 emit_insn (gen_strset (destptr, dest, value));
6344 }
6345 emit_label (label);
6346 LABEL_NUSES (label) = 1;
6347 }
6348 if (max_size > 8)
6349 {
6350 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
6351 if (TARGET_64BIT)
6352 {
6353 dest = change_address (destmem, DImode, destptr);
6354 emit_insn (gen_strset (destptr, dest, value));
6355 }
6356 else
6357 {
6358 dest = change_address (destmem, SImode, destptr);
6359 emit_insn (gen_strset (destptr, dest, value));
6360 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6361 emit_insn (gen_strset (destptr, dest, value));
6362 }
6363 emit_label (label);
6364 LABEL_NUSES (label) = 1;
6365 }
6366 if (max_size > 4)
6367 {
6368 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6369 dest = change_address (destmem, SImode, destptr);
6370 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
6371 emit_label (label);
6372 LABEL_NUSES (label) = 1;
6373 }
6374 if (max_size > 2)
6375 {
6376 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6377 dest = change_address (destmem, HImode, destptr);
6378 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
6379 emit_label (label);
6380 LABEL_NUSES (label) = 1;
6381 }
6382 if (max_size > 1)
6383 {
6384 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6385 dest = change_address (destmem, QImode, destptr);
6386 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
6387 emit_label (label);
6388 LABEL_NUSES (label) = 1;
6389 }
6390}
6391
6392/* Adjust COUNTER by the VALUE. */
6393static void
6394ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
6395{
83bc5e44 6396 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
2bf6d935
ML
6397}
6398
6399/* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6400 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
6401 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6402 ignored.
6403 Return value is updated DESTMEM. */
6404
6405static rtx
76715c32 6406expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
2bf6d935
ML
6407 rtx destptr, rtx srcptr, rtx value,
6408 rtx vec_value, rtx count, int align,
6409 int desired_alignment, bool issetmem)
6410{
6411 int i;
6412 for (i = 1; i < desired_alignment; i <<= 1)
6413 {
6414 if (align <= i)
6415 {
6416 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
6417 if (issetmem)
6418 {
6419 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6420 destmem = emit_memset (destmem, destptr, vec_value, i);
6421 else
6422 destmem = emit_memset (destmem, destptr, value, i);
6423 }
6424 else
6425 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6426 ix86_adjust_counter (count, i);
6427 emit_label (label);
6428 LABEL_NUSES (label) = 1;
6429 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
6430 }
6431 }
6432 return destmem;
6433}
6434
6435/* Test if COUNT&SIZE is nonzero and if so, expand movme
6436 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6437 and jump to DONE_LABEL. */
6438static void
76715c32 6439expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
2bf6d935
ML
6440 rtx destptr, rtx srcptr,
6441 rtx value, rtx vec_value,
6442 rtx count, int size,
6443 rtx done_label, bool issetmem)
6444{
6445 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
6446 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
6447 rtx modesize;
6448 int n;
6449
6450 /* If we do not have vector value to copy, we must reduce size. */
6451 if (issetmem)
6452 {
6453 if (!vec_value)
6454 {
6455 if (GET_MODE (value) == VOIDmode && size > 8)
6456 mode = Pmode;
6457 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
6458 mode = GET_MODE (value);
6459 }
6460 else
6461 mode = GET_MODE (vec_value), value = vec_value;
6462 }
6463 else
6464 {
6465 /* Choose appropriate vector mode. */
6466 if (size >= 32)
6467 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
6468 else if (size >= 16)
6469 mode = TARGET_SSE ? V16QImode : DImode;
6470 srcmem = change_address (srcmem, mode, srcptr);
6471 }
6472 destmem = change_address (destmem, mode, destptr);
6473 modesize = GEN_INT (GET_MODE_SIZE (mode));
6474 gcc_assert (GET_MODE_SIZE (mode) <= size);
6475 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6476 {
6477 if (issetmem)
6478 emit_move_insn (destmem, gen_lowpart (mode, value));
6479 else
6480 {
6481 emit_move_insn (destmem, srcmem);
6482 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6483 }
6484 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6485 }
6486
6487 destmem = offset_address (destmem, count, 1);
6488 destmem = offset_address (destmem, GEN_INT (-2 * size),
6489 GET_MODE_SIZE (mode));
6490 if (!issetmem)
6491 {
6492 srcmem = offset_address (srcmem, count, 1);
6493 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
6494 GET_MODE_SIZE (mode));
6495 }
6496 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6497 {
6498 if (issetmem)
6499 emit_move_insn (destmem, gen_lowpart (mode, value));
6500 else
6501 {
6502 emit_move_insn (destmem, srcmem);
6503 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6504 }
6505 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6506 }
6507 emit_jump_insn (gen_jump (done_label));
6508 emit_barrier ();
6509
6510 emit_label (label);
6511 LABEL_NUSES (label) = 1;
6512}
6513
6514/* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6515 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6516 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6517 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6518 DONE_LABEL is a label after the whole copying sequence. The label is created
6519 on demand if *DONE_LABEL is NULL.
6520 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
6521 bounds after the initial copies.
6522
6523 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6524 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6525 we will dispatch to a library call for large blocks.
6526
6527 In pseudocode we do:
6528
6529 if (COUNT < SIZE)
6530 {
6531 Assume that SIZE is 4. Bigger sizes are handled analogously
6532 if (COUNT & 4)
6533 {
6534 copy 4 bytes from SRCPTR to DESTPTR
6535 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6536 goto done_label
6537 }
6538 if (!COUNT)
6539 goto done_label;
6540 copy 1 byte from SRCPTR to DESTPTR
6541 if (COUNT & 2)
6542 {
6543 copy 2 bytes from SRCPTR to DESTPTR
6544 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6545 }
6546 }
6547 else
6548 {
6549 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6550 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6551
6552 OLD_DESPTR = DESTPTR;
6553 Align DESTPTR up to DESIRED_ALIGN
6554 SRCPTR += DESTPTR - OLD_DESTPTR
6555 COUNT -= DEST_PTR - OLD_DESTPTR
6556 if (DYNAMIC_CHECK)
6557 Round COUNT down to multiple of SIZE
6558 << optional caller supplied zero size guard is here >>
6559 << optional caller supplied dynamic check is here >>
6560 << caller supplied main copy loop is here >>
6561 }
6562 done_label:
6563 */
6564static void
76715c32 6565expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
2bf6d935
ML
6566 rtx *destptr, rtx *srcptr,
6567 machine_mode mode,
6568 rtx value, rtx vec_value,
6569 rtx *count,
6570 rtx_code_label **done_label,
6571 int size,
6572 int desired_align,
6573 int align,
6574 unsigned HOST_WIDE_INT *min_size,
6575 bool dynamic_check,
6576 bool issetmem)
6577{
6578 rtx_code_label *loop_label = NULL, *label;
6579 int n;
6580 rtx modesize;
6581 int prolog_size = 0;
6582 rtx mode_value;
6583
6584 /* Chose proper value to copy. */
6585 if (issetmem && VECTOR_MODE_P (mode))
6586 mode_value = vec_value;
6587 else
6588 mode_value = value;
6589 gcc_assert (GET_MODE_SIZE (mode) <= size);
6590
6591 /* See if block is big or small, handle small blocks. */
6592 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
6593 {
6594 int size2 = size;
6595 loop_label = gen_label_rtx ();
6596
6597 if (!*done_label)
6598 *done_label = gen_label_rtx ();
6599
6600 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
6601 1, loop_label);
6602 size2 >>= 1;
6603
6604 /* Handle sizes > 3. */
6605 for (;size2 > 2; size2 >>= 1)
76715c32 6606 expand_small_cpymem_or_setmem (destmem, srcmem,
2bf6d935
ML
6607 *destptr, *srcptr,
6608 value, vec_value,
6609 *count,
6610 size2, *done_label, issetmem);
6611 /* Nothing to copy? Jump to DONE_LABEL if so */
6612 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
6613 1, *done_label);
6614
6615 /* Do a byte copy. */
6616 destmem = change_address (destmem, QImode, *destptr);
6617 if (issetmem)
6618 emit_move_insn (destmem, gen_lowpart (QImode, value));
6619 else
6620 {
6621 srcmem = change_address (srcmem, QImode, *srcptr);
6622 emit_move_insn (destmem, srcmem);
6623 }
6624
6625 /* Handle sizes 2 and 3. */
6626 label = ix86_expand_aligntest (*count, 2, false);
6627 destmem = change_address (destmem, HImode, *destptr);
6628 destmem = offset_address (destmem, *count, 1);
6629 destmem = offset_address (destmem, GEN_INT (-2), 2);
6630 if (issetmem)
6631 emit_move_insn (destmem, gen_lowpart (HImode, value));
6632 else
6633 {
6634 srcmem = change_address (srcmem, HImode, *srcptr);
6635 srcmem = offset_address (srcmem, *count, 1);
6636 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
6637 emit_move_insn (destmem, srcmem);
6638 }
6639
6640 emit_label (label);
6641 LABEL_NUSES (label) = 1;
6642 emit_jump_insn (gen_jump (*done_label));
6643 emit_barrier ();
6644 }
6645 else
6646 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
6647 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
6648
6649 /* Start memcpy for COUNT >= SIZE. */
6650 if (loop_label)
6651 {
6652 emit_label (loop_label);
6653 LABEL_NUSES (loop_label) = 1;
6654 }
6655
6656 /* Copy first desired_align bytes. */
6657 if (!issetmem)
6658 srcmem = change_address (srcmem, mode, *srcptr);
6659 destmem = change_address (destmem, mode, *destptr);
6660 modesize = GEN_INT (GET_MODE_SIZE (mode));
6661 for (n = 0; prolog_size < desired_align - align; n++)
6662 {
6663 if (issetmem)
6664 emit_move_insn (destmem, mode_value);
6665 else
6666 {
6667 emit_move_insn (destmem, srcmem);
6668 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6669 }
6670 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6671 prolog_size += GET_MODE_SIZE (mode);
6672 }
6673
6674
6675 /* Copy last SIZE bytes. */
6676 destmem = offset_address (destmem, *count, 1);
6677 destmem = offset_address (destmem,
6678 GEN_INT (-size - prolog_size),
6679 1);
6680 if (issetmem)
6681 emit_move_insn (destmem, mode_value);
6682 else
6683 {
6684 srcmem = offset_address (srcmem, *count, 1);
6685 srcmem = offset_address (srcmem,
6686 GEN_INT (-size - prolog_size),
6687 1);
6688 emit_move_insn (destmem, srcmem);
6689 }
6690 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
6691 {
6692 destmem = offset_address (destmem, modesize, 1);
6693 if (issetmem)
6694 emit_move_insn (destmem, mode_value);
6695 else
6696 {
6697 srcmem = offset_address (srcmem, modesize, 1);
6698 emit_move_insn (destmem, srcmem);
6699 }
6700 }
6701
6702 /* Align destination. */
6703 if (desired_align > 1 && desired_align > align)
6704 {
6705 rtx saveddest = *destptr;
6706
6707 gcc_assert (desired_align <= size);
6708 /* Align destptr up, place it to new register. */
6709 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
6710 GEN_INT (prolog_size),
6711 NULL_RTX, 1, OPTAB_DIRECT);
6712 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
6713 REG_POINTER (*destptr) = 1;
6714 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
6715 GEN_INT (-desired_align),
6716 *destptr, 1, OPTAB_DIRECT);
6717 /* See how many bytes we skipped. */
6718 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
6719 *destptr,
6720 saveddest, 1, OPTAB_DIRECT);
6721 /* Adjust srcptr and count. */
6722 if (!issetmem)
6723 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
6724 saveddest, *srcptr, 1, OPTAB_DIRECT);
6725 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6726 saveddest, *count, 1, OPTAB_DIRECT);
6727 /* We copied at most size + prolog_size. */
6728 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
6729 *min_size
6730 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
6731 else
6732 *min_size = 0;
6733
6734 /* Our loops always round down the block size, but for dispatch to
6735 library we need precise value. */
6736 if (dynamic_check)
6737 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
6738 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
6739 }
6740 else
6741 {
6742 gcc_assert (prolog_size == 0);
6743 /* Decrease count, so we won't end up copying last word twice. */
6744 if (!CONST_INT_P (*count))
6745 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6746 constm1_rtx, *count, 1, OPTAB_DIRECT);
6747 else
6748 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
6749 (unsigned HOST_WIDE_INT)size));
6750 if (*min_size)
6751 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
6752 }
6753}
6754
6755
6756/* This function is like the previous one, except here we know how many bytes
6757 need to be copied. That allows us to update alignment not only of DST, which
6758 is returned, but also of SRC, which is passed as a pointer for that
6759 reason. */
6760static rtx
76715c32 6761expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
2bf6d935
ML
6762 rtx srcreg, rtx value, rtx vec_value,
6763 int desired_align, int align_bytes,
6764 bool issetmem)
6765{
6766 rtx src = NULL;
6767 rtx orig_dst = dst;
6768 rtx orig_src = NULL;
6769 int piece_size = 1;
6770 int copied_bytes = 0;
6771
6772 if (!issetmem)
6773 {
6774 gcc_assert (srcp != NULL);
6775 src = *srcp;
6776 orig_src = src;
6777 }
6778
6779 for (piece_size = 1;
6780 piece_size <= desired_align && copied_bytes < align_bytes;
6781 piece_size <<= 1)
6782 {
6783 if (align_bytes & piece_size)
6784 {
6785 if (issetmem)
6786 {
6787 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
6788 dst = emit_memset (dst, destreg, vec_value, piece_size);
6789 else
6790 dst = emit_memset (dst, destreg, value, piece_size);
6791 }
6792 else
6793 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
6794 copied_bytes += piece_size;
6795 }
6796 }
6797 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
6798 set_mem_align (dst, desired_align * BITS_PER_UNIT);
6799 if (MEM_SIZE_KNOWN_P (orig_dst))
6800 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
6801
6802 if (!issetmem)
6803 {
6804 int src_align_bytes = get_mem_align_offset (src, desired_align
6805 * BITS_PER_UNIT);
6806 if (src_align_bytes >= 0)
6807 src_align_bytes = desired_align - src_align_bytes;
6808 if (src_align_bytes >= 0)
6809 {
6810 unsigned int src_align;
6811 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
6812 {
6813 if ((src_align_bytes & (src_align - 1))
6814 == (align_bytes & (src_align - 1)))
6815 break;
6816 }
6817 if (src_align > (unsigned int) desired_align)
6818 src_align = desired_align;
6819 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
6820 set_mem_align (src, src_align * BITS_PER_UNIT);
6821 }
6822 if (MEM_SIZE_KNOWN_P (orig_src))
6823 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
6824 *srcp = src;
6825 }
6826
6827 return dst;
6828}
6829
6830/* Return true if ALG can be used in current context.
6831 Assume we expand memset if MEMSET is true. */
6832static bool
6833alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
6834{
6835 if (alg == no_stringop)
6836 return false;
6837 if (alg == vector_loop)
6838 return TARGET_SSE || TARGET_AVX;
6839 /* Algorithms using the rep prefix want at least edi and ecx;
6840 additionally, memset wants eax and memcpy wants esi. Don't
6841 consider such algorithms if the user has appropriated those
6842 registers for their own purposes, or if we have a non-default
6843 address space, since some string insns cannot override the segment. */
6844 if (alg == rep_prefix_1_byte
6845 || alg == rep_prefix_4_byte
6846 || alg == rep_prefix_8_byte)
6847 {
6848 if (have_as)
6849 return false;
6850 if (fixed_regs[CX_REG]
6851 || fixed_regs[DI_REG]
6852 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
6853 return false;
6854 }
6855 return true;
6856}
6857
6858/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
6859static enum stringop_alg
6860decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
6861 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
6862 bool memset, bool zero_memset, bool have_as,
6863 int *dynamic_check, bool *noalign, bool recur)
6864{
6865 const struct stringop_algs *algs;
6866 bool optimize_for_speed;
6867 int max = 0;
6868 const struct processor_costs *cost;
6869 int i;
6870 bool any_alg_usable_p = false;
6871
6872 *noalign = false;
6873 *dynamic_check = -1;
6874
6875 /* Even if the string operation call is cold, we still might spend a lot
6876 of time processing large blocks. */
6877 if (optimize_function_for_size_p (cfun)
6878 || (optimize_insn_for_size_p ()
6879 && (max_size < 256
6880 || (expected_size != -1 && expected_size < 256))))
6881 optimize_for_speed = false;
6882 else
6883 optimize_for_speed = true;
6884
6885 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
6886 if (memset)
6887 algs = &cost->memset[TARGET_64BIT != 0];
6888 else
6889 algs = &cost->memcpy[TARGET_64BIT != 0];
6890
6891 /* See maximal size for user defined algorithm. */
6892 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6893 {
6894 enum stringop_alg candidate = algs->size[i].alg;
6895 bool usable = alg_usable_p (candidate, memset, have_as);
6896 any_alg_usable_p |= usable;
6897
6898 if (candidate != libcall && candidate && usable)
6899 max = algs->size[i].max;
6900 }
6901
6902 /* If expected size is not known but max size is small enough
6903 so inline version is a win, set expected size into
6904 the range. */
6905 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
6906 && expected_size == -1)
6907 expected_size = min_size / 2 + max_size / 2;
6908
6909 /* If user specified the algorithm, honor it if possible. */
6910 if (ix86_stringop_alg != no_stringop
6911 && alg_usable_p (ix86_stringop_alg, memset, have_as))
6912 return ix86_stringop_alg;
6913 /* rep; movq or rep; movl is the smallest variant. */
6914 else if (!optimize_for_speed)
6915 {
6916 *noalign = true;
6917 if (!count || (count & 3) || (memset && !zero_memset))
6918 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
6919 ? rep_prefix_1_byte : loop_1_byte;
6920 else
6921 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
6922 ? rep_prefix_4_byte : loop;
6923 }
6924 /* Very tiny blocks are best handled via the loop, REP is expensive to
6925 setup. */
6926 else if (expected_size != -1 && expected_size < 4)
6927 return loop_1_byte;
6928 else if (expected_size != -1)
6929 {
6930 enum stringop_alg alg = libcall;
6931 bool alg_noalign = false;
6932 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6933 {
6934 /* We get here if the algorithms that were not libcall-based
6935 were rep-prefix based and we are unable to use rep prefixes
6936 based on global register usage. Break out of the loop and
6937 use the heuristic below. */
6938 if (algs->size[i].max == 0)
6939 break;
6940 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
6941 {
6942 enum stringop_alg candidate = algs->size[i].alg;
6943
6944 if (candidate != libcall
6945 && alg_usable_p (candidate, memset, have_as))
6946 {
6947 alg = candidate;
6948 alg_noalign = algs->size[i].noalign;
6949 }
6950 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6951 last non-libcall inline algorithm. */
6952 if (TARGET_INLINE_ALL_STRINGOPS)
6953 {
6954 /* When the current size is best to be copied by a libcall,
6955 but we are still forced to inline, run the heuristic below
6956 that will pick code for medium sized blocks. */
6957 if (alg != libcall)
6958 {
6959 *noalign = alg_noalign;
6960 return alg;
6961 }
6962 else if (!any_alg_usable_p)
6963 break;
6964 }
6965 else if (alg_usable_p (candidate, memset, have_as))
6966 {
6967 *noalign = algs->size[i].noalign;
6968 return candidate;
6969 }
6970 }
6971 }
6972 }
6973 /* When asked to inline the call anyway, try to pick meaningful choice.
6974 We look for maximal size of block that is faster to copy by hand and
6975 take blocks of at most of that size guessing that average size will
6976 be roughly half of the block.
6977
6978 If this turns out to be bad, we might simply specify the preferred
6979 choice in ix86_costs. */
6980 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
6981 && (algs->unknown_size == libcall
6982 || !alg_usable_p (algs->unknown_size, memset, have_as)))
6983 {
6984 enum stringop_alg alg;
6985 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
6986
6987 /* If there aren't any usable algorithms or if recursing already,
6988 then recursing on smaller sizes or same size isn't going to
6989 find anything. Just return the simple byte-at-a-time copy loop. */
6990 if (!any_alg_usable_p || recur)
6991 {
6992 /* Pick something reasonable. */
6993 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
6994 *dynamic_check = 128;
6995 return loop_1_byte;
6996 }
6997 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
6998 zero_memset, have_as, dynamic_check, noalign, true);
6999 gcc_assert (*dynamic_check == -1);
7000 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
7001 *dynamic_check = max;
7002 else
7003 gcc_assert (alg != libcall);
7004 return alg;
7005 }
7006 return (alg_usable_p (algs->unknown_size, memset, have_as)
7007 ? algs->unknown_size : libcall);
7008}
7009
7010/* Decide on alignment. We know that the operand is already aligned to ALIGN
7011 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
7012static int
7013decide_alignment (int align,
7014 enum stringop_alg alg,
7015 int expected_size,
7016 machine_mode move_mode)
7017{
7018 int desired_align = 0;
7019
7020 gcc_assert (alg != no_stringop);
7021
7022 if (alg == libcall)
7023 return 0;
7024 if (move_mode == VOIDmode)
7025 return 0;
7026
7027 desired_align = GET_MODE_SIZE (move_mode);
7028 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
7029 copying whole cacheline at once. */
7030 if (TARGET_PENTIUMPRO
7031 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
7032 desired_align = 8;
7033
7034 if (optimize_size)
7035 desired_align = 1;
7036 if (desired_align < align)
7037 desired_align = align;
7038 if (expected_size != -1 && expected_size < 4)
7039 desired_align = align;
7040
7041 return desired_align;
7042}
7043
7044
7045/* Helper function for memcpy. For QImode value 0xXY produce
7046 0xXYXYXYXY of wide specified by MODE. This is essentially
7047 a * 0x10101010, but we can do slightly better than
7048 synth_mult by unwinding the sequence by hand on CPUs with
7049 slow multiply. */
7050static rtx
7051promote_duplicated_reg (machine_mode mode, rtx val)
7052{
7053 machine_mode valmode = GET_MODE (val);
7054 rtx tmp;
7055 int nops = mode == DImode ? 3 : 2;
7056
7057 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
7058 if (val == const0_rtx)
7059 return copy_to_mode_reg (mode, CONST0_RTX (mode));
7060 if (CONST_INT_P (val))
7061 {
7062 HOST_WIDE_INT v = INTVAL (val) & 255;
7063
7064 v |= v << 8;
7065 v |= v << 16;
7066 if (mode == DImode)
7067 v |= (v << 16) << 16;
7068 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
7069 }
7070
7071 if (valmode == VOIDmode)
7072 valmode = QImode;
7073 if (valmode != QImode)
7074 val = gen_lowpart (QImode, val);
7075 if (mode == QImode)
7076 return val;
7077 if (!TARGET_PARTIAL_REG_STALL)
7078 nops--;
7079 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
7080 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
7081 <= (ix86_cost->shift_const + ix86_cost->add) * nops
7082 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
7083 {
7084 rtx reg = convert_modes (mode, QImode, val, true);
7085 tmp = promote_duplicated_reg (mode, const1_rtx);
7086 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
7087 OPTAB_DIRECT);
7088 }
7089 else
7090 {
7091 rtx reg = convert_modes (mode, QImode, val, true);
7092
7093 if (!TARGET_PARTIAL_REG_STALL)
e9539592 7094 emit_insn (gen_insv_1 (mode, reg, reg));
2bf6d935
ML
7095 else
7096 {
7097 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
7098 NULL, 1, OPTAB_DIRECT);
7099 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
7100 OPTAB_DIRECT);
7101 }
7102 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
7103 NULL, 1, OPTAB_DIRECT);
7104 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7105 if (mode == SImode)
7106 return reg;
7107 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
7108 NULL, 1, OPTAB_DIRECT);
7109 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7110 return reg;
7111 }
7112}
7113
7114/* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7115 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7116 alignment from ALIGN to DESIRED_ALIGN. */
7117static rtx
7118promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
7119 int align)
7120{
7121 rtx promoted_val;
7122
7123 if (TARGET_64BIT
7124 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
7125 promoted_val = promote_duplicated_reg (DImode, val);
7126 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
7127 promoted_val = promote_duplicated_reg (SImode, val);
7128 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
7129 promoted_val = promote_duplicated_reg (HImode, val);
7130 else
7131 promoted_val = val;
7132
7133 return promoted_val;
7134}
7135
7136/* Copy the address to a Pmode register. This is used for x32 to
7137 truncate DImode TLS address to a SImode register. */
7138
7139static rtx
7140ix86_copy_addr_to_reg (rtx addr)
7141{
7142 rtx reg;
7143 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
7144 {
7145 reg = copy_addr_to_reg (addr);
7146 REG_POINTER (reg) = 1;
7147 return reg;
7148 }
7149 else
7150 {
7151 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
7152 reg = copy_to_mode_reg (DImode, addr);
7153 REG_POINTER (reg) = 1;
7154 return gen_rtx_SUBREG (SImode, reg, 0);
7155 }
7156}
7157
7158/* Expand string move (memcpy) ot store (memset) operation. Use i386 string
7159 operations when profitable. The code depends upon architecture, block size
7160 and alignment, but always has one of the following overall structures:
7161
7162 Aligned move sequence:
7163
7164 1) Prologue guard: Conditional that jumps up to epilogues for small
7165 blocks that can be handled by epilogue alone. This is faster
7166 but also needed for correctness, since prologue assume the block
7167 is larger than the desired alignment.
7168
7169 Optional dynamic check for size and libcall for large
7170 blocks is emitted here too, with -minline-stringops-dynamically.
7171
7172 2) Prologue: copy first few bytes in order to get destination
7173 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
7174 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7175 copied. We emit either a jump tree on power of two sized
7176 blocks, or a byte loop.
7177
7178 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7179 with specified algorithm.
7180
7181 4) Epilogue: code copying tail of the block that is too small to be
7182 handled by main body (or up to size guarded by prologue guard).
7183
7184 Misaligned move sequence
7185
7186 1) missaligned move prologue/epilogue containing:
7187 a) Prologue handling small memory blocks and jumping to done_label
7188 (skipped if blocks are known to be large enough)
7189 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7190 needed by single possibly misaligned move
7191 (skipped if alignment is not needed)
7192 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7193
7194 2) Zero size guard dispatching to done_label, if needed
7195
7196 3) dispatch to library call, if needed,
7197
7198 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7199 with specified algorithm. */
7200bool
76715c32 7201ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
2bf6d935
ML
7202 rtx align_exp, rtx expected_align_exp,
7203 rtx expected_size_exp, rtx min_size_exp,
7204 rtx max_size_exp, rtx probable_max_size_exp,
7205 bool issetmem)
7206{
7207 rtx destreg;
7208 rtx srcreg = NULL;
7209 rtx_code_label *label = NULL;
7210 rtx tmp;
7211 rtx_code_label *jump_around_label = NULL;
7212 HOST_WIDE_INT align = 1;
7213 unsigned HOST_WIDE_INT count = 0;
7214 HOST_WIDE_INT expected_size = -1;
7215 int size_needed = 0, epilogue_size_needed;
7216 int desired_align = 0, align_bytes = 0;
7217 enum stringop_alg alg;
7218 rtx promoted_val = NULL;
7219 rtx vec_promoted_val = NULL;
7220 bool force_loopy_epilogue = false;
7221 int dynamic_check;
7222 bool need_zero_guard = false;
7223 bool noalign;
7224 machine_mode move_mode = VOIDmode;
7225 machine_mode wider_mode;
7226 int unroll_factor = 1;
7227 /* TODO: Once value ranges are available, fill in proper data. */
7228 unsigned HOST_WIDE_INT min_size = 0;
7229 unsigned HOST_WIDE_INT max_size = -1;
7230 unsigned HOST_WIDE_INT probable_max_size = -1;
7231 bool misaligned_prologue_used = false;
7232 bool have_as;
7233
7234 if (CONST_INT_P (align_exp))
7235 align = INTVAL (align_exp);
7236 /* i386 can do misaligned access on reasonably increased cost. */
7237 if (CONST_INT_P (expected_align_exp)
7238 && INTVAL (expected_align_exp) > align)
7239 align = INTVAL (expected_align_exp);
7240 /* ALIGN is the minimum of destination and source alignment, but we care here
7241 just about destination alignment. */
7242 else if (!issetmem
7243 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
7244 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
7245
7246 if (CONST_INT_P (count_exp))
7247 {
7248 min_size = max_size = probable_max_size = count = expected_size
7249 = INTVAL (count_exp);
7250 /* When COUNT is 0, there is nothing to do. */
7251 if (!count)
7252 return true;
7253 }
7254 else
7255 {
7256 if (min_size_exp)
7257 min_size = INTVAL (min_size_exp);
7258 if (max_size_exp)
7259 max_size = INTVAL (max_size_exp);
7260 if (probable_max_size_exp)
7261 probable_max_size = INTVAL (probable_max_size_exp);
7262 if (CONST_INT_P (expected_size_exp))
7263 expected_size = INTVAL (expected_size_exp);
7264 }
7265
7266 /* Make sure we don't need to care about overflow later on. */
7267 if (count > (HOST_WIDE_INT_1U << 30))
7268 return false;
7269
7270 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
7271 if (!issetmem)
7272 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
7273
7274 /* Step 0: Decide on preferred algorithm, desired alignment and
7275 size of chunks to be copied by main loop. */
7276 alg = decide_alg (count, expected_size, min_size, probable_max_size,
7277 issetmem,
7278 issetmem && val_exp == const0_rtx, have_as,
7279 &dynamic_check, &noalign, false);
7280
7281 if (dump_file)
7282 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
7283 stringop_alg_names[alg]);
7284
7285 if (alg == libcall)
7286 return false;
7287 gcc_assert (alg != no_stringop);
7288
7289 /* For now vector-version of memset is generated only for memory zeroing, as
7290 creating of promoted vector value is very cheap in this case. */
7291 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
7292 alg = unrolled_loop;
7293
7294 if (!count)
7295 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
7296 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
7297 if (!issetmem)
7298 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
7299
7300 unroll_factor = 1;
7301 move_mode = word_mode;
7302 switch (alg)
7303 {
7304 case libcall:
7305 case no_stringop:
7306 case last_alg:
7307 gcc_unreachable ();
7308 case loop_1_byte:
7309 need_zero_guard = true;
7310 move_mode = QImode;
7311 break;
7312 case loop:
7313 need_zero_guard = true;
7314 break;
7315 case unrolled_loop:
7316 need_zero_guard = true;
7317 unroll_factor = (TARGET_64BIT ? 4 : 2);
7318 break;
7319 case vector_loop:
7320 need_zero_guard = true;
7321 unroll_factor = 4;
7322 /* Find the widest supported mode. */
7323 move_mode = word_mode;
7324 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
7325 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
7326 move_mode = wider_mode;
7327
586bbef1 7328 if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
2bf6d935
ML
7329 move_mode = TImode;
7330
7331 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7332 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7333 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7334 {
7335 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7336 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7337 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
7338 move_mode = word_mode;
7339 }
7340 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
7341 break;
7342 case rep_prefix_8_byte:
7343 move_mode = DImode;
7344 break;
7345 case rep_prefix_4_byte:
7346 move_mode = SImode;
7347 break;
7348 case rep_prefix_1_byte:
7349 move_mode = QImode;
7350 break;
7351 }
7352 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
7353 epilogue_size_needed = size_needed;
7354
7355 /* If we are going to call any library calls conditionally, make sure any
7356 pending stack adjustment happen before the first conditional branch,
7357 otherwise they will be emitted before the library call only and won't
7358 happen from the other branches. */
7359 if (dynamic_check != -1)
7360 do_pending_stack_adjust ();
7361
7362 desired_align = decide_alignment (align, alg, expected_size, move_mode);
7363 if (!TARGET_ALIGN_STRINGOPS || noalign)
7364 align = desired_align;
7365
7366 /* Step 1: Prologue guard. */
7367
7368 /* Alignment code needs count to be in register. */
7369 if (CONST_INT_P (count_exp) && desired_align > align)
7370 {
7371 if (INTVAL (count_exp) > desired_align
7372 && INTVAL (count_exp) > size_needed)
7373 {
7374 align_bytes
7375 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
7376 if (align_bytes <= 0)
7377 align_bytes = 0;
7378 else
7379 align_bytes = desired_align - align_bytes;
7380 }
7381 if (align_bytes == 0)
7382 count_exp = force_reg (counter_mode (count_exp), count_exp);
7383 }
7384 gcc_assert (desired_align >= 1 && align >= 1);
7385
7386 /* Misaligned move sequences handle both prologue and epilogue at once.
7387 Default code generation results in a smaller code for large alignments
7388 and also avoids redundant job when sizes are known precisely. */
7389 misaligned_prologue_used
7390 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7391 && MAX (desired_align, epilogue_size_needed) <= 32
7392 && desired_align <= epilogue_size_needed
7393 && ((desired_align > align && !align_bytes)
7394 || (!count && epilogue_size_needed > 1)));
7395
7396 /* Do the cheap promotion to allow better CSE across the
7397 main loop and epilogue (ie one load of the big constant in the
7398 front of all code.
7399 For now the misaligned move sequences do not have fast path
7400 without broadcasting. */
7401 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
7402 {
7403 if (alg == vector_loop)
7404 {
7405 gcc_assert (val_exp == const0_rtx);
7406 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
7407 promoted_val = promote_duplicated_reg_to_size (val_exp,
7408 GET_MODE_SIZE (word_mode),
7409 desired_align, align);
7410 }
7411 else
7412 {
7413 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7414 desired_align, align);
7415 }
7416 }
7417 /* Misaligned move sequences handles both prologues and epilogues at once.
7418 Default code generation results in smaller code for large alignments and
7419 also avoids redundant job when sizes are known precisely. */
7420 if (misaligned_prologue_used)
7421 {
7422 /* Misaligned move prologue handled small blocks by itself. */
76715c32 7423 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
2bf6d935
ML
7424 (dst, src, &destreg, &srcreg,
7425 move_mode, promoted_val, vec_promoted_val,
7426 &count_exp,
7427 &jump_around_label,
7428 desired_align < align
7429 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
7430 desired_align, align, &min_size, dynamic_check, issetmem);
7431 if (!issetmem)
7432 src = change_address (src, BLKmode, srcreg);
7433 dst = change_address (dst, BLKmode, destreg);
7434 set_mem_align (dst, desired_align * BITS_PER_UNIT);
7435 epilogue_size_needed = 0;
7436 if (need_zero_guard
7437 && min_size < (unsigned HOST_WIDE_INT) size_needed)
7438 {
7439 /* It is possible that we copied enough so the main loop will not
7440 execute. */
7441 gcc_assert (size_needed > 1);
7442 if (jump_around_label == NULL_RTX)
7443 jump_around_label = gen_label_rtx ();
7444 emit_cmp_and_jump_insns (count_exp,
7445 GEN_INT (size_needed),
7446 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
7447 if (expected_size == -1
7448 || expected_size < (desired_align - align) / 2 + size_needed)
7449 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7450 else
7451 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7452 }
7453 }
7454 /* Ensure that alignment prologue won't copy past end of block. */
7455 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
7456 {
7457 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
7458 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7459 Make sure it is power of 2. */
7460 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
7461
7462 /* To improve performance of small blocks, we jump around the VAL
7463 promoting mode. This mean that if the promoted VAL is not constant,
7464 we might not use it in the epilogue and have to use byte
7465 loop variant. */
7466 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
7467 force_loopy_epilogue = true;
7468 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7469 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7470 {
7471 /* If main algorithm works on QImode, no epilogue is needed.
7472 For small sizes just don't align anything. */
7473 if (size_needed == 1)
7474 desired_align = align;
7475 else
7476 goto epilogue;
7477 }
7478 else if (!count
7479 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7480 {
7481 label = gen_label_rtx ();
7482 emit_cmp_and_jump_insns (count_exp,
7483 GEN_INT (epilogue_size_needed),
7484 LTU, 0, counter_mode (count_exp), 1, label);
7485 if (expected_size == -1 || expected_size < epilogue_size_needed)
7486 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7487 else
7488 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7489 }
7490 }
7491
7492 /* Emit code to decide on runtime whether library call or inline should be
7493 used. */
7494 if (dynamic_check != -1)
7495 {
7496 if (!issetmem && CONST_INT_P (count_exp))
7497 {
7498 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
7499 {
7500 emit_block_copy_via_libcall (dst, src, count_exp);
7501 count_exp = const0_rtx;
7502 goto epilogue;
7503 }
7504 }
7505 else
7506 {
7507 rtx_code_label *hot_label = gen_label_rtx ();
7508 if (jump_around_label == NULL_RTX)
7509 jump_around_label = gen_label_rtx ();
7510 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
7511 LEU, 0, counter_mode (count_exp),
7512 1, hot_label);
7513 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7514 if (issetmem)
7515 set_storage_via_libcall (dst, count_exp, val_exp);
7516 else
7517 emit_block_copy_via_libcall (dst, src, count_exp);
7518 emit_jump (jump_around_label);
7519 emit_label (hot_label);
7520 }
7521 }
7522
7523 /* Step 2: Alignment prologue. */
7524 /* Do the expensive promotion once we branched off the small blocks. */
7525 if (issetmem && !promoted_val)
7526 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7527 desired_align, align);
7528
7529 if (desired_align > align && !misaligned_prologue_used)
7530 {
7531 if (align_bytes == 0)
7532 {
7533 /* Except for the first move in prologue, we no longer know
7534 constant offset in aliasing info. It don't seems to worth
7535 the pain to maintain it for the first move, so throw away
7536 the info early. */
7537 dst = change_address (dst, BLKmode, destreg);
7538 if (!issetmem)
7539 src = change_address (src, BLKmode, srcreg);
76715c32 7540 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
2bf6d935
ML
7541 promoted_val, vec_promoted_val,
7542 count_exp, align, desired_align,
7543 issetmem);
7544 /* At most desired_align - align bytes are copied. */
7545 if (min_size < (unsigned)(desired_align - align))
7546 min_size = 0;
7547 else
7548 min_size -= desired_align - align;
7549 }
7550 else
7551 {
7552 /* If we know how many bytes need to be stored before dst is
7553 sufficiently aligned, maintain aliasing info accurately. */
76715c32 7554 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
2bf6d935
ML
7555 srcreg,
7556 promoted_val,
7557 vec_promoted_val,
7558 desired_align,
7559 align_bytes,
7560 issetmem);
7561
7562 count_exp = plus_constant (counter_mode (count_exp),
7563 count_exp, -align_bytes);
7564 count -= align_bytes;
7565 min_size -= align_bytes;
7566 max_size -= align_bytes;
7567 }
7568 if (need_zero_guard
7569 && min_size < (unsigned HOST_WIDE_INT) size_needed
7570 && (count < (unsigned HOST_WIDE_INT) size_needed
7571 || (align_bytes == 0
7572 && count < ((unsigned HOST_WIDE_INT) size_needed
7573 + desired_align - align))))
7574 {
7575 /* It is possible that we copied enough so the main loop will not
7576 execute. */
7577 gcc_assert (size_needed > 1);
7578 if (label == NULL_RTX)
7579 label = gen_label_rtx ();
7580 emit_cmp_and_jump_insns (count_exp,
7581 GEN_INT (size_needed),
7582 LTU, 0, counter_mode (count_exp), 1, label);
7583 if (expected_size == -1
7584 || expected_size < (desired_align - align) / 2 + size_needed)
7585 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7586 else
7587 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7588 }
7589 }
7590 if (label && size_needed == 1)
7591 {
7592 emit_label (label);
7593 LABEL_NUSES (label) = 1;
7594 label = NULL;
7595 epilogue_size_needed = 1;
7596 if (issetmem)
7597 promoted_val = val_exp;
7598 }
7599 else if (label == NULL_RTX && !misaligned_prologue_used)
7600 epilogue_size_needed = size_needed;
7601
7602 /* Step 3: Main loop. */
7603
7604 switch (alg)
7605 {
7606 case libcall:
7607 case no_stringop:
7608 case last_alg:
7609 gcc_unreachable ();
7610 case loop_1_byte:
7611 case loop:
7612 case unrolled_loop:
76715c32 7613 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
2bf6d935
ML
7614 count_exp, move_mode, unroll_factor,
7615 expected_size, issetmem);
7616 break;
7617 case vector_loop:
76715c32 7618 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
2bf6d935
ML
7619 vec_promoted_val, count_exp, move_mode,
7620 unroll_factor, expected_size, issetmem);
7621 break;
7622 case rep_prefix_8_byte:
7623 case rep_prefix_4_byte:
7624 case rep_prefix_1_byte:
76715c32 7625 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
2bf6d935
ML
7626 val_exp, count_exp, move_mode, issetmem);
7627 break;
7628 }
7629 /* Adjust properly the offset of src and dest memory for aliasing. */
7630 if (CONST_INT_P (count_exp))
7631 {
7632 if (!issetmem)
7633 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
7634 (count / size_needed) * size_needed);
7635 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
7636 (count / size_needed) * size_needed);
7637 }
7638 else
7639 {
7640 if (!issetmem)
7641 src = change_address (src, BLKmode, srcreg);
7642 dst = change_address (dst, BLKmode, destreg);
7643 }
7644
7645 /* Step 4: Epilogue to copy the remaining bytes. */
7646 epilogue:
7647 if (label)
7648 {
7649 /* When the main loop is done, COUNT_EXP might hold original count,
7650 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7651 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7652 bytes. Compensate if needed. */
7653
7654 if (size_needed < epilogue_size_needed)
7655 {
7656 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
7657 GEN_INT (size_needed - 1), count_exp, 1,
7658 OPTAB_DIRECT);
7659 if (tmp != count_exp)
7660 emit_move_insn (count_exp, tmp);
7661 }
7662 emit_label (label);
7663 LABEL_NUSES (label) = 1;
7664 }
7665
7666 if (count_exp != const0_rtx && epilogue_size_needed > 1)
7667 {
7668 if (force_loopy_epilogue)
7669 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
7670 epilogue_size_needed);
7671 else
7672 {
7673 if (issetmem)
7674 expand_setmem_epilogue (dst, destreg, promoted_val,
7675 vec_promoted_val, count_exp,
7676 epilogue_size_needed);
7677 else
76715c32 7678 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
2bf6d935
ML
7679 epilogue_size_needed);
7680 }
7681 }
7682 if (jump_around_label)
7683 emit_label (jump_around_label);
7684 return true;
7685}
7686
3edc21af
L
7687/* Expand cmpstrn or memcmp. */
7688
7689bool
7690ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
7691 rtx length, rtx align, bool is_cmpstrn)
7692{
4052c05e
L
7693 /* Expand strncmp and memcmp only with -minline-all-stringops since
7694 "repz cmpsb" can be much slower than strncmp and memcmp functions
7695 implemented with vector instructions, see
7696
7697 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
7698 */
7699 if (!TARGET_INLINE_ALL_STRINGOPS)
3edc21af
L
7700 return false;
7701
7702 /* Can't use this if the user has appropriated ecx, esi or edi. */
7703 if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
7704 return false;
7705
7706 if (is_cmpstrn)
7707 {
7708 /* For strncmp, length is the maximum length, which can be larger
7709 than actual string lengths. We can expand the cmpstrn pattern
7710 to "repz cmpsb" only if one of the strings is a constant so
7711 that expand_builtin_strncmp() can write the length argument to
7712 be the minimum of the const string length and the actual length
7713 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
7714 tree t1 = MEM_EXPR (src1);
7715 tree t2 = MEM_EXPR (src2);
7716 if (!((t1 && TREE_CODE (t1) == MEM_REF
7717 && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
7718 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
7719 == STRING_CST))
7720 || (t2 && TREE_CODE (t2) == MEM_REF
7721 && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
7722 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
7723 == STRING_CST))))
7724 return false;
7725 }
3edc21af
L
7726
7727 rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
7728 rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
7729 if (addr1 != XEXP (src1, 0))
7730 src1 = replace_equiv_address_nv (src1, addr1);
7731 if (addr2 != XEXP (src2, 0))
7732 src2 = replace_equiv_address_nv (src2, addr2);
7733
7734 /* NB: Make a copy of the data length to avoid changing the original
7735 data length by cmpstrnqi patterns. */
7736 length = ix86_zero_extend_to_Pmode (length);
7737 rtx lengthreg = gen_reg_rtx (Pmode);
7738 emit_move_insn (lengthreg, length);
7739
7740 /* If we are testing strict equality, we can use known alignment to
7741 good advantage. This may be possible with combine, particularly
7742 once cc0 is dead. */
7743 if (CONST_INT_P (length))
7744 {
7745 if (length == const0_rtx)
7746 {
7747 emit_move_insn (result, const0_rtx);
7748 return true;
7749 }
7750 emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
7751 src1, src2));
7752 }
7753 else
7754 {
7755 emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
7756 emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
7757 src1, src2));
7758 }
7759
7760 rtx out = gen_lowpart (QImode, result);
7761 emit_insn (gen_cmpintqi (out));
7762 emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
7763
7764 return true;
7765}
2bf6d935
ML
7766
7767/* Expand the appropriate insns for doing strlen if not just doing
7768 repnz; scasb
7769
7770 out = result, initialized with the start address
7771 align_rtx = alignment of the address.
7772 scratch = scratch register, initialized with the startaddress when
7773 not aligned, otherwise undefined
7774
7775 This is just the body. It needs the initializations mentioned above and
7776 some address computing at the end. These things are done in i386.md. */
7777
7778static void
7779ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
7780{
7781 int align;
7782 rtx tmp;
7783 rtx_code_label *align_2_label = NULL;
7784 rtx_code_label *align_3_label = NULL;
7785 rtx_code_label *align_4_label = gen_label_rtx ();
7786 rtx_code_label *end_0_label = gen_label_rtx ();
7787 rtx mem;
7788 rtx tmpreg = gen_reg_rtx (SImode);
7789 rtx scratch = gen_reg_rtx (SImode);
7790 rtx cmp;
7791
7792 align = 0;
7793 if (CONST_INT_P (align_rtx))
7794 align = INTVAL (align_rtx);
7795
7796 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
7797
7798 /* Is there a known alignment and is it less than 4? */
7799 if (align < 4)
7800 {
7801 rtx scratch1 = gen_reg_rtx (Pmode);
7802 emit_move_insn (scratch1, out);
7803 /* Is there a known alignment and is it not 2? */
7804 if (align != 2)
7805 {
7806 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
7807 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
7808
7809 /* Leave just the 3 lower bits. */
7810 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
7811 NULL_RTX, 0, OPTAB_WIDEN);
7812
7813 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7814 Pmode, 1, align_4_label);
7815 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
7816 Pmode, 1, align_2_label);
7817 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
7818 Pmode, 1, align_3_label);
7819 }
7820 else
7821 {
7822 /* Since the alignment is 2, we have to check 2 or 0 bytes;
7823 check if is aligned to 4 - byte. */
7824
7825 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
7826 NULL_RTX, 0, OPTAB_WIDEN);
7827
7828 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7829 Pmode, 1, align_4_label);
7830 }
7831
7832 mem = change_address (src, QImode, out);
7833
7834 /* Now compare the bytes. */
7835
7836 /* Compare the first n unaligned byte on a byte per byte basis. */
7837 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
7838 QImode, 1, end_0_label);
7839
7840 /* Increment the address. */
d9330fb5 7841 emit_insn (gen_add2_insn (out, const1_rtx));
2bf6d935
ML
7842
7843 /* Not needed with an alignment of 2 */
7844 if (align != 2)
7845 {
7846 emit_label (align_2_label);
7847
7848 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7849 end_0_label);
7850
d9330fb5 7851 emit_insn (gen_add2_insn (out, const1_rtx));
2bf6d935
ML
7852
7853 emit_label (align_3_label);
7854 }
7855
7856 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7857 end_0_label);
7858
d9330fb5 7859 emit_insn (gen_add2_insn (out, const1_rtx));
2bf6d935
ML
7860 }
7861
7862 /* Generate loop to check 4 bytes at a time. It is not a good idea to
7863 align this loop. It gives only huge programs, but does not help to
7864 speed up. */
7865 emit_label (align_4_label);
7866
7867 mem = change_address (src, SImode, out);
7868 emit_move_insn (scratch, mem);
d9330fb5 7869 emit_insn (gen_add2_insn (out, GEN_INT (4)));
2bf6d935
ML
7870
7871 /* This formula yields a nonzero result iff one of the bytes is zero.
7872 This saves three branches inside loop and many cycles. */
7873
7874 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
7875 emit_insn (gen_one_cmplsi2 (scratch, scratch));
7876 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
7877 emit_insn (gen_andsi3 (tmpreg, tmpreg,
7878 gen_int_mode (0x80808080, SImode)));
7879 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
7880 align_4_label);
7881
7882 if (TARGET_CMOVE)
7883 {
7884 rtx reg = gen_reg_rtx (SImode);
7885 rtx reg2 = gen_reg_rtx (Pmode);
7886 emit_move_insn (reg, tmpreg);
7887 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
7888
7889 /* If zero is not in the first two bytes, move two bytes forward. */
7890 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7891 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7892 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7893 emit_insn (gen_rtx_SET (tmpreg,
7894 gen_rtx_IF_THEN_ELSE (SImode, tmp,
7895 reg,
7896 tmpreg)));
7897 /* Emit lea manually to avoid clobbering of flags. */
c3185b64 7898 emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
2bf6d935
ML
7899
7900 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7901 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7902 emit_insn (gen_rtx_SET (out,
7903 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
7904 reg2,
7905 out)));
7906 }
7907 else
7908 {
7909 rtx_code_label *end_2_label = gen_label_rtx ();
7910 /* Is zero in the first two bytes? */
7911
7912 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7913 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7914 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
7915 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
7916 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
7917 pc_rtx);
7918 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
7919 JUMP_LABEL (tmp) = end_2_label;
7920
7921 /* Not in the first two. Move two bytes forward. */
7922 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
d9330fb5 7923 emit_insn (gen_add2_insn (out, const2_rtx));
2bf6d935
ML
7924
7925 emit_label (end_2_label);
7926
7927 }
7928
7929 /* Avoid branch in fixing the byte. */
7930 tmpreg = gen_lowpart (QImode, tmpreg);
7931 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
7932 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
7933 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
d9330fb5 7934 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
2bf6d935
ML
7935
7936 emit_label (end_0_label);
7937}
7938
7939/* Expand strlen. */
7940
7941bool
7942ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
7943{
7944if (TARGET_UNROLL_STRLEN
7945 && TARGET_INLINE_ALL_STRINGOPS
7946 && eoschar == const0_rtx
7947 && optimize > 1)
7948 {
7949 /* The generic case of strlen expander is long. Avoid it's
7950 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
7951 rtx addr = force_reg (Pmode, XEXP (src, 0));
7952 /* Well it seems that some optimizer does not combine a call like
7953 foo(strlen(bar), strlen(bar));
7954 when the move and the subtraction is done here. It does calculate
7955 the length just once when these instructions are done inside of
7956 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
7957 often used and I use one fewer register for the lifetime of
7958 output_strlen_unroll() this is better. */
7959
7960 emit_move_insn (out, addr);
7961
7962 ix86_expand_strlensi_unroll_1 (out, src, align);
7963
7964 /* strlensi_unroll_1 returns the address of the zero at the end of
7965 the string, like memchr(), so compute the length by subtracting
7966 the start address. */
d9330fb5 7967 emit_insn (gen_sub2_insn (out, addr));
2bf6d935
ML
7968 return true;
7969 }
7970 else
7971 return false;
7972}
7973
7974/* For given symbol (function) construct code to compute address of it's PLT
7975 entry in large x86-64 PIC model. */
7976
7977static rtx
7978construct_plt_address (rtx symbol)
7979{
7980 rtx tmp, unspec;
7981
7982 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
7983 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
7984 gcc_assert (Pmode == DImode);
7985
7986 tmp = gen_reg_rtx (Pmode);
7987 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
7988
7989 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
d9330fb5 7990 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
2bf6d935
ML
7991 return tmp;
7992}
7993
7994/* Additional registers that are clobbered by SYSV calls. */
7995
7996static int const x86_64_ms_sysv_extra_clobbered_registers
7997 [NUM_X86_64_MS_CLOBBERED_REGS] =
7998{
7999 SI_REG, DI_REG,
8000 XMM6_REG, XMM7_REG,
8001 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
8002 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
8003};
8004
8005rtx_insn *
8006ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
8007 rtx callarg2,
8008 rtx pop, bool sibcall)
8009{
8010 rtx vec[3];
8011 rtx use = NULL, call;
8012 unsigned int vec_len = 0;
8013 tree fndecl;
8014
8015 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
8016 {
8017 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
8018 if (fndecl
8019 && (lookup_attribute ("interrupt",
8020 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
a9c697b8 8021 error ("interrupt service routine cannot be called directly");
2bf6d935
ML
8022 }
8023 else
8024 fndecl = NULL_TREE;
8025
8026 if (pop == const0_rtx)
8027 pop = NULL;
8028 gcc_assert (!TARGET_64BIT || !pop);
8029
8030 if (TARGET_MACHO && !TARGET_64BIT)
8031 {
8032#if TARGET_MACHO
8033 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
8034 fnaddr = machopic_indirect_call_target (fnaddr);
8035#endif
8036 }
8037 else
8038 {
8039 /* Static functions and indirect calls don't need the pic register. Also,
8040 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
8041 it an indirect call. */
8042 rtx addr = XEXP (fnaddr, 0);
8043 if (flag_pic
8044 && GET_CODE (addr) == SYMBOL_REF
8045 && !SYMBOL_REF_LOCAL_P (addr))
8046 {
8047 if (flag_plt
8048 && (SYMBOL_REF_DECL (addr) == NULL_TREE
8049 || !lookup_attribute ("noplt",
8050 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
8051 {
8052 if (!TARGET_64BIT
8053 || (ix86_cmodel == CM_LARGE_PIC
8054 && DEFAULT_ABI != MS_ABI))
8055 {
8056 use_reg (&use, gen_rtx_REG (Pmode,
8057 REAL_PIC_OFFSET_TABLE_REGNUM));
8058 if (ix86_use_pseudo_pic_reg ())
8059 emit_move_insn (gen_rtx_REG (Pmode,
8060 REAL_PIC_OFFSET_TABLE_REGNUM),
8061 pic_offset_table_rtx);
8062 }
8063 }
8064 else if (!TARGET_PECOFF && !TARGET_MACHO)
8065 {
8066 if (TARGET_64BIT)
8067 {
8068 fnaddr = gen_rtx_UNSPEC (Pmode,
8069 gen_rtvec (1, addr),
8070 UNSPEC_GOTPCREL);
8071 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8072 }
8073 else
8074 {
8075 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
8076 UNSPEC_GOT);
8077 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8078 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
8079 fnaddr);
8080 }
8081 fnaddr = gen_const_mem (Pmode, fnaddr);
8082 /* Pmode may not be the same as word_mode for x32, which
8083 doesn't support indirect branch via 32-bit memory slot.
8084 Since x32 GOT slot is 64 bit with zero upper 32 bits,
8085 indirect branch via x32 GOT slot is OK. */
8086 if (GET_MODE (fnaddr) != word_mode)
8087 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
8088 fnaddr = gen_rtx_MEM (QImode, fnaddr);
8089 }
8090 }
8091 }
8092
8093 /* Skip setting up RAX register for -mskip-rax-setup when there are no
8094 parameters passed in vector registers. */
8095 if (TARGET_64BIT
8096 && (INTVAL (callarg2) > 0
8097 || (INTVAL (callarg2) == 0
8098 && (TARGET_SSE || !flag_skip_rax_setup))))
8099 {
8100 rtx al = gen_rtx_REG (QImode, AX_REG);
8101 emit_move_insn (al, callarg2);
8102 use_reg (&use, al);
8103 }
8104
8105 if (ix86_cmodel == CM_LARGE_PIC
8106 && !TARGET_PECOFF
8107 && MEM_P (fnaddr)
8108 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
8109 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
8110 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
8111 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
8112 branch via x32 GOT slot is OK. */
8113 else if (!(TARGET_X32
8114 && MEM_P (fnaddr)
8115 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
8116 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
8117 && (sibcall
8118 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
8119 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
8120 {
8121 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
8122 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
8123 }
8124
8125 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
8126
8127 if (retval)
8128 call = gen_rtx_SET (retval, call);
8129 vec[vec_len++] = call;
8130
8131 if (pop)
8132 {
8133 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
8134 pop = gen_rtx_SET (stack_pointer_rtx, pop);
8135 vec[vec_len++] = pop;
8136 }
8137
8138 if (cfun->machine->no_caller_saved_registers
8139 && (!fndecl
8140 || (!TREE_THIS_VOLATILE (fndecl)
8141 && !lookup_attribute ("no_caller_saved_registers",
8142 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
8143 {
8144 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
8145 bool is_64bit_ms_abi = (TARGET_64BIT
8146 && ix86_function_abi (fndecl) == MS_ABI);
8147 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
8148
8149 /* If there are no caller-saved registers, add all registers
8150 that are clobbered by the call which returns. */
8151 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
8152 if (!fixed_regs[i]
8153 && (ix86_call_used_regs[i] == 1
8154 || (ix86_call_used_regs[i] & c_mask))
8155 && !STACK_REGNO_P (i)
8156 && !MMX_REGNO_P (i))
8157 clobber_reg (&use,
8158 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
8159 }
8160 else if (TARGET_64BIT_MS_ABI
8161 && (!callarg2 || INTVAL (callarg2) != -2))
8162 {
8163 unsigned i;
8164
8165 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
8166 {
8167 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
8168 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
8169
8170 clobber_reg (&use, gen_rtx_REG (mode, regno));
8171 }
8172
8173 /* Set here, but it may get cleared later. */
8174 if (TARGET_CALL_MS2SYSV_XLOGUES)
8175 {
8176 if (!TARGET_SSE)
8177 ;
8178
8179 /* Don't break hot-patched functions. */
8180 else if (ix86_function_ms_hook_prologue (current_function_decl))
8181 ;
8182
8183 /* TODO: Cases not yet examined. */
8184 else if (flag_split_stack)
8185 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8186
8187 else
8188 {
8189 gcc_assert (!reload_completed);
8190 cfun->machine->call_ms2sysv = true;
8191 }
8192 }
8193 }
8194
8195 if (vec_len > 1)
8196 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
8197 rtx_insn *call_insn = emit_call_insn (call);
8198 if (use)
8199 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
8200
8201 return call_insn;
8202}
8203
8204/* Split simple return with popping POPC bytes from stack to indirect
8205 branch with stack adjustment . */
8206
8207void
8208ix86_split_simple_return_pop_internal (rtx popc)
8209{
8210 struct machine_function *m = cfun->machine;
8211 rtx ecx = gen_rtx_REG (SImode, CX_REG);
8212 rtx_insn *insn;
8213
8214 /* There is no "pascal" calling convention in any 64bit ABI. */
8215 gcc_assert (!TARGET_64BIT);
8216
8217 insn = emit_insn (gen_pop (ecx));
8218 m->fs.cfa_offset -= UNITS_PER_WORD;
8219 m->fs.sp_offset -= UNITS_PER_WORD;
8220
8221 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
8222 x = gen_rtx_SET (stack_pointer_rtx, x);
8223 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8224 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
8225 RTX_FRAME_RELATED_P (insn) = 1;
8226
8227 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
8228 x = gen_rtx_SET (stack_pointer_rtx, x);
8229 insn = emit_insn (x);
8230 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8231 RTX_FRAME_RELATED_P (insn) = 1;
8232
8233 /* Now return address is in ECX. */
8234 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
8235}
8236
8237/* Errors in the source file can cause expand_expr to return const0_rtx
8238 where we expect a vector. To avoid crashing, use one of the vector
8239 clear instructions. */
8240
8241static rtx
8242safe_vector_operand (rtx x, machine_mode mode)
8243{
8244 if (x == const0_rtx)
8245 x = CONST0_RTX (mode);
8246 return x;
8247}
8248
8249/* Subroutine of ix86_expand_builtin to take care of binop insns. */
8250
8251static rtx
8252ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
8253{
8254 rtx pat;
8255 tree arg0 = CALL_EXPR_ARG (exp, 0);
8256 tree arg1 = CALL_EXPR_ARG (exp, 1);
8257 rtx op0 = expand_normal (arg0);
8258 rtx op1 = expand_normal (arg1);
8259 machine_mode tmode = insn_data[icode].operand[0].mode;
8260 machine_mode mode0 = insn_data[icode].operand[1].mode;
8261 machine_mode mode1 = insn_data[icode].operand[2].mode;
8262
8263 if (VECTOR_MODE_P (mode0))
8264 op0 = safe_vector_operand (op0, mode0);
8265 if (VECTOR_MODE_P (mode1))
8266 op1 = safe_vector_operand (op1, mode1);
8267
8268 if (optimize || !target
8269 || GET_MODE (target) != tmode
8270 || !insn_data[icode].operand[0].predicate (target, tmode))
8271 target = gen_reg_rtx (tmode);
8272
8273 if (GET_MODE (op1) == SImode && mode1 == TImode)
8274 {
8275 rtx x = gen_reg_rtx (V4SImode);
8276 emit_insn (gen_sse2_loadd (x, op1));
8277 op1 = gen_lowpart (TImode, x);
8278 }
8279
8280 if (!insn_data[icode].operand[1].predicate (op0, mode0))
8281 op0 = copy_to_mode_reg (mode0, op0);
8282 if (!insn_data[icode].operand[2].predicate (op1, mode1))
8283 op1 = copy_to_mode_reg (mode1, op1);
8284
8285 pat = GEN_FCN (icode) (target, op0, op1);
8286 if (! pat)
8287 return 0;
8288
8289 emit_insn (pat);
8290
8291 return target;
8292}
8293
8294/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
8295
8296static rtx
8297ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
8298 enum ix86_builtin_func_type m_type,
8299 enum rtx_code sub_code)
8300{
8301 rtx pat;
8302 int i;
8303 int nargs;
8304 bool comparison_p = false;
8305 bool tf_p = false;
8306 bool last_arg_constant = false;
8307 int num_memory = 0;
8308 struct {
8309 rtx op;
8310 machine_mode mode;
8311 } args[4];
8312
8313 machine_mode tmode = insn_data[icode].operand[0].mode;
8314
8315 switch (m_type)
8316 {
8317 case MULTI_ARG_4_DF2_DI_I:
8318 case MULTI_ARG_4_DF2_DI_I1:
8319 case MULTI_ARG_4_SF2_SI_I:
8320 case MULTI_ARG_4_SF2_SI_I1:
8321 nargs = 4;
8322 last_arg_constant = true;
8323 break;
8324
8325 case MULTI_ARG_3_SF:
8326 case MULTI_ARG_3_DF:
8327 case MULTI_ARG_3_SF2:
8328 case MULTI_ARG_3_DF2:
8329 case MULTI_ARG_3_DI:
8330 case MULTI_ARG_3_SI:
8331 case MULTI_ARG_3_SI_DI:
8332 case MULTI_ARG_3_HI:
8333 case MULTI_ARG_3_HI_SI:
8334 case MULTI_ARG_3_QI:
8335 case MULTI_ARG_3_DI2:
8336 case MULTI_ARG_3_SI2:
8337 case MULTI_ARG_3_HI2:
8338 case MULTI_ARG_3_QI2:
8339 nargs = 3;
8340 break;
8341
8342 case MULTI_ARG_2_SF:
8343 case MULTI_ARG_2_DF:
8344 case MULTI_ARG_2_DI:
8345 case MULTI_ARG_2_SI:
8346 case MULTI_ARG_2_HI:
8347 case MULTI_ARG_2_QI:
8348 nargs = 2;
8349 break;
8350
8351 case MULTI_ARG_2_DI_IMM:
8352 case MULTI_ARG_2_SI_IMM:
8353 case MULTI_ARG_2_HI_IMM:
8354 case MULTI_ARG_2_QI_IMM:
8355 nargs = 2;
8356 last_arg_constant = true;
8357 break;
8358
8359 case MULTI_ARG_1_SF:
8360 case MULTI_ARG_1_DF:
8361 case MULTI_ARG_1_SF2:
8362 case MULTI_ARG_1_DF2:
8363 case MULTI_ARG_1_DI:
8364 case MULTI_ARG_1_SI:
8365 case MULTI_ARG_1_HI:
8366 case MULTI_ARG_1_QI:
8367 case MULTI_ARG_1_SI_DI:
8368 case MULTI_ARG_1_HI_DI:
8369 case MULTI_ARG_1_HI_SI:
8370 case MULTI_ARG_1_QI_DI:
8371 case MULTI_ARG_1_QI_SI:
8372 case MULTI_ARG_1_QI_HI:
8373 nargs = 1;
8374 break;
8375
8376 case MULTI_ARG_2_DI_CMP:
8377 case MULTI_ARG_2_SI_CMP:
8378 case MULTI_ARG_2_HI_CMP:
8379 case MULTI_ARG_2_QI_CMP:
8380 nargs = 2;
8381 comparison_p = true;
8382 break;
8383
8384 case MULTI_ARG_2_SF_TF:
8385 case MULTI_ARG_2_DF_TF:
8386 case MULTI_ARG_2_DI_TF:
8387 case MULTI_ARG_2_SI_TF:
8388 case MULTI_ARG_2_HI_TF:
8389 case MULTI_ARG_2_QI_TF:
8390 nargs = 2;
8391 tf_p = true;
8392 break;
8393
8394 default:
8395 gcc_unreachable ();
8396 }
8397
8398 if (optimize || !target
8399 || GET_MODE (target) != tmode
8400 || !insn_data[icode].operand[0].predicate (target, tmode))
8401 target = gen_reg_rtx (tmode);
8402 else if (memory_operand (target, tmode))
8403 num_memory++;
8404
8405 gcc_assert (nargs <= 4);
8406
8407 for (i = 0; i < nargs; i++)
8408 {
8409 tree arg = CALL_EXPR_ARG (exp, i);
8410 rtx op = expand_normal (arg);
8411 int adjust = (comparison_p) ? 1 : 0;
8412 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
8413
8414 if (last_arg_constant && i == nargs - 1)
8415 {
8416 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
8417 {
8418 enum insn_code new_icode = icode;
8419 switch (icode)
8420 {
8421 case CODE_FOR_xop_vpermil2v2df3:
8422 case CODE_FOR_xop_vpermil2v4sf3:
8423 case CODE_FOR_xop_vpermil2v4df3:
8424 case CODE_FOR_xop_vpermil2v8sf3:
8425 error ("the last argument must be a 2-bit immediate");
8426 return gen_reg_rtx (tmode);
8427 case CODE_FOR_xop_rotlv2di3:
8428 new_icode = CODE_FOR_rotlv2di3;
8429 goto xop_rotl;
8430 case CODE_FOR_xop_rotlv4si3:
8431 new_icode = CODE_FOR_rotlv4si3;
8432 goto xop_rotl;
8433 case CODE_FOR_xop_rotlv8hi3:
8434 new_icode = CODE_FOR_rotlv8hi3;
8435 goto xop_rotl;
8436 case CODE_FOR_xop_rotlv16qi3:
8437 new_icode = CODE_FOR_rotlv16qi3;
8438 xop_rotl:
8439 if (CONST_INT_P (op))
8440 {
8441 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
8442 op = GEN_INT (INTVAL (op) & mask);
8443 gcc_checking_assert
8444 (insn_data[icode].operand[i + 1].predicate (op, mode));
8445 }
8446 else
8447 {
8448 gcc_checking_assert
8449 (nargs == 2
8450 && insn_data[new_icode].operand[0].mode == tmode
8451 && insn_data[new_icode].operand[1].mode == tmode
8452 && insn_data[new_icode].operand[2].mode == mode
8453 && insn_data[new_icode].operand[0].predicate
8454 == insn_data[icode].operand[0].predicate
8455 && insn_data[new_icode].operand[1].predicate
8456 == insn_data[icode].operand[1].predicate);
8457 icode = new_icode;
8458 goto non_constant;
8459 }
8460 break;
8461 default:
8462 gcc_unreachable ();
8463 }
8464 }
8465 }
8466 else
8467 {
8468 non_constant:
8469 if (VECTOR_MODE_P (mode))
8470 op = safe_vector_operand (op, mode);
8471
8472 /* If we aren't optimizing, only allow one memory operand to be
8473 generated. */
8474 if (memory_operand (op, mode))
8475 num_memory++;
8476
8477 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
8478
8479 if (optimize
8480 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
8481 || num_memory > 1)
8482 op = force_reg (mode, op);
8483 }
8484
8485 args[i].op = op;
8486 args[i].mode = mode;
8487 }
8488
8489 switch (nargs)
8490 {
8491 case 1:
8492 pat = GEN_FCN (icode) (target, args[0].op);
8493 break;
8494
8495 case 2:
8496 if (tf_p)
8497 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
8498 GEN_INT ((int)sub_code));
8499 else if (! comparison_p)
8500 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
8501 else
8502 {
8503 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
8504 args[0].op,
8505 args[1].op);
8506
8507 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
8508 }
8509 break;
8510
8511 case 3:
8512 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
8513 break;
8514
8515 case 4:
8516 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
8517 break;
8518
8519 default:
8520 gcc_unreachable ();
8521 }
8522
8523 if (! pat)
8524 return 0;
8525
8526 emit_insn (pat);
8527 return target;
8528}
8529
8530/* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8531 insns with vec_merge. */
8532
8533static rtx
8534ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
8535 rtx target)
8536{
8537 rtx pat;
8538 tree arg0 = CALL_EXPR_ARG (exp, 0);
8539 rtx op1, op0 = expand_normal (arg0);
8540 machine_mode tmode = insn_data[icode].operand[0].mode;
8541 machine_mode mode0 = insn_data[icode].operand[1].mode;
8542
8543 if (optimize || !target
8544 || GET_MODE (target) != tmode
8545 || !insn_data[icode].operand[0].predicate (target, tmode))
8546 target = gen_reg_rtx (tmode);
8547
8548 if (VECTOR_MODE_P (mode0))
8549 op0 = safe_vector_operand (op0, mode0);
8550
8551 if ((optimize && !register_operand (op0, mode0))
8552 || !insn_data[icode].operand[1].predicate (op0, mode0))
8553 op0 = copy_to_mode_reg (mode0, op0);
8554
8555 op1 = op0;
8556 if (!insn_data[icode].operand[2].predicate (op1, mode0))
8557 op1 = copy_to_mode_reg (mode0, op1);
8558
8559 pat = GEN_FCN (icode) (target, op0, op1);
8560 if (! pat)
8561 return 0;
8562 emit_insn (pat);
8563 return target;
8564}
8565
8566/* Subroutine of ix86_expand_builtin to take care of comparison insns. */
8567
8568static rtx
8569ix86_expand_sse_compare (const struct builtin_description *d,
8570 tree exp, rtx target, bool swap)
8571{
8572 rtx pat;
8573 tree arg0 = CALL_EXPR_ARG (exp, 0);
8574 tree arg1 = CALL_EXPR_ARG (exp, 1);
8575 rtx op0 = expand_normal (arg0);
8576 rtx op1 = expand_normal (arg1);
8577 rtx op2;
8578 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8579 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8580 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8581 enum rtx_code comparison = d->comparison;
8582
8583 if (VECTOR_MODE_P (mode0))
8584 op0 = safe_vector_operand (op0, mode0);
8585 if (VECTOR_MODE_P (mode1))
8586 op1 = safe_vector_operand (op1, mode1);
8587
8588 /* Swap operands if we have a comparison that isn't available in
8589 hardware. */
8590 if (swap)
8591 std::swap (op0, op1);
8592
8593 if (optimize || !target
8594 || GET_MODE (target) != tmode
8595 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8596 target = gen_reg_rtx (tmode);
8597
8598 if ((optimize && !register_operand (op0, mode0))
8599 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
8600 op0 = copy_to_mode_reg (mode0, op0);
8601 if ((optimize && !register_operand (op1, mode1))
8602 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
8603 op1 = copy_to_mode_reg (mode1, op1);
8604
8605 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
8606 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8607 if (! pat)
8608 return 0;
8609 emit_insn (pat);
8610 return target;
8611}
8612
8613/* Subroutine of ix86_expand_builtin to take care of comi insns. */
8614
8615static rtx
8616ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
8617 rtx target)
8618{
8619 rtx pat;
8620 tree arg0 = CALL_EXPR_ARG (exp, 0);
8621 tree arg1 = CALL_EXPR_ARG (exp, 1);
8622 rtx op0 = expand_normal (arg0);
8623 rtx op1 = expand_normal (arg1);
8624 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8625 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8626 enum rtx_code comparison = d->comparison;
8627
8628 if (VECTOR_MODE_P (mode0))
8629 op0 = safe_vector_operand (op0, mode0);
8630 if (VECTOR_MODE_P (mode1))
8631 op1 = safe_vector_operand (op1, mode1);
8632
8633 /* Swap operands if we have a comparison that isn't available in
8634 hardware. */
8635 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
8636 std::swap (op0, op1);
8637
8638 target = gen_reg_rtx (SImode);
8639 emit_move_insn (target, const0_rtx);
8640 target = gen_rtx_SUBREG (QImode, target, 0);
8641
8642 if ((optimize && !register_operand (op0, mode0))
8643 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8644 op0 = copy_to_mode_reg (mode0, op0);
8645 if ((optimize && !register_operand (op1, mode1))
8646 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8647 op1 = copy_to_mode_reg (mode1, op1);
8648
8649 pat = GEN_FCN (d->icode) (op0, op1);
8650 if (! pat)
8651 return 0;
8652 emit_insn (pat);
8653 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8654 gen_rtx_fmt_ee (comparison, QImode,
8655 SET_DEST (pat),
8656 const0_rtx)));
8657
8658 return SUBREG_REG (target);
8659}
8660
8661/* Subroutines of ix86_expand_args_builtin to take care of round insns. */
8662
8663static rtx
8664ix86_expand_sse_round (const struct builtin_description *d, tree exp,
8665 rtx target)
8666{
8667 rtx pat;
8668 tree arg0 = CALL_EXPR_ARG (exp, 0);
8669 rtx op1, op0 = expand_normal (arg0);
8670 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8671 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8672
8673 if (optimize || target == 0
8674 || GET_MODE (target) != tmode
8675 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8676 target = gen_reg_rtx (tmode);
8677
8678 if (VECTOR_MODE_P (mode0))
8679 op0 = safe_vector_operand (op0, mode0);
8680
8681 if ((optimize && !register_operand (op0, mode0))
8682 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8683 op0 = copy_to_mode_reg (mode0, op0);
8684
8685 op1 = GEN_INT (d->comparison);
8686
8687 pat = GEN_FCN (d->icode) (target, op0, op1);
8688 if (! pat)
8689 return 0;
8690 emit_insn (pat);
8691 return target;
8692}
8693
8694static rtx
8695ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
8696 tree exp, rtx target)
8697{
8698 rtx pat;
8699 tree arg0 = CALL_EXPR_ARG (exp, 0);
8700 tree arg1 = CALL_EXPR_ARG (exp, 1);
8701 rtx op0 = expand_normal (arg0);
8702 rtx op1 = expand_normal (arg1);
8703 rtx op2;
8704 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8705 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8706 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8707
8708 if (optimize || target == 0
8709 || GET_MODE (target) != tmode
8710 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8711 target = gen_reg_rtx (tmode);
8712
8713 op0 = safe_vector_operand (op0, mode0);
8714 op1 = safe_vector_operand (op1, mode1);
8715
8716 if ((optimize && !register_operand (op0, mode0))
8717 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8718 op0 = copy_to_mode_reg (mode0, op0);
8719 if ((optimize && !register_operand (op1, mode1))
8720 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8721 op1 = copy_to_mode_reg (mode1, op1);
8722
8723 op2 = GEN_INT (d->comparison);
8724
8725 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8726 if (! pat)
8727 return 0;
8728 emit_insn (pat);
8729 return target;
8730}
8731
8732/* Subroutine of ix86_expand_builtin to take care of ptest insns. */
8733
8734static rtx
8735ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
8736 rtx target)
8737{
8738 rtx pat;
8739 tree arg0 = CALL_EXPR_ARG (exp, 0);
8740 tree arg1 = CALL_EXPR_ARG (exp, 1);
8741 rtx op0 = expand_normal (arg0);
8742 rtx op1 = expand_normal (arg1);
8743 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8744 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8745 enum rtx_code comparison = d->comparison;
8746
8747 if (VECTOR_MODE_P (mode0))
8748 op0 = safe_vector_operand (op0, mode0);
8749 if (VECTOR_MODE_P (mode1))
8750 op1 = safe_vector_operand (op1, mode1);
8751
8752 target = gen_reg_rtx (SImode);
8753 emit_move_insn (target, const0_rtx);
8754 target = gen_rtx_SUBREG (QImode, target, 0);
8755
8756 if ((optimize && !register_operand (op0, mode0))
8757 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8758 op0 = copy_to_mode_reg (mode0, op0);
8759 if ((optimize && !register_operand (op1, mode1))
8760 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8761 op1 = copy_to_mode_reg (mode1, op1);
8762
8763 pat = GEN_FCN (d->icode) (op0, op1);
8764 if (! pat)
8765 return 0;
8766 emit_insn (pat);
8767 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8768 gen_rtx_fmt_ee (comparison, QImode,
8769 SET_DEST (pat),
8770 const0_rtx)));
8771
8772 return SUBREG_REG (target);
8773}
8774
8775/* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
8776
8777static rtx
8778ix86_expand_sse_pcmpestr (const struct builtin_description *d,
8779 tree exp, rtx target)
8780{
8781 rtx pat;
8782 tree arg0 = CALL_EXPR_ARG (exp, 0);
8783 tree arg1 = CALL_EXPR_ARG (exp, 1);
8784 tree arg2 = CALL_EXPR_ARG (exp, 2);
8785 tree arg3 = CALL_EXPR_ARG (exp, 3);
8786 tree arg4 = CALL_EXPR_ARG (exp, 4);
8787 rtx scratch0, scratch1;
8788 rtx op0 = expand_normal (arg0);
8789 rtx op1 = expand_normal (arg1);
8790 rtx op2 = expand_normal (arg2);
8791 rtx op3 = expand_normal (arg3);
8792 rtx op4 = expand_normal (arg4);
8793 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
8794
8795 tmode0 = insn_data[d->icode].operand[0].mode;
8796 tmode1 = insn_data[d->icode].operand[1].mode;
8797 modev2 = insn_data[d->icode].operand[2].mode;
8798 modei3 = insn_data[d->icode].operand[3].mode;
8799 modev4 = insn_data[d->icode].operand[4].mode;
8800 modei5 = insn_data[d->icode].operand[5].mode;
8801 modeimm = insn_data[d->icode].operand[6].mode;
8802
8803 if (VECTOR_MODE_P (modev2))
8804 op0 = safe_vector_operand (op0, modev2);
8805 if (VECTOR_MODE_P (modev4))
8806 op2 = safe_vector_operand (op2, modev4);
8807
8808 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8809 op0 = copy_to_mode_reg (modev2, op0);
8810 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
8811 op1 = copy_to_mode_reg (modei3, op1);
8812 if ((optimize && !register_operand (op2, modev4))
8813 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
8814 op2 = copy_to_mode_reg (modev4, op2);
8815 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
8816 op3 = copy_to_mode_reg (modei5, op3);
8817
8818 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
8819 {
8820 error ("the fifth argument must be an 8-bit immediate");
8821 return const0_rtx;
8822 }
8823
8824 if (d->code == IX86_BUILTIN_PCMPESTRI128)
8825 {
8826 if (optimize || !target
8827 || GET_MODE (target) != tmode0
8828 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8829 target = gen_reg_rtx (tmode0);
8830
8831 scratch1 = gen_reg_rtx (tmode1);
8832
8833 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
8834 }
8835 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
8836 {
8837 if (optimize || !target
8838 || GET_MODE (target) != tmode1
8839 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8840 target = gen_reg_rtx (tmode1);
8841
8842 scratch0 = gen_reg_rtx (tmode0);
8843
8844 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
8845 }
8846 else
8847 {
8848 gcc_assert (d->flag);
8849
8850 scratch0 = gen_reg_rtx (tmode0);
8851 scratch1 = gen_reg_rtx (tmode1);
8852
8853 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
8854 }
8855
8856 if (! pat)
8857 return 0;
8858
8859 emit_insn (pat);
8860
8861 if (d->flag)
8862 {
8863 target = gen_reg_rtx (SImode);
8864 emit_move_insn (target, const0_rtx);
8865 target = gen_rtx_SUBREG (QImode, target, 0);
8866
8867 emit_insn
8868 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8869 gen_rtx_fmt_ee (EQ, QImode,
8870 gen_rtx_REG ((machine_mode) d->flag,
8871 FLAGS_REG),
8872 const0_rtx)));
8873 return SUBREG_REG (target);
8874 }
8875 else
8876 return target;
8877}
8878
8879
8880/* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
8881
8882static rtx
8883ix86_expand_sse_pcmpistr (const struct builtin_description *d,
8884 tree exp, rtx target)
8885{
8886 rtx pat;
8887 tree arg0 = CALL_EXPR_ARG (exp, 0);
8888 tree arg1 = CALL_EXPR_ARG (exp, 1);
8889 tree arg2 = CALL_EXPR_ARG (exp, 2);
8890 rtx scratch0, scratch1;
8891 rtx op0 = expand_normal (arg0);
8892 rtx op1 = expand_normal (arg1);
8893 rtx op2 = expand_normal (arg2);
8894 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
8895
8896 tmode0 = insn_data[d->icode].operand[0].mode;
8897 tmode1 = insn_data[d->icode].operand[1].mode;
8898 modev2 = insn_data[d->icode].operand[2].mode;
8899 modev3 = insn_data[d->icode].operand[3].mode;
8900 modeimm = insn_data[d->icode].operand[4].mode;
8901
8902 if (VECTOR_MODE_P (modev2))
8903 op0 = safe_vector_operand (op0, modev2);
8904 if (VECTOR_MODE_P (modev3))
8905 op1 = safe_vector_operand (op1, modev3);
8906
8907 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8908 op0 = copy_to_mode_reg (modev2, op0);
8909 if ((optimize && !register_operand (op1, modev3))
8910 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
8911 op1 = copy_to_mode_reg (modev3, op1);
8912
8913 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
8914 {
8915 error ("the third argument must be an 8-bit immediate");
8916 return const0_rtx;
8917 }
8918
8919 if (d->code == IX86_BUILTIN_PCMPISTRI128)
8920 {
8921 if (optimize || !target
8922 || GET_MODE (target) != tmode0
8923 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8924 target = gen_reg_rtx (tmode0);
8925
8926 scratch1 = gen_reg_rtx (tmode1);
8927
8928 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
8929 }
8930 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
8931 {
8932 if (optimize || !target
8933 || GET_MODE (target) != tmode1
8934 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8935 target = gen_reg_rtx (tmode1);
8936
8937 scratch0 = gen_reg_rtx (tmode0);
8938
8939 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
8940 }
8941 else
8942 {
8943 gcc_assert (d->flag);
8944
8945 scratch0 = gen_reg_rtx (tmode0);
8946 scratch1 = gen_reg_rtx (tmode1);
8947
8948 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
8949 }
8950
8951 if (! pat)
8952 return 0;
8953
8954 emit_insn (pat);
8955
8956 if (d->flag)
8957 {
8958 target = gen_reg_rtx (SImode);
8959 emit_move_insn (target, const0_rtx);
8960 target = gen_rtx_SUBREG (QImode, target, 0);
8961
8962 emit_insn
8963 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8964 gen_rtx_fmt_ee (EQ, QImode,
8965 gen_rtx_REG ((machine_mode) d->flag,
8966 FLAGS_REG),
8967 const0_rtx)));
8968 return SUBREG_REG (target);
8969 }
8970 else
8971 return target;
8972}
8973
8974/* Fixup modeless constants to fit required mode. */
8975
8976static rtx
8977fixup_modeless_constant (rtx x, machine_mode mode)
8978{
8979 if (GET_MODE (x) == VOIDmode)
8980 x = convert_to_mode (mode, x, 1);
8981 return x;
8982}
8983
8984/* Subroutine of ix86_expand_builtin to take care of insns with
8985 variable number of operands. */
8986
8987static rtx
8988ix86_expand_args_builtin (const struct builtin_description *d,
8989 tree exp, rtx target)
8990{
8991 rtx pat, real_target;
8992 unsigned int i, nargs;
8993 unsigned int nargs_constant = 0;
8994 unsigned int mask_pos = 0;
8995 int num_memory = 0;
8996 struct
8997 {
8998 rtx op;
8999 machine_mode mode;
9000 } args[6];
9001 bool second_arg_count = false;
9002 enum insn_code icode = d->icode;
9003 const struct insn_data_d *insn_p = &insn_data[icode];
9004 machine_mode tmode = insn_p->operand[0].mode;
9005 machine_mode rmode = VOIDmode;
9006 bool swap = false;
9007 enum rtx_code comparison = d->comparison;
9008
9009 switch ((enum ix86_builtin_func_type) d->flag)
9010 {
9011 case V2DF_FTYPE_V2DF_ROUND:
9012 case V4DF_FTYPE_V4DF_ROUND:
9013 case V8DF_FTYPE_V8DF_ROUND:
9014 case V4SF_FTYPE_V4SF_ROUND:
9015 case V8SF_FTYPE_V8SF_ROUND:
9016 case V16SF_FTYPE_V16SF_ROUND:
9017 case V4SI_FTYPE_V4SF_ROUND:
9018 case V8SI_FTYPE_V8SF_ROUND:
9019 case V16SI_FTYPE_V16SF_ROUND:
9020 return ix86_expand_sse_round (d, exp, target);
9021 case V4SI_FTYPE_V2DF_V2DF_ROUND:
9022 case V8SI_FTYPE_V4DF_V4DF_ROUND:
9023 case V16SI_FTYPE_V8DF_V8DF_ROUND:
9024 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
9025 case INT_FTYPE_V8SF_V8SF_PTEST:
9026 case INT_FTYPE_V4DI_V4DI_PTEST:
9027 case INT_FTYPE_V4DF_V4DF_PTEST:
9028 case INT_FTYPE_V4SF_V4SF_PTEST:
9029 case INT_FTYPE_V2DI_V2DI_PTEST:
9030 case INT_FTYPE_V2DF_V2DF_PTEST:
9031 return ix86_expand_sse_ptest (d, exp, target);
9032 case FLOAT128_FTYPE_FLOAT128:
9033 case FLOAT_FTYPE_FLOAT:
9034 case INT_FTYPE_INT:
9035 case UINT_FTYPE_UINT:
9036 case UINT16_FTYPE_UINT16:
9037 case UINT64_FTYPE_INT:
9038 case UINT64_FTYPE_UINT64:
9039 case INT64_FTYPE_INT64:
9040 case INT64_FTYPE_V4SF:
9041 case INT64_FTYPE_V2DF:
9042 case INT_FTYPE_V16QI:
9043 case INT_FTYPE_V8QI:
9044 case INT_FTYPE_V8SF:
9045 case INT_FTYPE_V4DF:
9046 case INT_FTYPE_V4SF:
9047 case INT_FTYPE_V2DF:
9048 case INT_FTYPE_V32QI:
9049 case V16QI_FTYPE_V16QI:
9050 case V8SI_FTYPE_V8SF:
9051 case V8SI_FTYPE_V4SI:
9052 case V8HI_FTYPE_V8HI:
9053 case V8HI_FTYPE_V16QI:
9054 case V8QI_FTYPE_V8QI:
9055 case V8SF_FTYPE_V8SF:
9056 case V8SF_FTYPE_V8SI:
9057 case V8SF_FTYPE_V4SF:
9058 case V8SF_FTYPE_V8HI:
9059 case V4SI_FTYPE_V4SI:
9060 case V4SI_FTYPE_V16QI:
9061 case V4SI_FTYPE_V4SF:
9062 case V4SI_FTYPE_V8SI:
9063 case V4SI_FTYPE_V8HI:
9064 case V4SI_FTYPE_V4DF:
9065 case V4SI_FTYPE_V2DF:
9066 case V4HI_FTYPE_V4HI:
9067 case V4DF_FTYPE_V4DF:
9068 case V4DF_FTYPE_V4SI:
9069 case V4DF_FTYPE_V4SF:
9070 case V4DF_FTYPE_V2DF:
9071 case V4SF_FTYPE_V4SF:
9072 case V4SF_FTYPE_V4SI:
9073 case V4SF_FTYPE_V8SF:
9074 case V4SF_FTYPE_V4DF:
9075 case V4SF_FTYPE_V8HI:
9076 case V4SF_FTYPE_V2DF:
9077 case V2DI_FTYPE_V2DI:
9078 case V2DI_FTYPE_V16QI:
9079 case V2DI_FTYPE_V8HI:
9080 case V2DI_FTYPE_V4SI:
9081 case V2DF_FTYPE_V2DF:
9082 case V2DF_FTYPE_V4SI:
9083 case V2DF_FTYPE_V4DF:
9084 case V2DF_FTYPE_V4SF:
9085 case V2DF_FTYPE_V2SI:
9086 case V2SI_FTYPE_V2SI:
9087 case V2SI_FTYPE_V4SF:
9088 case V2SI_FTYPE_V2SF:
9089 case V2SI_FTYPE_V2DF:
9090 case V2SF_FTYPE_V2SF:
9091 case V2SF_FTYPE_V2SI:
9092 case V32QI_FTYPE_V32QI:
9093 case V32QI_FTYPE_V16QI:
9094 case V16HI_FTYPE_V16HI:
9095 case V16HI_FTYPE_V8HI:
9096 case V8SI_FTYPE_V8SI:
9097 case V16HI_FTYPE_V16QI:
9098 case V8SI_FTYPE_V16QI:
9099 case V4DI_FTYPE_V16QI:
9100 case V8SI_FTYPE_V8HI:
9101 case V4DI_FTYPE_V8HI:
9102 case V4DI_FTYPE_V4SI:
9103 case V4DI_FTYPE_V2DI:
9104 case UQI_FTYPE_UQI:
9105 case UHI_FTYPE_UHI:
9106 case USI_FTYPE_USI:
9107 case USI_FTYPE_UQI:
9108 case USI_FTYPE_UHI:
9109 case UDI_FTYPE_UDI:
9110 case UHI_FTYPE_V16QI:
9111 case USI_FTYPE_V32QI:
9112 case UDI_FTYPE_V64QI:
9113 case V16QI_FTYPE_UHI:
9114 case V32QI_FTYPE_USI:
9115 case V64QI_FTYPE_UDI:
9116 case V8HI_FTYPE_UQI:
9117 case V16HI_FTYPE_UHI:
9118 case V32HI_FTYPE_USI:
9119 case V4SI_FTYPE_UQI:
9120 case V8SI_FTYPE_UQI:
9121 case V4SI_FTYPE_UHI:
9122 case V8SI_FTYPE_UHI:
9123 case UQI_FTYPE_V8HI:
9124 case UHI_FTYPE_V16HI:
9125 case USI_FTYPE_V32HI:
9126 case UQI_FTYPE_V4SI:
9127 case UQI_FTYPE_V8SI:
9128 case UHI_FTYPE_V16SI:
9129 case UQI_FTYPE_V2DI:
9130 case UQI_FTYPE_V4DI:
9131 case UQI_FTYPE_V8DI:
9132 case V16SI_FTYPE_UHI:
9133 case V2DI_FTYPE_UQI:
9134 case V4DI_FTYPE_UQI:
9135 case V16SI_FTYPE_INT:
9136 case V16SF_FTYPE_V8SF:
9137 case V16SI_FTYPE_V8SI:
9138 case V16SF_FTYPE_V4SF:
9139 case V16SI_FTYPE_V4SI:
9140 case V16SI_FTYPE_V16SF:
9141 case V16SI_FTYPE_V16SI:
9142 case V64QI_FTYPE_V64QI:
9143 case V32HI_FTYPE_V32HI:
9144 case V16SF_FTYPE_V16SF:
9145 case V8DI_FTYPE_UQI:
9146 case V8DI_FTYPE_V8DI:
9147 case V8DF_FTYPE_V4DF:
9148 case V8DF_FTYPE_V2DF:
9149 case V8DF_FTYPE_V8DF:
9150 case V4DI_FTYPE_V4DI:
4f0e90fa
HL
9151 case V16HI_FTYPE_V16SF:
9152 case V8HI_FTYPE_V8SF:
9153 case V8HI_FTYPE_V4SF:
2bf6d935
ML
9154 nargs = 1;
9155 break;
9156 case V4SF_FTYPE_V4SF_VEC_MERGE:
9157 case V2DF_FTYPE_V2DF_VEC_MERGE:
9158 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
9159 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
9160 case V16QI_FTYPE_V16QI_V16QI:
9161 case V16QI_FTYPE_V8HI_V8HI:
9162 case V16SF_FTYPE_V16SF_V16SF:
9163 case V8QI_FTYPE_V8QI_V8QI:
9164 case V8QI_FTYPE_V4HI_V4HI:
9165 case V8HI_FTYPE_V8HI_V8HI:
9166 case V8HI_FTYPE_V16QI_V16QI:
9167 case V8HI_FTYPE_V4SI_V4SI:
9168 case V8SF_FTYPE_V8SF_V8SF:
9169 case V8SF_FTYPE_V8SF_V8SI:
9170 case V8DF_FTYPE_V8DF_V8DF:
9171 case V4SI_FTYPE_V4SI_V4SI:
9172 case V4SI_FTYPE_V8HI_V8HI:
9173 case V4SI_FTYPE_V2DF_V2DF:
9174 case V4HI_FTYPE_V4HI_V4HI:
9175 case V4HI_FTYPE_V8QI_V8QI:
9176 case V4HI_FTYPE_V2SI_V2SI:
9177 case V4DF_FTYPE_V4DF_V4DF:
9178 case V4DF_FTYPE_V4DF_V4DI:
9179 case V4SF_FTYPE_V4SF_V4SF:
9180 case V4SF_FTYPE_V4SF_V4SI:
9181 case V4SF_FTYPE_V4SF_V2SI:
9182 case V4SF_FTYPE_V4SF_V2DF:
9183 case V4SF_FTYPE_V4SF_UINT:
9184 case V4SF_FTYPE_V4SF_DI:
9185 case V4SF_FTYPE_V4SF_SI:
9186 case V2DI_FTYPE_V2DI_V2DI:
9187 case V2DI_FTYPE_V16QI_V16QI:
9188 case V2DI_FTYPE_V4SI_V4SI:
9189 case V2DI_FTYPE_V2DI_V16QI:
9190 case V2SI_FTYPE_V2SI_V2SI:
9191 case V2SI_FTYPE_V4HI_V4HI:
9192 case V2SI_FTYPE_V2SF_V2SF:
9193 case V2DF_FTYPE_V2DF_V2DF:
9194 case V2DF_FTYPE_V2DF_V4SF:
9195 case V2DF_FTYPE_V2DF_V2DI:
9196 case V2DF_FTYPE_V2DF_DI:
9197 case V2DF_FTYPE_V2DF_SI:
9198 case V2DF_FTYPE_V2DF_UINT:
9199 case V2SF_FTYPE_V2SF_V2SF:
9200 case V1DI_FTYPE_V1DI_V1DI:
9201 case V1DI_FTYPE_V8QI_V8QI:
9202 case V1DI_FTYPE_V2SI_V2SI:
9203 case V32QI_FTYPE_V16HI_V16HI:
9204 case V16HI_FTYPE_V8SI_V8SI:
9205 case V64QI_FTYPE_V64QI_V64QI:
9206 case V32QI_FTYPE_V32QI_V32QI:
9207 case V16HI_FTYPE_V32QI_V32QI:
9208 case V16HI_FTYPE_V16HI_V16HI:
9209 case V8SI_FTYPE_V4DF_V4DF:
9210 case V8SI_FTYPE_V8SI_V8SI:
9211 case V8SI_FTYPE_V16HI_V16HI:
9212 case V4DI_FTYPE_V4DI_V4DI:
9213 case V4DI_FTYPE_V8SI_V8SI:
9214 case V8DI_FTYPE_V64QI_V64QI:
9215 if (comparison == UNKNOWN)
9216 return ix86_expand_binop_builtin (icode, exp, target);
9217 nargs = 2;
9218 break;
9219 case V4SF_FTYPE_V4SF_V4SF_SWAP:
9220 case V2DF_FTYPE_V2DF_V2DF_SWAP:
9221 gcc_assert (comparison != UNKNOWN);
9222 nargs = 2;
9223 swap = true;
9224 break;
9225 case V16HI_FTYPE_V16HI_V8HI_COUNT:
9226 case V16HI_FTYPE_V16HI_SI_COUNT:
9227 case V8SI_FTYPE_V8SI_V4SI_COUNT:
9228 case V8SI_FTYPE_V8SI_SI_COUNT:
9229 case V4DI_FTYPE_V4DI_V2DI_COUNT:
9230 case V4DI_FTYPE_V4DI_INT_COUNT:
9231 case V8HI_FTYPE_V8HI_V8HI_COUNT:
9232 case V8HI_FTYPE_V8HI_SI_COUNT:
9233 case V4SI_FTYPE_V4SI_V4SI_COUNT:
9234 case V4SI_FTYPE_V4SI_SI_COUNT:
9235 case V4HI_FTYPE_V4HI_V4HI_COUNT:
9236 case V4HI_FTYPE_V4HI_SI_COUNT:
9237 case V2DI_FTYPE_V2DI_V2DI_COUNT:
9238 case V2DI_FTYPE_V2DI_SI_COUNT:
9239 case V2SI_FTYPE_V2SI_V2SI_COUNT:
9240 case V2SI_FTYPE_V2SI_SI_COUNT:
9241 case V1DI_FTYPE_V1DI_V1DI_COUNT:
9242 case V1DI_FTYPE_V1DI_SI_COUNT:
9243 nargs = 2;
9244 second_arg_count = true;
9245 break;
9246 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
9247 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
9248 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
9249 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
9250 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
9251 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
9252 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
9253 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
9254 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
9255 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
9256 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
9257 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
9258 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
9259 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
9260 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
9261 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
9262 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
9263 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
9264 nargs = 4;
9265 second_arg_count = true;
9266 break;
9267 case UINT64_FTYPE_UINT64_UINT64:
9268 case UINT_FTYPE_UINT_UINT:
9269 case UINT_FTYPE_UINT_USHORT:
9270 case UINT_FTYPE_UINT_UCHAR:
9271 case UINT16_FTYPE_UINT16_INT:
9272 case UINT8_FTYPE_UINT8_INT:
9273 case UQI_FTYPE_UQI_UQI:
9274 case UHI_FTYPE_UHI_UHI:
9275 case USI_FTYPE_USI_USI:
9276 case UDI_FTYPE_UDI_UDI:
9277 case V16SI_FTYPE_V8DF_V8DF:
4f0e90fa
HL
9278 case V32HI_FTYPE_V16SF_V16SF:
9279 case V16HI_FTYPE_V8SF_V8SF:
9280 case V8HI_FTYPE_V4SF_V4SF:
9281 case V16HI_FTYPE_V16SF_UHI:
9282 case V8HI_FTYPE_V8SF_UQI:
9283 case V8HI_FTYPE_V4SF_UQI:
2bf6d935
ML
9284 nargs = 2;
9285 break;
9286 case V2DI_FTYPE_V2DI_INT_CONVERT:
9287 nargs = 2;
9288 rmode = V1TImode;
9289 nargs_constant = 1;
9290 break;
9291 case V4DI_FTYPE_V4DI_INT_CONVERT:
9292 nargs = 2;
9293 rmode = V2TImode;
9294 nargs_constant = 1;
9295 break;
9296 case V8DI_FTYPE_V8DI_INT_CONVERT:
9297 nargs = 2;
9298 rmode = V4TImode;
9299 nargs_constant = 1;
9300 break;
9301 case V8HI_FTYPE_V8HI_INT:
9302 case V8HI_FTYPE_V8SF_INT:
9303 case V16HI_FTYPE_V16SF_INT:
9304 case V8HI_FTYPE_V4SF_INT:
9305 case V8SF_FTYPE_V8SF_INT:
9306 case V4SF_FTYPE_V16SF_INT:
9307 case V16SF_FTYPE_V16SF_INT:
9308 case V4SI_FTYPE_V4SI_INT:
9309 case V4SI_FTYPE_V8SI_INT:
9310 case V4HI_FTYPE_V4HI_INT:
9311 case V4DF_FTYPE_V4DF_INT:
9312 case V4DF_FTYPE_V8DF_INT:
9313 case V4SF_FTYPE_V4SF_INT:
9314 case V4SF_FTYPE_V8SF_INT:
9315 case V2DI_FTYPE_V2DI_INT:
9316 case V2DF_FTYPE_V2DF_INT:
9317 case V2DF_FTYPE_V4DF_INT:
9318 case V16HI_FTYPE_V16HI_INT:
9319 case V8SI_FTYPE_V8SI_INT:
9320 case V16SI_FTYPE_V16SI_INT:
9321 case V4SI_FTYPE_V16SI_INT:
9322 case V4DI_FTYPE_V4DI_INT:
9323 case V2DI_FTYPE_V4DI_INT:
9324 case V4DI_FTYPE_V8DI_INT:
2bf6d935
ML
9325 case UQI_FTYPE_UQI_UQI_CONST:
9326 case UHI_FTYPE_UHI_UQI:
9327 case USI_FTYPE_USI_UQI:
9328 case UDI_FTYPE_UDI_UQI:
9329 nargs = 2;
9330 nargs_constant = 1;
9331 break;
9332 case V16QI_FTYPE_V16QI_V16QI_V16QI:
9333 case V8SF_FTYPE_V8SF_V8SF_V8SF:
9334 case V4DF_FTYPE_V4DF_V4DF_V4DF:
9335 case V4SF_FTYPE_V4SF_V4SF_V4SF:
9336 case V2DF_FTYPE_V2DF_V2DF_V2DF:
9337 case V32QI_FTYPE_V32QI_V32QI_V32QI:
9338 case UHI_FTYPE_V16SI_V16SI_UHI:
9339 case UQI_FTYPE_V8DI_V8DI_UQI:
9340 case V16HI_FTYPE_V16SI_V16HI_UHI:
9341 case V16QI_FTYPE_V16SI_V16QI_UHI:
9342 case V16QI_FTYPE_V8DI_V16QI_UQI:
9343 case V16SF_FTYPE_V16SF_V16SF_UHI:
9344 case V16SF_FTYPE_V4SF_V16SF_UHI:
9345 case V16SI_FTYPE_SI_V16SI_UHI:
9346 case V16SI_FTYPE_V16HI_V16SI_UHI:
9347 case V16SI_FTYPE_V16QI_V16SI_UHI:
9348 case V8SF_FTYPE_V4SF_V8SF_UQI:
9349 case V4DF_FTYPE_V2DF_V4DF_UQI:
9350 case V8SI_FTYPE_V4SI_V8SI_UQI:
9351 case V8SI_FTYPE_SI_V8SI_UQI:
9352 case V4SI_FTYPE_V4SI_V4SI_UQI:
9353 case V4SI_FTYPE_SI_V4SI_UQI:
9354 case V4DI_FTYPE_V2DI_V4DI_UQI:
9355 case V4DI_FTYPE_DI_V4DI_UQI:
9356 case V2DI_FTYPE_V2DI_V2DI_UQI:
9357 case V2DI_FTYPE_DI_V2DI_UQI:
9358 case V64QI_FTYPE_V64QI_V64QI_UDI:
9359 case V64QI_FTYPE_V16QI_V64QI_UDI:
9360 case V64QI_FTYPE_QI_V64QI_UDI:
9361 case V32QI_FTYPE_V32QI_V32QI_USI:
9362 case V32QI_FTYPE_V16QI_V32QI_USI:
9363 case V32QI_FTYPE_QI_V32QI_USI:
9364 case V16QI_FTYPE_V16QI_V16QI_UHI:
9365 case V16QI_FTYPE_QI_V16QI_UHI:
9366 case V32HI_FTYPE_V8HI_V32HI_USI:
9367 case V32HI_FTYPE_HI_V32HI_USI:
9368 case V16HI_FTYPE_V8HI_V16HI_UHI:
9369 case V16HI_FTYPE_HI_V16HI_UHI:
9370 case V8HI_FTYPE_V8HI_V8HI_UQI:
9371 case V8HI_FTYPE_HI_V8HI_UQI:
9372 case V8SF_FTYPE_V8HI_V8SF_UQI:
9373 case V4SF_FTYPE_V8HI_V4SF_UQI:
9374 case V8SI_FTYPE_V8SF_V8SI_UQI:
9375 case V4SI_FTYPE_V4SF_V4SI_UQI:
9376 case V4DI_FTYPE_V4SF_V4DI_UQI:
9377 case V2DI_FTYPE_V4SF_V2DI_UQI:
9378 case V4SF_FTYPE_V4DI_V4SF_UQI:
9379 case V4SF_FTYPE_V2DI_V4SF_UQI:
9380 case V4DF_FTYPE_V4DI_V4DF_UQI:
9381 case V2DF_FTYPE_V2DI_V2DF_UQI:
9382 case V16QI_FTYPE_V8HI_V16QI_UQI:
9383 case V16QI_FTYPE_V16HI_V16QI_UHI:
9384 case V16QI_FTYPE_V4SI_V16QI_UQI:
9385 case V16QI_FTYPE_V8SI_V16QI_UQI:
9386 case V8HI_FTYPE_V4SI_V8HI_UQI:
9387 case V8HI_FTYPE_V8SI_V8HI_UQI:
9388 case V16QI_FTYPE_V2DI_V16QI_UQI:
9389 case V16QI_FTYPE_V4DI_V16QI_UQI:
9390 case V8HI_FTYPE_V2DI_V8HI_UQI:
9391 case V8HI_FTYPE_V4DI_V8HI_UQI:
9392 case V4SI_FTYPE_V2DI_V4SI_UQI:
9393 case V4SI_FTYPE_V4DI_V4SI_UQI:
9394 case V32QI_FTYPE_V32HI_V32QI_USI:
9395 case UHI_FTYPE_V16QI_V16QI_UHI:
9396 case USI_FTYPE_V32QI_V32QI_USI:
9397 case UDI_FTYPE_V64QI_V64QI_UDI:
9398 case UQI_FTYPE_V8HI_V8HI_UQI:
9399 case UHI_FTYPE_V16HI_V16HI_UHI:
9400 case USI_FTYPE_V32HI_V32HI_USI:
9401 case UQI_FTYPE_V4SI_V4SI_UQI:
9402 case UQI_FTYPE_V8SI_V8SI_UQI:
9403 case UQI_FTYPE_V2DI_V2DI_UQI:
9404 case UQI_FTYPE_V4DI_V4DI_UQI:
9405 case V4SF_FTYPE_V2DF_V4SF_UQI:
9406 case V4SF_FTYPE_V4DF_V4SF_UQI:
9407 case V16SI_FTYPE_V16SI_V16SI_UHI:
9408 case V16SI_FTYPE_V4SI_V16SI_UHI:
9409 case V2DI_FTYPE_V4SI_V2DI_UQI:
9410 case V2DI_FTYPE_V8HI_V2DI_UQI:
9411 case V2DI_FTYPE_V16QI_V2DI_UQI:
9412 case V4DI_FTYPE_V4DI_V4DI_UQI:
9413 case V4DI_FTYPE_V4SI_V4DI_UQI:
9414 case V4DI_FTYPE_V8HI_V4DI_UQI:
9415 case V4DI_FTYPE_V16QI_V4DI_UQI:
9416 case V4DI_FTYPE_V4DF_V4DI_UQI:
9417 case V2DI_FTYPE_V2DF_V2DI_UQI:
9418 case V4SI_FTYPE_V4DF_V4SI_UQI:
9419 case V4SI_FTYPE_V2DF_V4SI_UQI:
9420 case V4SI_FTYPE_V8HI_V4SI_UQI:
9421 case V4SI_FTYPE_V16QI_V4SI_UQI:
9422 case V4DI_FTYPE_V4DI_V4DI_V4DI:
9423 case V8DF_FTYPE_V2DF_V8DF_UQI:
9424 case V8DF_FTYPE_V4DF_V8DF_UQI:
9425 case V8DF_FTYPE_V8DF_V8DF_UQI:
9426 case V8SF_FTYPE_V8SF_V8SF_UQI:
9427 case V8SF_FTYPE_V8SI_V8SF_UQI:
9428 case V4DF_FTYPE_V4DF_V4DF_UQI:
9429 case V4SF_FTYPE_V4SF_V4SF_UQI:
9430 case V2DF_FTYPE_V2DF_V2DF_UQI:
9431 case V2DF_FTYPE_V4SF_V2DF_UQI:
9432 case V2DF_FTYPE_V4SI_V2DF_UQI:
9433 case V4SF_FTYPE_V4SI_V4SF_UQI:
9434 case V4DF_FTYPE_V4SF_V4DF_UQI:
9435 case V4DF_FTYPE_V4SI_V4DF_UQI:
9436 case V8SI_FTYPE_V8SI_V8SI_UQI:
9437 case V8SI_FTYPE_V8HI_V8SI_UQI:
9438 case V8SI_FTYPE_V16QI_V8SI_UQI:
9439 case V8DF_FTYPE_V8SI_V8DF_UQI:
9440 case V8DI_FTYPE_DI_V8DI_UQI:
9441 case V16SF_FTYPE_V8SF_V16SF_UHI:
9442 case V16SI_FTYPE_V8SI_V16SI_UHI:
9443 case V16HI_FTYPE_V16HI_V16HI_UHI:
9444 case V8HI_FTYPE_V16QI_V8HI_UQI:
9445 case V16HI_FTYPE_V16QI_V16HI_UHI:
9446 case V32HI_FTYPE_V32HI_V32HI_USI:
9447 case V32HI_FTYPE_V32QI_V32HI_USI:
9448 case V8DI_FTYPE_V16QI_V8DI_UQI:
9449 case V8DI_FTYPE_V2DI_V8DI_UQI:
9450 case V8DI_FTYPE_V4DI_V8DI_UQI:
9451 case V8DI_FTYPE_V8DI_V8DI_UQI:
9452 case V8DI_FTYPE_V8HI_V8DI_UQI:
9453 case V8DI_FTYPE_V8SI_V8DI_UQI:
9454 case V8HI_FTYPE_V8DI_V8HI_UQI:
9455 case V8SI_FTYPE_V8DI_V8SI_UQI:
9456 case V4SI_FTYPE_V4SI_V4SI_V4SI:
9457 case V16SI_FTYPE_V16SI_V16SI_V16SI:
9458 case V8DI_FTYPE_V8DI_V8DI_V8DI:
9459 case V32HI_FTYPE_V32HI_V32HI_V32HI:
9460 case V2DI_FTYPE_V2DI_V2DI_V2DI:
9461 case V16HI_FTYPE_V16HI_V16HI_V16HI:
9462 case V8SI_FTYPE_V8SI_V8SI_V8SI:
9463 case V8HI_FTYPE_V8HI_V8HI_V8HI:
4f0e90fa
HL
9464 case V32HI_FTYPE_V16SF_V16SF_USI:
9465 case V16HI_FTYPE_V8SF_V8SF_UHI:
9466 case V8HI_FTYPE_V4SF_V4SF_UQI:
9467 case V16HI_FTYPE_V16SF_V16HI_UHI:
9468 case V8HI_FTYPE_V8SF_V8HI_UQI:
9469 case V8HI_FTYPE_V4SF_V8HI_UQI:
9470 case V16SF_FTYPE_V16SF_V32HI_V32HI:
9471 case V8SF_FTYPE_V8SF_V16HI_V16HI:
9472 case V4SF_FTYPE_V4SF_V8HI_V8HI:
2bf6d935
ML
9473 nargs = 3;
9474 break;
9475 case V32QI_FTYPE_V32QI_V32QI_INT:
9476 case V16HI_FTYPE_V16HI_V16HI_INT:
9477 case V16QI_FTYPE_V16QI_V16QI_INT:
9478 case V4DI_FTYPE_V4DI_V4DI_INT:
9479 case V8HI_FTYPE_V8HI_V8HI_INT:
9480 case V8SI_FTYPE_V8SI_V8SI_INT:
9481 case V8SI_FTYPE_V8SI_V4SI_INT:
9482 case V8SF_FTYPE_V8SF_V8SF_INT:
9483 case V8SF_FTYPE_V8SF_V4SF_INT:
9484 case V4SI_FTYPE_V4SI_V4SI_INT:
9485 case V4DF_FTYPE_V4DF_V4DF_INT:
9486 case V16SF_FTYPE_V16SF_V16SF_INT:
9487 case V16SF_FTYPE_V16SF_V4SF_INT:
9488 case V16SI_FTYPE_V16SI_V4SI_INT:
9489 case V4DF_FTYPE_V4DF_V2DF_INT:
9490 case V4SF_FTYPE_V4SF_V4SF_INT:
9491 case V2DI_FTYPE_V2DI_V2DI_INT:
9492 case V4DI_FTYPE_V4DI_V2DI_INT:
9493 case V2DF_FTYPE_V2DF_V2DF_INT:
9494 case UQI_FTYPE_V8DI_V8UDI_INT:
9495 case UQI_FTYPE_V8DF_V8DF_INT:
9496 case UQI_FTYPE_V2DF_V2DF_INT:
9497 case UQI_FTYPE_V4SF_V4SF_INT:
9498 case UHI_FTYPE_V16SI_V16SI_INT:
9499 case UHI_FTYPE_V16SF_V16SF_INT:
9500 case V64QI_FTYPE_V64QI_V64QI_INT:
9501 case V32HI_FTYPE_V32HI_V32HI_INT:
9502 case V16SI_FTYPE_V16SI_V16SI_INT:
9503 case V8DI_FTYPE_V8DI_V8DI_INT:
9504 nargs = 3;
9505 nargs_constant = 1;
9506 break;
9507 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
9508 nargs = 3;
9509 rmode = V4DImode;
9510 nargs_constant = 1;
9511 break;
9512 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
9513 nargs = 3;
9514 rmode = V2DImode;
9515 nargs_constant = 1;
9516 break;
9517 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
9518 nargs = 3;
9519 rmode = DImode;
9520 nargs_constant = 1;
9521 break;
9522 case V2DI_FTYPE_V2DI_UINT_UINT:
9523 nargs = 3;
9524 nargs_constant = 2;
9525 break;
9526 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
9527 nargs = 3;
9528 rmode = V8DImode;
9529 nargs_constant = 1;
9530 break;
9531 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
9532 nargs = 5;
9533 rmode = V8DImode;
9534 mask_pos = 2;
9535 nargs_constant = 1;
9536 break;
9537 case QI_FTYPE_V8DF_INT_UQI:
9538 case QI_FTYPE_V4DF_INT_UQI:
9539 case QI_FTYPE_V2DF_INT_UQI:
9540 case HI_FTYPE_V16SF_INT_UHI:
9541 case QI_FTYPE_V8SF_INT_UQI:
9542 case QI_FTYPE_V4SF_INT_UQI:
9543 case V4SI_FTYPE_V4SI_V4SI_UHI:
9544 case V8SI_FTYPE_V8SI_V8SI_UHI:
9545 nargs = 3;
9546 mask_pos = 1;
9547 nargs_constant = 1;
9548 break;
9549 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
9550 nargs = 5;
9551 rmode = V4DImode;
9552 mask_pos = 2;
9553 nargs_constant = 1;
9554 break;
9555 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
9556 nargs = 5;
9557 rmode = V2DImode;
9558 mask_pos = 2;
9559 nargs_constant = 1;
9560 break;
9561 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
9562 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
9563 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
9564 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
9565 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
9566 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
9567 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
9568 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
9569 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
9570 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
9571 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
9572 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
9573 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
9574 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
9575 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
9576 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
9577 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
9578 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
9579 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
9580 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
9581 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
9582 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
9583 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
9584 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
9585 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
9586 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
9587 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
9588 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
9589 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
9590 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
9591 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
9592 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
9593 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
9594 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
9595 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
9596 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
9597 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
9598 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
9599 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
9600 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
9601 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
9602 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
9603 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
9604 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
9605 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
9606 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
9607 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
9608 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
9609 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
9610 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
9611 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
4f0e90fa
HL
9612 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
9613 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
9614 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
2bf6d935
ML
9615 nargs = 4;
9616 break;
9617 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
9618 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
9619 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
9620 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
9621 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
9622 nargs = 4;
9623 nargs_constant = 1;
9624 break;
9625 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
9626 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
9627 case QI_FTYPE_V4DF_V4DF_INT_UQI:
9628 case QI_FTYPE_V8SF_V8SF_INT_UQI:
9629 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
9630 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
9631 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
9632 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
9633 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
9634 case USI_FTYPE_V32QI_V32QI_INT_USI:
9635 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
9636 case USI_FTYPE_V32HI_V32HI_INT_USI:
9637 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
9638 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
2bf6d935
ML
9639 nargs = 4;
9640 mask_pos = 1;
9641 nargs_constant = 1;
9642 break;
9643 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
9644 nargs = 4;
9645 nargs_constant = 2;
9646 break;
9647 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
9648 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
4f0e90fa
HL
9649 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
9650 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
9651 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
2bf6d935
ML
9652 nargs = 4;
9653 break;
9654 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
9655 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
9656 mask_pos = 1;
9657 nargs = 4;
9658 nargs_constant = 1;
9659 break;
9660 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
9661 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
9662 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
9663 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
9664 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
9665 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
9666 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
9667 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
9668 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
9669 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
9670 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
9671 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
9672 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
9673 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
9674 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
9675 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
9676 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
9677 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
9678 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
9679 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
9680 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
9681 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
9682 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
9683 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
9684 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
9685 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
9686 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
9687 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
9688 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
9689 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
9690 nargs = 4;
9691 mask_pos = 2;
9692 nargs_constant = 1;
9693 break;
9694 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
9695 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
9696 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
9697 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
9698 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
9699 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
9700 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
9701 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
9702 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
9703 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
9704 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
9705 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
9706 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
9707 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
9708 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
9709 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
9710 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
9711 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
9712 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
9713 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
9714 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
9715 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
9716 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
9717 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
9718 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
9719 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
9720 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
9721 nargs = 5;
9722 mask_pos = 2;
9723 nargs_constant = 1;
9724 break;
9725 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
9726 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
9727 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
9728 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
9729 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
9730 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
9731 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
9732 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
9733 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
9734 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
9735 nargs = 5;
9736 mask_pos = 1;
9737 nargs_constant = 1;
9738 break;
9739 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
9740 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
9741 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
9742 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
9743 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
9744 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
9745 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
9746 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
9747 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
9748 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
9749 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
9750 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
9751 nargs = 5;
9752 mask_pos = 1;
9753 nargs_constant = 2;
9754 break;
9755
9756 default:
9757 gcc_unreachable ();
9758 }
9759
9760 gcc_assert (nargs <= ARRAY_SIZE (args));
9761
9762 if (comparison != UNKNOWN)
9763 {
9764 gcc_assert (nargs == 2);
9765 return ix86_expand_sse_compare (d, exp, target, swap);
9766 }
9767
9768 if (rmode == VOIDmode || rmode == tmode)
9769 {
9770 if (optimize
9771 || target == 0
9772 || GET_MODE (target) != tmode
9773 || !insn_p->operand[0].predicate (target, tmode))
9774 target = gen_reg_rtx (tmode);
9775 else if (memory_operand (target, tmode))
9776 num_memory++;
9777 real_target = target;
9778 }
9779 else
9780 {
9781 real_target = gen_reg_rtx (tmode);
9782 target = lowpart_subreg (rmode, real_target, tmode);
9783 }
9784
9785 for (i = 0; i < nargs; i++)
9786 {
9787 tree arg = CALL_EXPR_ARG (exp, i);
9788 rtx op = expand_normal (arg);
9789 machine_mode mode = insn_p->operand[i + 1].mode;
9790 bool match = insn_p->operand[i + 1].predicate (op, mode);
9791
9792 if (second_arg_count && i == 1)
9793 {
9794 /* SIMD shift insns take either an 8-bit immediate or
9795 register as count. But builtin functions take int as
9796 count. If count doesn't match, we put it in register.
9797 The instructions are using 64-bit count, if op is just
9798 32-bit, zero-extend it, as negative shift counts
9799 are undefined behavior and zero-extension is more
9800 efficient. */
9801 if (!match)
9802 {
9803 if (SCALAR_INT_MODE_P (GET_MODE (op)))
9804 op = convert_modes (mode, GET_MODE (op), op, 1);
9805 else
9806 op = lowpart_subreg (mode, op, GET_MODE (op));
9807 if (!insn_p->operand[i + 1].predicate (op, mode))
9808 op = copy_to_reg (op);
9809 }
9810 }
9811 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9812 (!mask_pos && (nargs - i) <= nargs_constant))
9813 {
9814 if (!match)
9815 switch (icode)
9816 {
9817 case CODE_FOR_avx_vinsertf128v4di:
9818 case CODE_FOR_avx_vextractf128v4di:
9819 error ("the last argument must be an 1-bit immediate");
9820 return const0_rtx;
9821
9822 case CODE_FOR_avx512f_cmpv8di3_mask:
9823 case CODE_FOR_avx512f_cmpv16si3_mask:
9824 case CODE_FOR_avx512f_ucmpv8di3_mask:
9825 case CODE_FOR_avx512f_ucmpv16si3_mask:
9826 case CODE_FOR_avx512vl_cmpv4di3_mask:
9827 case CODE_FOR_avx512vl_cmpv8si3_mask:
9828 case CODE_FOR_avx512vl_ucmpv4di3_mask:
9829 case CODE_FOR_avx512vl_ucmpv8si3_mask:
9830 case CODE_FOR_avx512vl_cmpv2di3_mask:
9831 case CODE_FOR_avx512vl_cmpv4si3_mask:
9832 case CODE_FOR_avx512vl_ucmpv2di3_mask:
9833 case CODE_FOR_avx512vl_ucmpv4si3_mask:
9834 error ("the last argument must be a 3-bit immediate");
9835 return const0_rtx;
9836
9837 case CODE_FOR_sse4_1_roundsd:
9838 case CODE_FOR_sse4_1_roundss:
9839
9840 case CODE_FOR_sse4_1_roundpd:
9841 case CODE_FOR_sse4_1_roundps:
9842 case CODE_FOR_avx_roundpd256:
9843 case CODE_FOR_avx_roundps256:
9844
9845 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
9846 case CODE_FOR_sse4_1_roundps_sfix:
9847 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
9848 case CODE_FOR_avx_roundps_sfix256:
9849
9850 case CODE_FOR_sse4_1_blendps:
9851 case CODE_FOR_avx_blendpd256:
9852 case CODE_FOR_avx_vpermilv4df:
9853 case CODE_FOR_avx_vpermilv4df_mask:
9854 case CODE_FOR_avx512f_getmantv8df_mask:
9855 case CODE_FOR_avx512f_getmantv16sf_mask:
9856 case CODE_FOR_avx512vl_getmantv8sf_mask:
9857 case CODE_FOR_avx512vl_getmantv4df_mask:
9858 case CODE_FOR_avx512vl_getmantv4sf_mask:
9859 case CODE_FOR_avx512vl_getmantv2df_mask:
9860 case CODE_FOR_avx512dq_rangepv8df_mask_round:
9861 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
9862 case CODE_FOR_avx512dq_rangepv4df_mask:
9863 case CODE_FOR_avx512dq_rangepv8sf_mask:
9864 case CODE_FOR_avx512dq_rangepv2df_mask:
9865 case CODE_FOR_avx512dq_rangepv4sf_mask:
9866 case CODE_FOR_avx_shufpd256_mask:
9867 error ("the last argument must be a 4-bit immediate");
9868 return const0_rtx;
9869
9870 case CODE_FOR_sha1rnds4:
9871 case CODE_FOR_sse4_1_blendpd:
9872 case CODE_FOR_avx_vpermilv2df:
9873 case CODE_FOR_avx_vpermilv2df_mask:
9874 case CODE_FOR_xop_vpermil2v2df3:
9875 case CODE_FOR_xop_vpermil2v4sf3:
9876 case CODE_FOR_xop_vpermil2v4df3:
9877 case CODE_FOR_xop_vpermil2v8sf3:
9878 case CODE_FOR_avx512f_vinsertf32x4_mask:
9879 case CODE_FOR_avx512f_vinserti32x4_mask:
9880 case CODE_FOR_avx512f_vextractf32x4_mask:
9881 case CODE_FOR_avx512f_vextracti32x4_mask:
9882 case CODE_FOR_sse2_shufpd:
9883 case CODE_FOR_sse2_shufpd_mask:
9884 case CODE_FOR_avx512dq_shuf_f64x2_mask:
9885 case CODE_FOR_avx512dq_shuf_i64x2_mask:
9886 case CODE_FOR_avx512vl_shuf_i32x4_mask:
9887 case CODE_FOR_avx512vl_shuf_f32x4_mask:
9888 error ("the last argument must be a 2-bit immediate");
9889 return const0_rtx;
9890
9891 case CODE_FOR_avx_vextractf128v4df:
9892 case CODE_FOR_avx_vextractf128v8sf:
9893 case CODE_FOR_avx_vextractf128v8si:
9894 case CODE_FOR_avx_vinsertf128v4df:
9895 case CODE_FOR_avx_vinsertf128v8sf:
9896 case CODE_FOR_avx_vinsertf128v8si:
9897 case CODE_FOR_avx512f_vinsertf64x4_mask:
9898 case CODE_FOR_avx512f_vinserti64x4_mask:
9899 case CODE_FOR_avx512f_vextractf64x4_mask:
9900 case CODE_FOR_avx512f_vextracti64x4_mask:
9901 case CODE_FOR_avx512dq_vinsertf32x8_mask:
9902 case CODE_FOR_avx512dq_vinserti32x8_mask:
9903 case CODE_FOR_avx512vl_vinsertv4df:
9904 case CODE_FOR_avx512vl_vinsertv4di:
9905 case CODE_FOR_avx512vl_vinsertv8sf:
9906 case CODE_FOR_avx512vl_vinsertv8si:
9907 error ("the last argument must be a 1-bit immediate");
9908 return const0_rtx;
9909
9910 case CODE_FOR_avx_vmcmpv2df3:
9911 case CODE_FOR_avx_vmcmpv4sf3:
9912 case CODE_FOR_avx_cmpv2df3:
9913 case CODE_FOR_avx_cmpv4sf3:
9914 case CODE_FOR_avx_cmpv4df3:
9915 case CODE_FOR_avx_cmpv8sf3:
9916 case CODE_FOR_avx512f_cmpv8df3_mask:
9917 case CODE_FOR_avx512f_cmpv16sf3_mask:
9918 case CODE_FOR_avx512f_vmcmpv2df3_mask:
9919 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
9920 error ("the last argument must be a 5-bit immediate");
9921 return const0_rtx;
9922
9923 default:
9924 switch (nargs_constant)
9925 {
9926 case 2:
9927 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9928 (!mask_pos && (nargs - i) == nargs_constant))
9929 {
9930 error ("the next to last argument must be an 8-bit immediate");
9931 break;
9932 }
9933 /* FALLTHRU */
9934 case 1:
9935 error ("the last argument must be an 8-bit immediate");
9936 break;
9937 default:
9938 gcc_unreachable ();
9939 }
9940 return const0_rtx;
9941 }
9942 }
9943 else
9944 {
9945 if (VECTOR_MODE_P (mode))
9946 op = safe_vector_operand (op, mode);
9947
9948 /* If we aren't optimizing, only allow one memory operand to
9949 be generated. */
9950 if (memory_operand (op, mode))
9951 num_memory++;
9952
9953 op = fixup_modeless_constant (op, mode);
9954
9955 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
9956 {
9957 if (optimize || !match || num_memory > 1)
9958 op = copy_to_mode_reg (mode, op);
9959 }
9960 else
9961 {
9962 op = copy_to_reg (op);
9963 op = lowpart_subreg (mode, op, GET_MODE (op));
9964 }
9965 }
9966
9967 args[i].op = op;
9968 args[i].mode = mode;
9969 }
9970
9971 switch (nargs)
9972 {
9973 case 1:
9974 pat = GEN_FCN (icode) (real_target, args[0].op);
9975 break;
9976 case 2:
9977 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
9978 break;
9979 case 3:
9980 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9981 args[2].op);
9982 break;
9983 case 4:
9984 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9985 args[2].op, args[3].op);
9986 break;
9987 case 5:
9988 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9989 args[2].op, args[3].op, args[4].op);
9990 break;
9991 case 6:
9992 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9993 args[2].op, args[3].op, args[4].op,
9994 args[5].op);
9995 break;
9996 default:
9997 gcc_unreachable ();
9998 }
9999
10000 if (! pat)
10001 return 0;
10002
10003 emit_insn (pat);
10004 return target;
10005}
10006
10007/* Transform pattern of following layout:
10008 (set A
10009 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
10010 )
10011 into:
10012 (set (A B)) */
10013
10014static rtx
10015ix86_erase_embedded_rounding (rtx pat)
10016{
10017 if (GET_CODE (pat) == INSN)
10018 pat = PATTERN (pat);
10019
10020 gcc_assert (GET_CODE (pat) == SET);
10021 rtx src = SET_SRC (pat);
10022 gcc_assert (XVECLEN (src, 0) == 2);
10023 rtx p0 = XVECEXP (src, 0, 0);
10024 gcc_assert (GET_CODE (src) == UNSPEC
10025 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
10026 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
10027 return res;
10028}
10029
10030/* Subroutine of ix86_expand_round_builtin to take care of comi insns
10031 with rounding. */
10032static rtx
10033ix86_expand_sse_comi_round (const struct builtin_description *d,
10034 tree exp, rtx target)
10035{
10036 rtx pat, set_dst;
10037 tree arg0 = CALL_EXPR_ARG (exp, 0);
10038 tree arg1 = CALL_EXPR_ARG (exp, 1);
10039 tree arg2 = CALL_EXPR_ARG (exp, 2);
10040 tree arg3 = CALL_EXPR_ARG (exp, 3);
10041 rtx op0 = expand_normal (arg0);
10042 rtx op1 = expand_normal (arg1);
10043 rtx op2 = expand_normal (arg2);
10044 rtx op3 = expand_normal (arg3);
10045 enum insn_code icode = d->icode;
10046 const struct insn_data_d *insn_p = &insn_data[icode];
10047 machine_mode mode0 = insn_p->operand[0].mode;
10048 machine_mode mode1 = insn_p->operand[1].mode;
2bf6d935
ML
10049
10050 /* See avxintrin.h for values. */
467e9f38 10051 static const enum rtx_code comparisons[32] =
2bf6d935 10052 {
467e9f38
L
10053 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
10054 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
10055 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
10056 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
2bf6d935 10057 };
467e9f38
L
10058 static const bool ordereds[32] =
10059 {
10060 true, true, true, false, false, false, false, true,
10061 false, false, false, true, true, true, true, false,
10062 true, true, true, false, false, false, false, true,
10063 false, false, false, true, true, true, true, false
10064 };
10065 static const bool non_signalings[32] =
2bf6d935
ML
10066 {
10067 true, false, false, true, true, false, false, true,
10068 true, false, false, true, true, false, false, true,
10069 false, true, true, false, false, true, true, false,
10070 false, true, true, false, false, true, true, false
10071 };
10072
10073 if (!CONST_INT_P (op2))
10074 {
10075 error ("the third argument must be comparison constant");
10076 return const0_rtx;
10077 }
10078 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
10079 {
10080 error ("incorrect comparison mode");
10081 return const0_rtx;
10082 }
10083
10084 if (!insn_p->operand[2].predicate (op3, SImode))
10085 {
10086 error ("incorrect rounding operand");
10087 return const0_rtx;
10088 }
10089
2bf6d935
ML
10090 if (VECTOR_MODE_P (mode0))
10091 op0 = safe_vector_operand (op0, mode0);
10092 if (VECTOR_MODE_P (mode1))
10093 op1 = safe_vector_operand (op1, mode1);
10094
467e9f38
L
10095 enum rtx_code comparison = comparisons[INTVAL (op2)];
10096 bool ordered = ordereds[INTVAL (op2)];
10097 bool non_signaling = non_signalings[INTVAL (op2)];
10098 rtx const_val = const0_rtx;
10099
10100 bool check_unordered = false;
10101 machine_mode mode = CCFPmode;
10102 switch (comparison)
10103 {
10104 case ORDERED:
10105 if (!ordered)
10106 {
10107 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
10108 if (!non_signaling)
10109 ordered = true;
10110 mode = CCSmode;
10111 }
10112 else
10113 {
10114 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
10115 if (non_signaling)
10116 ordered = false;
10117 mode = CCPmode;
10118 }
10119 comparison = NE;
10120 break;
10121 case UNORDERED:
10122 if (ordered)
10123 {
10124 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
10125 if (non_signaling)
10126 ordered = false;
10127 mode = CCSmode;
10128 }
10129 else
10130 {
10131 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
10132 if (!non_signaling)
10133 ordered = true;
10134 mode = CCPmode;
10135 }
10136 comparison = EQ;
10137 break;
10138
10139 case LE: /* -> GE */
10140 case LT: /* -> GT */
10141 case UNGE: /* -> UNLE */
10142 case UNGT: /* -> UNLT */
10143 std::swap (op0, op1);
10144 comparison = swap_condition (comparison);
10145 /* FALLTHRU */
10146 case GT:
10147 case GE:
10148 case UNEQ:
10149 case UNLT:
10150 case UNLE:
10151 case LTGT:
10152 /* These are supported by CCFPmode. NB: Use ordered/signaling
10153 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
10154 with NAN operands. */
10155 if (ordered == non_signaling)
10156 ordered = !ordered;
10157 break;
10158 case EQ:
10159 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10160 _CMP_EQ_OQ/_CMP_EQ_OS. */
10161 check_unordered = true;
10162 mode = CCZmode;
10163 break;
10164 case NE:
10165 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10166 _CMP_NEQ_UQ/_CMP_NEQ_US. */
10167 gcc_assert (!ordered);
10168 check_unordered = true;
10169 mode = CCZmode;
10170 const_val = const1_rtx;
10171 break;
10172 default:
10173 gcc_unreachable ();
10174 }
10175
2bf6d935 10176 target = gen_reg_rtx (SImode);
467e9f38 10177 emit_move_insn (target, const_val);
2bf6d935
ML
10178 target = gen_rtx_SUBREG (QImode, target, 0);
10179
10180 if ((optimize && !register_operand (op0, mode0))
10181 || !insn_p->operand[0].predicate (op0, mode0))
10182 op0 = copy_to_mode_reg (mode0, op0);
10183 if ((optimize && !register_operand (op1, mode1))
10184 || !insn_p->operand[1].predicate (op1, mode1))
10185 op1 = copy_to_mode_reg (mode1, op1);
10186
467e9f38
L
10187 /*
10188 1. COMI: ordered and signaling.
10189 2. UCOMI: unordered and non-signaling.
10190 */
10191 if (non_signaling)
10192 icode = (icode == CODE_FOR_sse_comi_round
10193 ? CODE_FOR_sse_ucomi_round
10194 : CODE_FOR_sse2_ucomi_round);
2bf6d935
ML
10195
10196 pat = GEN_FCN (icode) (op0, op1, op3);
10197 if (! pat)
10198 return 0;
10199
10200 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
10201 if (INTVAL (op3) == NO_ROUND)
10202 {
10203 pat = ix86_erase_embedded_rounding (pat);
10204 if (! pat)
10205 return 0;
10206
10207 set_dst = SET_DEST (pat);
10208 }
10209 else
10210 {
10211 gcc_assert (GET_CODE (pat) == SET);
10212 set_dst = SET_DEST (pat);
10213 }
10214
10215 emit_insn (pat);
467e9f38
L
10216
10217 rtx_code_label *label = NULL;
10218
10219 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10220 with NAN operands. */
10221 if (check_unordered)
10222 {
10223 gcc_assert (comparison == EQ || comparison == NE);
10224
10225 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
10226 label = gen_label_rtx ();
10227 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
10228 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10229 gen_rtx_LABEL_REF (VOIDmode, label),
10230 pc_rtx);
10231 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10232 }
10233
10234 /* NB: Set CCFPmode and check a different CCmode which is in subset
10235 of CCFPmode. */
10236 if (GET_MODE (set_dst) != mode)
10237 {
10238 gcc_assert (mode == CCAmode || mode == CCCmode
10239 || mode == CCOmode || mode == CCPmode
10240 || mode == CCSmode || mode == CCZmode);
10241 set_dst = gen_rtx_REG (mode, FLAGS_REG);
10242 }
10243
2bf6d935
ML
10244 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10245 gen_rtx_fmt_ee (comparison, QImode,
10246 set_dst,
10247 const0_rtx)));
10248
467e9f38
L
10249 if (label)
10250 emit_label (label);
10251
2bf6d935
ML
10252 return SUBREG_REG (target);
10253}
10254
10255static rtx
10256ix86_expand_round_builtin (const struct builtin_description *d,
10257 tree exp, rtx target)
10258{
10259 rtx pat;
10260 unsigned int i, nargs;
10261 struct
10262 {
10263 rtx op;
10264 machine_mode mode;
10265 } args[6];
10266 enum insn_code icode = d->icode;
10267 const struct insn_data_d *insn_p = &insn_data[icode];
10268 machine_mode tmode = insn_p->operand[0].mode;
10269 unsigned int nargs_constant = 0;
10270 unsigned int redundant_embed_rnd = 0;
10271
10272 switch ((enum ix86_builtin_func_type) d->flag)
10273 {
10274 case UINT64_FTYPE_V2DF_INT:
10275 case UINT64_FTYPE_V4SF_INT:
10276 case UINT_FTYPE_V2DF_INT:
10277 case UINT_FTYPE_V4SF_INT:
10278 case INT64_FTYPE_V2DF_INT:
10279 case INT64_FTYPE_V4SF_INT:
10280 case INT_FTYPE_V2DF_INT:
10281 case INT_FTYPE_V4SF_INT:
10282 nargs = 2;
10283 break;
10284 case V4SF_FTYPE_V4SF_UINT_INT:
10285 case V4SF_FTYPE_V4SF_UINT64_INT:
10286 case V2DF_FTYPE_V2DF_UINT64_INT:
10287 case V4SF_FTYPE_V4SF_INT_INT:
10288 case V4SF_FTYPE_V4SF_INT64_INT:
10289 case V2DF_FTYPE_V2DF_INT64_INT:
10290 case V4SF_FTYPE_V4SF_V4SF_INT:
10291 case V2DF_FTYPE_V2DF_V2DF_INT:
10292 case V4SF_FTYPE_V4SF_V2DF_INT:
10293 case V2DF_FTYPE_V2DF_V4SF_INT:
10294 nargs = 3;
10295 break;
10296 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
10297 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
10298 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
10299 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
10300 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
10301 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
10302 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
10303 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
10304 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
10305 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
10306 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
10307 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
10308 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
10309 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
10310 nargs = 4;
10311 break;
10312 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
10313 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
10314 nargs_constant = 2;
10315 nargs = 4;
10316 break;
10317 case INT_FTYPE_V4SF_V4SF_INT_INT:
10318 case INT_FTYPE_V2DF_V2DF_INT_INT:
10319 return ix86_expand_sse_comi_round (d, exp, target);
10320 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
10321 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
10322 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
10323 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
10324 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
10325 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
93103603 10326 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
2bf6d935
ML
10327 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
10328 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
93103603 10329 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
2bf6d935
ML
10330 nargs = 5;
10331 break;
10332 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
10333 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
93103603
SP
10334 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
10335 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
2bf6d935
ML
10336 nargs_constant = 4;
10337 nargs = 5;
10338 break;
10339 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
10340 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
10341 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
10342 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
10343 nargs_constant = 3;
10344 nargs = 5;
10345 break;
10346 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
10347 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
10348 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
10349 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
10350 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
10351 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
10352 nargs = 6;
10353 nargs_constant = 4;
10354 break;
10355 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
10356 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
10357 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
10358 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
10359 nargs = 6;
10360 nargs_constant = 3;
10361 break;
10362 default:
10363 gcc_unreachable ();
10364 }
10365 gcc_assert (nargs <= ARRAY_SIZE (args));
10366
10367 if (optimize
10368 || target == 0
10369 || GET_MODE (target) != tmode
10370 || !insn_p->operand[0].predicate (target, tmode))
10371 target = gen_reg_rtx (tmode);
10372
10373 for (i = 0; i < nargs; i++)
10374 {
10375 tree arg = CALL_EXPR_ARG (exp, i);
10376 rtx op = expand_normal (arg);
10377 machine_mode mode = insn_p->operand[i + 1].mode;
10378 bool match = insn_p->operand[i + 1].predicate (op, mode);
10379
10380 if (i == nargs - nargs_constant)
10381 {
10382 if (!match)
10383 {
10384 switch (icode)
10385 {
10386 case CODE_FOR_avx512f_getmantv8df_mask_round:
10387 case CODE_FOR_avx512f_getmantv16sf_mask_round:
10388 case CODE_FOR_avx512f_vgetmantv2df_round:
10389 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
10390 case CODE_FOR_avx512f_vgetmantv4sf_round:
10391 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
10392 error ("the immediate argument must be a 4-bit immediate");
10393 return const0_rtx;
10394 case CODE_FOR_avx512f_cmpv8df3_mask_round:
10395 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
10396 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
10397 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
10398 error ("the immediate argument must be a 5-bit immediate");
10399 return const0_rtx;
10400 default:
10401 error ("the immediate argument must be an 8-bit immediate");
10402 return const0_rtx;
10403 }
10404 }
10405 }
10406 else if (i == nargs-1)
10407 {
10408 if (!insn_p->operand[nargs].predicate (op, SImode))
10409 {
10410 error ("incorrect rounding operand");
10411 return const0_rtx;
10412 }
10413
10414 /* If there is no rounding use normal version of the pattern. */
10415 if (INTVAL (op) == NO_ROUND)
10416 redundant_embed_rnd = 1;
10417 }
10418 else
10419 {
10420 if (VECTOR_MODE_P (mode))
10421 op = safe_vector_operand (op, mode);
10422
10423 op = fixup_modeless_constant (op, mode);
10424
10425 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10426 {
10427 if (optimize || !match)
10428 op = copy_to_mode_reg (mode, op);
10429 }
10430 else
10431 {
10432 op = copy_to_reg (op);
10433 op = lowpart_subreg (mode, op, GET_MODE (op));
10434 }
10435 }
10436
10437 args[i].op = op;
10438 args[i].mode = mode;
10439 }
10440
10441 switch (nargs)
10442 {
10443 case 1:
10444 pat = GEN_FCN (icode) (target, args[0].op);
10445 break;
10446 case 2:
10447 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
10448 break;
10449 case 3:
10450 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10451 args[2].op);
10452 break;
10453 case 4:
10454 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10455 args[2].op, args[3].op);
10456 break;
10457 case 5:
10458 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10459 args[2].op, args[3].op, args[4].op);
10460 break;
10461 case 6:
10462 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10463 args[2].op, args[3].op, args[4].op,
10464 args[5].op);
10465 break;
10466 default:
10467 gcc_unreachable ();
10468 }
10469
10470 if (!pat)
10471 return 0;
10472
10473 if (redundant_embed_rnd)
10474 pat = ix86_erase_embedded_rounding (pat);
10475
10476 emit_insn (pat);
10477 return target;
10478}
10479
10480/* Subroutine of ix86_expand_builtin to take care of special insns
10481 with variable number of operands. */
10482
10483static rtx
10484ix86_expand_special_args_builtin (const struct builtin_description *d,
10485 tree exp, rtx target)
10486{
10487 tree arg;
10488 rtx pat, op;
10489 unsigned int i, nargs, arg_adjust, memory;
10490 bool aligned_mem = false;
10491 struct
10492 {
10493 rtx op;
10494 machine_mode mode;
10495 } args[3];
10496 enum insn_code icode = d->icode;
2bf6d935
ML
10497 const struct insn_data_d *insn_p = &insn_data[icode];
10498 machine_mode tmode = insn_p->operand[0].mode;
10499 enum { load, store } klass;
10500
10501 switch ((enum ix86_builtin_func_type) d->flag)
10502 {
10503 case VOID_FTYPE_VOID:
10504 emit_insn (GEN_FCN (icode) (target));
10505 return 0;
10506 case VOID_FTYPE_UINT64:
10507 case VOID_FTYPE_UNSIGNED:
10508 nargs = 0;
10509 klass = store;
10510 memory = 0;
10511 break;
10512
10513 case INT_FTYPE_VOID:
10514 case USHORT_FTYPE_VOID:
10515 case UINT64_FTYPE_VOID:
10516 case UINT_FTYPE_VOID:
299a53d7 10517 case UINT8_FTYPE_VOID:
2bf6d935
ML
10518 case UNSIGNED_FTYPE_VOID:
10519 nargs = 0;
10520 klass = load;
10521 memory = 0;
10522 break;
10523 case UINT64_FTYPE_PUNSIGNED:
10524 case V2DI_FTYPE_PV2DI:
10525 case V4DI_FTYPE_PV4DI:
10526 case V32QI_FTYPE_PCCHAR:
10527 case V16QI_FTYPE_PCCHAR:
10528 case V8SF_FTYPE_PCV4SF:
10529 case V8SF_FTYPE_PCFLOAT:
10530 case V4SF_FTYPE_PCFLOAT:
10531 case V4DF_FTYPE_PCV2DF:
10532 case V4DF_FTYPE_PCDOUBLE:
10533 case V2DF_FTYPE_PCDOUBLE:
10534 case VOID_FTYPE_PVOID:
10535 case V8DI_FTYPE_PV8DI:
10536 nargs = 1;
10537 klass = load;
10538 memory = 0;
10539 switch (icode)
10540 {
10541 case CODE_FOR_sse4_1_movntdqa:
10542 case CODE_FOR_avx2_movntdqa:
10543 case CODE_FOR_avx512f_movntdqa:
10544 aligned_mem = true;
10545 break;
10546 default:
10547 break;
10548 }
10549 break;
10550 case VOID_FTYPE_PV2SF_V4SF:
10551 case VOID_FTYPE_PV8DI_V8DI:
10552 case VOID_FTYPE_PV4DI_V4DI:
10553 case VOID_FTYPE_PV2DI_V2DI:
10554 case VOID_FTYPE_PCHAR_V32QI:
10555 case VOID_FTYPE_PCHAR_V16QI:
10556 case VOID_FTYPE_PFLOAT_V16SF:
10557 case VOID_FTYPE_PFLOAT_V8SF:
10558 case VOID_FTYPE_PFLOAT_V4SF:
10559 case VOID_FTYPE_PDOUBLE_V8DF:
10560 case VOID_FTYPE_PDOUBLE_V4DF:
10561 case VOID_FTYPE_PDOUBLE_V2DF:
10562 case VOID_FTYPE_PLONGLONG_LONGLONG:
10563 case VOID_FTYPE_PULONGLONG_ULONGLONG:
10564 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
10565 case VOID_FTYPE_PINT_INT:
10566 nargs = 1;
10567 klass = store;
10568 /* Reserve memory operand for target. */
10569 memory = ARRAY_SIZE (args);
10570 switch (icode)
10571 {
10572 /* These builtins and instructions require the memory
10573 to be properly aligned. */
10574 case CODE_FOR_avx_movntv4di:
10575 case CODE_FOR_sse2_movntv2di:
10576 case CODE_FOR_avx_movntv8sf:
10577 case CODE_FOR_sse_movntv4sf:
10578 case CODE_FOR_sse4a_vmmovntv4sf:
10579 case CODE_FOR_avx_movntv4df:
10580 case CODE_FOR_sse2_movntv2df:
10581 case CODE_FOR_sse4a_vmmovntv2df:
10582 case CODE_FOR_sse2_movntidi:
10583 case CODE_FOR_sse_movntq:
10584 case CODE_FOR_sse2_movntisi:
10585 case CODE_FOR_avx512f_movntv16sf:
10586 case CODE_FOR_avx512f_movntv8df:
10587 case CODE_FOR_avx512f_movntv8di:
10588 aligned_mem = true;
10589 break;
10590 default:
10591 break;
10592 }
10593 break;
10594 case VOID_FTYPE_PVOID_PCVOID:
10595 nargs = 1;
10596 klass = store;
10597 memory = 0;
10598
10599 break;
10600 case V4SF_FTYPE_V4SF_PCV2SF:
10601 case V2DF_FTYPE_V2DF_PCDOUBLE:
10602 nargs = 2;
10603 klass = load;
10604 memory = 1;
10605 break;
10606 case V8SF_FTYPE_PCV8SF_V8SI:
10607 case V4DF_FTYPE_PCV4DF_V4DI:
10608 case V4SF_FTYPE_PCV4SF_V4SI:
10609 case V2DF_FTYPE_PCV2DF_V2DI:
10610 case V8SI_FTYPE_PCV8SI_V8SI:
10611 case V4DI_FTYPE_PCV4DI_V4DI:
10612 case V4SI_FTYPE_PCV4SI_V4SI:
10613 case V2DI_FTYPE_PCV2DI_V2DI:
10614 case VOID_FTYPE_INT_INT64:
10615 nargs = 2;
10616 klass = load;
10617 memory = 0;
10618 break;
10619 case VOID_FTYPE_PV8DF_V8DF_UQI:
10620 case VOID_FTYPE_PV4DF_V4DF_UQI:
10621 case VOID_FTYPE_PV2DF_V2DF_UQI:
10622 case VOID_FTYPE_PV16SF_V16SF_UHI:
10623 case VOID_FTYPE_PV8SF_V8SF_UQI:
10624 case VOID_FTYPE_PV4SF_V4SF_UQI:
10625 case VOID_FTYPE_PV8DI_V8DI_UQI:
10626 case VOID_FTYPE_PV4DI_V4DI_UQI:
10627 case VOID_FTYPE_PV2DI_V2DI_UQI:
10628 case VOID_FTYPE_PV16SI_V16SI_UHI:
10629 case VOID_FTYPE_PV8SI_V8SI_UQI:
10630 case VOID_FTYPE_PV4SI_V4SI_UQI:
10631 case VOID_FTYPE_PV64QI_V64QI_UDI:
10632 case VOID_FTYPE_PV32HI_V32HI_USI:
10633 case VOID_FTYPE_PV32QI_V32QI_USI:
10634 case VOID_FTYPE_PV16QI_V16QI_UHI:
10635 case VOID_FTYPE_PV16HI_V16HI_UHI:
10636 case VOID_FTYPE_PV8HI_V8HI_UQI:
10637 switch (icode)
10638 {
10639 /* These builtins and instructions require the memory
10640 to be properly aligned. */
10641 case CODE_FOR_avx512f_storev16sf_mask:
10642 case CODE_FOR_avx512f_storev16si_mask:
10643 case CODE_FOR_avx512f_storev8df_mask:
10644 case CODE_FOR_avx512f_storev8di_mask:
10645 case CODE_FOR_avx512vl_storev8sf_mask:
10646 case CODE_FOR_avx512vl_storev8si_mask:
10647 case CODE_FOR_avx512vl_storev4df_mask:
10648 case CODE_FOR_avx512vl_storev4di_mask:
10649 case CODE_FOR_avx512vl_storev4sf_mask:
10650 case CODE_FOR_avx512vl_storev4si_mask:
10651 case CODE_FOR_avx512vl_storev2df_mask:
10652 case CODE_FOR_avx512vl_storev2di_mask:
10653 aligned_mem = true;
10654 break;
10655 default:
10656 break;
10657 }
10658 /* FALLTHRU */
10659 case VOID_FTYPE_PV8SF_V8SI_V8SF:
10660 case VOID_FTYPE_PV4DF_V4DI_V4DF:
10661 case VOID_FTYPE_PV4SF_V4SI_V4SF:
10662 case VOID_FTYPE_PV2DF_V2DI_V2DF:
10663 case VOID_FTYPE_PV8SI_V8SI_V8SI:
10664 case VOID_FTYPE_PV4DI_V4DI_V4DI:
10665 case VOID_FTYPE_PV4SI_V4SI_V4SI:
10666 case VOID_FTYPE_PV2DI_V2DI_V2DI:
10667 case VOID_FTYPE_PV8SI_V8DI_UQI:
10668 case VOID_FTYPE_PV8HI_V8DI_UQI:
10669 case VOID_FTYPE_PV16HI_V16SI_UHI:
4a948703 10670 case VOID_FTYPE_PUDI_V8DI_UQI:
2bf6d935
ML
10671 case VOID_FTYPE_PV16QI_V16SI_UHI:
10672 case VOID_FTYPE_PV4SI_V4DI_UQI:
4a948703 10673 case VOID_FTYPE_PUDI_V2DI_UQI:
10674 case VOID_FTYPE_PUDI_V4DI_UQI:
10675 case VOID_FTYPE_PUSI_V2DI_UQI:
2bf6d935 10676 case VOID_FTYPE_PV8HI_V8SI_UQI:
4a948703 10677 case VOID_FTYPE_PUDI_V4SI_UQI:
10678 case VOID_FTYPE_PUSI_V4DI_UQI:
10679 case VOID_FTYPE_PUHI_V2DI_UQI:
10680 case VOID_FTYPE_PUDI_V8SI_UQI:
10681 case VOID_FTYPE_PUSI_V4SI_UQI:
2bf6d935
ML
10682 case VOID_FTYPE_PCHAR_V64QI_UDI:
10683 case VOID_FTYPE_PCHAR_V32QI_USI:
10684 case VOID_FTYPE_PCHAR_V16QI_UHI:
10685 case VOID_FTYPE_PSHORT_V32HI_USI:
10686 case VOID_FTYPE_PSHORT_V16HI_UHI:
10687 case VOID_FTYPE_PSHORT_V8HI_UQI:
10688 case VOID_FTYPE_PINT_V16SI_UHI:
10689 case VOID_FTYPE_PINT_V8SI_UQI:
10690 case VOID_FTYPE_PINT_V4SI_UQI:
10691 case VOID_FTYPE_PINT64_V8DI_UQI:
10692 case VOID_FTYPE_PINT64_V4DI_UQI:
10693 case VOID_FTYPE_PINT64_V2DI_UQI:
10694 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
10695 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
10696 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
10697 case VOID_FTYPE_PFLOAT_V16SF_UHI:
10698 case VOID_FTYPE_PFLOAT_V8SF_UQI:
10699 case VOID_FTYPE_PFLOAT_V4SF_UQI:
10700 case VOID_FTYPE_PV32QI_V32HI_USI:
10701 case VOID_FTYPE_PV16QI_V16HI_UHI:
4a948703 10702 case VOID_FTYPE_PUDI_V8HI_UQI:
2bf6d935
ML
10703 nargs = 2;
10704 klass = store;
10705 /* Reserve memory operand for target. */
10706 memory = ARRAY_SIZE (args);
10707 break;
10708 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
10709 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
10710 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
10711 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
10712 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
10713 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
10714 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
10715 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
10716 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
10717 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
10718 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
10719 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
10720 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
10721 case V32HI_FTYPE_PCV32HI_V32HI_USI:
10722 case V32QI_FTYPE_PCV32QI_V32QI_USI:
10723 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
10724 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
10725 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
10726 switch (icode)
10727 {
10728 /* These builtins and instructions require the memory
10729 to be properly aligned. */
10730 case CODE_FOR_avx512f_loadv16sf_mask:
10731 case CODE_FOR_avx512f_loadv16si_mask:
10732 case CODE_FOR_avx512f_loadv8df_mask:
10733 case CODE_FOR_avx512f_loadv8di_mask:
10734 case CODE_FOR_avx512vl_loadv8sf_mask:
10735 case CODE_FOR_avx512vl_loadv8si_mask:
10736 case CODE_FOR_avx512vl_loadv4df_mask:
10737 case CODE_FOR_avx512vl_loadv4di_mask:
10738 case CODE_FOR_avx512vl_loadv4sf_mask:
10739 case CODE_FOR_avx512vl_loadv4si_mask:
10740 case CODE_FOR_avx512vl_loadv2df_mask:
10741 case CODE_FOR_avx512vl_loadv2di_mask:
10742 case CODE_FOR_avx512bw_loadv64qi_mask:
10743 case CODE_FOR_avx512vl_loadv32qi_mask:
10744 case CODE_FOR_avx512vl_loadv16qi_mask:
10745 case CODE_FOR_avx512bw_loadv32hi_mask:
10746 case CODE_FOR_avx512vl_loadv16hi_mask:
10747 case CODE_FOR_avx512vl_loadv8hi_mask:
10748 aligned_mem = true;
10749 break;
10750 default:
10751 break;
10752 }
10753 /* FALLTHRU */
10754 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
10755 case V32QI_FTYPE_PCCHAR_V32QI_USI:
10756 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
10757 case V32HI_FTYPE_PCSHORT_V32HI_USI:
10758 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
10759 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
10760 case V16SI_FTYPE_PCINT_V16SI_UHI:
10761 case V8SI_FTYPE_PCINT_V8SI_UQI:
10762 case V4SI_FTYPE_PCINT_V4SI_UQI:
10763 case V8DI_FTYPE_PCINT64_V8DI_UQI:
10764 case V4DI_FTYPE_PCINT64_V4DI_UQI:
10765 case V2DI_FTYPE_PCINT64_V2DI_UQI:
10766 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
10767 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
10768 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
10769 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
10770 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
10771 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
10772 nargs = 3;
10773 klass = load;
10774 memory = 0;
10775 break;
2bf6d935
ML
10776 default:
10777 gcc_unreachable ();
10778 }
10779
10780 gcc_assert (nargs <= ARRAY_SIZE (args));
10781
10782 if (klass == store)
10783 {
10784 arg = CALL_EXPR_ARG (exp, 0);
10785 op = expand_normal (arg);
10786 gcc_assert (target == 0);
10787 if (memory)
10788 {
10789 op = ix86_zero_extend_to_Pmode (op);
10790 target = gen_rtx_MEM (tmode, op);
10791 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10792 on it. Try to improve it using get_pointer_alignment,
10793 and if the special builtin is one that requires strict
10794 mode alignment, also from it's GET_MODE_ALIGNMENT.
10795 Failure to do so could lead to ix86_legitimate_combined_insn
10796 rejecting all changes to such insns. */
10797 unsigned int align = get_pointer_alignment (arg);
10798 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
10799 align = GET_MODE_ALIGNMENT (tmode);
10800 if (MEM_ALIGN (target) < align)
10801 set_mem_align (target, align);
10802 }
10803 else
10804 target = force_reg (tmode, op);
10805 arg_adjust = 1;
10806 }
10807 else
10808 {
10809 arg_adjust = 0;
10810 if (optimize
10811 || target == 0
10812 || !register_operand (target, tmode)
10813 || GET_MODE (target) != tmode)
10814 target = gen_reg_rtx (tmode);
10815 }
10816
10817 for (i = 0; i < nargs; i++)
10818 {
10819 machine_mode mode = insn_p->operand[i + 1].mode;
2bf6d935
ML
10820
10821 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
10822 op = expand_normal (arg);
2bf6d935 10823
776a37f6 10824 if (i == memory)
2bf6d935 10825 {
776a37f6 10826 /* This must be the memory operand. */
10827 op = ix86_zero_extend_to_Pmode (op);
10828 op = gen_rtx_MEM (mode, op);
10829 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10830 on it. Try to improve it using get_pointer_alignment,
10831 and if the special builtin is one that requires strict
10832 mode alignment, also from it's GET_MODE_ALIGNMENT.
10833 Failure to do so could lead to ix86_legitimate_combined_insn
10834 rejecting all changes to such insns. */
10835 unsigned int align = get_pointer_alignment (arg);
10836 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
10837 align = GET_MODE_ALIGNMENT (mode);
10838 if (MEM_ALIGN (op) < align)
10839 set_mem_align (op, align);
2bf6d935
ML
10840 }
10841 else
10842 {
776a37f6 10843 /* This must be register. */
10844 if (VECTOR_MODE_P (mode))
10845 op = safe_vector_operand (op, mode);
2bf6d935 10846
776a37f6 10847 op = fixup_modeless_constant (op, mode);
2bf6d935 10848
776a37f6 10849 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10850 op = copy_to_mode_reg (mode, op);
10851 else
10852 {
10853 op = copy_to_reg (op);
10854 op = lowpart_subreg (mode, op, GET_MODE (op));
2bf6d935
ML
10855 }
10856 }
10857
10858 args[i].op = op;
10859 args[i].mode = mode;
10860 }
10861
10862 switch (nargs)
10863 {
10864 case 0:
10865 pat = GEN_FCN (icode) (target);
10866 break;
10867 case 1:
10868 pat = GEN_FCN (icode) (target, args[0].op);
10869 break;
10870 case 2:
10871 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
10872 break;
10873 case 3:
10874 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
10875 break;
10876 default:
10877 gcc_unreachable ();
10878 }
10879
10880 if (! pat)
10881 return 0;
10882 emit_insn (pat);
10883 return klass == store ? 0 : target;
10884}
10885
10886/* Return the integer constant in ARG. Constrain it to be in the range
10887 of the subparts of VEC_TYPE; issue an error if not. */
10888
10889static int
10890get_element_number (tree vec_type, tree arg)
10891{
10892 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
10893
10894 if (!tree_fits_uhwi_p (arg)
10895 || (elt = tree_to_uhwi (arg), elt > max))
10896 {
a9c697b8
MS
10897 error ("selector must be an integer constant in the range "
10898 "[0, %wi]", max);
2bf6d935
ML
10899 return 0;
10900 }
10901
10902 return elt;
10903}
10904
10905/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10906 ix86_expand_vector_init. We DO have language-level syntax for this, in
10907 the form of (type){ init-list }. Except that since we can't place emms
10908 instructions from inside the compiler, we can't allow the use of MMX
10909 registers unless the user explicitly asks for it. So we do *not* define
10910 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
10911 we have builtins invoked by mmintrin.h that gives us license to emit
10912 these sorts of instructions. */
10913
10914static rtx
10915ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
10916{
10917 machine_mode tmode = TYPE_MODE (type);
10918 machine_mode inner_mode = GET_MODE_INNER (tmode);
10919 int i, n_elt = GET_MODE_NUNITS (tmode);
10920 rtvec v = rtvec_alloc (n_elt);
10921
10922 gcc_assert (VECTOR_MODE_P (tmode));
10923 gcc_assert (call_expr_nargs (exp) == n_elt);
10924
10925 for (i = 0; i < n_elt; ++i)
10926 {
10927 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
10928 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
10929 }
10930
10931 if (!target || !register_operand (target, tmode))
10932 target = gen_reg_rtx (tmode);
10933
10934 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
10935 return target;
10936}
10937
10938/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10939 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
10940 had a language-level syntax for referencing vector elements. */
10941
10942static rtx
10943ix86_expand_vec_ext_builtin (tree exp, rtx target)
10944{
10945 machine_mode tmode, mode0;
10946 tree arg0, arg1;
10947 int elt;
10948 rtx op0;
10949
10950 arg0 = CALL_EXPR_ARG (exp, 0);
10951 arg1 = CALL_EXPR_ARG (exp, 1);
10952
10953 op0 = expand_normal (arg0);
10954 elt = get_element_number (TREE_TYPE (arg0), arg1);
10955
10956 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10957 mode0 = TYPE_MODE (TREE_TYPE (arg0));
10958 gcc_assert (VECTOR_MODE_P (mode0));
10959
10960 op0 = force_reg (mode0, op0);
10961
10962 if (optimize || !target || !register_operand (target, tmode))
10963 target = gen_reg_rtx (tmode);
10964
10965 ix86_expand_vector_extract (true, target, op0, elt);
10966
10967 return target;
10968}
10969
10970/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10971 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
10972 a language-level syntax for referencing vector elements. */
10973
10974static rtx
10975ix86_expand_vec_set_builtin (tree exp)
10976{
10977 machine_mode tmode, mode1;
10978 tree arg0, arg1, arg2;
10979 int elt;
10980 rtx op0, op1, target;
10981
10982 arg0 = CALL_EXPR_ARG (exp, 0);
10983 arg1 = CALL_EXPR_ARG (exp, 1);
10984 arg2 = CALL_EXPR_ARG (exp, 2);
10985
10986 tmode = TYPE_MODE (TREE_TYPE (arg0));
10987 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10988 gcc_assert (VECTOR_MODE_P (tmode));
10989
10990 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
10991 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
10992 elt = get_element_number (TREE_TYPE (arg0), arg2);
10993
10994 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
10995 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
10996
10997 op0 = force_reg (tmode, op0);
10998 op1 = force_reg (mode1, op1);
10999
11000 /* OP0 is the source of these builtin functions and shouldn't be
11001 modified. Create a copy, use it and return it as target. */
11002 target = gen_reg_rtx (tmode);
11003 emit_move_insn (target, op0);
11004 ix86_expand_vector_set (true, target, op1, elt);
11005
11006 return target;
11007}
11008
11009/* Expand an expression EXP that calls a built-in function,
11010 with result going to TARGET if that's convenient
11011 (and in mode MODE if that's convenient).
11012 SUBTARGET may be used as the target for computing one of EXP's operands.
11013 IGNORE is nonzero if the value is to be ignored. */
11014
11015rtx
11016ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
11017 machine_mode mode, int ignore)
11018{
11019 size_t i;
11020 enum insn_code icode, icode2;
11021 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
11022 tree arg0, arg1, arg2, arg3, arg4;
11023 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
11024 machine_mode mode0, mode1, mode2, mode3, mode4;
4d732405 11025 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
2bf6d935
ML
11026
11027 /* For CPU builtins that can be folded, fold first and expand the fold. */
11028 switch (fcode)
11029 {
11030 case IX86_BUILTIN_CPU_INIT:
11031 {
11032 /* Make it call __cpu_indicator_init in libgcc. */
11033 tree call_expr, fndecl, type;
11034 type = build_function_type_list (integer_type_node, NULL_TREE);
11035 fndecl = build_fn_decl ("__cpu_indicator_init", type);
11036 call_expr = build_call_expr (fndecl, 0);
11037 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
11038 }
11039 case IX86_BUILTIN_CPU_IS:
11040 case IX86_BUILTIN_CPU_SUPPORTS:
11041 {
11042 tree arg0 = CALL_EXPR_ARG (exp, 0);
11043 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
11044 gcc_assert (fold_expr != NULL_TREE);
11045 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
11046 }
11047 }
11048
11049 HOST_WIDE_INT isa = ix86_isa_flags;
11050 HOST_WIDE_INT isa2 = ix86_isa_flags2;
11051 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
11052 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
11053 /* The general case is we require all the ISAs specified in bisa{,2}
11054 to be enabled.
11055 The exceptions are:
11056 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
11057 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
11058 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
ca813880 11059 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
11060 OPTION_MASK_ISA2_AVXVNNI
a13d6ec8
JJ
11061 where for each such pair it is sufficient if either of the ISAs is
11062 enabled, plus if it is ored with other options also those others.
11063 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
2bf6d935
ML
11064 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11065 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11066 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
11067 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
db3f0d21 11068
2bf6d935
ML
11069 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11070 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11071 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
11072 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
db3f0d21 11073
2bf6d935
ML
11074 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11075 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11076 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
11077 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
6058b874 11078
ca813880 11079 if ((((bisa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11080 == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11081 || (bisa2 & OPTION_MASK_ISA2_AVXVNNI) != 0)
11082 && (((isa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11083 == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11084 || (isa2 & OPTION_MASK_ISA2_AVXVNNI) != 0))
11085 {
11086 isa |= OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL;
11087 isa2 |= OPTION_MASK_ISA2_AVXVNNI;
11088 }
11089
db3f0d21
UB
11090 if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
11091 /* __builtin_ia32_maskmovq requires MMX registers. */
6058b874 11092 && fcode != IX86_BUILTIN_MASKMOVQ)
a13d6ec8
JJ
11093 {
11094 bisa &= ~OPTION_MASK_ISA_MMX;
11095 bisa |= OPTION_MASK_ISA_SSE2;
ecfdb16c 11096 }
6058b874 11097
2bf6d935
ML
11098 if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
11099 {
11100 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
11101 if (TARGET_ABI_X32)
11102 bisa |= OPTION_MASK_ABI_X32;
11103 else
11104 bisa |= OPTION_MASK_ABI_64;
11105 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
46e6341f
JJ
11106 (enum fpmath_unit) 0,
11107 (enum prefer_vector_width) 0,
11108 false, add_abi_p);
2bf6d935
ML
11109 if (!opts)
11110 error ("%qE needs unknown isa option", fndecl);
11111 else
11112 {
11113 gcc_assert (opts != NULL);
11114 error ("%qE needs isa option %s", fndecl, opts);
11115 free (opts);
11116 }
11117 return expand_call (exp, target, ignore);
11118 }
11119
11120 switch (fcode)
11121 {
11122 case IX86_BUILTIN_MASKMOVQ:
11123 case IX86_BUILTIN_MASKMOVDQU:
11124 icode = (fcode == IX86_BUILTIN_MASKMOVQ
11125 ? CODE_FOR_mmx_maskmovq
11126 : CODE_FOR_sse2_maskmovdqu);
11127 /* Note the arg order is different from the operand order. */
11128 arg1 = CALL_EXPR_ARG (exp, 0);
11129 arg2 = CALL_EXPR_ARG (exp, 1);
11130 arg0 = CALL_EXPR_ARG (exp, 2);
11131 op0 = expand_normal (arg0);
11132 op1 = expand_normal (arg1);
11133 op2 = expand_normal (arg2);
11134 mode0 = insn_data[icode].operand[0].mode;
11135 mode1 = insn_data[icode].operand[1].mode;
11136 mode2 = insn_data[icode].operand[2].mode;
11137
11138 op0 = ix86_zero_extend_to_Pmode (op0);
11139 op0 = gen_rtx_MEM (mode1, op0);
11140
11141 if (!insn_data[icode].operand[0].predicate (op0, mode0))
11142 op0 = copy_to_mode_reg (mode0, op0);
11143 if (!insn_data[icode].operand[1].predicate (op1, mode1))
11144 op1 = copy_to_mode_reg (mode1, op1);
11145 if (!insn_data[icode].operand[2].predicate (op2, mode2))
11146 op2 = copy_to_mode_reg (mode2, op2);
11147 pat = GEN_FCN (icode) (op0, op1, op2);
11148 if (! pat)
11149 return 0;
11150 emit_insn (pat);
11151 return 0;
11152
11153 case IX86_BUILTIN_LDMXCSR:
11154 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
11155 target = assign_386_stack_local (SImode, SLOT_TEMP);
11156 emit_move_insn (target, op0);
11157 emit_insn (gen_sse_ldmxcsr (target));
11158 return 0;
11159
11160 case IX86_BUILTIN_STMXCSR:
11161 target = assign_386_stack_local (SImode, SLOT_TEMP);
11162 emit_insn (gen_sse_stmxcsr (target));
11163 return copy_to_mode_reg (SImode, target);
11164
11165 case IX86_BUILTIN_CLFLUSH:
11166 arg0 = CALL_EXPR_ARG (exp, 0);
11167 op0 = expand_normal (arg0);
11168 icode = CODE_FOR_sse2_clflush;
11169 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11170 op0 = ix86_zero_extend_to_Pmode (op0);
11171
11172 emit_insn (gen_sse2_clflush (op0));
11173 return 0;
11174
11175 case IX86_BUILTIN_CLWB:
11176 arg0 = CALL_EXPR_ARG (exp, 0);
11177 op0 = expand_normal (arg0);
11178 icode = CODE_FOR_clwb;
11179 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11180 op0 = ix86_zero_extend_to_Pmode (op0);
11181
11182 emit_insn (gen_clwb (op0));
11183 return 0;
11184
11185 case IX86_BUILTIN_CLFLUSHOPT:
11186 arg0 = CALL_EXPR_ARG (exp, 0);
11187 op0 = expand_normal (arg0);
11188 icode = CODE_FOR_clflushopt;
11189 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11190 op0 = ix86_zero_extend_to_Pmode (op0);
11191
11192 emit_insn (gen_clflushopt (op0));
11193 return 0;
11194
11195 case IX86_BUILTIN_MONITOR:
11196 case IX86_BUILTIN_MONITORX:
11197 arg0 = CALL_EXPR_ARG (exp, 0);
11198 arg1 = CALL_EXPR_ARG (exp, 1);
11199 arg2 = CALL_EXPR_ARG (exp, 2);
11200 op0 = expand_normal (arg0);
11201 op1 = expand_normal (arg1);
11202 op2 = expand_normal (arg2);
11203 if (!REG_P (op0))
11204 op0 = ix86_zero_extend_to_Pmode (op0);
11205 if (!REG_P (op1))
11206 op1 = copy_to_mode_reg (SImode, op1);
11207 if (!REG_P (op2))
11208 op2 = copy_to_mode_reg (SImode, op2);
11209
11210 emit_insn (fcode == IX86_BUILTIN_MONITOR
a963ca40
UB
11211 ? gen_sse3_monitor (Pmode, op0, op1, op2)
11212 : gen_monitorx (Pmode, op0, op1, op2));
2bf6d935
ML
11213 return 0;
11214
11215 case IX86_BUILTIN_MWAIT:
11216 arg0 = CALL_EXPR_ARG (exp, 0);
11217 arg1 = CALL_EXPR_ARG (exp, 1);
11218 op0 = expand_normal (arg0);
11219 op1 = expand_normal (arg1);
11220 if (!REG_P (op0))
11221 op0 = copy_to_mode_reg (SImode, op0);
11222 if (!REG_P (op1))
11223 op1 = copy_to_mode_reg (SImode, op1);
11224 emit_insn (gen_sse3_mwait (op0, op1));
11225 return 0;
11226
11227 case IX86_BUILTIN_MWAITX:
11228 arg0 = CALL_EXPR_ARG (exp, 0);
11229 arg1 = CALL_EXPR_ARG (exp, 1);
11230 arg2 = CALL_EXPR_ARG (exp, 2);
11231 op0 = expand_normal (arg0);
11232 op1 = expand_normal (arg1);
11233 op2 = expand_normal (arg2);
11234 if (!REG_P (op0))
11235 op0 = copy_to_mode_reg (SImode, op0);
11236 if (!REG_P (op1))
11237 op1 = copy_to_mode_reg (SImode, op1);
11238 if (!REG_P (op2))
11239 op2 = copy_to_mode_reg (SImode, op2);
11240 emit_insn (gen_mwaitx (op0, op1, op2));
11241 return 0;
11242
11243 case IX86_BUILTIN_UMONITOR:
11244 arg0 = CALL_EXPR_ARG (exp, 0);
11245 op0 = expand_normal (arg0);
11246
11247 op0 = ix86_zero_extend_to_Pmode (op0);
987a3082 11248 emit_insn (gen_umonitor (Pmode, op0));
2bf6d935
ML
11249 return 0;
11250
11251 case IX86_BUILTIN_UMWAIT:
11252 case IX86_BUILTIN_TPAUSE:
11253 arg0 = CALL_EXPR_ARG (exp, 0);
11254 arg1 = CALL_EXPR_ARG (exp, 1);
11255 op0 = expand_normal (arg0);
11256 op1 = expand_normal (arg1);
11257
11258 if (!REG_P (op0))
11259 op0 = copy_to_mode_reg (SImode, op0);
11260
11261 op1 = force_reg (DImode, op1);
11262
11263 if (TARGET_64BIT)
11264 {
11265 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11266 NULL, 1, OPTAB_DIRECT);
11267 switch (fcode)
11268 {
11269 case IX86_BUILTIN_UMWAIT:
11270 icode = CODE_FOR_umwait_rex64;
11271 break;
11272 case IX86_BUILTIN_TPAUSE:
11273 icode = CODE_FOR_tpause_rex64;
11274 break;
11275 default:
11276 gcc_unreachable ();
11277 }
11278
11279 op2 = gen_lowpart (SImode, op2);
11280 op1 = gen_lowpart (SImode, op1);
11281 pat = GEN_FCN (icode) (op0, op1, op2);
11282 }
11283 else
11284 {
11285 switch (fcode)
11286 {
11287 case IX86_BUILTIN_UMWAIT:
11288 icode = CODE_FOR_umwait;
11289 break;
11290 case IX86_BUILTIN_TPAUSE:
11291 icode = CODE_FOR_tpause;
11292 break;
11293 default:
11294 gcc_unreachable ();
11295 }
11296 pat = GEN_FCN (icode) (op0, op1);
11297 }
11298
11299 if (!pat)
11300 return 0;
11301
11302 emit_insn (pat);
11303
11304 if (target == 0
11305 || !register_operand (target, QImode))
11306 target = gen_reg_rtx (QImode);
11307
11308 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11309 const0_rtx);
11310 emit_insn (gen_rtx_SET (target, pat));
11311
11312 return target;
11313
299a53d7 11314 case IX86_BUILTIN_TESTUI:
11315 emit_insn (gen_testui ());
11316
11317 if (target == 0
11318 || !register_operand (target, QImode))
11319 target = gen_reg_rtx (QImode);
11320
11321 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11322 const0_rtx);
11323 emit_insn (gen_rtx_SET (target, pat));
11324
11325 return target;
11326
2bf6d935
ML
11327 case IX86_BUILTIN_CLZERO:
11328 arg0 = CALL_EXPR_ARG (exp, 0);
11329 op0 = expand_normal (arg0);
11330 if (!REG_P (op0))
11331 op0 = ix86_zero_extend_to_Pmode (op0);
a963ca40 11332 emit_insn (gen_clzero (Pmode, op0));
2bf6d935
ML
11333 return 0;
11334
11335 case IX86_BUILTIN_CLDEMOTE:
11336 arg0 = CALL_EXPR_ARG (exp, 0);
11337 op0 = expand_normal (arg0);
11338 icode = CODE_FOR_cldemote;
11339 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11340 op0 = ix86_zero_extend_to_Pmode (op0);
11341
11342 emit_insn (gen_cldemote (op0));
11343 return 0;
11344
632a2f50 11345 case IX86_BUILTIN_LOADIWKEY:
11346 {
11347 arg0 = CALL_EXPR_ARG (exp, 0);
11348 arg1 = CALL_EXPR_ARG (exp, 1);
11349 arg2 = CALL_EXPR_ARG (exp, 2);
11350 arg3 = CALL_EXPR_ARG (exp, 3);
11351
11352 op0 = expand_normal (arg0);
11353 op1 = expand_normal (arg1);
11354 op2 = expand_normal (arg2);
11355 op3 = expand_normal (arg3);
11356
11357 if (!REG_P (op0))
11358 op0 = copy_to_mode_reg (V2DImode, op0);
11359 if (!REG_P (op1))
11360 op1 = copy_to_mode_reg (V2DImode, op1);
11361 if (!REG_P (op2))
11362 op2 = copy_to_mode_reg (V2DImode, op2);
11363 if (!REG_P (op3))
11364 op3 = copy_to_mode_reg (SImode, op3);
11365
11366 emit_insn (gen_loadiwkey (op0, op1, op2, op3));
11367
11368 return 0;
11369 }
11370
11371 case IX86_BUILTIN_AESDEC128KLU8:
11372 icode = CODE_FOR_aesdec128klu8;
11373 goto aesdecenc_expand;
11374
11375 case IX86_BUILTIN_AESDEC256KLU8:
11376 icode = CODE_FOR_aesdec256klu8;
11377 goto aesdecenc_expand;
11378
11379 case IX86_BUILTIN_AESENC128KLU8:
11380 icode = CODE_FOR_aesenc128klu8;
11381 goto aesdecenc_expand;
11382
11383 case IX86_BUILTIN_AESENC256KLU8:
11384 icode = CODE_FOR_aesenc256klu8;
11385
11386 aesdecenc_expand:
11387
11388 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
11389 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
11390 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
11391
11392 op0 = expand_normal (arg0);
11393 op1 = expand_normal (arg1);
11394 op2 = expand_normal (arg2);
11395
11396 if (!address_operand (op0, V2DImode))
11397 {
11398 op0 = convert_memory_address (Pmode, op0);
11399 op0 = copy_addr_to_reg (op0);
11400 }
11401 op0 = gen_rtx_MEM (V2DImode, op0);
11402
11403 if (!REG_P (op1))
11404 op1 = copy_to_mode_reg (V2DImode, op1);
11405
11406 if (!address_operand (op2, VOIDmode))
11407 {
11408 op2 = convert_memory_address (Pmode, op2);
11409 op2 = copy_addr_to_reg (op2);
11410 }
11411 op2 = gen_rtx_MEM (BLKmode, op2);
11412
11413 emit_insn (GEN_FCN (icode) (op1, op1, op2));
11414
11415 if (target == 0)
11416 target = gen_reg_rtx (QImode);
11417
11418 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCZmode, FLAGS_REG),
11419 const0_rtx);
11420 emit_insn (gen_rtx_SET (target, pat));
11421
11422 emit_insn (gen_rtx_SET (op0, op1));
11423
11424 return target;
11425
11426 case IX86_BUILTIN_AESDECWIDE128KLU8:
11427 icode = CODE_FOR_aesdecwide128klu8;
11428 goto wideaesdecenc_expand;
11429
11430 case IX86_BUILTIN_AESDECWIDE256KLU8:
11431 icode = CODE_FOR_aesdecwide256klu8;
11432 goto wideaesdecenc_expand;
11433
11434 case IX86_BUILTIN_AESENCWIDE128KLU8:
11435 icode = CODE_FOR_aesencwide128klu8;
11436 goto wideaesdecenc_expand;
11437
11438 case IX86_BUILTIN_AESENCWIDE256KLU8:
11439 icode = CODE_FOR_aesencwide256klu8;
11440
11441 wideaesdecenc_expand:
11442
11443 rtx xmm_regs[8];
11444 rtx op;
11445
11446 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
11447 arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
11448 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
11449
11450 op0 = expand_normal (arg0);
11451 op1 = expand_normal (arg1);
11452 op2 = expand_normal (arg2);
11453
11454 if (!address_operand (op2, VOIDmode))
11455 {
11456 op2 = convert_memory_address (Pmode, op2);
11457 op2 = copy_addr_to_reg (op2);
11458 }
11459 op2 = gen_rtx_MEM (BLKmode, op2);
11460
11461 for (i = 0; i < 8; i++)
11462 {
11463 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
11464
11465 op = gen_rtx_MEM (V2DImode,
11466 plus_constant (Pmode, op1, (i * 16)));
11467
11468 emit_move_insn (xmm_regs[i], op);
11469 }
11470
11471 emit_insn (GEN_FCN (icode) (op2));
11472
11473 if (target == 0)
11474 target = gen_reg_rtx (QImode);
11475
11476 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCZmode, FLAGS_REG),
11477 const0_rtx);
11478 emit_insn (gen_rtx_SET (target, pat));
11479
11480 for (i = 0; i < 8; i++)
11481 {
11482 op = gen_rtx_MEM (V2DImode,
11483 plus_constant (Pmode, op0, (i * 16)));
11484 emit_move_insn (op, xmm_regs[i]);
11485 }
11486
11487 return target;
11488
11489 case IX86_BUILTIN_ENCODEKEY128U32:
11490 {
11491 rtx op, xmm_regs[7];
11492
11493 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
11494 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
11495 arg2 = CALL_EXPR_ARG (exp, 2); // void *h
11496
11497 op0 = expand_normal (arg0);
11498 op1 = expand_normal (arg1);
11499 op2 = expand_normal (arg2);
11500
11501 if (!REG_P (op0))
11502 op0 = copy_to_mode_reg (SImode, op0);
11503
11504 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
11505 emit_move_insn (op, op1);
11506
11507 for (i = 0; i < 3; i++)
11508 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
11509
11510 if (target == 0)
11511 target = gen_reg_rtx (SImode);
11512
11513 emit_insn (gen_encodekey128u32 (target, op0));
11514
11515 for (i = 0; i < 3; i++)
11516 {
11517 op = gen_rtx_MEM (V2DImode,
11518 plus_constant (Pmode, op2, (i * 16)));
11519 emit_move_insn (op, xmm_regs[i]);
11520 }
11521
11522 return target;
11523 }
11524 case IX86_BUILTIN_ENCODEKEY256U32:
11525 {
11526 rtx op, xmm_regs[7];
11527
11528 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
11529 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
11530 arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
11531 arg3 = CALL_EXPR_ARG (exp, 3); // void *h
11532
11533 op0 = expand_normal (arg0);
11534 op1 = expand_normal (arg1);
11535 op2 = expand_normal (arg2);
11536 op3 = expand_normal (arg3);
11537
11538 if (!REG_P (op0))
11539 op0 = copy_to_mode_reg (SImode, op0);
11540
11541 /* Force to use xmm0, xmm1 for keylow, keyhi*/
11542 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
11543 emit_move_insn (op, op1);
11544 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
11545 emit_move_insn (op, op2);
11546
11547 for (i = 0; i < 4; i++)
11548 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
11549
11550 if (target == 0)
11551 target = gen_reg_rtx (SImode);
11552
11553 emit_insn (gen_encodekey256u32 (target, op0));
11554
11555 for (i = 0; i < 4; i++)
11556 {
11557 op = gen_rtx_MEM (V2DImode,
11558 plus_constant (Pmode, op3, (i * 16)));
11559 emit_move_insn (op, xmm_regs[i]);
11560 }
11561
11562 return target;
11563 }
11564
2bf6d935
ML
11565 case IX86_BUILTIN_VEC_INIT_V2SI:
11566 case IX86_BUILTIN_VEC_INIT_V4HI:
11567 case IX86_BUILTIN_VEC_INIT_V8QI:
11568 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
11569
11570 case IX86_BUILTIN_VEC_EXT_V2DF:
11571 case IX86_BUILTIN_VEC_EXT_V2DI:
11572 case IX86_BUILTIN_VEC_EXT_V4SF:
11573 case IX86_BUILTIN_VEC_EXT_V4SI:
11574 case IX86_BUILTIN_VEC_EXT_V8HI:
11575 case IX86_BUILTIN_VEC_EXT_V2SI:
11576 case IX86_BUILTIN_VEC_EXT_V4HI:
11577 case IX86_BUILTIN_VEC_EXT_V16QI:
11578 return ix86_expand_vec_ext_builtin (exp, target);
11579
11580 case IX86_BUILTIN_VEC_SET_V2DI:
11581 case IX86_BUILTIN_VEC_SET_V4SF:
11582 case IX86_BUILTIN_VEC_SET_V4SI:
11583 case IX86_BUILTIN_VEC_SET_V8HI:
11584 case IX86_BUILTIN_VEC_SET_V4HI:
11585 case IX86_BUILTIN_VEC_SET_V16QI:
11586 return ix86_expand_vec_set_builtin (exp);
11587
11588 case IX86_BUILTIN_NANQ:
11589 case IX86_BUILTIN_NANSQ:
11590 return expand_call (exp, target, ignore);
11591
11592 case IX86_BUILTIN_RDPID:
11593
11594 op0 = gen_reg_rtx (word_mode);
11595
11596 if (TARGET_64BIT)
11597 {
11598 insn = gen_rdpid_rex64 (op0);
11599 op0 = convert_to_mode (SImode, op0, 1);
11600 }
11601 else
11602 insn = gen_rdpid (op0);
11603
11604 emit_insn (insn);
11605
11606 if (target == 0
11607 || !register_operand (target, SImode))
11608 target = gen_reg_rtx (SImode);
11609
11610 emit_move_insn (target, op0);
11611 return target;
11612
e21b52af
HL
11613 case IX86_BUILTIN_2INTERSECTD512:
11614 case IX86_BUILTIN_2INTERSECTQ512:
11615 case IX86_BUILTIN_2INTERSECTD256:
11616 case IX86_BUILTIN_2INTERSECTQ256:
11617 case IX86_BUILTIN_2INTERSECTD128:
11618 case IX86_BUILTIN_2INTERSECTQ128:
11619 arg0 = CALL_EXPR_ARG (exp, 0);
11620 arg1 = CALL_EXPR_ARG (exp, 1);
11621 arg2 = CALL_EXPR_ARG (exp, 2);
11622 arg3 = CALL_EXPR_ARG (exp, 3);
11623 op0 = expand_normal (arg0);
11624 op1 = expand_normal (arg1);
11625 op2 = expand_normal (arg2);
11626 op3 = expand_normal (arg3);
11627
11628 if (!address_operand (op0, VOIDmode))
11629 {
11630 op0 = convert_memory_address (Pmode, op0);
11631 op0 = copy_addr_to_reg (op0);
11632 }
11633 if (!address_operand (op1, VOIDmode))
11634 {
11635 op1 = convert_memory_address (Pmode, op1);
11636 op1 = copy_addr_to_reg (op1);
11637 }
11638
11639 switch (fcode)
11640 {
11641 case IX86_BUILTIN_2INTERSECTD512:
11642 mode4 = P2HImode;
11643 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
11644 break;
11645 case IX86_BUILTIN_2INTERSECTQ512:
11646 mode4 = P2QImode;
11647 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
11648 break;
11649 case IX86_BUILTIN_2INTERSECTD256:
11650 mode4 = P2QImode;
11651 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
11652 break;
11653 case IX86_BUILTIN_2INTERSECTQ256:
11654 mode4 = P2QImode;
11655 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
11656 break;
11657 case IX86_BUILTIN_2INTERSECTD128:
11658 mode4 = P2QImode;
11659 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
11660 break;
11661 case IX86_BUILTIN_2INTERSECTQ128:
11662 mode4 = P2QImode;
11663 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
11664 break;
11665 default:
11666 gcc_unreachable ();
11667 }
11668
11669 mode2 = insn_data[icode].operand[1].mode;
11670 mode3 = insn_data[icode].operand[2].mode;
11671 if (!insn_data[icode].operand[1].predicate (op2, mode2))
11672 op2 = copy_to_mode_reg (mode2, op2);
11673 if (!insn_data[icode].operand[2].predicate (op3, mode3))
11674 op3 = copy_to_mode_reg (mode3, op3);
11675
11676 op4 = gen_reg_rtx (mode4);
11677 emit_insn (GEN_FCN (icode) (op4, op2, op3));
11678 mode0 = mode4 == P2HImode ? HImode : QImode;
11679 emit_move_insn (gen_rtx_MEM (mode0, op0),
11680 gen_lowpart (mode0, op4));
11681 emit_move_insn (gen_rtx_MEM (mode0, op1),
11682 gen_highpart (mode0, op4));
11683
11684 return 0;
11685
2bf6d935
ML
11686 case IX86_BUILTIN_RDPMC:
11687 case IX86_BUILTIN_RDTSC:
11688 case IX86_BUILTIN_RDTSCP:
11689 case IX86_BUILTIN_XGETBV:
11690
11691 op0 = gen_reg_rtx (DImode);
11692 op1 = gen_reg_rtx (DImode);
11693
11694 if (fcode == IX86_BUILTIN_RDPMC)
11695 {
11696 arg0 = CALL_EXPR_ARG (exp, 0);
11697 op2 = expand_normal (arg0);
11698 if (!register_operand (op2, SImode))
11699 op2 = copy_to_mode_reg (SImode, op2);
11700
11701 insn = (TARGET_64BIT
11702 ? gen_rdpmc_rex64 (op0, op1, op2)
11703 : gen_rdpmc (op0, op2));
11704 emit_insn (insn);
11705 }
11706 else if (fcode == IX86_BUILTIN_XGETBV)
11707 {
11708 arg0 = CALL_EXPR_ARG (exp, 0);
11709 op2 = expand_normal (arg0);
11710 if (!register_operand (op2, SImode))
11711 op2 = copy_to_mode_reg (SImode, op2);
11712
11713 insn = (TARGET_64BIT
11714 ? gen_xgetbv_rex64 (op0, op1, op2)
11715 : gen_xgetbv (op0, op2));
11716 emit_insn (insn);
11717 }
11718 else if (fcode == IX86_BUILTIN_RDTSC)
11719 {
11720 insn = (TARGET_64BIT
11721 ? gen_rdtsc_rex64 (op0, op1)
11722 : gen_rdtsc (op0));
11723 emit_insn (insn);
11724 }
11725 else
11726 {
11727 op2 = gen_reg_rtx (SImode);
11728
11729 insn = (TARGET_64BIT
11730 ? gen_rdtscp_rex64 (op0, op1, op2)
11731 : gen_rdtscp (op0, op2));
11732 emit_insn (insn);
11733
11734 arg0 = CALL_EXPR_ARG (exp, 0);
11735 op4 = expand_normal (arg0);
11736 if (!address_operand (op4, VOIDmode))
11737 {
11738 op4 = convert_memory_address (Pmode, op4);
11739 op4 = copy_addr_to_reg (op4);
11740 }
11741 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
11742 }
11743
11744 if (target == 0
11745 || !register_operand (target, DImode))
11746 target = gen_reg_rtx (DImode);
11747
11748 if (TARGET_64BIT)
11749 {
11750 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
11751 op1, 1, OPTAB_DIRECT);
11752 op0 = expand_simple_binop (DImode, IOR, op0, op1,
11753 op0, 1, OPTAB_DIRECT);
11754 }
11755
11756 emit_move_insn (target, op0);
11757 return target;
11758
6a10feda
XG
11759 case IX86_BUILTIN_ENQCMD:
11760 case IX86_BUILTIN_ENQCMDS:
2bf6d935
ML
11761 case IX86_BUILTIN_MOVDIR64B:
11762
11763 arg0 = CALL_EXPR_ARG (exp, 0);
11764 arg1 = CALL_EXPR_ARG (exp, 1);
11765 op0 = expand_normal (arg0);
11766 op1 = expand_normal (arg1);
11767
11768 op0 = ix86_zero_extend_to_Pmode (op0);
11769 if (!address_operand (op1, VOIDmode))
11770 {
11771 op1 = convert_memory_address (Pmode, op1);
11772 op1 = copy_addr_to_reg (op1);
11773 }
11774 op1 = gen_rtx_MEM (XImode, op1);
11775
6a10feda
XG
11776 if (fcode == IX86_BUILTIN_MOVDIR64B)
11777 {
11778 emit_insn (gen_movdir64b (Pmode, op0, op1));
11779 return 0;
11780 }
11781 else
11782 {
44320665
UB
11783 if (target == 0
11784 || !register_operand (target, SImode))
11785 target = gen_reg_rtx (SImode);
6a10feda 11786
6a10feda
XG
11787 emit_move_insn (target, const0_rtx);
11788 target = gen_rtx_SUBREG (QImode, target, 0);
11789
44320665
UB
11790 int unspecv = (fcode == IX86_BUILTIN_ENQCMD
11791 ? UNSPECV_ENQCMD
11792 : UNSPECV_ENQCMDS);
11793 icode = code_for_enqcmd (unspecv, Pmode);
11794 emit_insn (GEN_FCN (icode) (op0, op1));
6a10feda 11795
44320665
UB
11796 emit_insn
11797 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
11798 gen_rtx_fmt_ee (EQ, QImode,
11799 gen_rtx_REG (CCZmode, FLAGS_REG),
11800 const0_rtx)));
6a10feda
XG
11801 return SUBREG_REG (target);
11802 }
2bf6d935
ML
11803
11804 case IX86_BUILTIN_FXSAVE:
11805 case IX86_BUILTIN_FXRSTOR:
11806 case IX86_BUILTIN_FXSAVE64:
11807 case IX86_BUILTIN_FXRSTOR64:
11808 case IX86_BUILTIN_FNSTENV:
11809 case IX86_BUILTIN_FLDENV:
11810 mode0 = BLKmode;
11811 switch (fcode)
11812 {
11813 case IX86_BUILTIN_FXSAVE:
11814 icode = CODE_FOR_fxsave;
11815 break;
11816 case IX86_BUILTIN_FXRSTOR:
11817 icode = CODE_FOR_fxrstor;
11818 break;
11819 case IX86_BUILTIN_FXSAVE64:
11820 icode = CODE_FOR_fxsave64;
11821 break;
11822 case IX86_BUILTIN_FXRSTOR64:
11823 icode = CODE_FOR_fxrstor64;
11824 break;
11825 case IX86_BUILTIN_FNSTENV:
11826 icode = CODE_FOR_fnstenv;
11827 break;
11828 case IX86_BUILTIN_FLDENV:
11829 icode = CODE_FOR_fldenv;
11830 break;
11831 default:
11832 gcc_unreachable ();
11833 }
11834
11835 arg0 = CALL_EXPR_ARG (exp, 0);
11836 op0 = expand_normal (arg0);
11837
11838 if (!address_operand (op0, VOIDmode))
11839 {
11840 op0 = convert_memory_address (Pmode, op0);
11841 op0 = copy_addr_to_reg (op0);
11842 }
11843 op0 = gen_rtx_MEM (mode0, op0);
11844
11845 pat = GEN_FCN (icode) (op0);
11846 if (pat)
11847 emit_insn (pat);
11848 return 0;
11849
11850 case IX86_BUILTIN_XSETBV:
11851 arg0 = CALL_EXPR_ARG (exp, 0);
11852 arg1 = CALL_EXPR_ARG (exp, 1);
11853 op0 = expand_normal (arg0);
11854 op1 = expand_normal (arg1);
11855
11856 if (!REG_P (op0))
11857 op0 = copy_to_mode_reg (SImode, op0);
11858
11859 op1 = force_reg (DImode, op1);
11860
11861 if (TARGET_64BIT)
11862 {
11863 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11864 NULL, 1, OPTAB_DIRECT);
11865
11866 icode = CODE_FOR_xsetbv_rex64;
11867
11868 op2 = gen_lowpart (SImode, op2);
11869 op1 = gen_lowpart (SImode, op1);
11870 pat = GEN_FCN (icode) (op0, op1, op2);
11871 }
11872 else
11873 {
11874 icode = CODE_FOR_xsetbv;
11875
11876 pat = GEN_FCN (icode) (op0, op1);
11877 }
11878 if (pat)
11879 emit_insn (pat);
11880 return 0;
11881
11882 case IX86_BUILTIN_XSAVE:
11883 case IX86_BUILTIN_XRSTOR:
11884 case IX86_BUILTIN_XSAVE64:
11885 case IX86_BUILTIN_XRSTOR64:
11886 case IX86_BUILTIN_XSAVEOPT:
11887 case IX86_BUILTIN_XSAVEOPT64:
11888 case IX86_BUILTIN_XSAVES:
11889 case IX86_BUILTIN_XRSTORS:
11890 case IX86_BUILTIN_XSAVES64:
11891 case IX86_BUILTIN_XRSTORS64:
11892 case IX86_BUILTIN_XSAVEC:
11893 case IX86_BUILTIN_XSAVEC64:
11894 arg0 = CALL_EXPR_ARG (exp, 0);
11895 arg1 = CALL_EXPR_ARG (exp, 1);
11896 op0 = expand_normal (arg0);
11897 op1 = expand_normal (arg1);
11898
11899 if (!address_operand (op0, VOIDmode))
11900 {
11901 op0 = convert_memory_address (Pmode, op0);
11902 op0 = copy_addr_to_reg (op0);
11903 }
11904 op0 = gen_rtx_MEM (BLKmode, op0);
11905
11906 op1 = force_reg (DImode, op1);
11907
11908 if (TARGET_64BIT)
11909 {
11910 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11911 NULL, 1, OPTAB_DIRECT);
11912 switch (fcode)
11913 {
11914 case IX86_BUILTIN_XSAVE:
11915 icode = CODE_FOR_xsave_rex64;
11916 break;
11917 case IX86_BUILTIN_XRSTOR:
11918 icode = CODE_FOR_xrstor_rex64;
11919 break;
11920 case IX86_BUILTIN_XSAVE64:
11921 icode = CODE_FOR_xsave64;
11922 break;
11923 case IX86_BUILTIN_XRSTOR64:
11924 icode = CODE_FOR_xrstor64;
11925 break;
11926 case IX86_BUILTIN_XSAVEOPT:
11927 icode = CODE_FOR_xsaveopt_rex64;
11928 break;
11929 case IX86_BUILTIN_XSAVEOPT64:
11930 icode = CODE_FOR_xsaveopt64;
11931 break;
11932 case IX86_BUILTIN_XSAVES:
11933 icode = CODE_FOR_xsaves_rex64;
11934 break;
11935 case IX86_BUILTIN_XRSTORS:
11936 icode = CODE_FOR_xrstors_rex64;
11937 break;
11938 case IX86_BUILTIN_XSAVES64:
11939 icode = CODE_FOR_xsaves64;
11940 break;
11941 case IX86_BUILTIN_XRSTORS64:
11942 icode = CODE_FOR_xrstors64;
11943 break;
11944 case IX86_BUILTIN_XSAVEC:
11945 icode = CODE_FOR_xsavec_rex64;
11946 break;
11947 case IX86_BUILTIN_XSAVEC64:
11948 icode = CODE_FOR_xsavec64;
11949 break;
11950 default:
11951 gcc_unreachable ();
11952 }
11953
11954 op2 = gen_lowpart (SImode, op2);
11955 op1 = gen_lowpart (SImode, op1);
11956 pat = GEN_FCN (icode) (op0, op1, op2);
11957 }
11958 else
11959 {
11960 switch (fcode)
11961 {
11962 case IX86_BUILTIN_XSAVE:
11963 icode = CODE_FOR_xsave;
11964 break;
11965 case IX86_BUILTIN_XRSTOR:
11966 icode = CODE_FOR_xrstor;
11967 break;
11968 case IX86_BUILTIN_XSAVEOPT:
11969 icode = CODE_FOR_xsaveopt;
11970 break;
11971 case IX86_BUILTIN_XSAVES:
11972 icode = CODE_FOR_xsaves;
11973 break;
11974 case IX86_BUILTIN_XRSTORS:
11975 icode = CODE_FOR_xrstors;
11976 break;
11977 case IX86_BUILTIN_XSAVEC:
11978 icode = CODE_FOR_xsavec;
11979 break;
11980 default:
11981 gcc_unreachable ();
11982 }
11983 pat = GEN_FCN (icode) (op0, op1);
11984 }
11985
11986 if (pat)
11987 emit_insn (pat);
11988 return 0;
11989
11990 case IX86_BUILTIN_LLWPCB:
11991 arg0 = CALL_EXPR_ARG (exp, 0);
11992 op0 = expand_normal (arg0);
2398c206
UB
11993
11994 if (!register_operand (op0, Pmode))
2bf6d935 11995 op0 = ix86_zero_extend_to_Pmode (op0);
2398c206 11996 emit_insn (gen_lwp_llwpcb (Pmode, op0));
2bf6d935
ML
11997 return 0;
11998
11999 case IX86_BUILTIN_SLWPCB:
2bf6d935 12000 if (!target
2398c206 12001 || !register_operand (target, Pmode))
2bf6d935 12002 target = gen_reg_rtx (Pmode);
2398c206 12003 emit_insn (gen_lwp_slwpcb (Pmode, target));
2bf6d935
ML
12004 return target;
12005
2398c206
UB
12006 case IX86_BUILTIN_LWPVAL32:
12007 case IX86_BUILTIN_LWPVAL64:
12008 case IX86_BUILTIN_LWPINS32:
12009 case IX86_BUILTIN_LWPINS64:
12010 mode = ((fcode == IX86_BUILTIN_LWPVAL32
12011 || fcode == IX86_BUILTIN_LWPINS32)
12012 ? SImode : DImode);
12013
12014 if (fcode == IX86_BUILTIN_LWPVAL32
12015 || fcode == IX86_BUILTIN_LWPVAL64)
12016 icode = code_for_lwp_lwpval (mode);
12017 else
12018 icode = code_for_lwp_lwpins (mode);
12019
12020 arg0 = CALL_EXPR_ARG (exp, 0);
12021 arg1 = CALL_EXPR_ARG (exp, 1);
12022 arg2 = CALL_EXPR_ARG (exp, 2);
12023 op0 = expand_normal (arg0);
12024 op1 = expand_normal (arg1);
12025 op2 = expand_normal (arg2);
12026 mode0 = insn_data[icode].operand[0].mode;
12027
12028 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12029 op0 = copy_to_mode_reg (mode0, op0);
12030 if (!insn_data[icode].operand[1].predicate (op1, SImode))
12031 op1 = copy_to_mode_reg (SImode, op1);
12032
12033 if (!CONST_INT_P (op2))
12034 {
12035 error ("the last argument must be a 32-bit immediate");
12036 return const0_rtx;
12037 }
12038
12039 emit_insn (GEN_FCN (icode) (op0, op1, op2));
12040
12041 if (fcode == IX86_BUILTIN_LWPINS32
12042 || fcode == IX86_BUILTIN_LWPINS64)
12043 {
12044 if (target == 0
12045 || !nonimmediate_operand (target, QImode))
12046 target = gen_reg_rtx (QImode);
12047
12048 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12049 const0_rtx);
12050 emit_insn (gen_rtx_SET (target, pat));
12051
12052 return target;
12053 }
12054 else
12055 return 0;
12056
2bf6d935
ML
12057 case IX86_BUILTIN_BEXTRI32:
12058 case IX86_BUILTIN_BEXTRI64:
9e026191
UB
12059 mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
12060
2bf6d935
ML
12061 arg0 = CALL_EXPR_ARG (exp, 0);
12062 arg1 = CALL_EXPR_ARG (exp, 1);
12063 op0 = expand_normal (arg0);
12064 op1 = expand_normal (arg1);
9e026191 12065
2bf6d935 12066 if (!CONST_INT_P (op1))
9e026191
UB
12067 {
12068 error ("last argument must be an immediate");
12069 return const0_rtx;
12070 }
2bf6d935 12071 else
9e026191
UB
12072 {
12073 unsigned char lsb_index = UINTVAL (op1);
12074 unsigned char length = UINTVAL (op1) >> 8;
12075
12076 unsigned char bitsize = GET_MODE_BITSIZE (mode);
12077
12078 icode = code_for_tbm_bextri (mode);
2bf6d935
ML
12079
12080 mode1 = insn_data[icode].operand[1].mode;
12081 if (!insn_data[icode].operand[1].predicate (op0, mode1))
12082 op0 = copy_to_mode_reg (mode1, op0);
12083
12084 mode0 = insn_data[icode].operand[0].mode;
12085 if (target == 0
12086 || !register_operand (target, mode0))
12087 target = gen_reg_rtx (mode0);
12088
9e026191
UB
12089 if (length == 0 || lsb_index >= bitsize)
12090 {
12091 emit_move_insn (target, const0_rtx);
12092 return target;
12093 }
12094
12095 if (length + lsb_index > bitsize)
12096 length = bitsize - lsb_index;
12097
12098 op1 = GEN_INT (length);
12099 op2 = GEN_INT (lsb_index);
12100
12101 emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
12102 return target;
12103 }
2bf6d935
ML
12104
12105 case IX86_BUILTIN_RDRAND16_STEP:
9e026191 12106 mode = HImode;
2bf6d935
ML
12107 goto rdrand_step;
12108
12109 case IX86_BUILTIN_RDRAND32_STEP:
9e026191 12110 mode = SImode;
2bf6d935
ML
12111 goto rdrand_step;
12112
12113 case IX86_BUILTIN_RDRAND64_STEP:
9e026191 12114 mode = DImode;
2bf6d935
ML
12115
12116rdrand_step:
12117 arg0 = CALL_EXPR_ARG (exp, 0);
12118 op1 = expand_normal (arg0);
12119 if (!address_operand (op1, VOIDmode))
12120 {
12121 op1 = convert_memory_address (Pmode, op1);
12122 op1 = copy_addr_to_reg (op1);
12123 }
12124
9e026191
UB
12125 op0 = gen_reg_rtx (mode);
12126 emit_insn (gen_rdrand (mode, op0));
2bf6d935 12127
9e026191 12128 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
2bf6d935 12129
9e026191 12130 op1 = force_reg (SImode, const1_rtx);
2bf6d935
ML
12131
12132 /* Emit SImode conditional move. */
9e026191 12133 if (mode == HImode)
2bf6d935
ML
12134 {
12135 if (TARGET_ZERO_EXTEND_WITH_AND
12136 && optimize_function_for_speed_p (cfun))
12137 {
12138 op2 = force_reg (SImode, const0_rtx);
12139
12140 emit_insn (gen_movstricthi
12141 (gen_lowpart (HImode, op2), op0));
12142 }
12143 else
12144 {
12145 op2 = gen_reg_rtx (SImode);
12146
12147 emit_insn (gen_zero_extendhisi2 (op2, op0));
12148 }
12149 }
9e026191 12150 else if (mode == SImode)
2bf6d935
ML
12151 op2 = op0;
12152 else
12153 op2 = gen_rtx_SUBREG (SImode, op0, 0);
12154
12155 if (target == 0
12156 || !register_operand (target, SImode))
12157 target = gen_reg_rtx (SImode);
12158
12159 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
12160 const0_rtx);
12161 emit_insn (gen_rtx_SET (target,
12162 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
12163 return target;
12164
12165 case IX86_BUILTIN_RDSEED16_STEP:
9e026191 12166 mode = HImode;
2bf6d935
ML
12167 goto rdseed_step;
12168
12169 case IX86_BUILTIN_RDSEED32_STEP:
9e026191 12170 mode = SImode;
2bf6d935
ML
12171 goto rdseed_step;
12172
12173 case IX86_BUILTIN_RDSEED64_STEP:
9e026191 12174 mode = DImode;
2bf6d935
ML
12175
12176rdseed_step:
12177 arg0 = CALL_EXPR_ARG (exp, 0);
12178 op1 = expand_normal (arg0);
12179 if (!address_operand (op1, VOIDmode))
12180 {
12181 op1 = convert_memory_address (Pmode, op1);
12182 op1 = copy_addr_to_reg (op1);
12183 }
12184
9e026191
UB
12185 op0 = gen_reg_rtx (mode);
12186 emit_insn (gen_rdseed (mode, op0));
2bf6d935 12187
9e026191 12188 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
2bf6d935
ML
12189
12190 op2 = gen_reg_rtx (QImode);
12191
12192 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12193 const0_rtx);
12194 emit_insn (gen_rtx_SET (op2, pat));
12195
12196 if (target == 0
12197 || !register_operand (target, SImode))
12198 target = gen_reg_rtx (SImode);
12199
12200 emit_insn (gen_zero_extendqisi2 (target, op2));
12201 return target;
12202
12203 case IX86_BUILTIN_SBB32:
12204 icode = CODE_FOR_subborrowsi;
12205 icode2 = CODE_FOR_subborrowsi_0;
12206 mode0 = SImode;
12207 mode1 = DImode;
12208 mode2 = CCmode;
12209 goto handlecarry;
12210
12211 case IX86_BUILTIN_SBB64:
12212 icode = CODE_FOR_subborrowdi;
12213 icode2 = CODE_FOR_subborrowdi_0;
12214 mode0 = DImode;
12215 mode1 = TImode;
12216 mode2 = CCmode;
12217 goto handlecarry;
12218
12219 case IX86_BUILTIN_ADDCARRYX32:
12220 icode = CODE_FOR_addcarrysi;
12221 icode2 = CODE_FOR_addcarrysi_0;
12222 mode0 = SImode;
12223 mode1 = DImode;
12224 mode2 = CCCmode;
12225 goto handlecarry;
12226
12227 case IX86_BUILTIN_ADDCARRYX64:
12228 icode = CODE_FOR_addcarrydi;
12229 icode2 = CODE_FOR_addcarrydi_0;
12230 mode0 = DImode;
12231 mode1 = TImode;
12232 mode2 = CCCmode;
12233
12234 handlecarry:
12235 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
12236 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
12237 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
12238 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
12239
12240 op1 = expand_normal (arg0);
12241 if (!integer_zerop (arg0))
12242 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
12243
12244 op2 = expand_normal (arg1);
12245 if (!register_operand (op2, mode0))
12246 op2 = copy_to_mode_reg (mode0, op2);
12247
12248 op3 = expand_normal (arg2);
12249 if (!register_operand (op3, mode0))
12250 op3 = copy_to_mode_reg (mode0, op3);
12251
12252 op4 = expand_normal (arg3);
12253 if (!address_operand (op4, VOIDmode))
12254 {
12255 op4 = convert_memory_address (Pmode, op4);
12256 op4 = copy_addr_to_reg (op4);
12257 }
12258
12259 op0 = gen_reg_rtx (mode0);
12260 if (integer_zerop (arg0))
12261 {
12262 /* If arg0 is 0, optimize right away into add or sub
12263 instruction that sets CCCmode flags. */
12264 op1 = gen_rtx_REG (mode2, FLAGS_REG);
12265 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
12266 }
12267 else
12268 {
12269 /* Generate CF from input operand. */
12270 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
12271
12272 /* Generate instruction that consumes CF. */
12273 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
12274 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
12275 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
12276 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
12277 }
12278
12279 /* Return current CF value. */
12280 if (target == 0)
12281 target = gen_reg_rtx (QImode);
12282
12283 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
12284 emit_insn (gen_rtx_SET (target, pat));
12285
12286 /* Store the result. */
12287 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
12288
12289 return target;
12290
12291 case IX86_BUILTIN_READ_FLAGS:
12292 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
12293
12294 if (optimize
12295 || target == NULL_RTX
12296 || !nonimmediate_operand (target, word_mode)
12297 || GET_MODE (target) != word_mode)
12298 target = gen_reg_rtx (word_mode);
12299
12300 emit_insn (gen_pop (target));
12301 return target;
12302
12303 case IX86_BUILTIN_WRITE_FLAGS:
12304
12305 arg0 = CALL_EXPR_ARG (exp, 0);
12306 op0 = expand_normal (arg0);
12307 if (!general_no_elim_operand (op0, word_mode))
12308 op0 = copy_to_mode_reg (word_mode, op0);
12309
12310 emit_insn (gen_push (op0));
12311 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
12312 return 0;
12313
12314 case IX86_BUILTIN_KTESTC8:
12315 icode = CODE_FOR_ktestqi;
12316 mode3 = CCCmode;
12317 goto kortest;
12318
12319 case IX86_BUILTIN_KTESTZ8:
12320 icode = CODE_FOR_ktestqi;
12321 mode3 = CCZmode;
12322 goto kortest;
12323
12324 case IX86_BUILTIN_KTESTC16:
12325 icode = CODE_FOR_ktesthi;
12326 mode3 = CCCmode;
12327 goto kortest;
12328
12329 case IX86_BUILTIN_KTESTZ16:
12330 icode = CODE_FOR_ktesthi;
12331 mode3 = CCZmode;
12332 goto kortest;
12333
12334 case IX86_BUILTIN_KTESTC32:
12335 icode = CODE_FOR_ktestsi;
12336 mode3 = CCCmode;
12337 goto kortest;
12338
12339 case IX86_BUILTIN_KTESTZ32:
12340 icode = CODE_FOR_ktestsi;
12341 mode3 = CCZmode;
12342 goto kortest;
12343
12344 case IX86_BUILTIN_KTESTC64:
12345 icode = CODE_FOR_ktestdi;
12346 mode3 = CCCmode;
12347 goto kortest;
12348
12349 case IX86_BUILTIN_KTESTZ64:
12350 icode = CODE_FOR_ktestdi;
12351 mode3 = CCZmode;
12352 goto kortest;
12353
12354 case IX86_BUILTIN_KORTESTC8:
12355 icode = CODE_FOR_kortestqi;
12356 mode3 = CCCmode;
12357 goto kortest;
12358
12359 case IX86_BUILTIN_KORTESTZ8:
12360 icode = CODE_FOR_kortestqi;
12361 mode3 = CCZmode;
12362 goto kortest;
12363
12364 case IX86_BUILTIN_KORTESTC16:
12365 icode = CODE_FOR_kortesthi;
12366 mode3 = CCCmode;
12367 goto kortest;
12368
12369 case IX86_BUILTIN_KORTESTZ16:
12370 icode = CODE_FOR_kortesthi;
12371 mode3 = CCZmode;
12372 goto kortest;
12373
12374 case IX86_BUILTIN_KORTESTC32:
12375 icode = CODE_FOR_kortestsi;
12376 mode3 = CCCmode;
12377 goto kortest;
12378
12379 case IX86_BUILTIN_KORTESTZ32:
12380 icode = CODE_FOR_kortestsi;
12381 mode3 = CCZmode;
12382 goto kortest;
12383
12384 case IX86_BUILTIN_KORTESTC64:
12385 icode = CODE_FOR_kortestdi;
12386 mode3 = CCCmode;
12387 goto kortest;
12388
12389 case IX86_BUILTIN_KORTESTZ64:
12390 icode = CODE_FOR_kortestdi;
12391 mode3 = CCZmode;
12392
12393 kortest:
12394 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
12395 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
12396 op0 = expand_normal (arg0);
12397 op1 = expand_normal (arg1);
12398
12399 mode0 = insn_data[icode].operand[0].mode;
12400 mode1 = insn_data[icode].operand[1].mode;
12401
12402 if (GET_MODE (op0) != VOIDmode)
12403 op0 = force_reg (GET_MODE (op0), op0);
12404
12405 op0 = gen_lowpart (mode0, op0);
12406
12407 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12408 op0 = copy_to_mode_reg (mode0, op0);
12409
12410 if (GET_MODE (op1) != VOIDmode)
12411 op1 = force_reg (GET_MODE (op1), op1);
12412
12413 op1 = gen_lowpart (mode1, op1);
12414
12415 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12416 op1 = copy_to_mode_reg (mode1, op1);
12417
12418 target = gen_reg_rtx (QImode);
12419
12420 /* Emit kortest. */
12421 emit_insn (GEN_FCN (icode) (op0, op1));
12422 /* And use setcc to return result from flags. */
12423 ix86_expand_setcc (target, EQ,
12424 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
12425 return target;
12426
12427 case IX86_BUILTIN_GATHERSIV2DF:
12428 icode = CODE_FOR_avx2_gathersiv2df;
12429 goto gather_gen;
12430 case IX86_BUILTIN_GATHERSIV4DF:
12431 icode = CODE_FOR_avx2_gathersiv4df;
12432 goto gather_gen;
12433 case IX86_BUILTIN_GATHERDIV2DF:
12434 icode = CODE_FOR_avx2_gatherdiv2df;
12435 goto gather_gen;
12436 case IX86_BUILTIN_GATHERDIV4DF:
12437 icode = CODE_FOR_avx2_gatherdiv4df;
12438 goto gather_gen;
12439 case IX86_BUILTIN_GATHERSIV4SF:
12440 icode = CODE_FOR_avx2_gathersiv4sf;
12441 goto gather_gen;
12442 case IX86_BUILTIN_GATHERSIV8SF:
12443 icode = CODE_FOR_avx2_gathersiv8sf;
12444 goto gather_gen;
12445 case IX86_BUILTIN_GATHERDIV4SF:
12446 icode = CODE_FOR_avx2_gatherdiv4sf;
12447 goto gather_gen;
12448 case IX86_BUILTIN_GATHERDIV8SF:
12449 icode = CODE_FOR_avx2_gatherdiv8sf;
12450 goto gather_gen;
12451 case IX86_BUILTIN_GATHERSIV2DI:
12452 icode = CODE_FOR_avx2_gathersiv2di;
12453 goto gather_gen;
12454 case IX86_BUILTIN_GATHERSIV4DI:
12455 icode = CODE_FOR_avx2_gathersiv4di;
12456 goto gather_gen;
12457 case IX86_BUILTIN_GATHERDIV2DI:
12458 icode = CODE_FOR_avx2_gatherdiv2di;
12459 goto gather_gen;
12460 case IX86_BUILTIN_GATHERDIV4DI:
12461 icode = CODE_FOR_avx2_gatherdiv4di;
12462 goto gather_gen;
12463 case IX86_BUILTIN_GATHERSIV4SI:
12464 icode = CODE_FOR_avx2_gathersiv4si;
12465 goto gather_gen;
12466 case IX86_BUILTIN_GATHERSIV8SI:
12467 icode = CODE_FOR_avx2_gathersiv8si;
12468 goto gather_gen;
12469 case IX86_BUILTIN_GATHERDIV4SI:
12470 icode = CODE_FOR_avx2_gatherdiv4si;
12471 goto gather_gen;
12472 case IX86_BUILTIN_GATHERDIV8SI:
12473 icode = CODE_FOR_avx2_gatherdiv8si;
12474 goto gather_gen;
12475 case IX86_BUILTIN_GATHERALTSIV4DF:
12476 icode = CODE_FOR_avx2_gathersiv4df;
12477 goto gather_gen;
12478 case IX86_BUILTIN_GATHERALTDIV8SF:
12479 icode = CODE_FOR_avx2_gatherdiv8sf;
12480 goto gather_gen;
12481 case IX86_BUILTIN_GATHERALTSIV4DI:
12482 icode = CODE_FOR_avx2_gathersiv4di;
12483 goto gather_gen;
12484 case IX86_BUILTIN_GATHERALTDIV8SI:
12485 icode = CODE_FOR_avx2_gatherdiv8si;
12486 goto gather_gen;
12487 case IX86_BUILTIN_GATHER3SIV16SF:
12488 icode = CODE_FOR_avx512f_gathersiv16sf;
12489 goto gather_gen;
12490 case IX86_BUILTIN_GATHER3SIV8DF:
12491 icode = CODE_FOR_avx512f_gathersiv8df;
12492 goto gather_gen;
12493 case IX86_BUILTIN_GATHER3DIV16SF:
12494 icode = CODE_FOR_avx512f_gatherdiv16sf;
12495 goto gather_gen;
12496 case IX86_BUILTIN_GATHER3DIV8DF:
12497 icode = CODE_FOR_avx512f_gatherdiv8df;
12498 goto gather_gen;
12499 case IX86_BUILTIN_GATHER3SIV16SI:
12500 icode = CODE_FOR_avx512f_gathersiv16si;
12501 goto gather_gen;
12502 case IX86_BUILTIN_GATHER3SIV8DI:
12503 icode = CODE_FOR_avx512f_gathersiv8di;
12504 goto gather_gen;
12505 case IX86_BUILTIN_GATHER3DIV16SI:
12506 icode = CODE_FOR_avx512f_gatherdiv16si;
12507 goto gather_gen;
12508 case IX86_BUILTIN_GATHER3DIV8DI:
12509 icode = CODE_FOR_avx512f_gatherdiv8di;
12510 goto gather_gen;
12511 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12512 icode = CODE_FOR_avx512f_gathersiv8df;
12513 goto gather_gen;
12514 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12515 icode = CODE_FOR_avx512f_gatherdiv16sf;
12516 goto gather_gen;
12517 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12518 icode = CODE_FOR_avx512f_gathersiv8di;
12519 goto gather_gen;
12520 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12521 icode = CODE_FOR_avx512f_gatherdiv16si;
12522 goto gather_gen;
12523 case IX86_BUILTIN_GATHER3SIV2DF:
12524 icode = CODE_FOR_avx512vl_gathersiv2df;
12525 goto gather_gen;
12526 case IX86_BUILTIN_GATHER3SIV4DF:
12527 icode = CODE_FOR_avx512vl_gathersiv4df;
12528 goto gather_gen;
12529 case IX86_BUILTIN_GATHER3DIV2DF:
12530 icode = CODE_FOR_avx512vl_gatherdiv2df;
12531 goto gather_gen;
12532 case IX86_BUILTIN_GATHER3DIV4DF:
12533 icode = CODE_FOR_avx512vl_gatherdiv4df;
12534 goto gather_gen;
12535 case IX86_BUILTIN_GATHER3SIV4SF:
12536 icode = CODE_FOR_avx512vl_gathersiv4sf;
12537 goto gather_gen;
12538 case IX86_BUILTIN_GATHER3SIV8SF:
12539 icode = CODE_FOR_avx512vl_gathersiv8sf;
12540 goto gather_gen;
12541 case IX86_BUILTIN_GATHER3DIV4SF:
12542 icode = CODE_FOR_avx512vl_gatherdiv4sf;
12543 goto gather_gen;
12544 case IX86_BUILTIN_GATHER3DIV8SF:
12545 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12546 goto gather_gen;
12547 case IX86_BUILTIN_GATHER3SIV2DI:
12548 icode = CODE_FOR_avx512vl_gathersiv2di;
12549 goto gather_gen;
12550 case IX86_BUILTIN_GATHER3SIV4DI:
12551 icode = CODE_FOR_avx512vl_gathersiv4di;
12552 goto gather_gen;
12553 case IX86_BUILTIN_GATHER3DIV2DI:
12554 icode = CODE_FOR_avx512vl_gatherdiv2di;
12555 goto gather_gen;
12556 case IX86_BUILTIN_GATHER3DIV4DI:
12557 icode = CODE_FOR_avx512vl_gatherdiv4di;
12558 goto gather_gen;
12559 case IX86_BUILTIN_GATHER3SIV4SI:
12560 icode = CODE_FOR_avx512vl_gathersiv4si;
12561 goto gather_gen;
12562 case IX86_BUILTIN_GATHER3SIV8SI:
12563 icode = CODE_FOR_avx512vl_gathersiv8si;
12564 goto gather_gen;
12565 case IX86_BUILTIN_GATHER3DIV4SI:
12566 icode = CODE_FOR_avx512vl_gatherdiv4si;
12567 goto gather_gen;
12568 case IX86_BUILTIN_GATHER3DIV8SI:
12569 icode = CODE_FOR_avx512vl_gatherdiv8si;
12570 goto gather_gen;
12571 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12572 icode = CODE_FOR_avx512vl_gathersiv4df;
12573 goto gather_gen;
12574 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12575 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12576 goto gather_gen;
12577 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12578 icode = CODE_FOR_avx512vl_gathersiv4di;
12579 goto gather_gen;
12580 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12581 icode = CODE_FOR_avx512vl_gatherdiv8si;
12582 goto gather_gen;
12583 case IX86_BUILTIN_SCATTERSIV16SF:
12584 icode = CODE_FOR_avx512f_scattersiv16sf;
12585 goto scatter_gen;
12586 case IX86_BUILTIN_SCATTERSIV8DF:
12587 icode = CODE_FOR_avx512f_scattersiv8df;
12588 goto scatter_gen;
12589 case IX86_BUILTIN_SCATTERDIV16SF:
12590 icode = CODE_FOR_avx512f_scatterdiv16sf;
12591 goto scatter_gen;
12592 case IX86_BUILTIN_SCATTERDIV8DF:
12593 icode = CODE_FOR_avx512f_scatterdiv8df;
12594 goto scatter_gen;
12595 case IX86_BUILTIN_SCATTERSIV16SI:
12596 icode = CODE_FOR_avx512f_scattersiv16si;
12597 goto scatter_gen;
12598 case IX86_BUILTIN_SCATTERSIV8DI:
12599 icode = CODE_FOR_avx512f_scattersiv8di;
12600 goto scatter_gen;
12601 case IX86_BUILTIN_SCATTERDIV16SI:
12602 icode = CODE_FOR_avx512f_scatterdiv16si;
12603 goto scatter_gen;
12604 case IX86_BUILTIN_SCATTERDIV8DI:
12605 icode = CODE_FOR_avx512f_scatterdiv8di;
12606 goto scatter_gen;
12607 case IX86_BUILTIN_SCATTERSIV8SF:
12608 icode = CODE_FOR_avx512vl_scattersiv8sf;
12609 goto scatter_gen;
12610 case IX86_BUILTIN_SCATTERSIV4SF:
12611 icode = CODE_FOR_avx512vl_scattersiv4sf;
12612 goto scatter_gen;
12613 case IX86_BUILTIN_SCATTERSIV4DF:
12614 icode = CODE_FOR_avx512vl_scattersiv4df;
12615 goto scatter_gen;
12616 case IX86_BUILTIN_SCATTERSIV2DF:
12617 icode = CODE_FOR_avx512vl_scattersiv2df;
12618 goto scatter_gen;
12619 case IX86_BUILTIN_SCATTERDIV8SF:
12620 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12621 goto scatter_gen;
12622 case IX86_BUILTIN_SCATTERDIV4SF:
12623 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12624 goto scatter_gen;
12625 case IX86_BUILTIN_SCATTERDIV4DF:
12626 icode = CODE_FOR_avx512vl_scatterdiv4df;
12627 goto scatter_gen;
12628 case IX86_BUILTIN_SCATTERDIV2DF:
12629 icode = CODE_FOR_avx512vl_scatterdiv2df;
12630 goto scatter_gen;
12631 case IX86_BUILTIN_SCATTERSIV8SI:
12632 icode = CODE_FOR_avx512vl_scattersiv8si;
12633 goto scatter_gen;
12634 case IX86_BUILTIN_SCATTERSIV4SI:
12635 icode = CODE_FOR_avx512vl_scattersiv4si;
12636 goto scatter_gen;
12637 case IX86_BUILTIN_SCATTERSIV4DI:
12638 icode = CODE_FOR_avx512vl_scattersiv4di;
12639 goto scatter_gen;
12640 case IX86_BUILTIN_SCATTERSIV2DI:
12641 icode = CODE_FOR_avx512vl_scattersiv2di;
12642 goto scatter_gen;
12643 case IX86_BUILTIN_SCATTERDIV8SI:
12644 icode = CODE_FOR_avx512vl_scatterdiv8si;
12645 goto scatter_gen;
12646 case IX86_BUILTIN_SCATTERDIV4SI:
12647 icode = CODE_FOR_avx512vl_scatterdiv4si;
12648 goto scatter_gen;
12649 case IX86_BUILTIN_SCATTERDIV4DI:
12650 icode = CODE_FOR_avx512vl_scatterdiv4di;
12651 goto scatter_gen;
12652 case IX86_BUILTIN_SCATTERDIV2DI:
12653 icode = CODE_FOR_avx512vl_scatterdiv2di;
12654 goto scatter_gen;
12655 case IX86_BUILTIN_GATHERPFDPD:
12656 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
12657 goto vec_prefetch_gen;
12658 case IX86_BUILTIN_SCATTERALTSIV8DF:
12659 icode = CODE_FOR_avx512f_scattersiv8df;
12660 goto scatter_gen;
12661 case IX86_BUILTIN_SCATTERALTDIV16SF:
12662 icode = CODE_FOR_avx512f_scatterdiv16sf;
12663 goto scatter_gen;
12664 case IX86_BUILTIN_SCATTERALTSIV8DI:
12665 icode = CODE_FOR_avx512f_scattersiv8di;
12666 goto scatter_gen;
12667 case IX86_BUILTIN_SCATTERALTDIV16SI:
12668 icode = CODE_FOR_avx512f_scatterdiv16si;
12669 goto scatter_gen;
12670 case IX86_BUILTIN_SCATTERALTSIV4DF:
12671 icode = CODE_FOR_avx512vl_scattersiv4df;
12672 goto scatter_gen;
12673 case IX86_BUILTIN_SCATTERALTDIV8SF:
12674 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12675 goto scatter_gen;
12676 case IX86_BUILTIN_SCATTERALTSIV4DI:
12677 icode = CODE_FOR_avx512vl_scattersiv4di;
12678 goto scatter_gen;
12679 case IX86_BUILTIN_SCATTERALTDIV8SI:
12680 icode = CODE_FOR_avx512vl_scatterdiv8si;
12681 goto scatter_gen;
12682 case IX86_BUILTIN_SCATTERALTSIV2DF:
12683 icode = CODE_FOR_avx512vl_scattersiv2df;
12684 goto scatter_gen;
12685 case IX86_BUILTIN_SCATTERALTDIV4SF:
12686 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12687 goto scatter_gen;
12688 case IX86_BUILTIN_SCATTERALTSIV2DI:
12689 icode = CODE_FOR_avx512vl_scattersiv2di;
12690 goto scatter_gen;
12691 case IX86_BUILTIN_SCATTERALTDIV4SI:
12692 icode = CODE_FOR_avx512vl_scatterdiv4si;
12693 goto scatter_gen;
12694 case IX86_BUILTIN_GATHERPFDPS:
12695 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
12696 goto vec_prefetch_gen;
12697 case IX86_BUILTIN_GATHERPFQPD:
12698 icode = CODE_FOR_avx512pf_gatherpfv8didf;
12699 goto vec_prefetch_gen;
12700 case IX86_BUILTIN_GATHERPFQPS:
12701 icode = CODE_FOR_avx512pf_gatherpfv8disf;
12702 goto vec_prefetch_gen;
12703 case IX86_BUILTIN_SCATTERPFDPD:
12704 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
12705 goto vec_prefetch_gen;
12706 case IX86_BUILTIN_SCATTERPFDPS:
12707 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
12708 goto vec_prefetch_gen;
12709 case IX86_BUILTIN_SCATTERPFQPD:
12710 icode = CODE_FOR_avx512pf_scatterpfv8didf;
12711 goto vec_prefetch_gen;
12712 case IX86_BUILTIN_SCATTERPFQPS:
12713 icode = CODE_FOR_avx512pf_scatterpfv8disf;
12714 goto vec_prefetch_gen;
12715
12716 gather_gen:
12717 rtx half;
12718 rtx (*gen) (rtx, rtx);
12719
12720 arg0 = CALL_EXPR_ARG (exp, 0);
12721 arg1 = CALL_EXPR_ARG (exp, 1);
12722 arg2 = CALL_EXPR_ARG (exp, 2);
12723 arg3 = CALL_EXPR_ARG (exp, 3);
12724 arg4 = CALL_EXPR_ARG (exp, 4);
12725 op0 = expand_normal (arg0);
12726 op1 = expand_normal (arg1);
12727 op2 = expand_normal (arg2);
12728 op3 = expand_normal (arg3);
12729 op4 = expand_normal (arg4);
12730 /* Note the arg order is different from the operand order. */
12731 mode0 = insn_data[icode].operand[1].mode;
12732 mode2 = insn_data[icode].operand[3].mode;
12733 mode3 = insn_data[icode].operand[4].mode;
12734 mode4 = insn_data[icode].operand[5].mode;
12735
12736 if (target == NULL_RTX
12737 || GET_MODE (target) != insn_data[icode].operand[0].mode
12738 || !insn_data[icode].operand[0].predicate (target,
12739 GET_MODE (target)))
12740 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
12741 else
12742 subtarget = target;
12743
12744 switch (fcode)
12745 {
12746 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12747 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12748 half = gen_reg_rtx (V8SImode);
12749 if (!nonimmediate_operand (op2, V16SImode))
12750 op2 = copy_to_mode_reg (V16SImode, op2);
12751 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12752 op2 = half;
12753 break;
12754 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12755 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12756 case IX86_BUILTIN_GATHERALTSIV4DF:
12757 case IX86_BUILTIN_GATHERALTSIV4DI:
12758 half = gen_reg_rtx (V4SImode);
12759 if (!nonimmediate_operand (op2, V8SImode))
12760 op2 = copy_to_mode_reg (V8SImode, op2);
12761 emit_insn (gen_vec_extract_lo_v8si (half, op2));
12762 op2 = half;
12763 break;
12764 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12765 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12766 half = gen_reg_rtx (mode0);
12767 if (mode0 == V8SFmode)
12768 gen = gen_vec_extract_lo_v16sf;
12769 else
12770 gen = gen_vec_extract_lo_v16si;
12771 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12772 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12773 emit_insn (gen (half, op0));
12774 op0 = half;
12775 op3 = lowpart_subreg (QImode, op3, HImode);
12776 break;
12777 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12778 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12779 case IX86_BUILTIN_GATHERALTDIV8SF:
12780 case IX86_BUILTIN_GATHERALTDIV8SI:
12781 half = gen_reg_rtx (mode0);
12782 if (mode0 == V4SFmode)
12783 gen = gen_vec_extract_lo_v8sf;
12784 else
12785 gen = gen_vec_extract_lo_v8si;
12786 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12787 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12788 emit_insn (gen (half, op0));
12789 op0 = half;
12790 if (VECTOR_MODE_P (GET_MODE (op3)))
12791 {
12792 half = gen_reg_rtx (mode0);
12793 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12794 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12795 emit_insn (gen (half, op3));
12796 op3 = half;
12797 }
12798 break;
12799 default:
12800 break;
12801 }
12802
12803 /* Force memory operand only with base register here. But we
12804 don't want to do it on memory operand for other builtin
12805 functions. */
12806 op1 = ix86_zero_extend_to_Pmode (op1);
12807
12808 if (!insn_data[icode].operand[1].predicate (op0, mode0))
12809 op0 = copy_to_mode_reg (mode0, op0);
12810 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
12811 op1 = copy_to_mode_reg (Pmode, op1);
12812 if (!insn_data[icode].operand[3].predicate (op2, mode2))
12813 op2 = copy_to_mode_reg (mode2, op2);
12814
12815 op3 = fixup_modeless_constant (op3, mode3);
12816
12817 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
12818 {
12819 if (!insn_data[icode].operand[4].predicate (op3, mode3))
12820 op3 = copy_to_mode_reg (mode3, op3);
12821 }
12822 else
12823 {
12824 op3 = copy_to_reg (op3);
12825 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
12826 }
12827 if (!insn_data[icode].operand[5].predicate (op4, mode4))
12828 {
12829 error ("the last argument must be scale 1, 2, 4, 8");
12830 return const0_rtx;
12831 }
12832
12833 /* Optimize. If mask is known to have all high bits set,
12834 replace op0 with pc_rtx to signal that the instruction
12835 overwrites the whole destination and doesn't use its
12836 previous contents. */
12837 if (optimize)
12838 {
12839 if (TREE_CODE (arg3) == INTEGER_CST)
12840 {
12841 if (integer_all_onesp (arg3))
12842 op0 = pc_rtx;
12843 }
12844 else if (TREE_CODE (arg3) == VECTOR_CST)
12845 {
12846 unsigned int negative = 0;
12847 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
12848 {
12849 tree cst = VECTOR_CST_ELT (arg3, i);
12850 if (TREE_CODE (cst) == INTEGER_CST
12851 && tree_int_cst_sign_bit (cst))
12852 negative++;
12853 else if (TREE_CODE (cst) == REAL_CST
12854 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
12855 negative++;
12856 }
12857 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
12858 op0 = pc_rtx;
12859 }
12860 else if (TREE_CODE (arg3) == SSA_NAME
12861 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
12862 {
12863 /* Recognize also when mask is like:
12864 __v2df src = _mm_setzero_pd ();
12865 __v2df mask = _mm_cmpeq_pd (src, src);
12866 or
12867 __v8sf src = _mm256_setzero_ps ();
12868 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12869 as that is a cheaper way to load all ones into
12870 a register than having to load a constant from
12871 memory. */
12872 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
12873 if (is_gimple_call (def_stmt))
12874 {
12875 tree fndecl = gimple_call_fndecl (def_stmt);
12876 if (fndecl
12877 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
4d732405 12878 switch (DECL_MD_FUNCTION_CODE (fndecl))
2bf6d935
ML
12879 {
12880 case IX86_BUILTIN_CMPPD:
12881 case IX86_BUILTIN_CMPPS:
12882 case IX86_BUILTIN_CMPPD256:
12883 case IX86_BUILTIN_CMPPS256:
12884 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
12885 break;
12886 /* FALLTHRU */
12887 case IX86_BUILTIN_CMPEQPD:
12888 case IX86_BUILTIN_CMPEQPS:
12889 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
12890 && initializer_zerop (gimple_call_arg (def_stmt,
12891 1)))
12892 op0 = pc_rtx;
12893 break;
12894 default:
12895 break;
12896 }
12897 }
12898 }
12899 }
12900
12901 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
12902 if (! pat)
12903 return const0_rtx;
12904 emit_insn (pat);
12905
12906 switch (fcode)
12907 {
12908 case IX86_BUILTIN_GATHER3DIV16SF:
12909 if (target == NULL_RTX)
12910 target = gen_reg_rtx (V8SFmode);
12911 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
12912 break;
12913 case IX86_BUILTIN_GATHER3DIV16SI:
12914 if (target == NULL_RTX)
12915 target = gen_reg_rtx (V8SImode);
12916 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
12917 break;
12918 case IX86_BUILTIN_GATHER3DIV8SF:
12919 case IX86_BUILTIN_GATHERDIV8SF:
12920 if (target == NULL_RTX)
12921 target = gen_reg_rtx (V4SFmode);
12922 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
12923 break;
12924 case IX86_BUILTIN_GATHER3DIV8SI:
12925 case IX86_BUILTIN_GATHERDIV8SI:
12926 if (target == NULL_RTX)
12927 target = gen_reg_rtx (V4SImode);
12928 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
12929 break;
12930 default:
12931 target = subtarget;
12932 break;
12933 }
12934 return target;
12935
12936 scatter_gen:
12937 arg0 = CALL_EXPR_ARG (exp, 0);
12938 arg1 = CALL_EXPR_ARG (exp, 1);
12939 arg2 = CALL_EXPR_ARG (exp, 2);
12940 arg3 = CALL_EXPR_ARG (exp, 3);
12941 arg4 = CALL_EXPR_ARG (exp, 4);
12942 op0 = expand_normal (arg0);
12943 op1 = expand_normal (arg1);
12944 op2 = expand_normal (arg2);
12945 op3 = expand_normal (arg3);
12946 op4 = expand_normal (arg4);
12947 mode1 = insn_data[icode].operand[1].mode;
12948 mode2 = insn_data[icode].operand[2].mode;
12949 mode3 = insn_data[icode].operand[3].mode;
12950 mode4 = insn_data[icode].operand[4].mode;
12951
12952 /* Scatter instruction stores operand op3 to memory with
12953 indices from op2 and scale from op4 under writemask op1.
12954 If index operand op2 has more elements then source operand
12955 op3 one need to use only its low half. And vice versa. */
12956 switch (fcode)
12957 {
12958 case IX86_BUILTIN_SCATTERALTSIV8DF:
12959 case IX86_BUILTIN_SCATTERALTSIV8DI:
12960 half = gen_reg_rtx (V8SImode);
12961 if (!nonimmediate_operand (op2, V16SImode))
12962 op2 = copy_to_mode_reg (V16SImode, op2);
12963 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12964 op2 = half;
12965 break;
12966 case IX86_BUILTIN_SCATTERALTDIV16SF:
12967 case IX86_BUILTIN_SCATTERALTDIV16SI:
12968 half = gen_reg_rtx (mode3);
12969 if (mode3 == V8SFmode)
12970 gen = gen_vec_extract_lo_v16sf;
12971 else
12972 gen = gen_vec_extract_lo_v16si;
12973 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12974 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12975 emit_insn (gen (half, op3));
12976 op3 = half;
12977 break;
12978 case IX86_BUILTIN_SCATTERALTSIV4DF:
12979 case IX86_BUILTIN_SCATTERALTSIV4DI:
12980 half = gen_reg_rtx (V4SImode);
12981 if (!nonimmediate_operand (op2, V8SImode))
12982 op2 = copy_to_mode_reg (V8SImode, op2);
12983 emit_insn (gen_vec_extract_lo_v8si (half, op2));
12984 op2 = half;
12985 break;
12986 case IX86_BUILTIN_SCATTERALTDIV8SF:
12987 case IX86_BUILTIN_SCATTERALTDIV8SI:
12988 half = gen_reg_rtx (mode3);
12989 if (mode3 == V4SFmode)
12990 gen = gen_vec_extract_lo_v8sf;
12991 else
12992 gen = gen_vec_extract_lo_v8si;
12993 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12994 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12995 emit_insn (gen (half, op3));
12996 op3 = half;
12997 break;
12998 case IX86_BUILTIN_SCATTERALTSIV2DF:
12999 case IX86_BUILTIN_SCATTERALTSIV2DI:
13000 if (!nonimmediate_operand (op2, V4SImode))
13001 op2 = copy_to_mode_reg (V4SImode, op2);
13002 break;
13003 case IX86_BUILTIN_SCATTERALTDIV4SF:
13004 case IX86_BUILTIN_SCATTERALTDIV4SI:
13005 if (!nonimmediate_operand (op3, GET_MODE (op3)))
13006 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
13007 break;
13008 default:
13009 break;
13010 }
13011
13012 /* Force memory operand only with base register here. But we
13013 don't want to do it on memory operand for other builtin
13014 functions. */
13015 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
13016
13017 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
13018 op0 = copy_to_mode_reg (Pmode, op0);
13019
13020 op1 = fixup_modeless_constant (op1, mode1);
13021
13022 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
13023 {
13024 if (!insn_data[icode].operand[1].predicate (op1, mode1))
13025 op1 = copy_to_mode_reg (mode1, op1);
13026 }
13027 else
13028 {
13029 op1 = copy_to_reg (op1);
13030 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
13031 }
13032
13033 if (!insn_data[icode].operand[2].predicate (op2, mode2))
13034 op2 = copy_to_mode_reg (mode2, op2);
13035
13036 if (!insn_data[icode].operand[3].predicate (op3, mode3))
13037 op3 = copy_to_mode_reg (mode3, op3);
13038
13039 if (!insn_data[icode].operand[4].predicate (op4, mode4))
13040 {
13041 error ("the last argument must be scale 1, 2, 4, 8");
13042 return const0_rtx;
13043 }
13044
13045 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
13046 if (! pat)
13047 return const0_rtx;
13048
13049 emit_insn (pat);
13050 return 0;
13051
13052 vec_prefetch_gen:
13053 arg0 = CALL_EXPR_ARG (exp, 0);
13054 arg1 = CALL_EXPR_ARG (exp, 1);
13055 arg2 = CALL_EXPR_ARG (exp, 2);
13056 arg3 = CALL_EXPR_ARG (exp, 3);
13057 arg4 = CALL_EXPR_ARG (exp, 4);
13058 op0 = expand_normal (arg0);
13059 op1 = expand_normal (arg1);
13060 op2 = expand_normal (arg2);
13061 op3 = expand_normal (arg3);
13062 op4 = expand_normal (arg4);
13063 mode0 = insn_data[icode].operand[0].mode;
13064 mode1 = insn_data[icode].operand[1].mode;
13065 mode3 = insn_data[icode].operand[3].mode;
13066 mode4 = insn_data[icode].operand[4].mode;
13067
13068 op0 = fixup_modeless_constant (op0, mode0);
13069
13070 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
13071 {
13072 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13073 op0 = copy_to_mode_reg (mode0, op0);
13074 }
13075 else
13076 {
13077 op0 = copy_to_reg (op0);
13078 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
13079 }
13080
13081 if (!insn_data[icode].operand[1].predicate (op1, mode1))
13082 op1 = copy_to_mode_reg (mode1, op1);
13083
13084 /* Force memory operand only with base register here. But we
13085 don't want to do it on memory operand for other builtin
13086 functions. */
13087 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
13088
13089 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
13090 op2 = copy_to_mode_reg (Pmode, op2);
13091
13092 if (!insn_data[icode].operand[3].predicate (op3, mode3))
13093 {
13094 error ("the forth argument must be scale 1, 2, 4, 8");
13095 return const0_rtx;
13096 }
13097
13098 if (!insn_data[icode].operand[4].predicate (op4, mode4))
13099 {
13100 error ("incorrect hint operand");
13101 return const0_rtx;
13102 }
13103
13104 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
13105 if (! pat)
13106 return const0_rtx;
13107
13108 emit_insn (pat);
13109
13110 return 0;
13111
13112 case IX86_BUILTIN_XABORT:
13113 icode = CODE_FOR_xabort;
13114 arg0 = CALL_EXPR_ARG (exp, 0);
13115 op0 = expand_normal (arg0);
13116 mode0 = insn_data[icode].operand[0].mode;
13117 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13118 {
13119 error ("the argument to %<xabort%> intrinsic must "
13120 "be an 8-bit immediate");
13121 return const0_rtx;
13122 }
13123 emit_insn (gen_xabort (op0));
13124 return 0;
13125
b5034abb
UB
13126 case IX86_BUILTIN_RDSSPD:
13127 case IX86_BUILTIN_RDSSPQ:
13128 mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
13129
13130 if (target == 0
13131 || !register_operand (target, mode))
13132 target = gen_reg_rtx (mode);
13133
13134 op0 = force_reg (mode, const0_rtx);
13135
13136 emit_insn (gen_rdssp (mode, target, op0));
13137 return target;
13138
13139 case IX86_BUILTIN_INCSSPD:
13140 case IX86_BUILTIN_INCSSPQ:
13141 mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
13142
13143 arg0 = CALL_EXPR_ARG (exp, 0);
13144 op0 = expand_normal (arg0);
13145
13146 op0 = force_reg (mode, op0);
13147
13148 emit_insn (gen_incssp (mode, op0));
13149 return 0;
13150
83927c63
HW
13151 case IX86_BUILTIN_HRESET:
13152 icode = CODE_FOR_hreset;
13153 arg0 = CALL_EXPR_ARG (exp, 0);
13154 op0 = expand_normal (arg0);
13155 op0 = force_reg (SImode, op0);
13156 emit_insn (gen_hreset (op0));
13157 return 0;
13158
2bf6d935
ML
13159 case IX86_BUILTIN_RSTORSSP:
13160 case IX86_BUILTIN_CLRSSBSY:
13161 arg0 = CALL_EXPR_ARG (exp, 0);
13162 op0 = expand_normal (arg0);
13163 icode = (fcode == IX86_BUILTIN_RSTORSSP
b5034abb
UB
13164 ? CODE_FOR_rstorssp
13165 : CODE_FOR_clrssbsy);
13166
2bf6d935
ML
13167 if (!address_operand (op0, VOIDmode))
13168 {
b5034abb
UB
13169 op0 = convert_memory_address (Pmode, op0);
13170 op0 = copy_addr_to_reg (op0);
2bf6d935 13171 }
b5034abb 13172 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
2bf6d935
ML
13173 return 0;
13174
13175 case IX86_BUILTIN_WRSSD:
13176 case IX86_BUILTIN_WRSSQ:
13177 case IX86_BUILTIN_WRUSSD:
13178 case IX86_BUILTIN_WRUSSQ:
b5034abb
UB
13179 mode = ((fcode == IX86_BUILTIN_WRSSD
13180 || fcode == IX86_BUILTIN_WRUSSD)
13181 ? SImode : DImode);
13182
2bf6d935
ML
13183 arg0 = CALL_EXPR_ARG (exp, 0);
13184 op0 = expand_normal (arg0);
13185 arg1 = CALL_EXPR_ARG (exp, 1);
13186 op1 = expand_normal (arg1);
b5034abb 13187
2bf6d935 13188 op0 = force_reg (mode, op0);
b5034abb 13189
2bf6d935
ML
13190 if (!address_operand (op1, VOIDmode))
13191 {
b5034abb
UB
13192 op1 = convert_memory_address (Pmode, op1);
13193 op1 = copy_addr_to_reg (op1);
2bf6d935 13194 }
b5034abb
UB
13195 op1 = gen_rtx_MEM (mode, op1);
13196
44320665
UB
13197 icode = ((fcode == IX86_BUILTIN_WRSSD
13198 || fcode == IX86_BUILTIN_WRSSQ)
13199 ? code_for_wrss (mode)
13200 : code_for_wruss (mode));
13201 emit_insn (GEN_FCN (icode) (op0, op1));
13202
2bf6d935
ML
13203 return 0;
13204
13205 default:
13206 break;
13207 }
13208
13209 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
13210 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
13211 {
13212 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
13213 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
13214 target);
13215 }
13216
13217 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
13218 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
13219 {
13220 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
13221 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
13222 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
13223 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
13224 int masked = 1;
13225 machine_mode mode, wide_mode, nar_mode;
13226
13227 nar_mode = V4SFmode;
13228 mode = V16SFmode;
13229 wide_mode = V64SFmode;
13230 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
13231 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
13232
13233 switch (fcode)
13234 {
13235 case IX86_BUILTIN_4FMAPS:
13236 fcn = gen_avx5124fmaddps_4fmaddps;
13237 masked = 0;
13238 goto v4fma_expand;
13239
13240 case IX86_BUILTIN_4DPWSSD:
13241 nar_mode = V4SImode;
13242 mode = V16SImode;
13243 wide_mode = V64SImode;
13244 fcn = gen_avx5124vnniw_vp4dpwssd;
13245 masked = 0;
13246 goto v4fma_expand;
13247
13248 case IX86_BUILTIN_4DPWSSDS:
13249 nar_mode = V4SImode;
13250 mode = V16SImode;
13251 wide_mode = V64SImode;
13252 fcn = gen_avx5124vnniw_vp4dpwssds;
13253 masked = 0;
13254 goto v4fma_expand;
13255
13256 case IX86_BUILTIN_4FNMAPS:
13257 fcn = gen_avx5124fmaddps_4fnmaddps;
13258 masked = 0;
13259 goto v4fma_expand;
13260
13261 case IX86_BUILTIN_4FNMAPS_MASK:
13262 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
13263 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
13264 goto v4fma_expand;
13265
13266 case IX86_BUILTIN_4DPWSSD_MASK:
13267 nar_mode = V4SImode;
13268 mode = V16SImode;
13269 wide_mode = V64SImode;
13270 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
13271 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
13272 goto v4fma_expand;
13273
13274 case IX86_BUILTIN_4DPWSSDS_MASK:
13275 nar_mode = V4SImode;
13276 mode = V16SImode;
13277 wide_mode = V64SImode;
13278 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
13279 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
13280 goto v4fma_expand;
13281
13282 case IX86_BUILTIN_4FMAPS_MASK:
13283 {
13284 tree args[4];
13285 rtx ops[4];
13286 rtx wide_reg;
13287 rtx accum;
13288 rtx addr;
13289 rtx mem;
13290
13291v4fma_expand:
13292 wide_reg = gen_reg_rtx (wide_mode);
13293 for (i = 0; i < 4; i++)
13294 {
13295 args[i] = CALL_EXPR_ARG (exp, i);
13296 ops[i] = expand_normal (args[i]);
13297
13298 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
13299 ops[i]);
13300 }
13301
13302 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
13303 accum = force_reg (mode, accum);
13304
13305 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
13306 addr = force_reg (Pmode, addr);
13307
13308 mem = gen_rtx_MEM (nar_mode, addr);
13309
13310 target = gen_reg_rtx (mode);
13311
13312 emit_move_insn (target, accum);
13313
13314 if (! masked)
13315 emit_insn (fcn (target, accum, wide_reg, mem));
13316 else
13317 {
13318 rtx merge, mask;
13319 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
13320
13321 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
13322
13323 if (CONST_INT_P (mask))
13324 mask = fixup_modeless_constant (mask, HImode);
13325
13326 mask = force_reg (HImode, mask);
13327
13328 if (GET_MODE (mask) != HImode)
13329 mask = gen_rtx_SUBREG (HImode, mask, 0);
13330
13331 /* If merge is 0 then we're about to emit z-masked variant. */
13332 if (const0_operand (merge, mode))
13333 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
13334 /* If merge is the same as accum then emit merge-masked variant. */
13335 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
13336 {
13337 merge = force_reg (mode, merge);
13338 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
13339 }
13340 /* Merge with something unknown might happen if we z-mask w/ -O0. */
13341 else
13342 {
13343 target = gen_reg_rtx (mode);
13344 emit_move_insn (target, merge);
13345 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
13346 }
13347 }
13348 return target;
13349 }
13350
13351 case IX86_BUILTIN_4FNMASS:
13352 fcn = gen_avx5124fmaddps_4fnmaddss;
13353 masked = 0;
13354 goto s4fma_expand;
13355
13356 case IX86_BUILTIN_4FMASS:
13357 fcn = gen_avx5124fmaddps_4fmaddss;
13358 masked = 0;
13359 goto s4fma_expand;
13360
13361 case IX86_BUILTIN_4FNMASS_MASK:
13362 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
13363 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
13364 goto s4fma_expand;
13365
13366 case IX86_BUILTIN_4FMASS_MASK:
13367 {
13368 tree args[4];
13369 rtx ops[4];
13370 rtx wide_reg;
13371 rtx accum;
13372 rtx addr;
13373 rtx mem;
13374
13375 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
13376 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
13377
13378s4fma_expand:
13379 mode = V4SFmode;
13380 wide_reg = gen_reg_rtx (V64SFmode);
13381 for (i = 0; i < 4; i++)
13382 {
13383 rtx tmp;
13384 args[i] = CALL_EXPR_ARG (exp, i);
13385 ops[i] = expand_normal (args[i]);
13386
13387 tmp = gen_reg_rtx (SFmode);
13388 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
13389
13390 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
13391 gen_rtx_SUBREG (V16SFmode, tmp, 0));
13392 }
13393
13394 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
13395 accum = force_reg (V4SFmode, accum);
13396
13397 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
13398 addr = force_reg (Pmode, addr);
13399
13400 mem = gen_rtx_MEM (V4SFmode, addr);
13401
13402 target = gen_reg_rtx (V4SFmode);
13403
13404 emit_move_insn (target, accum);
13405
13406 if (! masked)
13407 emit_insn (fcn (target, accum, wide_reg, mem));
13408 else
13409 {
13410 rtx merge, mask;
13411 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
13412
13413 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
13414
13415 if (CONST_INT_P (mask))
13416 mask = fixup_modeless_constant (mask, QImode);
13417
13418 mask = force_reg (QImode, mask);
13419
13420 if (GET_MODE (mask) != QImode)
13421 mask = gen_rtx_SUBREG (QImode, mask, 0);
13422
13423 /* If merge is 0 then we're about to emit z-masked variant. */
13424 if (const0_operand (merge, mode))
13425 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
13426 /* If merge is the same as accum then emit merge-masked
13427 variant. */
13428 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
13429 {
13430 merge = force_reg (mode, merge);
13431 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
13432 }
13433 /* Merge with something unknown might happen if we z-mask
13434 w/ -O0. */
13435 else
13436 {
13437 target = gen_reg_rtx (mode);
13438 emit_move_insn (target, merge);
13439 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
13440 }
13441 }
13442 return target;
13443 }
13444 case IX86_BUILTIN_RDPID:
13445 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
13446 target);
13447 case IX86_BUILTIN_FABSQ:
13448 case IX86_BUILTIN_COPYSIGNQ:
13449 if (!TARGET_SSE)
13450 /* Emit a normal call if SSE isn't available. */
13451 return expand_call (exp, target, ignore);
13452 /* FALLTHRU */
13453 default:
13454 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
13455 }
13456 }
13457
13458 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
13459 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
13460 {
13461 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
13462 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
13463 }
13464
13465 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
13466 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
13467 {
13468 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
13469 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
13470 }
13471
13472 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
13473 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
13474 {
13475 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
13476 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
13477 }
13478
13479 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
13480 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
13481 {
13482 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
13483 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
13484 }
13485
13486 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
13487 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
13488 {
13489 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
13490 const struct builtin_description *d = bdesc_multi_arg + i;
13491 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
13492 (enum ix86_builtin_func_type)
13493 d->flag, d->comparison);
13494 }
13495
13496 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
13497 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
13498 {
13499 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
13500 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
13501 target);
13502 }
13503
2bf6d935
ML
13504 gcc_unreachable ();
13505}
13506
13507/* A subroutine of ix86_expand_vector_init_duplicate. Tries to
13508 fill target with val via vec_duplicate. */
13509
13510static bool
13511ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
13512{
13513 bool ok;
13514 rtx_insn *insn;
13515 rtx dup;
13516
13517 /* First attempt to recognize VAL as-is. */
13518 dup = gen_vec_duplicate (mode, val);
13519 insn = emit_insn (gen_rtx_SET (target, dup));
13520 if (recog_memoized (insn) < 0)
13521 {
13522 rtx_insn *seq;
13523 machine_mode innermode = GET_MODE_INNER (mode);
13524 rtx reg;
13525
13526 /* If that fails, force VAL into a register. */
13527
13528 start_sequence ();
13529 reg = force_reg (innermode, val);
13530 if (GET_MODE (reg) != innermode)
13531 reg = gen_lowpart (innermode, reg);
13532 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
13533 seq = get_insns ();
13534 end_sequence ();
13535 if (seq)
13536 emit_insn_before (seq, insn);
13537
13538 ok = recog_memoized (insn) >= 0;
13539 gcc_assert (ok);
13540 }
13541 return true;
13542}
13543
13544/* Get a vector mode of the same size as the original but with elements
13545 twice as wide. This is only guaranteed to apply to integral vectors. */
13546
13547static machine_mode
13548get_mode_wider_vector (machine_mode o)
13549{
13550 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
13551 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
13552 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
13553 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
13554 return n;
13555}
13556
13557static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
13558static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
13559
13560/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13561 with all elements equal to VAR. Return true if successful. */
13562
13563static bool
13564ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
13565 rtx target, rtx val)
13566{
13567 bool ok;
13568
13569 switch (mode)
13570 {
13571 case E_V2SImode:
13572 case E_V2SFmode:
13573 if (!mmx_ok)
13574 return false;
13575 /* FALLTHRU */
13576
13577 case E_V4DFmode:
13578 case E_V4DImode:
13579 case E_V8SFmode:
13580 case E_V8SImode:
13581 case E_V2DFmode:
13582 case E_V2DImode:
13583 case E_V4SFmode:
13584 case E_V4SImode:
13585 case E_V16SImode:
13586 case E_V8DImode:
13587 case E_V16SFmode:
13588 case E_V8DFmode:
13589 return ix86_vector_duplicate_value (mode, target, val);
13590
13591 case E_V4HImode:
13592 if (!mmx_ok)
13593 return false;
13594 if (TARGET_SSE || TARGET_3DNOW_A)
13595 {
13596 rtx x;
13597
13598 val = gen_lowpart (SImode, val);
13599 x = gen_rtx_TRUNCATE (HImode, val);
13600 x = gen_rtx_VEC_DUPLICATE (mode, x);
13601 emit_insn (gen_rtx_SET (target, x));
13602 return true;
13603 }
13604 goto widen;
13605
13606 case E_V8QImode:
13607 if (!mmx_ok)
13608 return false;
13609 goto widen;
13610
13611 case E_V8HImode:
13612 if (TARGET_AVX2)
13613 return ix86_vector_duplicate_value (mode, target, val);
13614
13615 if (TARGET_SSE2)
13616 {
13617 struct expand_vec_perm_d dperm;
13618 rtx tmp1, tmp2;
13619
13620 permute:
13621 memset (&dperm, 0, sizeof (dperm));
13622 dperm.target = target;
13623 dperm.vmode = mode;
13624 dperm.nelt = GET_MODE_NUNITS (mode);
13625 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
13626 dperm.one_operand_p = true;
13627
13628 /* Extend to SImode using a paradoxical SUBREG. */
13629 tmp1 = gen_reg_rtx (SImode);
13630 emit_move_insn (tmp1, gen_lowpart (SImode, val));
13631
13632 /* Insert the SImode value as low element of a V4SImode vector. */
13633 tmp2 = gen_reg_rtx (V4SImode);
13634 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
13635 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
13636
13637 ok = (expand_vec_perm_1 (&dperm)
13638 || expand_vec_perm_broadcast_1 (&dperm));
13639 gcc_assert (ok);
13640 return ok;
13641 }
13642 goto widen;
13643
13644 case E_V16QImode:
13645 if (TARGET_AVX2)
13646 return ix86_vector_duplicate_value (mode, target, val);
13647
13648 if (TARGET_SSE2)
13649 goto permute;
13650 goto widen;
13651
13652 widen:
13653 /* Replicate the value once into the next wider mode and recurse. */
13654 {
13655 machine_mode smode, wsmode, wvmode;
13656 rtx x;
13657
13658 smode = GET_MODE_INNER (mode);
13659 wvmode = get_mode_wider_vector (mode);
13660 wsmode = GET_MODE_INNER (wvmode);
13661
13662 val = convert_modes (wsmode, smode, val, true);
13663 x = expand_simple_binop (wsmode, ASHIFT, val,
13664 GEN_INT (GET_MODE_BITSIZE (smode)),
13665 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13666 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
13667
13668 x = gen_reg_rtx (wvmode);
13669 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
13670 gcc_assert (ok);
13671 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
13672 return ok;
13673 }
13674
13675 case E_V16HImode:
13676 case E_V32QImode:
13677 if (TARGET_AVX2)
13678 return ix86_vector_duplicate_value (mode, target, val);
13679 else
13680 {
13681 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
13682 rtx x = gen_reg_rtx (hvmode);
13683
13684 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13685 gcc_assert (ok);
13686
13687 x = gen_rtx_VEC_CONCAT (mode, x, x);
13688 emit_insn (gen_rtx_SET (target, x));
13689 }
13690 return true;
13691
13692 case E_V64QImode:
13693 case E_V32HImode:
13694 if (TARGET_AVX512BW)
13695 return ix86_vector_duplicate_value (mode, target, val);
13696 else
13697 {
13698 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
13699 rtx x = gen_reg_rtx (hvmode);
13700
13701 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13702 gcc_assert (ok);
13703
13704 x = gen_rtx_VEC_CONCAT (mode, x, x);
13705 emit_insn (gen_rtx_SET (target, x));
13706 }
13707 return true;
13708
13709 default:
13710 return false;
13711 }
13712}
13713
13714/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13715 whose ONE_VAR element is VAR, and other elements are zero. Return true
13716 if successful. */
13717
13718static bool
13719ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
13720 rtx target, rtx var, int one_var)
13721{
13722 machine_mode vsimode;
13723 rtx new_target;
13724 rtx x, tmp;
13725 bool use_vector_set = false;
13726 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
13727
13728 switch (mode)
13729 {
13730 case E_V2DImode:
13731 /* For SSE4.1, we normally use vector set. But if the second
13732 element is zero and inter-unit moves are OK, we use movq
13733 instead. */
13734 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
13735 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
13736 && one_var == 0));
13737 break;
13738 case E_V16QImode:
13739 case E_V4SImode:
13740 case E_V4SFmode:
13741 use_vector_set = TARGET_SSE4_1;
13742 break;
13743 case E_V8HImode:
13744 use_vector_set = TARGET_SSE2;
13745 break;
8a0eb0cd
UB
13746 case E_V8QImode:
13747 use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
13748 break;
2bf6d935
ML
13749 case E_V4HImode:
13750 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
13751 break;
13752 case E_V32QImode:
13753 case E_V16HImode:
13754 use_vector_set = TARGET_AVX;
13755 break;
13756 case E_V8SImode:
13757 use_vector_set = TARGET_AVX;
13758 gen_vec_set_0 = gen_vec_setv8si_0;
13759 break;
13760 case E_V8SFmode:
13761 use_vector_set = TARGET_AVX;
13762 gen_vec_set_0 = gen_vec_setv8sf_0;
13763 break;
13764 case E_V4DFmode:
13765 use_vector_set = TARGET_AVX;
13766 gen_vec_set_0 = gen_vec_setv4df_0;
13767 break;
13768 case E_V4DImode:
13769 /* Use ix86_expand_vector_set in 64bit mode only. */
13770 use_vector_set = TARGET_AVX && TARGET_64BIT;
13771 gen_vec_set_0 = gen_vec_setv4di_0;
13772 break;
13773 case E_V16SImode:
13774 use_vector_set = TARGET_AVX512F && one_var == 0;
13775 gen_vec_set_0 = gen_vec_setv16si_0;
13776 break;
13777 case E_V16SFmode:
13778 use_vector_set = TARGET_AVX512F && one_var == 0;
13779 gen_vec_set_0 = gen_vec_setv16sf_0;
13780 break;
13781 case E_V8DFmode:
13782 use_vector_set = TARGET_AVX512F && one_var == 0;
13783 gen_vec_set_0 = gen_vec_setv8df_0;
13784 break;
13785 case E_V8DImode:
13786 /* Use ix86_expand_vector_set in 64bit mode only. */
13787 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
13788 gen_vec_set_0 = gen_vec_setv8di_0;
13789 break;
13790 default:
13791 break;
13792 }
13793
13794 if (use_vector_set)
13795 {
13796 if (gen_vec_set_0 && one_var == 0)
13797 {
13798 var = force_reg (GET_MODE_INNER (mode), var);
13799 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
13800 return true;
13801 }
13802 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
13803 var = force_reg (GET_MODE_INNER (mode), var);
13804 ix86_expand_vector_set (mmx_ok, target, var, one_var);
13805 return true;
13806 }
13807
13808 switch (mode)
13809 {
13810 case E_V2SFmode:
13811 case E_V2SImode:
13812 if (!mmx_ok)
13813 return false;
13814 /* FALLTHRU */
13815
13816 case E_V2DFmode:
13817 case E_V2DImode:
13818 if (one_var != 0)
13819 return false;
13820 var = force_reg (GET_MODE_INNER (mode), var);
13821 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
13822 emit_insn (gen_rtx_SET (target, x));
13823 return true;
13824
13825 case E_V4SFmode:
13826 case E_V4SImode:
13827 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
13828 new_target = gen_reg_rtx (mode);
13829 else
13830 new_target = target;
13831 var = force_reg (GET_MODE_INNER (mode), var);
13832 x = gen_rtx_VEC_DUPLICATE (mode, var);
13833 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
13834 emit_insn (gen_rtx_SET (new_target, x));
13835 if (one_var != 0)
13836 {
13837 /* We need to shuffle the value to the correct position, so
13838 create a new pseudo to store the intermediate result. */
13839
13840 /* With SSE2, we can use the integer shuffle insns. */
13841 if (mode != V4SFmode && TARGET_SSE2)
13842 {
13843 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
13844 const1_rtx,
13845 GEN_INT (one_var == 1 ? 0 : 1),
13846 GEN_INT (one_var == 2 ? 0 : 1),
13847 GEN_INT (one_var == 3 ? 0 : 1)));
13848 if (target != new_target)
13849 emit_move_insn (target, new_target);
13850 return true;
13851 }
13852
13853 /* Otherwise convert the intermediate result to V4SFmode and
13854 use the SSE1 shuffle instructions. */
13855 if (mode != V4SFmode)
13856 {
13857 tmp = gen_reg_rtx (V4SFmode);
13858 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
13859 }
13860 else
13861 tmp = new_target;
13862
13863 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
13864 const1_rtx,
13865 GEN_INT (one_var == 1 ? 0 : 1),
13866 GEN_INT (one_var == 2 ? 0+4 : 1+4),
13867 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
13868
13869 if (mode != V4SFmode)
13870 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
13871 else if (tmp != target)
13872 emit_move_insn (target, tmp);
13873 }
13874 else if (target != new_target)
13875 emit_move_insn (target, new_target);
13876 return true;
13877
13878 case E_V8HImode:
13879 case E_V16QImode:
13880 vsimode = V4SImode;
13881 goto widen;
13882 case E_V4HImode:
13883 case E_V8QImode:
13884 if (!mmx_ok)
13885 return false;
13886 vsimode = V2SImode;
13887 goto widen;
13888 widen:
13889 if (one_var != 0)
13890 return false;
13891
13892 /* Zero extend the variable element to SImode and recurse. */
13893 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
13894
13895 x = gen_reg_rtx (vsimode);
13896 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
13897 var, one_var))
13898 gcc_unreachable ();
13899
13900 emit_move_insn (target, gen_lowpart (mode, x));
13901 return true;
13902
13903 default:
13904 return false;
13905 }
13906}
13907
13908/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13909 consisting of the values in VALS. It is known that all elements
13910 except ONE_VAR are constants. Return true if successful. */
13911
13912static bool
13913ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
13914 rtx target, rtx vals, int one_var)
13915{
13916 rtx var = XVECEXP (vals, 0, one_var);
13917 machine_mode wmode;
13918 rtx const_vec, x;
13919
13920 const_vec = copy_rtx (vals);
13921 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
13922 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
13923
13924 switch (mode)
13925 {
13926 case E_V2DFmode:
13927 case E_V2DImode:
13928 case E_V2SFmode:
13929 case E_V2SImode:
13930 /* For the two element vectors, it's just as easy to use
13931 the general case. */
13932 return false;
13933
13934 case E_V4DImode:
13935 /* Use ix86_expand_vector_set in 64bit mode only. */
13936 if (!TARGET_64BIT)
13937 return false;
13938 /* FALLTHRU */
13939 case E_V4DFmode:
13940 case E_V8SFmode:
13941 case E_V8SImode:
13942 case E_V16HImode:
13943 case E_V32QImode:
13944 case E_V4SFmode:
13945 case E_V4SImode:
13946 case E_V8HImode:
13947 case E_V4HImode:
13948 break;
13949
13950 case E_V16QImode:
13951 if (TARGET_SSE4_1)
13952 break;
13953 wmode = V8HImode;
13954 goto widen;
13955 case E_V8QImode:
8a0eb0cd
UB
13956 if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
13957 break;
2bf6d935
ML
13958 wmode = V4HImode;
13959 goto widen;
13960 widen:
13961 /* There's no way to set one QImode entry easily. Combine
13962 the variable value with its adjacent constant value, and
13963 promote to an HImode set. */
13964 x = XVECEXP (vals, 0, one_var ^ 1);
13965 if (one_var & 1)
13966 {
13967 var = convert_modes (HImode, QImode, var, true);
13968 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
13969 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13970 x = GEN_INT (INTVAL (x) & 0xff);
13971 }
13972 else
13973 {
13974 var = convert_modes (HImode, QImode, var, true);
13975 x = gen_int_mode (UINTVAL (x) << 8, HImode);
13976 }
13977 if (x != const0_rtx)
13978 var = expand_simple_binop (HImode, IOR, var, x, var,
13979 1, OPTAB_LIB_WIDEN);
13980
13981 x = gen_reg_rtx (wmode);
13982 emit_move_insn (x, gen_lowpart (wmode, const_vec));
13983 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
13984
13985 emit_move_insn (target, gen_lowpart (mode, x));
13986 return true;
13987
13988 default:
13989 return false;
13990 }
13991
13992 emit_move_insn (target, const_vec);
13993 ix86_expand_vector_set (mmx_ok, target, var, one_var);
13994 return true;
13995}
13996
13997/* A subroutine of ix86_expand_vector_init_general. Use vector
13998 concatenate to handle the most general case: all values variable,
13999 and none identical. */
14000
14001static void
14002ix86_expand_vector_init_concat (machine_mode mode,
14003 rtx target, rtx *ops, int n)
14004{
1aeecaf5
HL
14005 machine_mode half_mode = VOIDmode;
14006 rtx half[2];
2bf6d935
ML
14007 rtvec v;
14008 int i, j;
14009
14010 switch (n)
14011 {
14012 case 2:
14013 switch (mode)
14014 {
14015 case E_V16SImode:
1aeecaf5 14016 half_mode = V8SImode;
2bf6d935
ML
14017 break;
14018 case E_V16SFmode:
1aeecaf5 14019 half_mode = V8SFmode;
2bf6d935
ML
14020 break;
14021 case E_V8DImode:
1aeecaf5 14022 half_mode = V4DImode;
2bf6d935
ML
14023 break;
14024 case E_V8DFmode:
1aeecaf5 14025 half_mode = V4DFmode;
2bf6d935
ML
14026 break;
14027 case E_V8SImode:
1aeecaf5 14028 half_mode = V4SImode;
2bf6d935
ML
14029 break;
14030 case E_V8SFmode:
1aeecaf5 14031 half_mode = V4SFmode;
2bf6d935
ML
14032 break;
14033 case E_V4DImode:
1aeecaf5 14034 half_mode = V2DImode;
2bf6d935
ML
14035 break;
14036 case E_V4DFmode:
1aeecaf5 14037 half_mode = V2DFmode;
2bf6d935
ML
14038 break;
14039 case E_V4SImode:
1aeecaf5 14040 half_mode = V2SImode;
2bf6d935
ML
14041 break;
14042 case E_V4SFmode:
1aeecaf5 14043 half_mode = V2SFmode;
2bf6d935
ML
14044 break;
14045 case E_V2DImode:
1aeecaf5 14046 half_mode = DImode;
2bf6d935
ML
14047 break;
14048 case E_V2SImode:
1aeecaf5 14049 half_mode = SImode;
2bf6d935
ML
14050 break;
14051 case E_V2DFmode:
1aeecaf5 14052 half_mode = DFmode;
2bf6d935
ML
14053 break;
14054 case E_V2SFmode:
1aeecaf5 14055 half_mode = SFmode;
2bf6d935
ML
14056 break;
14057 default:
14058 gcc_unreachable ();
14059 }
14060
1aeecaf5
HL
14061 if (!register_operand (ops[1], half_mode))
14062 ops[1] = force_reg (half_mode, ops[1]);
14063 if (!register_operand (ops[0], half_mode))
14064 ops[0] = force_reg (half_mode, ops[0]);
2bf6d935
ML
14065 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
14066 ops[1])));
14067 break;
14068
14069 case 4:
14070 switch (mode)
14071 {
14072 case E_V4DImode:
1aeecaf5 14073 half_mode = V2DImode;
2bf6d935
ML
14074 break;
14075 case E_V4DFmode:
1aeecaf5 14076 half_mode = V2DFmode;
2bf6d935
ML
14077 break;
14078 case E_V4SImode:
1aeecaf5 14079 half_mode = V2SImode;
2bf6d935
ML
14080 break;
14081 case E_V4SFmode:
1aeecaf5 14082 half_mode = V2SFmode;
2bf6d935
ML
14083 break;
14084 default:
14085 gcc_unreachable ();
14086 }
14087 goto half;
14088
14089 case 8:
14090 switch (mode)
14091 {
14092 case E_V8DImode:
1aeecaf5 14093 half_mode = V4DImode;
2bf6d935
ML
14094 break;
14095 case E_V8DFmode:
1aeecaf5 14096 half_mode = V4DFmode;
2bf6d935
ML
14097 break;
14098 case E_V8SImode:
1aeecaf5 14099 half_mode = V4SImode;
2bf6d935
ML
14100 break;
14101 case E_V8SFmode:
1aeecaf5 14102 half_mode = V4SFmode;
2bf6d935
ML
14103 break;
14104 default:
14105 gcc_unreachable ();
14106 }
14107 goto half;
14108
14109 case 16:
14110 switch (mode)
14111 {
14112 case E_V16SImode:
1aeecaf5 14113 half_mode = V8SImode;
2bf6d935
ML
14114 break;
14115 case E_V16SFmode:
1aeecaf5 14116 half_mode = V8SFmode;
2bf6d935
ML
14117 break;
14118 default:
14119 gcc_unreachable ();
14120 }
14121 goto half;
14122
14123half:
14124 /* FIXME: We process inputs backward to help RA. PR 36222. */
14125 i = n - 1;
1aeecaf5 14126 for (j = 1; j != -1; j--)
2bf6d935 14127 {
1aeecaf5
HL
14128 half[j] = gen_reg_rtx (half_mode);
14129 switch (n >> 1)
2bf6d935 14130 {
1aeecaf5
HL
14131 case 2:
14132 v = gen_rtvec (2, ops[i-1], ops[i]);
14133 i -= 2;
14134 break;
14135 case 4:
14136 v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
14137 i -= 4;
14138 break;
14139 case 8:
14140 v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
14141 ops[i-3], ops[i-2], ops[i-1], ops[i]);
14142 i -= 8;
14143 break;
14144 default:
14145 gcc_unreachable ();
2bf6d935 14146 }
1aeecaf5
HL
14147 ix86_expand_vector_init (false, half[j],
14148 gen_rtx_PARALLEL (half_mode, v));
2bf6d935 14149 }
1aeecaf5
HL
14150
14151 ix86_expand_vector_init_concat (mode, target, half, 2);
2bf6d935
ML
14152 break;
14153
14154 default:
14155 gcc_unreachable ();
14156 }
14157}
14158
14159/* A subroutine of ix86_expand_vector_init_general. Use vector
14160 interleave to handle the most general case: all values variable,
14161 and none identical. */
14162
14163static void
14164ix86_expand_vector_init_interleave (machine_mode mode,
14165 rtx target, rtx *ops, int n)
14166{
14167 machine_mode first_imode, second_imode, third_imode, inner_mode;
14168 int i, j;
14169 rtx op0, op1;
14170 rtx (*gen_load_even) (rtx, rtx, rtx);
14171 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
14172 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
14173
14174 switch (mode)
14175 {
14176 case E_V8HImode:
14177 gen_load_even = gen_vec_setv8hi;
14178 gen_interleave_first_low = gen_vec_interleave_lowv4si;
14179 gen_interleave_second_low = gen_vec_interleave_lowv2di;
14180 inner_mode = HImode;
14181 first_imode = V4SImode;
14182 second_imode = V2DImode;
14183 third_imode = VOIDmode;
14184 break;
14185 case E_V16QImode:
14186 gen_load_even = gen_vec_setv16qi;
14187 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
14188 gen_interleave_second_low = gen_vec_interleave_lowv4si;
14189 inner_mode = QImode;
14190 first_imode = V8HImode;
14191 second_imode = V4SImode;
14192 third_imode = V2DImode;
14193 break;
14194 default:
14195 gcc_unreachable ();
14196 }
14197
14198 for (i = 0; i < n; i++)
14199 {
14200 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
14201 op0 = gen_reg_rtx (SImode);
14202 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
14203
14204 /* Insert the SImode value as low element of V4SImode vector. */
14205 op1 = gen_reg_rtx (V4SImode);
14206 op0 = gen_rtx_VEC_MERGE (V4SImode,
14207 gen_rtx_VEC_DUPLICATE (V4SImode,
14208 op0),
14209 CONST0_RTX (V4SImode),
14210 const1_rtx);
14211 emit_insn (gen_rtx_SET (op1, op0));
14212
14213 /* Cast the V4SImode vector back to a vector in orignal mode. */
14214 op0 = gen_reg_rtx (mode);
14215 emit_move_insn (op0, gen_lowpart (mode, op1));
14216
14217 /* Load even elements into the second position. */
14218 emit_insn (gen_load_even (op0,
14219 force_reg (inner_mode,
14220 ops [i + i + 1]),
14221 const1_rtx));
14222
14223 /* Cast vector to FIRST_IMODE vector. */
14224 ops[i] = gen_reg_rtx (first_imode);
14225 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
14226 }
14227
14228 /* Interleave low FIRST_IMODE vectors. */
14229 for (i = j = 0; i < n; i += 2, j++)
14230 {
14231 op0 = gen_reg_rtx (first_imode);
14232 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
14233
14234 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
14235 ops[j] = gen_reg_rtx (second_imode);
14236 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
14237 }
14238
14239 /* Interleave low SECOND_IMODE vectors. */
14240 switch (second_imode)
14241 {
14242 case E_V4SImode:
14243 for (i = j = 0; i < n / 2; i += 2, j++)
14244 {
14245 op0 = gen_reg_rtx (second_imode);
14246 emit_insn (gen_interleave_second_low (op0, ops[i],
14247 ops[i + 1]));
14248
14249 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
14250 vector. */
14251 ops[j] = gen_reg_rtx (third_imode);
14252 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
14253 }
14254 second_imode = V2DImode;
14255 gen_interleave_second_low = gen_vec_interleave_lowv2di;
14256 /* FALLTHRU */
14257
14258 case E_V2DImode:
14259 op0 = gen_reg_rtx (second_imode);
14260 emit_insn (gen_interleave_second_low (op0, ops[0],
14261 ops[1]));
14262
14263 /* Cast the SECOND_IMODE vector back to a vector on original
14264 mode. */
14265 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
14266 break;
14267
14268 default:
14269 gcc_unreachable ();
14270 }
14271}
14272
14273/* A subroutine of ix86_expand_vector_init. Handle the most general case:
14274 all values variable, and none identical. */
14275
14276static void
14277ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
14278 rtx target, rtx vals)
14279{
14280 rtx ops[64], op0, op1, op2, op3, op4, op5;
14281 machine_mode half_mode = VOIDmode;
14282 machine_mode quarter_mode = VOIDmode;
14283 int n, i;
14284
14285 switch (mode)
14286 {
14287 case E_V2SFmode:
14288 case E_V2SImode:
14289 if (!mmx_ok && !TARGET_SSE)
14290 break;
14291 /* FALLTHRU */
14292
14293 case E_V16SImode:
14294 case E_V16SFmode:
14295 case E_V8DFmode:
14296 case E_V8DImode:
14297 case E_V8SFmode:
14298 case E_V8SImode:
14299 case E_V4DFmode:
14300 case E_V4DImode:
14301 case E_V4SFmode:
14302 case E_V4SImode:
14303 case E_V2DFmode:
14304 case E_V2DImode:
14305 n = GET_MODE_NUNITS (mode);
14306 for (i = 0; i < n; i++)
14307 ops[i] = XVECEXP (vals, 0, i);
14308 ix86_expand_vector_init_concat (mode, target, ops, n);
14309 return;
14310
14311 case E_V2TImode:
14312 for (i = 0; i < 2; i++)
14313 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
14314 op0 = gen_reg_rtx (V4DImode);
14315 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
14316 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
14317 return;
14318
14319 case E_V4TImode:
14320 for (i = 0; i < 4; i++)
14321 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
14322 ops[4] = gen_reg_rtx (V4DImode);
14323 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
14324 ops[5] = gen_reg_rtx (V4DImode);
14325 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
14326 op0 = gen_reg_rtx (V8DImode);
14327 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
14328 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
14329 return;
14330
14331 case E_V32QImode:
14332 half_mode = V16QImode;
14333 goto half;
14334
14335 case E_V16HImode:
14336 half_mode = V8HImode;
14337 goto half;
14338
14339half:
14340 n = GET_MODE_NUNITS (mode);
14341 for (i = 0; i < n; i++)
14342 ops[i] = XVECEXP (vals, 0, i);
14343 op0 = gen_reg_rtx (half_mode);
14344 op1 = gen_reg_rtx (half_mode);
14345 ix86_expand_vector_init_interleave (half_mode, op0, ops,
14346 n >> 2);
14347 ix86_expand_vector_init_interleave (half_mode, op1,
14348 &ops [n >> 1], n >> 2);
14349 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
14350 return;
14351
14352 case E_V64QImode:
14353 quarter_mode = V16QImode;
14354 half_mode = V32QImode;
14355 goto quarter;
14356
14357 case E_V32HImode:
14358 quarter_mode = V8HImode;
14359 half_mode = V16HImode;
14360 goto quarter;
14361
14362quarter:
14363 n = GET_MODE_NUNITS (mode);
14364 for (i = 0; i < n; i++)
14365 ops[i] = XVECEXP (vals, 0, i);
14366 op0 = gen_reg_rtx (quarter_mode);
14367 op1 = gen_reg_rtx (quarter_mode);
14368 op2 = gen_reg_rtx (quarter_mode);
14369 op3 = gen_reg_rtx (quarter_mode);
14370 op4 = gen_reg_rtx (half_mode);
14371 op5 = gen_reg_rtx (half_mode);
14372 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
14373 n >> 3);
14374 ix86_expand_vector_init_interleave (quarter_mode, op1,
14375 &ops [n >> 2], n >> 3);
14376 ix86_expand_vector_init_interleave (quarter_mode, op2,
14377 &ops [n >> 1], n >> 3);
14378 ix86_expand_vector_init_interleave (quarter_mode, op3,
14379 &ops [(n >> 1) | (n >> 2)], n >> 3);
14380 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
14381 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
14382 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
14383 return;
14384
14385 case E_V16QImode:
14386 if (!TARGET_SSE4_1)
14387 break;
14388 /* FALLTHRU */
14389
14390 case E_V8HImode:
14391 if (!TARGET_SSE2)
14392 break;
14393
14394 /* Don't use ix86_expand_vector_init_interleave if we can't
14395 move from GPR to SSE register directly. */
14396 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
14397 break;
14398
14399 n = GET_MODE_NUNITS (mode);
14400 for (i = 0; i < n; i++)
14401 ops[i] = XVECEXP (vals, 0, i);
14402 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
14403 return;
14404
14405 case E_V4HImode:
14406 case E_V8QImode:
14407 break;
14408
14409 default:
14410 gcc_unreachable ();
14411 }
14412
14413 {
14414 int i, j, n_elts, n_words, n_elt_per_word;
14415 machine_mode inner_mode;
14416 rtx words[4], shift;
14417
14418 inner_mode = GET_MODE_INNER (mode);
14419 n_elts = GET_MODE_NUNITS (mode);
14420 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
14421 n_elt_per_word = n_elts / n_words;
14422 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
14423
14424 for (i = 0; i < n_words; ++i)
14425 {
14426 rtx word = NULL_RTX;
14427
14428 for (j = 0; j < n_elt_per_word; ++j)
14429 {
14430 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
14431 elt = convert_modes (word_mode, inner_mode, elt, true);
14432
14433 if (j == 0)
14434 word = elt;
14435 else
14436 {
14437 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
14438 word, 1, OPTAB_LIB_WIDEN);
14439 word = expand_simple_binop (word_mode, IOR, word, elt,
14440 word, 1, OPTAB_LIB_WIDEN);
14441 }
14442 }
14443
14444 words[i] = word;
14445 }
14446
14447 if (n_words == 1)
14448 emit_move_insn (target, gen_lowpart (mode, words[0]));
14449 else if (n_words == 2)
14450 {
14451 rtx tmp = gen_reg_rtx (mode);
14452 emit_clobber (tmp);
14453 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
14454 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
14455 emit_move_insn (target, tmp);
14456 }
14457 else if (n_words == 4)
14458 {
14459 rtx tmp = gen_reg_rtx (V4SImode);
14460 gcc_assert (word_mode == SImode);
14461 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
14462 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
14463 emit_move_insn (target, gen_lowpart (mode, tmp));
14464 }
14465 else
14466 gcc_unreachable ();
14467 }
14468}
14469
14470/* Initialize vector TARGET via VALS. Suppress the use of MMX
14471 instructions unless MMX_OK is true. */
14472
14473void
14474ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
14475{
14476 machine_mode mode = GET_MODE (target);
14477 machine_mode inner_mode = GET_MODE_INNER (mode);
14478 int n_elts = GET_MODE_NUNITS (mode);
14479 int n_var = 0, one_var = -1;
14480 bool all_same = true, all_const_zero = true;
14481 int i;
14482 rtx x;
14483
14484 /* Handle first initialization from vector elts. */
14485 if (n_elts != XVECLEN (vals, 0))
14486 {
14487 rtx subtarget = target;
14488 x = XVECEXP (vals, 0, 0);
14489 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
14490 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
14491 {
14492 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
14493 if (inner_mode == QImode || inner_mode == HImode)
14494 {
14495 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
14496 mode = mode_for_vector (SImode, n_bits / 4).require ();
14497 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
14498 ops[0] = gen_lowpart (inner_mode, ops[0]);
14499 ops[1] = gen_lowpart (inner_mode, ops[1]);
14500 subtarget = gen_reg_rtx (mode);
14501 }
14502 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
14503 if (subtarget != target)
14504 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
14505 return;
14506 }
14507 gcc_unreachable ();
14508 }
14509
14510 for (i = 0; i < n_elts; ++i)
14511 {
14512 x = XVECEXP (vals, 0, i);
14513 if (!(CONST_SCALAR_INT_P (x)
14514 || CONST_DOUBLE_P (x)
14515 || CONST_FIXED_P (x)))
14516 n_var++, one_var = i;
14517 else if (x != CONST0_RTX (inner_mode))
14518 all_const_zero = false;
14519 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
14520 all_same = false;
14521 }
14522
14523 /* Constants are best loaded from the constant pool. */
14524 if (n_var == 0)
14525 {
14526 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
14527 return;
14528 }
14529
14530 /* If all values are identical, broadcast the value. */
14531 if (all_same
14532 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
14533 XVECEXP (vals, 0, 0)))
14534 return;
14535
14536 /* Values where only one field is non-constant are best loaded from
14537 the pool and overwritten via move later. */
14538 if (n_var == 1)
14539 {
14540 if (all_const_zero
14541 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
14542 XVECEXP (vals, 0, one_var),
14543 one_var))
14544 return;
14545
14546 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
14547 return;
14548 }
14549
14550 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
14551}
14552
287cc750 14553/* Implemented as
14554 V setg (V v, int idx, T val)
14555 {
14556 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
14557 V valv = (V){val, val, val, val, val, val, val, val};
14558 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
14559 v = (v & ~mask) | (valv & mask);
14560 return v;
14561 }. */
14562void
14563ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
14564{
14565 rtx vec[64];
14566 machine_mode mode = GET_MODE (target);
14567 machine_mode cmp_mode = mode;
14568 int n_elts = GET_MODE_NUNITS (mode);
14569 rtx valv,idxv,constv,idx_tmp;
14570 bool ok = false;
14571
14572 /* 512-bits vector byte/word broadcast and comparison only available
14573 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
14574 when without TARGET_AVX512BW. */
14575 if ((mode == V32HImode || mode == V64QImode) && !TARGET_AVX512BW)
14576 {
14577 gcc_assert (TARGET_AVX512F);
14578 rtx vhi, vlo, idx_hi;
14579 machine_mode half_mode;
14580 rtx (*extract_hi)(rtx, rtx);
14581 rtx (*extract_lo)(rtx, rtx);
14582
14583 if (mode == V32HImode)
14584 {
14585 half_mode = V16HImode;
14586 extract_hi = gen_vec_extract_hi_v32hi;
14587 extract_lo = gen_vec_extract_lo_v32hi;
14588 }
14589 else
14590 {
14591 half_mode = V32QImode;
14592 extract_hi = gen_vec_extract_hi_v64qi;
14593 extract_lo = gen_vec_extract_lo_v64qi;
14594 }
14595
14596 vhi = gen_reg_rtx (half_mode);
14597 vlo = gen_reg_rtx (half_mode);
14598 idx_hi = gen_reg_rtx (GET_MODE (idx));
14599 emit_insn (extract_hi (vhi, target));
14600 emit_insn (extract_lo (vlo, target));
14601 vec[0] = idx_hi;
14602 vec[1] = idx;
14603 vec[2] = GEN_INT (n_elts/2);
14604 ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
14605 ix86_expand_vector_set_var (vhi, val, idx_hi);
14606 ix86_expand_vector_set_var (vlo, val, idx);
14607 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
14608 return;
14609 }
14610
14611 if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
14612 {
14613 switch (mode)
14614 {
14615 case E_V2DFmode:
14616 cmp_mode = V2DImode;
14617 break;
14618 case E_V4DFmode:
14619 cmp_mode = V4DImode;
14620 break;
14621 case E_V8DFmode:
14622 cmp_mode = V8DImode;
14623 break;
14624 case E_V4SFmode:
14625 cmp_mode = V4SImode;
14626 break;
14627 case E_V8SFmode:
14628 cmp_mode = V8SImode;
14629 break;
14630 case E_V16SFmode:
14631 cmp_mode = V16SImode;
14632 break;
14633 default:
14634 gcc_unreachable ();
14635 }
14636 }
14637
14638 for (int i = 0; i != n_elts; i++)
14639 vec[i] = GEN_INT (i);
14640 constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
14641 valv = gen_reg_rtx (mode);
14642 idxv = gen_reg_rtx (cmp_mode);
14643 idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
14644
14645 ok = ix86_expand_vector_init_duplicate (false, mode, valv, val);
14646 gcc_assert (ok);
14647 ok = ix86_expand_vector_init_duplicate (false, cmp_mode, idxv, idx_tmp);
14648 gcc_assert (ok);
14649 vec[0] = target;
14650 vec[1] = valv;
14651 vec[2] = target;
14652 vec[3] = gen_rtx_EQ (mode, idxv, constv);
14653 vec[4] = idxv;
14654 vec[5] = constv;
14655 ok = ix86_expand_int_vcond (vec);
14656 gcc_assert (ok);
14657}
14658
2bf6d935
ML
14659void
14660ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
14661{
14662 machine_mode mode = GET_MODE (target);
14663 machine_mode inner_mode = GET_MODE_INNER (mode);
14664 machine_mode half_mode;
14665 bool use_vec_merge = false;
14666 rtx tmp;
14667 static rtx (*gen_extract[6][2]) (rtx, rtx)
14668 = {
14669 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
14670 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
14671 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
14672 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
14673 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
14674 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
14675 };
14676 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
14677 = {
14678 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
14679 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
14680 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
14681 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
14682 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
14683 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
14684 };
14685 int i, j, n;
14686 machine_mode mmode = VOIDmode;
14687 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
14688
14689 switch (mode)
14690 {
2bf6d935 14691 case E_V2SImode:
f15c7bd1
UB
14692 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14693 if (use_vec_merge)
14694 break;
14695 /* FALLTHRU */
14696
14697 case E_V2SFmode:
2bf6d935
ML
14698 if (mmx_ok)
14699 {
14700 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14701 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
14702 if (elt == 0)
14703 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14704 else
14705 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14706 emit_insn (gen_rtx_SET (target, tmp));
14707 return;
14708 }
14709 break;
14710
14711 case E_V2DImode:
14712 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
14713 if (use_vec_merge)
14714 break;
14715
14716 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14717 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
14718 if (elt == 0)
14719 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14720 else
14721 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14722 emit_insn (gen_rtx_SET (target, tmp));
14723 return;
14724
14725 case E_V2DFmode:
ac173024
L
14726 /* NB: For ELT == 0, use standard scalar operation patterns which
14727 preserve the rest of the vector for combiner:
14728
14729 (vec_merge:V2DF
14730 (vec_duplicate:V2DF (reg:DF))
14731 (reg:V2DF)
14732 (const_int 1))
14733 */
14734 if (elt == 0)
14735 goto do_vec_merge;
14736
2bf6d935
ML
14737 {
14738 rtx op0, op1;
14739
14740 /* For the two element vectors, we implement a VEC_CONCAT with
14741 the extraction of the other element. */
14742
14743 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
14744 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
14745
14746 if (elt == 0)
14747 op0 = val, op1 = tmp;
14748 else
14749 op0 = tmp, op1 = val;
14750
14751 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
14752 emit_insn (gen_rtx_SET (target, tmp));
14753 }
14754 return;
14755
14756 case E_V4SFmode:
14757 use_vec_merge = TARGET_SSE4_1;
14758 if (use_vec_merge)
14759 break;
14760
14761 switch (elt)
14762 {
14763 case 0:
14764 use_vec_merge = true;
14765 break;
14766
14767 case 1:
14768 /* tmp = target = A B C D */
14769 tmp = copy_to_reg (target);
14770 /* target = A A B B */
14771 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
14772 /* target = X A B B */
14773 ix86_expand_vector_set (false, target, val, 0);
14774 /* target = A X C D */
14775 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14776 const1_rtx, const0_rtx,
14777 GEN_INT (2+4), GEN_INT (3+4)));
14778 return;
14779
14780 case 2:
14781 /* tmp = target = A B C D */
14782 tmp = copy_to_reg (target);
14783 /* tmp = X B C D */
14784 ix86_expand_vector_set (false, tmp, val, 0);
14785 /* target = A B X D */
14786 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14787 const0_rtx, const1_rtx,
14788 GEN_INT (0+4), GEN_INT (3+4)));
14789 return;
14790
14791 case 3:
14792 /* tmp = target = A B C D */
14793 tmp = copy_to_reg (target);
14794 /* tmp = X B C D */
14795 ix86_expand_vector_set (false, tmp, val, 0);
14796 /* target = A B X D */
14797 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14798 const0_rtx, const1_rtx,
14799 GEN_INT (2+4), GEN_INT (0+4)));
14800 return;
14801
14802 default:
14803 gcc_unreachable ();
14804 }
14805 break;
14806
14807 case E_V4SImode:
14808 use_vec_merge = TARGET_SSE4_1;
14809 if (use_vec_merge)
14810 break;
14811
14812 /* Element 0 handled by vec_merge below. */
14813 if (elt == 0)
14814 {
14815 use_vec_merge = true;
14816 break;
14817 }
14818
14819 if (TARGET_SSE2)
14820 {
14821 /* With SSE2, use integer shuffles to swap element 0 and ELT,
14822 store into element 0, then shuffle them back. */
14823
14824 rtx order[4];
14825
14826 order[0] = GEN_INT (elt);
14827 order[1] = const1_rtx;
14828 order[2] = const2_rtx;
14829 order[3] = GEN_INT (3);
14830 order[elt] = const0_rtx;
14831
14832 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14833 order[1], order[2], order[3]));
14834
14835 ix86_expand_vector_set (false, target, val, 0);
14836
14837 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14838 order[1], order[2], order[3]));
14839 }
14840 else
14841 {
14842 /* For SSE1, we have to reuse the V4SF code. */
14843 rtx t = gen_reg_rtx (V4SFmode);
14844 emit_move_insn (t, gen_lowpart (V4SFmode, target));
14845 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
14846 emit_move_insn (target, gen_lowpart (mode, t));
14847 }
14848 return;
14849
14850 case E_V8HImode:
14851 use_vec_merge = TARGET_SSE2;
14852 break;
14853 case E_V4HImode:
14854 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14855 break;
14856
14857 case E_V16QImode:
14858 use_vec_merge = TARGET_SSE4_1;
14859 break;
14860
14861 case E_V8QImode:
f15c7bd1 14862 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
2bf6d935
ML
14863 break;
14864
14865 case E_V32QImode:
14866 half_mode = V16QImode;
14867 j = 0;
14868 n = 16;
14869 goto half;
14870
14871 case E_V16HImode:
14872 half_mode = V8HImode;
14873 j = 1;
14874 n = 8;
14875 goto half;
14876
14877 case E_V8SImode:
14878 half_mode = V4SImode;
14879 j = 2;
14880 n = 4;
14881 goto half;
14882
14883 case E_V4DImode:
14884 half_mode = V2DImode;
14885 j = 3;
14886 n = 2;
14887 goto half;
14888
14889 case E_V8SFmode:
14890 half_mode = V4SFmode;
14891 j = 4;
14892 n = 4;
14893 goto half;
14894
14895 case E_V4DFmode:
14896 half_mode = V2DFmode;
14897 j = 5;
14898 n = 2;
14899 goto half;
14900
14901half:
14902 /* Compute offset. */
14903 i = elt / n;
14904 elt %= n;
14905
14906 gcc_assert (i <= 1);
14907
14908 /* Extract the half. */
14909 tmp = gen_reg_rtx (half_mode);
14910 emit_insn (gen_extract[j][i] (tmp, target));
14911
14912 /* Put val in tmp at elt. */
14913 ix86_expand_vector_set (false, tmp, val, elt);
14914
14915 /* Put it back. */
14916 emit_insn (gen_insert[j][i] (target, target, tmp));
14917 return;
14918
14919 case E_V8DFmode:
14920 if (TARGET_AVX512F)
14921 {
14922 mmode = QImode;
14923 gen_blendm = gen_avx512f_blendmv8df;
14924 }
14925 break;
14926
14927 case E_V8DImode:
14928 if (TARGET_AVX512F)
14929 {
14930 mmode = QImode;
14931 gen_blendm = gen_avx512f_blendmv8di;
14932 }
14933 break;
14934
14935 case E_V16SFmode:
14936 if (TARGET_AVX512F)
14937 {
14938 mmode = HImode;
14939 gen_blendm = gen_avx512f_blendmv16sf;
14940 }
14941 break;
14942
14943 case E_V16SImode:
14944 if (TARGET_AVX512F)
14945 {
14946 mmode = HImode;
14947 gen_blendm = gen_avx512f_blendmv16si;
14948 }
14949 break;
14950
14951 case E_V32HImode:
14952 if (TARGET_AVX512BW)
14953 {
14954 mmode = SImode;
14955 gen_blendm = gen_avx512bw_blendmv32hi;
14956 }
14957 else if (TARGET_AVX512F)
14958 {
14959 half_mode = E_V8HImode;
14960 n = 8;
14961 goto quarter;
14962 }
14963 break;
14964
14965 case E_V64QImode:
14966 if (TARGET_AVX512BW)
14967 {
14968 mmode = DImode;
14969 gen_blendm = gen_avx512bw_blendmv64qi;
14970 }
14971 else if (TARGET_AVX512F)
14972 {
14973 half_mode = E_V16QImode;
14974 n = 16;
14975 goto quarter;
14976 }
14977 break;
14978
14979quarter:
14980 /* Compute offset. */
14981 i = elt / n;
14982 elt %= n;
14983
14984 gcc_assert (i <= 3);
14985
14986 {
14987 /* Extract the quarter. */
14988 tmp = gen_reg_rtx (V4SImode);
14989 rtx tmp2 = gen_lowpart (V16SImode, target);
14990 rtx mask = gen_reg_rtx (QImode);
14991
14992 emit_move_insn (mask, constm1_rtx);
14993 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
14994 tmp, mask));
14995
14996 tmp2 = gen_reg_rtx (half_mode);
14997 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
14998 tmp = tmp2;
14999
15000 /* Put val in tmp at elt. */
15001 ix86_expand_vector_set (false, tmp, val, elt);
15002
15003 /* Put it back. */
15004 tmp2 = gen_reg_rtx (V16SImode);
15005 rtx tmp3 = gen_lowpart (V16SImode, target);
15006 mask = gen_reg_rtx (HImode);
15007 emit_move_insn (mask, constm1_rtx);
15008 tmp = gen_lowpart (V4SImode, tmp);
15009 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
15010 tmp3, mask));
15011 emit_move_insn (target, gen_lowpart (mode, tmp2));
15012 }
15013 return;
15014
15015 default:
15016 break;
15017 }
15018
15019 if (mmode != VOIDmode)
15020 {
15021 tmp = gen_reg_rtx (mode);
15022 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
15023 /* The avx512*_blendm<mode> expanders have different operand order
15024 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
15025 elements where the mask is set and second input operand otherwise,
15026 in {sse,avx}*_*blend* the first input operand is used for elements
15027 where the mask is clear and second input operand otherwise. */
15028 emit_insn (gen_blendm (target, target, tmp,
15029 force_reg (mmode,
15030 gen_int_mode (HOST_WIDE_INT_1U << elt,
15031 mmode))));
15032 }
15033 else if (use_vec_merge)
15034 {
ac173024 15035do_vec_merge:
2bf6d935
ML
15036 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
15037 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
15038 GEN_INT (HOST_WIDE_INT_1U << elt));
15039 emit_insn (gen_rtx_SET (target, tmp));
15040 }
15041 else
15042 {
15043 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
15044
15045 emit_move_insn (mem, target);
15046
15047 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
15048 emit_move_insn (tmp, val);
15049
15050 emit_move_insn (target, mem);
15051 }
15052}
15053
15054void
15055ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
15056{
15057 machine_mode mode = GET_MODE (vec);
15058 machine_mode inner_mode = GET_MODE_INNER (mode);
15059 bool use_vec_extr = false;
15060 rtx tmp;
15061
15062 switch (mode)
15063 {
15064 case E_V2SImode:
5fbc8ab4
UB
15065 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15066 if (use_vec_extr)
15067 break;
15068 /* FALLTHRU */
15069
2bf6d935
ML
15070 case E_V2SFmode:
15071 if (!mmx_ok)
15072 break;
15073 /* FALLTHRU */
15074
15075 case E_V2DFmode:
15076 case E_V2DImode:
15077 case E_V2TImode:
15078 case E_V4TImode:
15079 use_vec_extr = true;
15080 break;
15081
15082 case E_V4SFmode:
15083 use_vec_extr = TARGET_SSE4_1;
15084 if (use_vec_extr)
15085 break;
15086
15087 switch (elt)
15088 {
15089 case 0:
15090 tmp = vec;
15091 break;
15092
15093 case 1:
15094 case 3:
15095 tmp = gen_reg_rtx (mode);
15096 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
15097 GEN_INT (elt), GEN_INT (elt),
15098 GEN_INT (elt+4), GEN_INT (elt+4)));
15099 break;
15100
15101 case 2:
15102 tmp = gen_reg_rtx (mode);
15103 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
15104 break;
15105
15106 default:
15107 gcc_unreachable ();
15108 }
15109 vec = tmp;
15110 use_vec_extr = true;
15111 elt = 0;
15112 break;
15113
15114 case E_V4SImode:
15115 use_vec_extr = TARGET_SSE4_1;
15116 if (use_vec_extr)
15117 break;
15118
15119 if (TARGET_SSE2)
15120 {
15121 switch (elt)
15122 {
15123 case 0:
15124 tmp = vec;
15125 break;
15126
15127 case 1:
15128 case 3:
15129 tmp = gen_reg_rtx (mode);
15130 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
15131 GEN_INT (elt), GEN_INT (elt),
15132 GEN_INT (elt), GEN_INT (elt)));
15133 break;
15134
15135 case 2:
15136 tmp = gen_reg_rtx (mode);
15137 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
15138 break;
15139
15140 default:
15141 gcc_unreachable ();
15142 }
15143 vec = tmp;
15144 use_vec_extr = true;
15145 elt = 0;
15146 }
15147 else
15148 {
15149 /* For SSE1, we have to reuse the V4SF code. */
15150 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
15151 gen_lowpart (V4SFmode, vec), elt);
15152 return;
15153 }
15154 break;
15155
15156 case E_V8HImode:
15157 use_vec_extr = TARGET_SSE2;
15158 break;
15159 case E_V4HImode:
15160 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
15161 break;
15162
15163 case E_V16QImode:
15164 use_vec_extr = TARGET_SSE4_1;
f66e6e2b
JJ
15165 if (!use_vec_extr
15166 && TARGET_SSE2
15167 && elt == 0
15168 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
15169 {
15170 tmp = gen_reg_rtx (SImode);
15171 ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
15172 0);
15173 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
15174 return;
15175 }
2bf6d935
ML
15176 break;
15177
15178 case E_V8SFmode:
15179 if (TARGET_AVX)
15180 {
15181 tmp = gen_reg_rtx (V4SFmode);
15182 if (elt < 4)
15183 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
15184 else
15185 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
15186 ix86_expand_vector_extract (false, target, tmp, elt & 3);
15187 return;
15188 }
15189 break;
15190
15191 case E_V4DFmode:
15192 if (TARGET_AVX)
15193 {
15194 tmp = gen_reg_rtx (V2DFmode);
15195 if (elt < 2)
15196 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
15197 else
15198 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
15199 ix86_expand_vector_extract (false, target, tmp, elt & 1);
15200 return;
15201 }
15202 break;
15203
15204 case E_V32QImode:
15205 if (TARGET_AVX)
15206 {
15207 tmp = gen_reg_rtx (V16QImode);
15208 if (elt < 16)
15209 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
15210 else
15211 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
15212 ix86_expand_vector_extract (false, target, tmp, elt & 15);
15213 return;
15214 }
15215 break;
15216
15217 case E_V16HImode:
15218 if (TARGET_AVX)
15219 {
15220 tmp = gen_reg_rtx (V8HImode);
15221 if (elt < 8)
15222 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
15223 else
15224 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
15225 ix86_expand_vector_extract (false, target, tmp, elt & 7);
15226 return;
15227 }
15228 break;
15229
15230 case E_V8SImode:
15231 if (TARGET_AVX)
15232 {
15233 tmp = gen_reg_rtx (V4SImode);
15234 if (elt < 4)
15235 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
15236 else
15237 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
15238 ix86_expand_vector_extract (false, target, tmp, elt & 3);
15239 return;
15240 }
15241 break;
15242
15243 case E_V4DImode:
15244 if (TARGET_AVX)
15245 {
15246 tmp = gen_reg_rtx (V2DImode);
15247 if (elt < 2)
15248 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
15249 else
15250 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
15251 ix86_expand_vector_extract (false, target, tmp, elt & 1);
15252 return;
15253 }
15254 break;
15255
15256 case E_V32HImode:
15257 if (TARGET_AVX512BW)
15258 {
15259 tmp = gen_reg_rtx (V16HImode);
15260 if (elt < 16)
15261 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
15262 else
15263 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
15264 ix86_expand_vector_extract (false, target, tmp, elt & 15);
15265 return;
15266 }
15267 break;
15268
15269 case E_V64QImode:
15270 if (TARGET_AVX512BW)
15271 {
15272 tmp = gen_reg_rtx (V32QImode);
15273 if (elt < 32)
15274 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
15275 else
15276 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
15277 ix86_expand_vector_extract (false, target, tmp, elt & 31);
15278 return;
15279 }
15280 break;
15281
15282 case E_V16SFmode:
15283 tmp = gen_reg_rtx (V8SFmode);
15284 if (elt < 8)
15285 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
15286 else
15287 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
15288 ix86_expand_vector_extract (false, target, tmp, elt & 7);
15289 return;
15290
15291 case E_V8DFmode:
15292 tmp = gen_reg_rtx (V4DFmode);
15293 if (elt < 4)
15294 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
15295 else
15296 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
15297 ix86_expand_vector_extract (false, target, tmp, elt & 3);
15298 return;
15299
15300 case E_V16SImode:
15301 tmp = gen_reg_rtx (V8SImode);
15302 if (elt < 8)
15303 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
15304 else
15305 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
15306 ix86_expand_vector_extract (false, target, tmp, elt & 7);
15307 return;
15308
15309 case E_V8DImode:
15310 tmp = gen_reg_rtx (V4DImode);
15311 if (elt < 4)
15312 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
15313 else
15314 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
15315 ix86_expand_vector_extract (false, target, tmp, elt & 3);
15316 return;
15317
15318 case E_V8QImode:
5fbc8ab4 15319 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
2bf6d935 15320 /* ??? Could extract the appropriate HImode element and shift. */
5fbc8ab4
UB
15321 break;
15322
2bf6d935
ML
15323 default:
15324 break;
15325 }
15326
15327 if (use_vec_extr)
15328 {
15329 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
15330 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
15331
15332 /* Let the rtl optimizers know about the zero extension performed. */
15333 if (inner_mode == QImode || inner_mode == HImode)
15334 {
15335 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
15336 target = gen_lowpart (SImode, target);
15337 }
15338
15339 emit_insn (gen_rtx_SET (target, tmp));
15340 }
15341 else
15342 {
15343 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
15344
15345 emit_move_insn (mem, vec);
15346
15347 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
15348 emit_move_insn (target, tmp);
15349 }
15350}
15351
15352/* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
15353 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
15354 The upper bits of DEST are undefined, though they shouldn't cause
15355 exceptions (some bits from src or all zeros are ok). */
15356
15357static void
15358emit_reduc_half (rtx dest, rtx src, int i)
15359{
15360 rtx tem, d = dest;
15361 switch (GET_MODE (src))
15362 {
15363 case E_V4SFmode:
15364 if (i == 128)
15365 tem = gen_sse_movhlps (dest, src, src);
15366 else
15367 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
15368 GEN_INT (1 + 4), GEN_INT (1 + 4));
15369 break;
15370 case E_V2DFmode:
15371 tem = gen_vec_interleave_highv2df (dest, src, src);
15372 break;
15373 case E_V16QImode:
15374 case E_V8HImode:
15375 case E_V4SImode:
15376 case E_V2DImode:
15377 d = gen_reg_rtx (V1TImode);
15378 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
15379 GEN_INT (i / 2));
15380 break;
15381 case E_V8SFmode:
15382 if (i == 256)
15383 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
15384 else
15385 tem = gen_avx_shufps256 (dest, src, src,
15386 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
15387 break;
15388 case E_V4DFmode:
15389 if (i == 256)
15390 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
15391 else
15392 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
15393 break;
15394 case E_V32QImode:
15395 case E_V16HImode:
15396 case E_V8SImode:
15397 case E_V4DImode:
15398 if (i == 256)
15399 {
15400 if (GET_MODE (dest) != V4DImode)
15401 d = gen_reg_rtx (V4DImode);
15402 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
15403 gen_lowpart (V4DImode, src),
15404 const1_rtx);
15405 }
15406 else
15407 {
15408 d = gen_reg_rtx (V2TImode);
15409 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
15410 GEN_INT (i / 2));
15411 }
15412 break;
15413 case E_V64QImode:
15414 case E_V32HImode:
bee27152
JJ
15415 if (i < 64)
15416 {
15417 d = gen_reg_rtx (V4TImode);
15418 tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
15419 GEN_INT (i / 2));
15420 break;
15421 }
15422 /* FALLTHRU */
2bf6d935
ML
15423 case E_V16SImode:
15424 case E_V16SFmode:
15425 case E_V8DImode:
15426 case E_V8DFmode:
15427 if (i > 128)
15428 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
bee27152
JJ
15429 gen_lowpart (V16SImode, src),
15430 gen_lowpart (V16SImode, src),
15431 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
15432 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
15433 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
15434 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
15435 GEN_INT (0xC), GEN_INT (0xD),
15436 GEN_INT (0xE), GEN_INT (0xF),
15437 GEN_INT (0x10), GEN_INT (0x11),
15438 GEN_INT (0x12), GEN_INT (0x13),
15439 GEN_INT (0x14), GEN_INT (0x15),
15440 GEN_INT (0x16), GEN_INT (0x17));
2bf6d935
ML
15441 else
15442 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
bee27152
JJ
15443 gen_lowpart (V16SImode, src),
15444 GEN_INT (i == 128 ? 0x2 : 0x1),
15445 GEN_INT (0x3),
15446 GEN_INT (0x3),
15447 GEN_INT (0x3),
15448 GEN_INT (i == 128 ? 0x6 : 0x5),
15449 GEN_INT (0x7),
15450 GEN_INT (0x7),
15451 GEN_INT (0x7),
15452 GEN_INT (i == 128 ? 0xA : 0x9),
15453 GEN_INT (0xB),
15454 GEN_INT (0xB),
15455 GEN_INT (0xB),
15456 GEN_INT (i == 128 ? 0xE : 0xD),
15457 GEN_INT (0xF),
15458 GEN_INT (0xF),
15459 GEN_INT (0xF));
2bf6d935
ML
15460 break;
15461 default:
15462 gcc_unreachable ();
15463 }
15464 emit_insn (tem);
15465 if (d != dest)
15466 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
15467}
15468
15469/* Expand a vector reduction. FN is the binary pattern to reduce;
15470 DEST is the destination; IN is the input vector. */
15471
15472void
15473ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
15474{
15475 rtx half, dst, vec = in;
15476 machine_mode mode = GET_MODE (in);
15477 int i;
15478
15479 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
15480 if (TARGET_SSE4_1
15481 && mode == V8HImode
15482 && fn == gen_uminv8hi3)
15483 {
15484 emit_insn (gen_sse4_1_phminposuw (dest, in));
15485 return;
15486 }
15487
15488 for (i = GET_MODE_BITSIZE (mode);
15489 i > GET_MODE_UNIT_BITSIZE (mode);
15490 i >>= 1)
15491 {
15492 half = gen_reg_rtx (mode);
15493 emit_reduc_half (half, vec, i);
15494 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
15495 dst = dest;
15496 else
15497 dst = gen_reg_rtx (mode);
15498 emit_insn (fn (dst, half, vec));
15499 vec = dst;
15500 }
15501}
15502
15503/* Output code to perform a conditional jump to LABEL, if C2 flag in
15504 FP status register is set. */
15505
15506void
15507ix86_emit_fp_unordered_jump (rtx label)
15508{
15509 rtx reg = gen_reg_rtx (HImode);
15510 rtx_insn *insn;
15511 rtx temp;
15512
15513 emit_insn (gen_x86_fnstsw_1 (reg));
15514
15515 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
15516 {
15517 emit_insn (gen_x86_sahf_1 (reg));
15518
15519 temp = gen_rtx_REG (CCmode, FLAGS_REG);
15520 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
15521 }
15522 else
15523 {
15524 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
15525
15526 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
15527 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
15528 }
15529
15530 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
15531 gen_rtx_LABEL_REF (VOIDmode, label),
15532 pc_rtx);
15533 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
15534 predict_jump (REG_BR_PROB_BASE * 10 / 100);
15535 JUMP_LABEL (insn) = label;
15536}
15537
15538/* Output code to perform an sinh XFmode calculation. */
15539
15540void ix86_emit_i387_sinh (rtx op0, rtx op1)
15541{
15542 rtx e1 = gen_reg_rtx (XFmode);
15543 rtx e2 = gen_reg_rtx (XFmode);
15544 rtx scratch = gen_reg_rtx (HImode);
15545 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15546 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15547 rtx cst1, tmp;
15548 rtx_code_label *jump_label = gen_label_rtx ();
15549 rtx_insn *insn;
15550
15551 /* scratch = fxam (op1) */
15552 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15553
15554 /* e1 = expm1 (|op1|) */
15555 emit_insn (gen_absxf2 (e2, op1));
15556 emit_insn (gen_expm1xf2 (e1, e2));
15557
15558 /* e2 = e1 / (e1 + 1.0) + e1 */
15559 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15560 emit_insn (gen_addxf3 (e2, e1, cst1));
15561 emit_insn (gen_divxf3 (e2, e1, e2));
15562 emit_insn (gen_addxf3 (e2, e2, e1));
15563
15564 /* flags = signbit (op1) */
15565 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15566
15567 /* if (flags) then e2 = -e2 */
15568 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15569 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15570 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15571 pc_rtx);
15572 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15573 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15574 JUMP_LABEL (insn) = jump_label;
15575
15576 emit_insn (gen_negxf2 (e2, e2));
15577
15578 emit_label (jump_label);
15579 LABEL_NUSES (jump_label) = 1;
15580
15581 /* op0 = 0.5 * e2 */
15582 half = force_reg (XFmode, half);
15583 emit_insn (gen_mulxf3 (op0, e2, half));
15584}
15585
15586/* Output code to perform an cosh XFmode calculation. */
15587
15588void ix86_emit_i387_cosh (rtx op0, rtx op1)
15589{
15590 rtx e1 = gen_reg_rtx (XFmode);
15591 rtx e2 = gen_reg_rtx (XFmode);
15592 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15593 rtx cst1;
15594
15595 /* e1 = exp (op1) */
15596 emit_insn (gen_expxf2 (e1, op1));
15597
15598 /* e2 = e1 + 1.0 / e1 */
15599 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15600 emit_insn (gen_divxf3 (e2, cst1, e1));
15601 emit_insn (gen_addxf3 (e2, e1, e2));
15602
15603 /* op0 = 0.5 * e2 */
15604 half = force_reg (XFmode, half);
15605 emit_insn (gen_mulxf3 (op0, e2, half));
15606}
15607
15608/* Output code to perform an tanh XFmode calculation. */
15609
15610void ix86_emit_i387_tanh (rtx op0, rtx op1)
15611{
15612 rtx e1 = gen_reg_rtx (XFmode);
15613 rtx e2 = gen_reg_rtx (XFmode);
15614 rtx scratch = gen_reg_rtx (HImode);
15615 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15616 rtx cst2, tmp;
15617 rtx_code_label *jump_label = gen_label_rtx ();
15618 rtx_insn *insn;
15619
15620 /* scratch = fxam (op1) */
15621 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15622
15623 /* e1 = expm1 (-|2 * op1|) */
15624 emit_insn (gen_addxf3 (e2, op1, op1));
15625 emit_insn (gen_absxf2 (e2, e2));
15626 emit_insn (gen_negxf2 (e2, e2));
15627 emit_insn (gen_expm1xf2 (e1, e2));
15628
15629 /* e2 = e1 / (e1 + 2.0) */
15630 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
15631 emit_insn (gen_addxf3 (e2, e1, cst2));
15632 emit_insn (gen_divxf3 (e2, e1, e2));
15633
15634 /* flags = signbit (op1) */
15635 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15636
15637 /* if (!flags) then e2 = -e2 */
15638 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15639 gen_rtx_NE (VOIDmode, flags, const0_rtx),
15640 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15641 pc_rtx);
15642 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15643 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15644 JUMP_LABEL (insn) = jump_label;
15645
15646 emit_insn (gen_negxf2 (e2, e2));
15647
15648 emit_label (jump_label);
15649 LABEL_NUSES (jump_label) = 1;
15650
15651 emit_move_insn (op0, e2);
15652}
15653
15654/* Output code to perform an asinh XFmode calculation. */
15655
15656void ix86_emit_i387_asinh (rtx op0, rtx op1)
15657{
15658 rtx e1 = gen_reg_rtx (XFmode);
15659 rtx e2 = gen_reg_rtx (XFmode);
15660 rtx scratch = gen_reg_rtx (HImode);
15661 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15662 rtx cst1, tmp;
15663 rtx_code_label *jump_label = gen_label_rtx ();
15664 rtx_insn *insn;
15665
15666 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15667 emit_insn (gen_mulxf3 (e1, op1, op1));
15668 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15669 emit_insn (gen_addxf3 (e2, e1, cst1));
15670 emit_insn (gen_sqrtxf2 (e2, e2));
15671 emit_insn (gen_addxf3 (e2, e2, cst1));
15672
15673 /* e1 = e1 / e2 */
15674 emit_insn (gen_divxf3 (e1, e1, e2));
15675
15676 /* scratch = fxam (op1) */
15677 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15678
15679 /* e1 = e1 + |op1| */
15680 emit_insn (gen_absxf2 (e2, op1));
15681 emit_insn (gen_addxf3 (e1, e1, e2));
15682
15683 /* e2 = log1p (e1) */
15684 ix86_emit_i387_log1p (e2, e1);
15685
15686 /* flags = signbit (op1) */
15687 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15688
15689 /* if (flags) then e2 = -e2 */
15690 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15691 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15692 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15693 pc_rtx);
15694 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15695 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15696 JUMP_LABEL (insn) = jump_label;
15697
15698 emit_insn (gen_negxf2 (e2, e2));
15699
15700 emit_label (jump_label);
15701 LABEL_NUSES (jump_label) = 1;
15702
15703 emit_move_insn (op0, e2);
15704}
15705
15706/* Output code to perform an acosh XFmode calculation. */
15707
15708void ix86_emit_i387_acosh (rtx op0, rtx op1)
15709{
15710 rtx e1 = gen_reg_rtx (XFmode);
15711 rtx e2 = gen_reg_rtx (XFmode);
15712 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15713
15714 /* e2 = sqrt (op1 + 1.0) */
15715 emit_insn (gen_addxf3 (e2, op1, cst1));
15716 emit_insn (gen_sqrtxf2 (e2, e2));
15717
15718 /* e1 = sqrt (op1 - 1.0) */
15719 emit_insn (gen_subxf3 (e1, op1, cst1));
15720 emit_insn (gen_sqrtxf2 (e1, e1));
15721
15722 /* e1 = e1 * e2 */
15723 emit_insn (gen_mulxf3 (e1, e1, e2));
15724
15725 /* e1 = e1 + op1 */
15726 emit_insn (gen_addxf3 (e1, e1, op1));
15727
15728 /* op0 = log (e1) */
15729 emit_insn (gen_logxf2 (op0, e1));
15730}
15731
15732/* Output code to perform an atanh XFmode calculation. */
15733
15734void ix86_emit_i387_atanh (rtx op0, rtx op1)
15735{
15736 rtx e1 = gen_reg_rtx (XFmode);
15737 rtx e2 = gen_reg_rtx (XFmode);
15738 rtx scratch = gen_reg_rtx (HImode);
15739 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15740 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15741 rtx cst1, tmp;
15742 rtx_code_label *jump_label = gen_label_rtx ();
15743 rtx_insn *insn;
15744
15745 /* scratch = fxam (op1) */
15746 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15747
15748 /* e2 = |op1| */
15749 emit_insn (gen_absxf2 (e2, op1));
15750
15751 /* e1 = -(e2 + e2) / (e2 + 1.0) */
15752 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15753 emit_insn (gen_addxf3 (e1, e2, cst1));
15754 emit_insn (gen_addxf3 (e2, e2, e2));
15755 emit_insn (gen_negxf2 (e2, e2));
15756 emit_insn (gen_divxf3 (e1, e2, e1));
15757
15758 /* e2 = log1p (e1) */
15759 ix86_emit_i387_log1p (e2, e1);
15760
15761 /* flags = signbit (op1) */
15762 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15763
15764 /* if (!flags) then e2 = -e2 */
15765 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15766 gen_rtx_NE (VOIDmode, flags, const0_rtx),
15767 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15768 pc_rtx);
15769 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15770 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15771 JUMP_LABEL (insn) = jump_label;
15772
15773 emit_insn (gen_negxf2 (e2, e2));
15774
15775 emit_label (jump_label);
15776 LABEL_NUSES (jump_label) = 1;
15777
15778 /* op0 = 0.5 * e2 */
15779 half = force_reg (XFmode, half);
15780 emit_insn (gen_mulxf3 (op0, e2, half));
15781}
15782
15783/* Output code to perform a log1p XFmode calculation. */
15784
15785void ix86_emit_i387_log1p (rtx op0, rtx op1)
15786{
15787 rtx_code_label *label1 = gen_label_rtx ();
15788 rtx_code_label *label2 = gen_label_rtx ();
15789
15790 rtx tmp = gen_reg_rtx (XFmode);
15791 rtx res = gen_reg_rtx (XFmode);
15792 rtx cst, cstln2, cst1;
15793 rtx_insn *insn;
15794
15795 cst = const_double_from_real_value
15796 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
15797 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
15798
15799 emit_insn (gen_absxf2 (tmp, op1));
15800
15801 cst = force_reg (XFmode, cst);
15802 ix86_expand_branch (GE, tmp, cst, label1);
15803 predict_jump (REG_BR_PROB_BASE * 10 / 100);
15804 insn = get_last_insn ();
15805 JUMP_LABEL (insn) = label1;
15806
15807 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
15808 emit_jump (label2);
15809
15810 emit_label (label1);
15811 LABEL_NUSES (label1) = 1;
15812
15813 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15814 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
15815 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
15816
15817 emit_label (label2);
15818 LABEL_NUSES (label2) = 1;
15819
15820 emit_move_insn (op0, res);
15821}
15822
15823/* Emit code for round calculation. */
15824void ix86_emit_i387_round (rtx op0, rtx op1)
15825{
15826 machine_mode inmode = GET_MODE (op1);
15827 machine_mode outmode = GET_MODE (op0);
15828 rtx e1 = gen_reg_rtx (XFmode);
15829 rtx e2 = gen_reg_rtx (XFmode);
15830 rtx scratch = gen_reg_rtx (HImode);
15831 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15832 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15833 rtx res = gen_reg_rtx (outmode);
15834 rtx_code_label *jump_label = gen_label_rtx ();
15835 rtx (*floor_insn) (rtx, rtx);
15836 rtx (*neg_insn) (rtx, rtx);
15837 rtx_insn *insn;
15838 rtx tmp;
15839
15840 switch (inmode)
15841 {
15842 case E_SFmode:
15843 case E_DFmode:
15844 tmp = gen_reg_rtx (XFmode);
15845
15846 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
15847 op1 = tmp;
15848 break;
15849 case E_XFmode:
15850 break;
15851 default:
15852 gcc_unreachable ();
15853 }
15854
15855 switch (outmode)
15856 {
15857 case E_SFmode:
15858 floor_insn = gen_frndintxf2_floor;
15859 neg_insn = gen_negsf2;
15860 break;
15861 case E_DFmode:
15862 floor_insn = gen_frndintxf2_floor;
15863 neg_insn = gen_negdf2;
15864 break;
15865 case E_XFmode:
15866 floor_insn = gen_frndintxf2_floor;
15867 neg_insn = gen_negxf2;
15868 break;
15869 case E_HImode:
15870 floor_insn = gen_lfloorxfhi2;
15871 neg_insn = gen_neghi2;
15872 break;
15873 case E_SImode:
15874 floor_insn = gen_lfloorxfsi2;
15875 neg_insn = gen_negsi2;
15876 break;
15877 case E_DImode:
15878 floor_insn = gen_lfloorxfdi2;
15879 neg_insn = gen_negdi2;
15880 break;
15881 default:
15882 gcc_unreachable ();
15883 }
15884
15885 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15886
15887 /* scratch = fxam(op1) */
15888 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15889
15890 /* e1 = fabs(op1) */
15891 emit_insn (gen_absxf2 (e1, op1));
15892
15893 /* e2 = e1 + 0.5 */
15894 half = force_reg (XFmode, half);
15895 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
15896
15897 /* res = floor(e2) */
15898 switch (outmode)
15899 {
15900 case E_SFmode:
15901 case E_DFmode:
15902 {
15903 tmp = gen_reg_rtx (XFmode);
15904
15905 emit_insn (floor_insn (tmp, e2));
15906 emit_insn (gen_rtx_SET (res,
15907 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
15908 UNSPEC_TRUNC_NOOP)));
15909 }
15910 break;
15911 default:
15912 emit_insn (floor_insn (res, e2));
15913 }
15914
15915 /* flags = signbit(a) */
15916 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15917
15918 /* if (flags) then res = -res */
15919 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15920 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15921 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15922 pc_rtx);
15923 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15924 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15925 JUMP_LABEL (insn) = jump_label;
15926
15927 emit_insn (neg_insn (res, res));
15928
15929 emit_label (jump_label);
15930 LABEL_NUSES (jump_label) = 1;
15931
15932 emit_move_insn (op0, res);
15933}
15934
15935/* Output code to perform a Newton-Rhapson approximation of a single precision
15936 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
15937
15938void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
15939{
15940 rtx x0, x1, e0, e1;
15941
15942 x0 = gen_reg_rtx (mode);
15943 e0 = gen_reg_rtx (mode);
15944 e1 = gen_reg_rtx (mode);
15945 x1 = gen_reg_rtx (mode);
15946
15947 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15948
15949 b = force_reg (mode, b);
15950
15951 /* x0 = rcp(b) estimate */
15952 if (mode == V16SFmode || mode == V8DFmode)
15953 {
15954 if (TARGET_AVX512ER)
15955 {
15956 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15957 UNSPEC_RCP28)));
15958 /* res = a * x0 */
15959 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
15960 return;
15961 }
15962 else
15963 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15964 UNSPEC_RCP14)));
15965 }
15966 else
15967 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15968 UNSPEC_RCP)));
15969
15970 /* e0 = x0 * b */
15971 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
15972
15973 /* e0 = x0 * e0 */
15974 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
15975
15976 /* e1 = x0 + x0 */
15977 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
15978
15979 /* x1 = e1 - e0 */
15980 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
15981
15982 /* res = a * x1 */
15983 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
15984}
15985
15986/* Output code to perform a Newton-Rhapson approximation of a
15987 single precision floating point [reciprocal] square root. */
15988
15989void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
15990{
15991 rtx x0, e0, e1, e2, e3, mthree, mhalf;
15992 REAL_VALUE_TYPE r;
15993 int unspec;
15994
15995 x0 = gen_reg_rtx (mode);
15996 e0 = gen_reg_rtx (mode);
15997 e1 = gen_reg_rtx (mode);
15998 e2 = gen_reg_rtx (mode);
15999 e3 = gen_reg_rtx (mode);
16000
16001 if (TARGET_AVX512ER && mode == V16SFmode)
16002 {
16003 if (recip)
16004 /* res = rsqrt28(a) estimate */
16005 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
16006 UNSPEC_RSQRT28)));
16007 else
16008 {
16009 /* x0 = rsqrt28(a) estimate */
16010 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
16011 UNSPEC_RSQRT28)));
16012 /* res = rcp28(x0) estimate */
16013 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
16014 UNSPEC_RCP28)));
16015 }
16016 return;
16017 }
16018
16019 real_from_integer (&r, VOIDmode, -3, SIGNED);
16020 mthree = const_double_from_real_value (r, SFmode);
16021
16022 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
16023 mhalf = const_double_from_real_value (r, SFmode);
16024 unspec = UNSPEC_RSQRT;
16025
16026 if (VECTOR_MODE_P (mode))
16027 {
16028 mthree = ix86_build_const_vector (mode, true, mthree);
16029 mhalf = ix86_build_const_vector (mode, true, mhalf);
16030 /* There is no 512-bit rsqrt. There is however rsqrt14. */
16031 if (GET_MODE_SIZE (mode) == 64)
16032 unspec = UNSPEC_RSQRT14;
16033 }
16034
16035 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
16036 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
16037
16038 a = force_reg (mode, a);
16039
16040 /* x0 = rsqrt(a) estimate */
16041 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
16042 unspec)));
16043
16044 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
16045 if (!recip)
16046 {
16047 rtx zero = force_reg (mode, CONST0_RTX(mode));
16048 rtx mask;
16049
16050 /* Handle masked compare. */
16051 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
16052 {
16053 mask = gen_reg_rtx (HImode);
16054 /* Imm value 0x4 corresponds to not-equal comparison. */
16055 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
16056 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
16057 }
16058 else
16059 {
16060 mask = gen_reg_rtx (mode);
16061 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
16062 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
16063 }
16064 }
16065
fab263ab
L
16066 mthree = force_reg (mode, mthree);
16067
2bf6d935
ML
16068 /* e0 = x0 * a */
16069 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
2bf6d935 16070
a6645a82
L
16071 unsigned vector_size = GET_MODE_SIZE (mode);
16072 if (TARGET_FMA
16073 || (TARGET_AVX512F && vector_size == 64)
16074 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
fab263ab
L
16075 emit_insn (gen_rtx_SET (e2,
16076 gen_rtx_FMA (mode, e0, x0, mthree)));
16077 else
16078 {
16079 /* e1 = e0 * x0 */
16080 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
16081
16082 /* e2 = e1 - 3. */
16083 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
16084 }
2bf6d935
ML
16085
16086 mhalf = force_reg (mode, mhalf);
16087 if (recip)
16088 /* e3 = -.5 * x0 */
16089 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
16090 else
16091 /* e3 = -.5 * e0 */
16092 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
16093 /* ret = e2 * e3 */
16094 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
16095}
16096
16097/* Expand fabs (OP0) and return a new rtx that holds the result. The
16098 mask for masking out the sign-bit is stored in *SMASK, if that is
16099 non-null. */
16100
16101static rtx
16102ix86_expand_sse_fabs (rtx op0, rtx *smask)
16103{
16104 machine_mode vmode, mode = GET_MODE (op0);
16105 rtx xa, mask;
16106
16107 xa = gen_reg_rtx (mode);
16108 if (mode == SFmode)
16109 vmode = V4SFmode;
16110 else if (mode == DFmode)
16111 vmode = V2DFmode;
16112 else
16113 vmode = mode;
16114 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
16115 if (!VECTOR_MODE_P (mode))
16116 {
16117 /* We need to generate a scalar mode mask in this case. */
16118 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
16119 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
16120 mask = gen_reg_rtx (mode);
16121 emit_insn (gen_rtx_SET (mask, tmp));
16122 }
16123 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
16124
16125 if (smask)
16126 *smask = mask;
16127
16128 return xa;
16129}
16130
16131/* Expands a comparison of OP0 with OP1 using comparison code CODE,
16132 swapping the operands if SWAP_OPERANDS is true. The expanded
16133 code is a forward jump to a newly created label in case the
16134 comparison is true. The generated label rtx is returned. */
16135static rtx_code_label *
16136ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
16137 bool swap_operands)
16138{
16139 bool unordered_compare = ix86_unordered_fp_compare (code);
16140 rtx_code_label *label;
16141 rtx tmp, reg;
16142
16143 if (swap_operands)
16144 std::swap (op0, op1);
16145
16146 label = gen_label_rtx ();
16147 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
16148 if (unordered_compare)
16149 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
16150 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
16151 emit_insn (gen_rtx_SET (reg, tmp));
16152 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
16153 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
16154 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
16155 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
16156 JUMP_LABEL (tmp) = label;
16157
16158 return label;
16159}
16160
16161/* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
16162 using comparison code CODE. Operands are swapped for the comparison if
16163 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
16164static rtx
16165ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
16166 bool swap_operands)
16167{
16168 rtx (*insn)(rtx, rtx, rtx, rtx);
16169 machine_mode mode = GET_MODE (op0);
16170 rtx mask = gen_reg_rtx (mode);
16171
16172 if (swap_operands)
16173 std::swap (op0, op1);
16174
16175 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
16176
16177 emit_insn (insn (mask, op0, op1,
16178 gen_rtx_fmt_ee (code, mode, op0, op1)));
16179 return mask;
16180}
16181
16182/* Expand copysign from SIGN to the positive value ABS_VALUE
16183 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
16184 the sign-bit. */
16185
16186static void
16187ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
16188{
16189 machine_mode mode = GET_MODE (sign);
16190 rtx sgn = gen_reg_rtx (mode);
16191 if (mask == NULL_RTX)
16192 {
16193 machine_mode vmode;
16194
16195 if (mode == SFmode)
16196 vmode = V4SFmode;
16197 else if (mode == DFmode)
16198 vmode = V2DFmode;
16199 else
16200 vmode = mode;
16201
16202 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
16203 if (!VECTOR_MODE_P (mode))
16204 {
16205 /* We need to generate a scalar mode mask in this case. */
16206 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
16207 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
16208 mask = gen_reg_rtx (mode);
16209 emit_insn (gen_rtx_SET (mask, tmp));
16210 }
16211 }
16212 else
16213 mask = gen_rtx_NOT (mode, mask);
16214 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
16215 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
16216}
16217
16218/* Expand SSE sequence for computing lround from OP1 storing
16219 into OP0. */
16220
16221void
16222ix86_expand_lround (rtx op0, rtx op1)
16223{
16224 /* C code for the stuff we're doing below:
16225 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
16226 return (long)tmp;
16227 */
16228 machine_mode mode = GET_MODE (op1);
16229 const struct real_format *fmt;
16230 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16231 rtx adj;
16232
16233 /* load nextafter (0.5, 0.0) */
16234 fmt = REAL_MODE_FORMAT (mode);
16235 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16236 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16237
16238 /* adj = copysign (0.5, op1) */
16239 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
16240 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
16241
16242 /* adj = op1 + adj */
16243 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
16244
16245 /* op0 = (imode)adj */
16246 expand_fix (op0, adj, 0);
16247}
16248
16249/* Expand SSE2 sequence for computing lround from OPERAND1 storing
16250 into OPERAND0. */
16251
16252void
16253ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
16254{
16255 /* C code for the stuff we're doing below (for do_floor):
16256 xi = (long)op1;
16257 xi -= (double)xi > op1 ? 1 : 0;
16258 return xi;
16259 */
16260 machine_mode fmode = GET_MODE (op1);
16261 machine_mode imode = GET_MODE (op0);
16262 rtx ireg, freg, tmp;
16263 rtx_code_label *label;
16264
16265 /* reg = (long)op1 */
16266 ireg = gen_reg_rtx (imode);
16267 expand_fix (ireg, op1, 0);
16268
16269 /* freg = (double)reg */
16270 freg = gen_reg_rtx (fmode);
16271 expand_float (freg, ireg, 0);
16272
16273 /* ireg = (freg > op1) ? ireg - 1 : ireg */
16274 label = ix86_expand_sse_compare_and_jump (UNLE,
16275 freg, op1, !do_floor);
16276 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
16277 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
16278 emit_move_insn (ireg, tmp);
16279
16280 emit_label (label);
16281 LABEL_NUSES (label) = 1;
16282
16283 emit_move_insn (op0, ireg);
16284}
16285
16286/* Generate and return a rtx of mode MODE for 2**n where n is the number
16287 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
16288
16289static rtx
16290ix86_gen_TWO52 (machine_mode mode)
16291{
16292 REAL_VALUE_TYPE TWO52r;
16293 rtx TWO52;
16294
16295 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
16296 TWO52 = const_double_from_real_value (TWO52r, mode);
16297 TWO52 = force_reg (mode, TWO52);
16298
16299 return TWO52;
16300}
16301
16302/* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
16303
16304void
16305ix86_expand_rint (rtx operand0, rtx operand1)
16306{
16307 /* C code for the stuff we're doing below:
16308 xa = fabs (operand1);
16309 if (!isless (xa, 2**52))
16310 return operand1;
16311 two52 = 2**52;
16312 if (flag_rounding_math)
16313 {
16314 two52 = copysign (two52, operand1);
16315 xa = operand1;
16316 }
16317 xa = xa + two52 - two52;
16318 return copysign (xa, operand1);
16319 */
16320 machine_mode mode = GET_MODE (operand0);
16321 rtx res, xa, TWO52, two52, mask;
16322 rtx_code_label *label;
16323
16324 res = gen_reg_rtx (mode);
16325 emit_move_insn (res, operand1);
16326
16327 /* xa = abs (operand1) */
16328 xa = ix86_expand_sse_fabs (res, &mask);
16329
16330 /* if (!isless (xa, TWO52)) goto label; */
16331 TWO52 = ix86_gen_TWO52 (mode);
16332 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16333
16334 two52 = TWO52;
16335 if (flag_rounding_math)
16336 {
16337 two52 = gen_reg_rtx (mode);
16338 ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
16339 xa = res;
16340 }
16341
16342 xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
16343 xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
16344
16345 ix86_sse_copysign_to_positive (res, xa, res, mask);
16346
16347 emit_label (label);
16348 LABEL_NUSES (label) = 1;
16349
16350 emit_move_insn (operand0, res);
16351}
16352
36d387f2
UB
16353/* Expand SSE2 sequence for computing floor or ceil
16354 from OPERAND1 storing into OPERAND0. */
2bf6d935
ML
16355void
16356ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
16357{
16358 /* C code for the stuff we expand below.
16359 double xa = fabs (x), x2;
16360 if (!isless (xa, TWO52))
16361 return x;
16362 x2 = (double)(long)x;
16363 Compensate. Floor:
16364 if (x2 > x)
16365 x2 -= 1;
16366 Compensate. Ceil:
16367 if (x2 < x)
16368 x2 += 1;
16369 if (HONOR_SIGNED_ZEROS (mode))
16370 return copysign (x2, x);
16371 return x2;
16372 */
16373 machine_mode mode = GET_MODE (operand0);
16374 rtx xa, xi, TWO52, tmp, one, res, mask;
16375 rtx_code_label *label;
16376
16377 TWO52 = ix86_gen_TWO52 (mode);
16378
16379 /* Temporary for holding the result, initialized to the input
16380 operand to ease control flow. */
16381 res = gen_reg_rtx (mode);
16382 emit_move_insn (res, operand1);
16383
16384 /* xa = abs (operand1) */
16385 xa = ix86_expand_sse_fabs (res, &mask);
16386
16387 /* if (!isless (xa, TWO52)) goto label; */
16388 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16389
16390 /* xa = (double)(long)x */
16391 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
16392 expand_fix (xi, res, 0);
16393 expand_float (xa, xi, 0);
16394
16395 /* generate 1.0 */
16396 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16397
16398 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
16399 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
16400 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
16401 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
16402 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16403 emit_move_insn (res, tmp);
16404
16405 if (HONOR_SIGNED_ZEROS (mode))
16406 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
16407
16408 emit_label (label);
16409 LABEL_NUSES (label) = 1;
16410
16411 emit_move_insn (operand0, res);
16412}
16413
36d387f2
UB
16414/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
16415 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16416 that is only available on 64bit targets. */
2bf6d935 16417void
36d387f2 16418ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
2bf6d935
ML
16419{
16420 /* C code for the stuff we expand below.
36d387f2 16421 double xa = fabs (x), x2;
2bf6d935
ML
16422 if (!isless (xa, TWO52))
16423 return x;
36d387f2
UB
16424 xa = xa + TWO52 - TWO52;
16425 x2 = copysign (xa, x);
16426 Compensate. Floor:
16427 if (x2 > x)
16428 x2 -= 1;
16429 Compensate. Ceil:
16430 if (x2 < x)
16431 x2 += 1;
16432 if (HONOR_SIGNED_ZEROS (mode))
16433 x2 = copysign (x2, x);
16434 return x2;
2bf6d935
ML
16435 */
16436 machine_mode mode = GET_MODE (operand0);
36d387f2 16437 rtx xa, TWO52, tmp, one, res, mask;
2bf6d935
ML
16438 rtx_code_label *label;
16439
16440 TWO52 = ix86_gen_TWO52 (mode);
16441
16442 /* Temporary for holding the result, initialized to the input
16443 operand to ease control flow. */
16444 res = gen_reg_rtx (mode);
16445 emit_move_insn (res, operand1);
16446
16447 /* xa = abs (operand1) */
16448 xa = ix86_expand_sse_fabs (res, &mask);
16449
16450 /* if (!isless (xa, TWO52)) goto label; */
16451 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16452
36d387f2
UB
16453 /* xa = xa + TWO52 - TWO52; */
16454 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16455 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
2bf6d935 16456
36d387f2
UB
16457 /* xa = copysign (xa, operand1) */
16458 ix86_sse_copysign_to_positive (xa, xa, res, mask);
2bf6d935 16459
36d387f2
UB
16460 /* generate 1.0 */
16461 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
2bf6d935 16462
36d387f2
UB
16463 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
16464 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
16465 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
16466 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
16467 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16468 if (!do_floor && HONOR_SIGNED_ZEROS (mode))
16469 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
16470 emit_move_insn (res, tmp);
2bf6d935
ML
16471
16472 emit_label (label);
16473 LABEL_NUSES (label) = 1;
16474
16475 emit_move_insn (operand0, res);
16476}
16477
36d387f2
UB
16478/* Expand SSE sequence for computing trunc
16479 from OPERAND1 storing into OPERAND0. */
2bf6d935
ML
16480void
16481ix86_expand_trunc (rtx operand0, rtx operand1)
16482{
16483 /* C code for SSE variant we expand below.
16484 double xa = fabs (x), x2;
16485 if (!isless (xa, TWO52))
16486 return x;
16487 x2 = (double)(long)x;
16488 if (HONOR_SIGNED_ZEROS (mode))
16489 return copysign (x2, x);
16490 return x2;
16491 */
16492 machine_mode mode = GET_MODE (operand0);
16493 rtx xa, xi, TWO52, res, mask;
16494 rtx_code_label *label;
16495
16496 TWO52 = ix86_gen_TWO52 (mode);
16497
16498 /* Temporary for holding the result, initialized to the input
16499 operand to ease control flow. */
16500 res = gen_reg_rtx (mode);
16501 emit_move_insn (res, operand1);
16502
16503 /* xa = abs (operand1) */
16504 xa = ix86_expand_sse_fabs (res, &mask);
16505
16506 /* if (!isless (xa, TWO52)) goto label; */
16507 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16508
16509 /* x = (double)(long)x */
16510 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
16511 expand_fix (xi, res, 0);
16512 expand_float (res, xi, 0);
16513
16514 if (HONOR_SIGNED_ZEROS (mode))
16515 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
16516
16517 emit_label (label);
16518 LABEL_NUSES (label) = 1;
16519
16520 emit_move_insn (operand0, res);
16521}
16522
16523/* Expand SSE sequence for computing trunc from OPERAND1 storing
36d387f2
UB
16524 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16525 that is only available on 64bit targets. */
2bf6d935
ML
16526void
16527ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
16528{
16529 machine_mode mode = GET_MODE (operand0);
16530 rtx xa, mask, TWO52, one, res, smask, tmp;
16531 rtx_code_label *label;
16532
16533 /* C code for SSE variant we expand below.
16534 double xa = fabs (x), x2;
16535 if (!isless (xa, TWO52))
16536 return x;
16537 xa2 = xa + TWO52 - TWO52;
16538 Compensate:
16539 if (xa2 > xa)
16540 xa2 -= 1.0;
16541 x2 = copysign (xa2, x);
16542 return x2;
16543 */
16544
16545 TWO52 = ix86_gen_TWO52 (mode);
16546
16547 /* Temporary for holding the result, initialized to the input
16548 operand to ease control flow. */
16549 res = gen_reg_rtx (mode);
16550 emit_move_insn (res, operand1);
16551
16552 /* xa = abs (operand1) */
16553 xa = ix86_expand_sse_fabs (res, &smask);
16554
16555 /* if (!isless (xa, TWO52)) goto label; */
16556 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16557
16558 /* res = xa + TWO52 - TWO52; */
16559 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16560 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
16561 emit_move_insn (res, tmp);
16562
16563 /* generate 1.0 */
16564 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16565
16566 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
16567 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
16568 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
16569 tmp = expand_simple_binop (mode, MINUS,
16570 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
16571 emit_move_insn (res, tmp);
16572
16573 /* res = copysign (res, operand1) */
16574 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
16575
16576 emit_label (label);
16577 LABEL_NUSES (label) = 1;
16578
16579 emit_move_insn (operand0, res);
16580}
16581
36d387f2
UB
16582/* Expand SSE sequence for computing round
16583 from OPERAND1 storing into OPERAND0. */
2bf6d935
ML
16584void
16585ix86_expand_round (rtx operand0, rtx operand1)
16586{
16587 /* C code for the stuff we're doing below:
16588 double xa = fabs (x);
16589 if (!isless (xa, TWO52))
16590 return x;
16591 xa = (double)(long)(xa + nextafter (0.5, 0.0));
16592 return copysign (xa, x);
16593 */
16594 machine_mode mode = GET_MODE (operand0);
16595 rtx res, TWO52, xa, xi, half, mask;
16596 rtx_code_label *label;
16597 const struct real_format *fmt;
16598 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16599
16600 /* Temporary for holding the result, initialized to the input
16601 operand to ease control flow. */
16602 res = gen_reg_rtx (mode);
16603 emit_move_insn (res, operand1);
16604
16605 TWO52 = ix86_gen_TWO52 (mode);
16606 xa = ix86_expand_sse_fabs (res, &mask);
16607 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16608
16609 /* load nextafter (0.5, 0.0) */
16610 fmt = REAL_MODE_FORMAT (mode);
16611 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16612 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16613
16614 /* xa = xa + 0.5 */
16615 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
16616 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
16617
16618 /* xa = (double)(int64_t)xa */
16619 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
16620 expand_fix (xi, xa, 0);
16621 expand_float (xa, xi, 0);
16622
16623 /* res = copysign (xa, operand1) */
16624 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
16625
16626 emit_label (label);
16627 LABEL_NUSES (label) = 1;
16628
16629 emit_move_insn (operand0, res);
16630}
16631
36d387f2
UB
16632/* Expand SSE sequence for computing round from OPERAND1 storing
16633 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16634 that is only available on 64bit targets. */
16635void
16636ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
16637{
16638 /* C code for the stuff we expand below.
16639 double xa = fabs (x), xa2, x2;
16640 if (!isless (xa, TWO52))
16641 return x;
16642 Using the absolute value and copying back sign makes
16643 -0.0 -> -0.0 correct.
16644 xa2 = xa + TWO52 - TWO52;
16645 Compensate.
16646 dxa = xa2 - xa;
16647 if (dxa <= -0.5)
16648 xa2 += 1;
16649 else if (dxa > 0.5)
16650 xa2 -= 1;
16651 x2 = copysign (xa2, x);
16652 return x2;
16653 */
16654 machine_mode mode = GET_MODE (operand0);
16655 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
16656 rtx_code_label *label;
16657
16658 TWO52 = ix86_gen_TWO52 (mode);
16659
16660 /* Temporary for holding the result, initialized to the input
16661 operand to ease control flow. */
16662 res = gen_reg_rtx (mode);
16663 emit_move_insn (res, operand1);
16664
16665 /* xa = abs (operand1) */
16666 xa = ix86_expand_sse_fabs (res, &mask);
16667
16668 /* if (!isless (xa, TWO52)) goto label; */
16669 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16670
16671 /* xa2 = xa + TWO52 - TWO52; */
16672 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16673 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
16674
16675 /* dxa = xa2 - xa; */
16676 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
16677
16678 /* generate 0.5, 1.0 and -0.5 */
16679 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
16680 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
16681 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
16682 0, OPTAB_DIRECT);
16683
16684 /* Compensate. */
16685 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
16686 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
16687 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16688 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16689 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
16690 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
16691 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16692 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16693
16694 /* res = copysign (xa2, operand1) */
16695 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
16696
16697 emit_label (label);
16698 LABEL_NUSES (label) = 1;
16699
16700 emit_move_insn (operand0, res);
16701}
16702
2bf6d935
ML
16703/* Expand SSE sequence for computing round
16704 from OP1 storing into OP0 using sse4 round insn. */
16705void
16706ix86_expand_round_sse4 (rtx op0, rtx op1)
16707{
16708 machine_mode mode = GET_MODE (op0);
16709 rtx e1, e2, res, half;
16710 const struct real_format *fmt;
16711 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16712 rtx (*gen_copysign) (rtx, rtx, rtx);
16713 rtx (*gen_round) (rtx, rtx, rtx);
16714
16715 switch (mode)
16716 {
16717 case E_SFmode:
16718 gen_copysign = gen_copysignsf3;
16719 gen_round = gen_sse4_1_roundsf2;
16720 break;
16721 case E_DFmode:
16722 gen_copysign = gen_copysigndf3;
16723 gen_round = gen_sse4_1_rounddf2;
16724 break;
16725 default:
16726 gcc_unreachable ();
16727 }
16728
16729 /* round (a) = trunc (a + copysign (0.5, a)) */
16730
16731 /* load nextafter (0.5, 0.0) */
16732 fmt = REAL_MODE_FORMAT (mode);
16733 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16734 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16735 half = const_double_from_real_value (pred_half, mode);
16736
16737 /* e1 = copysign (0.5, op1) */
16738 e1 = gen_reg_rtx (mode);
16739 emit_insn (gen_copysign (e1, half, op1));
16740
16741 /* e2 = op1 + e1 */
16742 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
16743
16744 /* res = trunc (e2) */
16745 res = gen_reg_rtx (mode);
16746 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
16747
16748 emit_move_insn (op0, res);
16749}
16750
16751/* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16752 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16753 insn every time. */
16754
16755static GTY(()) rtx_insn *vselect_insn;
16756
16757/* Initialize vselect_insn. */
16758
16759static void
16760init_vselect_insn (void)
16761{
16762 unsigned i;
16763 rtx x;
16764
16765 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
16766 for (i = 0; i < MAX_VECT_LEN; ++i)
16767 XVECEXP (x, 0, i) = const0_rtx;
16768 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
16769 const0_rtx), x);
16770 x = gen_rtx_SET (const0_rtx, x);
16771 start_sequence ();
16772 vselect_insn = emit_insn (x);
16773 end_sequence ();
16774}
16775
16776/* Construct (set target (vec_select op0 (parallel perm))) and
16777 return true if that's a valid instruction in the active ISA. */
16778
16779static bool
16780expand_vselect (rtx target, rtx op0, const unsigned char *perm,
16781 unsigned nelt, bool testing_p)
16782{
16783 unsigned int i;
16784 rtx x, save_vconcat;
16785 int icode;
16786
16787 if (vselect_insn == NULL_RTX)
16788 init_vselect_insn ();
16789
16790 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
16791 PUT_NUM_ELEM (XVEC (x, 0), nelt);
16792 for (i = 0; i < nelt; ++i)
16793 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
16794 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16795 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
16796 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
16797 SET_DEST (PATTERN (vselect_insn)) = target;
16798 icode = recog_memoized (vselect_insn);
16799
16800 if (icode >= 0 && !testing_p)
16801 emit_insn (copy_rtx (PATTERN (vselect_insn)));
16802
16803 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
16804 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
16805 INSN_CODE (vselect_insn) = -1;
16806
16807 return icode >= 0;
16808}
16809
16810/* Similar, but generate a vec_concat from op0 and op1 as well. */
16811
16812static bool
16813expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
16814 const unsigned char *perm, unsigned nelt,
16815 bool testing_p)
16816{
16817 machine_mode v2mode;
16818 rtx x;
16819 bool ok;
16820
16821 if (vselect_insn == NULL_RTX)
16822 init_vselect_insn ();
16823
16824 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
16825 return false;
16826 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16827 PUT_MODE (x, v2mode);
16828 XEXP (x, 0) = op0;
16829 XEXP (x, 1) = op1;
16830 ok = expand_vselect (target, x, perm, nelt, testing_p);
16831 XEXP (x, 0) = const0_rtx;
16832 XEXP (x, 1) = const0_rtx;
16833 return ok;
16834}
16835
4bf4c103 16836/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
16837 using movss or movsd. */
16838static bool
16839expand_vec_perm_movs (struct expand_vec_perm_d *d)
16840{
16841 machine_mode vmode = d->vmode;
16842 unsigned i, nelt = d->nelt;
16843 rtx x;
16844
16845 if (d->one_operand_p)
16846 return false;
16847
16848 if (!(TARGET_SSE && vmode == V4SFmode)
240198fe 16849 && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
2bf6d935
ML
16850 && !(TARGET_SSE2 && vmode == V2DFmode))
16851 return false;
16852
16853 /* Only the first element is changed. */
16854 if (d->perm[0] != nelt && d->perm[0] != 0)
16855 return false;
16856 for (i = 1; i < nelt; ++i)
16857 if (d->perm[i] != i + nelt - d->perm[0])
16858 return false;
16859
16860 if (d->testing_p)
16861 return true;
16862
16863 if (d->perm[0] == nelt)
16864 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
16865 else
16866 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
16867
16868 emit_insn (gen_rtx_SET (d->target, x));
16869
16870 return true;
16871}
16872
4bf4c103 16873/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
16874 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
16875
16876static bool
16877expand_vec_perm_blend (struct expand_vec_perm_d *d)
16878{
16879 machine_mode mmode, vmode = d->vmode;
fa2987ed
JJ
16880 unsigned i, nelt = d->nelt;
16881 unsigned HOST_WIDE_INT mask;
2bf6d935
ML
16882 rtx target, op0, op1, maskop, x;
16883 rtx rperm[32], vperm;
16884
16885 if (d->one_operand_p)
16886 return false;
16887 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
16888 && (TARGET_AVX512BW
16889 || GET_MODE_UNIT_SIZE (vmode) >= 4))
16890 ;
16891 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
16892 ;
16893 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
16894 ;
16895 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
16896 ;
16897 else
16898 return false;
16899
16900 /* This is a blend, not a permute. Elements must stay in their
16901 respective lanes. */
16902 for (i = 0; i < nelt; ++i)
16903 {
16904 unsigned e = d->perm[i];
16905 if (!(e == i || e == i + nelt))
16906 return false;
16907 }
16908
16909 if (d->testing_p)
16910 return true;
16911
16912 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
16913 decision should be extracted elsewhere, so that we only try that
16914 sequence once all budget==3 options have been tried. */
16915 target = d->target;
16916 op0 = d->op0;
16917 op1 = d->op1;
16918 mask = 0;
16919
16920 switch (vmode)
16921 {
16922 case E_V8DFmode:
16923 case E_V16SFmode:
16924 case E_V4DFmode:
16925 case E_V8SFmode:
16926 case E_V2DFmode:
16927 case E_V4SFmode:
16928 case E_V8HImode:
16929 case E_V8SImode:
16930 case E_V32HImode:
16931 case E_V64QImode:
16932 case E_V16SImode:
16933 case E_V8DImode:
16934 for (i = 0; i < nelt; ++i)
fa2987ed 16935 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
2bf6d935
ML
16936 break;
16937
16938 case E_V2DImode:
16939 for (i = 0; i < 2; ++i)
16940 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
16941 vmode = V8HImode;
16942 goto do_subreg;
16943
16944 case E_V4SImode:
16945 for (i = 0; i < 4; ++i)
16946 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16947 vmode = V8HImode;
16948 goto do_subreg;
16949
16950 case E_V16QImode:
16951 /* See if bytes move in pairs so we can use pblendw with
16952 an immediate argument, rather than pblendvb with a vector
16953 argument. */
16954 for (i = 0; i < 16; i += 2)
16955 if (d->perm[i] + 1 != d->perm[i + 1])
16956 {
16957 use_pblendvb:
16958 for (i = 0; i < nelt; ++i)
16959 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
16960
16961 finish_pblendvb:
16962 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16963 vperm = force_reg (vmode, vperm);
16964
16965 if (GET_MODE_SIZE (vmode) == 16)
16966 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
16967 else
16968 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
16969 if (target != d->target)
16970 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16971 return true;
16972 }
16973
16974 for (i = 0; i < 8; ++i)
16975 mask |= (d->perm[i * 2] >= 16) << i;
16976 vmode = V8HImode;
16977 /* FALLTHRU */
16978
16979 do_subreg:
16980 target = gen_reg_rtx (vmode);
16981 op0 = gen_lowpart (vmode, op0);
16982 op1 = gen_lowpart (vmode, op1);
16983 break;
16984
16985 case E_V32QImode:
16986 /* See if bytes move in pairs. If not, vpblendvb must be used. */
16987 for (i = 0; i < 32; i += 2)
16988 if (d->perm[i] + 1 != d->perm[i + 1])
16989 goto use_pblendvb;
16990 /* See if bytes move in quadruplets. If yes, vpblendd
16991 with immediate can be used. */
16992 for (i = 0; i < 32; i += 4)
16993 if (d->perm[i] + 2 != d->perm[i + 2])
16994 break;
16995 if (i < 32)
16996 {
16997 /* See if bytes move the same in both lanes. If yes,
16998 vpblendw with immediate can be used. */
16999 for (i = 0; i < 16; i += 2)
17000 if (d->perm[i] + 16 != d->perm[i + 16])
17001 goto use_pblendvb;
17002
17003 /* Use vpblendw. */
17004 for (i = 0; i < 16; ++i)
17005 mask |= (d->perm[i * 2] >= 32) << i;
17006 vmode = V16HImode;
17007 goto do_subreg;
17008 }
17009
17010 /* Use vpblendd. */
17011 for (i = 0; i < 8; ++i)
17012 mask |= (d->perm[i * 4] >= 32) << i;
17013 vmode = V8SImode;
17014 goto do_subreg;
17015
17016 case E_V16HImode:
17017 /* See if words move in pairs. If yes, vpblendd can be used. */
17018 for (i = 0; i < 16; i += 2)
17019 if (d->perm[i] + 1 != d->perm[i + 1])
17020 break;
17021 if (i < 16)
17022 {
17023 /* See if words move the same in both lanes. If not,
17024 vpblendvb must be used. */
17025 for (i = 0; i < 8; i++)
17026 if (d->perm[i] + 8 != d->perm[i + 8])
17027 {
17028 /* Use vpblendvb. */
17029 for (i = 0; i < 32; ++i)
17030 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
17031
17032 vmode = V32QImode;
17033 nelt = 32;
17034 target = gen_reg_rtx (vmode);
17035 op0 = gen_lowpart (vmode, op0);
17036 op1 = gen_lowpart (vmode, op1);
17037 goto finish_pblendvb;
17038 }
17039
17040 /* Use vpblendw. */
17041 for (i = 0; i < 16; ++i)
17042 mask |= (d->perm[i] >= 16) << i;
17043 break;
17044 }
17045
17046 /* Use vpblendd. */
17047 for (i = 0; i < 8; ++i)
17048 mask |= (d->perm[i * 2] >= 16) << i;
17049 vmode = V8SImode;
17050 goto do_subreg;
17051
17052 case E_V4DImode:
17053 /* Use vpblendd. */
17054 for (i = 0; i < 4; ++i)
17055 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
17056 vmode = V8SImode;
17057 goto do_subreg;
17058
17059 default:
17060 gcc_unreachable ();
17061 }
17062
17063 switch (vmode)
17064 {
17065 case E_V8DFmode:
17066 case E_V8DImode:
17067 mmode = QImode;
17068 break;
17069 case E_V16SFmode:
17070 case E_V16SImode:
17071 mmode = HImode;
17072 break;
17073 case E_V32HImode:
17074 mmode = SImode;
17075 break;
17076 case E_V64QImode:
17077 mmode = DImode;
17078 break;
17079 default:
17080 mmode = VOIDmode;
17081 }
17082
17083 if (mmode != VOIDmode)
17084 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
17085 else
17086 maskop = GEN_INT (mask);
17087
17088 /* This matches five different patterns with the different modes. */
17089 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
17090 x = gen_rtx_SET (target, x);
17091 emit_insn (x);
17092 if (target != d->target)
17093 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17094
17095 return true;
17096}
17097
4bf4c103 17098/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
17099 in terms of the variable form of vpermilps.
17100
17101 Note that we will have already failed the immediate input vpermilps,
17102 which requires that the high and low part shuffle be identical; the
17103 variable form doesn't require that. */
17104
17105static bool
17106expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
17107{
17108 rtx rperm[8], vperm;
17109 unsigned i;
17110
17111 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
17112 return false;
17113
17114 /* We can only permute within the 128-bit lane. */
17115 for (i = 0; i < 8; ++i)
17116 {
17117 unsigned e = d->perm[i];
17118 if (i < 4 ? e >= 4 : e < 4)
17119 return false;
17120 }
17121
17122 if (d->testing_p)
17123 return true;
17124
17125 for (i = 0; i < 8; ++i)
17126 {
17127 unsigned e = d->perm[i];
17128
17129 /* Within each 128-bit lane, the elements of op0 are numbered
17130 from 0 and the elements of op1 are numbered from 4. */
17131 if (e >= 8 + 4)
17132 e -= 8;
17133 else if (e >= 4)
17134 e -= 4;
17135
17136 rperm[i] = GEN_INT (e);
17137 }
17138
17139 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
17140 vperm = force_reg (V8SImode, vperm);
17141 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
17142
17143 return true;
17144}
17145
17146/* Return true if permutation D can be performed as VMODE permutation
17147 instead. */
17148
17149static bool
17150valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
17151{
17152 unsigned int i, j, chunk;
17153
17154 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
17155 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
17156 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
17157 return false;
17158
17159 if (GET_MODE_NUNITS (vmode) >= d->nelt)
17160 return true;
17161
17162 chunk = d->nelt / GET_MODE_NUNITS (vmode);
17163 for (i = 0; i < d->nelt; i += chunk)
17164 if (d->perm[i] & (chunk - 1))
17165 return false;
17166 else
17167 for (j = 1; j < chunk; ++j)
17168 if (d->perm[i] + j != d->perm[i + j])
17169 return false;
17170
17171 return true;
17172}
17173
4bf4c103 17174/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
17175 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
17176
17177static bool
17178expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
17179{
17180 unsigned i, nelt, eltsz, mask;
17181 unsigned char perm[64];
17182 machine_mode vmode = V16QImode;
17183 rtx rperm[64], vperm, target, op0, op1;
17184
17185 nelt = d->nelt;
17186
17187 if (!d->one_operand_p)
17188 {
17189 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
17190 {
17191 if (TARGET_AVX2
17192 && valid_perm_using_mode_p (V2TImode, d))
17193 {
17194 if (d->testing_p)
17195 return true;
17196
17197 /* Use vperm2i128 insn. The pattern uses
17198 V4DImode instead of V2TImode. */
17199 target = d->target;
17200 if (d->vmode != V4DImode)
17201 target = gen_reg_rtx (V4DImode);
17202 op0 = gen_lowpart (V4DImode, d->op0);
17203 op1 = gen_lowpart (V4DImode, d->op1);
17204 rperm[0]
17205 = GEN_INT ((d->perm[0] / (nelt / 2))
17206 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
17207 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
17208 if (target != d->target)
17209 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17210 return true;
17211 }
17212 return false;
17213 }
17214 }
17215 else
17216 {
17217 if (GET_MODE_SIZE (d->vmode) == 16)
17218 {
17219 if (!TARGET_SSSE3)
17220 return false;
17221 }
17222 else if (GET_MODE_SIZE (d->vmode) == 32)
17223 {
17224 if (!TARGET_AVX2)
17225 return false;
17226
17227 /* V4DImode should be already handled through
17228 expand_vselect by vpermq instruction. */
17229 gcc_assert (d->vmode != V4DImode);
17230
17231 vmode = V32QImode;
17232 if (d->vmode == V8SImode
17233 || d->vmode == V16HImode
17234 || d->vmode == V32QImode)
17235 {
17236 /* First see if vpermq can be used for
17237 V8SImode/V16HImode/V32QImode. */
17238 if (valid_perm_using_mode_p (V4DImode, d))
17239 {
17240 for (i = 0; i < 4; i++)
17241 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
17242 if (d->testing_p)
17243 return true;
17244 target = gen_reg_rtx (V4DImode);
17245 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
17246 perm, 4, false))
17247 {
17248 emit_move_insn (d->target,
17249 gen_lowpart (d->vmode, target));
17250 return true;
17251 }
17252 return false;
17253 }
17254
17255 /* Next see if vpermd can be used. */
17256 if (valid_perm_using_mode_p (V8SImode, d))
17257 vmode = V8SImode;
17258 }
17259 /* Or if vpermps can be used. */
17260 else if (d->vmode == V8SFmode)
17261 vmode = V8SImode;
17262
17263 if (vmode == V32QImode)
17264 {
17265 /* vpshufb only works intra lanes, it is not
17266 possible to shuffle bytes in between the lanes. */
17267 for (i = 0; i < nelt; ++i)
17268 if ((d->perm[i] ^ i) & (nelt / 2))
17269 return false;
17270 }
17271 }
17272 else if (GET_MODE_SIZE (d->vmode) == 64)
17273 {
17274 if (!TARGET_AVX512BW)
17275 return false;
17276
17277 /* If vpermq didn't work, vpshufb won't work either. */
17278 if (d->vmode == V8DFmode || d->vmode == V8DImode)
17279 return false;
17280
17281 vmode = V64QImode;
17282 if (d->vmode == V16SImode
17283 || d->vmode == V32HImode
17284 || d->vmode == V64QImode)
17285 {
17286 /* First see if vpermq can be used for
17287 V16SImode/V32HImode/V64QImode. */
17288 if (valid_perm_using_mode_p (V8DImode, d))
17289 {
17290 for (i = 0; i < 8; i++)
17291 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
17292 if (d->testing_p)
17293 return true;
17294 target = gen_reg_rtx (V8DImode);
17295 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
17296 perm, 8, false))
17297 {
17298 emit_move_insn (d->target,
17299 gen_lowpart (d->vmode, target));
17300 return true;
17301 }
17302 return false;
17303 }
17304
17305 /* Next see if vpermd can be used. */
17306 if (valid_perm_using_mode_p (V16SImode, d))
17307 vmode = V16SImode;
17308 }
17309 /* Or if vpermps can be used. */
17310 else if (d->vmode == V16SFmode)
17311 vmode = V16SImode;
17312 if (vmode == V64QImode)
17313 {
17314 /* vpshufb only works intra lanes, it is not
17315 possible to shuffle bytes in between the lanes. */
17316 for (i = 0; i < nelt; ++i)
d51af82b 17317 if ((d->perm[i] ^ i) & (3 * nelt / 4))
2bf6d935
ML
17318 return false;
17319 }
17320 }
17321 else
17322 return false;
17323 }
17324
17325 if (d->testing_p)
17326 return true;
17327
17328 if (vmode == V8SImode)
17329 for (i = 0; i < 8; ++i)
17330 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
17331 else if (vmode == V16SImode)
17332 for (i = 0; i < 16; ++i)
17333 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
17334 else
17335 {
17336 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
17337 if (!d->one_operand_p)
17338 mask = 2 * nelt - 1;
17339 else if (vmode == V16QImode)
17340 mask = nelt - 1;
17341 else if (vmode == V64QImode)
17342 mask = nelt / 4 - 1;
17343 else
17344 mask = nelt / 2 - 1;
17345
17346 for (i = 0; i < nelt; ++i)
17347 {
17348 unsigned j, e = d->perm[i] & mask;
17349 for (j = 0; j < eltsz; ++j)
17350 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
17351 }
17352 }
17353
17354 vperm = gen_rtx_CONST_VECTOR (vmode,
17355 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
17356 vperm = force_reg (vmode, vperm);
17357
17358 target = d->target;
17359 if (d->vmode != vmode)
17360 target = gen_reg_rtx (vmode);
17361 op0 = gen_lowpart (vmode, d->op0);
17362 if (d->one_operand_p)
17363 {
17364 if (vmode == V16QImode)
17365 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
17366 else if (vmode == V32QImode)
17367 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
17368 else if (vmode == V64QImode)
17369 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
17370 else if (vmode == V8SFmode)
17371 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
17372 else if (vmode == V8SImode)
17373 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
17374 else if (vmode == V16SFmode)
17375 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
17376 else if (vmode == V16SImode)
17377 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
17378 else
17379 gcc_unreachable ();
17380 }
17381 else
17382 {
17383 op1 = gen_lowpart (vmode, d->op1);
17384 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
17385 }
17386 if (target != d->target)
17387 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17388
17389 return true;
17390}
17391
17392/* For V*[QHS]Imode permutations, check if the same permutation
17393 can't be performed in a 2x, 4x or 8x wider inner mode. */
17394
17395static bool
17396canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
17397 struct expand_vec_perm_d *nd)
17398{
17399 int i;
17400 machine_mode mode = VOIDmode;
17401
17402 switch (d->vmode)
17403 {
17404 case E_V16QImode: mode = V8HImode; break;
17405 case E_V32QImode: mode = V16HImode; break;
17406 case E_V64QImode: mode = V32HImode; break;
17407 case E_V8HImode: mode = V4SImode; break;
17408 case E_V16HImode: mode = V8SImode; break;
17409 case E_V32HImode: mode = V16SImode; break;
17410 case E_V4SImode: mode = V2DImode; break;
17411 case E_V8SImode: mode = V4DImode; break;
17412 case E_V16SImode: mode = V8DImode; break;
17413 default: return false;
17414 }
17415 for (i = 0; i < d->nelt; i += 2)
17416 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
17417 return false;
17418 nd->vmode = mode;
17419 nd->nelt = d->nelt / 2;
17420 for (i = 0; i < nd->nelt; i++)
17421 nd->perm[i] = d->perm[2 * i] / 2;
17422 if (GET_MODE_INNER (mode) != DImode)
17423 canonicalize_vector_int_perm (nd, nd);
17424 if (nd != d)
17425 {
17426 nd->one_operand_p = d->one_operand_p;
17427 nd->testing_p = d->testing_p;
17428 if (d->op0 == d->op1)
17429 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
17430 else
17431 {
17432 nd->op0 = gen_lowpart (nd->vmode, d->op0);
17433 nd->op1 = gen_lowpart (nd->vmode, d->op1);
17434 }
17435 if (d->testing_p)
17436 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
17437 else
17438 nd->target = gen_reg_rtx (nd->vmode);
17439 }
17440 return true;
17441}
17442
17443/* Try to expand one-operand permutation with constant mask. */
17444
17445static bool
17446ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
17447{
17448 machine_mode mode = GET_MODE (d->op0);
17449 machine_mode maskmode = mode;
17450 rtx (*gen) (rtx, rtx, rtx) = NULL;
17451 rtx target, op0, mask;
17452 rtx vec[64];
17453
17454 if (!rtx_equal_p (d->op0, d->op1))
17455 return false;
17456
17457 if (!TARGET_AVX512F)
17458 return false;
17459
17460 switch (mode)
17461 {
17462 case E_V16SImode:
17463 gen = gen_avx512f_permvarv16si;
17464 break;
17465 case E_V16SFmode:
17466 gen = gen_avx512f_permvarv16sf;
17467 maskmode = V16SImode;
17468 break;
17469 case E_V8DImode:
17470 gen = gen_avx512f_permvarv8di;
17471 break;
17472 case E_V8DFmode:
17473 gen = gen_avx512f_permvarv8df;
17474 maskmode = V8DImode;
17475 break;
17476 default:
17477 return false;
17478 }
17479
17480 target = d->target;
17481 op0 = d->op0;
17482 for (int i = 0; i < d->nelt; ++i)
17483 vec[i] = GEN_INT (d->perm[i]);
17484 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
17485 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
17486 return true;
17487}
17488
17489static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
17490
4bf4c103 17491/* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
2bf6d935
ML
17492 in a single instruction. */
17493
17494static bool
17495expand_vec_perm_1 (struct expand_vec_perm_d *d)
17496{
17497 unsigned i, nelt = d->nelt;
17498 struct expand_vec_perm_d nd;
17499
17500 /* Check plain VEC_SELECT first, because AVX has instructions that could
17501 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
17502 input where SEL+CONCAT may not. */
17503 if (d->one_operand_p)
17504 {
17505 int mask = nelt - 1;
17506 bool identity_perm = true;
17507 bool broadcast_perm = true;
17508
17509 for (i = 0; i < nelt; i++)
17510 {
17511 nd.perm[i] = d->perm[i] & mask;
17512 if (nd.perm[i] != i)
17513 identity_perm = false;
17514 if (nd.perm[i])
17515 broadcast_perm = false;
17516 }
17517
17518 if (identity_perm)
17519 {
17520 if (!d->testing_p)
17521 emit_move_insn (d->target, d->op0);
17522 return true;
17523 }
17524 else if (broadcast_perm && TARGET_AVX2)
17525 {
17526 /* Use vpbroadcast{b,w,d}. */
17527 rtx (*gen) (rtx, rtx) = NULL;
17528 switch (d->vmode)
17529 {
17530 case E_V64QImode:
17531 if (TARGET_AVX512BW)
17532 gen = gen_avx512bw_vec_dupv64qi_1;
17533 break;
17534 case E_V32QImode:
17535 gen = gen_avx2_pbroadcastv32qi_1;
17536 break;
17537 case E_V32HImode:
17538 if (TARGET_AVX512BW)
17539 gen = gen_avx512bw_vec_dupv32hi_1;
17540 break;
17541 case E_V16HImode:
17542 gen = gen_avx2_pbroadcastv16hi_1;
17543 break;
17544 case E_V16SImode:
17545 if (TARGET_AVX512F)
17546 gen = gen_avx512f_vec_dupv16si_1;
17547 break;
17548 case E_V8SImode:
17549 gen = gen_avx2_pbroadcastv8si_1;
17550 break;
17551 case E_V16QImode:
17552 gen = gen_avx2_pbroadcastv16qi;
17553 break;
17554 case E_V8HImode:
17555 gen = gen_avx2_pbroadcastv8hi;
17556 break;
17557 case E_V16SFmode:
17558 if (TARGET_AVX512F)
17559 gen = gen_avx512f_vec_dupv16sf_1;
17560 break;
17561 case E_V8SFmode:
17562 gen = gen_avx2_vec_dupv8sf_1;
17563 break;
17564 case E_V8DFmode:
17565 if (TARGET_AVX512F)
17566 gen = gen_avx512f_vec_dupv8df_1;
17567 break;
17568 case E_V8DImode:
17569 if (TARGET_AVX512F)
17570 gen = gen_avx512f_vec_dupv8di_1;
17571 break;
17572 /* For other modes prefer other shuffles this function creates. */
17573 default: break;
17574 }
17575 if (gen != NULL)
17576 {
17577 if (!d->testing_p)
17578 emit_insn (gen (d->target, d->op0));
17579 return true;
17580 }
17581 }
17582
17583 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
17584 return true;
17585
17586 /* There are plenty of patterns in sse.md that are written for
17587 SEL+CONCAT and are not replicated for a single op. Perhaps
17588 that should be changed, to avoid the nastiness here. */
17589
17590 /* Recognize interleave style patterns, which means incrementing
17591 every other permutation operand. */
17592 for (i = 0; i < nelt; i += 2)
17593 {
17594 nd.perm[i] = d->perm[i] & mask;
17595 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
17596 }
17597 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17598 d->testing_p))
17599 return true;
17600
17601 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
17602 if (nelt >= 4)
17603 {
17604 for (i = 0; i < nelt; i += 4)
17605 {
17606 nd.perm[i + 0] = d->perm[i + 0] & mask;
17607 nd.perm[i + 1] = d->perm[i + 1] & mask;
17608 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
17609 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
17610 }
17611
17612 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17613 d->testing_p))
17614 return true;
17615 }
17616 }
17617
17618 /* Try movss/movsd instructions. */
17619 if (expand_vec_perm_movs (d))
17620 return true;
17621
17622 /* Finally, try the fully general two operand permute. */
17623 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
17624 d->testing_p))
17625 return true;
17626
17627 /* Recognize interleave style patterns with reversed operands. */
17628 if (!d->one_operand_p)
17629 {
17630 for (i = 0; i < nelt; ++i)
17631 {
17632 unsigned e = d->perm[i];
17633 if (e >= nelt)
17634 e -= nelt;
17635 else
17636 e += nelt;
17637 nd.perm[i] = e;
17638 }
17639
17640 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
17641 d->testing_p))
17642 return true;
17643 }
17644
17645 /* Try the SSE4.1 blend variable merge instructions. */
17646 if (expand_vec_perm_blend (d))
17647 return true;
17648
17649 /* Try one of the AVX vpermil variable permutations. */
17650 if (expand_vec_perm_vpermil (d))
17651 return true;
17652
17653 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
17654 vpshufb, vpermd, vpermps or vpermq variable permutation. */
17655 if (expand_vec_perm_pshufb (d))
17656 return true;
17657
17658 /* Try the AVX2 vpalignr instruction. */
17659 if (expand_vec_perm_palignr (d, true))
17660 return true;
17661
17662 /* Try the AVX512F vperm{s,d} instructions. */
17663 if (ix86_expand_vec_one_operand_perm_avx512 (d))
17664 return true;
17665
17666 /* Try the AVX512F vpermt2/vpermi2 instructions. */
17667 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
17668 return true;
17669
17670 /* See if we can get the same permutation in different vector integer
17671 mode. */
17672 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
17673 {
17674 if (!d->testing_p)
17675 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
17676 return true;
17677 }
17678 return false;
17679}
17680
4bf4c103 17681/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
17682 in terms of a pair of pshuflw + pshufhw instructions. */
17683
17684static bool
17685expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
17686{
17687 unsigned char perm2[MAX_VECT_LEN];
17688 unsigned i;
17689 bool ok;
17690
17691 if (d->vmode != V8HImode || !d->one_operand_p)
17692 return false;
17693
17694 /* The two permutations only operate in 64-bit lanes. */
17695 for (i = 0; i < 4; ++i)
17696 if (d->perm[i] >= 4)
17697 return false;
17698 for (i = 4; i < 8; ++i)
17699 if (d->perm[i] < 4)
17700 return false;
17701
17702 if (d->testing_p)
17703 return true;
17704
17705 /* Emit the pshuflw. */
17706 memcpy (perm2, d->perm, 4);
17707 for (i = 4; i < 8; ++i)
17708 perm2[i] = i;
17709 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
17710 gcc_assert (ok);
17711
17712 /* Emit the pshufhw. */
17713 memcpy (perm2 + 4, d->perm + 4, 4);
17714 for (i = 0; i < 4; ++i)
17715 perm2[i] = i;
17716 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
17717 gcc_assert (ok);
17718
17719 return true;
17720}
17721
4bf4c103 17722/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
17723 the permutation using the SSSE3 palignr instruction. This succeeds
17724 when all of the elements in PERM fit within one vector and we merely
17725 need to shift them down so that a single vector permutation has a
17726 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
17727 the vpalignr instruction itself can perform the requested permutation. */
17728
17729static bool
17730expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
17731{
17732 unsigned i, nelt = d->nelt;
17733 unsigned min, max, minswap, maxswap;
17734 bool in_order, ok, swap = false;
17735 rtx shift, target;
17736 struct expand_vec_perm_d dcopy;
17737
17738 /* Even with AVX, palignr only operates on 128-bit vectors,
17739 in AVX2 palignr operates on both 128-bit lanes. */
17740 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
17741 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
17742 return false;
17743
17744 min = 2 * nelt;
17745 max = 0;
17746 minswap = 2 * nelt;
17747 maxswap = 0;
17748 for (i = 0; i < nelt; ++i)
17749 {
17750 unsigned e = d->perm[i];
17751 unsigned eswap = d->perm[i] ^ nelt;
17752 if (GET_MODE_SIZE (d->vmode) == 32)
17753 {
17754 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
17755 eswap = e ^ (nelt / 2);
17756 }
17757 if (e < min)
17758 min = e;
17759 if (e > max)
17760 max = e;
17761 if (eswap < minswap)
17762 minswap = eswap;
17763 if (eswap > maxswap)
17764 maxswap = eswap;
17765 }
17766 if (min == 0
17767 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
17768 {
17769 if (d->one_operand_p
17770 || minswap == 0
17771 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
17772 ? nelt / 2 : nelt))
17773 return false;
17774 swap = true;
17775 min = minswap;
17776 max = maxswap;
17777 }
17778
17779 /* Given that we have SSSE3, we know we'll be able to implement the
17780 single operand permutation after the palignr with pshufb for
17781 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
17782 first. */
17783 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
17784 return true;
17785
17786 dcopy = *d;
17787 if (swap)
17788 {
17789 dcopy.op0 = d->op1;
17790 dcopy.op1 = d->op0;
17791 for (i = 0; i < nelt; ++i)
17792 dcopy.perm[i] ^= nelt;
17793 }
17794
17795 in_order = true;
17796 for (i = 0; i < nelt; ++i)
17797 {
17798 unsigned e = dcopy.perm[i];
17799 if (GET_MODE_SIZE (d->vmode) == 32
17800 && e >= nelt
17801 && (e & (nelt / 2 - 1)) < min)
17802 e = e - min - (nelt / 2);
17803 else
17804 e = e - min;
17805 if (e != i)
17806 in_order = false;
17807 dcopy.perm[i] = e;
17808 }
17809 dcopy.one_operand_p = true;
17810
17811 if (single_insn_only_p && !in_order)
17812 return false;
17813
17814 /* For AVX2, test whether we can permute the result in one instruction. */
17815 if (d->testing_p)
17816 {
17817 if (in_order)
17818 return true;
17819 dcopy.op1 = dcopy.op0;
17820 return expand_vec_perm_1 (&dcopy);
17821 }
17822
17823 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
17824 if (GET_MODE_SIZE (d->vmode) == 16)
17825 {
17826 target = gen_reg_rtx (TImode);
17827 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
17828 gen_lowpart (TImode, dcopy.op0), shift));
17829 }
17830 else
17831 {
17832 target = gen_reg_rtx (V2TImode);
17833 emit_insn (gen_avx2_palignrv2ti (target,
17834 gen_lowpart (V2TImode, dcopy.op1),
17835 gen_lowpart (V2TImode, dcopy.op0),
17836 shift));
17837 }
17838
17839 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
17840
17841 /* Test for the degenerate case where the alignment by itself
17842 produces the desired permutation. */
17843 if (in_order)
17844 {
17845 emit_move_insn (d->target, dcopy.op0);
17846 return true;
17847 }
17848
17849 ok = expand_vec_perm_1 (&dcopy);
17850 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
17851
17852 return ok;
17853}
17854
17855/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17856 the permutation using the SSE4_1 pblendv instruction. Potentially
17857 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
17858
17859static bool
17860expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
17861{
17862 unsigned i, which, nelt = d->nelt;
17863 struct expand_vec_perm_d dcopy, dcopy1;
17864 machine_mode vmode = d->vmode;
17865 bool ok;
17866
17867 /* Use the same checks as in expand_vec_perm_blend. */
17868 if (d->one_operand_p)
17869 return false;
17870 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
17871 ;
17872 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
17873 ;
17874 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
17875 ;
17876 else
17877 return false;
17878
17879 /* Figure out where permutation elements stay not in their
17880 respective lanes. */
17881 for (i = 0, which = 0; i < nelt; ++i)
17882 {
17883 unsigned e = d->perm[i];
17884 if (e != i)
17885 which |= (e < nelt ? 1 : 2);
17886 }
17887 /* We can pblend the part where elements stay not in their
17888 respective lanes only when these elements are all in one
17889 half of a permutation.
17890 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17891 lanes, but both 8 and 9 >= 8
17892 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17893 respective lanes and 8 >= 8, but 2 not. */
17894 if (which != 1 && which != 2)
17895 return false;
17896 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
17897 return true;
17898
17899 /* First we apply one operand permutation to the part where
17900 elements stay not in their respective lanes. */
17901 dcopy = *d;
17902 if (which == 2)
17903 dcopy.op0 = dcopy.op1 = d->op1;
17904 else
17905 dcopy.op0 = dcopy.op1 = d->op0;
17906 if (!d->testing_p)
17907 dcopy.target = gen_reg_rtx (vmode);
17908 dcopy.one_operand_p = true;
17909
17910 for (i = 0; i < nelt; ++i)
17911 dcopy.perm[i] = d->perm[i] & (nelt - 1);
17912
17913 ok = expand_vec_perm_1 (&dcopy);
17914 if (GET_MODE_SIZE (vmode) != 16 && !ok)
17915 return false;
17916 else
17917 gcc_assert (ok);
17918 if (d->testing_p)
17919 return true;
17920
17921 /* Next we put permuted elements into their positions. */
17922 dcopy1 = *d;
17923 if (which == 2)
17924 dcopy1.op1 = dcopy.target;
17925 else
17926 dcopy1.op0 = dcopy.target;
17927
17928 for (i = 0; i < nelt; ++i)
17929 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
17930
17931 ok = expand_vec_perm_blend (&dcopy1);
17932 gcc_assert (ok);
17933
17934 return true;
17935}
17936
17937static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
17938
4bf4c103 17939/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
17940 a two vector permutation into a single vector permutation by using
17941 an interleave operation to merge the vectors. */
17942
17943static bool
17944expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
17945{
17946 struct expand_vec_perm_d dremap, dfinal;
17947 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
17948 unsigned HOST_WIDE_INT contents;
17949 unsigned char remap[2 * MAX_VECT_LEN];
17950 rtx_insn *seq;
17951 bool ok, same_halves = false;
17952
17953 if (GET_MODE_SIZE (d->vmode) == 16)
17954 {
17955 if (d->one_operand_p)
17956 return false;
17957 }
17958 else if (GET_MODE_SIZE (d->vmode) == 32)
17959 {
17960 if (!TARGET_AVX)
17961 return false;
17962 /* For 32-byte modes allow even d->one_operand_p.
17963 The lack of cross-lane shuffling in some instructions
17964 might prevent a single insn shuffle. */
17965 dfinal = *d;
17966 dfinal.testing_p = true;
17967 /* If expand_vec_perm_interleave3 can expand this into
17968 a 3 insn sequence, give up and let it be expanded as
17969 3 insn sequence. While that is one insn longer,
17970 it doesn't need a memory operand and in the common
17971 case that both interleave low and high permutations
17972 with the same operands are adjacent needs 4 insns
17973 for both after CSE. */
17974 if (expand_vec_perm_interleave3 (&dfinal))
17975 return false;
17976 }
17977 else
17978 return false;
17979
17980 /* Examine from whence the elements come. */
17981 contents = 0;
17982 for (i = 0; i < nelt; ++i)
17983 contents |= HOST_WIDE_INT_1U << d->perm[i];
17984
17985 memset (remap, 0xff, sizeof (remap));
17986 dremap = *d;
17987
17988 if (GET_MODE_SIZE (d->vmode) == 16)
17989 {
17990 unsigned HOST_WIDE_INT h1, h2, h3, h4;
17991
17992 /* Split the two input vectors into 4 halves. */
17993 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
17994 h2 = h1 << nelt2;
17995 h3 = h2 << nelt2;
17996 h4 = h3 << nelt2;
17997
17998 /* If the elements from the low halves use interleave low, and similarly
17999 for interleave high. If the elements are from mis-matched halves, we
18000 can use shufps for V4SF/V4SI or do a DImode shuffle. */
18001 if ((contents & (h1 | h3)) == contents)
18002 {
18003 /* punpckl* */
18004 for (i = 0; i < nelt2; ++i)
18005 {
18006 remap[i] = i * 2;
18007 remap[i + nelt] = i * 2 + 1;
18008 dremap.perm[i * 2] = i;
18009 dremap.perm[i * 2 + 1] = i + nelt;
18010 }
18011 if (!TARGET_SSE2 && d->vmode == V4SImode)
18012 dremap.vmode = V4SFmode;
18013 }
18014 else if ((contents & (h2 | h4)) == contents)
18015 {
18016 /* punpckh* */
18017 for (i = 0; i < nelt2; ++i)
18018 {
18019 remap[i + nelt2] = i * 2;
18020 remap[i + nelt + nelt2] = i * 2 + 1;
18021 dremap.perm[i * 2] = i + nelt2;
18022 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
18023 }
18024 if (!TARGET_SSE2 && d->vmode == V4SImode)
18025 dremap.vmode = V4SFmode;
18026 }
18027 else if ((contents & (h1 | h4)) == contents)
18028 {
18029 /* shufps */
18030 for (i = 0; i < nelt2; ++i)
18031 {
18032 remap[i] = i;
18033 remap[i + nelt + nelt2] = i + nelt2;
18034 dremap.perm[i] = i;
18035 dremap.perm[i + nelt2] = i + nelt + nelt2;
18036 }
18037 if (nelt != 4)
18038 {
18039 /* shufpd */
18040 dremap.vmode = V2DImode;
18041 dremap.nelt = 2;
18042 dremap.perm[0] = 0;
18043 dremap.perm[1] = 3;
18044 }
18045 }
18046 else if ((contents & (h2 | h3)) == contents)
18047 {
18048 /* shufps */
18049 for (i = 0; i < nelt2; ++i)
18050 {
18051 remap[i + nelt2] = i;
18052 remap[i + nelt] = i + nelt2;
18053 dremap.perm[i] = i + nelt2;
18054 dremap.perm[i + nelt2] = i + nelt;
18055 }
18056 if (nelt != 4)
18057 {
18058 /* shufpd */
18059 dremap.vmode = V2DImode;
18060 dremap.nelt = 2;
18061 dremap.perm[0] = 1;
18062 dremap.perm[1] = 2;
18063 }
18064 }
18065 else
18066 return false;
18067 }
18068 else
18069 {
18070 unsigned int nelt4 = nelt / 4, nzcnt = 0;
18071 unsigned HOST_WIDE_INT q[8];
18072 unsigned int nonzero_halves[4];
18073
18074 /* Split the two input vectors into 8 quarters. */
18075 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
18076 for (i = 1; i < 8; ++i)
18077 q[i] = q[0] << (nelt4 * i);
18078 for (i = 0; i < 4; ++i)
18079 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
18080 {
18081 nonzero_halves[nzcnt] = i;
18082 ++nzcnt;
18083 }
18084
18085 if (nzcnt == 1)
18086 {
18087 gcc_assert (d->one_operand_p);
18088 nonzero_halves[1] = nonzero_halves[0];
18089 same_halves = true;
18090 }
18091 else if (d->one_operand_p)
18092 {
18093 gcc_assert (nonzero_halves[0] == 0);
18094 gcc_assert (nonzero_halves[1] == 1);
18095 }
18096
18097 if (nzcnt <= 2)
18098 {
18099 if (d->perm[0] / nelt2 == nonzero_halves[1])
18100 {
18101 /* Attempt to increase the likelihood that dfinal
18102 shuffle will be intra-lane. */
18103 std::swap (nonzero_halves[0], nonzero_halves[1]);
18104 }
18105
18106 /* vperm2f128 or vperm2i128. */
18107 for (i = 0; i < nelt2; ++i)
18108 {
18109 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
18110 remap[i + nonzero_halves[0] * nelt2] = i;
18111 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
18112 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
18113 }
18114
18115 if (d->vmode != V8SFmode
18116 && d->vmode != V4DFmode
18117 && d->vmode != V8SImode)
18118 {
18119 dremap.vmode = V8SImode;
18120 dremap.nelt = 8;
18121 for (i = 0; i < 4; ++i)
18122 {
18123 dremap.perm[i] = i + nonzero_halves[0] * 4;
18124 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
18125 }
18126 }
18127 }
18128 else if (d->one_operand_p)
18129 return false;
18130 else if (TARGET_AVX2
18131 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
18132 {
18133 /* vpunpckl* */
18134 for (i = 0; i < nelt4; ++i)
18135 {
18136 remap[i] = i * 2;
18137 remap[i + nelt] = i * 2 + 1;
18138 remap[i + nelt2] = i * 2 + nelt2;
18139 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
18140 dremap.perm[i * 2] = i;
18141 dremap.perm[i * 2 + 1] = i + nelt;
18142 dremap.perm[i * 2 + nelt2] = i + nelt2;
18143 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
18144 }
18145 }
18146 else if (TARGET_AVX2
18147 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
18148 {
18149 /* vpunpckh* */
18150 for (i = 0; i < nelt4; ++i)
18151 {
18152 remap[i + nelt4] = i * 2;
18153 remap[i + nelt + nelt4] = i * 2 + 1;
18154 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
18155 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
18156 dremap.perm[i * 2] = i + nelt4;
18157 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
18158 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
18159 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
18160 }
18161 }
18162 else
18163 return false;
18164 }
18165
18166 /* Use the remapping array set up above to move the elements from their
18167 swizzled locations into their final destinations. */
18168 dfinal = *d;
18169 for (i = 0; i < nelt; ++i)
18170 {
18171 unsigned e = remap[d->perm[i]];
18172 gcc_assert (e < nelt);
18173 /* If same_halves is true, both halves of the remapped vector are the
18174 same. Avoid cross-lane accesses if possible. */
18175 if (same_halves && i >= nelt2)
18176 {
18177 gcc_assert (e < nelt2);
18178 dfinal.perm[i] = e + nelt2;
18179 }
18180 else
18181 dfinal.perm[i] = e;
18182 }
18183 if (!d->testing_p)
18184 {
18185 dremap.target = gen_reg_rtx (dremap.vmode);
18186 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
18187 }
18188 dfinal.op1 = dfinal.op0;
18189 dfinal.one_operand_p = true;
18190
18191 /* Test if the final remap can be done with a single insn. For V4SFmode or
18192 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
18193 start_sequence ();
18194 ok = expand_vec_perm_1 (&dfinal);
18195 seq = get_insns ();
18196 end_sequence ();
18197
18198 if (!ok)
18199 return false;
18200
18201 if (d->testing_p)
18202 return true;
18203
18204 if (dremap.vmode != dfinal.vmode)
18205 {
18206 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
18207 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
18208 }
18209
18210 ok = expand_vec_perm_1 (&dremap);
18211 gcc_assert (ok);
18212
18213 emit_insn (seq);
18214 return true;
18215}
18216
4bf4c103 18217/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
18218 a single vector cross-lane permutation into vpermq followed
18219 by any of the single insn permutations. */
18220
18221static bool
18222expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
18223{
18224 struct expand_vec_perm_d dremap, dfinal;
18225 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
18226 unsigned contents[2];
18227 bool ok;
18228
18229 if (!(TARGET_AVX2
18230 && (d->vmode == V32QImode || d->vmode == V16HImode)
18231 && d->one_operand_p))
18232 return false;
18233
18234 contents[0] = 0;
18235 contents[1] = 0;
18236 for (i = 0; i < nelt2; ++i)
18237 {
18238 contents[0] |= 1u << (d->perm[i] / nelt4);
18239 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
18240 }
18241
18242 for (i = 0; i < 2; ++i)
18243 {
18244 unsigned int cnt = 0;
18245 for (j = 0; j < 4; ++j)
18246 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
18247 return false;
18248 }
18249
18250 if (d->testing_p)
18251 return true;
18252
18253 dremap = *d;
18254 dremap.vmode = V4DImode;
18255 dremap.nelt = 4;
18256 dremap.target = gen_reg_rtx (V4DImode);
18257 dremap.op0 = gen_lowpart (V4DImode, d->op0);
18258 dremap.op1 = dremap.op0;
18259 dremap.one_operand_p = true;
18260 for (i = 0; i < 2; ++i)
18261 {
18262 unsigned int cnt = 0;
18263 for (j = 0; j < 4; ++j)
18264 if ((contents[i] & (1u << j)) != 0)
18265 dremap.perm[2 * i + cnt++] = j;
18266 for (; cnt < 2; ++cnt)
18267 dremap.perm[2 * i + cnt] = 0;
18268 }
18269
18270 dfinal = *d;
18271 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
18272 dfinal.op1 = dfinal.op0;
18273 dfinal.one_operand_p = true;
18274 for (i = 0, j = 0; i < nelt; ++i)
18275 {
18276 if (i == nelt2)
18277 j = 2;
18278 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
18279 if ((d->perm[i] / nelt4) == dremap.perm[j])
18280 ;
18281 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
18282 dfinal.perm[i] |= nelt4;
18283 else
18284 gcc_unreachable ();
18285 }
18286
18287 ok = expand_vec_perm_1 (&dremap);
18288 gcc_assert (ok);
18289
18290 ok = expand_vec_perm_1 (&dfinal);
18291 gcc_assert (ok);
18292
18293 return true;
18294}
18295
18296static bool canonicalize_perm (struct expand_vec_perm_d *d);
18297
4bf4c103 18298/* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
2bf6d935
ML
18299 a vector permutation using two instructions, vperm2f128 resp.
18300 vperm2i128 followed by any single in-lane permutation. */
18301
18302static bool
18303expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
18304{
18305 struct expand_vec_perm_d dfirst, dsecond;
18306 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
18307 bool ok;
18308
18309 if (!TARGET_AVX
18310 || GET_MODE_SIZE (d->vmode) != 32
18311 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
18312 return false;
18313
18314 dsecond = *d;
18315 dsecond.one_operand_p = false;
18316 dsecond.testing_p = true;
18317
18318 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
18319 immediate. For perm < 16 the second permutation uses
18320 d->op0 as first operand, for perm >= 16 it uses d->op1
18321 as first operand. The second operand is the result of
18322 vperm2[fi]128. */
18323 for (perm = 0; perm < 32; perm++)
18324 {
18325 /* Ignore permutations which do not move anything cross-lane. */
18326 if (perm < 16)
18327 {
18328 /* The second shuffle for e.g. V4DFmode has
18329 0123 and ABCD operands.
18330 Ignore AB23, as 23 is already in the second lane
18331 of the first operand. */
18332 if ((perm & 0xc) == (1 << 2)) continue;
18333 /* And 01CD, as 01 is in the first lane of the first
18334 operand. */
18335 if ((perm & 3) == 0) continue;
18336 /* And 4567, as then the vperm2[fi]128 doesn't change
18337 anything on the original 4567 second operand. */
18338 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
18339 }
18340 else
18341 {
18342 /* The second shuffle for e.g. V4DFmode has
18343 4567 and ABCD operands.
18344 Ignore AB67, as 67 is already in the second lane
18345 of the first operand. */
18346 if ((perm & 0xc) == (3 << 2)) continue;
18347 /* And 45CD, as 45 is in the first lane of the first
18348 operand. */
18349 if ((perm & 3) == 2) continue;
18350 /* And 0123, as then the vperm2[fi]128 doesn't change
18351 anything on the original 0123 first operand. */
18352 if ((perm & 0xf) == (1 << 2)) continue;
18353 }
18354
18355 for (i = 0; i < nelt; i++)
18356 {
18357 j = d->perm[i] / nelt2;
18358 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
18359 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
18360 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
18361 dsecond.perm[i] = d->perm[i] & (nelt - 1);
18362 else
18363 break;
18364 }
18365
18366 if (i == nelt)
18367 {
18368 start_sequence ();
18369 ok = expand_vec_perm_1 (&dsecond);
18370 end_sequence ();
18371 }
18372 else
18373 ok = false;
18374
18375 if (ok)
18376 {
18377 if (d->testing_p)
18378 return true;
18379
18380 /* Found a usable second shuffle. dfirst will be
18381 vperm2f128 on d->op0 and d->op1. */
18382 dsecond.testing_p = false;
18383 dfirst = *d;
18384 dfirst.target = gen_reg_rtx (d->vmode);
18385 for (i = 0; i < nelt; i++)
18386 dfirst.perm[i] = (i & (nelt2 - 1))
18387 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
18388
18389 canonicalize_perm (&dfirst);
18390 ok = expand_vec_perm_1 (&dfirst);
18391 gcc_assert (ok);
18392
18393 /* And dsecond is some single insn shuffle, taking
18394 d->op0 and result of vperm2f128 (if perm < 16) or
18395 d->op1 and result of vperm2f128 (otherwise). */
18396 if (perm >= 16)
18397 dsecond.op0 = dsecond.op1;
18398 dsecond.op1 = dfirst.target;
18399
18400 ok = expand_vec_perm_1 (&dsecond);
18401 gcc_assert (ok);
18402
18403 return true;
18404 }
18405
18406 /* For one operand, the only useful vperm2f128 permutation is 0x01
18407 aka lanes swap. */
18408 if (d->one_operand_p)
18409 return false;
18410 }
18411
18412 return false;
18413}
18414
4bf4c103 18415/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
18416 a two vector permutation using 2 intra-lane interleave insns
18417 and cross-lane shuffle for 32-byte vectors. */
18418
18419static bool
18420expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
18421{
18422 unsigned i, nelt;
18423 rtx (*gen) (rtx, rtx, rtx);
18424
18425 if (d->one_operand_p)
18426 return false;
18427 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
18428 ;
18429 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
18430 ;
18431 else
18432 return false;
18433
18434 nelt = d->nelt;
18435 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
18436 return false;
18437 for (i = 0; i < nelt; i += 2)
18438 if (d->perm[i] != d->perm[0] + i / 2
18439 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
18440 return false;
18441
18442 if (d->testing_p)
18443 return true;
18444
18445 switch (d->vmode)
18446 {
18447 case E_V32QImode:
18448 if (d->perm[0])
18449 gen = gen_vec_interleave_highv32qi;
18450 else
18451 gen = gen_vec_interleave_lowv32qi;
18452 break;
18453 case E_V16HImode:
18454 if (d->perm[0])
18455 gen = gen_vec_interleave_highv16hi;
18456 else
18457 gen = gen_vec_interleave_lowv16hi;
18458 break;
18459 case E_V8SImode:
18460 if (d->perm[0])
18461 gen = gen_vec_interleave_highv8si;
18462 else
18463 gen = gen_vec_interleave_lowv8si;
18464 break;
18465 case E_V4DImode:
18466 if (d->perm[0])
18467 gen = gen_vec_interleave_highv4di;
18468 else
18469 gen = gen_vec_interleave_lowv4di;
18470 break;
18471 case E_V8SFmode:
18472 if (d->perm[0])
18473 gen = gen_vec_interleave_highv8sf;
18474 else
18475 gen = gen_vec_interleave_lowv8sf;
18476 break;
18477 case E_V4DFmode:
18478 if (d->perm[0])
18479 gen = gen_vec_interleave_highv4df;
18480 else
18481 gen = gen_vec_interleave_lowv4df;
18482 break;
18483 default:
18484 gcc_unreachable ();
18485 }
18486
18487 emit_insn (gen (d->target, d->op0, d->op1));
18488 return true;
18489}
18490
4bf4c103 18491/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
2bf6d935
ML
18492 a single vector permutation using a single intra-lane vector
18493 permutation, vperm2f128 swapping the lanes and vblend* insn blending
18494 the non-swapped and swapped vectors together. */
18495
18496static bool
18497expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
18498{
18499 struct expand_vec_perm_d dfirst, dsecond;
18500 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
18501 rtx_insn *seq;
18502 bool ok;
18503 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18504
18505 if (!TARGET_AVX
18506 || TARGET_AVX2
18507 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18508 || !d->one_operand_p)
18509 return false;
18510
18511 dfirst = *d;
18512 for (i = 0; i < nelt; i++)
18513 dfirst.perm[i] = 0xff;
18514 for (i = 0, msk = 0; i < nelt; i++)
18515 {
18516 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18517 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
18518 return false;
18519 dfirst.perm[j] = d->perm[i];
18520 if (j != i)
18521 msk |= (1 << i);
18522 }
18523 for (i = 0; i < nelt; i++)
18524 if (dfirst.perm[i] == 0xff)
18525 dfirst.perm[i] = i;
18526
18527 if (!d->testing_p)
18528 dfirst.target = gen_reg_rtx (dfirst.vmode);
18529
18530 start_sequence ();
18531 ok = expand_vec_perm_1 (&dfirst);
18532 seq = get_insns ();
18533 end_sequence ();
18534
18535 if (!ok)
18536 return false;
18537
18538 if (d->testing_p)
18539 return true;
18540
18541 emit_insn (seq);
18542
18543 dsecond = *d;
18544 dsecond.op0 = dfirst.target;
18545 dsecond.op1 = dfirst.target;
18546 dsecond.one_operand_p = true;
18547 dsecond.target = gen_reg_rtx (dsecond.vmode);
18548 for (i = 0; i < nelt; i++)
18549 dsecond.perm[i] = i ^ nelt2;
18550
18551 ok = expand_vec_perm_1 (&dsecond);
18552 gcc_assert (ok);
18553
18554 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18555 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
18556 return true;
18557}
18558
4bf4c103 18559/* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
2bf6d935
ML
18560 permutation using two vperm2f128, followed by a vshufpd insn blending
18561 the two vectors together. */
18562
18563static bool
18564expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
18565{
18566 struct expand_vec_perm_d dfirst, dsecond, dthird;
18567 bool ok;
18568
18569 if (!TARGET_AVX || (d->vmode != V4DFmode))
18570 return false;
18571
18572 if (d->testing_p)
18573 return true;
18574
18575 dfirst = *d;
18576 dsecond = *d;
18577 dthird = *d;
18578
18579 dfirst.perm[0] = (d->perm[0] & ~1);
18580 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
18581 dfirst.perm[2] = (d->perm[2] & ~1);
18582 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
18583 dsecond.perm[0] = (d->perm[1] & ~1);
18584 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
18585 dsecond.perm[2] = (d->perm[3] & ~1);
18586 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
18587 dthird.perm[0] = (d->perm[0] % 2);
18588 dthird.perm[1] = (d->perm[1] % 2) + 4;
18589 dthird.perm[2] = (d->perm[2] % 2) + 2;
18590 dthird.perm[3] = (d->perm[3] % 2) + 6;
18591
18592 dfirst.target = gen_reg_rtx (dfirst.vmode);
18593 dsecond.target = gen_reg_rtx (dsecond.vmode);
18594 dthird.op0 = dfirst.target;
18595 dthird.op1 = dsecond.target;
18596 dthird.one_operand_p = false;
18597
18598 canonicalize_perm (&dfirst);
18599 canonicalize_perm (&dsecond);
18600
18601 ok = expand_vec_perm_1 (&dfirst)
18602 && expand_vec_perm_1 (&dsecond)
18603 && expand_vec_perm_1 (&dthird);
18604
18605 gcc_assert (ok);
18606
18607 return true;
18608}
18609
4bf4c103
JJ
18610static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
18611
18612/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18613 a two vector permutation using two intra-lane vector
18614 permutations, vperm2f128 swapping the lanes and vblend* insn blending
18615 the non-swapped and swapped vectors together. */
18616
18617static bool
18618expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
18619{
18620 struct expand_vec_perm_d dfirst, dsecond, dthird;
18621 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
18622 rtx_insn *seq1, *seq2;
18623 bool ok;
18624 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18625
18626 if (!TARGET_AVX
18627 || TARGET_AVX2
18628 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18629 || d->one_operand_p)
18630 return false;
18631
18632 dfirst = *d;
18633 dsecond = *d;
18634 for (i = 0; i < nelt; i++)
18635 {
18636 dfirst.perm[i] = 0xff;
18637 dsecond.perm[i] = 0xff;
18638 }
18639 for (i = 0, msk = 0; i < nelt; i++)
18640 {
18641 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18642 if (j == i)
18643 {
18644 dfirst.perm[j] = d->perm[i];
18645 which1 |= (d->perm[i] < nelt ? 1 : 2);
18646 }
18647 else
18648 {
18649 dsecond.perm[j] = d->perm[i];
18650 which2 |= (d->perm[i] < nelt ? 1 : 2);
18651 msk |= (1U << i);
18652 }
18653 }
18654 if (msk == 0 || msk == (1U << nelt) - 1)
18655 return false;
18656
18657 if (!d->testing_p)
18658 {
18659 dfirst.target = gen_reg_rtx (dfirst.vmode);
18660 dsecond.target = gen_reg_rtx (dsecond.vmode);
18661 }
18662
18663 for (i = 0; i < nelt; i++)
18664 {
18665 if (dfirst.perm[i] == 0xff)
18666 dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
18667 if (dsecond.perm[i] == 0xff)
18668 dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
18669 }
18670 canonicalize_perm (&dfirst);
18671 start_sequence ();
18672 ok = ix86_expand_vec_perm_const_1 (&dfirst);
18673 seq1 = get_insns ();
18674 end_sequence ();
18675
18676 if (!ok)
18677 return false;
18678
18679 canonicalize_perm (&dsecond);
18680 start_sequence ();
18681 ok = ix86_expand_vec_perm_const_1 (&dsecond);
18682 seq2 = get_insns ();
18683 end_sequence ();
18684
18685 if (!ok)
18686 return false;
18687
18688 if (d->testing_p)
18689 return true;
18690
18691 emit_insn (seq1);
18692 emit_insn (seq2);
18693
18694 dthird = *d;
18695 dthird.op0 = dsecond.target;
18696 dthird.op1 = dsecond.target;
18697 dthird.one_operand_p = true;
18698 dthird.target = gen_reg_rtx (dthird.vmode);
18699 for (i = 0; i < nelt; i++)
18700 dthird.perm[i] = i ^ nelt2;
18701
18702 ok = expand_vec_perm_1 (&dthird);
18703 gcc_assert (ok);
18704
18705 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18706 emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
18707 return true;
18708}
18709
2bf6d935
ML
18710/* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
18711 permutation with two pshufb insns and an ior. We should have already
18712 failed all two instruction sequences. */
18713
18714static bool
18715expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
18716{
18717 rtx rperm[2][16], vperm, l, h, op, m128;
18718 unsigned int i, nelt, eltsz;
18719
18720 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
18721 return false;
18722 gcc_assert (!d->one_operand_p);
18723
18724 if (d->testing_p)
18725 return true;
18726
18727 nelt = d->nelt;
18728 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18729
18730 /* Generate two permutation masks. If the required element is within
18731 the given vector it is shuffled into the proper lane. If the required
18732 element is in the other vector, force a zero into the lane by setting
18733 bit 7 in the permutation mask. */
18734 m128 = GEN_INT (-128);
18735 for (i = 0; i < nelt; ++i)
18736 {
18737 unsigned j, e = d->perm[i];
18738 unsigned which = (e >= nelt);
18739 if (e >= nelt)
18740 e -= nelt;
18741
18742 for (j = 0; j < eltsz; ++j)
18743 {
18744 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
18745 rperm[1-which][i*eltsz + j] = m128;
18746 }
18747 }
18748
18749 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
18750 vperm = force_reg (V16QImode, vperm);
18751
18752 l = gen_reg_rtx (V16QImode);
18753 op = gen_lowpart (V16QImode, d->op0);
18754 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
18755
18756 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
18757 vperm = force_reg (V16QImode, vperm);
18758
18759 h = gen_reg_rtx (V16QImode);
18760 op = gen_lowpart (V16QImode, d->op1);
18761 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
18762
18763 op = d->target;
18764 if (d->vmode != V16QImode)
18765 op = gen_reg_rtx (V16QImode);
18766 emit_insn (gen_iorv16qi3 (op, l, h));
18767 if (op != d->target)
18768 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18769
18770 return true;
18771}
18772
18773/* Implement arbitrary permutation of one V32QImode and V16QImode operand
18774 with two vpshufb insns, vpermq and vpor. We should have already failed
18775 all two or three instruction sequences. */
18776
18777static bool
18778expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
18779{
18780 rtx rperm[2][32], vperm, l, h, hp, op, m128;
18781 unsigned int i, nelt, eltsz;
18782
18783 if (!TARGET_AVX2
18784 || !d->one_operand_p
18785 || (d->vmode != V32QImode && d->vmode != V16HImode))
18786 return false;
18787
18788 if (d->testing_p)
18789 return true;
18790
18791 nelt = d->nelt;
18792 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18793
18794 /* Generate two permutation masks. If the required element is within
18795 the same lane, it is shuffled in. If the required element from the
18796 other lane, force a zero by setting bit 7 in the permutation mask.
18797 In the other mask the mask has non-negative elements if element
18798 is requested from the other lane, but also moved to the other lane,
18799 so that the result of vpshufb can have the two V2TImode halves
18800 swapped. */
18801 m128 = GEN_INT (-128);
18802 for (i = 0; i < nelt; ++i)
18803 {
18804 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18805 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
18806
18807 for (j = 0; j < eltsz; ++j)
18808 {
18809 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
18810 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
18811 }
18812 }
18813
18814 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18815 vperm = force_reg (V32QImode, vperm);
18816
18817 h = gen_reg_rtx (V32QImode);
18818 op = gen_lowpart (V32QImode, d->op0);
18819 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18820
18821 /* Swap the 128-byte lanes of h into hp. */
18822 hp = gen_reg_rtx (V4DImode);
18823 op = gen_lowpart (V4DImode, h);
18824 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
18825 const1_rtx));
18826
18827 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18828 vperm = force_reg (V32QImode, vperm);
18829
18830 l = gen_reg_rtx (V32QImode);
18831 op = gen_lowpart (V32QImode, d->op0);
18832 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18833
18834 op = d->target;
18835 if (d->vmode != V32QImode)
18836 op = gen_reg_rtx (V32QImode);
18837 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
18838 if (op != d->target)
18839 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18840
18841 return true;
18842}
18843
18844/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18845 and extract-odd permutations of two V32QImode and V16QImode operand
18846 with two vpshufb insns, vpor and vpermq. We should have already
18847 failed all two or three instruction sequences. */
18848
18849static bool
18850expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
18851{
18852 rtx rperm[2][32], vperm, l, h, ior, op, m128;
18853 unsigned int i, nelt, eltsz;
18854
18855 if (!TARGET_AVX2
18856 || d->one_operand_p
18857 || (d->vmode != V32QImode && d->vmode != V16HImode))
18858 return false;
18859
18860 for (i = 0; i < d->nelt; ++i)
18861 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
18862 return false;
18863
18864 if (d->testing_p)
18865 return true;
18866
18867 nelt = d->nelt;
18868 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18869
18870 /* Generate two permutation masks. In the first permutation mask
18871 the first quarter will contain indexes for the first half
18872 of the op0, the second quarter will contain bit 7 set, third quarter
18873 will contain indexes for the second half of the op0 and the
18874 last quarter bit 7 set. In the second permutation mask
18875 the first quarter will contain bit 7 set, the second quarter
18876 indexes for the first half of the op1, the third quarter bit 7 set
18877 and last quarter indexes for the second half of the op1.
18878 I.e. the first mask e.g. for V32QImode extract even will be:
18879 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18880 (all values masked with 0xf except for -128) and second mask
18881 for extract even will be
18882 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
18883 m128 = GEN_INT (-128);
18884 for (i = 0; i < nelt; ++i)
18885 {
18886 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18887 unsigned which = d->perm[i] >= nelt;
18888 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
18889
18890 for (j = 0; j < eltsz; ++j)
18891 {
18892 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
18893 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
18894 }
18895 }
18896
18897 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18898 vperm = force_reg (V32QImode, vperm);
18899
18900 l = gen_reg_rtx (V32QImode);
18901 op = gen_lowpart (V32QImode, d->op0);
18902 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18903
18904 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18905 vperm = force_reg (V32QImode, vperm);
18906
18907 h = gen_reg_rtx (V32QImode);
18908 op = gen_lowpart (V32QImode, d->op1);
18909 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18910
18911 ior = gen_reg_rtx (V32QImode);
18912 emit_insn (gen_iorv32qi3 (ior, l, h));
18913
18914 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
18915 op = gen_reg_rtx (V4DImode);
18916 ior = gen_lowpart (V4DImode, ior);
18917 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
18918 const1_rtx, GEN_INT (3)));
18919 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18920
18921 return true;
18922}
18923
18924/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18925 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18926 with two "and" and "pack" or two "shift" and "pack" insns. We should
18927 have already failed all two instruction sequences. */
18928
18929static bool
18930expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
18931{
18932 rtx op, dop0, dop1, t;
18933 unsigned i, odd, c, s, nelt = d->nelt;
18934 bool end_perm = false;
18935 machine_mode half_mode;
18936 rtx (*gen_and) (rtx, rtx, rtx);
18937 rtx (*gen_pack) (rtx, rtx, rtx);
18938 rtx (*gen_shift) (rtx, rtx, rtx);
18939
18940 if (d->one_operand_p)
18941 return false;
18942
18943 switch (d->vmode)
18944 {
18945 case E_V8HImode:
18946 /* Required for "pack". */
18947 if (!TARGET_SSE4_1)
18948 return false;
18949 c = 0xffff;
18950 s = 16;
18951 half_mode = V4SImode;
18952 gen_and = gen_andv4si3;
18953 gen_pack = gen_sse4_1_packusdw;
18954 gen_shift = gen_lshrv4si3;
18955 break;
18956 case E_V16QImode:
18957 /* No check as all instructions are SSE2. */
18958 c = 0xff;
18959 s = 8;
18960 half_mode = V8HImode;
18961 gen_and = gen_andv8hi3;
18962 gen_pack = gen_sse2_packuswb;
18963 gen_shift = gen_lshrv8hi3;
18964 break;
18965 case E_V16HImode:
18966 if (!TARGET_AVX2)
18967 return false;
18968 c = 0xffff;
18969 s = 16;
18970 half_mode = V8SImode;
18971 gen_and = gen_andv8si3;
18972 gen_pack = gen_avx2_packusdw;
18973 gen_shift = gen_lshrv8si3;
18974 end_perm = true;
18975 break;
18976 case E_V32QImode:
18977 if (!TARGET_AVX2)
18978 return false;
18979 c = 0xff;
18980 s = 8;
18981 half_mode = V16HImode;
18982 gen_and = gen_andv16hi3;
18983 gen_pack = gen_avx2_packuswb;
18984 gen_shift = gen_lshrv16hi3;
18985 end_perm = true;
18986 break;
18987 default:
18988 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
18989 general shuffles. */
18990 return false;
18991 }
18992
18993 /* Check that permutation is even or odd. */
18994 odd = d->perm[0];
18995 if (odd > 1)
18996 return false;
18997
18998 for (i = 1; i < nelt; ++i)
18999 if (d->perm[i] != 2 * i + odd)
19000 return false;
19001
19002 if (d->testing_p)
19003 return true;
19004
19005 dop0 = gen_reg_rtx (half_mode);
19006 dop1 = gen_reg_rtx (half_mode);
19007 if (odd == 0)
19008 {
19009 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
19010 t = force_reg (half_mode, t);
19011 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
19012 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
19013 }
19014 else
19015 {
19016 emit_insn (gen_shift (dop0,
19017 gen_lowpart (half_mode, d->op0),
19018 GEN_INT (s)));
19019 emit_insn (gen_shift (dop1,
19020 gen_lowpart (half_mode, d->op1),
19021 GEN_INT (s)));
19022 }
19023 /* In AVX2 for 256 bit case we need to permute pack result. */
19024 if (TARGET_AVX2 && end_perm)
19025 {
19026 op = gen_reg_rtx (d->vmode);
19027 t = gen_reg_rtx (V4DImode);
19028 emit_insn (gen_pack (op, dop0, dop1));
19029 emit_insn (gen_avx2_permv4di_1 (t,
19030 gen_lowpart (V4DImode, op),
19031 const0_rtx,
19032 const2_rtx,
19033 const1_rtx,
19034 GEN_INT (3)));
19035 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
19036 }
19037 else
19038 emit_insn (gen_pack (d->target, dop0, dop1));
19039
19040 return true;
19041}
19042
19043/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
19044 and extract-odd permutations of two V64QI operands
19045 with two "shifts", two "truncs" and one "concat" insns for "odd"
19046 and two "truncs" and one concat insn for "even."
19047 Have already failed all two instruction sequences. */
19048
19049static bool
19050expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
19051{
19052 rtx t1, t2, t3, t4;
19053 unsigned i, odd, nelt = d->nelt;
19054
19055 if (!TARGET_AVX512BW
19056 || d->one_operand_p
19057 || d->vmode != V64QImode)
19058 return false;
19059
19060 /* Check that permutation is even or odd. */
19061 odd = d->perm[0];
19062 if (odd > 1)
19063 return false;
19064
19065 for (i = 1; i < nelt; ++i)
19066 if (d->perm[i] != 2 * i + odd)
19067 return false;
19068
19069 if (d->testing_p)
19070 return true;
19071
19072
19073 if (odd)
19074 {
19075 t1 = gen_reg_rtx (V32HImode);
19076 t2 = gen_reg_rtx (V32HImode);
19077 emit_insn (gen_lshrv32hi3 (t1,
19078 gen_lowpart (V32HImode, d->op0),
19079 GEN_INT (8)));
19080 emit_insn (gen_lshrv32hi3 (t2,
19081 gen_lowpart (V32HImode, d->op1),
19082 GEN_INT (8)));
19083 }
19084 else
19085 {
19086 t1 = gen_lowpart (V32HImode, d->op0);
19087 t2 = gen_lowpart (V32HImode, d->op1);
19088 }
19089
19090 t3 = gen_reg_rtx (V32QImode);
19091 t4 = gen_reg_rtx (V32QImode);
19092 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
19093 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
19094 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
19095
19096 return true;
19097}
19098
4bf4c103 19099/* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
2bf6d935
ML
19100 and extract-odd permutations. */
19101
19102static bool
19103expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
19104{
19105 rtx t1, t2, t3, t4, t5;
19106
19107 switch (d->vmode)
19108 {
19109 case E_V4DFmode:
19110 if (d->testing_p)
19111 break;
19112 t1 = gen_reg_rtx (V4DFmode);
19113 t2 = gen_reg_rtx (V4DFmode);
19114
19115 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
19116 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
19117 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
19118
19119 /* Now an unpck[lh]pd will produce the result required. */
19120 if (odd)
19121 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
19122 else
19123 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
19124 emit_insn (t3);
19125 break;
19126
19127 case E_V8SFmode:
19128 {
19129 int mask = odd ? 0xdd : 0x88;
19130
19131 if (d->testing_p)
19132 break;
19133 t1 = gen_reg_rtx (V8SFmode);
19134 t2 = gen_reg_rtx (V8SFmode);
19135 t3 = gen_reg_rtx (V8SFmode);
19136
19137 /* Shuffle within the 128-bit lanes to produce:
19138 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
19139 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
19140 GEN_INT (mask)));
19141
19142 /* Shuffle the lanes around to produce:
19143 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
19144 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
19145 GEN_INT (0x3)));
19146
19147 /* Shuffle within the 128-bit lanes to produce:
19148 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
19149 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
19150
19151 /* Shuffle within the 128-bit lanes to produce:
19152 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
19153 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
19154
19155 /* Shuffle the lanes around to produce:
19156 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
19157 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
19158 GEN_INT (0x20)));
19159 }
19160 break;
19161
19162 case E_V2DFmode:
19163 case E_V4SFmode:
19164 case E_V2DImode:
9b8579a6 19165 case E_V2SImode:
2bf6d935
ML
19166 case E_V4SImode:
19167 /* These are always directly implementable by expand_vec_perm_1. */
19168 gcc_unreachable ();
19169
240198fe
UB
19170 case E_V2SFmode:
19171 gcc_assert (TARGET_MMX_WITH_SSE);
19172 /* We have no suitable instructions. */
19173 if (d->testing_p)
19174 return false;
19175 break;
19176
9b8579a6
UB
19177 case E_V4HImode:
19178 if (d->testing_p)
19179 break;
19180 /* We need 2*log2(N)-1 operations to achieve odd/even
19181 with interleave. */
19182 t1 = gen_reg_rtx (V4HImode);
19183 emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
19184 emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
19185 if (odd)
19186 t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
19187 else
19188 t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
19189 emit_insn (t2);
19190 break;
19191
2bf6d935
ML
19192 case E_V8HImode:
19193 if (TARGET_SSE4_1)
19194 return expand_vec_perm_even_odd_pack (d);
19195 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
19196 return expand_vec_perm_pshufb2 (d);
19197 else
19198 {
19199 if (d->testing_p)
19200 break;
19201 /* We need 2*log2(N)-1 operations to achieve odd/even
19202 with interleave. */
19203 t1 = gen_reg_rtx (V8HImode);
19204 t2 = gen_reg_rtx (V8HImode);
19205 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
19206 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
19207 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
19208 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
19209 if (odd)
19210 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
19211 else
19212 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
19213 emit_insn (t3);
19214 }
19215 break;
19216
19217 case E_V16QImode:
19218 return expand_vec_perm_even_odd_pack (d);
19219
19220 case E_V16HImode:
19221 case E_V32QImode:
19222 return expand_vec_perm_even_odd_pack (d);
19223
19224 case E_V64QImode:
19225 return expand_vec_perm_even_odd_trunc (d);
19226
19227 case E_V4DImode:
19228 if (!TARGET_AVX2)
19229 {
19230 struct expand_vec_perm_d d_copy = *d;
19231 d_copy.vmode = V4DFmode;
19232 if (d->testing_p)
19233 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
19234 else
19235 d_copy.target = gen_reg_rtx (V4DFmode);
19236 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
19237 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
19238 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
19239 {
19240 if (!d->testing_p)
19241 emit_move_insn (d->target,
19242 gen_lowpart (V4DImode, d_copy.target));
19243 return true;
19244 }
19245 return false;
19246 }
19247
19248 if (d->testing_p)
19249 break;
19250
19251 t1 = gen_reg_rtx (V4DImode);
19252 t2 = gen_reg_rtx (V4DImode);
19253
19254 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
19255 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
19256 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
19257
19258 /* Now an vpunpck[lh]qdq will produce the result required. */
19259 if (odd)
19260 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
19261 else
19262 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
19263 emit_insn (t3);
19264 break;
19265
19266 case E_V8SImode:
19267 if (!TARGET_AVX2)
19268 {
19269 struct expand_vec_perm_d d_copy = *d;
19270 d_copy.vmode = V8SFmode;
19271 if (d->testing_p)
19272 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
19273 else
19274 d_copy.target = gen_reg_rtx (V8SFmode);
19275 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
19276 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
19277 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
19278 {
19279 if (!d->testing_p)
19280 emit_move_insn (d->target,
19281 gen_lowpart (V8SImode, d_copy.target));
19282 return true;
19283 }
19284 return false;
19285 }
19286
19287 if (d->testing_p)
19288 break;
19289
19290 t1 = gen_reg_rtx (V8SImode);
19291 t2 = gen_reg_rtx (V8SImode);
19292 t3 = gen_reg_rtx (V4DImode);
19293 t4 = gen_reg_rtx (V4DImode);
19294 t5 = gen_reg_rtx (V4DImode);
19295
19296 /* Shuffle the lanes around into
19297 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
19298 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
19299 gen_lowpart (V4DImode, d->op1),
19300 GEN_INT (0x20)));
19301 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
19302 gen_lowpart (V4DImode, d->op1),
19303 GEN_INT (0x31)));
19304
19305 /* Swap the 2nd and 3rd position in each lane into
19306 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
19307 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
19308 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
19309 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
19310 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
19311
19312 /* Now an vpunpck[lh]qdq will produce
19313 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
19314 if (odd)
19315 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
19316 gen_lowpart (V4DImode, t2));
19317 else
19318 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
19319 gen_lowpart (V4DImode, t2));
19320 emit_insn (t3);
19321 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
19322 break;
19323
19324 default:
19325 gcc_unreachable ();
19326 }
19327
19328 return true;
19329}
19330
4bf4c103 19331/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
2bf6d935
ML
19332 extract-even and extract-odd permutations. */
19333
19334static bool
19335expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
19336{
19337 unsigned i, odd, nelt = d->nelt;
19338
19339 odd = d->perm[0];
19340 if (odd != 0 && odd != 1)
19341 return false;
19342
19343 for (i = 1; i < nelt; ++i)
19344 if (d->perm[i] != 2 * i + odd)
19345 return false;
19346
19347 return expand_vec_perm_even_odd_1 (d, odd);
19348}
19349
4bf4c103 19350/* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
2bf6d935
ML
19351 permutations. We assume that expand_vec_perm_1 has already failed. */
19352
19353static bool
19354expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
19355{
19356 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
19357 machine_mode vmode = d->vmode;
19358 unsigned char perm2[4];
19359 rtx op0 = d->op0, dest;
19360 bool ok;
19361
19362 switch (vmode)
19363 {
19364 case E_V4DFmode:
19365 case E_V8SFmode:
19366 /* These are special-cased in sse.md so that we can optionally
19367 use the vbroadcast instruction. They expand to two insns
19368 if the input happens to be in a register. */
19369 gcc_unreachable ();
19370
19371 case E_V2DFmode:
240198fe 19372 case E_V2SFmode:
2bf6d935 19373 case E_V4SFmode:
240198fe 19374 case E_V2DImode:
9b8579a6 19375 case E_V2SImode:
2bf6d935
ML
19376 case E_V4SImode:
19377 /* These are always implementable using standard shuffle patterns. */
19378 gcc_unreachable ();
19379
19380 case E_V8HImode:
19381 case E_V16QImode:
19382 /* These can be implemented via interleave. We save one insn by
19383 stopping once we have promoted to V4SImode and then use pshufd. */
19384 if (d->testing_p)
19385 return true;
19386 do
19387 {
19388 rtx dest;
19389 rtx (*gen) (rtx, rtx, rtx)
19390 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
19391 : gen_vec_interleave_lowv8hi;
19392
19393 if (elt >= nelt2)
19394 {
19395 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
19396 : gen_vec_interleave_highv8hi;
19397 elt -= nelt2;
19398 }
19399 nelt2 /= 2;
19400
19401 dest = gen_reg_rtx (vmode);
19402 emit_insn (gen (dest, op0, op0));
19403 vmode = get_mode_wider_vector (vmode);
19404 op0 = gen_lowpart (vmode, dest);
19405 }
19406 while (vmode != V4SImode);
19407
19408 memset (perm2, elt, 4);
19409 dest = gen_reg_rtx (V4SImode);
19410 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
19411 gcc_assert (ok);
19412 if (!d->testing_p)
19413 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
19414 return true;
19415
19416 case E_V64QImode:
19417 case E_V32QImode:
19418 case E_V16HImode:
19419 case E_V8SImode:
19420 case E_V4DImode:
19421 /* For AVX2 broadcasts of the first element vpbroadcast* or
19422 vpermq should be used by expand_vec_perm_1. */
19423 gcc_assert (!TARGET_AVX2 || d->perm[0]);
19424 return false;
19425
19426 default:
19427 gcc_unreachable ();
19428 }
19429}
19430
4bf4c103 19431/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
2bf6d935
ML
19432 broadcast permutations. */
19433
19434static bool
19435expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
19436{
19437 unsigned i, elt, nelt = d->nelt;
19438
19439 if (!d->one_operand_p)
19440 return false;
19441
19442 elt = d->perm[0];
19443 for (i = 1; i < nelt; ++i)
19444 if (d->perm[i] != elt)
19445 return false;
19446
19447 return expand_vec_perm_broadcast_1 (d);
19448}
19449
19450/* Implement arbitrary permutations of two V64QImode operands
19451 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
19452static bool
19453expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
19454{
19455 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
19456 return false;
19457
19458 if (d->testing_p)
19459 return true;
19460
19461 struct expand_vec_perm_d ds[2];
19462 rtx rperm[128], vperm, target0, target1;
19463 unsigned int i, nelt;
19464 machine_mode vmode;
19465
19466 nelt = d->nelt;
19467 vmode = V64QImode;
19468
19469 for (i = 0; i < 2; i++)
19470 {
19471 ds[i] = *d;
19472 ds[i].vmode = V32HImode;
19473 ds[i].nelt = 32;
19474 ds[i].target = gen_reg_rtx (V32HImode);
19475 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
19476 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
19477 }
19478
19479 /* Prepare permutations such that the first one takes care of
19480 putting the even bytes into the right positions or one higher
19481 positions (ds[0]) and the second one takes care of
19482 putting the odd bytes into the right positions or one below
19483 (ds[1]). */
19484
19485 for (i = 0; i < nelt; i++)
19486 {
19487 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
19488 if (i & 1)
19489 {
19490 rperm[i] = constm1_rtx;
19491 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
19492 }
19493 else
19494 {
19495 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
19496 rperm[i + 64] = constm1_rtx;
19497 }
19498 }
19499
19500 bool ok = expand_vec_perm_1 (&ds[0]);
19501 gcc_assert (ok);
19502 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
19503
19504 ok = expand_vec_perm_1 (&ds[1]);
19505 gcc_assert (ok);
19506 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
19507
19508 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
19509 vperm = force_reg (vmode, vperm);
19510 target0 = gen_reg_rtx (V64QImode);
19511 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
19512
19513 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
19514 vperm = force_reg (vmode, vperm);
19515 target1 = gen_reg_rtx (V64QImode);
19516 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
19517
19518 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
19519 return true;
19520}
19521
19522/* Implement arbitrary permutation of two V32QImode and V16QImode operands
19523 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
19524 all the shorter instruction sequences. */
19525
19526static bool
19527expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
19528{
19529 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
19530 unsigned int i, nelt, eltsz;
19531 bool used[4];
19532
19533 if (!TARGET_AVX2
19534 || d->one_operand_p
19535 || (d->vmode != V32QImode && d->vmode != V16HImode))
19536 return false;
19537
19538 if (d->testing_p)
19539 return true;
19540
19541 nelt = d->nelt;
19542 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19543
19544 /* Generate 4 permutation masks. If the required element is within
19545 the same lane, it is shuffled in. If the required element from the
19546 other lane, force a zero by setting bit 7 in the permutation mask.
19547 In the other mask the mask has non-negative elements if element
19548 is requested from the other lane, but also moved to the other lane,
19549 so that the result of vpshufb can have the two V2TImode halves
19550 swapped. */
19551 m128 = GEN_INT (-128);
19552 for (i = 0; i < 32; ++i)
19553 {
19554 rperm[0][i] = m128;
19555 rperm[1][i] = m128;
19556 rperm[2][i] = m128;
19557 rperm[3][i] = m128;
19558 }
19559 used[0] = false;
19560 used[1] = false;
19561 used[2] = false;
19562 used[3] = false;
19563 for (i = 0; i < nelt; ++i)
19564 {
19565 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
19566 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
19567 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
19568
19569 for (j = 0; j < eltsz; ++j)
19570 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
19571 used[which] = true;
19572 }
19573
19574 for (i = 0; i < 2; ++i)
19575 {
19576 if (!used[2 * i + 1])
19577 {
19578 h[i] = NULL_RTX;
19579 continue;
19580 }
19581 vperm = gen_rtx_CONST_VECTOR (V32QImode,
19582 gen_rtvec_v (32, rperm[2 * i + 1]));
19583 vperm = force_reg (V32QImode, vperm);
19584 h[i] = gen_reg_rtx (V32QImode);
19585 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19586 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
19587 }
19588
19589 /* Swap the 128-byte lanes of h[X]. */
19590 for (i = 0; i < 2; ++i)
19591 {
19592 if (h[i] == NULL_RTX)
19593 continue;
19594 op = gen_reg_rtx (V4DImode);
19595 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
19596 const2_rtx, GEN_INT (3), const0_rtx,
19597 const1_rtx));
19598 h[i] = gen_lowpart (V32QImode, op);
19599 }
19600
19601 for (i = 0; i < 2; ++i)
19602 {
19603 if (!used[2 * i])
19604 {
19605 l[i] = NULL_RTX;
19606 continue;
19607 }
19608 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
19609 vperm = force_reg (V32QImode, vperm);
19610 l[i] = gen_reg_rtx (V32QImode);
19611 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19612 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
19613 }
19614
19615 for (i = 0; i < 2; ++i)
19616 {
19617 if (h[i] && l[i])
19618 {
19619 op = gen_reg_rtx (V32QImode);
19620 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
19621 l[i] = op;
19622 }
19623 else if (h[i])
19624 l[i] = h[i];
19625 }
19626
19627 gcc_assert (l[0] && l[1]);
19628 op = d->target;
19629 if (d->vmode != V32QImode)
19630 op = gen_reg_rtx (V32QImode);
19631 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
19632 if (op != d->target)
19633 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
19634 return true;
19635}
19636
19637/* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
19638 taken care of, perform the expansion in D and return true on success. */
19639
19640static bool
19641ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
19642{
19643 /* Try a single instruction expansion. */
19644 if (expand_vec_perm_1 (d))
19645 return true;
19646
19647 /* Try sequences of two instructions. */
19648
19649 if (expand_vec_perm_pshuflw_pshufhw (d))
19650 return true;
19651
19652 if (expand_vec_perm_palignr (d, false))
19653 return true;
19654
19655 if (expand_vec_perm_interleave2 (d))
19656 return true;
19657
19658 if (expand_vec_perm_broadcast (d))
19659 return true;
19660
19661 if (expand_vec_perm_vpermq_perm_1 (d))
19662 return true;
19663
19664 if (expand_vec_perm_vperm2f128 (d))
19665 return true;
19666
19667 if (expand_vec_perm_pblendv (d))
19668 return true;
19669
19670 /* Try sequences of three instructions. */
19671
19672 if (expand_vec_perm_even_odd_pack (d))
19673 return true;
19674
19675 if (expand_vec_perm_2vperm2f128_vshuf (d))
19676 return true;
19677
19678 if (expand_vec_perm_pshufb2 (d))
19679 return true;
19680
19681 if (expand_vec_perm_interleave3 (d))
19682 return true;
19683
19684 if (expand_vec_perm_vperm2f128_vblend (d))
19685 return true;
19686
19687 /* Try sequences of four instructions. */
19688
19689 if (expand_vec_perm_even_odd_trunc (d))
19690 return true;
19691 if (expand_vec_perm_vpshufb2_vpermq (d))
19692 return true;
19693
19694 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
19695 return true;
19696
19697 if (expand_vec_perm_vpermt2_vpshub2 (d))
19698 return true;
19699
19700 /* ??? Look for narrow permutations whose element orderings would
19701 allow the promotion to a wider mode. */
19702
19703 /* ??? Look for sequences of interleave or a wider permute that place
19704 the data into the correct lanes for a half-vector shuffle like
19705 pshuf[lh]w or vpermilps. */
19706
19707 /* ??? Look for sequences of interleave that produce the desired results.
19708 The combinatorics of punpck[lh] get pretty ugly... */
19709
19710 if (expand_vec_perm_even_odd (d))
19711 return true;
19712
19713 /* Even longer sequences. */
19714 if (expand_vec_perm_vpshufb4_vpermq2 (d))
19715 return true;
19716
19717 /* See if we can get the same permutation in different vector integer
19718 mode. */
19719 struct expand_vec_perm_d nd;
19720 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19721 {
19722 if (!d->testing_p)
19723 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19724 return true;
19725 }
19726
4bf4c103
JJ
19727 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
19728 if (expand_vec_perm2_vperm2f128_vblend (d))
19729 return true;
19730
2bf6d935
ML
19731 return false;
19732}
19733
19734/* If a permutation only uses one operand, make it clear. Returns true
19735 if the permutation references both operands. */
19736
19737static bool
19738canonicalize_perm (struct expand_vec_perm_d *d)
19739{
19740 int i, which, nelt = d->nelt;
19741
19742 for (i = which = 0; i < nelt; ++i)
4bf4c103 19743 which |= (d->perm[i] < nelt ? 1 : 2);
2bf6d935
ML
19744
19745 d->one_operand_p = true;
19746 switch (which)
19747 {
19748 default:
19749 gcc_unreachable();
19750
19751 case 3:
19752 if (!rtx_equal_p (d->op0, d->op1))
19753 {
19754 d->one_operand_p = false;
19755 break;
19756 }
19757 /* The elements of PERM do not suggest that only the first operand
19758 is used, but both operands are identical. Allow easier matching
19759 of the permutation by folding the permutation into the single
19760 input vector. */
19761 /* FALLTHRU */
19762
19763 case 2:
19764 for (i = 0; i < nelt; ++i)
19765 d->perm[i] &= nelt - 1;
19766 d->op0 = d->op1;
19767 break;
19768
19769 case 1:
19770 d->op1 = d->op0;
19771 break;
19772 }
19773
19774 return (which == 3);
19775}
19776
19777/* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19778
19779bool
19780ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
19781 rtx op1, const vec_perm_indices &sel)
19782{
19783 struct expand_vec_perm_d d;
19784 unsigned char perm[MAX_VECT_LEN];
19785 unsigned int i, nelt, which;
19786 bool two_args;
19787
19788 d.target = target;
19789 d.op0 = op0;
19790 d.op1 = op1;
19791
19792 d.vmode = vmode;
19793 gcc_assert (VECTOR_MODE_P (d.vmode));
19794 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19795 d.testing_p = !target;
19796
19797 gcc_assert (sel.length () == nelt);
19798 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
19799
19800 /* Given sufficient ISA support we can just return true here
19801 for selected vector modes. */
19802 switch (d.vmode)
19803 {
19804 case E_V16SFmode:
19805 case E_V16SImode:
19806 case E_V8DImode:
19807 case E_V8DFmode:
19808 if (!TARGET_AVX512F)
19809 return false;
19810 /* All implementable with a single vperm[it]2 insn. */
19811 if (d.testing_p)
19812 return true;
19813 break;
19814 case E_V32HImode:
19815 if (!TARGET_AVX512BW)
19816 return false;
19817 if (d.testing_p)
19818 /* All implementable with a single vperm[it]2 insn. */
19819 return true;
19820 break;
19821 case E_V64QImode:
19822 if (!TARGET_AVX512BW)
19823 return false;
19824 if (d.testing_p)
19825 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
19826 return true;
19827 break;
19828 case E_V8SImode:
19829 case E_V8SFmode:
19830 case E_V4DFmode:
19831 case E_V4DImode:
19832 if (!TARGET_AVX)
19833 return false;
19834 if (d.testing_p && TARGET_AVX512VL)
19835 /* All implementable with a single vperm[it]2 insn. */
19836 return true;
19837 break;
19838 case E_V16HImode:
19839 if (!TARGET_SSE2)
19840 return false;
19841 if (d.testing_p && TARGET_AVX2)
19842 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19843 return true;
19844 break;
19845 case E_V32QImode:
19846 if (!TARGET_SSE2)
19847 return false;
19848 if (d.testing_p && TARGET_AVX2)
19849 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19850 return true;
19851 break;
19852 case E_V8HImode:
19853 case E_V16QImode:
19854 if (!TARGET_SSE2)
19855 return false;
19856 /* Fall through. */
19857 case E_V4SImode:
19858 case E_V4SFmode:
19859 if (!TARGET_SSE)
19860 return false;
19861 /* All implementable with a single vpperm insn. */
19862 if (d.testing_p && TARGET_XOP)
19863 return true;
19864 /* All implementable with 2 pshufb + 1 ior. */
19865 if (d.testing_p && TARGET_SSSE3)
19866 return true;
19867 break;
240198fe 19868 case E_V2SFmode:
9b8579a6
UB
19869 case E_V2SImode:
19870 case E_V4HImode:
19871 if (!TARGET_MMX_WITH_SSE)
19872 return false;
19873 break;
2bf6d935
ML
19874 case E_V2DImode:
19875 case E_V2DFmode:
19876 if (!TARGET_SSE)
19877 return false;
19878 /* All implementable with shufpd or unpck[lh]pd. */
19879 if (d.testing_p)
19880 return true;
19881 break;
19882 default:
19883 return false;
19884 }
19885
19886 for (i = which = 0; i < nelt; ++i)
19887 {
19888 unsigned char e = sel[i];
19889 gcc_assert (e < 2 * nelt);
19890 d.perm[i] = e;
19891 perm[i] = e;
19892 which |= (e < nelt ? 1 : 2);
19893 }
19894
19895 if (d.testing_p)
19896 {
19897 /* For all elements from second vector, fold the elements to first. */
19898 if (which == 2)
19899 for (i = 0; i < nelt; ++i)
19900 d.perm[i] -= nelt;
19901
19902 /* Check whether the mask can be applied to the vector type. */
19903 d.one_operand_p = (which != 3);
19904
19905 /* Implementable with shufps or pshufd. */
9b8579a6 19906 if (d.one_operand_p
240198fe 19907 && (d.vmode == V4SFmode || d.vmode == V2SFmode
9b8579a6 19908 || d.vmode == V4SImode || d.vmode == V2SImode))
2bf6d935
ML
19909 return true;
19910
19911 /* Otherwise we have to go through the motions and see if we can
19912 figure out how to generate the requested permutation. */
19913 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
19914 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
19915 if (!d.one_operand_p)
19916 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
19917
19918 start_sequence ();
19919 bool ret = ix86_expand_vec_perm_const_1 (&d);
19920 end_sequence ();
19921
19922 return ret;
19923 }
19924
19925 two_args = canonicalize_perm (&d);
19926
19927 if (ix86_expand_vec_perm_const_1 (&d))
19928 return true;
19929
19930 /* If the selector says both arguments are needed, but the operands are the
19931 same, the above tried to expand with one_operand_p and flattened selector.
19932 If that didn't work, retry without one_operand_p; we succeeded with that
19933 during testing. */
19934 if (two_args && d.one_operand_p)
19935 {
19936 d.one_operand_p = false;
19937 memcpy (d.perm, perm, sizeof (perm));
19938 return ix86_expand_vec_perm_const_1 (&d);
19939 }
19940
19941 return false;
19942}
19943
19944void
19945ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
19946{
19947 struct expand_vec_perm_d d;
19948 unsigned i, nelt;
19949
19950 d.target = targ;
19951 d.op0 = op0;
19952 d.op1 = op1;
19953 d.vmode = GET_MODE (targ);
19954 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19955 d.one_operand_p = false;
19956 d.testing_p = false;
19957
19958 for (i = 0; i < nelt; ++i)
19959 d.perm[i] = i * 2 + odd;
19960
19961 /* We'll either be able to implement the permutation directly... */
19962 if (expand_vec_perm_1 (&d))
19963 return;
19964
19965 /* ... or we use the special-case patterns. */
19966 expand_vec_perm_even_odd_1 (&d, odd);
19967}
19968
19969static void
19970ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
19971{
19972 struct expand_vec_perm_d d;
19973 unsigned i, nelt, base;
19974 bool ok;
19975
19976 d.target = targ;
19977 d.op0 = op0;
19978 d.op1 = op1;
19979 d.vmode = GET_MODE (targ);
19980 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19981 d.one_operand_p = false;
19982 d.testing_p = false;
19983
19984 base = high_p ? nelt / 2 : 0;
19985 for (i = 0; i < nelt / 2; ++i)
19986 {
19987 d.perm[i * 2] = i + base;
19988 d.perm[i * 2 + 1] = i + base + nelt;
19989 }
19990
19991 /* Note that for AVX this isn't one instruction. */
19992 ok = ix86_expand_vec_perm_const_1 (&d);
19993 gcc_assert (ok);
19994}
19995
54cdb2f5 19996/* Optimize vector MUL generation for V8QI, V16QI and V32QI
19997 under TARGET_AVX512BW. i.e. for v16qi a * b, it has
19998
19999 vpmovzxbw ymm2, xmm0
20000 vpmovzxbw ymm3, xmm1
20001 vpmullw ymm4, ymm2, ymm3
20002 vpmovwb xmm0, ymm4
20003
20004 it would take less instructions than ix86_expand_vecop_qihi.
20005 Return true if success. */
20006
20007bool
20008ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
20009{
20010 machine_mode himode, qimode = GET_MODE (dest);
20011 rtx hop1, hop2, hdest;
20012 rtx (*gen_extend)(rtx, rtx);
20013 rtx (*gen_truncate)(rtx, rtx);
20014
20015 /* There's no V64HImode multiplication instruction. */
20016 if (qimode == E_V64QImode)
20017 return false;
20018
20019 /* vpmovwb only available under AVX512BW. */
20020 if (!TARGET_AVX512BW)
20021 return false;
20022 if ((qimode == V8QImode || qimode == V16QImode)
20023 && !TARGET_AVX512VL)
20024 return false;
20025 /* Not generate zmm instruction when prefer 128/256 bit vector width. */
20026 if (qimode == V32QImode
20027 && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
20028 return false;
20029
20030 switch (qimode)
20031 {
20032 case E_V8QImode:
20033 himode = V8HImode;
20034 gen_extend = gen_zero_extendv8qiv8hi2;
20035 gen_truncate = gen_truncv8hiv8qi2;
20036 break;
20037 case E_V16QImode:
20038 himode = V16HImode;
20039 gen_extend = gen_zero_extendv16qiv16hi2;
20040 gen_truncate = gen_truncv16hiv16qi2;
20041 break;
20042 case E_V32QImode:
20043 himode = V32HImode;
20044 gen_extend = gen_zero_extendv32qiv32hi2;
20045 gen_truncate = gen_truncv32hiv32qi2;
20046 break;
20047 default:
20048 gcc_unreachable ();
20049 }
20050
20051 hop1 = gen_reg_rtx (himode);
20052 hop2 = gen_reg_rtx (himode);
20053 hdest = gen_reg_rtx (himode);
20054 emit_insn (gen_extend (hop1, op1));
20055 emit_insn (gen_extend (hop2, op2));
20056 emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (MULT, himode,
20057 hop1, hop2)));
20058 emit_insn (gen_truncate (dest, hdest));
20059 return true;
20060}
2bf6d935 20061
c7199fb6 20062/* Expand a vector operation shift by constant for a V*QImode in terms of the
20063 same operation on V*HImode. Return true if success. */
20064bool
20065ix86_expand_vec_shift_qihi_constant (enum rtx_code code, rtx dest, rtx op1, rtx op2)
20066{
20067 machine_mode qimode, himode;
c44c2a3b 20068 HOST_WIDE_INT and_constant, xor_constant;
c7199fb6 20069 HOST_WIDE_INT shift_amount;
20070 rtx vec_const_and, vec_const_xor;
20071 rtx tmp, op1_subreg;
20072 rtx (*gen_shift) (rtx, rtx, rtx);
20073 rtx (*gen_and) (rtx, rtx, rtx);
20074 rtx (*gen_xor) (rtx, rtx, rtx);
20075 rtx (*gen_sub) (rtx, rtx, rtx);
20076
20077 /* Only optimize shift by constant. */
20078 if (!CONST_INT_P (op2))
20079 return false;
20080
20081 qimode = GET_MODE (dest);
20082 shift_amount = INTVAL (op2);
20083 /* Do nothing when shift amount greater equal 8. */
20084 if (shift_amount > 7)
20085 return false;
20086
20087 gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
20088 /* Record sign bit. */
20089 xor_constant = 1 << (8 - shift_amount - 1);
20090
20091 /* Zero upper/lower bits shift from left/right element. */
20092 and_constant
20093 = (code == ASHIFT ? 256 - (1 << shift_amount)
20094 : (1 << (8 - shift_amount)) - 1);
20095
20096 switch (qimode)
20097 {
20098 case V16QImode:
20099 himode = V8HImode;
20100 gen_shift =
20101 ((code == ASHIFT)
20102 ? gen_ashlv8hi3
20103 : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
20104 gen_and = gen_andv16qi3;
20105 gen_xor = gen_xorv16qi3;
20106 gen_sub = gen_subv16qi3;
20107 break;
20108 case V32QImode:
20109 himode = V16HImode;
20110 gen_shift =
20111 ((code == ASHIFT)
20112 ? gen_ashlv16hi3
20113 : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
20114 gen_and = gen_andv32qi3;
20115 gen_xor = gen_xorv32qi3;
20116 gen_sub = gen_subv32qi3;
20117 break;
20118 case V64QImode:
20119 himode = V32HImode;
20120 gen_shift =
20121 ((code == ASHIFT)
20122 ? gen_ashlv32hi3
20123 : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
20124 gen_and = gen_andv64qi3;
20125 gen_xor = gen_xorv64qi3;
20126 gen_sub = gen_subv64qi3;
20127 break;
20128 default:
20129 gcc_unreachable ();
20130 }
20131
20132 tmp = gen_reg_rtx (himode);
20133 vec_const_and = gen_reg_rtx (qimode);
20134 op1_subreg = lowpart_subreg (himode, op1, qimode);
20135
20136 /* For ASHIFT and LSHIFTRT, perform operation like
20137 vpsllw/vpsrlw $shift_amount, %op1, %dest.
20138 vpand %vec_const_and, %dest. */
20139 emit_insn (gen_shift (tmp, op1_subreg, op2));
20140 emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
20141 emit_move_insn (vec_const_and,
20142 ix86_build_const_vector (qimode, true,
c44c2a3b 20143 gen_int_mode (and_constant, QImode)));
c7199fb6 20144 emit_insn (gen_and (dest, dest, vec_const_and));
20145
20146 /* For ASHIFTRT, perform extra operation like
20147 vpxor %vec_const_xor, %dest, %dest
20148 vpsubb %vec_const_xor, %dest, %dest */
20149 if (code == ASHIFTRT)
20150 {
20151 vec_const_xor = gen_reg_rtx (qimode);
20152 emit_move_insn (vec_const_xor,
20153 ix86_build_const_vector (qimode, true,
c44c2a3b 20154 gen_int_mode (xor_constant, QImode)));
c7199fb6 20155 emit_insn (gen_xor (dest, dest, vec_const_xor));
20156 emit_insn (gen_sub (dest, dest, vec_const_xor));
20157 }
20158 return true;
20159}
20160
2bf6d935
ML
20161/* Expand a vector operation CODE for a V*QImode in terms of the
20162 same operation on V*HImode. */
20163
20164void
20165ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
20166{
20167 machine_mode qimode = GET_MODE (dest);
20168 machine_mode himode;
20169 rtx (*gen_il) (rtx, rtx, rtx);
20170 rtx (*gen_ih) (rtx, rtx, rtx);
20171 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
20172 struct expand_vec_perm_d d;
20173 bool ok, full_interleave;
20174 bool uns_p = false;
20175 int i;
20176
20177 switch (qimode)
20178 {
20179 case E_V16QImode:
20180 himode = V8HImode;
20181 gen_il = gen_vec_interleave_lowv16qi;
20182 gen_ih = gen_vec_interleave_highv16qi;
20183 break;
20184 case E_V32QImode:
20185 himode = V16HImode;
20186 gen_il = gen_avx2_interleave_lowv32qi;
20187 gen_ih = gen_avx2_interleave_highv32qi;
20188 break;
20189 case E_V64QImode:
20190 himode = V32HImode;
20191 gen_il = gen_avx512bw_interleave_lowv64qi;
20192 gen_ih = gen_avx512bw_interleave_highv64qi;
20193 break;
20194 default:
20195 gcc_unreachable ();
20196 }
20197
20198 op2_l = op2_h = op2;
20199 switch (code)
20200 {
20201 case MULT:
20202 /* Unpack data such that we've got a source byte in each low byte of
20203 each word. We don't care what goes into the high byte of each word.
20204 Rather than trying to get zero in there, most convenient is to let
20205 it be a copy of the low byte. */
20206 op2_l = gen_reg_rtx (qimode);
20207 op2_h = gen_reg_rtx (qimode);
20208 emit_insn (gen_il (op2_l, op2, op2));
20209 emit_insn (gen_ih (op2_h, op2, op2));
20210
20211 op1_l = gen_reg_rtx (qimode);
20212 op1_h = gen_reg_rtx (qimode);
20213 emit_insn (gen_il (op1_l, op1, op1));
20214 emit_insn (gen_ih (op1_h, op1, op1));
20215 full_interleave = qimode == V16QImode;
20216 break;
20217
20218 case ASHIFT:
20219 case LSHIFTRT:
20220 uns_p = true;
20221 /* FALLTHRU */
20222 case ASHIFTRT:
20223 op1_l = gen_reg_rtx (himode);
20224 op1_h = gen_reg_rtx (himode);
20225 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
20226 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
20227 full_interleave = true;
20228 break;
20229 default:
20230 gcc_unreachable ();
20231 }
20232
20233 /* Perform the operation. */
20234 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
20235 1, OPTAB_DIRECT);
20236 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
20237 1, OPTAB_DIRECT);
20238 gcc_assert (res_l && res_h);
20239
20240 /* Merge the data back into the right place. */
20241 d.target = dest;
20242 d.op0 = gen_lowpart (qimode, res_l);
20243 d.op1 = gen_lowpart (qimode, res_h);
20244 d.vmode = qimode;
20245 d.nelt = GET_MODE_NUNITS (qimode);
20246 d.one_operand_p = false;
20247 d.testing_p = false;
20248
20249 if (full_interleave)
20250 {
20251 /* For SSE2, we used an full interleave, so the desired
20252 results are in the even elements. */
20253 for (i = 0; i < d.nelt; ++i)
20254 d.perm[i] = i * 2;
20255 }
20256 else
20257 {
20258 /* For AVX, the interleave used above was not cross-lane. So the
20259 extraction is evens but with the second and third quarter swapped.
20260 Happily, that is even one insn shorter than even extraction.
20261 For AVX512BW we have 4 lanes. We extract evens from within a lane,
20262 always first from the first and then from the second source operand,
20263 the index bits above the low 4 bits remains the same.
20264 Thus, for d.nelt == 32 we want permutation
20265 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
20266 and for d.nelt == 64 we want permutation
20267 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
20268 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
20269 for (i = 0; i < d.nelt; ++i)
20270 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
20271 }
20272
20273 ok = ix86_expand_vec_perm_const_1 (&d);
20274 gcc_assert (ok);
20275
20276 set_unique_reg_note (get_last_insn (), REG_EQUAL,
20277 gen_rtx_fmt_ee (code, qimode, op1, op2));
20278}
20279
20280/* Helper function of ix86_expand_mul_widen_evenodd. Return true
20281 if op is CONST_VECTOR with all odd elements equal to their
20282 preceding element. */
20283
20284static bool
20285const_vector_equal_evenodd_p (rtx op)
20286{
20287 machine_mode mode = GET_MODE (op);
20288 int i, nunits = GET_MODE_NUNITS (mode);
20289 if (GET_CODE (op) != CONST_VECTOR
20290 || nunits != CONST_VECTOR_NUNITS (op))
20291 return false;
20292 for (i = 0; i < nunits; i += 2)
20293 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
20294 return false;
20295 return true;
20296}
20297
20298void
20299ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
20300 bool uns_p, bool odd_p)
20301{
20302 machine_mode mode = GET_MODE (op1);
20303 machine_mode wmode = GET_MODE (dest);
20304 rtx x;
20305 rtx orig_op1 = op1, orig_op2 = op2;
20306
20307 if (!nonimmediate_operand (op1, mode))
20308 op1 = force_reg (mode, op1);
20309 if (!nonimmediate_operand (op2, mode))
20310 op2 = force_reg (mode, op2);
20311
20312 /* We only play even/odd games with vectors of SImode. */
20313 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
20314
20315 /* If we're looking for the odd results, shift those members down to
20316 the even slots. For some cpus this is faster than a PSHUFD. */
20317 if (odd_p)
20318 {
20319 /* For XOP use vpmacsdqh, but only for smult, as it is only
20320 signed. */
20321 if (TARGET_XOP && mode == V4SImode && !uns_p)
20322 {
20323 x = force_reg (wmode, CONST0_RTX (wmode));
20324 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
20325 return;
20326 }
20327
20328 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
20329 if (!const_vector_equal_evenodd_p (orig_op1))
20330 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
20331 x, NULL, 1, OPTAB_DIRECT);
20332 if (!const_vector_equal_evenodd_p (orig_op2))
20333 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
20334 x, NULL, 1, OPTAB_DIRECT);
20335 op1 = gen_lowpart (mode, op1);
20336 op2 = gen_lowpart (mode, op2);
20337 }
20338
20339 if (mode == V16SImode)
20340 {
20341 if (uns_p)
20342 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
20343 else
20344 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
20345 }
20346 else if (mode == V8SImode)
20347 {
20348 if (uns_p)
20349 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
20350 else
20351 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
20352 }
20353 else if (uns_p)
20354 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
20355 else if (TARGET_SSE4_1)
20356 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
20357 else
20358 {
20359 rtx s1, s2, t0, t1, t2;
20360
20361 /* The easiest way to implement this without PMULDQ is to go through
20362 the motions as if we are performing a full 64-bit multiply. With
20363 the exception that we need to do less shuffling of the elements. */
20364
20365 /* Compute the sign-extension, aka highparts, of the two operands. */
20366 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
20367 op1, pc_rtx, pc_rtx);
20368 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
20369 op2, pc_rtx, pc_rtx);
20370
20371 /* Multiply LO(A) * HI(B), and vice-versa. */
20372 t1 = gen_reg_rtx (wmode);
20373 t2 = gen_reg_rtx (wmode);
20374 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
20375 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
20376
20377 /* Multiply LO(A) * LO(B). */
20378 t0 = gen_reg_rtx (wmode);
20379 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
20380
20381 /* Combine and shift the highparts into place. */
20382 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
20383 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
20384 1, OPTAB_DIRECT);
20385
20386 /* Combine high and low parts. */
20387 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
20388 return;
20389 }
20390 emit_insn (x);
20391}
20392
20393void
20394ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
20395 bool uns_p, bool high_p)
20396{
20397 machine_mode wmode = GET_MODE (dest);
20398 machine_mode mode = GET_MODE (op1);
20399 rtx t1, t2, t3, t4, mask;
20400
20401 switch (mode)
20402 {
20403 case E_V4SImode:
20404 t1 = gen_reg_rtx (mode);
20405 t2 = gen_reg_rtx (mode);
20406 if (TARGET_XOP && !uns_p)
20407 {
20408 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
20409 shuffle the elements once so that all elements are in the right
20410 place for immediate use: { A C B D }. */
20411 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
20412 const1_rtx, GEN_INT (3)));
20413 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
20414 const1_rtx, GEN_INT (3)));
20415 }
20416 else
20417 {
20418 /* Put the elements into place for the multiply. */
20419 ix86_expand_vec_interleave (t1, op1, op1, high_p);
20420 ix86_expand_vec_interleave (t2, op2, op2, high_p);
20421 high_p = false;
20422 }
20423 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
20424 break;
20425
20426 case E_V8SImode:
20427 /* Shuffle the elements between the lanes. After this we
20428 have { A B E F | C D G H } for each operand. */
20429 t1 = gen_reg_rtx (V4DImode);
20430 t2 = gen_reg_rtx (V4DImode);
20431 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
20432 const0_rtx, const2_rtx,
20433 const1_rtx, GEN_INT (3)));
20434 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
20435 const0_rtx, const2_rtx,
20436 const1_rtx, GEN_INT (3)));
20437
20438 /* Shuffle the elements within the lanes. After this we
20439 have { A A B B | C C D D } or { E E F F | G G H H }. */
20440 t3 = gen_reg_rtx (V8SImode);
20441 t4 = gen_reg_rtx (V8SImode);
20442 mask = GEN_INT (high_p
20443 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
20444 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
20445 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
20446 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
20447
20448 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
20449 break;
20450
20451 case E_V8HImode:
20452 case E_V16HImode:
20453 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
20454 uns_p, OPTAB_DIRECT);
20455 t2 = expand_binop (mode,
20456 uns_p ? umul_highpart_optab : smul_highpart_optab,
20457 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
20458 gcc_assert (t1 && t2);
20459
20460 t3 = gen_reg_rtx (mode);
20461 ix86_expand_vec_interleave (t3, t1, t2, high_p);
20462 emit_move_insn (dest, gen_lowpart (wmode, t3));
20463 break;
20464
20465 case E_V16QImode:
20466 case E_V32QImode:
20467 case E_V32HImode:
20468 case E_V16SImode:
20469 case E_V64QImode:
20470 t1 = gen_reg_rtx (wmode);
20471 t2 = gen_reg_rtx (wmode);
20472 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
20473 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
20474
20475 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
20476 break;
20477
20478 default:
20479 gcc_unreachable ();
20480 }
20481}
20482
20483void
20484ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
20485{
20486 rtx res_1, res_2, res_3, res_4;
20487
20488 res_1 = gen_reg_rtx (V4SImode);
20489 res_2 = gen_reg_rtx (V4SImode);
20490 res_3 = gen_reg_rtx (V2DImode);
20491 res_4 = gen_reg_rtx (V2DImode);
20492 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
20493 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
20494
20495 /* Move the results in element 2 down to element 1; we don't care
20496 what goes in elements 2 and 3. Then we can merge the parts
20497 back together with an interleave.
20498
20499 Note that two other sequences were tried:
20500 (1) Use interleaves at the start instead of psrldq, which allows
20501 us to use a single shufps to merge things back at the end.
20502 (2) Use shufps here to combine the two vectors, then pshufd to
20503 put the elements in the correct order.
20504 In both cases the cost of the reformatting stall was too high
20505 and the overall sequence slower. */
20506
20507 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
20508 const0_rtx, const2_rtx,
20509 const0_rtx, const0_rtx));
20510 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
20511 const0_rtx, const2_rtx,
20512 const0_rtx, const0_rtx));
20513 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
20514
20515 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
20516}
20517
20518void
20519ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
20520{
20521 machine_mode mode = GET_MODE (op0);
20522 rtx t1, t2, t3, t4, t5, t6;
20523
20524 if (TARGET_AVX512DQ && mode == V8DImode)
20525 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
20526 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
20527 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
20528 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
20529 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
20530 else if (TARGET_XOP && mode == V2DImode)
20531 {
20532 /* op1: A,B,C,D, op2: E,F,G,H */
20533 op1 = gen_lowpart (V4SImode, op1);
20534 op2 = gen_lowpart (V4SImode, op2);
20535
20536 t1 = gen_reg_rtx (V4SImode);
20537 t2 = gen_reg_rtx (V4SImode);
20538 t3 = gen_reg_rtx (V2DImode);
20539 t4 = gen_reg_rtx (V2DImode);
20540
20541 /* t1: B,A,D,C */
20542 emit_insn (gen_sse2_pshufd_1 (t1, op1,
20543 GEN_INT (1),
20544 GEN_INT (0),
20545 GEN_INT (3),
20546 GEN_INT (2)));
20547
20548 /* t2: (B*E),(A*F),(D*G),(C*H) */
20549 emit_insn (gen_mulv4si3 (t2, t1, op2));
20550
20551 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
20552 emit_insn (gen_xop_phadddq (t3, t2));
20553
20554 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
20555 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
20556
20557 /* Multiply lower parts and add all */
20558 t5 = gen_reg_rtx (V2DImode);
20559 emit_insn (gen_vec_widen_umult_even_v4si (t5,
20560 gen_lowpart (V4SImode, op1),
20561 gen_lowpart (V4SImode, op2)));
8ba6ea87 20562 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
2bf6d935
ML
20563 }
20564 else
20565 {
20566 machine_mode nmode;
20567 rtx (*umul) (rtx, rtx, rtx);
20568
20569 if (mode == V2DImode)
20570 {
20571 umul = gen_vec_widen_umult_even_v4si;
20572 nmode = V4SImode;
20573 }
20574 else if (mode == V4DImode)
20575 {
20576 umul = gen_vec_widen_umult_even_v8si;
20577 nmode = V8SImode;
20578 }
20579 else if (mode == V8DImode)
20580 {
20581 umul = gen_vec_widen_umult_even_v16si;
20582 nmode = V16SImode;
20583 }
20584 else
20585 gcc_unreachable ();
20586
20587
20588 /* Multiply low parts. */
20589 t1 = gen_reg_rtx (mode);
20590 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
20591
20592 /* Shift input vectors right 32 bits so we can multiply high parts. */
20593 t6 = GEN_INT (32);
20594 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
20595 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
20596
20597 /* Multiply high parts by low parts. */
20598 t4 = gen_reg_rtx (mode);
20599 t5 = gen_reg_rtx (mode);
20600 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
20601 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
20602
20603 /* Combine and shift the highparts back. */
20604 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
20605 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
20606
20607 /* Combine high and low parts. */
20608 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
20609 }
20610
20611 set_unique_reg_note (get_last_insn (), REG_EQUAL,
20612 gen_rtx_MULT (mode, op1, op2));
20613}
20614
20615/* Return 1 if control tansfer instruction INSN
20616 should be encoded with notrack prefix. */
20617
20618bool
e8b0314a 20619ix86_notrack_prefixed_insn_p (rtx_insn *insn)
2bf6d935
ML
20620{
20621 if (!insn || !((flag_cf_protection & CF_BRANCH)))
20622 return false;
20623
20624 if (CALL_P (insn))
20625 {
20626 rtx call = get_call_rtx_from (insn);
20627 gcc_assert (call != NULL_RTX);
20628 rtx addr = XEXP (call, 0);
20629
20630 /* Do not emit 'notrack' if it's not an indirect call. */
20631 if (MEM_P (addr)
20632 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
20633 return false;
20634 else
20635 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
20636 }
20637
20638 if (JUMP_P (insn) && !flag_cet_switch)
20639 {
20640 rtx target = JUMP_LABEL (insn);
20641 if (target == NULL_RTX || ANY_RETURN_P (target))
20642 return false;
20643
20644 /* Check the jump is a switch table. */
20645 rtx_insn *label = as_a<rtx_insn *> (target);
20646 rtx_insn *table = next_insn (label);
20647 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
20648 return false;
20649 else
20650 return true;
20651 }
20652 return false;
20653}
20654
20655/* Calculate integer abs() using only SSE2 instructions. */
20656
20657void
20658ix86_expand_sse2_abs (rtx target, rtx input)
20659{
20660 machine_mode mode = GET_MODE (target);
20661 rtx tmp0, tmp1, x;
20662
20663 switch (mode)
20664 {
20665 case E_V2DImode:
20666 case E_V4DImode:
20667 /* For 64-bit signed integer X, with SSE4.2 use
20668 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
20669 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
20670 32 and use logical instead of arithmetic right shift (which is
20671 unimplemented) and subtract. */
20672 if (TARGET_SSE4_2)
20673 {
20674 tmp0 = gen_reg_rtx (mode);
20675 tmp1 = gen_reg_rtx (mode);
20676 emit_move_insn (tmp1, CONST0_RTX (mode));
20677 if (mode == E_V2DImode)
20678 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
20679 else
20680 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
20681 }
20682 else
20683 {
20684 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
20685 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
20686 - 1), NULL, 0, OPTAB_DIRECT);
20687 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
20688 }
20689
20690 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20691 NULL, 0, OPTAB_DIRECT);
20692 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20693 target, 0, OPTAB_DIRECT);
20694 break;
20695
20696 case E_V4SImode:
20697 /* For 32-bit signed integer X, the best way to calculate the absolute
20698 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
20699 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
20700 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
20701 NULL, 0, OPTAB_DIRECT);
20702 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20703 NULL, 0, OPTAB_DIRECT);
20704 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20705 target, 0, OPTAB_DIRECT);
20706 break;
20707
20708 case E_V8HImode:
20709 /* For 16-bit signed integer X, the best way to calculate the absolute
20710 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
20711 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20712
20713 x = expand_simple_binop (mode, SMAX, tmp0, input,
20714 target, 0, OPTAB_DIRECT);
20715 break;
20716
20717 case E_V16QImode:
20718 /* For 8-bit signed integer X, the best way to calculate the absolute
20719 value of X is min ((unsigned char) X, (unsigned char) (-X)),
20720 as SSE2 provides the PMINUB insn. */
20721 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20722
20723 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
20724 target, 0, OPTAB_DIRECT);
20725 break;
20726
20727 default:
20728 gcc_unreachable ();
20729 }
20730
20731 if (x != target)
20732 emit_move_insn (target, x);
20733}
20734
20735/* Expand an extract from a vector register through pextr insn.
20736 Return true if successful. */
20737
20738bool
20739ix86_expand_pextr (rtx *operands)
20740{
20741 rtx dst = operands[0];
20742 rtx src = operands[1];
20743
20744 unsigned int size = INTVAL (operands[2]);
20745 unsigned int pos = INTVAL (operands[3]);
20746
20747 if (SUBREG_P (dst))
20748 {
20749 /* Reject non-lowpart subregs. */
20750 if (SUBREG_BYTE (dst) > 0)
20751 return false;
20752 dst = SUBREG_REG (dst);
20753 }
20754
20755 if (SUBREG_P (src))
20756 {
20757 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
20758 src = SUBREG_REG (src);
20759 }
20760
20761 switch (GET_MODE (src))
20762 {
20763 case E_V16QImode:
20764 case E_V8HImode:
20765 case E_V4SImode:
20766 case E_V2DImode:
20767 case E_V1TImode:
2bf6d935
ML
20768 {
20769 machine_mode srcmode, dstmode;
20770 rtx d, pat;
20771
20772 if (!int_mode_for_size (size, 0).exists (&dstmode))
20773 return false;
20774
20775 switch (dstmode)
20776 {
20777 case E_QImode:
20778 if (!TARGET_SSE4_1)
20779 return false;
20780 srcmode = V16QImode;
20781 break;
20782
20783 case E_HImode:
20784 if (!TARGET_SSE2)
20785 return false;
20786 srcmode = V8HImode;
20787 break;
20788
20789 case E_SImode:
20790 if (!TARGET_SSE4_1)
20791 return false;
20792 srcmode = V4SImode;
20793 break;
20794
20795 case E_DImode:
20796 gcc_assert (TARGET_64BIT);
20797 if (!TARGET_SSE4_1)
20798 return false;
20799 srcmode = V2DImode;
20800 break;
20801
20802 default:
20803 return false;
20804 }
20805
20806 /* Reject extractions from misaligned positions. */
20807 if (pos & (size-1))
20808 return false;
20809
20810 if (GET_MODE (dst) == dstmode)
20811 d = dst;
20812 else
20813 d = gen_reg_rtx (dstmode);
20814
20815 /* Construct insn pattern. */
20816 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
20817 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
20818
20819 /* Let the rtl optimizers know about the zero extension performed. */
20820 if (dstmode == QImode || dstmode == HImode)
20821 {
20822 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
20823 d = gen_lowpart (SImode, d);
20824 }
20825
20826 emit_insn (gen_rtx_SET (d, pat));
20827
20828 if (d != dst)
20829 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20830 return true;
20831 }
20832
20833 default:
20834 return false;
20835 }
20836}
20837
20838/* Expand an insert into a vector register through pinsr insn.
20839 Return true if successful. */
20840
20841bool
20842ix86_expand_pinsr (rtx *operands)
20843{
20844 rtx dst = operands[0];
20845 rtx src = operands[3];
20846
20847 unsigned int size = INTVAL (operands[1]);
20848 unsigned int pos = INTVAL (operands[2]);
20849
20850 if (SUBREG_P (dst))
20851 {
20852 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
20853 dst = SUBREG_REG (dst);
20854 }
20855
20856 switch (GET_MODE (dst))
20857 {
20858 case E_V16QImode:
20859 case E_V8HImode:
20860 case E_V4SImode:
20861 case E_V2DImode:
20862 case E_V1TImode:
2bf6d935
ML
20863 {
20864 machine_mode srcmode, dstmode;
20865 rtx (*pinsr)(rtx, rtx, rtx, rtx);
20866 rtx d;
20867
20868 if (!int_mode_for_size (size, 0).exists (&srcmode))
20869 return false;
20870
20871 switch (srcmode)
20872 {
20873 case E_QImode:
20874 if (!TARGET_SSE4_1)
20875 return false;
20876 dstmode = V16QImode;
20877 pinsr = gen_sse4_1_pinsrb;
20878 break;
20879
20880 case E_HImode:
20881 if (!TARGET_SSE2)
20882 return false;
20883 dstmode = V8HImode;
20884 pinsr = gen_sse2_pinsrw;
20885 break;
20886
20887 case E_SImode:
20888 if (!TARGET_SSE4_1)
20889 return false;
20890 dstmode = V4SImode;
20891 pinsr = gen_sse4_1_pinsrd;
20892 break;
20893
20894 case E_DImode:
20895 gcc_assert (TARGET_64BIT);
20896 if (!TARGET_SSE4_1)
20897 return false;
20898 dstmode = V2DImode;
20899 pinsr = gen_sse4_1_pinsrq;
20900 break;
20901
20902 default:
20903 return false;
20904 }
20905
20906 /* Reject insertions to misaligned positions. */
20907 if (pos & (size-1))
20908 return false;
20909
20910 if (SUBREG_P (src))
20911 {
20912 unsigned int srcpos = SUBREG_BYTE (src);
20913
20914 if (srcpos > 0)
20915 {
20916 rtx extr_ops[4];
20917
20918 extr_ops[0] = gen_reg_rtx (srcmode);
20919 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
20920 extr_ops[2] = GEN_INT (size);
20921 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
20922
20923 if (!ix86_expand_pextr (extr_ops))
20924 return false;
20925
20926 src = extr_ops[0];
20927 }
20928 else
20929 src = gen_lowpart (srcmode, SUBREG_REG (src));
20930 }
20931
20932 if (GET_MODE (dst) == dstmode)
20933 d = dst;
20934 else
20935 d = gen_reg_rtx (dstmode);
20936
20937 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
20938 gen_lowpart (srcmode, src),
20939 GEN_INT (1 << (pos / size))));
20940 if (d != dst)
20941 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20942 return true;
20943 }
20944
20945 default:
20946 return false;
20947 }
20948}
20949
20950/* All CPUs prefer to avoid cross-lane operations so perform reductions
20951 upper against lower halves up to SSE reg size. */
20952
20953machine_mode
20954ix86_split_reduction (machine_mode mode)
20955{
20956 /* Reduce lowpart against highpart until we reach SSE reg width to
20957 avoid cross-lane operations. */
20958 switch (mode)
20959 {
20960 case E_V8DImode:
20961 case E_V4DImode:
20962 return V2DImode;
20963 case E_V16SImode:
20964 case E_V8SImode:
20965 return V4SImode;
20966 case E_V32HImode:
20967 case E_V16HImode:
20968 return V8HImode;
20969 case E_V64QImode:
20970 case E_V32QImode:
20971 return V16QImode;
20972 case E_V16SFmode:
20973 case E_V8SFmode:
20974 return V4SFmode;
20975 case E_V8DFmode:
20976 case E_V4DFmode:
20977 return V2DFmode;
20978 default:
20979 return mode;
20980 }
20981}
20982
20983/* Generate call to __divmoddi4. */
20984
20985void
20986ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
20987 rtx op0, rtx op1,
20988 rtx *quot_p, rtx *rem_p)
20989{
20990 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
20991
20992 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
20993 mode, op0, mode, op1, mode,
20994 XEXP (rem, 0), Pmode);
20995 *quot_p = quot;
20996 *rem_p = rem;
20997}
20998
20999#include "gt-i386-expand.h"