]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/i386/i386-expand.c
testsuite/95363 - fix gcc.dg/vect/bb-slp-pr95271.c for ilp32
[thirdparty/gcc.git] / gcc / config / i386 / i386-expand.c
CommitLineData
8d9254fc 1/* Copyright (C) 1988-2020 Free Software Foundation, Inc.
2bf6d935
ML
2
3This file is part of GCC.
4
5GCC is free software; you can redistribute it and/or modify
6it under the terms of the GNU General Public License as published by
7the Free Software Foundation; either version 3, or (at your option)
8any later version.
9
10GCC is distributed in the hope that it will be useful,
11but WITHOUT ANY WARRANTY; without even the implied warranty of
12MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13GNU General Public License for more details.
14
15You should have received a copy of the GNU General Public License
16along with GCC; see the file COPYING3. If not see
17<http://www.gnu.org/licenses/>. */
18
19#define IN_TARGET_CODE 1
20
21#include "config.h"
22#include "system.h"
23#include "coretypes.h"
24#include "backend.h"
25#include "rtl.h"
26#include "tree.h"
27#include "memmodel.h"
28#include "gimple.h"
29#include "cfghooks.h"
30#include "cfgloop.h"
31#include "df.h"
32#include "tm_p.h"
33#include "stringpool.h"
34#include "expmed.h"
35#include "optabs.h"
36#include "regs.h"
37#include "emit-rtl.h"
38#include "recog.h"
39#include "cgraph.h"
40#include "diagnostic.h"
41#include "cfgbuild.h"
42#include "alias.h"
43#include "fold-const.h"
44#include "attribs.h"
45#include "calls.h"
46#include "stor-layout.h"
47#include "varasm.h"
48#include "output.h"
49#include "insn-attr.h"
50#include "flags.h"
51#include "except.h"
52#include "explow.h"
53#include "expr.h"
54#include "cfgrtl.h"
55#include "common/common-target.h"
56#include "langhooks.h"
57#include "reload.h"
58#include "gimplify.h"
59#include "dwarf2.h"
60#include "tm-constrs.h"
2bf6d935
ML
61#include "cselib.h"
62#include "sched-int.h"
63#include "opts.h"
64#include "tree-pass.h"
65#include "context.h"
66#include "pass_manager.h"
67#include "target-globals.h"
68#include "gimple-iterator.h"
69#include "tree-vectorizer.h"
70#include "shrink-wrap.h"
71#include "builtins.h"
72#include "rtl-iter.h"
73#include "tree-iterator.h"
74#include "dbgcnt.h"
75#include "case-cfn-macros.h"
76#include "dojump.h"
77#include "fold-const-call.h"
78#include "tree-vrp.h"
79#include "tree-ssanames.h"
80#include "selftest.h"
81#include "selftest-rtl.h"
82#include "print-rtl.h"
83#include "intl.h"
84#include "ifcvt.h"
85#include "symbol-summary.h"
86#include "ipa-prop.h"
87#include "ipa-fnsummary.h"
88#include "wide-int-bitmask.h"
89#include "tree-vector-builder.h"
90#include "debug.h"
91#include "dwarf2out.h"
92#include "i386-options.h"
93#include "i386-builtins.h"
94#include "i386-expand.h"
95
96/* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
101
102void
103split_double_mode (machine_mode mode, rtx operands[],
104 int num, rtx lo_half[], rtx hi_half[])
105{
106 machine_mode half_mode;
107 unsigned int byte;
deeedbad
JJ
108 rtx mem_op = NULL_RTX;
109 int mem_num = 0;
2bf6d935
ML
110
111 switch (mode)
112 {
113 case E_TImode:
114 half_mode = DImode;
115 break;
116 case E_DImode:
117 half_mode = SImode;
118 break;
119 default:
120 gcc_unreachable ();
121 }
122
123 byte = GET_MODE_SIZE (half_mode);
124
125 while (num--)
126 {
127 rtx op = operands[num];
128
129 /* simplify_subreg refuse to split volatile memory addresses,
130 but we still have to handle it. */
131 if (MEM_P (op))
132 {
deeedbad
JJ
133 if (mem_op && rtx_equal_p (op, mem_op))
134 {
135 lo_half[num] = lo_half[mem_num];
136 hi_half[num] = hi_half[mem_num];
137 }
138 else
139 {
140 mem_op = op;
141 mem_num = num;
142 lo_half[num] = adjust_address (op, half_mode, 0);
143 hi_half[num] = adjust_address (op, half_mode, byte);
144 }
2bf6d935
ML
145 }
146 else
147 {
148 lo_half[num] = simplify_gen_subreg (half_mode, op,
149 GET_MODE (op) == VOIDmode
150 ? mode : GET_MODE (op), 0);
151 hi_half[num] = simplify_gen_subreg (half_mode, op,
152 GET_MODE (op) == VOIDmode
153 ? mode : GET_MODE (op), byte);
154 }
155 }
156}
157
158/* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
159 for the target. */
160
161void
162ix86_expand_clear (rtx dest)
163{
164 rtx tmp;
165
166 /* We play register width games, which are only valid after reload. */
167 gcc_assert (reload_completed);
168
169 /* Avoid HImode and its attendant prefix byte. */
170 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
171 dest = gen_rtx_REG (SImode, REGNO (dest));
172 tmp = gen_rtx_SET (dest, const0_rtx);
173
174 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
175 {
176 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
177 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
178 }
179
180 emit_insn (tmp);
181}
182
183void
184ix86_expand_move (machine_mode mode, rtx operands[])
185{
186 rtx op0, op1;
187 rtx tmp, addend = NULL_RTX;
188 enum tls_model model;
189
190 op0 = operands[0];
191 op1 = operands[1];
192
193 switch (GET_CODE (op1))
194 {
195 case CONST:
196 tmp = XEXP (op1, 0);
197
198 if (GET_CODE (tmp) != PLUS
199 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
200 break;
201
202 op1 = XEXP (tmp, 0);
203 addend = XEXP (tmp, 1);
204 /* FALLTHRU */
205
206 case SYMBOL_REF:
207 model = SYMBOL_REF_TLS_MODEL (op1);
208
209 if (model)
210 op1 = legitimize_tls_address (op1, model, true);
211 else if (ix86_force_load_from_GOT_p (op1))
212 {
213 /* Load the external function address via GOT slot to avoid PLT. */
214 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
215 (TARGET_64BIT
216 ? UNSPEC_GOTPCREL
217 : UNSPEC_GOT));
218 op1 = gen_rtx_CONST (Pmode, op1);
219 op1 = gen_const_mem (Pmode, op1);
220 set_mem_alias_set (op1, ix86_GOT_alias_set ());
221 }
222 else
223 {
224 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
225 if (tmp)
226 {
227 op1 = tmp;
228 if (!addend)
229 break;
230 }
231 else
232 {
233 op1 = operands[1];
234 break;
235 }
236 }
237
238 if (addend)
239 {
240 op1 = force_operand (op1, NULL_RTX);
241 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
242 op0, 1, OPTAB_DIRECT);
243 }
244 else
245 op1 = force_operand (op1, op0);
246
247 if (op1 == op0)
248 return;
249
250 op1 = convert_to_mode (mode, op1, 1);
251
252 default:
253 break;
254 }
255
256 if ((flag_pic || MACHOPIC_INDIRECT)
257 && symbolic_operand (op1, mode))
258 {
259 if (TARGET_MACHO && !TARGET_64BIT)
260 {
261#if TARGET_MACHO
262 /* dynamic-no-pic */
263 if (MACHOPIC_INDIRECT)
264 {
265 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
266 ? op0 : gen_reg_rtx (Pmode);
267 op1 = machopic_indirect_data_reference (op1, temp);
268 if (MACHOPIC_PURE)
269 op1 = machopic_legitimize_pic_address (op1, mode,
270 temp == op1 ? 0 : temp);
271 }
272 if (op0 != op1 && GET_CODE (op0) != MEM)
273 {
274 rtx insn = gen_rtx_SET (op0, op1);
275 emit_insn (insn);
276 return;
277 }
278 if (GET_CODE (op0) == MEM)
279 op1 = force_reg (Pmode, op1);
280 else
281 {
282 rtx temp = op0;
283 if (GET_CODE (temp) != REG)
284 temp = gen_reg_rtx (Pmode);
285 temp = legitimize_pic_address (op1, temp);
286 if (temp == op0)
287 return;
288 op1 = temp;
289 }
290 /* dynamic-no-pic */
291#endif
292 }
293 else
294 {
295 if (MEM_P (op0))
296 op1 = force_reg (mode, op1);
297 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
298 {
299 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
300 op1 = legitimize_pic_address (op1, reg);
301 if (op0 == op1)
302 return;
303 op1 = convert_to_mode (mode, op1, 1);
304 }
305 }
306 }
307 else
308 {
309 if (MEM_P (op0)
310 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
311 || !push_operand (op0, mode))
312 && MEM_P (op1))
313 op1 = force_reg (mode, op1);
314
315 if (push_operand (op0, mode)
316 && ! general_no_elim_operand (op1, mode))
317 op1 = copy_to_mode_reg (mode, op1);
318
319 /* Force large constants in 64bit compilation into register
320 to get them CSEed. */
321 if (can_create_pseudo_p ()
322 && (mode == DImode) && TARGET_64BIT
323 && immediate_operand (op1, mode)
324 && !x86_64_zext_immediate_operand (op1, VOIDmode)
325 && !register_operand (op0, mode)
326 && optimize)
327 op1 = copy_to_mode_reg (mode, op1);
328
329 if (can_create_pseudo_p ()
330 && CONST_DOUBLE_P (op1))
331 {
332 /* If we are loading a floating point constant to a register,
333 force the value to memory now, since we'll get better code
334 out the back end. */
335
336 op1 = validize_mem (force_const_mem (mode, op1));
337 if (!register_operand (op0, mode))
338 {
339 rtx temp = gen_reg_rtx (mode);
340 emit_insn (gen_rtx_SET (temp, op1));
341 emit_move_insn (op0, temp);
342 return;
343 }
344 }
345 }
346
347 emit_insn (gen_rtx_SET (op0, op1));
348}
349
350void
351ix86_expand_vector_move (machine_mode mode, rtx operands[])
352{
353 rtx op0 = operands[0], op1 = operands[1];
354 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
355 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
356 unsigned int align = (TARGET_IAMCU
357 ? GET_MODE_BITSIZE (mode)
358 : GET_MODE_ALIGNMENT (mode));
359
360 if (push_operand (op0, VOIDmode))
361 op0 = emit_move_resolve_push (mode, op0);
362
363 /* Force constants other than zero into memory. We do not know how
364 the instructions used to build constants modify the upper 64 bits
365 of the register, once we have that information we may be able
366 to handle some of them more efficiently. */
367 if (can_create_pseudo_p ()
368 && (CONSTANT_P (op1)
369 || (SUBREG_P (op1)
370 && CONSTANT_P (SUBREG_REG (op1))))
371 && ((register_operand (op0, mode)
372 && !standard_sse_constant_p (op1, mode))
373 /* ix86_expand_vector_move_misalign() does not like constants. */
374 || (SSE_REG_MODE_P (mode)
375 && MEM_P (op0)
376 && MEM_ALIGN (op0) < align)))
377 {
378 if (SUBREG_P (op1))
379 {
380 machine_mode imode = GET_MODE (SUBREG_REG (op1));
381 rtx r = force_const_mem (imode, SUBREG_REG (op1));
382 if (r)
383 r = validize_mem (r);
384 else
385 r = force_reg (imode, SUBREG_REG (op1));
386 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
387 }
388 else
389 op1 = validize_mem (force_const_mem (mode, op1));
390 }
391
392 /* We need to check memory alignment for SSE mode since attribute
393 can make operands unaligned. */
394 if (can_create_pseudo_p ()
395 && SSE_REG_MODE_P (mode)
396 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
397 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
398 {
399 rtx tmp[2];
400
401 /* ix86_expand_vector_move_misalign() does not like both
402 arguments in memory. */
403 if (!register_operand (op0, mode)
404 && !register_operand (op1, mode))
405 op1 = force_reg (mode, op1);
406
407 tmp[0] = op0; tmp[1] = op1;
408 ix86_expand_vector_move_misalign (mode, tmp);
409 return;
410 }
411
412 /* Make operand1 a register if it isn't already. */
413 if (can_create_pseudo_p ()
414 && !register_operand (op0, mode)
415 && !register_operand (op1, mode))
416 {
417 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
418 return;
419 }
420
421 emit_insn (gen_rtx_SET (op0, op1));
422}
423
424/* Split 32-byte AVX unaligned load and store if needed. */
425
426static void
427ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
428{
429 rtx m;
430 rtx (*extract) (rtx, rtx, rtx);
431 machine_mode mode;
432
433 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
434 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
435 {
436 emit_insn (gen_rtx_SET (op0, op1));
437 return;
438 }
439
440 rtx orig_op0 = NULL_RTX;
441 mode = GET_MODE (op0);
442 switch (GET_MODE_CLASS (mode))
443 {
444 case MODE_VECTOR_INT:
445 case MODE_INT:
446 if (mode != V32QImode)
447 {
448 if (!MEM_P (op0))
449 {
450 orig_op0 = op0;
451 op0 = gen_reg_rtx (V32QImode);
452 }
453 else
454 op0 = gen_lowpart (V32QImode, op0);
455 op1 = gen_lowpart (V32QImode, op1);
456 mode = V32QImode;
457 }
458 break;
459 case MODE_VECTOR_FLOAT:
460 break;
461 default:
462 gcc_unreachable ();
463 }
464
465 switch (mode)
466 {
467 default:
468 gcc_unreachable ();
469 case E_V32QImode:
470 extract = gen_avx_vextractf128v32qi;
471 mode = V16QImode;
472 break;
473 case E_V8SFmode:
474 extract = gen_avx_vextractf128v8sf;
475 mode = V4SFmode;
476 break;
477 case E_V4DFmode:
478 extract = gen_avx_vextractf128v4df;
479 mode = V2DFmode;
480 break;
481 }
482
483 if (MEM_P (op1))
484 {
485 rtx r = gen_reg_rtx (mode);
486 m = adjust_address (op1, mode, 0);
487 emit_move_insn (r, m);
488 m = adjust_address (op1, mode, 16);
489 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
490 emit_move_insn (op0, r);
491 }
492 else if (MEM_P (op0))
493 {
494 m = adjust_address (op0, mode, 0);
495 emit_insn (extract (m, op1, const0_rtx));
496 m = adjust_address (op0, mode, 16);
497 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
498 }
499 else
500 gcc_unreachable ();
501
502 if (orig_op0)
503 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
504}
505
506/* Implement the movmisalign patterns for SSE. Non-SSE modes go
507 straight to ix86_expand_vector_move. */
508/* Code generation for scalar reg-reg moves of single and double precision data:
509 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
510 movaps reg, reg
511 else
512 movss reg, reg
513 if (x86_sse_partial_reg_dependency == true)
514 movapd reg, reg
515 else
516 movsd reg, reg
517
518 Code generation for scalar loads of double precision data:
519 if (x86_sse_split_regs == true)
520 movlpd mem, reg (gas syntax)
521 else
522 movsd mem, reg
523
524 Code generation for unaligned packed loads of single precision data
525 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
526 if (x86_sse_unaligned_move_optimal)
527 movups mem, reg
528
529 if (x86_sse_partial_reg_dependency == true)
530 {
531 xorps reg, reg
532 movlps mem, reg
533 movhps mem+8, reg
534 }
535 else
536 {
537 movlps mem, reg
538 movhps mem+8, reg
539 }
540
541 Code generation for unaligned packed loads of double precision data
542 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
543 if (x86_sse_unaligned_move_optimal)
544 movupd mem, reg
545
546 if (x86_sse_split_regs == true)
547 {
548 movlpd mem, reg
549 movhpd mem+8, reg
550 }
551 else
552 {
553 movsd mem, reg
554 movhpd mem+8, reg
555 }
556 */
557
558void
559ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
560{
561 rtx op0, op1, m;
562
563 op0 = operands[0];
564 op1 = operands[1];
565
566 /* Use unaligned load/store for AVX512 or when optimizing for size. */
567 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
568 {
569 emit_insn (gen_rtx_SET (op0, op1));
570 return;
571 }
572
573 if (TARGET_AVX)
574 {
575 if (GET_MODE_SIZE (mode) == 32)
576 ix86_avx256_split_vector_move_misalign (op0, op1);
577 else
578 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
579 emit_insn (gen_rtx_SET (op0, op1));
580 return;
581 }
582
583 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
584 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
585 {
586 emit_insn (gen_rtx_SET (op0, op1));
587 return;
588 }
589
590 /* ??? If we have typed data, then it would appear that using
591 movdqu is the only way to get unaligned data loaded with
592 integer type. */
593 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
594 {
595 emit_insn (gen_rtx_SET (op0, op1));
596 return;
597 }
598
599 if (MEM_P (op1))
600 {
601 if (TARGET_SSE2 && mode == V2DFmode)
602 {
603 rtx zero;
604
605 /* When SSE registers are split into halves, we can avoid
606 writing to the top half twice. */
607 if (TARGET_SSE_SPLIT_REGS)
608 {
609 emit_clobber (op0);
610 zero = op0;
611 }
612 else
613 {
614 /* ??? Not sure about the best option for the Intel chips.
615 The following would seem to satisfy; the register is
616 entirely cleared, breaking the dependency chain. We
617 then store to the upper half, with a dependency depth
618 of one. A rumor has it that Intel recommends two movsd
619 followed by an unpacklpd, but this is unconfirmed. And
620 given that the dependency depth of the unpacklpd would
621 still be one, I'm not sure why this would be better. */
622 zero = CONST0_RTX (V2DFmode);
623 }
624
625 m = adjust_address (op1, DFmode, 0);
626 emit_insn (gen_sse2_loadlpd (op0, zero, m));
627 m = adjust_address (op1, DFmode, 8);
628 emit_insn (gen_sse2_loadhpd (op0, op0, m));
629 }
630 else
631 {
632 rtx t;
633
634 if (mode != V4SFmode)
635 t = gen_reg_rtx (V4SFmode);
636 else
637 t = op0;
638
639 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
640 emit_move_insn (t, CONST0_RTX (V4SFmode));
641 else
642 emit_clobber (t);
643
644 m = adjust_address (op1, V2SFmode, 0);
645 emit_insn (gen_sse_loadlps (t, t, m));
646 m = adjust_address (op1, V2SFmode, 8);
647 emit_insn (gen_sse_loadhps (t, t, m));
648 if (mode != V4SFmode)
649 emit_move_insn (op0, gen_lowpart (mode, t));
650 }
651 }
652 else if (MEM_P (op0))
653 {
654 if (TARGET_SSE2 && mode == V2DFmode)
655 {
656 m = adjust_address (op0, DFmode, 0);
657 emit_insn (gen_sse2_storelpd (m, op1));
658 m = adjust_address (op0, DFmode, 8);
659 emit_insn (gen_sse2_storehpd (m, op1));
660 }
661 else
662 {
663 if (mode != V4SFmode)
664 op1 = gen_lowpart (V4SFmode, op1);
665
666 m = adjust_address (op0, V2SFmode, 0);
667 emit_insn (gen_sse_storelps (m, op1));
668 m = adjust_address (op0, V2SFmode, 8);
669 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
670 }
671 }
672 else
673 gcc_unreachable ();
674}
675
b74ebb2a
L
676/* Move bits 64:95 to bits 32:63. */
677
678void
679ix86_move_vector_high_sse_to_mmx (rtx op)
680{
681 rtx mask = gen_rtx_PARALLEL (VOIDmode,
682 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
683 GEN_INT (0), GEN_INT (0)));
684 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
685 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
686 rtx insn = gen_rtx_SET (dest, op);
687 emit_insn (insn);
688}
689
690/* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
691
692void
693ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
694{
695 rtx op0 = operands[0];
696 rtx op1 = operands[1];
697 rtx op2 = operands[2];
698
699 machine_mode dmode = GET_MODE (op0);
700 machine_mode smode = GET_MODE (op1);
701 machine_mode inner_dmode = GET_MODE_INNER (dmode);
702 machine_mode inner_smode = GET_MODE_INNER (smode);
703
704 /* Get the corresponding SSE mode for destination. */
705 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
706 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
707 nunits).require ();
708 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
709 nunits / 2).require ();
710
711 /* Get the corresponding SSE mode for source. */
712 nunits = 16 / GET_MODE_SIZE (inner_smode);
713 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
714 nunits).require ();
715
716 /* Generate SSE pack with signed/unsigned saturation. */
717 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
718 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
719 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
720
721 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
722 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
723 rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
724 op1, op2));
725 emit_insn (insn);
726
727 ix86_move_vector_high_sse_to_mmx (op0);
728}
729
6e9fffcf
L
730/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
731
732void
733ix86_split_mmx_punpck (rtx operands[], bool high_p)
734{
735 rtx op0 = operands[0];
736 rtx op1 = operands[1];
737 rtx op2 = operands[2];
738 machine_mode mode = GET_MODE (op0);
739 rtx mask;
740 /* The corresponding SSE mode. */
741 machine_mode sse_mode, double_sse_mode;
742
743 switch (mode)
744 {
745 case E_V8QImode:
746 sse_mode = V16QImode;
747 double_sse_mode = V32QImode;
748 mask = gen_rtx_PARALLEL (VOIDmode,
749 gen_rtvec (16,
750 GEN_INT (0), GEN_INT (16),
751 GEN_INT (1), GEN_INT (17),
752 GEN_INT (2), GEN_INT (18),
753 GEN_INT (3), GEN_INT (19),
754 GEN_INT (4), GEN_INT (20),
755 GEN_INT (5), GEN_INT (21),
756 GEN_INT (6), GEN_INT (22),
757 GEN_INT (7), GEN_INT (23)));
758 break;
759
760 case E_V4HImode:
761 sse_mode = V8HImode;
762 double_sse_mode = V16HImode;
763 mask = gen_rtx_PARALLEL (VOIDmode,
764 gen_rtvec (8,
765 GEN_INT (0), GEN_INT (8),
766 GEN_INT (1), GEN_INT (9),
767 GEN_INT (2), GEN_INT (10),
768 GEN_INT (3), GEN_INT (11)));
769 break;
770
771 case E_V2SImode:
772 sse_mode = V4SImode;
773 double_sse_mode = V8SImode;
774 mask = gen_rtx_PARALLEL (VOIDmode,
775 gen_rtvec (4,
776 GEN_INT (0), GEN_INT (4),
777 GEN_INT (1), GEN_INT (5)));
778 break;
779
780 default:
781 gcc_unreachable ();
782 }
783
784 /* Generate SSE punpcklXX. */
785 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
786 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
787 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
788
789 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
790 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
791 rtx insn = gen_rtx_SET (dest, op2);
792 emit_insn (insn);
793
794 if (high_p)
795 {
796 /* Move bits 64:127 to bits 0:63. */
797 mask = gen_rtx_PARALLEL (VOIDmode,
798 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
799 GEN_INT (0), GEN_INT (0)));
800 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
801 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
802 insn = gen_rtx_SET (dest, op1);
803 emit_insn (insn);
804 }
805}
806
2bf6d935
ML
807/* Helper function of ix86_fixup_binary_operands to canonicalize
808 operand order. Returns true if the operands should be swapped. */
809
810static bool
811ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
812 rtx operands[])
813{
814 rtx dst = operands[0];
815 rtx src1 = operands[1];
816 rtx src2 = operands[2];
817
818 /* If the operation is not commutative, we can't do anything. */
819 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
820 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
821 return false;
822
823 /* Highest priority is that src1 should match dst. */
824 if (rtx_equal_p (dst, src1))
825 return false;
826 if (rtx_equal_p (dst, src2))
827 return true;
828
829 /* Next highest priority is that immediate constants come second. */
830 if (immediate_operand (src2, mode))
831 return false;
832 if (immediate_operand (src1, mode))
833 return true;
834
835 /* Lowest priority is that memory references should come second. */
836 if (MEM_P (src2))
837 return false;
838 if (MEM_P (src1))
839 return true;
840
841 return false;
842}
843
844
845/* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
846 destination to use for the operation. If different from the true
847 destination in operands[0], a copy operation will be required. */
848
849rtx
850ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
851 rtx operands[])
852{
853 rtx dst = operands[0];
854 rtx src1 = operands[1];
855 rtx src2 = operands[2];
856
857 /* Canonicalize operand order. */
858 if (ix86_swap_binary_operands_p (code, mode, operands))
859 {
860 /* It is invalid to swap operands of different modes. */
861 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
862
863 std::swap (src1, src2);
864 }
865
866 /* Both source operands cannot be in memory. */
867 if (MEM_P (src1) && MEM_P (src2))
868 {
869 /* Optimization: Only read from memory once. */
870 if (rtx_equal_p (src1, src2))
871 {
872 src2 = force_reg (mode, src2);
873 src1 = src2;
874 }
875 else if (rtx_equal_p (dst, src1))
876 src2 = force_reg (mode, src2);
877 else
878 src1 = force_reg (mode, src1);
879 }
880
881 /* If the destination is memory, and we do not have matching source
882 operands, do things in registers. */
883 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
884 dst = gen_reg_rtx (mode);
885
886 /* Source 1 cannot be a constant. */
887 if (CONSTANT_P (src1))
888 src1 = force_reg (mode, src1);
889
890 /* Source 1 cannot be a non-matching memory. */
891 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
892 src1 = force_reg (mode, src1);
893
894 /* Improve address combine. */
895 if (code == PLUS
896 && GET_MODE_CLASS (mode) == MODE_INT
897 && MEM_P (src2))
898 src2 = force_reg (mode, src2);
899
900 operands[1] = src1;
901 operands[2] = src2;
902 return dst;
903}
904
905/* Similarly, but assume that the destination has already been
906 set up properly. */
907
908void
909ix86_fixup_binary_operands_no_copy (enum rtx_code code,
910 machine_mode mode, rtx operands[])
911{
912 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
913 gcc_assert (dst == operands[0]);
914}
915
916/* Attempt to expand a binary operator. Make the expansion closer to the
917 actual machine, then just general_operand, which will allow 3 separate
918 memory references (one output, two input) in a single insn. */
919
920void
921ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
922 rtx operands[])
923{
924 rtx src1, src2, dst, op, clob;
925
926 dst = ix86_fixup_binary_operands (code, mode, operands);
927 src1 = operands[1];
928 src2 = operands[2];
929
930 /* Emit the instruction. */
931
932 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
933
934 if (reload_completed
935 && code == PLUS
936 && !rtx_equal_p (dst, src1))
937 {
938 /* This is going to be an LEA; avoid splitting it later. */
939 emit_insn (op);
940 }
941 else
942 {
943 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
944 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
945 }
946
947 /* Fix up the destination if needed. */
948 if (dst != operands[0])
949 emit_move_insn (operands[0], dst);
950}
951
952/* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
953 the given OPERANDS. */
954
955void
956ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
957 rtx operands[])
958{
959 rtx op1 = NULL_RTX, op2 = NULL_RTX;
960 if (SUBREG_P (operands[1]))
961 {
962 op1 = operands[1];
963 op2 = operands[2];
964 }
965 else if (SUBREG_P (operands[2]))
966 {
967 op1 = operands[2];
968 op2 = operands[1];
969 }
970 /* Optimize (__m128i) d | (__m128i) e and similar code
971 when d and e are float vectors into float vector logical
972 insn. In C/C++ without using intrinsics there is no other way
973 to express vector logical operation on float vectors than
974 to cast them temporarily to integer vectors. */
975 if (op1
976 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
977 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
978 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
979 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
980 && SUBREG_BYTE (op1) == 0
981 && (GET_CODE (op2) == CONST_VECTOR
982 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
983 && SUBREG_BYTE (op2) == 0))
984 && can_create_pseudo_p ())
985 {
986 rtx dst;
987 switch (GET_MODE (SUBREG_REG (op1)))
988 {
989 case E_V4SFmode:
990 case E_V8SFmode:
991 case E_V16SFmode:
992 case E_V2DFmode:
993 case E_V4DFmode:
994 case E_V8DFmode:
995 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
996 if (GET_CODE (op2) == CONST_VECTOR)
997 {
998 op2 = gen_lowpart (GET_MODE (dst), op2);
999 op2 = force_reg (GET_MODE (dst), op2);
1000 }
1001 else
1002 {
1003 op1 = operands[1];
1004 op2 = SUBREG_REG (operands[2]);
1005 if (!vector_operand (op2, GET_MODE (dst)))
1006 op2 = force_reg (GET_MODE (dst), op2);
1007 }
1008 op1 = SUBREG_REG (op1);
1009 if (!vector_operand (op1, GET_MODE (dst)))
1010 op1 = force_reg (GET_MODE (dst), op1);
1011 emit_insn (gen_rtx_SET (dst,
1012 gen_rtx_fmt_ee (code, GET_MODE (dst),
1013 op1, op2)));
1014 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1015 return;
1016 default:
1017 break;
1018 }
1019 }
1020 if (!vector_operand (operands[1], mode))
1021 operands[1] = force_reg (mode, operands[1]);
1022 if (!vector_operand (operands[2], mode))
1023 operands[2] = force_reg (mode, operands[2]);
1024 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1025 emit_insn (gen_rtx_SET (operands[0],
1026 gen_rtx_fmt_ee (code, mode, operands[1],
1027 operands[2])));
1028}
1029
1030/* Return TRUE or FALSE depending on whether the binary operator meets the
1031 appropriate constraints. */
1032
1033bool
1034ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1035 rtx operands[3])
1036{
1037 rtx dst = operands[0];
1038 rtx src1 = operands[1];
1039 rtx src2 = operands[2];
1040
1041 /* Both source operands cannot be in memory. */
1042 if (MEM_P (src1) && MEM_P (src2))
1043 return false;
1044
1045 /* Canonicalize operand order for commutative operators. */
1046 if (ix86_swap_binary_operands_p (code, mode, operands))
1047 std::swap (src1, src2);
1048
1049 /* If the destination is memory, we must have a matching source operand. */
1050 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1051 return false;
1052
1053 /* Source 1 cannot be a constant. */
1054 if (CONSTANT_P (src1))
1055 return false;
1056
1057 /* Source 1 cannot be a non-matching memory. */
1058 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1059 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1060 return (code == AND
1061 && (mode == HImode
1062 || mode == SImode
1063 || (TARGET_64BIT && mode == DImode))
1064 && satisfies_constraint_L (src2));
1065
1066 return true;
1067}
1068
1069/* Attempt to expand a unary operator. Make the expansion closer to the
1070 actual machine, then just general_operand, which will allow 2 separate
1071 memory references (one output, one input) in a single insn. */
1072
1073void
1074ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1075 rtx operands[])
1076{
1077 bool matching_memory = false;
1078 rtx src, dst, op, clob;
1079
1080 dst = operands[0];
1081 src = operands[1];
1082
1083 /* If the destination is memory, and we do not have matching source
1084 operands, do things in registers. */
1085 if (MEM_P (dst))
1086 {
1087 if (rtx_equal_p (dst, src))
1088 matching_memory = true;
1089 else
1090 dst = gen_reg_rtx (mode);
1091 }
1092
1093 /* When source operand is memory, destination must match. */
1094 if (MEM_P (src) && !matching_memory)
1095 src = force_reg (mode, src);
1096
1097 /* Emit the instruction. */
1098
1099 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1100
1101 if (code == NOT)
1102 emit_insn (op);
1103 else
1104 {
1105 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1106 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1107 }
1108
1109 /* Fix up the destination if needed. */
1110 if (dst != operands[0])
1111 emit_move_insn (operands[0], dst);
1112}
1113
1114/* Predict just emitted jump instruction to be taken with probability PROB. */
1115
1116static void
1117predict_jump (int prob)
1118{
1119 rtx_insn *insn = get_last_insn ();
1120 gcc_assert (JUMP_P (insn));
1121 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1122}
1123
1124/* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1125 divisor are within the range [0-255]. */
1126
1127void
1128ix86_split_idivmod (machine_mode mode, rtx operands[],
40c81f84 1129 bool unsigned_p)
2bf6d935
ML
1130{
1131 rtx_code_label *end_label, *qimode_label;
1132 rtx div, mod;
1133 rtx_insn *insn;
1134 rtx scratch, tmp0, tmp1, tmp2;
1135 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
2bf6d935
ML
1136
1137 switch (mode)
1138 {
1139 case E_SImode:
1140 if (GET_MODE (operands[0]) == SImode)
1141 {
1142 if (GET_MODE (operands[1]) == SImode)
40c81f84 1143 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
2bf6d935
ML
1144 else
1145 gen_divmod4_1
40c81f84 1146 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
2bf6d935
ML
1147 }
1148 else
ea298f7a
UB
1149 gen_divmod4_1
1150 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
2bf6d935 1151 break;
ea298f7a 1152
2bf6d935 1153 case E_DImode:
40c81f84 1154 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
2bf6d935 1155 break;
ea298f7a 1156
2bf6d935
ML
1157 default:
1158 gcc_unreachable ();
1159 }
1160
1161 end_label = gen_label_rtx ();
1162 qimode_label = gen_label_rtx ();
1163
1164 scratch = gen_reg_rtx (mode);
1165
1166 /* Use 8bit unsigned divimod if dividend and divisor are within
1167 the range [0-255]. */
1168 emit_move_insn (scratch, operands[2]);
1169 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1170 scratch, 1, OPTAB_DIRECT);
ea298f7a 1171 emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
2bf6d935
ML
1172 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1173 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1174 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1175 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1176 pc_rtx);
1177 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1178 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1179 JUMP_LABEL (insn) = qimode_label;
1180
1181 /* Generate original signed/unsigned divimod. */
1182 div = gen_divmod4_1 (operands[0], operands[1],
1183 operands[2], operands[3]);
1184 emit_insn (div);
1185
1186 /* Branch to the end. */
1187 emit_jump_insn (gen_jump (end_label));
1188 emit_barrier ();
1189
1190 /* Generate 8bit unsigned divide. */
1191 emit_label (qimode_label);
1192 /* Don't use operands[0] for result of 8bit divide since not all
1193 registers support QImode ZERO_EXTRACT. */
1194 tmp0 = lowpart_subreg (HImode, scratch, mode);
1195 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1196 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1197 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1198
40c81f84 1199 if (unsigned_p)
2bf6d935 1200 {
40c81f84
UB
1201 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1202 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
2bf6d935
ML
1203 }
1204 else
1205 {
40c81f84
UB
1206 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1207 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
2bf6d935
ML
1208 }
1209 if (mode == SImode)
1210 {
1211 if (GET_MODE (operands[0]) != SImode)
1212 div = gen_rtx_ZERO_EXTEND (DImode, div);
1213 if (GET_MODE (operands[1]) != SImode)
1214 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1215 }
1216
1217 /* Extract remainder from AH. */
1218 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
1219 tmp0, GEN_INT (8), GEN_INT (8));
1220 if (REG_P (operands[1]))
1221 insn = emit_move_insn (operands[1], tmp1);
1222 else
1223 {
1224 /* Need a new scratch register since the old one has result
1225 of 8bit divide. */
1226 scratch = gen_reg_rtx (GET_MODE (operands[1]));
1227 emit_move_insn (scratch, tmp1);
1228 insn = emit_move_insn (operands[1], scratch);
1229 }
1230 set_unique_reg_note (insn, REG_EQUAL, mod);
1231
1232 /* Zero extend quotient from AL. */
1233 tmp1 = gen_lowpart (QImode, tmp0);
ea298f7a
UB
1234 insn = emit_insn (gen_extend_insn
1235 (operands[0], tmp1,
1236 GET_MODE (operands[0]), QImode, 1));
2bf6d935
ML
1237 set_unique_reg_note (insn, REG_EQUAL, div);
1238
1239 emit_label (end_label);
1240}
1241
1242/* Emit x86 binary operand CODE in mode MODE, where the first operand
1243 matches destination. RTX includes clobber of FLAGS_REG. */
1244
1245void
1246ix86_emit_binop (enum rtx_code code, machine_mode mode,
1247 rtx dst, rtx src)
1248{
1249 rtx op, clob;
1250
1251 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1252 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1253
1254 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1255}
1256
1257/* Return true if regno1 def is nearest to the insn. */
1258
1259static bool
1260find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1261{
1262 rtx_insn *prev = insn;
1263 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1264
1265 if (insn == start)
1266 return false;
1267 while (prev && prev != start)
1268 {
1269 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1270 {
1271 prev = PREV_INSN (prev);
1272 continue;
1273 }
1274 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1275 return true;
1276 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1277 return false;
1278 prev = PREV_INSN (prev);
1279 }
1280
1281 /* None of the regs is defined in the bb. */
1282 return false;
1283}
1284
1285/* Split lea instructions into a sequence of instructions
1286 which are executed on ALU to avoid AGU stalls.
1287 It is assumed that it is allowed to clobber flags register
1288 at lea position. */
1289
1290void
1291ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1292{
1293 unsigned int regno0, regno1, regno2;
1294 struct ix86_address parts;
1295 rtx target, tmp;
1296 int ok, adds;
1297
1298 ok = ix86_decompose_address (operands[1], &parts);
1299 gcc_assert (ok);
1300
1301 target = gen_lowpart (mode, operands[0]);
1302
1303 regno0 = true_regnum (target);
1304 regno1 = INVALID_REGNUM;
1305 regno2 = INVALID_REGNUM;
1306
1307 if (parts.base)
1308 {
1309 parts.base = gen_lowpart (mode, parts.base);
1310 regno1 = true_regnum (parts.base);
1311 }
1312
1313 if (parts.index)
1314 {
1315 parts.index = gen_lowpart (mode, parts.index);
1316 regno2 = true_regnum (parts.index);
1317 }
1318
1319 if (parts.disp)
1320 parts.disp = gen_lowpart (mode, parts.disp);
1321
1322 if (parts.scale > 1)
1323 {
1324 /* Case r1 = r1 + ... */
1325 if (regno1 == regno0)
1326 {
1327 /* If we have a case r1 = r1 + C * r2 then we
1328 should use multiplication which is very
1329 expensive. Assume cost model is wrong if we
1330 have such case here. */
1331 gcc_assert (regno2 != regno0);
1332
1333 for (adds = parts.scale; adds > 0; adds--)
1334 ix86_emit_binop (PLUS, mode, target, parts.index);
1335 }
1336 else
1337 {
1338 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1339 if (regno0 != regno2)
1340 emit_insn (gen_rtx_SET (target, parts.index));
1341
1342 /* Use shift for scaling. */
1343 ix86_emit_binop (ASHIFT, mode, target,
1344 GEN_INT (exact_log2 (parts.scale)));
1345
1346 if (parts.base)
1347 ix86_emit_binop (PLUS, mode, target, parts.base);
1348
1349 if (parts.disp && parts.disp != const0_rtx)
1350 ix86_emit_binop (PLUS, mode, target, parts.disp);
1351 }
1352 }
1353 else if (!parts.base && !parts.index)
1354 {
1355 gcc_assert(parts.disp);
1356 emit_insn (gen_rtx_SET (target, parts.disp));
1357 }
1358 else
1359 {
1360 if (!parts.base)
1361 {
1362 if (regno0 != regno2)
1363 emit_insn (gen_rtx_SET (target, parts.index));
1364 }
1365 else if (!parts.index)
1366 {
1367 if (regno0 != regno1)
1368 emit_insn (gen_rtx_SET (target, parts.base));
1369 }
1370 else
1371 {
1372 if (regno0 == regno1)
1373 tmp = parts.index;
1374 else if (regno0 == regno2)
1375 tmp = parts.base;
1376 else
1377 {
1378 rtx tmp1;
1379
1380 /* Find better operand for SET instruction, depending
1381 on which definition is farther from the insn. */
1382 if (find_nearest_reg_def (insn, regno1, regno2))
1383 tmp = parts.index, tmp1 = parts.base;
1384 else
1385 tmp = parts.base, tmp1 = parts.index;
1386
1387 emit_insn (gen_rtx_SET (target, tmp));
1388
1389 if (parts.disp && parts.disp != const0_rtx)
1390 ix86_emit_binop (PLUS, mode, target, parts.disp);
1391
1392 ix86_emit_binop (PLUS, mode, target, tmp1);
1393 return;
1394 }
1395
1396 ix86_emit_binop (PLUS, mode, target, tmp);
1397 }
1398
1399 if (parts.disp && parts.disp != const0_rtx)
1400 ix86_emit_binop (PLUS, mode, target, parts.disp);
1401 }
1402}
1403
1404/* Post-reload splitter for converting an SF or DFmode value in an
1405 SSE register into an unsigned SImode. */
1406
1407void
1408ix86_split_convert_uns_si_sse (rtx operands[])
1409{
1410 machine_mode vecmode;
1411 rtx value, large, zero_or_two31, input, two31, x;
1412
1413 large = operands[1];
1414 zero_or_two31 = operands[2];
1415 input = operands[3];
1416 two31 = operands[4];
1417 vecmode = GET_MODE (large);
1418 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1419
1420 /* Load up the value into the low element. We must ensure that the other
1421 elements are valid floats -- zero is the easiest such value. */
1422 if (MEM_P (input))
1423 {
1424 if (vecmode == V4SFmode)
1425 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1426 else
1427 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1428 }
1429 else
1430 {
1431 input = gen_rtx_REG (vecmode, REGNO (input));
1432 emit_move_insn (value, CONST0_RTX (vecmode));
1433 if (vecmode == V4SFmode)
1434 emit_insn (gen_sse_movss (value, value, input));
1435 else
1436 emit_insn (gen_sse2_movsd (value, value, input));
1437 }
1438
1439 emit_move_insn (large, two31);
1440 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1441
1442 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1443 emit_insn (gen_rtx_SET (large, x));
1444
1445 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1446 emit_insn (gen_rtx_SET (zero_or_two31, x));
1447
1448 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1449 emit_insn (gen_rtx_SET (value, x));
1450
1451 large = gen_rtx_REG (V4SImode, REGNO (large));
1452 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1453
1454 x = gen_rtx_REG (V4SImode, REGNO (value));
1455 if (vecmode == V4SFmode)
1456 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1457 else
1458 emit_insn (gen_sse2_cvttpd2dq (x, value));
1459 value = x;
1460
1461 emit_insn (gen_xorv4si3 (value, value, large));
1462}
1463
1464static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1465 machine_mode mode, rtx target,
1466 rtx var, int one_var);
1467
1468/* Convert an unsigned DImode value into a DFmode, using only SSE.
1469 Expects the 64-bit DImode to be supplied in a pair of integral
1470 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1471 -mfpmath=sse, !optimize_size only. */
1472
1473void
1474ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1475{
1476 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1477 rtx int_xmm, fp_xmm;
1478 rtx biases, exponents;
1479 rtx x;
1480
1481 int_xmm = gen_reg_rtx (V4SImode);
1482 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1483 emit_insn (gen_movdi_to_sse (int_xmm, input));
1484 else if (TARGET_SSE_SPLIT_REGS)
1485 {
1486 emit_clobber (int_xmm);
1487 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1488 }
1489 else
1490 {
1491 x = gen_reg_rtx (V2DImode);
1492 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1493 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1494 }
1495
1496 x = gen_rtx_CONST_VECTOR (V4SImode,
1497 gen_rtvec (4, GEN_INT (0x43300000UL),
1498 GEN_INT (0x45300000UL),
1499 const0_rtx, const0_rtx));
1500 exponents = validize_mem (force_const_mem (V4SImode, x));
1501
1502 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1503 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1504
1505 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1506 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1507 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1508 (0x1.0p84 + double(fp_value_hi_xmm)).
1509 Note these exponents differ by 32. */
1510
1511 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1512
1513 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1514 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1515 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1516 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1517 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1518 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1519 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1520 biases = validize_mem (force_const_mem (V2DFmode, biases));
1521 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1522
1523 /* Add the upper and lower DFmode values together. */
1524 if (TARGET_SSE3)
1525 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1526 else
1527 {
1528 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1529 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1530 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1531 }
1532
1533 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1534}
1535
1536/* Not used, but eases macroization of patterns. */
1537void
1538ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1539{
1540 gcc_unreachable ();
1541}
1542
1543/* Convert an unsigned SImode value into a DFmode. Only currently used
1544 for SSE, but applicable anywhere. */
1545
1546void
1547ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1548{
1549 REAL_VALUE_TYPE TWO31r;
1550 rtx x, fp;
1551
1552 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1553 NULL, 1, OPTAB_DIRECT);
1554
1555 fp = gen_reg_rtx (DFmode);
1556 emit_insn (gen_floatsidf2 (fp, x));
1557
1558 real_ldexp (&TWO31r, &dconst1, 31);
1559 x = const_double_from_real_value (TWO31r, DFmode);
1560
1561 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1562 if (x != target)
1563 emit_move_insn (target, x);
1564}
1565
1566/* Convert a signed DImode value into a DFmode. Only used for SSE in
1567 32-bit mode; otherwise we have a direct convert instruction. */
1568
1569void
1570ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1571{
1572 REAL_VALUE_TYPE TWO32r;
1573 rtx fp_lo, fp_hi, x;
1574
1575 fp_lo = gen_reg_rtx (DFmode);
1576 fp_hi = gen_reg_rtx (DFmode);
1577
1578 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1579
1580 real_ldexp (&TWO32r, &dconst1, 32);
1581 x = const_double_from_real_value (TWO32r, DFmode);
1582 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1583
1584 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1585
1586 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1587 0, OPTAB_DIRECT);
1588 if (x != target)
1589 emit_move_insn (target, x);
1590}
1591
1592/* Convert an unsigned SImode value into a SFmode, using only SSE.
1593 For x86_32, -mfpmath=sse, !optimize_size only. */
1594void
1595ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1596{
1597 REAL_VALUE_TYPE ONE16r;
1598 rtx fp_hi, fp_lo, int_hi, int_lo, x;
1599
1600 real_ldexp (&ONE16r, &dconst1, 16);
1601 x = const_double_from_real_value (ONE16r, SFmode);
1602 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1603 NULL, 0, OPTAB_DIRECT);
1604 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1605 NULL, 0, OPTAB_DIRECT);
1606 fp_hi = gen_reg_rtx (SFmode);
1607 fp_lo = gen_reg_rtx (SFmode);
1608 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1609 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
1610 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1611 0, OPTAB_DIRECT);
1612 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1613 0, OPTAB_DIRECT);
1614 if (!rtx_equal_p (target, fp_hi))
1615 emit_move_insn (target, fp_hi);
1616}
1617
1618/* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1619 a vector of unsigned ints VAL to vector of floats TARGET. */
1620
1621void
1622ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1623{
1624 rtx tmp[8];
1625 REAL_VALUE_TYPE TWO16r;
1626 machine_mode intmode = GET_MODE (val);
1627 machine_mode fltmode = GET_MODE (target);
1628 rtx (*cvt) (rtx, rtx);
1629
1630 if (intmode == V4SImode)
1631 cvt = gen_floatv4siv4sf2;
1632 else
1633 cvt = gen_floatv8siv8sf2;
1634 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1635 tmp[0] = force_reg (intmode, tmp[0]);
1636 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1637 OPTAB_DIRECT);
1638 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1639 NULL_RTX, 1, OPTAB_DIRECT);
1640 tmp[3] = gen_reg_rtx (fltmode);
1641 emit_insn (cvt (tmp[3], tmp[1]));
1642 tmp[4] = gen_reg_rtx (fltmode);
1643 emit_insn (cvt (tmp[4], tmp[2]));
1644 real_ldexp (&TWO16r, &dconst1, 16);
1645 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
1646 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
1647 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
1648 OPTAB_DIRECT);
1649 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
1650 OPTAB_DIRECT);
1651 if (tmp[7] != target)
1652 emit_move_insn (target, tmp[7]);
1653}
1654
1655/* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1656 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1657 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1658 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1659
1660rtx
1661ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
1662{
1663 REAL_VALUE_TYPE TWO31r;
1664 rtx two31r, tmp[4];
1665 machine_mode mode = GET_MODE (val);
1666 machine_mode scalarmode = GET_MODE_INNER (mode);
1667 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
1668 rtx (*cmp) (rtx, rtx, rtx, rtx);
1669 int i;
1670
1671 for (i = 0; i < 3; i++)
1672 tmp[i] = gen_reg_rtx (mode);
1673 real_ldexp (&TWO31r, &dconst1, 31);
1674 two31r = const_double_from_real_value (TWO31r, scalarmode);
1675 two31r = ix86_build_const_vector (mode, 1, two31r);
1676 two31r = force_reg (mode, two31r);
1677 switch (mode)
1678 {
1679 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
1680 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
1681 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
1682 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
1683 default: gcc_unreachable ();
1684 }
1685 tmp[3] = gen_rtx_LE (mode, two31r, val);
1686 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
1687 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
1688 0, OPTAB_DIRECT);
1689 if (intmode == V4SImode || TARGET_AVX2)
1690 *xorp = expand_simple_binop (intmode, ASHIFT,
1691 gen_lowpart (intmode, tmp[0]),
1692 GEN_INT (31), NULL_RTX, 0,
1693 OPTAB_DIRECT);
1694 else
1695 {
6a556ba4 1696 rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
2bf6d935
ML
1697 two31 = ix86_build_const_vector (intmode, 1, two31);
1698 *xorp = expand_simple_binop (intmode, AND,
1699 gen_lowpart (intmode, tmp[0]),
1700 two31, NULL_RTX, 0,
1701 OPTAB_DIRECT);
1702 }
1703 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
1704 0, OPTAB_DIRECT);
1705}
1706
1707/* Generate code for floating point ABS or NEG. */
1708
1709void
1710ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1711 rtx operands[])
1712{
f359611b 1713 rtx set, dst, src;
2bf6d935
ML
1714 bool use_sse = false;
1715 bool vector_mode = VECTOR_MODE_P (mode);
1716 machine_mode vmode = mode;
f359611b 1717 rtvec par;
2bf6d935 1718
94f687bd 1719 if (vector_mode || mode == TFmode)
2bf6d935
ML
1720 use_sse = true;
1721 else if (TARGET_SSE_MATH)
1722 {
1723 use_sse = SSE_FLOAT_MODE_P (mode);
1724 if (mode == SFmode)
1725 vmode = V4SFmode;
1726 else if (mode == DFmode)
1727 vmode = V2DFmode;
1728 }
1729
2bf6d935
ML
1730 dst = operands[0];
1731 src = operands[1];
1732
1733 set = gen_rtx_fmt_e (code, mode, src);
1734 set = gen_rtx_SET (dst, set);
1735
f359611b 1736 if (use_sse)
2bf6d935 1737 {
f359611b 1738 rtx mask, use, clob;
2bf6d935 1739
f359611b
UB
1740 /* NEG and ABS performed with SSE use bitwise mask operations.
1741 Create the appropriate mask now. */
1742 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
2bf6d935 1743 use = gen_rtx_USE (VOIDmode, mask);
94f687bd 1744 if (vector_mode || mode == TFmode)
2bf6d935
ML
1745 par = gen_rtvec (2, set, use);
1746 else
1747 {
1748 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1749 par = gen_rtvec (3, set, use, clob);
1750 }
2bf6d935
ML
1751 }
1752 else
f359611b
UB
1753 {
1754 rtx clob;
1755
1756 /* Changing of sign for FP values is doable using integer unit too. */
1757 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1758 par = gen_rtvec (2, set, clob);
1759 }
1760
1761 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1762}
1763
1764/* Deconstruct a floating point ABS or NEG operation
1765 with integer registers into integer operations. */
1766
1767void
1768ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1769 rtx operands[])
1770{
1771 enum rtx_code absneg_op;
1772 rtx dst, set;
1773
1774 gcc_assert (operands_match_p (operands[0], operands[1]));
1775
1776 switch (mode)
1777 {
1778 case E_SFmode:
1779 dst = gen_lowpart (SImode, operands[0]);
1780
1781 if (code == ABS)
1782 {
1783 set = gen_int_mode (0x7fffffff, SImode);
1784 absneg_op = AND;
1785 }
1786 else
1787 {
1788 set = gen_int_mode (0x80000000, SImode);
1789 absneg_op = XOR;
1790 }
1791 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1792 break;
1793
1794 case E_DFmode:
1795 if (TARGET_64BIT)
1796 {
1797 dst = gen_lowpart (DImode, operands[0]);
1798 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
1799
1800 if (code == ABS)
1801 set = const0_rtx;
1802 else
1803 set = gen_rtx_NOT (DImode, dst);
1804 }
1805 else
1806 {
1807 dst = gen_highpart (SImode, operands[0]);
1808
1809 if (code == ABS)
1810 {
1811 set = gen_int_mode (0x7fffffff, SImode);
1812 absneg_op = AND;
1813 }
1814 else
1815 {
1816 set = gen_int_mode (0x80000000, SImode);
1817 absneg_op = XOR;
1818 }
1819 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1820 }
1821 break;
1822
1823 case E_XFmode:
1824 dst = gen_rtx_REG (SImode,
1825 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
1826 if (code == ABS)
1827 {
1828 set = GEN_INT (0x7fff);
1829 absneg_op = AND;
1830 }
1831 else
1832 {
1833 set = GEN_INT (0x8000);
1834 absneg_op = XOR;
1835 }
1836 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1837 break;
1838
1839 default:
1840 gcc_unreachable ();
1841 }
1842
1843 set = gen_rtx_SET (dst, set);
1844
1845 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1846 rtvec par = gen_rtvec (2, set, clob);
1847
1848 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2bf6d935
ML
1849}
1850
1851/* Expand a copysign operation. Special case operand 0 being a constant. */
1852
1853void
1854ix86_expand_copysign (rtx operands[])
1855{
1856 machine_mode mode, vmode;
987a3082 1857 rtx dest, op0, op1, mask;
2bf6d935
ML
1858
1859 dest = operands[0];
1860 op0 = operands[1];
1861 op1 = operands[2];
1862
1863 mode = GET_MODE (dest);
1864
1865 if (mode == SFmode)
1866 vmode = V4SFmode;
1867 else if (mode == DFmode)
1868 vmode = V2DFmode;
987a3082 1869 else if (mode == TFmode)
2bf6d935 1870 vmode = mode;
987a3082
UB
1871 else
1872 gcc_unreachable ();
1873
1874 mask = ix86_build_signbit_mask (vmode, 0, 0);
2bf6d935
ML
1875
1876 if (CONST_DOUBLE_P (op0))
1877 {
2bf6d935
ML
1878 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
1879 op0 = simplify_unary_operation (ABS, mode, op0, mode);
1880
1881 if (mode == SFmode || mode == DFmode)
1882 {
1883 if (op0 == CONST0_RTX (mode))
1884 op0 = CONST0_RTX (vmode);
1885 else
1886 {
1887 rtx v = ix86_build_const_vector (vmode, false, op0);
1888
1889 op0 = force_reg (vmode, v);
1890 }
1891 }
1892 else if (op0 != CONST0_RTX (mode))
1893 op0 = force_reg (mode, op0);
1894
987a3082 1895 emit_insn (gen_copysign3_const (mode, dest, op0, op1, mask));
2bf6d935
ML
1896 }
1897 else
1898 {
987a3082 1899 rtx nmask = ix86_build_signbit_mask (vmode, 0, 1);
2bf6d935 1900
987a3082
UB
1901 emit_insn (gen_copysign3_var
1902 (mode, dest, NULL_RTX, op0, op1, nmask, mask));
2bf6d935
ML
1903 }
1904}
1905
1906/* Deconstruct a copysign operation into bit masks. Operand 0 is known to
1907 be a constant, and so has already been expanded into a vector constant. */
1908
1909void
1910ix86_split_copysign_const (rtx operands[])
1911{
1912 machine_mode mode, vmode;
1913 rtx dest, op0, mask, x;
1914
1915 dest = operands[0];
1916 op0 = operands[1];
1917 mask = operands[3];
1918
1919 mode = GET_MODE (dest);
1920 vmode = GET_MODE (mask);
1921
1922 dest = lowpart_subreg (vmode, dest, mode);
1923 x = gen_rtx_AND (vmode, dest, mask);
1924 emit_insn (gen_rtx_SET (dest, x));
1925
1926 if (op0 != CONST0_RTX (vmode))
1927 {
1928 x = gen_rtx_IOR (vmode, dest, op0);
1929 emit_insn (gen_rtx_SET (dest, x));
1930 }
1931}
1932
1933/* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
1934 so we have to do two masks. */
1935
1936void
1937ix86_split_copysign_var (rtx operands[])
1938{
1939 machine_mode mode, vmode;
1940 rtx dest, scratch, op0, op1, mask, nmask, x;
1941
1942 dest = operands[0];
1943 scratch = operands[1];
1944 op0 = operands[2];
1945 op1 = operands[3];
1946 nmask = operands[4];
1947 mask = operands[5];
1948
1949 mode = GET_MODE (dest);
1950 vmode = GET_MODE (mask);
1951
1952 if (rtx_equal_p (op0, op1))
1953 {
1954 /* Shouldn't happen often (it's useless, obviously), but when it does
1955 we'd generate incorrect code if we continue below. */
1956 emit_move_insn (dest, op0);
1957 return;
1958 }
1959
1960 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
1961 {
1962 gcc_assert (REGNO (op1) == REGNO (scratch));
1963
1964 x = gen_rtx_AND (vmode, scratch, mask);
1965 emit_insn (gen_rtx_SET (scratch, x));
1966
1967 dest = mask;
1968 op0 = lowpart_subreg (vmode, op0, mode);
1969 x = gen_rtx_NOT (vmode, dest);
1970 x = gen_rtx_AND (vmode, x, op0);
1971 emit_insn (gen_rtx_SET (dest, x));
1972 }
1973 else
1974 {
1975 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
1976 {
1977 x = gen_rtx_AND (vmode, scratch, mask);
1978 }
1979 else /* alternative 2,4 */
1980 {
1981 gcc_assert (REGNO (mask) == REGNO (scratch));
1982 op1 = lowpart_subreg (vmode, op1, mode);
1983 x = gen_rtx_AND (vmode, scratch, op1);
1984 }
1985 emit_insn (gen_rtx_SET (scratch, x));
1986
1987 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
1988 {
1989 dest = lowpart_subreg (vmode, op0, mode);
1990 x = gen_rtx_AND (vmode, dest, nmask);
1991 }
1992 else /* alternative 3,4 */
1993 {
1994 gcc_assert (REGNO (nmask) == REGNO (dest));
1995 dest = nmask;
1996 op0 = lowpart_subreg (vmode, op0, mode);
1997 x = gen_rtx_AND (vmode, dest, op0);
1998 }
1999 emit_insn (gen_rtx_SET (dest, x));
2000 }
2001
2002 x = gen_rtx_IOR (vmode, dest, scratch);
2003 emit_insn (gen_rtx_SET (dest, x));
2004}
2005
2006/* Expand an xorsign operation. */
2007
2008void
2009ix86_expand_xorsign (rtx operands[])
2010{
2bf6d935
ML
2011 machine_mode mode, vmode;
2012 rtx dest, op0, op1, mask;
2013
2014 dest = operands[0];
2015 op0 = operands[1];
2016 op1 = operands[2];
2017
2018 mode = GET_MODE (dest);
2019
2020 if (mode == SFmode)
987a3082 2021 vmode = V4SFmode;
2bf6d935 2022 else if (mode == DFmode)
987a3082 2023 vmode = V2DFmode;
2bf6d935
ML
2024 else
2025 gcc_unreachable ();
2026
2027 mask = ix86_build_signbit_mask (vmode, 0, 0);
2028
987a3082 2029 emit_insn (gen_xorsign3_1 (mode, dest, op0, op1, mask));
2bf6d935
ML
2030}
2031
2032/* Deconstruct an xorsign operation into bit masks. */
2033
2034void
2035ix86_split_xorsign (rtx operands[])
2036{
2037 machine_mode mode, vmode;
2038 rtx dest, op0, mask, x;
2039
2040 dest = operands[0];
2041 op0 = operands[1];
2042 mask = operands[3];
2043
2044 mode = GET_MODE (dest);
2045 vmode = GET_MODE (mask);
2046
2047 dest = lowpart_subreg (vmode, dest, mode);
2048 x = gen_rtx_AND (vmode, dest, mask);
2049 emit_insn (gen_rtx_SET (dest, x));
2050
2051 op0 = lowpart_subreg (vmode, op0, mode);
2052 x = gen_rtx_XOR (vmode, dest, op0);
2053 emit_insn (gen_rtx_SET (dest, x));
2054}
2055
2056static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2057
2058void
2059ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2060{
2061 machine_mode mode = GET_MODE (op0);
2062 rtx tmp;
2063
2064 /* Handle special case - vector comparsion with boolean result, transform
2065 it using ptest instruction. */
2066 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
2067 {
2068 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2069 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2070
2071 gcc_assert (code == EQ || code == NE);
2072 /* Generate XOR since we can't check that one operand is zero vector. */
2073 tmp = gen_reg_rtx (mode);
2074 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2075 tmp = gen_lowpart (p_mode, tmp);
2076 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2077 gen_rtx_UNSPEC (CCmode,
2078 gen_rtvec (2, tmp, tmp),
2079 UNSPEC_PTEST)));
2080 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2081 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2082 gen_rtx_LABEL_REF (VOIDmode, label),
2083 pc_rtx);
2084 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2085 return;
2086 }
2087
2088 switch (mode)
2089 {
2090 case E_SFmode:
2091 case E_DFmode:
2092 case E_XFmode:
2093 case E_QImode:
2094 case E_HImode:
2095 case E_SImode:
2096 simple:
2097 tmp = ix86_expand_compare (code, op0, op1);
2098 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2099 gen_rtx_LABEL_REF (VOIDmode, label),
2100 pc_rtx);
2101 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2102 return;
2103
2104 case E_DImode:
2105 if (TARGET_64BIT)
2106 goto simple;
2107 /* For 32-bit target DI comparison may be performed on
2108 SSE registers. To allow this we should avoid split
2109 to SI mode which is achieved by doing xor in DI mode
2110 and then comparing with zero (which is recognized by
2111 STV pass). We don't compare using xor when optimizing
2112 for size. */
2113 if (!optimize_insn_for_size_p ()
2114 && TARGET_STV
2115 && (code == EQ || code == NE))
2116 {
2117 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
2118 op1 = const0_rtx;
2119 }
2120 /* FALLTHRU */
2121 case E_TImode:
2122 /* Expand DImode branch into multiple compare+branch. */
2123 {
2124 rtx lo[2], hi[2];
2125 rtx_code_label *label2;
2126 enum rtx_code code1, code2, code3;
2127 machine_mode submode;
2128
2129 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2130 {
2131 std::swap (op0, op1);
2132 code = swap_condition (code);
2133 }
2134
2135 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2136 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2137
2138 submode = mode == DImode ? SImode : DImode;
2139
2140 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2141 avoid two branches. This costs one extra insn, so disable when
2142 optimizing for size. */
2143
2144 if ((code == EQ || code == NE)
2145 && (!optimize_insn_for_size_p ()
2146 || hi[1] == const0_rtx || lo[1] == const0_rtx))
2147 {
2148 rtx xor0, xor1;
2149
2150 xor1 = hi[0];
2151 if (hi[1] != const0_rtx)
2152 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
2153 NULL_RTX, 0, OPTAB_WIDEN);
2154
2155 xor0 = lo[0];
2156 if (lo[1] != const0_rtx)
2157 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
2158 NULL_RTX, 0, OPTAB_WIDEN);
2159
2160 tmp = expand_binop (submode, ior_optab, xor1, xor0,
2161 NULL_RTX, 0, OPTAB_WIDEN);
2162
2163 ix86_expand_branch (code, tmp, const0_rtx, label);
2164 return;
2165 }
2166
2167 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2168 op1 is a constant and the low word is zero, then we can just
2169 examine the high word. Similarly for low word -1 and
2170 less-or-equal-than or greater-than. */
2171
2172 if (CONST_INT_P (hi[1]))
2173 switch (code)
2174 {
2175 case LT: case LTU: case GE: case GEU:
2176 if (lo[1] == const0_rtx)
2177 {
2178 ix86_expand_branch (code, hi[0], hi[1], label);
2179 return;
2180 }
2181 break;
2182 case LE: case LEU: case GT: case GTU:
2183 if (lo[1] == constm1_rtx)
2184 {
2185 ix86_expand_branch (code, hi[0], hi[1], label);
2186 return;
2187 }
2188 break;
2189 default:
2190 break;
2191 }
2192
2193 /* Emulate comparisons that do not depend on Zero flag with
2194 double-word subtraction. Note that only Overflow, Sign
2195 and Carry flags are valid, so swap arguments and condition
2196 of comparisons that would otherwise test Zero flag. */
2197
2198 switch (code)
2199 {
2200 case LE: case LEU: case GT: case GTU:
2201 std::swap (lo[0], lo[1]);
2202 std::swap (hi[0], hi[1]);
2203 code = swap_condition (code);
2204 /* FALLTHRU */
2205
2206 case LT: case LTU: case GE: case GEU:
2207 {
2bf6d935 2208 bool uns = (code == LTU || code == GEU);
987a3082
UB
2209 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2210 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2bf6d935
ML
2211
2212 if (!nonimmediate_operand (lo[0], submode))
2213 lo[0] = force_reg (submode, lo[0]);
2214 if (!x86_64_general_operand (lo[1], submode))
2215 lo[1] = force_reg (submode, lo[1]);
2216
2217 if (!register_operand (hi[0], submode))
2218 hi[0] = force_reg (submode, hi[0]);
2219 if ((uns && !nonimmediate_operand (hi[1], submode))
2220 || (!uns && !x86_64_general_operand (hi[1], submode)))
2221 hi[1] = force_reg (submode, hi[1]);
2222
987a3082 2223 emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2bf6d935 2224
987a3082
UB
2225 tmp = gen_rtx_SCRATCH (submode);
2226 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2bf6d935 2227
987a3082 2228 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2bf6d935
ML
2229 ix86_expand_branch (code, tmp, const0_rtx, label);
2230 return;
2231 }
2232
2233 default:
2234 break;
2235 }
2236
2237 /* Otherwise, we need two or three jumps. */
2238
2239 label2 = gen_label_rtx ();
2240
2241 code1 = code;
2242 code2 = swap_condition (code);
2243 code3 = unsigned_condition (code);
2244
2245 switch (code)
2246 {
2247 case LT: case GT: case LTU: case GTU:
2248 break;
2249
2250 case LE: code1 = LT; code2 = GT; break;
2251 case GE: code1 = GT; code2 = LT; break;
2252 case LEU: code1 = LTU; code2 = GTU; break;
2253 case GEU: code1 = GTU; code2 = LTU; break;
2254
2255 case EQ: code1 = UNKNOWN; code2 = NE; break;
2256 case NE: code2 = UNKNOWN; break;
2257
2258 default:
2259 gcc_unreachable ();
2260 }
2261
2262 /*
2263 * a < b =>
2264 * if (hi(a) < hi(b)) goto true;
2265 * if (hi(a) > hi(b)) goto false;
2266 * if (lo(a) < lo(b)) goto true;
2267 * false:
2268 */
2269
2270 if (code1 != UNKNOWN)
2271 ix86_expand_branch (code1, hi[0], hi[1], label);
2272 if (code2 != UNKNOWN)
2273 ix86_expand_branch (code2, hi[0], hi[1], label2);
2274
2275 ix86_expand_branch (code3, lo[0], lo[1], label);
2276
2277 if (code2 != UNKNOWN)
2278 emit_label (label2);
2279 return;
2280 }
2281
2282 default:
2283 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2284 goto simple;
2285 }
2286}
2287
2288/* Figure out whether to use unordered fp comparisons. */
2289
2290static bool
2291ix86_unordered_fp_compare (enum rtx_code code)
2292{
2293 if (!TARGET_IEEE_FP)
2294 return false;
2295
2296 switch (code)
2297 {
2bf6d935
ML
2298 case LT:
2299 case LE:
d6038777
UB
2300 case GT:
2301 case GE:
2302 case LTGT:
2bf6d935
ML
2303 return false;
2304
2305 case EQ:
2306 case NE:
2307
2bf6d935
ML
2308 case UNORDERED:
2309 case ORDERED:
2310 case UNLT:
2311 case UNLE:
2312 case UNGT:
2313 case UNGE:
2314 case UNEQ:
2315 return true;
2316
2317 default:
2318 gcc_unreachable ();
2319 }
2320}
2321
2322/* Return a comparison we can do and that it is equivalent to
2323 swap_condition (code) apart possibly from orderedness.
2324 But, never change orderedness if TARGET_IEEE_FP, returning
2325 UNKNOWN in that case if necessary. */
2326
2327static enum rtx_code
2328ix86_fp_swap_condition (enum rtx_code code)
2329{
2330 switch (code)
2331 {
2332 case GT: /* GTU - CF=0 & ZF=0 */
2333 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2334 case GE: /* GEU - CF=0 */
2335 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2336 case UNLT: /* LTU - CF=1 */
2337 return TARGET_IEEE_FP ? UNKNOWN : GT;
2338 case UNLE: /* LEU - CF=1 | ZF=1 */
2339 return TARGET_IEEE_FP ? UNKNOWN : GE;
2340 default:
2341 return swap_condition (code);
2342 }
2343}
2344
2345/* Return cost of comparison CODE using the best strategy for performance.
2346 All following functions do use number of instructions as a cost metrics.
2347 In future this should be tweaked to compute bytes for optimize_size and
2348 take into account performance of various instructions on various CPUs. */
2349
2350static int
2351ix86_fp_comparison_cost (enum rtx_code code)
2352{
2353 int arith_cost;
2354
2355 /* The cost of code using bit-twiddling on %ah. */
2356 switch (code)
2357 {
2358 case UNLE:
2359 case UNLT:
2360 case LTGT:
2361 case GT:
2362 case GE:
2363 case UNORDERED:
2364 case ORDERED:
2365 case UNEQ:
2366 arith_cost = 4;
2367 break;
2368 case LT:
2369 case NE:
2370 case EQ:
2371 case UNGE:
2372 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2373 break;
2374 case LE:
2375 case UNGT:
2376 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2377 break;
2378 default:
2379 gcc_unreachable ();
2380 }
2381
2382 switch (ix86_fp_comparison_strategy (code))
2383 {
2384 case IX86_FPCMP_COMI:
2385 return arith_cost > 4 ? 3 : 2;
2386 case IX86_FPCMP_SAHF:
2387 return arith_cost > 4 ? 4 : 3;
2388 default:
2389 return arith_cost;
2390 }
2391}
2392
2393/* Swap, force into registers, or otherwise massage the two operands
2394 to a fp comparison. The operands are updated in place; the new
2395 comparison code is returned. */
2396
2397static enum rtx_code
2398ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2399{
2400 bool unordered_compare = ix86_unordered_fp_compare (code);
2401 rtx op0 = *pop0, op1 = *pop1;
2402 machine_mode op_mode = GET_MODE (op0);
2403 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
2404
2405 /* All of the unordered compare instructions only work on registers.
2406 The same is true of the fcomi compare instructions. The XFmode
2407 compare instructions require registers except when comparing
2408 against zero or when converting operand 1 from fixed point to
2409 floating point. */
2410
2411 if (!is_sse
2412 && (unordered_compare
2413 || (op_mode == XFmode
2414 && ! (standard_80387_constant_p (op0) == 1
2415 || standard_80387_constant_p (op1) == 1)
2416 && GET_CODE (op1) != FLOAT)
2417 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2418 {
2419 op0 = force_reg (op_mode, op0);
2420 op1 = force_reg (op_mode, op1);
2421 }
2422 else
2423 {
2424 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2425 things around if they appear profitable, otherwise force op0
2426 into a register. */
2427
2428 if (standard_80387_constant_p (op0) == 0
2429 || (MEM_P (op0)
2430 && ! (standard_80387_constant_p (op1) == 0
2431 || MEM_P (op1))))
2432 {
2433 enum rtx_code new_code = ix86_fp_swap_condition (code);
2434 if (new_code != UNKNOWN)
2435 {
2436 std::swap (op0, op1);
2437 code = new_code;
2438 }
2439 }
2440
2441 if (!REG_P (op0))
2442 op0 = force_reg (op_mode, op0);
2443
2444 if (CONSTANT_P (op1))
2445 {
2446 int tmp = standard_80387_constant_p (op1);
2447 if (tmp == 0)
2448 op1 = validize_mem (force_const_mem (op_mode, op1));
2449 else if (tmp == 1)
2450 {
2451 if (TARGET_CMOVE)
2452 op1 = force_reg (op_mode, op1);
2453 }
2454 else
2455 op1 = force_reg (op_mode, op1);
2456 }
2457 }
2458
2459 /* Try to rearrange the comparison to make it cheaper. */
2460 if (ix86_fp_comparison_cost (code)
2461 > ix86_fp_comparison_cost (swap_condition (code))
2462 && (REG_P (op1) || can_create_pseudo_p ()))
2463 {
2464 std::swap (op0, op1);
2465 code = swap_condition (code);
2466 if (!REG_P (op0))
2467 op0 = force_reg (op_mode, op0);
2468 }
2469
2470 *pop0 = op0;
2471 *pop1 = op1;
2472 return code;
2473}
2474
2475/* Generate insn patterns to do a floating point compare of OPERANDS. */
2476
2477static rtx
2478ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2479{
2480 bool unordered_compare = ix86_unordered_fp_compare (code);
2481 machine_mode cmp_mode;
2482 rtx tmp, scratch;
2483
2484 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2485
2486 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2487 if (unordered_compare)
2488 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2489
2490 /* Do fcomi/sahf based test when profitable. */
2491 switch (ix86_fp_comparison_strategy (code))
2492 {
2493 case IX86_FPCMP_COMI:
2494 cmp_mode = CCFPmode;
2495 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2496 break;
2497
2498 case IX86_FPCMP_SAHF:
2499 cmp_mode = CCFPmode;
2500 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2501 scratch = gen_reg_rtx (HImode);
2502 emit_insn (gen_rtx_SET (scratch, tmp));
2503 emit_insn (gen_x86_sahf_1 (scratch));
2504 break;
2505
2506 case IX86_FPCMP_ARITH:
2507 cmp_mode = CCNOmode;
2508 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2509 scratch = gen_reg_rtx (HImode);
2510 emit_insn (gen_rtx_SET (scratch, tmp));
2511
2512 /* In the unordered case, we have to check C2 for NaN's, which
2513 doesn't happen to work out to anything nice combination-wise.
2514 So do some bit twiddling on the value we've got in AH to come
2515 up with an appropriate set of condition codes. */
2516
2517 switch (code)
2518 {
2519 case GT:
2520 case UNGT:
2521 if (code == GT || !TARGET_IEEE_FP)
2522 {
2523 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2524 code = EQ;
2525 }
2526 else
2527 {
2528 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2529 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2530 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2531 cmp_mode = CCmode;
2532 code = GEU;
2533 }
2534 break;
2535 case LT:
2536 case UNLT:
2537 if (code == LT && TARGET_IEEE_FP)
2538 {
2539 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2540 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2541 cmp_mode = CCmode;
2542 code = EQ;
2543 }
2544 else
2545 {
2546 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2547 code = NE;
2548 }
2549 break;
2550 case GE:
2551 case UNGE:
2552 if (code == GE || !TARGET_IEEE_FP)
2553 {
2554 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2555 code = EQ;
2556 }
2557 else
2558 {
2559 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2560 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2561 code = NE;
2562 }
2563 break;
2564 case LE:
2565 case UNLE:
2566 if (code == LE && TARGET_IEEE_FP)
2567 {
2568 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2569 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2570 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2571 cmp_mode = CCmode;
2572 code = LTU;
2573 }
2574 else
2575 {
2576 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2577 code = NE;
2578 }
2579 break;
2580 case EQ:
2581 case UNEQ:
2582 if (code == EQ && TARGET_IEEE_FP)
2583 {
2584 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2585 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2586 cmp_mode = CCmode;
2587 code = EQ;
2588 }
2589 else
2590 {
2591 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2592 code = NE;
2593 }
2594 break;
2595 case NE:
2596 case LTGT:
2597 if (code == NE && TARGET_IEEE_FP)
2598 {
2599 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2600 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2601 GEN_INT (0x40)));
2602 code = NE;
2603 }
2604 else
2605 {
2606 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2607 code = EQ;
2608 }
2609 break;
2610
2611 case UNORDERED:
2612 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2613 code = NE;
2614 break;
2615 case ORDERED:
2616 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2617 code = EQ;
2618 break;
2619
2620 default:
2621 gcc_unreachable ();
2622 }
2623 break;
2624
2625 default:
2626 gcc_unreachable();
2627 }
2628
2629 /* Return the test that should be put into the flags user, i.e.
2630 the bcc, scc, or cmov instruction. */
2631 return gen_rtx_fmt_ee (code, VOIDmode,
2632 gen_rtx_REG (cmp_mode, FLAGS_REG),
2633 const0_rtx);
2634}
2635
2636/* Generate insn patterns to do an integer compare of OPERANDS. */
2637
2638static rtx
2639ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2640{
2641 machine_mode cmpmode;
2642 rtx tmp, flags;
2643
2644 cmpmode = SELECT_CC_MODE (code, op0, op1);
2645 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2646
2647 /* This is very simple, but making the interface the same as in the
2648 FP case makes the rest of the code easier. */
2649 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2650 emit_insn (gen_rtx_SET (flags, tmp));
2651
2652 /* Return the test that should be put into the flags user, i.e.
2653 the bcc, scc, or cmov instruction. */
2654 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2655}
2656
2657static rtx
2658ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2659{
2660 rtx ret;
2661
2662 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2663 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2664
2665 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2666 {
2667 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2668 ret = ix86_expand_fp_compare (code, op0, op1);
2669 }
2670 else
2671 ret = ix86_expand_int_compare (code, op0, op1);
2672
2673 return ret;
2674}
2675
2676void
2677ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2678{
2679 rtx ret;
2680
2681 gcc_assert (GET_MODE (dest) == QImode);
2682
2683 ret = ix86_expand_compare (code, op0, op1);
2684 PUT_MODE (ret, QImode);
2685 emit_insn (gen_rtx_SET (dest, ret));
2686}
2687
2688/* Expand comparison setting or clearing carry flag. Return true when
2689 successful and set pop for the operation. */
2690static bool
2691ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
2692{
2693 machine_mode mode
2694 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
2695
2696 /* Do not handle double-mode compares that go through special path. */
2697 if (mode == (TARGET_64BIT ? TImode : DImode))
2698 return false;
2699
2700 if (SCALAR_FLOAT_MODE_P (mode))
2701 {
2702 rtx compare_op;
2703 rtx_insn *compare_seq;
2704
2705 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
2706
2707 /* Shortcut: following common codes never translate
2708 into carry flag compares. */
2709 if (code == EQ || code == NE || code == UNEQ || code == LTGT
2710 || code == ORDERED || code == UNORDERED)
2711 return false;
2712
2713 /* These comparisons require zero flag; swap operands so they won't. */
2714 if ((code == GT || code == UNLE || code == LE || code == UNGT)
2715 && !TARGET_IEEE_FP)
2716 {
2717 std::swap (op0, op1);
2718 code = swap_condition (code);
2719 }
2720
2721 /* Try to expand the comparison and verify that we end up with
2722 carry flag based comparison. This fails to be true only when
2723 we decide to expand comparison using arithmetic that is not
2724 too common scenario. */
2725 start_sequence ();
2726 compare_op = ix86_expand_fp_compare (code, op0, op1);
2727 compare_seq = get_insns ();
2728 end_sequence ();
2729
2730 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
2731 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
2732 else
2733 code = GET_CODE (compare_op);
2734
2735 if (code != LTU && code != GEU)
2736 return false;
2737
2738 emit_insn (compare_seq);
2739 *pop = compare_op;
2740 return true;
2741 }
2742
2743 if (!INTEGRAL_MODE_P (mode))
2744 return false;
2745
2746 switch (code)
2747 {
2748 case LTU:
2749 case GEU:
2750 break;
2751
2752 /* Convert a==0 into (unsigned)a<1. */
2753 case EQ:
2754 case NE:
2755 if (op1 != const0_rtx)
2756 return false;
2757 op1 = const1_rtx;
2758 code = (code == EQ ? LTU : GEU);
2759 break;
2760
2761 /* Convert a>b into b<a or a>=b-1. */
2762 case GTU:
2763 case LEU:
2764 if (CONST_INT_P (op1))
2765 {
2766 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
2767 /* Bail out on overflow. We still can swap operands but that
2768 would force loading of the constant into register. */
2769 if (op1 == const0_rtx
2770 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
2771 return false;
2772 code = (code == GTU ? GEU : LTU);
2773 }
2774 else
2775 {
2776 std::swap (op0, op1);
2777 code = (code == GTU ? LTU : GEU);
2778 }
2779 break;
2780
2781 /* Convert a>=0 into (unsigned)a<0x80000000. */
2782 case LT:
2783 case GE:
2784 if (mode == DImode || op1 != const0_rtx)
2785 return false;
2786 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2787 code = (code == LT ? GEU : LTU);
2788 break;
2789 case LE:
2790 case GT:
2791 if (mode == DImode || op1 != constm1_rtx)
2792 return false;
2793 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2794 code = (code == LE ? GEU : LTU);
2795 break;
2796
2797 default:
2798 return false;
2799 }
2800 /* Swapping operands may cause constant to appear as first operand. */
2801 if (!nonimmediate_operand (op0, VOIDmode))
2802 {
2803 if (!can_create_pseudo_p ())
2804 return false;
2805 op0 = force_reg (mode, op0);
2806 }
2807 *pop = ix86_expand_compare (code, op0, op1);
2808 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
2809 return true;
2810}
2811
2812/* Expand conditional increment or decrement using adb/sbb instructions.
2813 The default case using setcc followed by the conditional move can be
2814 done by generic code. */
2815bool
2816ix86_expand_int_addcc (rtx operands[])
2817{
2818 enum rtx_code code = GET_CODE (operands[1]);
2819 rtx flags;
987a3082 2820 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
2bf6d935
ML
2821 rtx compare_op;
2822 rtx val = const0_rtx;
2823 bool fpcmp = false;
2824 machine_mode mode;
2825 rtx op0 = XEXP (operands[1], 0);
2826 rtx op1 = XEXP (operands[1], 1);
2827
2828 if (operands[3] != const1_rtx
2829 && operands[3] != constm1_rtx)
2830 return false;
2831 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2832 return false;
2833 code = GET_CODE (compare_op);
2834
2835 flags = XEXP (compare_op, 0);
2836
2837 if (GET_MODE (flags) == CCFPmode)
2838 {
2839 fpcmp = true;
2840 code = ix86_fp_compare_code_to_integer (code);
2841 }
2842
2843 if (code != LTU)
2844 {
2845 val = constm1_rtx;
2846 if (fpcmp)
2847 PUT_CODE (compare_op,
2848 reverse_condition_maybe_unordered
2849 (GET_CODE (compare_op)));
2850 else
2851 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
2852 }
2853
2854 mode = GET_MODE (operands[0]);
2855
2856 /* Construct either adc or sbb insn. */
2857 if ((code == LTU) == (operands[3] == constm1_rtx))
987a3082 2858 insn = gen_sub3_carry;
2bf6d935 2859 else
987a3082
UB
2860 insn = gen_add3_carry;
2861
2862 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
2bf6d935
ML
2863
2864 return true;
2865}
2866
2867bool
2868ix86_expand_int_movcc (rtx operands[])
2869{
2870 enum rtx_code code = GET_CODE (operands[1]), compare_code;
2871 rtx_insn *compare_seq;
2872 rtx compare_op;
2873 machine_mode mode = GET_MODE (operands[0]);
2874 bool sign_bit_compare_p = false;
2875 rtx op0 = XEXP (operands[1], 0);
2876 rtx op1 = XEXP (operands[1], 1);
2877
2878 if (GET_MODE (op0) == TImode
2879 || (GET_MODE (op0) == DImode
2880 && !TARGET_64BIT))
2881 return false;
2882
2883 start_sequence ();
2884 compare_op = ix86_expand_compare (code, op0, op1);
2885 compare_seq = get_insns ();
2886 end_sequence ();
2887
2888 compare_code = GET_CODE (compare_op);
2889
2890 if ((op1 == const0_rtx && (code == GE || code == LT))
2891 || (op1 == constm1_rtx && (code == GT || code == LE)))
2892 sign_bit_compare_p = true;
2893
2894 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2895 HImode insns, we'd be swallowed in word prefix ops. */
2896
2897 if ((mode != HImode || TARGET_FAST_PREFIX)
2898 && (mode != (TARGET_64BIT ? TImode : DImode))
2899 && CONST_INT_P (operands[2])
2900 && CONST_INT_P (operands[3]))
2901 {
2902 rtx out = operands[0];
2903 HOST_WIDE_INT ct = INTVAL (operands[2]);
2904 HOST_WIDE_INT cf = INTVAL (operands[3]);
2905 HOST_WIDE_INT diff;
2906
2907 diff = ct - cf;
2908 /* Sign bit compares are better done using shifts than we do by using
2909 sbb. */
2910 if (sign_bit_compare_p
2911 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2912 {
2913 /* Detect overlap between destination and compare sources. */
2914 rtx tmp = out;
2915
2916 if (!sign_bit_compare_p)
2917 {
2918 rtx flags;
2919 bool fpcmp = false;
2920
2921 compare_code = GET_CODE (compare_op);
2922
2923 flags = XEXP (compare_op, 0);
2924
2925 if (GET_MODE (flags) == CCFPmode)
2926 {
2927 fpcmp = true;
2928 compare_code
2929 = ix86_fp_compare_code_to_integer (compare_code);
2930 }
2931
2932 /* To simplify rest of code, restrict to the GEU case. */
2933 if (compare_code == LTU)
2934 {
2935 std::swap (ct, cf);
2936 compare_code = reverse_condition (compare_code);
2937 code = reverse_condition (code);
2938 }
2939 else
2940 {
2941 if (fpcmp)
2942 PUT_CODE (compare_op,
2943 reverse_condition_maybe_unordered
2944 (GET_CODE (compare_op)));
2945 else
2946 PUT_CODE (compare_op,
2947 reverse_condition (GET_CODE (compare_op)));
2948 }
2949 diff = ct - cf;
2950
2951 if (reg_overlap_mentioned_p (out, op0)
2952 || reg_overlap_mentioned_p (out, op1))
2953 tmp = gen_reg_rtx (mode);
2954
2955 if (mode == DImode)
2956 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
2957 else
2958 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
2959 flags, compare_op));
2960 }
2961 else
2962 {
2963 if (code == GT || code == GE)
2964 code = reverse_condition (code);
2965 else
2966 {
2967 std::swap (ct, cf);
2968 diff = ct - cf;
2969 }
2970 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
2971 }
2972
2973 if (diff == 1)
2974 {
2975 /*
2976 * cmpl op0,op1
2977 * sbbl dest,dest
2978 * [addl dest, ct]
2979 *
2980 * Size 5 - 8.
2981 */
2982 if (ct)
2983 tmp = expand_simple_binop (mode, PLUS,
2984 tmp, GEN_INT (ct),
2985 copy_rtx (tmp), 1, OPTAB_DIRECT);
2986 }
2987 else if (cf == -1)
2988 {
2989 /*
2990 * cmpl op0,op1
2991 * sbbl dest,dest
2992 * orl $ct, dest
2993 *
2994 * Size 8.
2995 */
2996 tmp = expand_simple_binop (mode, IOR,
2997 tmp, GEN_INT (ct),
2998 copy_rtx (tmp), 1, OPTAB_DIRECT);
2999 }
3000 else if (diff == -1 && ct)
3001 {
3002 /*
3003 * cmpl op0,op1
3004 * sbbl dest,dest
3005 * notl dest
3006 * [addl dest, cf]
3007 *
3008 * Size 8 - 11.
3009 */
3010 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3011 if (cf)
3012 tmp = expand_simple_binop (mode, PLUS,
3013 copy_rtx (tmp), GEN_INT (cf),
3014 copy_rtx (tmp), 1, OPTAB_DIRECT);
3015 }
3016 else
3017 {
3018 /*
3019 * cmpl op0,op1
3020 * sbbl dest,dest
3021 * [notl dest]
3022 * andl cf - ct, dest
3023 * [addl dest, ct]
3024 *
3025 * Size 8 - 11.
3026 */
3027
3028 if (cf == 0)
3029 {
3030 cf = ct;
3031 ct = 0;
3032 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3033 }
3034
3035 tmp = expand_simple_binop (mode, AND,
3036 copy_rtx (tmp),
3037 gen_int_mode (cf - ct, mode),
3038 copy_rtx (tmp), 1, OPTAB_DIRECT);
3039 if (ct)
3040 tmp = expand_simple_binop (mode, PLUS,
3041 copy_rtx (tmp), GEN_INT (ct),
3042 copy_rtx (tmp), 1, OPTAB_DIRECT);
3043 }
3044
3045 if (!rtx_equal_p (tmp, out))
3046 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3047
3048 return true;
3049 }
3050
3051 if (diff < 0)
3052 {
3053 machine_mode cmp_mode = GET_MODE (op0);
3054 enum rtx_code new_code;
3055
3056 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3057 {
3058 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3059
8f17461b
UB
3060 /* We may be reversing a non-trapping
3061 comparison to a trapping comparison. */
3062 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3063 && code != EQ && code != NE
3064 && code != ORDERED && code != UNORDERED)
3065 new_code = UNKNOWN;
3066 else
3067 new_code = reverse_condition_maybe_unordered (code);
2bf6d935
ML
3068 }
3069 else
3070 new_code = ix86_reverse_condition (code, cmp_mode);
3071 if (new_code != UNKNOWN)
3072 {
3073 std::swap (ct, cf);
3074 diff = -diff;
3075 code = new_code;
3076 }
3077 }
3078
3079 compare_code = UNKNOWN;
3080 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3081 && CONST_INT_P (op1))
3082 {
3083 if (op1 == const0_rtx
3084 && (code == LT || code == GE))
3085 compare_code = code;
3086 else if (op1 == constm1_rtx)
3087 {
3088 if (code == LE)
3089 compare_code = LT;
3090 else if (code == GT)
3091 compare_code = GE;
3092 }
3093 }
3094
3095 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3096 if (compare_code != UNKNOWN
3097 && GET_MODE (op0) == GET_MODE (out)
3098 && (cf == -1 || ct == -1))
3099 {
3100 /* If lea code below could be used, only optimize
3101 if it results in a 2 insn sequence. */
3102
3103 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3104 || diff == 3 || diff == 5 || diff == 9)
3105 || (compare_code == LT && ct == -1)
3106 || (compare_code == GE && cf == -1))
3107 {
3108 /*
3109 * notl op1 (if necessary)
3110 * sarl $31, op1
3111 * orl cf, op1
3112 */
3113 if (ct != -1)
3114 {
3115 cf = ct;
3116 ct = -1;
3117 code = reverse_condition (code);
3118 }
3119
3120 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3121
3122 out = expand_simple_binop (mode, IOR,
3123 out, GEN_INT (cf),
3124 out, 1, OPTAB_DIRECT);
3125 if (out != operands[0])
3126 emit_move_insn (operands[0], out);
3127
3128 return true;
3129 }
3130 }
3131
3132
3133 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3134 || diff == 3 || diff == 5 || diff == 9)
3135 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3136 && (mode != DImode
3137 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3138 {
3139 /*
3140 * xorl dest,dest
3141 * cmpl op1,op2
3142 * setcc dest
3143 * lea cf(dest*(ct-cf)),dest
3144 *
3145 * Size 14.
3146 *
3147 * This also catches the degenerate setcc-only case.
3148 */
3149
3150 rtx tmp;
3151 int nops;
3152
3153 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3154
3155 nops = 0;
3156 /* On x86_64 the lea instruction operates on Pmode, so we need
3157 to get arithmetics done in proper mode to match. */
3158 if (diff == 1)
3159 tmp = copy_rtx (out);
3160 else
3161 {
3162 rtx out1;
3163 out1 = copy_rtx (out);
3164 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3165 nops++;
3166 if (diff & 1)
3167 {
3168 tmp = gen_rtx_PLUS (mode, tmp, out1);
3169 nops++;
3170 }
3171 }
3172 if (cf != 0)
3173 {
c3185b64 3174 tmp = plus_constant (mode, tmp, cf);
2bf6d935
ML
3175 nops++;
3176 }
3177 if (!rtx_equal_p (tmp, out))
3178 {
3179 if (nops == 1)
3180 out = force_operand (tmp, copy_rtx (out));
3181 else
3182 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3183 }
3184 if (!rtx_equal_p (out, operands[0]))
3185 emit_move_insn (operands[0], copy_rtx (out));
3186
3187 return true;
3188 }
3189
3190 /*
3191 * General case: Jumpful:
3192 * xorl dest,dest cmpl op1, op2
3193 * cmpl op1, op2 movl ct, dest
3194 * setcc dest jcc 1f
3195 * decl dest movl cf, dest
3196 * andl (cf-ct),dest 1:
3197 * addl ct,dest
3198 *
3199 * Size 20. Size 14.
3200 *
3201 * This is reasonably steep, but branch mispredict costs are
3202 * high on modern cpus, so consider failing only if optimizing
3203 * for space.
3204 */
3205
3206 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3207 && BRANCH_COST (optimize_insn_for_speed_p (),
3208 false) >= 2)
3209 {
3210 if (cf == 0)
3211 {
3212 machine_mode cmp_mode = GET_MODE (op0);
3213 enum rtx_code new_code;
3214
3215 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3216 {
3217 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3218
8f17461b
UB
3219 /* We may be reversing a non-trapping
3220 comparison to a trapping comparison. */
3221 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3222 && code != EQ && code != NE
3223 && code != ORDERED && code != UNORDERED)
3224 new_code = UNKNOWN;
3225 else
3226 new_code = reverse_condition_maybe_unordered (code);
3227
2bf6d935
ML
3228 }
3229 else
3230 {
3231 new_code = ix86_reverse_condition (code, cmp_mode);
3232 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3233 compare_code = reverse_condition (compare_code);
3234 }
3235
3236 if (new_code != UNKNOWN)
3237 {
3238 cf = ct;
3239 ct = 0;
3240 code = new_code;
3241 }
3242 }
3243
3244 if (compare_code != UNKNOWN)
3245 {
3246 /* notl op1 (if needed)
3247 sarl $31, op1
3248 andl (cf-ct), op1
3249 addl ct, op1
3250
3251 For x < 0 (resp. x <= -1) there will be no notl,
3252 so if possible swap the constants to get rid of the
3253 complement.
3254 True/false will be -1/0 while code below (store flag
3255 followed by decrement) is 0/-1, so the constants need
3256 to be exchanged once more. */
3257
3258 if (compare_code == GE || !cf)
3259 {
3260 code = reverse_condition (code);
3261 compare_code = LT;
3262 }
3263 else
3264 std::swap (ct, cf);
3265
3266 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3267 }
3268 else
3269 {
3270 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3271
3272 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3273 constm1_rtx,
3274 copy_rtx (out), 1, OPTAB_DIRECT);
3275 }
3276
3277 out = expand_simple_binop (mode, AND, copy_rtx (out),
3278 gen_int_mode (cf - ct, mode),
3279 copy_rtx (out), 1, OPTAB_DIRECT);
3280 if (ct)
3281 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3282 copy_rtx (out), 1, OPTAB_DIRECT);
3283 if (!rtx_equal_p (out, operands[0]))
3284 emit_move_insn (operands[0], copy_rtx (out));
3285
3286 return true;
3287 }
3288 }
3289
3290 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3291 {
3292 /* Try a few things more with specific constants and a variable. */
3293
3294 optab op;
3295 rtx var, orig_out, out, tmp;
3296
3297 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3298 return false;
3299
3300 /* If one of the two operands is an interesting constant, load a
3301 constant with the above and mask it in with a logical operation. */
3302
3303 if (CONST_INT_P (operands[2]))
3304 {
3305 var = operands[3];
3306 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3307 operands[3] = constm1_rtx, op = and_optab;
3308 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3309 operands[3] = const0_rtx, op = ior_optab;
3310 else
3311 return false;
3312 }
3313 else if (CONST_INT_P (operands[3]))
3314 {
3315 var = operands[2];
3316 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3317 operands[2] = constm1_rtx, op = and_optab;
3318 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3319 operands[2] = const0_rtx, op = ior_optab;
3320 else
3321 return false;
3322 }
3323 else
3324 return false;
3325
3326 orig_out = operands[0];
3327 tmp = gen_reg_rtx (mode);
3328 operands[0] = tmp;
3329
3330 /* Recurse to get the constant loaded. */
3331 if (!ix86_expand_int_movcc (operands))
3332 return false;
3333
3334 /* Mask in the interesting variable. */
3335 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3336 OPTAB_WIDEN);
3337 if (!rtx_equal_p (out, orig_out))
3338 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3339
3340 return true;
3341 }
3342
3343 /*
3344 * For comparison with above,
3345 *
3346 * movl cf,dest
3347 * movl ct,tmp
3348 * cmpl op1,op2
3349 * cmovcc tmp,dest
3350 *
3351 * Size 15.
3352 */
3353
3354 if (! nonimmediate_operand (operands[2], mode))
3355 operands[2] = force_reg (mode, operands[2]);
3356 if (! nonimmediate_operand (operands[3], mode))
3357 operands[3] = force_reg (mode, operands[3]);
3358
3359 if (! register_operand (operands[2], VOIDmode)
3360 && (mode == QImode
3361 || ! register_operand (operands[3], VOIDmode)))
3362 operands[2] = force_reg (mode, operands[2]);
3363
3364 if (mode == QImode
3365 && ! register_operand (operands[3], VOIDmode))
3366 operands[3] = force_reg (mode, operands[3]);
3367
3368 emit_insn (compare_seq);
3369 emit_insn (gen_rtx_SET (operands[0],
3370 gen_rtx_IF_THEN_ELSE (mode,
3371 compare_op, operands[2],
3372 operands[3])));
3373 return true;
3374}
3375
3376/* Detect conditional moves that exactly match min/max operational
3377 semantics. Note that this is IEEE safe, as long as we don't
3378 interchange the operands.
3379
3380 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3381 and TRUE if the operation is successful and instructions are emitted. */
3382
3383static bool
3384ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3385 rtx cmp_op1, rtx if_true, rtx if_false)
3386{
3387 machine_mode mode;
3388 bool is_min;
3389 rtx tmp;
3390
3391 if (code == LT)
3392 ;
3393 else if (code == UNGE)
3394 std::swap (if_true, if_false);
3395 else
3396 return false;
3397
3398 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3399 is_min = true;
3400 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3401 is_min = false;
3402 else
3403 return false;
3404
3405 mode = GET_MODE (dest);
3406
3407 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3408 but MODE may be a vector mode and thus not appropriate. */
3409 if (!flag_finite_math_only || flag_signed_zeros)
3410 {
3411 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3412 rtvec v;
3413
3414 if_true = force_reg (mode, if_true);
3415 v = gen_rtvec (2, if_true, if_false);
3416 tmp = gen_rtx_UNSPEC (mode, v, u);
3417 }
3418 else
3419 {
3420 code = is_min ? SMIN : SMAX;
3421 if (MEM_P (if_true) && MEM_P (if_false))
3422 if_true = force_reg (mode, if_true);
3423 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3424 }
3425
3426 emit_insn (gen_rtx_SET (dest, tmp));
3427 return true;
3428}
3429
8b905e9b
HL
3430/* Return true if MODE is valid for vector compare to mask register,
3431 Same result for conditionl vector move with mask register. */
3432static bool
3433ix86_valid_mask_cmp_mode (machine_mode mode)
3434{
3435 /* XOP has its own vector conditional movement. */
a8654147 3436 if (TARGET_XOP && !TARGET_AVX512F)
8b905e9b
HL
3437 return false;
3438
3439 /* AVX512F is needed for mask operation. */
3440 if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3441 return false;
3442
3443 /* AVX512BW is needed for vector QI/HImode,
3444 AVX512VL is needed for 128/256-bit vector. */
3445 machine_mode inner_mode = GET_MODE_INNER (mode);
3446 int vector_size = GET_MODE_SIZE (mode);
3447 if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3448 return false;
3449
3450 return vector_size == 64 || TARGET_AVX512VL;
3451}
3452
2bf6d935
ML
3453/* Expand an SSE comparison. Return the register with the result. */
3454
3455static rtx
3456ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3457 rtx op_true, rtx op_false)
3458{
3459 machine_mode mode = GET_MODE (dest);
3460 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3461
3462 /* In general case result of comparison can differ from operands' type. */
3463 machine_mode cmp_mode;
3464
3465 /* In AVX512F the result of comparison is an integer mask. */
3466 bool maskcmp = false;
3467 rtx x;
3468
8b905e9b 3469 if (ix86_valid_mask_cmp_mode (cmp_ops_mode))
2bf6d935
ML
3470 {
3471 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
2bf6d935 3472 maskcmp = true;
8b905e9b 3473 cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
2bf6d935
ML
3474 }
3475 else
3476 cmp_mode = cmp_ops_mode;
3477
3478 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3479
3480 int (*op1_predicate)(rtx, machine_mode)
3481 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3482
3483 if (!op1_predicate (cmp_op1, cmp_ops_mode))
3484 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3485
3486 if (optimize
3487 || (maskcmp && cmp_mode != mode)
3488 || (op_true && reg_overlap_mentioned_p (dest, op_true))
3489 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3490 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3491
2bf6d935
ML
3492 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3493
3494 if (cmp_mode != mode && !maskcmp)
3495 {
3496 x = force_reg (cmp_ops_mode, x);
3497 convert_move (dest, x, false);
3498 }
3499 else
3500 emit_insn (gen_rtx_SET (dest, x));
3501
3502 return dest;
3503}
3504
3505/* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3506 operations. This is used for both scalar and vector conditional moves. */
3507
3508void
3509ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3510{
3511 machine_mode mode = GET_MODE (dest);
3512 machine_mode cmpmode = GET_MODE (cmp);
3513
3514 /* In AVX512F the result of comparison is an integer mask. */
8b905e9b 3515 bool maskcmp = mode != cmpmode && ix86_valid_mask_cmp_mode (mode);
2bf6d935
ML
3516
3517 rtx t2, t3, x;
3518
3519 /* If we have an integer mask and FP value then we need
3520 to cast mask to FP mode. */
3521 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3522 {
3523 cmp = force_reg (cmpmode, cmp);
3524 cmp = gen_rtx_SUBREG (mode, cmp, 0);
3525 }
3526
3527 if (maskcmp)
3528 {
8b905e9b
HL
3529 /* Using vector move with mask register. */
3530 cmp = force_reg (cmpmode, cmp);
3531 /* Optimize for mask zero. */
3532 op_true = (op_true != CONST0_RTX (mode)
3533 ? force_reg (mode, op_true) : op_true);
3534 op_false = (op_false != CONST0_RTX (mode)
3535 ? force_reg (mode, op_false) : op_false);
3536 if (op_true == CONST0_RTX (mode))
2bf6d935 3537 {
8b905e9b
HL
3538 rtx (*gen_not) (rtx, rtx);
3539 switch (cmpmode)
2bf6d935 3540 {
8b905e9b
HL
3541 case E_QImode: gen_not = gen_knotqi; break;
3542 case E_HImode: gen_not = gen_knothi; break;
3543 case E_SImode: gen_not = gen_knotsi; break;
3544 case E_DImode: gen_not = gen_knotdi; break;
3545 default: gcc_unreachable ();
2bf6d935 3546 }
8b905e9b
HL
3547 rtx n = gen_reg_rtx (cmpmode);
3548 emit_insn (gen_not (n, cmp));
3549 cmp = n;
3550 /* Reverse op_true op_false. */
3551 std::swap (op_true, op_false);
2bf6d935 3552 }
8b905e9b
HL
3553
3554 rtx vec_merge = gen_rtx_VEC_MERGE (mode, op_true, op_false, cmp);
3555 emit_insn (gen_rtx_SET (dest, vec_merge));
3556 return;
2bf6d935
ML
3557 }
3558 else if (vector_all_ones_operand (op_true, mode)
3559 && op_false == CONST0_RTX (mode))
3560 {
3561 emit_insn (gen_rtx_SET (dest, cmp));
3562 return;
3563 }
3564 else if (op_false == CONST0_RTX (mode))
3565 {
3566 op_true = force_reg (mode, op_true);
3567 x = gen_rtx_AND (mode, cmp, op_true);
3568 emit_insn (gen_rtx_SET (dest, x));
3569 return;
3570 }
3571 else if (op_true == CONST0_RTX (mode))
3572 {
3573 op_false = force_reg (mode, op_false);
3574 x = gen_rtx_NOT (mode, cmp);
3575 x = gen_rtx_AND (mode, x, op_false);
3576 emit_insn (gen_rtx_SET (dest, x));
3577 return;
3578 }
3579 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
3580 {
3581 op_false = force_reg (mode, op_false);
3582 x = gen_rtx_IOR (mode, cmp, op_false);
3583 emit_insn (gen_rtx_SET (dest, x));
3584 return;
3585 }
3586 else if (TARGET_XOP)
3587 {
3588 op_true = force_reg (mode, op_true);
3589
3590 if (!nonimmediate_operand (op_false, mode))
3591 op_false = force_reg (mode, op_false);
3592
3593 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
3594 op_true,
3595 op_false)));
3596 return;
3597 }
3598
3599 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
3600 rtx d = dest;
3601
3602 if (!vector_operand (op_true, mode))
3603 op_true = force_reg (mode, op_true);
3604
3605 op_false = force_reg (mode, op_false);
3606
3607 switch (mode)
3608 {
3609 case E_V4SFmode:
3610 if (TARGET_SSE4_1)
3611 gen = gen_sse4_1_blendvps;
3612 break;
3613 case E_V2DFmode:
3614 if (TARGET_SSE4_1)
3615 gen = gen_sse4_1_blendvpd;
3616 break;
3617 case E_SFmode:
3618 if (TARGET_SSE4_1)
3619 {
3620 gen = gen_sse4_1_blendvss;
3621 op_true = force_reg (mode, op_true);
3622 }
3623 break;
3624 case E_DFmode:
3625 if (TARGET_SSE4_1)
3626 {
3627 gen = gen_sse4_1_blendvsd;
3628 op_true = force_reg (mode, op_true);
3629 }
3630 break;
3631 case E_V16QImode:
3632 case E_V8HImode:
3633 case E_V4SImode:
3634 case E_V2DImode:
3635 if (TARGET_SSE4_1)
3636 {
3637 gen = gen_sse4_1_pblendvb;
3638 if (mode != V16QImode)
3639 d = gen_reg_rtx (V16QImode);
3640 op_false = gen_lowpart (V16QImode, op_false);
3641 op_true = gen_lowpart (V16QImode, op_true);
3642 cmp = gen_lowpart (V16QImode, cmp);
3643 }
3644 break;
3645 case E_V8SFmode:
3646 if (TARGET_AVX)
3647 gen = gen_avx_blendvps256;
3648 break;
3649 case E_V4DFmode:
3650 if (TARGET_AVX)
3651 gen = gen_avx_blendvpd256;
3652 break;
3653 case E_V32QImode:
3654 case E_V16HImode:
3655 case E_V8SImode:
3656 case E_V4DImode:
3657 if (TARGET_AVX2)
3658 {
3659 gen = gen_avx2_pblendvb;
3660 if (mode != V32QImode)
3661 d = gen_reg_rtx (V32QImode);
3662 op_false = gen_lowpart (V32QImode, op_false);
3663 op_true = gen_lowpart (V32QImode, op_true);
3664 cmp = gen_lowpart (V32QImode, cmp);
3665 }
3666 break;
3667
3668 case E_V64QImode:
3669 gen = gen_avx512bw_blendmv64qi;
3670 break;
3671 case E_V32HImode:
3672 gen = gen_avx512bw_blendmv32hi;
3673 break;
3674 case E_V16SImode:
3675 gen = gen_avx512f_blendmv16si;
3676 break;
3677 case E_V8DImode:
3678 gen = gen_avx512f_blendmv8di;
3679 break;
3680 case E_V8DFmode:
3681 gen = gen_avx512f_blendmv8df;
3682 break;
3683 case E_V16SFmode:
3684 gen = gen_avx512f_blendmv16sf;
3685 break;
3686
3687 default:
3688 break;
3689 }
3690
3691 if (gen != NULL)
3692 {
3693 emit_insn (gen (d, op_false, op_true, cmp));
3694 if (d != dest)
3695 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
3696 }
3697 else
3698 {
3699 op_true = force_reg (mode, op_true);
3700
3701 t2 = gen_reg_rtx (mode);
3702 if (optimize)
3703 t3 = gen_reg_rtx (mode);
3704 else
3705 t3 = dest;
3706
3707 x = gen_rtx_AND (mode, op_true, cmp);
3708 emit_insn (gen_rtx_SET (t2, x));
3709
3710 x = gen_rtx_NOT (mode, cmp);
3711 x = gen_rtx_AND (mode, x, op_false);
3712 emit_insn (gen_rtx_SET (t3, x));
3713
3714 x = gen_rtx_IOR (mode, t3, t2);
3715 emit_insn (gen_rtx_SET (dest, x));
3716 }
3717}
3718
3719/* Swap, force into registers, or otherwise massage the two operands
3720 to an sse comparison with a mask result. Thus we differ a bit from
3721 ix86_prepare_fp_compare_args which expects to produce a flags result.
3722
3723 The DEST operand exists to help determine whether to commute commutative
3724 operators. The POP0/POP1 operands are updated in place. The new
3725 comparison code is returned, or UNKNOWN if not implementable. */
3726
3727static enum rtx_code
3728ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
3729 rtx *pop0, rtx *pop1)
3730{
3731 switch (code)
3732 {
3733 case LTGT:
3734 case UNEQ:
3735 /* AVX supports all the needed comparisons. */
3736 if (TARGET_AVX)
3737 break;
3738 /* We have no LTGT as an operator. We could implement it with
3739 NE & ORDERED, but this requires an extra temporary. It's
3740 not clear that it's worth it. */
3741 return UNKNOWN;
3742
3743 case LT:
3744 case LE:
3745 case UNGT:
3746 case UNGE:
3747 /* These are supported directly. */
3748 break;
3749
3750 case EQ:
3751 case NE:
3752 case UNORDERED:
3753 case ORDERED:
3754 /* AVX has 3 operand comparisons, no need to swap anything. */
3755 if (TARGET_AVX)
3756 break;
3757 /* For commutative operators, try to canonicalize the destination
3758 operand to be first in the comparison - this helps reload to
3759 avoid extra moves. */
3760 if (!dest || !rtx_equal_p (dest, *pop1))
3761 break;
3762 /* FALLTHRU */
3763
3764 case GE:
3765 case GT:
3766 case UNLE:
3767 case UNLT:
3768 /* These are not supported directly before AVX, and furthermore
3769 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
3770 comparison operands to transform into something that is
3771 supported. */
3772 std::swap (*pop0, *pop1);
3773 code = swap_condition (code);
3774 break;
3775
3776 default:
3777 gcc_unreachable ();
3778 }
3779
3780 return code;
3781}
3782
3783/* Expand a floating-point conditional move. Return true if successful. */
3784
3785bool
3786ix86_expand_fp_movcc (rtx operands[])
3787{
3788 machine_mode mode = GET_MODE (operands[0]);
3789 enum rtx_code code = GET_CODE (operands[1]);
3790 rtx tmp, compare_op;
3791 rtx op0 = XEXP (operands[1], 0);
3792 rtx op1 = XEXP (operands[1], 1);
3793
3794 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
3795 {
3796 machine_mode cmode;
3797
3798 /* Since we've no cmove for sse registers, don't force bad register
3799 allocation just to gain access to it. Deny movcc when the
3800 comparison mode doesn't match the move mode. */
3801 cmode = GET_MODE (op0);
3802 if (cmode == VOIDmode)
3803 cmode = GET_MODE (op1);
3804 if (cmode != mode)
3805 return false;
3806
3807 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
3808 if (code == UNKNOWN)
3809 return false;
3810
3811 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
3812 operands[2], operands[3]))
3813 return true;
3814
3815 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
3816 operands[2], operands[3]);
3817 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
3818 return true;
3819 }
3820
3821 if (GET_MODE (op0) == TImode
3822 || (GET_MODE (op0) == DImode
3823 && !TARGET_64BIT))
3824 return false;
3825
3826 /* The floating point conditional move instructions don't directly
3827 support conditions resulting from a signed integer comparison. */
3828
3829 compare_op = ix86_expand_compare (code, op0, op1);
3830 if (!fcmov_comparison_operator (compare_op, VOIDmode))
3831 {
3832 tmp = gen_reg_rtx (QImode);
3833 ix86_expand_setcc (tmp, code, op0, op1);
3834
3835 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
3836 }
3837
3838 emit_insn (gen_rtx_SET (operands[0],
3839 gen_rtx_IF_THEN_ELSE (mode, compare_op,
3840 operands[2], operands[3])));
3841
3842 return true;
3843}
3844
3845/* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
3846
3847static int
3848ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
3849{
3850 switch (code)
3851 {
3852 case EQ:
3853 return 0;
3854 case LT:
3855 case LTU:
3856 return 1;
3857 case LE:
3858 case LEU:
3859 return 2;
3860 case NE:
3861 return 4;
3862 case GE:
3863 case GEU:
3864 return 5;
3865 case GT:
3866 case GTU:
3867 return 6;
3868 default:
3869 gcc_unreachable ();
3870 }
3871}
3872
3873/* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
3874
3875static int
3876ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
3877{
3878 switch (code)
3879 {
3880 case EQ:
3881 return 0x00;
3882 case NE:
3883 return 0x04;
3884 case GT:
3885 return 0x0e;
3886 case LE:
3887 return 0x02;
3888 case GE:
3889 return 0x0d;
3890 case LT:
3891 return 0x01;
3892 case UNLE:
3893 return 0x0a;
3894 case UNLT:
3895 return 0x09;
3896 case UNGE:
3897 return 0x05;
3898 case UNGT:
3899 return 0x06;
3900 case UNEQ:
3901 return 0x18;
3902 case LTGT:
3903 return 0x0c;
3904 case ORDERED:
3905 return 0x07;
3906 case UNORDERED:
3907 return 0x03;
3908 default:
3909 gcc_unreachable ();
3910 }
3911}
3912
3913/* Return immediate value to be used in UNSPEC_PCMP
3914 for comparison CODE in MODE. */
3915
3916static int
3917ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
3918{
3919 if (FLOAT_MODE_P (mode))
3920 return ix86_fp_cmp_code_to_pcmp_immediate (code);
3921 return ix86_int_cmp_code_to_pcmp_immediate (code);
3922}
3923
3924/* Expand AVX-512 vector comparison. */
3925
3926bool
3927ix86_expand_mask_vec_cmp (rtx operands[])
3928{
3929 machine_mode mask_mode = GET_MODE (operands[0]);
3930 machine_mode cmp_mode = GET_MODE (operands[2]);
3931 enum rtx_code code = GET_CODE (operands[1]);
3932 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
3933 int unspec_code;
3934 rtx unspec;
3935
3936 switch (code)
3937 {
3938 case LEU:
3939 case GTU:
3940 case GEU:
3941 case LTU:
3942 unspec_code = UNSPEC_UNSIGNED_PCMP;
3943 break;
3944
3945 default:
3946 unspec_code = UNSPEC_PCMP;
3947 }
3948
3949 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
3950 operands[3], imm),
3951 unspec_code);
3952 emit_insn (gen_rtx_SET (operands[0], unspec));
3953
3954 return true;
3955}
3956
3957/* Expand fp vector comparison. */
3958
3959bool
3960ix86_expand_fp_vec_cmp (rtx operands[])
3961{
3962 enum rtx_code code = GET_CODE (operands[1]);
3963 rtx cmp;
3964
3965 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
3966 &operands[2], &operands[3]);
3967 if (code == UNKNOWN)
3968 {
3969 rtx temp;
3970 switch (GET_CODE (operands[1]))
3971 {
3972 case LTGT:
3973 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
3974 operands[3], NULL, NULL);
3975 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
3976 operands[3], NULL, NULL);
3977 code = AND;
3978 break;
3979 case UNEQ:
3980 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
3981 operands[3], NULL, NULL);
3982 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
3983 operands[3], NULL, NULL);
3984 code = IOR;
3985 break;
3986 default:
3987 gcc_unreachable ();
3988 }
3989 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
3990 OPTAB_DIRECT);
3991 }
3992 else
3993 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
3994 operands[1], operands[2]);
3995
3996 if (operands[0] != cmp)
3997 emit_move_insn (operands[0], cmp);
3998
3999 return true;
4000}
4001
4002static rtx
4003ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4004 rtx op_true, rtx op_false, bool *negate)
4005{
4006 machine_mode data_mode = GET_MODE (dest);
4007 machine_mode mode = GET_MODE (cop0);
4008 rtx x;
4009
4010 *negate = false;
4011
4012 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4013 if (TARGET_XOP
4014 && (mode == V16QImode || mode == V8HImode
4015 || mode == V4SImode || mode == V2DImode))
4016 ;
8b905e9b
HL
4017 /* AVX512F supports all of the comparsions
4018 on all 128/256/512-bit vector int types. */
4019 else if (ix86_valid_mask_cmp_mode (mode))
4020 ;
2bf6d935
ML
4021 else
4022 {
4023 /* Canonicalize the comparison to EQ, GT, GTU. */
4024 switch (code)
4025 {
4026 case EQ:
4027 case GT:
4028 case GTU:
4029 break;
4030
4031 case NE:
4032 case LE:
4033 case LEU:
4034 code = reverse_condition (code);
4035 *negate = true;
4036 break;
4037
4038 case GE:
4039 case GEU:
4040 code = reverse_condition (code);
4041 *negate = true;
4042 /* FALLTHRU */
4043
4044 case LT:
4045 case LTU:
4046 std::swap (cop0, cop1);
4047 code = swap_condition (code);
4048 break;
4049
4050 default:
4051 gcc_unreachable ();
4052 }
4053
4054 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4055 if (mode == V2DImode)
4056 {
4057 switch (code)
4058 {
4059 case EQ:
4060 /* SSE4.1 supports EQ. */
4061 if (!TARGET_SSE4_1)
4062 return NULL;
4063 break;
4064
4065 case GT:
4066 case GTU:
4067 /* SSE4.2 supports GT/GTU. */
4068 if (!TARGET_SSE4_2)
4069 return NULL;
4070 break;
4071
4072 default:
4073 gcc_unreachable ();
4074 }
4075 }
4076
4077 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4078 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4079 if (*negate)
4080 std::swap (optrue, opfalse);
4081
4082 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4083 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4084 min (x, y) == x). While we add one instruction (the minimum),
4085 we remove the need for two instructions in the negation, as the
4086 result is done this way.
4087 When using masks, do it for SI/DImode element types, as it is shorter
4088 than the two subtractions. */
4089 if ((code != EQ
4090 && GET_MODE_SIZE (mode) != 64
4091 && vector_all_ones_operand (opfalse, data_mode)
4092 && optrue == CONST0_RTX (data_mode))
4093 || (code == GTU
4094 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4095 /* Don't do it if not using integer masks and we'd end up with
4096 the right values in the registers though. */
4097 && (GET_MODE_SIZE (mode) == 64
4098 || !vector_all_ones_operand (optrue, data_mode)
4099 || opfalse != CONST0_RTX (data_mode))))
4100 {
4101 rtx (*gen) (rtx, rtx, rtx) = NULL;
4102
4103 switch (mode)
4104 {
4105 case E_V16SImode:
4106 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4107 break;
4108 case E_V8DImode:
4109 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4110 cop0 = force_reg (mode, cop0);
4111 cop1 = force_reg (mode, cop1);
4112 break;
4113 case E_V32QImode:
4114 if (TARGET_AVX2)
4115 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4116 break;
4117 case E_V16HImode:
4118 if (TARGET_AVX2)
4119 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4120 break;
4121 case E_V8SImode:
4122 if (TARGET_AVX2)
4123 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4124 break;
4125 case E_V4DImode:
4126 if (TARGET_AVX512VL)
4127 {
4128 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4129 cop0 = force_reg (mode, cop0);
4130 cop1 = force_reg (mode, cop1);
4131 }
4132 break;
4133 case E_V16QImode:
4134 if (code == GTU && TARGET_SSE2)
4135 gen = gen_uminv16qi3;
4136 else if (code == GT && TARGET_SSE4_1)
4137 gen = gen_sminv16qi3;
4138 break;
4139 case E_V8HImode:
4140 if (code == GTU && TARGET_SSE4_1)
4141 gen = gen_uminv8hi3;
4142 else if (code == GT && TARGET_SSE2)
4143 gen = gen_sminv8hi3;
4144 break;
4145 case E_V4SImode:
4146 if (TARGET_SSE4_1)
4147 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4148 break;
4149 case E_V2DImode:
4150 if (TARGET_AVX512VL)
4151 {
4152 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4153 cop0 = force_reg (mode, cop0);
4154 cop1 = force_reg (mode, cop1);
4155 }
4156 break;
4157 default:
4158 break;
4159 }
4160
4161 if (gen)
4162 {
4163 rtx tem = gen_reg_rtx (mode);
4164 if (!vector_operand (cop0, mode))
4165 cop0 = force_reg (mode, cop0);
4166 if (!vector_operand (cop1, mode))
4167 cop1 = force_reg (mode, cop1);
4168 *negate = !*negate;
4169 emit_insn (gen (tem, cop0, cop1));
4170 cop1 = tem;
4171 code = EQ;
4172 }
4173 }
4174
4175 /* Unsigned parallel compare is not supported by the hardware.
4176 Play some tricks to turn this into a signed comparison
4177 against 0. */
4178 if (code == GTU)
4179 {
4180 cop0 = force_reg (mode, cop0);
4181
4182 switch (mode)
4183 {
4184 case E_V16SImode:
4185 case E_V8DImode:
4186 case E_V8SImode:
4187 case E_V4DImode:
4188 case E_V4SImode:
4189 case E_V2DImode:
4190 {
4191 rtx t1, t2, mask;
83bc5e44 4192
2bf6d935
ML
4193 /* Subtract (-(INT MAX) - 1) from both operands to make
4194 them signed. */
4195 mask = ix86_build_signbit_mask (mode, true, false);
4196 t1 = gen_reg_rtx (mode);
83bc5e44 4197 emit_insn (gen_sub3_insn (t1, cop0, mask));
2bf6d935
ML
4198
4199 t2 = gen_reg_rtx (mode);
83bc5e44 4200 emit_insn (gen_sub3_insn (t2, cop1, mask));
2bf6d935
ML
4201
4202 cop0 = t1;
4203 cop1 = t2;
4204 code = GT;
4205 }
4206 break;
4207
4208 case E_V64QImode:
4209 case E_V32HImode:
4210 case E_V32QImode:
4211 case E_V16HImode:
4212 case E_V16QImode:
4213 case E_V8HImode:
4214 /* Perform a parallel unsigned saturating subtraction. */
4215 x = gen_reg_rtx (mode);
83bc5e44
UB
4216 emit_insn (gen_rtx_SET
4217 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
2bf6d935
ML
4218 cop0 = x;
4219 cop1 = CONST0_RTX (mode);
4220 code = EQ;
4221 *negate = !*negate;
4222 break;
4223
4224 default:
4225 gcc_unreachable ();
4226 }
4227 }
4228 }
4229
4230 if (*negate)
4231 std::swap (op_true, op_false);
4232
4233 /* Allow the comparison to be done in one mode, but the movcc to
4234 happen in another mode. */
4235 if (data_mode == mode)
4236 {
4237 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
4238 op_true, op_false);
4239 }
4240 else
4241 {
4242 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4243 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4244 op_true, op_false);
4245 if (GET_MODE (x) == mode)
4246 x = gen_lowpart (data_mode, x);
4247 }
4248
4249 return x;
4250}
4251
4252/* Expand integer vector comparison. */
4253
4254bool
4255ix86_expand_int_vec_cmp (rtx operands[])
4256{
4257 rtx_code code = GET_CODE (operands[1]);
4258 bool negate = false;
4259 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4260 operands[3], NULL, NULL, &negate);
4261
4262 if (!cmp)
4263 return false;
4264
4265 if (negate)
4266 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4267 CONST0_RTX (GET_MODE (cmp)),
4268 NULL, NULL, &negate);
4269
4270 gcc_assert (!negate);
4271
4272 if (operands[0] != cmp)
4273 emit_move_insn (operands[0], cmp);
4274
4275 return true;
4276}
4277
4278/* Expand a floating-point vector conditional move; a vcond operation
4279 rather than a movcc operation. */
4280
4281bool
4282ix86_expand_fp_vcond (rtx operands[])
4283{
4284 enum rtx_code code = GET_CODE (operands[3]);
4285 rtx cmp;
4286
4287 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4288 &operands[4], &operands[5]);
4289 if (code == UNKNOWN)
4290 {
4291 rtx temp;
4292 switch (GET_CODE (operands[3]))
4293 {
4294 case LTGT:
4295 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4296 operands[5], operands[0], operands[0]);
4297 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4298 operands[5], operands[1], operands[2]);
4299 code = AND;
4300 break;
4301 case UNEQ:
4302 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4303 operands[5], operands[0], operands[0]);
4304 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4305 operands[5], operands[1], operands[2]);
4306 code = IOR;
4307 break;
4308 default:
4309 gcc_unreachable ();
4310 }
4311 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4312 OPTAB_DIRECT);
4313 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4314 return true;
4315 }
4316
4317 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4318 operands[5], operands[1], operands[2]))
4319 return true;
4320
4321 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4322 operands[1], operands[2]);
4323 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4324 return true;
4325}
4326
4327/* Expand a signed/unsigned integral vector conditional move. */
4328
4329bool
4330ix86_expand_int_vcond (rtx operands[])
4331{
4332 machine_mode data_mode = GET_MODE (operands[0]);
4333 machine_mode mode = GET_MODE (operands[4]);
4334 enum rtx_code code = GET_CODE (operands[3]);
4335 bool negate = false;
4336 rtx x, cop0, cop1;
4337
4338 cop0 = operands[4];
4339 cop1 = operands[5];
4340
4341 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4342 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4343 if ((code == LT || code == GE)
4344 && data_mode == mode
4345 && cop1 == CONST0_RTX (mode)
4346 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4347 && GET_MODE_UNIT_SIZE (data_mode) > 1
4348 && GET_MODE_UNIT_SIZE (data_mode) <= 8
4349 && (GET_MODE_SIZE (data_mode) == 16
4350 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4351 {
4352 rtx negop = operands[2 - (code == LT)];
4353 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4354 if (negop == CONST1_RTX (data_mode))
4355 {
4356 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4357 operands[0], 1, OPTAB_DIRECT);
4358 if (res != operands[0])
4359 emit_move_insn (operands[0], res);
4360 return true;
4361 }
4362 else if (GET_MODE_INNER (data_mode) != DImode
4363 && vector_all_ones_operand (negop, data_mode))
4364 {
4365 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4366 operands[0], 0, OPTAB_DIRECT);
4367 if (res != operands[0])
4368 emit_move_insn (operands[0], res);
4369 return true;
4370 }
4371 }
4372
4373 if (!nonimmediate_operand (cop1, mode))
4374 cop1 = force_reg (mode, cop1);
4375 if (!general_operand (operands[1], data_mode))
4376 operands[1] = force_reg (data_mode, operands[1]);
4377 if (!general_operand (operands[2], data_mode))
4378 operands[2] = force_reg (data_mode, operands[2]);
4379
4380 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
4381 operands[1], operands[2], &negate);
4382
4383 if (!x)
4384 return false;
4385
4386 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
4387 operands[2-negate]);
4388 return true;
4389}
4390
4391static bool
4392ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
4393 struct expand_vec_perm_d *d)
4394{
4395 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4396 expander, so args are either in d, or in op0, op1 etc. */
4397 machine_mode mode = GET_MODE (d ? d->op0 : op0);
4398 machine_mode maskmode = mode;
4399 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4400
4401 switch (mode)
4402 {
4403 case E_V8HImode:
4404 if (TARGET_AVX512VL && TARGET_AVX512BW)
4405 gen = gen_avx512vl_vpermt2varv8hi3;
4406 break;
4407 case E_V16HImode:
4408 if (TARGET_AVX512VL && TARGET_AVX512BW)
4409 gen = gen_avx512vl_vpermt2varv16hi3;
4410 break;
4411 case E_V64QImode:
4412 if (TARGET_AVX512VBMI)
4413 gen = gen_avx512bw_vpermt2varv64qi3;
4414 break;
4415 case E_V32HImode:
4416 if (TARGET_AVX512BW)
4417 gen = gen_avx512bw_vpermt2varv32hi3;
4418 break;
4419 case E_V4SImode:
4420 if (TARGET_AVX512VL)
4421 gen = gen_avx512vl_vpermt2varv4si3;
4422 break;
4423 case E_V8SImode:
4424 if (TARGET_AVX512VL)
4425 gen = gen_avx512vl_vpermt2varv8si3;
4426 break;
4427 case E_V16SImode:
4428 if (TARGET_AVX512F)
4429 gen = gen_avx512f_vpermt2varv16si3;
4430 break;
4431 case E_V4SFmode:
4432 if (TARGET_AVX512VL)
4433 {
4434 gen = gen_avx512vl_vpermt2varv4sf3;
4435 maskmode = V4SImode;
4436 }
4437 break;
4438 case E_V8SFmode:
4439 if (TARGET_AVX512VL)
4440 {
4441 gen = gen_avx512vl_vpermt2varv8sf3;
4442 maskmode = V8SImode;
4443 }
4444 break;
4445 case E_V16SFmode:
4446 if (TARGET_AVX512F)
4447 {
4448 gen = gen_avx512f_vpermt2varv16sf3;
4449 maskmode = V16SImode;
4450 }
4451 break;
4452 case E_V2DImode:
4453 if (TARGET_AVX512VL)
4454 gen = gen_avx512vl_vpermt2varv2di3;
4455 break;
4456 case E_V4DImode:
4457 if (TARGET_AVX512VL)
4458 gen = gen_avx512vl_vpermt2varv4di3;
4459 break;
4460 case E_V8DImode:
4461 if (TARGET_AVX512F)
4462 gen = gen_avx512f_vpermt2varv8di3;
4463 break;
4464 case E_V2DFmode:
4465 if (TARGET_AVX512VL)
4466 {
4467 gen = gen_avx512vl_vpermt2varv2df3;
4468 maskmode = V2DImode;
4469 }
4470 break;
4471 case E_V4DFmode:
4472 if (TARGET_AVX512VL)
4473 {
4474 gen = gen_avx512vl_vpermt2varv4df3;
4475 maskmode = V4DImode;
4476 }
4477 break;
4478 case E_V8DFmode:
4479 if (TARGET_AVX512F)
4480 {
4481 gen = gen_avx512f_vpermt2varv8df3;
4482 maskmode = V8DImode;
4483 }
4484 break;
4485 default:
4486 break;
4487 }
4488
4489 if (gen == NULL)
4490 return false;
4491
4492 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4493 expander, so args are either in d, or in op0, op1 etc. */
4494 if (d)
4495 {
4496 rtx vec[64];
4497 target = d->target;
4498 op0 = d->op0;
4499 op1 = d->op1;
4500 for (int i = 0; i < d->nelt; ++i)
4501 vec[i] = GEN_INT (d->perm[i]);
4502 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
4503 }
4504
4505 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
4506 return true;
4507}
4508
4509/* Expand a variable vector permutation. */
4510
4511void
4512ix86_expand_vec_perm (rtx operands[])
4513{
4514 rtx target = operands[0];
4515 rtx op0 = operands[1];
4516 rtx op1 = operands[2];
4517 rtx mask = operands[3];
4518 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
4519 machine_mode mode = GET_MODE (op0);
4520 machine_mode maskmode = GET_MODE (mask);
4521 int w, e, i;
4522 bool one_operand_shuffle = rtx_equal_p (op0, op1);
4523
4524 /* Number of elements in the vector. */
4525 w = GET_MODE_NUNITS (mode);
4526 e = GET_MODE_UNIT_SIZE (mode);
4527 gcc_assert (w <= 64);
4528
4529 if (TARGET_AVX512F && one_operand_shuffle)
4530 {
4531 rtx (*gen) (rtx, rtx, rtx) = NULL;
4532 switch (mode)
4533 {
4534 case E_V16SImode:
4535 gen =gen_avx512f_permvarv16si;
4536 break;
4537 case E_V16SFmode:
4538 gen = gen_avx512f_permvarv16sf;
4539 break;
4540 case E_V8DImode:
4541 gen = gen_avx512f_permvarv8di;
4542 break;
4543 case E_V8DFmode:
4544 gen = gen_avx512f_permvarv8df;
4545 break;
4546 default:
4547 break;
4548 }
4549 if (gen != NULL)
4550 {
4551 emit_insn (gen (target, op0, mask));
4552 return;
4553 }
4554 }
4555
4556 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
4557 return;
4558
4559 if (TARGET_AVX2)
4560 {
4561 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
4562 {
4563 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4564 an constant shuffle operand. With a tiny bit of effort we can
4565 use VPERMD instead. A re-interpretation stall for V4DFmode is
4566 unfortunate but there's no avoiding it.
4567 Similarly for V16HImode we don't have instructions for variable
4568 shuffling, while for V32QImode we can use after preparing suitable
4569 masks vpshufb; vpshufb; vpermq; vpor. */
4570
4571 if (mode == V16HImode)
4572 {
4573 maskmode = mode = V32QImode;
4574 w = 32;
4575 e = 1;
4576 }
4577 else
4578 {
4579 maskmode = mode = V8SImode;
4580 w = 8;
4581 e = 4;
4582 }
4583 t1 = gen_reg_rtx (maskmode);
4584
4585 /* Replicate the low bits of the V4DImode mask into V8SImode:
4586 mask = { A B C D }
4587 t1 = { A A B B C C D D }. */
4588 for (i = 0; i < w / 2; ++i)
4589 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
4590 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4591 vt = force_reg (maskmode, vt);
4592 mask = gen_lowpart (maskmode, mask);
4593 if (maskmode == V8SImode)
4594 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
4595 else
4596 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
4597
4598 /* Multiply the shuffle indicies by two. */
4599 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
4600 OPTAB_DIRECT);
4601
4602 /* Add one to the odd shuffle indicies:
4603 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
4604 for (i = 0; i < w / 2; ++i)
4605 {
4606 vec[i * 2] = const0_rtx;
4607 vec[i * 2 + 1] = const1_rtx;
4608 }
4609 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4610 vt = validize_mem (force_const_mem (maskmode, vt));
4611 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
4612 OPTAB_DIRECT);
4613
4614 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
4615 operands[3] = mask = t1;
4616 target = gen_reg_rtx (mode);
4617 op0 = gen_lowpart (mode, op0);
4618 op1 = gen_lowpart (mode, op1);
4619 }
4620
4621 switch (mode)
4622 {
4623 case E_V8SImode:
4624 /* The VPERMD and VPERMPS instructions already properly ignore
4625 the high bits of the shuffle elements. No need for us to
4626 perform an AND ourselves. */
4627 if (one_operand_shuffle)
4628 {
4629 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
4630 if (target != operands[0])
4631 emit_move_insn (operands[0],
4632 gen_lowpart (GET_MODE (operands[0]), target));
4633 }
4634 else
4635 {
4636 t1 = gen_reg_rtx (V8SImode);
4637 t2 = gen_reg_rtx (V8SImode);
4638 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
4639 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
4640 goto merge_two;
4641 }
4642 return;
4643
4644 case E_V8SFmode:
4645 mask = gen_lowpart (V8SImode, mask);
4646 if (one_operand_shuffle)
4647 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
4648 else
4649 {
4650 t1 = gen_reg_rtx (V8SFmode);
4651 t2 = gen_reg_rtx (V8SFmode);
4652 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
4653 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
4654 goto merge_two;
4655 }
4656 return;
4657
4658 case E_V4SImode:
4659 /* By combining the two 128-bit input vectors into one 256-bit
4660 input vector, we can use VPERMD and VPERMPS for the full
4661 two-operand shuffle. */
4662 t1 = gen_reg_rtx (V8SImode);
4663 t2 = gen_reg_rtx (V8SImode);
4664 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
4665 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4666 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
4667 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
4668 return;
4669
4670 case E_V4SFmode:
4671 t1 = gen_reg_rtx (V8SFmode);
4672 t2 = gen_reg_rtx (V8SImode);
4673 mask = gen_lowpart (V4SImode, mask);
4674 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
4675 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4676 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
4677 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
4678 return;
4679
4680 case E_V32QImode:
4681 t1 = gen_reg_rtx (V32QImode);
4682 t2 = gen_reg_rtx (V32QImode);
4683 t3 = gen_reg_rtx (V32QImode);
4684 vt2 = GEN_INT (-128);
4685 vt = gen_const_vec_duplicate (V32QImode, vt2);
4686 vt = force_reg (V32QImode, vt);
4687 for (i = 0; i < 32; i++)
4688 vec[i] = i < 16 ? vt2 : const0_rtx;
4689 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
4690 vt2 = force_reg (V32QImode, vt2);
4691 /* From mask create two adjusted masks, which contain the same
4692 bits as mask in the low 7 bits of each vector element.
4693 The first mask will have the most significant bit clear
4694 if it requests element from the same 128-bit lane
4695 and MSB set if it requests element from the other 128-bit lane.
4696 The second mask will have the opposite values of the MSB,
4697 and additionally will have its 128-bit lanes swapped.
4698 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4699 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
4700 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4701 stands for other 12 bytes. */
4702 /* The bit whether element is from the same lane or the other
4703 lane is bit 4, so shift it up by 3 to the MSB position. */
4704 t5 = gen_reg_rtx (V4DImode);
4705 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
4706 GEN_INT (3)));
4707 /* Clear MSB bits from the mask just in case it had them set. */
4708 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
4709 /* After this t1 will have MSB set for elements from other lane. */
4710 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
4711 /* Clear bits other than MSB. */
4712 emit_insn (gen_andv32qi3 (t1, t1, vt));
4713 /* Or in the lower bits from mask into t3. */
4714 emit_insn (gen_iorv32qi3 (t3, t1, t2));
4715 /* And invert MSB bits in t1, so MSB is set for elements from the same
4716 lane. */
4717 emit_insn (gen_xorv32qi3 (t1, t1, vt));
4718 /* Swap 128-bit lanes in t3. */
4719 t6 = gen_reg_rtx (V4DImode);
4720 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
4721 const2_rtx, GEN_INT (3),
4722 const0_rtx, const1_rtx));
4723 /* And or in the lower bits from mask into t1. */
4724 emit_insn (gen_iorv32qi3 (t1, t1, t2));
4725 if (one_operand_shuffle)
4726 {
4727 /* Each of these shuffles will put 0s in places where
4728 element from the other 128-bit lane is needed, otherwise
4729 will shuffle in the requested value. */
4730 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
4731 gen_lowpart (V32QImode, t6)));
4732 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
4733 /* For t3 the 128-bit lanes are swapped again. */
4734 t7 = gen_reg_rtx (V4DImode);
4735 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
4736 const2_rtx, GEN_INT (3),
4737 const0_rtx, const1_rtx));
4738 /* And oring both together leads to the result. */
4739 emit_insn (gen_iorv32qi3 (target, t1,
4740 gen_lowpart (V32QImode, t7)));
4741 if (target != operands[0])
4742 emit_move_insn (operands[0],
4743 gen_lowpart (GET_MODE (operands[0]), target));
4744 return;
4745 }
4746
4747 t4 = gen_reg_rtx (V32QImode);
4748 /* Similarly to the above one_operand_shuffle code,
4749 just for repeated twice for each operand. merge_two:
4750 code will merge the two results together. */
4751 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
4752 gen_lowpart (V32QImode, t6)));
4753 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
4754 gen_lowpart (V32QImode, t6)));
4755 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
4756 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
4757 t7 = gen_reg_rtx (V4DImode);
4758 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
4759 const2_rtx, GEN_INT (3),
4760 const0_rtx, const1_rtx));
4761 t8 = gen_reg_rtx (V4DImode);
4762 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
4763 const2_rtx, GEN_INT (3),
4764 const0_rtx, const1_rtx));
4765 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
4766 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
4767 t1 = t4;
4768 t2 = t3;
4769 goto merge_two;
4770
4771 default:
4772 gcc_assert (GET_MODE_SIZE (mode) <= 16);
4773 break;
4774 }
4775 }
4776
4777 if (TARGET_XOP)
4778 {
4779 /* The XOP VPPERM insn supports three inputs. By ignoring the
4780 one_operand_shuffle special case, we avoid creating another
4781 set of constant vectors in memory. */
4782 one_operand_shuffle = false;
4783
4784 /* mask = mask & {2*w-1, ...} */
4785 vt = GEN_INT (2*w - 1);
4786 }
4787 else
4788 {
4789 /* mask = mask & {w-1, ...} */
4790 vt = GEN_INT (w - 1);
4791 }
4792
4793 vt = gen_const_vec_duplicate (maskmode, vt);
4794 mask = expand_simple_binop (maskmode, AND, mask, vt,
4795 NULL_RTX, 0, OPTAB_DIRECT);
4796
4797 /* For non-QImode operations, convert the word permutation control
4798 into a byte permutation control. */
4799 if (mode != V16QImode)
4800 {
4801 mask = expand_simple_binop (maskmode, ASHIFT, mask,
4802 GEN_INT (exact_log2 (e)),
4803 NULL_RTX, 0, OPTAB_DIRECT);
4804
4805 /* Convert mask to vector of chars. */
4806 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
4807
4808 /* Replicate each of the input bytes into byte positions:
4809 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4810 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4811 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
4812 for (i = 0; i < 16; ++i)
4813 vec[i] = GEN_INT (i/e * e);
4814 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4815 vt = validize_mem (force_const_mem (V16QImode, vt));
4816 if (TARGET_XOP)
4817 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
4818 else
4819 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
4820
4821 /* Convert it into the byte positions by doing
4822 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
4823 for (i = 0; i < 16; ++i)
4824 vec[i] = GEN_INT (i % e);
4825 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4826 vt = validize_mem (force_const_mem (V16QImode, vt));
4827 emit_insn (gen_addv16qi3 (mask, mask, vt));
4828 }
4829
4830 /* The actual shuffle operations all operate on V16QImode. */
4831 op0 = gen_lowpart (V16QImode, op0);
4832 op1 = gen_lowpart (V16QImode, op1);
4833
4834 if (TARGET_XOP)
4835 {
4836 if (GET_MODE (target) != V16QImode)
4837 target = gen_reg_rtx (V16QImode);
4838 emit_insn (gen_xop_pperm (target, op0, op1, mask));
4839 if (target != operands[0])
4840 emit_move_insn (operands[0],
4841 gen_lowpart (GET_MODE (operands[0]), target));
4842 }
4843 else if (one_operand_shuffle)
4844 {
4845 if (GET_MODE (target) != V16QImode)
4846 target = gen_reg_rtx (V16QImode);
4847 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
4848 if (target != operands[0])
4849 emit_move_insn (operands[0],
4850 gen_lowpart (GET_MODE (operands[0]), target));
4851 }
4852 else
4853 {
4854 rtx xops[6];
4855 bool ok;
4856
4857 /* Shuffle the two input vectors independently. */
4858 t1 = gen_reg_rtx (V16QImode);
4859 t2 = gen_reg_rtx (V16QImode);
4860 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
4861 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
4862
4863 merge_two:
4864 /* Then merge them together. The key is whether any given control
4865 element contained a bit set that indicates the second word. */
4866 mask = operands[3];
4867 vt = GEN_INT (w);
4868 if (maskmode == V2DImode && !TARGET_SSE4_1)
4869 {
4870 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
4871 more shuffle to convert the V2DI input mask into a V4SI
4872 input mask. At which point the masking that expand_int_vcond
4873 will work as desired. */
4874 rtx t3 = gen_reg_rtx (V4SImode);
4875 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
4876 const0_rtx, const0_rtx,
4877 const2_rtx, const2_rtx));
4878 mask = t3;
4879 maskmode = V4SImode;
4880 e = w = 4;
4881 }
4882
4883 vt = gen_const_vec_duplicate (maskmode, vt);
4884 vt = force_reg (maskmode, vt);
4885 mask = expand_simple_binop (maskmode, AND, mask, vt,
4886 NULL_RTX, 0, OPTAB_DIRECT);
4887
4888 if (GET_MODE (target) != mode)
4889 target = gen_reg_rtx (mode);
4890 xops[0] = target;
4891 xops[1] = gen_lowpart (mode, t2);
4892 xops[2] = gen_lowpart (mode, t1);
4893 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
4894 xops[4] = mask;
4895 xops[5] = vt;
4896 ok = ix86_expand_int_vcond (xops);
4897 gcc_assert (ok);
4898 if (target != operands[0])
4899 emit_move_insn (operands[0],
4900 gen_lowpart (GET_MODE (operands[0]), target));
4901 }
4902}
4903
4904/* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
4905 true if we should do zero extension, else sign extension. HIGH_P is
4906 true if we want the N/2 high elements, else the low elements. */
4907
4908void
4909ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
4910{
4911 machine_mode imode = GET_MODE (src);
4912 rtx tmp;
4913
4914 if (TARGET_SSE4_1)
4915 {
4916 rtx (*unpack)(rtx, rtx);
4917 rtx (*extract)(rtx, rtx) = NULL;
4918 machine_mode halfmode = BLKmode;
4919
4920 switch (imode)
4921 {
4922 case E_V64QImode:
4923 if (unsigned_p)
4924 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
4925 else
4926 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
4927 halfmode = V32QImode;
4928 extract
4929 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
4930 break;
4931 case E_V32QImode:
4932 if (unsigned_p)
4933 unpack = gen_avx2_zero_extendv16qiv16hi2;
4934 else
4935 unpack = gen_avx2_sign_extendv16qiv16hi2;
4936 halfmode = V16QImode;
4937 extract
4938 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
4939 break;
4940 case E_V32HImode:
4941 if (unsigned_p)
4942 unpack = gen_avx512f_zero_extendv16hiv16si2;
4943 else
4944 unpack = gen_avx512f_sign_extendv16hiv16si2;
4945 halfmode = V16HImode;
4946 extract
4947 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
4948 break;
4949 case E_V16HImode:
4950 if (unsigned_p)
4951 unpack = gen_avx2_zero_extendv8hiv8si2;
4952 else
4953 unpack = gen_avx2_sign_extendv8hiv8si2;
4954 halfmode = V8HImode;
4955 extract
4956 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
4957 break;
4958 case E_V16SImode:
4959 if (unsigned_p)
4960 unpack = gen_avx512f_zero_extendv8siv8di2;
4961 else
4962 unpack = gen_avx512f_sign_extendv8siv8di2;
4963 halfmode = V8SImode;
4964 extract
4965 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
4966 break;
4967 case E_V8SImode:
4968 if (unsigned_p)
4969 unpack = gen_avx2_zero_extendv4siv4di2;
4970 else
4971 unpack = gen_avx2_sign_extendv4siv4di2;
4972 halfmode = V4SImode;
4973 extract
4974 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
4975 break;
4976 case E_V16QImode:
4977 if (unsigned_p)
4978 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
4979 else
4980 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
4981 break;
4982 case E_V8HImode:
4983 if (unsigned_p)
4984 unpack = gen_sse4_1_zero_extendv4hiv4si2;
4985 else
4986 unpack = gen_sse4_1_sign_extendv4hiv4si2;
4987 break;
4988 case E_V4SImode:
4989 if (unsigned_p)
4990 unpack = gen_sse4_1_zero_extendv2siv2di2;
4991 else
4992 unpack = gen_sse4_1_sign_extendv2siv2di2;
4993 break;
4994 default:
4995 gcc_unreachable ();
4996 }
4997
4998 if (GET_MODE_SIZE (imode) >= 32)
4999 {
5000 tmp = gen_reg_rtx (halfmode);
5001 emit_insn (extract (tmp, src));
5002 }
5003 else if (high_p)
5004 {
5005 /* Shift higher 8 bytes to lower 8 bytes. */
5006 tmp = gen_reg_rtx (V1TImode);
5007 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5008 GEN_INT (64)));
5009 tmp = gen_lowpart (imode, tmp);
5010 }
5011 else
5012 tmp = src;
5013
5014 emit_insn (unpack (dest, tmp));
5015 }
5016 else
5017 {
5018 rtx (*unpack)(rtx, rtx, rtx);
5019
5020 switch (imode)
5021 {
5022 case E_V16QImode:
5023 if (high_p)
5024 unpack = gen_vec_interleave_highv16qi;
5025 else
5026 unpack = gen_vec_interleave_lowv16qi;
5027 break;
5028 case E_V8HImode:
5029 if (high_p)
5030 unpack = gen_vec_interleave_highv8hi;
5031 else
5032 unpack = gen_vec_interleave_lowv8hi;
5033 break;
5034 case E_V4SImode:
5035 if (high_p)
5036 unpack = gen_vec_interleave_highv4si;
5037 else
5038 unpack = gen_vec_interleave_lowv4si;
5039 break;
5040 default:
5041 gcc_unreachable ();
5042 }
5043
5044 if (unsigned_p)
5045 tmp = force_reg (imode, CONST0_RTX (imode));
5046 else
5047 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5048 src, pc_rtx, pc_rtx);
5049
5050 rtx tmp2 = gen_reg_rtx (imode);
5051 emit_insn (unpack (tmp2, src, tmp));
5052 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5053 }
5054}
5055
5056/* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5057 but works for floating pointer parameters and nonoffsetable memories.
5058 For pushes, it returns just stack offsets; the values will be saved
5059 in the right order. Maximally three parts are generated. */
5060
5061static int
5062ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5063{
5064 int size;
5065
5066 if (!TARGET_64BIT)
5067 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5068 else
5069 size = (GET_MODE_SIZE (mode) + 4) / 8;
5070
5071 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5072 gcc_assert (size >= 2 && size <= 4);
5073
5074 /* Optimize constant pool reference to immediates. This is used by fp
5075 moves, that force all constants to memory to allow combining. */
5076 if (MEM_P (operand) && MEM_READONLY_P (operand))
5077 operand = avoid_constant_pool_reference (operand);
5078
5079 if (MEM_P (operand) && !offsettable_memref_p (operand))
5080 {
5081 /* The only non-offsetable memories we handle are pushes. */
5082 int ok = push_operand (operand, VOIDmode);
5083
5084 gcc_assert (ok);
5085
5086 operand = copy_rtx (operand);
5087 PUT_MODE (operand, word_mode);
5088 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5089 return size;
5090 }
5091
5092 if (GET_CODE (operand) == CONST_VECTOR)
5093 {
5094 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5095 /* Caution: if we looked through a constant pool memory above,
5096 the operand may actually have a different mode now. That's
5097 ok, since we want to pun this all the way back to an integer. */
5098 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5099 gcc_assert (operand != NULL);
5100 mode = imode;
5101 }
5102
5103 if (!TARGET_64BIT)
5104 {
5105 if (mode == DImode)
5106 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5107 else
5108 {
5109 int i;
5110
5111 if (REG_P (operand))
5112 {
5113 gcc_assert (reload_completed);
5114 for (i = 0; i < size; i++)
5115 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5116 }
5117 else if (offsettable_memref_p (operand))
5118 {
5119 operand = adjust_address (operand, SImode, 0);
5120 parts[0] = operand;
5121 for (i = 1; i < size; i++)
5122 parts[i] = adjust_address (operand, SImode, 4 * i);
5123 }
5124 else if (CONST_DOUBLE_P (operand))
5125 {
5126 const REAL_VALUE_TYPE *r;
5127 long l[4];
5128
5129 r = CONST_DOUBLE_REAL_VALUE (operand);
5130 switch (mode)
5131 {
5132 case E_TFmode:
5133 real_to_target (l, r, mode);
5134 parts[3] = gen_int_mode (l[3], SImode);
5135 parts[2] = gen_int_mode (l[2], SImode);
5136 break;
5137 case E_XFmode:
5138 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5139 long double may not be 80-bit. */
5140 real_to_target (l, r, mode);
5141 parts[2] = gen_int_mode (l[2], SImode);
5142 break;
5143 case E_DFmode:
5144 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5145 break;
5146 default:
5147 gcc_unreachable ();
5148 }
5149 parts[1] = gen_int_mode (l[1], SImode);
5150 parts[0] = gen_int_mode (l[0], SImode);
5151 }
5152 else
5153 gcc_unreachable ();
5154 }
5155 }
5156 else
5157 {
5158 if (mode == TImode)
5159 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5160 if (mode == XFmode || mode == TFmode)
5161 {
5162 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5163 if (REG_P (operand))
5164 {
5165 gcc_assert (reload_completed);
5166 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5167 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5168 }
5169 else if (offsettable_memref_p (operand))
5170 {
5171 operand = adjust_address (operand, DImode, 0);
5172 parts[0] = operand;
5173 parts[1] = adjust_address (operand, upper_mode, 8);
5174 }
5175 else if (CONST_DOUBLE_P (operand))
5176 {
5177 long l[4];
5178
5179 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5180
5181 /* real_to_target puts 32-bit pieces in each long. */
5182 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5183 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5184 << 32), DImode);
5185
5186 if (upper_mode == SImode)
5187 parts[1] = gen_int_mode (l[2], SImode);
5188 else
5189 parts[1]
5190 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5191 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5192 << 32), DImode);
5193 }
5194 else
5195 gcc_unreachable ();
5196 }
5197 }
5198
5199 return size;
5200}
5201
5202/* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5203 Return false when normal moves are needed; true when all required
5204 insns have been emitted. Operands 2-4 contain the input values
5205 int the correct order; operands 5-7 contain the output values. */
5206
5207void
5208ix86_split_long_move (rtx operands[])
5209{
5210 rtx part[2][4];
5211 int nparts, i, j;
5212 int push = 0;
5213 int collisions = 0;
5214 machine_mode mode = GET_MODE (operands[0]);
5215 bool collisionparts[4];
5216
5217 /* The DFmode expanders may ask us to move double.
5218 For 64bit target this is single move. By hiding the fact
5219 here we simplify i386.md splitters. */
5220 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5221 {
5222 /* Optimize constant pool reference to immediates. This is used by
5223 fp moves, that force all constants to memory to allow combining. */
5224
5225 if (MEM_P (operands[1])
5226 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5227 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5228 operands[1] = get_pool_constant (XEXP (operands[1], 0));
5229 if (push_operand (operands[0], VOIDmode))
5230 {
5231 operands[0] = copy_rtx (operands[0]);
5232 PUT_MODE (operands[0], word_mode);
5233 }
5234 else
5235 operands[0] = gen_lowpart (DImode, operands[0]);
5236 operands[1] = gen_lowpart (DImode, operands[1]);
5237 emit_move_insn (operands[0], operands[1]);
5238 return;
5239 }
5240
5241 /* The only non-offsettable memory we handle is push. */
5242 if (push_operand (operands[0], VOIDmode))
5243 push = 1;
5244 else
5245 gcc_assert (!MEM_P (operands[0])
5246 || offsettable_memref_p (operands[0]));
5247
5248 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5249 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5250
5251 /* When emitting push, take care for source operands on the stack. */
5252 if (push && MEM_P (operands[1])
5253 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5254 {
5255 rtx src_base = XEXP (part[1][nparts - 1], 0);
5256
5257 /* Compensate for the stack decrement by 4. */
5258 if (!TARGET_64BIT && nparts == 3
5259 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
5260 src_base = plus_constant (Pmode, src_base, 4);
5261
5262 /* src_base refers to the stack pointer and is
5263 automatically decreased by emitted push. */
5264 for (i = 0; i < nparts; i++)
5265 part[1][i] = change_address (part[1][i],
5266 GET_MODE (part[1][i]), src_base);
5267 }
5268
5269 /* We need to do copy in the right order in case an address register
5270 of the source overlaps the destination. */
5271 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
5272 {
5273 rtx tmp;
5274
5275 for (i = 0; i < nparts; i++)
5276 {
5277 collisionparts[i]
5278 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
5279 if (collisionparts[i])
5280 collisions++;
5281 }
5282
5283 /* Collision in the middle part can be handled by reordering. */
5284 if (collisions == 1 && nparts == 3 && collisionparts [1])
5285 {
5286 std::swap (part[0][1], part[0][2]);
5287 std::swap (part[1][1], part[1][2]);
5288 }
5289 else if (collisions == 1
5290 && nparts == 4
5291 && (collisionparts [1] || collisionparts [2]))
5292 {
5293 if (collisionparts [1])
5294 {
5295 std::swap (part[0][1], part[0][2]);
5296 std::swap (part[1][1], part[1][2]);
5297 }
5298 else
5299 {
5300 std::swap (part[0][2], part[0][3]);
5301 std::swap (part[1][2], part[1][3]);
5302 }
5303 }
5304
5305 /* If there are more collisions, we can't handle it by reordering.
5306 Do an lea to the last part and use only one colliding move. */
5307 else if (collisions > 1)
5308 {
5309 rtx base, addr;
5310
5311 collisions = 1;
5312
5313 base = part[0][nparts - 1];
5314
5315 /* Handle the case when the last part isn't valid for lea.
5316 Happens in 64-bit mode storing the 12-byte XFmode. */
5317 if (GET_MODE (base) != Pmode)
5318 base = gen_rtx_REG (Pmode, REGNO (base));
5319
5320 addr = XEXP (part[1][0], 0);
5321 if (TARGET_TLS_DIRECT_SEG_REFS)
5322 {
5323 struct ix86_address parts;
5324 int ok = ix86_decompose_address (addr, &parts);
5325 gcc_assert (ok);
5326 /* It is not valid to use %gs: or %fs: in lea. */
5327 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
5328 }
5329 emit_insn (gen_rtx_SET (base, addr));
5330 part[1][0] = replace_equiv_address (part[1][0], base);
5331 for (i = 1; i < nparts; i++)
5332 {
5333 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
5334 part[1][i] = replace_equiv_address (part[1][i], tmp);
5335 }
5336 }
5337 }
5338
5339 if (push)
5340 {
5341 if (!TARGET_64BIT)
5342 {
5343 if (nparts == 3)
5344 {
5345 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
d9330fb5 5346 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
2bf6d935
ML
5347 emit_move_insn (part[0][2], part[1][2]);
5348 }
5349 else if (nparts == 4)
5350 {
5351 emit_move_insn (part[0][3], part[1][3]);
5352 emit_move_insn (part[0][2], part[1][2]);
5353 }
5354 }
5355 else
5356 {
5357 /* In 64bit mode we don't have 32bit push available. In case this is
5358 register, it is OK - we will just use larger counterpart. We also
5359 retype memory - these comes from attempt to avoid REX prefix on
5360 moving of second half of TFmode value. */
5361 if (GET_MODE (part[1][1]) == SImode)
5362 {
5363 switch (GET_CODE (part[1][1]))
5364 {
5365 case MEM:
5366 part[1][1] = adjust_address (part[1][1], DImode, 0);
5367 break;
5368
5369 case REG:
5370 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
5371 break;
5372
5373 default:
5374 gcc_unreachable ();
5375 }
5376
5377 if (GET_MODE (part[1][0]) == SImode)
5378 part[1][0] = part[1][1];
5379 }
5380 }
5381 emit_move_insn (part[0][1], part[1][1]);
5382 emit_move_insn (part[0][0], part[1][0]);
5383 return;
5384 }
5385
5386 /* Choose correct order to not overwrite the source before it is copied. */
5387 if ((REG_P (part[0][0])
5388 && REG_P (part[1][1])
5389 && (REGNO (part[0][0]) == REGNO (part[1][1])
5390 || (nparts == 3
5391 && REGNO (part[0][0]) == REGNO (part[1][2]))
5392 || (nparts == 4
5393 && REGNO (part[0][0]) == REGNO (part[1][3]))))
5394 || (collisions > 0
5395 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
5396 {
5397 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
5398 {
5399 operands[2 + i] = part[0][j];
5400 operands[6 + i] = part[1][j];
5401 }
5402 }
5403 else
5404 {
5405 for (i = 0; i < nparts; i++)
5406 {
5407 operands[2 + i] = part[0][i];
5408 operands[6 + i] = part[1][i];
5409 }
5410 }
5411
5412 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5413 if (optimize_insn_for_size_p ())
5414 {
5415 for (j = 0; j < nparts - 1; j++)
5416 if (CONST_INT_P (operands[6 + j])
5417 && operands[6 + j] != const0_rtx
5418 && REG_P (operands[2 + j]))
5419 for (i = j; i < nparts - 1; i++)
5420 if (CONST_INT_P (operands[7 + i])
5421 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
5422 operands[7 + i] = operands[2 + j];
5423 }
5424
5425 for (i = 0; i < nparts; i++)
5426 emit_move_insn (operands[2 + i], operands[6 + i]);
5427
5428 return;
5429}
5430
5431/* Helper function of ix86_split_ashl used to generate an SImode/DImode
5432 left shift by a constant, either using a single shift or
5433 a sequence of add instructions. */
5434
5435static void
5436ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
5437{
2bf6d935
ML
5438 if (count == 1
5439 || (count * ix86_cost->add <= ix86_cost->shift_const
5440 && !optimize_insn_for_size_p ()))
5441 {
2bf6d935 5442 while (count-- > 0)
83bc5e44 5443 emit_insn (gen_add2_insn (operand, operand));
2bf6d935
ML
5444 }
5445 else
5446 {
83bc5e44
UB
5447 rtx (*insn)(rtx, rtx, rtx);
5448
2bf6d935
ML
5449 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5450 emit_insn (insn (operand, operand, GEN_INT (count)));
5451 }
5452}
5453
5454void
5455ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
5456{
5457 rtx (*gen_ashl3)(rtx, rtx, rtx);
5458 rtx (*gen_shld)(rtx, rtx, rtx);
5459 int half_width = GET_MODE_BITSIZE (mode) >> 1;
987a3082 5460 machine_mode half_mode;
2bf6d935
ML
5461
5462 rtx low[2], high[2];
5463 int count;
5464
5465 if (CONST_INT_P (operands[2]))
5466 {
5467 split_double_mode (mode, operands, 2, low, high);
5468 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5469
5470 if (count >= half_width)
5471 {
5472 emit_move_insn (high[0], low[1]);
5473 emit_move_insn (low[0], const0_rtx);
5474
5475 if (count > half_width)
5476 ix86_expand_ashl_const (high[0], count - half_width, mode);
5477 }
5478 else
5479 {
5480 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5481
5482 if (!rtx_equal_p (operands[0], operands[1]))
5483 emit_move_insn (operands[0], operands[1]);
5484
5485 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
5486 ix86_expand_ashl_const (low[0], count, mode);
5487 }
5488 return;
5489 }
5490
5491 split_double_mode (mode, operands, 1, low, high);
987a3082 5492 half_mode = mode == DImode ? SImode : DImode;
2bf6d935
ML
5493
5494 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5495
5496 if (operands[1] == const1_rtx)
5497 {
5498 /* Assuming we've chosen a QImode capable registers, then 1 << N
5499 can be done with two 32/64-bit shifts, no branches, no cmoves. */
5500 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
5501 {
5502 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
5503
5504 ix86_expand_clear (low[0]);
5505 ix86_expand_clear (high[0]);
5506 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
5507
5508 d = gen_lowpart (QImode, low[0]);
5509 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5510 s = gen_rtx_EQ (QImode, flags, const0_rtx);
5511 emit_insn (gen_rtx_SET (d, s));
5512
5513 d = gen_lowpart (QImode, high[0]);
5514 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5515 s = gen_rtx_NE (QImode, flags, const0_rtx);
5516 emit_insn (gen_rtx_SET (d, s));
5517 }
5518
5519 /* Otherwise, we can get the same results by manually performing
5520 a bit extract operation on bit 5/6, and then performing the two
5521 shifts. The two methods of getting 0/1 into low/high are exactly
5522 the same size. Avoiding the shift in the bit extract case helps
5523 pentium4 a bit; no one else seems to care much either way. */
5524 else
5525 {
2bf6d935
ML
5526 rtx (*gen_lshr3)(rtx, rtx, rtx);
5527 rtx (*gen_and3)(rtx, rtx, rtx);
5528 rtx (*gen_xor3)(rtx, rtx, rtx);
5529 HOST_WIDE_INT bits;
5530 rtx x;
5531
5532 if (mode == DImode)
5533 {
2bf6d935
ML
5534 gen_lshr3 = gen_lshrsi3;
5535 gen_and3 = gen_andsi3;
5536 gen_xor3 = gen_xorsi3;
5537 bits = 5;
5538 }
5539 else
5540 {
2bf6d935
ML
5541 gen_lshr3 = gen_lshrdi3;
5542 gen_and3 = gen_anddi3;
5543 gen_xor3 = gen_xordi3;
5544 bits = 6;
5545 }
5546
5547 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
5548 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
5549 else
5550 x = gen_lowpart (half_mode, operands[2]);
5551 emit_insn (gen_rtx_SET (high[0], x));
5552
5553 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
5554 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
5555 emit_move_insn (low[0], high[0]);
5556 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
5557 }
5558
5559 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5560 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
5561 return;
5562 }
5563
5564 if (operands[1] == constm1_rtx)
5565 {
5566 /* For -1 << N, we can avoid the shld instruction, because we
5567 know that we're shifting 0...31/63 ones into a -1. */
5568 emit_move_insn (low[0], constm1_rtx);
5569 if (optimize_insn_for_size_p ())
5570 emit_move_insn (high[0], low[0]);
5571 else
5572 emit_move_insn (high[0], constm1_rtx);
5573 }
5574 else
5575 {
5576 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5577
5578 if (!rtx_equal_p (operands[0], operands[1]))
5579 emit_move_insn (operands[0], operands[1]);
5580
5581 split_double_mode (mode, operands, 1, low, high);
5582 emit_insn (gen_shld (high[0], low[0], operands[2]));
5583 }
5584
5585 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5586
5587 if (TARGET_CMOVE && scratch)
5588 {
2bf6d935 5589 ix86_expand_clear (scratch);
987a3082
UB
5590 emit_insn (gen_x86_shift_adj_1
5591 (half_mode, high[0], low[0], operands[2], scratch));
2bf6d935
ML
5592 }
5593 else
987a3082 5594 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
2bf6d935
ML
5595}
5596
5597void
5598ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
5599{
5600 rtx (*gen_ashr3)(rtx, rtx, rtx)
5601 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
5602 rtx (*gen_shrd)(rtx, rtx, rtx);
5603 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5604
5605 rtx low[2], high[2];
5606 int count;
5607
5608 if (CONST_INT_P (operands[2]))
5609 {
5610 split_double_mode (mode, operands, 2, low, high);
5611 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5612
5613 if (count == GET_MODE_BITSIZE (mode) - 1)
5614 {
5615 emit_move_insn (high[0], high[1]);
5616 emit_insn (gen_ashr3 (high[0], high[0],
5617 GEN_INT (half_width - 1)));
5618 emit_move_insn (low[0], high[0]);
5619
5620 }
5621 else if (count >= half_width)
5622 {
5623 emit_move_insn (low[0], high[1]);
5624 emit_move_insn (high[0], low[0]);
5625 emit_insn (gen_ashr3 (high[0], high[0],
5626 GEN_INT (half_width - 1)));
5627
5628 if (count > half_width)
5629 emit_insn (gen_ashr3 (low[0], low[0],
5630 GEN_INT (count - half_width)));
5631 }
5632 else
5633 {
5634 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5635
5636 if (!rtx_equal_p (operands[0], operands[1]))
5637 emit_move_insn (operands[0], operands[1]);
5638
5639 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5640 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
5641 }
5642 }
5643 else
5644 {
987a3082
UB
5645 machine_mode half_mode;
5646
2bf6d935
ML
5647 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5648
5649 if (!rtx_equal_p (operands[0], operands[1]))
5650 emit_move_insn (operands[0], operands[1]);
5651
5652 split_double_mode (mode, operands, 1, low, high);
987a3082 5653 half_mode = mode == DImode ? SImode : DImode;
2bf6d935
ML
5654
5655 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5656 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
5657
5658 if (TARGET_CMOVE && scratch)
5659 {
2bf6d935
ML
5660 emit_move_insn (scratch, high[0]);
5661 emit_insn (gen_ashr3 (scratch, scratch,
5662 GEN_INT (half_width - 1)));
987a3082
UB
5663 emit_insn (gen_x86_shift_adj_1
5664 (half_mode, low[0], high[0], operands[2], scratch));
2bf6d935
ML
5665 }
5666 else
987a3082
UB
5667 emit_insn (gen_x86_shift_adj_3
5668 (half_mode, low[0], high[0], operands[2]));
2bf6d935
ML
5669 }
5670}
5671
5672void
5673ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
5674{
5675 rtx (*gen_lshr3)(rtx, rtx, rtx)
5676 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
5677 rtx (*gen_shrd)(rtx, rtx, rtx);
5678 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5679
5680 rtx low[2], high[2];
5681 int count;
5682
5683 if (CONST_INT_P (operands[2]))
5684 {
5685 split_double_mode (mode, operands, 2, low, high);
5686 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5687
5688 if (count >= half_width)
5689 {
5690 emit_move_insn (low[0], high[1]);
5691 ix86_expand_clear (high[0]);
5692
5693 if (count > half_width)
5694 emit_insn (gen_lshr3 (low[0], low[0],
5695 GEN_INT (count - half_width)));
5696 }
5697 else
5698 {
5699 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5700
5701 if (!rtx_equal_p (operands[0], operands[1]))
5702 emit_move_insn (operands[0], operands[1]);
5703
5704 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5705 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
5706 }
5707 }
5708 else
5709 {
987a3082
UB
5710 machine_mode half_mode;
5711
2bf6d935
ML
5712 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5713
5714 if (!rtx_equal_p (operands[0], operands[1]))
5715 emit_move_insn (operands[0], operands[1]);
5716
5717 split_double_mode (mode, operands, 1, low, high);
987a3082 5718 half_mode = mode == DImode ? SImode : DImode;
2bf6d935
ML
5719
5720 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5721 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
5722
5723 if (TARGET_CMOVE && scratch)
5724 {
2bf6d935 5725 ix86_expand_clear (scratch);
987a3082
UB
5726 emit_insn (gen_x86_shift_adj_1
5727 (half_mode, low[0], high[0], operands[2], scratch));
2bf6d935
ML
5728 }
5729 else
987a3082
UB
5730 emit_insn (gen_x86_shift_adj_2
5731 (half_mode, low[0], high[0], operands[2]));
2bf6d935
ML
5732 }
5733}
5734
5735/* Return mode for the memcpy/memset loop counter. Prefer SImode over
5736 DImode for constant loop counts. */
5737
5738static machine_mode
5739counter_mode (rtx count_exp)
5740{
5741 if (GET_MODE (count_exp) != VOIDmode)
5742 return GET_MODE (count_exp);
5743 if (!CONST_INT_P (count_exp))
5744 return Pmode;
5745 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
5746 return DImode;
5747 return SImode;
5748}
5749
5750/* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5751 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5752 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
5753 memory by VALUE (supposed to be in MODE).
5754
5755 The size is rounded down to whole number of chunk size moved at once.
5756 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
5757
5758
5759static void
76715c32 5760expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
2bf6d935
ML
5761 rtx destptr, rtx srcptr, rtx value,
5762 rtx count, machine_mode mode, int unroll,
5763 int expected_size, bool issetmem)
5764{
5765 rtx_code_label *out_label, *top_label;
5766 rtx iter, tmp;
5767 machine_mode iter_mode = counter_mode (count);
5768 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
5769 rtx piece_size = GEN_INT (piece_size_n);
5770 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
5771 rtx size;
5772 int i;
5773
5774 top_label = gen_label_rtx ();
5775 out_label = gen_label_rtx ();
5776 iter = gen_reg_rtx (iter_mode);
5777
5778 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
5779 NULL, 1, OPTAB_DIRECT);
5780 /* Those two should combine. */
5781 if (piece_size == const1_rtx)
5782 {
5783 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
5784 true, out_label);
5785 predict_jump (REG_BR_PROB_BASE * 10 / 100);
5786 }
5787 emit_move_insn (iter, const0_rtx);
5788
5789 emit_label (top_label);
5790
5791 tmp = convert_modes (Pmode, iter_mode, iter, true);
5792
5793 /* This assert could be relaxed - in this case we'll need to compute
5794 smallest power of two, containing in PIECE_SIZE_N and pass it to
5795 offset_address. */
5796 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
5797 destmem = offset_address (destmem, tmp, piece_size_n);
5798 destmem = adjust_address (destmem, mode, 0);
5799
5800 if (!issetmem)
5801 {
5802 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
5803 srcmem = adjust_address (srcmem, mode, 0);
5804
5805 /* When unrolling for chips that reorder memory reads and writes,
5806 we can save registers by using single temporary.
5807 Also using 4 temporaries is overkill in 32bit mode. */
5808 if (!TARGET_64BIT && 0)
5809 {
5810 for (i = 0; i < unroll; i++)
5811 {
5812 if (i)
5813 {
5814 destmem = adjust_address (copy_rtx (destmem), mode,
5815 GET_MODE_SIZE (mode));
5816 srcmem = adjust_address (copy_rtx (srcmem), mode,
5817 GET_MODE_SIZE (mode));
5818 }
5819 emit_move_insn (destmem, srcmem);
5820 }
5821 }
5822 else
5823 {
5824 rtx tmpreg[4];
5825 gcc_assert (unroll <= 4);
5826 for (i = 0; i < unroll; i++)
5827 {
5828 tmpreg[i] = gen_reg_rtx (mode);
5829 if (i)
5830 srcmem = adjust_address (copy_rtx (srcmem), mode,
5831 GET_MODE_SIZE (mode));
5832 emit_move_insn (tmpreg[i], srcmem);
5833 }
5834 for (i = 0; i < unroll; i++)
5835 {
5836 if (i)
5837 destmem = adjust_address (copy_rtx (destmem), mode,
5838 GET_MODE_SIZE (mode));
5839 emit_move_insn (destmem, tmpreg[i]);
5840 }
5841 }
5842 }
5843 else
5844 for (i = 0; i < unroll; i++)
5845 {
5846 if (i)
5847 destmem = adjust_address (copy_rtx (destmem), mode,
5848 GET_MODE_SIZE (mode));
5849 emit_move_insn (destmem, value);
5850 }
5851
5852 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
5853 true, OPTAB_LIB_WIDEN);
5854 if (tmp != iter)
5855 emit_move_insn (iter, tmp);
5856
5857 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
5858 true, top_label);
5859 if (expected_size != -1)
5860 {
5861 expected_size /= GET_MODE_SIZE (mode) * unroll;
5862 if (expected_size == 0)
5863 predict_jump (0);
5864 else if (expected_size > REG_BR_PROB_BASE)
5865 predict_jump (REG_BR_PROB_BASE - 1);
5866 else
5867 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
5868 / expected_size);
5869 }
5870 else
5871 predict_jump (REG_BR_PROB_BASE * 80 / 100);
5872 iter = ix86_zero_extend_to_Pmode (iter);
5873 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
5874 true, OPTAB_LIB_WIDEN);
5875 if (tmp != destptr)
5876 emit_move_insn (destptr, tmp);
5877 if (!issetmem)
5878 {
5879 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
5880 true, OPTAB_LIB_WIDEN);
5881 if (tmp != srcptr)
5882 emit_move_insn (srcptr, tmp);
5883 }
5884 emit_label (out_label);
5885}
5886
5887/* Divide COUNTREG by SCALE. */
5888static rtx
5889scale_counter (rtx countreg, int scale)
5890{
5891 rtx sc;
5892
5893 if (scale == 1)
5894 return countreg;
5895 if (CONST_INT_P (countreg))
5896 return GEN_INT (INTVAL (countreg) / scale);
5897 gcc_assert (REG_P (countreg));
5898
5899 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
5900 GEN_INT (exact_log2 (scale)),
5901 NULL, 1, OPTAB_DIRECT);
5902 return sc;
5903}
5904
5905/* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5906 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5907 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5908 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5909 ORIG_VALUE is the original value passed to memset to fill the memory with.
5910 Other arguments have same meaning as for previous function. */
5911
5912static void
76715c32 5913expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
2bf6d935
ML
5914 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
5915 rtx count,
5916 machine_mode mode, bool issetmem)
5917{
5918 rtx destexp;
5919 rtx srcexp;
5920 rtx countreg;
5921 HOST_WIDE_INT rounded_count;
5922
5923 /* If possible, it is shorter to use rep movs.
5924 TODO: Maybe it is better to move this logic to decide_alg. */
5925 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
5926 && (!issetmem || orig_value == const0_rtx))
5927 mode = SImode;
5928
5929 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
5930 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
5931
5932 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
5933 GET_MODE_SIZE (mode)));
5934 if (mode != QImode)
5935 {
5936 destexp = gen_rtx_ASHIFT (Pmode, countreg,
5937 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
5938 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
5939 }
5940 else
5941 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
5942 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
5943 {
5944 rounded_count
5945 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
5946 destmem = shallow_copy_rtx (destmem);
5947 set_mem_size (destmem, rounded_count);
5948 }
5949 else if (MEM_SIZE_KNOWN_P (destmem))
5950 clear_mem_size (destmem);
5951
5952 if (issetmem)
5953 {
5954 value = force_reg (mode, gen_lowpart (mode, value));
5955 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
5956 }
5957 else
5958 {
5959 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
5960 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
5961 if (mode != QImode)
5962 {
5963 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
5964 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
5965 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
5966 }
5967 else
5968 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
5969 if (CONST_INT_P (count))
5970 {
5971 rounded_count
5972 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
5973 srcmem = shallow_copy_rtx (srcmem);
5974 set_mem_size (srcmem, rounded_count);
5975 }
5976 else
5977 {
5978 if (MEM_SIZE_KNOWN_P (srcmem))
5979 clear_mem_size (srcmem);
5980 }
5981 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
5982 destexp, srcexp));
5983 }
5984}
5985
5986/* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
5987 DESTMEM.
5988 SRC is passed by pointer to be updated on return.
5989 Return value is updated DST. */
5990static rtx
5991emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
5992 HOST_WIDE_INT size_to_move)
5993{
c3185b64 5994 rtx dst = destmem, src = *srcmem, tempreg;
2bf6d935
ML
5995 enum insn_code code;
5996 machine_mode move_mode;
5997 int piece_size, i;
5998
5999 /* Find the widest mode in which we could perform moves.
6000 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6001 it until move of such size is supported. */
6002 piece_size = 1 << floor_log2 (size_to_move);
6003 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
6004 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6005 {
6006 gcc_assert (piece_size > 1);
6007 piece_size >>= 1;
6008 }
6009
6010 /* Find the corresponding vector mode with the same size as MOVE_MODE.
6011 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
6012 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
6013 {
6014 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
6015 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
6016 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6017 {
6018 move_mode = word_mode;
6019 piece_size = GET_MODE_SIZE (move_mode);
6020 code = optab_handler (mov_optab, move_mode);
6021 }
6022 }
6023 gcc_assert (code != CODE_FOR_nothing);
6024
6025 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6026 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
6027
6028 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6029 gcc_assert (size_to_move % piece_size == 0);
c3185b64 6030
2bf6d935
ML
6031 for (i = 0; i < size_to_move; i += piece_size)
6032 {
6033 /* We move from memory to memory, so we'll need to do it via
6034 a temporary register. */
6035 tempreg = gen_reg_rtx (move_mode);
6036 emit_insn (GEN_FCN (code) (tempreg, src));
6037 emit_insn (GEN_FCN (code) (dst, tempreg));
6038
6039 emit_move_insn (destptr,
c3185b64 6040 plus_constant (Pmode, copy_rtx (destptr), piece_size));
2bf6d935 6041 emit_move_insn (srcptr,
c3185b64 6042 plus_constant (Pmode, copy_rtx (srcptr), piece_size));
2bf6d935
ML
6043
6044 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6045 piece_size);
6046 src = adjust_automodify_address_nv (src, move_mode, srcptr,
6047 piece_size);
6048 }
6049
6050 /* Update DST and SRC rtx. */
6051 *srcmem = src;
6052 return dst;
6053}
6054
6055/* Helper function for the string operations below. Dest VARIABLE whether
6056 it is aligned to VALUE bytes. If true, jump to the label. */
6057
6058static rtx_code_label *
6059ix86_expand_aligntest (rtx variable, int value, bool epilogue)
6060{
6061 rtx_code_label *label = gen_label_rtx ();
6062 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
6063 if (GET_MODE (variable) == DImode)
6064 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
6065 else
6066 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
6067 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
6068 1, label);
6069 if (epilogue)
6070 predict_jump (REG_BR_PROB_BASE * 50 / 100);
6071 else
6072 predict_jump (REG_BR_PROB_BASE * 90 / 100);
6073 return label;
6074}
6075
6076
6077/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
6078
6079static void
76715c32 6080expand_cpymem_epilogue (rtx destmem, rtx srcmem,
2bf6d935
ML
6081 rtx destptr, rtx srcptr, rtx count, int max_size)
6082{
6083 rtx src, dest;
6084 if (CONST_INT_P (count))
6085 {
6086 HOST_WIDE_INT countval = INTVAL (count);
6087 HOST_WIDE_INT epilogue_size = countval % max_size;
6088 int i;
6089
6090 /* For now MAX_SIZE should be a power of 2. This assert could be
6091 relaxed, but it'll require a bit more complicated epilogue
6092 expanding. */
6093 gcc_assert ((max_size & (max_size - 1)) == 0);
6094 for (i = max_size; i >= 1; i >>= 1)
6095 {
6096 if (epilogue_size & i)
6097 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6098 }
6099 return;
6100 }
6101 if (max_size > 8)
6102 {
6103 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
6104 count, 1, OPTAB_DIRECT);
76715c32 6105 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
2bf6d935
ML
6106 count, QImode, 1, 4, false);
6107 return;
6108 }
6109
6110 /* When there are stringops, we can cheaply increase dest and src pointers.
6111 Otherwise we save code size by maintaining offset (zero is readily
6112 available from preceding rep operation) and using x86 addressing modes.
6113 */
6114 if (TARGET_SINGLE_STRINGOP)
6115 {
6116 if (max_size > 4)
6117 {
6118 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6119 src = change_address (srcmem, SImode, srcptr);
6120 dest = change_address (destmem, SImode, destptr);
6121 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6122 emit_label (label);
6123 LABEL_NUSES (label) = 1;
6124 }
6125 if (max_size > 2)
6126 {
6127 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6128 src = change_address (srcmem, HImode, srcptr);
6129 dest = change_address (destmem, HImode, destptr);
6130 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6131 emit_label (label);
6132 LABEL_NUSES (label) = 1;
6133 }
6134 if (max_size > 1)
6135 {
6136 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6137 src = change_address (srcmem, QImode, srcptr);
6138 dest = change_address (destmem, QImode, destptr);
6139 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6140 emit_label (label);
6141 LABEL_NUSES (label) = 1;
6142 }
6143 }
6144 else
6145 {
6146 rtx offset = force_reg (Pmode, const0_rtx);
6147 rtx tmp;
6148
6149 if (max_size > 4)
6150 {
6151 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6152 src = change_address (srcmem, SImode, srcptr);
6153 dest = change_address (destmem, SImode, destptr);
6154 emit_move_insn (dest, src);
6155 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
6156 true, OPTAB_LIB_WIDEN);
6157 if (tmp != offset)
6158 emit_move_insn (offset, tmp);
6159 emit_label (label);
6160 LABEL_NUSES (label) = 1;
6161 }
6162 if (max_size > 2)
6163 {
6164 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6165 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6166 src = change_address (srcmem, HImode, tmp);
6167 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6168 dest = change_address (destmem, HImode, tmp);
6169 emit_move_insn (dest, src);
6170 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
6171 true, OPTAB_LIB_WIDEN);
6172 if (tmp != offset)
6173 emit_move_insn (offset, tmp);
6174 emit_label (label);
6175 LABEL_NUSES (label) = 1;
6176 }
6177 if (max_size > 1)
6178 {
6179 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6180 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6181 src = change_address (srcmem, QImode, tmp);
6182 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6183 dest = change_address (destmem, QImode, tmp);
6184 emit_move_insn (dest, src);
6185 emit_label (label);
6186 LABEL_NUSES (label) = 1;
6187 }
6188 }
6189}
6190
6191/* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6192 with value PROMOTED_VAL.
6193 SRC is passed by pointer to be updated on return.
6194 Return value is updated DST. */
6195static rtx
6196emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
6197 HOST_WIDE_INT size_to_move)
6198{
c3185b64 6199 rtx dst = destmem;
2bf6d935
ML
6200 enum insn_code code;
6201 machine_mode move_mode;
6202 int piece_size, i;
6203
6204 /* Find the widest mode in which we could perform moves.
6205 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6206 it until move of such size is supported. */
6207 move_mode = GET_MODE (promoted_val);
6208 if (move_mode == VOIDmode)
6209 move_mode = QImode;
6210 if (size_to_move < GET_MODE_SIZE (move_mode))
6211 {
6212 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
6213 move_mode = int_mode_for_size (move_bits, 0).require ();
6214 promoted_val = gen_lowpart (move_mode, promoted_val);
6215 }
6216 piece_size = GET_MODE_SIZE (move_mode);
6217 code = optab_handler (mov_optab, move_mode);
6218 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
6219
6220 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6221
6222 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6223 gcc_assert (size_to_move % piece_size == 0);
c3185b64 6224
2bf6d935
ML
6225 for (i = 0; i < size_to_move; i += piece_size)
6226 {
6227 if (piece_size <= GET_MODE_SIZE (word_mode))
6228 {
6229 emit_insn (gen_strset (destptr, dst, promoted_val));
6230 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6231 piece_size);
6232 continue;
6233 }
6234
6235 emit_insn (GEN_FCN (code) (dst, promoted_val));
6236
6237 emit_move_insn (destptr,
c3185b64 6238 plus_constant (Pmode, copy_rtx (destptr), piece_size));
2bf6d935
ML
6239
6240 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6241 piece_size);
6242 }
6243
6244 /* Update DST rtx. */
6245 return dst;
6246}
6247/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6248static void
6249expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
6250 rtx count, int max_size)
6251{
6252 count = expand_simple_binop (counter_mode (count), AND, count,
6253 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
76715c32 6254 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
2bf6d935
ML
6255 gen_lowpart (QImode, value), count, QImode,
6256 1, max_size / 2, true);
6257}
6258
6259/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6260static void
6261expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
6262 rtx count, int max_size)
6263{
6264 rtx dest;
6265
6266 if (CONST_INT_P (count))
6267 {
6268 HOST_WIDE_INT countval = INTVAL (count);
6269 HOST_WIDE_INT epilogue_size = countval % max_size;
6270 int i;
6271
6272 /* For now MAX_SIZE should be a power of 2. This assert could be
6273 relaxed, but it'll require a bit more complicated epilogue
6274 expanding. */
6275 gcc_assert ((max_size & (max_size - 1)) == 0);
6276 for (i = max_size; i >= 1; i >>= 1)
6277 {
6278 if (epilogue_size & i)
6279 {
6280 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6281 destmem = emit_memset (destmem, destptr, vec_value, i);
6282 else
6283 destmem = emit_memset (destmem, destptr, value, i);
6284 }
6285 }
6286 return;
6287 }
6288 if (max_size > 32)
6289 {
6290 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
6291 return;
6292 }
6293 if (max_size > 16)
6294 {
6295 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
6296 if (TARGET_64BIT)
6297 {
6298 dest = change_address (destmem, DImode, destptr);
6299 emit_insn (gen_strset (destptr, dest, value));
6300 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
6301 emit_insn (gen_strset (destptr, dest, value));
6302 }
6303 else
6304 {
6305 dest = change_address (destmem, SImode, destptr);
6306 emit_insn (gen_strset (destptr, dest, value));
6307 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6308 emit_insn (gen_strset (destptr, dest, value));
6309 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
6310 emit_insn (gen_strset (destptr, dest, value));
6311 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
6312 emit_insn (gen_strset (destptr, dest, value));
6313 }
6314 emit_label (label);
6315 LABEL_NUSES (label) = 1;
6316 }
6317 if (max_size > 8)
6318 {
6319 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
6320 if (TARGET_64BIT)
6321 {
6322 dest = change_address (destmem, DImode, destptr);
6323 emit_insn (gen_strset (destptr, dest, value));
6324 }
6325 else
6326 {
6327 dest = change_address (destmem, SImode, destptr);
6328 emit_insn (gen_strset (destptr, dest, value));
6329 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6330 emit_insn (gen_strset (destptr, dest, value));
6331 }
6332 emit_label (label);
6333 LABEL_NUSES (label) = 1;
6334 }
6335 if (max_size > 4)
6336 {
6337 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6338 dest = change_address (destmem, SImode, destptr);
6339 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
6340 emit_label (label);
6341 LABEL_NUSES (label) = 1;
6342 }
6343 if (max_size > 2)
6344 {
6345 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6346 dest = change_address (destmem, HImode, destptr);
6347 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
6348 emit_label (label);
6349 LABEL_NUSES (label) = 1;
6350 }
6351 if (max_size > 1)
6352 {
6353 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6354 dest = change_address (destmem, QImode, destptr);
6355 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
6356 emit_label (label);
6357 LABEL_NUSES (label) = 1;
6358 }
6359}
6360
6361/* Adjust COUNTER by the VALUE. */
6362static void
6363ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
6364{
83bc5e44 6365 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
2bf6d935
ML
6366}
6367
6368/* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6369 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
6370 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6371 ignored.
6372 Return value is updated DESTMEM. */
6373
6374static rtx
76715c32 6375expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
2bf6d935
ML
6376 rtx destptr, rtx srcptr, rtx value,
6377 rtx vec_value, rtx count, int align,
6378 int desired_alignment, bool issetmem)
6379{
6380 int i;
6381 for (i = 1; i < desired_alignment; i <<= 1)
6382 {
6383 if (align <= i)
6384 {
6385 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
6386 if (issetmem)
6387 {
6388 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6389 destmem = emit_memset (destmem, destptr, vec_value, i);
6390 else
6391 destmem = emit_memset (destmem, destptr, value, i);
6392 }
6393 else
6394 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6395 ix86_adjust_counter (count, i);
6396 emit_label (label);
6397 LABEL_NUSES (label) = 1;
6398 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
6399 }
6400 }
6401 return destmem;
6402}
6403
6404/* Test if COUNT&SIZE is nonzero and if so, expand movme
6405 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6406 and jump to DONE_LABEL. */
6407static void
76715c32 6408expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
2bf6d935
ML
6409 rtx destptr, rtx srcptr,
6410 rtx value, rtx vec_value,
6411 rtx count, int size,
6412 rtx done_label, bool issetmem)
6413{
6414 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
6415 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
6416 rtx modesize;
6417 int n;
6418
6419 /* If we do not have vector value to copy, we must reduce size. */
6420 if (issetmem)
6421 {
6422 if (!vec_value)
6423 {
6424 if (GET_MODE (value) == VOIDmode && size > 8)
6425 mode = Pmode;
6426 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
6427 mode = GET_MODE (value);
6428 }
6429 else
6430 mode = GET_MODE (vec_value), value = vec_value;
6431 }
6432 else
6433 {
6434 /* Choose appropriate vector mode. */
6435 if (size >= 32)
6436 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
6437 else if (size >= 16)
6438 mode = TARGET_SSE ? V16QImode : DImode;
6439 srcmem = change_address (srcmem, mode, srcptr);
6440 }
6441 destmem = change_address (destmem, mode, destptr);
6442 modesize = GEN_INT (GET_MODE_SIZE (mode));
6443 gcc_assert (GET_MODE_SIZE (mode) <= size);
6444 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6445 {
6446 if (issetmem)
6447 emit_move_insn (destmem, gen_lowpart (mode, value));
6448 else
6449 {
6450 emit_move_insn (destmem, srcmem);
6451 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6452 }
6453 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6454 }
6455
6456 destmem = offset_address (destmem, count, 1);
6457 destmem = offset_address (destmem, GEN_INT (-2 * size),
6458 GET_MODE_SIZE (mode));
6459 if (!issetmem)
6460 {
6461 srcmem = offset_address (srcmem, count, 1);
6462 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
6463 GET_MODE_SIZE (mode));
6464 }
6465 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6466 {
6467 if (issetmem)
6468 emit_move_insn (destmem, gen_lowpart (mode, value));
6469 else
6470 {
6471 emit_move_insn (destmem, srcmem);
6472 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6473 }
6474 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6475 }
6476 emit_jump_insn (gen_jump (done_label));
6477 emit_barrier ();
6478
6479 emit_label (label);
6480 LABEL_NUSES (label) = 1;
6481}
6482
6483/* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6484 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6485 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6486 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6487 DONE_LABEL is a label after the whole copying sequence. The label is created
6488 on demand if *DONE_LABEL is NULL.
6489 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
6490 bounds after the initial copies.
6491
6492 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6493 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6494 we will dispatch to a library call for large blocks.
6495
6496 In pseudocode we do:
6497
6498 if (COUNT < SIZE)
6499 {
6500 Assume that SIZE is 4. Bigger sizes are handled analogously
6501 if (COUNT & 4)
6502 {
6503 copy 4 bytes from SRCPTR to DESTPTR
6504 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6505 goto done_label
6506 }
6507 if (!COUNT)
6508 goto done_label;
6509 copy 1 byte from SRCPTR to DESTPTR
6510 if (COUNT & 2)
6511 {
6512 copy 2 bytes from SRCPTR to DESTPTR
6513 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6514 }
6515 }
6516 else
6517 {
6518 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6519 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6520
6521 OLD_DESPTR = DESTPTR;
6522 Align DESTPTR up to DESIRED_ALIGN
6523 SRCPTR += DESTPTR - OLD_DESTPTR
6524 COUNT -= DEST_PTR - OLD_DESTPTR
6525 if (DYNAMIC_CHECK)
6526 Round COUNT down to multiple of SIZE
6527 << optional caller supplied zero size guard is here >>
6528 << optional caller supplied dynamic check is here >>
6529 << caller supplied main copy loop is here >>
6530 }
6531 done_label:
6532 */
6533static void
76715c32 6534expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
2bf6d935
ML
6535 rtx *destptr, rtx *srcptr,
6536 machine_mode mode,
6537 rtx value, rtx vec_value,
6538 rtx *count,
6539 rtx_code_label **done_label,
6540 int size,
6541 int desired_align,
6542 int align,
6543 unsigned HOST_WIDE_INT *min_size,
6544 bool dynamic_check,
6545 bool issetmem)
6546{
6547 rtx_code_label *loop_label = NULL, *label;
6548 int n;
6549 rtx modesize;
6550 int prolog_size = 0;
6551 rtx mode_value;
6552
6553 /* Chose proper value to copy. */
6554 if (issetmem && VECTOR_MODE_P (mode))
6555 mode_value = vec_value;
6556 else
6557 mode_value = value;
6558 gcc_assert (GET_MODE_SIZE (mode) <= size);
6559
6560 /* See if block is big or small, handle small blocks. */
6561 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
6562 {
6563 int size2 = size;
6564 loop_label = gen_label_rtx ();
6565
6566 if (!*done_label)
6567 *done_label = gen_label_rtx ();
6568
6569 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
6570 1, loop_label);
6571 size2 >>= 1;
6572
6573 /* Handle sizes > 3. */
6574 for (;size2 > 2; size2 >>= 1)
76715c32 6575 expand_small_cpymem_or_setmem (destmem, srcmem,
2bf6d935
ML
6576 *destptr, *srcptr,
6577 value, vec_value,
6578 *count,
6579 size2, *done_label, issetmem);
6580 /* Nothing to copy? Jump to DONE_LABEL if so */
6581 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
6582 1, *done_label);
6583
6584 /* Do a byte copy. */
6585 destmem = change_address (destmem, QImode, *destptr);
6586 if (issetmem)
6587 emit_move_insn (destmem, gen_lowpart (QImode, value));
6588 else
6589 {
6590 srcmem = change_address (srcmem, QImode, *srcptr);
6591 emit_move_insn (destmem, srcmem);
6592 }
6593
6594 /* Handle sizes 2 and 3. */
6595 label = ix86_expand_aligntest (*count, 2, false);
6596 destmem = change_address (destmem, HImode, *destptr);
6597 destmem = offset_address (destmem, *count, 1);
6598 destmem = offset_address (destmem, GEN_INT (-2), 2);
6599 if (issetmem)
6600 emit_move_insn (destmem, gen_lowpart (HImode, value));
6601 else
6602 {
6603 srcmem = change_address (srcmem, HImode, *srcptr);
6604 srcmem = offset_address (srcmem, *count, 1);
6605 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
6606 emit_move_insn (destmem, srcmem);
6607 }
6608
6609 emit_label (label);
6610 LABEL_NUSES (label) = 1;
6611 emit_jump_insn (gen_jump (*done_label));
6612 emit_barrier ();
6613 }
6614 else
6615 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
6616 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
6617
6618 /* Start memcpy for COUNT >= SIZE. */
6619 if (loop_label)
6620 {
6621 emit_label (loop_label);
6622 LABEL_NUSES (loop_label) = 1;
6623 }
6624
6625 /* Copy first desired_align bytes. */
6626 if (!issetmem)
6627 srcmem = change_address (srcmem, mode, *srcptr);
6628 destmem = change_address (destmem, mode, *destptr);
6629 modesize = GEN_INT (GET_MODE_SIZE (mode));
6630 for (n = 0; prolog_size < desired_align - align; n++)
6631 {
6632 if (issetmem)
6633 emit_move_insn (destmem, mode_value);
6634 else
6635 {
6636 emit_move_insn (destmem, srcmem);
6637 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6638 }
6639 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6640 prolog_size += GET_MODE_SIZE (mode);
6641 }
6642
6643
6644 /* Copy last SIZE bytes. */
6645 destmem = offset_address (destmem, *count, 1);
6646 destmem = offset_address (destmem,
6647 GEN_INT (-size - prolog_size),
6648 1);
6649 if (issetmem)
6650 emit_move_insn (destmem, mode_value);
6651 else
6652 {
6653 srcmem = offset_address (srcmem, *count, 1);
6654 srcmem = offset_address (srcmem,
6655 GEN_INT (-size - prolog_size),
6656 1);
6657 emit_move_insn (destmem, srcmem);
6658 }
6659 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
6660 {
6661 destmem = offset_address (destmem, modesize, 1);
6662 if (issetmem)
6663 emit_move_insn (destmem, mode_value);
6664 else
6665 {
6666 srcmem = offset_address (srcmem, modesize, 1);
6667 emit_move_insn (destmem, srcmem);
6668 }
6669 }
6670
6671 /* Align destination. */
6672 if (desired_align > 1 && desired_align > align)
6673 {
6674 rtx saveddest = *destptr;
6675
6676 gcc_assert (desired_align <= size);
6677 /* Align destptr up, place it to new register. */
6678 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
6679 GEN_INT (prolog_size),
6680 NULL_RTX, 1, OPTAB_DIRECT);
6681 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
6682 REG_POINTER (*destptr) = 1;
6683 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
6684 GEN_INT (-desired_align),
6685 *destptr, 1, OPTAB_DIRECT);
6686 /* See how many bytes we skipped. */
6687 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
6688 *destptr,
6689 saveddest, 1, OPTAB_DIRECT);
6690 /* Adjust srcptr and count. */
6691 if (!issetmem)
6692 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
6693 saveddest, *srcptr, 1, OPTAB_DIRECT);
6694 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6695 saveddest, *count, 1, OPTAB_DIRECT);
6696 /* We copied at most size + prolog_size. */
6697 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
6698 *min_size
6699 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
6700 else
6701 *min_size = 0;
6702
6703 /* Our loops always round down the block size, but for dispatch to
6704 library we need precise value. */
6705 if (dynamic_check)
6706 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
6707 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
6708 }
6709 else
6710 {
6711 gcc_assert (prolog_size == 0);
6712 /* Decrease count, so we won't end up copying last word twice. */
6713 if (!CONST_INT_P (*count))
6714 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6715 constm1_rtx, *count, 1, OPTAB_DIRECT);
6716 else
6717 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
6718 (unsigned HOST_WIDE_INT)size));
6719 if (*min_size)
6720 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
6721 }
6722}
6723
6724
6725/* This function is like the previous one, except here we know how many bytes
6726 need to be copied. That allows us to update alignment not only of DST, which
6727 is returned, but also of SRC, which is passed as a pointer for that
6728 reason. */
6729static rtx
76715c32 6730expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
2bf6d935
ML
6731 rtx srcreg, rtx value, rtx vec_value,
6732 int desired_align, int align_bytes,
6733 bool issetmem)
6734{
6735 rtx src = NULL;
6736 rtx orig_dst = dst;
6737 rtx orig_src = NULL;
6738 int piece_size = 1;
6739 int copied_bytes = 0;
6740
6741 if (!issetmem)
6742 {
6743 gcc_assert (srcp != NULL);
6744 src = *srcp;
6745 orig_src = src;
6746 }
6747
6748 for (piece_size = 1;
6749 piece_size <= desired_align && copied_bytes < align_bytes;
6750 piece_size <<= 1)
6751 {
6752 if (align_bytes & piece_size)
6753 {
6754 if (issetmem)
6755 {
6756 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
6757 dst = emit_memset (dst, destreg, vec_value, piece_size);
6758 else
6759 dst = emit_memset (dst, destreg, value, piece_size);
6760 }
6761 else
6762 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
6763 copied_bytes += piece_size;
6764 }
6765 }
6766 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
6767 set_mem_align (dst, desired_align * BITS_PER_UNIT);
6768 if (MEM_SIZE_KNOWN_P (orig_dst))
6769 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
6770
6771 if (!issetmem)
6772 {
6773 int src_align_bytes = get_mem_align_offset (src, desired_align
6774 * BITS_PER_UNIT);
6775 if (src_align_bytes >= 0)
6776 src_align_bytes = desired_align - src_align_bytes;
6777 if (src_align_bytes >= 0)
6778 {
6779 unsigned int src_align;
6780 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
6781 {
6782 if ((src_align_bytes & (src_align - 1))
6783 == (align_bytes & (src_align - 1)))
6784 break;
6785 }
6786 if (src_align > (unsigned int) desired_align)
6787 src_align = desired_align;
6788 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
6789 set_mem_align (src, src_align * BITS_PER_UNIT);
6790 }
6791 if (MEM_SIZE_KNOWN_P (orig_src))
6792 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
6793 *srcp = src;
6794 }
6795
6796 return dst;
6797}
6798
6799/* Return true if ALG can be used in current context.
6800 Assume we expand memset if MEMSET is true. */
6801static bool
6802alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
6803{
6804 if (alg == no_stringop)
6805 return false;
6806 if (alg == vector_loop)
6807 return TARGET_SSE || TARGET_AVX;
6808 /* Algorithms using the rep prefix want at least edi and ecx;
6809 additionally, memset wants eax and memcpy wants esi. Don't
6810 consider such algorithms if the user has appropriated those
6811 registers for their own purposes, or if we have a non-default
6812 address space, since some string insns cannot override the segment. */
6813 if (alg == rep_prefix_1_byte
6814 || alg == rep_prefix_4_byte
6815 || alg == rep_prefix_8_byte)
6816 {
6817 if (have_as)
6818 return false;
6819 if (fixed_regs[CX_REG]
6820 || fixed_regs[DI_REG]
6821 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
6822 return false;
6823 }
6824 return true;
6825}
6826
6827/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
6828static enum stringop_alg
6829decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
6830 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
6831 bool memset, bool zero_memset, bool have_as,
6832 int *dynamic_check, bool *noalign, bool recur)
6833{
6834 const struct stringop_algs *algs;
6835 bool optimize_for_speed;
6836 int max = 0;
6837 const struct processor_costs *cost;
6838 int i;
6839 bool any_alg_usable_p = false;
6840
6841 *noalign = false;
6842 *dynamic_check = -1;
6843
6844 /* Even if the string operation call is cold, we still might spend a lot
6845 of time processing large blocks. */
6846 if (optimize_function_for_size_p (cfun)
6847 || (optimize_insn_for_size_p ()
6848 && (max_size < 256
6849 || (expected_size != -1 && expected_size < 256))))
6850 optimize_for_speed = false;
6851 else
6852 optimize_for_speed = true;
6853
6854 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
6855 if (memset)
6856 algs = &cost->memset[TARGET_64BIT != 0];
6857 else
6858 algs = &cost->memcpy[TARGET_64BIT != 0];
6859
6860 /* See maximal size for user defined algorithm. */
6861 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6862 {
6863 enum stringop_alg candidate = algs->size[i].alg;
6864 bool usable = alg_usable_p (candidate, memset, have_as);
6865 any_alg_usable_p |= usable;
6866
6867 if (candidate != libcall && candidate && usable)
6868 max = algs->size[i].max;
6869 }
6870
6871 /* If expected size is not known but max size is small enough
6872 so inline version is a win, set expected size into
6873 the range. */
6874 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
6875 && expected_size == -1)
6876 expected_size = min_size / 2 + max_size / 2;
6877
6878 /* If user specified the algorithm, honor it if possible. */
6879 if (ix86_stringop_alg != no_stringop
6880 && alg_usable_p (ix86_stringop_alg, memset, have_as))
6881 return ix86_stringop_alg;
6882 /* rep; movq or rep; movl is the smallest variant. */
6883 else if (!optimize_for_speed)
6884 {
6885 *noalign = true;
6886 if (!count || (count & 3) || (memset && !zero_memset))
6887 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
6888 ? rep_prefix_1_byte : loop_1_byte;
6889 else
6890 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
6891 ? rep_prefix_4_byte : loop;
6892 }
6893 /* Very tiny blocks are best handled via the loop, REP is expensive to
6894 setup. */
6895 else if (expected_size != -1 && expected_size < 4)
6896 return loop_1_byte;
6897 else if (expected_size != -1)
6898 {
6899 enum stringop_alg alg = libcall;
6900 bool alg_noalign = false;
6901 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6902 {
6903 /* We get here if the algorithms that were not libcall-based
6904 were rep-prefix based and we are unable to use rep prefixes
6905 based on global register usage. Break out of the loop and
6906 use the heuristic below. */
6907 if (algs->size[i].max == 0)
6908 break;
6909 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
6910 {
6911 enum stringop_alg candidate = algs->size[i].alg;
6912
6913 if (candidate != libcall
6914 && alg_usable_p (candidate, memset, have_as))
6915 {
6916 alg = candidate;
6917 alg_noalign = algs->size[i].noalign;
6918 }
6919 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6920 last non-libcall inline algorithm. */
6921 if (TARGET_INLINE_ALL_STRINGOPS)
6922 {
6923 /* When the current size is best to be copied by a libcall,
6924 but we are still forced to inline, run the heuristic below
6925 that will pick code for medium sized blocks. */
6926 if (alg != libcall)
6927 {
6928 *noalign = alg_noalign;
6929 return alg;
6930 }
6931 else if (!any_alg_usable_p)
6932 break;
6933 }
6934 else if (alg_usable_p (candidate, memset, have_as))
6935 {
6936 *noalign = algs->size[i].noalign;
6937 return candidate;
6938 }
6939 }
6940 }
6941 }
6942 /* When asked to inline the call anyway, try to pick meaningful choice.
6943 We look for maximal size of block that is faster to copy by hand and
6944 take blocks of at most of that size guessing that average size will
6945 be roughly half of the block.
6946
6947 If this turns out to be bad, we might simply specify the preferred
6948 choice in ix86_costs. */
6949 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
6950 && (algs->unknown_size == libcall
6951 || !alg_usable_p (algs->unknown_size, memset, have_as)))
6952 {
6953 enum stringop_alg alg;
6954 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
6955
6956 /* If there aren't any usable algorithms or if recursing already,
6957 then recursing on smaller sizes or same size isn't going to
6958 find anything. Just return the simple byte-at-a-time copy loop. */
6959 if (!any_alg_usable_p || recur)
6960 {
6961 /* Pick something reasonable. */
6962 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
6963 *dynamic_check = 128;
6964 return loop_1_byte;
6965 }
6966 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
6967 zero_memset, have_as, dynamic_check, noalign, true);
6968 gcc_assert (*dynamic_check == -1);
6969 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
6970 *dynamic_check = max;
6971 else
6972 gcc_assert (alg != libcall);
6973 return alg;
6974 }
6975 return (alg_usable_p (algs->unknown_size, memset, have_as)
6976 ? algs->unknown_size : libcall);
6977}
6978
6979/* Decide on alignment. We know that the operand is already aligned to ALIGN
6980 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
6981static int
6982decide_alignment (int align,
6983 enum stringop_alg alg,
6984 int expected_size,
6985 machine_mode move_mode)
6986{
6987 int desired_align = 0;
6988
6989 gcc_assert (alg != no_stringop);
6990
6991 if (alg == libcall)
6992 return 0;
6993 if (move_mode == VOIDmode)
6994 return 0;
6995
6996 desired_align = GET_MODE_SIZE (move_mode);
6997 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
6998 copying whole cacheline at once. */
6999 if (TARGET_PENTIUMPRO
7000 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
7001 desired_align = 8;
7002
7003 if (optimize_size)
7004 desired_align = 1;
7005 if (desired_align < align)
7006 desired_align = align;
7007 if (expected_size != -1 && expected_size < 4)
7008 desired_align = align;
7009
7010 return desired_align;
7011}
7012
7013
7014/* Helper function for memcpy. For QImode value 0xXY produce
7015 0xXYXYXYXY of wide specified by MODE. This is essentially
7016 a * 0x10101010, but we can do slightly better than
7017 synth_mult by unwinding the sequence by hand on CPUs with
7018 slow multiply. */
7019static rtx
7020promote_duplicated_reg (machine_mode mode, rtx val)
7021{
7022 machine_mode valmode = GET_MODE (val);
7023 rtx tmp;
7024 int nops = mode == DImode ? 3 : 2;
7025
7026 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
7027 if (val == const0_rtx)
7028 return copy_to_mode_reg (mode, CONST0_RTX (mode));
7029 if (CONST_INT_P (val))
7030 {
7031 HOST_WIDE_INT v = INTVAL (val) & 255;
7032
7033 v |= v << 8;
7034 v |= v << 16;
7035 if (mode == DImode)
7036 v |= (v << 16) << 16;
7037 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
7038 }
7039
7040 if (valmode == VOIDmode)
7041 valmode = QImode;
7042 if (valmode != QImode)
7043 val = gen_lowpart (QImode, val);
7044 if (mode == QImode)
7045 return val;
7046 if (!TARGET_PARTIAL_REG_STALL)
7047 nops--;
7048 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
7049 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
7050 <= (ix86_cost->shift_const + ix86_cost->add) * nops
7051 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
7052 {
7053 rtx reg = convert_modes (mode, QImode, val, true);
7054 tmp = promote_duplicated_reg (mode, const1_rtx);
7055 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
7056 OPTAB_DIRECT);
7057 }
7058 else
7059 {
7060 rtx reg = convert_modes (mode, QImode, val, true);
7061
7062 if (!TARGET_PARTIAL_REG_STALL)
7063 if (mode == SImode)
7064 emit_insn (gen_insvsi_1 (reg, reg));
7065 else
7066 emit_insn (gen_insvdi_1 (reg, reg));
7067 else
7068 {
7069 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
7070 NULL, 1, OPTAB_DIRECT);
7071 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
7072 OPTAB_DIRECT);
7073 }
7074 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
7075 NULL, 1, OPTAB_DIRECT);
7076 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7077 if (mode == SImode)
7078 return reg;
7079 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
7080 NULL, 1, OPTAB_DIRECT);
7081 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7082 return reg;
7083 }
7084}
7085
7086/* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7087 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7088 alignment from ALIGN to DESIRED_ALIGN. */
7089static rtx
7090promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
7091 int align)
7092{
7093 rtx promoted_val;
7094
7095 if (TARGET_64BIT
7096 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
7097 promoted_val = promote_duplicated_reg (DImode, val);
7098 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
7099 promoted_val = promote_duplicated_reg (SImode, val);
7100 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
7101 promoted_val = promote_duplicated_reg (HImode, val);
7102 else
7103 promoted_val = val;
7104
7105 return promoted_val;
7106}
7107
7108/* Copy the address to a Pmode register. This is used for x32 to
7109 truncate DImode TLS address to a SImode register. */
7110
7111static rtx
7112ix86_copy_addr_to_reg (rtx addr)
7113{
7114 rtx reg;
7115 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
7116 {
7117 reg = copy_addr_to_reg (addr);
7118 REG_POINTER (reg) = 1;
7119 return reg;
7120 }
7121 else
7122 {
7123 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
7124 reg = copy_to_mode_reg (DImode, addr);
7125 REG_POINTER (reg) = 1;
7126 return gen_rtx_SUBREG (SImode, reg, 0);
7127 }
7128}
7129
7130/* Expand string move (memcpy) ot store (memset) operation. Use i386 string
7131 operations when profitable. The code depends upon architecture, block size
7132 and alignment, but always has one of the following overall structures:
7133
7134 Aligned move sequence:
7135
7136 1) Prologue guard: Conditional that jumps up to epilogues for small
7137 blocks that can be handled by epilogue alone. This is faster
7138 but also needed for correctness, since prologue assume the block
7139 is larger than the desired alignment.
7140
7141 Optional dynamic check for size and libcall for large
7142 blocks is emitted here too, with -minline-stringops-dynamically.
7143
7144 2) Prologue: copy first few bytes in order to get destination
7145 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
7146 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7147 copied. We emit either a jump tree on power of two sized
7148 blocks, or a byte loop.
7149
7150 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7151 with specified algorithm.
7152
7153 4) Epilogue: code copying tail of the block that is too small to be
7154 handled by main body (or up to size guarded by prologue guard).
7155
7156 Misaligned move sequence
7157
7158 1) missaligned move prologue/epilogue containing:
7159 a) Prologue handling small memory blocks and jumping to done_label
7160 (skipped if blocks are known to be large enough)
7161 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7162 needed by single possibly misaligned move
7163 (skipped if alignment is not needed)
7164 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7165
7166 2) Zero size guard dispatching to done_label, if needed
7167
7168 3) dispatch to library call, if needed,
7169
7170 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7171 with specified algorithm. */
7172bool
76715c32 7173ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
2bf6d935
ML
7174 rtx align_exp, rtx expected_align_exp,
7175 rtx expected_size_exp, rtx min_size_exp,
7176 rtx max_size_exp, rtx probable_max_size_exp,
7177 bool issetmem)
7178{
7179 rtx destreg;
7180 rtx srcreg = NULL;
7181 rtx_code_label *label = NULL;
7182 rtx tmp;
7183 rtx_code_label *jump_around_label = NULL;
7184 HOST_WIDE_INT align = 1;
7185 unsigned HOST_WIDE_INT count = 0;
7186 HOST_WIDE_INT expected_size = -1;
7187 int size_needed = 0, epilogue_size_needed;
7188 int desired_align = 0, align_bytes = 0;
7189 enum stringop_alg alg;
7190 rtx promoted_val = NULL;
7191 rtx vec_promoted_val = NULL;
7192 bool force_loopy_epilogue = false;
7193 int dynamic_check;
7194 bool need_zero_guard = false;
7195 bool noalign;
7196 machine_mode move_mode = VOIDmode;
7197 machine_mode wider_mode;
7198 int unroll_factor = 1;
7199 /* TODO: Once value ranges are available, fill in proper data. */
7200 unsigned HOST_WIDE_INT min_size = 0;
7201 unsigned HOST_WIDE_INT max_size = -1;
7202 unsigned HOST_WIDE_INT probable_max_size = -1;
7203 bool misaligned_prologue_used = false;
7204 bool have_as;
7205
7206 if (CONST_INT_P (align_exp))
7207 align = INTVAL (align_exp);
7208 /* i386 can do misaligned access on reasonably increased cost. */
7209 if (CONST_INT_P (expected_align_exp)
7210 && INTVAL (expected_align_exp) > align)
7211 align = INTVAL (expected_align_exp);
7212 /* ALIGN is the minimum of destination and source alignment, but we care here
7213 just about destination alignment. */
7214 else if (!issetmem
7215 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
7216 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
7217
7218 if (CONST_INT_P (count_exp))
7219 {
7220 min_size = max_size = probable_max_size = count = expected_size
7221 = INTVAL (count_exp);
7222 /* When COUNT is 0, there is nothing to do. */
7223 if (!count)
7224 return true;
7225 }
7226 else
7227 {
7228 if (min_size_exp)
7229 min_size = INTVAL (min_size_exp);
7230 if (max_size_exp)
7231 max_size = INTVAL (max_size_exp);
7232 if (probable_max_size_exp)
7233 probable_max_size = INTVAL (probable_max_size_exp);
7234 if (CONST_INT_P (expected_size_exp))
7235 expected_size = INTVAL (expected_size_exp);
7236 }
7237
7238 /* Make sure we don't need to care about overflow later on. */
7239 if (count > (HOST_WIDE_INT_1U << 30))
7240 return false;
7241
7242 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
7243 if (!issetmem)
7244 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
7245
7246 /* Step 0: Decide on preferred algorithm, desired alignment and
7247 size of chunks to be copied by main loop. */
7248 alg = decide_alg (count, expected_size, min_size, probable_max_size,
7249 issetmem,
7250 issetmem && val_exp == const0_rtx, have_as,
7251 &dynamic_check, &noalign, false);
7252
7253 if (dump_file)
7254 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
7255 stringop_alg_names[alg]);
7256
7257 if (alg == libcall)
7258 return false;
7259 gcc_assert (alg != no_stringop);
7260
7261 /* For now vector-version of memset is generated only for memory zeroing, as
7262 creating of promoted vector value is very cheap in this case. */
7263 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
7264 alg = unrolled_loop;
7265
7266 if (!count)
7267 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
7268 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
7269 if (!issetmem)
7270 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
7271
7272 unroll_factor = 1;
7273 move_mode = word_mode;
7274 switch (alg)
7275 {
7276 case libcall:
7277 case no_stringop:
7278 case last_alg:
7279 gcc_unreachable ();
7280 case loop_1_byte:
7281 need_zero_guard = true;
7282 move_mode = QImode;
7283 break;
7284 case loop:
7285 need_zero_guard = true;
7286 break;
7287 case unrolled_loop:
7288 need_zero_guard = true;
7289 unroll_factor = (TARGET_64BIT ? 4 : 2);
7290 break;
7291 case vector_loop:
7292 need_zero_guard = true;
7293 unroll_factor = 4;
7294 /* Find the widest supported mode. */
7295 move_mode = word_mode;
7296 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
7297 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
7298 move_mode = wider_mode;
7299
586bbef1 7300 if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
2bf6d935
ML
7301 move_mode = TImode;
7302
7303 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7304 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7305 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7306 {
7307 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7308 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7309 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
7310 move_mode = word_mode;
7311 }
7312 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
7313 break;
7314 case rep_prefix_8_byte:
7315 move_mode = DImode;
7316 break;
7317 case rep_prefix_4_byte:
7318 move_mode = SImode;
7319 break;
7320 case rep_prefix_1_byte:
7321 move_mode = QImode;
7322 break;
7323 }
7324 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
7325 epilogue_size_needed = size_needed;
7326
7327 /* If we are going to call any library calls conditionally, make sure any
7328 pending stack adjustment happen before the first conditional branch,
7329 otherwise they will be emitted before the library call only and won't
7330 happen from the other branches. */
7331 if (dynamic_check != -1)
7332 do_pending_stack_adjust ();
7333
7334 desired_align = decide_alignment (align, alg, expected_size, move_mode);
7335 if (!TARGET_ALIGN_STRINGOPS || noalign)
7336 align = desired_align;
7337
7338 /* Step 1: Prologue guard. */
7339
7340 /* Alignment code needs count to be in register. */
7341 if (CONST_INT_P (count_exp) && desired_align > align)
7342 {
7343 if (INTVAL (count_exp) > desired_align
7344 && INTVAL (count_exp) > size_needed)
7345 {
7346 align_bytes
7347 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
7348 if (align_bytes <= 0)
7349 align_bytes = 0;
7350 else
7351 align_bytes = desired_align - align_bytes;
7352 }
7353 if (align_bytes == 0)
7354 count_exp = force_reg (counter_mode (count_exp), count_exp);
7355 }
7356 gcc_assert (desired_align >= 1 && align >= 1);
7357
7358 /* Misaligned move sequences handle both prologue and epilogue at once.
7359 Default code generation results in a smaller code for large alignments
7360 and also avoids redundant job when sizes are known precisely. */
7361 misaligned_prologue_used
7362 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7363 && MAX (desired_align, epilogue_size_needed) <= 32
7364 && desired_align <= epilogue_size_needed
7365 && ((desired_align > align && !align_bytes)
7366 || (!count && epilogue_size_needed > 1)));
7367
7368 /* Do the cheap promotion to allow better CSE across the
7369 main loop and epilogue (ie one load of the big constant in the
7370 front of all code.
7371 For now the misaligned move sequences do not have fast path
7372 without broadcasting. */
7373 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
7374 {
7375 if (alg == vector_loop)
7376 {
7377 gcc_assert (val_exp == const0_rtx);
7378 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
7379 promoted_val = promote_duplicated_reg_to_size (val_exp,
7380 GET_MODE_SIZE (word_mode),
7381 desired_align, align);
7382 }
7383 else
7384 {
7385 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7386 desired_align, align);
7387 }
7388 }
7389 /* Misaligned move sequences handles both prologues and epilogues at once.
7390 Default code generation results in smaller code for large alignments and
7391 also avoids redundant job when sizes are known precisely. */
7392 if (misaligned_prologue_used)
7393 {
7394 /* Misaligned move prologue handled small blocks by itself. */
76715c32 7395 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
2bf6d935
ML
7396 (dst, src, &destreg, &srcreg,
7397 move_mode, promoted_val, vec_promoted_val,
7398 &count_exp,
7399 &jump_around_label,
7400 desired_align < align
7401 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
7402 desired_align, align, &min_size, dynamic_check, issetmem);
7403 if (!issetmem)
7404 src = change_address (src, BLKmode, srcreg);
7405 dst = change_address (dst, BLKmode, destreg);
7406 set_mem_align (dst, desired_align * BITS_PER_UNIT);
7407 epilogue_size_needed = 0;
7408 if (need_zero_guard
7409 && min_size < (unsigned HOST_WIDE_INT) size_needed)
7410 {
7411 /* It is possible that we copied enough so the main loop will not
7412 execute. */
7413 gcc_assert (size_needed > 1);
7414 if (jump_around_label == NULL_RTX)
7415 jump_around_label = gen_label_rtx ();
7416 emit_cmp_and_jump_insns (count_exp,
7417 GEN_INT (size_needed),
7418 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
7419 if (expected_size == -1
7420 || expected_size < (desired_align - align) / 2 + size_needed)
7421 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7422 else
7423 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7424 }
7425 }
7426 /* Ensure that alignment prologue won't copy past end of block. */
7427 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
7428 {
7429 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
7430 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7431 Make sure it is power of 2. */
7432 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
7433
7434 /* To improve performance of small blocks, we jump around the VAL
7435 promoting mode. This mean that if the promoted VAL is not constant,
7436 we might not use it in the epilogue and have to use byte
7437 loop variant. */
7438 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
7439 force_loopy_epilogue = true;
7440 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7441 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7442 {
7443 /* If main algorithm works on QImode, no epilogue is needed.
7444 For small sizes just don't align anything. */
7445 if (size_needed == 1)
7446 desired_align = align;
7447 else
7448 goto epilogue;
7449 }
7450 else if (!count
7451 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7452 {
7453 label = gen_label_rtx ();
7454 emit_cmp_and_jump_insns (count_exp,
7455 GEN_INT (epilogue_size_needed),
7456 LTU, 0, counter_mode (count_exp), 1, label);
7457 if (expected_size == -1 || expected_size < epilogue_size_needed)
7458 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7459 else
7460 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7461 }
7462 }
7463
7464 /* Emit code to decide on runtime whether library call or inline should be
7465 used. */
7466 if (dynamic_check != -1)
7467 {
7468 if (!issetmem && CONST_INT_P (count_exp))
7469 {
7470 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
7471 {
7472 emit_block_copy_via_libcall (dst, src, count_exp);
7473 count_exp = const0_rtx;
7474 goto epilogue;
7475 }
7476 }
7477 else
7478 {
7479 rtx_code_label *hot_label = gen_label_rtx ();
7480 if (jump_around_label == NULL_RTX)
7481 jump_around_label = gen_label_rtx ();
7482 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
7483 LEU, 0, counter_mode (count_exp),
7484 1, hot_label);
7485 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7486 if (issetmem)
7487 set_storage_via_libcall (dst, count_exp, val_exp);
7488 else
7489 emit_block_copy_via_libcall (dst, src, count_exp);
7490 emit_jump (jump_around_label);
7491 emit_label (hot_label);
7492 }
7493 }
7494
7495 /* Step 2: Alignment prologue. */
7496 /* Do the expensive promotion once we branched off the small blocks. */
7497 if (issetmem && !promoted_val)
7498 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7499 desired_align, align);
7500
7501 if (desired_align > align && !misaligned_prologue_used)
7502 {
7503 if (align_bytes == 0)
7504 {
7505 /* Except for the first move in prologue, we no longer know
7506 constant offset in aliasing info. It don't seems to worth
7507 the pain to maintain it for the first move, so throw away
7508 the info early. */
7509 dst = change_address (dst, BLKmode, destreg);
7510 if (!issetmem)
7511 src = change_address (src, BLKmode, srcreg);
76715c32 7512 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
2bf6d935
ML
7513 promoted_val, vec_promoted_val,
7514 count_exp, align, desired_align,
7515 issetmem);
7516 /* At most desired_align - align bytes are copied. */
7517 if (min_size < (unsigned)(desired_align - align))
7518 min_size = 0;
7519 else
7520 min_size -= desired_align - align;
7521 }
7522 else
7523 {
7524 /* If we know how many bytes need to be stored before dst is
7525 sufficiently aligned, maintain aliasing info accurately. */
76715c32 7526 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
2bf6d935
ML
7527 srcreg,
7528 promoted_val,
7529 vec_promoted_val,
7530 desired_align,
7531 align_bytes,
7532 issetmem);
7533
7534 count_exp = plus_constant (counter_mode (count_exp),
7535 count_exp, -align_bytes);
7536 count -= align_bytes;
7537 min_size -= align_bytes;
7538 max_size -= align_bytes;
7539 }
7540 if (need_zero_guard
7541 && min_size < (unsigned HOST_WIDE_INT) size_needed
7542 && (count < (unsigned HOST_WIDE_INT) size_needed
7543 || (align_bytes == 0
7544 && count < ((unsigned HOST_WIDE_INT) size_needed
7545 + desired_align - align))))
7546 {
7547 /* It is possible that we copied enough so the main loop will not
7548 execute. */
7549 gcc_assert (size_needed > 1);
7550 if (label == NULL_RTX)
7551 label = gen_label_rtx ();
7552 emit_cmp_and_jump_insns (count_exp,
7553 GEN_INT (size_needed),
7554 LTU, 0, counter_mode (count_exp), 1, label);
7555 if (expected_size == -1
7556 || expected_size < (desired_align - align) / 2 + size_needed)
7557 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7558 else
7559 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7560 }
7561 }
7562 if (label && size_needed == 1)
7563 {
7564 emit_label (label);
7565 LABEL_NUSES (label) = 1;
7566 label = NULL;
7567 epilogue_size_needed = 1;
7568 if (issetmem)
7569 promoted_val = val_exp;
7570 }
7571 else if (label == NULL_RTX && !misaligned_prologue_used)
7572 epilogue_size_needed = size_needed;
7573
7574 /* Step 3: Main loop. */
7575
7576 switch (alg)
7577 {
7578 case libcall:
7579 case no_stringop:
7580 case last_alg:
7581 gcc_unreachable ();
7582 case loop_1_byte:
7583 case loop:
7584 case unrolled_loop:
76715c32 7585 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
2bf6d935
ML
7586 count_exp, move_mode, unroll_factor,
7587 expected_size, issetmem);
7588 break;
7589 case vector_loop:
76715c32 7590 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
2bf6d935
ML
7591 vec_promoted_val, count_exp, move_mode,
7592 unroll_factor, expected_size, issetmem);
7593 break;
7594 case rep_prefix_8_byte:
7595 case rep_prefix_4_byte:
7596 case rep_prefix_1_byte:
76715c32 7597 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
2bf6d935
ML
7598 val_exp, count_exp, move_mode, issetmem);
7599 break;
7600 }
7601 /* Adjust properly the offset of src and dest memory for aliasing. */
7602 if (CONST_INT_P (count_exp))
7603 {
7604 if (!issetmem)
7605 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
7606 (count / size_needed) * size_needed);
7607 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
7608 (count / size_needed) * size_needed);
7609 }
7610 else
7611 {
7612 if (!issetmem)
7613 src = change_address (src, BLKmode, srcreg);
7614 dst = change_address (dst, BLKmode, destreg);
7615 }
7616
7617 /* Step 4: Epilogue to copy the remaining bytes. */
7618 epilogue:
7619 if (label)
7620 {
7621 /* When the main loop is done, COUNT_EXP might hold original count,
7622 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7623 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7624 bytes. Compensate if needed. */
7625
7626 if (size_needed < epilogue_size_needed)
7627 {
7628 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
7629 GEN_INT (size_needed - 1), count_exp, 1,
7630 OPTAB_DIRECT);
7631 if (tmp != count_exp)
7632 emit_move_insn (count_exp, tmp);
7633 }
7634 emit_label (label);
7635 LABEL_NUSES (label) = 1;
7636 }
7637
7638 if (count_exp != const0_rtx && epilogue_size_needed > 1)
7639 {
7640 if (force_loopy_epilogue)
7641 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
7642 epilogue_size_needed);
7643 else
7644 {
7645 if (issetmem)
7646 expand_setmem_epilogue (dst, destreg, promoted_val,
7647 vec_promoted_val, count_exp,
7648 epilogue_size_needed);
7649 else
76715c32 7650 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
2bf6d935
ML
7651 epilogue_size_needed);
7652 }
7653 }
7654 if (jump_around_label)
7655 emit_label (jump_around_label);
7656 return true;
7657}
7658
7659
7660/* Expand the appropriate insns for doing strlen if not just doing
7661 repnz; scasb
7662
7663 out = result, initialized with the start address
7664 align_rtx = alignment of the address.
7665 scratch = scratch register, initialized with the startaddress when
7666 not aligned, otherwise undefined
7667
7668 This is just the body. It needs the initializations mentioned above and
7669 some address computing at the end. These things are done in i386.md. */
7670
7671static void
7672ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
7673{
7674 int align;
7675 rtx tmp;
7676 rtx_code_label *align_2_label = NULL;
7677 rtx_code_label *align_3_label = NULL;
7678 rtx_code_label *align_4_label = gen_label_rtx ();
7679 rtx_code_label *end_0_label = gen_label_rtx ();
7680 rtx mem;
7681 rtx tmpreg = gen_reg_rtx (SImode);
7682 rtx scratch = gen_reg_rtx (SImode);
7683 rtx cmp;
7684
7685 align = 0;
7686 if (CONST_INT_P (align_rtx))
7687 align = INTVAL (align_rtx);
7688
7689 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
7690
7691 /* Is there a known alignment and is it less than 4? */
7692 if (align < 4)
7693 {
7694 rtx scratch1 = gen_reg_rtx (Pmode);
7695 emit_move_insn (scratch1, out);
7696 /* Is there a known alignment and is it not 2? */
7697 if (align != 2)
7698 {
7699 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
7700 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
7701
7702 /* Leave just the 3 lower bits. */
7703 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
7704 NULL_RTX, 0, OPTAB_WIDEN);
7705
7706 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7707 Pmode, 1, align_4_label);
7708 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
7709 Pmode, 1, align_2_label);
7710 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
7711 Pmode, 1, align_3_label);
7712 }
7713 else
7714 {
7715 /* Since the alignment is 2, we have to check 2 or 0 bytes;
7716 check if is aligned to 4 - byte. */
7717
7718 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
7719 NULL_RTX, 0, OPTAB_WIDEN);
7720
7721 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7722 Pmode, 1, align_4_label);
7723 }
7724
7725 mem = change_address (src, QImode, out);
7726
7727 /* Now compare the bytes. */
7728
7729 /* Compare the first n unaligned byte on a byte per byte basis. */
7730 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
7731 QImode, 1, end_0_label);
7732
7733 /* Increment the address. */
d9330fb5 7734 emit_insn (gen_add2_insn (out, const1_rtx));
2bf6d935
ML
7735
7736 /* Not needed with an alignment of 2 */
7737 if (align != 2)
7738 {
7739 emit_label (align_2_label);
7740
7741 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7742 end_0_label);
7743
d9330fb5 7744 emit_insn (gen_add2_insn (out, const1_rtx));
2bf6d935
ML
7745
7746 emit_label (align_3_label);
7747 }
7748
7749 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7750 end_0_label);
7751
d9330fb5 7752 emit_insn (gen_add2_insn (out, const1_rtx));
2bf6d935
ML
7753 }
7754
7755 /* Generate loop to check 4 bytes at a time. It is not a good idea to
7756 align this loop. It gives only huge programs, but does not help to
7757 speed up. */
7758 emit_label (align_4_label);
7759
7760 mem = change_address (src, SImode, out);
7761 emit_move_insn (scratch, mem);
d9330fb5 7762 emit_insn (gen_add2_insn (out, GEN_INT (4)));
2bf6d935
ML
7763
7764 /* This formula yields a nonzero result iff one of the bytes is zero.
7765 This saves three branches inside loop and many cycles. */
7766
7767 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
7768 emit_insn (gen_one_cmplsi2 (scratch, scratch));
7769 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
7770 emit_insn (gen_andsi3 (tmpreg, tmpreg,
7771 gen_int_mode (0x80808080, SImode)));
7772 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
7773 align_4_label);
7774
7775 if (TARGET_CMOVE)
7776 {
7777 rtx reg = gen_reg_rtx (SImode);
7778 rtx reg2 = gen_reg_rtx (Pmode);
7779 emit_move_insn (reg, tmpreg);
7780 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
7781
7782 /* If zero is not in the first two bytes, move two bytes forward. */
7783 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7784 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7785 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7786 emit_insn (gen_rtx_SET (tmpreg,
7787 gen_rtx_IF_THEN_ELSE (SImode, tmp,
7788 reg,
7789 tmpreg)));
7790 /* Emit lea manually to avoid clobbering of flags. */
c3185b64 7791 emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
2bf6d935
ML
7792
7793 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7794 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7795 emit_insn (gen_rtx_SET (out,
7796 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
7797 reg2,
7798 out)));
7799 }
7800 else
7801 {
7802 rtx_code_label *end_2_label = gen_label_rtx ();
7803 /* Is zero in the first two bytes? */
7804
7805 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7806 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7807 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
7808 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
7809 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
7810 pc_rtx);
7811 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
7812 JUMP_LABEL (tmp) = end_2_label;
7813
7814 /* Not in the first two. Move two bytes forward. */
7815 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
d9330fb5 7816 emit_insn (gen_add2_insn (out, const2_rtx));
2bf6d935
ML
7817
7818 emit_label (end_2_label);
7819
7820 }
7821
7822 /* Avoid branch in fixing the byte. */
7823 tmpreg = gen_lowpart (QImode, tmpreg);
7824 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
7825 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
7826 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
d9330fb5 7827 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
2bf6d935
ML
7828
7829 emit_label (end_0_label);
7830}
7831
7832/* Expand strlen. */
7833
7834bool
7835ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
7836{
7837if (TARGET_UNROLL_STRLEN
7838 && TARGET_INLINE_ALL_STRINGOPS
7839 && eoschar == const0_rtx
7840 && optimize > 1)
7841 {
7842 /* The generic case of strlen expander is long. Avoid it's
7843 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
7844 rtx addr = force_reg (Pmode, XEXP (src, 0));
7845 /* Well it seems that some optimizer does not combine a call like
7846 foo(strlen(bar), strlen(bar));
7847 when the move and the subtraction is done here. It does calculate
7848 the length just once when these instructions are done inside of
7849 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
7850 often used and I use one fewer register for the lifetime of
7851 output_strlen_unroll() this is better. */
7852
7853 emit_move_insn (out, addr);
7854
7855 ix86_expand_strlensi_unroll_1 (out, src, align);
7856
7857 /* strlensi_unroll_1 returns the address of the zero at the end of
7858 the string, like memchr(), so compute the length by subtracting
7859 the start address. */
d9330fb5 7860 emit_insn (gen_sub2_insn (out, addr));
2bf6d935
ML
7861 return true;
7862 }
7863 else
7864 return false;
7865}
7866
7867/* For given symbol (function) construct code to compute address of it's PLT
7868 entry in large x86-64 PIC model. */
7869
7870static rtx
7871construct_plt_address (rtx symbol)
7872{
7873 rtx tmp, unspec;
7874
7875 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
7876 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
7877 gcc_assert (Pmode == DImode);
7878
7879 tmp = gen_reg_rtx (Pmode);
7880 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
7881
7882 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
d9330fb5 7883 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
2bf6d935
ML
7884 return tmp;
7885}
7886
7887/* Additional registers that are clobbered by SYSV calls. */
7888
7889static int const x86_64_ms_sysv_extra_clobbered_registers
7890 [NUM_X86_64_MS_CLOBBERED_REGS] =
7891{
7892 SI_REG, DI_REG,
7893 XMM6_REG, XMM7_REG,
7894 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
7895 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
7896};
7897
7898rtx_insn *
7899ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
7900 rtx callarg2,
7901 rtx pop, bool sibcall)
7902{
7903 rtx vec[3];
7904 rtx use = NULL, call;
7905 unsigned int vec_len = 0;
7906 tree fndecl;
7907
7908 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
7909 {
7910 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
7911 if (fndecl
7912 && (lookup_attribute ("interrupt",
7913 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
a9c697b8 7914 error ("interrupt service routine cannot be called directly");
2bf6d935
ML
7915 }
7916 else
7917 fndecl = NULL_TREE;
7918
7919 if (pop == const0_rtx)
7920 pop = NULL;
7921 gcc_assert (!TARGET_64BIT || !pop);
7922
7923 if (TARGET_MACHO && !TARGET_64BIT)
7924 {
7925#if TARGET_MACHO
7926 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
7927 fnaddr = machopic_indirect_call_target (fnaddr);
7928#endif
7929 }
7930 else
7931 {
7932 /* Static functions and indirect calls don't need the pic register. Also,
7933 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
7934 it an indirect call. */
7935 rtx addr = XEXP (fnaddr, 0);
7936 if (flag_pic
7937 && GET_CODE (addr) == SYMBOL_REF
7938 && !SYMBOL_REF_LOCAL_P (addr))
7939 {
7940 if (flag_plt
7941 && (SYMBOL_REF_DECL (addr) == NULL_TREE
7942 || !lookup_attribute ("noplt",
7943 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
7944 {
7945 if (!TARGET_64BIT
7946 || (ix86_cmodel == CM_LARGE_PIC
7947 && DEFAULT_ABI != MS_ABI))
7948 {
7949 use_reg (&use, gen_rtx_REG (Pmode,
7950 REAL_PIC_OFFSET_TABLE_REGNUM));
7951 if (ix86_use_pseudo_pic_reg ())
7952 emit_move_insn (gen_rtx_REG (Pmode,
7953 REAL_PIC_OFFSET_TABLE_REGNUM),
7954 pic_offset_table_rtx);
7955 }
7956 }
7957 else if (!TARGET_PECOFF && !TARGET_MACHO)
7958 {
7959 if (TARGET_64BIT)
7960 {
7961 fnaddr = gen_rtx_UNSPEC (Pmode,
7962 gen_rtvec (1, addr),
7963 UNSPEC_GOTPCREL);
7964 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
7965 }
7966 else
7967 {
7968 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
7969 UNSPEC_GOT);
7970 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
7971 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
7972 fnaddr);
7973 }
7974 fnaddr = gen_const_mem (Pmode, fnaddr);
7975 /* Pmode may not be the same as word_mode for x32, which
7976 doesn't support indirect branch via 32-bit memory slot.
7977 Since x32 GOT slot is 64 bit with zero upper 32 bits,
7978 indirect branch via x32 GOT slot is OK. */
7979 if (GET_MODE (fnaddr) != word_mode)
7980 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
7981 fnaddr = gen_rtx_MEM (QImode, fnaddr);
7982 }
7983 }
7984 }
7985
7986 /* Skip setting up RAX register for -mskip-rax-setup when there are no
7987 parameters passed in vector registers. */
7988 if (TARGET_64BIT
7989 && (INTVAL (callarg2) > 0
7990 || (INTVAL (callarg2) == 0
7991 && (TARGET_SSE || !flag_skip_rax_setup))))
7992 {
7993 rtx al = gen_rtx_REG (QImode, AX_REG);
7994 emit_move_insn (al, callarg2);
7995 use_reg (&use, al);
7996 }
7997
7998 if (ix86_cmodel == CM_LARGE_PIC
7999 && !TARGET_PECOFF
8000 && MEM_P (fnaddr)
8001 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
8002 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
8003 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
8004 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
8005 branch via x32 GOT slot is OK. */
8006 else if (!(TARGET_X32
8007 && MEM_P (fnaddr)
8008 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
8009 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
8010 && (sibcall
8011 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
8012 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
8013 {
8014 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
8015 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
8016 }
8017
8018 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
8019
8020 if (retval)
8021 call = gen_rtx_SET (retval, call);
8022 vec[vec_len++] = call;
8023
8024 if (pop)
8025 {
8026 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
8027 pop = gen_rtx_SET (stack_pointer_rtx, pop);
8028 vec[vec_len++] = pop;
8029 }
8030
8031 if (cfun->machine->no_caller_saved_registers
8032 && (!fndecl
8033 || (!TREE_THIS_VOLATILE (fndecl)
8034 && !lookup_attribute ("no_caller_saved_registers",
8035 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
8036 {
8037 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
8038 bool is_64bit_ms_abi = (TARGET_64BIT
8039 && ix86_function_abi (fndecl) == MS_ABI);
8040 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
8041
8042 /* If there are no caller-saved registers, add all registers
8043 that are clobbered by the call which returns. */
8044 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
8045 if (!fixed_regs[i]
8046 && (ix86_call_used_regs[i] == 1
8047 || (ix86_call_used_regs[i] & c_mask))
8048 && !STACK_REGNO_P (i)
8049 && !MMX_REGNO_P (i))
8050 clobber_reg (&use,
8051 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
8052 }
8053 else if (TARGET_64BIT_MS_ABI
8054 && (!callarg2 || INTVAL (callarg2) != -2))
8055 {
8056 unsigned i;
8057
8058 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
8059 {
8060 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
8061 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
8062
8063 clobber_reg (&use, gen_rtx_REG (mode, regno));
8064 }
8065
8066 /* Set here, but it may get cleared later. */
8067 if (TARGET_CALL_MS2SYSV_XLOGUES)
8068 {
8069 if (!TARGET_SSE)
8070 ;
8071
8072 /* Don't break hot-patched functions. */
8073 else if (ix86_function_ms_hook_prologue (current_function_decl))
8074 ;
8075
8076 /* TODO: Cases not yet examined. */
8077 else if (flag_split_stack)
8078 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8079
8080 else
8081 {
8082 gcc_assert (!reload_completed);
8083 cfun->machine->call_ms2sysv = true;
8084 }
8085 }
8086 }
8087
8088 if (vec_len > 1)
8089 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
8090 rtx_insn *call_insn = emit_call_insn (call);
8091 if (use)
8092 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
8093
8094 return call_insn;
8095}
8096
8097/* Split simple return with popping POPC bytes from stack to indirect
8098 branch with stack adjustment . */
8099
8100void
8101ix86_split_simple_return_pop_internal (rtx popc)
8102{
8103 struct machine_function *m = cfun->machine;
8104 rtx ecx = gen_rtx_REG (SImode, CX_REG);
8105 rtx_insn *insn;
8106
8107 /* There is no "pascal" calling convention in any 64bit ABI. */
8108 gcc_assert (!TARGET_64BIT);
8109
8110 insn = emit_insn (gen_pop (ecx));
8111 m->fs.cfa_offset -= UNITS_PER_WORD;
8112 m->fs.sp_offset -= UNITS_PER_WORD;
8113
8114 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
8115 x = gen_rtx_SET (stack_pointer_rtx, x);
8116 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8117 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
8118 RTX_FRAME_RELATED_P (insn) = 1;
8119
8120 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
8121 x = gen_rtx_SET (stack_pointer_rtx, x);
8122 insn = emit_insn (x);
8123 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8124 RTX_FRAME_RELATED_P (insn) = 1;
8125
8126 /* Now return address is in ECX. */
8127 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
8128}
8129
8130/* Errors in the source file can cause expand_expr to return const0_rtx
8131 where we expect a vector. To avoid crashing, use one of the vector
8132 clear instructions. */
8133
8134static rtx
8135safe_vector_operand (rtx x, machine_mode mode)
8136{
8137 if (x == const0_rtx)
8138 x = CONST0_RTX (mode);
8139 return x;
8140}
8141
8142/* Subroutine of ix86_expand_builtin to take care of binop insns. */
8143
8144static rtx
8145ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
8146{
8147 rtx pat;
8148 tree arg0 = CALL_EXPR_ARG (exp, 0);
8149 tree arg1 = CALL_EXPR_ARG (exp, 1);
8150 rtx op0 = expand_normal (arg0);
8151 rtx op1 = expand_normal (arg1);
8152 machine_mode tmode = insn_data[icode].operand[0].mode;
8153 machine_mode mode0 = insn_data[icode].operand[1].mode;
8154 machine_mode mode1 = insn_data[icode].operand[2].mode;
8155
8156 if (VECTOR_MODE_P (mode0))
8157 op0 = safe_vector_operand (op0, mode0);
8158 if (VECTOR_MODE_P (mode1))
8159 op1 = safe_vector_operand (op1, mode1);
8160
8161 if (optimize || !target
8162 || GET_MODE (target) != tmode
8163 || !insn_data[icode].operand[0].predicate (target, tmode))
8164 target = gen_reg_rtx (tmode);
8165
8166 if (GET_MODE (op1) == SImode && mode1 == TImode)
8167 {
8168 rtx x = gen_reg_rtx (V4SImode);
8169 emit_insn (gen_sse2_loadd (x, op1));
8170 op1 = gen_lowpart (TImode, x);
8171 }
8172
8173 if (!insn_data[icode].operand[1].predicate (op0, mode0))
8174 op0 = copy_to_mode_reg (mode0, op0);
8175 if (!insn_data[icode].operand[2].predicate (op1, mode1))
8176 op1 = copy_to_mode_reg (mode1, op1);
8177
8178 pat = GEN_FCN (icode) (target, op0, op1);
8179 if (! pat)
8180 return 0;
8181
8182 emit_insn (pat);
8183
8184 return target;
8185}
8186
8187/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
8188
8189static rtx
8190ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
8191 enum ix86_builtin_func_type m_type,
8192 enum rtx_code sub_code)
8193{
8194 rtx pat;
8195 int i;
8196 int nargs;
8197 bool comparison_p = false;
8198 bool tf_p = false;
8199 bool last_arg_constant = false;
8200 int num_memory = 0;
8201 struct {
8202 rtx op;
8203 machine_mode mode;
8204 } args[4];
8205
8206 machine_mode tmode = insn_data[icode].operand[0].mode;
8207
8208 switch (m_type)
8209 {
8210 case MULTI_ARG_4_DF2_DI_I:
8211 case MULTI_ARG_4_DF2_DI_I1:
8212 case MULTI_ARG_4_SF2_SI_I:
8213 case MULTI_ARG_4_SF2_SI_I1:
8214 nargs = 4;
8215 last_arg_constant = true;
8216 break;
8217
8218 case MULTI_ARG_3_SF:
8219 case MULTI_ARG_3_DF:
8220 case MULTI_ARG_3_SF2:
8221 case MULTI_ARG_3_DF2:
8222 case MULTI_ARG_3_DI:
8223 case MULTI_ARG_3_SI:
8224 case MULTI_ARG_3_SI_DI:
8225 case MULTI_ARG_3_HI:
8226 case MULTI_ARG_3_HI_SI:
8227 case MULTI_ARG_3_QI:
8228 case MULTI_ARG_3_DI2:
8229 case MULTI_ARG_3_SI2:
8230 case MULTI_ARG_3_HI2:
8231 case MULTI_ARG_3_QI2:
8232 nargs = 3;
8233 break;
8234
8235 case MULTI_ARG_2_SF:
8236 case MULTI_ARG_2_DF:
8237 case MULTI_ARG_2_DI:
8238 case MULTI_ARG_2_SI:
8239 case MULTI_ARG_2_HI:
8240 case MULTI_ARG_2_QI:
8241 nargs = 2;
8242 break;
8243
8244 case MULTI_ARG_2_DI_IMM:
8245 case MULTI_ARG_2_SI_IMM:
8246 case MULTI_ARG_2_HI_IMM:
8247 case MULTI_ARG_2_QI_IMM:
8248 nargs = 2;
8249 last_arg_constant = true;
8250 break;
8251
8252 case MULTI_ARG_1_SF:
8253 case MULTI_ARG_1_DF:
8254 case MULTI_ARG_1_SF2:
8255 case MULTI_ARG_1_DF2:
8256 case MULTI_ARG_1_DI:
8257 case MULTI_ARG_1_SI:
8258 case MULTI_ARG_1_HI:
8259 case MULTI_ARG_1_QI:
8260 case MULTI_ARG_1_SI_DI:
8261 case MULTI_ARG_1_HI_DI:
8262 case MULTI_ARG_1_HI_SI:
8263 case MULTI_ARG_1_QI_DI:
8264 case MULTI_ARG_1_QI_SI:
8265 case MULTI_ARG_1_QI_HI:
8266 nargs = 1;
8267 break;
8268
8269 case MULTI_ARG_2_DI_CMP:
8270 case MULTI_ARG_2_SI_CMP:
8271 case MULTI_ARG_2_HI_CMP:
8272 case MULTI_ARG_2_QI_CMP:
8273 nargs = 2;
8274 comparison_p = true;
8275 break;
8276
8277 case MULTI_ARG_2_SF_TF:
8278 case MULTI_ARG_2_DF_TF:
8279 case MULTI_ARG_2_DI_TF:
8280 case MULTI_ARG_2_SI_TF:
8281 case MULTI_ARG_2_HI_TF:
8282 case MULTI_ARG_2_QI_TF:
8283 nargs = 2;
8284 tf_p = true;
8285 break;
8286
8287 default:
8288 gcc_unreachable ();
8289 }
8290
8291 if (optimize || !target
8292 || GET_MODE (target) != tmode
8293 || !insn_data[icode].operand[0].predicate (target, tmode))
8294 target = gen_reg_rtx (tmode);
8295 else if (memory_operand (target, tmode))
8296 num_memory++;
8297
8298 gcc_assert (nargs <= 4);
8299
8300 for (i = 0; i < nargs; i++)
8301 {
8302 tree arg = CALL_EXPR_ARG (exp, i);
8303 rtx op = expand_normal (arg);
8304 int adjust = (comparison_p) ? 1 : 0;
8305 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
8306
8307 if (last_arg_constant && i == nargs - 1)
8308 {
8309 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
8310 {
8311 enum insn_code new_icode = icode;
8312 switch (icode)
8313 {
8314 case CODE_FOR_xop_vpermil2v2df3:
8315 case CODE_FOR_xop_vpermil2v4sf3:
8316 case CODE_FOR_xop_vpermil2v4df3:
8317 case CODE_FOR_xop_vpermil2v8sf3:
8318 error ("the last argument must be a 2-bit immediate");
8319 return gen_reg_rtx (tmode);
8320 case CODE_FOR_xop_rotlv2di3:
8321 new_icode = CODE_FOR_rotlv2di3;
8322 goto xop_rotl;
8323 case CODE_FOR_xop_rotlv4si3:
8324 new_icode = CODE_FOR_rotlv4si3;
8325 goto xop_rotl;
8326 case CODE_FOR_xop_rotlv8hi3:
8327 new_icode = CODE_FOR_rotlv8hi3;
8328 goto xop_rotl;
8329 case CODE_FOR_xop_rotlv16qi3:
8330 new_icode = CODE_FOR_rotlv16qi3;
8331 xop_rotl:
8332 if (CONST_INT_P (op))
8333 {
8334 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
8335 op = GEN_INT (INTVAL (op) & mask);
8336 gcc_checking_assert
8337 (insn_data[icode].operand[i + 1].predicate (op, mode));
8338 }
8339 else
8340 {
8341 gcc_checking_assert
8342 (nargs == 2
8343 && insn_data[new_icode].operand[0].mode == tmode
8344 && insn_data[new_icode].operand[1].mode == tmode
8345 && insn_data[new_icode].operand[2].mode == mode
8346 && insn_data[new_icode].operand[0].predicate
8347 == insn_data[icode].operand[0].predicate
8348 && insn_data[new_icode].operand[1].predicate
8349 == insn_data[icode].operand[1].predicate);
8350 icode = new_icode;
8351 goto non_constant;
8352 }
8353 break;
8354 default:
8355 gcc_unreachable ();
8356 }
8357 }
8358 }
8359 else
8360 {
8361 non_constant:
8362 if (VECTOR_MODE_P (mode))
8363 op = safe_vector_operand (op, mode);
8364
8365 /* If we aren't optimizing, only allow one memory operand to be
8366 generated. */
8367 if (memory_operand (op, mode))
8368 num_memory++;
8369
8370 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
8371
8372 if (optimize
8373 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
8374 || num_memory > 1)
8375 op = force_reg (mode, op);
8376 }
8377
8378 args[i].op = op;
8379 args[i].mode = mode;
8380 }
8381
8382 switch (nargs)
8383 {
8384 case 1:
8385 pat = GEN_FCN (icode) (target, args[0].op);
8386 break;
8387
8388 case 2:
8389 if (tf_p)
8390 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
8391 GEN_INT ((int)sub_code));
8392 else if (! comparison_p)
8393 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
8394 else
8395 {
8396 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
8397 args[0].op,
8398 args[1].op);
8399
8400 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
8401 }
8402 break;
8403
8404 case 3:
8405 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
8406 break;
8407
8408 case 4:
8409 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
8410 break;
8411
8412 default:
8413 gcc_unreachable ();
8414 }
8415
8416 if (! pat)
8417 return 0;
8418
8419 emit_insn (pat);
8420 return target;
8421}
8422
8423/* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8424 insns with vec_merge. */
8425
8426static rtx
8427ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
8428 rtx target)
8429{
8430 rtx pat;
8431 tree arg0 = CALL_EXPR_ARG (exp, 0);
8432 rtx op1, op0 = expand_normal (arg0);
8433 machine_mode tmode = insn_data[icode].operand[0].mode;
8434 machine_mode mode0 = insn_data[icode].operand[1].mode;
8435
8436 if (optimize || !target
8437 || GET_MODE (target) != tmode
8438 || !insn_data[icode].operand[0].predicate (target, tmode))
8439 target = gen_reg_rtx (tmode);
8440
8441 if (VECTOR_MODE_P (mode0))
8442 op0 = safe_vector_operand (op0, mode0);
8443
8444 if ((optimize && !register_operand (op0, mode0))
8445 || !insn_data[icode].operand[1].predicate (op0, mode0))
8446 op0 = copy_to_mode_reg (mode0, op0);
8447
8448 op1 = op0;
8449 if (!insn_data[icode].operand[2].predicate (op1, mode0))
8450 op1 = copy_to_mode_reg (mode0, op1);
8451
8452 pat = GEN_FCN (icode) (target, op0, op1);
8453 if (! pat)
8454 return 0;
8455 emit_insn (pat);
8456 return target;
8457}
8458
8459/* Subroutine of ix86_expand_builtin to take care of comparison insns. */
8460
8461static rtx
8462ix86_expand_sse_compare (const struct builtin_description *d,
8463 tree exp, rtx target, bool swap)
8464{
8465 rtx pat;
8466 tree arg0 = CALL_EXPR_ARG (exp, 0);
8467 tree arg1 = CALL_EXPR_ARG (exp, 1);
8468 rtx op0 = expand_normal (arg0);
8469 rtx op1 = expand_normal (arg1);
8470 rtx op2;
8471 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8472 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8473 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8474 enum rtx_code comparison = d->comparison;
8475
8476 if (VECTOR_MODE_P (mode0))
8477 op0 = safe_vector_operand (op0, mode0);
8478 if (VECTOR_MODE_P (mode1))
8479 op1 = safe_vector_operand (op1, mode1);
8480
8481 /* Swap operands if we have a comparison that isn't available in
8482 hardware. */
8483 if (swap)
8484 std::swap (op0, op1);
8485
8486 if (optimize || !target
8487 || GET_MODE (target) != tmode
8488 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8489 target = gen_reg_rtx (tmode);
8490
8491 if ((optimize && !register_operand (op0, mode0))
8492 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
8493 op0 = copy_to_mode_reg (mode0, op0);
8494 if ((optimize && !register_operand (op1, mode1))
8495 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
8496 op1 = copy_to_mode_reg (mode1, op1);
8497
8498 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
8499 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8500 if (! pat)
8501 return 0;
8502 emit_insn (pat);
8503 return target;
8504}
8505
8506/* Subroutine of ix86_expand_builtin to take care of comi insns. */
8507
8508static rtx
8509ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
8510 rtx target)
8511{
8512 rtx pat;
8513 tree arg0 = CALL_EXPR_ARG (exp, 0);
8514 tree arg1 = CALL_EXPR_ARG (exp, 1);
8515 rtx op0 = expand_normal (arg0);
8516 rtx op1 = expand_normal (arg1);
8517 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8518 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8519 enum rtx_code comparison = d->comparison;
8520
8521 if (VECTOR_MODE_P (mode0))
8522 op0 = safe_vector_operand (op0, mode0);
8523 if (VECTOR_MODE_P (mode1))
8524 op1 = safe_vector_operand (op1, mode1);
8525
8526 /* Swap operands if we have a comparison that isn't available in
8527 hardware. */
8528 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
8529 std::swap (op0, op1);
8530
8531 target = gen_reg_rtx (SImode);
8532 emit_move_insn (target, const0_rtx);
8533 target = gen_rtx_SUBREG (QImode, target, 0);
8534
8535 if ((optimize && !register_operand (op0, mode0))
8536 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8537 op0 = copy_to_mode_reg (mode0, op0);
8538 if ((optimize && !register_operand (op1, mode1))
8539 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8540 op1 = copy_to_mode_reg (mode1, op1);
8541
8542 pat = GEN_FCN (d->icode) (op0, op1);
8543 if (! pat)
8544 return 0;
8545 emit_insn (pat);
8546 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8547 gen_rtx_fmt_ee (comparison, QImode,
8548 SET_DEST (pat),
8549 const0_rtx)));
8550
8551 return SUBREG_REG (target);
8552}
8553
8554/* Subroutines of ix86_expand_args_builtin to take care of round insns. */
8555
8556static rtx
8557ix86_expand_sse_round (const struct builtin_description *d, tree exp,
8558 rtx target)
8559{
8560 rtx pat;
8561 tree arg0 = CALL_EXPR_ARG (exp, 0);
8562 rtx op1, op0 = expand_normal (arg0);
8563 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8564 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8565
8566 if (optimize || target == 0
8567 || GET_MODE (target) != tmode
8568 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8569 target = gen_reg_rtx (tmode);
8570
8571 if (VECTOR_MODE_P (mode0))
8572 op0 = safe_vector_operand (op0, mode0);
8573
8574 if ((optimize && !register_operand (op0, mode0))
8575 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8576 op0 = copy_to_mode_reg (mode0, op0);
8577
8578 op1 = GEN_INT (d->comparison);
8579
8580 pat = GEN_FCN (d->icode) (target, op0, op1);
8581 if (! pat)
8582 return 0;
8583 emit_insn (pat);
8584 return target;
8585}
8586
8587static rtx
8588ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
8589 tree exp, rtx target)
8590{
8591 rtx pat;
8592 tree arg0 = CALL_EXPR_ARG (exp, 0);
8593 tree arg1 = CALL_EXPR_ARG (exp, 1);
8594 rtx op0 = expand_normal (arg0);
8595 rtx op1 = expand_normal (arg1);
8596 rtx op2;
8597 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8598 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8599 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8600
8601 if (optimize || target == 0
8602 || GET_MODE (target) != tmode
8603 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8604 target = gen_reg_rtx (tmode);
8605
8606 op0 = safe_vector_operand (op0, mode0);
8607 op1 = safe_vector_operand (op1, mode1);
8608
8609 if ((optimize && !register_operand (op0, mode0))
8610 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8611 op0 = copy_to_mode_reg (mode0, op0);
8612 if ((optimize && !register_operand (op1, mode1))
8613 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8614 op1 = copy_to_mode_reg (mode1, op1);
8615
8616 op2 = GEN_INT (d->comparison);
8617
8618 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8619 if (! pat)
8620 return 0;
8621 emit_insn (pat);
8622 return target;
8623}
8624
8625/* Subroutine of ix86_expand_builtin to take care of ptest insns. */
8626
8627static rtx
8628ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
8629 rtx target)
8630{
8631 rtx pat;
8632 tree arg0 = CALL_EXPR_ARG (exp, 0);
8633 tree arg1 = CALL_EXPR_ARG (exp, 1);
8634 rtx op0 = expand_normal (arg0);
8635 rtx op1 = expand_normal (arg1);
8636 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8637 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8638 enum rtx_code comparison = d->comparison;
8639
8640 if (VECTOR_MODE_P (mode0))
8641 op0 = safe_vector_operand (op0, mode0);
8642 if (VECTOR_MODE_P (mode1))
8643 op1 = safe_vector_operand (op1, mode1);
8644
8645 target = gen_reg_rtx (SImode);
8646 emit_move_insn (target, const0_rtx);
8647 target = gen_rtx_SUBREG (QImode, target, 0);
8648
8649 if ((optimize && !register_operand (op0, mode0))
8650 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8651 op0 = copy_to_mode_reg (mode0, op0);
8652 if ((optimize && !register_operand (op1, mode1))
8653 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8654 op1 = copy_to_mode_reg (mode1, op1);
8655
8656 pat = GEN_FCN (d->icode) (op0, op1);
8657 if (! pat)
8658 return 0;
8659 emit_insn (pat);
8660 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8661 gen_rtx_fmt_ee (comparison, QImode,
8662 SET_DEST (pat),
8663 const0_rtx)));
8664
8665 return SUBREG_REG (target);
8666}
8667
8668/* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
8669
8670static rtx
8671ix86_expand_sse_pcmpestr (const struct builtin_description *d,
8672 tree exp, rtx target)
8673{
8674 rtx pat;
8675 tree arg0 = CALL_EXPR_ARG (exp, 0);
8676 tree arg1 = CALL_EXPR_ARG (exp, 1);
8677 tree arg2 = CALL_EXPR_ARG (exp, 2);
8678 tree arg3 = CALL_EXPR_ARG (exp, 3);
8679 tree arg4 = CALL_EXPR_ARG (exp, 4);
8680 rtx scratch0, scratch1;
8681 rtx op0 = expand_normal (arg0);
8682 rtx op1 = expand_normal (arg1);
8683 rtx op2 = expand_normal (arg2);
8684 rtx op3 = expand_normal (arg3);
8685 rtx op4 = expand_normal (arg4);
8686 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
8687
8688 tmode0 = insn_data[d->icode].operand[0].mode;
8689 tmode1 = insn_data[d->icode].operand[1].mode;
8690 modev2 = insn_data[d->icode].operand[2].mode;
8691 modei3 = insn_data[d->icode].operand[3].mode;
8692 modev4 = insn_data[d->icode].operand[4].mode;
8693 modei5 = insn_data[d->icode].operand[5].mode;
8694 modeimm = insn_data[d->icode].operand[6].mode;
8695
8696 if (VECTOR_MODE_P (modev2))
8697 op0 = safe_vector_operand (op0, modev2);
8698 if (VECTOR_MODE_P (modev4))
8699 op2 = safe_vector_operand (op2, modev4);
8700
8701 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8702 op0 = copy_to_mode_reg (modev2, op0);
8703 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
8704 op1 = copy_to_mode_reg (modei3, op1);
8705 if ((optimize && !register_operand (op2, modev4))
8706 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
8707 op2 = copy_to_mode_reg (modev4, op2);
8708 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
8709 op3 = copy_to_mode_reg (modei5, op3);
8710
8711 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
8712 {
8713 error ("the fifth argument must be an 8-bit immediate");
8714 return const0_rtx;
8715 }
8716
8717 if (d->code == IX86_BUILTIN_PCMPESTRI128)
8718 {
8719 if (optimize || !target
8720 || GET_MODE (target) != tmode0
8721 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8722 target = gen_reg_rtx (tmode0);
8723
8724 scratch1 = gen_reg_rtx (tmode1);
8725
8726 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
8727 }
8728 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
8729 {
8730 if (optimize || !target
8731 || GET_MODE (target) != tmode1
8732 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8733 target = gen_reg_rtx (tmode1);
8734
8735 scratch0 = gen_reg_rtx (tmode0);
8736
8737 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
8738 }
8739 else
8740 {
8741 gcc_assert (d->flag);
8742
8743 scratch0 = gen_reg_rtx (tmode0);
8744 scratch1 = gen_reg_rtx (tmode1);
8745
8746 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
8747 }
8748
8749 if (! pat)
8750 return 0;
8751
8752 emit_insn (pat);
8753
8754 if (d->flag)
8755 {
8756 target = gen_reg_rtx (SImode);
8757 emit_move_insn (target, const0_rtx);
8758 target = gen_rtx_SUBREG (QImode, target, 0);
8759
8760 emit_insn
8761 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8762 gen_rtx_fmt_ee (EQ, QImode,
8763 gen_rtx_REG ((machine_mode) d->flag,
8764 FLAGS_REG),
8765 const0_rtx)));
8766 return SUBREG_REG (target);
8767 }
8768 else
8769 return target;
8770}
8771
8772
8773/* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
8774
8775static rtx
8776ix86_expand_sse_pcmpistr (const struct builtin_description *d,
8777 tree exp, rtx target)
8778{
8779 rtx pat;
8780 tree arg0 = CALL_EXPR_ARG (exp, 0);
8781 tree arg1 = CALL_EXPR_ARG (exp, 1);
8782 tree arg2 = CALL_EXPR_ARG (exp, 2);
8783 rtx scratch0, scratch1;
8784 rtx op0 = expand_normal (arg0);
8785 rtx op1 = expand_normal (arg1);
8786 rtx op2 = expand_normal (arg2);
8787 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
8788
8789 tmode0 = insn_data[d->icode].operand[0].mode;
8790 tmode1 = insn_data[d->icode].operand[1].mode;
8791 modev2 = insn_data[d->icode].operand[2].mode;
8792 modev3 = insn_data[d->icode].operand[3].mode;
8793 modeimm = insn_data[d->icode].operand[4].mode;
8794
8795 if (VECTOR_MODE_P (modev2))
8796 op0 = safe_vector_operand (op0, modev2);
8797 if (VECTOR_MODE_P (modev3))
8798 op1 = safe_vector_operand (op1, modev3);
8799
8800 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8801 op0 = copy_to_mode_reg (modev2, op0);
8802 if ((optimize && !register_operand (op1, modev3))
8803 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
8804 op1 = copy_to_mode_reg (modev3, op1);
8805
8806 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
8807 {
8808 error ("the third argument must be an 8-bit immediate");
8809 return const0_rtx;
8810 }
8811
8812 if (d->code == IX86_BUILTIN_PCMPISTRI128)
8813 {
8814 if (optimize || !target
8815 || GET_MODE (target) != tmode0
8816 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8817 target = gen_reg_rtx (tmode0);
8818
8819 scratch1 = gen_reg_rtx (tmode1);
8820
8821 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
8822 }
8823 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
8824 {
8825 if (optimize || !target
8826 || GET_MODE (target) != tmode1
8827 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8828 target = gen_reg_rtx (tmode1);
8829
8830 scratch0 = gen_reg_rtx (tmode0);
8831
8832 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
8833 }
8834 else
8835 {
8836 gcc_assert (d->flag);
8837
8838 scratch0 = gen_reg_rtx (tmode0);
8839 scratch1 = gen_reg_rtx (tmode1);
8840
8841 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
8842 }
8843
8844 if (! pat)
8845 return 0;
8846
8847 emit_insn (pat);
8848
8849 if (d->flag)
8850 {
8851 target = gen_reg_rtx (SImode);
8852 emit_move_insn (target, const0_rtx);
8853 target = gen_rtx_SUBREG (QImode, target, 0);
8854
8855 emit_insn
8856 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8857 gen_rtx_fmt_ee (EQ, QImode,
8858 gen_rtx_REG ((machine_mode) d->flag,
8859 FLAGS_REG),
8860 const0_rtx)));
8861 return SUBREG_REG (target);
8862 }
8863 else
8864 return target;
8865}
8866
8867/* Fixup modeless constants to fit required mode. */
8868
8869static rtx
8870fixup_modeless_constant (rtx x, machine_mode mode)
8871{
8872 if (GET_MODE (x) == VOIDmode)
8873 x = convert_to_mode (mode, x, 1);
8874 return x;
8875}
8876
8877/* Subroutine of ix86_expand_builtin to take care of insns with
8878 variable number of operands. */
8879
8880static rtx
8881ix86_expand_args_builtin (const struct builtin_description *d,
8882 tree exp, rtx target)
8883{
8884 rtx pat, real_target;
8885 unsigned int i, nargs;
8886 unsigned int nargs_constant = 0;
8887 unsigned int mask_pos = 0;
8888 int num_memory = 0;
8889 struct
8890 {
8891 rtx op;
8892 machine_mode mode;
8893 } args[6];
8894 bool second_arg_count = false;
8895 enum insn_code icode = d->icode;
8896 const struct insn_data_d *insn_p = &insn_data[icode];
8897 machine_mode tmode = insn_p->operand[0].mode;
8898 machine_mode rmode = VOIDmode;
8899 bool swap = false;
8900 enum rtx_code comparison = d->comparison;
8901
8902 switch ((enum ix86_builtin_func_type) d->flag)
8903 {
8904 case V2DF_FTYPE_V2DF_ROUND:
8905 case V4DF_FTYPE_V4DF_ROUND:
8906 case V8DF_FTYPE_V8DF_ROUND:
8907 case V4SF_FTYPE_V4SF_ROUND:
8908 case V8SF_FTYPE_V8SF_ROUND:
8909 case V16SF_FTYPE_V16SF_ROUND:
8910 case V4SI_FTYPE_V4SF_ROUND:
8911 case V8SI_FTYPE_V8SF_ROUND:
8912 case V16SI_FTYPE_V16SF_ROUND:
8913 return ix86_expand_sse_round (d, exp, target);
8914 case V4SI_FTYPE_V2DF_V2DF_ROUND:
8915 case V8SI_FTYPE_V4DF_V4DF_ROUND:
8916 case V16SI_FTYPE_V8DF_V8DF_ROUND:
8917 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
8918 case INT_FTYPE_V8SF_V8SF_PTEST:
8919 case INT_FTYPE_V4DI_V4DI_PTEST:
8920 case INT_FTYPE_V4DF_V4DF_PTEST:
8921 case INT_FTYPE_V4SF_V4SF_PTEST:
8922 case INT_FTYPE_V2DI_V2DI_PTEST:
8923 case INT_FTYPE_V2DF_V2DF_PTEST:
8924 return ix86_expand_sse_ptest (d, exp, target);
8925 case FLOAT128_FTYPE_FLOAT128:
8926 case FLOAT_FTYPE_FLOAT:
8927 case INT_FTYPE_INT:
8928 case UINT_FTYPE_UINT:
8929 case UINT16_FTYPE_UINT16:
8930 case UINT64_FTYPE_INT:
8931 case UINT64_FTYPE_UINT64:
8932 case INT64_FTYPE_INT64:
8933 case INT64_FTYPE_V4SF:
8934 case INT64_FTYPE_V2DF:
8935 case INT_FTYPE_V16QI:
8936 case INT_FTYPE_V8QI:
8937 case INT_FTYPE_V8SF:
8938 case INT_FTYPE_V4DF:
8939 case INT_FTYPE_V4SF:
8940 case INT_FTYPE_V2DF:
8941 case INT_FTYPE_V32QI:
8942 case V16QI_FTYPE_V16QI:
8943 case V8SI_FTYPE_V8SF:
8944 case V8SI_FTYPE_V4SI:
8945 case V8HI_FTYPE_V8HI:
8946 case V8HI_FTYPE_V16QI:
8947 case V8QI_FTYPE_V8QI:
8948 case V8SF_FTYPE_V8SF:
8949 case V8SF_FTYPE_V8SI:
8950 case V8SF_FTYPE_V4SF:
8951 case V8SF_FTYPE_V8HI:
8952 case V4SI_FTYPE_V4SI:
8953 case V4SI_FTYPE_V16QI:
8954 case V4SI_FTYPE_V4SF:
8955 case V4SI_FTYPE_V8SI:
8956 case V4SI_FTYPE_V8HI:
8957 case V4SI_FTYPE_V4DF:
8958 case V4SI_FTYPE_V2DF:
8959 case V4HI_FTYPE_V4HI:
8960 case V4DF_FTYPE_V4DF:
8961 case V4DF_FTYPE_V4SI:
8962 case V4DF_FTYPE_V4SF:
8963 case V4DF_FTYPE_V2DF:
8964 case V4SF_FTYPE_V4SF:
8965 case V4SF_FTYPE_V4SI:
8966 case V4SF_FTYPE_V8SF:
8967 case V4SF_FTYPE_V4DF:
8968 case V4SF_FTYPE_V8HI:
8969 case V4SF_FTYPE_V2DF:
8970 case V2DI_FTYPE_V2DI:
8971 case V2DI_FTYPE_V16QI:
8972 case V2DI_FTYPE_V8HI:
8973 case V2DI_FTYPE_V4SI:
8974 case V2DF_FTYPE_V2DF:
8975 case V2DF_FTYPE_V4SI:
8976 case V2DF_FTYPE_V4DF:
8977 case V2DF_FTYPE_V4SF:
8978 case V2DF_FTYPE_V2SI:
8979 case V2SI_FTYPE_V2SI:
8980 case V2SI_FTYPE_V4SF:
8981 case V2SI_FTYPE_V2SF:
8982 case V2SI_FTYPE_V2DF:
8983 case V2SF_FTYPE_V2SF:
8984 case V2SF_FTYPE_V2SI:
8985 case V32QI_FTYPE_V32QI:
8986 case V32QI_FTYPE_V16QI:
8987 case V16HI_FTYPE_V16HI:
8988 case V16HI_FTYPE_V8HI:
8989 case V8SI_FTYPE_V8SI:
8990 case V16HI_FTYPE_V16QI:
8991 case V8SI_FTYPE_V16QI:
8992 case V4DI_FTYPE_V16QI:
8993 case V8SI_FTYPE_V8HI:
8994 case V4DI_FTYPE_V8HI:
8995 case V4DI_FTYPE_V4SI:
8996 case V4DI_FTYPE_V2DI:
8997 case UQI_FTYPE_UQI:
8998 case UHI_FTYPE_UHI:
8999 case USI_FTYPE_USI:
9000 case USI_FTYPE_UQI:
9001 case USI_FTYPE_UHI:
9002 case UDI_FTYPE_UDI:
9003 case UHI_FTYPE_V16QI:
9004 case USI_FTYPE_V32QI:
9005 case UDI_FTYPE_V64QI:
9006 case V16QI_FTYPE_UHI:
9007 case V32QI_FTYPE_USI:
9008 case V64QI_FTYPE_UDI:
9009 case V8HI_FTYPE_UQI:
9010 case V16HI_FTYPE_UHI:
9011 case V32HI_FTYPE_USI:
9012 case V4SI_FTYPE_UQI:
9013 case V8SI_FTYPE_UQI:
9014 case V4SI_FTYPE_UHI:
9015 case V8SI_FTYPE_UHI:
9016 case UQI_FTYPE_V8HI:
9017 case UHI_FTYPE_V16HI:
9018 case USI_FTYPE_V32HI:
9019 case UQI_FTYPE_V4SI:
9020 case UQI_FTYPE_V8SI:
9021 case UHI_FTYPE_V16SI:
9022 case UQI_FTYPE_V2DI:
9023 case UQI_FTYPE_V4DI:
9024 case UQI_FTYPE_V8DI:
9025 case V16SI_FTYPE_UHI:
9026 case V2DI_FTYPE_UQI:
9027 case V4DI_FTYPE_UQI:
9028 case V16SI_FTYPE_INT:
9029 case V16SF_FTYPE_V8SF:
9030 case V16SI_FTYPE_V8SI:
9031 case V16SF_FTYPE_V4SF:
9032 case V16SI_FTYPE_V4SI:
9033 case V16SI_FTYPE_V16SF:
9034 case V16SI_FTYPE_V16SI:
9035 case V64QI_FTYPE_V64QI:
9036 case V32HI_FTYPE_V32HI:
9037 case V16SF_FTYPE_V16SF:
9038 case V8DI_FTYPE_UQI:
9039 case V8DI_FTYPE_V8DI:
9040 case V8DF_FTYPE_V4DF:
9041 case V8DF_FTYPE_V2DF:
9042 case V8DF_FTYPE_V8DF:
9043 case V4DI_FTYPE_V4DI:
4f0e90fa
HL
9044 case V16HI_FTYPE_V16SF:
9045 case V8HI_FTYPE_V8SF:
9046 case V8HI_FTYPE_V4SF:
2bf6d935
ML
9047 nargs = 1;
9048 break;
9049 case V4SF_FTYPE_V4SF_VEC_MERGE:
9050 case V2DF_FTYPE_V2DF_VEC_MERGE:
9051 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
9052 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
9053 case V16QI_FTYPE_V16QI_V16QI:
9054 case V16QI_FTYPE_V8HI_V8HI:
9055 case V16SF_FTYPE_V16SF_V16SF:
9056 case V8QI_FTYPE_V8QI_V8QI:
9057 case V8QI_FTYPE_V4HI_V4HI:
9058 case V8HI_FTYPE_V8HI_V8HI:
9059 case V8HI_FTYPE_V16QI_V16QI:
9060 case V8HI_FTYPE_V4SI_V4SI:
9061 case V8SF_FTYPE_V8SF_V8SF:
9062 case V8SF_FTYPE_V8SF_V8SI:
9063 case V8DF_FTYPE_V8DF_V8DF:
9064 case V4SI_FTYPE_V4SI_V4SI:
9065 case V4SI_FTYPE_V8HI_V8HI:
9066 case V4SI_FTYPE_V2DF_V2DF:
9067 case V4HI_FTYPE_V4HI_V4HI:
9068 case V4HI_FTYPE_V8QI_V8QI:
9069 case V4HI_FTYPE_V2SI_V2SI:
9070 case V4DF_FTYPE_V4DF_V4DF:
9071 case V4DF_FTYPE_V4DF_V4DI:
9072 case V4SF_FTYPE_V4SF_V4SF:
9073 case V4SF_FTYPE_V4SF_V4SI:
9074 case V4SF_FTYPE_V4SF_V2SI:
9075 case V4SF_FTYPE_V4SF_V2DF:
9076 case V4SF_FTYPE_V4SF_UINT:
9077 case V4SF_FTYPE_V4SF_DI:
9078 case V4SF_FTYPE_V4SF_SI:
9079 case V2DI_FTYPE_V2DI_V2DI:
9080 case V2DI_FTYPE_V16QI_V16QI:
9081 case V2DI_FTYPE_V4SI_V4SI:
9082 case V2DI_FTYPE_V2DI_V16QI:
9083 case V2SI_FTYPE_V2SI_V2SI:
9084 case V2SI_FTYPE_V4HI_V4HI:
9085 case V2SI_FTYPE_V2SF_V2SF:
9086 case V2DF_FTYPE_V2DF_V2DF:
9087 case V2DF_FTYPE_V2DF_V4SF:
9088 case V2DF_FTYPE_V2DF_V2DI:
9089 case V2DF_FTYPE_V2DF_DI:
9090 case V2DF_FTYPE_V2DF_SI:
9091 case V2DF_FTYPE_V2DF_UINT:
9092 case V2SF_FTYPE_V2SF_V2SF:
9093 case V1DI_FTYPE_V1DI_V1DI:
9094 case V1DI_FTYPE_V8QI_V8QI:
9095 case V1DI_FTYPE_V2SI_V2SI:
9096 case V32QI_FTYPE_V16HI_V16HI:
9097 case V16HI_FTYPE_V8SI_V8SI:
9098 case V64QI_FTYPE_V64QI_V64QI:
9099 case V32QI_FTYPE_V32QI_V32QI:
9100 case V16HI_FTYPE_V32QI_V32QI:
9101 case V16HI_FTYPE_V16HI_V16HI:
9102 case V8SI_FTYPE_V4DF_V4DF:
9103 case V8SI_FTYPE_V8SI_V8SI:
9104 case V8SI_FTYPE_V16HI_V16HI:
9105 case V4DI_FTYPE_V4DI_V4DI:
9106 case V4DI_FTYPE_V8SI_V8SI:
9107 case V8DI_FTYPE_V64QI_V64QI:
9108 if (comparison == UNKNOWN)
9109 return ix86_expand_binop_builtin (icode, exp, target);
9110 nargs = 2;
9111 break;
9112 case V4SF_FTYPE_V4SF_V4SF_SWAP:
9113 case V2DF_FTYPE_V2DF_V2DF_SWAP:
9114 gcc_assert (comparison != UNKNOWN);
9115 nargs = 2;
9116 swap = true;
9117 break;
9118 case V16HI_FTYPE_V16HI_V8HI_COUNT:
9119 case V16HI_FTYPE_V16HI_SI_COUNT:
9120 case V8SI_FTYPE_V8SI_V4SI_COUNT:
9121 case V8SI_FTYPE_V8SI_SI_COUNT:
9122 case V4DI_FTYPE_V4DI_V2DI_COUNT:
9123 case V4DI_FTYPE_V4DI_INT_COUNT:
9124 case V8HI_FTYPE_V8HI_V8HI_COUNT:
9125 case V8HI_FTYPE_V8HI_SI_COUNT:
9126 case V4SI_FTYPE_V4SI_V4SI_COUNT:
9127 case V4SI_FTYPE_V4SI_SI_COUNT:
9128 case V4HI_FTYPE_V4HI_V4HI_COUNT:
9129 case V4HI_FTYPE_V4HI_SI_COUNT:
9130 case V2DI_FTYPE_V2DI_V2DI_COUNT:
9131 case V2DI_FTYPE_V2DI_SI_COUNT:
9132 case V2SI_FTYPE_V2SI_V2SI_COUNT:
9133 case V2SI_FTYPE_V2SI_SI_COUNT:
9134 case V1DI_FTYPE_V1DI_V1DI_COUNT:
9135 case V1DI_FTYPE_V1DI_SI_COUNT:
9136 nargs = 2;
9137 second_arg_count = true;
9138 break;
9139 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
9140 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
9141 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
9142 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
9143 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
9144 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
9145 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
9146 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
9147 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
9148 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
9149 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
9150 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
9151 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
9152 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
9153 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
9154 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
9155 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
9156 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
9157 nargs = 4;
9158 second_arg_count = true;
9159 break;
9160 case UINT64_FTYPE_UINT64_UINT64:
9161 case UINT_FTYPE_UINT_UINT:
9162 case UINT_FTYPE_UINT_USHORT:
9163 case UINT_FTYPE_UINT_UCHAR:
9164 case UINT16_FTYPE_UINT16_INT:
9165 case UINT8_FTYPE_UINT8_INT:
9166 case UQI_FTYPE_UQI_UQI:
9167 case UHI_FTYPE_UHI_UHI:
9168 case USI_FTYPE_USI_USI:
9169 case UDI_FTYPE_UDI_UDI:
9170 case V16SI_FTYPE_V8DF_V8DF:
4f0e90fa
HL
9171 case V32HI_FTYPE_V16SF_V16SF:
9172 case V16HI_FTYPE_V8SF_V8SF:
9173 case V8HI_FTYPE_V4SF_V4SF:
9174 case V16HI_FTYPE_V16SF_UHI:
9175 case V8HI_FTYPE_V8SF_UQI:
9176 case V8HI_FTYPE_V4SF_UQI:
2bf6d935
ML
9177 nargs = 2;
9178 break;
9179 case V2DI_FTYPE_V2DI_INT_CONVERT:
9180 nargs = 2;
9181 rmode = V1TImode;
9182 nargs_constant = 1;
9183 break;
9184 case V4DI_FTYPE_V4DI_INT_CONVERT:
9185 nargs = 2;
9186 rmode = V2TImode;
9187 nargs_constant = 1;
9188 break;
9189 case V8DI_FTYPE_V8DI_INT_CONVERT:
9190 nargs = 2;
9191 rmode = V4TImode;
9192 nargs_constant = 1;
9193 break;
9194 case V8HI_FTYPE_V8HI_INT:
9195 case V8HI_FTYPE_V8SF_INT:
9196 case V16HI_FTYPE_V16SF_INT:
9197 case V8HI_FTYPE_V4SF_INT:
9198 case V8SF_FTYPE_V8SF_INT:
9199 case V4SF_FTYPE_V16SF_INT:
9200 case V16SF_FTYPE_V16SF_INT:
9201 case V4SI_FTYPE_V4SI_INT:
9202 case V4SI_FTYPE_V8SI_INT:
9203 case V4HI_FTYPE_V4HI_INT:
9204 case V4DF_FTYPE_V4DF_INT:
9205 case V4DF_FTYPE_V8DF_INT:
9206 case V4SF_FTYPE_V4SF_INT:
9207 case V4SF_FTYPE_V8SF_INT:
9208 case V2DI_FTYPE_V2DI_INT:
9209 case V2DF_FTYPE_V2DF_INT:
9210 case V2DF_FTYPE_V4DF_INT:
9211 case V16HI_FTYPE_V16HI_INT:
9212 case V8SI_FTYPE_V8SI_INT:
9213 case V16SI_FTYPE_V16SI_INT:
9214 case V4SI_FTYPE_V16SI_INT:
9215 case V4DI_FTYPE_V4DI_INT:
9216 case V2DI_FTYPE_V4DI_INT:
9217 case V4DI_FTYPE_V8DI_INT:
2bf6d935
ML
9218 case UQI_FTYPE_UQI_UQI_CONST:
9219 case UHI_FTYPE_UHI_UQI:
9220 case USI_FTYPE_USI_UQI:
9221 case UDI_FTYPE_UDI_UQI:
9222 nargs = 2;
9223 nargs_constant = 1;
9224 break;
9225 case V16QI_FTYPE_V16QI_V16QI_V16QI:
9226 case V8SF_FTYPE_V8SF_V8SF_V8SF:
9227 case V4DF_FTYPE_V4DF_V4DF_V4DF:
9228 case V4SF_FTYPE_V4SF_V4SF_V4SF:
9229 case V2DF_FTYPE_V2DF_V2DF_V2DF:
9230 case V32QI_FTYPE_V32QI_V32QI_V32QI:
9231 case UHI_FTYPE_V16SI_V16SI_UHI:
9232 case UQI_FTYPE_V8DI_V8DI_UQI:
9233 case V16HI_FTYPE_V16SI_V16HI_UHI:
9234 case V16QI_FTYPE_V16SI_V16QI_UHI:
9235 case V16QI_FTYPE_V8DI_V16QI_UQI:
9236 case V16SF_FTYPE_V16SF_V16SF_UHI:
9237 case V16SF_FTYPE_V4SF_V16SF_UHI:
9238 case V16SI_FTYPE_SI_V16SI_UHI:
9239 case V16SI_FTYPE_V16HI_V16SI_UHI:
9240 case V16SI_FTYPE_V16QI_V16SI_UHI:
9241 case V8SF_FTYPE_V4SF_V8SF_UQI:
9242 case V4DF_FTYPE_V2DF_V4DF_UQI:
9243 case V8SI_FTYPE_V4SI_V8SI_UQI:
9244 case V8SI_FTYPE_SI_V8SI_UQI:
9245 case V4SI_FTYPE_V4SI_V4SI_UQI:
9246 case V4SI_FTYPE_SI_V4SI_UQI:
9247 case V4DI_FTYPE_V2DI_V4DI_UQI:
9248 case V4DI_FTYPE_DI_V4DI_UQI:
9249 case V2DI_FTYPE_V2DI_V2DI_UQI:
9250 case V2DI_FTYPE_DI_V2DI_UQI:
9251 case V64QI_FTYPE_V64QI_V64QI_UDI:
9252 case V64QI_FTYPE_V16QI_V64QI_UDI:
9253 case V64QI_FTYPE_QI_V64QI_UDI:
9254 case V32QI_FTYPE_V32QI_V32QI_USI:
9255 case V32QI_FTYPE_V16QI_V32QI_USI:
9256 case V32QI_FTYPE_QI_V32QI_USI:
9257 case V16QI_FTYPE_V16QI_V16QI_UHI:
9258 case V16QI_FTYPE_QI_V16QI_UHI:
9259 case V32HI_FTYPE_V8HI_V32HI_USI:
9260 case V32HI_FTYPE_HI_V32HI_USI:
9261 case V16HI_FTYPE_V8HI_V16HI_UHI:
9262 case V16HI_FTYPE_HI_V16HI_UHI:
9263 case V8HI_FTYPE_V8HI_V8HI_UQI:
9264 case V8HI_FTYPE_HI_V8HI_UQI:
9265 case V8SF_FTYPE_V8HI_V8SF_UQI:
9266 case V4SF_FTYPE_V8HI_V4SF_UQI:
9267 case V8SI_FTYPE_V8SF_V8SI_UQI:
9268 case V4SI_FTYPE_V4SF_V4SI_UQI:
9269 case V4DI_FTYPE_V4SF_V4DI_UQI:
9270 case V2DI_FTYPE_V4SF_V2DI_UQI:
9271 case V4SF_FTYPE_V4DI_V4SF_UQI:
9272 case V4SF_FTYPE_V2DI_V4SF_UQI:
9273 case V4DF_FTYPE_V4DI_V4DF_UQI:
9274 case V2DF_FTYPE_V2DI_V2DF_UQI:
9275 case V16QI_FTYPE_V8HI_V16QI_UQI:
9276 case V16QI_FTYPE_V16HI_V16QI_UHI:
9277 case V16QI_FTYPE_V4SI_V16QI_UQI:
9278 case V16QI_FTYPE_V8SI_V16QI_UQI:
9279 case V8HI_FTYPE_V4SI_V8HI_UQI:
9280 case V8HI_FTYPE_V8SI_V8HI_UQI:
9281 case V16QI_FTYPE_V2DI_V16QI_UQI:
9282 case V16QI_FTYPE_V4DI_V16QI_UQI:
9283 case V8HI_FTYPE_V2DI_V8HI_UQI:
9284 case V8HI_FTYPE_V4DI_V8HI_UQI:
9285 case V4SI_FTYPE_V2DI_V4SI_UQI:
9286 case V4SI_FTYPE_V4DI_V4SI_UQI:
9287 case V32QI_FTYPE_V32HI_V32QI_USI:
9288 case UHI_FTYPE_V16QI_V16QI_UHI:
9289 case USI_FTYPE_V32QI_V32QI_USI:
9290 case UDI_FTYPE_V64QI_V64QI_UDI:
9291 case UQI_FTYPE_V8HI_V8HI_UQI:
9292 case UHI_FTYPE_V16HI_V16HI_UHI:
9293 case USI_FTYPE_V32HI_V32HI_USI:
9294 case UQI_FTYPE_V4SI_V4SI_UQI:
9295 case UQI_FTYPE_V8SI_V8SI_UQI:
9296 case UQI_FTYPE_V2DI_V2DI_UQI:
9297 case UQI_FTYPE_V4DI_V4DI_UQI:
9298 case V4SF_FTYPE_V2DF_V4SF_UQI:
9299 case V4SF_FTYPE_V4DF_V4SF_UQI:
9300 case V16SI_FTYPE_V16SI_V16SI_UHI:
9301 case V16SI_FTYPE_V4SI_V16SI_UHI:
9302 case V2DI_FTYPE_V4SI_V2DI_UQI:
9303 case V2DI_FTYPE_V8HI_V2DI_UQI:
9304 case V2DI_FTYPE_V16QI_V2DI_UQI:
9305 case V4DI_FTYPE_V4DI_V4DI_UQI:
9306 case V4DI_FTYPE_V4SI_V4DI_UQI:
9307 case V4DI_FTYPE_V8HI_V4DI_UQI:
9308 case V4DI_FTYPE_V16QI_V4DI_UQI:
9309 case V4DI_FTYPE_V4DF_V4DI_UQI:
9310 case V2DI_FTYPE_V2DF_V2DI_UQI:
9311 case V4SI_FTYPE_V4DF_V4SI_UQI:
9312 case V4SI_FTYPE_V2DF_V4SI_UQI:
9313 case V4SI_FTYPE_V8HI_V4SI_UQI:
9314 case V4SI_FTYPE_V16QI_V4SI_UQI:
9315 case V4DI_FTYPE_V4DI_V4DI_V4DI:
9316 case V8DF_FTYPE_V2DF_V8DF_UQI:
9317 case V8DF_FTYPE_V4DF_V8DF_UQI:
9318 case V8DF_FTYPE_V8DF_V8DF_UQI:
9319 case V8SF_FTYPE_V8SF_V8SF_UQI:
9320 case V8SF_FTYPE_V8SI_V8SF_UQI:
9321 case V4DF_FTYPE_V4DF_V4DF_UQI:
9322 case V4SF_FTYPE_V4SF_V4SF_UQI:
9323 case V2DF_FTYPE_V2DF_V2DF_UQI:
9324 case V2DF_FTYPE_V4SF_V2DF_UQI:
9325 case V2DF_FTYPE_V4SI_V2DF_UQI:
9326 case V4SF_FTYPE_V4SI_V4SF_UQI:
9327 case V4DF_FTYPE_V4SF_V4DF_UQI:
9328 case V4DF_FTYPE_V4SI_V4DF_UQI:
9329 case V8SI_FTYPE_V8SI_V8SI_UQI:
9330 case V8SI_FTYPE_V8HI_V8SI_UQI:
9331 case V8SI_FTYPE_V16QI_V8SI_UQI:
9332 case V8DF_FTYPE_V8SI_V8DF_UQI:
9333 case V8DI_FTYPE_DI_V8DI_UQI:
9334 case V16SF_FTYPE_V8SF_V16SF_UHI:
9335 case V16SI_FTYPE_V8SI_V16SI_UHI:
9336 case V16HI_FTYPE_V16HI_V16HI_UHI:
9337 case V8HI_FTYPE_V16QI_V8HI_UQI:
9338 case V16HI_FTYPE_V16QI_V16HI_UHI:
9339 case V32HI_FTYPE_V32HI_V32HI_USI:
9340 case V32HI_FTYPE_V32QI_V32HI_USI:
9341 case V8DI_FTYPE_V16QI_V8DI_UQI:
9342 case V8DI_FTYPE_V2DI_V8DI_UQI:
9343 case V8DI_FTYPE_V4DI_V8DI_UQI:
9344 case V8DI_FTYPE_V8DI_V8DI_UQI:
9345 case V8DI_FTYPE_V8HI_V8DI_UQI:
9346 case V8DI_FTYPE_V8SI_V8DI_UQI:
9347 case V8HI_FTYPE_V8DI_V8HI_UQI:
9348 case V8SI_FTYPE_V8DI_V8SI_UQI:
9349 case V4SI_FTYPE_V4SI_V4SI_V4SI:
9350 case V16SI_FTYPE_V16SI_V16SI_V16SI:
9351 case V8DI_FTYPE_V8DI_V8DI_V8DI:
9352 case V32HI_FTYPE_V32HI_V32HI_V32HI:
9353 case V2DI_FTYPE_V2DI_V2DI_V2DI:
9354 case V16HI_FTYPE_V16HI_V16HI_V16HI:
9355 case V8SI_FTYPE_V8SI_V8SI_V8SI:
9356 case V8HI_FTYPE_V8HI_V8HI_V8HI:
4f0e90fa
HL
9357 case V32HI_FTYPE_V16SF_V16SF_USI:
9358 case V16HI_FTYPE_V8SF_V8SF_UHI:
9359 case V8HI_FTYPE_V4SF_V4SF_UQI:
9360 case V16HI_FTYPE_V16SF_V16HI_UHI:
9361 case V8HI_FTYPE_V8SF_V8HI_UQI:
9362 case V8HI_FTYPE_V4SF_V8HI_UQI:
9363 case V16SF_FTYPE_V16SF_V32HI_V32HI:
9364 case V8SF_FTYPE_V8SF_V16HI_V16HI:
9365 case V4SF_FTYPE_V4SF_V8HI_V8HI:
2bf6d935
ML
9366 nargs = 3;
9367 break;
9368 case V32QI_FTYPE_V32QI_V32QI_INT:
9369 case V16HI_FTYPE_V16HI_V16HI_INT:
9370 case V16QI_FTYPE_V16QI_V16QI_INT:
9371 case V4DI_FTYPE_V4DI_V4DI_INT:
9372 case V8HI_FTYPE_V8HI_V8HI_INT:
9373 case V8SI_FTYPE_V8SI_V8SI_INT:
9374 case V8SI_FTYPE_V8SI_V4SI_INT:
9375 case V8SF_FTYPE_V8SF_V8SF_INT:
9376 case V8SF_FTYPE_V8SF_V4SF_INT:
9377 case V4SI_FTYPE_V4SI_V4SI_INT:
9378 case V4DF_FTYPE_V4DF_V4DF_INT:
9379 case V16SF_FTYPE_V16SF_V16SF_INT:
9380 case V16SF_FTYPE_V16SF_V4SF_INT:
9381 case V16SI_FTYPE_V16SI_V4SI_INT:
9382 case V4DF_FTYPE_V4DF_V2DF_INT:
9383 case V4SF_FTYPE_V4SF_V4SF_INT:
9384 case V2DI_FTYPE_V2DI_V2DI_INT:
9385 case V4DI_FTYPE_V4DI_V2DI_INT:
9386 case V2DF_FTYPE_V2DF_V2DF_INT:
9387 case UQI_FTYPE_V8DI_V8UDI_INT:
9388 case UQI_FTYPE_V8DF_V8DF_INT:
9389 case UQI_FTYPE_V2DF_V2DF_INT:
9390 case UQI_FTYPE_V4SF_V4SF_INT:
9391 case UHI_FTYPE_V16SI_V16SI_INT:
9392 case UHI_FTYPE_V16SF_V16SF_INT:
9393 case V64QI_FTYPE_V64QI_V64QI_INT:
9394 case V32HI_FTYPE_V32HI_V32HI_INT:
9395 case V16SI_FTYPE_V16SI_V16SI_INT:
9396 case V8DI_FTYPE_V8DI_V8DI_INT:
9397 nargs = 3;
9398 nargs_constant = 1;
9399 break;
9400 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
9401 nargs = 3;
9402 rmode = V4DImode;
9403 nargs_constant = 1;
9404 break;
9405 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
9406 nargs = 3;
9407 rmode = V2DImode;
9408 nargs_constant = 1;
9409 break;
9410 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
9411 nargs = 3;
9412 rmode = DImode;
9413 nargs_constant = 1;
9414 break;
9415 case V2DI_FTYPE_V2DI_UINT_UINT:
9416 nargs = 3;
9417 nargs_constant = 2;
9418 break;
9419 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
9420 nargs = 3;
9421 rmode = V8DImode;
9422 nargs_constant = 1;
9423 break;
9424 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
9425 nargs = 5;
9426 rmode = V8DImode;
9427 mask_pos = 2;
9428 nargs_constant = 1;
9429 break;
9430 case QI_FTYPE_V8DF_INT_UQI:
9431 case QI_FTYPE_V4DF_INT_UQI:
9432 case QI_FTYPE_V2DF_INT_UQI:
9433 case HI_FTYPE_V16SF_INT_UHI:
9434 case QI_FTYPE_V8SF_INT_UQI:
9435 case QI_FTYPE_V4SF_INT_UQI:
9436 case V4SI_FTYPE_V4SI_V4SI_UHI:
9437 case V8SI_FTYPE_V8SI_V8SI_UHI:
9438 nargs = 3;
9439 mask_pos = 1;
9440 nargs_constant = 1;
9441 break;
9442 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
9443 nargs = 5;
9444 rmode = V4DImode;
9445 mask_pos = 2;
9446 nargs_constant = 1;
9447 break;
9448 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
9449 nargs = 5;
9450 rmode = V2DImode;
9451 mask_pos = 2;
9452 nargs_constant = 1;
9453 break;
9454 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
9455 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
9456 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
9457 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
9458 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
9459 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
9460 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
9461 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
9462 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
9463 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
9464 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
9465 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
9466 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
9467 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
9468 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
9469 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
9470 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
9471 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
9472 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
9473 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
9474 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
9475 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
9476 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
9477 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
9478 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
9479 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
9480 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
9481 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
9482 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
9483 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
9484 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
9485 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
9486 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
9487 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
9488 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
9489 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
9490 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
9491 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
9492 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
9493 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
9494 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
9495 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
9496 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
9497 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
9498 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
9499 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
9500 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
9501 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
9502 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
9503 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
9504 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
4f0e90fa
HL
9505 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
9506 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
9507 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
2bf6d935
ML
9508 nargs = 4;
9509 break;
9510 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
9511 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
9512 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
9513 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
9514 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
9515 nargs = 4;
9516 nargs_constant = 1;
9517 break;
9518 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
9519 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
9520 case QI_FTYPE_V4DF_V4DF_INT_UQI:
9521 case QI_FTYPE_V8SF_V8SF_INT_UQI:
9522 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
9523 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
9524 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
9525 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
9526 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
9527 case USI_FTYPE_V32QI_V32QI_INT_USI:
9528 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
9529 case USI_FTYPE_V32HI_V32HI_INT_USI:
9530 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
9531 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
2bf6d935
ML
9532 nargs = 4;
9533 mask_pos = 1;
9534 nargs_constant = 1;
9535 break;
9536 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
9537 nargs = 4;
9538 nargs_constant = 2;
9539 break;
9540 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
9541 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
4f0e90fa
HL
9542 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
9543 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
9544 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
2bf6d935
ML
9545 nargs = 4;
9546 break;
9547 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
9548 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
9549 mask_pos = 1;
9550 nargs = 4;
9551 nargs_constant = 1;
9552 break;
9553 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
9554 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
9555 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
9556 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
9557 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
9558 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
9559 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
9560 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
9561 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
9562 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
9563 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
9564 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
9565 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
9566 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
9567 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
9568 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
9569 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
9570 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
9571 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
9572 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
9573 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
9574 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
9575 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
9576 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
9577 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
9578 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
9579 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
9580 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
9581 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
9582 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
9583 nargs = 4;
9584 mask_pos = 2;
9585 nargs_constant = 1;
9586 break;
9587 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
9588 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
9589 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
9590 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
9591 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
9592 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
9593 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
9594 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
9595 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
9596 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
9597 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
9598 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
9599 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
9600 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
9601 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
9602 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
9603 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
9604 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
9605 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
9606 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
9607 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
9608 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
9609 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
9610 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
9611 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
9612 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
9613 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
9614 nargs = 5;
9615 mask_pos = 2;
9616 nargs_constant = 1;
9617 break;
9618 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
9619 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
9620 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
9621 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
9622 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
9623 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
9624 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
9625 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
9626 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
9627 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
9628 nargs = 5;
9629 mask_pos = 1;
9630 nargs_constant = 1;
9631 break;
9632 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
9633 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
9634 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
9635 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
9636 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
9637 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
9638 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
9639 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
9640 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
9641 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
9642 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
9643 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
9644 nargs = 5;
9645 mask_pos = 1;
9646 nargs_constant = 2;
9647 break;
9648
9649 default:
9650 gcc_unreachable ();
9651 }
9652
9653 gcc_assert (nargs <= ARRAY_SIZE (args));
9654
9655 if (comparison != UNKNOWN)
9656 {
9657 gcc_assert (nargs == 2);
9658 return ix86_expand_sse_compare (d, exp, target, swap);
9659 }
9660
9661 if (rmode == VOIDmode || rmode == tmode)
9662 {
9663 if (optimize
9664 || target == 0
9665 || GET_MODE (target) != tmode
9666 || !insn_p->operand[0].predicate (target, tmode))
9667 target = gen_reg_rtx (tmode);
9668 else if (memory_operand (target, tmode))
9669 num_memory++;
9670 real_target = target;
9671 }
9672 else
9673 {
9674 real_target = gen_reg_rtx (tmode);
9675 target = lowpart_subreg (rmode, real_target, tmode);
9676 }
9677
9678 for (i = 0; i < nargs; i++)
9679 {
9680 tree arg = CALL_EXPR_ARG (exp, i);
9681 rtx op = expand_normal (arg);
9682 machine_mode mode = insn_p->operand[i + 1].mode;
9683 bool match = insn_p->operand[i + 1].predicate (op, mode);
9684
9685 if (second_arg_count && i == 1)
9686 {
9687 /* SIMD shift insns take either an 8-bit immediate or
9688 register as count. But builtin functions take int as
9689 count. If count doesn't match, we put it in register.
9690 The instructions are using 64-bit count, if op is just
9691 32-bit, zero-extend it, as negative shift counts
9692 are undefined behavior and zero-extension is more
9693 efficient. */
9694 if (!match)
9695 {
9696 if (SCALAR_INT_MODE_P (GET_MODE (op)))
9697 op = convert_modes (mode, GET_MODE (op), op, 1);
9698 else
9699 op = lowpart_subreg (mode, op, GET_MODE (op));
9700 if (!insn_p->operand[i + 1].predicate (op, mode))
9701 op = copy_to_reg (op);
9702 }
9703 }
9704 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9705 (!mask_pos && (nargs - i) <= nargs_constant))
9706 {
9707 if (!match)
9708 switch (icode)
9709 {
9710 case CODE_FOR_avx_vinsertf128v4di:
9711 case CODE_FOR_avx_vextractf128v4di:
9712 error ("the last argument must be an 1-bit immediate");
9713 return const0_rtx;
9714
9715 case CODE_FOR_avx512f_cmpv8di3_mask:
9716 case CODE_FOR_avx512f_cmpv16si3_mask:
9717 case CODE_FOR_avx512f_ucmpv8di3_mask:
9718 case CODE_FOR_avx512f_ucmpv16si3_mask:
9719 case CODE_FOR_avx512vl_cmpv4di3_mask:
9720 case CODE_FOR_avx512vl_cmpv8si3_mask:
9721 case CODE_FOR_avx512vl_ucmpv4di3_mask:
9722 case CODE_FOR_avx512vl_ucmpv8si3_mask:
9723 case CODE_FOR_avx512vl_cmpv2di3_mask:
9724 case CODE_FOR_avx512vl_cmpv4si3_mask:
9725 case CODE_FOR_avx512vl_ucmpv2di3_mask:
9726 case CODE_FOR_avx512vl_ucmpv4si3_mask:
9727 error ("the last argument must be a 3-bit immediate");
9728 return const0_rtx;
9729
9730 case CODE_FOR_sse4_1_roundsd:
9731 case CODE_FOR_sse4_1_roundss:
9732
9733 case CODE_FOR_sse4_1_roundpd:
9734 case CODE_FOR_sse4_1_roundps:
9735 case CODE_FOR_avx_roundpd256:
9736 case CODE_FOR_avx_roundps256:
9737
9738 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
9739 case CODE_FOR_sse4_1_roundps_sfix:
9740 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
9741 case CODE_FOR_avx_roundps_sfix256:
9742
9743 case CODE_FOR_sse4_1_blendps:
9744 case CODE_FOR_avx_blendpd256:
9745 case CODE_FOR_avx_vpermilv4df:
9746 case CODE_FOR_avx_vpermilv4df_mask:
9747 case CODE_FOR_avx512f_getmantv8df_mask:
9748 case CODE_FOR_avx512f_getmantv16sf_mask:
9749 case CODE_FOR_avx512vl_getmantv8sf_mask:
9750 case CODE_FOR_avx512vl_getmantv4df_mask:
9751 case CODE_FOR_avx512vl_getmantv4sf_mask:
9752 case CODE_FOR_avx512vl_getmantv2df_mask:
9753 case CODE_FOR_avx512dq_rangepv8df_mask_round:
9754 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
9755 case CODE_FOR_avx512dq_rangepv4df_mask:
9756 case CODE_FOR_avx512dq_rangepv8sf_mask:
9757 case CODE_FOR_avx512dq_rangepv2df_mask:
9758 case CODE_FOR_avx512dq_rangepv4sf_mask:
9759 case CODE_FOR_avx_shufpd256_mask:
9760 error ("the last argument must be a 4-bit immediate");
9761 return const0_rtx;
9762
9763 case CODE_FOR_sha1rnds4:
9764 case CODE_FOR_sse4_1_blendpd:
9765 case CODE_FOR_avx_vpermilv2df:
9766 case CODE_FOR_avx_vpermilv2df_mask:
9767 case CODE_FOR_xop_vpermil2v2df3:
9768 case CODE_FOR_xop_vpermil2v4sf3:
9769 case CODE_FOR_xop_vpermil2v4df3:
9770 case CODE_FOR_xop_vpermil2v8sf3:
9771 case CODE_FOR_avx512f_vinsertf32x4_mask:
9772 case CODE_FOR_avx512f_vinserti32x4_mask:
9773 case CODE_FOR_avx512f_vextractf32x4_mask:
9774 case CODE_FOR_avx512f_vextracti32x4_mask:
9775 case CODE_FOR_sse2_shufpd:
9776 case CODE_FOR_sse2_shufpd_mask:
9777 case CODE_FOR_avx512dq_shuf_f64x2_mask:
9778 case CODE_FOR_avx512dq_shuf_i64x2_mask:
9779 case CODE_FOR_avx512vl_shuf_i32x4_mask:
9780 case CODE_FOR_avx512vl_shuf_f32x4_mask:
9781 error ("the last argument must be a 2-bit immediate");
9782 return const0_rtx;
9783
9784 case CODE_FOR_avx_vextractf128v4df:
9785 case CODE_FOR_avx_vextractf128v8sf:
9786 case CODE_FOR_avx_vextractf128v8si:
9787 case CODE_FOR_avx_vinsertf128v4df:
9788 case CODE_FOR_avx_vinsertf128v8sf:
9789 case CODE_FOR_avx_vinsertf128v8si:
9790 case CODE_FOR_avx512f_vinsertf64x4_mask:
9791 case CODE_FOR_avx512f_vinserti64x4_mask:
9792 case CODE_FOR_avx512f_vextractf64x4_mask:
9793 case CODE_FOR_avx512f_vextracti64x4_mask:
9794 case CODE_FOR_avx512dq_vinsertf32x8_mask:
9795 case CODE_FOR_avx512dq_vinserti32x8_mask:
9796 case CODE_FOR_avx512vl_vinsertv4df:
9797 case CODE_FOR_avx512vl_vinsertv4di:
9798 case CODE_FOR_avx512vl_vinsertv8sf:
9799 case CODE_FOR_avx512vl_vinsertv8si:
9800 error ("the last argument must be a 1-bit immediate");
9801 return const0_rtx;
9802
9803 case CODE_FOR_avx_vmcmpv2df3:
9804 case CODE_FOR_avx_vmcmpv4sf3:
9805 case CODE_FOR_avx_cmpv2df3:
9806 case CODE_FOR_avx_cmpv4sf3:
9807 case CODE_FOR_avx_cmpv4df3:
9808 case CODE_FOR_avx_cmpv8sf3:
9809 case CODE_FOR_avx512f_cmpv8df3_mask:
9810 case CODE_FOR_avx512f_cmpv16sf3_mask:
9811 case CODE_FOR_avx512f_vmcmpv2df3_mask:
9812 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
9813 error ("the last argument must be a 5-bit immediate");
9814 return const0_rtx;
9815
9816 default:
9817 switch (nargs_constant)
9818 {
9819 case 2:
9820 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9821 (!mask_pos && (nargs - i) == nargs_constant))
9822 {
9823 error ("the next to last argument must be an 8-bit immediate");
9824 break;
9825 }
9826 /* FALLTHRU */
9827 case 1:
9828 error ("the last argument must be an 8-bit immediate");
9829 break;
9830 default:
9831 gcc_unreachable ();
9832 }
9833 return const0_rtx;
9834 }
9835 }
9836 else
9837 {
9838 if (VECTOR_MODE_P (mode))
9839 op = safe_vector_operand (op, mode);
9840
9841 /* If we aren't optimizing, only allow one memory operand to
9842 be generated. */
9843 if (memory_operand (op, mode))
9844 num_memory++;
9845
9846 op = fixup_modeless_constant (op, mode);
9847
9848 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
9849 {
9850 if (optimize || !match || num_memory > 1)
9851 op = copy_to_mode_reg (mode, op);
9852 }
9853 else
9854 {
9855 op = copy_to_reg (op);
9856 op = lowpart_subreg (mode, op, GET_MODE (op));
9857 }
9858 }
9859
9860 args[i].op = op;
9861 args[i].mode = mode;
9862 }
9863
9864 switch (nargs)
9865 {
9866 case 1:
9867 pat = GEN_FCN (icode) (real_target, args[0].op);
9868 break;
9869 case 2:
9870 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
9871 break;
9872 case 3:
9873 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9874 args[2].op);
9875 break;
9876 case 4:
9877 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9878 args[2].op, args[3].op);
9879 break;
9880 case 5:
9881 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9882 args[2].op, args[3].op, args[4].op);
9883 break;
9884 case 6:
9885 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9886 args[2].op, args[3].op, args[4].op,
9887 args[5].op);
9888 break;
9889 default:
9890 gcc_unreachable ();
9891 }
9892
9893 if (! pat)
9894 return 0;
9895
9896 emit_insn (pat);
9897 return target;
9898}
9899
9900/* Transform pattern of following layout:
9901 (set A
9902 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
9903 )
9904 into:
9905 (set (A B)) */
9906
9907static rtx
9908ix86_erase_embedded_rounding (rtx pat)
9909{
9910 if (GET_CODE (pat) == INSN)
9911 pat = PATTERN (pat);
9912
9913 gcc_assert (GET_CODE (pat) == SET);
9914 rtx src = SET_SRC (pat);
9915 gcc_assert (XVECLEN (src, 0) == 2);
9916 rtx p0 = XVECEXP (src, 0, 0);
9917 gcc_assert (GET_CODE (src) == UNSPEC
9918 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
9919 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
9920 return res;
9921}
9922
9923/* Subroutine of ix86_expand_round_builtin to take care of comi insns
9924 with rounding. */
9925static rtx
9926ix86_expand_sse_comi_round (const struct builtin_description *d,
9927 tree exp, rtx target)
9928{
9929 rtx pat, set_dst;
9930 tree arg0 = CALL_EXPR_ARG (exp, 0);
9931 tree arg1 = CALL_EXPR_ARG (exp, 1);
9932 tree arg2 = CALL_EXPR_ARG (exp, 2);
9933 tree arg3 = CALL_EXPR_ARG (exp, 3);
9934 rtx op0 = expand_normal (arg0);
9935 rtx op1 = expand_normal (arg1);
9936 rtx op2 = expand_normal (arg2);
9937 rtx op3 = expand_normal (arg3);
9938 enum insn_code icode = d->icode;
9939 const struct insn_data_d *insn_p = &insn_data[icode];
9940 machine_mode mode0 = insn_p->operand[0].mode;
9941 machine_mode mode1 = insn_p->operand[1].mode;
2bf6d935
ML
9942
9943 /* See avxintrin.h for values. */
467e9f38 9944 static const enum rtx_code comparisons[32] =
2bf6d935 9945 {
467e9f38
L
9946 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
9947 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
9948 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
9949 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
2bf6d935 9950 };
467e9f38
L
9951 static const bool ordereds[32] =
9952 {
9953 true, true, true, false, false, false, false, true,
9954 false, false, false, true, true, true, true, false,
9955 true, true, true, false, false, false, false, true,
9956 false, false, false, true, true, true, true, false
9957 };
9958 static const bool non_signalings[32] =
2bf6d935
ML
9959 {
9960 true, false, false, true, true, false, false, true,
9961 true, false, false, true, true, false, false, true,
9962 false, true, true, false, false, true, true, false,
9963 false, true, true, false, false, true, true, false
9964 };
9965
9966 if (!CONST_INT_P (op2))
9967 {
9968 error ("the third argument must be comparison constant");
9969 return const0_rtx;
9970 }
9971 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
9972 {
9973 error ("incorrect comparison mode");
9974 return const0_rtx;
9975 }
9976
9977 if (!insn_p->operand[2].predicate (op3, SImode))
9978 {
9979 error ("incorrect rounding operand");
9980 return const0_rtx;
9981 }
9982
2bf6d935
ML
9983 if (VECTOR_MODE_P (mode0))
9984 op0 = safe_vector_operand (op0, mode0);
9985 if (VECTOR_MODE_P (mode1))
9986 op1 = safe_vector_operand (op1, mode1);
9987
467e9f38
L
9988 enum rtx_code comparison = comparisons[INTVAL (op2)];
9989 bool ordered = ordereds[INTVAL (op2)];
9990 bool non_signaling = non_signalings[INTVAL (op2)];
9991 rtx const_val = const0_rtx;
9992
9993 bool check_unordered = false;
9994 machine_mode mode = CCFPmode;
9995 switch (comparison)
9996 {
9997 case ORDERED:
9998 if (!ordered)
9999 {
10000 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
10001 if (!non_signaling)
10002 ordered = true;
10003 mode = CCSmode;
10004 }
10005 else
10006 {
10007 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
10008 if (non_signaling)
10009 ordered = false;
10010 mode = CCPmode;
10011 }
10012 comparison = NE;
10013 break;
10014 case UNORDERED:
10015 if (ordered)
10016 {
10017 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
10018 if (non_signaling)
10019 ordered = false;
10020 mode = CCSmode;
10021 }
10022 else
10023 {
10024 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
10025 if (!non_signaling)
10026 ordered = true;
10027 mode = CCPmode;
10028 }
10029 comparison = EQ;
10030 break;
10031
10032 case LE: /* -> GE */
10033 case LT: /* -> GT */
10034 case UNGE: /* -> UNLE */
10035 case UNGT: /* -> UNLT */
10036 std::swap (op0, op1);
10037 comparison = swap_condition (comparison);
10038 /* FALLTHRU */
10039 case GT:
10040 case GE:
10041 case UNEQ:
10042 case UNLT:
10043 case UNLE:
10044 case LTGT:
10045 /* These are supported by CCFPmode. NB: Use ordered/signaling
10046 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
10047 with NAN operands. */
10048 if (ordered == non_signaling)
10049 ordered = !ordered;
10050 break;
10051 case EQ:
10052 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10053 _CMP_EQ_OQ/_CMP_EQ_OS. */
10054 check_unordered = true;
10055 mode = CCZmode;
10056 break;
10057 case NE:
10058 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10059 _CMP_NEQ_UQ/_CMP_NEQ_US. */
10060 gcc_assert (!ordered);
10061 check_unordered = true;
10062 mode = CCZmode;
10063 const_val = const1_rtx;
10064 break;
10065 default:
10066 gcc_unreachable ();
10067 }
10068
2bf6d935 10069 target = gen_reg_rtx (SImode);
467e9f38 10070 emit_move_insn (target, const_val);
2bf6d935
ML
10071 target = gen_rtx_SUBREG (QImode, target, 0);
10072
10073 if ((optimize && !register_operand (op0, mode0))
10074 || !insn_p->operand[0].predicate (op0, mode0))
10075 op0 = copy_to_mode_reg (mode0, op0);
10076 if ((optimize && !register_operand (op1, mode1))
10077 || !insn_p->operand[1].predicate (op1, mode1))
10078 op1 = copy_to_mode_reg (mode1, op1);
10079
467e9f38
L
10080 /*
10081 1. COMI: ordered and signaling.
10082 2. UCOMI: unordered and non-signaling.
10083 */
10084 if (non_signaling)
10085 icode = (icode == CODE_FOR_sse_comi_round
10086 ? CODE_FOR_sse_ucomi_round
10087 : CODE_FOR_sse2_ucomi_round);
2bf6d935
ML
10088
10089 pat = GEN_FCN (icode) (op0, op1, op3);
10090 if (! pat)
10091 return 0;
10092
10093 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
10094 if (INTVAL (op3) == NO_ROUND)
10095 {
10096 pat = ix86_erase_embedded_rounding (pat);
10097 if (! pat)
10098 return 0;
10099
10100 set_dst = SET_DEST (pat);
10101 }
10102 else
10103 {
10104 gcc_assert (GET_CODE (pat) == SET);
10105 set_dst = SET_DEST (pat);
10106 }
10107
10108 emit_insn (pat);
467e9f38
L
10109
10110 rtx_code_label *label = NULL;
10111
10112 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10113 with NAN operands. */
10114 if (check_unordered)
10115 {
10116 gcc_assert (comparison == EQ || comparison == NE);
10117
10118 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
10119 label = gen_label_rtx ();
10120 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
10121 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10122 gen_rtx_LABEL_REF (VOIDmode, label),
10123 pc_rtx);
10124 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10125 }
10126
10127 /* NB: Set CCFPmode and check a different CCmode which is in subset
10128 of CCFPmode. */
10129 if (GET_MODE (set_dst) != mode)
10130 {
10131 gcc_assert (mode == CCAmode || mode == CCCmode
10132 || mode == CCOmode || mode == CCPmode
10133 || mode == CCSmode || mode == CCZmode);
10134 set_dst = gen_rtx_REG (mode, FLAGS_REG);
10135 }
10136
2bf6d935
ML
10137 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10138 gen_rtx_fmt_ee (comparison, QImode,
10139 set_dst,
10140 const0_rtx)));
10141
467e9f38
L
10142 if (label)
10143 emit_label (label);
10144
2bf6d935
ML
10145 return SUBREG_REG (target);
10146}
10147
10148static rtx
10149ix86_expand_round_builtin (const struct builtin_description *d,
10150 tree exp, rtx target)
10151{
10152 rtx pat;
10153 unsigned int i, nargs;
10154 struct
10155 {
10156 rtx op;
10157 machine_mode mode;
10158 } args[6];
10159 enum insn_code icode = d->icode;
10160 const struct insn_data_d *insn_p = &insn_data[icode];
10161 machine_mode tmode = insn_p->operand[0].mode;
10162 unsigned int nargs_constant = 0;
10163 unsigned int redundant_embed_rnd = 0;
10164
10165 switch ((enum ix86_builtin_func_type) d->flag)
10166 {
10167 case UINT64_FTYPE_V2DF_INT:
10168 case UINT64_FTYPE_V4SF_INT:
10169 case UINT_FTYPE_V2DF_INT:
10170 case UINT_FTYPE_V4SF_INT:
10171 case INT64_FTYPE_V2DF_INT:
10172 case INT64_FTYPE_V4SF_INT:
10173 case INT_FTYPE_V2DF_INT:
10174 case INT_FTYPE_V4SF_INT:
10175 nargs = 2;
10176 break;
10177 case V4SF_FTYPE_V4SF_UINT_INT:
10178 case V4SF_FTYPE_V4SF_UINT64_INT:
10179 case V2DF_FTYPE_V2DF_UINT64_INT:
10180 case V4SF_FTYPE_V4SF_INT_INT:
10181 case V4SF_FTYPE_V4SF_INT64_INT:
10182 case V2DF_FTYPE_V2DF_INT64_INT:
10183 case V4SF_FTYPE_V4SF_V4SF_INT:
10184 case V2DF_FTYPE_V2DF_V2DF_INT:
10185 case V4SF_FTYPE_V4SF_V2DF_INT:
10186 case V2DF_FTYPE_V2DF_V4SF_INT:
10187 nargs = 3;
10188 break;
10189 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
10190 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
10191 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
10192 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
10193 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
10194 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
10195 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
10196 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
10197 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
10198 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
10199 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
10200 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
10201 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
10202 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
10203 nargs = 4;
10204 break;
10205 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
10206 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
10207 nargs_constant = 2;
10208 nargs = 4;
10209 break;
10210 case INT_FTYPE_V4SF_V4SF_INT_INT:
10211 case INT_FTYPE_V2DF_V2DF_INT_INT:
10212 return ix86_expand_sse_comi_round (d, exp, target);
10213 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
10214 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
10215 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
10216 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
10217 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
10218 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
10219 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
10220 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
10221 nargs = 5;
10222 break;
10223 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
10224 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
10225 nargs_constant = 4;
10226 nargs = 5;
10227 break;
10228 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
10229 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
10230 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
10231 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
10232 nargs_constant = 3;
10233 nargs = 5;
10234 break;
10235 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
10236 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
10237 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
10238 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
10239 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
10240 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
10241 nargs = 6;
10242 nargs_constant = 4;
10243 break;
10244 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
10245 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
10246 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
10247 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
10248 nargs = 6;
10249 nargs_constant = 3;
10250 break;
10251 default:
10252 gcc_unreachable ();
10253 }
10254 gcc_assert (nargs <= ARRAY_SIZE (args));
10255
10256 if (optimize
10257 || target == 0
10258 || GET_MODE (target) != tmode
10259 || !insn_p->operand[0].predicate (target, tmode))
10260 target = gen_reg_rtx (tmode);
10261
10262 for (i = 0; i < nargs; i++)
10263 {
10264 tree arg = CALL_EXPR_ARG (exp, i);
10265 rtx op = expand_normal (arg);
10266 machine_mode mode = insn_p->operand[i + 1].mode;
10267 bool match = insn_p->operand[i + 1].predicate (op, mode);
10268
10269 if (i == nargs - nargs_constant)
10270 {
10271 if (!match)
10272 {
10273 switch (icode)
10274 {
10275 case CODE_FOR_avx512f_getmantv8df_mask_round:
10276 case CODE_FOR_avx512f_getmantv16sf_mask_round:
10277 case CODE_FOR_avx512f_vgetmantv2df_round:
10278 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
10279 case CODE_FOR_avx512f_vgetmantv4sf_round:
10280 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
10281 error ("the immediate argument must be a 4-bit immediate");
10282 return const0_rtx;
10283 case CODE_FOR_avx512f_cmpv8df3_mask_round:
10284 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
10285 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
10286 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
10287 error ("the immediate argument must be a 5-bit immediate");
10288 return const0_rtx;
10289 default:
10290 error ("the immediate argument must be an 8-bit immediate");
10291 return const0_rtx;
10292 }
10293 }
10294 }
10295 else if (i == nargs-1)
10296 {
10297 if (!insn_p->operand[nargs].predicate (op, SImode))
10298 {
10299 error ("incorrect rounding operand");
10300 return const0_rtx;
10301 }
10302
10303 /* If there is no rounding use normal version of the pattern. */
10304 if (INTVAL (op) == NO_ROUND)
10305 redundant_embed_rnd = 1;
10306 }
10307 else
10308 {
10309 if (VECTOR_MODE_P (mode))
10310 op = safe_vector_operand (op, mode);
10311
10312 op = fixup_modeless_constant (op, mode);
10313
10314 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10315 {
10316 if (optimize || !match)
10317 op = copy_to_mode_reg (mode, op);
10318 }
10319 else
10320 {
10321 op = copy_to_reg (op);
10322 op = lowpart_subreg (mode, op, GET_MODE (op));
10323 }
10324 }
10325
10326 args[i].op = op;
10327 args[i].mode = mode;
10328 }
10329
10330 switch (nargs)
10331 {
10332 case 1:
10333 pat = GEN_FCN (icode) (target, args[0].op);
10334 break;
10335 case 2:
10336 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
10337 break;
10338 case 3:
10339 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10340 args[2].op);
10341 break;
10342 case 4:
10343 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10344 args[2].op, args[3].op);
10345 break;
10346 case 5:
10347 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10348 args[2].op, args[3].op, args[4].op);
10349 break;
10350 case 6:
10351 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10352 args[2].op, args[3].op, args[4].op,
10353 args[5].op);
10354 break;
10355 default:
10356 gcc_unreachable ();
10357 }
10358
10359 if (!pat)
10360 return 0;
10361
10362 if (redundant_embed_rnd)
10363 pat = ix86_erase_embedded_rounding (pat);
10364
10365 emit_insn (pat);
10366 return target;
10367}
10368
10369/* Subroutine of ix86_expand_builtin to take care of special insns
10370 with variable number of operands. */
10371
10372static rtx
10373ix86_expand_special_args_builtin (const struct builtin_description *d,
10374 tree exp, rtx target)
10375{
10376 tree arg;
10377 rtx pat, op;
10378 unsigned int i, nargs, arg_adjust, memory;
10379 bool aligned_mem = false;
10380 struct
10381 {
10382 rtx op;
10383 machine_mode mode;
10384 } args[3];
10385 enum insn_code icode = d->icode;
10386 bool last_arg_constant = false;
10387 const struct insn_data_d *insn_p = &insn_data[icode];
10388 machine_mode tmode = insn_p->operand[0].mode;
10389 enum { load, store } klass;
10390
10391 switch ((enum ix86_builtin_func_type) d->flag)
10392 {
10393 case VOID_FTYPE_VOID:
10394 emit_insn (GEN_FCN (icode) (target));
10395 return 0;
10396 case VOID_FTYPE_UINT64:
10397 case VOID_FTYPE_UNSIGNED:
10398 nargs = 0;
10399 klass = store;
10400 memory = 0;
10401 break;
10402
10403 case INT_FTYPE_VOID:
10404 case USHORT_FTYPE_VOID:
10405 case UINT64_FTYPE_VOID:
10406 case UINT_FTYPE_VOID:
10407 case UNSIGNED_FTYPE_VOID:
10408 nargs = 0;
10409 klass = load;
10410 memory = 0;
10411 break;
10412 case UINT64_FTYPE_PUNSIGNED:
10413 case V2DI_FTYPE_PV2DI:
10414 case V4DI_FTYPE_PV4DI:
10415 case V32QI_FTYPE_PCCHAR:
10416 case V16QI_FTYPE_PCCHAR:
10417 case V8SF_FTYPE_PCV4SF:
10418 case V8SF_FTYPE_PCFLOAT:
10419 case V4SF_FTYPE_PCFLOAT:
10420 case V4DF_FTYPE_PCV2DF:
10421 case V4DF_FTYPE_PCDOUBLE:
10422 case V2DF_FTYPE_PCDOUBLE:
10423 case VOID_FTYPE_PVOID:
10424 case V8DI_FTYPE_PV8DI:
10425 nargs = 1;
10426 klass = load;
10427 memory = 0;
10428 switch (icode)
10429 {
10430 case CODE_FOR_sse4_1_movntdqa:
10431 case CODE_FOR_avx2_movntdqa:
10432 case CODE_FOR_avx512f_movntdqa:
10433 aligned_mem = true;
10434 break;
10435 default:
10436 break;
10437 }
10438 break;
10439 case VOID_FTYPE_PV2SF_V4SF:
10440 case VOID_FTYPE_PV8DI_V8DI:
10441 case VOID_FTYPE_PV4DI_V4DI:
10442 case VOID_FTYPE_PV2DI_V2DI:
10443 case VOID_FTYPE_PCHAR_V32QI:
10444 case VOID_FTYPE_PCHAR_V16QI:
10445 case VOID_FTYPE_PFLOAT_V16SF:
10446 case VOID_FTYPE_PFLOAT_V8SF:
10447 case VOID_FTYPE_PFLOAT_V4SF:
10448 case VOID_FTYPE_PDOUBLE_V8DF:
10449 case VOID_FTYPE_PDOUBLE_V4DF:
10450 case VOID_FTYPE_PDOUBLE_V2DF:
10451 case VOID_FTYPE_PLONGLONG_LONGLONG:
10452 case VOID_FTYPE_PULONGLONG_ULONGLONG:
10453 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
10454 case VOID_FTYPE_PINT_INT:
10455 nargs = 1;
10456 klass = store;
10457 /* Reserve memory operand for target. */
10458 memory = ARRAY_SIZE (args);
10459 switch (icode)
10460 {
10461 /* These builtins and instructions require the memory
10462 to be properly aligned. */
10463 case CODE_FOR_avx_movntv4di:
10464 case CODE_FOR_sse2_movntv2di:
10465 case CODE_FOR_avx_movntv8sf:
10466 case CODE_FOR_sse_movntv4sf:
10467 case CODE_FOR_sse4a_vmmovntv4sf:
10468 case CODE_FOR_avx_movntv4df:
10469 case CODE_FOR_sse2_movntv2df:
10470 case CODE_FOR_sse4a_vmmovntv2df:
10471 case CODE_FOR_sse2_movntidi:
10472 case CODE_FOR_sse_movntq:
10473 case CODE_FOR_sse2_movntisi:
10474 case CODE_FOR_avx512f_movntv16sf:
10475 case CODE_FOR_avx512f_movntv8df:
10476 case CODE_FOR_avx512f_movntv8di:
10477 aligned_mem = true;
10478 break;
10479 default:
10480 break;
10481 }
10482 break;
10483 case VOID_FTYPE_PVOID_PCVOID:
10484 nargs = 1;
10485 klass = store;
10486 memory = 0;
10487
10488 break;
10489 case V4SF_FTYPE_V4SF_PCV2SF:
10490 case V2DF_FTYPE_V2DF_PCDOUBLE:
10491 nargs = 2;
10492 klass = load;
10493 memory = 1;
10494 break;
10495 case V8SF_FTYPE_PCV8SF_V8SI:
10496 case V4DF_FTYPE_PCV4DF_V4DI:
10497 case V4SF_FTYPE_PCV4SF_V4SI:
10498 case V2DF_FTYPE_PCV2DF_V2DI:
10499 case V8SI_FTYPE_PCV8SI_V8SI:
10500 case V4DI_FTYPE_PCV4DI_V4DI:
10501 case V4SI_FTYPE_PCV4SI_V4SI:
10502 case V2DI_FTYPE_PCV2DI_V2DI:
10503 case VOID_FTYPE_INT_INT64:
10504 nargs = 2;
10505 klass = load;
10506 memory = 0;
10507 break;
10508 case VOID_FTYPE_PV8DF_V8DF_UQI:
10509 case VOID_FTYPE_PV4DF_V4DF_UQI:
10510 case VOID_FTYPE_PV2DF_V2DF_UQI:
10511 case VOID_FTYPE_PV16SF_V16SF_UHI:
10512 case VOID_FTYPE_PV8SF_V8SF_UQI:
10513 case VOID_FTYPE_PV4SF_V4SF_UQI:
10514 case VOID_FTYPE_PV8DI_V8DI_UQI:
10515 case VOID_FTYPE_PV4DI_V4DI_UQI:
10516 case VOID_FTYPE_PV2DI_V2DI_UQI:
10517 case VOID_FTYPE_PV16SI_V16SI_UHI:
10518 case VOID_FTYPE_PV8SI_V8SI_UQI:
10519 case VOID_FTYPE_PV4SI_V4SI_UQI:
10520 case VOID_FTYPE_PV64QI_V64QI_UDI:
10521 case VOID_FTYPE_PV32HI_V32HI_USI:
10522 case VOID_FTYPE_PV32QI_V32QI_USI:
10523 case VOID_FTYPE_PV16QI_V16QI_UHI:
10524 case VOID_FTYPE_PV16HI_V16HI_UHI:
10525 case VOID_FTYPE_PV8HI_V8HI_UQI:
10526 switch (icode)
10527 {
10528 /* These builtins and instructions require the memory
10529 to be properly aligned. */
10530 case CODE_FOR_avx512f_storev16sf_mask:
10531 case CODE_FOR_avx512f_storev16si_mask:
10532 case CODE_FOR_avx512f_storev8df_mask:
10533 case CODE_FOR_avx512f_storev8di_mask:
10534 case CODE_FOR_avx512vl_storev8sf_mask:
10535 case CODE_FOR_avx512vl_storev8si_mask:
10536 case CODE_FOR_avx512vl_storev4df_mask:
10537 case CODE_FOR_avx512vl_storev4di_mask:
10538 case CODE_FOR_avx512vl_storev4sf_mask:
10539 case CODE_FOR_avx512vl_storev4si_mask:
10540 case CODE_FOR_avx512vl_storev2df_mask:
10541 case CODE_FOR_avx512vl_storev2di_mask:
10542 aligned_mem = true;
10543 break;
10544 default:
10545 break;
10546 }
10547 /* FALLTHRU */
10548 case VOID_FTYPE_PV8SF_V8SI_V8SF:
10549 case VOID_FTYPE_PV4DF_V4DI_V4DF:
10550 case VOID_FTYPE_PV4SF_V4SI_V4SF:
10551 case VOID_FTYPE_PV2DF_V2DI_V2DF:
10552 case VOID_FTYPE_PV8SI_V8SI_V8SI:
10553 case VOID_FTYPE_PV4DI_V4DI_V4DI:
10554 case VOID_FTYPE_PV4SI_V4SI_V4SI:
10555 case VOID_FTYPE_PV2DI_V2DI_V2DI:
10556 case VOID_FTYPE_PV8SI_V8DI_UQI:
10557 case VOID_FTYPE_PV8HI_V8DI_UQI:
10558 case VOID_FTYPE_PV16HI_V16SI_UHI:
10559 case VOID_FTYPE_PV16QI_V8DI_UQI:
10560 case VOID_FTYPE_PV16QI_V16SI_UHI:
10561 case VOID_FTYPE_PV4SI_V4DI_UQI:
10562 case VOID_FTYPE_PV4SI_V2DI_UQI:
10563 case VOID_FTYPE_PV8HI_V4DI_UQI:
10564 case VOID_FTYPE_PV8HI_V2DI_UQI:
10565 case VOID_FTYPE_PV8HI_V8SI_UQI:
10566 case VOID_FTYPE_PV8HI_V4SI_UQI:
10567 case VOID_FTYPE_PV16QI_V4DI_UQI:
10568 case VOID_FTYPE_PV16QI_V2DI_UQI:
10569 case VOID_FTYPE_PV16QI_V8SI_UQI:
10570 case VOID_FTYPE_PV16QI_V4SI_UQI:
10571 case VOID_FTYPE_PCHAR_V64QI_UDI:
10572 case VOID_FTYPE_PCHAR_V32QI_USI:
10573 case VOID_FTYPE_PCHAR_V16QI_UHI:
10574 case VOID_FTYPE_PSHORT_V32HI_USI:
10575 case VOID_FTYPE_PSHORT_V16HI_UHI:
10576 case VOID_FTYPE_PSHORT_V8HI_UQI:
10577 case VOID_FTYPE_PINT_V16SI_UHI:
10578 case VOID_FTYPE_PINT_V8SI_UQI:
10579 case VOID_FTYPE_PINT_V4SI_UQI:
10580 case VOID_FTYPE_PINT64_V8DI_UQI:
10581 case VOID_FTYPE_PINT64_V4DI_UQI:
10582 case VOID_FTYPE_PINT64_V2DI_UQI:
10583 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
10584 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
10585 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
10586 case VOID_FTYPE_PFLOAT_V16SF_UHI:
10587 case VOID_FTYPE_PFLOAT_V8SF_UQI:
10588 case VOID_FTYPE_PFLOAT_V4SF_UQI:
10589 case VOID_FTYPE_PV32QI_V32HI_USI:
10590 case VOID_FTYPE_PV16QI_V16HI_UHI:
10591 case VOID_FTYPE_PV8QI_V8HI_UQI:
10592 nargs = 2;
10593 klass = store;
10594 /* Reserve memory operand for target. */
10595 memory = ARRAY_SIZE (args);
10596 break;
10597 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
10598 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
10599 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
10600 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
10601 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
10602 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
10603 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
10604 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
10605 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
10606 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
10607 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
10608 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
10609 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
10610 case V32HI_FTYPE_PCV32HI_V32HI_USI:
10611 case V32QI_FTYPE_PCV32QI_V32QI_USI:
10612 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
10613 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
10614 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
10615 switch (icode)
10616 {
10617 /* These builtins and instructions require the memory
10618 to be properly aligned. */
10619 case CODE_FOR_avx512f_loadv16sf_mask:
10620 case CODE_FOR_avx512f_loadv16si_mask:
10621 case CODE_FOR_avx512f_loadv8df_mask:
10622 case CODE_FOR_avx512f_loadv8di_mask:
10623 case CODE_FOR_avx512vl_loadv8sf_mask:
10624 case CODE_FOR_avx512vl_loadv8si_mask:
10625 case CODE_FOR_avx512vl_loadv4df_mask:
10626 case CODE_FOR_avx512vl_loadv4di_mask:
10627 case CODE_FOR_avx512vl_loadv4sf_mask:
10628 case CODE_FOR_avx512vl_loadv4si_mask:
10629 case CODE_FOR_avx512vl_loadv2df_mask:
10630 case CODE_FOR_avx512vl_loadv2di_mask:
10631 case CODE_FOR_avx512bw_loadv64qi_mask:
10632 case CODE_FOR_avx512vl_loadv32qi_mask:
10633 case CODE_FOR_avx512vl_loadv16qi_mask:
10634 case CODE_FOR_avx512bw_loadv32hi_mask:
10635 case CODE_FOR_avx512vl_loadv16hi_mask:
10636 case CODE_FOR_avx512vl_loadv8hi_mask:
10637 aligned_mem = true;
10638 break;
10639 default:
10640 break;
10641 }
10642 /* FALLTHRU */
10643 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
10644 case V32QI_FTYPE_PCCHAR_V32QI_USI:
10645 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
10646 case V32HI_FTYPE_PCSHORT_V32HI_USI:
10647 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
10648 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
10649 case V16SI_FTYPE_PCINT_V16SI_UHI:
10650 case V8SI_FTYPE_PCINT_V8SI_UQI:
10651 case V4SI_FTYPE_PCINT_V4SI_UQI:
10652 case V8DI_FTYPE_PCINT64_V8DI_UQI:
10653 case V4DI_FTYPE_PCINT64_V4DI_UQI:
10654 case V2DI_FTYPE_PCINT64_V2DI_UQI:
10655 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
10656 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
10657 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
10658 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
10659 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
10660 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
10661 nargs = 3;
10662 klass = load;
10663 memory = 0;
10664 break;
10665 case VOID_FTYPE_UINT_UINT_UINT:
10666 case VOID_FTYPE_UINT64_UINT_UINT:
10667 case UCHAR_FTYPE_UINT_UINT_UINT:
10668 case UCHAR_FTYPE_UINT64_UINT_UINT:
10669 nargs = 3;
10670 klass = load;
10671 memory = ARRAY_SIZE (args);
10672 last_arg_constant = true;
10673 break;
10674 default:
10675 gcc_unreachable ();
10676 }
10677
10678 gcc_assert (nargs <= ARRAY_SIZE (args));
10679
10680 if (klass == store)
10681 {
10682 arg = CALL_EXPR_ARG (exp, 0);
10683 op = expand_normal (arg);
10684 gcc_assert (target == 0);
10685 if (memory)
10686 {
10687 op = ix86_zero_extend_to_Pmode (op);
10688 target = gen_rtx_MEM (tmode, op);
10689 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10690 on it. Try to improve it using get_pointer_alignment,
10691 and if the special builtin is one that requires strict
10692 mode alignment, also from it's GET_MODE_ALIGNMENT.
10693 Failure to do so could lead to ix86_legitimate_combined_insn
10694 rejecting all changes to such insns. */
10695 unsigned int align = get_pointer_alignment (arg);
10696 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
10697 align = GET_MODE_ALIGNMENT (tmode);
10698 if (MEM_ALIGN (target) < align)
10699 set_mem_align (target, align);
10700 }
10701 else
10702 target = force_reg (tmode, op);
10703 arg_adjust = 1;
10704 }
10705 else
10706 {
10707 arg_adjust = 0;
10708 if (optimize
10709 || target == 0
10710 || !register_operand (target, tmode)
10711 || GET_MODE (target) != tmode)
10712 target = gen_reg_rtx (tmode);
10713 }
10714
10715 for (i = 0; i < nargs; i++)
10716 {
10717 machine_mode mode = insn_p->operand[i + 1].mode;
10718 bool match;
10719
10720 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
10721 op = expand_normal (arg);
10722 match = insn_p->operand[i + 1].predicate (op, mode);
10723
10724 if (last_arg_constant && (i + 1) == nargs)
10725 {
10726 if (!match)
10727 {
10728 if (icode == CODE_FOR_lwp_lwpvalsi3
10729 || icode == CODE_FOR_lwp_lwpinssi3
10730 || icode == CODE_FOR_lwp_lwpvaldi3
10731 || icode == CODE_FOR_lwp_lwpinsdi3)
10732 error ("the last argument must be a 32-bit immediate");
10733 else
10734 error ("the last argument must be an 8-bit immediate");
10735 return const0_rtx;
10736 }
10737 }
10738 else
10739 {
10740 if (i == memory)
10741 {
10742 /* This must be the memory operand. */
10743 op = ix86_zero_extend_to_Pmode (op);
10744 op = gen_rtx_MEM (mode, op);
10745 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10746 on it. Try to improve it using get_pointer_alignment,
10747 and if the special builtin is one that requires strict
10748 mode alignment, also from it's GET_MODE_ALIGNMENT.
10749 Failure to do so could lead to ix86_legitimate_combined_insn
10750 rejecting all changes to such insns. */
10751 unsigned int align = get_pointer_alignment (arg);
10752 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
10753 align = GET_MODE_ALIGNMENT (mode);
10754 if (MEM_ALIGN (op) < align)
10755 set_mem_align (op, align);
10756 }
10757 else
10758 {
10759 /* This must be register. */
10760 if (VECTOR_MODE_P (mode))
10761 op = safe_vector_operand (op, mode);
10762
10763 op = fixup_modeless_constant (op, mode);
10764
10765 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10766 op = copy_to_mode_reg (mode, op);
10767 else
10768 {
10769 op = copy_to_reg (op);
10770 op = lowpart_subreg (mode, op, GET_MODE (op));
10771 }
10772 }
10773 }
10774
10775 args[i].op = op;
10776 args[i].mode = mode;
10777 }
10778
10779 switch (nargs)
10780 {
10781 case 0:
10782 pat = GEN_FCN (icode) (target);
10783 break;
10784 case 1:
10785 pat = GEN_FCN (icode) (target, args[0].op);
10786 break;
10787 case 2:
10788 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
10789 break;
10790 case 3:
10791 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
10792 break;
10793 default:
10794 gcc_unreachable ();
10795 }
10796
10797 if (! pat)
10798 return 0;
10799 emit_insn (pat);
10800 return klass == store ? 0 : target;
10801}
10802
10803/* Return the integer constant in ARG. Constrain it to be in the range
10804 of the subparts of VEC_TYPE; issue an error if not. */
10805
10806static int
10807get_element_number (tree vec_type, tree arg)
10808{
10809 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
10810
10811 if (!tree_fits_uhwi_p (arg)
10812 || (elt = tree_to_uhwi (arg), elt > max))
10813 {
a9c697b8
MS
10814 error ("selector must be an integer constant in the range "
10815 "[0, %wi]", max);
2bf6d935
ML
10816 return 0;
10817 }
10818
10819 return elt;
10820}
10821
10822/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10823 ix86_expand_vector_init. We DO have language-level syntax for this, in
10824 the form of (type){ init-list }. Except that since we can't place emms
10825 instructions from inside the compiler, we can't allow the use of MMX
10826 registers unless the user explicitly asks for it. So we do *not* define
10827 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
10828 we have builtins invoked by mmintrin.h that gives us license to emit
10829 these sorts of instructions. */
10830
10831static rtx
10832ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
10833{
10834 machine_mode tmode = TYPE_MODE (type);
10835 machine_mode inner_mode = GET_MODE_INNER (tmode);
10836 int i, n_elt = GET_MODE_NUNITS (tmode);
10837 rtvec v = rtvec_alloc (n_elt);
10838
10839 gcc_assert (VECTOR_MODE_P (tmode));
10840 gcc_assert (call_expr_nargs (exp) == n_elt);
10841
10842 for (i = 0; i < n_elt; ++i)
10843 {
10844 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
10845 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
10846 }
10847
10848 if (!target || !register_operand (target, tmode))
10849 target = gen_reg_rtx (tmode);
10850
10851 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
10852 return target;
10853}
10854
10855/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10856 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
10857 had a language-level syntax for referencing vector elements. */
10858
10859static rtx
10860ix86_expand_vec_ext_builtin (tree exp, rtx target)
10861{
10862 machine_mode tmode, mode0;
10863 tree arg0, arg1;
10864 int elt;
10865 rtx op0;
10866
10867 arg0 = CALL_EXPR_ARG (exp, 0);
10868 arg1 = CALL_EXPR_ARG (exp, 1);
10869
10870 op0 = expand_normal (arg0);
10871 elt = get_element_number (TREE_TYPE (arg0), arg1);
10872
10873 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10874 mode0 = TYPE_MODE (TREE_TYPE (arg0));
10875 gcc_assert (VECTOR_MODE_P (mode0));
10876
10877 op0 = force_reg (mode0, op0);
10878
10879 if (optimize || !target || !register_operand (target, tmode))
10880 target = gen_reg_rtx (tmode);
10881
10882 ix86_expand_vector_extract (true, target, op0, elt);
10883
10884 return target;
10885}
10886
10887/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10888 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
10889 a language-level syntax for referencing vector elements. */
10890
10891static rtx
10892ix86_expand_vec_set_builtin (tree exp)
10893{
10894 machine_mode tmode, mode1;
10895 tree arg0, arg1, arg2;
10896 int elt;
10897 rtx op0, op1, target;
10898
10899 arg0 = CALL_EXPR_ARG (exp, 0);
10900 arg1 = CALL_EXPR_ARG (exp, 1);
10901 arg2 = CALL_EXPR_ARG (exp, 2);
10902
10903 tmode = TYPE_MODE (TREE_TYPE (arg0));
10904 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10905 gcc_assert (VECTOR_MODE_P (tmode));
10906
10907 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
10908 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
10909 elt = get_element_number (TREE_TYPE (arg0), arg2);
10910
10911 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
10912 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
10913
10914 op0 = force_reg (tmode, op0);
10915 op1 = force_reg (mode1, op1);
10916
10917 /* OP0 is the source of these builtin functions and shouldn't be
10918 modified. Create a copy, use it and return it as target. */
10919 target = gen_reg_rtx (tmode);
10920 emit_move_insn (target, op0);
10921 ix86_expand_vector_set (true, target, op1, elt);
10922
10923 return target;
10924}
10925
10926/* Expand an expression EXP that calls a built-in function,
10927 with result going to TARGET if that's convenient
10928 (and in mode MODE if that's convenient).
10929 SUBTARGET may be used as the target for computing one of EXP's operands.
10930 IGNORE is nonzero if the value is to be ignored. */
10931
10932rtx
10933ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
10934 machine_mode mode, int ignore)
10935{
10936 size_t i;
10937 enum insn_code icode, icode2;
10938 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
10939 tree arg0, arg1, arg2, arg3, arg4;
10940 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
10941 machine_mode mode0, mode1, mode2, mode3, mode4;
4d732405 10942 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
2bf6d935
ML
10943
10944 /* For CPU builtins that can be folded, fold first and expand the fold. */
10945 switch (fcode)
10946 {
10947 case IX86_BUILTIN_CPU_INIT:
10948 {
10949 /* Make it call __cpu_indicator_init in libgcc. */
10950 tree call_expr, fndecl, type;
10951 type = build_function_type_list (integer_type_node, NULL_TREE);
10952 fndecl = build_fn_decl ("__cpu_indicator_init", type);
10953 call_expr = build_call_expr (fndecl, 0);
10954 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
10955 }
10956 case IX86_BUILTIN_CPU_IS:
10957 case IX86_BUILTIN_CPU_SUPPORTS:
10958 {
10959 tree arg0 = CALL_EXPR_ARG (exp, 0);
10960 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
10961 gcc_assert (fold_expr != NULL_TREE);
10962 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
10963 }
10964 }
10965
10966 HOST_WIDE_INT isa = ix86_isa_flags;
10967 HOST_WIDE_INT isa2 = ix86_isa_flags2;
10968 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
10969 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
10970 /* The general case is we require all the ISAs specified in bisa{,2}
10971 to be enabled.
10972 The exceptions are:
10973 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
10974 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
10975 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
a13d6ec8
JJ
10976 where for each such pair it is sufficient if either of the ISAs is
10977 enabled, plus if it is ored with other options also those others.
10978 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
2bf6d935
ML
10979 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
10980 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
10981 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
10982 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
10983 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
10984 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
10985 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
10986 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
10987 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
10988 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
10989 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
10990 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
a13d6ec8
JJ
10991 if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE)
10992 {
10993 bisa &= ~OPTION_MASK_ISA_MMX;
10994 bisa |= OPTION_MASK_ISA_SSE2;
ecfdb16c 10995 }
2bf6d935
ML
10996 if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
10997 {
10998 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
10999 if (TARGET_ABI_X32)
11000 bisa |= OPTION_MASK_ABI_X32;
11001 else
11002 bisa |= OPTION_MASK_ABI_64;
11003 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
46e6341f
JJ
11004 (enum fpmath_unit) 0,
11005 (enum prefer_vector_width) 0,
11006 false, add_abi_p);
2bf6d935
ML
11007 if (!opts)
11008 error ("%qE needs unknown isa option", fndecl);
11009 else
11010 {
11011 gcc_assert (opts != NULL);
11012 error ("%qE needs isa option %s", fndecl, opts);
11013 free (opts);
11014 }
11015 return expand_call (exp, target, ignore);
11016 }
11017
11018 switch (fcode)
11019 {
11020 case IX86_BUILTIN_MASKMOVQ:
11021 case IX86_BUILTIN_MASKMOVDQU:
11022 icode = (fcode == IX86_BUILTIN_MASKMOVQ
11023 ? CODE_FOR_mmx_maskmovq
11024 : CODE_FOR_sse2_maskmovdqu);
11025 /* Note the arg order is different from the operand order. */
11026 arg1 = CALL_EXPR_ARG (exp, 0);
11027 arg2 = CALL_EXPR_ARG (exp, 1);
11028 arg0 = CALL_EXPR_ARG (exp, 2);
11029 op0 = expand_normal (arg0);
11030 op1 = expand_normal (arg1);
11031 op2 = expand_normal (arg2);
11032 mode0 = insn_data[icode].operand[0].mode;
11033 mode1 = insn_data[icode].operand[1].mode;
11034 mode2 = insn_data[icode].operand[2].mode;
11035
11036 op0 = ix86_zero_extend_to_Pmode (op0);
11037 op0 = gen_rtx_MEM (mode1, op0);
11038
11039 if (!insn_data[icode].operand[0].predicate (op0, mode0))
11040 op0 = copy_to_mode_reg (mode0, op0);
11041 if (!insn_data[icode].operand[1].predicate (op1, mode1))
11042 op1 = copy_to_mode_reg (mode1, op1);
11043 if (!insn_data[icode].operand[2].predicate (op2, mode2))
11044 op2 = copy_to_mode_reg (mode2, op2);
11045 pat = GEN_FCN (icode) (op0, op1, op2);
11046 if (! pat)
11047 return 0;
11048 emit_insn (pat);
11049 return 0;
11050
11051 case IX86_BUILTIN_LDMXCSR:
11052 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
11053 target = assign_386_stack_local (SImode, SLOT_TEMP);
11054 emit_move_insn (target, op0);
11055 emit_insn (gen_sse_ldmxcsr (target));
11056 return 0;
11057
11058 case IX86_BUILTIN_STMXCSR:
11059 target = assign_386_stack_local (SImode, SLOT_TEMP);
11060 emit_insn (gen_sse_stmxcsr (target));
11061 return copy_to_mode_reg (SImode, target);
11062
11063 case IX86_BUILTIN_CLFLUSH:
11064 arg0 = CALL_EXPR_ARG (exp, 0);
11065 op0 = expand_normal (arg0);
11066 icode = CODE_FOR_sse2_clflush;
11067 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11068 op0 = ix86_zero_extend_to_Pmode (op0);
11069
11070 emit_insn (gen_sse2_clflush (op0));
11071 return 0;
11072
11073 case IX86_BUILTIN_CLWB:
11074 arg0 = CALL_EXPR_ARG (exp, 0);
11075 op0 = expand_normal (arg0);
11076 icode = CODE_FOR_clwb;
11077 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11078 op0 = ix86_zero_extend_to_Pmode (op0);
11079
11080 emit_insn (gen_clwb (op0));
11081 return 0;
11082
11083 case IX86_BUILTIN_CLFLUSHOPT:
11084 arg0 = CALL_EXPR_ARG (exp, 0);
11085 op0 = expand_normal (arg0);
11086 icode = CODE_FOR_clflushopt;
11087 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11088 op0 = ix86_zero_extend_to_Pmode (op0);
11089
11090 emit_insn (gen_clflushopt (op0));
11091 return 0;
11092
11093 case IX86_BUILTIN_MONITOR:
11094 case IX86_BUILTIN_MONITORX:
11095 arg0 = CALL_EXPR_ARG (exp, 0);
11096 arg1 = CALL_EXPR_ARG (exp, 1);
11097 arg2 = CALL_EXPR_ARG (exp, 2);
11098 op0 = expand_normal (arg0);
11099 op1 = expand_normal (arg1);
11100 op2 = expand_normal (arg2);
11101 if (!REG_P (op0))
11102 op0 = ix86_zero_extend_to_Pmode (op0);
11103 if (!REG_P (op1))
11104 op1 = copy_to_mode_reg (SImode, op1);
11105 if (!REG_P (op2))
11106 op2 = copy_to_mode_reg (SImode, op2);
11107
11108 emit_insn (fcode == IX86_BUILTIN_MONITOR
a963ca40
UB
11109 ? gen_sse3_monitor (Pmode, op0, op1, op2)
11110 : gen_monitorx (Pmode, op0, op1, op2));
2bf6d935
ML
11111 return 0;
11112
11113 case IX86_BUILTIN_MWAIT:
11114 arg0 = CALL_EXPR_ARG (exp, 0);
11115 arg1 = CALL_EXPR_ARG (exp, 1);
11116 op0 = expand_normal (arg0);
11117 op1 = expand_normal (arg1);
11118 if (!REG_P (op0))
11119 op0 = copy_to_mode_reg (SImode, op0);
11120 if (!REG_P (op1))
11121 op1 = copy_to_mode_reg (SImode, op1);
11122 emit_insn (gen_sse3_mwait (op0, op1));
11123 return 0;
11124
11125 case IX86_BUILTIN_MWAITX:
11126 arg0 = CALL_EXPR_ARG (exp, 0);
11127 arg1 = CALL_EXPR_ARG (exp, 1);
11128 arg2 = CALL_EXPR_ARG (exp, 2);
11129 op0 = expand_normal (arg0);
11130 op1 = expand_normal (arg1);
11131 op2 = expand_normal (arg2);
11132 if (!REG_P (op0))
11133 op0 = copy_to_mode_reg (SImode, op0);
11134 if (!REG_P (op1))
11135 op1 = copy_to_mode_reg (SImode, op1);
11136 if (!REG_P (op2))
11137 op2 = copy_to_mode_reg (SImode, op2);
11138 emit_insn (gen_mwaitx (op0, op1, op2));
11139 return 0;
11140
11141 case IX86_BUILTIN_UMONITOR:
11142 arg0 = CALL_EXPR_ARG (exp, 0);
11143 op0 = expand_normal (arg0);
11144
11145 op0 = ix86_zero_extend_to_Pmode (op0);
987a3082 11146 emit_insn (gen_umonitor (Pmode, op0));
2bf6d935
ML
11147 return 0;
11148
11149 case IX86_BUILTIN_UMWAIT:
11150 case IX86_BUILTIN_TPAUSE:
11151 arg0 = CALL_EXPR_ARG (exp, 0);
11152 arg1 = CALL_EXPR_ARG (exp, 1);
11153 op0 = expand_normal (arg0);
11154 op1 = expand_normal (arg1);
11155
11156 if (!REG_P (op0))
11157 op0 = copy_to_mode_reg (SImode, op0);
11158
11159 op1 = force_reg (DImode, op1);
11160
11161 if (TARGET_64BIT)
11162 {
11163 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11164 NULL, 1, OPTAB_DIRECT);
11165 switch (fcode)
11166 {
11167 case IX86_BUILTIN_UMWAIT:
11168 icode = CODE_FOR_umwait_rex64;
11169 break;
11170 case IX86_BUILTIN_TPAUSE:
11171 icode = CODE_FOR_tpause_rex64;
11172 break;
11173 default:
11174 gcc_unreachable ();
11175 }
11176
11177 op2 = gen_lowpart (SImode, op2);
11178 op1 = gen_lowpart (SImode, op1);
11179 pat = GEN_FCN (icode) (op0, op1, op2);
11180 }
11181 else
11182 {
11183 switch (fcode)
11184 {
11185 case IX86_BUILTIN_UMWAIT:
11186 icode = CODE_FOR_umwait;
11187 break;
11188 case IX86_BUILTIN_TPAUSE:
11189 icode = CODE_FOR_tpause;
11190 break;
11191 default:
11192 gcc_unreachable ();
11193 }
11194 pat = GEN_FCN (icode) (op0, op1);
11195 }
11196
11197 if (!pat)
11198 return 0;
11199
11200 emit_insn (pat);
11201
11202 if (target == 0
11203 || !register_operand (target, QImode))
11204 target = gen_reg_rtx (QImode);
11205
11206 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11207 const0_rtx);
11208 emit_insn (gen_rtx_SET (target, pat));
11209
11210 return target;
11211
11212 case IX86_BUILTIN_CLZERO:
11213 arg0 = CALL_EXPR_ARG (exp, 0);
11214 op0 = expand_normal (arg0);
11215 if (!REG_P (op0))
11216 op0 = ix86_zero_extend_to_Pmode (op0);
a963ca40 11217 emit_insn (gen_clzero (Pmode, op0));
2bf6d935
ML
11218 return 0;
11219
11220 case IX86_BUILTIN_CLDEMOTE:
11221 arg0 = CALL_EXPR_ARG (exp, 0);
11222 op0 = expand_normal (arg0);
11223 icode = CODE_FOR_cldemote;
11224 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11225 op0 = ix86_zero_extend_to_Pmode (op0);
11226
11227 emit_insn (gen_cldemote (op0));
11228 return 0;
11229
11230 case IX86_BUILTIN_VEC_INIT_V2SI:
11231 case IX86_BUILTIN_VEC_INIT_V4HI:
11232 case IX86_BUILTIN_VEC_INIT_V8QI:
11233 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
11234
11235 case IX86_BUILTIN_VEC_EXT_V2DF:
11236 case IX86_BUILTIN_VEC_EXT_V2DI:
11237 case IX86_BUILTIN_VEC_EXT_V4SF:
11238 case IX86_BUILTIN_VEC_EXT_V4SI:
11239 case IX86_BUILTIN_VEC_EXT_V8HI:
11240 case IX86_BUILTIN_VEC_EXT_V2SI:
11241 case IX86_BUILTIN_VEC_EXT_V4HI:
11242 case IX86_BUILTIN_VEC_EXT_V16QI:
11243 return ix86_expand_vec_ext_builtin (exp, target);
11244
11245 case IX86_BUILTIN_VEC_SET_V2DI:
11246 case IX86_BUILTIN_VEC_SET_V4SF:
11247 case IX86_BUILTIN_VEC_SET_V4SI:
11248 case IX86_BUILTIN_VEC_SET_V8HI:
11249 case IX86_BUILTIN_VEC_SET_V4HI:
11250 case IX86_BUILTIN_VEC_SET_V16QI:
11251 return ix86_expand_vec_set_builtin (exp);
11252
11253 case IX86_BUILTIN_NANQ:
11254 case IX86_BUILTIN_NANSQ:
11255 return expand_call (exp, target, ignore);
11256
11257 case IX86_BUILTIN_RDPID:
11258
11259 op0 = gen_reg_rtx (word_mode);
11260
11261 if (TARGET_64BIT)
11262 {
11263 insn = gen_rdpid_rex64 (op0);
11264 op0 = convert_to_mode (SImode, op0, 1);
11265 }
11266 else
11267 insn = gen_rdpid (op0);
11268
11269 emit_insn (insn);
11270
11271 if (target == 0
11272 || !register_operand (target, SImode))
11273 target = gen_reg_rtx (SImode);
11274
11275 emit_move_insn (target, op0);
11276 return target;
11277
e21b52af
HL
11278 case IX86_BUILTIN_2INTERSECTD512:
11279 case IX86_BUILTIN_2INTERSECTQ512:
11280 case IX86_BUILTIN_2INTERSECTD256:
11281 case IX86_BUILTIN_2INTERSECTQ256:
11282 case IX86_BUILTIN_2INTERSECTD128:
11283 case IX86_BUILTIN_2INTERSECTQ128:
11284 arg0 = CALL_EXPR_ARG (exp, 0);
11285 arg1 = CALL_EXPR_ARG (exp, 1);
11286 arg2 = CALL_EXPR_ARG (exp, 2);
11287 arg3 = CALL_EXPR_ARG (exp, 3);
11288 op0 = expand_normal (arg0);
11289 op1 = expand_normal (arg1);
11290 op2 = expand_normal (arg2);
11291 op3 = expand_normal (arg3);
11292
11293 if (!address_operand (op0, VOIDmode))
11294 {
11295 op0 = convert_memory_address (Pmode, op0);
11296 op0 = copy_addr_to_reg (op0);
11297 }
11298 if (!address_operand (op1, VOIDmode))
11299 {
11300 op1 = convert_memory_address (Pmode, op1);
11301 op1 = copy_addr_to_reg (op1);
11302 }
11303
11304 switch (fcode)
11305 {
11306 case IX86_BUILTIN_2INTERSECTD512:
11307 mode4 = P2HImode;
11308 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
11309 break;
11310 case IX86_BUILTIN_2INTERSECTQ512:
11311 mode4 = P2QImode;
11312 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
11313 break;
11314 case IX86_BUILTIN_2INTERSECTD256:
11315 mode4 = P2QImode;
11316 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
11317 break;
11318 case IX86_BUILTIN_2INTERSECTQ256:
11319 mode4 = P2QImode;
11320 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
11321 break;
11322 case IX86_BUILTIN_2INTERSECTD128:
11323 mode4 = P2QImode;
11324 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
11325 break;
11326 case IX86_BUILTIN_2INTERSECTQ128:
11327 mode4 = P2QImode;
11328 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
11329 break;
11330 default:
11331 gcc_unreachable ();
11332 }
11333
11334 mode2 = insn_data[icode].operand[1].mode;
11335 mode3 = insn_data[icode].operand[2].mode;
11336 if (!insn_data[icode].operand[1].predicate (op2, mode2))
11337 op2 = copy_to_mode_reg (mode2, op2);
11338 if (!insn_data[icode].operand[2].predicate (op3, mode3))
11339 op3 = copy_to_mode_reg (mode3, op3);
11340
11341 op4 = gen_reg_rtx (mode4);
11342 emit_insn (GEN_FCN (icode) (op4, op2, op3));
11343 mode0 = mode4 == P2HImode ? HImode : QImode;
11344 emit_move_insn (gen_rtx_MEM (mode0, op0),
11345 gen_lowpart (mode0, op4));
11346 emit_move_insn (gen_rtx_MEM (mode0, op1),
11347 gen_highpart (mode0, op4));
11348
11349 return 0;
11350
2bf6d935
ML
11351 case IX86_BUILTIN_RDPMC:
11352 case IX86_BUILTIN_RDTSC:
11353 case IX86_BUILTIN_RDTSCP:
11354 case IX86_BUILTIN_XGETBV:
11355
11356 op0 = gen_reg_rtx (DImode);
11357 op1 = gen_reg_rtx (DImode);
11358
11359 if (fcode == IX86_BUILTIN_RDPMC)
11360 {
11361 arg0 = CALL_EXPR_ARG (exp, 0);
11362 op2 = expand_normal (arg0);
11363 if (!register_operand (op2, SImode))
11364 op2 = copy_to_mode_reg (SImode, op2);
11365
11366 insn = (TARGET_64BIT
11367 ? gen_rdpmc_rex64 (op0, op1, op2)
11368 : gen_rdpmc (op0, op2));
11369 emit_insn (insn);
11370 }
11371 else if (fcode == IX86_BUILTIN_XGETBV)
11372 {
11373 arg0 = CALL_EXPR_ARG (exp, 0);
11374 op2 = expand_normal (arg0);
11375 if (!register_operand (op2, SImode))
11376 op2 = copy_to_mode_reg (SImode, op2);
11377
11378 insn = (TARGET_64BIT
11379 ? gen_xgetbv_rex64 (op0, op1, op2)
11380 : gen_xgetbv (op0, op2));
11381 emit_insn (insn);
11382 }
11383 else if (fcode == IX86_BUILTIN_RDTSC)
11384 {
11385 insn = (TARGET_64BIT
11386 ? gen_rdtsc_rex64 (op0, op1)
11387 : gen_rdtsc (op0));
11388 emit_insn (insn);
11389 }
11390 else
11391 {
11392 op2 = gen_reg_rtx (SImode);
11393
11394 insn = (TARGET_64BIT
11395 ? gen_rdtscp_rex64 (op0, op1, op2)
11396 : gen_rdtscp (op0, op2));
11397 emit_insn (insn);
11398
11399 arg0 = CALL_EXPR_ARG (exp, 0);
11400 op4 = expand_normal (arg0);
11401 if (!address_operand (op4, VOIDmode))
11402 {
11403 op4 = convert_memory_address (Pmode, op4);
11404 op4 = copy_addr_to_reg (op4);
11405 }
11406 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
11407 }
11408
11409 if (target == 0
11410 || !register_operand (target, DImode))
11411 target = gen_reg_rtx (DImode);
11412
11413 if (TARGET_64BIT)
11414 {
11415 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
11416 op1, 1, OPTAB_DIRECT);
11417 op0 = expand_simple_binop (DImode, IOR, op0, op1,
11418 op0, 1, OPTAB_DIRECT);
11419 }
11420
11421 emit_move_insn (target, op0);
11422 return target;
11423
6a10feda
XG
11424 case IX86_BUILTIN_ENQCMD:
11425 case IX86_BUILTIN_ENQCMDS:
2bf6d935
ML
11426 case IX86_BUILTIN_MOVDIR64B:
11427
11428 arg0 = CALL_EXPR_ARG (exp, 0);
11429 arg1 = CALL_EXPR_ARG (exp, 1);
11430 op0 = expand_normal (arg0);
11431 op1 = expand_normal (arg1);
11432
11433 op0 = ix86_zero_extend_to_Pmode (op0);
11434 if (!address_operand (op1, VOIDmode))
11435 {
11436 op1 = convert_memory_address (Pmode, op1);
11437 op1 = copy_addr_to_reg (op1);
11438 }
11439 op1 = gen_rtx_MEM (XImode, op1);
11440
6a10feda
XG
11441 if (fcode == IX86_BUILTIN_MOVDIR64B)
11442 {
11443 emit_insn (gen_movdir64b (Pmode, op0, op1));
11444 return 0;
11445 }
11446 else
11447 {
11448 rtx pat;
11449
11450 target = gen_reg_rtx (SImode);
11451 emit_move_insn (target, const0_rtx);
11452 target = gen_rtx_SUBREG (QImode, target, 0);
11453
11454 if (fcode == IX86_BUILTIN_ENQCMD)
11455 pat = gen_enqcmd (UNSPECV_ENQCMD, Pmode, op0, op1);
11456 else
11457 pat = gen_enqcmd (UNSPECV_ENQCMDS, Pmode, op0, op1);
11458
11459 emit_insn (pat);
11460
11461 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
11462 gen_rtx_fmt_ee (EQ, QImode,
11463 SET_DEST (pat),
11464 const0_rtx)));
11465
11466 return SUBREG_REG (target);
11467 }
2bf6d935
ML
11468
11469 case IX86_BUILTIN_FXSAVE:
11470 case IX86_BUILTIN_FXRSTOR:
11471 case IX86_BUILTIN_FXSAVE64:
11472 case IX86_BUILTIN_FXRSTOR64:
11473 case IX86_BUILTIN_FNSTENV:
11474 case IX86_BUILTIN_FLDENV:
11475 mode0 = BLKmode;
11476 switch (fcode)
11477 {
11478 case IX86_BUILTIN_FXSAVE:
11479 icode = CODE_FOR_fxsave;
11480 break;
11481 case IX86_BUILTIN_FXRSTOR:
11482 icode = CODE_FOR_fxrstor;
11483 break;
11484 case IX86_BUILTIN_FXSAVE64:
11485 icode = CODE_FOR_fxsave64;
11486 break;
11487 case IX86_BUILTIN_FXRSTOR64:
11488 icode = CODE_FOR_fxrstor64;
11489 break;
11490 case IX86_BUILTIN_FNSTENV:
11491 icode = CODE_FOR_fnstenv;
11492 break;
11493 case IX86_BUILTIN_FLDENV:
11494 icode = CODE_FOR_fldenv;
11495 break;
11496 default:
11497 gcc_unreachable ();
11498 }
11499
11500 arg0 = CALL_EXPR_ARG (exp, 0);
11501 op0 = expand_normal (arg0);
11502
11503 if (!address_operand (op0, VOIDmode))
11504 {
11505 op0 = convert_memory_address (Pmode, op0);
11506 op0 = copy_addr_to_reg (op0);
11507 }
11508 op0 = gen_rtx_MEM (mode0, op0);
11509
11510 pat = GEN_FCN (icode) (op0);
11511 if (pat)
11512 emit_insn (pat);
11513 return 0;
11514
11515 case IX86_BUILTIN_XSETBV:
11516 arg0 = CALL_EXPR_ARG (exp, 0);
11517 arg1 = CALL_EXPR_ARG (exp, 1);
11518 op0 = expand_normal (arg0);
11519 op1 = expand_normal (arg1);
11520
11521 if (!REG_P (op0))
11522 op0 = copy_to_mode_reg (SImode, op0);
11523
11524 op1 = force_reg (DImode, op1);
11525
11526 if (TARGET_64BIT)
11527 {
11528 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11529 NULL, 1, OPTAB_DIRECT);
11530
11531 icode = CODE_FOR_xsetbv_rex64;
11532
11533 op2 = gen_lowpart (SImode, op2);
11534 op1 = gen_lowpart (SImode, op1);
11535 pat = GEN_FCN (icode) (op0, op1, op2);
11536 }
11537 else
11538 {
11539 icode = CODE_FOR_xsetbv;
11540
11541 pat = GEN_FCN (icode) (op0, op1);
11542 }
11543 if (pat)
11544 emit_insn (pat);
11545 return 0;
11546
11547 case IX86_BUILTIN_XSAVE:
11548 case IX86_BUILTIN_XRSTOR:
11549 case IX86_BUILTIN_XSAVE64:
11550 case IX86_BUILTIN_XRSTOR64:
11551 case IX86_BUILTIN_XSAVEOPT:
11552 case IX86_BUILTIN_XSAVEOPT64:
11553 case IX86_BUILTIN_XSAVES:
11554 case IX86_BUILTIN_XRSTORS:
11555 case IX86_BUILTIN_XSAVES64:
11556 case IX86_BUILTIN_XRSTORS64:
11557 case IX86_BUILTIN_XSAVEC:
11558 case IX86_BUILTIN_XSAVEC64:
11559 arg0 = CALL_EXPR_ARG (exp, 0);
11560 arg1 = CALL_EXPR_ARG (exp, 1);
11561 op0 = expand_normal (arg0);
11562 op1 = expand_normal (arg1);
11563
11564 if (!address_operand (op0, VOIDmode))
11565 {
11566 op0 = convert_memory_address (Pmode, op0);
11567 op0 = copy_addr_to_reg (op0);
11568 }
11569 op0 = gen_rtx_MEM (BLKmode, op0);
11570
11571 op1 = force_reg (DImode, op1);
11572
11573 if (TARGET_64BIT)
11574 {
11575 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11576 NULL, 1, OPTAB_DIRECT);
11577 switch (fcode)
11578 {
11579 case IX86_BUILTIN_XSAVE:
11580 icode = CODE_FOR_xsave_rex64;
11581 break;
11582 case IX86_BUILTIN_XRSTOR:
11583 icode = CODE_FOR_xrstor_rex64;
11584 break;
11585 case IX86_BUILTIN_XSAVE64:
11586 icode = CODE_FOR_xsave64;
11587 break;
11588 case IX86_BUILTIN_XRSTOR64:
11589 icode = CODE_FOR_xrstor64;
11590 break;
11591 case IX86_BUILTIN_XSAVEOPT:
11592 icode = CODE_FOR_xsaveopt_rex64;
11593 break;
11594 case IX86_BUILTIN_XSAVEOPT64:
11595 icode = CODE_FOR_xsaveopt64;
11596 break;
11597 case IX86_BUILTIN_XSAVES:
11598 icode = CODE_FOR_xsaves_rex64;
11599 break;
11600 case IX86_BUILTIN_XRSTORS:
11601 icode = CODE_FOR_xrstors_rex64;
11602 break;
11603 case IX86_BUILTIN_XSAVES64:
11604 icode = CODE_FOR_xsaves64;
11605 break;
11606 case IX86_BUILTIN_XRSTORS64:
11607 icode = CODE_FOR_xrstors64;
11608 break;
11609 case IX86_BUILTIN_XSAVEC:
11610 icode = CODE_FOR_xsavec_rex64;
11611 break;
11612 case IX86_BUILTIN_XSAVEC64:
11613 icode = CODE_FOR_xsavec64;
11614 break;
11615 default:
11616 gcc_unreachable ();
11617 }
11618
11619 op2 = gen_lowpart (SImode, op2);
11620 op1 = gen_lowpart (SImode, op1);
11621 pat = GEN_FCN (icode) (op0, op1, op2);
11622 }
11623 else
11624 {
11625 switch (fcode)
11626 {
11627 case IX86_BUILTIN_XSAVE:
11628 icode = CODE_FOR_xsave;
11629 break;
11630 case IX86_BUILTIN_XRSTOR:
11631 icode = CODE_FOR_xrstor;
11632 break;
11633 case IX86_BUILTIN_XSAVEOPT:
11634 icode = CODE_FOR_xsaveopt;
11635 break;
11636 case IX86_BUILTIN_XSAVES:
11637 icode = CODE_FOR_xsaves;
11638 break;
11639 case IX86_BUILTIN_XRSTORS:
11640 icode = CODE_FOR_xrstors;
11641 break;
11642 case IX86_BUILTIN_XSAVEC:
11643 icode = CODE_FOR_xsavec;
11644 break;
11645 default:
11646 gcc_unreachable ();
11647 }
11648 pat = GEN_FCN (icode) (op0, op1);
11649 }
11650
11651 if (pat)
11652 emit_insn (pat);
11653 return 0;
11654
11655 case IX86_BUILTIN_LLWPCB:
11656 arg0 = CALL_EXPR_ARG (exp, 0);
11657 op0 = expand_normal (arg0);
11658 icode = CODE_FOR_lwp_llwpcb;
11659 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11660 op0 = ix86_zero_extend_to_Pmode (op0);
11661 emit_insn (gen_lwp_llwpcb (op0));
11662 return 0;
11663
11664 case IX86_BUILTIN_SLWPCB:
11665 icode = CODE_FOR_lwp_slwpcb;
11666 if (!target
11667 || !insn_data[icode].operand[0].predicate (target, Pmode))
11668 target = gen_reg_rtx (Pmode);
11669 emit_insn (gen_lwp_slwpcb (target));
11670 return target;
11671
11672 case IX86_BUILTIN_BEXTRI32:
11673 case IX86_BUILTIN_BEXTRI64:
11674 arg0 = CALL_EXPR_ARG (exp, 0);
11675 arg1 = CALL_EXPR_ARG (exp, 1);
11676 op0 = expand_normal (arg0);
11677 op1 = expand_normal (arg1);
11678 icode = (fcode == IX86_BUILTIN_BEXTRI32
11679 ? CODE_FOR_tbm_bextri_si
11680 : CODE_FOR_tbm_bextri_di);
11681 if (!CONST_INT_P (op1))
11682 {
11683 error ("last argument must be an immediate");
11684 return const0_rtx;
11685 }
11686 else
11687 {
11688 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
11689 unsigned char lsb_index = INTVAL (op1) & 0xFF;
11690 op1 = GEN_INT (length);
11691 op2 = GEN_INT (lsb_index);
11692
11693 mode1 = insn_data[icode].operand[1].mode;
11694 if (!insn_data[icode].operand[1].predicate (op0, mode1))
11695 op0 = copy_to_mode_reg (mode1, op0);
11696
11697 mode0 = insn_data[icode].operand[0].mode;
11698 if (target == 0
11699 || !register_operand (target, mode0))
11700 target = gen_reg_rtx (mode0);
11701
11702 pat = GEN_FCN (icode) (target, op0, op1, op2);
11703 if (pat)
11704 emit_insn (pat);
11705 return target;
11706 }
11707
11708 case IX86_BUILTIN_RDRAND16_STEP:
11709 icode = CODE_FOR_rdrandhi_1;
11710 mode0 = HImode;
11711 goto rdrand_step;
11712
11713 case IX86_BUILTIN_RDRAND32_STEP:
11714 icode = CODE_FOR_rdrandsi_1;
11715 mode0 = SImode;
11716 goto rdrand_step;
11717
11718 case IX86_BUILTIN_RDRAND64_STEP:
11719 icode = CODE_FOR_rdranddi_1;
11720 mode0 = DImode;
11721
11722rdrand_step:
11723 arg0 = CALL_EXPR_ARG (exp, 0);
11724 op1 = expand_normal (arg0);
11725 if (!address_operand (op1, VOIDmode))
11726 {
11727 op1 = convert_memory_address (Pmode, op1);
11728 op1 = copy_addr_to_reg (op1);
11729 }
11730
11731 op0 = gen_reg_rtx (mode0);
11732 emit_insn (GEN_FCN (icode) (op0));
11733
11734 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
11735
11736 op1 = gen_reg_rtx (SImode);
11737 emit_move_insn (op1, CONST1_RTX (SImode));
11738
11739 /* Emit SImode conditional move. */
11740 if (mode0 == HImode)
11741 {
11742 if (TARGET_ZERO_EXTEND_WITH_AND
11743 && optimize_function_for_speed_p (cfun))
11744 {
11745 op2 = force_reg (SImode, const0_rtx);
11746
11747 emit_insn (gen_movstricthi
11748 (gen_lowpart (HImode, op2), op0));
11749 }
11750 else
11751 {
11752 op2 = gen_reg_rtx (SImode);
11753
11754 emit_insn (gen_zero_extendhisi2 (op2, op0));
11755 }
11756 }
11757 else if (mode0 == SImode)
11758 op2 = op0;
11759 else
11760 op2 = gen_rtx_SUBREG (SImode, op0, 0);
11761
11762 if (target == 0
11763 || !register_operand (target, SImode))
11764 target = gen_reg_rtx (SImode);
11765
11766 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
11767 const0_rtx);
11768 emit_insn (gen_rtx_SET (target,
11769 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
11770 return target;
11771
11772 case IX86_BUILTIN_RDSEED16_STEP:
11773 icode = CODE_FOR_rdseedhi_1;
11774 mode0 = HImode;
11775 goto rdseed_step;
11776
11777 case IX86_BUILTIN_RDSEED32_STEP:
11778 icode = CODE_FOR_rdseedsi_1;
11779 mode0 = SImode;
11780 goto rdseed_step;
11781
11782 case IX86_BUILTIN_RDSEED64_STEP:
11783 icode = CODE_FOR_rdseeddi_1;
11784 mode0 = DImode;
11785
11786rdseed_step:
11787 arg0 = CALL_EXPR_ARG (exp, 0);
11788 op1 = expand_normal (arg0);
11789 if (!address_operand (op1, VOIDmode))
11790 {
11791 op1 = convert_memory_address (Pmode, op1);
11792 op1 = copy_addr_to_reg (op1);
11793 }
11794
11795 op0 = gen_reg_rtx (mode0);
11796 emit_insn (GEN_FCN (icode) (op0));
11797
11798 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
11799
11800 op2 = gen_reg_rtx (QImode);
11801
11802 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11803 const0_rtx);
11804 emit_insn (gen_rtx_SET (op2, pat));
11805
11806 if (target == 0
11807 || !register_operand (target, SImode))
11808 target = gen_reg_rtx (SImode);
11809
11810 emit_insn (gen_zero_extendqisi2 (target, op2));
11811 return target;
11812
11813 case IX86_BUILTIN_SBB32:
11814 icode = CODE_FOR_subborrowsi;
11815 icode2 = CODE_FOR_subborrowsi_0;
11816 mode0 = SImode;
11817 mode1 = DImode;
11818 mode2 = CCmode;
11819 goto handlecarry;
11820
11821 case IX86_BUILTIN_SBB64:
11822 icode = CODE_FOR_subborrowdi;
11823 icode2 = CODE_FOR_subborrowdi_0;
11824 mode0 = DImode;
11825 mode1 = TImode;
11826 mode2 = CCmode;
11827 goto handlecarry;
11828
11829 case IX86_BUILTIN_ADDCARRYX32:
11830 icode = CODE_FOR_addcarrysi;
11831 icode2 = CODE_FOR_addcarrysi_0;
11832 mode0 = SImode;
11833 mode1 = DImode;
11834 mode2 = CCCmode;
11835 goto handlecarry;
11836
11837 case IX86_BUILTIN_ADDCARRYX64:
11838 icode = CODE_FOR_addcarrydi;
11839 icode2 = CODE_FOR_addcarrydi_0;
11840 mode0 = DImode;
11841 mode1 = TImode;
11842 mode2 = CCCmode;
11843
11844 handlecarry:
11845 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
11846 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
11847 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
11848 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
11849
11850 op1 = expand_normal (arg0);
11851 if (!integer_zerop (arg0))
11852 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
11853
11854 op2 = expand_normal (arg1);
11855 if (!register_operand (op2, mode0))
11856 op2 = copy_to_mode_reg (mode0, op2);
11857
11858 op3 = expand_normal (arg2);
11859 if (!register_operand (op3, mode0))
11860 op3 = copy_to_mode_reg (mode0, op3);
11861
11862 op4 = expand_normal (arg3);
11863 if (!address_operand (op4, VOIDmode))
11864 {
11865 op4 = convert_memory_address (Pmode, op4);
11866 op4 = copy_addr_to_reg (op4);
11867 }
11868
11869 op0 = gen_reg_rtx (mode0);
11870 if (integer_zerop (arg0))
11871 {
11872 /* If arg0 is 0, optimize right away into add or sub
11873 instruction that sets CCCmode flags. */
11874 op1 = gen_rtx_REG (mode2, FLAGS_REG);
11875 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
11876 }
11877 else
11878 {
11879 /* Generate CF from input operand. */
11880 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
11881
11882 /* Generate instruction that consumes CF. */
11883 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
11884 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
11885 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
11886 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
11887 }
11888
11889 /* Return current CF value. */
11890 if (target == 0)
11891 target = gen_reg_rtx (QImode);
11892
11893 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
11894 emit_insn (gen_rtx_SET (target, pat));
11895
11896 /* Store the result. */
11897 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
11898
11899 return target;
11900
11901 case IX86_BUILTIN_READ_FLAGS:
11902 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
11903
11904 if (optimize
11905 || target == NULL_RTX
11906 || !nonimmediate_operand (target, word_mode)
11907 || GET_MODE (target) != word_mode)
11908 target = gen_reg_rtx (word_mode);
11909
11910 emit_insn (gen_pop (target));
11911 return target;
11912
11913 case IX86_BUILTIN_WRITE_FLAGS:
11914
11915 arg0 = CALL_EXPR_ARG (exp, 0);
11916 op0 = expand_normal (arg0);
11917 if (!general_no_elim_operand (op0, word_mode))
11918 op0 = copy_to_mode_reg (word_mode, op0);
11919
11920 emit_insn (gen_push (op0));
11921 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
11922 return 0;
11923
11924 case IX86_BUILTIN_KTESTC8:
11925 icode = CODE_FOR_ktestqi;
11926 mode3 = CCCmode;
11927 goto kortest;
11928
11929 case IX86_BUILTIN_KTESTZ8:
11930 icode = CODE_FOR_ktestqi;
11931 mode3 = CCZmode;
11932 goto kortest;
11933
11934 case IX86_BUILTIN_KTESTC16:
11935 icode = CODE_FOR_ktesthi;
11936 mode3 = CCCmode;
11937 goto kortest;
11938
11939 case IX86_BUILTIN_KTESTZ16:
11940 icode = CODE_FOR_ktesthi;
11941 mode3 = CCZmode;
11942 goto kortest;
11943
11944 case IX86_BUILTIN_KTESTC32:
11945 icode = CODE_FOR_ktestsi;
11946 mode3 = CCCmode;
11947 goto kortest;
11948
11949 case IX86_BUILTIN_KTESTZ32:
11950 icode = CODE_FOR_ktestsi;
11951 mode3 = CCZmode;
11952 goto kortest;
11953
11954 case IX86_BUILTIN_KTESTC64:
11955 icode = CODE_FOR_ktestdi;
11956 mode3 = CCCmode;
11957 goto kortest;
11958
11959 case IX86_BUILTIN_KTESTZ64:
11960 icode = CODE_FOR_ktestdi;
11961 mode3 = CCZmode;
11962 goto kortest;
11963
11964 case IX86_BUILTIN_KORTESTC8:
11965 icode = CODE_FOR_kortestqi;
11966 mode3 = CCCmode;
11967 goto kortest;
11968
11969 case IX86_BUILTIN_KORTESTZ8:
11970 icode = CODE_FOR_kortestqi;
11971 mode3 = CCZmode;
11972 goto kortest;
11973
11974 case IX86_BUILTIN_KORTESTC16:
11975 icode = CODE_FOR_kortesthi;
11976 mode3 = CCCmode;
11977 goto kortest;
11978
11979 case IX86_BUILTIN_KORTESTZ16:
11980 icode = CODE_FOR_kortesthi;
11981 mode3 = CCZmode;
11982 goto kortest;
11983
11984 case IX86_BUILTIN_KORTESTC32:
11985 icode = CODE_FOR_kortestsi;
11986 mode3 = CCCmode;
11987 goto kortest;
11988
11989 case IX86_BUILTIN_KORTESTZ32:
11990 icode = CODE_FOR_kortestsi;
11991 mode3 = CCZmode;
11992 goto kortest;
11993
11994 case IX86_BUILTIN_KORTESTC64:
11995 icode = CODE_FOR_kortestdi;
11996 mode3 = CCCmode;
11997 goto kortest;
11998
11999 case IX86_BUILTIN_KORTESTZ64:
12000 icode = CODE_FOR_kortestdi;
12001 mode3 = CCZmode;
12002
12003 kortest:
12004 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
12005 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
12006 op0 = expand_normal (arg0);
12007 op1 = expand_normal (arg1);
12008
12009 mode0 = insn_data[icode].operand[0].mode;
12010 mode1 = insn_data[icode].operand[1].mode;
12011
12012 if (GET_MODE (op0) != VOIDmode)
12013 op0 = force_reg (GET_MODE (op0), op0);
12014
12015 op0 = gen_lowpart (mode0, op0);
12016
12017 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12018 op0 = copy_to_mode_reg (mode0, op0);
12019
12020 if (GET_MODE (op1) != VOIDmode)
12021 op1 = force_reg (GET_MODE (op1), op1);
12022
12023 op1 = gen_lowpart (mode1, op1);
12024
12025 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12026 op1 = copy_to_mode_reg (mode1, op1);
12027
12028 target = gen_reg_rtx (QImode);
12029
12030 /* Emit kortest. */
12031 emit_insn (GEN_FCN (icode) (op0, op1));
12032 /* And use setcc to return result from flags. */
12033 ix86_expand_setcc (target, EQ,
12034 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
12035 return target;
12036
12037 case IX86_BUILTIN_GATHERSIV2DF:
12038 icode = CODE_FOR_avx2_gathersiv2df;
12039 goto gather_gen;
12040 case IX86_BUILTIN_GATHERSIV4DF:
12041 icode = CODE_FOR_avx2_gathersiv4df;
12042 goto gather_gen;
12043 case IX86_BUILTIN_GATHERDIV2DF:
12044 icode = CODE_FOR_avx2_gatherdiv2df;
12045 goto gather_gen;
12046 case IX86_BUILTIN_GATHERDIV4DF:
12047 icode = CODE_FOR_avx2_gatherdiv4df;
12048 goto gather_gen;
12049 case IX86_BUILTIN_GATHERSIV4SF:
12050 icode = CODE_FOR_avx2_gathersiv4sf;
12051 goto gather_gen;
12052 case IX86_BUILTIN_GATHERSIV8SF:
12053 icode = CODE_FOR_avx2_gathersiv8sf;
12054 goto gather_gen;
12055 case IX86_BUILTIN_GATHERDIV4SF:
12056 icode = CODE_FOR_avx2_gatherdiv4sf;
12057 goto gather_gen;
12058 case IX86_BUILTIN_GATHERDIV8SF:
12059 icode = CODE_FOR_avx2_gatherdiv8sf;
12060 goto gather_gen;
12061 case IX86_BUILTIN_GATHERSIV2DI:
12062 icode = CODE_FOR_avx2_gathersiv2di;
12063 goto gather_gen;
12064 case IX86_BUILTIN_GATHERSIV4DI:
12065 icode = CODE_FOR_avx2_gathersiv4di;
12066 goto gather_gen;
12067 case IX86_BUILTIN_GATHERDIV2DI:
12068 icode = CODE_FOR_avx2_gatherdiv2di;
12069 goto gather_gen;
12070 case IX86_BUILTIN_GATHERDIV4DI:
12071 icode = CODE_FOR_avx2_gatherdiv4di;
12072 goto gather_gen;
12073 case IX86_BUILTIN_GATHERSIV4SI:
12074 icode = CODE_FOR_avx2_gathersiv4si;
12075 goto gather_gen;
12076 case IX86_BUILTIN_GATHERSIV8SI:
12077 icode = CODE_FOR_avx2_gathersiv8si;
12078 goto gather_gen;
12079 case IX86_BUILTIN_GATHERDIV4SI:
12080 icode = CODE_FOR_avx2_gatherdiv4si;
12081 goto gather_gen;
12082 case IX86_BUILTIN_GATHERDIV8SI:
12083 icode = CODE_FOR_avx2_gatherdiv8si;
12084 goto gather_gen;
12085 case IX86_BUILTIN_GATHERALTSIV4DF:
12086 icode = CODE_FOR_avx2_gathersiv4df;
12087 goto gather_gen;
12088 case IX86_BUILTIN_GATHERALTDIV8SF:
12089 icode = CODE_FOR_avx2_gatherdiv8sf;
12090 goto gather_gen;
12091 case IX86_BUILTIN_GATHERALTSIV4DI:
12092 icode = CODE_FOR_avx2_gathersiv4di;
12093 goto gather_gen;
12094 case IX86_BUILTIN_GATHERALTDIV8SI:
12095 icode = CODE_FOR_avx2_gatherdiv8si;
12096 goto gather_gen;
12097 case IX86_BUILTIN_GATHER3SIV16SF:
12098 icode = CODE_FOR_avx512f_gathersiv16sf;
12099 goto gather_gen;
12100 case IX86_BUILTIN_GATHER3SIV8DF:
12101 icode = CODE_FOR_avx512f_gathersiv8df;
12102 goto gather_gen;
12103 case IX86_BUILTIN_GATHER3DIV16SF:
12104 icode = CODE_FOR_avx512f_gatherdiv16sf;
12105 goto gather_gen;
12106 case IX86_BUILTIN_GATHER3DIV8DF:
12107 icode = CODE_FOR_avx512f_gatherdiv8df;
12108 goto gather_gen;
12109 case IX86_BUILTIN_GATHER3SIV16SI:
12110 icode = CODE_FOR_avx512f_gathersiv16si;
12111 goto gather_gen;
12112 case IX86_BUILTIN_GATHER3SIV8DI:
12113 icode = CODE_FOR_avx512f_gathersiv8di;
12114 goto gather_gen;
12115 case IX86_BUILTIN_GATHER3DIV16SI:
12116 icode = CODE_FOR_avx512f_gatherdiv16si;
12117 goto gather_gen;
12118 case IX86_BUILTIN_GATHER3DIV8DI:
12119 icode = CODE_FOR_avx512f_gatherdiv8di;
12120 goto gather_gen;
12121 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12122 icode = CODE_FOR_avx512f_gathersiv8df;
12123 goto gather_gen;
12124 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12125 icode = CODE_FOR_avx512f_gatherdiv16sf;
12126 goto gather_gen;
12127 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12128 icode = CODE_FOR_avx512f_gathersiv8di;
12129 goto gather_gen;
12130 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12131 icode = CODE_FOR_avx512f_gatherdiv16si;
12132 goto gather_gen;
12133 case IX86_BUILTIN_GATHER3SIV2DF:
12134 icode = CODE_FOR_avx512vl_gathersiv2df;
12135 goto gather_gen;
12136 case IX86_BUILTIN_GATHER3SIV4DF:
12137 icode = CODE_FOR_avx512vl_gathersiv4df;
12138 goto gather_gen;
12139 case IX86_BUILTIN_GATHER3DIV2DF:
12140 icode = CODE_FOR_avx512vl_gatherdiv2df;
12141 goto gather_gen;
12142 case IX86_BUILTIN_GATHER3DIV4DF:
12143 icode = CODE_FOR_avx512vl_gatherdiv4df;
12144 goto gather_gen;
12145 case IX86_BUILTIN_GATHER3SIV4SF:
12146 icode = CODE_FOR_avx512vl_gathersiv4sf;
12147 goto gather_gen;
12148 case IX86_BUILTIN_GATHER3SIV8SF:
12149 icode = CODE_FOR_avx512vl_gathersiv8sf;
12150 goto gather_gen;
12151 case IX86_BUILTIN_GATHER3DIV4SF:
12152 icode = CODE_FOR_avx512vl_gatherdiv4sf;
12153 goto gather_gen;
12154 case IX86_BUILTIN_GATHER3DIV8SF:
12155 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12156 goto gather_gen;
12157 case IX86_BUILTIN_GATHER3SIV2DI:
12158 icode = CODE_FOR_avx512vl_gathersiv2di;
12159 goto gather_gen;
12160 case IX86_BUILTIN_GATHER3SIV4DI:
12161 icode = CODE_FOR_avx512vl_gathersiv4di;
12162 goto gather_gen;
12163 case IX86_BUILTIN_GATHER3DIV2DI:
12164 icode = CODE_FOR_avx512vl_gatherdiv2di;
12165 goto gather_gen;
12166 case IX86_BUILTIN_GATHER3DIV4DI:
12167 icode = CODE_FOR_avx512vl_gatherdiv4di;
12168 goto gather_gen;
12169 case IX86_BUILTIN_GATHER3SIV4SI:
12170 icode = CODE_FOR_avx512vl_gathersiv4si;
12171 goto gather_gen;
12172 case IX86_BUILTIN_GATHER3SIV8SI:
12173 icode = CODE_FOR_avx512vl_gathersiv8si;
12174 goto gather_gen;
12175 case IX86_BUILTIN_GATHER3DIV4SI:
12176 icode = CODE_FOR_avx512vl_gatherdiv4si;
12177 goto gather_gen;
12178 case IX86_BUILTIN_GATHER3DIV8SI:
12179 icode = CODE_FOR_avx512vl_gatherdiv8si;
12180 goto gather_gen;
12181 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12182 icode = CODE_FOR_avx512vl_gathersiv4df;
12183 goto gather_gen;
12184 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12185 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12186 goto gather_gen;
12187 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12188 icode = CODE_FOR_avx512vl_gathersiv4di;
12189 goto gather_gen;
12190 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12191 icode = CODE_FOR_avx512vl_gatherdiv8si;
12192 goto gather_gen;
12193 case IX86_BUILTIN_SCATTERSIV16SF:
12194 icode = CODE_FOR_avx512f_scattersiv16sf;
12195 goto scatter_gen;
12196 case IX86_BUILTIN_SCATTERSIV8DF:
12197 icode = CODE_FOR_avx512f_scattersiv8df;
12198 goto scatter_gen;
12199 case IX86_BUILTIN_SCATTERDIV16SF:
12200 icode = CODE_FOR_avx512f_scatterdiv16sf;
12201 goto scatter_gen;
12202 case IX86_BUILTIN_SCATTERDIV8DF:
12203 icode = CODE_FOR_avx512f_scatterdiv8df;
12204 goto scatter_gen;
12205 case IX86_BUILTIN_SCATTERSIV16SI:
12206 icode = CODE_FOR_avx512f_scattersiv16si;
12207 goto scatter_gen;
12208 case IX86_BUILTIN_SCATTERSIV8DI:
12209 icode = CODE_FOR_avx512f_scattersiv8di;
12210 goto scatter_gen;
12211 case IX86_BUILTIN_SCATTERDIV16SI:
12212 icode = CODE_FOR_avx512f_scatterdiv16si;
12213 goto scatter_gen;
12214 case IX86_BUILTIN_SCATTERDIV8DI:
12215 icode = CODE_FOR_avx512f_scatterdiv8di;
12216 goto scatter_gen;
12217 case IX86_BUILTIN_SCATTERSIV8SF:
12218 icode = CODE_FOR_avx512vl_scattersiv8sf;
12219 goto scatter_gen;
12220 case IX86_BUILTIN_SCATTERSIV4SF:
12221 icode = CODE_FOR_avx512vl_scattersiv4sf;
12222 goto scatter_gen;
12223 case IX86_BUILTIN_SCATTERSIV4DF:
12224 icode = CODE_FOR_avx512vl_scattersiv4df;
12225 goto scatter_gen;
12226 case IX86_BUILTIN_SCATTERSIV2DF:
12227 icode = CODE_FOR_avx512vl_scattersiv2df;
12228 goto scatter_gen;
12229 case IX86_BUILTIN_SCATTERDIV8SF:
12230 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12231 goto scatter_gen;
12232 case IX86_BUILTIN_SCATTERDIV4SF:
12233 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12234 goto scatter_gen;
12235 case IX86_BUILTIN_SCATTERDIV4DF:
12236 icode = CODE_FOR_avx512vl_scatterdiv4df;
12237 goto scatter_gen;
12238 case IX86_BUILTIN_SCATTERDIV2DF:
12239 icode = CODE_FOR_avx512vl_scatterdiv2df;
12240 goto scatter_gen;
12241 case IX86_BUILTIN_SCATTERSIV8SI:
12242 icode = CODE_FOR_avx512vl_scattersiv8si;
12243 goto scatter_gen;
12244 case IX86_BUILTIN_SCATTERSIV4SI:
12245 icode = CODE_FOR_avx512vl_scattersiv4si;
12246 goto scatter_gen;
12247 case IX86_BUILTIN_SCATTERSIV4DI:
12248 icode = CODE_FOR_avx512vl_scattersiv4di;
12249 goto scatter_gen;
12250 case IX86_BUILTIN_SCATTERSIV2DI:
12251 icode = CODE_FOR_avx512vl_scattersiv2di;
12252 goto scatter_gen;
12253 case IX86_BUILTIN_SCATTERDIV8SI:
12254 icode = CODE_FOR_avx512vl_scatterdiv8si;
12255 goto scatter_gen;
12256 case IX86_BUILTIN_SCATTERDIV4SI:
12257 icode = CODE_FOR_avx512vl_scatterdiv4si;
12258 goto scatter_gen;
12259 case IX86_BUILTIN_SCATTERDIV4DI:
12260 icode = CODE_FOR_avx512vl_scatterdiv4di;
12261 goto scatter_gen;
12262 case IX86_BUILTIN_SCATTERDIV2DI:
12263 icode = CODE_FOR_avx512vl_scatterdiv2di;
12264 goto scatter_gen;
12265 case IX86_BUILTIN_GATHERPFDPD:
12266 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
12267 goto vec_prefetch_gen;
12268 case IX86_BUILTIN_SCATTERALTSIV8DF:
12269 icode = CODE_FOR_avx512f_scattersiv8df;
12270 goto scatter_gen;
12271 case IX86_BUILTIN_SCATTERALTDIV16SF:
12272 icode = CODE_FOR_avx512f_scatterdiv16sf;
12273 goto scatter_gen;
12274 case IX86_BUILTIN_SCATTERALTSIV8DI:
12275 icode = CODE_FOR_avx512f_scattersiv8di;
12276 goto scatter_gen;
12277 case IX86_BUILTIN_SCATTERALTDIV16SI:
12278 icode = CODE_FOR_avx512f_scatterdiv16si;
12279 goto scatter_gen;
12280 case IX86_BUILTIN_SCATTERALTSIV4DF:
12281 icode = CODE_FOR_avx512vl_scattersiv4df;
12282 goto scatter_gen;
12283 case IX86_BUILTIN_SCATTERALTDIV8SF:
12284 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12285 goto scatter_gen;
12286 case IX86_BUILTIN_SCATTERALTSIV4DI:
12287 icode = CODE_FOR_avx512vl_scattersiv4di;
12288 goto scatter_gen;
12289 case IX86_BUILTIN_SCATTERALTDIV8SI:
12290 icode = CODE_FOR_avx512vl_scatterdiv8si;
12291 goto scatter_gen;
12292 case IX86_BUILTIN_SCATTERALTSIV2DF:
12293 icode = CODE_FOR_avx512vl_scattersiv2df;
12294 goto scatter_gen;
12295 case IX86_BUILTIN_SCATTERALTDIV4SF:
12296 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12297 goto scatter_gen;
12298 case IX86_BUILTIN_SCATTERALTSIV2DI:
12299 icode = CODE_FOR_avx512vl_scattersiv2di;
12300 goto scatter_gen;
12301 case IX86_BUILTIN_SCATTERALTDIV4SI:
12302 icode = CODE_FOR_avx512vl_scatterdiv4si;
12303 goto scatter_gen;
12304 case IX86_BUILTIN_GATHERPFDPS:
12305 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
12306 goto vec_prefetch_gen;
12307 case IX86_BUILTIN_GATHERPFQPD:
12308 icode = CODE_FOR_avx512pf_gatherpfv8didf;
12309 goto vec_prefetch_gen;
12310 case IX86_BUILTIN_GATHERPFQPS:
12311 icode = CODE_FOR_avx512pf_gatherpfv8disf;
12312 goto vec_prefetch_gen;
12313 case IX86_BUILTIN_SCATTERPFDPD:
12314 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
12315 goto vec_prefetch_gen;
12316 case IX86_BUILTIN_SCATTERPFDPS:
12317 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
12318 goto vec_prefetch_gen;
12319 case IX86_BUILTIN_SCATTERPFQPD:
12320 icode = CODE_FOR_avx512pf_scatterpfv8didf;
12321 goto vec_prefetch_gen;
12322 case IX86_BUILTIN_SCATTERPFQPS:
12323 icode = CODE_FOR_avx512pf_scatterpfv8disf;
12324 goto vec_prefetch_gen;
12325
12326 gather_gen:
12327 rtx half;
12328 rtx (*gen) (rtx, rtx);
12329
12330 arg0 = CALL_EXPR_ARG (exp, 0);
12331 arg1 = CALL_EXPR_ARG (exp, 1);
12332 arg2 = CALL_EXPR_ARG (exp, 2);
12333 arg3 = CALL_EXPR_ARG (exp, 3);
12334 arg4 = CALL_EXPR_ARG (exp, 4);
12335 op0 = expand_normal (arg0);
12336 op1 = expand_normal (arg1);
12337 op2 = expand_normal (arg2);
12338 op3 = expand_normal (arg3);
12339 op4 = expand_normal (arg4);
12340 /* Note the arg order is different from the operand order. */
12341 mode0 = insn_data[icode].operand[1].mode;
12342 mode2 = insn_data[icode].operand[3].mode;
12343 mode3 = insn_data[icode].operand[4].mode;
12344 mode4 = insn_data[icode].operand[5].mode;
12345
12346 if (target == NULL_RTX
12347 || GET_MODE (target) != insn_data[icode].operand[0].mode
12348 || !insn_data[icode].operand[0].predicate (target,
12349 GET_MODE (target)))
12350 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
12351 else
12352 subtarget = target;
12353
12354 switch (fcode)
12355 {
12356 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12357 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12358 half = gen_reg_rtx (V8SImode);
12359 if (!nonimmediate_operand (op2, V16SImode))
12360 op2 = copy_to_mode_reg (V16SImode, op2);
12361 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12362 op2 = half;
12363 break;
12364 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12365 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12366 case IX86_BUILTIN_GATHERALTSIV4DF:
12367 case IX86_BUILTIN_GATHERALTSIV4DI:
12368 half = gen_reg_rtx (V4SImode);
12369 if (!nonimmediate_operand (op2, V8SImode))
12370 op2 = copy_to_mode_reg (V8SImode, op2);
12371 emit_insn (gen_vec_extract_lo_v8si (half, op2));
12372 op2 = half;
12373 break;
12374 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12375 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12376 half = gen_reg_rtx (mode0);
12377 if (mode0 == V8SFmode)
12378 gen = gen_vec_extract_lo_v16sf;
12379 else
12380 gen = gen_vec_extract_lo_v16si;
12381 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12382 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12383 emit_insn (gen (half, op0));
12384 op0 = half;
12385 op3 = lowpart_subreg (QImode, op3, HImode);
12386 break;
12387 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12388 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12389 case IX86_BUILTIN_GATHERALTDIV8SF:
12390 case IX86_BUILTIN_GATHERALTDIV8SI:
12391 half = gen_reg_rtx (mode0);
12392 if (mode0 == V4SFmode)
12393 gen = gen_vec_extract_lo_v8sf;
12394 else
12395 gen = gen_vec_extract_lo_v8si;
12396 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12397 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12398 emit_insn (gen (half, op0));
12399 op0 = half;
12400 if (VECTOR_MODE_P (GET_MODE (op3)))
12401 {
12402 half = gen_reg_rtx (mode0);
12403 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12404 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12405 emit_insn (gen (half, op3));
12406 op3 = half;
12407 }
12408 break;
12409 default:
12410 break;
12411 }
12412
12413 /* Force memory operand only with base register here. But we
12414 don't want to do it on memory operand for other builtin
12415 functions. */
12416 op1 = ix86_zero_extend_to_Pmode (op1);
12417
12418 if (!insn_data[icode].operand[1].predicate (op0, mode0))
12419 op0 = copy_to_mode_reg (mode0, op0);
12420 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
12421 op1 = copy_to_mode_reg (Pmode, op1);
12422 if (!insn_data[icode].operand[3].predicate (op2, mode2))
12423 op2 = copy_to_mode_reg (mode2, op2);
12424
12425 op3 = fixup_modeless_constant (op3, mode3);
12426
12427 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
12428 {
12429 if (!insn_data[icode].operand[4].predicate (op3, mode3))
12430 op3 = copy_to_mode_reg (mode3, op3);
12431 }
12432 else
12433 {
12434 op3 = copy_to_reg (op3);
12435 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
12436 }
12437 if (!insn_data[icode].operand[5].predicate (op4, mode4))
12438 {
12439 error ("the last argument must be scale 1, 2, 4, 8");
12440 return const0_rtx;
12441 }
12442
12443 /* Optimize. If mask is known to have all high bits set,
12444 replace op0 with pc_rtx to signal that the instruction
12445 overwrites the whole destination and doesn't use its
12446 previous contents. */
12447 if (optimize)
12448 {
12449 if (TREE_CODE (arg3) == INTEGER_CST)
12450 {
12451 if (integer_all_onesp (arg3))
12452 op0 = pc_rtx;
12453 }
12454 else if (TREE_CODE (arg3) == VECTOR_CST)
12455 {
12456 unsigned int negative = 0;
12457 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
12458 {
12459 tree cst = VECTOR_CST_ELT (arg3, i);
12460 if (TREE_CODE (cst) == INTEGER_CST
12461 && tree_int_cst_sign_bit (cst))
12462 negative++;
12463 else if (TREE_CODE (cst) == REAL_CST
12464 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
12465 negative++;
12466 }
12467 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
12468 op0 = pc_rtx;
12469 }
12470 else if (TREE_CODE (arg3) == SSA_NAME
12471 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
12472 {
12473 /* Recognize also when mask is like:
12474 __v2df src = _mm_setzero_pd ();
12475 __v2df mask = _mm_cmpeq_pd (src, src);
12476 or
12477 __v8sf src = _mm256_setzero_ps ();
12478 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12479 as that is a cheaper way to load all ones into
12480 a register than having to load a constant from
12481 memory. */
12482 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
12483 if (is_gimple_call (def_stmt))
12484 {
12485 tree fndecl = gimple_call_fndecl (def_stmt);
12486 if (fndecl
12487 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
4d732405 12488 switch (DECL_MD_FUNCTION_CODE (fndecl))
2bf6d935
ML
12489 {
12490 case IX86_BUILTIN_CMPPD:
12491 case IX86_BUILTIN_CMPPS:
12492 case IX86_BUILTIN_CMPPD256:
12493 case IX86_BUILTIN_CMPPS256:
12494 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
12495 break;
12496 /* FALLTHRU */
12497 case IX86_BUILTIN_CMPEQPD:
12498 case IX86_BUILTIN_CMPEQPS:
12499 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
12500 && initializer_zerop (gimple_call_arg (def_stmt,
12501 1)))
12502 op0 = pc_rtx;
12503 break;
12504 default:
12505 break;
12506 }
12507 }
12508 }
12509 }
12510
12511 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
12512 if (! pat)
12513 return const0_rtx;
12514 emit_insn (pat);
12515
12516 switch (fcode)
12517 {
12518 case IX86_BUILTIN_GATHER3DIV16SF:
12519 if (target == NULL_RTX)
12520 target = gen_reg_rtx (V8SFmode);
12521 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
12522 break;
12523 case IX86_BUILTIN_GATHER3DIV16SI:
12524 if (target == NULL_RTX)
12525 target = gen_reg_rtx (V8SImode);
12526 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
12527 break;
12528 case IX86_BUILTIN_GATHER3DIV8SF:
12529 case IX86_BUILTIN_GATHERDIV8SF:
12530 if (target == NULL_RTX)
12531 target = gen_reg_rtx (V4SFmode);
12532 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
12533 break;
12534 case IX86_BUILTIN_GATHER3DIV8SI:
12535 case IX86_BUILTIN_GATHERDIV8SI:
12536 if (target == NULL_RTX)
12537 target = gen_reg_rtx (V4SImode);
12538 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
12539 break;
12540 default:
12541 target = subtarget;
12542 break;
12543 }
12544 return target;
12545
12546 scatter_gen:
12547 arg0 = CALL_EXPR_ARG (exp, 0);
12548 arg1 = CALL_EXPR_ARG (exp, 1);
12549 arg2 = CALL_EXPR_ARG (exp, 2);
12550 arg3 = CALL_EXPR_ARG (exp, 3);
12551 arg4 = CALL_EXPR_ARG (exp, 4);
12552 op0 = expand_normal (arg0);
12553 op1 = expand_normal (arg1);
12554 op2 = expand_normal (arg2);
12555 op3 = expand_normal (arg3);
12556 op4 = expand_normal (arg4);
12557 mode1 = insn_data[icode].operand[1].mode;
12558 mode2 = insn_data[icode].operand[2].mode;
12559 mode3 = insn_data[icode].operand[3].mode;
12560 mode4 = insn_data[icode].operand[4].mode;
12561
12562 /* Scatter instruction stores operand op3 to memory with
12563 indices from op2 and scale from op4 under writemask op1.
12564 If index operand op2 has more elements then source operand
12565 op3 one need to use only its low half. And vice versa. */
12566 switch (fcode)
12567 {
12568 case IX86_BUILTIN_SCATTERALTSIV8DF:
12569 case IX86_BUILTIN_SCATTERALTSIV8DI:
12570 half = gen_reg_rtx (V8SImode);
12571 if (!nonimmediate_operand (op2, V16SImode))
12572 op2 = copy_to_mode_reg (V16SImode, op2);
12573 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12574 op2 = half;
12575 break;
12576 case IX86_BUILTIN_SCATTERALTDIV16SF:
12577 case IX86_BUILTIN_SCATTERALTDIV16SI:
12578 half = gen_reg_rtx (mode3);
12579 if (mode3 == V8SFmode)
12580 gen = gen_vec_extract_lo_v16sf;
12581 else
12582 gen = gen_vec_extract_lo_v16si;
12583 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12584 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12585 emit_insn (gen (half, op3));
12586 op3 = half;
12587 break;
12588 case IX86_BUILTIN_SCATTERALTSIV4DF:
12589 case IX86_BUILTIN_SCATTERALTSIV4DI:
12590 half = gen_reg_rtx (V4SImode);
12591 if (!nonimmediate_operand (op2, V8SImode))
12592 op2 = copy_to_mode_reg (V8SImode, op2);
12593 emit_insn (gen_vec_extract_lo_v8si (half, op2));
12594 op2 = half;
12595 break;
12596 case IX86_BUILTIN_SCATTERALTDIV8SF:
12597 case IX86_BUILTIN_SCATTERALTDIV8SI:
12598 half = gen_reg_rtx (mode3);
12599 if (mode3 == V4SFmode)
12600 gen = gen_vec_extract_lo_v8sf;
12601 else
12602 gen = gen_vec_extract_lo_v8si;
12603 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12604 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12605 emit_insn (gen (half, op3));
12606 op3 = half;
12607 break;
12608 case IX86_BUILTIN_SCATTERALTSIV2DF:
12609 case IX86_BUILTIN_SCATTERALTSIV2DI:
12610 if (!nonimmediate_operand (op2, V4SImode))
12611 op2 = copy_to_mode_reg (V4SImode, op2);
12612 break;
12613 case IX86_BUILTIN_SCATTERALTDIV4SF:
12614 case IX86_BUILTIN_SCATTERALTDIV4SI:
12615 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12616 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12617 break;
12618 default:
12619 break;
12620 }
12621
12622 /* Force memory operand only with base register here. But we
12623 don't want to do it on memory operand for other builtin
12624 functions. */
12625 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
12626
12627 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12628 op0 = copy_to_mode_reg (Pmode, op0);
12629
12630 op1 = fixup_modeless_constant (op1, mode1);
12631
12632 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
12633 {
12634 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12635 op1 = copy_to_mode_reg (mode1, op1);
12636 }
12637 else
12638 {
12639 op1 = copy_to_reg (op1);
12640 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
12641 }
12642
12643 if (!insn_data[icode].operand[2].predicate (op2, mode2))
12644 op2 = copy_to_mode_reg (mode2, op2);
12645
12646 if (!insn_data[icode].operand[3].predicate (op3, mode3))
12647 op3 = copy_to_mode_reg (mode3, op3);
12648
12649 if (!insn_data[icode].operand[4].predicate (op4, mode4))
12650 {
12651 error ("the last argument must be scale 1, 2, 4, 8");
12652 return const0_rtx;
12653 }
12654
12655 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
12656 if (! pat)
12657 return const0_rtx;
12658
12659 emit_insn (pat);
12660 return 0;
12661
12662 vec_prefetch_gen:
12663 arg0 = CALL_EXPR_ARG (exp, 0);
12664 arg1 = CALL_EXPR_ARG (exp, 1);
12665 arg2 = CALL_EXPR_ARG (exp, 2);
12666 arg3 = CALL_EXPR_ARG (exp, 3);
12667 arg4 = CALL_EXPR_ARG (exp, 4);
12668 op0 = expand_normal (arg0);
12669 op1 = expand_normal (arg1);
12670 op2 = expand_normal (arg2);
12671 op3 = expand_normal (arg3);
12672 op4 = expand_normal (arg4);
12673 mode0 = insn_data[icode].operand[0].mode;
12674 mode1 = insn_data[icode].operand[1].mode;
12675 mode3 = insn_data[icode].operand[3].mode;
12676 mode4 = insn_data[icode].operand[4].mode;
12677
12678 op0 = fixup_modeless_constant (op0, mode0);
12679
12680 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
12681 {
12682 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12683 op0 = copy_to_mode_reg (mode0, op0);
12684 }
12685 else
12686 {
12687 op0 = copy_to_reg (op0);
12688 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
12689 }
12690
12691 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12692 op1 = copy_to_mode_reg (mode1, op1);
12693
12694 /* Force memory operand only with base register here. But we
12695 don't want to do it on memory operand for other builtin
12696 functions. */
12697 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
12698
12699 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
12700 op2 = copy_to_mode_reg (Pmode, op2);
12701
12702 if (!insn_data[icode].operand[3].predicate (op3, mode3))
12703 {
12704 error ("the forth argument must be scale 1, 2, 4, 8");
12705 return const0_rtx;
12706 }
12707
12708 if (!insn_data[icode].operand[4].predicate (op4, mode4))
12709 {
12710 error ("incorrect hint operand");
12711 return const0_rtx;
12712 }
12713
12714 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
12715 if (! pat)
12716 return const0_rtx;
12717
12718 emit_insn (pat);
12719
12720 return 0;
12721
12722 case IX86_BUILTIN_XABORT:
12723 icode = CODE_FOR_xabort;
12724 arg0 = CALL_EXPR_ARG (exp, 0);
12725 op0 = expand_normal (arg0);
12726 mode0 = insn_data[icode].operand[0].mode;
12727 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12728 {
12729 error ("the argument to %<xabort%> intrinsic must "
12730 "be an 8-bit immediate");
12731 return const0_rtx;
12732 }
12733 emit_insn (gen_xabort (op0));
12734 return 0;
12735
12736 case IX86_BUILTIN_RSTORSSP:
12737 case IX86_BUILTIN_CLRSSBSY:
12738 arg0 = CALL_EXPR_ARG (exp, 0);
12739 op0 = expand_normal (arg0);
12740 icode = (fcode == IX86_BUILTIN_RSTORSSP
12741 ? CODE_FOR_rstorssp
12742 : CODE_FOR_clrssbsy);
12743 if (!address_operand (op0, VOIDmode))
12744 {
12745 op1 = convert_memory_address (Pmode, op0);
12746 op0 = copy_addr_to_reg (op1);
12747 }
12748 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
12749 return 0;
12750
12751 case IX86_BUILTIN_WRSSD:
12752 case IX86_BUILTIN_WRSSQ:
12753 case IX86_BUILTIN_WRUSSD:
12754 case IX86_BUILTIN_WRUSSQ:
12755 arg0 = CALL_EXPR_ARG (exp, 0);
12756 op0 = expand_normal (arg0);
12757 arg1 = CALL_EXPR_ARG (exp, 1);
12758 op1 = expand_normal (arg1);
12759 switch (fcode)
12760 {
12761 case IX86_BUILTIN_WRSSD:
12762 icode = CODE_FOR_wrsssi;
12763 mode = SImode;
12764 break;
12765 case IX86_BUILTIN_WRSSQ:
12766 icode = CODE_FOR_wrssdi;
12767 mode = DImode;
12768 break;
12769 case IX86_BUILTIN_WRUSSD:
12770 icode = CODE_FOR_wrusssi;
12771 mode = SImode;
12772 break;
12773 case IX86_BUILTIN_WRUSSQ:
12774 icode = CODE_FOR_wrussdi;
12775 mode = DImode;
12776 break;
12777 }
12778 op0 = force_reg (mode, op0);
12779 if (!address_operand (op1, VOIDmode))
12780 {
12781 op2 = convert_memory_address (Pmode, op1);
12782 op1 = copy_addr_to_reg (op2);
12783 }
12784 emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
12785 return 0;
12786
12787 default:
12788 break;
12789 }
12790
12791 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
12792 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
12793 {
12794 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
12795 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
12796 target);
12797 }
12798
12799 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
12800 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
12801 {
12802 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
12803 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
12804 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
12805 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
12806 int masked = 1;
12807 machine_mode mode, wide_mode, nar_mode;
12808
12809 nar_mode = V4SFmode;
12810 mode = V16SFmode;
12811 wide_mode = V64SFmode;
12812 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
12813 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
12814
12815 switch (fcode)
12816 {
12817 case IX86_BUILTIN_4FMAPS:
12818 fcn = gen_avx5124fmaddps_4fmaddps;
12819 masked = 0;
12820 goto v4fma_expand;
12821
12822 case IX86_BUILTIN_4DPWSSD:
12823 nar_mode = V4SImode;
12824 mode = V16SImode;
12825 wide_mode = V64SImode;
12826 fcn = gen_avx5124vnniw_vp4dpwssd;
12827 masked = 0;
12828 goto v4fma_expand;
12829
12830 case IX86_BUILTIN_4DPWSSDS:
12831 nar_mode = V4SImode;
12832 mode = V16SImode;
12833 wide_mode = V64SImode;
12834 fcn = gen_avx5124vnniw_vp4dpwssds;
12835 masked = 0;
12836 goto v4fma_expand;
12837
12838 case IX86_BUILTIN_4FNMAPS:
12839 fcn = gen_avx5124fmaddps_4fnmaddps;
12840 masked = 0;
12841 goto v4fma_expand;
12842
12843 case IX86_BUILTIN_4FNMAPS_MASK:
12844 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
12845 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
12846 goto v4fma_expand;
12847
12848 case IX86_BUILTIN_4DPWSSD_MASK:
12849 nar_mode = V4SImode;
12850 mode = V16SImode;
12851 wide_mode = V64SImode;
12852 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
12853 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
12854 goto v4fma_expand;
12855
12856 case IX86_BUILTIN_4DPWSSDS_MASK:
12857 nar_mode = V4SImode;
12858 mode = V16SImode;
12859 wide_mode = V64SImode;
12860 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
12861 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
12862 goto v4fma_expand;
12863
12864 case IX86_BUILTIN_4FMAPS_MASK:
12865 {
12866 tree args[4];
12867 rtx ops[4];
12868 rtx wide_reg;
12869 rtx accum;
12870 rtx addr;
12871 rtx mem;
12872
12873v4fma_expand:
12874 wide_reg = gen_reg_rtx (wide_mode);
12875 for (i = 0; i < 4; i++)
12876 {
12877 args[i] = CALL_EXPR_ARG (exp, i);
12878 ops[i] = expand_normal (args[i]);
12879
12880 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
12881 ops[i]);
12882 }
12883
12884 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
12885 accum = force_reg (mode, accum);
12886
12887 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
12888 addr = force_reg (Pmode, addr);
12889
12890 mem = gen_rtx_MEM (nar_mode, addr);
12891
12892 target = gen_reg_rtx (mode);
12893
12894 emit_move_insn (target, accum);
12895
12896 if (! masked)
12897 emit_insn (fcn (target, accum, wide_reg, mem));
12898 else
12899 {
12900 rtx merge, mask;
12901 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
12902
12903 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
12904
12905 if (CONST_INT_P (mask))
12906 mask = fixup_modeless_constant (mask, HImode);
12907
12908 mask = force_reg (HImode, mask);
12909
12910 if (GET_MODE (mask) != HImode)
12911 mask = gen_rtx_SUBREG (HImode, mask, 0);
12912
12913 /* If merge is 0 then we're about to emit z-masked variant. */
12914 if (const0_operand (merge, mode))
12915 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
12916 /* If merge is the same as accum then emit merge-masked variant. */
12917 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
12918 {
12919 merge = force_reg (mode, merge);
12920 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
12921 }
12922 /* Merge with something unknown might happen if we z-mask w/ -O0. */
12923 else
12924 {
12925 target = gen_reg_rtx (mode);
12926 emit_move_insn (target, merge);
12927 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
12928 }
12929 }
12930 return target;
12931 }
12932
12933 case IX86_BUILTIN_4FNMASS:
12934 fcn = gen_avx5124fmaddps_4fnmaddss;
12935 masked = 0;
12936 goto s4fma_expand;
12937
12938 case IX86_BUILTIN_4FMASS:
12939 fcn = gen_avx5124fmaddps_4fmaddss;
12940 masked = 0;
12941 goto s4fma_expand;
12942
12943 case IX86_BUILTIN_4FNMASS_MASK:
12944 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
12945 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
12946 goto s4fma_expand;
12947
12948 case IX86_BUILTIN_4FMASS_MASK:
12949 {
12950 tree args[4];
12951 rtx ops[4];
12952 rtx wide_reg;
12953 rtx accum;
12954 rtx addr;
12955 rtx mem;
12956
12957 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
12958 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
12959
12960s4fma_expand:
12961 mode = V4SFmode;
12962 wide_reg = gen_reg_rtx (V64SFmode);
12963 for (i = 0; i < 4; i++)
12964 {
12965 rtx tmp;
12966 args[i] = CALL_EXPR_ARG (exp, i);
12967 ops[i] = expand_normal (args[i]);
12968
12969 tmp = gen_reg_rtx (SFmode);
12970 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
12971
12972 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
12973 gen_rtx_SUBREG (V16SFmode, tmp, 0));
12974 }
12975
12976 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
12977 accum = force_reg (V4SFmode, accum);
12978
12979 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
12980 addr = force_reg (Pmode, addr);
12981
12982 mem = gen_rtx_MEM (V4SFmode, addr);
12983
12984 target = gen_reg_rtx (V4SFmode);
12985
12986 emit_move_insn (target, accum);
12987
12988 if (! masked)
12989 emit_insn (fcn (target, accum, wide_reg, mem));
12990 else
12991 {
12992 rtx merge, mask;
12993 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
12994
12995 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
12996
12997 if (CONST_INT_P (mask))
12998 mask = fixup_modeless_constant (mask, QImode);
12999
13000 mask = force_reg (QImode, mask);
13001
13002 if (GET_MODE (mask) != QImode)
13003 mask = gen_rtx_SUBREG (QImode, mask, 0);
13004
13005 /* If merge is 0 then we're about to emit z-masked variant. */
13006 if (const0_operand (merge, mode))
13007 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
13008 /* If merge is the same as accum then emit merge-masked
13009 variant. */
13010 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
13011 {
13012 merge = force_reg (mode, merge);
13013 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
13014 }
13015 /* Merge with something unknown might happen if we z-mask
13016 w/ -O0. */
13017 else
13018 {
13019 target = gen_reg_rtx (mode);
13020 emit_move_insn (target, merge);
13021 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
13022 }
13023 }
13024 return target;
13025 }
13026 case IX86_BUILTIN_RDPID:
13027 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
13028 target);
13029 case IX86_BUILTIN_FABSQ:
13030 case IX86_BUILTIN_COPYSIGNQ:
13031 if (!TARGET_SSE)
13032 /* Emit a normal call if SSE isn't available. */
13033 return expand_call (exp, target, ignore);
13034 /* FALLTHRU */
13035 default:
13036 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
13037 }
13038 }
13039
13040 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
13041 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
13042 {
13043 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
13044 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
13045 }
13046
13047 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
13048 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
13049 {
13050 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
13051 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
13052 }
13053
13054 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
13055 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
13056 {
13057 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
13058 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
13059 }
13060
13061 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
13062 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
13063 {
13064 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
13065 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
13066 }
13067
13068 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
13069 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
13070 {
13071 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
13072 const struct builtin_description *d = bdesc_multi_arg + i;
13073 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
13074 (enum ix86_builtin_func_type)
13075 d->flag, d->comparison);
13076 }
13077
13078 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
13079 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
13080 {
13081 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
13082 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
13083 target);
13084 }
13085
13086 if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
13087 && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
13088 {
13089 i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
13090 return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
13091 target);
13092 }
13093
13094 gcc_unreachable ();
13095}
13096
13097/* A subroutine of ix86_expand_vector_init_duplicate. Tries to
13098 fill target with val via vec_duplicate. */
13099
13100static bool
13101ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
13102{
13103 bool ok;
13104 rtx_insn *insn;
13105 rtx dup;
13106
13107 /* First attempt to recognize VAL as-is. */
13108 dup = gen_vec_duplicate (mode, val);
13109 insn = emit_insn (gen_rtx_SET (target, dup));
13110 if (recog_memoized (insn) < 0)
13111 {
13112 rtx_insn *seq;
13113 machine_mode innermode = GET_MODE_INNER (mode);
13114 rtx reg;
13115
13116 /* If that fails, force VAL into a register. */
13117
13118 start_sequence ();
13119 reg = force_reg (innermode, val);
13120 if (GET_MODE (reg) != innermode)
13121 reg = gen_lowpart (innermode, reg);
13122 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
13123 seq = get_insns ();
13124 end_sequence ();
13125 if (seq)
13126 emit_insn_before (seq, insn);
13127
13128 ok = recog_memoized (insn) >= 0;
13129 gcc_assert (ok);
13130 }
13131 return true;
13132}
13133
13134/* Get a vector mode of the same size as the original but with elements
13135 twice as wide. This is only guaranteed to apply to integral vectors. */
13136
13137static machine_mode
13138get_mode_wider_vector (machine_mode o)
13139{
13140 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
13141 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
13142 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
13143 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
13144 return n;
13145}
13146
13147static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
13148static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
13149
13150/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13151 with all elements equal to VAR. Return true if successful. */
13152
13153static bool
13154ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
13155 rtx target, rtx val)
13156{
13157 bool ok;
13158
13159 switch (mode)
13160 {
13161 case E_V2SImode:
13162 case E_V2SFmode:
13163 if (!mmx_ok)
13164 return false;
13165 /* FALLTHRU */
13166
13167 case E_V4DFmode:
13168 case E_V4DImode:
13169 case E_V8SFmode:
13170 case E_V8SImode:
13171 case E_V2DFmode:
13172 case E_V2DImode:
13173 case E_V4SFmode:
13174 case E_V4SImode:
13175 case E_V16SImode:
13176 case E_V8DImode:
13177 case E_V16SFmode:
13178 case E_V8DFmode:
13179 return ix86_vector_duplicate_value (mode, target, val);
13180
13181 case E_V4HImode:
13182 if (!mmx_ok)
13183 return false;
13184 if (TARGET_SSE || TARGET_3DNOW_A)
13185 {
13186 rtx x;
13187
13188 val = gen_lowpart (SImode, val);
13189 x = gen_rtx_TRUNCATE (HImode, val);
13190 x = gen_rtx_VEC_DUPLICATE (mode, x);
13191 emit_insn (gen_rtx_SET (target, x));
13192 return true;
13193 }
13194 goto widen;
13195
13196 case E_V8QImode:
13197 if (!mmx_ok)
13198 return false;
13199 goto widen;
13200
13201 case E_V8HImode:
13202 if (TARGET_AVX2)
13203 return ix86_vector_duplicate_value (mode, target, val);
13204
13205 if (TARGET_SSE2)
13206 {
13207 struct expand_vec_perm_d dperm;
13208 rtx tmp1, tmp2;
13209
13210 permute:
13211 memset (&dperm, 0, sizeof (dperm));
13212 dperm.target = target;
13213 dperm.vmode = mode;
13214 dperm.nelt = GET_MODE_NUNITS (mode);
13215 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
13216 dperm.one_operand_p = true;
13217
13218 /* Extend to SImode using a paradoxical SUBREG. */
13219 tmp1 = gen_reg_rtx (SImode);
13220 emit_move_insn (tmp1, gen_lowpart (SImode, val));
13221
13222 /* Insert the SImode value as low element of a V4SImode vector. */
13223 tmp2 = gen_reg_rtx (V4SImode);
13224 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
13225 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
13226
13227 ok = (expand_vec_perm_1 (&dperm)
13228 || expand_vec_perm_broadcast_1 (&dperm));
13229 gcc_assert (ok);
13230 return ok;
13231 }
13232 goto widen;
13233
13234 case E_V16QImode:
13235 if (TARGET_AVX2)
13236 return ix86_vector_duplicate_value (mode, target, val);
13237
13238 if (TARGET_SSE2)
13239 goto permute;
13240 goto widen;
13241
13242 widen:
13243 /* Replicate the value once into the next wider mode and recurse. */
13244 {
13245 machine_mode smode, wsmode, wvmode;
13246 rtx x;
13247
13248 smode = GET_MODE_INNER (mode);
13249 wvmode = get_mode_wider_vector (mode);
13250 wsmode = GET_MODE_INNER (wvmode);
13251
13252 val = convert_modes (wsmode, smode, val, true);
13253 x = expand_simple_binop (wsmode, ASHIFT, val,
13254 GEN_INT (GET_MODE_BITSIZE (smode)),
13255 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13256 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
13257
13258 x = gen_reg_rtx (wvmode);
13259 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
13260 gcc_assert (ok);
13261 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
13262 return ok;
13263 }
13264
13265 case E_V16HImode:
13266 case E_V32QImode:
13267 if (TARGET_AVX2)
13268 return ix86_vector_duplicate_value (mode, target, val);
13269 else
13270 {
13271 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
13272 rtx x = gen_reg_rtx (hvmode);
13273
13274 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13275 gcc_assert (ok);
13276
13277 x = gen_rtx_VEC_CONCAT (mode, x, x);
13278 emit_insn (gen_rtx_SET (target, x));
13279 }
13280 return true;
13281
13282 case E_V64QImode:
13283 case E_V32HImode:
13284 if (TARGET_AVX512BW)
13285 return ix86_vector_duplicate_value (mode, target, val);
13286 else
13287 {
13288 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
13289 rtx x = gen_reg_rtx (hvmode);
13290
13291 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13292 gcc_assert (ok);
13293
13294 x = gen_rtx_VEC_CONCAT (mode, x, x);
13295 emit_insn (gen_rtx_SET (target, x));
13296 }
13297 return true;
13298
13299 default:
13300 return false;
13301 }
13302}
13303
13304/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13305 whose ONE_VAR element is VAR, and other elements are zero. Return true
13306 if successful. */
13307
13308static bool
13309ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
13310 rtx target, rtx var, int one_var)
13311{
13312 machine_mode vsimode;
13313 rtx new_target;
13314 rtx x, tmp;
13315 bool use_vector_set = false;
13316 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
13317
13318 switch (mode)
13319 {
13320 case E_V2DImode:
13321 /* For SSE4.1, we normally use vector set. But if the second
13322 element is zero and inter-unit moves are OK, we use movq
13323 instead. */
13324 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
13325 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
13326 && one_var == 0));
13327 break;
13328 case E_V16QImode:
13329 case E_V4SImode:
13330 case E_V4SFmode:
13331 use_vector_set = TARGET_SSE4_1;
13332 break;
13333 case E_V8HImode:
13334 use_vector_set = TARGET_SSE2;
13335 break;
8a0eb0cd
UB
13336 case E_V8QImode:
13337 use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
13338 break;
2bf6d935
ML
13339 case E_V4HImode:
13340 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
13341 break;
13342 case E_V32QImode:
13343 case E_V16HImode:
13344 use_vector_set = TARGET_AVX;
13345 break;
13346 case E_V8SImode:
13347 use_vector_set = TARGET_AVX;
13348 gen_vec_set_0 = gen_vec_setv8si_0;
13349 break;
13350 case E_V8SFmode:
13351 use_vector_set = TARGET_AVX;
13352 gen_vec_set_0 = gen_vec_setv8sf_0;
13353 break;
13354 case E_V4DFmode:
13355 use_vector_set = TARGET_AVX;
13356 gen_vec_set_0 = gen_vec_setv4df_0;
13357 break;
13358 case E_V4DImode:
13359 /* Use ix86_expand_vector_set in 64bit mode only. */
13360 use_vector_set = TARGET_AVX && TARGET_64BIT;
13361 gen_vec_set_0 = gen_vec_setv4di_0;
13362 break;
13363 case E_V16SImode:
13364 use_vector_set = TARGET_AVX512F && one_var == 0;
13365 gen_vec_set_0 = gen_vec_setv16si_0;
13366 break;
13367 case E_V16SFmode:
13368 use_vector_set = TARGET_AVX512F && one_var == 0;
13369 gen_vec_set_0 = gen_vec_setv16sf_0;
13370 break;
13371 case E_V8DFmode:
13372 use_vector_set = TARGET_AVX512F && one_var == 0;
13373 gen_vec_set_0 = gen_vec_setv8df_0;
13374 break;
13375 case E_V8DImode:
13376 /* Use ix86_expand_vector_set in 64bit mode only. */
13377 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
13378 gen_vec_set_0 = gen_vec_setv8di_0;
13379 break;
13380 default:
13381 break;
13382 }
13383
13384 if (use_vector_set)
13385 {
13386 if (gen_vec_set_0 && one_var == 0)
13387 {
13388 var = force_reg (GET_MODE_INNER (mode), var);
13389 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
13390 return true;
13391 }
13392 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
13393 var = force_reg (GET_MODE_INNER (mode), var);
13394 ix86_expand_vector_set (mmx_ok, target, var, one_var);
13395 return true;
13396 }
13397
13398 switch (mode)
13399 {
13400 case E_V2SFmode:
13401 case E_V2SImode:
13402 if (!mmx_ok)
13403 return false;
13404 /* FALLTHRU */
13405
13406 case E_V2DFmode:
13407 case E_V2DImode:
13408 if (one_var != 0)
13409 return false;
13410 var = force_reg (GET_MODE_INNER (mode), var);
13411 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
13412 emit_insn (gen_rtx_SET (target, x));
13413 return true;
13414
13415 case E_V4SFmode:
13416 case E_V4SImode:
13417 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
13418 new_target = gen_reg_rtx (mode);
13419 else
13420 new_target = target;
13421 var = force_reg (GET_MODE_INNER (mode), var);
13422 x = gen_rtx_VEC_DUPLICATE (mode, var);
13423 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
13424 emit_insn (gen_rtx_SET (new_target, x));
13425 if (one_var != 0)
13426 {
13427 /* We need to shuffle the value to the correct position, so
13428 create a new pseudo to store the intermediate result. */
13429
13430 /* With SSE2, we can use the integer shuffle insns. */
13431 if (mode != V4SFmode && TARGET_SSE2)
13432 {
13433 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
13434 const1_rtx,
13435 GEN_INT (one_var == 1 ? 0 : 1),
13436 GEN_INT (one_var == 2 ? 0 : 1),
13437 GEN_INT (one_var == 3 ? 0 : 1)));
13438 if (target != new_target)
13439 emit_move_insn (target, new_target);
13440 return true;
13441 }
13442
13443 /* Otherwise convert the intermediate result to V4SFmode and
13444 use the SSE1 shuffle instructions. */
13445 if (mode != V4SFmode)
13446 {
13447 tmp = gen_reg_rtx (V4SFmode);
13448 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
13449 }
13450 else
13451 tmp = new_target;
13452
13453 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
13454 const1_rtx,
13455 GEN_INT (one_var == 1 ? 0 : 1),
13456 GEN_INT (one_var == 2 ? 0+4 : 1+4),
13457 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
13458
13459 if (mode != V4SFmode)
13460 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
13461 else if (tmp != target)
13462 emit_move_insn (target, tmp);
13463 }
13464 else if (target != new_target)
13465 emit_move_insn (target, new_target);
13466 return true;
13467
13468 case E_V8HImode:
13469 case E_V16QImode:
13470 vsimode = V4SImode;
13471 goto widen;
13472 case E_V4HImode:
13473 case E_V8QImode:
13474 if (!mmx_ok)
13475 return false;
13476 vsimode = V2SImode;
13477 goto widen;
13478 widen:
13479 if (one_var != 0)
13480 return false;
13481
13482 /* Zero extend the variable element to SImode and recurse. */
13483 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
13484
13485 x = gen_reg_rtx (vsimode);
13486 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
13487 var, one_var))
13488 gcc_unreachable ();
13489
13490 emit_move_insn (target, gen_lowpart (mode, x));
13491 return true;
13492
13493 default:
13494 return false;
13495 }
13496}
13497
13498/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13499 consisting of the values in VALS. It is known that all elements
13500 except ONE_VAR are constants. Return true if successful. */
13501
13502static bool
13503ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
13504 rtx target, rtx vals, int one_var)
13505{
13506 rtx var = XVECEXP (vals, 0, one_var);
13507 machine_mode wmode;
13508 rtx const_vec, x;
13509
13510 const_vec = copy_rtx (vals);
13511 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
13512 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
13513
13514 switch (mode)
13515 {
13516 case E_V2DFmode:
13517 case E_V2DImode:
13518 case E_V2SFmode:
13519 case E_V2SImode:
13520 /* For the two element vectors, it's just as easy to use
13521 the general case. */
13522 return false;
13523
13524 case E_V4DImode:
13525 /* Use ix86_expand_vector_set in 64bit mode only. */
13526 if (!TARGET_64BIT)
13527 return false;
13528 /* FALLTHRU */
13529 case E_V4DFmode:
13530 case E_V8SFmode:
13531 case E_V8SImode:
13532 case E_V16HImode:
13533 case E_V32QImode:
13534 case E_V4SFmode:
13535 case E_V4SImode:
13536 case E_V8HImode:
13537 case E_V4HImode:
13538 break;
13539
13540 case E_V16QImode:
13541 if (TARGET_SSE4_1)
13542 break;
13543 wmode = V8HImode;
13544 goto widen;
13545 case E_V8QImode:
8a0eb0cd
UB
13546 if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
13547 break;
2bf6d935
ML
13548 wmode = V4HImode;
13549 goto widen;
13550 widen:
13551 /* There's no way to set one QImode entry easily. Combine
13552 the variable value with its adjacent constant value, and
13553 promote to an HImode set. */
13554 x = XVECEXP (vals, 0, one_var ^ 1);
13555 if (one_var & 1)
13556 {
13557 var = convert_modes (HImode, QImode, var, true);
13558 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
13559 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13560 x = GEN_INT (INTVAL (x) & 0xff);
13561 }
13562 else
13563 {
13564 var = convert_modes (HImode, QImode, var, true);
13565 x = gen_int_mode (UINTVAL (x) << 8, HImode);
13566 }
13567 if (x != const0_rtx)
13568 var = expand_simple_binop (HImode, IOR, var, x, var,
13569 1, OPTAB_LIB_WIDEN);
13570
13571 x = gen_reg_rtx (wmode);
13572 emit_move_insn (x, gen_lowpart (wmode, const_vec));
13573 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
13574
13575 emit_move_insn (target, gen_lowpart (mode, x));
13576 return true;
13577
13578 default:
13579 return false;
13580 }
13581
13582 emit_move_insn (target, const_vec);
13583 ix86_expand_vector_set (mmx_ok, target, var, one_var);
13584 return true;
13585}
13586
13587/* A subroutine of ix86_expand_vector_init_general. Use vector
13588 concatenate to handle the most general case: all values variable,
13589 and none identical. */
13590
13591static void
13592ix86_expand_vector_init_concat (machine_mode mode,
13593 rtx target, rtx *ops, int n)
13594{
1aeecaf5
HL
13595 machine_mode half_mode = VOIDmode;
13596 rtx half[2];
2bf6d935
ML
13597 rtvec v;
13598 int i, j;
13599
13600 switch (n)
13601 {
13602 case 2:
13603 switch (mode)
13604 {
13605 case E_V16SImode:
1aeecaf5 13606 half_mode = V8SImode;
2bf6d935
ML
13607 break;
13608 case E_V16SFmode:
1aeecaf5 13609 half_mode = V8SFmode;
2bf6d935
ML
13610 break;
13611 case E_V8DImode:
1aeecaf5 13612 half_mode = V4DImode;
2bf6d935
ML
13613 break;
13614 case E_V8DFmode:
1aeecaf5 13615 half_mode = V4DFmode;
2bf6d935
ML
13616 break;
13617 case E_V8SImode:
1aeecaf5 13618 half_mode = V4SImode;
2bf6d935
ML
13619 break;
13620 case E_V8SFmode:
1aeecaf5 13621 half_mode = V4SFmode;
2bf6d935
ML
13622 break;
13623 case E_V4DImode:
1aeecaf5 13624 half_mode = V2DImode;
2bf6d935
ML
13625 break;
13626 case E_V4DFmode:
1aeecaf5 13627 half_mode = V2DFmode;
2bf6d935
ML
13628 break;
13629 case E_V4SImode:
1aeecaf5 13630 half_mode = V2SImode;
2bf6d935
ML
13631 break;
13632 case E_V4SFmode:
1aeecaf5 13633 half_mode = V2SFmode;
2bf6d935
ML
13634 break;
13635 case E_V2DImode:
1aeecaf5 13636 half_mode = DImode;
2bf6d935
ML
13637 break;
13638 case E_V2SImode:
1aeecaf5 13639 half_mode = SImode;
2bf6d935
ML
13640 break;
13641 case E_V2DFmode:
1aeecaf5 13642 half_mode = DFmode;
2bf6d935
ML
13643 break;
13644 case E_V2SFmode:
1aeecaf5 13645 half_mode = SFmode;
2bf6d935
ML
13646 break;
13647 default:
13648 gcc_unreachable ();
13649 }
13650
1aeecaf5
HL
13651 if (!register_operand (ops[1], half_mode))
13652 ops[1] = force_reg (half_mode, ops[1]);
13653 if (!register_operand (ops[0], half_mode))
13654 ops[0] = force_reg (half_mode, ops[0]);
2bf6d935
ML
13655 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
13656 ops[1])));
13657 break;
13658
13659 case 4:
13660 switch (mode)
13661 {
13662 case E_V4DImode:
1aeecaf5 13663 half_mode = V2DImode;
2bf6d935
ML
13664 break;
13665 case E_V4DFmode:
1aeecaf5 13666 half_mode = V2DFmode;
2bf6d935
ML
13667 break;
13668 case E_V4SImode:
1aeecaf5 13669 half_mode = V2SImode;
2bf6d935
ML
13670 break;
13671 case E_V4SFmode:
1aeecaf5 13672 half_mode = V2SFmode;
2bf6d935
ML
13673 break;
13674 default:
13675 gcc_unreachable ();
13676 }
13677 goto half;
13678
13679 case 8:
13680 switch (mode)
13681 {
13682 case E_V8DImode:
1aeecaf5 13683 half_mode = V4DImode;
2bf6d935
ML
13684 break;
13685 case E_V8DFmode:
1aeecaf5 13686 half_mode = V4DFmode;
2bf6d935
ML
13687 break;
13688 case E_V8SImode:
1aeecaf5 13689 half_mode = V4SImode;
2bf6d935
ML
13690 break;
13691 case E_V8SFmode:
1aeecaf5 13692 half_mode = V4SFmode;
2bf6d935
ML
13693 break;
13694 default:
13695 gcc_unreachable ();
13696 }
13697 goto half;
13698
13699 case 16:
13700 switch (mode)
13701 {
13702 case E_V16SImode:
1aeecaf5 13703 half_mode = V8SImode;
2bf6d935
ML
13704 break;
13705 case E_V16SFmode:
1aeecaf5 13706 half_mode = V8SFmode;
2bf6d935
ML
13707 break;
13708 default:
13709 gcc_unreachable ();
13710 }
13711 goto half;
13712
13713half:
13714 /* FIXME: We process inputs backward to help RA. PR 36222. */
13715 i = n - 1;
1aeecaf5 13716 for (j = 1; j != -1; j--)
2bf6d935 13717 {
1aeecaf5
HL
13718 half[j] = gen_reg_rtx (half_mode);
13719 switch (n >> 1)
2bf6d935 13720 {
1aeecaf5
HL
13721 case 2:
13722 v = gen_rtvec (2, ops[i-1], ops[i]);
13723 i -= 2;
13724 break;
13725 case 4:
13726 v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
13727 i -= 4;
13728 break;
13729 case 8:
13730 v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
13731 ops[i-3], ops[i-2], ops[i-1], ops[i]);
13732 i -= 8;
13733 break;
13734 default:
13735 gcc_unreachable ();
2bf6d935 13736 }
1aeecaf5
HL
13737 ix86_expand_vector_init (false, half[j],
13738 gen_rtx_PARALLEL (half_mode, v));
2bf6d935 13739 }
1aeecaf5
HL
13740
13741 ix86_expand_vector_init_concat (mode, target, half, 2);
2bf6d935
ML
13742 break;
13743
13744 default:
13745 gcc_unreachable ();
13746 }
13747}
13748
13749/* A subroutine of ix86_expand_vector_init_general. Use vector
13750 interleave to handle the most general case: all values variable,
13751 and none identical. */
13752
13753static void
13754ix86_expand_vector_init_interleave (machine_mode mode,
13755 rtx target, rtx *ops, int n)
13756{
13757 machine_mode first_imode, second_imode, third_imode, inner_mode;
13758 int i, j;
13759 rtx op0, op1;
13760 rtx (*gen_load_even) (rtx, rtx, rtx);
13761 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
13762 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
13763
13764 switch (mode)
13765 {
13766 case E_V8HImode:
13767 gen_load_even = gen_vec_setv8hi;
13768 gen_interleave_first_low = gen_vec_interleave_lowv4si;
13769 gen_interleave_second_low = gen_vec_interleave_lowv2di;
13770 inner_mode = HImode;
13771 first_imode = V4SImode;
13772 second_imode = V2DImode;
13773 third_imode = VOIDmode;
13774 break;
13775 case E_V16QImode:
13776 gen_load_even = gen_vec_setv16qi;
13777 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
13778 gen_interleave_second_low = gen_vec_interleave_lowv4si;
13779 inner_mode = QImode;
13780 first_imode = V8HImode;
13781 second_imode = V4SImode;
13782 third_imode = V2DImode;
13783 break;
13784 default:
13785 gcc_unreachable ();
13786 }
13787
13788 for (i = 0; i < n; i++)
13789 {
13790 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
13791 op0 = gen_reg_rtx (SImode);
13792 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
13793
13794 /* Insert the SImode value as low element of V4SImode vector. */
13795 op1 = gen_reg_rtx (V4SImode);
13796 op0 = gen_rtx_VEC_MERGE (V4SImode,
13797 gen_rtx_VEC_DUPLICATE (V4SImode,
13798 op0),
13799 CONST0_RTX (V4SImode),
13800 const1_rtx);
13801 emit_insn (gen_rtx_SET (op1, op0));
13802
13803 /* Cast the V4SImode vector back to a vector in orignal mode. */
13804 op0 = gen_reg_rtx (mode);
13805 emit_move_insn (op0, gen_lowpart (mode, op1));
13806
13807 /* Load even elements into the second position. */
13808 emit_insn (gen_load_even (op0,
13809 force_reg (inner_mode,
13810 ops [i + i + 1]),
13811 const1_rtx));
13812
13813 /* Cast vector to FIRST_IMODE vector. */
13814 ops[i] = gen_reg_rtx (first_imode);
13815 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
13816 }
13817
13818 /* Interleave low FIRST_IMODE vectors. */
13819 for (i = j = 0; i < n; i += 2, j++)
13820 {
13821 op0 = gen_reg_rtx (first_imode);
13822 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
13823
13824 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
13825 ops[j] = gen_reg_rtx (second_imode);
13826 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
13827 }
13828
13829 /* Interleave low SECOND_IMODE vectors. */
13830 switch (second_imode)
13831 {
13832 case E_V4SImode:
13833 for (i = j = 0; i < n / 2; i += 2, j++)
13834 {
13835 op0 = gen_reg_rtx (second_imode);
13836 emit_insn (gen_interleave_second_low (op0, ops[i],
13837 ops[i + 1]));
13838
13839 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
13840 vector. */
13841 ops[j] = gen_reg_rtx (third_imode);
13842 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
13843 }
13844 second_imode = V2DImode;
13845 gen_interleave_second_low = gen_vec_interleave_lowv2di;
13846 /* FALLTHRU */
13847
13848 case E_V2DImode:
13849 op0 = gen_reg_rtx (second_imode);
13850 emit_insn (gen_interleave_second_low (op0, ops[0],
13851 ops[1]));
13852
13853 /* Cast the SECOND_IMODE vector back to a vector on original
13854 mode. */
13855 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
13856 break;
13857
13858 default:
13859 gcc_unreachable ();
13860 }
13861}
13862
13863/* A subroutine of ix86_expand_vector_init. Handle the most general case:
13864 all values variable, and none identical. */
13865
13866static void
13867ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
13868 rtx target, rtx vals)
13869{
13870 rtx ops[64], op0, op1, op2, op3, op4, op5;
13871 machine_mode half_mode = VOIDmode;
13872 machine_mode quarter_mode = VOIDmode;
13873 int n, i;
13874
13875 switch (mode)
13876 {
13877 case E_V2SFmode:
13878 case E_V2SImode:
13879 if (!mmx_ok && !TARGET_SSE)
13880 break;
13881 /* FALLTHRU */
13882
13883 case E_V16SImode:
13884 case E_V16SFmode:
13885 case E_V8DFmode:
13886 case E_V8DImode:
13887 case E_V8SFmode:
13888 case E_V8SImode:
13889 case E_V4DFmode:
13890 case E_V4DImode:
13891 case E_V4SFmode:
13892 case E_V4SImode:
13893 case E_V2DFmode:
13894 case E_V2DImode:
13895 n = GET_MODE_NUNITS (mode);
13896 for (i = 0; i < n; i++)
13897 ops[i] = XVECEXP (vals, 0, i);
13898 ix86_expand_vector_init_concat (mode, target, ops, n);
13899 return;
13900
13901 case E_V2TImode:
13902 for (i = 0; i < 2; i++)
13903 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
13904 op0 = gen_reg_rtx (V4DImode);
13905 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
13906 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
13907 return;
13908
13909 case E_V4TImode:
13910 for (i = 0; i < 4; i++)
13911 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
13912 ops[4] = gen_reg_rtx (V4DImode);
13913 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
13914 ops[5] = gen_reg_rtx (V4DImode);
13915 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
13916 op0 = gen_reg_rtx (V8DImode);
13917 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
13918 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
13919 return;
13920
13921 case E_V32QImode:
13922 half_mode = V16QImode;
13923 goto half;
13924
13925 case E_V16HImode:
13926 half_mode = V8HImode;
13927 goto half;
13928
13929half:
13930 n = GET_MODE_NUNITS (mode);
13931 for (i = 0; i < n; i++)
13932 ops[i] = XVECEXP (vals, 0, i);
13933 op0 = gen_reg_rtx (half_mode);
13934 op1 = gen_reg_rtx (half_mode);
13935 ix86_expand_vector_init_interleave (half_mode, op0, ops,
13936 n >> 2);
13937 ix86_expand_vector_init_interleave (half_mode, op1,
13938 &ops [n >> 1], n >> 2);
13939 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
13940 return;
13941
13942 case E_V64QImode:
13943 quarter_mode = V16QImode;
13944 half_mode = V32QImode;
13945 goto quarter;
13946
13947 case E_V32HImode:
13948 quarter_mode = V8HImode;
13949 half_mode = V16HImode;
13950 goto quarter;
13951
13952quarter:
13953 n = GET_MODE_NUNITS (mode);
13954 for (i = 0; i < n; i++)
13955 ops[i] = XVECEXP (vals, 0, i);
13956 op0 = gen_reg_rtx (quarter_mode);
13957 op1 = gen_reg_rtx (quarter_mode);
13958 op2 = gen_reg_rtx (quarter_mode);
13959 op3 = gen_reg_rtx (quarter_mode);
13960 op4 = gen_reg_rtx (half_mode);
13961 op5 = gen_reg_rtx (half_mode);
13962 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
13963 n >> 3);
13964 ix86_expand_vector_init_interleave (quarter_mode, op1,
13965 &ops [n >> 2], n >> 3);
13966 ix86_expand_vector_init_interleave (quarter_mode, op2,
13967 &ops [n >> 1], n >> 3);
13968 ix86_expand_vector_init_interleave (quarter_mode, op3,
13969 &ops [(n >> 1) | (n >> 2)], n >> 3);
13970 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
13971 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
13972 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
13973 return;
13974
13975 case E_V16QImode:
13976 if (!TARGET_SSE4_1)
13977 break;
13978 /* FALLTHRU */
13979
13980 case E_V8HImode:
13981 if (!TARGET_SSE2)
13982 break;
13983
13984 /* Don't use ix86_expand_vector_init_interleave if we can't
13985 move from GPR to SSE register directly. */
13986 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
13987 break;
13988
13989 n = GET_MODE_NUNITS (mode);
13990 for (i = 0; i < n; i++)
13991 ops[i] = XVECEXP (vals, 0, i);
13992 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
13993 return;
13994
13995 case E_V4HImode:
13996 case E_V8QImode:
13997 break;
13998
13999 default:
14000 gcc_unreachable ();
14001 }
14002
14003 {
14004 int i, j, n_elts, n_words, n_elt_per_word;
14005 machine_mode inner_mode;
14006 rtx words[4], shift;
14007
14008 inner_mode = GET_MODE_INNER (mode);
14009 n_elts = GET_MODE_NUNITS (mode);
14010 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
14011 n_elt_per_word = n_elts / n_words;
14012 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
14013
14014 for (i = 0; i < n_words; ++i)
14015 {
14016 rtx word = NULL_RTX;
14017
14018 for (j = 0; j < n_elt_per_word; ++j)
14019 {
14020 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
14021 elt = convert_modes (word_mode, inner_mode, elt, true);
14022
14023 if (j == 0)
14024 word = elt;
14025 else
14026 {
14027 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
14028 word, 1, OPTAB_LIB_WIDEN);
14029 word = expand_simple_binop (word_mode, IOR, word, elt,
14030 word, 1, OPTAB_LIB_WIDEN);
14031 }
14032 }
14033
14034 words[i] = word;
14035 }
14036
14037 if (n_words == 1)
14038 emit_move_insn (target, gen_lowpart (mode, words[0]));
14039 else if (n_words == 2)
14040 {
14041 rtx tmp = gen_reg_rtx (mode);
14042 emit_clobber (tmp);
14043 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
14044 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
14045 emit_move_insn (target, tmp);
14046 }
14047 else if (n_words == 4)
14048 {
14049 rtx tmp = gen_reg_rtx (V4SImode);
14050 gcc_assert (word_mode == SImode);
14051 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
14052 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
14053 emit_move_insn (target, gen_lowpart (mode, tmp));
14054 }
14055 else
14056 gcc_unreachable ();
14057 }
14058}
14059
14060/* Initialize vector TARGET via VALS. Suppress the use of MMX
14061 instructions unless MMX_OK is true. */
14062
14063void
14064ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
14065{
14066 machine_mode mode = GET_MODE (target);
14067 machine_mode inner_mode = GET_MODE_INNER (mode);
14068 int n_elts = GET_MODE_NUNITS (mode);
14069 int n_var = 0, one_var = -1;
14070 bool all_same = true, all_const_zero = true;
14071 int i;
14072 rtx x;
14073
14074 /* Handle first initialization from vector elts. */
14075 if (n_elts != XVECLEN (vals, 0))
14076 {
14077 rtx subtarget = target;
14078 x = XVECEXP (vals, 0, 0);
14079 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
14080 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
14081 {
14082 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
14083 if (inner_mode == QImode || inner_mode == HImode)
14084 {
14085 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
14086 mode = mode_for_vector (SImode, n_bits / 4).require ();
14087 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
14088 ops[0] = gen_lowpart (inner_mode, ops[0]);
14089 ops[1] = gen_lowpart (inner_mode, ops[1]);
14090 subtarget = gen_reg_rtx (mode);
14091 }
14092 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
14093 if (subtarget != target)
14094 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
14095 return;
14096 }
14097 gcc_unreachable ();
14098 }
14099
14100 for (i = 0; i < n_elts; ++i)
14101 {
14102 x = XVECEXP (vals, 0, i);
14103 if (!(CONST_SCALAR_INT_P (x)
14104 || CONST_DOUBLE_P (x)
14105 || CONST_FIXED_P (x)))
14106 n_var++, one_var = i;
14107 else if (x != CONST0_RTX (inner_mode))
14108 all_const_zero = false;
14109 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
14110 all_same = false;
14111 }
14112
14113 /* Constants are best loaded from the constant pool. */
14114 if (n_var == 0)
14115 {
14116 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
14117 return;
14118 }
14119
14120 /* If all values are identical, broadcast the value. */
14121 if (all_same
14122 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
14123 XVECEXP (vals, 0, 0)))
14124 return;
14125
14126 /* Values where only one field is non-constant are best loaded from
14127 the pool and overwritten via move later. */
14128 if (n_var == 1)
14129 {
14130 if (all_const_zero
14131 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
14132 XVECEXP (vals, 0, one_var),
14133 one_var))
14134 return;
14135
14136 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
14137 return;
14138 }
14139
14140 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
14141}
14142
14143void
14144ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
14145{
14146 machine_mode mode = GET_MODE (target);
14147 machine_mode inner_mode = GET_MODE_INNER (mode);
14148 machine_mode half_mode;
14149 bool use_vec_merge = false;
14150 rtx tmp;
14151 static rtx (*gen_extract[6][2]) (rtx, rtx)
14152 = {
14153 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
14154 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
14155 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
14156 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
14157 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
14158 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
14159 };
14160 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
14161 = {
14162 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
14163 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
14164 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
14165 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
14166 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
14167 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
14168 };
14169 int i, j, n;
14170 machine_mode mmode = VOIDmode;
14171 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
14172
14173 switch (mode)
14174 {
2bf6d935 14175 case E_V2SImode:
f15c7bd1
UB
14176 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14177 if (use_vec_merge)
14178 break;
14179 /* FALLTHRU */
14180
14181 case E_V2SFmode:
2bf6d935
ML
14182 if (mmx_ok)
14183 {
14184 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14185 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
14186 if (elt == 0)
14187 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14188 else
14189 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14190 emit_insn (gen_rtx_SET (target, tmp));
14191 return;
14192 }
14193 break;
14194
14195 case E_V2DImode:
14196 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
14197 if (use_vec_merge)
14198 break;
14199
14200 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14201 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
14202 if (elt == 0)
14203 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14204 else
14205 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14206 emit_insn (gen_rtx_SET (target, tmp));
14207 return;
14208
14209 case E_V2DFmode:
ac173024
L
14210 /* NB: For ELT == 0, use standard scalar operation patterns which
14211 preserve the rest of the vector for combiner:
14212
14213 (vec_merge:V2DF
14214 (vec_duplicate:V2DF (reg:DF))
14215 (reg:V2DF)
14216 (const_int 1))
14217 */
14218 if (elt == 0)
14219 goto do_vec_merge;
14220
2bf6d935
ML
14221 {
14222 rtx op0, op1;
14223
14224 /* For the two element vectors, we implement a VEC_CONCAT with
14225 the extraction of the other element. */
14226
14227 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
14228 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
14229
14230 if (elt == 0)
14231 op0 = val, op1 = tmp;
14232 else
14233 op0 = tmp, op1 = val;
14234
14235 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
14236 emit_insn (gen_rtx_SET (target, tmp));
14237 }
14238 return;
14239
14240 case E_V4SFmode:
14241 use_vec_merge = TARGET_SSE4_1;
14242 if (use_vec_merge)
14243 break;
14244
14245 switch (elt)
14246 {
14247 case 0:
14248 use_vec_merge = true;
14249 break;
14250
14251 case 1:
14252 /* tmp = target = A B C D */
14253 tmp = copy_to_reg (target);
14254 /* target = A A B B */
14255 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
14256 /* target = X A B B */
14257 ix86_expand_vector_set (false, target, val, 0);
14258 /* target = A X C D */
14259 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14260 const1_rtx, const0_rtx,
14261 GEN_INT (2+4), GEN_INT (3+4)));
14262 return;
14263
14264 case 2:
14265 /* tmp = target = A B C D */
14266 tmp = copy_to_reg (target);
14267 /* tmp = X B C D */
14268 ix86_expand_vector_set (false, tmp, val, 0);
14269 /* target = A B X D */
14270 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14271 const0_rtx, const1_rtx,
14272 GEN_INT (0+4), GEN_INT (3+4)));
14273 return;
14274
14275 case 3:
14276 /* tmp = target = A B C D */
14277 tmp = copy_to_reg (target);
14278 /* tmp = X B C D */
14279 ix86_expand_vector_set (false, tmp, val, 0);
14280 /* target = A B X D */
14281 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14282 const0_rtx, const1_rtx,
14283 GEN_INT (2+4), GEN_INT (0+4)));
14284 return;
14285
14286 default:
14287 gcc_unreachable ();
14288 }
14289 break;
14290
14291 case E_V4SImode:
14292 use_vec_merge = TARGET_SSE4_1;
14293 if (use_vec_merge)
14294 break;
14295
14296 /* Element 0 handled by vec_merge below. */
14297 if (elt == 0)
14298 {
14299 use_vec_merge = true;
14300 break;
14301 }
14302
14303 if (TARGET_SSE2)
14304 {
14305 /* With SSE2, use integer shuffles to swap element 0 and ELT,
14306 store into element 0, then shuffle them back. */
14307
14308 rtx order[4];
14309
14310 order[0] = GEN_INT (elt);
14311 order[1] = const1_rtx;
14312 order[2] = const2_rtx;
14313 order[3] = GEN_INT (3);
14314 order[elt] = const0_rtx;
14315
14316 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14317 order[1], order[2], order[3]));
14318
14319 ix86_expand_vector_set (false, target, val, 0);
14320
14321 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14322 order[1], order[2], order[3]));
14323 }
14324 else
14325 {
14326 /* For SSE1, we have to reuse the V4SF code. */
14327 rtx t = gen_reg_rtx (V4SFmode);
14328 emit_move_insn (t, gen_lowpart (V4SFmode, target));
14329 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
14330 emit_move_insn (target, gen_lowpart (mode, t));
14331 }
14332 return;
14333
14334 case E_V8HImode:
14335 use_vec_merge = TARGET_SSE2;
14336 break;
14337 case E_V4HImode:
14338 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14339 break;
14340
14341 case E_V16QImode:
14342 use_vec_merge = TARGET_SSE4_1;
14343 break;
14344
14345 case E_V8QImode:
f15c7bd1 14346 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
2bf6d935
ML
14347 break;
14348
14349 case E_V32QImode:
14350 half_mode = V16QImode;
14351 j = 0;
14352 n = 16;
14353 goto half;
14354
14355 case E_V16HImode:
14356 half_mode = V8HImode;
14357 j = 1;
14358 n = 8;
14359 goto half;
14360
14361 case E_V8SImode:
14362 half_mode = V4SImode;
14363 j = 2;
14364 n = 4;
14365 goto half;
14366
14367 case E_V4DImode:
14368 half_mode = V2DImode;
14369 j = 3;
14370 n = 2;
14371 goto half;
14372
14373 case E_V8SFmode:
14374 half_mode = V4SFmode;
14375 j = 4;
14376 n = 4;
14377 goto half;
14378
14379 case E_V4DFmode:
14380 half_mode = V2DFmode;
14381 j = 5;
14382 n = 2;
14383 goto half;
14384
14385half:
14386 /* Compute offset. */
14387 i = elt / n;
14388 elt %= n;
14389
14390 gcc_assert (i <= 1);
14391
14392 /* Extract the half. */
14393 tmp = gen_reg_rtx (half_mode);
14394 emit_insn (gen_extract[j][i] (tmp, target));
14395
14396 /* Put val in tmp at elt. */
14397 ix86_expand_vector_set (false, tmp, val, elt);
14398
14399 /* Put it back. */
14400 emit_insn (gen_insert[j][i] (target, target, tmp));
14401 return;
14402
14403 case E_V8DFmode:
14404 if (TARGET_AVX512F)
14405 {
14406 mmode = QImode;
14407 gen_blendm = gen_avx512f_blendmv8df;
14408 }
14409 break;
14410
14411 case E_V8DImode:
14412 if (TARGET_AVX512F)
14413 {
14414 mmode = QImode;
14415 gen_blendm = gen_avx512f_blendmv8di;
14416 }
14417 break;
14418
14419 case E_V16SFmode:
14420 if (TARGET_AVX512F)
14421 {
14422 mmode = HImode;
14423 gen_blendm = gen_avx512f_blendmv16sf;
14424 }
14425 break;
14426
14427 case E_V16SImode:
14428 if (TARGET_AVX512F)
14429 {
14430 mmode = HImode;
14431 gen_blendm = gen_avx512f_blendmv16si;
14432 }
14433 break;
14434
14435 case E_V32HImode:
14436 if (TARGET_AVX512BW)
14437 {
14438 mmode = SImode;
14439 gen_blendm = gen_avx512bw_blendmv32hi;
14440 }
14441 else if (TARGET_AVX512F)
14442 {
14443 half_mode = E_V8HImode;
14444 n = 8;
14445 goto quarter;
14446 }
14447 break;
14448
14449 case E_V64QImode:
14450 if (TARGET_AVX512BW)
14451 {
14452 mmode = DImode;
14453 gen_blendm = gen_avx512bw_blendmv64qi;
14454 }
14455 else if (TARGET_AVX512F)
14456 {
14457 half_mode = E_V16QImode;
14458 n = 16;
14459 goto quarter;
14460 }
14461 break;
14462
14463quarter:
14464 /* Compute offset. */
14465 i = elt / n;
14466 elt %= n;
14467
14468 gcc_assert (i <= 3);
14469
14470 {
14471 /* Extract the quarter. */
14472 tmp = gen_reg_rtx (V4SImode);
14473 rtx tmp2 = gen_lowpart (V16SImode, target);
14474 rtx mask = gen_reg_rtx (QImode);
14475
14476 emit_move_insn (mask, constm1_rtx);
14477 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
14478 tmp, mask));
14479
14480 tmp2 = gen_reg_rtx (half_mode);
14481 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
14482 tmp = tmp2;
14483
14484 /* Put val in tmp at elt. */
14485 ix86_expand_vector_set (false, tmp, val, elt);
14486
14487 /* Put it back. */
14488 tmp2 = gen_reg_rtx (V16SImode);
14489 rtx tmp3 = gen_lowpart (V16SImode, target);
14490 mask = gen_reg_rtx (HImode);
14491 emit_move_insn (mask, constm1_rtx);
14492 tmp = gen_lowpart (V4SImode, tmp);
14493 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
14494 tmp3, mask));
14495 emit_move_insn (target, gen_lowpart (mode, tmp2));
14496 }
14497 return;
14498
14499 default:
14500 break;
14501 }
14502
14503 if (mmode != VOIDmode)
14504 {
14505 tmp = gen_reg_rtx (mode);
14506 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
14507 /* The avx512*_blendm<mode> expanders have different operand order
14508 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
14509 elements where the mask is set and second input operand otherwise,
14510 in {sse,avx}*_*blend* the first input operand is used for elements
14511 where the mask is clear and second input operand otherwise. */
14512 emit_insn (gen_blendm (target, target, tmp,
14513 force_reg (mmode,
14514 gen_int_mode (HOST_WIDE_INT_1U << elt,
14515 mmode))));
14516 }
14517 else if (use_vec_merge)
14518 {
ac173024 14519do_vec_merge:
2bf6d935
ML
14520 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
14521 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
14522 GEN_INT (HOST_WIDE_INT_1U << elt));
14523 emit_insn (gen_rtx_SET (target, tmp));
14524 }
14525 else
14526 {
14527 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
14528
14529 emit_move_insn (mem, target);
14530
14531 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
14532 emit_move_insn (tmp, val);
14533
14534 emit_move_insn (target, mem);
14535 }
14536}
14537
14538void
14539ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
14540{
14541 machine_mode mode = GET_MODE (vec);
14542 machine_mode inner_mode = GET_MODE_INNER (mode);
14543 bool use_vec_extr = false;
14544 rtx tmp;
14545
14546 switch (mode)
14547 {
14548 case E_V2SImode:
5fbc8ab4
UB
14549 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14550 if (use_vec_extr)
14551 break;
14552 /* FALLTHRU */
14553
2bf6d935
ML
14554 case E_V2SFmode:
14555 if (!mmx_ok)
14556 break;
14557 /* FALLTHRU */
14558
14559 case E_V2DFmode:
14560 case E_V2DImode:
14561 case E_V2TImode:
14562 case E_V4TImode:
14563 use_vec_extr = true;
14564 break;
14565
14566 case E_V4SFmode:
14567 use_vec_extr = TARGET_SSE4_1;
14568 if (use_vec_extr)
14569 break;
14570
14571 switch (elt)
14572 {
14573 case 0:
14574 tmp = vec;
14575 break;
14576
14577 case 1:
14578 case 3:
14579 tmp = gen_reg_rtx (mode);
14580 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
14581 GEN_INT (elt), GEN_INT (elt),
14582 GEN_INT (elt+4), GEN_INT (elt+4)));
14583 break;
14584
14585 case 2:
14586 tmp = gen_reg_rtx (mode);
14587 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
14588 break;
14589
14590 default:
14591 gcc_unreachable ();
14592 }
14593 vec = tmp;
14594 use_vec_extr = true;
14595 elt = 0;
14596 break;
14597
14598 case E_V4SImode:
14599 use_vec_extr = TARGET_SSE4_1;
14600 if (use_vec_extr)
14601 break;
14602
14603 if (TARGET_SSE2)
14604 {
14605 switch (elt)
14606 {
14607 case 0:
14608 tmp = vec;
14609 break;
14610
14611 case 1:
14612 case 3:
14613 tmp = gen_reg_rtx (mode);
14614 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
14615 GEN_INT (elt), GEN_INT (elt),
14616 GEN_INT (elt), GEN_INT (elt)));
14617 break;
14618
14619 case 2:
14620 tmp = gen_reg_rtx (mode);
14621 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
14622 break;
14623
14624 default:
14625 gcc_unreachable ();
14626 }
14627 vec = tmp;
14628 use_vec_extr = true;
14629 elt = 0;
14630 }
14631 else
14632 {
14633 /* For SSE1, we have to reuse the V4SF code. */
14634 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
14635 gen_lowpart (V4SFmode, vec), elt);
14636 return;
14637 }
14638 break;
14639
14640 case E_V8HImode:
14641 use_vec_extr = TARGET_SSE2;
14642 break;
14643 case E_V4HImode:
14644 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14645 break;
14646
14647 case E_V16QImode:
14648 use_vec_extr = TARGET_SSE4_1;
f66e6e2b
JJ
14649 if (!use_vec_extr
14650 && TARGET_SSE2
14651 && elt == 0
14652 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
14653 {
14654 tmp = gen_reg_rtx (SImode);
14655 ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
14656 0);
14657 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
14658 return;
14659 }
2bf6d935
ML
14660 break;
14661
14662 case E_V8SFmode:
14663 if (TARGET_AVX)
14664 {
14665 tmp = gen_reg_rtx (V4SFmode);
14666 if (elt < 4)
14667 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
14668 else
14669 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
14670 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14671 return;
14672 }
14673 break;
14674
14675 case E_V4DFmode:
14676 if (TARGET_AVX)
14677 {
14678 tmp = gen_reg_rtx (V2DFmode);
14679 if (elt < 2)
14680 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
14681 else
14682 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
14683 ix86_expand_vector_extract (false, target, tmp, elt & 1);
14684 return;
14685 }
14686 break;
14687
14688 case E_V32QImode:
14689 if (TARGET_AVX)
14690 {
14691 tmp = gen_reg_rtx (V16QImode);
14692 if (elt < 16)
14693 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
14694 else
14695 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
14696 ix86_expand_vector_extract (false, target, tmp, elt & 15);
14697 return;
14698 }
14699 break;
14700
14701 case E_V16HImode:
14702 if (TARGET_AVX)
14703 {
14704 tmp = gen_reg_rtx (V8HImode);
14705 if (elt < 8)
14706 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
14707 else
14708 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
14709 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14710 return;
14711 }
14712 break;
14713
14714 case E_V8SImode:
14715 if (TARGET_AVX)
14716 {
14717 tmp = gen_reg_rtx (V4SImode);
14718 if (elt < 4)
14719 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
14720 else
14721 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
14722 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14723 return;
14724 }
14725 break;
14726
14727 case E_V4DImode:
14728 if (TARGET_AVX)
14729 {
14730 tmp = gen_reg_rtx (V2DImode);
14731 if (elt < 2)
14732 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
14733 else
14734 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
14735 ix86_expand_vector_extract (false, target, tmp, elt & 1);
14736 return;
14737 }
14738 break;
14739
14740 case E_V32HImode:
14741 if (TARGET_AVX512BW)
14742 {
14743 tmp = gen_reg_rtx (V16HImode);
14744 if (elt < 16)
14745 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
14746 else
14747 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
14748 ix86_expand_vector_extract (false, target, tmp, elt & 15);
14749 return;
14750 }
14751 break;
14752
14753 case E_V64QImode:
14754 if (TARGET_AVX512BW)
14755 {
14756 tmp = gen_reg_rtx (V32QImode);
14757 if (elt < 32)
14758 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
14759 else
14760 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
14761 ix86_expand_vector_extract (false, target, tmp, elt & 31);
14762 return;
14763 }
14764 break;
14765
14766 case E_V16SFmode:
14767 tmp = gen_reg_rtx (V8SFmode);
14768 if (elt < 8)
14769 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
14770 else
14771 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
14772 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14773 return;
14774
14775 case E_V8DFmode:
14776 tmp = gen_reg_rtx (V4DFmode);
14777 if (elt < 4)
14778 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
14779 else
14780 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
14781 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14782 return;
14783
14784 case E_V16SImode:
14785 tmp = gen_reg_rtx (V8SImode);
14786 if (elt < 8)
14787 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
14788 else
14789 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
14790 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14791 return;
14792
14793 case E_V8DImode:
14794 tmp = gen_reg_rtx (V4DImode);
14795 if (elt < 4)
14796 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
14797 else
14798 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
14799 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14800 return;
14801
14802 case E_V8QImode:
5fbc8ab4 14803 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
2bf6d935 14804 /* ??? Could extract the appropriate HImode element and shift. */
5fbc8ab4
UB
14805 break;
14806
2bf6d935
ML
14807 default:
14808 break;
14809 }
14810
14811 if (use_vec_extr)
14812 {
14813 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
14814 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
14815
14816 /* Let the rtl optimizers know about the zero extension performed. */
14817 if (inner_mode == QImode || inner_mode == HImode)
14818 {
14819 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
14820 target = gen_lowpart (SImode, target);
14821 }
14822
14823 emit_insn (gen_rtx_SET (target, tmp));
14824 }
14825 else
14826 {
14827 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
14828
14829 emit_move_insn (mem, vec);
14830
14831 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
14832 emit_move_insn (target, tmp);
14833 }
14834}
14835
14836/* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
14837 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
14838 The upper bits of DEST are undefined, though they shouldn't cause
14839 exceptions (some bits from src or all zeros are ok). */
14840
14841static void
14842emit_reduc_half (rtx dest, rtx src, int i)
14843{
14844 rtx tem, d = dest;
14845 switch (GET_MODE (src))
14846 {
14847 case E_V4SFmode:
14848 if (i == 128)
14849 tem = gen_sse_movhlps (dest, src, src);
14850 else
14851 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
14852 GEN_INT (1 + 4), GEN_INT (1 + 4));
14853 break;
14854 case E_V2DFmode:
14855 tem = gen_vec_interleave_highv2df (dest, src, src);
14856 break;
14857 case E_V16QImode:
14858 case E_V8HImode:
14859 case E_V4SImode:
14860 case E_V2DImode:
14861 d = gen_reg_rtx (V1TImode);
14862 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
14863 GEN_INT (i / 2));
14864 break;
14865 case E_V8SFmode:
14866 if (i == 256)
14867 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
14868 else
14869 tem = gen_avx_shufps256 (dest, src, src,
14870 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
14871 break;
14872 case E_V4DFmode:
14873 if (i == 256)
14874 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
14875 else
14876 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
14877 break;
14878 case E_V32QImode:
14879 case E_V16HImode:
14880 case E_V8SImode:
14881 case E_V4DImode:
14882 if (i == 256)
14883 {
14884 if (GET_MODE (dest) != V4DImode)
14885 d = gen_reg_rtx (V4DImode);
14886 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
14887 gen_lowpart (V4DImode, src),
14888 const1_rtx);
14889 }
14890 else
14891 {
14892 d = gen_reg_rtx (V2TImode);
14893 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
14894 GEN_INT (i / 2));
14895 }
14896 break;
14897 case E_V64QImode:
14898 case E_V32HImode:
bee27152
JJ
14899 if (i < 64)
14900 {
14901 d = gen_reg_rtx (V4TImode);
14902 tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
14903 GEN_INT (i / 2));
14904 break;
14905 }
14906 /* FALLTHRU */
2bf6d935
ML
14907 case E_V16SImode:
14908 case E_V16SFmode:
14909 case E_V8DImode:
14910 case E_V8DFmode:
14911 if (i > 128)
14912 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
bee27152
JJ
14913 gen_lowpart (V16SImode, src),
14914 gen_lowpart (V16SImode, src),
14915 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
14916 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
14917 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
14918 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
14919 GEN_INT (0xC), GEN_INT (0xD),
14920 GEN_INT (0xE), GEN_INT (0xF),
14921 GEN_INT (0x10), GEN_INT (0x11),
14922 GEN_INT (0x12), GEN_INT (0x13),
14923 GEN_INT (0x14), GEN_INT (0x15),
14924 GEN_INT (0x16), GEN_INT (0x17));
2bf6d935
ML
14925 else
14926 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
bee27152
JJ
14927 gen_lowpart (V16SImode, src),
14928 GEN_INT (i == 128 ? 0x2 : 0x1),
14929 GEN_INT (0x3),
14930 GEN_INT (0x3),
14931 GEN_INT (0x3),
14932 GEN_INT (i == 128 ? 0x6 : 0x5),
14933 GEN_INT (0x7),
14934 GEN_INT (0x7),
14935 GEN_INT (0x7),
14936 GEN_INT (i == 128 ? 0xA : 0x9),
14937 GEN_INT (0xB),
14938 GEN_INT (0xB),
14939 GEN_INT (0xB),
14940 GEN_INT (i == 128 ? 0xE : 0xD),
14941 GEN_INT (0xF),
14942 GEN_INT (0xF),
14943 GEN_INT (0xF));
2bf6d935
ML
14944 break;
14945 default:
14946 gcc_unreachable ();
14947 }
14948 emit_insn (tem);
14949 if (d != dest)
14950 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
14951}
14952
14953/* Expand a vector reduction. FN is the binary pattern to reduce;
14954 DEST is the destination; IN is the input vector. */
14955
14956void
14957ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
14958{
14959 rtx half, dst, vec = in;
14960 machine_mode mode = GET_MODE (in);
14961 int i;
14962
14963 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
14964 if (TARGET_SSE4_1
14965 && mode == V8HImode
14966 && fn == gen_uminv8hi3)
14967 {
14968 emit_insn (gen_sse4_1_phminposuw (dest, in));
14969 return;
14970 }
14971
14972 for (i = GET_MODE_BITSIZE (mode);
14973 i > GET_MODE_UNIT_BITSIZE (mode);
14974 i >>= 1)
14975 {
14976 half = gen_reg_rtx (mode);
14977 emit_reduc_half (half, vec, i);
14978 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
14979 dst = dest;
14980 else
14981 dst = gen_reg_rtx (mode);
14982 emit_insn (fn (dst, half, vec));
14983 vec = dst;
14984 }
14985}
14986
14987/* Output code to perform a conditional jump to LABEL, if C2 flag in
14988 FP status register is set. */
14989
14990void
14991ix86_emit_fp_unordered_jump (rtx label)
14992{
14993 rtx reg = gen_reg_rtx (HImode);
14994 rtx_insn *insn;
14995 rtx temp;
14996
14997 emit_insn (gen_x86_fnstsw_1 (reg));
14998
14999 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
15000 {
15001 emit_insn (gen_x86_sahf_1 (reg));
15002
15003 temp = gen_rtx_REG (CCmode, FLAGS_REG);
15004 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
15005 }
15006 else
15007 {
15008 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
15009
15010 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
15011 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
15012 }
15013
15014 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
15015 gen_rtx_LABEL_REF (VOIDmode, label),
15016 pc_rtx);
15017 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
15018 predict_jump (REG_BR_PROB_BASE * 10 / 100);
15019 JUMP_LABEL (insn) = label;
15020}
15021
15022/* Output code to perform an sinh XFmode calculation. */
15023
15024void ix86_emit_i387_sinh (rtx op0, rtx op1)
15025{
15026 rtx e1 = gen_reg_rtx (XFmode);
15027 rtx e2 = gen_reg_rtx (XFmode);
15028 rtx scratch = gen_reg_rtx (HImode);
15029 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15030 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15031 rtx cst1, tmp;
15032 rtx_code_label *jump_label = gen_label_rtx ();
15033 rtx_insn *insn;
15034
15035 /* scratch = fxam (op1) */
15036 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15037
15038 /* e1 = expm1 (|op1|) */
15039 emit_insn (gen_absxf2 (e2, op1));
15040 emit_insn (gen_expm1xf2 (e1, e2));
15041
15042 /* e2 = e1 / (e1 + 1.0) + e1 */
15043 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15044 emit_insn (gen_addxf3 (e2, e1, cst1));
15045 emit_insn (gen_divxf3 (e2, e1, e2));
15046 emit_insn (gen_addxf3 (e2, e2, e1));
15047
15048 /* flags = signbit (op1) */
15049 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15050
15051 /* if (flags) then e2 = -e2 */
15052 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15053 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15054 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15055 pc_rtx);
15056 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15057 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15058 JUMP_LABEL (insn) = jump_label;
15059
15060 emit_insn (gen_negxf2 (e2, e2));
15061
15062 emit_label (jump_label);
15063 LABEL_NUSES (jump_label) = 1;
15064
15065 /* op0 = 0.5 * e2 */
15066 half = force_reg (XFmode, half);
15067 emit_insn (gen_mulxf3 (op0, e2, half));
15068}
15069
15070/* Output code to perform an cosh XFmode calculation. */
15071
15072void ix86_emit_i387_cosh (rtx op0, rtx op1)
15073{
15074 rtx e1 = gen_reg_rtx (XFmode);
15075 rtx e2 = gen_reg_rtx (XFmode);
15076 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15077 rtx cst1;
15078
15079 /* e1 = exp (op1) */
15080 emit_insn (gen_expxf2 (e1, op1));
15081
15082 /* e2 = e1 + 1.0 / e1 */
15083 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15084 emit_insn (gen_divxf3 (e2, cst1, e1));
15085 emit_insn (gen_addxf3 (e2, e1, e2));
15086
15087 /* op0 = 0.5 * e2 */
15088 half = force_reg (XFmode, half);
15089 emit_insn (gen_mulxf3 (op0, e2, half));
15090}
15091
15092/* Output code to perform an tanh XFmode calculation. */
15093
15094void ix86_emit_i387_tanh (rtx op0, rtx op1)
15095{
15096 rtx e1 = gen_reg_rtx (XFmode);
15097 rtx e2 = gen_reg_rtx (XFmode);
15098 rtx scratch = gen_reg_rtx (HImode);
15099 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15100 rtx cst2, tmp;
15101 rtx_code_label *jump_label = gen_label_rtx ();
15102 rtx_insn *insn;
15103
15104 /* scratch = fxam (op1) */
15105 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15106
15107 /* e1 = expm1 (-|2 * op1|) */
15108 emit_insn (gen_addxf3 (e2, op1, op1));
15109 emit_insn (gen_absxf2 (e2, e2));
15110 emit_insn (gen_negxf2 (e2, e2));
15111 emit_insn (gen_expm1xf2 (e1, e2));
15112
15113 /* e2 = e1 / (e1 + 2.0) */
15114 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
15115 emit_insn (gen_addxf3 (e2, e1, cst2));
15116 emit_insn (gen_divxf3 (e2, e1, e2));
15117
15118 /* flags = signbit (op1) */
15119 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15120
15121 /* if (!flags) then e2 = -e2 */
15122 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15123 gen_rtx_NE (VOIDmode, flags, const0_rtx),
15124 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15125 pc_rtx);
15126 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15127 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15128 JUMP_LABEL (insn) = jump_label;
15129
15130 emit_insn (gen_negxf2 (e2, e2));
15131
15132 emit_label (jump_label);
15133 LABEL_NUSES (jump_label) = 1;
15134
15135 emit_move_insn (op0, e2);
15136}
15137
15138/* Output code to perform an asinh XFmode calculation. */
15139
15140void ix86_emit_i387_asinh (rtx op0, rtx op1)
15141{
15142 rtx e1 = gen_reg_rtx (XFmode);
15143 rtx e2 = gen_reg_rtx (XFmode);
15144 rtx scratch = gen_reg_rtx (HImode);
15145 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15146 rtx cst1, tmp;
15147 rtx_code_label *jump_label = gen_label_rtx ();
15148 rtx_insn *insn;
15149
15150 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15151 emit_insn (gen_mulxf3 (e1, op1, op1));
15152 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15153 emit_insn (gen_addxf3 (e2, e1, cst1));
15154 emit_insn (gen_sqrtxf2 (e2, e2));
15155 emit_insn (gen_addxf3 (e2, e2, cst1));
15156
15157 /* e1 = e1 / e2 */
15158 emit_insn (gen_divxf3 (e1, e1, e2));
15159
15160 /* scratch = fxam (op1) */
15161 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15162
15163 /* e1 = e1 + |op1| */
15164 emit_insn (gen_absxf2 (e2, op1));
15165 emit_insn (gen_addxf3 (e1, e1, e2));
15166
15167 /* e2 = log1p (e1) */
15168 ix86_emit_i387_log1p (e2, e1);
15169
15170 /* flags = signbit (op1) */
15171 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15172
15173 /* if (flags) then e2 = -e2 */
15174 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15175 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15176 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15177 pc_rtx);
15178 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15179 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15180 JUMP_LABEL (insn) = jump_label;
15181
15182 emit_insn (gen_negxf2 (e2, e2));
15183
15184 emit_label (jump_label);
15185 LABEL_NUSES (jump_label) = 1;
15186
15187 emit_move_insn (op0, e2);
15188}
15189
15190/* Output code to perform an acosh XFmode calculation. */
15191
15192void ix86_emit_i387_acosh (rtx op0, rtx op1)
15193{
15194 rtx e1 = gen_reg_rtx (XFmode);
15195 rtx e2 = gen_reg_rtx (XFmode);
15196 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15197
15198 /* e2 = sqrt (op1 + 1.0) */
15199 emit_insn (gen_addxf3 (e2, op1, cst1));
15200 emit_insn (gen_sqrtxf2 (e2, e2));
15201
15202 /* e1 = sqrt (op1 - 1.0) */
15203 emit_insn (gen_subxf3 (e1, op1, cst1));
15204 emit_insn (gen_sqrtxf2 (e1, e1));
15205
15206 /* e1 = e1 * e2 */
15207 emit_insn (gen_mulxf3 (e1, e1, e2));
15208
15209 /* e1 = e1 + op1 */
15210 emit_insn (gen_addxf3 (e1, e1, op1));
15211
15212 /* op0 = log (e1) */
15213 emit_insn (gen_logxf2 (op0, e1));
15214}
15215
15216/* Output code to perform an atanh XFmode calculation. */
15217
15218void ix86_emit_i387_atanh (rtx op0, rtx op1)
15219{
15220 rtx e1 = gen_reg_rtx (XFmode);
15221 rtx e2 = gen_reg_rtx (XFmode);
15222 rtx scratch = gen_reg_rtx (HImode);
15223 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15224 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15225 rtx cst1, tmp;
15226 rtx_code_label *jump_label = gen_label_rtx ();
15227 rtx_insn *insn;
15228
15229 /* scratch = fxam (op1) */
15230 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15231
15232 /* e2 = |op1| */
15233 emit_insn (gen_absxf2 (e2, op1));
15234
15235 /* e1 = -(e2 + e2) / (e2 + 1.0) */
15236 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15237 emit_insn (gen_addxf3 (e1, e2, cst1));
15238 emit_insn (gen_addxf3 (e2, e2, e2));
15239 emit_insn (gen_negxf2 (e2, e2));
15240 emit_insn (gen_divxf3 (e1, e2, e1));
15241
15242 /* e2 = log1p (e1) */
15243 ix86_emit_i387_log1p (e2, e1);
15244
15245 /* flags = signbit (op1) */
15246 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15247
15248 /* if (!flags) then e2 = -e2 */
15249 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15250 gen_rtx_NE (VOIDmode, flags, const0_rtx),
15251 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15252 pc_rtx);
15253 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15254 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15255 JUMP_LABEL (insn) = jump_label;
15256
15257 emit_insn (gen_negxf2 (e2, e2));
15258
15259 emit_label (jump_label);
15260 LABEL_NUSES (jump_label) = 1;
15261
15262 /* op0 = 0.5 * e2 */
15263 half = force_reg (XFmode, half);
15264 emit_insn (gen_mulxf3 (op0, e2, half));
15265}
15266
15267/* Output code to perform a log1p XFmode calculation. */
15268
15269void ix86_emit_i387_log1p (rtx op0, rtx op1)
15270{
15271 rtx_code_label *label1 = gen_label_rtx ();
15272 rtx_code_label *label2 = gen_label_rtx ();
15273
15274 rtx tmp = gen_reg_rtx (XFmode);
15275 rtx res = gen_reg_rtx (XFmode);
15276 rtx cst, cstln2, cst1;
15277 rtx_insn *insn;
15278
15279 cst = const_double_from_real_value
15280 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
15281 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
15282
15283 emit_insn (gen_absxf2 (tmp, op1));
15284
15285 cst = force_reg (XFmode, cst);
15286 ix86_expand_branch (GE, tmp, cst, label1);
15287 predict_jump (REG_BR_PROB_BASE * 10 / 100);
15288 insn = get_last_insn ();
15289 JUMP_LABEL (insn) = label1;
15290
15291 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
15292 emit_jump (label2);
15293
15294 emit_label (label1);
15295 LABEL_NUSES (label1) = 1;
15296
15297 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15298 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
15299 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
15300
15301 emit_label (label2);
15302 LABEL_NUSES (label2) = 1;
15303
15304 emit_move_insn (op0, res);
15305}
15306
15307/* Emit code for round calculation. */
15308void ix86_emit_i387_round (rtx op0, rtx op1)
15309{
15310 machine_mode inmode = GET_MODE (op1);
15311 machine_mode outmode = GET_MODE (op0);
15312 rtx e1 = gen_reg_rtx (XFmode);
15313 rtx e2 = gen_reg_rtx (XFmode);
15314 rtx scratch = gen_reg_rtx (HImode);
15315 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15316 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15317 rtx res = gen_reg_rtx (outmode);
15318 rtx_code_label *jump_label = gen_label_rtx ();
15319 rtx (*floor_insn) (rtx, rtx);
15320 rtx (*neg_insn) (rtx, rtx);
15321 rtx_insn *insn;
15322 rtx tmp;
15323
15324 switch (inmode)
15325 {
15326 case E_SFmode:
15327 case E_DFmode:
15328 tmp = gen_reg_rtx (XFmode);
15329
15330 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
15331 op1 = tmp;
15332 break;
15333 case E_XFmode:
15334 break;
15335 default:
15336 gcc_unreachable ();
15337 }
15338
15339 switch (outmode)
15340 {
15341 case E_SFmode:
15342 floor_insn = gen_frndintxf2_floor;
15343 neg_insn = gen_negsf2;
15344 break;
15345 case E_DFmode:
15346 floor_insn = gen_frndintxf2_floor;
15347 neg_insn = gen_negdf2;
15348 break;
15349 case E_XFmode:
15350 floor_insn = gen_frndintxf2_floor;
15351 neg_insn = gen_negxf2;
15352 break;
15353 case E_HImode:
15354 floor_insn = gen_lfloorxfhi2;
15355 neg_insn = gen_neghi2;
15356 break;
15357 case E_SImode:
15358 floor_insn = gen_lfloorxfsi2;
15359 neg_insn = gen_negsi2;
15360 break;
15361 case E_DImode:
15362 floor_insn = gen_lfloorxfdi2;
15363 neg_insn = gen_negdi2;
15364 break;
15365 default:
15366 gcc_unreachable ();
15367 }
15368
15369 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15370
15371 /* scratch = fxam(op1) */
15372 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15373
15374 /* e1 = fabs(op1) */
15375 emit_insn (gen_absxf2 (e1, op1));
15376
15377 /* e2 = e1 + 0.5 */
15378 half = force_reg (XFmode, half);
15379 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
15380
15381 /* res = floor(e2) */
15382 switch (outmode)
15383 {
15384 case E_SFmode:
15385 case E_DFmode:
15386 {
15387 tmp = gen_reg_rtx (XFmode);
15388
15389 emit_insn (floor_insn (tmp, e2));
15390 emit_insn (gen_rtx_SET (res,
15391 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
15392 UNSPEC_TRUNC_NOOP)));
15393 }
15394 break;
15395 default:
15396 emit_insn (floor_insn (res, e2));
15397 }
15398
15399 /* flags = signbit(a) */
15400 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15401
15402 /* if (flags) then res = -res */
15403 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15404 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15405 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15406 pc_rtx);
15407 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15408 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15409 JUMP_LABEL (insn) = jump_label;
15410
15411 emit_insn (neg_insn (res, res));
15412
15413 emit_label (jump_label);
15414 LABEL_NUSES (jump_label) = 1;
15415
15416 emit_move_insn (op0, res);
15417}
15418
15419/* Output code to perform a Newton-Rhapson approximation of a single precision
15420 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
15421
15422void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
15423{
15424 rtx x0, x1, e0, e1;
15425
15426 x0 = gen_reg_rtx (mode);
15427 e0 = gen_reg_rtx (mode);
15428 e1 = gen_reg_rtx (mode);
15429 x1 = gen_reg_rtx (mode);
15430
15431 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15432
15433 b = force_reg (mode, b);
15434
15435 /* x0 = rcp(b) estimate */
15436 if (mode == V16SFmode || mode == V8DFmode)
15437 {
15438 if (TARGET_AVX512ER)
15439 {
15440 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15441 UNSPEC_RCP28)));
15442 /* res = a * x0 */
15443 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
15444 return;
15445 }
15446 else
15447 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15448 UNSPEC_RCP14)));
15449 }
15450 else
15451 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15452 UNSPEC_RCP)));
15453
15454 /* e0 = x0 * b */
15455 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
15456
15457 /* e0 = x0 * e0 */
15458 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
15459
15460 /* e1 = x0 + x0 */
15461 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
15462
15463 /* x1 = e1 - e0 */
15464 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
15465
15466 /* res = a * x1 */
15467 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
15468}
15469
15470/* Output code to perform a Newton-Rhapson approximation of a
15471 single precision floating point [reciprocal] square root. */
15472
15473void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
15474{
15475 rtx x0, e0, e1, e2, e3, mthree, mhalf;
15476 REAL_VALUE_TYPE r;
15477 int unspec;
15478
15479 x0 = gen_reg_rtx (mode);
15480 e0 = gen_reg_rtx (mode);
15481 e1 = gen_reg_rtx (mode);
15482 e2 = gen_reg_rtx (mode);
15483 e3 = gen_reg_rtx (mode);
15484
15485 if (TARGET_AVX512ER && mode == V16SFmode)
15486 {
15487 if (recip)
15488 /* res = rsqrt28(a) estimate */
15489 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15490 UNSPEC_RSQRT28)));
15491 else
15492 {
15493 /* x0 = rsqrt28(a) estimate */
15494 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15495 UNSPEC_RSQRT28)));
15496 /* res = rcp28(x0) estimate */
15497 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
15498 UNSPEC_RCP28)));
15499 }
15500 return;
15501 }
15502
15503 real_from_integer (&r, VOIDmode, -3, SIGNED);
15504 mthree = const_double_from_real_value (r, SFmode);
15505
15506 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
15507 mhalf = const_double_from_real_value (r, SFmode);
15508 unspec = UNSPEC_RSQRT;
15509
15510 if (VECTOR_MODE_P (mode))
15511 {
15512 mthree = ix86_build_const_vector (mode, true, mthree);
15513 mhalf = ix86_build_const_vector (mode, true, mhalf);
15514 /* There is no 512-bit rsqrt. There is however rsqrt14. */
15515 if (GET_MODE_SIZE (mode) == 64)
15516 unspec = UNSPEC_RSQRT14;
15517 }
15518
15519 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
15520 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
15521
15522 a = force_reg (mode, a);
15523
15524 /* x0 = rsqrt(a) estimate */
15525 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15526 unspec)));
15527
15528 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
15529 if (!recip)
15530 {
15531 rtx zero = force_reg (mode, CONST0_RTX(mode));
15532 rtx mask;
15533
15534 /* Handle masked compare. */
15535 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
15536 {
15537 mask = gen_reg_rtx (HImode);
15538 /* Imm value 0x4 corresponds to not-equal comparison. */
15539 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
15540 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
15541 }
15542 else
15543 {
15544 mask = gen_reg_rtx (mode);
15545 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
15546 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
15547 }
15548 }
15549
15550 /* e0 = x0 * a */
15551 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
15552 /* e1 = e0 * x0 */
15553 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
15554
15555 /* e2 = e1 - 3. */
15556 mthree = force_reg (mode, mthree);
15557 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
15558
15559 mhalf = force_reg (mode, mhalf);
15560 if (recip)
15561 /* e3 = -.5 * x0 */
15562 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
15563 else
15564 /* e3 = -.5 * e0 */
15565 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
15566 /* ret = e2 * e3 */
15567 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
15568}
15569
15570/* Expand fabs (OP0) and return a new rtx that holds the result. The
15571 mask for masking out the sign-bit is stored in *SMASK, if that is
15572 non-null. */
15573
15574static rtx
15575ix86_expand_sse_fabs (rtx op0, rtx *smask)
15576{
15577 machine_mode vmode, mode = GET_MODE (op0);
15578 rtx xa, mask;
15579
15580 xa = gen_reg_rtx (mode);
15581 if (mode == SFmode)
15582 vmode = V4SFmode;
15583 else if (mode == DFmode)
15584 vmode = V2DFmode;
15585 else
15586 vmode = mode;
15587 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
15588 if (!VECTOR_MODE_P (mode))
15589 {
15590 /* We need to generate a scalar mode mask in this case. */
15591 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
15592 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
15593 mask = gen_reg_rtx (mode);
15594 emit_insn (gen_rtx_SET (mask, tmp));
15595 }
15596 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
15597
15598 if (smask)
15599 *smask = mask;
15600
15601 return xa;
15602}
15603
15604/* Expands a comparison of OP0 with OP1 using comparison code CODE,
15605 swapping the operands if SWAP_OPERANDS is true. The expanded
15606 code is a forward jump to a newly created label in case the
15607 comparison is true. The generated label rtx is returned. */
15608static rtx_code_label *
15609ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
15610 bool swap_operands)
15611{
15612 bool unordered_compare = ix86_unordered_fp_compare (code);
15613 rtx_code_label *label;
15614 rtx tmp, reg;
15615
15616 if (swap_operands)
15617 std::swap (op0, op1);
15618
15619 label = gen_label_rtx ();
15620 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
15621 if (unordered_compare)
15622 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
15623 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
15624 emit_insn (gen_rtx_SET (reg, tmp));
15625 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
15626 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
15627 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
15628 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15629 JUMP_LABEL (tmp) = label;
15630
15631 return label;
15632}
15633
15634/* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
15635 using comparison code CODE. Operands are swapped for the comparison if
15636 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
15637static rtx
15638ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
15639 bool swap_operands)
15640{
15641 rtx (*insn)(rtx, rtx, rtx, rtx);
15642 machine_mode mode = GET_MODE (op0);
15643 rtx mask = gen_reg_rtx (mode);
15644
15645 if (swap_operands)
15646 std::swap (op0, op1);
15647
15648 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
15649
15650 emit_insn (insn (mask, op0, op1,
15651 gen_rtx_fmt_ee (code, mode, op0, op1)));
15652 return mask;
15653}
15654
15655/* Expand copysign from SIGN to the positive value ABS_VALUE
15656 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
15657 the sign-bit. */
15658
15659static void
15660ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
15661{
15662 machine_mode mode = GET_MODE (sign);
15663 rtx sgn = gen_reg_rtx (mode);
15664 if (mask == NULL_RTX)
15665 {
15666 machine_mode vmode;
15667
15668 if (mode == SFmode)
15669 vmode = V4SFmode;
15670 else if (mode == DFmode)
15671 vmode = V2DFmode;
15672 else
15673 vmode = mode;
15674
15675 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
15676 if (!VECTOR_MODE_P (mode))
15677 {
15678 /* We need to generate a scalar mode mask in this case. */
15679 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
15680 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
15681 mask = gen_reg_rtx (mode);
15682 emit_insn (gen_rtx_SET (mask, tmp));
15683 }
15684 }
15685 else
15686 mask = gen_rtx_NOT (mode, mask);
15687 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
15688 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
15689}
15690
15691/* Expand SSE sequence for computing lround from OP1 storing
15692 into OP0. */
15693
15694void
15695ix86_expand_lround (rtx op0, rtx op1)
15696{
15697 /* C code for the stuff we're doing below:
15698 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
15699 return (long)tmp;
15700 */
15701 machine_mode mode = GET_MODE (op1);
15702 const struct real_format *fmt;
15703 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
15704 rtx adj;
15705
15706 /* load nextafter (0.5, 0.0) */
15707 fmt = REAL_MODE_FORMAT (mode);
15708 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
15709 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
15710
15711 /* adj = copysign (0.5, op1) */
15712 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
15713 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
15714
15715 /* adj = op1 + adj */
15716 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
15717
15718 /* op0 = (imode)adj */
15719 expand_fix (op0, adj, 0);
15720}
15721
15722/* Expand SSE2 sequence for computing lround from OPERAND1 storing
15723 into OPERAND0. */
15724
15725void
15726ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
15727{
15728 /* C code for the stuff we're doing below (for do_floor):
15729 xi = (long)op1;
15730 xi -= (double)xi > op1 ? 1 : 0;
15731 return xi;
15732 */
15733 machine_mode fmode = GET_MODE (op1);
15734 machine_mode imode = GET_MODE (op0);
15735 rtx ireg, freg, tmp;
15736 rtx_code_label *label;
15737
15738 /* reg = (long)op1 */
15739 ireg = gen_reg_rtx (imode);
15740 expand_fix (ireg, op1, 0);
15741
15742 /* freg = (double)reg */
15743 freg = gen_reg_rtx (fmode);
15744 expand_float (freg, ireg, 0);
15745
15746 /* ireg = (freg > op1) ? ireg - 1 : ireg */
15747 label = ix86_expand_sse_compare_and_jump (UNLE,
15748 freg, op1, !do_floor);
15749 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
15750 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
15751 emit_move_insn (ireg, tmp);
15752
15753 emit_label (label);
15754 LABEL_NUSES (label) = 1;
15755
15756 emit_move_insn (op0, ireg);
15757}
15758
15759/* Generate and return a rtx of mode MODE for 2**n where n is the number
15760 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
15761
15762static rtx
15763ix86_gen_TWO52 (machine_mode mode)
15764{
15765 REAL_VALUE_TYPE TWO52r;
15766 rtx TWO52;
15767
15768 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
15769 TWO52 = const_double_from_real_value (TWO52r, mode);
15770 TWO52 = force_reg (mode, TWO52);
15771
15772 return TWO52;
15773}
15774
15775/* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
15776
15777void
15778ix86_expand_rint (rtx operand0, rtx operand1)
15779{
15780 /* C code for the stuff we're doing below:
15781 xa = fabs (operand1);
15782 if (!isless (xa, 2**52))
15783 return operand1;
15784 two52 = 2**52;
15785 if (flag_rounding_math)
15786 {
15787 two52 = copysign (two52, operand1);
15788 xa = operand1;
15789 }
15790 xa = xa + two52 - two52;
15791 return copysign (xa, operand1);
15792 */
15793 machine_mode mode = GET_MODE (operand0);
15794 rtx res, xa, TWO52, two52, mask;
15795 rtx_code_label *label;
15796
15797 res = gen_reg_rtx (mode);
15798 emit_move_insn (res, operand1);
15799
15800 /* xa = abs (operand1) */
15801 xa = ix86_expand_sse_fabs (res, &mask);
15802
15803 /* if (!isless (xa, TWO52)) goto label; */
15804 TWO52 = ix86_gen_TWO52 (mode);
15805 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15806
15807 two52 = TWO52;
15808 if (flag_rounding_math)
15809 {
15810 two52 = gen_reg_rtx (mode);
15811 ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
15812 xa = res;
15813 }
15814
15815 xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
15816 xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
15817
15818 ix86_sse_copysign_to_positive (res, xa, res, mask);
15819
15820 emit_label (label);
15821 LABEL_NUSES (label) = 1;
15822
15823 emit_move_insn (operand0, res);
15824}
15825
36d387f2
UB
15826/* Expand SSE2 sequence for computing floor or ceil
15827 from OPERAND1 storing into OPERAND0. */
2bf6d935
ML
15828void
15829ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
15830{
15831 /* C code for the stuff we expand below.
15832 double xa = fabs (x), x2;
15833 if (!isless (xa, TWO52))
15834 return x;
15835 x2 = (double)(long)x;
15836 Compensate. Floor:
15837 if (x2 > x)
15838 x2 -= 1;
15839 Compensate. Ceil:
15840 if (x2 < x)
15841 x2 += 1;
15842 if (HONOR_SIGNED_ZEROS (mode))
15843 return copysign (x2, x);
15844 return x2;
15845 */
15846 machine_mode mode = GET_MODE (operand0);
15847 rtx xa, xi, TWO52, tmp, one, res, mask;
15848 rtx_code_label *label;
15849
15850 TWO52 = ix86_gen_TWO52 (mode);
15851
15852 /* Temporary for holding the result, initialized to the input
15853 operand to ease control flow. */
15854 res = gen_reg_rtx (mode);
15855 emit_move_insn (res, operand1);
15856
15857 /* xa = abs (operand1) */
15858 xa = ix86_expand_sse_fabs (res, &mask);
15859
15860 /* if (!isless (xa, TWO52)) goto label; */
15861 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15862
15863 /* xa = (double)(long)x */
15864 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
15865 expand_fix (xi, res, 0);
15866 expand_float (xa, xi, 0);
15867
15868 /* generate 1.0 */
15869 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
15870
15871 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15872 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
15873 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
15874 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
15875 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
15876 emit_move_insn (res, tmp);
15877
15878 if (HONOR_SIGNED_ZEROS (mode))
15879 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
15880
15881 emit_label (label);
15882 LABEL_NUSES (label) = 1;
15883
15884 emit_move_insn (operand0, res);
15885}
15886
36d387f2
UB
15887/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15888 into OPERAND0 without relying on DImode truncation via cvttsd2siq
15889 that is only available on 64bit targets. */
2bf6d935 15890void
36d387f2 15891ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
2bf6d935
ML
15892{
15893 /* C code for the stuff we expand below.
36d387f2 15894 double xa = fabs (x), x2;
2bf6d935
ML
15895 if (!isless (xa, TWO52))
15896 return x;
36d387f2
UB
15897 xa = xa + TWO52 - TWO52;
15898 x2 = copysign (xa, x);
15899 Compensate. Floor:
15900 if (x2 > x)
15901 x2 -= 1;
15902 Compensate. Ceil:
15903 if (x2 < x)
15904 x2 += 1;
15905 if (HONOR_SIGNED_ZEROS (mode))
15906 x2 = copysign (x2, x);
15907 return x2;
2bf6d935
ML
15908 */
15909 machine_mode mode = GET_MODE (operand0);
36d387f2 15910 rtx xa, TWO52, tmp, one, res, mask;
2bf6d935
ML
15911 rtx_code_label *label;
15912
15913 TWO52 = ix86_gen_TWO52 (mode);
15914
15915 /* Temporary for holding the result, initialized to the input
15916 operand to ease control flow. */
15917 res = gen_reg_rtx (mode);
15918 emit_move_insn (res, operand1);
15919
15920 /* xa = abs (operand1) */
15921 xa = ix86_expand_sse_fabs (res, &mask);
15922
15923 /* if (!isless (xa, TWO52)) goto label; */
15924 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15925
36d387f2
UB
15926 /* xa = xa + TWO52 - TWO52; */
15927 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
15928 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
2bf6d935 15929
36d387f2
UB
15930 /* xa = copysign (xa, operand1) */
15931 ix86_sse_copysign_to_positive (xa, xa, res, mask);
2bf6d935 15932
36d387f2
UB
15933 /* generate 1.0 */
15934 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
2bf6d935 15935
36d387f2
UB
15936 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15937 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
15938 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
15939 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
15940 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
15941 if (!do_floor && HONOR_SIGNED_ZEROS (mode))
15942 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
15943 emit_move_insn (res, tmp);
2bf6d935
ML
15944
15945 emit_label (label);
15946 LABEL_NUSES (label) = 1;
15947
15948 emit_move_insn (operand0, res);
15949}
15950
36d387f2
UB
15951/* Expand SSE sequence for computing trunc
15952 from OPERAND1 storing into OPERAND0. */
2bf6d935
ML
15953void
15954ix86_expand_trunc (rtx operand0, rtx operand1)
15955{
15956 /* C code for SSE variant we expand below.
15957 double xa = fabs (x), x2;
15958 if (!isless (xa, TWO52))
15959 return x;
15960 x2 = (double)(long)x;
15961 if (HONOR_SIGNED_ZEROS (mode))
15962 return copysign (x2, x);
15963 return x2;
15964 */
15965 machine_mode mode = GET_MODE (operand0);
15966 rtx xa, xi, TWO52, res, mask;
15967 rtx_code_label *label;
15968
15969 TWO52 = ix86_gen_TWO52 (mode);
15970
15971 /* Temporary for holding the result, initialized to the input
15972 operand to ease control flow. */
15973 res = gen_reg_rtx (mode);
15974 emit_move_insn (res, operand1);
15975
15976 /* xa = abs (operand1) */
15977 xa = ix86_expand_sse_fabs (res, &mask);
15978
15979 /* if (!isless (xa, TWO52)) goto label; */
15980 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15981
15982 /* x = (double)(long)x */
15983 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
15984 expand_fix (xi, res, 0);
15985 expand_float (res, xi, 0);
15986
15987 if (HONOR_SIGNED_ZEROS (mode))
15988 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
15989
15990 emit_label (label);
15991 LABEL_NUSES (label) = 1;
15992
15993 emit_move_insn (operand0, res);
15994}
15995
15996/* Expand SSE sequence for computing trunc from OPERAND1 storing
36d387f2
UB
15997 into OPERAND0 without relying on DImode truncation via cvttsd2siq
15998 that is only available on 64bit targets. */
2bf6d935
ML
15999void
16000ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
16001{
16002 machine_mode mode = GET_MODE (operand0);
16003 rtx xa, mask, TWO52, one, res, smask, tmp;
16004 rtx_code_label *label;
16005
16006 /* C code for SSE variant we expand below.
16007 double xa = fabs (x), x2;
16008 if (!isless (xa, TWO52))
16009 return x;
16010 xa2 = xa + TWO52 - TWO52;
16011 Compensate:
16012 if (xa2 > xa)
16013 xa2 -= 1.0;
16014 x2 = copysign (xa2, x);
16015 return x2;
16016 */
16017
16018 TWO52 = ix86_gen_TWO52 (mode);
16019
16020 /* Temporary for holding the result, initialized to the input
16021 operand to ease control flow. */
16022 res = gen_reg_rtx (mode);
16023 emit_move_insn (res, operand1);
16024
16025 /* xa = abs (operand1) */
16026 xa = ix86_expand_sse_fabs (res, &smask);
16027
16028 /* if (!isless (xa, TWO52)) goto label; */
16029 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16030
16031 /* res = xa + TWO52 - TWO52; */
16032 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16033 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
16034 emit_move_insn (res, tmp);
16035
16036 /* generate 1.0 */
16037 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16038
16039 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
16040 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
16041 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
16042 tmp = expand_simple_binop (mode, MINUS,
16043 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
16044 emit_move_insn (res, tmp);
16045
16046 /* res = copysign (res, operand1) */
16047 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
16048
16049 emit_label (label);
16050 LABEL_NUSES (label) = 1;
16051
16052 emit_move_insn (operand0, res);
16053}
16054
36d387f2
UB
16055/* Expand SSE sequence for computing round
16056 from OPERAND1 storing into OPERAND0. */
2bf6d935
ML
16057void
16058ix86_expand_round (rtx operand0, rtx operand1)
16059{
16060 /* C code for the stuff we're doing below:
16061 double xa = fabs (x);
16062 if (!isless (xa, TWO52))
16063 return x;
16064 xa = (double)(long)(xa + nextafter (0.5, 0.0));
16065 return copysign (xa, x);
16066 */
16067 machine_mode mode = GET_MODE (operand0);
16068 rtx res, TWO52, xa, xi, half, mask;
16069 rtx_code_label *label;
16070 const struct real_format *fmt;
16071 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16072
16073 /* Temporary for holding the result, initialized to the input
16074 operand to ease control flow. */
16075 res = gen_reg_rtx (mode);
16076 emit_move_insn (res, operand1);
16077
16078 TWO52 = ix86_gen_TWO52 (mode);
16079 xa = ix86_expand_sse_fabs (res, &mask);
16080 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16081
16082 /* load nextafter (0.5, 0.0) */
16083 fmt = REAL_MODE_FORMAT (mode);
16084 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16085 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16086
16087 /* xa = xa + 0.5 */
16088 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
16089 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
16090
16091 /* xa = (double)(int64_t)xa */
16092 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
16093 expand_fix (xi, xa, 0);
16094 expand_float (xa, xi, 0);
16095
16096 /* res = copysign (xa, operand1) */
16097 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
16098
16099 emit_label (label);
16100 LABEL_NUSES (label) = 1;
16101
16102 emit_move_insn (operand0, res);
16103}
16104
36d387f2
UB
16105/* Expand SSE sequence for computing round from OPERAND1 storing
16106 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16107 that is only available on 64bit targets. */
16108void
16109ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
16110{
16111 /* C code for the stuff we expand below.
16112 double xa = fabs (x), xa2, x2;
16113 if (!isless (xa, TWO52))
16114 return x;
16115 Using the absolute value and copying back sign makes
16116 -0.0 -> -0.0 correct.
16117 xa2 = xa + TWO52 - TWO52;
16118 Compensate.
16119 dxa = xa2 - xa;
16120 if (dxa <= -0.5)
16121 xa2 += 1;
16122 else if (dxa > 0.5)
16123 xa2 -= 1;
16124 x2 = copysign (xa2, x);
16125 return x2;
16126 */
16127 machine_mode mode = GET_MODE (operand0);
16128 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
16129 rtx_code_label *label;
16130
16131 TWO52 = ix86_gen_TWO52 (mode);
16132
16133 /* Temporary for holding the result, initialized to the input
16134 operand to ease control flow. */
16135 res = gen_reg_rtx (mode);
16136 emit_move_insn (res, operand1);
16137
16138 /* xa = abs (operand1) */
16139 xa = ix86_expand_sse_fabs (res, &mask);
16140
16141 /* if (!isless (xa, TWO52)) goto label; */
16142 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16143
16144 /* xa2 = xa + TWO52 - TWO52; */
16145 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16146 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
16147
16148 /* dxa = xa2 - xa; */
16149 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
16150
16151 /* generate 0.5, 1.0 and -0.5 */
16152 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
16153 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
16154 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
16155 0, OPTAB_DIRECT);
16156
16157 /* Compensate. */
16158 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
16159 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
16160 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16161 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16162 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
16163 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
16164 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16165 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16166
16167 /* res = copysign (xa2, operand1) */
16168 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
16169
16170 emit_label (label);
16171 LABEL_NUSES (label) = 1;
16172
16173 emit_move_insn (operand0, res);
16174}
16175
2bf6d935
ML
16176/* Expand SSE sequence for computing round
16177 from OP1 storing into OP0 using sse4 round insn. */
16178void
16179ix86_expand_round_sse4 (rtx op0, rtx op1)
16180{
16181 machine_mode mode = GET_MODE (op0);
16182 rtx e1, e2, res, half;
16183 const struct real_format *fmt;
16184 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16185 rtx (*gen_copysign) (rtx, rtx, rtx);
16186 rtx (*gen_round) (rtx, rtx, rtx);
16187
16188 switch (mode)
16189 {
16190 case E_SFmode:
16191 gen_copysign = gen_copysignsf3;
16192 gen_round = gen_sse4_1_roundsf2;
16193 break;
16194 case E_DFmode:
16195 gen_copysign = gen_copysigndf3;
16196 gen_round = gen_sse4_1_rounddf2;
16197 break;
16198 default:
16199 gcc_unreachable ();
16200 }
16201
16202 /* round (a) = trunc (a + copysign (0.5, a)) */
16203
16204 /* load nextafter (0.5, 0.0) */
16205 fmt = REAL_MODE_FORMAT (mode);
16206 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16207 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16208 half = const_double_from_real_value (pred_half, mode);
16209
16210 /* e1 = copysign (0.5, op1) */
16211 e1 = gen_reg_rtx (mode);
16212 emit_insn (gen_copysign (e1, half, op1));
16213
16214 /* e2 = op1 + e1 */
16215 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
16216
16217 /* res = trunc (e2) */
16218 res = gen_reg_rtx (mode);
16219 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
16220
16221 emit_move_insn (op0, res);
16222}
16223
16224/* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16225 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16226 insn every time. */
16227
16228static GTY(()) rtx_insn *vselect_insn;
16229
16230/* Initialize vselect_insn. */
16231
16232static void
16233init_vselect_insn (void)
16234{
16235 unsigned i;
16236 rtx x;
16237
16238 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
16239 for (i = 0; i < MAX_VECT_LEN; ++i)
16240 XVECEXP (x, 0, i) = const0_rtx;
16241 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
16242 const0_rtx), x);
16243 x = gen_rtx_SET (const0_rtx, x);
16244 start_sequence ();
16245 vselect_insn = emit_insn (x);
16246 end_sequence ();
16247}
16248
16249/* Construct (set target (vec_select op0 (parallel perm))) and
16250 return true if that's a valid instruction in the active ISA. */
16251
16252static bool
16253expand_vselect (rtx target, rtx op0, const unsigned char *perm,
16254 unsigned nelt, bool testing_p)
16255{
16256 unsigned int i;
16257 rtx x, save_vconcat;
16258 int icode;
16259
16260 if (vselect_insn == NULL_RTX)
16261 init_vselect_insn ();
16262
16263 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
16264 PUT_NUM_ELEM (XVEC (x, 0), nelt);
16265 for (i = 0; i < nelt; ++i)
16266 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
16267 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16268 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
16269 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
16270 SET_DEST (PATTERN (vselect_insn)) = target;
16271 icode = recog_memoized (vselect_insn);
16272
16273 if (icode >= 0 && !testing_p)
16274 emit_insn (copy_rtx (PATTERN (vselect_insn)));
16275
16276 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
16277 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
16278 INSN_CODE (vselect_insn) = -1;
16279
16280 return icode >= 0;
16281}
16282
16283/* Similar, but generate a vec_concat from op0 and op1 as well. */
16284
16285static bool
16286expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
16287 const unsigned char *perm, unsigned nelt,
16288 bool testing_p)
16289{
16290 machine_mode v2mode;
16291 rtx x;
16292 bool ok;
16293
16294 if (vselect_insn == NULL_RTX)
16295 init_vselect_insn ();
16296
16297 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
16298 return false;
16299 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16300 PUT_MODE (x, v2mode);
16301 XEXP (x, 0) = op0;
16302 XEXP (x, 1) = op1;
16303 ok = expand_vselect (target, x, perm, nelt, testing_p);
16304 XEXP (x, 0) = const0_rtx;
16305 XEXP (x, 1) = const0_rtx;
16306 return ok;
16307}
16308
4bf4c103 16309/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
16310 using movss or movsd. */
16311static bool
16312expand_vec_perm_movs (struct expand_vec_perm_d *d)
16313{
16314 machine_mode vmode = d->vmode;
16315 unsigned i, nelt = d->nelt;
16316 rtx x;
16317
16318 if (d->one_operand_p)
16319 return false;
16320
16321 if (!(TARGET_SSE && vmode == V4SFmode)
240198fe 16322 && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
2bf6d935
ML
16323 && !(TARGET_SSE2 && vmode == V2DFmode))
16324 return false;
16325
16326 /* Only the first element is changed. */
16327 if (d->perm[0] != nelt && d->perm[0] != 0)
16328 return false;
16329 for (i = 1; i < nelt; ++i)
16330 if (d->perm[i] != i + nelt - d->perm[0])
16331 return false;
16332
16333 if (d->testing_p)
16334 return true;
16335
16336 if (d->perm[0] == nelt)
16337 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
16338 else
16339 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
16340
16341 emit_insn (gen_rtx_SET (d->target, x));
16342
16343 return true;
16344}
16345
4bf4c103 16346/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
16347 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
16348
16349static bool
16350expand_vec_perm_blend (struct expand_vec_perm_d *d)
16351{
16352 machine_mode mmode, vmode = d->vmode;
fa2987ed
JJ
16353 unsigned i, nelt = d->nelt;
16354 unsigned HOST_WIDE_INT mask;
2bf6d935
ML
16355 rtx target, op0, op1, maskop, x;
16356 rtx rperm[32], vperm;
16357
16358 if (d->one_operand_p)
16359 return false;
16360 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
16361 && (TARGET_AVX512BW
16362 || GET_MODE_UNIT_SIZE (vmode) >= 4))
16363 ;
16364 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
16365 ;
16366 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
16367 ;
16368 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
16369 ;
16370 else
16371 return false;
16372
16373 /* This is a blend, not a permute. Elements must stay in their
16374 respective lanes. */
16375 for (i = 0; i < nelt; ++i)
16376 {
16377 unsigned e = d->perm[i];
16378 if (!(e == i || e == i + nelt))
16379 return false;
16380 }
16381
16382 if (d->testing_p)
16383 return true;
16384
16385 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
16386 decision should be extracted elsewhere, so that we only try that
16387 sequence once all budget==3 options have been tried. */
16388 target = d->target;
16389 op0 = d->op0;
16390 op1 = d->op1;
16391 mask = 0;
16392
16393 switch (vmode)
16394 {
16395 case E_V8DFmode:
16396 case E_V16SFmode:
16397 case E_V4DFmode:
16398 case E_V8SFmode:
16399 case E_V2DFmode:
16400 case E_V4SFmode:
16401 case E_V8HImode:
16402 case E_V8SImode:
16403 case E_V32HImode:
16404 case E_V64QImode:
16405 case E_V16SImode:
16406 case E_V8DImode:
16407 for (i = 0; i < nelt; ++i)
fa2987ed 16408 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
2bf6d935
ML
16409 break;
16410
16411 case E_V2DImode:
16412 for (i = 0; i < 2; ++i)
16413 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
16414 vmode = V8HImode;
16415 goto do_subreg;
16416
16417 case E_V4SImode:
16418 for (i = 0; i < 4; ++i)
16419 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16420 vmode = V8HImode;
16421 goto do_subreg;
16422
16423 case E_V16QImode:
16424 /* See if bytes move in pairs so we can use pblendw with
16425 an immediate argument, rather than pblendvb with a vector
16426 argument. */
16427 for (i = 0; i < 16; i += 2)
16428 if (d->perm[i] + 1 != d->perm[i + 1])
16429 {
16430 use_pblendvb:
16431 for (i = 0; i < nelt; ++i)
16432 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
16433
16434 finish_pblendvb:
16435 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16436 vperm = force_reg (vmode, vperm);
16437
16438 if (GET_MODE_SIZE (vmode) == 16)
16439 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
16440 else
16441 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
16442 if (target != d->target)
16443 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16444 return true;
16445 }
16446
16447 for (i = 0; i < 8; ++i)
16448 mask |= (d->perm[i * 2] >= 16) << i;
16449 vmode = V8HImode;
16450 /* FALLTHRU */
16451
16452 do_subreg:
16453 target = gen_reg_rtx (vmode);
16454 op0 = gen_lowpart (vmode, op0);
16455 op1 = gen_lowpart (vmode, op1);
16456 break;
16457
16458 case E_V32QImode:
16459 /* See if bytes move in pairs. If not, vpblendvb must be used. */
16460 for (i = 0; i < 32; i += 2)
16461 if (d->perm[i] + 1 != d->perm[i + 1])
16462 goto use_pblendvb;
16463 /* See if bytes move in quadruplets. If yes, vpblendd
16464 with immediate can be used. */
16465 for (i = 0; i < 32; i += 4)
16466 if (d->perm[i] + 2 != d->perm[i + 2])
16467 break;
16468 if (i < 32)
16469 {
16470 /* See if bytes move the same in both lanes. If yes,
16471 vpblendw with immediate can be used. */
16472 for (i = 0; i < 16; i += 2)
16473 if (d->perm[i] + 16 != d->perm[i + 16])
16474 goto use_pblendvb;
16475
16476 /* Use vpblendw. */
16477 for (i = 0; i < 16; ++i)
16478 mask |= (d->perm[i * 2] >= 32) << i;
16479 vmode = V16HImode;
16480 goto do_subreg;
16481 }
16482
16483 /* Use vpblendd. */
16484 for (i = 0; i < 8; ++i)
16485 mask |= (d->perm[i * 4] >= 32) << i;
16486 vmode = V8SImode;
16487 goto do_subreg;
16488
16489 case E_V16HImode:
16490 /* See if words move in pairs. If yes, vpblendd can be used. */
16491 for (i = 0; i < 16; i += 2)
16492 if (d->perm[i] + 1 != d->perm[i + 1])
16493 break;
16494 if (i < 16)
16495 {
16496 /* See if words move the same in both lanes. If not,
16497 vpblendvb must be used. */
16498 for (i = 0; i < 8; i++)
16499 if (d->perm[i] + 8 != d->perm[i + 8])
16500 {
16501 /* Use vpblendvb. */
16502 for (i = 0; i < 32; ++i)
16503 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
16504
16505 vmode = V32QImode;
16506 nelt = 32;
16507 target = gen_reg_rtx (vmode);
16508 op0 = gen_lowpart (vmode, op0);
16509 op1 = gen_lowpart (vmode, op1);
16510 goto finish_pblendvb;
16511 }
16512
16513 /* Use vpblendw. */
16514 for (i = 0; i < 16; ++i)
16515 mask |= (d->perm[i] >= 16) << i;
16516 break;
16517 }
16518
16519 /* Use vpblendd. */
16520 for (i = 0; i < 8; ++i)
16521 mask |= (d->perm[i * 2] >= 16) << i;
16522 vmode = V8SImode;
16523 goto do_subreg;
16524
16525 case E_V4DImode:
16526 /* Use vpblendd. */
16527 for (i = 0; i < 4; ++i)
16528 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16529 vmode = V8SImode;
16530 goto do_subreg;
16531
16532 default:
16533 gcc_unreachable ();
16534 }
16535
16536 switch (vmode)
16537 {
16538 case E_V8DFmode:
16539 case E_V8DImode:
16540 mmode = QImode;
16541 break;
16542 case E_V16SFmode:
16543 case E_V16SImode:
16544 mmode = HImode;
16545 break;
16546 case E_V32HImode:
16547 mmode = SImode;
16548 break;
16549 case E_V64QImode:
16550 mmode = DImode;
16551 break;
16552 default:
16553 mmode = VOIDmode;
16554 }
16555
16556 if (mmode != VOIDmode)
16557 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
16558 else
16559 maskop = GEN_INT (mask);
16560
16561 /* This matches five different patterns with the different modes. */
16562 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
16563 x = gen_rtx_SET (target, x);
16564 emit_insn (x);
16565 if (target != d->target)
16566 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16567
16568 return true;
16569}
16570
4bf4c103 16571/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
16572 in terms of the variable form of vpermilps.
16573
16574 Note that we will have already failed the immediate input vpermilps,
16575 which requires that the high and low part shuffle be identical; the
16576 variable form doesn't require that. */
16577
16578static bool
16579expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
16580{
16581 rtx rperm[8], vperm;
16582 unsigned i;
16583
16584 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
16585 return false;
16586
16587 /* We can only permute within the 128-bit lane. */
16588 for (i = 0; i < 8; ++i)
16589 {
16590 unsigned e = d->perm[i];
16591 if (i < 4 ? e >= 4 : e < 4)
16592 return false;
16593 }
16594
16595 if (d->testing_p)
16596 return true;
16597
16598 for (i = 0; i < 8; ++i)
16599 {
16600 unsigned e = d->perm[i];
16601
16602 /* Within each 128-bit lane, the elements of op0 are numbered
16603 from 0 and the elements of op1 are numbered from 4. */
16604 if (e >= 8 + 4)
16605 e -= 8;
16606 else if (e >= 4)
16607 e -= 4;
16608
16609 rperm[i] = GEN_INT (e);
16610 }
16611
16612 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
16613 vperm = force_reg (V8SImode, vperm);
16614 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
16615
16616 return true;
16617}
16618
16619/* Return true if permutation D can be performed as VMODE permutation
16620 instead. */
16621
16622static bool
16623valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
16624{
16625 unsigned int i, j, chunk;
16626
16627 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
16628 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
16629 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
16630 return false;
16631
16632 if (GET_MODE_NUNITS (vmode) >= d->nelt)
16633 return true;
16634
16635 chunk = d->nelt / GET_MODE_NUNITS (vmode);
16636 for (i = 0; i < d->nelt; i += chunk)
16637 if (d->perm[i] & (chunk - 1))
16638 return false;
16639 else
16640 for (j = 1; j < chunk; ++j)
16641 if (d->perm[i] + j != d->perm[i + j])
16642 return false;
16643
16644 return true;
16645}
16646
4bf4c103 16647/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
16648 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
16649
16650static bool
16651expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
16652{
16653 unsigned i, nelt, eltsz, mask;
16654 unsigned char perm[64];
16655 machine_mode vmode = V16QImode;
16656 rtx rperm[64], vperm, target, op0, op1;
16657
16658 nelt = d->nelt;
16659
16660 if (!d->one_operand_p)
16661 {
16662 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
16663 {
16664 if (TARGET_AVX2
16665 && valid_perm_using_mode_p (V2TImode, d))
16666 {
16667 if (d->testing_p)
16668 return true;
16669
16670 /* Use vperm2i128 insn. The pattern uses
16671 V4DImode instead of V2TImode. */
16672 target = d->target;
16673 if (d->vmode != V4DImode)
16674 target = gen_reg_rtx (V4DImode);
16675 op0 = gen_lowpart (V4DImode, d->op0);
16676 op1 = gen_lowpart (V4DImode, d->op1);
16677 rperm[0]
16678 = GEN_INT ((d->perm[0] / (nelt / 2))
16679 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
16680 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
16681 if (target != d->target)
16682 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16683 return true;
16684 }
16685 return false;
16686 }
16687 }
16688 else
16689 {
16690 if (GET_MODE_SIZE (d->vmode) == 16)
16691 {
16692 if (!TARGET_SSSE3)
16693 return false;
16694 }
16695 else if (GET_MODE_SIZE (d->vmode) == 32)
16696 {
16697 if (!TARGET_AVX2)
16698 return false;
16699
16700 /* V4DImode should be already handled through
16701 expand_vselect by vpermq instruction. */
16702 gcc_assert (d->vmode != V4DImode);
16703
16704 vmode = V32QImode;
16705 if (d->vmode == V8SImode
16706 || d->vmode == V16HImode
16707 || d->vmode == V32QImode)
16708 {
16709 /* First see if vpermq can be used for
16710 V8SImode/V16HImode/V32QImode. */
16711 if (valid_perm_using_mode_p (V4DImode, d))
16712 {
16713 for (i = 0; i < 4; i++)
16714 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
16715 if (d->testing_p)
16716 return true;
16717 target = gen_reg_rtx (V4DImode);
16718 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
16719 perm, 4, false))
16720 {
16721 emit_move_insn (d->target,
16722 gen_lowpart (d->vmode, target));
16723 return true;
16724 }
16725 return false;
16726 }
16727
16728 /* Next see if vpermd can be used. */
16729 if (valid_perm_using_mode_p (V8SImode, d))
16730 vmode = V8SImode;
16731 }
16732 /* Or if vpermps can be used. */
16733 else if (d->vmode == V8SFmode)
16734 vmode = V8SImode;
16735
16736 if (vmode == V32QImode)
16737 {
16738 /* vpshufb only works intra lanes, it is not
16739 possible to shuffle bytes in between the lanes. */
16740 for (i = 0; i < nelt; ++i)
16741 if ((d->perm[i] ^ i) & (nelt / 2))
16742 return false;
16743 }
16744 }
16745 else if (GET_MODE_SIZE (d->vmode) == 64)
16746 {
16747 if (!TARGET_AVX512BW)
16748 return false;
16749
16750 /* If vpermq didn't work, vpshufb won't work either. */
16751 if (d->vmode == V8DFmode || d->vmode == V8DImode)
16752 return false;
16753
16754 vmode = V64QImode;
16755 if (d->vmode == V16SImode
16756 || d->vmode == V32HImode
16757 || d->vmode == V64QImode)
16758 {
16759 /* First see if vpermq can be used for
16760 V16SImode/V32HImode/V64QImode. */
16761 if (valid_perm_using_mode_p (V8DImode, d))
16762 {
16763 for (i = 0; i < 8; i++)
16764 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
16765 if (d->testing_p)
16766 return true;
16767 target = gen_reg_rtx (V8DImode);
16768 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
16769 perm, 8, false))
16770 {
16771 emit_move_insn (d->target,
16772 gen_lowpart (d->vmode, target));
16773 return true;
16774 }
16775 return false;
16776 }
16777
16778 /* Next see if vpermd can be used. */
16779 if (valid_perm_using_mode_p (V16SImode, d))
16780 vmode = V16SImode;
16781 }
16782 /* Or if vpermps can be used. */
16783 else if (d->vmode == V16SFmode)
16784 vmode = V16SImode;
16785 if (vmode == V64QImode)
16786 {
16787 /* vpshufb only works intra lanes, it is not
16788 possible to shuffle bytes in between the lanes. */
16789 for (i = 0; i < nelt; ++i)
d51af82b 16790 if ((d->perm[i] ^ i) & (3 * nelt / 4))
2bf6d935
ML
16791 return false;
16792 }
16793 }
16794 else
16795 return false;
16796 }
16797
16798 if (d->testing_p)
16799 return true;
16800
16801 if (vmode == V8SImode)
16802 for (i = 0; i < 8; ++i)
16803 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
16804 else if (vmode == V16SImode)
16805 for (i = 0; i < 16; ++i)
16806 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
16807 else
16808 {
16809 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
16810 if (!d->one_operand_p)
16811 mask = 2 * nelt - 1;
16812 else if (vmode == V16QImode)
16813 mask = nelt - 1;
16814 else if (vmode == V64QImode)
16815 mask = nelt / 4 - 1;
16816 else
16817 mask = nelt / 2 - 1;
16818
16819 for (i = 0; i < nelt; ++i)
16820 {
16821 unsigned j, e = d->perm[i] & mask;
16822 for (j = 0; j < eltsz; ++j)
16823 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
16824 }
16825 }
16826
16827 vperm = gen_rtx_CONST_VECTOR (vmode,
16828 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
16829 vperm = force_reg (vmode, vperm);
16830
16831 target = d->target;
16832 if (d->vmode != vmode)
16833 target = gen_reg_rtx (vmode);
16834 op0 = gen_lowpart (vmode, d->op0);
16835 if (d->one_operand_p)
16836 {
16837 if (vmode == V16QImode)
16838 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
16839 else if (vmode == V32QImode)
16840 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
16841 else if (vmode == V64QImode)
16842 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
16843 else if (vmode == V8SFmode)
16844 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
16845 else if (vmode == V8SImode)
16846 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
16847 else if (vmode == V16SFmode)
16848 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
16849 else if (vmode == V16SImode)
16850 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
16851 else
16852 gcc_unreachable ();
16853 }
16854 else
16855 {
16856 op1 = gen_lowpart (vmode, d->op1);
16857 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
16858 }
16859 if (target != d->target)
16860 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16861
16862 return true;
16863}
16864
16865/* For V*[QHS]Imode permutations, check if the same permutation
16866 can't be performed in a 2x, 4x or 8x wider inner mode. */
16867
16868static bool
16869canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
16870 struct expand_vec_perm_d *nd)
16871{
16872 int i;
16873 machine_mode mode = VOIDmode;
16874
16875 switch (d->vmode)
16876 {
16877 case E_V16QImode: mode = V8HImode; break;
16878 case E_V32QImode: mode = V16HImode; break;
16879 case E_V64QImode: mode = V32HImode; break;
16880 case E_V8HImode: mode = V4SImode; break;
16881 case E_V16HImode: mode = V8SImode; break;
16882 case E_V32HImode: mode = V16SImode; break;
16883 case E_V4SImode: mode = V2DImode; break;
16884 case E_V8SImode: mode = V4DImode; break;
16885 case E_V16SImode: mode = V8DImode; break;
16886 default: return false;
16887 }
16888 for (i = 0; i < d->nelt; i += 2)
16889 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
16890 return false;
16891 nd->vmode = mode;
16892 nd->nelt = d->nelt / 2;
16893 for (i = 0; i < nd->nelt; i++)
16894 nd->perm[i] = d->perm[2 * i] / 2;
16895 if (GET_MODE_INNER (mode) != DImode)
16896 canonicalize_vector_int_perm (nd, nd);
16897 if (nd != d)
16898 {
16899 nd->one_operand_p = d->one_operand_p;
16900 nd->testing_p = d->testing_p;
16901 if (d->op0 == d->op1)
16902 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
16903 else
16904 {
16905 nd->op0 = gen_lowpart (nd->vmode, d->op0);
16906 nd->op1 = gen_lowpart (nd->vmode, d->op1);
16907 }
16908 if (d->testing_p)
16909 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
16910 else
16911 nd->target = gen_reg_rtx (nd->vmode);
16912 }
16913 return true;
16914}
16915
16916/* Try to expand one-operand permutation with constant mask. */
16917
16918static bool
16919ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
16920{
16921 machine_mode mode = GET_MODE (d->op0);
16922 machine_mode maskmode = mode;
16923 rtx (*gen) (rtx, rtx, rtx) = NULL;
16924 rtx target, op0, mask;
16925 rtx vec[64];
16926
16927 if (!rtx_equal_p (d->op0, d->op1))
16928 return false;
16929
16930 if (!TARGET_AVX512F)
16931 return false;
16932
16933 switch (mode)
16934 {
16935 case E_V16SImode:
16936 gen = gen_avx512f_permvarv16si;
16937 break;
16938 case E_V16SFmode:
16939 gen = gen_avx512f_permvarv16sf;
16940 maskmode = V16SImode;
16941 break;
16942 case E_V8DImode:
16943 gen = gen_avx512f_permvarv8di;
16944 break;
16945 case E_V8DFmode:
16946 gen = gen_avx512f_permvarv8df;
16947 maskmode = V8DImode;
16948 break;
16949 default:
16950 return false;
16951 }
16952
16953 target = d->target;
16954 op0 = d->op0;
16955 for (int i = 0; i < d->nelt; ++i)
16956 vec[i] = GEN_INT (d->perm[i]);
16957 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
16958 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
16959 return true;
16960}
16961
16962static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
16963
4bf4c103 16964/* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
2bf6d935
ML
16965 in a single instruction. */
16966
16967static bool
16968expand_vec_perm_1 (struct expand_vec_perm_d *d)
16969{
16970 unsigned i, nelt = d->nelt;
16971 struct expand_vec_perm_d nd;
16972
16973 /* Check plain VEC_SELECT first, because AVX has instructions that could
16974 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
16975 input where SEL+CONCAT may not. */
16976 if (d->one_operand_p)
16977 {
16978 int mask = nelt - 1;
16979 bool identity_perm = true;
16980 bool broadcast_perm = true;
16981
16982 for (i = 0; i < nelt; i++)
16983 {
16984 nd.perm[i] = d->perm[i] & mask;
16985 if (nd.perm[i] != i)
16986 identity_perm = false;
16987 if (nd.perm[i])
16988 broadcast_perm = false;
16989 }
16990
16991 if (identity_perm)
16992 {
16993 if (!d->testing_p)
16994 emit_move_insn (d->target, d->op0);
16995 return true;
16996 }
16997 else if (broadcast_perm && TARGET_AVX2)
16998 {
16999 /* Use vpbroadcast{b,w,d}. */
17000 rtx (*gen) (rtx, rtx) = NULL;
17001 switch (d->vmode)
17002 {
17003 case E_V64QImode:
17004 if (TARGET_AVX512BW)
17005 gen = gen_avx512bw_vec_dupv64qi_1;
17006 break;
17007 case E_V32QImode:
17008 gen = gen_avx2_pbroadcastv32qi_1;
17009 break;
17010 case E_V32HImode:
17011 if (TARGET_AVX512BW)
17012 gen = gen_avx512bw_vec_dupv32hi_1;
17013 break;
17014 case E_V16HImode:
17015 gen = gen_avx2_pbroadcastv16hi_1;
17016 break;
17017 case E_V16SImode:
17018 if (TARGET_AVX512F)
17019 gen = gen_avx512f_vec_dupv16si_1;
17020 break;
17021 case E_V8SImode:
17022 gen = gen_avx2_pbroadcastv8si_1;
17023 break;
17024 case E_V16QImode:
17025 gen = gen_avx2_pbroadcastv16qi;
17026 break;
17027 case E_V8HImode:
17028 gen = gen_avx2_pbroadcastv8hi;
17029 break;
17030 case E_V16SFmode:
17031 if (TARGET_AVX512F)
17032 gen = gen_avx512f_vec_dupv16sf_1;
17033 break;
17034 case E_V8SFmode:
17035 gen = gen_avx2_vec_dupv8sf_1;
17036 break;
17037 case E_V8DFmode:
17038 if (TARGET_AVX512F)
17039 gen = gen_avx512f_vec_dupv8df_1;
17040 break;
17041 case E_V8DImode:
17042 if (TARGET_AVX512F)
17043 gen = gen_avx512f_vec_dupv8di_1;
17044 break;
17045 /* For other modes prefer other shuffles this function creates. */
17046 default: break;
17047 }
17048 if (gen != NULL)
17049 {
17050 if (!d->testing_p)
17051 emit_insn (gen (d->target, d->op0));
17052 return true;
17053 }
17054 }
17055
17056 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
17057 return true;
17058
17059 /* There are plenty of patterns in sse.md that are written for
17060 SEL+CONCAT and are not replicated for a single op. Perhaps
17061 that should be changed, to avoid the nastiness here. */
17062
17063 /* Recognize interleave style patterns, which means incrementing
17064 every other permutation operand. */
17065 for (i = 0; i < nelt; i += 2)
17066 {
17067 nd.perm[i] = d->perm[i] & mask;
17068 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
17069 }
17070 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17071 d->testing_p))
17072 return true;
17073
17074 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
17075 if (nelt >= 4)
17076 {
17077 for (i = 0; i < nelt; i += 4)
17078 {
17079 nd.perm[i + 0] = d->perm[i + 0] & mask;
17080 nd.perm[i + 1] = d->perm[i + 1] & mask;
17081 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
17082 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
17083 }
17084
17085 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17086 d->testing_p))
17087 return true;
17088 }
17089 }
17090
17091 /* Try movss/movsd instructions. */
17092 if (expand_vec_perm_movs (d))
17093 return true;
17094
17095 /* Finally, try the fully general two operand permute. */
17096 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
17097 d->testing_p))
17098 return true;
17099
17100 /* Recognize interleave style patterns with reversed operands. */
17101 if (!d->one_operand_p)
17102 {
17103 for (i = 0; i < nelt; ++i)
17104 {
17105 unsigned e = d->perm[i];
17106 if (e >= nelt)
17107 e -= nelt;
17108 else
17109 e += nelt;
17110 nd.perm[i] = e;
17111 }
17112
17113 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
17114 d->testing_p))
17115 return true;
17116 }
17117
17118 /* Try the SSE4.1 blend variable merge instructions. */
17119 if (expand_vec_perm_blend (d))
17120 return true;
17121
17122 /* Try one of the AVX vpermil variable permutations. */
17123 if (expand_vec_perm_vpermil (d))
17124 return true;
17125
17126 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
17127 vpshufb, vpermd, vpermps or vpermq variable permutation. */
17128 if (expand_vec_perm_pshufb (d))
17129 return true;
17130
17131 /* Try the AVX2 vpalignr instruction. */
17132 if (expand_vec_perm_palignr (d, true))
17133 return true;
17134
17135 /* Try the AVX512F vperm{s,d} instructions. */
17136 if (ix86_expand_vec_one_operand_perm_avx512 (d))
17137 return true;
17138
17139 /* Try the AVX512F vpermt2/vpermi2 instructions. */
17140 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
17141 return true;
17142
17143 /* See if we can get the same permutation in different vector integer
17144 mode. */
17145 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
17146 {
17147 if (!d->testing_p)
17148 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
17149 return true;
17150 }
17151 return false;
17152}
17153
4bf4c103 17154/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
17155 in terms of a pair of pshuflw + pshufhw instructions. */
17156
17157static bool
17158expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
17159{
17160 unsigned char perm2[MAX_VECT_LEN];
17161 unsigned i;
17162 bool ok;
17163
17164 if (d->vmode != V8HImode || !d->one_operand_p)
17165 return false;
17166
17167 /* The two permutations only operate in 64-bit lanes. */
17168 for (i = 0; i < 4; ++i)
17169 if (d->perm[i] >= 4)
17170 return false;
17171 for (i = 4; i < 8; ++i)
17172 if (d->perm[i] < 4)
17173 return false;
17174
17175 if (d->testing_p)
17176 return true;
17177
17178 /* Emit the pshuflw. */
17179 memcpy (perm2, d->perm, 4);
17180 for (i = 4; i < 8; ++i)
17181 perm2[i] = i;
17182 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
17183 gcc_assert (ok);
17184
17185 /* Emit the pshufhw. */
17186 memcpy (perm2 + 4, d->perm + 4, 4);
17187 for (i = 0; i < 4; ++i)
17188 perm2[i] = i;
17189 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
17190 gcc_assert (ok);
17191
17192 return true;
17193}
17194
4bf4c103 17195/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
17196 the permutation using the SSSE3 palignr instruction. This succeeds
17197 when all of the elements in PERM fit within one vector and we merely
17198 need to shift them down so that a single vector permutation has a
17199 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
17200 the vpalignr instruction itself can perform the requested permutation. */
17201
17202static bool
17203expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
17204{
17205 unsigned i, nelt = d->nelt;
17206 unsigned min, max, minswap, maxswap;
17207 bool in_order, ok, swap = false;
17208 rtx shift, target;
17209 struct expand_vec_perm_d dcopy;
17210
17211 /* Even with AVX, palignr only operates on 128-bit vectors,
17212 in AVX2 palignr operates on both 128-bit lanes. */
17213 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
17214 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
17215 return false;
17216
17217 min = 2 * nelt;
17218 max = 0;
17219 minswap = 2 * nelt;
17220 maxswap = 0;
17221 for (i = 0; i < nelt; ++i)
17222 {
17223 unsigned e = d->perm[i];
17224 unsigned eswap = d->perm[i] ^ nelt;
17225 if (GET_MODE_SIZE (d->vmode) == 32)
17226 {
17227 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
17228 eswap = e ^ (nelt / 2);
17229 }
17230 if (e < min)
17231 min = e;
17232 if (e > max)
17233 max = e;
17234 if (eswap < minswap)
17235 minswap = eswap;
17236 if (eswap > maxswap)
17237 maxswap = eswap;
17238 }
17239 if (min == 0
17240 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
17241 {
17242 if (d->one_operand_p
17243 || minswap == 0
17244 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
17245 ? nelt / 2 : nelt))
17246 return false;
17247 swap = true;
17248 min = minswap;
17249 max = maxswap;
17250 }
17251
17252 /* Given that we have SSSE3, we know we'll be able to implement the
17253 single operand permutation after the palignr with pshufb for
17254 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
17255 first. */
17256 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
17257 return true;
17258
17259 dcopy = *d;
17260 if (swap)
17261 {
17262 dcopy.op0 = d->op1;
17263 dcopy.op1 = d->op0;
17264 for (i = 0; i < nelt; ++i)
17265 dcopy.perm[i] ^= nelt;
17266 }
17267
17268 in_order = true;
17269 for (i = 0; i < nelt; ++i)
17270 {
17271 unsigned e = dcopy.perm[i];
17272 if (GET_MODE_SIZE (d->vmode) == 32
17273 && e >= nelt
17274 && (e & (nelt / 2 - 1)) < min)
17275 e = e - min - (nelt / 2);
17276 else
17277 e = e - min;
17278 if (e != i)
17279 in_order = false;
17280 dcopy.perm[i] = e;
17281 }
17282 dcopy.one_operand_p = true;
17283
17284 if (single_insn_only_p && !in_order)
17285 return false;
17286
17287 /* For AVX2, test whether we can permute the result in one instruction. */
17288 if (d->testing_p)
17289 {
17290 if (in_order)
17291 return true;
17292 dcopy.op1 = dcopy.op0;
17293 return expand_vec_perm_1 (&dcopy);
17294 }
17295
17296 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
17297 if (GET_MODE_SIZE (d->vmode) == 16)
17298 {
17299 target = gen_reg_rtx (TImode);
17300 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
17301 gen_lowpart (TImode, dcopy.op0), shift));
17302 }
17303 else
17304 {
17305 target = gen_reg_rtx (V2TImode);
17306 emit_insn (gen_avx2_palignrv2ti (target,
17307 gen_lowpart (V2TImode, dcopy.op1),
17308 gen_lowpart (V2TImode, dcopy.op0),
17309 shift));
17310 }
17311
17312 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
17313
17314 /* Test for the degenerate case where the alignment by itself
17315 produces the desired permutation. */
17316 if (in_order)
17317 {
17318 emit_move_insn (d->target, dcopy.op0);
17319 return true;
17320 }
17321
17322 ok = expand_vec_perm_1 (&dcopy);
17323 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
17324
17325 return ok;
17326}
17327
17328/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17329 the permutation using the SSE4_1 pblendv instruction. Potentially
17330 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
17331
17332static bool
17333expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
17334{
17335 unsigned i, which, nelt = d->nelt;
17336 struct expand_vec_perm_d dcopy, dcopy1;
17337 machine_mode vmode = d->vmode;
17338 bool ok;
17339
17340 /* Use the same checks as in expand_vec_perm_blend. */
17341 if (d->one_operand_p)
17342 return false;
17343 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
17344 ;
17345 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
17346 ;
17347 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
17348 ;
17349 else
17350 return false;
17351
17352 /* Figure out where permutation elements stay not in their
17353 respective lanes. */
17354 for (i = 0, which = 0; i < nelt; ++i)
17355 {
17356 unsigned e = d->perm[i];
17357 if (e != i)
17358 which |= (e < nelt ? 1 : 2);
17359 }
17360 /* We can pblend the part where elements stay not in their
17361 respective lanes only when these elements are all in one
17362 half of a permutation.
17363 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17364 lanes, but both 8 and 9 >= 8
17365 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17366 respective lanes and 8 >= 8, but 2 not. */
17367 if (which != 1 && which != 2)
17368 return false;
17369 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
17370 return true;
17371
17372 /* First we apply one operand permutation to the part where
17373 elements stay not in their respective lanes. */
17374 dcopy = *d;
17375 if (which == 2)
17376 dcopy.op0 = dcopy.op1 = d->op1;
17377 else
17378 dcopy.op0 = dcopy.op1 = d->op0;
17379 if (!d->testing_p)
17380 dcopy.target = gen_reg_rtx (vmode);
17381 dcopy.one_operand_p = true;
17382
17383 for (i = 0; i < nelt; ++i)
17384 dcopy.perm[i] = d->perm[i] & (nelt - 1);
17385
17386 ok = expand_vec_perm_1 (&dcopy);
17387 if (GET_MODE_SIZE (vmode) != 16 && !ok)
17388 return false;
17389 else
17390 gcc_assert (ok);
17391 if (d->testing_p)
17392 return true;
17393
17394 /* Next we put permuted elements into their positions. */
17395 dcopy1 = *d;
17396 if (which == 2)
17397 dcopy1.op1 = dcopy.target;
17398 else
17399 dcopy1.op0 = dcopy.target;
17400
17401 for (i = 0; i < nelt; ++i)
17402 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
17403
17404 ok = expand_vec_perm_blend (&dcopy1);
17405 gcc_assert (ok);
17406
17407 return true;
17408}
17409
17410static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
17411
4bf4c103 17412/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
17413 a two vector permutation into a single vector permutation by using
17414 an interleave operation to merge the vectors. */
17415
17416static bool
17417expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
17418{
17419 struct expand_vec_perm_d dremap, dfinal;
17420 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
17421 unsigned HOST_WIDE_INT contents;
17422 unsigned char remap[2 * MAX_VECT_LEN];
17423 rtx_insn *seq;
17424 bool ok, same_halves = false;
17425
17426 if (GET_MODE_SIZE (d->vmode) == 16)
17427 {
17428 if (d->one_operand_p)
17429 return false;
17430 }
17431 else if (GET_MODE_SIZE (d->vmode) == 32)
17432 {
17433 if (!TARGET_AVX)
17434 return false;
17435 /* For 32-byte modes allow even d->one_operand_p.
17436 The lack of cross-lane shuffling in some instructions
17437 might prevent a single insn shuffle. */
17438 dfinal = *d;
17439 dfinal.testing_p = true;
17440 /* If expand_vec_perm_interleave3 can expand this into
17441 a 3 insn sequence, give up and let it be expanded as
17442 3 insn sequence. While that is one insn longer,
17443 it doesn't need a memory operand and in the common
17444 case that both interleave low and high permutations
17445 with the same operands are adjacent needs 4 insns
17446 for both after CSE. */
17447 if (expand_vec_perm_interleave3 (&dfinal))
17448 return false;
17449 }
17450 else
17451 return false;
17452
17453 /* Examine from whence the elements come. */
17454 contents = 0;
17455 for (i = 0; i < nelt; ++i)
17456 contents |= HOST_WIDE_INT_1U << d->perm[i];
17457
17458 memset (remap, 0xff, sizeof (remap));
17459 dremap = *d;
17460
17461 if (GET_MODE_SIZE (d->vmode) == 16)
17462 {
17463 unsigned HOST_WIDE_INT h1, h2, h3, h4;
17464
17465 /* Split the two input vectors into 4 halves. */
17466 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
17467 h2 = h1 << nelt2;
17468 h3 = h2 << nelt2;
17469 h4 = h3 << nelt2;
17470
17471 /* If the elements from the low halves use interleave low, and similarly
17472 for interleave high. If the elements are from mis-matched halves, we
17473 can use shufps for V4SF/V4SI or do a DImode shuffle. */
17474 if ((contents & (h1 | h3)) == contents)
17475 {
17476 /* punpckl* */
17477 for (i = 0; i < nelt2; ++i)
17478 {
17479 remap[i] = i * 2;
17480 remap[i + nelt] = i * 2 + 1;
17481 dremap.perm[i * 2] = i;
17482 dremap.perm[i * 2 + 1] = i + nelt;
17483 }
17484 if (!TARGET_SSE2 && d->vmode == V4SImode)
17485 dremap.vmode = V4SFmode;
17486 }
17487 else if ((contents & (h2 | h4)) == contents)
17488 {
17489 /* punpckh* */
17490 for (i = 0; i < nelt2; ++i)
17491 {
17492 remap[i + nelt2] = i * 2;
17493 remap[i + nelt + nelt2] = i * 2 + 1;
17494 dremap.perm[i * 2] = i + nelt2;
17495 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
17496 }
17497 if (!TARGET_SSE2 && d->vmode == V4SImode)
17498 dremap.vmode = V4SFmode;
17499 }
17500 else if ((contents & (h1 | h4)) == contents)
17501 {
17502 /* shufps */
17503 for (i = 0; i < nelt2; ++i)
17504 {
17505 remap[i] = i;
17506 remap[i + nelt + nelt2] = i + nelt2;
17507 dremap.perm[i] = i;
17508 dremap.perm[i + nelt2] = i + nelt + nelt2;
17509 }
17510 if (nelt != 4)
17511 {
17512 /* shufpd */
17513 dremap.vmode = V2DImode;
17514 dremap.nelt = 2;
17515 dremap.perm[0] = 0;
17516 dremap.perm[1] = 3;
17517 }
17518 }
17519 else if ((contents & (h2 | h3)) == contents)
17520 {
17521 /* shufps */
17522 for (i = 0; i < nelt2; ++i)
17523 {
17524 remap[i + nelt2] = i;
17525 remap[i + nelt] = i + nelt2;
17526 dremap.perm[i] = i + nelt2;
17527 dremap.perm[i + nelt2] = i + nelt;
17528 }
17529 if (nelt != 4)
17530 {
17531 /* shufpd */
17532 dremap.vmode = V2DImode;
17533 dremap.nelt = 2;
17534 dremap.perm[0] = 1;
17535 dremap.perm[1] = 2;
17536 }
17537 }
17538 else
17539 return false;
17540 }
17541 else
17542 {
17543 unsigned int nelt4 = nelt / 4, nzcnt = 0;
17544 unsigned HOST_WIDE_INT q[8];
17545 unsigned int nonzero_halves[4];
17546
17547 /* Split the two input vectors into 8 quarters. */
17548 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
17549 for (i = 1; i < 8; ++i)
17550 q[i] = q[0] << (nelt4 * i);
17551 for (i = 0; i < 4; ++i)
17552 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
17553 {
17554 nonzero_halves[nzcnt] = i;
17555 ++nzcnt;
17556 }
17557
17558 if (nzcnt == 1)
17559 {
17560 gcc_assert (d->one_operand_p);
17561 nonzero_halves[1] = nonzero_halves[0];
17562 same_halves = true;
17563 }
17564 else if (d->one_operand_p)
17565 {
17566 gcc_assert (nonzero_halves[0] == 0);
17567 gcc_assert (nonzero_halves[1] == 1);
17568 }
17569
17570 if (nzcnt <= 2)
17571 {
17572 if (d->perm[0] / nelt2 == nonzero_halves[1])
17573 {
17574 /* Attempt to increase the likelihood that dfinal
17575 shuffle will be intra-lane. */
17576 std::swap (nonzero_halves[0], nonzero_halves[1]);
17577 }
17578
17579 /* vperm2f128 or vperm2i128. */
17580 for (i = 0; i < nelt2; ++i)
17581 {
17582 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
17583 remap[i + nonzero_halves[0] * nelt2] = i;
17584 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
17585 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
17586 }
17587
17588 if (d->vmode != V8SFmode
17589 && d->vmode != V4DFmode
17590 && d->vmode != V8SImode)
17591 {
17592 dremap.vmode = V8SImode;
17593 dremap.nelt = 8;
17594 for (i = 0; i < 4; ++i)
17595 {
17596 dremap.perm[i] = i + nonzero_halves[0] * 4;
17597 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
17598 }
17599 }
17600 }
17601 else if (d->one_operand_p)
17602 return false;
17603 else if (TARGET_AVX2
17604 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
17605 {
17606 /* vpunpckl* */
17607 for (i = 0; i < nelt4; ++i)
17608 {
17609 remap[i] = i * 2;
17610 remap[i + nelt] = i * 2 + 1;
17611 remap[i + nelt2] = i * 2 + nelt2;
17612 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
17613 dremap.perm[i * 2] = i;
17614 dremap.perm[i * 2 + 1] = i + nelt;
17615 dremap.perm[i * 2 + nelt2] = i + nelt2;
17616 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
17617 }
17618 }
17619 else if (TARGET_AVX2
17620 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
17621 {
17622 /* vpunpckh* */
17623 for (i = 0; i < nelt4; ++i)
17624 {
17625 remap[i + nelt4] = i * 2;
17626 remap[i + nelt + nelt4] = i * 2 + 1;
17627 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
17628 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
17629 dremap.perm[i * 2] = i + nelt4;
17630 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
17631 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
17632 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
17633 }
17634 }
17635 else
17636 return false;
17637 }
17638
17639 /* Use the remapping array set up above to move the elements from their
17640 swizzled locations into their final destinations. */
17641 dfinal = *d;
17642 for (i = 0; i < nelt; ++i)
17643 {
17644 unsigned e = remap[d->perm[i]];
17645 gcc_assert (e < nelt);
17646 /* If same_halves is true, both halves of the remapped vector are the
17647 same. Avoid cross-lane accesses if possible. */
17648 if (same_halves && i >= nelt2)
17649 {
17650 gcc_assert (e < nelt2);
17651 dfinal.perm[i] = e + nelt2;
17652 }
17653 else
17654 dfinal.perm[i] = e;
17655 }
17656 if (!d->testing_p)
17657 {
17658 dremap.target = gen_reg_rtx (dremap.vmode);
17659 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
17660 }
17661 dfinal.op1 = dfinal.op0;
17662 dfinal.one_operand_p = true;
17663
17664 /* Test if the final remap can be done with a single insn. For V4SFmode or
17665 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
17666 start_sequence ();
17667 ok = expand_vec_perm_1 (&dfinal);
17668 seq = get_insns ();
17669 end_sequence ();
17670
17671 if (!ok)
17672 return false;
17673
17674 if (d->testing_p)
17675 return true;
17676
17677 if (dremap.vmode != dfinal.vmode)
17678 {
17679 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
17680 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
17681 }
17682
17683 ok = expand_vec_perm_1 (&dremap);
17684 gcc_assert (ok);
17685
17686 emit_insn (seq);
17687 return true;
17688}
17689
4bf4c103 17690/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
17691 a single vector cross-lane permutation into vpermq followed
17692 by any of the single insn permutations. */
17693
17694static bool
17695expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
17696{
17697 struct expand_vec_perm_d dremap, dfinal;
17698 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
17699 unsigned contents[2];
17700 bool ok;
17701
17702 if (!(TARGET_AVX2
17703 && (d->vmode == V32QImode || d->vmode == V16HImode)
17704 && d->one_operand_p))
17705 return false;
17706
17707 contents[0] = 0;
17708 contents[1] = 0;
17709 for (i = 0; i < nelt2; ++i)
17710 {
17711 contents[0] |= 1u << (d->perm[i] / nelt4);
17712 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
17713 }
17714
17715 for (i = 0; i < 2; ++i)
17716 {
17717 unsigned int cnt = 0;
17718 for (j = 0; j < 4; ++j)
17719 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
17720 return false;
17721 }
17722
17723 if (d->testing_p)
17724 return true;
17725
17726 dremap = *d;
17727 dremap.vmode = V4DImode;
17728 dremap.nelt = 4;
17729 dremap.target = gen_reg_rtx (V4DImode);
17730 dremap.op0 = gen_lowpart (V4DImode, d->op0);
17731 dremap.op1 = dremap.op0;
17732 dremap.one_operand_p = true;
17733 for (i = 0; i < 2; ++i)
17734 {
17735 unsigned int cnt = 0;
17736 for (j = 0; j < 4; ++j)
17737 if ((contents[i] & (1u << j)) != 0)
17738 dremap.perm[2 * i + cnt++] = j;
17739 for (; cnt < 2; ++cnt)
17740 dremap.perm[2 * i + cnt] = 0;
17741 }
17742
17743 dfinal = *d;
17744 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
17745 dfinal.op1 = dfinal.op0;
17746 dfinal.one_operand_p = true;
17747 for (i = 0, j = 0; i < nelt; ++i)
17748 {
17749 if (i == nelt2)
17750 j = 2;
17751 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
17752 if ((d->perm[i] / nelt4) == dremap.perm[j])
17753 ;
17754 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
17755 dfinal.perm[i] |= nelt4;
17756 else
17757 gcc_unreachable ();
17758 }
17759
17760 ok = expand_vec_perm_1 (&dremap);
17761 gcc_assert (ok);
17762
17763 ok = expand_vec_perm_1 (&dfinal);
17764 gcc_assert (ok);
17765
17766 return true;
17767}
17768
17769static bool canonicalize_perm (struct expand_vec_perm_d *d);
17770
4bf4c103 17771/* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
2bf6d935
ML
17772 a vector permutation using two instructions, vperm2f128 resp.
17773 vperm2i128 followed by any single in-lane permutation. */
17774
17775static bool
17776expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
17777{
17778 struct expand_vec_perm_d dfirst, dsecond;
17779 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
17780 bool ok;
17781
17782 if (!TARGET_AVX
17783 || GET_MODE_SIZE (d->vmode) != 32
17784 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
17785 return false;
17786
17787 dsecond = *d;
17788 dsecond.one_operand_p = false;
17789 dsecond.testing_p = true;
17790
17791 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
17792 immediate. For perm < 16 the second permutation uses
17793 d->op0 as first operand, for perm >= 16 it uses d->op1
17794 as first operand. The second operand is the result of
17795 vperm2[fi]128. */
17796 for (perm = 0; perm < 32; perm++)
17797 {
17798 /* Ignore permutations which do not move anything cross-lane. */
17799 if (perm < 16)
17800 {
17801 /* The second shuffle for e.g. V4DFmode has
17802 0123 and ABCD operands.
17803 Ignore AB23, as 23 is already in the second lane
17804 of the first operand. */
17805 if ((perm & 0xc) == (1 << 2)) continue;
17806 /* And 01CD, as 01 is in the first lane of the first
17807 operand. */
17808 if ((perm & 3) == 0) continue;
17809 /* And 4567, as then the vperm2[fi]128 doesn't change
17810 anything on the original 4567 second operand. */
17811 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
17812 }
17813 else
17814 {
17815 /* The second shuffle for e.g. V4DFmode has
17816 4567 and ABCD operands.
17817 Ignore AB67, as 67 is already in the second lane
17818 of the first operand. */
17819 if ((perm & 0xc) == (3 << 2)) continue;
17820 /* And 45CD, as 45 is in the first lane of the first
17821 operand. */
17822 if ((perm & 3) == 2) continue;
17823 /* And 0123, as then the vperm2[fi]128 doesn't change
17824 anything on the original 0123 first operand. */
17825 if ((perm & 0xf) == (1 << 2)) continue;
17826 }
17827
17828 for (i = 0; i < nelt; i++)
17829 {
17830 j = d->perm[i] / nelt2;
17831 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
17832 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
17833 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
17834 dsecond.perm[i] = d->perm[i] & (nelt - 1);
17835 else
17836 break;
17837 }
17838
17839 if (i == nelt)
17840 {
17841 start_sequence ();
17842 ok = expand_vec_perm_1 (&dsecond);
17843 end_sequence ();
17844 }
17845 else
17846 ok = false;
17847
17848 if (ok)
17849 {
17850 if (d->testing_p)
17851 return true;
17852
17853 /* Found a usable second shuffle. dfirst will be
17854 vperm2f128 on d->op0 and d->op1. */
17855 dsecond.testing_p = false;
17856 dfirst = *d;
17857 dfirst.target = gen_reg_rtx (d->vmode);
17858 for (i = 0; i < nelt; i++)
17859 dfirst.perm[i] = (i & (nelt2 - 1))
17860 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
17861
17862 canonicalize_perm (&dfirst);
17863 ok = expand_vec_perm_1 (&dfirst);
17864 gcc_assert (ok);
17865
17866 /* And dsecond is some single insn shuffle, taking
17867 d->op0 and result of vperm2f128 (if perm < 16) or
17868 d->op1 and result of vperm2f128 (otherwise). */
17869 if (perm >= 16)
17870 dsecond.op0 = dsecond.op1;
17871 dsecond.op1 = dfirst.target;
17872
17873 ok = expand_vec_perm_1 (&dsecond);
17874 gcc_assert (ok);
17875
17876 return true;
17877 }
17878
17879 /* For one operand, the only useful vperm2f128 permutation is 0x01
17880 aka lanes swap. */
17881 if (d->one_operand_p)
17882 return false;
17883 }
17884
17885 return false;
17886}
17887
4bf4c103 17888/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
17889 a two vector permutation using 2 intra-lane interleave insns
17890 and cross-lane shuffle for 32-byte vectors. */
17891
17892static bool
17893expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
17894{
17895 unsigned i, nelt;
17896 rtx (*gen) (rtx, rtx, rtx);
17897
17898 if (d->one_operand_p)
17899 return false;
17900 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
17901 ;
17902 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
17903 ;
17904 else
17905 return false;
17906
17907 nelt = d->nelt;
17908 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
17909 return false;
17910 for (i = 0; i < nelt; i += 2)
17911 if (d->perm[i] != d->perm[0] + i / 2
17912 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
17913 return false;
17914
17915 if (d->testing_p)
17916 return true;
17917
17918 switch (d->vmode)
17919 {
17920 case E_V32QImode:
17921 if (d->perm[0])
17922 gen = gen_vec_interleave_highv32qi;
17923 else
17924 gen = gen_vec_interleave_lowv32qi;
17925 break;
17926 case E_V16HImode:
17927 if (d->perm[0])
17928 gen = gen_vec_interleave_highv16hi;
17929 else
17930 gen = gen_vec_interleave_lowv16hi;
17931 break;
17932 case E_V8SImode:
17933 if (d->perm[0])
17934 gen = gen_vec_interleave_highv8si;
17935 else
17936 gen = gen_vec_interleave_lowv8si;
17937 break;
17938 case E_V4DImode:
17939 if (d->perm[0])
17940 gen = gen_vec_interleave_highv4di;
17941 else
17942 gen = gen_vec_interleave_lowv4di;
17943 break;
17944 case E_V8SFmode:
17945 if (d->perm[0])
17946 gen = gen_vec_interleave_highv8sf;
17947 else
17948 gen = gen_vec_interleave_lowv8sf;
17949 break;
17950 case E_V4DFmode:
17951 if (d->perm[0])
17952 gen = gen_vec_interleave_highv4df;
17953 else
17954 gen = gen_vec_interleave_lowv4df;
17955 break;
17956 default:
17957 gcc_unreachable ();
17958 }
17959
17960 emit_insn (gen (d->target, d->op0, d->op1));
17961 return true;
17962}
17963
4bf4c103 17964/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
2bf6d935
ML
17965 a single vector permutation using a single intra-lane vector
17966 permutation, vperm2f128 swapping the lanes and vblend* insn blending
17967 the non-swapped and swapped vectors together. */
17968
17969static bool
17970expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
17971{
17972 struct expand_vec_perm_d dfirst, dsecond;
17973 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
17974 rtx_insn *seq;
17975 bool ok;
17976 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
17977
17978 if (!TARGET_AVX
17979 || TARGET_AVX2
17980 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
17981 || !d->one_operand_p)
17982 return false;
17983
17984 dfirst = *d;
17985 for (i = 0; i < nelt; i++)
17986 dfirst.perm[i] = 0xff;
17987 for (i = 0, msk = 0; i < nelt; i++)
17988 {
17989 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
17990 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
17991 return false;
17992 dfirst.perm[j] = d->perm[i];
17993 if (j != i)
17994 msk |= (1 << i);
17995 }
17996 for (i = 0; i < nelt; i++)
17997 if (dfirst.perm[i] == 0xff)
17998 dfirst.perm[i] = i;
17999
18000 if (!d->testing_p)
18001 dfirst.target = gen_reg_rtx (dfirst.vmode);
18002
18003 start_sequence ();
18004 ok = expand_vec_perm_1 (&dfirst);
18005 seq = get_insns ();
18006 end_sequence ();
18007
18008 if (!ok)
18009 return false;
18010
18011 if (d->testing_p)
18012 return true;
18013
18014 emit_insn (seq);
18015
18016 dsecond = *d;
18017 dsecond.op0 = dfirst.target;
18018 dsecond.op1 = dfirst.target;
18019 dsecond.one_operand_p = true;
18020 dsecond.target = gen_reg_rtx (dsecond.vmode);
18021 for (i = 0; i < nelt; i++)
18022 dsecond.perm[i] = i ^ nelt2;
18023
18024 ok = expand_vec_perm_1 (&dsecond);
18025 gcc_assert (ok);
18026
18027 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18028 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
18029 return true;
18030}
18031
4bf4c103 18032/* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
2bf6d935
ML
18033 permutation using two vperm2f128, followed by a vshufpd insn blending
18034 the two vectors together. */
18035
18036static bool
18037expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
18038{
18039 struct expand_vec_perm_d dfirst, dsecond, dthird;
18040 bool ok;
18041
18042 if (!TARGET_AVX || (d->vmode != V4DFmode))
18043 return false;
18044
18045 if (d->testing_p)
18046 return true;
18047
18048 dfirst = *d;
18049 dsecond = *d;
18050 dthird = *d;
18051
18052 dfirst.perm[0] = (d->perm[0] & ~1);
18053 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
18054 dfirst.perm[2] = (d->perm[2] & ~1);
18055 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
18056 dsecond.perm[0] = (d->perm[1] & ~1);
18057 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
18058 dsecond.perm[2] = (d->perm[3] & ~1);
18059 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
18060 dthird.perm[0] = (d->perm[0] % 2);
18061 dthird.perm[1] = (d->perm[1] % 2) + 4;
18062 dthird.perm[2] = (d->perm[2] % 2) + 2;
18063 dthird.perm[3] = (d->perm[3] % 2) + 6;
18064
18065 dfirst.target = gen_reg_rtx (dfirst.vmode);
18066 dsecond.target = gen_reg_rtx (dsecond.vmode);
18067 dthird.op0 = dfirst.target;
18068 dthird.op1 = dsecond.target;
18069 dthird.one_operand_p = false;
18070
18071 canonicalize_perm (&dfirst);
18072 canonicalize_perm (&dsecond);
18073
18074 ok = expand_vec_perm_1 (&dfirst)
18075 && expand_vec_perm_1 (&dsecond)
18076 && expand_vec_perm_1 (&dthird);
18077
18078 gcc_assert (ok);
18079
18080 return true;
18081}
18082
4bf4c103
JJ
18083static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
18084
18085/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18086 a two vector permutation using two intra-lane vector
18087 permutations, vperm2f128 swapping the lanes and vblend* insn blending
18088 the non-swapped and swapped vectors together. */
18089
18090static bool
18091expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
18092{
18093 struct expand_vec_perm_d dfirst, dsecond, dthird;
18094 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
18095 rtx_insn *seq1, *seq2;
18096 bool ok;
18097 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18098
18099 if (!TARGET_AVX
18100 || TARGET_AVX2
18101 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18102 || d->one_operand_p)
18103 return false;
18104
18105 dfirst = *d;
18106 dsecond = *d;
18107 for (i = 0; i < nelt; i++)
18108 {
18109 dfirst.perm[i] = 0xff;
18110 dsecond.perm[i] = 0xff;
18111 }
18112 for (i = 0, msk = 0; i < nelt; i++)
18113 {
18114 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18115 if (j == i)
18116 {
18117 dfirst.perm[j] = d->perm[i];
18118 which1 |= (d->perm[i] < nelt ? 1 : 2);
18119 }
18120 else
18121 {
18122 dsecond.perm[j] = d->perm[i];
18123 which2 |= (d->perm[i] < nelt ? 1 : 2);
18124 msk |= (1U << i);
18125 }
18126 }
18127 if (msk == 0 || msk == (1U << nelt) - 1)
18128 return false;
18129
18130 if (!d->testing_p)
18131 {
18132 dfirst.target = gen_reg_rtx (dfirst.vmode);
18133 dsecond.target = gen_reg_rtx (dsecond.vmode);
18134 }
18135
18136 for (i = 0; i < nelt; i++)
18137 {
18138 if (dfirst.perm[i] == 0xff)
18139 dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
18140 if (dsecond.perm[i] == 0xff)
18141 dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
18142 }
18143 canonicalize_perm (&dfirst);
18144 start_sequence ();
18145 ok = ix86_expand_vec_perm_const_1 (&dfirst);
18146 seq1 = get_insns ();
18147 end_sequence ();
18148
18149 if (!ok)
18150 return false;
18151
18152 canonicalize_perm (&dsecond);
18153 start_sequence ();
18154 ok = ix86_expand_vec_perm_const_1 (&dsecond);
18155 seq2 = get_insns ();
18156 end_sequence ();
18157
18158 if (!ok)
18159 return false;
18160
18161 if (d->testing_p)
18162 return true;
18163
18164 emit_insn (seq1);
18165 emit_insn (seq2);
18166
18167 dthird = *d;
18168 dthird.op0 = dsecond.target;
18169 dthird.op1 = dsecond.target;
18170 dthird.one_operand_p = true;
18171 dthird.target = gen_reg_rtx (dthird.vmode);
18172 for (i = 0; i < nelt; i++)
18173 dthird.perm[i] = i ^ nelt2;
18174
18175 ok = expand_vec_perm_1 (&dthird);
18176 gcc_assert (ok);
18177
18178 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18179 emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
18180 return true;
18181}
18182
2bf6d935
ML
18183/* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
18184 permutation with two pshufb insns and an ior. We should have already
18185 failed all two instruction sequences. */
18186
18187static bool
18188expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
18189{
18190 rtx rperm[2][16], vperm, l, h, op, m128;
18191 unsigned int i, nelt, eltsz;
18192
18193 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
18194 return false;
18195 gcc_assert (!d->one_operand_p);
18196
18197 if (d->testing_p)
18198 return true;
18199
18200 nelt = d->nelt;
18201 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18202
18203 /* Generate two permutation masks. If the required element is within
18204 the given vector it is shuffled into the proper lane. If the required
18205 element is in the other vector, force a zero into the lane by setting
18206 bit 7 in the permutation mask. */
18207 m128 = GEN_INT (-128);
18208 for (i = 0; i < nelt; ++i)
18209 {
18210 unsigned j, e = d->perm[i];
18211 unsigned which = (e >= nelt);
18212 if (e >= nelt)
18213 e -= nelt;
18214
18215 for (j = 0; j < eltsz; ++j)
18216 {
18217 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
18218 rperm[1-which][i*eltsz + j] = m128;
18219 }
18220 }
18221
18222 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
18223 vperm = force_reg (V16QImode, vperm);
18224
18225 l = gen_reg_rtx (V16QImode);
18226 op = gen_lowpart (V16QImode, d->op0);
18227 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
18228
18229 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
18230 vperm = force_reg (V16QImode, vperm);
18231
18232 h = gen_reg_rtx (V16QImode);
18233 op = gen_lowpart (V16QImode, d->op1);
18234 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
18235
18236 op = d->target;
18237 if (d->vmode != V16QImode)
18238 op = gen_reg_rtx (V16QImode);
18239 emit_insn (gen_iorv16qi3 (op, l, h));
18240 if (op != d->target)
18241 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18242
18243 return true;
18244}
18245
18246/* Implement arbitrary permutation of one V32QImode and V16QImode operand
18247 with two vpshufb insns, vpermq and vpor. We should have already failed
18248 all two or three instruction sequences. */
18249
18250static bool
18251expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
18252{
18253 rtx rperm[2][32], vperm, l, h, hp, op, m128;
18254 unsigned int i, nelt, eltsz;
18255
18256 if (!TARGET_AVX2
18257 || !d->one_operand_p
18258 || (d->vmode != V32QImode && d->vmode != V16HImode))
18259 return false;
18260
18261 if (d->testing_p)
18262 return true;
18263
18264 nelt = d->nelt;
18265 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18266
18267 /* Generate two permutation masks. If the required element is within
18268 the same lane, it is shuffled in. If the required element from the
18269 other lane, force a zero by setting bit 7 in the permutation mask.
18270 In the other mask the mask has non-negative elements if element
18271 is requested from the other lane, but also moved to the other lane,
18272 so that the result of vpshufb can have the two V2TImode halves
18273 swapped. */
18274 m128 = GEN_INT (-128);
18275 for (i = 0; i < nelt; ++i)
18276 {
18277 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18278 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
18279
18280 for (j = 0; j < eltsz; ++j)
18281 {
18282 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
18283 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
18284 }
18285 }
18286
18287 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18288 vperm = force_reg (V32QImode, vperm);
18289
18290 h = gen_reg_rtx (V32QImode);
18291 op = gen_lowpart (V32QImode, d->op0);
18292 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18293
18294 /* Swap the 128-byte lanes of h into hp. */
18295 hp = gen_reg_rtx (V4DImode);
18296 op = gen_lowpart (V4DImode, h);
18297 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
18298 const1_rtx));
18299
18300 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18301 vperm = force_reg (V32QImode, vperm);
18302
18303 l = gen_reg_rtx (V32QImode);
18304 op = gen_lowpart (V32QImode, d->op0);
18305 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18306
18307 op = d->target;
18308 if (d->vmode != V32QImode)
18309 op = gen_reg_rtx (V32QImode);
18310 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
18311 if (op != d->target)
18312 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18313
18314 return true;
18315}
18316
18317/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18318 and extract-odd permutations of two V32QImode and V16QImode operand
18319 with two vpshufb insns, vpor and vpermq. We should have already
18320 failed all two or three instruction sequences. */
18321
18322static bool
18323expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
18324{
18325 rtx rperm[2][32], vperm, l, h, ior, op, m128;
18326 unsigned int i, nelt, eltsz;
18327
18328 if (!TARGET_AVX2
18329 || d->one_operand_p
18330 || (d->vmode != V32QImode && d->vmode != V16HImode))
18331 return false;
18332
18333 for (i = 0; i < d->nelt; ++i)
18334 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
18335 return false;
18336
18337 if (d->testing_p)
18338 return true;
18339
18340 nelt = d->nelt;
18341 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18342
18343 /* Generate two permutation masks. In the first permutation mask
18344 the first quarter will contain indexes for the first half
18345 of the op0, the second quarter will contain bit 7 set, third quarter
18346 will contain indexes for the second half of the op0 and the
18347 last quarter bit 7 set. In the second permutation mask
18348 the first quarter will contain bit 7 set, the second quarter
18349 indexes for the first half of the op1, the third quarter bit 7 set
18350 and last quarter indexes for the second half of the op1.
18351 I.e. the first mask e.g. for V32QImode extract even will be:
18352 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18353 (all values masked with 0xf except for -128) and second mask
18354 for extract even will be
18355 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
18356 m128 = GEN_INT (-128);
18357 for (i = 0; i < nelt; ++i)
18358 {
18359 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18360 unsigned which = d->perm[i] >= nelt;
18361 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
18362
18363 for (j = 0; j < eltsz; ++j)
18364 {
18365 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
18366 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
18367 }
18368 }
18369
18370 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18371 vperm = force_reg (V32QImode, vperm);
18372
18373 l = gen_reg_rtx (V32QImode);
18374 op = gen_lowpart (V32QImode, d->op0);
18375 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18376
18377 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18378 vperm = force_reg (V32QImode, vperm);
18379
18380 h = gen_reg_rtx (V32QImode);
18381 op = gen_lowpart (V32QImode, d->op1);
18382 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18383
18384 ior = gen_reg_rtx (V32QImode);
18385 emit_insn (gen_iorv32qi3 (ior, l, h));
18386
18387 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
18388 op = gen_reg_rtx (V4DImode);
18389 ior = gen_lowpart (V4DImode, ior);
18390 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
18391 const1_rtx, GEN_INT (3)));
18392 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18393
18394 return true;
18395}
18396
18397/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18398 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18399 with two "and" and "pack" or two "shift" and "pack" insns. We should
18400 have already failed all two instruction sequences. */
18401
18402static bool
18403expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
18404{
18405 rtx op, dop0, dop1, t;
18406 unsigned i, odd, c, s, nelt = d->nelt;
18407 bool end_perm = false;
18408 machine_mode half_mode;
18409 rtx (*gen_and) (rtx, rtx, rtx);
18410 rtx (*gen_pack) (rtx, rtx, rtx);
18411 rtx (*gen_shift) (rtx, rtx, rtx);
18412
18413 if (d->one_operand_p)
18414 return false;
18415
18416 switch (d->vmode)
18417 {
18418 case E_V8HImode:
18419 /* Required for "pack". */
18420 if (!TARGET_SSE4_1)
18421 return false;
18422 c = 0xffff;
18423 s = 16;
18424 half_mode = V4SImode;
18425 gen_and = gen_andv4si3;
18426 gen_pack = gen_sse4_1_packusdw;
18427 gen_shift = gen_lshrv4si3;
18428 break;
18429 case E_V16QImode:
18430 /* No check as all instructions are SSE2. */
18431 c = 0xff;
18432 s = 8;
18433 half_mode = V8HImode;
18434 gen_and = gen_andv8hi3;
18435 gen_pack = gen_sse2_packuswb;
18436 gen_shift = gen_lshrv8hi3;
18437 break;
18438 case E_V16HImode:
18439 if (!TARGET_AVX2)
18440 return false;
18441 c = 0xffff;
18442 s = 16;
18443 half_mode = V8SImode;
18444 gen_and = gen_andv8si3;
18445 gen_pack = gen_avx2_packusdw;
18446 gen_shift = gen_lshrv8si3;
18447 end_perm = true;
18448 break;
18449 case E_V32QImode:
18450 if (!TARGET_AVX2)
18451 return false;
18452 c = 0xff;
18453 s = 8;
18454 half_mode = V16HImode;
18455 gen_and = gen_andv16hi3;
18456 gen_pack = gen_avx2_packuswb;
18457 gen_shift = gen_lshrv16hi3;
18458 end_perm = true;
18459 break;
18460 default:
18461 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
18462 general shuffles. */
18463 return false;
18464 }
18465
18466 /* Check that permutation is even or odd. */
18467 odd = d->perm[0];
18468 if (odd > 1)
18469 return false;
18470
18471 for (i = 1; i < nelt; ++i)
18472 if (d->perm[i] != 2 * i + odd)
18473 return false;
18474
18475 if (d->testing_p)
18476 return true;
18477
18478 dop0 = gen_reg_rtx (half_mode);
18479 dop1 = gen_reg_rtx (half_mode);
18480 if (odd == 0)
18481 {
18482 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
18483 t = force_reg (half_mode, t);
18484 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
18485 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
18486 }
18487 else
18488 {
18489 emit_insn (gen_shift (dop0,
18490 gen_lowpart (half_mode, d->op0),
18491 GEN_INT (s)));
18492 emit_insn (gen_shift (dop1,
18493 gen_lowpart (half_mode, d->op1),
18494 GEN_INT (s)));
18495 }
18496 /* In AVX2 for 256 bit case we need to permute pack result. */
18497 if (TARGET_AVX2 && end_perm)
18498 {
18499 op = gen_reg_rtx (d->vmode);
18500 t = gen_reg_rtx (V4DImode);
18501 emit_insn (gen_pack (op, dop0, dop1));
18502 emit_insn (gen_avx2_permv4di_1 (t,
18503 gen_lowpart (V4DImode, op),
18504 const0_rtx,
18505 const2_rtx,
18506 const1_rtx,
18507 GEN_INT (3)));
18508 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
18509 }
18510 else
18511 emit_insn (gen_pack (d->target, dop0, dop1));
18512
18513 return true;
18514}
18515
18516/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18517 and extract-odd permutations of two V64QI operands
18518 with two "shifts", two "truncs" and one "concat" insns for "odd"
18519 and two "truncs" and one concat insn for "even."
18520 Have already failed all two instruction sequences. */
18521
18522static bool
18523expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
18524{
18525 rtx t1, t2, t3, t4;
18526 unsigned i, odd, nelt = d->nelt;
18527
18528 if (!TARGET_AVX512BW
18529 || d->one_operand_p
18530 || d->vmode != V64QImode)
18531 return false;
18532
18533 /* Check that permutation is even or odd. */
18534 odd = d->perm[0];
18535 if (odd > 1)
18536 return false;
18537
18538 for (i = 1; i < nelt; ++i)
18539 if (d->perm[i] != 2 * i + odd)
18540 return false;
18541
18542 if (d->testing_p)
18543 return true;
18544
18545
18546 if (odd)
18547 {
18548 t1 = gen_reg_rtx (V32HImode);
18549 t2 = gen_reg_rtx (V32HImode);
18550 emit_insn (gen_lshrv32hi3 (t1,
18551 gen_lowpart (V32HImode, d->op0),
18552 GEN_INT (8)));
18553 emit_insn (gen_lshrv32hi3 (t2,
18554 gen_lowpart (V32HImode, d->op1),
18555 GEN_INT (8)));
18556 }
18557 else
18558 {
18559 t1 = gen_lowpart (V32HImode, d->op0);
18560 t2 = gen_lowpart (V32HImode, d->op1);
18561 }
18562
18563 t3 = gen_reg_rtx (V32QImode);
18564 t4 = gen_reg_rtx (V32QImode);
18565 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
18566 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
18567 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
18568
18569 return true;
18570}
18571
4bf4c103 18572/* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
2bf6d935
ML
18573 and extract-odd permutations. */
18574
18575static bool
18576expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
18577{
18578 rtx t1, t2, t3, t4, t5;
18579
18580 switch (d->vmode)
18581 {
18582 case E_V4DFmode:
18583 if (d->testing_p)
18584 break;
18585 t1 = gen_reg_rtx (V4DFmode);
18586 t2 = gen_reg_rtx (V4DFmode);
18587
18588 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18589 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
18590 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
18591
18592 /* Now an unpck[lh]pd will produce the result required. */
18593 if (odd)
18594 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
18595 else
18596 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
18597 emit_insn (t3);
18598 break;
18599
18600 case E_V8SFmode:
18601 {
18602 int mask = odd ? 0xdd : 0x88;
18603
18604 if (d->testing_p)
18605 break;
18606 t1 = gen_reg_rtx (V8SFmode);
18607 t2 = gen_reg_rtx (V8SFmode);
18608 t3 = gen_reg_rtx (V8SFmode);
18609
18610 /* Shuffle within the 128-bit lanes to produce:
18611 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
18612 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
18613 GEN_INT (mask)));
18614
18615 /* Shuffle the lanes around to produce:
18616 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
18617 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
18618 GEN_INT (0x3)));
18619
18620 /* Shuffle within the 128-bit lanes to produce:
18621 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
18622 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
18623
18624 /* Shuffle within the 128-bit lanes to produce:
18625 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
18626 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
18627
18628 /* Shuffle the lanes around to produce:
18629 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
18630 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
18631 GEN_INT (0x20)));
18632 }
18633 break;
18634
18635 case E_V2DFmode:
18636 case E_V4SFmode:
18637 case E_V2DImode:
9b8579a6 18638 case E_V2SImode:
2bf6d935
ML
18639 case E_V4SImode:
18640 /* These are always directly implementable by expand_vec_perm_1. */
18641 gcc_unreachable ();
18642
240198fe
UB
18643 case E_V2SFmode:
18644 gcc_assert (TARGET_MMX_WITH_SSE);
18645 /* We have no suitable instructions. */
18646 if (d->testing_p)
18647 return false;
18648 break;
18649
9b8579a6
UB
18650 case E_V4HImode:
18651 if (d->testing_p)
18652 break;
18653 /* We need 2*log2(N)-1 operations to achieve odd/even
18654 with interleave. */
18655 t1 = gen_reg_rtx (V4HImode);
18656 emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
18657 emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
18658 if (odd)
18659 t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
18660 else
18661 t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
18662 emit_insn (t2);
18663 break;
18664
2bf6d935
ML
18665 case E_V8HImode:
18666 if (TARGET_SSE4_1)
18667 return expand_vec_perm_even_odd_pack (d);
18668 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
18669 return expand_vec_perm_pshufb2 (d);
18670 else
18671 {
18672 if (d->testing_p)
18673 break;
18674 /* We need 2*log2(N)-1 operations to achieve odd/even
18675 with interleave. */
18676 t1 = gen_reg_rtx (V8HImode);
18677 t2 = gen_reg_rtx (V8HImode);
18678 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
18679 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
18680 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
18681 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
18682 if (odd)
18683 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
18684 else
18685 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
18686 emit_insn (t3);
18687 }
18688 break;
18689
18690 case E_V16QImode:
18691 return expand_vec_perm_even_odd_pack (d);
18692
18693 case E_V16HImode:
18694 case E_V32QImode:
18695 return expand_vec_perm_even_odd_pack (d);
18696
18697 case E_V64QImode:
18698 return expand_vec_perm_even_odd_trunc (d);
18699
18700 case E_V4DImode:
18701 if (!TARGET_AVX2)
18702 {
18703 struct expand_vec_perm_d d_copy = *d;
18704 d_copy.vmode = V4DFmode;
18705 if (d->testing_p)
18706 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
18707 else
18708 d_copy.target = gen_reg_rtx (V4DFmode);
18709 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
18710 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
18711 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
18712 {
18713 if (!d->testing_p)
18714 emit_move_insn (d->target,
18715 gen_lowpart (V4DImode, d_copy.target));
18716 return true;
18717 }
18718 return false;
18719 }
18720
18721 if (d->testing_p)
18722 break;
18723
18724 t1 = gen_reg_rtx (V4DImode);
18725 t2 = gen_reg_rtx (V4DImode);
18726
18727 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18728 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
18729 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
18730
18731 /* Now an vpunpck[lh]qdq will produce the result required. */
18732 if (odd)
18733 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
18734 else
18735 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
18736 emit_insn (t3);
18737 break;
18738
18739 case E_V8SImode:
18740 if (!TARGET_AVX2)
18741 {
18742 struct expand_vec_perm_d d_copy = *d;
18743 d_copy.vmode = V8SFmode;
18744 if (d->testing_p)
18745 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
18746 else
18747 d_copy.target = gen_reg_rtx (V8SFmode);
18748 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
18749 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
18750 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
18751 {
18752 if (!d->testing_p)
18753 emit_move_insn (d->target,
18754 gen_lowpart (V8SImode, d_copy.target));
18755 return true;
18756 }
18757 return false;
18758 }
18759
18760 if (d->testing_p)
18761 break;
18762
18763 t1 = gen_reg_rtx (V8SImode);
18764 t2 = gen_reg_rtx (V8SImode);
18765 t3 = gen_reg_rtx (V4DImode);
18766 t4 = gen_reg_rtx (V4DImode);
18767 t5 = gen_reg_rtx (V4DImode);
18768
18769 /* Shuffle the lanes around into
18770 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
18771 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
18772 gen_lowpart (V4DImode, d->op1),
18773 GEN_INT (0x20)));
18774 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
18775 gen_lowpart (V4DImode, d->op1),
18776 GEN_INT (0x31)));
18777
18778 /* Swap the 2nd and 3rd position in each lane into
18779 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
18780 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
18781 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18782 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
18783 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18784
18785 /* Now an vpunpck[lh]qdq will produce
18786 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
18787 if (odd)
18788 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
18789 gen_lowpart (V4DImode, t2));
18790 else
18791 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
18792 gen_lowpart (V4DImode, t2));
18793 emit_insn (t3);
18794 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
18795 break;
18796
18797 default:
18798 gcc_unreachable ();
18799 }
18800
18801 return true;
18802}
18803
4bf4c103 18804/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
2bf6d935
ML
18805 extract-even and extract-odd permutations. */
18806
18807static bool
18808expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
18809{
18810 unsigned i, odd, nelt = d->nelt;
18811
18812 odd = d->perm[0];
18813 if (odd != 0 && odd != 1)
18814 return false;
18815
18816 for (i = 1; i < nelt; ++i)
18817 if (d->perm[i] != 2 * i + odd)
18818 return false;
18819
18820 return expand_vec_perm_even_odd_1 (d, odd);
18821}
18822
4bf4c103 18823/* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
2bf6d935
ML
18824 permutations. We assume that expand_vec_perm_1 has already failed. */
18825
18826static bool
18827expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
18828{
18829 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
18830 machine_mode vmode = d->vmode;
18831 unsigned char perm2[4];
18832 rtx op0 = d->op0, dest;
18833 bool ok;
18834
18835 switch (vmode)
18836 {
18837 case E_V4DFmode:
18838 case E_V8SFmode:
18839 /* These are special-cased in sse.md so that we can optionally
18840 use the vbroadcast instruction. They expand to two insns
18841 if the input happens to be in a register. */
18842 gcc_unreachable ();
18843
18844 case E_V2DFmode:
240198fe 18845 case E_V2SFmode:
2bf6d935 18846 case E_V4SFmode:
240198fe 18847 case E_V2DImode:
9b8579a6 18848 case E_V2SImode:
2bf6d935
ML
18849 case E_V4SImode:
18850 /* These are always implementable using standard shuffle patterns. */
18851 gcc_unreachable ();
18852
18853 case E_V8HImode:
18854 case E_V16QImode:
18855 /* These can be implemented via interleave. We save one insn by
18856 stopping once we have promoted to V4SImode and then use pshufd. */
18857 if (d->testing_p)
18858 return true;
18859 do
18860 {
18861 rtx dest;
18862 rtx (*gen) (rtx, rtx, rtx)
18863 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
18864 : gen_vec_interleave_lowv8hi;
18865
18866 if (elt >= nelt2)
18867 {
18868 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
18869 : gen_vec_interleave_highv8hi;
18870 elt -= nelt2;
18871 }
18872 nelt2 /= 2;
18873
18874 dest = gen_reg_rtx (vmode);
18875 emit_insn (gen (dest, op0, op0));
18876 vmode = get_mode_wider_vector (vmode);
18877 op0 = gen_lowpart (vmode, dest);
18878 }
18879 while (vmode != V4SImode);
18880
18881 memset (perm2, elt, 4);
18882 dest = gen_reg_rtx (V4SImode);
18883 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
18884 gcc_assert (ok);
18885 if (!d->testing_p)
18886 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
18887 return true;
18888
18889 case E_V64QImode:
18890 case E_V32QImode:
18891 case E_V16HImode:
18892 case E_V8SImode:
18893 case E_V4DImode:
18894 /* For AVX2 broadcasts of the first element vpbroadcast* or
18895 vpermq should be used by expand_vec_perm_1. */
18896 gcc_assert (!TARGET_AVX2 || d->perm[0]);
18897 return false;
18898
18899 default:
18900 gcc_unreachable ();
18901 }
18902}
18903
4bf4c103 18904/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
2bf6d935
ML
18905 broadcast permutations. */
18906
18907static bool
18908expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
18909{
18910 unsigned i, elt, nelt = d->nelt;
18911
18912 if (!d->one_operand_p)
18913 return false;
18914
18915 elt = d->perm[0];
18916 for (i = 1; i < nelt; ++i)
18917 if (d->perm[i] != elt)
18918 return false;
18919
18920 return expand_vec_perm_broadcast_1 (d);
18921}
18922
18923/* Implement arbitrary permutations of two V64QImode operands
18924 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
18925static bool
18926expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
18927{
18928 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
18929 return false;
18930
18931 if (d->testing_p)
18932 return true;
18933
18934 struct expand_vec_perm_d ds[2];
18935 rtx rperm[128], vperm, target0, target1;
18936 unsigned int i, nelt;
18937 machine_mode vmode;
18938
18939 nelt = d->nelt;
18940 vmode = V64QImode;
18941
18942 for (i = 0; i < 2; i++)
18943 {
18944 ds[i] = *d;
18945 ds[i].vmode = V32HImode;
18946 ds[i].nelt = 32;
18947 ds[i].target = gen_reg_rtx (V32HImode);
18948 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
18949 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
18950 }
18951
18952 /* Prepare permutations such that the first one takes care of
18953 putting the even bytes into the right positions or one higher
18954 positions (ds[0]) and the second one takes care of
18955 putting the odd bytes into the right positions or one below
18956 (ds[1]). */
18957
18958 for (i = 0; i < nelt; i++)
18959 {
18960 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
18961 if (i & 1)
18962 {
18963 rperm[i] = constm1_rtx;
18964 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
18965 }
18966 else
18967 {
18968 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
18969 rperm[i + 64] = constm1_rtx;
18970 }
18971 }
18972
18973 bool ok = expand_vec_perm_1 (&ds[0]);
18974 gcc_assert (ok);
18975 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
18976
18977 ok = expand_vec_perm_1 (&ds[1]);
18978 gcc_assert (ok);
18979 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
18980
18981 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
18982 vperm = force_reg (vmode, vperm);
18983 target0 = gen_reg_rtx (V64QImode);
18984 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
18985
18986 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
18987 vperm = force_reg (vmode, vperm);
18988 target1 = gen_reg_rtx (V64QImode);
18989 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
18990
18991 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
18992 return true;
18993}
18994
18995/* Implement arbitrary permutation of two V32QImode and V16QImode operands
18996 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
18997 all the shorter instruction sequences. */
18998
18999static bool
19000expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
19001{
19002 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
19003 unsigned int i, nelt, eltsz;
19004 bool used[4];
19005
19006 if (!TARGET_AVX2
19007 || d->one_operand_p
19008 || (d->vmode != V32QImode && d->vmode != V16HImode))
19009 return false;
19010
19011 if (d->testing_p)
19012 return true;
19013
19014 nelt = d->nelt;
19015 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19016
19017 /* Generate 4 permutation masks. If the required element is within
19018 the same lane, it is shuffled in. If the required element from the
19019 other lane, force a zero by setting bit 7 in the permutation mask.
19020 In the other mask the mask has non-negative elements if element
19021 is requested from the other lane, but also moved to the other lane,
19022 so that the result of vpshufb can have the two V2TImode halves
19023 swapped. */
19024 m128 = GEN_INT (-128);
19025 for (i = 0; i < 32; ++i)
19026 {
19027 rperm[0][i] = m128;
19028 rperm[1][i] = m128;
19029 rperm[2][i] = m128;
19030 rperm[3][i] = m128;
19031 }
19032 used[0] = false;
19033 used[1] = false;
19034 used[2] = false;
19035 used[3] = false;
19036 for (i = 0; i < nelt; ++i)
19037 {
19038 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
19039 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
19040 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
19041
19042 for (j = 0; j < eltsz; ++j)
19043 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
19044 used[which] = true;
19045 }
19046
19047 for (i = 0; i < 2; ++i)
19048 {
19049 if (!used[2 * i + 1])
19050 {
19051 h[i] = NULL_RTX;
19052 continue;
19053 }
19054 vperm = gen_rtx_CONST_VECTOR (V32QImode,
19055 gen_rtvec_v (32, rperm[2 * i + 1]));
19056 vperm = force_reg (V32QImode, vperm);
19057 h[i] = gen_reg_rtx (V32QImode);
19058 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19059 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
19060 }
19061
19062 /* Swap the 128-byte lanes of h[X]. */
19063 for (i = 0; i < 2; ++i)
19064 {
19065 if (h[i] == NULL_RTX)
19066 continue;
19067 op = gen_reg_rtx (V4DImode);
19068 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
19069 const2_rtx, GEN_INT (3), const0_rtx,
19070 const1_rtx));
19071 h[i] = gen_lowpart (V32QImode, op);
19072 }
19073
19074 for (i = 0; i < 2; ++i)
19075 {
19076 if (!used[2 * i])
19077 {
19078 l[i] = NULL_RTX;
19079 continue;
19080 }
19081 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
19082 vperm = force_reg (V32QImode, vperm);
19083 l[i] = gen_reg_rtx (V32QImode);
19084 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19085 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
19086 }
19087
19088 for (i = 0; i < 2; ++i)
19089 {
19090 if (h[i] && l[i])
19091 {
19092 op = gen_reg_rtx (V32QImode);
19093 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
19094 l[i] = op;
19095 }
19096 else if (h[i])
19097 l[i] = h[i];
19098 }
19099
19100 gcc_assert (l[0] && l[1]);
19101 op = d->target;
19102 if (d->vmode != V32QImode)
19103 op = gen_reg_rtx (V32QImode);
19104 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
19105 if (op != d->target)
19106 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
19107 return true;
19108}
19109
19110/* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
19111 taken care of, perform the expansion in D and return true on success. */
19112
19113static bool
19114ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
19115{
19116 /* Try a single instruction expansion. */
19117 if (expand_vec_perm_1 (d))
19118 return true;
19119
19120 /* Try sequences of two instructions. */
19121
19122 if (expand_vec_perm_pshuflw_pshufhw (d))
19123 return true;
19124
19125 if (expand_vec_perm_palignr (d, false))
19126 return true;
19127
19128 if (expand_vec_perm_interleave2 (d))
19129 return true;
19130
19131 if (expand_vec_perm_broadcast (d))
19132 return true;
19133
19134 if (expand_vec_perm_vpermq_perm_1 (d))
19135 return true;
19136
19137 if (expand_vec_perm_vperm2f128 (d))
19138 return true;
19139
19140 if (expand_vec_perm_pblendv (d))
19141 return true;
19142
19143 /* Try sequences of three instructions. */
19144
19145 if (expand_vec_perm_even_odd_pack (d))
19146 return true;
19147
19148 if (expand_vec_perm_2vperm2f128_vshuf (d))
19149 return true;
19150
19151 if (expand_vec_perm_pshufb2 (d))
19152 return true;
19153
19154 if (expand_vec_perm_interleave3 (d))
19155 return true;
19156
19157 if (expand_vec_perm_vperm2f128_vblend (d))
19158 return true;
19159
19160 /* Try sequences of four instructions. */
19161
19162 if (expand_vec_perm_even_odd_trunc (d))
19163 return true;
19164 if (expand_vec_perm_vpshufb2_vpermq (d))
19165 return true;
19166
19167 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
19168 return true;
19169
19170 if (expand_vec_perm_vpermt2_vpshub2 (d))
19171 return true;
19172
19173 /* ??? Look for narrow permutations whose element orderings would
19174 allow the promotion to a wider mode. */
19175
19176 /* ??? Look for sequences of interleave or a wider permute that place
19177 the data into the correct lanes for a half-vector shuffle like
19178 pshuf[lh]w or vpermilps. */
19179
19180 /* ??? Look for sequences of interleave that produce the desired results.
19181 The combinatorics of punpck[lh] get pretty ugly... */
19182
19183 if (expand_vec_perm_even_odd (d))
19184 return true;
19185
19186 /* Even longer sequences. */
19187 if (expand_vec_perm_vpshufb4_vpermq2 (d))
19188 return true;
19189
19190 /* See if we can get the same permutation in different vector integer
19191 mode. */
19192 struct expand_vec_perm_d nd;
19193 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19194 {
19195 if (!d->testing_p)
19196 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19197 return true;
19198 }
19199
4bf4c103
JJ
19200 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
19201 if (expand_vec_perm2_vperm2f128_vblend (d))
19202 return true;
19203
2bf6d935
ML
19204 return false;
19205}
19206
19207/* If a permutation only uses one operand, make it clear. Returns true
19208 if the permutation references both operands. */
19209
19210static bool
19211canonicalize_perm (struct expand_vec_perm_d *d)
19212{
19213 int i, which, nelt = d->nelt;
19214
19215 for (i = which = 0; i < nelt; ++i)
4bf4c103 19216 which |= (d->perm[i] < nelt ? 1 : 2);
2bf6d935
ML
19217
19218 d->one_operand_p = true;
19219 switch (which)
19220 {
19221 default:
19222 gcc_unreachable();
19223
19224 case 3:
19225 if (!rtx_equal_p (d->op0, d->op1))
19226 {
19227 d->one_operand_p = false;
19228 break;
19229 }
19230 /* The elements of PERM do not suggest that only the first operand
19231 is used, but both operands are identical. Allow easier matching
19232 of the permutation by folding the permutation into the single
19233 input vector. */
19234 /* FALLTHRU */
19235
19236 case 2:
19237 for (i = 0; i < nelt; ++i)
19238 d->perm[i] &= nelt - 1;
19239 d->op0 = d->op1;
19240 break;
19241
19242 case 1:
19243 d->op1 = d->op0;
19244 break;
19245 }
19246
19247 return (which == 3);
19248}
19249
19250/* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19251
19252bool
19253ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
19254 rtx op1, const vec_perm_indices &sel)
19255{
19256 struct expand_vec_perm_d d;
19257 unsigned char perm[MAX_VECT_LEN];
19258 unsigned int i, nelt, which;
19259 bool two_args;
19260
19261 d.target = target;
19262 d.op0 = op0;
19263 d.op1 = op1;
19264
19265 d.vmode = vmode;
19266 gcc_assert (VECTOR_MODE_P (d.vmode));
19267 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19268 d.testing_p = !target;
19269
19270 gcc_assert (sel.length () == nelt);
19271 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
19272
19273 /* Given sufficient ISA support we can just return true here
19274 for selected vector modes. */
19275 switch (d.vmode)
19276 {
19277 case E_V16SFmode:
19278 case E_V16SImode:
19279 case E_V8DImode:
19280 case E_V8DFmode:
19281 if (!TARGET_AVX512F)
19282 return false;
19283 /* All implementable with a single vperm[it]2 insn. */
19284 if (d.testing_p)
19285 return true;
19286 break;
19287 case E_V32HImode:
19288 if (!TARGET_AVX512BW)
19289 return false;
19290 if (d.testing_p)
19291 /* All implementable with a single vperm[it]2 insn. */
19292 return true;
19293 break;
19294 case E_V64QImode:
19295 if (!TARGET_AVX512BW)
19296 return false;
19297 if (d.testing_p)
19298 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
19299 return true;
19300 break;
19301 case E_V8SImode:
19302 case E_V8SFmode:
19303 case E_V4DFmode:
19304 case E_V4DImode:
19305 if (!TARGET_AVX)
19306 return false;
19307 if (d.testing_p && TARGET_AVX512VL)
19308 /* All implementable with a single vperm[it]2 insn. */
19309 return true;
19310 break;
19311 case E_V16HImode:
19312 if (!TARGET_SSE2)
19313 return false;
19314 if (d.testing_p && TARGET_AVX2)
19315 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19316 return true;
19317 break;
19318 case E_V32QImode:
19319 if (!TARGET_SSE2)
19320 return false;
19321 if (d.testing_p && TARGET_AVX2)
19322 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19323 return true;
19324 break;
19325 case E_V8HImode:
19326 case E_V16QImode:
19327 if (!TARGET_SSE2)
19328 return false;
19329 /* Fall through. */
19330 case E_V4SImode:
19331 case E_V4SFmode:
19332 if (!TARGET_SSE)
19333 return false;
19334 /* All implementable with a single vpperm insn. */
19335 if (d.testing_p && TARGET_XOP)
19336 return true;
19337 /* All implementable with 2 pshufb + 1 ior. */
19338 if (d.testing_p && TARGET_SSSE3)
19339 return true;
19340 break;
240198fe 19341 case E_V2SFmode:
9b8579a6
UB
19342 case E_V2SImode:
19343 case E_V4HImode:
19344 if (!TARGET_MMX_WITH_SSE)
19345 return false;
19346 break;
2bf6d935
ML
19347 case E_V2DImode:
19348 case E_V2DFmode:
19349 if (!TARGET_SSE)
19350 return false;
19351 /* All implementable with shufpd or unpck[lh]pd. */
19352 if (d.testing_p)
19353 return true;
19354 break;
19355 default:
19356 return false;
19357 }
19358
19359 for (i = which = 0; i < nelt; ++i)
19360 {
19361 unsigned char e = sel[i];
19362 gcc_assert (e < 2 * nelt);
19363 d.perm[i] = e;
19364 perm[i] = e;
19365 which |= (e < nelt ? 1 : 2);
19366 }
19367
19368 if (d.testing_p)
19369 {
19370 /* For all elements from second vector, fold the elements to first. */
19371 if (which == 2)
19372 for (i = 0; i < nelt; ++i)
19373 d.perm[i] -= nelt;
19374
19375 /* Check whether the mask can be applied to the vector type. */
19376 d.one_operand_p = (which != 3);
19377
19378 /* Implementable with shufps or pshufd. */
9b8579a6 19379 if (d.one_operand_p
240198fe 19380 && (d.vmode == V4SFmode || d.vmode == V2SFmode
9b8579a6 19381 || d.vmode == V4SImode || d.vmode == V2SImode))
2bf6d935
ML
19382 return true;
19383
19384 /* Otherwise we have to go through the motions and see if we can
19385 figure out how to generate the requested permutation. */
19386 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
19387 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
19388 if (!d.one_operand_p)
19389 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
19390
19391 start_sequence ();
19392 bool ret = ix86_expand_vec_perm_const_1 (&d);
19393 end_sequence ();
19394
19395 return ret;
19396 }
19397
19398 two_args = canonicalize_perm (&d);
19399
19400 if (ix86_expand_vec_perm_const_1 (&d))
19401 return true;
19402
19403 /* If the selector says both arguments are needed, but the operands are the
19404 same, the above tried to expand with one_operand_p and flattened selector.
19405 If that didn't work, retry without one_operand_p; we succeeded with that
19406 during testing. */
19407 if (two_args && d.one_operand_p)
19408 {
19409 d.one_operand_p = false;
19410 memcpy (d.perm, perm, sizeof (perm));
19411 return ix86_expand_vec_perm_const_1 (&d);
19412 }
19413
19414 return false;
19415}
19416
19417void
19418ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
19419{
19420 struct expand_vec_perm_d d;
19421 unsigned i, nelt;
19422
19423 d.target = targ;
19424 d.op0 = op0;
19425 d.op1 = op1;
19426 d.vmode = GET_MODE (targ);
19427 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19428 d.one_operand_p = false;
19429 d.testing_p = false;
19430
19431 for (i = 0; i < nelt; ++i)
19432 d.perm[i] = i * 2 + odd;
19433
19434 /* We'll either be able to implement the permutation directly... */
19435 if (expand_vec_perm_1 (&d))
19436 return;
19437
19438 /* ... or we use the special-case patterns. */
19439 expand_vec_perm_even_odd_1 (&d, odd);
19440}
19441
19442static void
19443ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
19444{
19445 struct expand_vec_perm_d d;
19446 unsigned i, nelt, base;
19447 bool ok;
19448
19449 d.target = targ;
19450 d.op0 = op0;
19451 d.op1 = op1;
19452 d.vmode = GET_MODE (targ);
19453 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19454 d.one_operand_p = false;
19455 d.testing_p = false;
19456
19457 base = high_p ? nelt / 2 : 0;
19458 for (i = 0; i < nelt / 2; ++i)
19459 {
19460 d.perm[i * 2] = i + base;
19461 d.perm[i * 2 + 1] = i + base + nelt;
19462 }
19463
19464 /* Note that for AVX this isn't one instruction. */
19465 ok = ix86_expand_vec_perm_const_1 (&d);
19466 gcc_assert (ok);
19467}
19468
19469
19470/* Expand a vector operation CODE for a V*QImode in terms of the
19471 same operation on V*HImode. */
19472
19473void
19474ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
19475{
19476 machine_mode qimode = GET_MODE (dest);
19477 machine_mode himode;
19478 rtx (*gen_il) (rtx, rtx, rtx);
19479 rtx (*gen_ih) (rtx, rtx, rtx);
19480 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
19481 struct expand_vec_perm_d d;
19482 bool ok, full_interleave;
19483 bool uns_p = false;
19484 int i;
19485
19486 switch (qimode)
19487 {
19488 case E_V16QImode:
19489 himode = V8HImode;
19490 gen_il = gen_vec_interleave_lowv16qi;
19491 gen_ih = gen_vec_interleave_highv16qi;
19492 break;
19493 case E_V32QImode:
19494 himode = V16HImode;
19495 gen_il = gen_avx2_interleave_lowv32qi;
19496 gen_ih = gen_avx2_interleave_highv32qi;
19497 break;
19498 case E_V64QImode:
19499 himode = V32HImode;
19500 gen_il = gen_avx512bw_interleave_lowv64qi;
19501 gen_ih = gen_avx512bw_interleave_highv64qi;
19502 break;
19503 default:
19504 gcc_unreachable ();
19505 }
19506
19507 op2_l = op2_h = op2;
19508 switch (code)
19509 {
19510 case MULT:
19511 /* Unpack data such that we've got a source byte in each low byte of
19512 each word. We don't care what goes into the high byte of each word.
19513 Rather than trying to get zero in there, most convenient is to let
19514 it be a copy of the low byte. */
19515 op2_l = gen_reg_rtx (qimode);
19516 op2_h = gen_reg_rtx (qimode);
19517 emit_insn (gen_il (op2_l, op2, op2));
19518 emit_insn (gen_ih (op2_h, op2, op2));
19519
19520 op1_l = gen_reg_rtx (qimode);
19521 op1_h = gen_reg_rtx (qimode);
19522 emit_insn (gen_il (op1_l, op1, op1));
19523 emit_insn (gen_ih (op1_h, op1, op1));
19524 full_interleave = qimode == V16QImode;
19525 break;
19526
19527 case ASHIFT:
19528 case LSHIFTRT:
19529 uns_p = true;
19530 /* FALLTHRU */
19531 case ASHIFTRT:
19532 op1_l = gen_reg_rtx (himode);
19533 op1_h = gen_reg_rtx (himode);
19534 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
19535 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
19536 full_interleave = true;
19537 break;
19538 default:
19539 gcc_unreachable ();
19540 }
19541
19542 /* Perform the operation. */
19543 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
19544 1, OPTAB_DIRECT);
19545 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
19546 1, OPTAB_DIRECT);
19547 gcc_assert (res_l && res_h);
19548
19549 /* Merge the data back into the right place. */
19550 d.target = dest;
19551 d.op0 = gen_lowpart (qimode, res_l);
19552 d.op1 = gen_lowpart (qimode, res_h);
19553 d.vmode = qimode;
19554 d.nelt = GET_MODE_NUNITS (qimode);
19555 d.one_operand_p = false;
19556 d.testing_p = false;
19557
19558 if (full_interleave)
19559 {
19560 /* For SSE2, we used an full interleave, so the desired
19561 results are in the even elements. */
19562 for (i = 0; i < d.nelt; ++i)
19563 d.perm[i] = i * 2;
19564 }
19565 else
19566 {
19567 /* For AVX, the interleave used above was not cross-lane. So the
19568 extraction is evens but with the second and third quarter swapped.
19569 Happily, that is even one insn shorter than even extraction.
19570 For AVX512BW we have 4 lanes. We extract evens from within a lane,
19571 always first from the first and then from the second source operand,
19572 the index bits above the low 4 bits remains the same.
19573 Thus, for d.nelt == 32 we want permutation
19574 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
19575 and for d.nelt == 64 we want permutation
19576 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
19577 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
19578 for (i = 0; i < d.nelt; ++i)
19579 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
19580 }
19581
19582 ok = ix86_expand_vec_perm_const_1 (&d);
19583 gcc_assert (ok);
19584
19585 set_unique_reg_note (get_last_insn (), REG_EQUAL,
19586 gen_rtx_fmt_ee (code, qimode, op1, op2));
19587}
19588
19589/* Helper function of ix86_expand_mul_widen_evenodd. Return true
19590 if op is CONST_VECTOR with all odd elements equal to their
19591 preceding element. */
19592
19593static bool
19594const_vector_equal_evenodd_p (rtx op)
19595{
19596 machine_mode mode = GET_MODE (op);
19597 int i, nunits = GET_MODE_NUNITS (mode);
19598 if (GET_CODE (op) != CONST_VECTOR
19599 || nunits != CONST_VECTOR_NUNITS (op))
19600 return false;
19601 for (i = 0; i < nunits; i += 2)
19602 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
19603 return false;
19604 return true;
19605}
19606
19607void
19608ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
19609 bool uns_p, bool odd_p)
19610{
19611 machine_mode mode = GET_MODE (op1);
19612 machine_mode wmode = GET_MODE (dest);
19613 rtx x;
19614 rtx orig_op1 = op1, orig_op2 = op2;
19615
19616 if (!nonimmediate_operand (op1, mode))
19617 op1 = force_reg (mode, op1);
19618 if (!nonimmediate_operand (op2, mode))
19619 op2 = force_reg (mode, op2);
19620
19621 /* We only play even/odd games with vectors of SImode. */
19622 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
19623
19624 /* If we're looking for the odd results, shift those members down to
19625 the even slots. For some cpus this is faster than a PSHUFD. */
19626 if (odd_p)
19627 {
19628 /* For XOP use vpmacsdqh, but only for smult, as it is only
19629 signed. */
19630 if (TARGET_XOP && mode == V4SImode && !uns_p)
19631 {
19632 x = force_reg (wmode, CONST0_RTX (wmode));
19633 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
19634 return;
19635 }
19636
19637 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
19638 if (!const_vector_equal_evenodd_p (orig_op1))
19639 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
19640 x, NULL, 1, OPTAB_DIRECT);
19641 if (!const_vector_equal_evenodd_p (orig_op2))
19642 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
19643 x, NULL, 1, OPTAB_DIRECT);
19644 op1 = gen_lowpart (mode, op1);
19645 op2 = gen_lowpart (mode, op2);
19646 }
19647
19648 if (mode == V16SImode)
19649 {
19650 if (uns_p)
19651 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
19652 else
19653 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
19654 }
19655 else if (mode == V8SImode)
19656 {
19657 if (uns_p)
19658 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
19659 else
19660 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
19661 }
19662 else if (uns_p)
19663 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
19664 else if (TARGET_SSE4_1)
19665 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
19666 else
19667 {
19668 rtx s1, s2, t0, t1, t2;
19669
19670 /* The easiest way to implement this without PMULDQ is to go through
19671 the motions as if we are performing a full 64-bit multiply. With
19672 the exception that we need to do less shuffling of the elements. */
19673
19674 /* Compute the sign-extension, aka highparts, of the two operands. */
19675 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
19676 op1, pc_rtx, pc_rtx);
19677 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
19678 op2, pc_rtx, pc_rtx);
19679
19680 /* Multiply LO(A) * HI(B), and vice-versa. */
19681 t1 = gen_reg_rtx (wmode);
19682 t2 = gen_reg_rtx (wmode);
19683 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
19684 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
19685
19686 /* Multiply LO(A) * LO(B). */
19687 t0 = gen_reg_rtx (wmode);
19688 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
19689
19690 /* Combine and shift the highparts into place. */
19691 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
19692 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
19693 1, OPTAB_DIRECT);
19694
19695 /* Combine high and low parts. */
19696 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
19697 return;
19698 }
19699 emit_insn (x);
19700}
19701
19702void
19703ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
19704 bool uns_p, bool high_p)
19705{
19706 machine_mode wmode = GET_MODE (dest);
19707 machine_mode mode = GET_MODE (op1);
19708 rtx t1, t2, t3, t4, mask;
19709
19710 switch (mode)
19711 {
19712 case E_V4SImode:
19713 t1 = gen_reg_rtx (mode);
19714 t2 = gen_reg_rtx (mode);
19715 if (TARGET_XOP && !uns_p)
19716 {
19717 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
19718 shuffle the elements once so that all elements are in the right
19719 place for immediate use: { A C B D }. */
19720 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
19721 const1_rtx, GEN_INT (3)));
19722 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
19723 const1_rtx, GEN_INT (3)));
19724 }
19725 else
19726 {
19727 /* Put the elements into place for the multiply. */
19728 ix86_expand_vec_interleave (t1, op1, op1, high_p);
19729 ix86_expand_vec_interleave (t2, op2, op2, high_p);
19730 high_p = false;
19731 }
19732 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
19733 break;
19734
19735 case E_V8SImode:
19736 /* Shuffle the elements between the lanes. After this we
19737 have { A B E F | C D G H } for each operand. */
19738 t1 = gen_reg_rtx (V4DImode);
19739 t2 = gen_reg_rtx (V4DImode);
19740 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
19741 const0_rtx, const2_rtx,
19742 const1_rtx, GEN_INT (3)));
19743 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
19744 const0_rtx, const2_rtx,
19745 const1_rtx, GEN_INT (3)));
19746
19747 /* Shuffle the elements within the lanes. After this we
19748 have { A A B B | C C D D } or { E E F F | G G H H }. */
19749 t3 = gen_reg_rtx (V8SImode);
19750 t4 = gen_reg_rtx (V8SImode);
19751 mask = GEN_INT (high_p
19752 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
19753 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
19754 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
19755 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
19756
19757 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
19758 break;
19759
19760 case E_V8HImode:
19761 case E_V16HImode:
19762 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
19763 uns_p, OPTAB_DIRECT);
19764 t2 = expand_binop (mode,
19765 uns_p ? umul_highpart_optab : smul_highpart_optab,
19766 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
19767 gcc_assert (t1 && t2);
19768
19769 t3 = gen_reg_rtx (mode);
19770 ix86_expand_vec_interleave (t3, t1, t2, high_p);
19771 emit_move_insn (dest, gen_lowpart (wmode, t3));
19772 break;
19773
19774 case E_V16QImode:
19775 case E_V32QImode:
19776 case E_V32HImode:
19777 case E_V16SImode:
19778 case E_V64QImode:
19779 t1 = gen_reg_rtx (wmode);
19780 t2 = gen_reg_rtx (wmode);
19781 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
19782 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
19783
19784 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
19785 break;
19786
19787 default:
19788 gcc_unreachable ();
19789 }
19790}
19791
19792void
19793ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
19794{
19795 rtx res_1, res_2, res_3, res_4;
19796
19797 res_1 = gen_reg_rtx (V4SImode);
19798 res_2 = gen_reg_rtx (V4SImode);
19799 res_3 = gen_reg_rtx (V2DImode);
19800 res_4 = gen_reg_rtx (V2DImode);
19801 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
19802 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
19803
19804 /* Move the results in element 2 down to element 1; we don't care
19805 what goes in elements 2 and 3. Then we can merge the parts
19806 back together with an interleave.
19807
19808 Note that two other sequences were tried:
19809 (1) Use interleaves at the start instead of psrldq, which allows
19810 us to use a single shufps to merge things back at the end.
19811 (2) Use shufps here to combine the two vectors, then pshufd to
19812 put the elements in the correct order.
19813 In both cases the cost of the reformatting stall was too high
19814 and the overall sequence slower. */
19815
19816 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
19817 const0_rtx, const2_rtx,
19818 const0_rtx, const0_rtx));
19819 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
19820 const0_rtx, const2_rtx,
19821 const0_rtx, const0_rtx));
19822 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
19823
19824 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
19825}
19826
19827void
19828ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
19829{
19830 machine_mode mode = GET_MODE (op0);
19831 rtx t1, t2, t3, t4, t5, t6;
19832
19833 if (TARGET_AVX512DQ && mode == V8DImode)
19834 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
19835 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
19836 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
19837 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
19838 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
19839 else if (TARGET_XOP && mode == V2DImode)
19840 {
19841 /* op1: A,B,C,D, op2: E,F,G,H */
19842 op1 = gen_lowpart (V4SImode, op1);
19843 op2 = gen_lowpart (V4SImode, op2);
19844
19845 t1 = gen_reg_rtx (V4SImode);
19846 t2 = gen_reg_rtx (V4SImode);
19847 t3 = gen_reg_rtx (V2DImode);
19848 t4 = gen_reg_rtx (V2DImode);
19849
19850 /* t1: B,A,D,C */
19851 emit_insn (gen_sse2_pshufd_1 (t1, op1,
19852 GEN_INT (1),
19853 GEN_INT (0),
19854 GEN_INT (3),
19855 GEN_INT (2)));
19856
19857 /* t2: (B*E),(A*F),(D*G),(C*H) */
19858 emit_insn (gen_mulv4si3 (t2, t1, op2));
19859
19860 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
19861 emit_insn (gen_xop_phadddq (t3, t2));
19862
19863 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
19864 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
19865
19866 /* Multiply lower parts and add all */
19867 t5 = gen_reg_rtx (V2DImode);
19868 emit_insn (gen_vec_widen_umult_even_v4si (t5,
19869 gen_lowpart (V4SImode, op1),
19870 gen_lowpart (V4SImode, op2)));
8ba6ea87 19871 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
2bf6d935
ML
19872 }
19873 else
19874 {
19875 machine_mode nmode;
19876 rtx (*umul) (rtx, rtx, rtx);
19877
19878 if (mode == V2DImode)
19879 {
19880 umul = gen_vec_widen_umult_even_v4si;
19881 nmode = V4SImode;
19882 }
19883 else if (mode == V4DImode)
19884 {
19885 umul = gen_vec_widen_umult_even_v8si;
19886 nmode = V8SImode;
19887 }
19888 else if (mode == V8DImode)
19889 {
19890 umul = gen_vec_widen_umult_even_v16si;
19891 nmode = V16SImode;
19892 }
19893 else
19894 gcc_unreachable ();
19895
19896
19897 /* Multiply low parts. */
19898 t1 = gen_reg_rtx (mode);
19899 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
19900
19901 /* Shift input vectors right 32 bits so we can multiply high parts. */
19902 t6 = GEN_INT (32);
19903 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
19904 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
19905
19906 /* Multiply high parts by low parts. */
19907 t4 = gen_reg_rtx (mode);
19908 t5 = gen_reg_rtx (mode);
19909 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
19910 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
19911
19912 /* Combine and shift the highparts back. */
19913 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
19914 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
19915
19916 /* Combine high and low parts. */
19917 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
19918 }
19919
19920 set_unique_reg_note (get_last_insn (), REG_EQUAL,
19921 gen_rtx_MULT (mode, op1, op2));
19922}
19923
19924/* Return 1 if control tansfer instruction INSN
19925 should be encoded with notrack prefix. */
19926
19927bool
e8b0314a 19928ix86_notrack_prefixed_insn_p (rtx_insn *insn)
2bf6d935
ML
19929{
19930 if (!insn || !((flag_cf_protection & CF_BRANCH)))
19931 return false;
19932
19933 if (CALL_P (insn))
19934 {
19935 rtx call = get_call_rtx_from (insn);
19936 gcc_assert (call != NULL_RTX);
19937 rtx addr = XEXP (call, 0);
19938
19939 /* Do not emit 'notrack' if it's not an indirect call. */
19940 if (MEM_P (addr)
19941 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
19942 return false;
19943 else
19944 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
19945 }
19946
19947 if (JUMP_P (insn) && !flag_cet_switch)
19948 {
19949 rtx target = JUMP_LABEL (insn);
19950 if (target == NULL_RTX || ANY_RETURN_P (target))
19951 return false;
19952
19953 /* Check the jump is a switch table. */
19954 rtx_insn *label = as_a<rtx_insn *> (target);
19955 rtx_insn *table = next_insn (label);
19956 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
19957 return false;
19958 else
19959 return true;
19960 }
19961 return false;
19962}
19963
19964/* Calculate integer abs() using only SSE2 instructions. */
19965
19966void
19967ix86_expand_sse2_abs (rtx target, rtx input)
19968{
19969 machine_mode mode = GET_MODE (target);
19970 rtx tmp0, tmp1, x;
19971
19972 switch (mode)
19973 {
19974 case E_V2DImode:
19975 case E_V4DImode:
19976 /* For 64-bit signed integer X, with SSE4.2 use
19977 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
19978 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
19979 32 and use logical instead of arithmetic right shift (which is
19980 unimplemented) and subtract. */
19981 if (TARGET_SSE4_2)
19982 {
19983 tmp0 = gen_reg_rtx (mode);
19984 tmp1 = gen_reg_rtx (mode);
19985 emit_move_insn (tmp1, CONST0_RTX (mode));
19986 if (mode == E_V2DImode)
19987 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
19988 else
19989 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
19990 }
19991 else
19992 {
19993 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
19994 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
19995 - 1), NULL, 0, OPTAB_DIRECT);
19996 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
19997 }
19998
19999 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20000 NULL, 0, OPTAB_DIRECT);
20001 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20002 target, 0, OPTAB_DIRECT);
20003 break;
20004
20005 case E_V4SImode:
20006 /* For 32-bit signed integer X, the best way to calculate the absolute
20007 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
20008 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
20009 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
20010 NULL, 0, OPTAB_DIRECT);
20011 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20012 NULL, 0, OPTAB_DIRECT);
20013 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20014 target, 0, OPTAB_DIRECT);
20015 break;
20016
20017 case E_V8HImode:
20018 /* For 16-bit signed integer X, the best way to calculate the absolute
20019 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
20020 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20021
20022 x = expand_simple_binop (mode, SMAX, tmp0, input,
20023 target, 0, OPTAB_DIRECT);
20024 break;
20025
20026 case E_V16QImode:
20027 /* For 8-bit signed integer X, the best way to calculate the absolute
20028 value of X is min ((unsigned char) X, (unsigned char) (-X)),
20029 as SSE2 provides the PMINUB insn. */
20030 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20031
20032 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
20033 target, 0, OPTAB_DIRECT);
20034 break;
20035
20036 default:
20037 gcc_unreachable ();
20038 }
20039
20040 if (x != target)
20041 emit_move_insn (target, x);
20042}
20043
20044/* Expand an extract from a vector register through pextr insn.
20045 Return true if successful. */
20046
20047bool
20048ix86_expand_pextr (rtx *operands)
20049{
20050 rtx dst = operands[0];
20051 rtx src = operands[1];
20052
20053 unsigned int size = INTVAL (operands[2]);
20054 unsigned int pos = INTVAL (operands[3]);
20055
20056 if (SUBREG_P (dst))
20057 {
20058 /* Reject non-lowpart subregs. */
20059 if (SUBREG_BYTE (dst) > 0)
20060 return false;
20061 dst = SUBREG_REG (dst);
20062 }
20063
20064 if (SUBREG_P (src))
20065 {
20066 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
20067 src = SUBREG_REG (src);
20068 }
20069
20070 switch (GET_MODE (src))
20071 {
20072 case E_V16QImode:
20073 case E_V8HImode:
20074 case E_V4SImode:
20075 case E_V2DImode:
20076 case E_V1TImode:
20077 case E_TImode:
20078 {
20079 machine_mode srcmode, dstmode;
20080 rtx d, pat;
20081
20082 if (!int_mode_for_size (size, 0).exists (&dstmode))
20083 return false;
20084
20085 switch (dstmode)
20086 {
20087 case E_QImode:
20088 if (!TARGET_SSE4_1)
20089 return false;
20090 srcmode = V16QImode;
20091 break;
20092
20093 case E_HImode:
20094 if (!TARGET_SSE2)
20095 return false;
20096 srcmode = V8HImode;
20097 break;
20098
20099 case E_SImode:
20100 if (!TARGET_SSE4_1)
20101 return false;
20102 srcmode = V4SImode;
20103 break;
20104
20105 case E_DImode:
20106 gcc_assert (TARGET_64BIT);
20107 if (!TARGET_SSE4_1)
20108 return false;
20109 srcmode = V2DImode;
20110 break;
20111
20112 default:
20113 return false;
20114 }
20115
20116 /* Reject extractions from misaligned positions. */
20117 if (pos & (size-1))
20118 return false;
20119
20120 if (GET_MODE (dst) == dstmode)
20121 d = dst;
20122 else
20123 d = gen_reg_rtx (dstmode);
20124
20125 /* Construct insn pattern. */
20126 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
20127 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
20128
20129 /* Let the rtl optimizers know about the zero extension performed. */
20130 if (dstmode == QImode || dstmode == HImode)
20131 {
20132 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
20133 d = gen_lowpart (SImode, d);
20134 }
20135
20136 emit_insn (gen_rtx_SET (d, pat));
20137
20138 if (d != dst)
20139 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20140 return true;
20141 }
20142
20143 default:
20144 return false;
20145 }
20146}
20147
20148/* Expand an insert into a vector register through pinsr insn.
20149 Return true if successful. */
20150
20151bool
20152ix86_expand_pinsr (rtx *operands)
20153{
20154 rtx dst = operands[0];
20155 rtx src = operands[3];
20156
20157 unsigned int size = INTVAL (operands[1]);
20158 unsigned int pos = INTVAL (operands[2]);
20159
20160 if (SUBREG_P (dst))
20161 {
20162 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
20163 dst = SUBREG_REG (dst);
20164 }
20165
20166 switch (GET_MODE (dst))
20167 {
20168 case E_V16QImode:
20169 case E_V8HImode:
20170 case E_V4SImode:
20171 case E_V2DImode:
20172 case E_V1TImode:
20173 case E_TImode:
20174 {
20175 machine_mode srcmode, dstmode;
20176 rtx (*pinsr)(rtx, rtx, rtx, rtx);
20177 rtx d;
20178
20179 if (!int_mode_for_size (size, 0).exists (&srcmode))
20180 return false;
20181
20182 switch (srcmode)
20183 {
20184 case E_QImode:
20185 if (!TARGET_SSE4_1)
20186 return false;
20187 dstmode = V16QImode;
20188 pinsr = gen_sse4_1_pinsrb;
20189 break;
20190
20191 case E_HImode:
20192 if (!TARGET_SSE2)
20193 return false;
20194 dstmode = V8HImode;
20195 pinsr = gen_sse2_pinsrw;
20196 break;
20197
20198 case E_SImode:
20199 if (!TARGET_SSE4_1)
20200 return false;
20201 dstmode = V4SImode;
20202 pinsr = gen_sse4_1_pinsrd;
20203 break;
20204
20205 case E_DImode:
20206 gcc_assert (TARGET_64BIT);
20207 if (!TARGET_SSE4_1)
20208 return false;
20209 dstmode = V2DImode;
20210 pinsr = gen_sse4_1_pinsrq;
20211 break;
20212
20213 default:
20214 return false;
20215 }
20216
20217 /* Reject insertions to misaligned positions. */
20218 if (pos & (size-1))
20219 return false;
20220
20221 if (SUBREG_P (src))
20222 {
20223 unsigned int srcpos = SUBREG_BYTE (src);
20224
20225 if (srcpos > 0)
20226 {
20227 rtx extr_ops[4];
20228
20229 extr_ops[0] = gen_reg_rtx (srcmode);
20230 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
20231 extr_ops[2] = GEN_INT (size);
20232 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
20233
20234 if (!ix86_expand_pextr (extr_ops))
20235 return false;
20236
20237 src = extr_ops[0];
20238 }
20239 else
20240 src = gen_lowpart (srcmode, SUBREG_REG (src));
20241 }
20242
20243 if (GET_MODE (dst) == dstmode)
20244 d = dst;
20245 else
20246 d = gen_reg_rtx (dstmode);
20247
20248 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
20249 gen_lowpart (srcmode, src),
20250 GEN_INT (1 << (pos / size))));
20251 if (d != dst)
20252 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20253 return true;
20254 }
20255
20256 default:
20257 return false;
20258 }
20259}
20260
20261/* All CPUs prefer to avoid cross-lane operations so perform reductions
20262 upper against lower halves up to SSE reg size. */
20263
20264machine_mode
20265ix86_split_reduction (machine_mode mode)
20266{
20267 /* Reduce lowpart against highpart until we reach SSE reg width to
20268 avoid cross-lane operations. */
20269 switch (mode)
20270 {
20271 case E_V8DImode:
20272 case E_V4DImode:
20273 return V2DImode;
20274 case E_V16SImode:
20275 case E_V8SImode:
20276 return V4SImode;
20277 case E_V32HImode:
20278 case E_V16HImode:
20279 return V8HImode;
20280 case E_V64QImode:
20281 case E_V32QImode:
20282 return V16QImode;
20283 case E_V16SFmode:
20284 case E_V8SFmode:
20285 return V4SFmode;
20286 case E_V8DFmode:
20287 case E_V4DFmode:
20288 return V2DFmode;
20289 default:
20290 return mode;
20291 }
20292}
20293
20294/* Generate call to __divmoddi4. */
20295
20296void
20297ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
20298 rtx op0, rtx op1,
20299 rtx *quot_p, rtx *rem_p)
20300{
20301 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
20302
20303 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
20304 mode, op0, mode, op1, mode,
20305 XEXP (rem, 0), Pmode);
20306 *quot_p = quot;
20307 *rem_p = rem;
20308}
20309
20310#include "gt-i386-expand.h"