]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/i386/i386-expand.c
AVX512FP16: Enable FP16 mask load/store.
[thirdparty/gcc.git] / gcc / config / i386 / i386-expand.c
CommitLineData
99dee823 1/* Copyright (C) 1988-2021 Free Software Foundation, Inc.
2bf6d935
ML
2
3This file is part of GCC.
4
5GCC is free software; you can redistribute it and/or modify
6it under the terms of the GNU General Public License as published by
7the Free Software Foundation; either version 3, or (at your option)
8any later version.
9
10GCC is distributed in the hope that it will be useful,
11but WITHOUT ANY WARRANTY; without even the implied warranty of
12MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13GNU General Public License for more details.
14
15You should have received a copy of the GNU General Public License
16along with GCC; see the file COPYING3. If not see
17<http://www.gnu.org/licenses/>. */
18
19#define IN_TARGET_CODE 1
20
21#include "config.h"
22#include "system.h"
23#include "coretypes.h"
24#include "backend.h"
25#include "rtl.h"
26#include "tree.h"
27#include "memmodel.h"
28#include "gimple.h"
29#include "cfghooks.h"
30#include "cfgloop.h"
31#include "df.h"
32#include "tm_p.h"
33#include "stringpool.h"
34#include "expmed.h"
35#include "optabs.h"
36#include "regs.h"
37#include "emit-rtl.h"
38#include "recog.h"
39#include "cgraph.h"
40#include "diagnostic.h"
41#include "cfgbuild.h"
42#include "alias.h"
43#include "fold-const.h"
44#include "attribs.h"
45#include "calls.h"
46#include "stor-layout.h"
47#include "varasm.h"
48#include "output.h"
49#include "insn-attr.h"
50#include "flags.h"
51#include "except.h"
52#include "explow.h"
53#include "expr.h"
54#include "cfgrtl.h"
55#include "common/common-target.h"
56#include "langhooks.h"
57#include "reload.h"
58#include "gimplify.h"
59#include "dwarf2.h"
60#include "tm-constrs.h"
2bf6d935
ML
61#include "cselib.h"
62#include "sched-int.h"
63#include "opts.h"
64#include "tree-pass.h"
65#include "context.h"
66#include "pass_manager.h"
67#include "target-globals.h"
68#include "gimple-iterator.h"
69#include "tree-vectorizer.h"
70#include "shrink-wrap.h"
71#include "builtins.h"
72#include "rtl-iter.h"
73#include "tree-iterator.h"
74#include "dbgcnt.h"
75#include "case-cfn-macros.h"
76#include "dojump.h"
77#include "fold-const-call.h"
78#include "tree-vrp.h"
79#include "tree-ssanames.h"
80#include "selftest.h"
81#include "selftest-rtl.h"
82#include "print-rtl.h"
83#include "intl.h"
84#include "ifcvt.h"
85#include "symbol-summary.h"
86#include "ipa-prop.h"
87#include "ipa-fnsummary.h"
88#include "wide-int-bitmask.h"
89#include "tree-vector-builder.h"
90#include "debug.h"
91#include "dwarf2out.h"
92#include "i386-options.h"
93#include "i386-builtins.h"
94#include "i386-expand.h"
95
96/* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
101
102void
103split_double_mode (machine_mode mode, rtx operands[],
104 int num, rtx lo_half[], rtx hi_half[])
105{
106 machine_mode half_mode;
107 unsigned int byte;
deeedbad
JJ
108 rtx mem_op = NULL_RTX;
109 int mem_num = 0;
2bf6d935
ML
110
111 switch (mode)
112 {
113 case E_TImode:
114 half_mode = DImode;
115 break;
116 case E_DImode:
117 half_mode = SImode;
118 break;
58d6eea0 119 case E_P2HImode:
120 half_mode = HImode;
121 break;
122 case E_P2QImode:
123 half_mode = QImode;
124 break;
2bf6d935
ML
125 default:
126 gcc_unreachable ();
127 }
128
129 byte = GET_MODE_SIZE (half_mode);
130
131 while (num--)
132 {
133 rtx op = operands[num];
134
135 /* simplify_subreg refuse to split volatile memory addresses,
136 but we still have to handle it. */
137 if (MEM_P (op))
138 {
deeedbad
JJ
139 if (mem_op && rtx_equal_p (op, mem_op))
140 {
141 lo_half[num] = lo_half[mem_num];
142 hi_half[num] = hi_half[mem_num];
143 }
144 else
145 {
146 mem_op = op;
147 mem_num = num;
148 lo_half[num] = adjust_address (op, half_mode, 0);
149 hi_half[num] = adjust_address (op, half_mode, byte);
150 }
2bf6d935
ML
151 }
152 else
153 {
154 lo_half[num] = simplify_gen_subreg (half_mode, op,
155 GET_MODE (op) == VOIDmode
156 ? mode : GET_MODE (op), 0);
d39fbed7
UB
157
158 rtx tmp = simplify_gen_subreg (half_mode, op,
159 GET_MODE (op) == VOIDmode
160 ? mode : GET_MODE (op), byte);
161 /* simplify_gen_subreg will return NULL RTX for the
162 high half of the paradoxical subreg. */
163 hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
2bf6d935
ML
164 }
165 }
166}
167
168/* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
169 for the target. */
170
171void
172ix86_expand_clear (rtx dest)
173{
174 rtx tmp;
175
176 /* We play register width games, which are only valid after reload. */
177 gcc_assert (reload_completed);
178
179 /* Avoid HImode and its attendant prefix byte. */
180 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
181 dest = gen_rtx_REG (SImode, REGNO (dest));
182 tmp = gen_rtx_SET (dest, const0_rtx);
183
184 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
185 {
186 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
187 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
188 }
189
190 emit_insn (tmp);
191}
192
edafb35b
L
193/* Return true if V can be broadcasted from an integer of WIDTH bits
194 which is returned in VAL_BROADCAST. Otherwise, return false. */
195
196static bool
197ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
198 HOST_WIDE_INT &val_broadcast)
199{
200 wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
201 val_broadcast = wi::extract_uhwi (val, 0, width);
202 for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
203 {
204 HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
205 if (val_broadcast != each)
206 return false;
207 }
208 val_broadcast = sext_hwi (val_broadcast, width);
209 return true;
210}
211
212/* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
213
214static rtx
215ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
216{
217 /* Don't use integer vector broadcast if we can't move from GPR to SSE
218 register directly. */
219 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
220 return nullptr;
221
222 /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
223 broadcast only if vector broadcast is available. */
224 if (!TARGET_AVX
225 || !CONST_WIDE_INT_P (op)
226 || standard_sse_constant_p (op, mode))
227 return nullptr;
228
229 HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
230 HOST_WIDE_INT val_broadcast;
231 scalar_int_mode broadcast_mode;
232 if (TARGET_AVX2
233 && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
234 val_broadcast))
235 broadcast_mode = QImode;
236 else if (TARGET_AVX2
237 && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
238 val_broadcast))
239 broadcast_mode = HImode;
240 else if (ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
241 val_broadcast))
242 broadcast_mode = SImode;
243 else if (TARGET_64BIT
244 && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
245 val_broadcast))
246 broadcast_mode = DImode;
247 else
248 return nullptr;
249
250 /* Check if OP can be broadcasted from VAL. */
251 for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
252 if (val != CONST_WIDE_INT_ELT (op, i))
253 return nullptr;
254
255 unsigned int nunits = (GET_MODE_SIZE (mode)
256 / GET_MODE_SIZE (broadcast_mode));
257 machine_mode vector_mode;
258 if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
259 gcc_unreachable ();
260 rtx target = ix86_gen_scratch_sse_rtx (vector_mode);
261 bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
262 target,
263 GEN_INT (val_broadcast));
264 gcc_assert (ok);
265 target = lowpart_subreg (mode, target, vector_mode);
266 return target;
267}
268
2bf6d935
ML
269void
270ix86_expand_move (machine_mode mode, rtx operands[])
271{
272 rtx op0, op1;
273 rtx tmp, addend = NULL_RTX;
274 enum tls_model model;
275
276 op0 = operands[0];
277 op1 = operands[1];
278
be39636d
RS
279 /* Avoid complex sets of likely spilled hard registers before reload. */
280 if (!ix86_hardreg_mov_ok (op0, op1))
281 {
282 tmp = gen_reg_rtx (mode);
283 operands[0] = tmp;
284 ix86_expand_move (mode, operands);
285 operands[0] = op0;
286 operands[1] = tmp;
287 op1 = tmp;
288 }
289
2bf6d935
ML
290 switch (GET_CODE (op1))
291 {
292 case CONST:
293 tmp = XEXP (op1, 0);
294
295 if (GET_CODE (tmp) != PLUS
296 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
297 break;
298
299 op1 = XEXP (tmp, 0);
300 addend = XEXP (tmp, 1);
301 /* FALLTHRU */
302
303 case SYMBOL_REF:
304 model = SYMBOL_REF_TLS_MODEL (op1);
305
306 if (model)
307 op1 = legitimize_tls_address (op1, model, true);
308 else if (ix86_force_load_from_GOT_p (op1))
309 {
310 /* Load the external function address via GOT slot to avoid PLT. */
311 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
312 (TARGET_64BIT
313 ? UNSPEC_GOTPCREL
314 : UNSPEC_GOT));
315 op1 = gen_rtx_CONST (Pmode, op1);
316 op1 = gen_const_mem (Pmode, op1);
317 set_mem_alias_set (op1, ix86_GOT_alias_set ());
318 }
319 else
320 {
321 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
322 if (tmp)
323 {
324 op1 = tmp;
325 if (!addend)
326 break;
327 }
328 else
329 {
330 op1 = operands[1];
331 break;
332 }
333 }
334
335 if (addend)
336 {
337 op1 = force_operand (op1, NULL_RTX);
338 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
339 op0, 1, OPTAB_DIRECT);
340 }
341 else
342 op1 = force_operand (op1, op0);
343
344 if (op1 == op0)
345 return;
346
347 op1 = convert_to_mode (mode, op1, 1);
348
349 default:
350 break;
351 }
352
353 if ((flag_pic || MACHOPIC_INDIRECT)
354 && symbolic_operand (op1, mode))
355 {
356 if (TARGET_MACHO && !TARGET_64BIT)
357 {
358#if TARGET_MACHO
359 /* dynamic-no-pic */
360 if (MACHOPIC_INDIRECT)
361 {
362 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
363 ? op0 : gen_reg_rtx (Pmode);
364 op1 = machopic_indirect_data_reference (op1, temp);
365 if (MACHOPIC_PURE)
366 op1 = machopic_legitimize_pic_address (op1, mode,
367 temp == op1 ? 0 : temp);
368 }
369 if (op0 != op1 && GET_CODE (op0) != MEM)
370 {
371 rtx insn = gen_rtx_SET (op0, op1);
372 emit_insn (insn);
373 return;
374 }
375 if (GET_CODE (op0) == MEM)
376 op1 = force_reg (Pmode, op1);
377 else
378 {
379 rtx temp = op0;
380 if (GET_CODE (temp) != REG)
381 temp = gen_reg_rtx (Pmode);
382 temp = legitimize_pic_address (op1, temp);
383 if (temp == op0)
384 return;
385 op1 = temp;
386 }
387 /* dynamic-no-pic */
388#endif
389 }
390 else
391 {
392 if (MEM_P (op0))
393 op1 = force_reg (mode, op1);
394 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
395 {
396 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
397 op1 = legitimize_pic_address (op1, reg);
398 if (op0 == op1)
399 return;
400 op1 = convert_to_mode (mode, op1, 1);
401 }
402 }
403 }
404 else
405 {
406 if (MEM_P (op0)
407 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
408 || !push_operand (op0, mode))
409 && MEM_P (op1))
410 op1 = force_reg (mode, op1);
411
412 if (push_operand (op0, mode)
413 && ! general_no_elim_operand (op1, mode))
414 op1 = copy_to_mode_reg (mode, op1);
415
416 /* Force large constants in 64bit compilation into register
417 to get them CSEed. */
418 if (can_create_pseudo_p ()
419 && (mode == DImode) && TARGET_64BIT
420 && immediate_operand (op1, mode)
421 && !x86_64_zext_immediate_operand (op1, VOIDmode)
422 && !register_operand (op0, mode)
423 && optimize)
424 op1 = copy_to_mode_reg (mode, op1);
425
edafb35b 426 if (can_create_pseudo_p ())
2bf6d935 427 {
edafb35b 428 if (CONST_DOUBLE_P (op1))
2bf6d935 429 {
edafb35b
L
430 /* If we are loading a floating point constant to a
431 register, force the value to memory now, since we'll
432 get better code out the back end. */
433
434 op1 = validize_mem (force_const_mem (mode, op1));
435 if (!register_operand (op0, mode))
436 {
437 rtx temp = gen_reg_rtx (mode);
438 emit_insn (gen_rtx_SET (temp, op1));
439 emit_move_insn (op0, temp);
440 return;
441 }
442 }
443 else if (GET_MODE_SIZE (mode) >= 16)
444 {
445 rtx tmp = ix86_convert_const_wide_int_to_broadcast
446 (GET_MODE (op0), op1);
447 if (tmp != nullptr)
448 op1 = tmp;
2bf6d935
ML
449 }
450 }
451 }
452
453 emit_insn (gen_rtx_SET (op0, op1));
454}
455
a6291d88 456/* OP is a memref of CONST_VECTOR, return scalar constant mem
457 if CONST_VECTOR is a vec_duplicate, else return NULL. */
edafb35b 458static rtx
a6291d88 459ix86_broadcast_from_constant (machine_mode mode, rtx op)
edafb35b
L
460{
461 int nunits = GET_MODE_NUNITS (mode);
462 if (nunits < 2)
463 return nullptr;
464
465 /* Don't use integer vector broadcast if we can't move from GPR to SSE
466 register directly. */
a6291d88 467 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
468 && INTEGRAL_MODE_P (mode))
edafb35b
L
469 return nullptr;
470
471 /* Convert CONST_VECTOR to a non-standard SSE constant integer
472 broadcast only if vector broadcast is available. */
473 if (!(TARGET_AVX2
474 || (TARGET_AVX
475 && (GET_MODE_INNER (mode) == SImode
a6291d88 476 || GET_MODE_INNER (mode) == DImode))
477 || FLOAT_MODE_P (mode))
edafb35b
L
478 || standard_sse_constant_p (op, mode))
479 return nullptr;
480
a6291d88 481 /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
482 We can still put 64-bit integer constant in memory when
483 avx512 embed broadcast is available. */
484 if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
485 && (!TARGET_AVX512F
486 || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
edafb35b
L
487 return nullptr;
488
f7cad1a0
JJ
489 if (GET_MODE_INNER (mode) == TImode)
490 return nullptr;
491
edafb35b
L
492 rtx constant = get_pool_constant (XEXP (op, 0));
493 if (GET_CODE (constant) != CONST_VECTOR)
494 return nullptr;
495
496 /* There could be some rtx like
497 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
498 but with "*.LC1" refer to V2DI constant vector. */
499 if (GET_MODE (constant) != mode)
500 {
501 constant = simplify_subreg (mode, constant, GET_MODE (constant),
502 0);
503 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
504 return nullptr;
505 }
506
507 rtx first = XVECEXP (constant, 0, 0);
508
509 for (int i = 1; i < nunits; ++i)
510 {
511 rtx tmp = XVECEXP (constant, 0, i);
512 /* Vector duplicate value. */
513 if (!rtx_equal_p (tmp, first))
514 return nullptr;
515 }
516
517 return first;
518}
519
2bf6d935
ML
520void
521ix86_expand_vector_move (machine_mode mode, rtx operands[])
522{
523 rtx op0 = operands[0], op1 = operands[1];
524 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
525 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
526 unsigned int align = (TARGET_IAMCU
527 ? GET_MODE_BITSIZE (mode)
528 : GET_MODE_ALIGNMENT (mode));
529
530 if (push_operand (op0, VOIDmode))
531 op0 = emit_move_resolve_push (mode, op0);
532
533 /* Force constants other than zero into memory. We do not know how
534 the instructions used to build constants modify the upper 64 bits
535 of the register, once we have that information we may be able
536 to handle some of them more efficiently. */
537 if (can_create_pseudo_p ()
538 && (CONSTANT_P (op1)
539 || (SUBREG_P (op1)
540 && CONSTANT_P (SUBREG_REG (op1))))
541 && ((register_operand (op0, mode)
542 && !standard_sse_constant_p (op1, mode))
543 /* ix86_expand_vector_move_misalign() does not like constants. */
544 || (SSE_REG_MODE_P (mode)
545 && MEM_P (op0)
546 && MEM_ALIGN (op0) < align)))
547 {
548 if (SUBREG_P (op1))
549 {
550 machine_mode imode = GET_MODE (SUBREG_REG (op1));
551 rtx r = force_const_mem (imode, SUBREG_REG (op1));
552 if (r)
553 r = validize_mem (r);
554 else
555 r = force_reg (imode, SUBREG_REG (op1));
556 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
557 }
558 else
edafb35b
L
559 {
560 machine_mode mode = GET_MODE (op0);
561 rtx tmp = ix86_convert_const_wide_int_to_broadcast
562 (mode, op1);
563 if (tmp == nullptr)
564 op1 = validize_mem (force_const_mem (mode, op1));
565 else
566 op1 = tmp;
567 }
568 }
569
570 if (can_create_pseudo_p ()
571 && GET_MODE_SIZE (mode) >= 16
a6291d88 572 && VECTOR_MODE_P (mode)
edafb35b
L
573 && (MEM_P (op1)
574 && SYMBOL_REF_P (XEXP (op1, 0))
575 && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
576 {
a6291d88 577 rtx first = ix86_broadcast_from_constant (mode, op1);
edafb35b
L
578 if (first != nullptr)
579 {
580 /* Broadcast to XMM/YMM/ZMM register from an integer
a6291d88 581 constant or scalar mem. */
6e5401e8 582 op1 = gen_reg_rtx (mode);
a6291d88 583 if (FLOAT_MODE_P (mode)
584 || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
6e5401e8 585 first = force_const_mem (GET_MODE_INNER (mode), first);
edafb35b
L
586 bool ok = ix86_expand_vector_init_duplicate (false, mode,
587 op1, first);
588 gcc_assert (ok);
589 emit_move_insn (op0, op1);
590 return;
591 }
2bf6d935
ML
592 }
593
594 /* We need to check memory alignment for SSE mode since attribute
595 can make operands unaligned. */
596 if (can_create_pseudo_p ()
597 && SSE_REG_MODE_P (mode)
598 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
599 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
600 {
601 rtx tmp[2];
602
603 /* ix86_expand_vector_move_misalign() does not like both
604 arguments in memory. */
605 if (!register_operand (op0, mode)
606 && !register_operand (op1, mode))
09dba016
L
607 {
608 rtx scratch = ix86_gen_scratch_sse_rtx (mode);
609 emit_move_insn (scratch, op1);
610 op1 = scratch;
611 }
2bf6d935
ML
612
613 tmp[0] = op0; tmp[1] = op1;
614 ix86_expand_vector_move_misalign (mode, tmp);
615 return;
616 }
617
618 /* Make operand1 a register if it isn't already. */
619 if (can_create_pseudo_p ()
620 && !register_operand (op0, mode)
621 && !register_operand (op1, mode))
622 {
7f4c3943
L
623 rtx tmp = ix86_gen_scratch_sse_rtx (GET_MODE (op0));
624 emit_move_insn (tmp, op1);
625 emit_move_insn (op0, tmp);
2bf6d935
ML
626 return;
627 }
628
629 emit_insn (gen_rtx_SET (op0, op1));
630}
631
632/* Split 32-byte AVX unaligned load and store if needed. */
633
634static void
635ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
636{
637 rtx m;
638 rtx (*extract) (rtx, rtx, rtx);
639 machine_mode mode;
640
641 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
642 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
643 {
644 emit_insn (gen_rtx_SET (op0, op1));
645 return;
646 }
647
648 rtx orig_op0 = NULL_RTX;
649 mode = GET_MODE (op0);
650 switch (GET_MODE_CLASS (mode))
651 {
652 case MODE_VECTOR_INT:
653 case MODE_INT:
654 if (mode != V32QImode)
655 {
656 if (!MEM_P (op0))
657 {
658 orig_op0 = op0;
659 op0 = gen_reg_rtx (V32QImode);
660 }
661 else
662 op0 = gen_lowpart (V32QImode, op0);
663 op1 = gen_lowpart (V32QImode, op1);
664 mode = V32QImode;
665 }
666 break;
667 case MODE_VECTOR_FLOAT:
668 break;
669 default:
670 gcc_unreachable ();
671 }
672
673 switch (mode)
674 {
675 default:
676 gcc_unreachable ();
677 case E_V32QImode:
678 extract = gen_avx_vextractf128v32qi;
679 mode = V16QImode;
680 break;
d959312b
L
681 case E_V16HFmode:
682 extract = gen_avx_vextractf128v16hf;
683 mode = V8HFmode;
684 break;
2bf6d935
ML
685 case E_V8SFmode:
686 extract = gen_avx_vextractf128v8sf;
687 mode = V4SFmode;
688 break;
689 case E_V4DFmode:
690 extract = gen_avx_vextractf128v4df;
691 mode = V2DFmode;
692 break;
693 }
694
695 if (MEM_P (op1))
696 {
697 rtx r = gen_reg_rtx (mode);
698 m = adjust_address (op1, mode, 0);
699 emit_move_insn (r, m);
700 m = adjust_address (op1, mode, 16);
701 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
702 emit_move_insn (op0, r);
703 }
704 else if (MEM_P (op0))
705 {
706 m = adjust_address (op0, mode, 0);
707 emit_insn (extract (m, op1, const0_rtx));
708 m = adjust_address (op0, mode, 16);
709 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
710 }
711 else
712 gcc_unreachable ();
713
714 if (orig_op0)
715 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
716}
717
718/* Implement the movmisalign patterns for SSE. Non-SSE modes go
719 straight to ix86_expand_vector_move. */
720/* Code generation for scalar reg-reg moves of single and double precision data:
721 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
722 movaps reg, reg
723 else
724 movss reg, reg
725 if (x86_sse_partial_reg_dependency == true)
726 movapd reg, reg
727 else
728 movsd reg, reg
729
730 Code generation for scalar loads of double precision data:
731 if (x86_sse_split_regs == true)
732 movlpd mem, reg (gas syntax)
733 else
734 movsd mem, reg
735
736 Code generation for unaligned packed loads of single precision data
737 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
738 if (x86_sse_unaligned_move_optimal)
739 movups mem, reg
740
741 if (x86_sse_partial_reg_dependency == true)
742 {
743 xorps reg, reg
744 movlps mem, reg
745 movhps mem+8, reg
746 }
747 else
748 {
749 movlps mem, reg
750 movhps mem+8, reg
751 }
752
753 Code generation for unaligned packed loads of double precision data
754 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
755 if (x86_sse_unaligned_move_optimal)
756 movupd mem, reg
757
758 if (x86_sse_split_regs == true)
759 {
760 movlpd mem, reg
761 movhpd mem+8, reg
762 }
763 else
764 {
765 movsd mem, reg
766 movhpd mem+8, reg
767 }
768 */
769
770void
771ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
772{
773 rtx op0, op1, m;
774
775 op0 = operands[0];
776 op1 = operands[1];
777
778 /* Use unaligned load/store for AVX512 or when optimizing for size. */
779 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
780 {
781 emit_insn (gen_rtx_SET (op0, op1));
782 return;
783 }
784
785 if (TARGET_AVX)
786 {
787 if (GET_MODE_SIZE (mode) == 32)
788 ix86_avx256_split_vector_move_misalign (op0, op1);
789 else
790 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
791 emit_insn (gen_rtx_SET (op0, op1));
792 return;
793 }
794
795 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
796 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
797 {
798 emit_insn (gen_rtx_SET (op0, op1));
799 return;
800 }
801
802 /* ??? If we have typed data, then it would appear that using
803 movdqu is the only way to get unaligned data loaded with
804 integer type. */
805 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
806 {
807 emit_insn (gen_rtx_SET (op0, op1));
808 return;
809 }
810
811 if (MEM_P (op1))
812 {
813 if (TARGET_SSE2 && mode == V2DFmode)
814 {
815 rtx zero;
816
817 /* When SSE registers are split into halves, we can avoid
818 writing to the top half twice. */
819 if (TARGET_SSE_SPLIT_REGS)
820 {
821 emit_clobber (op0);
822 zero = op0;
823 }
824 else
825 {
826 /* ??? Not sure about the best option for the Intel chips.
827 The following would seem to satisfy; the register is
828 entirely cleared, breaking the dependency chain. We
829 then store to the upper half, with a dependency depth
830 of one. A rumor has it that Intel recommends two movsd
831 followed by an unpacklpd, but this is unconfirmed. And
832 given that the dependency depth of the unpacklpd would
833 still be one, I'm not sure why this would be better. */
834 zero = CONST0_RTX (V2DFmode);
835 }
836
837 m = adjust_address (op1, DFmode, 0);
838 emit_insn (gen_sse2_loadlpd (op0, zero, m));
839 m = adjust_address (op1, DFmode, 8);
840 emit_insn (gen_sse2_loadhpd (op0, op0, m));
841 }
842 else
843 {
844 rtx t;
845
846 if (mode != V4SFmode)
847 t = gen_reg_rtx (V4SFmode);
848 else
849 t = op0;
850
851 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
852 emit_move_insn (t, CONST0_RTX (V4SFmode));
853 else
854 emit_clobber (t);
855
856 m = adjust_address (op1, V2SFmode, 0);
857 emit_insn (gen_sse_loadlps (t, t, m));
858 m = adjust_address (op1, V2SFmode, 8);
859 emit_insn (gen_sse_loadhps (t, t, m));
860 if (mode != V4SFmode)
861 emit_move_insn (op0, gen_lowpart (mode, t));
862 }
863 }
864 else if (MEM_P (op0))
865 {
866 if (TARGET_SSE2 && mode == V2DFmode)
867 {
868 m = adjust_address (op0, DFmode, 0);
869 emit_insn (gen_sse2_storelpd (m, op1));
870 m = adjust_address (op0, DFmode, 8);
871 emit_insn (gen_sse2_storehpd (m, op1));
872 }
873 else
874 {
875 if (mode != V4SFmode)
876 op1 = gen_lowpart (V4SFmode, op1);
877
878 m = adjust_address (op0, V2SFmode, 0);
879 emit_insn (gen_sse_storelps (m, op1));
880 m = adjust_address (op0, V2SFmode, 8);
881 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
882 }
883 }
884 else
885 gcc_unreachable ();
886}
887
b74ebb2a
L
888/* Move bits 64:95 to bits 32:63. */
889
890void
891ix86_move_vector_high_sse_to_mmx (rtx op)
892{
893 rtx mask = gen_rtx_PARALLEL (VOIDmode,
894 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
895 GEN_INT (0), GEN_INT (0)));
896 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
897 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
898 rtx insn = gen_rtx_SET (dest, op);
899 emit_insn (insn);
900}
901
902/* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
903
904void
905ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
906{
907 rtx op0 = operands[0];
908 rtx op1 = operands[1];
909 rtx op2 = operands[2];
910
911 machine_mode dmode = GET_MODE (op0);
912 machine_mode smode = GET_MODE (op1);
913 machine_mode inner_dmode = GET_MODE_INNER (dmode);
914 machine_mode inner_smode = GET_MODE_INNER (smode);
915
916 /* Get the corresponding SSE mode for destination. */
917 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
918 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
919 nunits).require ();
920 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
921 nunits / 2).require ();
922
923 /* Get the corresponding SSE mode for source. */
924 nunits = 16 / GET_MODE_SIZE (inner_smode);
925 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
926 nunits).require ();
927
928 /* Generate SSE pack with signed/unsigned saturation. */
929 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
930 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
931 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
932
933 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
934 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
935 rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
936 op1, op2));
937 emit_insn (insn);
938
939 ix86_move_vector_high_sse_to_mmx (op0);
940}
941
6e9fffcf
L
942/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
943
944void
945ix86_split_mmx_punpck (rtx operands[], bool high_p)
946{
947 rtx op0 = operands[0];
948 rtx op1 = operands[1];
949 rtx op2 = operands[2];
950 machine_mode mode = GET_MODE (op0);
951 rtx mask;
952 /* The corresponding SSE mode. */
953 machine_mode sse_mode, double_sse_mode;
954
955 switch (mode)
956 {
be8749f9 957 case E_V4QImode:
6e9fffcf
L
958 case E_V8QImode:
959 sse_mode = V16QImode;
960 double_sse_mode = V32QImode;
961 mask = gen_rtx_PARALLEL (VOIDmode,
962 gen_rtvec (16,
963 GEN_INT (0), GEN_INT (16),
964 GEN_INT (1), GEN_INT (17),
965 GEN_INT (2), GEN_INT (18),
966 GEN_INT (3), GEN_INT (19),
967 GEN_INT (4), GEN_INT (20),
968 GEN_INT (5), GEN_INT (21),
969 GEN_INT (6), GEN_INT (22),
970 GEN_INT (7), GEN_INT (23)));
971 break;
972
973 case E_V4HImode:
be8749f9 974 case E_V2HImode:
6e9fffcf
L
975 sse_mode = V8HImode;
976 double_sse_mode = V16HImode;
977 mask = gen_rtx_PARALLEL (VOIDmode,
978 gen_rtvec (8,
979 GEN_INT (0), GEN_INT (8),
980 GEN_INT (1), GEN_INT (9),
981 GEN_INT (2), GEN_INT (10),
982 GEN_INT (3), GEN_INT (11)));
983 break;
984
985 case E_V2SImode:
986 sse_mode = V4SImode;
987 double_sse_mode = V8SImode;
988 mask = gen_rtx_PARALLEL (VOIDmode,
989 gen_rtvec (4,
990 GEN_INT (0), GEN_INT (4),
991 GEN_INT (1), GEN_INT (5)));
992 break;
993
a325bdd1
PB
994 case E_V2SFmode:
995 sse_mode = V4SFmode;
996 double_sse_mode = V8SFmode;
997 mask = gen_rtx_PARALLEL (VOIDmode,
998 gen_rtvec (4,
999 GEN_INT (0), GEN_INT (4),
1000 GEN_INT (1), GEN_INT (5)));
1001 break;
1002
6e9fffcf
L
1003 default:
1004 gcc_unreachable ();
1005 }
1006
1007 /* Generate SSE punpcklXX. */
1008 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
1009 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
1010 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
1011
1012 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
1013 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
1014 rtx insn = gen_rtx_SET (dest, op2);
1015 emit_insn (insn);
1016
be8749f9 1017 /* Move high bits to low bits. */
6e9fffcf
L
1018 if (high_p)
1019 {
a325bdd1
PB
1020 if (sse_mode == V4SFmode)
1021 {
1022 mask = gen_rtx_PARALLEL (VOIDmode,
1023 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1024 GEN_INT (4), GEN_INT (5)));
1025 op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
1026 op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
1027 }
1028 else
1029 {
be8749f9
UB
1030 int sz = GET_MODE_SIZE (mode);
1031
1032 if (sz == 4)
1033 mask = gen_rtx_PARALLEL (VOIDmode,
1034 gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1035 GEN_INT (0), GEN_INT (1)));
1036 else if (sz == 8)
1037 mask = gen_rtx_PARALLEL (VOIDmode,
1038 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1039 GEN_INT (0), GEN_INT (1)));
1040 else
1041 gcc_unreachable ();
1042
a325bdd1
PB
1043 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
1044 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1045 }
1046
6e9fffcf
L
1047 insn = gen_rtx_SET (dest, op1);
1048 emit_insn (insn);
1049 }
1050}
1051
2bf6d935
ML
1052/* Helper function of ix86_fixup_binary_operands to canonicalize
1053 operand order. Returns true if the operands should be swapped. */
1054
1055static bool
1056ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
1057 rtx operands[])
1058{
1059 rtx dst = operands[0];
1060 rtx src1 = operands[1];
1061 rtx src2 = operands[2];
1062
1063 /* If the operation is not commutative, we can't do anything. */
1064 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
1065 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
1066 return false;
1067
1068 /* Highest priority is that src1 should match dst. */
1069 if (rtx_equal_p (dst, src1))
1070 return false;
1071 if (rtx_equal_p (dst, src2))
1072 return true;
1073
1074 /* Next highest priority is that immediate constants come second. */
1075 if (immediate_operand (src2, mode))
1076 return false;
1077 if (immediate_operand (src1, mode))
1078 return true;
1079
1080 /* Lowest priority is that memory references should come second. */
1081 if (MEM_P (src2))
1082 return false;
1083 if (MEM_P (src1))
1084 return true;
1085
1086 return false;
1087}
1088
1089
1090/* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
1091 destination to use for the operation. If different from the true
1092 destination in operands[0], a copy operation will be required. */
1093
1094rtx
1095ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
1096 rtx operands[])
1097{
1098 rtx dst = operands[0];
1099 rtx src1 = operands[1];
1100 rtx src2 = operands[2];
1101
1102 /* Canonicalize operand order. */
1103 if (ix86_swap_binary_operands_p (code, mode, operands))
1104 {
1105 /* It is invalid to swap operands of different modes. */
1106 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
1107
1108 std::swap (src1, src2);
1109 }
1110
1111 /* Both source operands cannot be in memory. */
1112 if (MEM_P (src1) && MEM_P (src2))
1113 {
1114 /* Optimization: Only read from memory once. */
1115 if (rtx_equal_p (src1, src2))
1116 {
1117 src2 = force_reg (mode, src2);
1118 src1 = src2;
1119 }
1120 else if (rtx_equal_p (dst, src1))
1121 src2 = force_reg (mode, src2);
1122 else
1123 src1 = force_reg (mode, src1);
1124 }
1125
1126 /* If the destination is memory, and we do not have matching source
1127 operands, do things in registers. */
1128 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1129 dst = gen_reg_rtx (mode);
1130
1131 /* Source 1 cannot be a constant. */
1132 if (CONSTANT_P (src1))
1133 src1 = force_reg (mode, src1);
1134
1135 /* Source 1 cannot be a non-matching memory. */
1136 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1137 src1 = force_reg (mode, src1);
1138
1139 /* Improve address combine. */
1140 if (code == PLUS
1141 && GET_MODE_CLASS (mode) == MODE_INT
1142 && MEM_P (src2))
1143 src2 = force_reg (mode, src2);
1144
1145 operands[1] = src1;
1146 operands[2] = src2;
1147 return dst;
1148}
1149
1150/* Similarly, but assume that the destination has already been
1151 set up properly. */
1152
1153void
1154ix86_fixup_binary_operands_no_copy (enum rtx_code code,
1155 machine_mode mode, rtx operands[])
1156{
1157 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
1158 gcc_assert (dst == operands[0]);
1159}
1160
1161/* Attempt to expand a binary operator. Make the expansion closer to the
1162 actual machine, then just general_operand, which will allow 3 separate
1163 memory references (one output, two input) in a single insn. */
1164
1165void
1166ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
1167 rtx operands[])
1168{
1169 rtx src1, src2, dst, op, clob;
1170
1171 dst = ix86_fixup_binary_operands (code, mode, operands);
1172 src1 = operands[1];
1173 src2 = operands[2];
1174
1175 /* Emit the instruction. */
1176
1177 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
1178
1179 if (reload_completed
1180 && code == PLUS
1181 && !rtx_equal_p (dst, src1))
1182 {
1183 /* This is going to be an LEA; avoid splitting it later. */
1184 emit_insn (op);
1185 }
1186 else
1187 {
1188 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1189 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1190 }
1191
1192 /* Fix up the destination if needed. */
1193 if (dst != operands[0])
1194 emit_move_insn (operands[0], dst);
1195}
1196
1197/* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1198 the given OPERANDS. */
1199
1200void
1201ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
1202 rtx operands[])
1203{
1204 rtx op1 = NULL_RTX, op2 = NULL_RTX;
1205 if (SUBREG_P (operands[1]))
1206 {
1207 op1 = operands[1];
1208 op2 = operands[2];
1209 }
1210 else if (SUBREG_P (operands[2]))
1211 {
1212 op1 = operands[2];
1213 op2 = operands[1];
1214 }
1215 /* Optimize (__m128i) d | (__m128i) e and similar code
1216 when d and e are float vectors into float vector logical
1217 insn. In C/C++ without using intrinsics there is no other way
1218 to express vector logical operation on float vectors than
1219 to cast them temporarily to integer vectors. */
1220 if (op1
1221 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1222 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
1223 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
1224 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
1225 && SUBREG_BYTE (op1) == 0
1226 && (GET_CODE (op2) == CONST_VECTOR
1227 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
1228 && SUBREG_BYTE (op2) == 0))
1229 && can_create_pseudo_p ())
1230 {
1231 rtx dst;
1232 switch (GET_MODE (SUBREG_REG (op1)))
1233 {
1234 case E_V4SFmode:
1235 case E_V8SFmode:
1236 case E_V16SFmode:
1237 case E_V2DFmode:
1238 case E_V4DFmode:
1239 case E_V8DFmode:
1240 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1241 if (GET_CODE (op2) == CONST_VECTOR)
1242 {
1243 op2 = gen_lowpart (GET_MODE (dst), op2);
1244 op2 = force_reg (GET_MODE (dst), op2);
1245 }
1246 else
1247 {
1248 op1 = operands[1];
1249 op2 = SUBREG_REG (operands[2]);
1250 if (!vector_operand (op2, GET_MODE (dst)))
1251 op2 = force_reg (GET_MODE (dst), op2);
1252 }
1253 op1 = SUBREG_REG (op1);
1254 if (!vector_operand (op1, GET_MODE (dst)))
1255 op1 = force_reg (GET_MODE (dst), op1);
1256 emit_insn (gen_rtx_SET (dst,
1257 gen_rtx_fmt_ee (code, GET_MODE (dst),
1258 op1, op2)));
1259 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1260 return;
1261 default:
1262 break;
1263 }
1264 }
1265 if (!vector_operand (operands[1], mode))
1266 operands[1] = force_reg (mode, operands[1]);
1267 if (!vector_operand (operands[2], mode))
1268 operands[2] = force_reg (mode, operands[2]);
1269 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1270 emit_insn (gen_rtx_SET (operands[0],
1271 gen_rtx_fmt_ee (code, mode, operands[1],
1272 operands[2])));
1273}
1274
1275/* Return TRUE or FALSE depending on whether the binary operator meets the
1276 appropriate constraints. */
1277
1278bool
1279ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1280 rtx operands[3])
1281{
1282 rtx dst = operands[0];
1283 rtx src1 = operands[1];
1284 rtx src2 = operands[2];
1285
1286 /* Both source operands cannot be in memory. */
7026bb95 1287 if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
1288 && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
2bf6d935
ML
1289 return false;
1290
1291 /* Canonicalize operand order for commutative operators. */
1292 if (ix86_swap_binary_operands_p (code, mode, operands))
1293 std::swap (src1, src2);
1294
1295 /* If the destination is memory, we must have a matching source operand. */
1296 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1297 return false;
1298
1299 /* Source 1 cannot be a constant. */
1300 if (CONSTANT_P (src1))
1301 return false;
1302
1303 /* Source 1 cannot be a non-matching memory. */
1304 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1305 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1306 return (code == AND
1307 && (mode == HImode
1308 || mode == SImode
1309 || (TARGET_64BIT && mode == DImode))
1310 && satisfies_constraint_L (src2));
1311
1312 return true;
1313}
1314
1315/* Attempt to expand a unary operator. Make the expansion closer to the
1316 actual machine, then just general_operand, which will allow 2 separate
1317 memory references (one output, one input) in a single insn. */
1318
1319void
1320ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1321 rtx operands[])
1322{
1323 bool matching_memory = false;
1324 rtx src, dst, op, clob;
1325
1326 dst = operands[0];
1327 src = operands[1];
1328
1329 /* If the destination is memory, and we do not have matching source
1330 operands, do things in registers. */
1331 if (MEM_P (dst))
1332 {
1333 if (rtx_equal_p (dst, src))
1334 matching_memory = true;
1335 else
1336 dst = gen_reg_rtx (mode);
1337 }
1338
1339 /* When source operand is memory, destination must match. */
1340 if (MEM_P (src) && !matching_memory)
1341 src = force_reg (mode, src);
1342
1343 /* Emit the instruction. */
1344
1345 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1346
1347 if (code == NOT)
1348 emit_insn (op);
1349 else
1350 {
1351 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1352 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1353 }
1354
1355 /* Fix up the destination if needed. */
1356 if (dst != operands[0])
1357 emit_move_insn (operands[0], dst);
1358}
1359
1360/* Predict just emitted jump instruction to be taken with probability PROB. */
1361
1362static void
1363predict_jump (int prob)
1364{
1365 rtx_insn *insn = get_last_insn ();
1366 gcc_assert (JUMP_P (insn));
1367 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1368}
1369
1370/* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1371 divisor are within the range [0-255]. */
1372
1373void
1374ix86_split_idivmod (machine_mode mode, rtx operands[],
40c81f84 1375 bool unsigned_p)
2bf6d935
ML
1376{
1377 rtx_code_label *end_label, *qimode_label;
1378 rtx div, mod;
1379 rtx_insn *insn;
1380 rtx scratch, tmp0, tmp1, tmp2;
1381 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
2bf6d935
ML
1382
1383 switch (mode)
1384 {
1385 case E_SImode:
1386 if (GET_MODE (operands[0]) == SImode)
1387 {
1388 if (GET_MODE (operands[1]) == SImode)
40c81f84 1389 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
2bf6d935
ML
1390 else
1391 gen_divmod4_1
40c81f84 1392 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
2bf6d935
ML
1393 }
1394 else
ea298f7a
UB
1395 gen_divmod4_1
1396 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
2bf6d935 1397 break;
ea298f7a 1398
2bf6d935 1399 case E_DImode:
40c81f84 1400 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
2bf6d935 1401 break;
ea298f7a 1402
2bf6d935
ML
1403 default:
1404 gcc_unreachable ();
1405 }
1406
1407 end_label = gen_label_rtx ();
1408 qimode_label = gen_label_rtx ();
1409
1410 scratch = gen_reg_rtx (mode);
1411
1412 /* Use 8bit unsigned divimod if dividend and divisor are within
1413 the range [0-255]. */
1414 emit_move_insn (scratch, operands[2]);
1415 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1416 scratch, 1, OPTAB_DIRECT);
ea298f7a 1417 emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
2bf6d935
ML
1418 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1419 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1420 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1421 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1422 pc_rtx);
1423 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1424 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1425 JUMP_LABEL (insn) = qimode_label;
1426
1427 /* Generate original signed/unsigned divimod. */
e9539592
UB
1428 emit_insn (gen_divmod4_1 (operands[0], operands[1],
1429 operands[2], operands[3]));
2bf6d935
ML
1430
1431 /* Branch to the end. */
1432 emit_jump_insn (gen_jump (end_label));
1433 emit_barrier ();
1434
1435 /* Generate 8bit unsigned divide. */
1436 emit_label (qimode_label);
1437 /* Don't use operands[0] for result of 8bit divide since not all
1438 registers support QImode ZERO_EXTRACT. */
1439 tmp0 = lowpart_subreg (HImode, scratch, mode);
1440 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1441 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1442 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1443
40c81f84 1444 if (unsigned_p)
2bf6d935 1445 {
40c81f84
UB
1446 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1447 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
2bf6d935
ML
1448 }
1449 else
1450 {
40c81f84
UB
1451 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1452 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
2bf6d935
ML
1453 }
1454 if (mode == SImode)
1455 {
1456 if (GET_MODE (operands[0]) != SImode)
1457 div = gen_rtx_ZERO_EXTEND (DImode, div);
1458 if (GET_MODE (operands[1]) != SImode)
1459 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1460 }
1461
1462 /* Extract remainder from AH. */
e9539592
UB
1463 scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1464 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1465 GEN_INT (8), GEN_INT (8));
1466 insn = emit_move_insn (operands[1], tmp1);
2bf6d935
ML
1467 set_unique_reg_note (insn, REG_EQUAL, mod);
1468
1469 /* Zero extend quotient from AL. */
1470 tmp1 = gen_lowpart (QImode, tmp0);
ea298f7a
UB
1471 insn = emit_insn (gen_extend_insn
1472 (operands[0], tmp1,
1473 GET_MODE (operands[0]), QImode, 1));
2bf6d935
ML
1474 set_unique_reg_note (insn, REG_EQUAL, div);
1475
1476 emit_label (end_label);
1477}
1478
1479/* Emit x86 binary operand CODE in mode MODE, where the first operand
1480 matches destination. RTX includes clobber of FLAGS_REG. */
1481
1482void
1483ix86_emit_binop (enum rtx_code code, machine_mode mode,
1484 rtx dst, rtx src)
1485{
1486 rtx op, clob;
1487
1488 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1489 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1490
1491 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1492}
1493
1494/* Return true if regno1 def is nearest to the insn. */
1495
1496static bool
1497find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1498{
1499 rtx_insn *prev = insn;
1500 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1501
1502 if (insn == start)
1503 return false;
1504 while (prev && prev != start)
1505 {
1506 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1507 {
1508 prev = PREV_INSN (prev);
1509 continue;
1510 }
1511 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1512 return true;
1513 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1514 return false;
1515 prev = PREV_INSN (prev);
1516 }
1517
1518 /* None of the regs is defined in the bb. */
1519 return false;
1520}
1521
d58a66aa
JJ
1522/* INSN_UID of the last insn emitted by zero store peephole2s. */
1523int ix86_last_zero_store_uid;
1524
2bf6d935
ML
1525/* Split lea instructions into a sequence of instructions
1526 which are executed on ALU to avoid AGU stalls.
1527 It is assumed that it is allowed to clobber flags register
1528 at lea position. */
1529
1530void
1531ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1532{
1533 unsigned int regno0, regno1, regno2;
1534 struct ix86_address parts;
1535 rtx target, tmp;
1536 int ok, adds;
1537
1538 ok = ix86_decompose_address (operands[1], &parts);
1539 gcc_assert (ok);
1540
1541 target = gen_lowpart (mode, operands[0]);
1542
1543 regno0 = true_regnum (target);
1544 regno1 = INVALID_REGNUM;
1545 regno2 = INVALID_REGNUM;
1546
1547 if (parts.base)
1548 {
1549 parts.base = gen_lowpart (mode, parts.base);
1550 regno1 = true_regnum (parts.base);
1551 }
1552
1553 if (parts.index)
1554 {
1555 parts.index = gen_lowpart (mode, parts.index);
1556 regno2 = true_regnum (parts.index);
1557 }
1558
1559 if (parts.disp)
1560 parts.disp = gen_lowpart (mode, parts.disp);
1561
1562 if (parts.scale > 1)
1563 {
1564 /* Case r1 = r1 + ... */
1565 if (regno1 == regno0)
1566 {
1567 /* If we have a case r1 = r1 + C * r2 then we
1568 should use multiplication which is very
1569 expensive. Assume cost model is wrong if we
1570 have such case here. */
1571 gcc_assert (regno2 != regno0);
1572
1573 for (adds = parts.scale; adds > 0; adds--)
1574 ix86_emit_binop (PLUS, mode, target, parts.index);
1575 }
1576 else
1577 {
1578 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1579 if (regno0 != regno2)
1580 emit_insn (gen_rtx_SET (target, parts.index));
1581
d55ce33a
JJ
1582 /* Use shift for scaling, but emit it as MULT instead
1583 to avoid it being immediately peephole2 optimized back
1584 into lea. */
1585 ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
2bf6d935
ML
1586
1587 if (parts.base)
1588 ix86_emit_binop (PLUS, mode, target, parts.base);
1589
1590 if (parts.disp && parts.disp != const0_rtx)
1591 ix86_emit_binop (PLUS, mode, target, parts.disp);
1592 }
1593 }
1594 else if (!parts.base && !parts.index)
1595 {
1596 gcc_assert(parts.disp);
1597 emit_insn (gen_rtx_SET (target, parts.disp));
1598 }
1599 else
1600 {
1601 if (!parts.base)
1602 {
1603 if (regno0 != regno2)
1604 emit_insn (gen_rtx_SET (target, parts.index));
1605 }
1606 else if (!parts.index)
1607 {
1608 if (regno0 != regno1)
1609 emit_insn (gen_rtx_SET (target, parts.base));
1610 }
1611 else
1612 {
1613 if (regno0 == regno1)
1614 tmp = parts.index;
1615 else if (regno0 == regno2)
1616 tmp = parts.base;
1617 else
1618 {
1619 rtx tmp1;
1620
1621 /* Find better operand for SET instruction, depending
1622 on which definition is farther from the insn. */
1623 if (find_nearest_reg_def (insn, regno1, regno2))
1624 tmp = parts.index, tmp1 = parts.base;
1625 else
1626 tmp = parts.base, tmp1 = parts.index;
1627
1628 emit_insn (gen_rtx_SET (target, tmp));
1629
1630 if (parts.disp && parts.disp != const0_rtx)
1631 ix86_emit_binop (PLUS, mode, target, parts.disp);
1632
1633 ix86_emit_binop (PLUS, mode, target, tmp1);
1634 return;
1635 }
1636
1637 ix86_emit_binop (PLUS, mode, target, tmp);
1638 }
1639
1640 if (parts.disp && parts.disp != const0_rtx)
1641 ix86_emit_binop (PLUS, mode, target, parts.disp);
1642 }
1643}
1644
1645/* Post-reload splitter for converting an SF or DFmode value in an
1646 SSE register into an unsigned SImode. */
1647
1648void
1649ix86_split_convert_uns_si_sse (rtx operands[])
1650{
1651 machine_mode vecmode;
1652 rtx value, large, zero_or_two31, input, two31, x;
1653
1654 large = operands[1];
1655 zero_or_two31 = operands[2];
1656 input = operands[3];
1657 two31 = operands[4];
1658 vecmode = GET_MODE (large);
1659 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1660
1661 /* Load up the value into the low element. We must ensure that the other
1662 elements are valid floats -- zero is the easiest such value. */
1663 if (MEM_P (input))
1664 {
1665 if (vecmode == V4SFmode)
1666 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1667 else
1668 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1669 }
1670 else
1671 {
1672 input = gen_rtx_REG (vecmode, REGNO (input));
1673 emit_move_insn (value, CONST0_RTX (vecmode));
1674 if (vecmode == V4SFmode)
1675 emit_insn (gen_sse_movss (value, value, input));
1676 else
1677 emit_insn (gen_sse2_movsd (value, value, input));
1678 }
1679
1680 emit_move_insn (large, two31);
1681 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1682
1683 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1684 emit_insn (gen_rtx_SET (large, x));
1685
1686 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1687 emit_insn (gen_rtx_SET (zero_or_two31, x));
1688
1689 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1690 emit_insn (gen_rtx_SET (value, x));
1691
1692 large = gen_rtx_REG (V4SImode, REGNO (large));
1693 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1694
1695 x = gen_rtx_REG (V4SImode, REGNO (value));
1696 if (vecmode == V4SFmode)
1697 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1698 else
1699 emit_insn (gen_sse2_cvttpd2dq (x, value));
1700 value = x;
1701
1702 emit_insn (gen_xorv4si3 (value, value, large));
1703}
1704
1705static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1706 machine_mode mode, rtx target,
1707 rtx var, int one_var);
1708
1709/* Convert an unsigned DImode value into a DFmode, using only SSE.
1710 Expects the 64-bit DImode to be supplied in a pair of integral
1711 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1712 -mfpmath=sse, !optimize_size only. */
1713
1714void
1715ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1716{
1717 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1718 rtx int_xmm, fp_xmm;
1719 rtx biases, exponents;
1720 rtx x;
1721
1722 int_xmm = gen_reg_rtx (V4SImode);
1723 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1724 emit_insn (gen_movdi_to_sse (int_xmm, input));
1725 else if (TARGET_SSE_SPLIT_REGS)
1726 {
1727 emit_clobber (int_xmm);
1728 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1729 }
1730 else
1731 {
1732 x = gen_reg_rtx (V2DImode);
1733 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1734 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1735 }
1736
1737 x = gen_rtx_CONST_VECTOR (V4SImode,
1738 gen_rtvec (4, GEN_INT (0x43300000UL),
1739 GEN_INT (0x45300000UL),
1740 const0_rtx, const0_rtx));
1741 exponents = validize_mem (force_const_mem (V4SImode, x));
1742
1743 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1744 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1745
1746 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1747 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1748 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1749 (0x1.0p84 + double(fp_value_hi_xmm)).
1750 Note these exponents differ by 32. */
1751
1752 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1753
1754 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1755 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1756 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1757 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1758 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1759 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1760 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1761 biases = validize_mem (force_const_mem (V2DFmode, biases));
1762 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1763
1764 /* Add the upper and lower DFmode values together. */
1765 if (TARGET_SSE3)
1766 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1767 else
1768 {
1769 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1770 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1771 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1772 }
1773
1774 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1775}
1776
1777/* Not used, but eases macroization of patterns. */
1778void
1779ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1780{
1781 gcc_unreachable ();
1782}
1783
0cda606d
UB
1784static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
1785
2bf6d935
ML
1786/* Convert an unsigned SImode value into a DFmode. Only currently used
1787 for SSE, but applicable anywhere. */
1788
1789void
1790ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1791{
1792 REAL_VALUE_TYPE TWO31r;
1793 rtx x, fp;
1794
1795 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1796 NULL, 1, OPTAB_DIRECT);
1797
1798 fp = gen_reg_rtx (DFmode);
1799 emit_insn (gen_floatsidf2 (fp, x));
1800
1801 real_ldexp (&TWO31r, &dconst1, 31);
1802 x = const_double_from_real_value (TWO31r, DFmode);
1803
1804 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
0cda606d
UB
1805
1806 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
1807 if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
1808 x = ix86_expand_sse_fabs (x, NULL);
1809
2bf6d935
ML
1810 if (x != target)
1811 emit_move_insn (target, x);
1812}
1813
1814/* Convert a signed DImode value into a DFmode. Only used for SSE in
1815 32-bit mode; otherwise we have a direct convert instruction. */
1816
1817void
1818ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1819{
1820 REAL_VALUE_TYPE TWO32r;
1821 rtx fp_lo, fp_hi, x;
1822
1823 fp_lo = gen_reg_rtx (DFmode);
1824 fp_hi = gen_reg_rtx (DFmode);
1825
1826 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1827
1828 real_ldexp (&TWO32r, &dconst1, 32);
1829 x = const_double_from_real_value (TWO32r, DFmode);
1830 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1831
1832 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1833
1834 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1835 0, OPTAB_DIRECT);
1836 if (x != target)
1837 emit_move_insn (target, x);
1838}
1839
1840/* Convert an unsigned SImode value into a SFmode, using only SSE.
1841 For x86_32, -mfpmath=sse, !optimize_size only. */
1842void
1843ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1844{
1845 REAL_VALUE_TYPE ONE16r;
1846 rtx fp_hi, fp_lo, int_hi, int_lo, x;
1847
1848 real_ldexp (&ONE16r, &dconst1, 16);
1849 x = const_double_from_real_value (ONE16r, SFmode);
1850 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1851 NULL, 0, OPTAB_DIRECT);
1852 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1853 NULL, 0, OPTAB_DIRECT);
1854 fp_hi = gen_reg_rtx (SFmode);
1855 fp_lo = gen_reg_rtx (SFmode);
1856 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1857 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
ad9fcb96
L
1858 if (TARGET_FMA)
1859 {
1860 x = validize_mem (force_const_mem (SFmode, x));
1861 fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
1862 emit_move_insn (target, fp_hi);
1863 }
1864 else
1865 {
1866 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1867 0, OPTAB_DIRECT);
1868 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1869 0, OPTAB_DIRECT);
1870 if (!rtx_equal_p (target, fp_hi))
1871 emit_move_insn (target, fp_hi);
1872 }
2bf6d935
ML
1873}
1874
1875/* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1876 a vector of unsigned ints VAL to vector of floats TARGET. */
1877
1878void
1879ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1880{
1881 rtx tmp[8];
1882 REAL_VALUE_TYPE TWO16r;
1883 machine_mode intmode = GET_MODE (val);
1884 machine_mode fltmode = GET_MODE (target);
1885 rtx (*cvt) (rtx, rtx);
1886
1887 if (intmode == V4SImode)
1888 cvt = gen_floatv4siv4sf2;
1889 else
1890 cvt = gen_floatv8siv8sf2;
1891 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1892 tmp[0] = force_reg (intmode, tmp[0]);
1893 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1894 OPTAB_DIRECT);
1895 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1896 NULL_RTX, 1, OPTAB_DIRECT);
1897 tmp[3] = gen_reg_rtx (fltmode);
1898 emit_insn (cvt (tmp[3], tmp[1]));
1899 tmp[4] = gen_reg_rtx (fltmode);
1900 emit_insn (cvt (tmp[4], tmp[2]));
1901 real_ldexp (&TWO16r, &dconst1, 16);
1902 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
1903 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
ad9fcb96
L
1904 if (TARGET_FMA)
1905 {
1906 tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
1907 emit_move_insn (target, tmp[6]);
1908 }
1909 else
1910 {
1911 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
1912 NULL_RTX, 1, OPTAB_DIRECT);
1913 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
1914 target, 1, OPTAB_DIRECT);
1915 if (tmp[7] != target)
1916 emit_move_insn (target, tmp[7]);
1917 }
2bf6d935
ML
1918}
1919
1920/* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1921 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1922 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1923 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1924
1925rtx
1926ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
1927{
1928 REAL_VALUE_TYPE TWO31r;
1929 rtx two31r, tmp[4];
1930 machine_mode mode = GET_MODE (val);
1931 machine_mode scalarmode = GET_MODE_INNER (mode);
1932 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
1933 rtx (*cmp) (rtx, rtx, rtx, rtx);
1934 int i;
1935
1936 for (i = 0; i < 3; i++)
1937 tmp[i] = gen_reg_rtx (mode);
1938 real_ldexp (&TWO31r, &dconst1, 31);
1939 two31r = const_double_from_real_value (TWO31r, scalarmode);
1940 two31r = ix86_build_const_vector (mode, 1, two31r);
1941 two31r = force_reg (mode, two31r);
1942 switch (mode)
1943 {
1944 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
1945 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
1946 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
1947 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
1948 default: gcc_unreachable ();
1949 }
1950 tmp[3] = gen_rtx_LE (mode, two31r, val);
1951 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
1952 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
1953 0, OPTAB_DIRECT);
1954 if (intmode == V4SImode || TARGET_AVX2)
1955 *xorp = expand_simple_binop (intmode, ASHIFT,
1956 gen_lowpart (intmode, tmp[0]),
1957 GEN_INT (31), NULL_RTX, 0,
1958 OPTAB_DIRECT);
1959 else
1960 {
6a556ba4 1961 rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
2bf6d935
ML
1962 two31 = ix86_build_const_vector (intmode, 1, two31);
1963 *xorp = expand_simple_binop (intmode, AND,
1964 gen_lowpart (intmode, tmp[0]),
1965 two31, NULL_RTX, 0,
1966 OPTAB_DIRECT);
1967 }
1968 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
1969 0, OPTAB_DIRECT);
1970}
1971
1972/* Generate code for floating point ABS or NEG. */
1973
1974void
1975ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1976 rtx operands[])
1977{
f359611b 1978 rtx set, dst, src;
2bf6d935
ML
1979 bool use_sse = false;
1980 bool vector_mode = VECTOR_MODE_P (mode);
1981 machine_mode vmode = mode;
f359611b 1982 rtvec par;
2bf6d935 1983
75a97b59
L
1984 if (vector_mode || mode == TFmode || mode == HFmode)
1985 {
1986 use_sse = true;
1987 if (mode == HFmode)
1988 vmode = V8HFmode;
1989 }
2bf6d935
ML
1990 else if (TARGET_SSE_MATH)
1991 {
1992 use_sse = SSE_FLOAT_MODE_P (mode);
1993 if (mode == SFmode)
1994 vmode = V4SFmode;
1995 else if (mode == DFmode)
1996 vmode = V2DFmode;
1997 }
1998
2bf6d935
ML
1999 dst = operands[0];
2000 src = operands[1];
2001
2002 set = gen_rtx_fmt_e (code, mode, src);
2003 set = gen_rtx_SET (dst, set);
2004
f359611b 2005 if (use_sse)
2bf6d935 2006 {
f359611b 2007 rtx mask, use, clob;
2bf6d935 2008
f359611b
UB
2009 /* NEG and ABS performed with SSE use bitwise mask operations.
2010 Create the appropriate mask now. */
2011 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
2bf6d935 2012 use = gen_rtx_USE (VOIDmode, mask);
94f687bd 2013 if (vector_mode || mode == TFmode)
2bf6d935
ML
2014 par = gen_rtvec (2, set, use);
2015 else
2016 {
2017 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2018 par = gen_rtvec (3, set, use, clob);
2019 }
2bf6d935
ML
2020 }
2021 else
f359611b
UB
2022 {
2023 rtx clob;
2024
2025 /* Changing of sign for FP values is doable using integer unit too. */
2026 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2027 par = gen_rtvec (2, set, clob);
2028 }
2029
2030 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2031}
2032
2033/* Deconstruct a floating point ABS or NEG operation
2034 with integer registers into integer operations. */
2035
2036void
2037ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2038 rtx operands[])
2039{
2040 enum rtx_code absneg_op;
2041 rtx dst, set;
2042
2043 gcc_assert (operands_match_p (operands[0], operands[1]));
2044
2045 switch (mode)
2046 {
2047 case E_SFmode:
2048 dst = gen_lowpart (SImode, operands[0]);
2049
2050 if (code == ABS)
2051 {
2052 set = gen_int_mode (0x7fffffff, SImode);
2053 absneg_op = AND;
2054 }
2055 else
2056 {
2057 set = gen_int_mode (0x80000000, SImode);
2058 absneg_op = XOR;
2059 }
2060 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2061 break;
2062
2063 case E_DFmode:
2064 if (TARGET_64BIT)
2065 {
2066 dst = gen_lowpart (DImode, operands[0]);
2067 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
2068
2069 if (code == ABS)
2070 set = const0_rtx;
2071 else
2072 set = gen_rtx_NOT (DImode, dst);
2073 }
2074 else
2075 {
2076 dst = gen_highpart (SImode, operands[0]);
2077
2078 if (code == ABS)
2079 {
2080 set = gen_int_mode (0x7fffffff, SImode);
2081 absneg_op = AND;
2082 }
2083 else
2084 {
2085 set = gen_int_mode (0x80000000, SImode);
2086 absneg_op = XOR;
2087 }
2088 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2089 }
2090 break;
2091
2092 case E_XFmode:
2093 dst = gen_rtx_REG (SImode,
2094 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
2095 if (code == ABS)
2096 {
2097 set = GEN_INT (0x7fff);
2098 absneg_op = AND;
2099 }
2100 else
2101 {
2102 set = GEN_INT (0x8000);
2103 absneg_op = XOR;
2104 }
2105 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2106 break;
2107
2108 default:
2109 gcc_unreachable ();
2110 }
2111
2112 set = gen_rtx_SET (dst, set);
2113
2114 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2115 rtvec par = gen_rtvec (2, set, clob);
2116
2117 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2bf6d935
ML
2118}
2119
2120/* Expand a copysign operation. Special case operand 0 being a constant. */
2121
2122void
2123ix86_expand_copysign (rtx operands[])
2124{
2125 machine_mode mode, vmode;
60efb1fe 2126 rtx dest, op0, op1, mask, op2, op3;
2bf6d935 2127
60efb1fe 2128 mode = GET_MODE (operands[0]);
2bf6d935 2129
75a97b59
L
2130 if (mode == HFmode)
2131 vmode = V8HFmode;
2132 else if (mode == SFmode)
2bf6d935
ML
2133 vmode = V4SFmode;
2134 else if (mode == DFmode)
2135 vmode = V2DFmode;
987a3082 2136 else if (mode == TFmode)
2bf6d935 2137 vmode = mode;
987a3082
UB
2138 else
2139 gcc_unreachable ();
2140
60efb1fe 2141 if (rtx_equal_p (operands[1], operands[2]))
2bf6d935 2142 {
60efb1fe 2143 emit_move_insn (operands[0], operands[1]);
2bf6d935
ML
2144 return;
2145 }
2146
60efb1fe 2147 dest = lowpart_subreg (vmode, operands[0], mode);
2148 op1 = lowpart_subreg (vmode, operands[2], mode);
2149 mask = ix86_build_signbit_mask (vmode, 0, 0);
2bf6d935 2150
60efb1fe 2151 if (CONST_DOUBLE_P (operands[1]))
2bf6d935 2152 {
60efb1fe 2153 op0 = simplify_unary_operation (ABS, mode, operands[1], mode);
2154 /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */
2155 if (op0 == CONST0_RTX (mode))
2bf6d935 2156 {
60efb1fe 2157 emit_move_insn (dest, gen_rtx_AND (vmode, mask, op1));
2158 return;
2bf6d935 2159 }
2bf6d935 2160
60efb1fe 2161 if (GET_MODE_SIZE (mode) < 16)
2162 op0 = ix86_build_const_vector (vmode, false, op0);
2163 op0 = force_reg (vmode, op0);
2bf6d935 2164 }
60efb1fe 2165 else
2166 op0 = lowpart_subreg (vmode, operands[1], mode);
2167
2168 op2 = gen_reg_rtx (vmode);
2169 op3 = gen_reg_rtx (vmode);
2170 emit_move_insn (op2, gen_rtx_AND (vmode,
2171 gen_rtx_NOT (vmode, mask),
2172 op0));
2173 emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
2174 emit_move_insn (dest, gen_rtx_IOR (vmode, op2, op3));
2bf6d935
ML
2175}
2176
2177/* Expand an xorsign operation. */
2178
2179void
2180ix86_expand_xorsign (rtx operands[])
2181{
2bf6d935 2182 machine_mode mode, vmode;
7485a525 2183 rtx dest, op0, op1, mask, x, temp;
2bf6d935
ML
2184
2185 dest = operands[0];
2186 op0 = operands[1];
2187 op1 = operands[2];
2188
2189 mode = GET_MODE (dest);
2190
75a97b59
L
2191 if (mode == HFmode)
2192 vmode = V8HFmode;
2193 else if (mode == SFmode)
987a3082 2194 vmode = V4SFmode;
2bf6d935 2195 else if (mode == DFmode)
987a3082 2196 vmode = V2DFmode;
2bf6d935
ML
2197 else
2198 gcc_unreachable ();
2199
7485a525 2200 temp = gen_reg_rtx (vmode);
2bf6d935
ML
2201 mask = ix86_build_signbit_mask (vmode, 0, 0);
2202
7485a525
JJ
2203 op1 = lowpart_subreg (vmode, op1, mode);
2204 x = gen_rtx_AND (vmode, op1, mask);
2205 emit_insn (gen_rtx_SET (temp, x));
2bf6d935 2206
7485a525
JJ
2207 op0 = lowpart_subreg (vmode, op0, mode);
2208 x = gen_rtx_XOR (vmode, temp, op0);
652bef70
L
2209
2210 dest = lowpart_subreg (vmode, dest, mode);
2bf6d935
ML
2211 emit_insn (gen_rtx_SET (dest, x));
2212}
2213
2214static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2215
2216void
2217ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2218{
2219 machine_mode mode = GET_MODE (op0);
2220 rtx tmp;
2221
2222 /* Handle special case - vector comparsion with boolean result, transform
2223 it using ptest instruction. */
2224 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
2225 {
2226 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2227 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2228
2229 gcc_assert (code == EQ || code == NE);
2230 /* Generate XOR since we can't check that one operand is zero vector. */
2231 tmp = gen_reg_rtx (mode);
2232 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2233 tmp = gen_lowpart (p_mode, tmp);
2234 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2235 gen_rtx_UNSPEC (CCmode,
2236 gen_rtvec (2, tmp, tmp),
2237 UNSPEC_PTEST)));
2238 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2239 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2240 gen_rtx_LABEL_REF (VOIDmode, label),
2241 pc_rtx);
2242 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2243 return;
2244 }
2245
2246 switch (mode)
2247 {
a6841211 2248 case E_HFmode:
2bf6d935
ML
2249 case E_SFmode:
2250 case E_DFmode:
2251 case E_XFmode:
2252 case E_QImode:
2253 case E_HImode:
2254 case E_SImode:
2255 simple:
2256 tmp = ix86_expand_compare (code, op0, op1);
2257 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2258 gen_rtx_LABEL_REF (VOIDmode, label),
2259 pc_rtx);
2260 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2261 return;
2262
2263 case E_DImode:
2264 if (TARGET_64BIT)
2265 goto simple;
2266 /* For 32-bit target DI comparison may be performed on
2267 SSE registers. To allow this we should avoid split
2268 to SI mode which is achieved by doing xor in DI mode
2269 and then comparing with zero (which is recognized by
2270 STV pass). We don't compare using xor when optimizing
2271 for size. */
2272 if (!optimize_insn_for_size_p ()
2273 && TARGET_STV
2274 && (code == EQ || code == NE))
2275 {
2276 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
2277 op1 = const0_rtx;
2278 }
2279 /* FALLTHRU */
2280 case E_TImode:
2281 /* Expand DImode branch into multiple compare+branch. */
2282 {
2283 rtx lo[2], hi[2];
2284 rtx_code_label *label2;
2285 enum rtx_code code1, code2, code3;
2286 machine_mode submode;
2287
2288 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2289 {
2290 std::swap (op0, op1);
2291 code = swap_condition (code);
2292 }
2293
2294 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2295 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2296
2297 submode = mode == DImode ? SImode : DImode;
2298
2299 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2300 avoid two branches. This costs one extra insn, so disable when
2301 optimizing for size. */
2302
2303 if ((code == EQ || code == NE)
2304 && (!optimize_insn_for_size_p ()
2305 || hi[1] == const0_rtx || lo[1] == const0_rtx))
2306 {
2307 rtx xor0, xor1;
2308
2309 xor1 = hi[0];
2310 if (hi[1] != const0_rtx)
2311 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
2312 NULL_RTX, 0, OPTAB_WIDEN);
2313
2314 xor0 = lo[0];
2315 if (lo[1] != const0_rtx)
2316 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
2317 NULL_RTX, 0, OPTAB_WIDEN);
2318
2319 tmp = expand_binop (submode, ior_optab, xor1, xor0,
2320 NULL_RTX, 0, OPTAB_WIDEN);
2321
2322 ix86_expand_branch (code, tmp, const0_rtx, label);
2323 return;
2324 }
2325
2326 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2327 op1 is a constant and the low word is zero, then we can just
2328 examine the high word. Similarly for low word -1 and
2329 less-or-equal-than or greater-than. */
2330
2331 if (CONST_INT_P (hi[1]))
2332 switch (code)
2333 {
2334 case LT: case LTU: case GE: case GEU:
2335 if (lo[1] == const0_rtx)
2336 {
2337 ix86_expand_branch (code, hi[0], hi[1], label);
2338 return;
2339 }
2340 break;
2341 case LE: case LEU: case GT: case GTU:
2342 if (lo[1] == constm1_rtx)
2343 {
2344 ix86_expand_branch (code, hi[0], hi[1], label);
2345 return;
2346 }
2347 break;
2348 default:
2349 break;
2350 }
2351
2352 /* Emulate comparisons that do not depend on Zero flag with
2353 double-word subtraction. Note that only Overflow, Sign
2354 and Carry flags are valid, so swap arguments and condition
2355 of comparisons that would otherwise test Zero flag. */
2356
2357 switch (code)
2358 {
2359 case LE: case LEU: case GT: case GTU:
2360 std::swap (lo[0], lo[1]);
2361 std::swap (hi[0], hi[1]);
2362 code = swap_condition (code);
2363 /* FALLTHRU */
2364
2365 case LT: case LTU: case GE: case GEU:
2366 {
2bf6d935 2367 bool uns = (code == LTU || code == GEU);
987a3082
UB
2368 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2369 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2bf6d935
ML
2370
2371 if (!nonimmediate_operand (lo[0], submode))
2372 lo[0] = force_reg (submode, lo[0]);
2373 if (!x86_64_general_operand (lo[1], submode))
2374 lo[1] = force_reg (submode, lo[1]);
2375
2376 if (!register_operand (hi[0], submode))
2377 hi[0] = force_reg (submode, hi[0]);
2378 if ((uns && !nonimmediate_operand (hi[1], submode))
2379 || (!uns && !x86_64_general_operand (hi[1], submode)))
2380 hi[1] = force_reg (submode, hi[1]);
2381
987a3082 2382 emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2bf6d935 2383
987a3082
UB
2384 tmp = gen_rtx_SCRATCH (submode);
2385 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2bf6d935 2386
987a3082 2387 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2bf6d935
ML
2388 ix86_expand_branch (code, tmp, const0_rtx, label);
2389 return;
2390 }
2391
2392 default:
2393 break;
2394 }
2395
2396 /* Otherwise, we need two or three jumps. */
2397
2398 label2 = gen_label_rtx ();
2399
2400 code1 = code;
2401 code2 = swap_condition (code);
2402 code3 = unsigned_condition (code);
2403
2404 switch (code)
2405 {
2406 case LT: case GT: case LTU: case GTU:
2407 break;
2408
2409 case LE: code1 = LT; code2 = GT; break;
2410 case GE: code1 = GT; code2 = LT; break;
2411 case LEU: code1 = LTU; code2 = GTU; break;
2412 case GEU: code1 = GTU; code2 = LTU; break;
2413
2414 case EQ: code1 = UNKNOWN; code2 = NE; break;
2415 case NE: code2 = UNKNOWN; break;
2416
2417 default:
2418 gcc_unreachable ();
2419 }
2420
2421 /*
2422 * a < b =>
2423 * if (hi(a) < hi(b)) goto true;
2424 * if (hi(a) > hi(b)) goto false;
2425 * if (lo(a) < lo(b)) goto true;
2426 * false:
2427 */
2428
2429 if (code1 != UNKNOWN)
2430 ix86_expand_branch (code1, hi[0], hi[1], label);
2431 if (code2 != UNKNOWN)
2432 ix86_expand_branch (code2, hi[0], hi[1], label2);
2433
2434 ix86_expand_branch (code3, lo[0], lo[1], label);
2435
2436 if (code2 != UNKNOWN)
2437 emit_label (label2);
2438 return;
2439 }
2440
2441 default:
2442 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2443 goto simple;
2444 }
2445}
2446
2447/* Figure out whether to use unordered fp comparisons. */
2448
2449static bool
2450ix86_unordered_fp_compare (enum rtx_code code)
2451{
2452 if (!TARGET_IEEE_FP)
2453 return false;
2454
2455 switch (code)
2456 {
2bf6d935
ML
2457 case LT:
2458 case LE:
d6038777
UB
2459 case GT:
2460 case GE:
2461 case LTGT:
2bf6d935
ML
2462 return false;
2463
2464 case EQ:
2465 case NE:
2466
2bf6d935
ML
2467 case UNORDERED:
2468 case ORDERED:
2469 case UNLT:
2470 case UNLE:
2471 case UNGT:
2472 case UNGE:
2473 case UNEQ:
2474 return true;
2475
2476 default:
2477 gcc_unreachable ();
2478 }
2479}
2480
2481/* Return a comparison we can do and that it is equivalent to
2482 swap_condition (code) apart possibly from orderedness.
2483 But, never change orderedness if TARGET_IEEE_FP, returning
2484 UNKNOWN in that case if necessary. */
2485
2486static enum rtx_code
2487ix86_fp_swap_condition (enum rtx_code code)
2488{
2489 switch (code)
2490 {
2491 case GT: /* GTU - CF=0 & ZF=0 */
2492 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2493 case GE: /* GEU - CF=0 */
2494 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2495 case UNLT: /* LTU - CF=1 */
2496 return TARGET_IEEE_FP ? UNKNOWN : GT;
2497 case UNLE: /* LEU - CF=1 | ZF=1 */
2498 return TARGET_IEEE_FP ? UNKNOWN : GE;
2499 default:
2500 return swap_condition (code);
2501 }
2502}
2503
2504/* Return cost of comparison CODE using the best strategy for performance.
2505 All following functions do use number of instructions as a cost metrics.
2506 In future this should be tweaked to compute bytes for optimize_size and
2507 take into account performance of various instructions on various CPUs. */
2508
2509static int
2510ix86_fp_comparison_cost (enum rtx_code code)
2511{
2512 int arith_cost;
2513
2514 /* The cost of code using bit-twiddling on %ah. */
2515 switch (code)
2516 {
2517 case UNLE:
2518 case UNLT:
2519 case LTGT:
2520 case GT:
2521 case GE:
2522 case UNORDERED:
2523 case ORDERED:
2524 case UNEQ:
2525 arith_cost = 4;
2526 break;
2527 case LT:
2528 case NE:
2529 case EQ:
2530 case UNGE:
2531 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2532 break;
2533 case LE:
2534 case UNGT:
2535 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2536 break;
2537 default:
2538 gcc_unreachable ();
2539 }
2540
2541 switch (ix86_fp_comparison_strategy (code))
2542 {
2543 case IX86_FPCMP_COMI:
2544 return arith_cost > 4 ? 3 : 2;
2545 case IX86_FPCMP_SAHF:
2546 return arith_cost > 4 ? 4 : 3;
2547 default:
2548 return arith_cost;
2549 }
2550}
2551
2552/* Swap, force into registers, or otherwise massage the two operands
2553 to a fp comparison. The operands are updated in place; the new
2554 comparison code is returned. */
2555
2556static enum rtx_code
2557ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2558{
2559 bool unordered_compare = ix86_unordered_fp_compare (code);
2560 rtx op0 = *pop0, op1 = *pop1;
2561 machine_mode op_mode = GET_MODE (op0);
a6841211 2562 bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode);
2bf6d935
ML
2563
2564 /* All of the unordered compare instructions only work on registers.
2565 The same is true of the fcomi compare instructions. The XFmode
2566 compare instructions require registers except when comparing
2567 against zero or when converting operand 1 from fixed point to
2568 floating point. */
2569
2570 if (!is_sse
2571 && (unordered_compare
2572 || (op_mode == XFmode
2573 && ! (standard_80387_constant_p (op0) == 1
2574 || standard_80387_constant_p (op1) == 1)
2575 && GET_CODE (op1) != FLOAT)
2576 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2577 {
2578 op0 = force_reg (op_mode, op0);
2579 op1 = force_reg (op_mode, op1);
2580 }
2581 else
2582 {
2583 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2584 things around if they appear profitable, otherwise force op0
2585 into a register. */
2586
2587 if (standard_80387_constant_p (op0) == 0
2588 || (MEM_P (op0)
2589 && ! (standard_80387_constant_p (op1) == 0
2590 || MEM_P (op1))))
2591 {
2592 enum rtx_code new_code = ix86_fp_swap_condition (code);
2593 if (new_code != UNKNOWN)
2594 {
2595 std::swap (op0, op1);
2596 code = new_code;
2597 }
2598 }
2599
2600 if (!REG_P (op0))
2601 op0 = force_reg (op_mode, op0);
2602
2603 if (CONSTANT_P (op1))
2604 {
2605 int tmp = standard_80387_constant_p (op1);
2606 if (tmp == 0)
2607 op1 = validize_mem (force_const_mem (op_mode, op1));
2608 else if (tmp == 1)
2609 {
2610 if (TARGET_CMOVE)
2611 op1 = force_reg (op_mode, op1);
2612 }
2613 else
2614 op1 = force_reg (op_mode, op1);
2615 }
2616 }
2617
2618 /* Try to rearrange the comparison to make it cheaper. */
2619 if (ix86_fp_comparison_cost (code)
2620 > ix86_fp_comparison_cost (swap_condition (code))
2621 && (REG_P (op1) || can_create_pseudo_p ()))
2622 {
2623 std::swap (op0, op1);
2624 code = swap_condition (code);
2625 if (!REG_P (op0))
2626 op0 = force_reg (op_mode, op0);
2627 }
2628
2629 *pop0 = op0;
2630 *pop1 = op1;
2631 return code;
2632}
2633
2634/* Generate insn patterns to do a floating point compare of OPERANDS. */
2635
2636static rtx
2637ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2638{
2639 bool unordered_compare = ix86_unordered_fp_compare (code);
2640 machine_mode cmp_mode;
2641 rtx tmp, scratch;
2642
2643 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2644
2645 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2646 if (unordered_compare)
2647 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2648
2649 /* Do fcomi/sahf based test when profitable. */
2650 switch (ix86_fp_comparison_strategy (code))
2651 {
2652 case IX86_FPCMP_COMI:
2653 cmp_mode = CCFPmode;
2654 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2655 break;
2656
2657 case IX86_FPCMP_SAHF:
2658 cmp_mode = CCFPmode;
2659 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2660 scratch = gen_reg_rtx (HImode);
2661 emit_insn (gen_rtx_SET (scratch, tmp));
2662 emit_insn (gen_x86_sahf_1 (scratch));
2663 break;
2664
2665 case IX86_FPCMP_ARITH:
2666 cmp_mode = CCNOmode;
2667 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2668 scratch = gen_reg_rtx (HImode);
2669 emit_insn (gen_rtx_SET (scratch, tmp));
2670
2671 /* In the unordered case, we have to check C2 for NaN's, which
2672 doesn't happen to work out to anything nice combination-wise.
2673 So do some bit twiddling on the value we've got in AH to come
2674 up with an appropriate set of condition codes. */
2675
2676 switch (code)
2677 {
2678 case GT:
2679 case UNGT:
2680 if (code == GT || !TARGET_IEEE_FP)
2681 {
2682 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2683 code = EQ;
2684 }
2685 else
2686 {
2687 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2688 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2689 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2690 cmp_mode = CCmode;
2691 code = GEU;
2692 }
2693 break;
2694 case LT:
2695 case UNLT:
2696 if (code == LT && TARGET_IEEE_FP)
2697 {
2698 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2699 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2700 cmp_mode = CCmode;
2701 code = EQ;
2702 }
2703 else
2704 {
2705 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2706 code = NE;
2707 }
2708 break;
2709 case GE:
2710 case UNGE:
2711 if (code == GE || !TARGET_IEEE_FP)
2712 {
2713 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2714 code = EQ;
2715 }
2716 else
2717 {
2718 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2719 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2720 code = NE;
2721 }
2722 break;
2723 case LE:
2724 case UNLE:
2725 if (code == LE && TARGET_IEEE_FP)
2726 {
2727 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2728 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2729 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2730 cmp_mode = CCmode;
2731 code = LTU;
2732 }
2733 else
2734 {
2735 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2736 code = NE;
2737 }
2738 break;
2739 case EQ:
2740 case UNEQ:
2741 if (code == EQ && TARGET_IEEE_FP)
2742 {
2743 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2744 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2745 cmp_mode = CCmode;
2746 code = EQ;
2747 }
2748 else
2749 {
2750 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2751 code = NE;
2752 }
2753 break;
2754 case NE:
2755 case LTGT:
2756 if (code == NE && TARGET_IEEE_FP)
2757 {
2758 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2759 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2760 GEN_INT (0x40)));
2761 code = NE;
2762 }
2763 else
2764 {
2765 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2766 code = EQ;
2767 }
2768 break;
2769
2770 case UNORDERED:
2771 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2772 code = NE;
2773 break;
2774 case ORDERED:
2775 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2776 code = EQ;
2777 break;
2778
2779 default:
2780 gcc_unreachable ();
2781 }
2782 break;
2783
2784 default:
2785 gcc_unreachable();
2786 }
2787
2788 /* Return the test that should be put into the flags user, i.e.
2789 the bcc, scc, or cmov instruction. */
2790 return gen_rtx_fmt_ee (code, VOIDmode,
2791 gen_rtx_REG (cmp_mode, FLAGS_REG),
2792 const0_rtx);
2793}
2794
2795/* Generate insn patterns to do an integer compare of OPERANDS. */
2796
2797static rtx
2798ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2799{
2800 machine_mode cmpmode;
2801 rtx tmp, flags;
2802
86403f4e
UB
2803 /* Swap operands to emit carry flag comparison. */
2804 if ((code == GTU || code == LEU)
2805 && nonimmediate_operand (op1, VOIDmode))
2806 {
2807 std::swap (op0, op1);
2808 code = swap_condition (code);
2809 }
2810
2bf6d935
ML
2811 cmpmode = SELECT_CC_MODE (code, op0, op1);
2812 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2813
2814 /* This is very simple, but making the interface the same as in the
2815 FP case makes the rest of the code easier. */
2816 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2817 emit_insn (gen_rtx_SET (flags, tmp));
2818
2819 /* Return the test that should be put into the flags user, i.e.
2820 the bcc, scc, or cmov instruction. */
2821 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2822}
2823
2824static rtx
2825ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2826{
2827 rtx ret;
2828
2829 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2830 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2831
2832 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2833 {
2834 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2835 ret = ix86_expand_fp_compare (code, op0, op1);
2836 }
2837 else
2838 ret = ix86_expand_int_compare (code, op0, op1);
2839
2840 return ret;
2841}
2842
2843void
2844ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2845{
2846 rtx ret;
2847
2848 gcc_assert (GET_MODE (dest) == QImode);
2849
2850 ret = ix86_expand_compare (code, op0, op1);
2851 PUT_MODE (ret, QImode);
2852 emit_insn (gen_rtx_SET (dest, ret));
2853}
2854
2855/* Expand comparison setting or clearing carry flag. Return true when
2856 successful and set pop for the operation. */
2857static bool
2858ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
2859{
2860 machine_mode mode
2861 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
2862
2863 /* Do not handle double-mode compares that go through special path. */
2864 if (mode == (TARGET_64BIT ? TImode : DImode))
2865 return false;
2866
2867 if (SCALAR_FLOAT_MODE_P (mode))
2868 {
2869 rtx compare_op;
2870 rtx_insn *compare_seq;
2871
2872 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
2873
2874 /* Shortcut: following common codes never translate
2875 into carry flag compares. */
2876 if (code == EQ || code == NE || code == UNEQ || code == LTGT
2877 || code == ORDERED || code == UNORDERED)
2878 return false;
2879
2880 /* These comparisons require zero flag; swap operands so they won't. */
2881 if ((code == GT || code == UNLE || code == LE || code == UNGT)
2882 && !TARGET_IEEE_FP)
2883 {
2884 std::swap (op0, op1);
2885 code = swap_condition (code);
2886 }
2887
2888 /* Try to expand the comparison and verify that we end up with
2889 carry flag based comparison. This fails to be true only when
2890 we decide to expand comparison using arithmetic that is not
2891 too common scenario. */
2892 start_sequence ();
2893 compare_op = ix86_expand_fp_compare (code, op0, op1);
2894 compare_seq = get_insns ();
2895 end_sequence ();
2896
2897 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
2898 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
2899 else
2900 code = GET_CODE (compare_op);
2901
2902 if (code != LTU && code != GEU)
2903 return false;
2904
2905 emit_insn (compare_seq);
2906 *pop = compare_op;
2907 return true;
2908 }
2909
2910 if (!INTEGRAL_MODE_P (mode))
2911 return false;
2912
2913 switch (code)
2914 {
2915 case LTU:
2916 case GEU:
2917 break;
2918
2919 /* Convert a==0 into (unsigned)a<1. */
2920 case EQ:
2921 case NE:
2922 if (op1 != const0_rtx)
2923 return false;
2924 op1 = const1_rtx;
2925 code = (code == EQ ? LTU : GEU);
2926 break;
2927
2928 /* Convert a>b into b<a or a>=b-1. */
2929 case GTU:
2930 case LEU:
2931 if (CONST_INT_P (op1))
2932 {
2933 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
2934 /* Bail out on overflow. We still can swap operands but that
2935 would force loading of the constant into register. */
2936 if (op1 == const0_rtx
2937 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
2938 return false;
2939 code = (code == GTU ? GEU : LTU);
2940 }
2941 else
2942 {
2943 std::swap (op0, op1);
2944 code = (code == GTU ? LTU : GEU);
2945 }
2946 break;
2947
2948 /* Convert a>=0 into (unsigned)a<0x80000000. */
2949 case LT:
2950 case GE:
2951 if (mode == DImode || op1 != const0_rtx)
2952 return false;
2953 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2954 code = (code == LT ? GEU : LTU);
2955 break;
2956 case LE:
2957 case GT:
2958 if (mode == DImode || op1 != constm1_rtx)
2959 return false;
2960 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2961 code = (code == LE ? GEU : LTU);
2962 break;
2963
2964 default:
2965 return false;
2966 }
2967 /* Swapping operands may cause constant to appear as first operand. */
2968 if (!nonimmediate_operand (op0, VOIDmode))
2969 {
2970 if (!can_create_pseudo_p ())
2971 return false;
2972 op0 = force_reg (mode, op0);
2973 }
2974 *pop = ix86_expand_compare (code, op0, op1);
2975 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
2976 return true;
2977}
2978
2979/* Expand conditional increment or decrement using adb/sbb instructions.
2980 The default case using setcc followed by the conditional move can be
2981 done by generic code. */
2982bool
2983ix86_expand_int_addcc (rtx operands[])
2984{
2985 enum rtx_code code = GET_CODE (operands[1]);
2986 rtx flags;
987a3082 2987 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
2bf6d935
ML
2988 rtx compare_op;
2989 rtx val = const0_rtx;
2990 bool fpcmp = false;
2991 machine_mode mode;
2992 rtx op0 = XEXP (operands[1], 0);
2993 rtx op1 = XEXP (operands[1], 1);
2994
2995 if (operands[3] != const1_rtx
2996 && operands[3] != constm1_rtx)
2997 return false;
2998 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2999 return false;
3000 code = GET_CODE (compare_op);
3001
3002 flags = XEXP (compare_op, 0);
3003
3004 if (GET_MODE (flags) == CCFPmode)
3005 {
3006 fpcmp = true;
3007 code = ix86_fp_compare_code_to_integer (code);
3008 }
3009
3010 if (code != LTU)
3011 {
3012 val = constm1_rtx;
3013 if (fpcmp)
3014 PUT_CODE (compare_op,
3015 reverse_condition_maybe_unordered
3016 (GET_CODE (compare_op)));
3017 else
3018 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
3019 }
3020
3021 mode = GET_MODE (operands[0]);
3022
3023 /* Construct either adc or sbb insn. */
3024 if ((code == LTU) == (operands[3] == constm1_rtx))
987a3082 3025 insn = gen_sub3_carry;
2bf6d935 3026 else
987a3082
UB
3027 insn = gen_add3_carry;
3028
3029 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
2bf6d935
ML
3030
3031 return true;
3032}
3033
3034bool
3035ix86_expand_int_movcc (rtx operands[])
3036{
3037 enum rtx_code code = GET_CODE (operands[1]), compare_code;
3038 rtx_insn *compare_seq;
3039 rtx compare_op;
3040 machine_mode mode = GET_MODE (operands[0]);
3041 bool sign_bit_compare_p = false;
3042 rtx op0 = XEXP (operands[1], 0);
3043 rtx op1 = XEXP (operands[1], 1);
3044
3045 if (GET_MODE (op0) == TImode
3046 || (GET_MODE (op0) == DImode
3047 && !TARGET_64BIT))
3048 return false;
3049
3050 start_sequence ();
3051 compare_op = ix86_expand_compare (code, op0, op1);
3052 compare_seq = get_insns ();
3053 end_sequence ();
3054
3055 compare_code = GET_CODE (compare_op);
3056
3057 if ((op1 == const0_rtx && (code == GE || code == LT))
3058 || (op1 == constm1_rtx && (code == GT || code == LE)))
3059 sign_bit_compare_p = true;
3060
3061 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3062 HImode insns, we'd be swallowed in word prefix ops. */
3063
3064 if ((mode != HImode || TARGET_FAST_PREFIX)
3065 && (mode != (TARGET_64BIT ? TImode : DImode))
3066 && CONST_INT_P (operands[2])
3067 && CONST_INT_P (operands[3]))
3068 {
3069 rtx out = operands[0];
3070 HOST_WIDE_INT ct = INTVAL (operands[2]);
3071 HOST_WIDE_INT cf = INTVAL (operands[3]);
3072 HOST_WIDE_INT diff;
3073
3074 diff = ct - cf;
3075 /* Sign bit compares are better done using shifts than we do by using
3076 sbb. */
3077 if (sign_bit_compare_p
3078 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3079 {
3080 /* Detect overlap between destination and compare sources. */
3081 rtx tmp = out;
3082
3083 if (!sign_bit_compare_p)
3084 {
3085 rtx flags;
3086 bool fpcmp = false;
3087
3088 compare_code = GET_CODE (compare_op);
3089
3090 flags = XEXP (compare_op, 0);
3091
3092 if (GET_MODE (flags) == CCFPmode)
3093 {
3094 fpcmp = true;
3095 compare_code
3096 = ix86_fp_compare_code_to_integer (compare_code);
3097 }
3098
3099 /* To simplify rest of code, restrict to the GEU case. */
3100 if (compare_code == LTU)
3101 {
3102 std::swap (ct, cf);
3103 compare_code = reverse_condition (compare_code);
3104 code = reverse_condition (code);
3105 }
3106 else
3107 {
3108 if (fpcmp)
3109 PUT_CODE (compare_op,
3110 reverse_condition_maybe_unordered
3111 (GET_CODE (compare_op)));
3112 else
3113 PUT_CODE (compare_op,
3114 reverse_condition (GET_CODE (compare_op)));
3115 }
3116 diff = ct - cf;
3117
3118 if (reg_overlap_mentioned_p (out, op0)
3119 || reg_overlap_mentioned_p (out, op1))
3120 tmp = gen_reg_rtx (mode);
3121
3122 if (mode == DImode)
3123 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
3124 else
3125 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
3126 flags, compare_op));
3127 }
3128 else
3129 {
3130 if (code == GT || code == GE)
3131 code = reverse_condition (code);
3132 else
3133 {
3134 std::swap (ct, cf);
3135 diff = ct - cf;
3136 }
3137 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
3138 }
3139
3140 if (diff == 1)
3141 {
3142 /*
3143 * cmpl op0,op1
3144 * sbbl dest,dest
3145 * [addl dest, ct]
3146 *
3147 * Size 5 - 8.
3148 */
3149 if (ct)
3150 tmp = expand_simple_binop (mode, PLUS,
3151 tmp, GEN_INT (ct),
3152 copy_rtx (tmp), 1, OPTAB_DIRECT);
3153 }
3154 else if (cf == -1)
3155 {
3156 /*
3157 * cmpl op0,op1
3158 * sbbl dest,dest
3159 * orl $ct, dest
3160 *
3161 * Size 8.
3162 */
3163 tmp = expand_simple_binop (mode, IOR,
3164 tmp, GEN_INT (ct),
3165 copy_rtx (tmp), 1, OPTAB_DIRECT);
3166 }
3167 else if (diff == -1 && ct)
3168 {
3169 /*
3170 * cmpl op0,op1
3171 * sbbl dest,dest
3172 * notl dest
3173 * [addl dest, cf]
3174 *
3175 * Size 8 - 11.
3176 */
3177 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3178 if (cf)
3179 tmp = expand_simple_binop (mode, PLUS,
3180 copy_rtx (tmp), GEN_INT (cf),
3181 copy_rtx (tmp), 1, OPTAB_DIRECT);
3182 }
3183 else
3184 {
3185 /*
3186 * cmpl op0,op1
3187 * sbbl dest,dest
3188 * [notl dest]
3189 * andl cf - ct, dest
3190 * [addl dest, ct]
3191 *
3192 * Size 8 - 11.
3193 */
3194
3195 if (cf == 0)
3196 {
3197 cf = ct;
3198 ct = 0;
3199 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3200 }
3201
3202 tmp = expand_simple_binop (mode, AND,
3203 copy_rtx (tmp),
3204 gen_int_mode (cf - ct, mode),
3205 copy_rtx (tmp), 1, OPTAB_DIRECT);
3206 if (ct)
3207 tmp = expand_simple_binop (mode, PLUS,
3208 copy_rtx (tmp), GEN_INT (ct),
3209 copy_rtx (tmp), 1, OPTAB_DIRECT);
3210 }
3211
3212 if (!rtx_equal_p (tmp, out))
3213 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3214
3215 return true;
3216 }
3217
3218 if (diff < 0)
3219 {
3220 machine_mode cmp_mode = GET_MODE (op0);
3221 enum rtx_code new_code;
3222
3223 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3224 {
3225 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3226
8f17461b
UB
3227 /* We may be reversing a non-trapping
3228 comparison to a trapping comparison. */
3229 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3230 && code != EQ && code != NE
3231 && code != ORDERED && code != UNORDERED)
3232 new_code = UNKNOWN;
3233 else
3234 new_code = reverse_condition_maybe_unordered (code);
2bf6d935
ML
3235 }
3236 else
3237 new_code = ix86_reverse_condition (code, cmp_mode);
3238 if (new_code != UNKNOWN)
3239 {
3240 std::swap (ct, cf);
3241 diff = -diff;
3242 code = new_code;
3243 }
3244 }
3245
3246 compare_code = UNKNOWN;
3247 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3248 && CONST_INT_P (op1))
3249 {
3250 if (op1 == const0_rtx
3251 && (code == LT || code == GE))
3252 compare_code = code;
3253 else if (op1 == constm1_rtx)
3254 {
3255 if (code == LE)
3256 compare_code = LT;
3257 else if (code == GT)
3258 compare_code = GE;
3259 }
3260 }
3261
3262 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3263 if (compare_code != UNKNOWN
3264 && GET_MODE (op0) == GET_MODE (out)
3265 && (cf == -1 || ct == -1))
3266 {
3267 /* If lea code below could be used, only optimize
3268 if it results in a 2 insn sequence. */
3269
3270 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3271 || diff == 3 || diff == 5 || diff == 9)
3272 || (compare_code == LT && ct == -1)
3273 || (compare_code == GE && cf == -1))
3274 {
3275 /*
3276 * notl op1 (if necessary)
3277 * sarl $31, op1
3278 * orl cf, op1
3279 */
3280 if (ct != -1)
3281 {
3282 cf = ct;
3283 ct = -1;
3284 code = reverse_condition (code);
3285 }
3286
3287 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3288
3289 out = expand_simple_binop (mode, IOR,
3290 out, GEN_INT (cf),
3291 out, 1, OPTAB_DIRECT);
3292 if (out != operands[0])
3293 emit_move_insn (operands[0], out);
3294
3295 return true;
3296 }
3297 }
3298
3299
3300 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3301 || diff == 3 || diff == 5 || diff == 9)
3302 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3303 && (mode != DImode
3304 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3305 {
3306 /*
3307 * xorl dest,dest
3308 * cmpl op1,op2
3309 * setcc dest
3310 * lea cf(dest*(ct-cf)),dest
3311 *
3312 * Size 14.
3313 *
3314 * This also catches the degenerate setcc-only case.
3315 */
3316
3317 rtx tmp;
3318 int nops;
3319
3320 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3321
3322 nops = 0;
3323 /* On x86_64 the lea instruction operates on Pmode, so we need
3324 to get arithmetics done in proper mode to match. */
3325 if (diff == 1)
3326 tmp = copy_rtx (out);
3327 else
3328 {
3329 rtx out1;
3330 out1 = copy_rtx (out);
3331 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3332 nops++;
3333 if (diff & 1)
3334 {
3335 tmp = gen_rtx_PLUS (mode, tmp, out1);
3336 nops++;
3337 }
3338 }
3339 if (cf != 0)
3340 {
c3185b64 3341 tmp = plus_constant (mode, tmp, cf);
2bf6d935
ML
3342 nops++;
3343 }
3344 if (!rtx_equal_p (tmp, out))
3345 {
3346 if (nops == 1)
3347 out = force_operand (tmp, copy_rtx (out));
3348 else
3349 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3350 }
3351 if (!rtx_equal_p (out, operands[0]))
3352 emit_move_insn (operands[0], copy_rtx (out));
3353
3354 return true;
3355 }
3356
3357 /*
3358 * General case: Jumpful:
3359 * xorl dest,dest cmpl op1, op2
3360 * cmpl op1, op2 movl ct, dest
3361 * setcc dest jcc 1f
3362 * decl dest movl cf, dest
3363 * andl (cf-ct),dest 1:
3364 * addl ct,dest
3365 *
3366 * Size 20. Size 14.
3367 *
3368 * This is reasonably steep, but branch mispredict costs are
3369 * high on modern cpus, so consider failing only if optimizing
3370 * for space.
3371 */
3372
3373 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3374 && BRANCH_COST (optimize_insn_for_speed_p (),
3375 false) >= 2)
3376 {
3377 if (cf == 0)
3378 {
3379 machine_mode cmp_mode = GET_MODE (op0);
3380 enum rtx_code new_code;
3381
3382 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3383 {
3384 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3385
8f17461b
UB
3386 /* We may be reversing a non-trapping
3387 comparison to a trapping comparison. */
3388 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3389 && code != EQ && code != NE
3390 && code != ORDERED && code != UNORDERED)
3391 new_code = UNKNOWN;
3392 else
3393 new_code = reverse_condition_maybe_unordered (code);
3394
2bf6d935
ML
3395 }
3396 else
3397 {
3398 new_code = ix86_reverse_condition (code, cmp_mode);
3399 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3400 compare_code = reverse_condition (compare_code);
3401 }
3402
3403 if (new_code != UNKNOWN)
3404 {
3405 cf = ct;
3406 ct = 0;
3407 code = new_code;
3408 }
3409 }
3410
3411 if (compare_code != UNKNOWN)
3412 {
3413 /* notl op1 (if needed)
3414 sarl $31, op1
3415 andl (cf-ct), op1
3416 addl ct, op1
3417
3418 For x < 0 (resp. x <= -1) there will be no notl,
3419 so if possible swap the constants to get rid of the
3420 complement.
3421 True/false will be -1/0 while code below (store flag
3422 followed by decrement) is 0/-1, so the constants need
3423 to be exchanged once more. */
3424
3425 if (compare_code == GE || !cf)
3426 {
3427 code = reverse_condition (code);
3428 compare_code = LT;
3429 }
3430 else
3431 std::swap (ct, cf);
3432
3433 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3434 }
3435 else
3436 {
3437 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3438
3439 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3440 constm1_rtx,
3441 copy_rtx (out), 1, OPTAB_DIRECT);
3442 }
3443
3444 out = expand_simple_binop (mode, AND, copy_rtx (out),
3445 gen_int_mode (cf - ct, mode),
3446 copy_rtx (out), 1, OPTAB_DIRECT);
3447 if (ct)
3448 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3449 copy_rtx (out), 1, OPTAB_DIRECT);
3450 if (!rtx_equal_p (out, operands[0]))
3451 emit_move_insn (operands[0], copy_rtx (out));
3452
3453 return true;
3454 }
3455 }
3456
3457 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3458 {
3459 /* Try a few things more with specific constants and a variable. */
3460
3461 optab op;
3462 rtx var, orig_out, out, tmp;
3463
3464 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3465 return false;
3466
3467 /* If one of the two operands is an interesting constant, load a
3468 constant with the above and mask it in with a logical operation. */
3469
3470 if (CONST_INT_P (operands[2]))
3471 {
3472 var = operands[3];
3473 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3474 operands[3] = constm1_rtx, op = and_optab;
3475 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3476 operands[3] = const0_rtx, op = ior_optab;
3477 else
3478 return false;
3479 }
3480 else if (CONST_INT_P (operands[3]))
3481 {
3482 var = operands[2];
3483 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
e4ced0b6
RS
3484 {
3485 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3486 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3487 if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
3488 operands[1] = simplify_gen_relational (LT, VOIDmode,
3489 GET_MODE (op0),
3490 op0, const0_rtx);
3491
3492 operands[2] = constm1_rtx;
3493 op = and_optab;
3494 }
2bf6d935
ML
3495 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3496 operands[2] = const0_rtx, op = ior_optab;
3497 else
3498 return false;
3499 }
3500 else
3501 return false;
3502
3503 orig_out = operands[0];
3504 tmp = gen_reg_rtx (mode);
3505 operands[0] = tmp;
3506
3507 /* Recurse to get the constant loaded. */
3508 if (!ix86_expand_int_movcc (operands))
3509 return false;
3510
3511 /* Mask in the interesting variable. */
3512 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3513 OPTAB_WIDEN);
3514 if (!rtx_equal_p (out, orig_out))
3515 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3516
3517 return true;
3518 }
3519
3520 /*
3521 * For comparison with above,
3522 *
3523 * movl cf,dest
3524 * movl ct,tmp
3525 * cmpl op1,op2
3526 * cmovcc tmp,dest
3527 *
3528 * Size 15.
3529 */
3530
3531 if (! nonimmediate_operand (operands[2], mode))
3532 operands[2] = force_reg (mode, operands[2]);
3533 if (! nonimmediate_operand (operands[3], mode))
3534 operands[3] = force_reg (mode, operands[3]);
3535
3536 if (! register_operand (operands[2], VOIDmode)
3537 && (mode == QImode
3538 || ! register_operand (operands[3], VOIDmode)))
3539 operands[2] = force_reg (mode, operands[2]);
3540
3541 if (mode == QImode
3542 && ! register_operand (operands[3], VOIDmode))
3543 operands[3] = force_reg (mode, operands[3]);
3544
3545 emit_insn (compare_seq);
3546 emit_insn (gen_rtx_SET (operands[0],
3547 gen_rtx_IF_THEN_ELSE (mode,
3548 compare_op, operands[2],
3549 operands[3])));
3550 return true;
3551}
3552
3553/* Detect conditional moves that exactly match min/max operational
3554 semantics. Note that this is IEEE safe, as long as we don't
3555 interchange the operands.
3556
3557 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3558 and TRUE if the operation is successful and instructions are emitted. */
3559
3560static bool
3561ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3562 rtx cmp_op1, rtx if_true, rtx if_false)
3563{
3564 machine_mode mode;
3565 bool is_min;
3566 rtx tmp;
3567
3568 if (code == LT)
3569 ;
3570 else if (code == UNGE)
3571 std::swap (if_true, if_false);
3572 else
3573 return false;
3574
3575 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3576 is_min = true;
3577 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3578 is_min = false;
3579 else
3580 return false;
3581
3582 mode = GET_MODE (dest);
3583
3584 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3585 but MODE may be a vector mode and thus not appropriate. */
3586 if (!flag_finite_math_only || flag_signed_zeros)
3587 {
3588 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3589 rtvec v;
3590
3591 if_true = force_reg (mode, if_true);
3592 v = gen_rtvec (2, if_true, if_false);
3593 tmp = gen_rtx_UNSPEC (mode, v, u);
3594 }
3595 else
3596 {
3597 code = is_min ? SMIN : SMAX;
3598 if (MEM_P (if_true) && MEM_P (if_false))
3599 if_true = force_reg (mode, if_true);
3600 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3601 }
3602
3603 emit_insn (gen_rtx_SET (dest, tmp));
3604 return true;
3605}
3606
8b905e9b
HL
3607/* Return true if MODE is valid for vector compare to mask register,
3608 Same result for conditionl vector move with mask register. */
3609static bool
3610ix86_valid_mask_cmp_mode (machine_mode mode)
3611{
3612 /* XOP has its own vector conditional movement. */
a8654147 3613 if (TARGET_XOP && !TARGET_AVX512F)
8b905e9b
HL
3614 return false;
3615
3616 /* AVX512F is needed for mask operation. */
3617 if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3618 return false;
3619
3620 /* AVX512BW is needed for vector QI/HImode,
3621 AVX512VL is needed for 128/256-bit vector. */
3622 machine_mode inner_mode = GET_MODE_INNER (mode);
3623 int vector_size = GET_MODE_SIZE (mode);
3624 if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3625 return false;
3626
3627 return vector_size == 64 || TARGET_AVX512VL;
3628}
3629
8d0737d8 3630/* Return true if integer mask comparison should be used. */
3631static bool
3632ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
3633 rtx op_true, rtx op_false)
3634{
92f372f0
UB
3635 int vector_size = GET_MODE_SIZE (mode);
3636
3637 if (vector_size < 16)
3638 return false;
3639 else if (vector_size == 64)
8d0737d8 3640 return true;
3641
3642 /* When op_true is NULL, op_false must be NULL, or vice versa. */
3643 gcc_assert (!op_true == !op_false);
3644
3645 /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3646 vector dest is required. */
3647 if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
3648 return false;
3649
3650 /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
3651 if (op_false == CONST0_RTX (mode)
3652 || op_true == CONST0_RTX (mode)
3653 || (INTEGRAL_MODE_P (mode)
3654 && (op_true == CONSTM1_RTX (mode)
3655 || op_false == CONSTM1_RTX (mode))))
3656 return false;
3657
3658 return true;
3659}
3660
2bf6d935
ML
3661/* Expand an SSE comparison. Return the register with the result. */
3662
3663static rtx
3664ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3665 rtx op_true, rtx op_false)
3666{
3667 machine_mode mode = GET_MODE (dest);
3668 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3669
3670 /* In general case result of comparison can differ from operands' type. */
3671 machine_mode cmp_mode;
3672
3673 /* In AVX512F the result of comparison is an integer mask. */
3674 bool maskcmp = false;
3675 rtx x;
3676
8d0737d8 3677 if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
2bf6d935
ML
3678 {
3679 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
2bf6d935 3680 maskcmp = true;
8b905e9b 3681 cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
2bf6d935
ML
3682 }
3683 else
3684 cmp_mode = cmp_ops_mode;
3685
3686 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3687
a86b3453 3688 bool (*op1_predicate)(rtx, machine_mode)
2bf6d935
ML
3689 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3690
3691 if (!op1_predicate (cmp_op1, cmp_ops_mode))
3692 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3693
3694 if (optimize
3695 || (maskcmp && cmp_mode != mode)
3696 || (op_true && reg_overlap_mentioned_p (dest, op_true))
3697 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3698 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3699
99e4891e 3700 if (maskcmp)
3701 {
3702 bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
3703 gcc_assert (ok);
3704 return dest;
3705 }
3706
2bf6d935
ML
3707 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3708
8d0737d8 3709 if (cmp_mode != mode)
2bf6d935
ML
3710 {
3711 x = force_reg (cmp_ops_mode, x);
3712 convert_move (dest, x, false);
3713 }
3714 else
3715 emit_insn (gen_rtx_SET (dest, x));
3716
3717 return dest;
3718}
3719
3720/* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3721 operations. This is used for both scalar and vector conditional moves. */
3722
3723void
3724ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3725{
3726 machine_mode mode = GET_MODE (dest);
3727 machine_mode cmpmode = GET_MODE (cmp);
3728
9b5d50b7 3729 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
3730 if (rtx_equal_p (op_true, op_false))
3731 {
3732 emit_move_insn (dest, op_true);
3733 return;
3734 }
3735
2bf6d935
ML
3736 rtx t2, t3, x;
3737
3738 /* If we have an integer mask and FP value then we need
3739 to cast mask to FP mode. */
3740 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3741 {
3742 cmp = force_reg (cmpmode, cmp);
3743 cmp = gen_rtx_SUBREG (mode, cmp, 0);
3744 }
3745
8d0737d8 3746 /* In AVX512F the result of comparison is an integer mask. */
3747 if (mode != cmpmode
3748 && GET_MODE_CLASS (cmpmode) == MODE_INT)
2bf6d935 3749 {
8d0737d8 3750 gcc_assert (ix86_valid_mask_cmp_mode (mode));
8b905e9b
HL
3751 /* Using vector move with mask register. */
3752 cmp = force_reg (cmpmode, cmp);
3753 /* Optimize for mask zero. */
3754 op_true = (op_true != CONST0_RTX (mode)
3755 ? force_reg (mode, op_true) : op_true);
3756 op_false = (op_false != CONST0_RTX (mode)
3757 ? force_reg (mode, op_false) : op_false);
3758 if (op_true == CONST0_RTX (mode))
2bf6d935 3759 {
8b905e9b 3760 rtx n = gen_reg_rtx (cmpmode);
ee78c20e 3761 if (cmpmode == E_DImode && !TARGET_64BIT)
3762 emit_insn (gen_knotdi (n, cmp));
3763 else
3764 emit_insn (gen_rtx_SET (n, gen_rtx_fmt_e (NOT, cmpmode, cmp)));
8b905e9b
HL
3765 cmp = n;
3766 /* Reverse op_true op_false. */
3767 std::swap (op_true, op_false);
2bf6d935 3768 }
8b905e9b
HL
3769
3770 rtx vec_merge = gen_rtx_VEC_MERGE (mode, op_true, op_false, cmp);
3771 emit_insn (gen_rtx_SET (dest, vec_merge));
3772 return;
2bf6d935
ML
3773 }
3774 else if (vector_all_ones_operand (op_true, mode)
3775 && op_false == CONST0_RTX (mode))
3776 {
3777 emit_insn (gen_rtx_SET (dest, cmp));
3778 return;
3779 }
3780 else if (op_false == CONST0_RTX (mode))
3781 {
3782 op_true = force_reg (mode, op_true);
3783 x = gen_rtx_AND (mode, cmp, op_true);
3784 emit_insn (gen_rtx_SET (dest, x));
3785 return;
3786 }
3787 else if (op_true == CONST0_RTX (mode))
3788 {
3789 op_false = force_reg (mode, op_false);
3790 x = gen_rtx_NOT (mode, cmp);
3791 x = gen_rtx_AND (mode, x, op_false);
3792 emit_insn (gen_rtx_SET (dest, x));
3793 return;
3794 }
3795 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
3796 {
3797 op_false = force_reg (mode, op_false);
3798 x = gen_rtx_IOR (mode, cmp, op_false);
3799 emit_insn (gen_rtx_SET (dest, x));
3800 return;
3801 }
3802 else if (TARGET_XOP)
3803 {
3804 op_true = force_reg (mode, op_true);
3805
f1693741
UB
3806 if (GET_MODE_SIZE (mode) < 16
3807 || !nonimmediate_operand (op_false, mode))
2bf6d935
ML
3808 op_false = force_reg (mode, op_false);
3809
3810 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
3811 op_true,
3812 op_false)));
3813 return;
3814 }
3815
3816 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
3817 rtx d = dest;
3818
3819 if (!vector_operand (op_true, mode))
3820 op_true = force_reg (mode, op_true);
3821
3822 op_false = force_reg (mode, op_false);
3823
3824 switch (mode)
3825 {
b1f7fd8a
UB
3826 case E_V2SFmode:
3827 if (TARGET_SSE4_1)
3828 {
3829 gen = gen_mmx_blendvps;
3830 op_true = force_reg (mode, op_true);
3831 }
3832 break;
2bf6d935
ML
3833 case E_V4SFmode:
3834 if (TARGET_SSE4_1)
3835 gen = gen_sse4_1_blendvps;
3836 break;
3837 case E_V2DFmode:
3838 if (TARGET_SSE4_1)
3839 gen = gen_sse4_1_blendvpd;
3840 break;
3841 case E_SFmode:
3842 if (TARGET_SSE4_1)
3843 {
3844 gen = gen_sse4_1_blendvss;
3845 op_true = force_reg (mode, op_true);
3846 }
3847 break;
3848 case E_DFmode:
3849 if (TARGET_SSE4_1)
3850 {
3851 gen = gen_sse4_1_blendvsd;
3852 op_true = force_reg (mode, op_true);
3853 }
3854 break;
5795ec0e
UB
3855 case E_V8QImode:
3856 case E_V4HImode:
3857 case E_V2SImode:
3858 if (TARGET_SSE4_1)
3859 {
69577673
UB
3860 op_true = force_reg (mode, op_true);
3861
2df9d3c5 3862 gen = gen_mmx_pblendvb64;
5795ec0e
UB
3863 if (mode != V8QImode)
3864 d = gen_reg_rtx (V8QImode);
3865 op_false = gen_lowpart (V8QImode, op_false);
3866 op_true = gen_lowpart (V8QImode, op_true);
3867 cmp = gen_lowpart (V8QImode, cmp);
3868 }
3869 break;
2df9d3c5
UB
3870 case E_V4QImode:
3871 case E_V2HImode:
3872 if (TARGET_SSE4_1)
3873 {
3874 op_true = force_reg (mode, op_true);
3875
3876 gen = gen_mmx_pblendvb32;
3877 if (mode != V4QImode)
3878 d = gen_reg_rtx (V4QImode);
3879 op_false = gen_lowpart (V4QImode, op_false);
3880 op_true = gen_lowpart (V4QImode, op_true);
3881 cmp = gen_lowpart (V4QImode, cmp);
3882 }
3883 break;
2bf6d935
ML
3884 case E_V16QImode:
3885 case E_V8HImode:
9e2a82e1 3886 case E_V8HFmode:
2bf6d935
ML
3887 case E_V4SImode:
3888 case E_V2DImode:
3889 if (TARGET_SSE4_1)
3890 {
3891 gen = gen_sse4_1_pblendvb;
3892 if (mode != V16QImode)
3893 d = gen_reg_rtx (V16QImode);
3894 op_false = gen_lowpart (V16QImode, op_false);
3895 op_true = gen_lowpart (V16QImode, op_true);
3896 cmp = gen_lowpart (V16QImode, cmp);
3897 }
3898 break;
3899 case E_V8SFmode:
3900 if (TARGET_AVX)
3901 gen = gen_avx_blendvps256;
3902 break;
3903 case E_V4DFmode:
3904 if (TARGET_AVX)
3905 gen = gen_avx_blendvpd256;
3906 break;
3907 case E_V32QImode:
3908 case E_V16HImode:
9e2a82e1 3909 case E_V16HFmode:
2bf6d935
ML
3910 case E_V8SImode:
3911 case E_V4DImode:
3912 if (TARGET_AVX2)
3913 {
3914 gen = gen_avx2_pblendvb;
3915 if (mode != V32QImode)
3916 d = gen_reg_rtx (V32QImode);
3917 op_false = gen_lowpart (V32QImode, op_false);
3918 op_true = gen_lowpart (V32QImode, op_true);
3919 cmp = gen_lowpart (V32QImode, cmp);
3920 }
3921 break;
3922
3923 case E_V64QImode:
3924 gen = gen_avx512bw_blendmv64qi;
3925 break;
3926 case E_V32HImode:
3927 gen = gen_avx512bw_blendmv32hi;
3928 break;
9e2a82e1 3929 case E_V32HFmode:
3930 gen = gen_avx512bw_blendmv32hf;
3931 break;
2bf6d935
ML
3932 case E_V16SImode:
3933 gen = gen_avx512f_blendmv16si;
3934 break;
3935 case E_V8DImode:
3936 gen = gen_avx512f_blendmv8di;
3937 break;
3938 case E_V8DFmode:
3939 gen = gen_avx512f_blendmv8df;
3940 break;
3941 case E_V16SFmode:
3942 gen = gen_avx512f_blendmv16sf;
3943 break;
3944
3945 default:
3946 break;
3947 }
3948
3949 if (gen != NULL)
3950 {
3951 emit_insn (gen (d, op_false, op_true, cmp));
3952 if (d != dest)
3953 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
3954 }
3955 else
3956 {
3957 op_true = force_reg (mode, op_true);
3958
3959 t2 = gen_reg_rtx (mode);
3960 if (optimize)
3961 t3 = gen_reg_rtx (mode);
3962 else
3963 t3 = dest;
3964
3965 x = gen_rtx_AND (mode, op_true, cmp);
3966 emit_insn (gen_rtx_SET (t2, x));
3967
3968 x = gen_rtx_NOT (mode, cmp);
3969 x = gen_rtx_AND (mode, x, op_false);
3970 emit_insn (gen_rtx_SET (t3, x));
3971
3972 x = gen_rtx_IOR (mode, t3, t2);
3973 emit_insn (gen_rtx_SET (dest, x));
3974 }
3975}
3976
3977/* Swap, force into registers, or otherwise massage the two operands
3978 to an sse comparison with a mask result. Thus we differ a bit from
3979 ix86_prepare_fp_compare_args which expects to produce a flags result.
3980
3981 The DEST operand exists to help determine whether to commute commutative
3982 operators. The POP0/POP1 operands are updated in place. The new
3983 comparison code is returned, or UNKNOWN if not implementable. */
3984
3985static enum rtx_code
3986ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
3987 rtx *pop0, rtx *pop1)
3988{
3989 switch (code)
3990 {
3991 case LTGT:
3992 case UNEQ:
3993 /* AVX supports all the needed comparisons. */
3994 if (TARGET_AVX)
3995 break;
3996 /* We have no LTGT as an operator. We could implement it with
3997 NE & ORDERED, but this requires an extra temporary. It's
3998 not clear that it's worth it. */
3999 return UNKNOWN;
4000
4001 case LT:
4002 case LE:
4003 case UNGT:
4004 case UNGE:
4005 /* These are supported directly. */
4006 break;
4007
4008 case EQ:
4009 case NE:
4010 case UNORDERED:
4011 case ORDERED:
4012 /* AVX has 3 operand comparisons, no need to swap anything. */
4013 if (TARGET_AVX)
4014 break;
4015 /* For commutative operators, try to canonicalize the destination
4016 operand to be first in the comparison - this helps reload to
4017 avoid extra moves. */
4018 if (!dest || !rtx_equal_p (dest, *pop1))
4019 break;
4020 /* FALLTHRU */
4021
4022 case GE:
4023 case GT:
4024 case UNLE:
4025 case UNLT:
4026 /* These are not supported directly before AVX, and furthermore
4027 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
4028 comparison operands to transform into something that is
4029 supported. */
4030 std::swap (*pop0, *pop1);
4031 code = swap_condition (code);
4032 break;
4033
4034 default:
4035 gcc_unreachable ();
4036 }
4037
4038 return code;
4039}
4040
4041/* Expand a floating-point conditional move. Return true if successful. */
4042
4043bool
4044ix86_expand_fp_movcc (rtx operands[])
4045{
4046 machine_mode mode = GET_MODE (operands[0]);
4047 enum rtx_code code = GET_CODE (operands[1]);
4048 rtx tmp, compare_op;
4049 rtx op0 = XEXP (operands[1], 0);
4050 rtx op1 = XEXP (operands[1], 1);
4051
a6841211 4052 if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
2bf6d935
ML
4053 {
4054 machine_mode cmode;
4055
4056 /* Since we've no cmove for sse registers, don't force bad register
4057 allocation just to gain access to it. Deny movcc when the
4058 comparison mode doesn't match the move mode. */
4059 cmode = GET_MODE (op0);
4060 if (cmode == VOIDmode)
4061 cmode = GET_MODE (op1);
4062 if (cmode != mode)
4063 return false;
4064
4065 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
4066 if (code == UNKNOWN)
4067 return false;
4068
4069 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
4070 operands[2], operands[3]))
4071 return true;
4072
4073 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
4074 operands[2], operands[3]);
4075 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
4076 return true;
4077 }
4078
4079 if (GET_MODE (op0) == TImode
4080 || (GET_MODE (op0) == DImode
4081 && !TARGET_64BIT))
4082 return false;
4083
4084 /* The floating point conditional move instructions don't directly
4085 support conditions resulting from a signed integer comparison. */
4086
4087 compare_op = ix86_expand_compare (code, op0, op1);
4088 if (!fcmov_comparison_operator (compare_op, VOIDmode))
4089 {
4090 tmp = gen_reg_rtx (QImode);
4091 ix86_expand_setcc (tmp, code, op0, op1);
4092
4093 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
4094 }
4095
4096 emit_insn (gen_rtx_SET (operands[0],
4097 gen_rtx_IF_THEN_ELSE (mode, compare_op,
4098 operands[2], operands[3])));
4099
4100 return true;
4101}
4102
4103/* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
4104
4105static int
4106ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
4107{
4108 switch (code)
4109 {
4110 case EQ:
4111 return 0;
4112 case LT:
4113 case LTU:
4114 return 1;
4115 case LE:
4116 case LEU:
4117 return 2;
4118 case NE:
4119 return 4;
4120 case GE:
4121 case GEU:
4122 return 5;
4123 case GT:
4124 case GTU:
4125 return 6;
4126 default:
4127 gcc_unreachable ();
4128 }
4129}
4130
4131/* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
4132
4133static int
4134ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
4135{
4136 switch (code)
4137 {
4138 case EQ:
4139 return 0x00;
4140 case NE:
4141 return 0x04;
4142 case GT:
4143 return 0x0e;
4144 case LE:
4145 return 0x02;
4146 case GE:
4147 return 0x0d;
4148 case LT:
4149 return 0x01;
4150 case UNLE:
4151 return 0x0a;
4152 case UNLT:
4153 return 0x09;
4154 case UNGE:
4155 return 0x05;
4156 case UNGT:
4157 return 0x06;
4158 case UNEQ:
4159 return 0x18;
4160 case LTGT:
4161 return 0x0c;
4162 case ORDERED:
4163 return 0x07;
4164 case UNORDERED:
4165 return 0x03;
4166 default:
4167 gcc_unreachable ();
4168 }
4169}
4170
4171/* Return immediate value to be used in UNSPEC_PCMP
4172 for comparison CODE in MODE. */
4173
4174static int
4175ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
4176{
4177 if (FLOAT_MODE_P (mode))
4178 return ix86_fp_cmp_code_to_pcmp_immediate (code);
4179 return ix86_int_cmp_code_to_pcmp_immediate (code);
4180}
4181
4182/* Expand AVX-512 vector comparison. */
4183
4184bool
99e4891e 4185ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
2bf6d935 4186{
99e4891e 4187 machine_mode mask_mode = GET_MODE (dest);
4188 machine_mode cmp_mode = GET_MODE (cmp_op0);
2bf6d935
ML
4189 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
4190 int unspec_code;
4191 rtx unspec;
4192
4193 switch (code)
4194 {
4195 case LEU:
4196 case GTU:
4197 case GEU:
4198 case LTU:
4199 unspec_code = UNSPEC_UNSIGNED_PCMP;
4200 break;
4201
4202 default:
4203 unspec_code = UNSPEC_PCMP;
4204 }
4205
99e4891e 4206 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
2bf6d935 4207 unspec_code);
99e4891e 4208 emit_insn (gen_rtx_SET (dest, unspec));
2bf6d935
ML
4209
4210 return true;
4211}
4212
4213/* Expand fp vector comparison. */
4214
4215bool
4216ix86_expand_fp_vec_cmp (rtx operands[])
4217{
4218 enum rtx_code code = GET_CODE (operands[1]);
4219 rtx cmp;
4220
4221 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4222 &operands[2], &operands[3]);
4223 if (code == UNKNOWN)
4224 {
4225 rtx temp;
4226 switch (GET_CODE (operands[1]))
4227 {
4228 case LTGT:
4229 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4230 operands[3], NULL, NULL);
4231 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4232 operands[3], NULL, NULL);
4233 code = AND;
4234 break;
4235 case UNEQ:
4236 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4237 operands[3], NULL, NULL);
4238 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4239 operands[3], NULL, NULL);
4240 code = IOR;
4241 break;
4242 default:
4243 gcc_unreachable ();
4244 }
4245 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4246 OPTAB_DIRECT);
4247 }
4248 else
4249 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
8d0737d8 4250 NULL, NULL);
2bf6d935
ML
4251
4252 if (operands[0] != cmp)
4253 emit_move_insn (operands[0], cmp);
4254
4255 return true;
4256}
4257
4258static rtx
4259ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4260 rtx op_true, rtx op_false, bool *negate)
4261{
4262 machine_mode data_mode = GET_MODE (dest);
4263 machine_mode mode = GET_MODE (cop0);
4264 rtx x;
4265
4266 *negate = false;
4267
4268 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4269 if (TARGET_XOP
6c67afaf
UB
4270 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4271 && GET_MODE_SIZE (mode) <= 16)
2bf6d935 4272 ;
8b905e9b
HL
4273 /* AVX512F supports all of the comparsions
4274 on all 128/256/512-bit vector int types. */
8d0737d8 4275 else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
8b905e9b 4276 ;
2bf6d935
ML
4277 else
4278 {
4279 /* Canonicalize the comparison to EQ, GT, GTU. */
4280 switch (code)
4281 {
4282 case EQ:
4283 case GT:
4284 case GTU:
4285 break;
4286
4287 case NE:
4288 case LE:
4289 case LEU:
4290 code = reverse_condition (code);
4291 *negate = true;
4292 break;
4293
4294 case GE:
4295 case GEU:
4296 code = reverse_condition (code);
4297 *negate = true;
4298 /* FALLTHRU */
4299
4300 case LT:
4301 case LTU:
4302 std::swap (cop0, cop1);
4303 code = swap_condition (code);
4304 break;
4305
4306 default:
4307 gcc_unreachable ();
4308 }
4309
4310 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4311 if (mode == V2DImode)
4312 {
4313 switch (code)
4314 {
4315 case EQ:
4316 /* SSE4.1 supports EQ. */
4317 if (!TARGET_SSE4_1)
4318 return NULL;
4319 break;
4320
4321 case GT:
4322 case GTU:
4323 /* SSE4.2 supports GT/GTU. */
4324 if (!TARGET_SSE4_2)
4325 return NULL;
4326 break;
4327
4328 default:
4329 gcc_unreachable ();
4330 }
4331 }
4332
4333 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4334 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4335 if (*negate)
4336 std::swap (optrue, opfalse);
4337
4338 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4339 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4340 min (x, y) == x). While we add one instruction (the minimum),
4341 we remove the need for two instructions in the negation, as the
4342 result is done this way.
4343 When using masks, do it for SI/DImode element types, as it is shorter
4344 than the two subtractions. */
4345 if ((code != EQ
4346 && GET_MODE_SIZE (mode) != 64
4347 && vector_all_ones_operand (opfalse, data_mode)
4348 && optrue == CONST0_RTX (data_mode))
4349 || (code == GTU
4350 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4351 /* Don't do it if not using integer masks and we'd end up with
4352 the right values in the registers though. */
4353 && (GET_MODE_SIZE (mode) == 64
4354 || !vector_all_ones_operand (optrue, data_mode)
4355 || opfalse != CONST0_RTX (data_mode))))
4356 {
4357 rtx (*gen) (rtx, rtx, rtx) = NULL;
4358
4359 switch (mode)
4360 {
4361 case E_V16SImode:
4362 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4363 break;
4364 case E_V8DImode:
4365 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4366 cop0 = force_reg (mode, cop0);
4367 cop1 = force_reg (mode, cop1);
4368 break;
4369 case E_V32QImode:
4370 if (TARGET_AVX2)
4371 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4372 break;
4373 case E_V16HImode:
4374 if (TARGET_AVX2)
4375 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4376 break;
4377 case E_V8SImode:
4378 if (TARGET_AVX2)
4379 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4380 break;
4381 case E_V4DImode:
4382 if (TARGET_AVX512VL)
4383 {
4384 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4385 cop0 = force_reg (mode, cop0);
4386 cop1 = force_reg (mode, cop1);
4387 }
4388 break;
4389 case E_V16QImode:
4390 if (code == GTU && TARGET_SSE2)
4391 gen = gen_uminv16qi3;
4392 else if (code == GT && TARGET_SSE4_1)
4393 gen = gen_sminv16qi3;
4394 break;
f3661f2d
UB
4395 case E_V8QImode:
4396 if (code == GTU && TARGET_SSE2)
4397 gen = gen_uminv8qi3;
4398 else if (code == GT && TARGET_SSE4_1)
4399 gen = gen_sminv8qi3;
4400 break;
2df9d3c5
UB
4401 case E_V4QImode:
4402 if (code == GTU && TARGET_SSE2)
4403 gen = gen_uminv4qi3;
4404 else if (code == GT && TARGET_SSE4_1)
4405 gen = gen_sminv4qi3;
4406 break;
2bf6d935
ML
4407 case E_V8HImode:
4408 if (code == GTU && TARGET_SSE4_1)
4409 gen = gen_uminv8hi3;
4410 else if (code == GT && TARGET_SSE2)
4411 gen = gen_sminv8hi3;
4412 break;
f3661f2d
UB
4413 case E_V4HImode:
4414 if (code == GTU && TARGET_SSE4_1)
4415 gen = gen_uminv4hi3;
4416 else if (code == GT && TARGET_SSE2)
4417 gen = gen_sminv4hi3;
4418 break;
2df9d3c5
UB
4419 case E_V2HImode:
4420 if (code == GTU && TARGET_SSE4_1)
4421 gen = gen_uminv2hi3;
4422 else if (code == GT && TARGET_SSE2)
4423 gen = gen_sminv2hi3;
4424 break;
2bf6d935
ML
4425 case E_V4SImode:
4426 if (TARGET_SSE4_1)
4427 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4428 break;
f3661f2d
UB
4429 case E_V2SImode:
4430 if (TARGET_SSE4_1)
4431 gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
4432 break;
2bf6d935
ML
4433 case E_V2DImode:
4434 if (TARGET_AVX512VL)
4435 {
4436 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4437 cop0 = force_reg (mode, cop0);
4438 cop1 = force_reg (mode, cop1);
4439 }
4440 break;
4441 default:
4442 break;
4443 }
4444
4445 if (gen)
4446 {
4447 rtx tem = gen_reg_rtx (mode);
4448 if (!vector_operand (cop0, mode))
4449 cop0 = force_reg (mode, cop0);
4450 if (!vector_operand (cop1, mode))
4451 cop1 = force_reg (mode, cop1);
4452 *negate = !*negate;
4453 emit_insn (gen (tem, cop0, cop1));
4454 cop1 = tem;
4455 code = EQ;
4456 }
4457 }
4458
4459 /* Unsigned parallel compare is not supported by the hardware.
4460 Play some tricks to turn this into a signed comparison
4461 against 0. */
4462 if (code == GTU)
4463 {
4464 cop0 = force_reg (mode, cop0);
4465
4466 switch (mode)
4467 {
4468 case E_V16SImode:
4469 case E_V8DImode:
4470 case E_V8SImode:
4471 case E_V4DImode:
4472 case E_V4SImode:
f3661f2d 4473 case E_V2SImode:
2bf6d935
ML
4474 case E_V2DImode:
4475 {
4476 rtx t1, t2, mask;
83bc5e44 4477
2bf6d935
ML
4478 /* Subtract (-(INT MAX) - 1) from both operands to make
4479 them signed. */
4480 mask = ix86_build_signbit_mask (mode, true, false);
4481 t1 = gen_reg_rtx (mode);
83bc5e44 4482 emit_insn (gen_sub3_insn (t1, cop0, mask));
2bf6d935
ML
4483
4484 t2 = gen_reg_rtx (mode);
83bc5e44 4485 emit_insn (gen_sub3_insn (t2, cop1, mask));
2bf6d935
ML
4486
4487 cop0 = t1;
4488 cop1 = t2;
4489 code = GT;
4490 }
4491 break;
4492
4493 case E_V64QImode:
4494 case E_V32HImode:
4495 case E_V32QImode:
4496 case E_V16HImode:
4497 case E_V16QImode:
f3661f2d 4498 case E_V8QImode:
2df9d3c5 4499 case E_V4QImode:
2bf6d935 4500 case E_V8HImode:
f3661f2d 4501 case E_V4HImode:
2df9d3c5 4502 case E_V2HImode:
2bf6d935
ML
4503 /* Perform a parallel unsigned saturating subtraction. */
4504 x = gen_reg_rtx (mode);
83bc5e44
UB
4505 emit_insn (gen_rtx_SET
4506 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
2bf6d935
ML
4507 cop0 = x;
4508 cop1 = CONST0_RTX (mode);
4509 code = EQ;
4510 *negate = !*negate;
4511 break;
4512
4513 default:
4514 gcc_unreachable ();
4515 }
4516 }
4517 }
4518
4519 if (*negate)
4520 std::swap (op_true, op_false);
4521
4522 /* Allow the comparison to be done in one mode, but the movcc to
4523 happen in another mode. */
4524 if (data_mode == mode)
4525 {
4526 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
4527 op_true, op_false);
4528 }
4529 else
4530 {
4531 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4532 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4533 op_true, op_false);
4534 if (GET_MODE (x) == mode)
4535 x = gen_lowpart (data_mode, x);
4536 }
4537
4538 return x;
4539}
4540
4541/* Expand integer vector comparison. */
4542
4543bool
4544ix86_expand_int_vec_cmp (rtx operands[])
4545{
4546 rtx_code code = GET_CODE (operands[1]);
4547 bool negate = false;
4548 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4549 operands[3], NULL, NULL, &negate);
4550
4551 if (!cmp)
4552 return false;
4553
4554 if (negate)
4555 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4556 CONST0_RTX (GET_MODE (cmp)),
4557 NULL, NULL, &negate);
4558
4559 gcc_assert (!negate);
4560
4561 if (operands[0] != cmp)
4562 emit_move_insn (operands[0], cmp);
4563
4564 return true;
4565}
4566
4567/* Expand a floating-point vector conditional move; a vcond operation
4568 rather than a movcc operation. */
4569
4570bool
4571ix86_expand_fp_vcond (rtx operands[])
4572{
4573 enum rtx_code code = GET_CODE (operands[3]);
4574 rtx cmp;
4575
4576 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4577 &operands[4], &operands[5]);
4578 if (code == UNKNOWN)
4579 {
4580 rtx temp;
4581 switch (GET_CODE (operands[3]))
4582 {
4583 case LTGT:
4584 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4585 operands[5], operands[0], operands[0]);
4586 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4587 operands[5], operands[1], operands[2]);
4588 code = AND;
4589 break;
4590 case UNEQ:
4591 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4592 operands[5], operands[0], operands[0]);
4593 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4594 operands[5], operands[1], operands[2]);
4595 code = IOR;
4596 break;
4597 default:
4598 gcc_unreachable ();
4599 }
4600 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4601 OPTAB_DIRECT);
4602 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4603 return true;
4604 }
4605
4606 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4607 operands[5], operands[1], operands[2]))
4608 return true;
4609
4610 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4611 operands[1], operands[2]);
4612 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4613 return true;
4614}
4615
4616/* Expand a signed/unsigned integral vector conditional move. */
4617
4618bool
4619ix86_expand_int_vcond (rtx operands[])
4620{
4621 machine_mode data_mode = GET_MODE (operands[0]);
4622 machine_mode mode = GET_MODE (operands[4]);
4623 enum rtx_code code = GET_CODE (operands[3]);
4624 bool negate = false;
4625 rtx x, cop0, cop1;
4626
4627 cop0 = operands[4];
4628 cop1 = operands[5];
4629
4630 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4631 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4632 if ((code == LT || code == GE)
4633 && data_mode == mode
4634 && cop1 == CONST0_RTX (mode)
4635 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4636 && GET_MODE_UNIT_SIZE (data_mode) > 1
4637 && GET_MODE_UNIT_SIZE (data_mode) <= 8
4638 && (GET_MODE_SIZE (data_mode) == 16
4639 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4640 {
4641 rtx negop = operands[2 - (code == LT)];
4642 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4643 if (negop == CONST1_RTX (data_mode))
4644 {
4645 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4646 operands[0], 1, OPTAB_DIRECT);
4647 if (res != operands[0])
4648 emit_move_insn (operands[0], res);
4649 return true;
4650 }
4651 else if (GET_MODE_INNER (data_mode) != DImode
4652 && vector_all_ones_operand (negop, data_mode))
4653 {
4654 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4655 operands[0], 0, OPTAB_DIRECT);
4656 if (res != operands[0])
4657 emit_move_insn (operands[0], res);
4658 return true;
4659 }
4660 }
4661
4662 if (!nonimmediate_operand (cop1, mode))
4663 cop1 = force_reg (mode, cop1);
4664 if (!general_operand (operands[1], data_mode))
4665 operands[1] = force_reg (data_mode, operands[1]);
4666 if (!general_operand (operands[2], data_mode))
4667 operands[2] = force_reg (data_mode, operands[2]);
4668
4669 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
4670 operands[1], operands[2], &negate);
4671
4672 if (!x)
4673 return false;
4674
4675 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
4676 operands[2-negate]);
4677 return true;
4678}
4679
4680static bool
4681ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
4682 struct expand_vec_perm_d *d)
4683{
4684 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4685 expander, so args are either in d, or in op0, op1 etc. */
4686 machine_mode mode = GET_MODE (d ? d->op0 : op0);
4687 machine_mode maskmode = mode;
4688 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4689
4690 switch (mode)
4691 {
faf2b6bc 4692 case E_V16QImode:
4693 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
4694 gen = gen_avx512vl_vpermt2varv16qi3;
4695 break;
4696 case E_V32QImode:
4697 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
4698 gen = gen_avx512vl_vpermt2varv32qi3;
4699 break;
4700 case E_V64QImode:
4701 if (TARGET_AVX512VBMI)
4702 gen = gen_avx512bw_vpermt2varv64qi3;
4703 break;
2bf6d935
ML
4704 case E_V8HImode:
4705 if (TARGET_AVX512VL && TARGET_AVX512BW)
4706 gen = gen_avx512vl_vpermt2varv8hi3;
4707 break;
4708 case E_V16HImode:
4709 if (TARGET_AVX512VL && TARGET_AVX512BW)
4710 gen = gen_avx512vl_vpermt2varv16hi3;
4711 break;
2bf6d935
ML
4712 case E_V32HImode:
4713 if (TARGET_AVX512BW)
4714 gen = gen_avx512bw_vpermt2varv32hi3;
4715 break;
4716 case E_V4SImode:
4717 if (TARGET_AVX512VL)
4718 gen = gen_avx512vl_vpermt2varv4si3;
4719 break;
4720 case E_V8SImode:
4721 if (TARGET_AVX512VL)
4722 gen = gen_avx512vl_vpermt2varv8si3;
4723 break;
4724 case E_V16SImode:
4725 if (TARGET_AVX512F)
4726 gen = gen_avx512f_vpermt2varv16si3;
4727 break;
4728 case E_V4SFmode:
4729 if (TARGET_AVX512VL)
4730 {
4731 gen = gen_avx512vl_vpermt2varv4sf3;
4732 maskmode = V4SImode;
4733 }
4734 break;
4735 case E_V8SFmode:
4736 if (TARGET_AVX512VL)
4737 {
4738 gen = gen_avx512vl_vpermt2varv8sf3;
4739 maskmode = V8SImode;
4740 }
4741 break;
4742 case E_V16SFmode:
4743 if (TARGET_AVX512F)
4744 {
4745 gen = gen_avx512f_vpermt2varv16sf3;
4746 maskmode = V16SImode;
4747 }
4748 break;
4749 case E_V2DImode:
4750 if (TARGET_AVX512VL)
4751 gen = gen_avx512vl_vpermt2varv2di3;
4752 break;
4753 case E_V4DImode:
4754 if (TARGET_AVX512VL)
4755 gen = gen_avx512vl_vpermt2varv4di3;
4756 break;
4757 case E_V8DImode:
4758 if (TARGET_AVX512F)
4759 gen = gen_avx512f_vpermt2varv8di3;
4760 break;
4761 case E_V2DFmode:
4762 if (TARGET_AVX512VL)
4763 {
4764 gen = gen_avx512vl_vpermt2varv2df3;
4765 maskmode = V2DImode;
4766 }
4767 break;
4768 case E_V4DFmode:
4769 if (TARGET_AVX512VL)
4770 {
4771 gen = gen_avx512vl_vpermt2varv4df3;
4772 maskmode = V4DImode;
4773 }
4774 break;
4775 case E_V8DFmode:
4776 if (TARGET_AVX512F)
4777 {
4778 gen = gen_avx512f_vpermt2varv8df3;
4779 maskmode = V8DImode;
4780 }
4781 break;
4782 default:
4783 break;
4784 }
4785
4786 if (gen == NULL)
4787 return false;
4788
4789 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4790 expander, so args are either in d, or in op0, op1 etc. */
4791 if (d)
4792 {
4793 rtx vec[64];
4794 target = d->target;
4795 op0 = d->op0;
4796 op1 = d->op1;
4797 for (int i = 0; i < d->nelt; ++i)
4798 vec[i] = GEN_INT (d->perm[i]);
4799 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
4800 }
4801
4802 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
4803 return true;
4804}
4805
4806/* Expand a variable vector permutation. */
4807
4808void
4809ix86_expand_vec_perm (rtx operands[])
4810{
4811 rtx target = operands[0];
4812 rtx op0 = operands[1];
4813 rtx op1 = operands[2];
4814 rtx mask = operands[3];
4815 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
4816 machine_mode mode = GET_MODE (op0);
4817 machine_mode maskmode = GET_MODE (mask);
4818 int w, e, i;
4819 bool one_operand_shuffle = rtx_equal_p (op0, op1);
4820
4821 /* Number of elements in the vector. */
4822 w = GET_MODE_NUNITS (mode);
4823 e = GET_MODE_UNIT_SIZE (mode);
4824 gcc_assert (w <= 64);
4825
4826 if (TARGET_AVX512F && one_operand_shuffle)
4827 {
4828 rtx (*gen) (rtx, rtx, rtx) = NULL;
4829 switch (mode)
4830 {
4831 case E_V16SImode:
4832 gen =gen_avx512f_permvarv16si;
4833 break;
4834 case E_V16SFmode:
4835 gen = gen_avx512f_permvarv16sf;
4836 break;
4837 case E_V8DImode:
4838 gen = gen_avx512f_permvarv8di;
4839 break;
4840 case E_V8DFmode:
4841 gen = gen_avx512f_permvarv8df;
4842 break;
4843 default:
4844 break;
4845 }
4846 if (gen != NULL)
4847 {
4848 emit_insn (gen (target, op0, mask));
4849 return;
4850 }
4851 }
4852
4853 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
4854 return;
4855
4856 if (TARGET_AVX2)
4857 {
4858 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
4859 {
4860 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4861 an constant shuffle operand. With a tiny bit of effort we can
4862 use VPERMD instead. A re-interpretation stall for V4DFmode is
4863 unfortunate but there's no avoiding it.
4864 Similarly for V16HImode we don't have instructions for variable
4865 shuffling, while for V32QImode we can use after preparing suitable
4866 masks vpshufb; vpshufb; vpermq; vpor. */
4867
4868 if (mode == V16HImode)
4869 {
4870 maskmode = mode = V32QImode;
4871 w = 32;
4872 e = 1;
4873 }
4874 else
4875 {
4876 maskmode = mode = V8SImode;
4877 w = 8;
4878 e = 4;
4879 }
4880 t1 = gen_reg_rtx (maskmode);
4881
4882 /* Replicate the low bits of the V4DImode mask into V8SImode:
4883 mask = { A B C D }
4884 t1 = { A A B B C C D D }. */
4885 for (i = 0; i < w / 2; ++i)
4886 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
4887 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4888 vt = force_reg (maskmode, vt);
4889 mask = gen_lowpart (maskmode, mask);
4890 if (maskmode == V8SImode)
4891 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
4892 else
4893 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
4894
4895 /* Multiply the shuffle indicies by two. */
4896 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
4897 OPTAB_DIRECT);
4898
4899 /* Add one to the odd shuffle indicies:
4900 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
4901 for (i = 0; i < w / 2; ++i)
4902 {
4903 vec[i * 2] = const0_rtx;
4904 vec[i * 2 + 1] = const1_rtx;
4905 }
4906 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4907 vt = validize_mem (force_const_mem (maskmode, vt));
4908 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
4909 OPTAB_DIRECT);
4910
4911 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
4912 operands[3] = mask = t1;
4913 target = gen_reg_rtx (mode);
4914 op0 = gen_lowpart (mode, op0);
4915 op1 = gen_lowpart (mode, op1);
4916 }
4917
4918 switch (mode)
4919 {
4920 case E_V8SImode:
4921 /* The VPERMD and VPERMPS instructions already properly ignore
4922 the high bits of the shuffle elements. No need for us to
4923 perform an AND ourselves. */
4924 if (one_operand_shuffle)
4925 {
4926 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
4927 if (target != operands[0])
4928 emit_move_insn (operands[0],
4929 gen_lowpart (GET_MODE (operands[0]), target));
4930 }
4931 else
4932 {
4933 t1 = gen_reg_rtx (V8SImode);
4934 t2 = gen_reg_rtx (V8SImode);
4935 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
4936 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
4937 goto merge_two;
4938 }
4939 return;
4940
4941 case E_V8SFmode:
4942 mask = gen_lowpart (V8SImode, mask);
4943 if (one_operand_shuffle)
4944 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
4945 else
4946 {
4947 t1 = gen_reg_rtx (V8SFmode);
4948 t2 = gen_reg_rtx (V8SFmode);
4949 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
4950 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
4951 goto merge_two;
4952 }
4953 return;
4954
4955 case E_V4SImode:
4956 /* By combining the two 128-bit input vectors into one 256-bit
4957 input vector, we can use VPERMD and VPERMPS for the full
4958 two-operand shuffle. */
4959 t1 = gen_reg_rtx (V8SImode);
4960 t2 = gen_reg_rtx (V8SImode);
4961 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
4962 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4963 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
4964 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
4965 return;
4966
4967 case E_V4SFmode:
4968 t1 = gen_reg_rtx (V8SFmode);
4969 t2 = gen_reg_rtx (V8SImode);
4970 mask = gen_lowpart (V4SImode, mask);
4971 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
4972 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4973 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
4974 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
4975 return;
4976
4977 case E_V32QImode:
4978 t1 = gen_reg_rtx (V32QImode);
4979 t2 = gen_reg_rtx (V32QImode);
4980 t3 = gen_reg_rtx (V32QImode);
4981 vt2 = GEN_INT (-128);
4982 vt = gen_const_vec_duplicate (V32QImode, vt2);
4983 vt = force_reg (V32QImode, vt);
4984 for (i = 0; i < 32; i++)
4985 vec[i] = i < 16 ? vt2 : const0_rtx;
4986 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
4987 vt2 = force_reg (V32QImode, vt2);
4988 /* From mask create two adjusted masks, which contain the same
4989 bits as mask in the low 7 bits of each vector element.
4990 The first mask will have the most significant bit clear
4991 if it requests element from the same 128-bit lane
4992 and MSB set if it requests element from the other 128-bit lane.
4993 The second mask will have the opposite values of the MSB,
4994 and additionally will have its 128-bit lanes swapped.
4995 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4996 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
4997 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4998 stands for other 12 bytes. */
4999 /* The bit whether element is from the same lane or the other
5000 lane is bit 4, so shift it up by 3 to the MSB position. */
5001 t5 = gen_reg_rtx (V4DImode);
5002 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
5003 GEN_INT (3)));
5004 /* Clear MSB bits from the mask just in case it had them set. */
5005 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
5006 /* After this t1 will have MSB set for elements from other lane. */
5007 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
5008 /* Clear bits other than MSB. */
5009 emit_insn (gen_andv32qi3 (t1, t1, vt));
5010 /* Or in the lower bits from mask into t3. */
5011 emit_insn (gen_iorv32qi3 (t3, t1, t2));
5012 /* And invert MSB bits in t1, so MSB is set for elements from the same
5013 lane. */
5014 emit_insn (gen_xorv32qi3 (t1, t1, vt));
5015 /* Swap 128-bit lanes in t3. */
5016 t6 = gen_reg_rtx (V4DImode);
5017 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
5018 const2_rtx, GEN_INT (3),
5019 const0_rtx, const1_rtx));
5020 /* And or in the lower bits from mask into t1. */
5021 emit_insn (gen_iorv32qi3 (t1, t1, t2));
5022 if (one_operand_shuffle)
5023 {
5024 /* Each of these shuffles will put 0s in places where
5025 element from the other 128-bit lane is needed, otherwise
5026 will shuffle in the requested value. */
5027 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
5028 gen_lowpart (V32QImode, t6)));
5029 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
5030 /* For t3 the 128-bit lanes are swapped again. */
5031 t7 = gen_reg_rtx (V4DImode);
5032 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
5033 const2_rtx, GEN_INT (3),
5034 const0_rtx, const1_rtx));
5035 /* And oring both together leads to the result. */
5036 emit_insn (gen_iorv32qi3 (target, t1,
5037 gen_lowpart (V32QImode, t7)));
5038 if (target != operands[0])
5039 emit_move_insn (operands[0],
5040 gen_lowpart (GET_MODE (operands[0]), target));
5041 return;
5042 }
5043
5044 t4 = gen_reg_rtx (V32QImode);
5045 /* Similarly to the above one_operand_shuffle code,
5046 just for repeated twice for each operand. merge_two:
5047 code will merge the two results together. */
5048 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
5049 gen_lowpart (V32QImode, t6)));
5050 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
5051 gen_lowpart (V32QImode, t6)));
5052 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
5053 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
5054 t7 = gen_reg_rtx (V4DImode);
5055 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
5056 const2_rtx, GEN_INT (3),
5057 const0_rtx, const1_rtx));
5058 t8 = gen_reg_rtx (V4DImode);
5059 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
5060 const2_rtx, GEN_INT (3),
5061 const0_rtx, const1_rtx));
5062 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
5063 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
5064 t1 = t4;
5065 t2 = t3;
5066 goto merge_two;
5067
5068 default:
5069 gcc_assert (GET_MODE_SIZE (mode) <= 16);
5070 break;
5071 }
5072 }
5073
5074 if (TARGET_XOP)
5075 {
5076 /* The XOP VPPERM insn supports three inputs. By ignoring the
5077 one_operand_shuffle special case, we avoid creating another
5078 set of constant vectors in memory. */
5079 one_operand_shuffle = false;
5080
5081 /* mask = mask & {2*w-1, ...} */
5082 vt = GEN_INT (2*w - 1);
5083 }
5084 else
5085 {
5086 /* mask = mask & {w-1, ...} */
5087 vt = GEN_INT (w - 1);
5088 }
5089
5090 vt = gen_const_vec_duplicate (maskmode, vt);
5091 mask = expand_simple_binop (maskmode, AND, mask, vt,
5092 NULL_RTX, 0, OPTAB_DIRECT);
5093
5094 /* For non-QImode operations, convert the word permutation control
5095 into a byte permutation control. */
5096 if (mode != V16QImode)
5097 {
5098 mask = expand_simple_binop (maskmode, ASHIFT, mask,
5099 GEN_INT (exact_log2 (e)),
5100 NULL_RTX, 0, OPTAB_DIRECT);
5101
5102 /* Convert mask to vector of chars. */
5103 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
5104
5105 /* Replicate each of the input bytes into byte positions:
5106 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5107 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5108 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
5109 for (i = 0; i < 16; ++i)
5110 vec[i] = GEN_INT (i/e * e);
5111 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5112 vt = validize_mem (force_const_mem (V16QImode, vt));
5113 if (TARGET_XOP)
5114 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
5115 else
5116 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
5117
5118 /* Convert it into the byte positions by doing
5119 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
5120 for (i = 0; i < 16; ++i)
5121 vec[i] = GEN_INT (i % e);
5122 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5123 vt = validize_mem (force_const_mem (V16QImode, vt));
5124 emit_insn (gen_addv16qi3 (mask, mask, vt));
5125 }
5126
5127 /* The actual shuffle operations all operate on V16QImode. */
5128 op0 = gen_lowpart (V16QImode, op0);
5129 op1 = gen_lowpart (V16QImode, op1);
5130
5131 if (TARGET_XOP)
5132 {
5133 if (GET_MODE (target) != V16QImode)
5134 target = gen_reg_rtx (V16QImode);
5135 emit_insn (gen_xop_pperm (target, op0, op1, mask));
5136 if (target != operands[0])
5137 emit_move_insn (operands[0],
5138 gen_lowpart (GET_MODE (operands[0]), target));
5139 }
5140 else if (one_operand_shuffle)
5141 {
5142 if (GET_MODE (target) != V16QImode)
5143 target = gen_reg_rtx (V16QImode);
5144 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
5145 if (target != operands[0])
5146 emit_move_insn (operands[0],
5147 gen_lowpart (GET_MODE (operands[0]), target));
5148 }
5149 else
5150 {
5151 rtx xops[6];
5152 bool ok;
5153
5154 /* Shuffle the two input vectors independently. */
5155 t1 = gen_reg_rtx (V16QImode);
5156 t2 = gen_reg_rtx (V16QImode);
5157 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
5158 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
5159
5160 merge_two:
5161 /* Then merge them together. The key is whether any given control
5162 element contained a bit set that indicates the second word. */
5163 mask = operands[3];
5164 vt = GEN_INT (w);
5165 if (maskmode == V2DImode && !TARGET_SSE4_1)
5166 {
5167 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
5168 more shuffle to convert the V2DI input mask into a V4SI
5169 input mask. At which point the masking that expand_int_vcond
5170 will work as desired. */
5171 rtx t3 = gen_reg_rtx (V4SImode);
5172 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
5173 const0_rtx, const0_rtx,
5174 const2_rtx, const2_rtx));
5175 mask = t3;
5176 maskmode = V4SImode;
5177 e = w = 4;
5178 }
5179
5180 vt = gen_const_vec_duplicate (maskmode, vt);
5181 vt = force_reg (maskmode, vt);
5182 mask = expand_simple_binop (maskmode, AND, mask, vt,
5183 NULL_RTX, 0, OPTAB_DIRECT);
5184
5185 if (GET_MODE (target) != mode)
5186 target = gen_reg_rtx (mode);
5187 xops[0] = target;
5188 xops[1] = gen_lowpart (mode, t2);
5189 xops[2] = gen_lowpart (mode, t1);
5190 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
5191 xops[4] = mask;
5192 xops[5] = vt;
5193 ok = ix86_expand_int_vcond (xops);
5194 gcc_assert (ok);
5195 if (target != operands[0])
5196 emit_move_insn (operands[0],
5197 gen_lowpart (GET_MODE (operands[0]), target));
5198 }
5199}
5200
5201/* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
5202 true if we should do zero extension, else sign extension. HIGH_P is
5203 true if we want the N/2 high elements, else the low elements. */
5204
5205void
5206ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
5207{
5208 machine_mode imode = GET_MODE (src);
5209 rtx tmp;
5210
5211 if (TARGET_SSE4_1)
5212 {
5213 rtx (*unpack)(rtx, rtx);
5214 rtx (*extract)(rtx, rtx) = NULL;
5215 machine_mode halfmode = BLKmode;
5216
5217 switch (imode)
5218 {
5219 case E_V64QImode:
5220 if (unsigned_p)
5221 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
5222 else
5223 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
5224 halfmode = V32QImode;
5225 extract
5226 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
5227 break;
5228 case E_V32QImode:
5229 if (unsigned_p)
5230 unpack = gen_avx2_zero_extendv16qiv16hi2;
5231 else
5232 unpack = gen_avx2_sign_extendv16qiv16hi2;
5233 halfmode = V16QImode;
5234 extract
5235 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
5236 break;
5237 case E_V32HImode:
5238 if (unsigned_p)
5239 unpack = gen_avx512f_zero_extendv16hiv16si2;
5240 else
5241 unpack = gen_avx512f_sign_extendv16hiv16si2;
5242 halfmode = V16HImode;
5243 extract
5244 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
5245 break;
5246 case E_V16HImode:
5247 if (unsigned_p)
5248 unpack = gen_avx2_zero_extendv8hiv8si2;
5249 else
5250 unpack = gen_avx2_sign_extendv8hiv8si2;
5251 halfmode = V8HImode;
5252 extract
5253 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
5254 break;
5255 case E_V16SImode:
5256 if (unsigned_p)
5257 unpack = gen_avx512f_zero_extendv8siv8di2;
5258 else
5259 unpack = gen_avx512f_sign_extendv8siv8di2;
5260 halfmode = V8SImode;
5261 extract
5262 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
5263 break;
5264 case E_V8SImode:
5265 if (unsigned_p)
5266 unpack = gen_avx2_zero_extendv4siv4di2;
5267 else
5268 unpack = gen_avx2_sign_extendv4siv4di2;
5269 halfmode = V4SImode;
5270 extract
5271 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5272 break;
5273 case E_V16QImode:
5274 if (unsigned_p)
5275 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5276 else
5277 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5278 break;
5279 case E_V8HImode:
5280 if (unsigned_p)
5281 unpack = gen_sse4_1_zero_extendv4hiv4si2;
5282 else
5283 unpack = gen_sse4_1_sign_extendv4hiv4si2;
5284 break;
5285 case E_V4SImode:
5286 if (unsigned_p)
5287 unpack = gen_sse4_1_zero_extendv2siv2di2;
5288 else
5289 unpack = gen_sse4_1_sign_extendv2siv2di2;
5290 break;
836328b2
UB
5291 case E_V8QImode:
5292 if (unsigned_p)
5293 unpack = gen_sse4_1_zero_extendv4qiv4hi2;
5294 else
5295 unpack = gen_sse4_1_sign_extendv4qiv4hi2;
5296 break;
5297 case E_V4HImode:
5298 if (unsigned_p)
5299 unpack = gen_sse4_1_zero_extendv2hiv2si2;
5300 else
5301 unpack = gen_sse4_1_sign_extendv2hiv2si2;
5302 break;
663a014e
UB
5303 case E_V4QImode:
5304 if (unsigned_p)
5305 unpack = gen_sse4_1_zero_extendv2qiv2hi2;
5306 else
5307 unpack = gen_sse4_1_sign_extendv2qiv2hi2;
5308 break;
2bf6d935
ML
5309 default:
5310 gcc_unreachable ();
5311 }
5312
5313 if (GET_MODE_SIZE (imode) >= 32)
5314 {
5315 tmp = gen_reg_rtx (halfmode);
5316 emit_insn (extract (tmp, src));
5317 }
5318 else if (high_p)
5319 {
836328b2
UB
5320 switch (GET_MODE_SIZE (imode))
5321 {
5322 case 16:
5323 /* Shift higher 8 bytes to lower 8 bytes. */
5324 tmp = gen_reg_rtx (V1TImode);
5325 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5326 GEN_INT (64)));
5327 break;
5328 case 8:
5329 /* Shift higher 4 bytes to lower 4 bytes. */
5330 tmp = gen_reg_rtx (V1DImode);
5331 emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
5332 GEN_INT (32)));
5333 break;
663a014e
UB
5334 case 4:
5335 /* Shift higher 2 bytes to lower 2 bytes. */
5336 tmp = gen_reg_rtx (V1SImode);
5337 emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
5338 GEN_INT (16)));
5339 break;
836328b2
UB
5340 default:
5341 gcc_unreachable ();
5342 }
5343
2bf6d935
ML
5344 tmp = gen_lowpart (imode, tmp);
5345 }
5346 else
5347 tmp = src;
5348
5349 emit_insn (unpack (dest, tmp));
5350 }
5351 else
5352 {
5353 rtx (*unpack)(rtx, rtx, rtx);
5354
5355 switch (imode)
5356 {
5357 case E_V16QImode:
5358 if (high_p)
5359 unpack = gen_vec_interleave_highv16qi;
5360 else
5361 unpack = gen_vec_interleave_lowv16qi;
5362 break;
5363 case E_V8HImode:
5364 if (high_p)
5365 unpack = gen_vec_interleave_highv8hi;
5366 else
5367 unpack = gen_vec_interleave_lowv8hi;
5368 break;
5369 case E_V4SImode:
5370 if (high_p)
5371 unpack = gen_vec_interleave_highv4si;
5372 else
5373 unpack = gen_vec_interleave_lowv4si;
5374 break;
836328b2
UB
5375 case E_V8QImode:
5376 if (high_p)
5377 unpack = gen_mmx_punpckhbw;
5378 else
5379 unpack = gen_mmx_punpcklbw;
5380 break;
5381 case E_V4HImode:
5382 if (high_p)
5383 unpack = gen_mmx_punpckhwd;
5384 else
5385 unpack = gen_mmx_punpcklwd;
5386 break;
663a014e
UB
5387 case E_V4QImode:
5388 if (high_p)
5389 unpack = gen_mmx_punpckhbw_low;
5390 else
5391 unpack = gen_mmx_punpcklbw_low;
5392 break;
2bf6d935
ML
5393 default:
5394 gcc_unreachable ();
5395 }
5396
5397 if (unsigned_p)
5398 tmp = force_reg (imode, CONST0_RTX (imode));
5399 else
5400 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5401 src, pc_rtx, pc_rtx);
5402
5403 rtx tmp2 = gen_reg_rtx (imode);
5404 emit_insn (unpack (tmp2, src, tmp));
5405 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5406 }
5407}
5408
faf2b6bc 5409/* Return true if mem is pool constant which contains a const_vector
5410 perm index, assign the index to PERM. */
5411bool
5412ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
5413{
5414 machine_mode mode = GET_MODE (mem);
5415 int nelt = GET_MODE_NUNITS (mode);
5416
5417 if (!INTEGRAL_MODE_P (mode))
5418 return false;
5419
5420 /* Needs to be constant pool. */
5421 if (!(MEM_P (mem))
5422 || !SYMBOL_REF_P (XEXP (mem, 0))
5423 || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
5424 return false;
5425
5426 rtx constant = get_pool_constant (XEXP (mem, 0));
5427
5428 if (GET_CODE (constant) != CONST_VECTOR)
5429 return false;
5430
5431 /* There could be some rtx like
5432 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
5433 but with "*.LC1" refer to V2DI constant vector. */
5434 if (GET_MODE (constant) != mode)
5435 {
5436 constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
5437
5438 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
5439 return false;
5440 }
5441
5442 for (int i = 0; i != nelt; i++)
5443 perm[i] = UINTVAL (XVECEXP (constant, 0, i));
5444
5445 return true;
5446}
5447
2bf6d935
ML
5448/* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5449 but works for floating pointer parameters and nonoffsetable memories.
5450 For pushes, it returns just stack offsets; the values will be saved
5451 in the right order. Maximally three parts are generated. */
5452
5453static int
5454ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5455{
5456 int size;
5457
5458 if (!TARGET_64BIT)
5459 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5460 else
5461 size = (GET_MODE_SIZE (mode) + 4) / 8;
5462
5463 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5464 gcc_assert (size >= 2 && size <= 4);
5465
5466 /* Optimize constant pool reference to immediates. This is used by fp
5467 moves, that force all constants to memory to allow combining. */
5468 if (MEM_P (operand) && MEM_READONLY_P (operand))
5469 operand = avoid_constant_pool_reference (operand);
5470
5471 if (MEM_P (operand) && !offsettable_memref_p (operand))
5472 {
5473 /* The only non-offsetable memories we handle are pushes. */
5474 int ok = push_operand (operand, VOIDmode);
5475
5476 gcc_assert (ok);
5477
5478 operand = copy_rtx (operand);
5479 PUT_MODE (operand, word_mode);
5480 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5481 return size;
5482 }
5483
5484 if (GET_CODE (operand) == CONST_VECTOR)
5485 {
5486 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5487 /* Caution: if we looked through a constant pool memory above,
5488 the operand may actually have a different mode now. That's
5489 ok, since we want to pun this all the way back to an integer. */
5490 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5491 gcc_assert (operand != NULL);
5492 mode = imode;
5493 }
5494
5495 if (!TARGET_64BIT)
5496 {
5497 if (mode == DImode)
5498 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5499 else
5500 {
5501 int i;
5502
5503 if (REG_P (operand))
5504 {
5505 gcc_assert (reload_completed);
5506 for (i = 0; i < size; i++)
5507 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5508 }
5509 else if (offsettable_memref_p (operand))
5510 {
5511 operand = adjust_address (operand, SImode, 0);
5512 parts[0] = operand;
5513 for (i = 1; i < size; i++)
5514 parts[i] = adjust_address (operand, SImode, 4 * i);
5515 }
5516 else if (CONST_DOUBLE_P (operand))
5517 {
5518 const REAL_VALUE_TYPE *r;
5519 long l[4];
5520
5521 r = CONST_DOUBLE_REAL_VALUE (operand);
5522 switch (mode)
5523 {
5524 case E_TFmode:
5525 real_to_target (l, r, mode);
5526 parts[3] = gen_int_mode (l[3], SImode);
5527 parts[2] = gen_int_mode (l[2], SImode);
5528 break;
5529 case E_XFmode:
5530 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5531 long double may not be 80-bit. */
5532 real_to_target (l, r, mode);
5533 parts[2] = gen_int_mode (l[2], SImode);
5534 break;
5535 case E_DFmode:
5536 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5537 break;
5538 default:
5539 gcc_unreachable ();
5540 }
5541 parts[1] = gen_int_mode (l[1], SImode);
5542 parts[0] = gen_int_mode (l[0], SImode);
5543 }
5544 else
5545 gcc_unreachable ();
5546 }
5547 }
5548 else
5549 {
5550 if (mode == TImode)
5551 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5552 if (mode == XFmode || mode == TFmode)
5553 {
5554 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5555 if (REG_P (operand))
5556 {
5557 gcc_assert (reload_completed);
5558 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5559 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5560 }
5561 else if (offsettable_memref_p (operand))
5562 {
5563 operand = adjust_address (operand, DImode, 0);
5564 parts[0] = operand;
5565 parts[1] = adjust_address (operand, upper_mode, 8);
5566 }
5567 else if (CONST_DOUBLE_P (operand))
5568 {
5569 long l[4];
5570
5571 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5572
5573 /* real_to_target puts 32-bit pieces in each long. */
5574 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5575 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5576 << 32), DImode);
5577
5578 if (upper_mode == SImode)
5579 parts[1] = gen_int_mode (l[2], SImode);
5580 else
5581 parts[1]
5582 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5583 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5584 << 32), DImode);
5585 }
5586 else
5587 gcc_unreachable ();
5588 }
5589 }
5590
5591 return size;
5592}
5593
5594/* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5595 Return false when normal moves are needed; true when all required
5596 insns have been emitted. Operands 2-4 contain the input values
5597 int the correct order; operands 5-7 contain the output values. */
5598
5599void
5600ix86_split_long_move (rtx operands[])
5601{
5602 rtx part[2][4];
5603 int nparts, i, j;
5604 int push = 0;
5605 int collisions = 0;
5606 machine_mode mode = GET_MODE (operands[0]);
5607 bool collisionparts[4];
5608
5609 /* The DFmode expanders may ask us to move double.
5610 For 64bit target this is single move. By hiding the fact
5611 here we simplify i386.md splitters. */
5612 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5613 {
5614 /* Optimize constant pool reference to immediates. This is used by
5615 fp moves, that force all constants to memory to allow combining. */
5616
5617 if (MEM_P (operands[1])
5618 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5619 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5620 operands[1] = get_pool_constant (XEXP (operands[1], 0));
5621 if (push_operand (operands[0], VOIDmode))
5622 {
5623 operands[0] = copy_rtx (operands[0]);
5624 PUT_MODE (operands[0], word_mode);
5625 }
5626 else
5627 operands[0] = gen_lowpart (DImode, operands[0]);
5628 operands[1] = gen_lowpart (DImode, operands[1]);
5629 emit_move_insn (operands[0], operands[1]);
5630 return;
5631 }
5632
5633 /* The only non-offsettable memory we handle is push. */
5634 if (push_operand (operands[0], VOIDmode))
5635 push = 1;
5636 else
5637 gcc_assert (!MEM_P (operands[0])
5638 || offsettable_memref_p (operands[0]));
5639
5640 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5641 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5642
5643 /* When emitting push, take care for source operands on the stack. */
5644 if (push && MEM_P (operands[1])
5645 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5646 {
5647 rtx src_base = XEXP (part[1][nparts - 1], 0);
5648
5649 /* Compensate for the stack decrement by 4. */
5650 if (!TARGET_64BIT && nparts == 3
5651 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
5652 src_base = plus_constant (Pmode, src_base, 4);
5653
5654 /* src_base refers to the stack pointer and is
5655 automatically decreased by emitted push. */
5656 for (i = 0; i < nparts; i++)
5657 part[1][i] = change_address (part[1][i],
5658 GET_MODE (part[1][i]), src_base);
5659 }
5660
5661 /* We need to do copy in the right order in case an address register
5662 of the source overlaps the destination. */
5663 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
5664 {
5665 rtx tmp;
5666
5667 for (i = 0; i < nparts; i++)
5668 {
5669 collisionparts[i]
5670 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
5671 if (collisionparts[i])
5672 collisions++;
5673 }
5674
5675 /* Collision in the middle part can be handled by reordering. */
5676 if (collisions == 1 && nparts == 3 && collisionparts [1])
5677 {
5678 std::swap (part[0][1], part[0][2]);
5679 std::swap (part[1][1], part[1][2]);
5680 }
5681 else if (collisions == 1
5682 && nparts == 4
5683 && (collisionparts [1] || collisionparts [2]))
5684 {
5685 if (collisionparts [1])
5686 {
5687 std::swap (part[0][1], part[0][2]);
5688 std::swap (part[1][1], part[1][2]);
5689 }
5690 else
5691 {
5692 std::swap (part[0][2], part[0][3]);
5693 std::swap (part[1][2], part[1][3]);
5694 }
5695 }
5696
5697 /* If there are more collisions, we can't handle it by reordering.
5698 Do an lea to the last part and use only one colliding move. */
5699 else if (collisions > 1)
5700 {
5701 rtx base, addr;
5702
5703 collisions = 1;
5704
5705 base = part[0][nparts - 1];
5706
5707 /* Handle the case when the last part isn't valid for lea.
5708 Happens in 64-bit mode storing the 12-byte XFmode. */
5709 if (GET_MODE (base) != Pmode)
5710 base = gen_rtx_REG (Pmode, REGNO (base));
5711
5712 addr = XEXP (part[1][0], 0);
5713 if (TARGET_TLS_DIRECT_SEG_REFS)
5714 {
5715 struct ix86_address parts;
5716 int ok = ix86_decompose_address (addr, &parts);
5717 gcc_assert (ok);
5718 /* It is not valid to use %gs: or %fs: in lea. */
5719 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
5720 }
5721 emit_insn (gen_rtx_SET (base, addr));
5722 part[1][0] = replace_equiv_address (part[1][0], base);
5723 for (i = 1; i < nparts; i++)
5724 {
5725 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
5726 part[1][i] = replace_equiv_address (part[1][i], tmp);
5727 }
5728 }
5729 }
5730
5731 if (push)
5732 {
5733 if (!TARGET_64BIT)
5734 {
5735 if (nparts == 3)
5736 {
5737 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
d9330fb5 5738 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
2bf6d935
ML
5739 emit_move_insn (part[0][2], part[1][2]);
5740 }
5741 else if (nparts == 4)
5742 {
5743 emit_move_insn (part[0][3], part[1][3]);
5744 emit_move_insn (part[0][2], part[1][2]);
5745 }
5746 }
5747 else
5748 {
5749 /* In 64bit mode we don't have 32bit push available. In case this is
5750 register, it is OK - we will just use larger counterpart. We also
5751 retype memory - these comes from attempt to avoid REX prefix on
5752 moving of second half of TFmode value. */
5753 if (GET_MODE (part[1][1]) == SImode)
5754 {
5755 switch (GET_CODE (part[1][1]))
5756 {
5757 case MEM:
5758 part[1][1] = adjust_address (part[1][1], DImode, 0);
5759 break;
5760
5761 case REG:
5762 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
5763 break;
5764
5765 default:
5766 gcc_unreachable ();
5767 }
5768
5769 if (GET_MODE (part[1][0]) == SImode)
5770 part[1][0] = part[1][1];
5771 }
5772 }
5773 emit_move_insn (part[0][1], part[1][1]);
5774 emit_move_insn (part[0][0], part[1][0]);
5775 return;
5776 }
5777
5778 /* Choose correct order to not overwrite the source before it is copied. */
5779 if ((REG_P (part[0][0])
5780 && REG_P (part[1][1])
5781 && (REGNO (part[0][0]) == REGNO (part[1][1])
5782 || (nparts == 3
5783 && REGNO (part[0][0]) == REGNO (part[1][2]))
5784 || (nparts == 4
5785 && REGNO (part[0][0]) == REGNO (part[1][3]))))
5786 || (collisions > 0
5787 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
5788 {
5789 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
5790 {
5791 operands[2 + i] = part[0][j];
5792 operands[6 + i] = part[1][j];
5793 }
5794 }
5795 else
5796 {
5797 for (i = 0; i < nparts; i++)
5798 {
5799 operands[2 + i] = part[0][i];
5800 operands[6 + i] = part[1][i];
5801 }
5802 }
5803
5804 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5805 if (optimize_insn_for_size_p ())
5806 {
5807 for (j = 0; j < nparts - 1; j++)
5808 if (CONST_INT_P (operands[6 + j])
5809 && operands[6 + j] != const0_rtx
5810 && REG_P (operands[2 + j]))
5811 for (i = j; i < nparts - 1; i++)
5812 if (CONST_INT_P (operands[7 + i])
5813 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
5814 operands[7 + i] = operands[2 + j];
5815 }
5816
5817 for (i = 0; i < nparts; i++)
5818 emit_move_insn (operands[2 + i], operands[6 + i]);
5819
5820 return;
5821}
5822
5823/* Helper function of ix86_split_ashl used to generate an SImode/DImode
5824 left shift by a constant, either using a single shift or
5825 a sequence of add instructions. */
5826
5827static void
5828ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
5829{
2bf6d935
ML
5830 if (count == 1
5831 || (count * ix86_cost->add <= ix86_cost->shift_const
5832 && !optimize_insn_for_size_p ()))
5833 {
2bf6d935 5834 while (count-- > 0)
83bc5e44 5835 emit_insn (gen_add2_insn (operand, operand));
2bf6d935
ML
5836 }
5837 else
5838 {
83bc5e44
UB
5839 rtx (*insn)(rtx, rtx, rtx);
5840
2bf6d935
ML
5841 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5842 emit_insn (insn (operand, operand, GEN_INT (count)));
5843 }
5844}
5845
5846void
5847ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
5848{
5849 rtx (*gen_ashl3)(rtx, rtx, rtx);
5850 rtx (*gen_shld)(rtx, rtx, rtx);
5851 int half_width = GET_MODE_BITSIZE (mode) >> 1;
987a3082 5852 machine_mode half_mode;
2bf6d935
ML
5853
5854 rtx low[2], high[2];
5855 int count;
5856
5857 if (CONST_INT_P (operands[2]))
5858 {
5859 split_double_mode (mode, operands, 2, low, high);
5860 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5861
5862 if (count >= half_width)
5863 {
5864 emit_move_insn (high[0], low[1]);
5865 emit_move_insn (low[0], const0_rtx);
5866
5867 if (count > half_width)
5868 ix86_expand_ashl_const (high[0], count - half_width, mode);
5869 }
5870 else
5871 {
5872 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5873
5874 if (!rtx_equal_p (operands[0], operands[1]))
5875 emit_move_insn (operands[0], operands[1]);
5876
5877 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
5878 ix86_expand_ashl_const (low[0], count, mode);
5879 }
5880 return;
5881 }
5882
5883 split_double_mode (mode, operands, 1, low, high);
987a3082 5884 half_mode = mode == DImode ? SImode : DImode;
2bf6d935
ML
5885
5886 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5887
5888 if (operands[1] == const1_rtx)
5889 {
5890 /* Assuming we've chosen a QImode capable registers, then 1 << N
5891 can be done with two 32/64-bit shifts, no branches, no cmoves. */
5892 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
5893 {
5894 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
5895
5896 ix86_expand_clear (low[0]);
5897 ix86_expand_clear (high[0]);
5898 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
5899
5900 d = gen_lowpart (QImode, low[0]);
5901 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5902 s = gen_rtx_EQ (QImode, flags, const0_rtx);
5903 emit_insn (gen_rtx_SET (d, s));
5904
5905 d = gen_lowpart (QImode, high[0]);
5906 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5907 s = gen_rtx_NE (QImode, flags, const0_rtx);
5908 emit_insn (gen_rtx_SET (d, s));
5909 }
5910
5911 /* Otherwise, we can get the same results by manually performing
5912 a bit extract operation on bit 5/6, and then performing the two
5913 shifts. The two methods of getting 0/1 into low/high are exactly
5914 the same size. Avoiding the shift in the bit extract case helps
5915 pentium4 a bit; no one else seems to care much either way. */
5916 else
5917 {
2bf6d935
ML
5918 rtx (*gen_lshr3)(rtx, rtx, rtx);
5919 rtx (*gen_and3)(rtx, rtx, rtx);
5920 rtx (*gen_xor3)(rtx, rtx, rtx);
5921 HOST_WIDE_INT bits;
5922 rtx x;
5923
5924 if (mode == DImode)
5925 {
2bf6d935
ML
5926 gen_lshr3 = gen_lshrsi3;
5927 gen_and3 = gen_andsi3;
5928 gen_xor3 = gen_xorsi3;
5929 bits = 5;
5930 }
5931 else
5932 {
2bf6d935
ML
5933 gen_lshr3 = gen_lshrdi3;
5934 gen_and3 = gen_anddi3;
5935 gen_xor3 = gen_xordi3;
5936 bits = 6;
5937 }
5938
5939 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
5940 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
5941 else
5942 x = gen_lowpart (half_mode, operands[2]);
5943 emit_insn (gen_rtx_SET (high[0], x));
5944
5945 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
5946 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
5947 emit_move_insn (low[0], high[0]);
5948 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
5949 }
5950
5951 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5952 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
5953 return;
5954 }
5955
5956 if (operands[1] == constm1_rtx)
5957 {
5958 /* For -1 << N, we can avoid the shld instruction, because we
5959 know that we're shifting 0...31/63 ones into a -1. */
5960 emit_move_insn (low[0], constm1_rtx);
5961 if (optimize_insn_for_size_p ())
5962 emit_move_insn (high[0], low[0]);
5963 else
5964 emit_move_insn (high[0], constm1_rtx);
5965 }
5966 else
5967 {
5968 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5969
5970 if (!rtx_equal_p (operands[0], operands[1]))
5971 emit_move_insn (operands[0], operands[1]);
5972
5973 split_double_mode (mode, operands, 1, low, high);
5974 emit_insn (gen_shld (high[0], low[0], operands[2]));
5975 }
5976
5977 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5978
5979 if (TARGET_CMOVE && scratch)
5980 {
2bf6d935 5981 ix86_expand_clear (scratch);
987a3082
UB
5982 emit_insn (gen_x86_shift_adj_1
5983 (half_mode, high[0], low[0], operands[2], scratch));
2bf6d935
ML
5984 }
5985 else
987a3082 5986 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
2bf6d935
ML
5987}
5988
5989void
5990ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
5991{
5992 rtx (*gen_ashr3)(rtx, rtx, rtx)
5993 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
5994 rtx (*gen_shrd)(rtx, rtx, rtx);
5995 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5996
5997 rtx low[2], high[2];
5998 int count;
5999
6000 if (CONST_INT_P (operands[2]))
6001 {
6002 split_double_mode (mode, operands, 2, low, high);
6003 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6004
6005 if (count == GET_MODE_BITSIZE (mode) - 1)
6006 {
6007 emit_move_insn (high[0], high[1]);
6008 emit_insn (gen_ashr3 (high[0], high[0],
6009 GEN_INT (half_width - 1)));
6010 emit_move_insn (low[0], high[0]);
6011
6012 }
6013 else if (count >= half_width)
6014 {
6015 emit_move_insn (low[0], high[1]);
6016 emit_move_insn (high[0], low[0]);
6017 emit_insn (gen_ashr3 (high[0], high[0],
6018 GEN_INT (half_width - 1)));
6019
6020 if (count > half_width)
6021 emit_insn (gen_ashr3 (low[0], low[0],
6022 GEN_INT (count - half_width)));
6023 }
6024 else
6025 {
6026 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6027
6028 if (!rtx_equal_p (operands[0], operands[1]))
6029 emit_move_insn (operands[0], operands[1]);
6030
6031 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6032 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
6033 }
6034 }
6035 else
6036 {
987a3082
UB
6037 machine_mode half_mode;
6038
2bf6d935
ML
6039 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6040
6041 if (!rtx_equal_p (operands[0], operands[1]))
6042 emit_move_insn (operands[0], operands[1]);
6043
6044 split_double_mode (mode, operands, 1, low, high);
987a3082 6045 half_mode = mode == DImode ? SImode : DImode;
2bf6d935
ML
6046
6047 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6048 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
6049
6050 if (TARGET_CMOVE && scratch)
6051 {
2bf6d935
ML
6052 emit_move_insn (scratch, high[0]);
6053 emit_insn (gen_ashr3 (scratch, scratch,
6054 GEN_INT (half_width - 1)));
987a3082
UB
6055 emit_insn (gen_x86_shift_adj_1
6056 (half_mode, low[0], high[0], operands[2], scratch));
2bf6d935
ML
6057 }
6058 else
987a3082
UB
6059 emit_insn (gen_x86_shift_adj_3
6060 (half_mode, low[0], high[0], operands[2]));
2bf6d935
ML
6061 }
6062}
6063
6064void
6065ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
6066{
6067 rtx (*gen_lshr3)(rtx, rtx, rtx)
6068 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
6069 rtx (*gen_shrd)(rtx, rtx, rtx);
6070 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6071
6072 rtx low[2], high[2];
6073 int count;
6074
6075 if (CONST_INT_P (operands[2]))
6076 {
6077 split_double_mode (mode, operands, 2, low, high);
6078 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6079
6080 if (count >= half_width)
6081 {
6082 emit_move_insn (low[0], high[1]);
6083 ix86_expand_clear (high[0]);
6084
6085 if (count > half_width)
6086 emit_insn (gen_lshr3 (low[0], low[0],
6087 GEN_INT (count - half_width)));
6088 }
6089 else
6090 {
6091 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6092
6093 if (!rtx_equal_p (operands[0], operands[1]))
6094 emit_move_insn (operands[0], operands[1]);
6095
6096 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6097 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
6098 }
6099 }
6100 else
6101 {
987a3082
UB
6102 machine_mode half_mode;
6103
2bf6d935
ML
6104 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6105
6106 if (!rtx_equal_p (operands[0], operands[1]))
6107 emit_move_insn (operands[0], operands[1]);
6108
6109 split_double_mode (mode, operands, 1, low, high);
987a3082 6110 half_mode = mode == DImode ? SImode : DImode;
2bf6d935
ML
6111
6112 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6113 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
6114
6115 if (TARGET_CMOVE && scratch)
6116 {
2bf6d935 6117 ix86_expand_clear (scratch);
987a3082
UB
6118 emit_insn (gen_x86_shift_adj_1
6119 (half_mode, low[0], high[0], operands[2], scratch));
2bf6d935
ML
6120 }
6121 else
987a3082
UB
6122 emit_insn (gen_x86_shift_adj_2
6123 (half_mode, low[0], high[0], operands[2]));
2bf6d935
ML
6124 }
6125}
6126
6127/* Return mode for the memcpy/memset loop counter. Prefer SImode over
6128 DImode for constant loop counts. */
6129
6130static machine_mode
6131counter_mode (rtx count_exp)
6132{
6133 if (GET_MODE (count_exp) != VOIDmode)
6134 return GET_MODE (count_exp);
6135 if (!CONST_INT_P (count_exp))
6136 return Pmode;
6137 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
6138 return DImode;
6139 return SImode;
6140}
6141
6142/* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
6143 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
6144 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
6145 memory by VALUE (supposed to be in MODE).
6146
6147 The size is rounded down to whole number of chunk size moved at once.
6148 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
6149
6150
6151static void
76715c32 6152expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
2bf6d935
ML
6153 rtx destptr, rtx srcptr, rtx value,
6154 rtx count, machine_mode mode, int unroll,
6155 int expected_size, bool issetmem)
6156{
6157 rtx_code_label *out_label, *top_label;
6158 rtx iter, tmp;
6159 machine_mode iter_mode = counter_mode (count);
6160 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
6161 rtx piece_size = GEN_INT (piece_size_n);
6162 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
6163 rtx size;
6164 int i;
6165
6166 top_label = gen_label_rtx ();
6167 out_label = gen_label_rtx ();
6168 iter = gen_reg_rtx (iter_mode);
6169
6170 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
6171 NULL, 1, OPTAB_DIRECT);
6172 /* Those two should combine. */
6173 if (piece_size == const1_rtx)
6174 {
6175 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
6176 true, out_label);
6177 predict_jump (REG_BR_PROB_BASE * 10 / 100);
6178 }
6179 emit_move_insn (iter, const0_rtx);
6180
6181 emit_label (top_label);
6182
6183 tmp = convert_modes (Pmode, iter_mode, iter, true);
6184
6185 /* This assert could be relaxed - in this case we'll need to compute
6186 smallest power of two, containing in PIECE_SIZE_N and pass it to
6187 offset_address. */
6188 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
6189 destmem = offset_address (destmem, tmp, piece_size_n);
6190 destmem = adjust_address (destmem, mode, 0);
6191
6192 if (!issetmem)
6193 {
6194 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
6195 srcmem = adjust_address (srcmem, mode, 0);
6196
6197 /* When unrolling for chips that reorder memory reads and writes,
6198 we can save registers by using single temporary.
6199 Also using 4 temporaries is overkill in 32bit mode. */
6200 if (!TARGET_64BIT && 0)
6201 {
6202 for (i = 0; i < unroll; i++)
6203 {
6204 if (i)
6205 {
6206 destmem = adjust_address (copy_rtx (destmem), mode,
6207 GET_MODE_SIZE (mode));
6208 srcmem = adjust_address (copy_rtx (srcmem), mode,
6209 GET_MODE_SIZE (mode));
6210 }
6211 emit_move_insn (destmem, srcmem);
6212 }
6213 }
6214 else
6215 {
6216 rtx tmpreg[4];
6217 gcc_assert (unroll <= 4);
6218 for (i = 0; i < unroll; i++)
6219 {
6220 tmpreg[i] = gen_reg_rtx (mode);
6221 if (i)
6222 srcmem = adjust_address (copy_rtx (srcmem), mode,
6223 GET_MODE_SIZE (mode));
6224 emit_move_insn (tmpreg[i], srcmem);
6225 }
6226 for (i = 0; i < unroll; i++)
6227 {
6228 if (i)
6229 destmem = adjust_address (copy_rtx (destmem), mode,
6230 GET_MODE_SIZE (mode));
6231 emit_move_insn (destmem, tmpreg[i]);
6232 }
6233 }
6234 }
6235 else
6236 for (i = 0; i < unroll; i++)
6237 {
6238 if (i)
6239 destmem = adjust_address (copy_rtx (destmem), mode,
6240 GET_MODE_SIZE (mode));
6241 emit_move_insn (destmem, value);
6242 }
6243
6244 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
6245 true, OPTAB_LIB_WIDEN);
6246 if (tmp != iter)
6247 emit_move_insn (iter, tmp);
6248
6249 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
6250 true, top_label);
6251 if (expected_size != -1)
6252 {
6253 expected_size /= GET_MODE_SIZE (mode) * unroll;
6254 if (expected_size == 0)
6255 predict_jump (0);
6256 else if (expected_size > REG_BR_PROB_BASE)
6257 predict_jump (REG_BR_PROB_BASE - 1);
6258 else
6259 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
6260 / expected_size);
6261 }
6262 else
6263 predict_jump (REG_BR_PROB_BASE * 80 / 100);
6264 iter = ix86_zero_extend_to_Pmode (iter);
6265 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
6266 true, OPTAB_LIB_WIDEN);
6267 if (tmp != destptr)
6268 emit_move_insn (destptr, tmp);
6269 if (!issetmem)
6270 {
6271 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
6272 true, OPTAB_LIB_WIDEN);
6273 if (tmp != srcptr)
6274 emit_move_insn (srcptr, tmp);
6275 }
6276 emit_label (out_label);
6277}
6278
6279/* Divide COUNTREG by SCALE. */
6280static rtx
6281scale_counter (rtx countreg, int scale)
6282{
6283 rtx sc;
6284
6285 if (scale == 1)
6286 return countreg;
6287 if (CONST_INT_P (countreg))
6288 return GEN_INT (INTVAL (countreg) / scale);
6289 gcc_assert (REG_P (countreg));
6290
6291 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
6292 GEN_INT (exact_log2 (scale)),
6293 NULL, 1, OPTAB_DIRECT);
6294 return sc;
6295}
6296
6297/* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
6298 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
6299 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
6300 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
6301 ORIG_VALUE is the original value passed to memset to fill the memory with.
6302 Other arguments have same meaning as for previous function. */
6303
6304static void
76715c32 6305expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
2bf6d935
ML
6306 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
6307 rtx count,
6308 machine_mode mode, bool issetmem)
6309{
6310 rtx destexp;
6311 rtx srcexp;
6312 rtx countreg;
6313 HOST_WIDE_INT rounded_count;
6314
6315 /* If possible, it is shorter to use rep movs.
6316 TODO: Maybe it is better to move this logic to decide_alg. */
6317 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
bf24f4ec 6318 && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
2bf6d935
ML
6319 && (!issetmem || orig_value == const0_rtx))
6320 mode = SImode;
6321
6322 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
6323 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
6324
6325 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
6326 GET_MODE_SIZE (mode)));
6327 if (mode != QImode)
6328 {
6329 destexp = gen_rtx_ASHIFT (Pmode, countreg,
6330 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
6331 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
6332 }
6333 else
6334 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
6335 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
6336 {
6337 rounded_count
6338 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
6339 destmem = shallow_copy_rtx (destmem);
6340 set_mem_size (destmem, rounded_count);
6341 }
6342 else if (MEM_SIZE_KNOWN_P (destmem))
6343 clear_mem_size (destmem);
6344
6345 if (issetmem)
6346 {
6347 value = force_reg (mode, gen_lowpart (mode, value));
6348 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
6349 }
6350 else
6351 {
6352 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
6353 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
6354 if (mode != QImode)
6355 {
6356 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
6357 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
6358 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
6359 }
6360 else
6361 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
6362 if (CONST_INT_P (count))
6363 {
6364 rounded_count
6365 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
6366 srcmem = shallow_copy_rtx (srcmem);
6367 set_mem_size (srcmem, rounded_count);
6368 }
6369 else
6370 {
6371 if (MEM_SIZE_KNOWN_P (srcmem))
6372 clear_mem_size (srcmem);
6373 }
6374 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
6375 destexp, srcexp));
6376 }
6377}
6378
6379/* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
6380 DESTMEM.
6381 SRC is passed by pointer to be updated on return.
6382 Return value is updated DST. */
6383static rtx
6384emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
6385 HOST_WIDE_INT size_to_move)
6386{
c3185b64 6387 rtx dst = destmem, src = *srcmem, tempreg;
2bf6d935
ML
6388 enum insn_code code;
6389 machine_mode move_mode;
6390 int piece_size, i;
6391
6392 /* Find the widest mode in which we could perform moves.
6393 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6394 it until move of such size is supported. */
6395 piece_size = 1 << floor_log2 (size_to_move);
6396 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
6397 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6398 {
6399 gcc_assert (piece_size > 1);
6400 piece_size >>= 1;
6401 }
6402
6403 /* Find the corresponding vector mode with the same size as MOVE_MODE.
6404 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
6405 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
6406 {
6407 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
6408 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
6409 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6410 {
6411 move_mode = word_mode;
6412 piece_size = GET_MODE_SIZE (move_mode);
6413 code = optab_handler (mov_optab, move_mode);
6414 }
6415 }
6416 gcc_assert (code != CODE_FOR_nothing);
6417
6418 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6419 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
6420
6421 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6422 gcc_assert (size_to_move % piece_size == 0);
c3185b64 6423
2bf6d935
ML
6424 for (i = 0; i < size_to_move; i += piece_size)
6425 {
6426 /* We move from memory to memory, so we'll need to do it via
6427 a temporary register. */
6428 tempreg = gen_reg_rtx (move_mode);
6429 emit_insn (GEN_FCN (code) (tempreg, src));
6430 emit_insn (GEN_FCN (code) (dst, tempreg));
6431
6432 emit_move_insn (destptr,
c3185b64 6433 plus_constant (Pmode, copy_rtx (destptr), piece_size));
2bf6d935 6434 emit_move_insn (srcptr,
c3185b64 6435 plus_constant (Pmode, copy_rtx (srcptr), piece_size));
2bf6d935
ML
6436
6437 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6438 piece_size);
6439 src = adjust_automodify_address_nv (src, move_mode, srcptr,
6440 piece_size);
6441 }
6442
6443 /* Update DST and SRC rtx. */
6444 *srcmem = src;
6445 return dst;
6446}
6447
6448/* Helper function for the string operations below. Dest VARIABLE whether
6449 it is aligned to VALUE bytes. If true, jump to the label. */
6450
6451static rtx_code_label *
6452ix86_expand_aligntest (rtx variable, int value, bool epilogue)
6453{
6454 rtx_code_label *label = gen_label_rtx ();
6455 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
6456 if (GET_MODE (variable) == DImode)
6457 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
6458 else
6459 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
6460 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
6461 1, label);
6462 if (epilogue)
6463 predict_jump (REG_BR_PROB_BASE * 50 / 100);
6464 else
6465 predict_jump (REG_BR_PROB_BASE * 90 / 100);
6466 return label;
6467}
6468
6469
6470/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
6471
6472static void
76715c32 6473expand_cpymem_epilogue (rtx destmem, rtx srcmem,
2bf6d935
ML
6474 rtx destptr, rtx srcptr, rtx count, int max_size)
6475{
6476 rtx src, dest;
6477 if (CONST_INT_P (count))
6478 {
6479 HOST_WIDE_INT countval = INTVAL (count);
6480 HOST_WIDE_INT epilogue_size = countval % max_size;
6481 int i;
6482
6483 /* For now MAX_SIZE should be a power of 2. This assert could be
6484 relaxed, but it'll require a bit more complicated epilogue
6485 expanding. */
6486 gcc_assert ((max_size & (max_size - 1)) == 0);
6487 for (i = max_size; i >= 1; i >>= 1)
6488 {
6489 if (epilogue_size & i)
6490 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6491 }
6492 return;
6493 }
6494 if (max_size > 8)
6495 {
6496 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
6497 count, 1, OPTAB_DIRECT);
76715c32 6498 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
2bf6d935
ML
6499 count, QImode, 1, 4, false);
6500 return;
6501 }
6502
6503 /* When there are stringops, we can cheaply increase dest and src pointers.
6504 Otherwise we save code size by maintaining offset (zero is readily
6505 available from preceding rep operation) and using x86 addressing modes.
6506 */
6507 if (TARGET_SINGLE_STRINGOP)
6508 {
6509 if (max_size > 4)
6510 {
6511 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6512 src = change_address (srcmem, SImode, srcptr);
6513 dest = change_address (destmem, SImode, destptr);
6514 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6515 emit_label (label);
6516 LABEL_NUSES (label) = 1;
6517 }
6518 if (max_size > 2)
6519 {
6520 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6521 src = change_address (srcmem, HImode, srcptr);
6522 dest = change_address (destmem, HImode, destptr);
6523 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6524 emit_label (label);
6525 LABEL_NUSES (label) = 1;
6526 }
6527 if (max_size > 1)
6528 {
6529 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6530 src = change_address (srcmem, QImode, srcptr);
6531 dest = change_address (destmem, QImode, destptr);
6532 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6533 emit_label (label);
6534 LABEL_NUSES (label) = 1;
6535 }
6536 }
6537 else
6538 {
6539 rtx offset = force_reg (Pmode, const0_rtx);
6540 rtx tmp;
6541
6542 if (max_size > 4)
6543 {
6544 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6545 src = change_address (srcmem, SImode, srcptr);
6546 dest = change_address (destmem, SImode, destptr);
6547 emit_move_insn (dest, src);
6548 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
6549 true, OPTAB_LIB_WIDEN);
6550 if (tmp != offset)
6551 emit_move_insn (offset, tmp);
6552 emit_label (label);
6553 LABEL_NUSES (label) = 1;
6554 }
6555 if (max_size > 2)
6556 {
6557 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6558 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6559 src = change_address (srcmem, HImode, tmp);
6560 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6561 dest = change_address (destmem, HImode, tmp);
6562 emit_move_insn (dest, src);
6563 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
6564 true, OPTAB_LIB_WIDEN);
6565 if (tmp != offset)
6566 emit_move_insn (offset, tmp);
6567 emit_label (label);
6568 LABEL_NUSES (label) = 1;
6569 }
6570 if (max_size > 1)
6571 {
6572 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6573 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6574 src = change_address (srcmem, QImode, tmp);
6575 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6576 dest = change_address (destmem, QImode, tmp);
6577 emit_move_insn (dest, src);
6578 emit_label (label);
6579 LABEL_NUSES (label) = 1;
6580 }
6581 }
6582}
6583
6584/* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6585 with value PROMOTED_VAL.
6586 SRC is passed by pointer to be updated on return.
6587 Return value is updated DST. */
6588static rtx
6589emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
6590 HOST_WIDE_INT size_to_move)
6591{
c3185b64 6592 rtx dst = destmem;
2bf6d935
ML
6593 enum insn_code code;
6594 machine_mode move_mode;
6595 int piece_size, i;
6596
6597 /* Find the widest mode in which we could perform moves.
6598 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6599 it until move of such size is supported. */
6600 move_mode = GET_MODE (promoted_val);
6601 if (move_mode == VOIDmode)
6602 move_mode = QImode;
6603 if (size_to_move < GET_MODE_SIZE (move_mode))
6604 {
6605 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
6606 move_mode = int_mode_for_size (move_bits, 0).require ();
6607 promoted_val = gen_lowpart (move_mode, promoted_val);
6608 }
6609 piece_size = GET_MODE_SIZE (move_mode);
6610 code = optab_handler (mov_optab, move_mode);
6611 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
6612
6613 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6614
6615 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6616 gcc_assert (size_to_move % piece_size == 0);
c3185b64 6617
2bf6d935
ML
6618 for (i = 0; i < size_to_move; i += piece_size)
6619 {
6620 if (piece_size <= GET_MODE_SIZE (word_mode))
6621 {
6622 emit_insn (gen_strset (destptr, dst, promoted_val));
6623 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6624 piece_size);
6625 continue;
6626 }
6627
6628 emit_insn (GEN_FCN (code) (dst, promoted_val));
6629
6630 emit_move_insn (destptr,
c3185b64 6631 plus_constant (Pmode, copy_rtx (destptr), piece_size));
2bf6d935
ML
6632
6633 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6634 piece_size);
6635 }
6636
6637 /* Update DST rtx. */
6638 return dst;
6639}
6640/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6641static void
6642expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
6643 rtx count, int max_size)
6644{
6645 count = expand_simple_binop (counter_mode (count), AND, count,
6646 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
76715c32 6647 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
2bf6d935
ML
6648 gen_lowpart (QImode, value), count, QImode,
6649 1, max_size / 2, true);
6650}
6651
6652/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6653static void
6654expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
6655 rtx count, int max_size)
6656{
6657 rtx dest;
6658
6659 if (CONST_INT_P (count))
6660 {
6661 HOST_WIDE_INT countval = INTVAL (count);
6662 HOST_WIDE_INT epilogue_size = countval % max_size;
6663 int i;
6664
6665 /* For now MAX_SIZE should be a power of 2. This assert could be
6666 relaxed, but it'll require a bit more complicated epilogue
6667 expanding. */
6668 gcc_assert ((max_size & (max_size - 1)) == 0);
6669 for (i = max_size; i >= 1; i >>= 1)
6670 {
6671 if (epilogue_size & i)
6672 {
6673 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6674 destmem = emit_memset (destmem, destptr, vec_value, i);
6675 else
6676 destmem = emit_memset (destmem, destptr, value, i);
6677 }
6678 }
6679 return;
6680 }
6681 if (max_size > 32)
6682 {
6683 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
6684 return;
6685 }
6686 if (max_size > 16)
6687 {
6688 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
6689 if (TARGET_64BIT)
6690 {
6691 dest = change_address (destmem, DImode, destptr);
6692 emit_insn (gen_strset (destptr, dest, value));
6693 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
6694 emit_insn (gen_strset (destptr, dest, value));
6695 }
6696 else
6697 {
6698 dest = change_address (destmem, SImode, destptr);
6699 emit_insn (gen_strset (destptr, dest, value));
6700 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6701 emit_insn (gen_strset (destptr, dest, value));
6702 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
6703 emit_insn (gen_strset (destptr, dest, value));
6704 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
6705 emit_insn (gen_strset (destptr, dest, value));
6706 }
6707 emit_label (label);
6708 LABEL_NUSES (label) = 1;
6709 }
6710 if (max_size > 8)
6711 {
6712 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
6713 if (TARGET_64BIT)
6714 {
6715 dest = change_address (destmem, DImode, destptr);
6716 emit_insn (gen_strset (destptr, dest, value));
6717 }
6718 else
6719 {
6720 dest = change_address (destmem, SImode, destptr);
6721 emit_insn (gen_strset (destptr, dest, value));
6722 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6723 emit_insn (gen_strset (destptr, dest, value));
6724 }
6725 emit_label (label);
6726 LABEL_NUSES (label) = 1;
6727 }
6728 if (max_size > 4)
6729 {
6730 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6731 dest = change_address (destmem, SImode, destptr);
6732 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
6733 emit_label (label);
6734 LABEL_NUSES (label) = 1;
6735 }
6736 if (max_size > 2)
6737 {
6738 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6739 dest = change_address (destmem, HImode, destptr);
6740 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
6741 emit_label (label);
6742 LABEL_NUSES (label) = 1;
6743 }
6744 if (max_size > 1)
6745 {
6746 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6747 dest = change_address (destmem, QImode, destptr);
6748 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
6749 emit_label (label);
6750 LABEL_NUSES (label) = 1;
6751 }
6752}
6753
6754/* Adjust COUNTER by the VALUE. */
6755static void
6756ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
6757{
83bc5e44 6758 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
2bf6d935
ML
6759}
6760
6761/* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6762 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
6763 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6764 ignored.
6765 Return value is updated DESTMEM. */
6766
6767static rtx
76715c32 6768expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
2bf6d935
ML
6769 rtx destptr, rtx srcptr, rtx value,
6770 rtx vec_value, rtx count, int align,
6771 int desired_alignment, bool issetmem)
6772{
6773 int i;
6774 for (i = 1; i < desired_alignment; i <<= 1)
6775 {
6776 if (align <= i)
6777 {
6778 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
6779 if (issetmem)
6780 {
6781 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6782 destmem = emit_memset (destmem, destptr, vec_value, i);
6783 else
6784 destmem = emit_memset (destmem, destptr, value, i);
6785 }
6786 else
6787 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6788 ix86_adjust_counter (count, i);
6789 emit_label (label);
6790 LABEL_NUSES (label) = 1;
6791 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
6792 }
6793 }
6794 return destmem;
6795}
6796
6797/* Test if COUNT&SIZE is nonzero and if so, expand movme
6798 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6799 and jump to DONE_LABEL. */
6800static void
76715c32 6801expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
2bf6d935
ML
6802 rtx destptr, rtx srcptr,
6803 rtx value, rtx vec_value,
6804 rtx count, int size,
6805 rtx done_label, bool issetmem)
6806{
6807 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
6808 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
6809 rtx modesize;
6810 int n;
6811
6812 /* If we do not have vector value to copy, we must reduce size. */
6813 if (issetmem)
6814 {
6815 if (!vec_value)
6816 {
6817 if (GET_MODE (value) == VOIDmode && size > 8)
6818 mode = Pmode;
6819 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
6820 mode = GET_MODE (value);
6821 }
6822 else
6823 mode = GET_MODE (vec_value), value = vec_value;
6824 }
6825 else
6826 {
6827 /* Choose appropriate vector mode. */
6828 if (size >= 32)
6829 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
6830 else if (size >= 16)
6831 mode = TARGET_SSE ? V16QImode : DImode;
6832 srcmem = change_address (srcmem, mode, srcptr);
6833 }
6834 destmem = change_address (destmem, mode, destptr);
6835 modesize = GEN_INT (GET_MODE_SIZE (mode));
6836 gcc_assert (GET_MODE_SIZE (mode) <= size);
6837 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6838 {
6839 if (issetmem)
6840 emit_move_insn (destmem, gen_lowpart (mode, value));
6841 else
6842 {
6843 emit_move_insn (destmem, srcmem);
6844 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6845 }
6846 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6847 }
6848
6849 destmem = offset_address (destmem, count, 1);
6850 destmem = offset_address (destmem, GEN_INT (-2 * size),
6851 GET_MODE_SIZE (mode));
6852 if (!issetmem)
6853 {
6854 srcmem = offset_address (srcmem, count, 1);
6855 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
6856 GET_MODE_SIZE (mode));
6857 }
6858 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6859 {
6860 if (issetmem)
6861 emit_move_insn (destmem, gen_lowpart (mode, value));
6862 else
6863 {
6864 emit_move_insn (destmem, srcmem);
6865 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6866 }
6867 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6868 }
6869 emit_jump_insn (gen_jump (done_label));
6870 emit_barrier ();
6871
6872 emit_label (label);
6873 LABEL_NUSES (label) = 1;
6874}
6875
6876/* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6877 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6878 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6879 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6880 DONE_LABEL is a label after the whole copying sequence. The label is created
6881 on demand if *DONE_LABEL is NULL.
6882 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
6883 bounds after the initial copies.
6884
6885 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6886 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6887 we will dispatch to a library call for large blocks.
6888
6889 In pseudocode we do:
6890
6891 if (COUNT < SIZE)
6892 {
6893 Assume that SIZE is 4. Bigger sizes are handled analogously
6894 if (COUNT & 4)
6895 {
6896 copy 4 bytes from SRCPTR to DESTPTR
6897 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6898 goto done_label
6899 }
6900 if (!COUNT)
6901 goto done_label;
6902 copy 1 byte from SRCPTR to DESTPTR
6903 if (COUNT & 2)
6904 {
6905 copy 2 bytes from SRCPTR to DESTPTR
6906 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6907 }
6908 }
6909 else
6910 {
6911 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6912 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6913
6914 OLD_DESPTR = DESTPTR;
6915 Align DESTPTR up to DESIRED_ALIGN
6916 SRCPTR += DESTPTR - OLD_DESTPTR
6917 COUNT -= DEST_PTR - OLD_DESTPTR
6918 if (DYNAMIC_CHECK)
6919 Round COUNT down to multiple of SIZE
6920 << optional caller supplied zero size guard is here >>
6921 << optional caller supplied dynamic check is here >>
6922 << caller supplied main copy loop is here >>
6923 }
6924 done_label:
6925 */
6926static void
76715c32 6927expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
2bf6d935
ML
6928 rtx *destptr, rtx *srcptr,
6929 machine_mode mode,
6930 rtx value, rtx vec_value,
6931 rtx *count,
6932 rtx_code_label **done_label,
6933 int size,
6934 int desired_align,
6935 int align,
6936 unsigned HOST_WIDE_INT *min_size,
6937 bool dynamic_check,
6938 bool issetmem)
6939{
6940 rtx_code_label *loop_label = NULL, *label;
6941 int n;
6942 rtx modesize;
6943 int prolog_size = 0;
6944 rtx mode_value;
6945
6946 /* Chose proper value to copy. */
6947 if (issetmem && VECTOR_MODE_P (mode))
6948 mode_value = vec_value;
6949 else
6950 mode_value = value;
6951 gcc_assert (GET_MODE_SIZE (mode) <= size);
6952
6953 /* See if block is big or small, handle small blocks. */
6954 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
6955 {
6956 int size2 = size;
6957 loop_label = gen_label_rtx ();
6958
6959 if (!*done_label)
6960 *done_label = gen_label_rtx ();
6961
6962 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
6963 1, loop_label);
6964 size2 >>= 1;
6965
6966 /* Handle sizes > 3. */
6967 for (;size2 > 2; size2 >>= 1)
76715c32 6968 expand_small_cpymem_or_setmem (destmem, srcmem,
2bf6d935
ML
6969 *destptr, *srcptr,
6970 value, vec_value,
6971 *count,
6972 size2, *done_label, issetmem);
6973 /* Nothing to copy? Jump to DONE_LABEL if so */
6974 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
6975 1, *done_label);
6976
6977 /* Do a byte copy. */
6978 destmem = change_address (destmem, QImode, *destptr);
6979 if (issetmem)
6980 emit_move_insn (destmem, gen_lowpart (QImode, value));
6981 else
6982 {
6983 srcmem = change_address (srcmem, QImode, *srcptr);
6984 emit_move_insn (destmem, srcmem);
6985 }
6986
6987 /* Handle sizes 2 and 3. */
6988 label = ix86_expand_aligntest (*count, 2, false);
6989 destmem = change_address (destmem, HImode, *destptr);
6990 destmem = offset_address (destmem, *count, 1);
6991 destmem = offset_address (destmem, GEN_INT (-2), 2);
6992 if (issetmem)
6993 emit_move_insn (destmem, gen_lowpart (HImode, value));
6994 else
6995 {
6996 srcmem = change_address (srcmem, HImode, *srcptr);
6997 srcmem = offset_address (srcmem, *count, 1);
6998 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
6999 emit_move_insn (destmem, srcmem);
7000 }
7001
7002 emit_label (label);
7003 LABEL_NUSES (label) = 1;
7004 emit_jump_insn (gen_jump (*done_label));
7005 emit_barrier ();
7006 }
7007 else
7008 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
7009 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
7010
7011 /* Start memcpy for COUNT >= SIZE. */
7012 if (loop_label)
7013 {
7014 emit_label (loop_label);
7015 LABEL_NUSES (loop_label) = 1;
7016 }
7017
7018 /* Copy first desired_align bytes. */
7019 if (!issetmem)
7020 srcmem = change_address (srcmem, mode, *srcptr);
7021 destmem = change_address (destmem, mode, *destptr);
7022 modesize = GEN_INT (GET_MODE_SIZE (mode));
7023 for (n = 0; prolog_size < desired_align - align; n++)
7024 {
7025 if (issetmem)
7026 emit_move_insn (destmem, mode_value);
7027 else
7028 {
7029 emit_move_insn (destmem, srcmem);
7030 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7031 }
7032 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7033 prolog_size += GET_MODE_SIZE (mode);
7034 }
7035
7036
7037 /* Copy last SIZE bytes. */
7038 destmem = offset_address (destmem, *count, 1);
7039 destmem = offset_address (destmem,
7040 GEN_INT (-size - prolog_size),
7041 1);
7042 if (issetmem)
7043 emit_move_insn (destmem, mode_value);
7044 else
7045 {
7046 srcmem = offset_address (srcmem, *count, 1);
7047 srcmem = offset_address (srcmem,
7048 GEN_INT (-size - prolog_size),
7049 1);
7050 emit_move_insn (destmem, srcmem);
7051 }
7052 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
7053 {
7054 destmem = offset_address (destmem, modesize, 1);
7055 if (issetmem)
7056 emit_move_insn (destmem, mode_value);
7057 else
7058 {
7059 srcmem = offset_address (srcmem, modesize, 1);
7060 emit_move_insn (destmem, srcmem);
7061 }
7062 }
7063
7064 /* Align destination. */
7065 if (desired_align > 1 && desired_align > align)
7066 {
7067 rtx saveddest = *destptr;
7068
7069 gcc_assert (desired_align <= size);
7070 /* Align destptr up, place it to new register. */
7071 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
7072 GEN_INT (prolog_size),
7073 NULL_RTX, 1, OPTAB_DIRECT);
7074 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
7075 REG_POINTER (*destptr) = 1;
7076 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
7077 GEN_INT (-desired_align),
7078 *destptr, 1, OPTAB_DIRECT);
7079 /* See how many bytes we skipped. */
7080 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
7081 *destptr,
7082 saveddest, 1, OPTAB_DIRECT);
7083 /* Adjust srcptr and count. */
7084 if (!issetmem)
7085 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
7086 saveddest, *srcptr, 1, OPTAB_DIRECT);
7087 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
7088 saveddest, *count, 1, OPTAB_DIRECT);
7089 /* We copied at most size + prolog_size. */
7090 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
7091 *min_size
7092 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
7093 else
7094 *min_size = 0;
7095
7096 /* Our loops always round down the block size, but for dispatch to
7097 library we need precise value. */
7098 if (dynamic_check)
7099 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
7100 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
7101 }
7102 else
7103 {
7104 gcc_assert (prolog_size == 0);
7105 /* Decrease count, so we won't end up copying last word twice. */
7106 if (!CONST_INT_P (*count))
7107 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
7108 constm1_rtx, *count, 1, OPTAB_DIRECT);
7109 else
7110 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
7111 (unsigned HOST_WIDE_INT)size));
7112 if (*min_size)
7113 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
7114 }
7115}
7116
7117
7118/* This function is like the previous one, except here we know how many bytes
7119 need to be copied. That allows us to update alignment not only of DST, which
7120 is returned, but also of SRC, which is passed as a pointer for that
7121 reason. */
7122static rtx
76715c32 7123expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
2bf6d935
ML
7124 rtx srcreg, rtx value, rtx vec_value,
7125 int desired_align, int align_bytes,
7126 bool issetmem)
7127{
7128 rtx src = NULL;
7129 rtx orig_dst = dst;
7130 rtx orig_src = NULL;
7131 int piece_size = 1;
7132 int copied_bytes = 0;
7133
7134 if (!issetmem)
7135 {
7136 gcc_assert (srcp != NULL);
7137 src = *srcp;
7138 orig_src = src;
7139 }
7140
7141 for (piece_size = 1;
7142 piece_size <= desired_align && copied_bytes < align_bytes;
7143 piece_size <<= 1)
7144 {
7145 if (align_bytes & piece_size)
7146 {
7147 if (issetmem)
7148 {
7149 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
7150 dst = emit_memset (dst, destreg, vec_value, piece_size);
7151 else
7152 dst = emit_memset (dst, destreg, value, piece_size);
7153 }
7154 else
7155 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
7156 copied_bytes += piece_size;
7157 }
7158 }
7159 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
7160 set_mem_align (dst, desired_align * BITS_PER_UNIT);
7161 if (MEM_SIZE_KNOWN_P (orig_dst))
7162 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
7163
7164 if (!issetmem)
7165 {
7166 int src_align_bytes = get_mem_align_offset (src, desired_align
7167 * BITS_PER_UNIT);
7168 if (src_align_bytes >= 0)
7169 src_align_bytes = desired_align - src_align_bytes;
7170 if (src_align_bytes >= 0)
7171 {
7172 unsigned int src_align;
7173 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
7174 {
7175 if ((src_align_bytes & (src_align - 1))
7176 == (align_bytes & (src_align - 1)))
7177 break;
7178 }
7179 if (src_align > (unsigned int) desired_align)
7180 src_align = desired_align;
7181 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
7182 set_mem_align (src, src_align * BITS_PER_UNIT);
7183 }
7184 if (MEM_SIZE_KNOWN_P (orig_src))
7185 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
7186 *srcp = src;
7187 }
7188
7189 return dst;
7190}
7191
7192/* Return true if ALG can be used in current context.
7193 Assume we expand memset if MEMSET is true. */
7194static bool
7195alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
7196{
7197 if (alg == no_stringop)
7198 return false;
7199 if (alg == vector_loop)
7200 return TARGET_SSE || TARGET_AVX;
7201 /* Algorithms using the rep prefix want at least edi and ecx;
7202 additionally, memset wants eax and memcpy wants esi. Don't
7203 consider such algorithms if the user has appropriated those
7204 registers for their own purposes, or if we have a non-default
7205 address space, since some string insns cannot override the segment. */
7206 if (alg == rep_prefix_1_byte
7207 || alg == rep_prefix_4_byte
7208 || alg == rep_prefix_8_byte)
7209 {
7210 if (have_as)
7211 return false;
7212 if (fixed_regs[CX_REG]
7213 || fixed_regs[DI_REG]
7214 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
7215 return false;
7216 }
7217 return true;
7218}
7219
7220/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
7221static enum stringop_alg
7222decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
7223 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
7224 bool memset, bool zero_memset, bool have_as,
7225 int *dynamic_check, bool *noalign, bool recur)
7226{
7227 const struct stringop_algs *algs;
7228 bool optimize_for_speed;
7229 int max = 0;
7230 const struct processor_costs *cost;
7231 int i;
7232 bool any_alg_usable_p = false;
7233
7234 *noalign = false;
7235 *dynamic_check = -1;
7236
7237 /* Even if the string operation call is cold, we still might spend a lot
7238 of time processing large blocks. */
7239 if (optimize_function_for_size_p (cfun)
7240 || (optimize_insn_for_size_p ()
7241 && (max_size < 256
7242 || (expected_size != -1 && expected_size < 256))))
7243 optimize_for_speed = false;
7244 else
7245 optimize_for_speed = true;
7246
7247 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
7248 if (memset)
7249 algs = &cost->memset[TARGET_64BIT != 0];
7250 else
7251 algs = &cost->memcpy[TARGET_64BIT != 0];
7252
7253 /* See maximal size for user defined algorithm. */
7254 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
7255 {
7256 enum stringop_alg candidate = algs->size[i].alg;
7257 bool usable = alg_usable_p (candidate, memset, have_as);
7258 any_alg_usable_p |= usable;
7259
7260 if (candidate != libcall && candidate && usable)
7261 max = algs->size[i].max;
7262 }
7263
7264 /* If expected size is not known but max size is small enough
7265 so inline version is a win, set expected size into
7266 the range. */
7267 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
7268 && expected_size == -1)
7269 expected_size = min_size / 2 + max_size / 2;
7270
7271 /* If user specified the algorithm, honor it if possible. */
7272 if (ix86_stringop_alg != no_stringop
7273 && alg_usable_p (ix86_stringop_alg, memset, have_as))
7274 return ix86_stringop_alg;
7275 /* rep; movq or rep; movl is the smallest variant. */
7276 else if (!optimize_for_speed)
7277 {
7278 *noalign = true;
7279 if (!count || (count & 3) || (memset && !zero_memset))
7280 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
7281 ? rep_prefix_1_byte : loop_1_byte;
7282 else
7283 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
7284 ? rep_prefix_4_byte : loop;
7285 }
7286 /* Very tiny blocks are best handled via the loop, REP is expensive to
7287 setup. */
7288 else if (expected_size != -1 && expected_size < 4)
7289 return loop_1_byte;
7290 else if (expected_size != -1)
7291 {
7292 enum stringop_alg alg = libcall;
7293 bool alg_noalign = false;
7294 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
7295 {
7296 /* We get here if the algorithms that were not libcall-based
7297 were rep-prefix based and we are unable to use rep prefixes
7298 based on global register usage. Break out of the loop and
7299 use the heuristic below. */
7300 if (algs->size[i].max == 0)
7301 break;
7302 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
7303 {
7304 enum stringop_alg candidate = algs->size[i].alg;
7305
7306 if (candidate != libcall
7307 && alg_usable_p (candidate, memset, have_as))
7308 {
7309 alg = candidate;
7310 alg_noalign = algs->size[i].noalign;
7311 }
7312 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
7313 last non-libcall inline algorithm. */
7314 if (TARGET_INLINE_ALL_STRINGOPS)
7315 {
7316 /* When the current size is best to be copied by a libcall,
7317 but we are still forced to inline, run the heuristic below
7318 that will pick code for medium sized blocks. */
7319 if (alg != libcall)
7320 {
7321 *noalign = alg_noalign;
7322 return alg;
7323 }
7324 else if (!any_alg_usable_p)
7325 break;
7326 }
bf24f4ec
L
7327 else if (alg_usable_p (candidate, memset, have_as)
7328 && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
7329 && candidate == rep_prefix_1_byte
7330 /* NB: If min_size != max_size, size is
7331 unknown. */
7332 && min_size != max_size))
2bf6d935
ML
7333 {
7334 *noalign = algs->size[i].noalign;
7335 return candidate;
7336 }
7337 }
7338 }
7339 }
7340 /* When asked to inline the call anyway, try to pick meaningful choice.
7341 We look for maximal size of block that is faster to copy by hand and
7342 take blocks of at most of that size guessing that average size will
7343 be roughly half of the block.
7344
7345 If this turns out to be bad, we might simply specify the preferred
7346 choice in ix86_costs. */
7347 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
7348 && (algs->unknown_size == libcall
7349 || !alg_usable_p (algs->unknown_size, memset, have_as)))
7350 {
7351 enum stringop_alg alg;
7352 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
7353
7354 /* If there aren't any usable algorithms or if recursing already,
7355 then recursing on smaller sizes or same size isn't going to
7356 find anything. Just return the simple byte-at-a-time copy loop. */
7357 if (!any_alg_usable_p || recur)
7358 {
7359 /* Pick something reasonable. */
7360 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
7361 *dynamic_check = 128;
7362 return loop_1_byte;
7363 }
7364 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
7365 zero_memset, have_as, dynamic_check, noalign, true);
7366 gcc_assert (*dynamic_check == -1);
7367 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
7368 *dynamic_check = max;
7369 else
7370 gcc_assert (alg != libcall);
7371 return alg;
7372 }
7373 return (alg_usable_p (algs->unknown_size, memset, have_as)
7374 ? algs->unknown_size : libcall);
7375}
7376
7377/* Decide on alignment. We know that the operand is already aligned to ALIGN
7378 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
7379static int
7380decide_alignment (int align,
7381 enum stringop_alg alg,
7382 int expected_size,
7383 machine_mode move_mode)
7384{
7385 int desired_align = 0;
7386
7387 gcc_assert (alg != no_stringop);
7388
7389 if (alg == libcall)
7390 return 0;
7391 if (move_mode == VOIDmode)
7392 return 0;
7393
7394 desired_align = GET_MODE_SIZE (move_mode);
7395 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
7396 copying whole cacheline at once. */
f23881fc 7397 if (TARGET_CPU_P (PENTIUMPRO)
2bf6d935
ML
7398 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
7399 desired_align = 8;
7400
7401 if (optimize_size)
7402 desired_align = 1;
7403 if (desired_align < align)
7404 desired_align = align;
7405 if (expected_size != -1 && expected_size < 4)
7406 desired_align = align;
7407
7408 return desired_align;
7409}
7410
7411
7412/* Helper function for memcpy. For QImode value 0xXY produce
7413 0xXYXYXYXY of wide specified by MODE. This is essentially
7414 a * 0x10101010, but we can do slightly better than
7415 synth_mult by unwinding the sequence by hand on CPUs with
7416 slow multiply. */
7417static rtx
7418promote_duplicated_reg (machine_mode mode, rtx val)
7419{
7420 machine_mode valmode = GET_MODE (val);
7421 rtx tmp;
7422 int nops = mode == DImode ? 3 : 2;
7423
7424 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
7425 if (val == const0_rtx)
7426 return copy_to_mode_reg (mode, CONST0_RTX (mode));
7427 if (CONST_INT_P (val))
7428 {
7429 HOST_WIDE_INT v = INTVAL (val) & 255;
7430
7431 v |= v << 8;
7432 v |= v << 16;
7433 if (mode == DImode)
7434 v |= (v << 16) << 16;
7435 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
7436 }
7437
7438 if (valmode == VOIDmode)
7439 valmode = QImode;
7440 if (valmode != QImode)
7441 val = gen_lowpart (QImode, val);
7442 if (mode == QImode)
7443 return val;
7444 if (!TARGET_PARTIAL_REG_STALL)
7445 nops--;
7446 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
7447 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
7448 <= (ix86_cost->shift_const + ix86_cost->add) * nops
7449 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
7450 {
7451 rtx reg = convert_modes (mode, QImode, val, true);
7452 tmp = promote_duplicated_reg (mode, const1_rtx);
7453 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
7454 OPTAB_DIRECT);
7455 }
7456 else
7457 {
7458 rtx reg = convert_modes (mode, QImode, val, true);
7459
7460 if (!TARGET_PARTIAL_REG_STALL)
e9539592 7461 emit_insn (gen_insv_1 (mode, reg, reg));
2bf6d935
ML
7462 else
7463 {
7464 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
7465 NULL, 1, OPTAB_DIRECT);
7466 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
7467 OPTAB_DIRECT);
7468 }
7469 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
7470 NULL, 1, OPTAB_DIRECT);
7471 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7472 if (mode == SImode)
7473 return reg;
7474 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
7475 NULL, 1, OPTAB_DIRECT);
7476 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7477 return reg;
7478 }
7479}
7480
7481/* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7482 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7483 alignment from ALIGN to DESIRED_ALIGN. */
7484static rtx
7485promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
7486 int align)
7487{
7488 rtx promoted_val;
7489
7490 if (TARGET_64BIT
7491 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
7492 promoted_val = promote_duplicated_reg (DImode, val);
7493 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
7494 promoted_val = promote_duplicated_reg (SImode, val);
7495 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
7496 promoted_val = promote_duplicated_reg (HImode, val);
7497 else
7498 promoted_val = val;
7499
7500 return promoted_val;
7501}
7502
7503/* Copy the address to a Pmode register. This is used for x32 to
7504 truncate DImode TLS address to a SImode register. */
7505
7506static rtx
7507ix86_copy_addr_to_reg (rtx addr)
7508{
7509 rtx reg;
7510 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
7511 {
7512 reg = copy_addr_to_reg (addr);
7513 REG_POINTER (reg) = 1;
7514 return reg;
7515 }
7516 else
7517 {
7518 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
7519 reg = copy_to_mode_reg (DImode, addr);
7520 REG_POINTER (reg) = 1;
7521 return gen_rtx_SUBREG (SImode, reg, 0);
7522 }
7523}
7524
7525/* Expand string move (memcpy) ot store (memset) operation. Use i386 string
7526 operations when profitable. The code depends upon architecture, block size
7527 and alignment, but always has one of the following overall structures:
7528
7529 Aligned move sequence:
7530
7531 1) Prologue guard: Conditional that jumps up to epilogues for small
7532 blocks that can be handled by epilogue alone. This is faster
7533 but also needed for correctness, since prologue assume the block
7534 is larger than the desired alignment.
7535
7536 Optional dynamic check for size and libcall for large
7537 blocks is emitted here too, with -minline-stringops-dynamically.
7538
7539 2) Prologue: copy first few bytes in order to get destination
7540 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
7541 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7542 copied. We emit either a jump tree on power of two sized
7543 blocks, or a byte loop.
7544
7545 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7546 with specified algorithm.
7547
7548 4) Epilogue: code copying tail of the block that is too small to be
7549 handled by main body (or up to size guarded by prologue guard).
7550
7551 Misaligned move sequence
7552
7553 1) missaligned move prologue/epilogue containing:
7554 a) Prologue handling small memory blocks and jumping to done_label
7555 (skipped if blocks are known to be large enough)
7556 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7557 needed by single possibly misaligned move
7558 (skipped if alignment is not needed)
7559 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7560
7561 2) Zero size guard dispatching to done_label, if needed
7562
7563 3) dispatch to library call, if needed,
7564
7565 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7566 with specified algorithm. */
7567bool
76715c32 7568ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
2bf6d935
ML
7569 rtx align_exp, rtx expected_align_exp,
7570 rtx expected_size_exp, rtx min_size_exp,
7571 rtx max_size_exp, rtx probable_max_size_exp,
7572 bool issetmem)
7573{
7574 rtx destreg;
7575 rtx srcreg = NULL;
7576 rtx_code_label *label = NULL;
7577 rtx tmp;
7578 rtx_code_label *jump_around_label = NULL;
7579 HOST_WIDE_INT align = 1;
7580 unsigned HOST_WIDE_INT count = 0;
7581 HOST_WIDE_INT expected_size = -1;
7582 int size_needed = 0, epilogue_size_needed;
7583 int desired_align = 0, align_bytes = 0;
7584 enum stringop_alg alg;
7585 rtx promoted_val = NULL;
7586 rtx vec_promoted_val = NULL;
7587 bool force_loopy_epilogue = false;
7588 int dynamic_check;
7589 bool need_zero_guard = false;
7590 bool noalign;
7591 machine_mode move_mode = VOIDmode;
7592 machine_mode wider_mode;
7593 int unroll_factor = 1;
7594 /* TODO: Once value ranges are available, fill in proper data. */
7595 unsigned HOST_WIDE_INT min_size = 0;
7596 unsigned HOST_WIDE_INT max_size = -1;
7597 unsigned HOST_WIDE_INT probable_max_size = -1;
7598 bool misaligned_prologue_used = false;
7599 bool have_as;
7600
7601 if (CONST_INT_P (align_exp))
7602 align = INTVAL (align_exp);
7603 /* i386 can do misaligned access on reasonably increased cost. */
7604 if (CONST_INT_P (expected_align_exp)
7605 && INTVAL (expected_align_exp) > align)
7606 align = INTVAL (expected_align_exp);
7607 /* ALIGN is the minimum of destination and source alignment, but we care here
7608 just about destination alignment. */
7609 else if (!issetmem
7610 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
7611 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
7612
7613 if (CONST_INT_P (count_exp))
7614 {
7615 min_size = max_size = probable_max_size = count = expected_size
7616 = INTVAL (count_exp);
7617 /* When COUNT is 0, there is nothing to do. */
7618 if (!count)
7619 return true;
7620 }
7621 else
7622 {
7623 if (min_size_exp)
7624 min_size = INTVAL (min_size_exp);
7625 if (max_size_exp)
7626 max_size = INTVAL (max_size_exp);
7627 if (probable_max_size_exp)
7628 probable_max_size = INTVAL (probable_max_size_exp);
7629 if (CONST_INT_P (expected_size_exp))
7630 expected_size = INTVAL (expected_size_exp);
7631 }
7632
7633 /* Make sure we don't need to care about overflow later on. */
7634 if (count > (HOST_WIDE_INT_1U << 30))
7635 return false;
7636
7637 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
7638 if (!issetmem)
7639 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
7640
7641 /* Step 0: Decide on preferred algorithm, desired alignment and
7642 size of chunks to be copied by main loop. */
7643 alg = decide_alg (count, expected_size, min_size, probable_max_size,
7644 issetmem,
7645 issetmem && val_exp == const0_rtx, have_as,
7646 &dynamic_check, &noalign, false);
7647
7648 if (dump_file)
7649 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
7650 stringop_alg_names[alg]);
7651
7652 if (alg == libcall)
7653 return false;
7654 gcc_assert (alg != no_stringop);
7655
7656 /* For now vector-version of memset is generated only for memory zeroing, as
7657 creating of promoted vector value is very cheap in this case. */
7658 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
7659 alg = unrolled_loop;
7660
7661 if (!count)
7662 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
7663 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
7664 if (!issetmem)
7665 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
7666
7667 unroll_factor = 1;
7668 move_mode = word_mode;
7669 switch (alg)
7670 {
7671 case libcall:
7672 case no_stringop:
7673 case last_alg:
7674 gcc_unreachable ();
7675 case loop_1_byte:
7676 need_zero_guard = true;
7677 move_mode = QImode;
7678 break;
7679 case loop:
7680 need_zero_guard = true;
7681 break;
7682 case unrolled_loop:
7683 need_zero_guard = true;
7684 unroll_factor = (TARGET_64BIT ? 4 : 2);
7685 break;
7686 case vector_loop:
7687 need_zero_guard = true;
7688 unroll_factor = 4;
7689 /* Find the widest supported mode. */
7690 move_mode = word_mode;
7691 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
7692 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
7693 move_mode = wider_mode;
7694
586bbef1 7695 if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
2bf6d935
ML
7696 move_mode = TImode;
7697
7698 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7699 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7700 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7701 {
7702 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7703 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7704 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
7705 move_mode = word_mode;
7706 }
7707 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
7708 break;
7709 case rep_prefix_8_byte:
7710 move_mode = DImode;
7711 break;
7712 case rep_prefix_4_byte:
7713 move_mode = SImode;
7714 break;
7715 case rep_prefix_1_byte:
7716 move_mode = QImode;
7717 break;
7718 }
7719 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
7720 epilogue_size_needed = size_needed;
7721
7722 /* If we are going to call any library calls conditionally, make sure any
7723 pending stack adjustment happen before the first conditional branch,
7724 otherwise they will be emitted before the library call only and won't
7725 happen from the other branches. */
7726 if (dynamic_check != -1)
7727 do_pending_stack_adjust ();
7728
7729 desired_align = decide_alignment (align, alg, expected_size, move_mode);
7730 if (!TARGET_ALIGN_STRINGOPS || noalign)
7731 align = desired_align;
7732
7733 /* Step 1: Prologue guard. */
7734
7735 /* Alignment code needs count to be in register. */
7736 if (CONST_INT_P (count_exp) && desired_align > align)
7737 {
7738 if (INTVAL (count_exp) > desired_align
7739 && INTVAL (count_exp) > size_needed)
7740 {
7741 align_bytes
7742 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
7743 if (align_bytes <= 0)
7744 align_bytes = 0;
7745 else
7746 align_bytes = desired_align - align_bytes;
7747 }
7748 if (align_bytes == 0)
7749 count_exp = force_reg (counter_mode (count_exp), count_exp);
7750 }
7751 gcc_assert (desired_align >= 1 && align >= 1);
7752
7753 /* Misaligned move sequences handle both prologue and epilogue at once.
7754 Default code generation results in a smaller code for large alignments
7755 and also avoids redundant job when sizes are known precisely. */
7756 misaligned_prologue_used
7757 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7758 && MAX (desired_align, epilogue_size_needed) <= 32
7759 && desired_align <= epilogue_size_needed
7760 && ((desired_align > align && !align_bytes)
7761 || (!count && epilogue_size_needed > 1)));
7762
7763 /* Do the cheap promotion to allow better CSE across the
7764 main loop and epilogue (ie one load of the big constant in the
7765 front of all code.
7766 For now the misaligned move sequences do not have fast path
7767 without broadcasting. */
7768 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
7769 {
7770 if (alg == vector_loop)
7771 {
7772 gcc_assert (val_exp == const0_rtx);
7773 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
7774 promoted_val = promote_duplicated_reg_to_size (val_exp,
7775 GET_MODE_SIZE (word_mode),
7776 desired_align, align);
7777 }
7778 else
7779 {
7780 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7781 desired_align, align);
7782 }
7783 }
7784 /* Misaligned move sequences handles both prologues and epilogues at once.
7785 Default code generation results in smaller code for large alignments and
7786 also avoids redundant job when sizes are known precisely. */
7787 if (misaligned_prologue_used)
7788 {
7789 /* Misaligned move prologue handled small blocks by itself. */
76715c32 7790 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
2bf6d935
ML
7791 (dst, src, &destreg, &srcreg,
7792 move_mode, promoted_val, vec_promoted_val,
7793 &count_exp,
7794 &jump_around_label,
7795 desired_align < align
7796 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
7797 desired_align, align, &min_size, dynamic_check, issetmem);
7798 if (!issetmem)
7799 src = change_address (src, BLKmode, srcreg);
7800 dst = change_address (dst, BLKmode, destreg);
7801 set_mem_align (dst, desired_align * BITS_PER_UNIT);
7802 epilogue_size_needed = 0;
7803 if (need_zero_guard
7804 && min_size < (unsigned HOST_WIDE_INT) size_needed)
7805 {
7806 /* It is possible that we copied enough so the main loop will not
7807 execute. */
7808 gcc_assert (size_needed > 1);
7809 if (jump_around_label == NULL_RTX)
7810 jump_around_label = gen_label_rtx ();
7811 emit_cmp_and_jump_insns (count_exp,
7812 GEN_INT (size_needed),
7813 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
7814 if (expected_size == -1
7815 || expected_size < (desired_align - align) / 2 + size_needed)
7816 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7817 else
7818 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7819 }
7820 }
7821 /* Ensure that alignment prologue won't copy past end of block. */
7822 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
7823 {
7824 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
7825 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7826 Make sure it is power of 2. */
7827 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
7828
7829 /* To improve performance of small blocks, we jump around the VAL
7830 promoting mode. This mean that if the promoted VAL is not constant,
7831 we might not use it in the epilogue and have to use byte
7832 loop variant. */
7833 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
7834 force_loopy_epilogue = true;
7835 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7836 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7837 {
7838 /* If main algorithm works on QImode, no epilogue is needed.
7839 For small sizes just don't align anything. */
7840 if (size_needed == 1)
7841 desired_align = align;
7842 else
7843 goto epilogue;
7844 }
7845 else if (!count
7846 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7847 {
7848 label = gen_label_rtx ();
7849 emit_cmp_and_jump_insns (count_exp,
7850 GEN_INT (epilogue_size_needed),
7851 LTU, 0, counter_mode (count_exp), 1, label);
7852 if (expected_size == -1 || expected_size < epilogue_size_needed)
7853 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7854 else
7855 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7856 }
7857 }
7858
7859 /* Emit code to decide on runtime whether library call or inline should be
7860 used. */
7861 if (dynamic_check != -1)
7862 {
7863 if (!issetmem && CONST_INT_P (count_exp))
7864 {
7865 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
7866 {
7867 emit_block_copy_via_libcall (dst, src, count_exp);
7868 count_exp = const0_rtx;
7869 goto epilogue;
7870 }
7871 }
7872 else
7873 {
7874 rtx_code_label *hot_label = gen_label_rtx ();
7875 if (jump_around_label == NULL_RTX)
7876 jump_around_label = gen_label_rtx ();
7877 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
7878 LEU, 0, counter_mode (count_exp),
7879 1, hot_label);
7880 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7881 if (issetmem)
7882 set_storage_via_libcall (dst, count_exp, val_exp);
7883 else
7884 emit_block_copy_via_libcall (dst, src, count_exp);
7885 emit_jump (jump_around_label);
7886 emit_label (hot_label);
7887 }
7888 }
7889
7890 /* Step 2: Alignment prologue. */
7891 /* Do the expensive promotion once we branched off the small blocks. */
7892 if (issetmem && !promoted_val)
7893 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7894 desired_align, align);
7895
7896 if (desired_align > align && !misaligned_prologue_used)
7897 {
7898 if (align_bytes == 0)
7899 {
7900 /* Except for the first move in prologue, we no longer know
7901 constant offset in aliasing info. It don't seems to worth
7902 the pain to maintain it for the first move, so throw away
7903 the info early. */
7904 dst = change_address (dst, BLKmode, destreg);
7905 if (!issetmem)
7906 src = change_address (src, BLKmode, srcreg);
76715c32 7907 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
2bf6d935
ML
7908 promoted_val, vec_promoted_val,
7909 count_exp, align, desired_align,
7910 issetmem);
7911 /* At most desired_align - align bytes are copied. */
7912 if (min_size < (unsigned)(desired_align - align))
7913 min_size = 0;
7914 else
7915 min_size -= desired_align - align;
7916 }
7917 else
7918 {
7919 /* If we know how many bytes need to be stored before dst is
7920 sufficiently aligned, maintain aliasing info accurately. */
76715c32 7921 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
2bf6d935
ML
7922 srcreg,
7923 promoted_val,
7924 vec_promoted_val,
7925 desired_align,
7926 align_bytes,
7927 issetmem);
7928
7929 count_exp = plus_constant (counter_mode (count_exp),
7930 count_exp, -align_bytes);
7931 count -= align_bytes;
7932 min_size -= align_bytes;
7933 max_size -= align_bytes;
7934 }
7935 if (need_zero_guard
7936 && min_size < (unsigned HOST_WIDE_INT) size_needed
7937 && (count < (unsigned HOST_WIDE_INT) size_needed
7938 || (align_bytes == 0
7939 && count < ((unsigned HOST_WIDE_INT) size_needed
7940 + desired_align - align))))
7941 {
7942 /* It is possible that we copied enough so the main loop will not
7943 execute. */
7944 gcc_assert (size_needed > 1);
7945 if (label == NULL_RTX)
7946 label = gen_label_rtx ();
7947 emit_cmp_and_jump_insns (count_exp,
7948 GEN_INT (size_needed),
7949 LTU, 0, counter_mode (count_exp), 1, label);
7950 if (expected_size == -1
7951 || expected_size < (desired_align - align) / 2 + size_needed)
7952 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7953 else
7954 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7955 }
7956 }
7957 if (label && size_needed == 1)
7958 {
7959 emit_label (label);
7960 LABEL_NUSES (label) = 1;
7961 label = NULL;
7962 epilogue_size_needed = 1;
7963 if (issetmem)
7964 promoted_val = val_exp;
7965 }
7966 else if (label == NULL_RTX && !misaligned_prologue_used)
7967 epilogue_size_needed = size_needed;
7968
7969 /* Step 3: Main loop. */
7970
7971 switch (alg)
7972 {
7973 case libcall:
7974 case no_stringop:
7975 case last_alg:
7976 gcc_unreachable ();
7977 case loop_1_byte:
7978 case loop:
7979 case unrolled_loop:
76715c32 7980 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
2bf6d935
ML
7981 count_exp, move_mode, unroll_factor,
7982 expected_size, issetmem);
7983 break;
7984 case vector_loop:
76715c32 7985 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
2bf6d935
ML
7986 vec_promoted_val, count_exp, move_mode,
7987 unroll_factor, expected_size, issetmem);
7988 break;
7989 case rep_prefix_8_byte:
7990 case rep_prefix_4_byte:
7991 case rep_prefix_1_byte:
76715c32 7992 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
2bf6d935
ML
7993 val_exp, count_exp, move_mode, issetmem);
7994 break;
7995 }
7996 /* Adjust properly the offset of src and dest memory for aliasing. */
7997 if (CONST_INT_P (count_exp))
7998 {
7999 if (!issetmem)
8000 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
8001 (count / size_needed) * size_needed);
8002 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
8003 (count / size_needed) * size_needed);
8004 }
8005 else
8006 {
8007 if (!issetmem)
8008 src = change_address (src, BLKmode, srcreg);
8009 dst = change_address (dst, BLKmode, destreg);
8010 }
8011
8012 /* Step 4: Epilogue to copy the remaining bytes. */
8013 epilogue:
8014 if (label)
8015 {
8016 /* When the main loop is done, COUNT_EXP might hold original count,
8017 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
8018 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
8019 bytes. Compensate if needed. */
8020
8021 if (size_needed < epilogue_size_needed)
8022 {
8023 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
8024 GEN_INT (size_needed - 1), count_exp, 1,
8025 OPTAB_DIRECT);
8026 if (tmp != count_exp)
8027 emit_move_insn (count_exp, tmp);
8028 }
8029 emit_label (label);
8030 LABEL_NUSES (label) = 1;
8031 }
8032
8033 if (count_exp != const0_rtx && epilogue_size_needed > 1)
8034 {
8035 if (force_loopy_epilogue)
8036 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
8037 epilogue_size_needed);
8038 else
8039 {
8040 if (issetmem)
8041 expand_setmem_epilogue (dst, destreg, promoted_val,
8042 vec_promoted_val, count_exp,
8043 epilogue_size_needed);
8044 else
76715c32 8045 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
2bf6d935
ML
8046 epilogue_size_needed);
8047 }
8048 }
8049 if (jump_around_label)
8050 emit_label (jump_around_label);
8051 return true;
8052}
8053
3edc21af
L
8054/* Expand cmpstrn or memcmp. */
8055
8056bool
8057ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
8058 rtx length, rtx align, bool is_cmpstrn)
8059{
4052c05e
L
8060 /* Expand strncmp and memcmp only with -minline-all-stringops since
8061 "repz cmpsb" can be much slower than strncmp and memcmp functions
8062 implemented with vector instructions, see
8063
8064 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
8065 */
8066 if (!TARGET_INLINE_ALL_STRINGOPS)
3edc21af
L
8067 return false;
8068
8069 /* Can't use this if the user has appropriated ecx, esi or edi. */
8070 if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
8071 return false;
8072
8073 if (is_cmpstrn)
8074 {
8075 /* For strncmp, length is the maximum length, which can be larger
8076 than actual string lengths. We can expand the cmpstrn pattern
8077 to "repz cmpsb" only if one of the strings is a constant so
8078 that expand_builtin_strncmp() can write the length argument to
8079 be the minimum of the const string length and the actual length
8080 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
8081 tree t1 = MEM_EXPR (src1);
8082 tree t2 = MEM_EXPR (src2);
8083 if (!((t1 && TREE_CODE (t1) == MEM_REF
8084 && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
8085 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
8086 == STRING_CST))
8087 || (t2 && TREE_CODE (t2) == MEM_REF
8088 && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
8089 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
8090 == STRING_CST))))
8091 return false;
8092 }
3edc21af
L
8093
8094 rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
8095 rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
8096 if (addr1 != XEXP (src1, 0))
8097 src1 = replace_equiv_address_nv (src1, addr1);
8098 if (addr2 != XEXP (src2, 0))
8099 src2 = replace_equiv_address_nv (src2, addr2);
8100
8101 /* NB: Make a copy of the data length to avoid changing the original
8102 data length by cmpstrnqi patterns. */
8103 length = ix86_zero_extend_to_Pmode (length);
8104 rtx lengthreg = gen_reg_rtx (Pmode);
8105 emit_move_insn (lengthreg, length);
8106
8107 /* If we are testing strict equality, we can use known alignment to
8108 good advantage. This may be possible with combine, particularly
8109 once cc0 is dead. */
8110 if (CONST_INT_P (length))
8111 {
8112 if (length == const0_rtx)
8113 {
8114 emit_move_insn (result, const0_rtx);
8115 return true;
8116 }
8117 emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
8118 src1, src2));
8119 }
8120 else
8121 {
8122 emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
8123 emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
8124 src1, src2));
8125 }
8126
8127 rtx out = gen_lowpart (QImode, result);
8128 emit_insn (gen_cmpintqi (out));
8129 emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
8130
8131 return true;
8132}
2bf6d935
ML
8133
8134/* Expand the appropriate insns for doing strlen if not just doing
8135 repnz; scasb
8136
8137 out = result, initialized with the start address
8138 align_rtx = alignment of the address.
8139 scratch = scratch register, initialized with the startaddress when
8140 not aligned, otherwise undefined
8141
8142 This is just the body. It needs the initializations mentioned above and
8143 some address computing at the end. These things are done in i386.md. */
8144
8145static void
8146ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
8147{
8148 int align;
8149 rtx tmp;
8150 rtx_code_label *align_2_label = NULL;
8151 rtx_code_label *align_3_label = NULL;
8152 rtx_code_label *align_4_label = gen_label_rtx ();
8153 rtx_code_label *end_0_label = gen_label_rtx ();
8154 rtx mem;
8155 rtx tmpreg = gen_reg_rtx (SImode);
8156 rtx scratch = gen_reg_rtx (SImode);
8157 rtx cmp;
8158
8159 align = 0;
8160 if (CONST_INT_P (align_rtx))
8161 align = INTVAL (align_rtx);
8162
8163 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
8164
8165 /* Is there a known alignment and is it less than 4? */
8166 if (align < 4)
8167 {
8168 rtx scratch1 = gen_reg_rtx (Pmode);
8169 emit_move_insn (scratch1, out);
8170 /* Is there a known alignment and is it not 2? */
8171 if (align != 2)
8172 {
8173 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
8174 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
8175
8176 /* Leave just the 3 lower bits. */
8177 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
8178 NULL_RTX, 0, OPTAB_WIDEN);
8179
8180 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
8181 Pmode, 1, align_4_label);
8182 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
8183 Pmode, 1, align_2_label);
8184 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
8185 Pmode, 1, align_3_label);
8186 }
8187 else
8188 {
8189 /* Since the alignment is 2, we have to check 2 or 0 bytes;
8190 check if is aligned to 4 - byte. */
8191
8192 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
8193 NULL_RTX, 0, OPTAB_WIDEN);
8194
8195 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
8196 Pmode, 1, align_4_label);
8197 }
8198
8199 mem = change_address (src, QImode, out);
8200
8201 /* Now compare the bytes. */
8202
8203 /* Compare the first n unaligned byte on a byte per byte basis. */
8204 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
8205 QImode, 1, end_0_label);
8206
8207 /* Increment the address. */
d9330fb5 8208 emit_insn (gen_add2_insn (out, const1_rtx));
2bf6d935
ML
8209
8210 /* Not needed with an alignment of 2 */
8211 if (align != 2)
8212 {
8213 emit_label (align_2_label);
8214
8215 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
8216 end_0_label);
8217
d9330fb5 8218 emit_insn (gen_add2_insn (out, const1_rtx));
2bf6d935
ML
8219
8220 emit_label (align_3_label);
8221 }
8222
8223 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
8224 end_0_label);
8225
d9330fb5 8226 emit_insn (gen_add2_insn (out, const1_rtx));
2bf6d935
ML
8227 }
8228
8229 /* Generate loop to check 4 bytes at a time. It is not a good idea to
8230 align this loop. It gives only huge programs, but does not help to
8231 speed up. */
8232 emit_label (align_4_label);
8233
8234 mem = change_address (src, SImode, out);
8235 emit_move_insn (scratch, mem);
d9330fb5 8236 emit_insn (gen_add2_insn (out, GEN_INT (4)));
2bf6d935
ML
8237
8238 /* This formula yields a nonzero result iff one of the bytes is zero.
8239 This saves three branches inside loop and many cycles. */
8240
8241 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
8242 emit_insn (gen_one_cmplsi2 (scratch, scratch));
8243 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
8244 emit_insn (gen_andsi3 (tmpreg, tmpreg,
8245 gen_int_mode (0x80808080, SImode)));
8246 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
8247 align_4_label);
8248
8249 if (TARGET_CMOVE)
8250 {
8251 rtx reg = gen_reg_rtx (SImode);
8252 rtx reg2 = gen_reg_rtx (Pmode);
8253 emit_move_insn (reg, tmpreg);
8254 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
8255
8256 /* If zero is not in the first two bytes, move two bytes forward. */
8257 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
8258 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
8259 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
8260 emit_insn (gen_rtx_SET (tmpreg,
8261 gen_rtx_IF_THEN_ELSE (SImode, tmp,
8262 reg,
8263 tmpreg)));
8264 /* Emit lea manually to avoid clobbering of flags. */
c3185b64 8265 emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
2bf6d935
ML
8266
8267 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
8268 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
8269 emit_insn (gen_rtx_SET (out,
8270 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
8271 reg2,
8272 out)));
8273 }
8274 else
8275 {
8276 rtx_code_label *end_2_label = gen_label_rtx ();
8277 /* Is zero in the first two bytes? */
8278
8279 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
8280 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
8281 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
8282 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
8283 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
8284 pc_rtx);
8285 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
8286 JUMP_LABEL (tmp) = end_2_label;
8287
8288 /* Not in the first two. Move two bytes forward. */
8289 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
d9330fb5 8290 emit_insn (gen_add2_insn (out, const2_rtx));
2bf6d935
ML
8291
8292 emit_label (end_2_label);
8293
8294 }
8295
8296 /* Avoid branch in fixing the byte. */
8297 tmpreg = gen_lowpart (QImode, tmpreg);
8298 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
8299 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
8300 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
d9330fb5 8301 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
2bf6d935
ML
8302
8303 emit_label (end_0_label);
8304}
8305
8306/* Expand strlen. */
8307
8308bool
8309ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
8310{
8311if (TARGET_UNROLL_STRLEN
8312 && TARGET_INLINE_ALL_STRINGOPS
8313 && eoschar == const0_rtx
8314 && optimize > 1)
8315 {
8316 /* The generic case of strlen expander is long. Avoid it's
8317 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
8318 rtx addr = force_reg (Pmode, XEXP (src, 0));
8319 /* Well it seems that some optimizer does not combine a call like
8320 foo(strlen(bar), strlen(bar));
8321 when the move and the subtraction is done here. It does calculate
8322 the length just once when these instructions are done inside of
8323 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
8324 often used and I use one fewer register for the lifetime of
8325 output_strlen_unroll() this is better. */
8326
8327 emit_move_insn (out, addr);
8328
8329 ix86_expand_strlensi_unroll_1 (out, src, align);
8330
8331 /* strlensi_unroll_1 returns the address of the zero at the end of
8332 the string, like memchr(), so compute the length by subtracting
8333 the start address. */
d9330fb5 8334 emit_insn (gen_sub2_insn (out, addr));
2bf6d935
ML
8335 return true;
8336 }
8337 else
8338 return false;
8339}
8340
8341/* For given symbol (function) construct code to compute address of it's PLT
8342 entry in large x86-64 PIC model. */
8343
8344static rtx
8345construct_plt_address (rtx symbol)
8346{
8347 rtx tmp, unspec;
8348
8349 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
8350 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
8351 gcc_assert (Pmode == DImode);
8352
8353 tmp = gen_reg_rtx (Pmode);
8354 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
8355
8356 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
d9330fb5 8357 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
2bf6d935
ML
8358 return tmp;
8359}
8360
8361/* Additional registers that are clobbered by SYSV calls. */
8362
8363static int const x86_64_ms_sysv_extra_clobbered_registers
8364 [NUM_X86_64_MS_CLOBBERED_REGS] =
8365{
8366 SI_REG, DI_REG,
8367 XMM6_REG, XMM7_REG,
8368 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
8369 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
8370};
8371
8372rtx_insn *
8373ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
8374 rtx callarg2,
8375 rtx pop, bool sibcall)
8376{
8377 rtx vec[3];
8378 rtx use = NULL, call;
8379 unsigned int vec_len = 0;
8380 tree fndecl;
8381
8382 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
8383 {
8384 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
8385 if (fndecl
8386 && (lookup_attribute ("interrupt",
8387 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
a9c697b8 8388 error ("interrupt service routine cannot be called directly");
2bf6d935
ML
8389 }
8390 else
8391 fndecl = NULL_TREE;
8392
8393 if (pop == const0_rtx)
8394 pop = NULL;
8395 gcc_assert (!TARGET_64BIT || !pop);
8396
41bd1b19 8397 rtx addr = XEXP (fnaddr, 0);
2bf6d935
ML
8398 if (TARGET_MACHO && !TARGET_64BIT)
8399 {
8400#if TARGET_MACHO
8401 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
8402 fnaddr = machopic_indirect_call_target (fnaddr);
8403#endif
8404 }
8405 else
8406 {
8407 /* Static functions and indirect calls don't need the pic register. Also,
8408 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
8409 it an indirect call. */
2bf6d935
ML
8410 if (flag_pic
8411 && GET_CODE (addr) == SYMBOL_REF
8412 && !SYMBOL_REF_LOCAL_P (addr))
8413 {
8414 if (flag_plt
8415 && (SYMBOL_REF_DECL (addr) == NULL_TREE
8416 || !lookup_attribute ("noplt",
8417 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
8418 {
8419 if (!TARGET_64BIT
8420 || (ix86_cmodel == CM_LARGE_PIC
8421 && DEFAULT_ABI != MS_ABI))
8422 {
8423 use_reg (&use, gen_rtx_REG (Pmode,
8424 REAL_PIC_OFFSET_TABLE_REGNUM));
8425 if (ix86_use_pseudo_pic_reg ())
8426 emit_move_insn (gen_rtx_REG (Pmode,
8427 REAL_PIC_OFFSET_TABLE_REGNUM),
8428 pic_offset_table_rtx);
8429 }
8430 }
8431 else if (!TARGET_PECOFF && !TARGET_MACHO)
8432 {
69157fe7
JJ
8433 if (TARGET_64BIT
8434 && ix86_cmodel == CM_LARGE_PIC
8435 && DEFAULT_ABI != MS_ABI)
8436 {
8437 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
8438 UNSPEC_GOT);
8439 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8440 fnaddr = force_reg (Pmode, fnaddr);
8441 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
8442 }
8443 else if (TARGET_64BIT)
2bf6d935
ML
8444 {
8445 fnaddr = gen_rtx_UNSPEC (Pmode,
8446 gen_rtvec (1, addr),
8447 UNSPEC_GOTPCREL);
8448 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8449 }
8450 else
8451 {
8452 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
8453 UNSPEC_GOT);
8454 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8455 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
8456 fnaddr);
8457 }
8458 fnaddr = gen_const_mem (Pmode, fnaddr);
8459 /* Pmode may not be the same as word_mode for x32, which
8460 doesn't support indirect branch via 32-bit memory slot.
8461 Since x32 GOT slot is 64 bit with zero upper 32 bits,
8462 indirect branch via x32 GOT slot is OK. */
8463 if (GET_MODE (fnaddr) != word_mode)
8464 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
8465 fnaddr = gen_rtx_MEM (QImode, fnaddr);
8466 }
8467 }
8468 }
8469
8470 /* Skip setting up RAX register for -mskip-rax-setup when there are no
8471 parameters passed in vector registers. */
8472 if (TARGET_64BIT
8473 && (INTVAL (callarg2) > 0
8474 || (INTVAL (callarg2) == 0
8475 && (TARGET_SSE || !flag_skip_rax_setup))))
8476 {
8477 rtx al = gen_rtx_REG (QImode, AX_REG);
8478 emit_move_insn (al, callarg2);
8479 use_reg (&use, al);
8480 }
8481
8482 if (ix86_cmodel == CM_LARGE_PIC
8483 && !TARGET_PECOFF
8484 && MEM_P (fnaddr)
8485 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
8486 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
8487 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
8488 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
8489 branch via x32 GOT slot is OK. */
8490 else if (!(TARGET_X32
8491 && MEM_P (fnaddr)
8492 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
8493 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
8494 && (sibcall
8495 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
8496 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
8497 {
8498 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
8499 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
8500 }
8501
8502 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
8503
8504 if (retval)
8505 call = gen_rtx_SET (retval, call);
8506 vec[vec_len++] = call;
8507
8508 if (pop)
8509 {
8510 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
8511 pop = gen_rtx_SET (stack_pointer_rtx, pop);
8512 vec[vec_len++] = pop;
8513 }
8514
8515 if (cfun->machine->no_caller_saved_registers
8516 && (!fndecl
8517 || (!TREE_THIS_VOLATILE (fndecl)
8518 && !lookup_attribute ("no_caller_saved_registers",
8519 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
8520 {
8521 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
8522 bool is_64bit_ms_abi = (TARGET_64BIT
8523 && ix86_function_abi (fndecl) == MS_ABI);
8524 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
8525
8526 /* If there are no caller-saved registers, add all registers
8527 that are clobbered by the call which returns. */
8528 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
8529 if (!fixed_regs[i]
8530 && (ix86_call_used_regs[i] == 1
8531 || (ix86_call_used_regs[i] & c_mask))
8532 && !STACK_REGNO_P (i)
8533 && !MMX_REGNO_P (i))
8534 clobber_reg (&use,
8535 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
8536 }
8537 else if (TARGET_64BIT_MS_ABI
8538 && (!callarg2 || INTVAL (callarg2) != -2))
8539 {
8540 unsigned i;
8541
8542 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
8543 {
8544 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
8545 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
8546
8547 clobber_reg (&use, gen_rtx_REG (mode, regno));
8548 }
8549
8550 /* Set here, but it may get cleared later. */
8551 if (TARGET_CALL_MS2SYSV_XLOGUES)
8552 {
8553 if (!TARGET_SSE)
8554 ;
8555
8556 /* Don't break hot-patched functions. */
8557 else if (ix86_function_ms_hook_prologue (current_function_decl))
8558 ;
8559
8560 /* TODO: Cases not yet examined. */
8561 else if (flag_split_stack)
8562 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8563
8564 else
8565 {
8566 gcc_assert (!reload_completed);
8567 cfun->machine->call_ms2sysv = true;
8568 }
8569 }
8570 }
8571
41bd1b19
IS
8572 if (TARGET_MACHO && TARGET_64BIT && !sibcall
8573 && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr))
8574 || !fndecl || TREE_PUBLIC (fndecl)))
8575 {
8576 /* We allow public functions defined in a TU to bind locally for PIC
8577 code (the default) on 64bit Mach-O.
8578 If such functions are not inlined, we cannot tell at compile-time if
8579 they will be called via the lazy symbol resolver (this can depend on
8580 options given at link-time). Therefore, we must assume that the lazy
8581 resolver could be used which clobbers R11 and R10. */
8582 clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
8583 clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
8584 }
8585
2bf6d935
ML
8586 if (vec_len > 1)
8587 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
8588 rtx_insn *call_insn = emit_call_insn (call);
8589 if (use)
8590 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
8591
8592 return call_insn;
8593}
8594
8595/* Split simple return with popping POPC bytes from stack to indirect
8596 branch with stack adjustment . */
8597
8598void
8599ix86_split_simple_return_pop_internal (rtx popc)
8600{
8601 struct machine_function *m = cfun->machine;
8602 rtx ecx = gen_rtx_REG (SImode, CX_REG);
8603 rtx_insn *insn;
8604
8605 /* There is no "pascal" calling convention in any 64bit ABI. */
8606 gcc_assert (!TARGET_64BIT);
8607
8608 insn = emit_insn (gen_pop (ecx));
8609 m->fs.cfa_offset -= UNITS_PER_WORD;
8610 m->fs.sp_offset -= UNITS_PER_WORD;
8611
8612 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
8613 x = gen_rtx_SET (stack_pointer_rtx, x);
8614 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8615 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
8616 RTX_FRAME_RELATED_P (insn) = 1;
8617
8618 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
8619 x = gen_rtx_SET (stack_pointer_rtx, x);
8620 insn = emit_insn (x);
8621 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8622 RTX_FRAME_RELATED_P (insn) = 1;
8623
8624 /* Now return address is in ECX. */
8625 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
8626}
8627
8628/* Errors in the source file can cause expand_expr to return const0_rtx
8629 where we expect a vector. To avoid crashing, use one of the vector
8630 clear instructions. */
8631
8632static rtx
8633safe_vector_operand (rtx x, machine_mode mode)
8634{
8635 if (x == const0_rtx)
8636 x = CONST0_RTX (mode);
8637 return x;
8638}
8639
8640/* Subroutine of ix86_expand_builtin to take care of binop insns. */
8641
8642static rtx
8643ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
8644{
8645 rtx pat;
8646 tree arg0 = CALL_EXPR_ARG (exp, 0);
8647 tree arg1 = CALL_EXPR_ARG (exp, 1);
8648 rtx op0 = expand_normal (arg0);
8649 rtx op1 = expand_normal (arg1);
8650 machine_mode tmode = insn_data[icode].operand[0].mode;
8651 machine_mode mode0 = insn_data[icode].operand[1].mode;
8652 machine_mode mode1 = insn_data[icode].operand[2].mode;
8653
8654 if (VECTOR_MODE_P (mode0))
8655 op0 = safe_vector_operand (op0, mode0);
8656 if (VECTOR_MODE_P (mode1))
8657 op1 = safe_vector_operand (op1, mode1);
8658
8659 if (optimize || !target
8660 || GET_MODE (target) != tmode
8661 || !insn_data[icode].operand[0].predicate (target, tmode))
8662 target = gen_reg_rtx (tmode);
8663
8664 if (GET_MODE (op1) == SImode && mode1 == TImode)
8665 {
8666 rtx x = gen_reg_rtx (V4SImode);
8667 emit_insn (gen_sse2_loadd (x, op1));
8668 op1 = gen_lowpart (TImode, x);
8669 }
8670
8671 if (!insn_data[icode].operand[1].predicate (op0, mode0))
8672 op0 = copy_to_mode_reg (mode0, op0);
8673 if (!insn_data[icode].operand[2].predicate (op1, mode1))
8674 op1 = copy_to_mode_reg (mode1, op1);
8675
8676 pat = GEN_FCN (icode) (target, op0, op1);
8677 if (! pat)
8678 return 0;
8679
8680 emit_insn (pat);
8681
8682 return target;
8683}
8684
8685/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
8686
8687static rtx
8688ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
8689 enum ix86_builtin_func_type m_type,
8690 enum rtx_code sub_code)
8691{
8692 rtx pat;
715a8bc8 8693 unsigned int i, nargs;
2bf6d935
ML
8694 bool comparison_p = false;
8695 bool tf_p = false;
8696 bool last_arg_constant = false;
8697 int num_memory = 0;
715a8bc8 8698 rtx xops[4];
2bf6d935
ML
8699
8700 machine_mode tmode = insn_data[icode].operand[0].mode;
8701
8702 switch (m_type)
8703 {
8704 case MULTI_ARG_4_DF2_DI_I:
8705 case MULTI_ARG_4_DF2_DI_I1:
8706 case MULTI_ARG_4_SF2_SI_I:
8707 case MULTI_ARG_4_SF2_SI_I1:
8708 nargs = 4;
8709 last_arg_constant = true;
8710 break;
8711
8712 case MULTI_ARG_3_SF:
8713 case MULTI_ARG_3_DF:
8714 case MULTI_ARG_3_SF2:
8715 case MULTI_ARG_3_DF2:
8716 case MULTI_ARG_3_DI:
8717 case MULTI_ARG_3_SI:
8718 case MULTI_ARG_3_SI_DI:
8719 case MULTI_ARG_3_HI:
8720 case MULTI_ARG_3_HI_SI:
8721 case MULTI_ARG_3_QI:
8722 case MULTI_ARG_3_DI2:
8723 case MULTI_ARG_3_SI2:
8724 case MULTI_ARG_3_HI2:
8725 case MULTI_ARG_3_QI2:
8726 nargs = 3;
8727 break;
8728
8729 case MULTI_ARG_2_SF:
8730 case MULTI_ARG_2_DF:
8731 case MULTI_ARG_2_DI:
8732 case MULTI_ARG_2_SI:
8733 case MULTI_ARG_2_HI:
8734 case MULTI_ARG_2_QI:
8735 nargs = 2;
8736 break;
8737
8738 case MULTI_ARG_2_DI_IMM:
8739 case MULTI_ARG_2_SI_IMM:
8740 case MULTI_ARG_2_HI_IMM:
8741 case MULTI_ARG_2_QI_IMM:
8742 nargs = 2;
8743 last_arg_constant = true;
8744 break;
8745
8746 case MULTI_ARG_1_SF:
8747 case MULTI_ARG_1_DF:
8748 case MULTI_ARG_1_SF2:
8749 case MULTI_ARG_1_DF2:
8750 case MULTI_ARG_1_DI:
8751 case MULTI_ARG_1_SI:
8752 case MULTI_ARG_1_HI:
8753 case MULTI_ARG_1_QI:
8754 case MULTI_ARG_1_SI_DI:
8755 case MULTI_ARG_1_HI_DI:
8756 case MULTI_ARG_1_HI_SI:
8757 case MULTI_ARG_1_QI_DI:
8758 case MULTI_ARG_1_QI_SI:
8759 case MULTI_ARG_1_QI_HI:
8760 nargs = 1;
8761 break;
8762
8763 case MULTI_ARG_2_DI_CMP:
8764 case MULTI_ARG_2_SI_CMP:
8765 case MULTI_ARG_2_HI_CMP:
8766 case MULTI_ARG_2_QI_CMP:
8767 nargs = 2;
8768 comparison_p = true;
8769 break;
8770
8771 case MULTI_ARG_2_SF_TF:
8772 case MULTI_ARG_2_DF_TF:
8773 case MULTI_ARG_2_DI_TF:
8774 case MULTI_ARG_2_SI_TF:
8775 case MULTI_ARG_2_HI_TF:
8776 case MULTI_ARG_2_QI_TF:
8777 nargs = 2;
8778 tf_p = true;
8779 break;
8780
8781 default:
8782 gcc_unreachable ();
8783 }
8784
8785 if (optimize || !target
8786 || GET_MODE (target) != tmode
8787 || !insn_data[icode].operand[0].predicate (target, tmode))
8788 target = gen_reg_rtx (tmode);
8789 else if (memory_operand (target, tmode))
8790 num_memory++;
8791
715a8bc8 8792 gcc_assert (nargs <= ARRAY_SIZE (xops));
2bf6d935
ML
8793
8794 for (i = 0; i < nargs; i++)
8795 {
8796 tree arg = CALL_EXPR_ARG (exp, i);
8797 rtx op = expand_normal (arg);
8798 int adjust = (comparison_p) ? 1 : 0;
8799 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
8800
8801 if (last_arg_constant && i == nargs - 1)
8802 {
8803 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
8804 {
8805 enum insn_code new_icode = icode;
8806 switch (icode)
8807 {
8808 case CODE_FOR_xop_vpermil2v2df3:
8809 case CODE_FOR_xop_vpermil2v4sf3:
8810 case CODE_FOR_xop_vpermil2v4df3:
8811 case CODE_FOR_xop_vpermil2v8sf3:
8812 error ("the last argument must be a 2-bit immediate");
8813 return gen_reg_rtx (tmode);
8814 case CODE_FOR_xop_rotlv2di3:
8815 new_icode = CODE_FOR_rotlv2di3;
8816 goto xop_rotl;
8817 case CODE_FOR_xop_rotlv4si3:
8818 new_icode = CODE_FOR_rotlv4si3;
8819 goto xop_rotl;
8820 case CODE_FOR_xop_rotlv8hi3:
8821 new_icode = CODE_FOR_rotlv8hi3;
8822 goto xop_rotl;
8823 case CODE_FOR_xop_rotlv16qi3:
8824 new_icode = CODE_FOR_rotlv16qi3;
8825 xop_rotl:
8826 if (CONST_INT_P (op))
8827 {
8828 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
8829 op = GEN_INT (INTVAL (op) & mask);
8830 gcc_checking_assert
8831 (insn_data[icode].operand[i + 1].predicate (op, mode));
8832 }
8833 else
8834 {
8835 gcc_checking_assert
8836 (nargs == 2
8837 && insn_data[new_icode].operand[0].mode == tmode
8838 && insn_data[new_icode].operand[1].mode == tmode
8839 && insn_data[new_icode].operand[2].mode == mode
8840 && insn_data[new_icode].operand[0].predicate
8841 == insn_data[icode].operand[0].predicate
8842 && insn_data[new_icode].operand[1].predicate
8843 == insn_data[icode].operand[1].predicate);
8844 icode = new_icode;
8845 goto non_constant;
8846 }
8847 break;
8848 default:
8849 gcc_unreachable ();
8850 }
8851 }
8852 }
8853 else
8854 {
8855 non_constant:
8856 if (VECTOR_MODE_P (mode))
8857 op = safe_vector_operand (op, mode);
8858
8859 /* If we aren't optimizing, only allow one memory operand to be
8860 generated. */
8861 if (memory_operand (op, mode))
8862 num_memory++;
8863
8864 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
8865
8866 if (optimize
8867 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
8868 || num_memory > 1)
8869 op = force_reg (mode, op);
8870 }
8871
715a8bc8 8872 xops[i] = op;
2bf6d935
ML
8873 }
8874
8875 switch (nargs)
8876 {
8877 case 1:
715a8bc8 8878 pat = GEN_FCN (icode) (target, xops[0]);
2bf6d935
ML
8879 break;
8880
8881 case 2:
8882 if (tf_p)
715a8bc8 8883 pat = GEN_FCN (icode) (target, xops[0], xops[1],
2bf6d935
ML
8884 GEN_INT ((int)sub_code));
8885 else if (! comparison_p)
715a8bc8 8886 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
2bf6d935
ML
8887 else
8888 {
8889 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
715a8bc8 8890 xops[0], xops[1]);
2bf6d935 8891
715a8bc8 8892 pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
2bf6d935
ML
8893 }
8894 break;
8895
8896 case 3:
715a8bc8 8897 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
2bf6d935
ML
8898 break;
8899
8900 case 4:
715a8bc8 8901 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
2bf6d935
ML
8902 break;
8903
8904 default:
8905 gcc_unreachable ();
8906 }
8907
8908 if (! pat)
8909 return 0;
8910
8911 emit_insn (pat);
8912 return target;
8913}
8914
8915/* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8916 insns with vec_merge. */
8917
8918static rtx
8919ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
8920 rtx target)
8921{
8922 rtx pat;
8923 tree arg0 = CALL_EXPR_ARG (exp, 0);
8924 rtx op1, op0 = expand_normal (arg0);
8925 machine_mode tmode = insn_data[icode].operand[0].mode;
8926 machine_mode mode0 = insn_data[icode].operand[1].mode;
8927
8928 if (optimize || !target
8929 || GET_MODE (target) != tmode
8930 || !insn_data[icode].operand[0].predicate (target, tmode))
8931 target = gen_reg_rtx (tmode);
8932
8933 if (VECTOR_MODE_P (mode0))
8934 op0 = safe_vector_operand (op0, mode0);
8935
8936 if ((optimize && !register_operand (op0, mode0))
8937 || !insn_data[icode].operand[1].predicate (op0, mode0))
8938 op0 = copy_to_mode_reg (mode0, op0);
8939
8940 op1 = op0;
8941 if (!insn_data[icode].operand[2].predicate (op1, mode0))
8942 op1 = copy_to_mode_reg (mode0, op1);
8943
8944 pat = GEN_FCN (icode) (target, op0, op1);
8945 if (! pat)
8946 return 0;
8947 emit_insn (pat);
8948 return target;
8949}
8950
8951/* Subroutine of ix86_expand_builtin to take care of comparison insns. */
8952
8953static rtx
8954ix86_expand_sse_compare (const struct builtin_description *d,
8955 tree exp, rtx target, bool swap)
8956{
8957 rtx pat;
8958 tree arg0 = CALL_EXPR_ARG (exp, 0);
8959 tree arg1 = CALL_EXPR_ARG (exp, 1);
8960 rtx op0 = expand_normal (arg0);
8961 rtx op1 = expand_normal (arg1);
8962 rtx op2;
8963 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8964 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8965 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8966 enum rtx_code comparison = d->comparison;
8967
8968 if (VECTOR_MODE_P (mode0))
8969 op0 = safe_vector_operand (op0, mode0);
8970 if (VECTOR_MODE_P (mode1))
8971 op1 = safe_vector_operand (op1, mode1);
8972
8973 /* Swap operands if we have a comparison that isn't available in
8974 hardware. */
8975 if (swap)
8976 std::swap (op0, op1);
8977
8978 if (optimize || !target
8979 || GET_MODE (target) != tmode
8980 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8981 target = gen_reg_rtx (tmode);
8982
8983 if ((optimize && !register_operand (op0, mode0))
8984 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
8985 op0 = copy_to_mode_reg (mode0, op0);
8986 if ((optimize && !register_operand (op1, mode1))
8987 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
8988 op1 = copy_to_mode_reg (mode1, op1);
8989
8990 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
8991 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8992 if (! pat)
8993 return 0;
8994 emit_insn (pat);
8995 return target;
8996}
8997
8998/* Subroutine of ix86_expand_builtin to take care of comi insns. */
8999
9000static rtx
9001ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
9002 rtx target)
9003{
9004 rtx pat;
9005 tree arg0 = CALL_EXPR_ARG (exp, 0);
9006 tree arg1 = CALL_EXPR_ARG (exp, 1);
9007 rtx op0 = expand_normal (arg0);
9008 rtx op1 = expand_normal (arg1);
9009 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
9010 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
9011 enum rtx_code comparison = d->comparison;
9012
9013 if (VECTOR_MODE_P (mode0))
9014 op0 = safe_vector_operand (op0, mode0);
9015 if (VECTOR_MODE_P (mode1))
9016 op1 = safe_vector_operand (op1, mode1);
9017
2bf6d935
ML
9018 target = gen_reg_rtx (SImode);
9019 emit_move_insn (target, const0_rtx);
9020 target = gen_rtx_SUBREG (QImode, target, 0);
9021
9022 if ((optimize && !register_operand (op0, mode0))
9023 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
9024 op0 = copy_to_mode_reg (mode0, op0);
9025 if ((optimize && !register_operand (op1, mode1))
9026 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
9027 op1 = copy_to_mode_reg (mode1, op1);
9028
9029 pat = GEN_FCN (d->icode) (op0, op1);
9030 if (! pat)
9031 return 0;
9032 emit_insn (pat);
9033 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
9034 gen_rtx_fmt_ee (comparison, QImode,
9035 SET_DEST (pat),
9036 const0_rtx)));
9037
9038 return SUBREG_REG (target);
9039}
9040
9041/* Subroutines of ix86_expand_args_builtin to take care of round insns. */
9042
9043static rtx
9044ix86_expand_sse_round (const struct builtin_description *d, tree exp,
9045 rtx target)
9046{
9047 rtx pat;
9048 tree arg0 = CALL_EXPR_ARG (exp, 0);
9049 rtx op1, op0 = expand_normal (arg0);
9050 machine_mode tmode = insn_data[d->icode].operand[0].mode;
9051 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
9052
9053 if (optimize || target == 0
9054 || GET_MODE (target) != tmode
9055 || !insn_data[d->icode].operand[0].predicate (target, tmode))
9056 target = gen_reg_rtx (tmode);
9057
9058 if (VECTOR_MODE_P (mode0))
9059 op0 = safe_vector_operand (op0, mode0);
9060
9061 if ((optimize && !register_operand (op0, mode0))
9062 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
9063 op0 = copy_to_mode_reg (mode0, op0);
9064
9065 op1 = GEN_INT (d->comparison);
9066
9067 pat = GEN_FCN (d->icode) (target, op0, op1);
9068 if (! pat)
9069 return 0;
9070 emit_insn (pat);
9071 return target;
9072}
9073
9074static rtx
9075ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
9076 tree exp, rtx target)
9077{
9078 rtx pat;
9079 tree arg0 = CALL_EXPR_ARG (exp, 0);
9080 tree arg1 = CALL_EXPR_ARG (exp, 1);
9081 rtx op0 = expand_normal (arg0);
9082 rtx op1 = expand_normal (arg1);
9083 rtx op2;
9084 machine_mode tmode = insn_data[d->icode].operand[0].mode;
9085 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
9086 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
9087
9088 if (optimize || target == 0
9089 || GET_MODE (target) != tmode
9090 || !insn_data[d->icode].operand[0].predicate (target, tmode))
9091 target = gen_reg_rtx (tmode);
9092
9093 op0 = safe_vector_operand (op0, mode0);
9094 op1 = safe_vector_operand (op1, mode1);
9095
9096 if ((optimize && !register_operand (op0, mode0))
9097 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
9098 op0 = copy_to_mode_reg (mode0, op0);
9099 if ((optimize && !register_operand (op1, mode1))
9100 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
9101 op1 = copy_to_mode_reg (mode1, op1);
9102
9103 op2 = GEN_INT (d->comparison);
9104
9105 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
9106 if (! pat)
9107 return 0;
9108 emit_insn (pat);
9109 return target;
9110}
9111
9112/* Subroutine of ix86_expand_builtin to take care of ptest insns. */
9113
9114static rtx
9115ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
9116 rtx target)
9117{
9118 rtx pat;
9119 tree arg0 = CALL_EXPR_ARG (exp, 0);
9120 tree arg1 = CALL_EXPR_ARG (exp, 1);
9121 rtx op0 = expand_normal (arg0);
9122 rtx op1 = expand_normal (arg1);
9123 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
9124 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
9125 enum rtx_code comparison = d->comparison;
9126
9127 if (VECTOR_MODE_P (mode0))
9128 op0 = safe_vector_operand (op0, mode0);
9129 if (VECTOR_MODE_P (mode1))
9130 op1 = safe_vector_operand (op1, mode1);
9131
9132 target = gen_reg_rtx (SImode);
9133 emit_move_insn (target, const0_rtx);
9134 target = gen_rtx_SUBREG (QImode, target, 0);
9135
9136 if ((optimize && !register_operand (op0, mode0))
9137 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
9138 op0 = copy_to_mode_reg (mode0, op0);
9139 if ((optimize && !register_operand (op1, mode1))
9140 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
9141 op1 = copy_to_mode_reg (mode1, op1);
9142
9143 pat = GEN_FCN (d->icode) (op0, op1);
9144 if (! pat)
9145 return 0;
9146 emit_insn (pat);
9147 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
9148 gen_rtx_fmt_ee (comparison, QImode,
9149 SET_DEST (pat),
9150 const0_rtx)));
9151
9152 return SUBREG_REG (target);
9153}
9154
9155/* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
9156
9157static rtx
9158ix86_expand_sse_pcmpestr (const struct builtin_description *d,
9159 tree exp, rtx target)
9160{
9161 rtx pat;
9162 tree arg0 = CALL_EXPR_ARG (exp, 0);
9163 tree arg1 = CALL_EXPR_ARG (exp, 1);
9164 tree arg2 = CALL_EXPR_ARG (exp, 2);
9165 tree arg3 = CALL_EXPR_ARG (exp, 3);
9166 tree arg4 = CALL_EXPR_ARG (exp, 4);
9167 rtx scratch0, scratch1;
9168 rtx op0 = expand_normal (arg0);
9169 rtx op1 = expand_normal (arg1);
9170 rtx op2 = expand_normal (arg2);
9171 rtx op3 = expand_normal (arg3);
9172 rtx op4 = expand_normal (arg4);
9173 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
9174
9175 tmode0 = insn_data[d->icode].operand[0].mode;
9176 tmode1 = insn_data[d->icode].operand[1].mode;
9177 modev2 = insn_data[d->icode].operand[2].mode;
9178 modei3 = insn_data[d->icode].operand[3].mode;
9179 modev4 = insn_data[d->icode].operand[4].mode;
9180 modei5 = insn_data[d->icode].operand[5].mode;
9181 modeimm = insn_data[d->icode].operand[6].mode;
9182
9183 if (VECTOR_MODE_P (modev2))
9184 op0 = safe_vector_operand (op0, modev2);
9185 if (VECTOR_MODE_P (modev4))
9186 op2 = safe_vector_operand (op2, modev4);
9187
9188 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
9189 op0 = copy_to_mode_reg (modev2, op0);
9190 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
9191 op1 = copy_to_mode_reg (modei3, op1);
9192 if ((optimize && !register_operand (op2, modev4))
9193 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
9194 op2 = copy_to_mode_reg (modev4, op2);
9195 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
9196 op3 = copy_to_mode_reg (modei5, op3);
9197
9198 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
9199 {
9200 error ("the fifth argument must be an 8-bit immediate");
9201 return const0_rtx;
9202 }
9203
9204 if (d->code == IX86_BUILTIN_PCMPESTRI128)
9205 {
9206 if (optimize || !target
9207 || GET_MODE (target) != tmode0
9208 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
9209 target = gen_reg_rtx (tmode0);
9210
9211 scratch1 = gen_reg_rtx (tmode1);
9212
9213 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
9214 }
9215 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
9216 {
9217 if (optimize || !target
9218 || GET_MODE (target) != tmode1
9219 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
9220 target = gen_reg_rtx (tmode1);
9221
9222 scratch0 = gen_reg_rtx (tmode0);
9223
9224 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
9225 }
9226 else
9227 {
9228 gcc_assert (d->flag);
9229
9230 scratch0 = gen_reg_rtx (tmode0);
9231 scratch1 = gen_reg_rtx (tmode1);
9232
9233 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
9234 }
9235
9236 if (! pat)
9237 return 0;
9238
9239 emit_insn (pat);
9240
9241 if (d->flag)
9242 {
9243 target = gen_reg_rtx (SImode);
9244 emit_move_insn (target, const0_rtx);
9245 target = gen_rtx_SUBREG (QImode, target, 0);
9246
9247 emit_insn
9248 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
9249 gen_rtx_fmt_ee (EQ, QImode,
9250 gen_rtx_REG ((machine_mode) d->flag,
9251 FLAGS_REG),
9252 const0_rtx)));
9253 return SUBREG_REG (target);
9254 }
9255 else
9256 return target;
9257}
9258
9259
9260/* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
9261
9262static rtx
9263ix86_expand_sse_pcmpistr (const struct builtin_description *d,
9264 tree exp, rtx target)
9265{
9266 rtx pat;
9267 tree arg0 = CALL_EXPR_ARG (exp, 0);
9268 tree arg1 = CALL_EXPR_ARG (exp, 1);
9269 tree arg2 = CALL_EXPR_ARG (exp, 2);
9270 rtx scratch0, scratch1;
9271 rtx op0 = expand_normal (arg0);
9272 rtx op1 = expand_normal (arg1);
9273 rtx op2 = expand_normal (arg2);
9274 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
9275
9276 tmode0 = insn_data[d->icode].operand[0].mode;
9277 tmode1 = insn_data[d->icode].operand[1].mode;
9278 modev2 = insn_data[d->icode].operand[2].mode;
9279 modev3 = insn_data[d->icode].operand[3].mode;
9280 modeimm = insn_data[d->icode].operand[4].mode;
9281
9282 if (VECTOR_MODE_P (modev2))
9283 op0 = safe_vector_operand (op0, modev2);
9284 if (VECTOR_MODE_P (modev3))
9285 op1 = safe_vector_operand (op1, modev3);
9286
9287 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
9288 op0 = copy_to_mode_reg (modev2, op0);
9289 if ((optimize && !register_operand (op1, modev3))
9290 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
9291 op1 = copy_to_mode_reg (modev3, op1);
9292
9293 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
9294 {
9295 error ("the third argument must be an 8-bit immediate");
9296 return const0_rtx;
9297 }
9298
9299 if (d->code == IX86_BUILTIN_PCMPISTRI128)
9300 {
9301 if (optimize || !target
9302 || GET_MODE (target) != tmode0
9303 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
9304 target = gen_reg_rtx (tmode0);
9305
9306 scratch1 = gen_reg_rtx (tmode1);
9307
9308 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
9309 }
9310 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
9311 {
9312 if (optimize || !target
9313 || GET_MODE (target) != tmode1
9314 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
9315 target = gen_reg_rtx (tmode1);
9316
9317 scratch0 = gen_reg_rtx (tmode0);
9318
9319 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
9320 }
9321 else
9322 {
9323 gcc_assert (d->flag);
9324
9325 scratch0 = gen_reg_rtx (tmode0);
9326 scratch1 = gen_reg_rtx (tmode1);
9327
9328 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
9329 }
9330
9331 if (! pat)
9332 return 0;
9333
9334 emit_insn (pat);
9335
9336 if (d->flag)
9337 {
9338 target = gen_reg_rtx (SImode);
9339 emit_move_insn (target, const0_rtx);
9340 target = gen_rtx_SUBREG (QImode, target, 0);
9341
9342 emit_insn
9343 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
9344 gen_rtx_fmt_ee (EQ, QImode,
9345 gen_rtx_REG ((machine_mode) d->flag,
9346 FLAGS_REG),
9347 const0_rtx)));
9348 return SUBREG_REG (target);
9349 }
9350 else
9351 return target;
9352}
9353
9354/* Fixup modeless constants to fit required mode. */
9355
9356static rtx
9357fixup_modeless_constant (rtx x, machine_mode mode)
9358{
9359 if (GET_MODE (x) == VOIDmode)
9360 x = convert_to_mode (mode, x, 1);
9361 return x;
9362}
9363
9364/* Subroutine of ix86_expand_builtin to take care of insns with
9365 variable number of operands. */
9366
9367static rtx
9368ix86_expand_args_builtin (const struct builtin_description *d,
9369 tree exp, rtx target)
9370{
9371 rtx pat, real_target;
9372 unsigned int i, nargs;
9373 unsigned int nargs_constant = 0;
9374 unsigned int mask_pos = 0;
9375 int num_memory = 0;
715a8bc8 9376 rtx xops[6];
2bf6d935
ML
9377 bool second_arg_count = false;
9378 enum insn_code icode = d->icode;
9379 const struct insn_data_d *insn_p = &insn_data[icode];
9380 machine_mode tmode = insn_p->operand[0].mode;
9381 machine_mode rmode = VOIDmode;
9382 bool swap = false;
9383 enum rtx_code comparison = d->comparison;
9384
9385 switch ((enum ix86_builtin_func_type) d->flag)
9386 {
9387 case V2DF_FTYPE_V2DF_ROUND:
9388 case V4DF_FTYPE_V4DF_ROUND:
9389 case V8DF_FTYPE_V8DF_ROUND:
9390 case V4SF_FTYPE_V4SF_ROUND:
9391 case V8SF_FTYPE_V8SF_ROUND:
9392 case V16SF_FTYPE_V16SF_ROUND:
9393 case V4SI_FTYPE_V4SF_ROUND:
9394 case V8SI_FTYPE_V8SF_ROUND:
9395 case V16SI_FTYPE_V16SF_ROUND:
9396 return ix86_expand_sse_round (d, exp, target);
9397 case V4SI_FTYPE_V2DF_V2DF_ROUND:
9398 case V8SI_FTYPE_V4DF_V4DF_ROUND:
9399 case V16SI_FTYPE_V8DF_V8DF_ROUND:
9400 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
9401 case INT_FTYPE_V8SF_V8SF_PTEST:
9402 case INT_FTYPE_V4DI_V4DI_PTEST:
9403 case INT_FTYPE_V4DF_V4DF_PTEST:
9404 case INT_FTYPE_V4SF_V4SF_PTEST:
9405 case INT_FTYPE_V2DI_V2DI_PTEST:
9406 case INT_FTYPE_V2DF_V2DF_PTEST:
9407 return ix86_expand_sse_ptest (d, exp, target);
9408 case FLOAT128_FTYPE_FLOAT128:
9409 case FLOAT_FTYPE_FLOAT:
9410 case INT_FTYPE_INT:
9411 case UINT_FTYPE_UINT:
9412 case UINT16_FTYPE_UINT16:
9413 case UINT64_FTYPE_INT:
9414 case UINT64_FTYPE_UINT64:
9415 case INT64_FTYPE_INT64:
9416 case INT64_FTYPE_V4SF:
9417 case INT64_FTYPE_V2DF:
9418 case INT_FTYPE_V16QI:
9419 case INT_FTYPE_V8QI:
9420 case INT_FTYPE_V8SF:
9421 case INT_FTYPE_V4DF:
9422 case INT_FTYPE_V4SF:
9423 case INT_FTYPE_V2DF:
9424 case INT_FTYPE_V32QI:
9425 case V16QI_FTYPE_V16QI:
9426 case V8SI_FTYPE_V8SF:
9427 case V8SI_FTYPE_V4SI:
9428 case V8HI_FTYPE_V8HI:
9429 case V8HI_FTYPE_V16QI:
9430 case V8QI_FTYPE_V8QI:
9431 case V8SF_FTYPE_V8SF:
9432 case V8SF_FTYPE_V8SI:
9433 case V8SF_FTYPE_V4SF:
9434 case V8SF_FTYPE_V8HI:
9435 case V4SI_FTYPE_V4SI:
9436 case V4SI_FTYPE_V16QI:
9437 case V4SI_FTYPE_V4SF:
9438 case V4SI_FTYPE_V8SI:
9439 case V4SI_FTYPE_V8HI:
9440 case V4SI_FTYPE_V4DF:
9441 case V4SI_FTYPE_V2DF:
9442 case V4HI_FTYPE_V4HI:
9443 case V4DF_FTYPE_V4DF:
9444 case V4DF_FTYPE_V4SI:
9445 case V4DF_FTYPE_V4SF:
9446 case V4DF_FTYPE_V2DF:
9447 case V4SF_FTYPE_V4SF:
9448 case V4SF_FTYPE_V4SI:
9449 case V4SF_FTYPE_V8SF:
9450 case V4SF_FTYPE_V4DF:
9451 case V4SF_FTYPE_V8HI:
9452 case V4SF_FTYPE_V2DF:
9453 case V2DI_FTYPE_V2DI:
9454 case V2DI_FTYPE_V16QI:
9455 case V2DI_FTYPE_V8HI:
9456 case V2DI_FTYPE_V4SI:
9457 case V2DF_FTYPE_V2DF:
9458 case V2DF_FTYPE_V4SI:
9459 case V2DF_FTYPE_V4DF:
9460 case V2DF_FTYPE_V4SF:
9461 case V2DF_FTYPE_V2SI:
9462 case V2SI_FTYPE_V2SI:
9463 case V2SI_FTYPE_V4SF:
9464 case V2SI_FTYPE_V2SF:
9465 case V2SI_FTYPE_V2DF:
9466 case V2SF_FTYPE_V2SF:
9467 case V2SF_FTYPE_V2SI:
9468 case V32QI_FTYPE_V32QI:
9469 case V32QI_FTYPE_V16QI:
9470 case V16HI_FTYPE_V16HI:
9471 case V16HI_FTYPE_V8HI:
9472 case V8SI_FTYPE_V8SI:
9473 case V16HI_FTYPE_V16QI:
9474 case V8SI_FTYPE_V16QI:
9475 case V4DI_FTYPE_V16QI:
9476 case V8SI_FTYPE_V8HI:
9477 case V4DI_FTYPE_V8HI:
9478 case V4DI_FTYPE_V4SI:
9479 case V4DI_FTYPE_V2DI:
9480 case UQI_FTYPE_UQI:
9481 case UHI_FTYPE_UHI:
9482 case USI_FTYPE_USI:
9483 case USI_FTYPE_UQI:
9484 case USI_FTYPE_UHI:
9485 case UDI_FTYPE_UDI:
9486 case UHI_FTYPE_V16QI:
9487 case USI_FTYPE_V32QI:
9488 case UDI_FTYPE_V64QI:
9489 case V16QI_FTYPE_UHI:
9490 case V32QI_FTYPE_USI:
9491 case V64QI_FTYPE_UDI:
9492 case V8HI_FTYPE_UQI:
9493 case V16HI_FTYPE_UHI:
9494 case V32HI_FTYPE_USI:
9495 case V4SI_FTYPE_UQI:
9496 case V8SI_FTYPE_UQI:
9497 case V4SI_FTYPE_UHI:
9498 case V8SI_FTYPE_UHI:
9499 case UQI_FTYPE_V8HI:
9500 case UHI_FTYPE_V16HI:
9501 case USI_FTYPE_V32HI:
9502 case UQI_FTYPE_V4SI:
9503 case UQI_FTYPE_V8SI:
9504 case UHI_FTYPE_V16SI:
9505 case UQI_FTYPE_V2DI:
9506 case UQI_FTYPE_V4DI:
9507 case UQI_FTYPE_V8DI:
9508 case V16SI_FTYPE_UHI:
9509 case V2DI_FTYPE_UQI:
9510 case V4DI_FTYPE_UQI:
9511 case V16SI_FTYPE_INT:
9512 case V16SF_FTYPE_V8SF:
9513 case V16SI_FTYPE_V8SI:
9514 case V16SF_FTYPE_V4SF:
9515 case V16SI_FTYPE_V4SI:
9516 case V16SI_FTYPE_V16SF:
9517 case V16SI_FTYPE_V16SI:
9518 case V64QI_FTYPE_V64QI:
9519 case V32HI_FTYPE_V32HI:
9520 case V16SF_FTYPE_V16SF:
9521 case V8DI_FTYPE_UQI:
9522 case V8DI_FTYPE_V8DI:
9523 case V8DF_FTYPE_V4DF:
9524 case V8DF_FTYPE_V2DF:
9525 case V8DF_FTYPE_V8DF:
9526 case V4DI_FTYPE_V4DI:
4f0e90fa
HL
9527 case V16HI_FTYPE_V16SF:
9528 case V8HI_FTYPE_V8SF:
9529 case V8HI_FTYPE_V4SF:
2bf6d935
ML
9530 nargs = 1;
9531 break;
9532 case V4SF_FTYPE_V4SF_VEC_MERGE:
9533 case V2DF_FTYPE_V2DF_VEC_MERGE:
9534 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
9535 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
9536 case V16QI_FTYPE_V16QI_V16QI:
9537 case V16QI_FTYPE_V8HI_V8HI:
b96cb2ca 9538 case V16HF_FTYPE_V16HF_V16HF:
2bf6d935
ML
9539 case V16SF_FTYPE_V16SF_V16SF:
9540 case V8QI_FTYPE_V8QI_V8QI:
9541 case V8QI_FTYPE_V4HI_V4HI:
9542 case V8HI_FTYPE_V8HI_V8HI:
9543 case V8HI_FTYPE_V16QI_V16QI:
9544 case V8HI_FTYPE_V4SI_V4SI:
b96cb2ca 9545 case V8HF_FTYPE_V8HF_V8HF:
2bf6d935
ML
9546 case V8SF_FTYPE_V8SF_V8SF:
9547 case V8SF_FTYPE_V8SF_V8SI:
9548 case V8DF_FTYPE_V8DF_V8DF:
9549 case V4SI_FTYPE_V4SI_V4SI:
9550 case V4SI_FTYPE_V8HI_V8HI:
9551 case V4SI_FTYPE_V2DF_V2DF:
9552 case V4HI_FTYPE_V4HI_V4HI:
9553 case V4HI_FTYPE_V8QI_V8QI:
9554 case V4HI_FTYPE_V2SI_V2SI:
9555 case V4DF_FTYPE_V4DF_V4DF:
9556 case V4DF_FTYPE_V4DF_V4DI:
9557 case V4SF_FTYPE_V4SF_V4SF:
9558 case V4SF_FTYPE_V4SF_V4SI:
9559 case V4SF_FTYPE_V4SF_V2SI:
9560 case V4SF_FTYPE_V4SF_V2DF:
9561 case V4SF_FTYPE_V4SF_UINT:
9562 case V4SF_FTYPE_V4SF_DI:
9563 case V4SF_FTYPE_V4SF_SI:
9564 case V2DI_FTYPE_V2DI_V2DI:
9565 case V2DI_FTYPE_V16QI_V16QI:
9566 case V2DI_FTYPE_V4SI_V4SI:
9567 case V2DI_FTYPE_V2DI_V16QI:
9568 case V2SI_FTYPE_V2SI_V2SI:
9569 case V2SI_FTYPE_V4HI_V4HI:
9570 case V2SI_FTYPE_V2SF_V2SF:
9571 case V2DF_FTYPE_V2DF_V2DF:
9572 case V2DF_FTYPE_V2DF_V4SF:
9573 case V2DF_FTYPE_V2DF_V2DI:
9574 case V2DF_FTYPE_V2DF_DI:
9575 case V2DF_FTYPE_V2DF_SI:
9576 case V2DF_FTYPE_V2DF_UINT:
9577 case V2SF_FTYPE_V2SF_V2SF:
9578 case V1DI_FTYPE_V1DI_V1DI:
9579 case V1DI_FTYPE_V8QI_V8QI:
9580 case V1DI_FTYPE_V2SI_V2SI:
9581 case V32QI_FTYPE_V16HI_V16HI:
9582 case V16HI_FTYPE_V8SI_V8SI:
9583 case V64QI_FTYPE_V64QI_V64QI:
9584 case V32QI_FTYPE_V32QI_V32QI:
9585 case V16HI_FTYPE_V32QI_V32QI:
9586 case V16HI_FTYPE_V16HI_V16HI:
9587 case V8SI_FTYPE_V4DF_V4DF:
9588 case V8SI_FTYPE_V8SI_V8SI:
9589 case V8SI_FTYPE_V16HI_V16HI:
9590 case V4DI_FTYPE_V4DI_V4DI:
9591 case V4DI_FTYPE_V8SI_V8SI:
9592 case V8DI_FTYPE_V64QI_V64QI:
9593 if (comparison == UNKNOWN)
9594 return ix86_expand_binop_builtin (icode, exp, target);
9595 nargs = 2;
9596 break;
9597 case V4SF_FTYPE_V4SF_V4SF_SWAP:
9598 case V2DF_FTYPE_V2DF_V2DF_SWAP:
9599 gcc_assert (comparison != UNKNOWN);
9600 nargs = 2;
9601 swap = true;
9602 break;
9603 case V16HI_FTYPE_V16HI_V8HI_COUNT:
9604 case V16HI_FTYPE_V16HI_SI_COUNT:
9605 case V8SI_FTYPE_V8SI_V4SI_COUNT:
9606 case V8SI_FTYPE_V8SI_SI_COUNT:
9607 case V4DI_FTYPE_V4DI_V2DI_COUNT:
9608 case V4DI_FTYPE_V4DI_INT_COUNT:
9609 case V8HI_FTYPE_V8HI_V8HI_COUNT:
9610 case V8HI_FTYPE_V8HI_SI_COUNT:
9611 case V4SI_FTYPE_V4SI_V4SI_COUNT:
9612 case V4SI_FTYPE_V4SI_SI_COUNT:
9613 case V4HI_FTYPE_V4HI_V4HI_COUNT:
9614 case V4HI_FTYPE_V4HI_SI_COUNT:
9615 case V2DI_FTYPE_V2DI_V2DI_COUNT:
9616 case V2DI_FTYPE_V2DI_SI_COUNT:
9617 case V2SI_FTYPE_V2SI_V2SI_COUNT:
9618 case V2SI_FTYPE_V2SI_SI_COUNT:
9619 case V1DI_FTYPE_V1DI_V1DI_COUNT:
9620 case V1DI_FTYPE_V1DI_SI_COUNT:
9621 nargs = 2;
9622 second_arg_count = true;
9623 break;
9624 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
9625 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
9626 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
9627 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
9628 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
9629 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
9630 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
9631 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
9632 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
9633 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
9634 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
9635 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
9636 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
9637 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
9638 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
9639 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
9640 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
9641 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
9642 nargs = 4;
9643 second_arg_count = true;
9644 break;
9645 case UINT64_FTYPE_UINT64_UINT64:
9646 case UINT_FTYPE_UINT_UINT:
9647 case UINT_FTYPE_UINT_USHORT:
9648 case UINT_FTYPE_UINT_UCHAR:
9649 case UINT16_FTYPE_UINT16_INT:
9650 case UINT8_FTYPE_UINT8_INT:
9651 case UQI_FTYPE_UQI_UQI:
9652 case UHI_FTYPE_UHI_UHI:
9653 case USI_FTYPE_USI_USI:
9654 case UDI_FTYPE_UDI_UDI:
9655 case V16SI_FTYPE_V8DF_V8DF:
4f0e90fa
HL
9656 case V32HI_FTYPE_V16SF_V16SF:
9657 case V16HI_FTYPE_V8SF_V8SF:
9658 case V8HI_FTYPE_V4SF_V4SF:
9659 case V16HI_FTYPE_V16SF_UHI:
9660 case V8HI_FTYPE_V8SF_UQI:
9661 case V8HI_FTYPE_V4SF_UQI:
2bf6d935
ML
9662 nargs = 2;
9663 break;
9664 case V2DI_FTYPE_V2DI_INT_CONVERT:
9665 nargs = 2;
9666 rmode = V1TImode;
9667 nargs_constant = 1;
9668 break;
9669 case V4DI_FTYPE_V4DI_INT_CONVERT:
9670 nargs = 2;
9671 rmode = V2TImode;
9672 nargs_constant = 1;
9673 break;
9674 case V8DI_FTYPE_V8DI_INT_CONVERT:
9675 nargs = 2;
9676 rmode = V4TImode;
9677 nargs_constant = 1;
9678 break;
9679 case V8HI_FTYPE_V8HI_INT:
9680 case V8HI_FTYPE_V8SF_INT:
9681 case V16HI_FTYPE_V16SF_INT:
9682 case V8HI_FTYPE_V4SF_INT:
9683 case V8SF_FTYPE_V8SF_INT:
9684 case V4SF_FTYPE_V16SF_INT:
9685 case V16SF_FTYPE_V16SF_INT:
9686 case V4SI_FTYPE_V4SI_INT:
9687 case V4SI_FTYPE_V8SI_INT:
9688 case V4HI_FTYPE_V4HI_INT:
9689 case V4DF_FTYPE_V4DF_INT:
9690 case V4DF_FTYPE_V8DF_INT:
9691 case V4SF_FTYPE_V4SF_INT:
9692 case V4SF_FTYPE_V8SF_INT:
9693 case V2DI_FTYPE_V2DI_INT:
9694 case V2DF_FTYPE_V2DF_INT:
9695 case V2DF_FTYPE_V4DF_INT:
9696 case V16HI_FTYPE_V16HI_INT:
9697 case V8SI_FTYPE_V8SI_INT:
9698 case V16SI_FTYPE_V16SI_INT:
9699 case V4SI_FTYPE_V16SI_INT:
9700 case V4DI_FTYPE_V4DI_INT:
9701 case V2DI_FTYPE_V4DI_INT:
9702 case V4DI_FTYPE_V8DI_INT:
2bf6d935
ML
9703 case UQI_FTYPE_UQI_UQI_CONST:
9704 case UHI_FTYPE_UHI_UQI:
9705 case USI_FTYPE_USI_UQI:
9706 case UDI_FTYPE_UDI_UQI:
9707 nargs = 2;
9708 nargs_constant = 1;
9709 break;
9710 case V16QI_FTYPE_V16QI_V16QI_V16QI:
9711 case V8SF_FTYPE_V8SF_V8SF_V8SF:
9712 case V4DF_FTYPE_V4DF_V4DF_V4DF:
9713 case V4SF_FTYPE_V4SF_V4SF_V4SF:
9714 case V2DF_FTYPE_V2DF_V2DF_V2DF:
9715 case V32QI_FTYPE_V32QI_V32QI_V32QI:
9716 case UHI_FTYPE_V16SI_V16SI_UHI:
9717 case UQI_FTYPE_V8DI_V8DI_UQI:
9718 case V16HI_FTYPE_V16SI_V16HI_UHI:
9719 case V16QI_FTYPE_V16SI_V16QI_UHI:
9720 case V16QI_FTYPE_V8DI_V16QI_UQI:
4204740f 9721 case V32HF_FTYPE_V32HF_V32HF_USI:
2bf6d935
ML
9722 case V16SF_FTYPE_V16SF_V16SF_UHI:
9723 case V16SF_FTYPE_V4SF_V16SF_UHI:
9724 case V16SI_FTYPE_SI_V16SI_UHI:
9725 case V16SI_FTYPE_V16HI_V16SI_UHI:
9726 case V16SI_FTYPE_V16QI_V16SI_UHI:
9727 case V8SF_FTYPE_V4SF_V8SF_UQI:
9728 case V4DF_FTYPE_V2DF_V4DF_UQI:
9729 case V8SI_FTYPE_V4SI_V8SI_UQI:
9730 case V8SI_FTYPE_SI_V8SI_UQI:
9731 case V4SI_FTYPE_V4SI_V4SI_UQI:
9732 case V4SI_FTYPE_SI_V4SI_UQI:
9733 case V4DI_FTYPE_V2DI_V4DI_UQI:
9734 case V4DI_FTYPE_DI_V4DI_UQI:
9735 case V2DI_FTYPE_V2DI_V2DI_UQI:
9736 case V2DI_FTYPE_DI_V2DI_UQI:
9737 case V64QI_FTYPE_V64QI_V64QI_UDI:
9738 case V64QI_FTYPE_V16QI_V64QI_UDI:
9739 case V64QI_FTYPE_QI_V64QI_UDI:
9740 case V32QI_FTYPE_V32QI_V32QI_USI:
9741 case V32QI_FTYPE_V16QI_V32QI_USI:
9742 case V32QI_FTYPE_QI_V32QI_USI:
9743 case V16QI_FTYPE_V16QI_V16QI_UHI:
9744 case V16QI_FTYPE_QI_V16QI_UHI:
9745 case V32HI_FTYPE_V8HI_V32HI_USI:
9746 case V32HI_FTYPE_HI_V32HI_USI:
9747 case V16HI_FTYPE_V8HI_V16HI_UHI:
9748 case V16HI_FTYPE_HI_V16HI_UHI:
9749 case V8HI_FTYPE_V8HI_V8HI_UQI:
9750 case V8HI_FTYPE_HI_V8HI_UQI:
4204740f 9751 case V16HF_FTYPE_V16HF_V16HF_UHI:
2bf6d935
ML
9752 case V8SF_FTYPE_V8HI_V8SF_UQI:
9753 case V4SF_FTYPE_V8HI_V4SF_UQI:
bd610db0 9754 case V8SI_FTYPE_V8HF_V8SI_UQI:
5a744e50 9755 case V8SF_FTYPE_V8HF_V8SF_UQI:
2bf6d935
ML
9756 case V8SI_FTYPE_V8SF_V8SI_UQI:
9757 case V4SI_FTYPE_V4SF_V4SI_UQI:
bd610db0 9758 case V4SI_FTYPE_V8HF_V4SI_UQI:
5a744e50 9759 case V4SF_FTYPE_V8HF_V4SF_UQI:
bd610db0 9760 case V4DI_FTYPE_V8HF_V4DI_UQI:
2bf6d935 9761 case V4DI_FTYPE_V4SF_V4DI_UQI:
bd610db0 9762 case V2DI_FTYPE_V8HF_V2DI_UQI:
2bf6d935 9763 case V2DI_FTYPE_V4SF_V2DI_UQI:
4204740f 9764 case V8HF_FTYPE_V8HF_V8HF_UQI:
be0e4c32 9765 case V8HF_FTYPE_V8HI_V8HF_UQI:
9766 case V8HF_FTYPE_V8SI_V8HF_UQI:
5a744e50 9767 case V8HF_FTYPE_V8SF_V8HF_UQI:
be0e4c32 9768 case V8HF_FTYPE_V4SI_V8HF_UQI:
5a744e50 9769 case V8HF_FTYPE_V4SF_V8HF_UQI:
be0e4c32 9770 case V8HF_FTYPE_V4DI_V8HF_UQI:
5a744e50 9771 case V8HF_FTYPE_V4DF_V8HF_UQI:
be0e4c32 9772 case V8HF_FTYPE_V2DI_V8HF_UQI:
5a744e50 9773 case V8HF_FTYPE_V2DF_V8HF_UQI:
2bf6d935
ML
9774 case V4SF_FTYPE_V4DI_V4SF_UQI:
9775 case V4SF_FTYPE_V2DI_V4SF_UQI:
9776 case V4DF_FTYPE_V4DI_V4DF_UQI:
5a744e50 9777 case V4DF_FTYPE_V8HF_V4DF_UQI:
9778 case V2DF_FTYPE_V8HF_V2DF_UQI:
2bf6d935
ML
9779 case V2DF_FTYPE_V2DI_V2DF_UQI:
9780 case V16QI_FTYPE_V8HI_V16QI_UQI:
9781 case V16QI_FTYPE_V16HI_V16QI_UHI:
9782 case V16QI_FTYPE_V4SI_V16QI_UQI:
9783 case V16QI_FTYPE_V8SI_V16QI_UQI:
bd610db0 9784 case V8HI_FTYPE_V8HF_V8HI_UQI:
2bf6d935
ML
9785 case V8HI_FTYPE_V4SI_V8HI_UQI:
9786 case V8HI_FTYPE_V8SI_V8HI_UQI:
9787 case V16QI_FTYPE_V2DI_V16QI_UQI:
9788 case V16QI_FTYPE_V4DI_V16QI_UQI:
9789 case V8HI_FTYPE_V2DI_V8HI_UQI:
9790 case V8HI_FTYPE_V4DI_V8HI_UQI:
9791 case V4SI_FTYPE_V2DI_V4SI_UQI:
9792 case V4SI_FTYPE_V4DI_V4SI_UQI:
9793 case V32QI_FTYPE_V32HI_V32QI_USI:
9794 case UHI_FTYPE_V16QI_V16QI_UHI:
9795 case USI_FTYPE_V32QI_V32QI_USI:
9796 case UDI_FTYPE_V64QI_V64QI_UDI:
9797 case UQI_FTYPE_V8HI_V8HI_UQI:
9798 case UHI_FTYPE_V16HI_V16HI_UHI:
9799 case USI_FTYPE_V32HI_V32HI_USI:
9800 case UQI_FTYPE_V4SI_V4SI_UQI:
9801 case UQI_FTYPE_V8SI_V8SI_UQI:
9802 case UQI_FTYPE_V2DI_V2DI_UQI:
9803 case UQI_FTYPE_V4DI_V4DI_UQI:
9804 case V4SF_FTYPE_V2DF_V4SF_UQI:
9805 case V4SF_FTYPE_V4DF_V4SF_UQI:
9806 case V16SI_FTYPE_V16SI_V16SI_UHI:
9807 case V16SI_FTYPE_V4SI_V16SI_UHI:
9808 case V2DI_FTYPE_V4SI_V2DI_UQI:
9809 case V2DI_FTYPE_V8HI_V2DI_UQI:
9810 case V2DI_FTYPE_V16QI_V2DI_UQI:
9811 case V4DI_FTYPE_V4DI_V4DI_UQI:
9812 case V4DI_FTYPE_V4SI_V4DI_UQI:
9813 case V4DI_FTYPE_V8HI_V4DI_UQI:
9814 case V4DI_FTYPE_V16QI_V4DI_UQI:
9815 case V4DI_FTYPE_V4DF_V4DI_UQI:
9816 case V2DI_FTYPE_V2DF_V2DI_UQI:
9817 case V4SI_FTYPE_V4DF_V4SI_UQI:
9818 case V4SI_FTYPE_V2DF_V4SI_UQI:
9819 case V4SI_FTYPE_V8HI_V4SI_UQI:
9820 case V4SI_FTYPE_V16QI_V4SI_UQI:
9821 case V4DI_FTYPE_V4DI_V4DI_V4DI:
9822 case V8DF_FTYPE_V2DF_V8DF_UQI:
9823 case V8DF_FTYPE_V4DF_V8DF_UQI:
9824 case V8DF_FTYPE_V8DF_V8DF_UQI:
9825 case V8SF_FTYPE_V8SF_V8SF_UQI:
9826 case V8SF_FTYPE_V8SI_V8SF_UQI:
9827 case V4DF_FTYPE_V4DF_V4DF_UQI:
9828 case V4SF_FTYPE_V4SF_V4SF_UQI:
9829 case V2DF_FTYPE_V2DF_V2DF_UQI:
9830 case V2DF_FTYPE_V4SF_V2DF_UQI:
9831 case V2DF_FTYPE_V4SI_V2DF_UQI:
9832 case V4SF_FTYPE_V4SI_V4SF_UQI:
9833 case V4DF_FTYPE_V4SF_V4DF_UQI:
9834 case V4DF_FTYPE_V4SI_V4DF_UQI:
9835 case V8SI_FTYPE_V8SI_V8SI_UQI:
9836 case V8SI_FTYPE_V8HI_V8SI_UQI:
9837 case V8SI_FTYPE_V16QI_V8SI_UQI:
9838 case V8DF_FTYPE_V8SI_V8DF_UQI:
9839 case V8DI_FTYPE_DI_V8DI_UQI:
9840 case V16SF_FTYPE_V8SF_V16SF_UHI:
9841 case V16SI_FTYPE_V8SI_V16SI_UHI:
be0e4c32 9842 case V16HF_FTYPE_V16HI_V16HF_UHI:
bd610db0 9843 case V16HI_FTYPE_V16HF_V16HI_UHI:
2bf6d935
ML
9844 case V16HI_FTYPE_V16HI_V16HI_UHI:
9845 case V8HI_FTYPE_V16QI_V8HI_UQI:
9846 case V16HI_FTYPE_V16QI_V16HI_UHI:
9847 case V32HI_FTYPE_V32HI_V32HI_USI:
9848 case V32HI_FTYPE_V32QI_V32HI_USI:
9849 case V8DI_FTYPE_V16QI_V8DI_UQI:
9850 case V8DI_FTYPE_V2DI_V8DI_UQI:
9851 case V8DI_FTYPE_V4DI_V8DI_UQI:
9852 case V8DI_FTYPE_V8DI_V8DI_UQI:
9853 case V8DI_FTYPE_V8HI_V8DI_UQI:
9854 case V8DI_FTYPE_V8SI_V8DI_UQI:
9855 case V8HI_FTYPE_V8DI_V8HI_UQI:
9856 case V8SI_FTYPE_V8DI_V8SI_UQI:
9857 case V4SI_FTYPE_V4SI_V4SI_V4SI:
9858 case V16SI_FTYPE_V16SI_V16SI_V16SI:
9859 case V8DI_FTYPE_V8DI_V8DI_V8DI:
9860 case V32HI_FTYPE_V32HI_V32HI_V32HI:
9861 case V2DI_FTYPE_V2DI_V2DI_V2DI:
9862 case V16HI_FTYPE_V16HI_V16HI_V16HI:
9863 case V8SI_FTYPE_V8SI_V8SI_V8SI:
9864 case V8HI_FTYPE_V8HI_V8HI_V8HI:
4f0e90fa
HL
9865 case V32HI_FTYPE_V16SF_V16SF_USI:
9866 case V16HI_FTYPE_V8SF_V8SF_UHI:
9867 case V8HI_FTYPE_V4SF_V4SF_UQI:
9868 case V16HI_FTYPE_V16SF_V16HI_UHI:
9869 case V8HI_FTYPE_V8SF_V8HI_UQI:
9870 case V8HI_FTYPE_V4SF_V8HI_UQI:
9871 case V16SF_FTYPE_V16SF_V32HI_V32HI:
9872 case V8SF_FTYPE_V8SF_V16HI_V16HI:
9873 case V4SF_FTYPE_V4SF_V8HI_V8HI:
2bf6d935
ML
9874 nargs = 3;
9875 break;
9876 case V32QI_FTYPE_V32QI_V32QI_INT:
9877 case V16HI_FTYPE_V16HI_V16HI_INT:
9878 case V16QI_FTYPE_V16QI_V16QI_INT:
9879 case V4DI_FTYPE_V4DI_V4DI_INT:
9880 case V8HI_FTYPE_V8HI_V8HI_INT:
9881 case V8SI_FTYPE_V8SI_V8SI_INT:
9882 case V8SI_FTYPE_V8SI_V4SI_INT:
9883 case V8SF_FTYPE_V8SF_V8SF_INT:
9884 case V8SF_FTYPE_V8SF_V4SF_INT:
9885 case V4SI_FTYPE_V4SI_V4SI_INT:
9886 case V4DF_FTYPE_V4DF_V4DF_INT:
9887 case V16SF_FTYPE_V16SF_V16SF_INT:
9888 case V16SF_FTYPE_V16SF_V4SF_INT:
9889 case V16SI_FTYPE_V16SI_V4SI_INT:
9890 case V4DF_FTYPE_V4DF_V2DF_INT:
9891 case V4SF_FTYPE_V4SF_V4SF_INT:
9892 case V2DI_FTYPE_V2DI_V2DI_INT:
9893 case V4DI_FTYPE_V4DI_V2DI_INT:
9894 case V2DF_FTYPE_V2DF_V2DF_INT:
9895 case UQI_FTYPE_V8DI_V8UDI_INT:
9896 case UQI_FTYPE_V8DF_V8DF_INT:
9897 case UQI_FTYPE_V2DF_V2DF_INT:
9898 case UQI_FTYPE_V4SF_V4SF_INT:
9899 case UHI_FTYPE_V16SI_V16SI_INT:
9900 case UHI_FTYPE_V16SF_V16SF_INT:
9901 case V64QI_FTYPE_V64QI_V64QI_INT:
9902 case V32HI_FTYPE_V32HI_V32HI_INT:
9903 case V16SI_FTYPE_V16SI_V16SI_INT:
9904 case V8DI_FTYPE_V8DI_V8DI_INT:
9905 nargs = 3;
9906 nargs_constant = 1;
9907 break;
9908 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
9909 nargs = 3;
9910 rmode = V4DImode;
9911 nargs_constant = 1;
9912 break;
9913 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
9914 nargs = 3;
9915 rmode = V2DImode;
9916 nargs_constant = 1;
9917 break;
9918 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
9919 nargs = 3;
9920 rmode = DImode;
9921 nargs_constant = 1;
9922 break;
9923 case V2DI_FTYPE_V2DI_UINT_UINT:
9924 nargs = 3;
9925 nargs_constant = 2;
9926 break;
9927 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
9928 nargs = 3;
9929 rmode = V8DImode;
9930 nargs_constant = 1;
9931 break;
9932 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
9933 nargs = 5;
9934 rmode = V8DImode;
9935 mask_pos = 2;
9936 nargs_constant = 1;
9937 break;
9938 case QI_FTYPE_V8DF_INT_UQI:
9939 case QI_FTYPE_V4DF_INT_UQI:
9940 case QI_FTYPE_V2DF_INT_UQI:
9941 case HI_FTYPE_V16SF_INT_UHI:
9942 case QI_FTYPE_V8SF_INT_UQI:
9943 case QI_FTYPE_V4SF_INT_UQI:
8486e9f2 9944 case QI_FTYPE_V8HF_INT_UQI:
9945 case HI_FTYPE_V16HF_INT_UHI:
9946 case SI_FTYPE_V32HF_INT_USI:
2bf6d935
ML
9947 case V4SI_FTYPE_V4SI_V4SI_UHI:
9948 case V8SI_FTYPE_V8SI_V8SI_UHI:
9949 nargs = 3;
9950 mask_pos = 1;
9951 nargs_constant = 1;
9952 break;
9953 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
9954 nargs = 5;
9955 rmode = V4DImode;
9956 mask_pos = 2;
9957 nargs_constant = 1;
9958 break;
9959 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
9960 nargs = 5;
9961 rmode = V2DImode;
9962 mask_pos = 2;
9963 nargs_constant = 1;
9964 break;
9965 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
9966 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
9967 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
9968 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
9969 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
9970 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
9971 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
9972 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
9973 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
9974 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
9975 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
9976 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
9977 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
9978 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
9979 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
9980 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
bd7a34ef 9981 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
2bf6d935
ML
9982 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
9983 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
9984 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
9985 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
9986 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
9987 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
9988 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
9989 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
9990 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
9991 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
9992 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
9993 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
9994 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
9995 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
9996 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
9997 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
9998 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
bd7a34ef 9999 case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
2bf6d935
ML
10000 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
10001 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
10002 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
10003 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
10004 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
10005 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
10006 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
bd7a34ef 10007 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
2bf6d935
ML
10008 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
10009 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
10010 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
10011 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
10012 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
10013 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
10014 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
10015 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
10016 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
10017 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
10018 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
4f0e90fa
HL
10019 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
10020 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
10021 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
2bf6d935
ML
10022 nargs = 4;
10023 break;
10024 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
10025 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
10026 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
10027 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
10028 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
10029 nargs = 4;
10030 nargs_constant = 1;
10031 break;
10032 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
10033 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
10034 case QI_FTYPE_V4DF_V4DF_INT_UQI:
10035 case QI_FTYPE_V8SF_V8SF_INT_UQI:
0f200733 10036 case UHI_FTYPE_V16HF_V16HF_INT_UHI:
2bf6d935
ML
10037 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
10038 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
10039 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
10040 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
0f200733 10041 case UQI_FTYPE_V8HF_V8HF_INT_UQI:
2bf6d935
ML
10042 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
10043 case USI_FTYPE_V32QI_V32QI_INT_USI:
10044 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
10045 case USI_FTYPE_V32HI_V32HI_INT_USI:
0f200733 10046 case USI_FTYPE_V32HF_V32HF_INT_USI:
2bf6d935
ML
10047 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
10048 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
2bf6d935
ML
10049 nargs = 4;
10050 mask_pos = 1;
10051 nargs_constant = 1;
10052 break;
10053 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
10054 nargs = 4;
10055 nargs_constant = 2;
10056 break;
10057 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
10058 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
4f0e90fa
HL
10059 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
10060 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
10061 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
2bf6d935
ML
10062 nargs = 4;
10063 break;
10064 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
10065 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
10066 mask_pos = 1;
10067 nargs = 4;
10068 nargs_constant = 1;
10069 break;
10070 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
10071 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
10072 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
10073 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
10074 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
10075 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
10076 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
10077 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
10078 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
10079 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
10080 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
10081 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
10082 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
10083 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
10084 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
10085 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
10086 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
10087 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
10088 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
10089 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
10090 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
10091 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
10092 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
10093 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
10094 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
8bed7617 10095 case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
10096 case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
2bf6d935
ML
10097 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
10098 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
10099 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
10100 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
10101 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
10102 nargs = 4;
10103 mask_pos = 2;
10104 nargs_constant = 1;
10105 break;
10106 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
10107 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
10108 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
10109 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
10110 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
10111 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
10112 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
10113 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
10114 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
10115 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
10116 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
10117 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
10118 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
10119 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
10120 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
10121 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
10122 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
10123 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
10124 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
10125 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
10126 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
10127 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
10128 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
10129 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
10130 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
10131 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
10132 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
10133 nargs = 5;
10134 mask_pos = 2;
10135 nargs_constant = 1;
10136 break;
10137 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
10138 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
10139 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
10140 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
10141 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
10142 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
10143 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
10144 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
10145 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
10146 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
10147 nargs = 5;
10148 mask_pos = 1;
10149 nargs_constant = 1;
10150 break;
10151 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
10152 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
10153 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
10154 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
10155 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
10156 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
10157 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
10158 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
10159 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
10160 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
10161 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
10162 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
10163 nargs = 5;
10164 mask_pos = 1;
10165 nargs_constant = 2;
10166 break;
10167
10168 default:
10169 gcc_unreachable ();
10170 }
10171
715a8bc8 10172 gcc_assert (nargs <= ARRAY_SIZE (xops));
2bf6d935
ML
10173
10174 if (comparison != UNKNOWN)
10175 {
10176 gcc_assert (nargs == 2);
10177 return ix86_expand_sse_compare (d, exp, target, swap);
10178 }
10179
10180 if (rmode == VOIDmode || rmode == tmode)
10181 {
10182 if (optimize
10183 || target == 0
10184 || GET_MODE (target) != tmode
10185 || !insn_p->operand[0].predicate (target, tmode))
10186 target = gen_reg_rtx (tmode);
10187 else if (memory_operand (target, tmode))
10188 num_memory++;
10189 real_target = target;
10190 }
10191 else
10192 {
10193 real_target = gen_reg_rtx (tmode);
10194 target = lowpart_subreg (rmode, real_target, tmode);
10195 }
10196
10197 for (i = 0; i < nargs; i++)
10198 {
10199 tree arg = CALL_EXPR_ARG (exp, i);
10200 rtx op = expand_normal (arg);
10201 machine_mode mode = insn_p->operand[i + 1].mode;
10202 bool match = insn_p->operand[i + 1].predicate (op, mode);
10203
10204 if (second_arg_count && i == 1)
10205 {
10206 /* SIMD shift insns take either an 8-bit immediate or
10207 register as count. But builtin functions take int as
10208 count. If count doesn't match, we put it in register.
10209 The instructions are using 64-bit count, if op is just
10210 32-bit, zero-extend it, as negative shift counts
10211 are undefined behavior and zero-extension is more
10212 efficient. */
10213 if (!match)
10214 {
10215 if (SCALAR_INT_MODE_P (GET_MODE (op)))
10216 op = convert_modes (mode, GET_MODE (op), op, 1);
10217 else
10218 op = lowpart_subreg (mode, op, GET_MODE (op));
10219 if (!insn_p->operand[i + 1].predicate (op, mode))
10220 op = copy_to_reg (op);
10221 }
10222 }
10223 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
10224 (!mask_pos && (nargs - i) <= nargs_constant))
10225 {
10226 if (!match)
10227 switch (icode)
10228 {
10229 case CODE_FOR_avx_vinsertf128v4di:
10230 case CODE_FOR_avx_vextractf128v4di:
10231 error ("the last argument must be an 1-bit immediate");
10232 return const0_rtx;
10233
10234 case CODE_FOR_avx512f_cmpv8di3_mask:
10235 case CODE_FOR_avx512f_cmpv16si3_mask:
10236 case CODE_FOR_avx512f_ucmpv8di3_mask:
10237 case CODE_FOR_avx512f_ucmpv16si3_mask:
10238 case CODE_FOR_avx512vl_cmpv4di3_mask:
10239 case CODE_FOR_avx512vl_cmpv8si3_mask:
10240 case CODE_FOR_avx512vl_ucmpv4di3_mask:
10241 case CODE_FOR_avx512vl_ucmpv8si3_mask:
10242 case CODE_FOR_avx512vl_cmpv2di3_mask:
10243 case CODE_FOR_avx512vl_cmpv4si3_mask:
10244 case CODE_FOR_avx512vl_ucmpv2di3_mask:
10245 case CODE_FOR_avx512vl_ucmpv4si3_mask:
10246 error ("the last argument must be a 3-bit immediate");
10247 return const0_rtx;
10248
10249 case CODE_FOR_sse4_1_roundsd:
10250 case CODE_FOR_sse4_1_roundss:
10251
10252 case CODE_FOR_sse4_1_roundpd:
10253 case CODE_FOR_sse4_1_roundps:
10254 case CODE_FOR_avx_roundpd256:
10255 case CODE_FOR_avx_roundps256:
10256
10257 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
10258 case CODE_FOR_sse4_1_roundps_sfix:
10259 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
10260 case CODE_FOR_avx_roundps_sfix256:
10261
10262 case CODE_FOR_sse4_1_blendps:
10263 case CODE_FOR_avx_blendpd256:
10264 case CODE_FOR_avx_vpermilv4df:
10265 case CODE_FOR_avx_vpermilv4df_mask:
10266 case CODE_FOR_avx512f_getmantv8df_mask:
10267 case CODE_FOR_avx512f_getmantv16sf_mask:
8486e9f2 10268 case CODE_FOR_avx512vl_getmantv16hf_mask:
2bf6d935
ML
10269 case CODE_FOR_avx512vl_getmantv8sf_mask:
10270 case CODE_FOR_avx512vl_getmantv4df_mask:
8486e9f2 10271 case CODE_FOR_avx512fp16_getmantv8hf_mask:
2bf6d935
ML
10272 case CODE_FOR_avx512vl_getmantv4sf_mask:
10273 case CODE_FOR_avx512vl_getmantv2df_mask:
10274 case CODE_FOR_avx512dq_rangepv8df_mask_round:
10275 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
10276 case CODE_FOR_avx512dq_rangepv4df_mask:
10277 case CODE_FOR_avx512dq_rangepv8sf_mask:
10278 case CODE_FOR_avx512dq_rangepv2df_mask:
10279 case CODE_FOR_avx512dq_rangepv4sf_mask:
10280 case CODE_FOR_avx_shufpd256_mask:
10281 error ("the last argument must be a 4-bit immediate");
10282 return const0_rtx;
10283
10284 case CODE_FOR_sha1rnds4:
10285 case CODE_FOR_sse4_1_blendpd:
10286 case CODE_FOR_avx_vpermilv2df:
10287 case CODE_FOR_avx_vpermilv2df_mask:
10288 case CODE_FOR_xop_vpermil2v2df3:
10289 case CODE_FOR_xop_vpermil2v4sf3:
10290 case CODE_FOR_xop_vpermil2v4df3:
10291 case CODE_FOR_xop_vpermil2v8sf3:
10292 case CODE_FOR_avx512f_vinsertf32x4_mask:
10293 case CODE_FOR_avx512f_vinserti32x4_mask:
10294 case CODE_FOR_avx512f_vextractf32x4_mask:
10295 case CODE_FOR_avx512f_vextracti32x4_mask:
10296 case CODE_FOR_sse2_shufpd:
10297 case CODE_FOR_sse2_shufpd_mask:
10298 case CODE_FOR_avx512dq_shuf_f64x2_mask:
10299 case CODE_FOR_avx512dq_shuf_i64x2_mask:
10300 case CODE_FOR_avx512vl_shuf_i32x4_mask:
10301 case CODE_FOR_avx512vl_shuf_f32x4_mask:
10302 error ("the last argument must be a 2-bit immediate");
10303 return const0_rtx;
10304
10305 case CODE_FOR_avx_vextractf128v4df:
10306 case CODE_FOR_avx_vextractf128v8sf:
10307 case CODE_FOR_avx_vextractf128v8si:
10308 case CODE_FOR_avx_vinsertf128v4df:
10309 case CODE_FOR_avx_vinsertf128v8sf:
10310 case CODE_FOR_avx_vinsertf128v8si:
10311 case CODE_FOR_avx512f_vinsertf64x4_mask:
10312 case CODE_FOR_avx512f_vinserti64x4_mask:
10313 case CODE_FOR_avx512f_vextractf64x4_mask:
10314 case CODE_FOR_avx512f_vextracti64x4_mask:
10315 case CODE_FOR_avx512dq_vinsertf32x8_mask:
10316 case CODE_FOR_avx512dq_vinserti32x8_mask:
10317 case CODE_FOR_avx512vl_vinsertv4df:
10318 case CODE_FOR_avx512vl_vinsertv4di:
10319 case CODE_FOR_avx512vl_vinsertv8sf:
10320 case CODE_FOR_avx512vl_vinsertv8si:
10321 error ("the last argument must be a 1-bit immediate");
10322 return const0_rtx;
10323
10324 case CODE_FOR_avx_vmcmpv2df3:
10325 case CODE_FOR_avx_vmcmpv4sf3:
10326 case CODE_FOR_avx_cmpv2df3:
10327 case CODE_FOR_avx_cmpv4sf3:
10328 case CODE_FOR_avx_cmpv4df3:
10329 case CODE_FOR_avx_cmpv8sf3:
10330 case CODE_FOR_avx512f_cmpv8df3_mask:
10331 case CODE_FOR_avx512f_cmpv16sf3_mask:
10332 case CODE_FOR_avx512f_vmcmpv2df3_mask:
10333 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
0f200733 10334 case CODE_FOR_avx512bw_cmpv32hf3_mask:
10335 case CODE_FOR_avx512vl_cmpv16hf3_mask:
10336 case CODE_FOR_avx512fp16_cmpv8hf3_mask:
2bf6d935
ML
10337 error ("the last argument must be a 5-bit immediate");
10338 return const0_rtx;
10339
10340 default:
10341 switch (nargs_constant)
10342 {
10343 case 2:
10344 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
10345 (!mask_pos && (nargs - i) == nargs_constant))
10346 {
10347 error ("the next to last argument must be an 8-bit immediate");
10348 break;
10349 }
10350 /* FALLTHRU */
10351 case 1:
10352 error ("the last argument must be an 8-bit immediate");
10353 break;
10354 default:
10355 gcc_unreachable ();
10356 }
10357 return const0_rtx;
10358 }
10359 }
10360 else
10361 {
10362 if (VECTOR_MODE_P (mode))
10363 op = safe_vector_operand (op, mode);
10364
10365 /* If we aren't optimizing, only allow one memory operand to
10366 be generated. */
10367 if (memory_operand (op, mode))
10368 num_memory++;
10369
10370 op = fixup_modeless_constant (op, mode);
10371
10372 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10373 {
10374 if (optimize || !match || num_memory > 1)
10375 op = copy_to_mode_reg (mode, op);
10376 }
10377 else
10378 {
10379 op = copy_to_reg (op);
10380 op = lowpart_subreg (mode, op, GET_MODE (op));
10381 }
10382 }
10383
715a8bc8 10384 xops[i] = op;
2bf6d935
ML
10385 }
10386
10387 switch (nargs)
10388 {
10389 case 1:
715a8bc8 10390 pat = GEN_FCN (icode) (real_target, xops[0]);
2bf6d935
ML
10391 break;
10392 case 2:
715a8bc8 10393 pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
2bf6d935
ML
10394 break;
10395 case 3:
715a8bc8 10396 pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
2bf6d935
ML
10397 break;
10398 case 4:
715a8bc8
UB
10399 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
10400 xops[2], xops[3]);
2bf6d935
ML
10401 break;
10402 case 5:
715a8bc8
UB
10403 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
10404 xops[2], xops[3], xops[4]);
2bf6d935
ML
10405 break;
10406 case 6:
715a8bc8
UB
10407 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
10408 xops[2], xops[3], xops[4], xops[5]);
2bf6d935
ML
10409 break;
10410 default:
10411 gcc_unreachable ();
10412 }
10413
10414 if (! pat)
10415 return 0;
10416
10417 emit_insn (pat);
10418 return target;
10419}
10420
10421/* Transform pattern of following layout:
10422 (set A
10423 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
10424 )
10425 into:
10426 (set (A B)) */
10427
10428static rtx
10429ix86_erase_embedded_rounding (rtx pat)
10430{
10431 if (GET_CODE (pat) == INSN)
10432 pat = PATTERN (pat);
10433
10434 gcc_assert (GET_CODE (pat) == SET);
10435 rtx src = SET_SRC (pat);
10436 gcc_assert (XVECLEN (src, 0) == 2);
10437 rtx p0 = XVECEXP (src, 0, 0);
10438 gcc_assert (GET_CODE (src) == UNSPEC
10439 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
10440 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
10441 return res;
10442}
10443
10444/* Subroutine of ix86_expand_round_builtin to take care of comi insns
10445 with rounding. */
10446static rtx
10447ix86_expand_sse_comi_round (const struct builtin_description *d,
10448 tree exp, rtx target)
10449{
10450 rtx pat, set_dst;
10451 tree arg0 = CALL_EXPR_ARG (exp, 0);
10452 tree arg1 = CALL_EXPR_ARG (exp, 1);
10453 tree arg2 = CALL_EXPR_ARG (exp, 2);
10454 tree arg3 = CALL_EXPR_ARG (exp, 3);
10455 rtx op0 = expand_normal (arg0);
10456 rtx op1 = expand_normal (arg1);
10457 rtx op2 = expand_normal (arg2);
10458 rtx op3 = expand_normal (arg3);
10459 enum insn_code icode = d->icode;
10460 const struct insn_data_d *insn_p = &insn_data[icode];
10461 machine_mode mode0 = insn_p->operand[0].mode;
10462 machine_mode mode1 = insn_p->operand[1].mode;
2bf6d935
ML
10463
10464 /* See avxintrin.h for values. */
467e9f38 10465 static const enum rtx_code comparisons[32] =
2bf6d935 10466 {
467e9f38
L
10467 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
10468 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
10469 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
10470 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
2bf6d935 10471 };
467e9f38
L
10472 static const bool ordereds[32] =
10473 {
10474 true, true, true, false, false, false, false, true,
10475 false, false, false, true, true, true, true, false,
10476 true, true, true, false, false, false, false, true,
10477 false, false, false, true, true, true, true, false
10478 };
10479 static const bool non_signalings[32] =
2bf6d935
ML
10480 {
10481 true, false, false, true, true, false, false, true,
10482 true, false, false, true, true, false, false, true,
10483 false, true, true, false, false, true, true, false,
10484 false, true, true, false, false, true, true, false
10485 };
10486
10487 if (!CONST_INT_P (op2))
10488 {
10489 error ("the third argument must be comparison constant");
10490 return const0_rtx;
10491 }
10492 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
10493 {
10494 error ("incorrect comparison mode");
10495 return const0_rtx;
10496 }
10497
10498 if (!insn_p->operand[2].predicate (op3, SImode))
10499 {
10500 error ("incorrect rounding operand");
10501 return const0_rtx;
10502 }
10503
2bf6d935
ML
10504 if (VECTOR_MODE_P (mode0))
10505 op0 = safe_vector_operand (op0, mode0);
10506 if (VECTOR_MODE_P (mode1))
10507 op1 = safe_vector_operand (op1, mode1);
10508
467e9f38
L
10509 enum rtx_code comparison = comparisons[INTVAL (op2)];
10510 bool ordered = ordereds[INTVAL (op2)];
10511 bool non_signaling = non_signalings[INTVAL (op2)];
10512 rtx const_val = const0_rtx;
10513
10514 bool check_unordered = false;
10515 machine_mode mode = CCFPmode;
10516 switch (comparison)
10517 {
10518 case ORDERED:
10519 if (!ordered)
10520 {
10521 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
10522 if (!non_signaling)
10523 ordered = true;
10524 mode = CCSmode;
10525 }
10526 else
10527 {
10528 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
10529 if (non_signaling)
10530 ordered = false;
10531 mode = CCPmode;
10532 }
10533 comparison = NE;
10534 break;
10535 case UNORDERED:
10536 if (ordered)
10537 {
10538 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
10539 if (non_signaling)
10540 ordered = false;
10541 mode = CCSmode;
10542 }
10543 else
10544 {
10545 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
10546 if (!non_signaling)
10547 ordered = true;
10548 mode = CCPmode;
10549 }
10550 comparison = EQ;
10551 break;
10552
10553 case LE: /* -> GE */
10554 case LT: /* -> GT */
10555 case UNGE: /* -> UNLE */
10556 case UNGT: /* -> UNLT */
10557 std::swap (op0, op1);
10558 comparison = swap_condition (comparison);
10559 /* FALLTHRU */
10560 case GT:
10561 case GE:
10562 case UNEQ:
10563 case UNLT:
10564 case UNLE:
10565 case LTGT:
10566 /* These are supported by CCFPmode. NB: Use ordered/signaling
10567 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
10568 with NAN operands. */
10569 if (ordered == non_signaling)
10570 ordered = !ordered;
10571 break;
10572 case EQ:
10573 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10574 _CMP_EQ_OQ/_CMP_EQ_OS. */
10575 check_unordered = true;
10576 mode = CCZmode;
10577 break;
10578 case NE:
10579 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10580 _CMP_NEQ_UQ/_CMP_NEQ_US. */
10581 gcc_assert (!ordered);
10582 check_unordered = true;
10583 mode = CCZmode;
10584 const_val = const1_rtx;
10585 break;
10586 default:
10587 gcc_unreachable ();
10588 }
10589
2bf6d935 10590 target = gen_reg_rtx (SImode);
467e9f38 10591 emit_move_insn (target, const_val);
2bf6d935
ML
10592 target = gen_rtx_SUBREG (QImode, target, 0);
10593
10594 if ((optimize && !register_operand (op0, mode0))
10595 || !insn_p->operand[0].predicate (op0, mode0))
10596 op0 = copy_to_mode_reg (mode0, op0);
10597 if ((optimize && !register_operand (op1, mode1))
10598 || !insn_p->operand[1].predicate (op1, mode1))
10599 op1 = copy_to_mode_reg (mode1, op1);
10600
467e9f38
L
10601 /*
10602 1. COMI: ordered and signaling.
10603 2. UCOMI: unordered and non-signaling.
10604 */
10605 if (non_signaling)
10606 icode = (icode == CODE_FOR_sse_comi_round
10607 ? CODE_FOR_sse_ucomi_round
10608 : CODE_FOR_sse2_ucomi_round);
2bf6d935
ML
10609
10610 pat = GEN_FCN (icode) (op0, op1, op3);
10611 if (! pat)
10612 return 0;
10613
10614 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
10615 if (INTVAL (op3) == NO_ROUND)
10616 {
10617 pat = ix86_erase_embedded_rounding (pat);
10618 if (! pat)
10619 return 0;
10620
10621 set_dst = SET_DEST (pat);
10622 }
10623 else
10624 {
10625 gcc_assert (GET_CODE (pat) == SET);
10626 set_dst = SET_DEST (pat);
10627 }
10628
10629 emit_insn (pat);
467e9f38
L
10630
10631 rtx_code_label *label = NULL;
10632
10633 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10634 with NAN operands. */
10635 if (check_unordered)
10636 {
10637 gcc_assert (comparison == EQ || comparison == NE);
10638
10639 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
10640 label = gen_label_rtx ();
10641 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
10642 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10643 gen_rtx_LABEL_REF (VOIDmode, label),
10644 pc_rtx);
10645 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10646 }
10647
10648 /* NB: Set CCFPmode and check a different CCmode which is in subset
10649 of CCFPmode. */
10650 if (GET_MODE (set_dst) != mode)
10651 {
10652 gcc_assert (mode == CCAmode || mode == CCCmode
10653 || mode == CCOmode || mode == CCPmode
10654 || mode == CCSmode || mode == CCZmode);
10655 set_dst = gen_rtx_REG (mode, FLAGS_REG);
10656 }
10657
2bf6d935
ML
10658 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10659 gen_rtx_fmt_ee (comparison, QImode,
10660 set_dst,
10661 const0_rtx)));
10662
467e9f38
L
10663 if (label)
10664 emit_label (label);
10665
2bf6d935
ML
10666 return SUBREG_REG (target);
10667}
10668
10669static rtx
10670ix86_expand_round_builtin (const struct builtin_description *d,
10671 tree exp, rtx target)
10672{
10673 rtx pat;
10674 unsigned int i, nargs;
715a8bc8 10675 rtx xops[6];
2bf6d935
ML
10676 enum insn_code icode = d->icode;
10677 const struct insn_data_d *insn_p = &insn_data[icode];
10678 machine_mode tmode = insn_p->operand[0].mode;
10679 unsigned int nargs_constant = 0;
10680 unsigned int redundant_embed_rnd = 0;
10681
10682 switch ((enum ix86_builtin_func_type) d->flag)
10683 {
10684 case UINT64_FTYPE_V2DF_INT:
10685 case UINT64_FTYPE_V4SF_INT:
3069a2e5 10686 case UINT64_FTYPE_V8HF_INT:
2bf6d935
ML
10687 case UINT_FTYPE_V2DF_INT:
10688 case UINT_FTYPE_V4SF_INT:
3069a2e5 10689 case UINT_FTYPE_V8HF_INT:
2bf6d935
ML
10690 case INT64_FTYPE_V2DF_INT:
10691 case INT64_FTYPE_V4SF_INT:
3069a2e5 10692 case INT64_FTYPE_V8HF_INT:
2bf6d935
ML
10693 case INT_FTYPE_V2DF_INT:
10694 case INT_FTYPE_V4SF_INT:
3069a2e5 10695 case INT_FTYPE_V8HF_INT:
2bf6d935
ML
10696 nargs = 2;
10697 break;
bd7a34ef 10698 case V32HF_FTYPE_V32HF_V32HF_INT:
71838266 10699 case V8HF_FTYPE_V8HF_V8HF_INT:
3069a2e5 10700 case V8HF_FTYPE_V8HF_INT_INT:
10701 case V8HF_FTYPE_V8HF_UINT_INT:
10702 case V8HF_FTYPE_V8HF_INT64_INT:
10703 case V8HF_FTYPE_V8HF_UINT64_INT:
2bf6d935
ML
10704 case V4SF_FTYPE_V4SF_UINT_INT:
10705 case V4SF_FTYPE_V4SF_UINT64_INT:
10706 case V2DF_FTYPE_V2DF_UINT64_INT:
10707 case V4SF_FTYPE_V4SF_INT_INT:
10708 case V4SF_FTYPE_V4SF_INT64_INT:
10709 case V2DF_FTYPE_V2DF_INT64_INT:
10710 case V4SF_FTYPE_V4SF_V4SF_INT:
10711 case V2DF_FTYPE_V2DF_V2DF_INT:
10712 case V4SF_FTYPE_V4SF_V2DF_INT:
10713 case V2DF_FTYPE_V2DF_V4SF_INT:
10714 nargs = 3;
10715 break;
10716 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
10717 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
bd610db0 10718 case V32HI_FTYPE_V32HF_V32HI_USI_INT:
2bf6d935 10719 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
bd610db0 10720 case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
2bf6d935
ML
10721 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
10722 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
10723 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
5a744e50 10724 case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
10725 case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
be0e4c32 10726 case V32HF_FTYPE_V32HI_V32HF_USI_INT:
4204740f 10727 case V32HF_FTYPE_V32HF_V32HF_USI_INT:
2bf6d935
ML
10728 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
10729 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
10730 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
10731 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
bd610db0 10732 case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
be0e4c32 10733 case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
2bf6d935
ML
10734 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
10735 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
10736 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
10737 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
be0e4c32 10738 case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
5a744e50 10739 case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
10740 case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
2bf6d935
ML
10741 nargs = 4;
10742 break;
10743 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
10744 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
10745 nargs_constant = 2;
10746 nargs = 4;
10747 break;
10748 case INT_FTYPE_V4SF_V4SF_INT_INT:
10749 case INT_FTYPE_V2DF_V2DF_INT_INT:
10750 return ix86_expand_sse_comi_round (d, exp, target);
10751 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
10752 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
10753 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
90429b96 10754 case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
2bf6d935 10755 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
bd7a34ef 10756 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
90429b96 10757 case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
2bf6d935
ML
10758 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
10759 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
93103603 10760 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
2bf6d935
ML
10761 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
10762 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
93103603 10763 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
71838266 10764 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
90429b96 10765 case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
10766 case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
2bf6d935
ML
10767 nargs = 5;
10768 break;
8bed7617 10769 case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
2bf6d935
ML
10770 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
10771 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
93103603
SP
10772 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
10773 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
2bf6d935
ML
10774 nargs_constant = 4;
10775 nargs = 5;
10776 break;
10777 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
10778 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
10779 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
10780 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
0f200733 10781 case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
10782 case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
2bf6d935
ML
10783 nargs_constant = 3;
10784 nargs = 5;
10785 break;
10786 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
10787 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
10788 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
10789 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
10790 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
10791 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
8bed7617 10792 case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
2bf6d935
ML
10793 nargs = 6;
10794 nargs_constant = 4;
10795 break;
10796 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
10797 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
10798 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
10799 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
10800 nargs = 6;
10801 nargs_constant = 3;
10802 break;
10803 default:
10804 gcc_unreachable ();
10805 }
715a8bc8 10806 gcc_assert (nargs <= ARRAY_SIZE (xops));
2bf6d935
ML
10807
10808 if (optimize
10809 || target == 0
10810 || GET_MODE (target) != tmode
10811 || !insn_p->operand[0].predicate (target, tmode))
10812 target = gen_reg_rtx (tmode);
10813
10814 for (i = 0; i < nargs; i++)
10815 {
10816 tree arg = CALL_EXPR_ARG (exp, i);
10817 rtx op = expand_normal (arg);
10818 machine_mode mode = insn_p->operand[i + 1].mode;
10819 bool match = insn_p->operand[i + 1].predicate (op, mode);
10820
10821 if (i == nargs - nargs_constant)
10822 {
10823 if (!match)
10824 {
10825 switch (icode)
10826 {
10827 case CODE_FOR_avx512f_getmantv8df_mask_round:
10828 case CODE_FOR_avx512f_getmantv16sf_mask_round:
8486e9f2 10829 case CODE_FOR_avx512bw_getmantv32hf_mask_round:
2bf6d935
ML
10830 case CODE_FOR_avx512f_vgetmantv2df_round:
10831 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
10832 case CODE_FOR_avx512f_vgetmantv4sf_round:
10833 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
8486e9f2 10834 case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
2bf6d935
ML
10835 error ("the immediate argument must be a 4-bit immediate");
10836 return const0_rtx;
10837 case CODE_FOR_avx512f_cmpv8df3_mask_round:
10838 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
10839 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
10840 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
0f200733 10841 case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
10842 case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
2bf6d935
ML
10843 error ("the immediate argument must be a 5-bit immediate");
10844 return const0_rtx;
10845 default:
10846 error ("the immediate argument must be an 8-bit immediate");
10847 return const0_rtx;
10848 }
10849 }
10850 }
10851 else if (i == nargs-1)
10852 {
10853 if (!insn_p->operand[nargs].predicate (op, SImode))
10854 {
10855 error ("incorrect rounding operand");
10856 return const0_rtx;
10857 }
10858
10859 /* If there is no rounding use normal version of the pattern. */
10860 if (INTVAL (op) == NO_ROUND)
10861 redundant_embed_rnd = 1;
10862 }
10863 else
10864 {
10865 if (VECTOR_MODE_P (mode))
10866 op = safe_vector_operand (op, mode);
10867
10868 op = fixup_modeless_constant (op, mode);
10869
10870 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10871 {
10872 if (optimize || !match)
10873 op = copy_to_mode_reg (mode, op);
10874 }
10875 else
10876 {
10877 op = copy_to_reg (op);
10878 op = lowpart_subreg (mode, op, GET_MODE (op));
10879 }
10880 }
10881
715a8bc8 10882 xops[i] = op;
2bf6d935
ML
10883 }
10884
10885 switch (nargs)
10886 {
10887 case 1:
715a8bc8 10888 pat = GEN_FCN (icode) (target, xops[0]);
2bf6d935
ML
10889 break;
10890 case 2:
715a8bc8 10891 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
2bf6d935
ML
10892 break;
10893 case 3:
715a8bc8 10894 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
2bf6d935
ML
10895 break;
10896 case 4:
715a8bc8
UB
10897 pat = GEN_FCN (icode) (target, xops[0], xops[1],
10898 xops[2], xops[3]);
2bf6d935
ML
10899 break;
10900 case 5:
715a8bc8
UB
10901 pat = GEN_FCN (icode) (target, xops[0], xops[1],
10902 xops[2], xops[3], xops[4]);
2bf6d935
ML
10903 break;
10904 case 6:
715a8bc8
UB
10905 pat = GEN_FCN (icode) (target, xops[0], xops[1],
10906 xops[2], xops[3], xops[4], xops[5]);
2bf6d935
ML
10907 break;
10908 default:
10909 gcc_unreachable ();
10910 }
10911
10912 if (!pat)
10913 return 0;
10914
10915 if (redundant_embed_rnd)
10916 pat = ix86_erase_embedded_rounding (pat);
10917
10918 emit_insn (pat);
10919 return target;
10920}
10921
10922/* Subroutine of ix86_expand_builtin to take care of special insns
10923 with variable number of operands. */
10924
10925static rtx
10926ix86_expand_special_args_builtin (const struct builtin_description *d,
10927 tree exp, rtx target)
10928{
10929 tree arg;
10930 rtx pat, op;
10931 unsigned int i, nargs, arg_adjust, memory;
10932 bool aligned_mem = false;
715a8bc8 10933 rtx xops[3];
2bf6d935 10934 enum insn_code icode = d->icode;
2bf6d935
ML
10935 const struct insn_data_d *insn_p = &insn_data[icode];
10936 machine_mode tmode = insn_p->operand[0].mode;
10937 enum { load, store } klass;
10938
10939 switch ((enum ix86_builtin_func_type) d->flag)
10940 {
10941 case VOID_FTYPE_VOID:
10942 emit_insn (GEN_FCN (icode) (target));
10943 return 0;
10944 case VOID_FTYPE_UINT64:
10945 case VOID_FTYPE_UNSIGNED:
10946 nargs = 0;
10947 klass = store;
10948 memory = 0;
10949 break;
10950
10951 case INT_FTYPE_VOID:
10952 case USHORT_FTYPE_VOID:
10953 case UINT64_FTYPE_VOID:
10954 case UINT_FTYPE_VOID:
299a53d7 10955 case UINT8_FTYPE_VOID:
2bf6d935
ML
10956 case UNSIGNED_FTYPE_VOID:
10957 nargs = 0;
10958 klass = load;
10959 memory = 0;
10960 break;
10961 case UINT64_FTYPE_PUNSIGNED:
10962 case V2DI_FTYPE_PV2DI:
10963 case V4DI_FTYPE_PV4DI:
10964 case V32QI_FTYPE_PCCHAR:
10965 case V16QI_FTYPE_PCCHAR:
10966 case V8SF_FTYPE_PCV4SF:
10967 case V8SF_FTYPE_PCFLOAT:
10968 case V4SF_FTYPE_PCFLOAT:
10969 case V4DF_FTYPE_PCV2DF:
10970 case V4DF_FTYPE_PCDOUBLE:
10971 case V2DF_FTYPE_PCDOUBLE:
10972 case VOID_FTYPE_PVOID:
10973 case V8DI_FTYPE_PV8DI:
10974 nargs = 1;
10975 klass = load;
10976 memory = 0;
10977 switch (icode)
10978 {
10979 case CODE_FOR_sse4_1_movntdqa:
10980 case CODE_FOR_avx2_movntdqa:
10981 case CODE_FOR_avx512f_movntdqa:
10982 aligned_mem = true;
10983 break;
10984 default:
10985 break;
10986 }
10987 break;
10988 case VOID_FTYPE_PV2SF_V4SF:
10989 case VOID_FTYPE_PV8DI_V8DI:
10990 case VOID_FTYPE_PV4DI_V4DI:
10991 case VOID_FTYPE_PV2DI_V2DI:
10992 case VOID_FTYPE_PCHAR_V32QI:
10993 case VOID_FTYPE_PCHAR_V16QI:
10994 case VOID_FTYPE_PFLOAT_V16SF:
10995 case VOID_FTYPE_PFLOAT_V8SF:
10996 case VOID_FTYPE_PFLOAT_V4SF:
10997 case VOID_FTYPE_PDOUBLE_V8DF:
10998 case VOID_FTYPE_PDOUBLE_V4DF:
10999 case VOID_FTYPE_PDOUBLE_V2DF:
11000 case VOID_FTYPE_PLONGLONG_LONGLONG:
11001 case VOID_FTYPE_PULONGLONG_ULONGLONG:
11002 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
11003 case VOID_FTYPE_PINT_INT:
11004 nargs = 1;
11005 klass = store;
11006 /* Reserve memory operand for target. */
715a8bc8 11007 memory = ARRAY_SIZE (xops);
2bf6d935
ML
11008 switch (icode)
11009 {
11010 /* These builtins and instructions require the memory
11011 to be properly aligned. */
11012 case CODE_FOR_avx_movntv4di:
11013 case CODE_FOR_sse2_movntv2di:
11014 case CODE_FOR_avx_movntv8sf:
11015 case CODE_FOR_sse_movntv4sf:
11016 case CODE_FOR_sse4a_vmmovntv4sf:
11017 case CODE_FOR_avx_movntv4df:
11018 case CODE_FOR_sse2_movntv2df:
11019 case CODE_FOR_sse4a_vmmovntv2df:
11020 case CODE_FOR_sse2_movntidi:
11021 case CODE_FOR_sse_movntq:
11022 case CODE_FOR_sse2_movntisi:
11023 case CODE_FOR_avx512f_movntv16sf:
11024 case CODE_FOR_avx512f_movntv8df:
11025 case CODE_FOR_avx512f_movntv8di:
11026 aligned_mem = true;
11027 break;
11028 default:
11029 break;
11030 }
11031 break;
11032 case VOID_FTYPE_PVOID_PCVOID:
11033 nargs = 1;
11034 klass = store;
11035 memory = 0;
11036
11037 break;
11038 case V4SF_FTYPE_V4SF_PCV2SF:
11039 case V2DF_FTYPE_V2DF_PCDOUBLE:
11040 nargs = 2;
11041 klass = load;
11042 memory = 1;
11043 break;
11044 case V8SF_FTYPE_PCV8SF_V8SI:
11045 case V4DF_FTYPE_PCV4DF_V4DI:
11046 case V4SF_FTYPE_PCV4SF_V4SI:
11047 case V2DF_FTYPE_PCV2DF_V2DI:
11048 case V8SI_FTYPE_PCV8SI_V8SI:
11049 case V4DI_FTYPE_PCV4DI_V4DI:
11050 case V4SI_FTYPE_PCV4SI_V4SI:
11051 case V2DI_FTYPE_PCV2DI_V2DI:
11052 case VOID_FTYPE_INT_INT64:
11053 nargs = 2;
11054 klass = load;
11055 memory = 0;
11056 break;
11057 case VOID_FTYPE_PV8DF_V8DF_UQI:
11058 case VOID_FTYPE_PV4DF_V4DF_UQI:
11059 case VOID_FTYPE_PV2DF_V2DF_UQI:
11060 case VOID_FTYPE_PV16SF_V16SF_UHI:
11061 case VOID_FTYPE_PV8SF_V8SF_UQI:
11062 case VOID_FTYPE_PV4SF_V4SF_UQI:
11063 case VOID_FTYPE_PV8DI_V8DI_UQI:
11064 case VOID_FTYPE_PV4DI_V4DI_UQI:
11065 case VOID_FTYPE_PV2DI_V2DI_UQI:
11066 case VOID_FTYPE_PV16SI_V16SI_UHI:
11067 case VOID_FTYPE_PV8SI_V8SI_UQI:
11068 case VOID_FTYPE_PV4SI_V4SI_UQI:
11069 case VOID_FTYPE_PV64QI_V64QI_UDI:
11070 case VOID_FTYPE_PV32HI_V32HI_USI:
11071 case VOID_FTYPE_PV32QI_V32QI_USI:
11072 case VOID_FTYPE_PV16QI_V16QI_UHI:
11073 case VOID_FTYPE_PV16HI_V16HI_UHI:
11074 case VOID_FTYPE_PV8HI_V8HI_UQI:
11075 switch (icode)
11076 {
11077 /* These builtins and instructions require the memory
11078 to be properly aligned. */
11079 case CODE_FOR_avx512f_storev16sf_mask:
11080 case CODE_FOR_avx512f_storev16si_mask:
11081 case CODE_FOR_avx512f_storev8df_mask:
11082 case CODE_FOR_avx512f_storev8di_mask:
11083 case CODE_FOR_avx512vl_storev8sf_mask:
11084 case CODE_FOR_avx512vl_storev8si_mask:
11085 case CODE_FOR_avx512vl_storev4df_mask:
11086 case CODE_FOR_avx512vl_storev4di_mask:
11087 case CODE_FOR_avx512vl_storev4sf_mask:
11088 case CODE_FOR_avx512vl_storev4si_mask:
11089 case CODE_FOR_avx512vl_storev2df_mask:
11090 case CODE_FOR_avx512vl_storev2di_mask:
11091 aligned_mem = true;
11092 break;
11093 default:
11094 break;
11095 }
11096 /* FALLTHRU */
11097 case VOID_FTYPE_PV8SF_V8SI_V8SF:
11098 case VOID_FTYPE_PV4DF_V4DI_V4DF:
11099 case VOID_FTYPE_PV4SF_V4SI_V4SF:
11100 case VOID_FTYPE_PV2DF_V2DI_V2DF:
11101 case VOID_FTYPE_PV8SI_V8SI_V8SI:
11102 case VOID_FTYPE_PV4DI_V4DI_V4DI:
11103 case VOID_FTYPE_PV4SI_V4SI_V4SI:
11104 case VOID_FTYPE_PV2DI_V2DI_V2DI:
11105 case VOID_FTYPE_PV8SI_V8DI_UQI:
11106 case VOID_FTYPE_PV8HI_V8DI_UQI:
11107 case VOID_FTYPE_PV16HI_V16SI_UHI:
4a948703 11108 case VOID_FTYPE_PUDI_V8DI_UQI:
2bf6d935
ML
11109 case VOID_FTYPE_PV16QI_V16SI_UHI:
11110 case VOID_FTYPE_PV4SI_V4DI_UQI:
4a948703 11111 case VOID_FTYPE_PUDI_V2DI_UQI:
11112 case VOID_FTYPE_PUDI_V4DI_UQI:
11113 case VOID_FTYPE_PUSI_V2DI_UQI:
2bf6d935 11114 case VOID_FTYPE_PV8HI_V8SI_UQI:
4a948703 11115 case VOID_FTYPE_PUDI_V4SI_UQI:
11116 case VOID_FTYPE_PUSI_V4DI_UQI:
11117 case VOID_FTYPE_PUHI_V2DI_UQI:
11118 case VOID_FTYPE_PUDI_V8SI_UQI:
11119 case VOID_FTYPE_PUSI_V4SI_UQI:
2bf6d935
ML
11120 case VOID_FTYPE_PCHAR_V64QI_UDI:
11121 case VOID_FTYPE_PCHAR_V32QI_USI:
11122 case VOID_FTYPE_PCHAR_V16QI_UHI:
11123 case VOID_FTYPE_PSHORT_V32HI_USI:
11124 case VOID_FTYPE_PSHORT_V16HI_UHI:
11125 case VOID_FTYPE_PSHORT_V8HI_UQI:
11126 case VOID_FTYPE_PINT_V16SI_UHI:
11127 case VOID_FTYPE_PINT_V8SI_UQI:
11128 case VOID_FTYPE_PINT_V4SI_UQI:
11129 case VOID_FTYPE_PINT64_V8DI_UQI:
11130 case VOID_FTYPE_PINT64_V4DI_UQI:
11131 case VOID_FTYPE_PINT64_V2DI_UQI:
11132 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
11133 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
11134 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
11135 case VOID_FTYPE_PFLOAT_V16SF_UHI:
11136 case VOID_FTYPE_PFLOAT_V8SF_UQI:
11137 case VOID_FTYPE_PFLOAT_V4SF_UQI:
c4d423c7 11138 case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
2bf6d935
ML
11139 case VOID_FTYPE_PV32QI_V32HI_USI:
11140 case VOID_FTYPE_PV16QI_V16HI_UHI:
4a948703 11141 case VOID_FTYPE_PUDI_V8HI_UQI:
2bf6d935
ML
11142 nargs = 2;
11143 klass = store;
11144 /* Reserve memory operand for target. */
715a8bc8 11145 memory = ARRAY_SIZE (xops);
2bf6d935
ML
11146 break;
11147 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
11148 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
11149 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
11150 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
11151 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
11152 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
11153 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
11154 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
11155 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
11156 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
11157 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
11158 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
11159 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
11160 case V32HI_FTYPE_PCV32HI_V32HI_USI:
11161 case V32QI_FTYPE_PCV32QI_V32QI_USI:
11162 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
11163 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
11164 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
11165 switch (icode)
11166 {
11167 /* These builtins and instructions require the memory
11168 to be properly aligned. */
11169 case CODE_FOR_avx512f_loadv16sf_mask:
11170 case CODE_FOR_avx512f_loadv16si_mask:
11171 case CODE_FOR_avx512f_loadv8df_mask:
11172 case CODE_FOR_avx512f_loadv8di_mask:
11173 case CODE_FOR_avx512vl_loadv8sf_mask:
11174 case CODE_FOR_avx512vl_loadv8si_mask:
11175 case CODE_FOR_avx512vl_loadv4df_mask:
11176 case CODE_FOR_avx512vl_loadv4di_mask:
11177 case CODE_FOR_avx512vl_loadv4sf_mask:
11178 case CODE_FOR_avx512vl_loadv4si_mask:
11179 case CODE_FOR_avx512vl_loadv2df_mask:
11180 case CODE_FOR_avx512vl_loadv2di_mask:
11181 case CODE_FOR_avx512bw_loadv64qi_mask:
11182 case CODE_FOR_avx512vl_loadv32qi_mask:
11183 case CODE_FOR_avx512vl_loadv16qi_mask:
11184 case CODE_FOR_avx512bw_loadv32hi_mask:
11185 case CODE_FOR_avx512vl_loadv16hi_mask:
11186 case CODE_FOR_avx512vl_loadv8hi_mask:
11187 aligned_mem = true;
11188 break;
11189 default:
11190 break;
11191 }
11192 /* FALLTHRU */
11193 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
11194 case V32QI_FTYPE_PCCHAR_V32QI_USI:
11195 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
11196 case V32HI_FTYPE_PCSHORT_V32HI_USI:
11197 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
11198 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
11199 case V16SI_FTYPE_PCINT_V16SI_UHI:
11200 case V8SI_FTYPE_PCINT_V8SI_UQI:
11201 case V4SI_FTYPE_PCINT_V4SI_UQI:
11202 case V8DI_FTYPE_PCINT64_V8DI_UQI:
11203 case V4DI_FTYPE_PCINT64_V4DI_UQI:
11204 case V2DI_FTYPE_PCINT64_V2DI_UQI:
11205 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
11206 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
11207 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
11208 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
11209 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
11210 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
c4d423c7 11211 case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
2bf6d935
ML
11212 nargs = 3;
11213 klass = load;
11214 memory = 0;
11215 break;
2bf6d935
ML
11216 default:
11217 gcc_unreachable ();
11218 }
11219
715a8bc8 11220 gcc_assert (nargs <= ARRAY_SIZE (xops));
2bf6d935
ML
11221
11222 if (klass == store)
11223 {
11224 arg = CALL_EXPR_ARG (exp, 0);
11225 op = expand_normal (arg);
11226 gcc_assert (target == 0);
11227 if (memory)
11228 {
11229 op = ix86_zero_extend_to_Pmode (op);
11230 target = gen_rtx_MEM (tmode, op);
11231 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
11232 on it. Try to improve it using get_pointer_alignment,
11233 and if the special builtin is one that requires strict
11234 mode alignment, also from it's GET_MODE_ALIGNMENT.
11235 Failure to do so could lead to ix86_legitimate_combined_insn
11236 rejecting all changes to such insns. */
11237 unsigned int align = get_pointer_alignment (arg);
11238 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
11239 align = GET_MODE_ALIGNMENT (tmode);
11240 if (MEM_ALIGN (target) < align)
11241 set_mem_align (target, align);
11242 }
11243 else
11244 target = force_reg (tmode, op);
11245 arg_adjust = 1;
11246 }
11247 else
11248 {
11249 arg_adjust = 0;
11250 if (optimize
11251 || target == 0
11252 || !register_operand (target, tmode)
11253 || GET_MODE (target) != tmode)
11254 target = gen_reg_rtx (tmode);
11255 }
11256
11257 for (i = 0; i < nargs; i++)
11258 {
11259 machine_mode mode = insn_p->operand[i + 1].mode;
2bf6d935
ML
11260
11261 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
11262 op = expand_normal (arg);
2bf6d935 11263
776a37f6 11264 if (i == memory)
2bf6d935 11265 {
776a37f6 11266 /* This must be the memory operand. */
11267 op = ix86_zero_extend_to_Pmode (op);
11268 op = gen_rtx_MEM (mode, op);
11269 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
11270 on it. Try to improve it using get_pointer_alignment,
11271 and if the special builtin is one that requires strict
11272 mode alignment, also from it's GET_MODE_ALIGNMENT.
11273 Failure to do so could lead to ix86_legitimate_combined_insn
11274 rejecting all changes to such insns. */
11275 unsigned int align = get_pointer_alignment (arg);
11276 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
11277 align = GET_MODE_ALIGNMENT (mode);
11278 if (MEM_ALIGN (op) < align)
11279 set_mem_align (op, align);
2bf6d935
ML
11280 }
11281 else
11282 {
776a37f6 11283 /* This must be register. */
11284 if (VECTOR_MODE_P (mode))
11285 op = safe_vector_operand (op, mode);
2bf6d935 11286
776a37f6 11287 op = fixup_modeless_constant (op, mode);
2bf6d935 11288
b6efffa5 11289 /* NB: 3-operands load implied it's a mask load or v{p}expand*,
35c4c67e 11290 and that mask operand shoud be at the end.
11291 Keep all-ones mask which would be simplified by the expander. */
11292 if (nargs == 3 && i == 2 && klass == load
b6efffa5 11293 && constm1_operand (op, mode)
11294 && insn_p->operand[i].predicate (op, mode))
35c4c67e 11295 ;
11296 else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
776a37f6 11297 op = copy_to_mode_reg (mode, op);
11298 else
11299 {
11300 op = copy_to_reg (op);
11301 op = lowpart_subreg (mode, op, GET_MODE (op));
2bf6d935
ML
11302 }
11303 }
11304
715a8bc8 11305 xops[i]= op;
2bf6d935
ML
11306 }
11307
11308 switch (nargs)
11309 {
11310 case 0:
11311 pat = GEN_FCN (icode) (target);
11312 break;
11313 case 1:
715a8bc8 11314 pat = GEN_FCN (icode) (target, xops[0]);
2bf6d935
ML
11315 break;
11316 case 2:
715a8bc8 11317 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
2bf6d935
ML
11318 break;
11319 case 3:
715a8bc8 11320 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
2bf6d935
ML
11321 break;
11322 default:
11323 gcc_unreachable ();
11324 }
11325
11326 if (! pat)
11327 return 0;
715a8bc8 11328
2bf6d935
ML
11329 emit_insn (pat);
11330 return klass == store ? 0 : target;
11331}
11332
11333/* Return the integer constant in ARG. Constrain it to be in the range
11334 of the subparts of VEC_TYPE; issue an error if not. */
11335
11336static int
11337get_element_number (tree vec_type, tree arg)
11338{
11339 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
11340
11341 if (!tree_fits_uhwi_p (arg)
11342 || (elt = tree_to_uhwi (arg), elt > max))
11343 {
a9c697b8
MS
11344 error ("selector must be an integer constant in the range "
11345 "[0, %wi]", max);
2bf6d935
ML
11346 return 0;
11347 }
11348
11349 return elt;
11350}
11351
11352/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
11353 ix86_expand_vector_init. We DO have language-level syntax for this, in
11354 the form of (type){ init-list }. Except that since we can't place emms
11355 instructions from inside the compiler, we can't allow the use of MMX
11356 registers unless the user explicitly asks for it. So we do *not* define
11357 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
11358 we have builtins invoked by mmintrin.h that gives us license to emit
11359 these sorts of instructions. */
11360
11361static rtx
11362ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
11363{
11364 machine_mode tmode = TYPE_MODE (type);
11365 machine_mode inner_mode = GET_MODE_INNER (tmode);
11366 int i, n_elt = GET_MODE_NUNITS (tmode);
11367 rtvec v = rtvec_alloc (n_elt);
11368
11369 gcc_assert (VECTOR_MODE_P (tmode));
11370 gcc_assert (call_expr_nargs (exp) == n_elt);
11371
11372 for (i = 0; i < n_elt; ++i)
11373 {
11374 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
11375 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
11376 }
11377
11378 if (!target || !register_operand (target, tmode))
11379 target = gen_reg_rtx (tmode);
11380
11381 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
11382 return target;
11383}
11384
11385/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
11386 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
11387 had a language-level syntax for referencing vector elements. */
11388
11389static rtx
11390ix86_expand_vec_ext_builtin (tree exp, rtx target)
11391{
11392 machine_mode tmode, mode0;
11393 tree arg0, arg1;
11394 int elt;
11395 rtx op0;
11396
11397 arg0 = CALL_EXPR_ARG (exp, 0);
11398 arg1 = CALL_EXPR_ARG (exp, 1);
11399
11400 op0 = expand_normal (arg0);
11401 elt = get_element_number (TREE_TYPE (arg0), arg1);
11402
11403 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
11404 mode0 = TYPE_MODE (TREE_TYPE (arg0));
11405 gcc_assert (VECTOR_MODE_P (mode0));
11406
11407 op0 = force_reg (mode0, op0);
11408
11409 if (optimize || !target || !register_operand (target, tmode))
11410 target = gen_reg_rtx (tmode);
11411
11412 ix86_expand_vector_extract (true, target, op0, elt);
11413
11414 return target;
11415}
11416
11417/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
11418 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
11419 a language-level syntax for referencing vector elements. */
11420
11421static rtx
11422ix86_expand_vec_set_builtin (tree exp)
11423{
11424 machine_mode tmode, mode1;
11425 tree arg0, arg1, arg2;
11426 int elt;
11427 rtx op0, op1, target;
11428
11429 arg0 = CALL_EXPR_ARG (exp, 0);
11430 arg1 = CALL_EXPR_ARG (exp, 1);
11431 arg2 = CALL_EXPR_ARG (exp, 2);
11432
11433 tmode = TYPE_MODE (TREE_TYPE (arg0));
11434 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
11435 gcc_assert (VECTOR_MODE_P (tmode));
11436
11437 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
11438 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
11439 elt = get_element_number (TREE_TYPE (arg0), arg2);
11440
11441 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
11442 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
11443
11444 op0 = force_reg (tmode, op0);
11445 op1 = force_reg (mode1, op1);
11446
11447 /* OP0 is the source of these builtin functions and shouldn't be
11448 modified. Create a copy, use it and return it as target. */
11449 target = gen_reg_rtx (tmode);
11450 emit_move_insn (target, op0);
11451 ix86_expand_vector_set (true, target, op1, elt);
11452
11453 return target;
11454}
11455
11456/* Expand an expression EXP that calls a built-in function,
11457 with result going to TARGET if that's convenient
11458 (and in mode MODE if that's convenient).
11459 SUBTARGET may be used as the target for computing one of EXP's operands.
11460 IGNORE is nonzero if the value is to be ignored. */
11461
11462rtx
11463ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
11464 machine_mode mode, int ignore)
11465{
11466 size_t i;
11467 enum insn_code icode, icode2;
11468 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
11469 tree arg0, arg1, arg2, arg3, arg4;
11470 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
11471 machine_mode mode0, mode1, mode2, mode3, mode4;
4d732405 11472 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
2bf6d935
ML
11473
11474 /* For CPU builtins that can be folded, fold first and expand the fold. */
11475 switch (fcode)
11476 {
11477 case IX86_BUILTIN_CPU_INIT:
11478 {
11479 /* Make it call __cpu_indicator_init in libgcc. */
11480 tree call_expr, fndecl, type;
11481 type = build_function_type_list (integer_type_node, NULL_TREE);
11482 fndecl = build_fn_decl ("__cpu_indicator_init", type);
11483 call_expr = build_call_expr (fndecl, 0);
11484 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
11485 }
11486 case IX86_BUILTIN_CPU_IS:
11487 case IX86_BUILTIN_CPU_SUPPORTS:
11488 {
11489 tree arg0 = CALL_EXPR_ARG (exp, 0);
11490 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
11491 gcc_assert (fold_expr != NULL_TREE);
11492 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
11493 }
11494 }
11495
11496 HOST_WIDE_INT isa = ix86_isa_flags;
11497 HOST_WIDE_INT isa2 = ix86_isa_flags2;
11498 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
11499 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
11500 /* The general case is we require all the ISAs specified in bisa{,2}
11501 to be enabled.
11502 The exceptions are:
11503 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
11504 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
11505 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
ca813880 11506 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
11507 OPTION_MASK_ISA2_AVXVNNI
a13d6ec8
JJ
11508 where for each such pair it is sufficient if either of the ISAs is
11509 enabled, plus if it is ored with other options also those others.
11510 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
2bf6d935
ML
11511 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11512 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11513 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
11514 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
db3f0d21 11515
2bf6d935
ML
11516 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11517 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11518 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
11519 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
db3f0d21 11520
2bf6d935
ML
11521 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11522 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11523 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
11524 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
6058b874 11525
ca813880 11526 if ((((bisa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11527 == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11528 || (bisa2 & OPTION_MASK_ISA2_AVXVNNI) != 0)
11529 && (((isa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11530 == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11531 || (isa2 & OPTION_MASK_ISA2_AVXVNNI) != 0))
11532 {
11533 isa |= OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL;
11534 isa2 |= OPTION_MASK_ISA2_AVXVNNI;
11535 }
11536
db3f0d21
UB
11537 if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
11538 /* __builtin_ia32_maskmovq requires MMX registers. */
6058b874 11539 && fcode != IX86_BUILTIN_MASKMOVQ)
a13d6ec8
JJ
11540 {
11541 bisa &= ~OPTION_MASK_ISA_MMX;
11542 bisa |= OPTION_MASK_ISA_SSE2;
ecfdb16c 11543 }
6058b874 11544
2bf6d935
ML
11545 if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
11546 {
11547 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
11548 if (TARGET_ABI_X32)
11549 bisa |= OPTION_MASK_ABI_X32;
11550 else
11551 bisa |= OPTION_MASK_ABI_64;
11552 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
46e6341f
JJ
11553 (enum fpmath_unit) 0,
11554 (enum prefer_vector_width) 0,
11555 false, add_abi_p);
2bf6d935
ML
11556 if (!opts)
11557 error ("%qE needs unknown isa option", fndecl);
11558 else
11559 {
11560 gcc_assert (opts != NULL);
11561 error ("%qE needs isa option %s", fndecl, opts);
11562 free (opts);
11563 }
11564 return expand_call (exp, target, ignore);
11565 }
11566
11567 switch (fcode)
11568 {
11569 case IX86_BUILTIN_MASKMOVQ:
11570 case IX86_BUILTIN_MASKMOVDQU:
11571 icode = (fcode == IX86_BUILTIN_MASKMOVQ
11572 ? CODE_FOR_mmx_maskmovq
11573 : CODE_FOR_sse2_maskmovdqu);
11574 /* Note the arg order is different from the operand order. */
11575 arg1 = CALL_EXPR_ARG (exp, 0);
11576 arg2 = CALL_EXPR_ARG (exp, 1);
11577 arg0 = CALL_EXPR_ARG (exp, 2);
11578 op0 = expand_normal (arg0);
11579 op1 = expand_normal (arg1);
11580 op2 = expand_normal (arg2);
11581 mode0 = insn_data[icode].operand[0].mode;
11582 mode1 = insn_data[icode].operand[1].mode;
11583 mode2 = insn_data[icode].operand[2].mode;
11584
11585 op0 = ix86_zero_extend_to_Pmode (op0);
11586 op0 = gen_rtx_MEM (mode1, op0);
11587
11588 if (!insn_data[icode].operand[0].predicate (op0, mode0))
11589 op0 = copy_to_mode_reg (mode0, op0);
11590 if (!insn_data[icode].operand[1].predicate (op1, mode1))
11591 op1 = copy_to_mode_reg (mode1, op1);
11592 if (!insn_data[icode].operand[2].predicate (op2, mode2))
11593 op2 = copy_to_mode_reg (mode2, op2);
11594 pat = GEN_FCN (icode) (op0, op1, op2);
11595 if (! pat)
11596 return 0;
11597 emit_insn (pat);
11598 return 0;
11599
11600 case IX86_BUILTIN_LDMXCSR:
11601 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
11602 target = assign_386_stack_local (SImode, SLOT_TEMP);
11603 emit_move_insn (target, op0);
11604 emit_insn (gen_sse_ldmxcsr (target));
11605 return 0;
11606
11607 case IX86_BUILTIN_STMXCSR:
11608 target = assign_386_stack_local (SImode, SLOT_TEMP);
11609 emit_insn (gen_sse_stmxcsr (target));
11610 return copy_to_mode_reg (SImode, target);
11611
11612 case IX86_BUILTIN_CLFLUSH:
11613 arg0 = CALL_EXPR_ARG (exp, 0);
11614 op0 = expand_normal (arg0);
11615 icode = CODE_FOR_sse2_clflush;
11616 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11617 op0 = ix86_zero_extend_to_Pmode (op0);
11618
11619 emit_insn (gen_sse2_clflush (op0));
11620 return 0;
11621
11622 case IX86_BUILTIN_CLWB:
11623 arg0 = CALL_EXPR_ARG (exp, 0);
11624 op0 = expand_normal (arg0);
11625 icode = CODE_FOR_clwb;
11626 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11627 op0 = ix86_zero_extend_to_Pmode (op0);
11628
11629 emit_insn (gen_clwb (op0));
11630 return 0;
11631
11632 case IX86_BUILTIN_CLFLUSHOPT:
11633 arg0 = CALL_EXPR_ARG (exp, 0);
11634 op0 = expand_normal (arg0);
11635 icode = CODE_FOR_clflushopt;
11636 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11637 op0 = ix86_zero_extend_to_Pmode (op0);
11638
11639 emit_insn (gen_clflushopt (op0));
11640 return 0;
11641
11642 case IX86_BUILTIN_MONITOR:
11643 case IX86_BUILTIN_MONITORX:
11644 arg0 = CALL_EXPR_ARG (exp, 0);
11645 arg1 = CALL_EXPR_ARG (exp, 1);
11646 arg2 = CALL_EXPR_ARG (exp, 2);
11647 op0 = expand_normal (arg0);
11648 op1 = expand_normal (arg1);
11649 op2 = expand_normal (arg2);
11650 if (!REG_P (op0))
11651 op0 = ix86_zero_extend_to_Pmode (op0);
11652 if (!REG_P (op1))
11653 op1 = copy_to_mode_reg (SImode, op1);
11654 if (!REG_P (op2))
11655 op2 = copy_to_mode_reg (SImode, op2);
11656
11657 emit_insn (fcode == IX86_BUILTIN_MONITOR
a963ca40
UB
11658 ? gen_sse3_monitor (Pmode, op0, op1, op2)
11659 : gen_monitorx (Pmode, op0, op1, op2));
2bf6d935
ML
11660 return 0;
11661
11662 case IX86_BUILTIN_MWAIT:
11663 arg0 = CALL_EXPR_ARG (exp, 0);
11664 arg1 = CALL_EXPR_ARG (exp, 1);
11665 op0 = expand_normal (arg0);
11666 op1 = expand_normal (arg1);
11667 if (!REG_P (op0))
11668 op0 = copy_to_mode_reg (SImode, op0);
11669 if (!REG_P (op1))
11670 op1 = copy_to_mode_reg (SImode, op1);
11671 emit_insn (gen_sse3_mwait (op0, op1));
11672 return 0;
11673
11674 case IX86_BUILTIN_MWAITX:
11675 arg0 = CALL_EXPR_ARG (exp, 0);
11676 arg1 = CALL_EXPR_ARG (exp, 1);
11677 arg2 = CALL_EXPR_ARG (exp, 2);
11678 op0 = expand_normal (arg0);
11679 op1 = expand_normal (arg1);
11680 op2 = expand_normal (arg2);
11681 if (!REG_P (op0))
11682 op0 = copy_to_mode_reg (SImode, op0);
11683 if (!REG_P (op1))
11684 op1 = copy_to_mode_reg (SImode, op1);
11685 if (!REG_P (op2))
11686 op2 = copy_to_mode_reg (SImode, op2);
11687 emit_insn (gen_mwaitx (op0, op1, op2));
11688 return 0;
11689
11690 case IX86_BUILTIN_UMONITOR:
11691 arg0 = CALL_EXPR_ARG (exp, 0);
11692 op0 = expand_normal (arg0);
11693
11694 op0 = ix86_zero_extend_to_Pmode (op0);
987a3082 11695 emit_insn (gen_umonitor (Pmode, op0));
2bf6d935
ML
11696 return 0;
11697
11698 case IX86_BUILTIN_UMWAIT:
11699 case IX86_BUILTIN_TPAUSE:
11700 arg0 = CALL_EXPR_ARG (exp, 0);
11701 arg1 = CALL_EXPR_ARG (exp, 1);
11702 op0 = expand_normal (arg0);
11703 op1 = expand_normal (arg1);
11704
11705 if (!REG_P (op0))
11706 op0 = copy_to_mode_reg (SImode, op0);
11707
11708 op1 = force_reg (DImode, op1);
11709
11710 if (TARGET_64BIT)
11711 {
11712 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11713 NULL, 1, OPTAB_DIRECT);
11714 switch (fcode)
11715 {
11716 case IX86_BUILTIN_UMWAIT:
11717 icode = CODE_FOR_umwait_rex64;
11718 break;
11719 case IX86_BUILTIN_TPAUSE:
11720 icode = CODE_FOR_tpause_rex64;
11721 break;
11722 default:
11723 gcc_unreachable ();
11724 }
11725
11726 op2 = gen_lowpart (SImode, op2);
11727 op1 = gen_lowpart (SImode, op1);
11728 pat = GEN_FCN (icode) (op0, op1, op2);
11729 }
11730 else
11731 {
11732 switch (fcode)
11733 {
11734 case IX86_BUILTIN_UMWAIT:
11735 icode = CODE_FOR_umwait;
11736 break;
11737 case IX86_BUILTIN_TPAUSE:
11738 icode = CODE_FOR_tpause;
11739 break;
11740 default:
11741 gcc_unreachable ();
11742 }
11743 pat = GEN_FCN (icode) (op0, op1);
11744 }
11745
11746 if (!pat)
11747 return 0;
11748
11749 emit_insn (pat);
11750
11751 if (target == 0
11752 || !register_operand (target, QImode))
11753 target = gen_reg_rtx (QImode);
11754
11755 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11756 const0_rtx);
11757 emit_insn (gen_rtx_SET (target, pat));
11758
11759 return target;
11760
299a53d7 11761 case IX86_BUILTIN_TESTUI:
11762 emit_insn (gen_testui ());
11763
11764 if (target == 0
11765 || !register_operand (target, QImode))
11766 target = gen_reg_rtx (QImode);
11767
11768 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11769 const0_rtx);
11770 emit_insn (gen_rtx_SET (target, pat));
11771
11772 return target;
11773
2bf6d935
ML
11774 case IX86_BUILTIN_CLZERO:
11775 arg0 = CALL_EXPR_ARG (exp, 0);
11776 op0 = expand_normal (arg0);
11777 if (!REG_P (op0))
11778 op0 = ix86_zero_extend_to_Pmode (op0);
a963ca40 11779 emit_insn (gen_clzero (Pmode, op0));
2bf6d935
ML
11780 return 0;
11781
11782 case IX86_BUILTIN_CLDEMOTE:
11783 arg0 = CALL_EXPR_ARG (exp, 0);
11784 op0 = expand_normal (arg0);
11785 icode = CODE_FOR_cldemote;
11786 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11787 op0 = ix86_zero_extend_to_Pmode (op0);
11788
11789 emit_insn (gen_cldemote (op0));
11790 return 0;
11791
632a2f50 11792 case IX86_BUILTIN_LOADIWKEY:
11793 {
11794 arg0 = CALL_EXPR_ARG (exp, 0);
11795 arg1 = CALL_EXPR_ARG (exp, 1);
11796 arg2 = CALL_EXPR_ARG (exp, 2);
11797 arg3 = CALL_EXPR_ARG (exp, 3);
11798
11799 op0 = expand_normal (arg0);
11800 op1 = expand_normal (arg1);
11801 op2 = expand_normal (arg2);
11802 op3 = expand_normal (arg3);
11803
11804 if (!REG_P (op0))
11805 op0 = copy_to_mode_reg (V2DImode, op0);
11806 if (!REG_P (op1))
11807 op1 = copy_to_mode_reg (V2DImode, op1);
11808 if (!REG_P (op2))
11809 op2 = copy_to_mode_reg (V2DImode, op2);
11810 if (!REG_P (op3))
11811 op3 = copy_to_mode_reg (SImode, op3);
11812
11813 emit_insn (gen_loadiwkey (op0, op1, op2, op3));
11814
11815 return 0;
11816 }
11817
11818 case IX86_BUILTIN_AESDEC128KLU8:
11819 icode = CODE_FOR_aesdec128klu8;
11820 goto aesdecenc_expand;
11821
11822 case IX86_BUILTIN_AESDEC256KLU8:
11823 icode = CODE_FOR_aesdec256klu8;
11824 goto aesdecenc_expand;
11825
11826 case IX86_BUILTIN_AESENC128KLU8:
11827 icode = CODE_FOR_aesenc128klu8;
11828 goto aesdecenc_expand;
11829
11830 case IX86_BUILTIN_AESENC256KLU8:
11831 icode = CODE_FOR_aesenc256klu8;
11832
11833 aesdecenc_expand:
11834
11835 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
11836 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
11837 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
11838
11839 op0 = expand_normal (arg0);
11840 op1 = expand_normal (arg1);
11841 op2 = expand_normal (arg2);
11842
11843 if (!address_operand (op0, V2DImode))
11844 {
11845 op0 = convert_memory_address (Pmode, op0);
11846 op0 = copy_addr_to_reg (op0);
11847 }
11848 op0 = gen_rtx_MEM (V2DImode, op0);
11849
11850 if (!REG_P (op1))
11851 op1 = copy_to_mode_reg (V2DImode, op1);
11852
11853 if (!address_operand (op2, VOIDmode))
11854 {
11855 op2 = convert_memory_address (Pmode, op2);
11856 op2 = copy_addr_to_reg (op2);
11857 }
11858 op2 = gen_rtx_MEM (BLKmode, op2);
11859
11860 emit_insn (GEN_FCN (icode) (op1, op1, op2));
11861
11862 if (target == 0)
11863 target = gen_reg_rtx (QImode);
11864
1aeefa57
HW
11865 /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
11866 error occurs. Then the output should be cleared for safety. */
11867 rtx_code_label *ok_label;
11868 rtx tmp;
11869
11870 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
11871 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
11872 ok_label = gen_label_rtx ();
11873 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
11874 true, ok_label);
11875 /* Usually the runtime error seldom occur, so predict OK path as
11876 hotspot to optimize it as fallthrough block. */
11877 predict_jump (REG_BR_PROB_BASE * 90 / 100);
11878
11879 emit_insn (gen_rtx_SET (op1, const0_rtx));
632a2f50 11880
1aeefa57
HW
11881 emit_label (ok_label);
11882 emit_insn (gen_rtx_SET (target, pat));
632a2f50 11883 emit_insn (gen_rtx_SET (op0, op1));
11884
11885 return target;
11886
11887 case IX86_BUILTIN_AESDECWIDE128KLU8:
11888 icode = CODE_FOR_aesdecwide128klu8;
11889 goto wideaesdecenc_expand;
11890
11891 case IX86_BUILTIN_AESDECWIDE256KLU8:
11892 icode = CODE_FOR_aesdecwide256klu8;
11893 goto wideaesdecenc_expand;
11894
11895 case IX86_BUILTIN_AESENCWIDE128KLU8:
11896 icode = CODE_FOR_aesencwide128klu8;
11897 goto wideaesdecenc_expand;
11898
11899 case IX86_BUILTIN_AESENCWIDE256KLU8:
11900 icode = CODE_FOR_aesencwide256klu8;
11901
11902 wideaesdecenc_expand:
11903
11904 rtx xmm_regs[8];
11905 rtx op;
11906
11907 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
11908 arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
11909 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
11910
11911 op0 = expand_normal (arg0);
11912 op1 = expand_normal (arg1);
11913 op2 = expand_normal (arg2);
11914
11915 if (!address_operand (op2, VOIDmode))
11916 {
11917 op2 = convert_memory_address (Pmode, op2);
11918 op2 = copy_addr_to_reg (op2);
11919 }
11920 op2 = gen_rtx_MEM (BLKmode, op2);
11921
11922 for (i = 0; i < 8; i++)
11923 {
11924 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
11925
11926 op = gen_rtx_MEM (V2DImode,
11927 plus_constant (Pmode, op1, (i * 16)));
11928
11929 emit_move_insn (xmm_regs[i], op);
11930 }
11931
11932 emit_insn (GEN_FCN (icode) (op2));
11933
11934 if (target == 0)
11935 target = gen_reg_rtx (QImode);
11936
1aeefa57
HW
11937 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
11938 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
11939 ok_label = gen_label_rtx ();
11940 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
11941 true, ok_label);
11942 predict_jump (REG_BR_PROB_BASE * 90 / 100);
11943
11944 for (i = 0; i < 8; i++)
11945 emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
11946
11947 emit_label (ok_label);
632a2f50 11948 emit_insn (gen_rtx_SET (target, pat));
11949
11950 for (i = 0; i < 8; i++)
11951 {
11952 op = gen_rtx_MEM (V2DImode,
11953 plus_constant (Pmode, op0, (i * 16)));
11954 emit_move_insn (op, xmm_regs[i]);
11955 }
11956
11957 return target;
11958
11959 case IX86_BUILTIN_ENCODEKEY128U32:
11960 {
11961 rtx op, xmm_regs[7];
11962
11963 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
11964 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
11965 arg2 = CALL_EXPR_ARG (exp, 2); // void *h
11966
11967 op0 = expand_normal (arg0);
11968 op1 = expand_normal (arg1);
11969 op2 = expand_normal (arg2);
11970
11971 if (!REG_P (op0))
11972 op0 = copy_to_mode_reg (SImode, op0);
11973
11974 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
11975 emit_move_insn (op, op1);
11976
11977 for (i = 0; i < 3; i++)
11978 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
11979
11980 if (target == 0)
11981 target = gen_reg_rtx (SImode);
11982
11983 emit_insn (gen_encodekey128u32 (target, op0));
11984
11985 for (i = 0; i < 3; i++)
11986 {
11987 op = gen_rtx_MEM (V2DImode,
11988 plus_constant (Pmode, op2, (i * 16)));
11989 emit_move_insn (op, xmm_regs[i]);
11990 }
11991
11992 return target;
11993 }
11994 case IX86_BUILTIN_ENCODEKEY256U32:
11995 {
11996 rtx op, xmm_regs[7];
11997
11998 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
11999 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
12000 arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
12001 arg3 = CALL_EXPR_ARG (exp, 3); // void *h
12002
12003 op0 = expand_normal (arg0);
12004 op1 = expand_normal (arg1);
12005 op2 = expand_normal (arg2);
12006 op3 = expand_normal (arg3);
12007
12008 if (!REG_P (op0))
12009 op0 = copy_to_mode_reg (SImode, op0);
12010
12011 /* Force to use xmm0, xmm1 for keylow, keyhi*/
12012 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
12013 emit_move_insn (op, op1);
12014 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
12015 emit_move_insn (op, op2);
12016
12017 for (i = 0; i < 4; i++)
12018 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
12019
12020 if (target == 0)
12021 target = gen_reg_rtx (SImode);
12022
12023 emit_insn (gen_encodekey256u32 (target, op0));
12024
12025 for (i = 0; i < 4; i++)
12026 {
12027 op = gen_rtx_MEM (V2DImode,
12028 plus_constant (Pmode, op3, (i * 16)));
12029 emit_move_insn (op, xmm_regs[i]);
12030 }
12031
12032 return target;
12033 }
12034
2bf6d935
ML
12035 case IX86_BUILTIN_VEC_INIT_V2SI:
12036 case IX86_BUILTIN_VEC_INIT_V4HI:
12037 case IX86_BUILTIN_VEC_INIT_V8QI:
12038 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
12039
12040 case IX86_BUILTIN_VEC_EXT_V2DF:
12041 case IX86_BUILTIN_VEC_EXT_V2DI:
12042 case IX86_BUILTIN_VEC_EXT_V4SF:
12043 case IX86_BUILTIN_VEC_EXT_V4SI:
12044 case IX86_BUILTIN_VEC_EXT_V8HI:
12045 case IX86_BUILTIN_VEC_EXT_V2SI:
12046 case IX86_BUILTIN_VEC_EXT_V4HI:
12047 case IX86_BUILTIN_VEC_EXT_V16QI:
12048 return ix86_expand_vec_ext_builtin (exp, target);
12049
12050 case IX86_BUILTIN_VEC_SET_V2DI:
12051 case IX86_BUILTIN_VEC_SET_V4SF:
12052 case IX86_BUILTIN_VEC_SET_V4SI:
12053 case IX86_BUILTIN_VEC_SET_V8HI:
12054 case IX86_BUILTIN_VEC_SET_V4HI:
12055 case IX86_BUILTIN_VEC_SET_V16QI:
12056 return ix86_expand_vec_set_builtin (exp);
12057
12058 case IX86_BUILTIN_NANQ:
12059 case IX86_BUILTIN_NANSQ:
12060 return expand_call (exp, target, ignore);
12061
12062 case IX86_BUILTIN_RDPID:
12063
12064 op0 = gen_reg_rtx (word_mode);
12065
12066 if (TARGET_64BIT)
12067 {
12068 insn = gen_rdpid_rex64 (op0);
12069 op0 = convert_to_mode (SImode, op0, 1);
12070 }
12071 else
12072 insn = gen_rdpid (op0);
12073
12074 emit_insn (insn);
12075
12076 if (target == 0
12077 || !register_operand (target, SImode))
12078 target = gen_reg_rtx (SImode);
12079
12080 emit_move_insn (target, op0);
12081 return target;
12082
e21b52af
HL
12083 case IX86_BUILTIN_2INTERSECTD512:
12084 case IX86_BUILTIN_2INTERSECTQ512:
12085 case IX86_BUILTIN_2INTERSECTD256:
12086 case IX86_BUILTIN_2INTERSECTQ256:
12087 case IX86_BUILTIN_2INTERSECTD128:
12088 case IX86_BUILTIN_2INTERSECTQ128:
12089 arg0 = CALL_EXPR_ARG (exp, 0);
12090 arg1 = CALL_EXPR_ARG (exp, 1);
12091 arg2 = CALL_EXPR_ARG (exp, 2);
12092 arg3 = CALL_EXPR_ARG (exp, 3);
12093 op0 = expand_normal (arg0);
12094 op1 = expand_normal (arg1);
12095 op2 = expand_normal (arg2);
12096 op3 = expand_normal (arg3);
12097
12098 if (!address_operand (op0, VOIDmode))
12099 {
12100 op0 = convert_memory_address (Pmode, op0);
12101 op0 = copy_addr_to_reg (op0);
12102 }
12103 if (!address_operand (op1, VOIDmode))
12104 {
12105 op1 = convert_memory_address (Pmode, op1);
12106 op1 = copy_addr_to_reg (op1);
12107 }
12108
12109 switch (fcode)
12110 {
12111 case IX86_BUILTIN_2INTERSECTD512:
12112 mode4 = P2HImode;
12113 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
12114 break;
12115 case IX86_BUILTIN_2INTERSECTQ512:
12116 mode4 = P2QImode;
12117 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
12118 break;
12119 case IX86_BUILTIN_2INTERSECTD256:
12120 mode4 = P2QImode;
12121 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
12122 break;
12123 case IX86_BUILTIN_2INTERSECTQ256:
12124 mode4 = P2QImode;
12125 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
12126 break;
12127 case IX86_BUILTIN_2INTERSECTD128:
12128 mode4 = P2QImode;
12129 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
12130 break;
12131 case IX86_BUILTIN_2INTERSECTQ128:
12132 mode4 = P2QImode;
12133 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
12134 break;
12135 default:
12136 gcc_unreachable ();
12137 }
12138
12139 mode2 = insn_data[icode].operand[1].mode;
12140 mode3 = insn_data[icode].operand[2].mode;
12141 if (!insn_data[icode].operand[1].predicate (op2, mode2))
12142 op2 = copy_to_mode_reg (mode2, op2);
12143 if (!insn_data[icode].operand[2].predicate (op3, mode3))
12144 op3 = copy_to_mode_reg (mode3, op3);
12145
12146 op4 = gen_reg_rtx (mode4);
12147 emit_insn (GEN_FCN (icode) (op4, op2, op3));
12148 mode0 = mode4 == P2HImode ? HImode : QImode;
12149 emit_move_insn (gen_rtx_MEM (mode0, op0),
12150 gen_lowpart (mode0, op4));
12151 emit_move_insn (gen_rtx_MEM (mode0, op1),
12152 gen_highpart (mode0, op4));
12153
12154 return 0;
12155
2bf6d935
ML
12156 case IX86_BUILTIN_RDPMC:
12157 case IX86_BUILTIN_RDTSC:
12158 case IX86_BUILTIN_RDTSCP:
12159 case IX86_BUILTIN_XGETBV:
12160
12161 op0 = gen_reg_rtx (DImode);
12162 op1 = gen_reg_rtx (DImode);
12163
12164 if (fcode == IX86_BUILTIN_RDPMC)
12165 {
12166 arg0 = CALL_EXPR_ARG (exp, 0);
12167 op2 = expand_normal (arg0);
12168 if (!register_operand (op2, SImode))
12169 op2 = copy_to_mode_reg (SImode, op2);
12170
12171 insn = (TARGET_64BIT
12172 ? gen_rdpmc_rex64 (op0, op1, op2)
12173 : gen_rdpmc (op0, op2));
12174 emit_insn (insn);
12175 }
12176 else if (fcode == IX86_BUILTIN_XGETBV)
12177 {
12178 arg0 = CALL_EXPR_ARG (exp, 0);
12179 op2 = expand_normal (arg0);
12180 if (!register_operand (op2, SImode))
12181 op2 = copy_to_mode_reg (SImode, op2);
12182
12183 insn = (TARGET_64BIT
12184 ? gen_xgetbv_rex64 (op0, op1, op2)
12185 : gen_xgetbv (op0, op2));
12186 emit_insn (insn);
12187 }
12188 else if (fcode == IX86_BUILTIN_RDTSC)
12189 {
12190 insn = (TARGET_64BIT
12191 ? gen_rdtsc_rex64 (op0, op1)
12192 : gen_rdtsc (op0));
12193 emit_insn (insn);
12194 }
12195 else
12196 {
12197 op2 = gen_reg_rtx (SImode);
12198
12199 insn = (TARGET_64BIT
12200 ? gen_rdtscp_rex64 (op0, op1, op2)
12201 : gen_rdtscp (op0, op2));
12202 emit_insn (insn);
12203
12204 arg0 = CALL_EXPR_ARG (exp, 0);
12205 op4 = expand_normal (arg0);
12206 if (!address_operand (op4, VOIDmode))
12207 {
12208 op4 = convert_memory_address (Pmode, op4);
12209 op4 = copy_addr_to_reg (op4);
12210 }
12211 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
12212 }
12213
12214 if (target == 0
12215 || !register_operand (target, DImode))
12216 target = gen_reg_rtx (DImode);
12217
12218 if (TARGET_64BIT)
12219 {
12220 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
12221 op1, 1, OPTAB_DIRECT);
12222 op0 = expand_simple_binop (DImode, IOR, op0, op1,
12223 op0, 1, OPTAB_DIRECT);
12224 }
12225
12226 emit_move_insn (target, op0);
12227 return target;
12228
6a10feda
XG
12229 case IX86_BUILTIN_ENQCMD:
12230 case IX86_BUILTIN_ENQCMDS:
2bf6d935
ML
12231 case IX86_BUILTIN_MOVDIR64B:
12232
12233 arg0 = CALL_EXPR_ARG (exp, 0);
12234 arg1 = CALL_EXPR_ARG (exp, 1);
12235 op0 = expand_normal (arg0);
12236 op1 = expand_normal (arg1);
12237
12238 op0 = ix86_zero_extend_to_Pmode (op0);
12239 if (!address_operand (op1, VOIDmode))
12240 {
12241 op1 = convert_memory_address (Pmode, op1);
12242 op1 = copy_addr_to_reg (op1);
12243 }
12244 op1 = gen_rtx_MEM (XImode, op1);
12245
6a10feda
XG
12246 if (fcode == IX86_BUILTIN_MOVDIR64B)
12247 {
12248 emit_insn (gen_movdir64b (Pmode, op0, op1));
12249 return 0;
12250 }
12251 else
12252 {
44320665
UB
12253 if (target == 0
12254 || !register_operand (target, SImode))
12255 target = gen_reg_rtx (SImode);
6a10feda 12256
6a10feda
XG
12257 emit_move_insn (target, const0_rtx);
12258 target = gen_rtx_SUBREG (QImode, target, 0);
12259
44320665
UB
12260 int unspecv = (fcode == IX86_BUILTIN_ENQCMD
12261 ? UNSPECV_ENQCMD
12262 : UNSPECV_ENQCMDS);
12263 icode = code_for_enqcmd (unspecv, Pmode);
12264 emit_insn (GEN_FCN (icode) (op0, op1));
6a10feda 12265
44320665
UB
12266 emit_insn
12267 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
12268 gen_rtx_fmt_ee (EQ, QImode,
12269 gen_rtx_REG (CCZmode, FLAGS_REG),
12270 const0_rtx)));
6a10feda
XG
12271 return SUBREG_REG (target);
12272 }
2bf6d935
ML
12273
12274 case IX86_BUILTIN_FXSAVE:
12275 case IX86_BUILTIN_FXRSTOR:
12276 case IX86_BUILTIN_FXSAVE64:
12277 case IX86_BUILTIN_FXRSTOR64:
12278 case IX86_BUILTIN_FNSTENV:
12279 case IX86_BUILTIN_FLDENV:
12280 mode0 = BLKmode;
12281 switch (fcode)
12282 {
12283 case IX86_BUILTIN_FXSAVE:
12284 icode = CODE_FOR_fxsave;
12285 break;
12286 case IX86_BUILTIN_FXRSTOR:
12287 icode = CODE_FOR_fxrstor;
12288 break;
12289 case IX86_BUILTIN_FXSAVE64:
12290 icode = CODE_FOR_fxsave64;
12291 break;
12292 case IX86_BUILTIN_FXRSTOR64:
12293 icode = CODE_FOR_fxrstor64;
12294 break;
12295 case IX86_BUILTIN_FNSTENV:
12296 icode = CODE_FOR_fnstenv;
12297 break;
12298 case IX86_BUILTIN_FLDENV:
12299 icode = CODE_FOR_fldenv;
12300 break;
12301 default:
12302 gcc_unreachable ();
12303 }
12304
12305 arg0 = CALL_EXPR_ARG (exp, 0);
12306 op0 = expand_normal (arg0);
12307
12308 if (!address_operand (op0, VOIDmode))
12309 {
12310 op0 = convert_memory_address (Pmode, op0);
12311 op0 = copy_addr_to_reg (op0);
12312 }
12313 op0 = gen_rtx_MEM (mode0, op0);
12314
12315 pat = GEN_FCN (icode) (op0);
12316 if (pat)
12317 emit_insn (pat);
12318 return 0;
12319
12320 case IX86_BUILTIN_XSETBV:
12321 arg0 = CALL_EXPR_ARG (exp, 0);
12322 arg1 = CALL_EXPR_ARG (exp, 1);
12323 op0 = expand_normal (arg0);
12324 op1 = expand_normal (arg1);
12325
12326 if (!REG_P (op0))
12327 op0 = copy_to_mode_reg (SImode, op0);
12328
12329 op1 = force_reg (DImode, op1);
12330
12331 if (TARGET_64BIT)
12332 {
12333 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
12334 NULL, 1, OPTAB_DIRECT);
12335
12336 icode = CODE_FOR_xsetbv_rex64;
12337
12338 op2 = gen_lowpart (SImode, op2);
12339 op1 = gen_lowpart (SImode, op1);
12340 pat = GEN_FCN (icode) (op0, op1, op2);
12341 }
12342 else
12343 {
12344 icode = CODE_FOR_xsetbv;
12345
12346 pat = GEN_FCN (icode) (op0, op1);
12347 }
12348 if (pat)
12349 emit_insn (pat);
12350 return 0;
12351
12352 case IX86_BUILTIN_XSAVE:
12353 case IX86_BUILTIN_XRSTOR:
12354 case IX86_BUILTIN_XSAVE64:
12355 case IX86_BUILTIN_XRSTOR64:
12356 case IX86_BUILTIN_XSAVEOPT:
12357 case IX86_BUILTIN_XSAVEOPT64:
12358 case IX86_BUILTIN_XSAVES:
12359 case IX86_BUILTIN_XRSTORS:
12360 case IX86_BUILTIN_XSAVES64:
12361 case IX86_BUILTIN_XRSTORS64:
12362 case IX86_BUILTIN_XSAVEC:
12363 case IX86_BUILTIN_XSAVEC64:
12364 arg0 = CALL_EXPR_ARG (exp, 0);
12365 arg1 = CALL_EXPR_ARG (exp, 1);
12366 op0 = expand_normal (arg0);
12367 op1 = expand_normal (arg1);
12368
12369 if (!address_operand (op0, VOIDmode))
12370 {
12371 op0 = convert_memory_address (Pmode, op0);
12372 op0 = copy_addr_to_reg (op0);
12373 }
12374 op0 = gen_rtx_MEM (BLKmode, op0);
12375
12376 op1 = force_reg (DImode, op1);
12377
12378 if (TARGET_64BIT)
12379 {
12380 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
12381 NULL, 1, OPTAB_DIRECT);
12382 switch (fcode)
12383 {
12384 case IX86_BUILTIN_XSAVE:
12385 icode = CODE_FOR_xsave_rex64;
12386 break;
12387 case IX86_BUILTIN_XRSTOR:
12388 icode = CODE_FOR_xrstor_rex64;
12389 break;
12390 case IX86_BUILTIN_XSAVE64:
12391 icode = CODE_FOR_xsave64;
12392 break;
12393 case IX86_BUILTIN_XRSTOR64:
12394 icode = CODE_FOR_xrstor64;
12395 break;
12396 case IX86_BUILTIN_XSAVEOPT:
12397 icode = CODE_FOR_xsaveopt_rex64;
12398 break;
12399 case IX86_BUILTIN_XSAVEOPT64:
12400 icode = CODE_FOR_xsaveopt64;
12401 break;
12402 case IX86_BUILTIN_XSAVES:
12403 icode = CODE_FOR_xsaves_rex64;
12404 break;
12405 case IX86_BUILTIN_XRSTORS:
12406 icode = CODE_FOR_xrstors_rex64;
12407 break;
12408 case IX86_BUILTIN_XSAVES64:
12409 icode = CODE_FOR_xsaves64;
12410 break;
12411 case IX86_BUILTIN_XRSTORS64:
12412 icode = CODE_FOR_xrstors64;
12413 break;
12414 case IX86_BUILTIN_XSAVEC:
12415 icode = CODE_FOR_xsavec_rex64;
12416 break;
12417 case IX86_BUILTIN_XSAVEC64:
12418 icode = CODE_FOR_xsavec64;
12419 break;
12420 default:
12421 gcc_unreachable ();
12422 }
12423
12424 op2 = gen_lowpart (SImode, op2);
12425 op1 = gen_lowpart (SImode, op1);
12426 pat = GEN_FCN (icode) (op0, op1, op2);
12427 }
12428 else
12429 {
12430 switch (fcode)
12431 {
12432 case IX86_BUILTIN_XSAVE:
12433 icode = CODE_FOR_xsave;
12434 break;
12435 case IX86_BUILTIN_XRSTOR:
12436 icode = CODE_FOR_xrstor;
12437 break;
12438 case IX86_BUILTIN_XSAVEOPT:
12439 icode = CODE_FOR_xsaveopt;
12440 break;
12441 case IX86_BUILTIN_XSAVES:
12442 icode = CODE_FOR_xsaves;
12443 break;
12444 case IX86_BUILTIN_XRSTORS:
12445 icode = CODE_FOR_xrstors;
12446 break;
12447 case IX86_BUILTIN_XSAVEC:
12448 icode = CODE_FOR_xsavec;
12449 break;
12450 default:
12451 gcc_unreachable ();
12452 }
12453 pat = GEN_FCN (icode) (op0, op1);
12454 }
12455
12456 if (pat)
12457 emit_insn (pat);
12458 return 0;
12459
12460 case IX86_BUILTIN_LLWPCB:
12461 arg0 = CALL_EXPR_ARG (exp, 0);
12462 op0 = expand_normal (arg0);
2398c206
UB
12463
12464 if (!register_operand (op0, Pmode))
2bf6d935 12465 op0 = ix86_zero_extend_to_Pmode (op0);
2398c206 12466 emit_insn (gen_lwp_llwpcb (Pmode, op0));
2bf6d935
ML
12467 return 0;
12468
12469 case IX86_BUILTIN_SLWPCB:
2bf6d935 12470 if (!target
2398c206 12471 || !register_operand (target, Pmode))
2bf6d935 12472 target = gen_reg_rtx (Pmode);
2398c206 12473 emit_insn (gen_lwp_slwpcb (Pmode, target));
2bf6d935
ML
12474 return target;
12475
2398c206
UB
12476 case IX86_BUILTIN_LWPVAL32:
12477 case IX86_BUILTIN_LWPVAL64:
12478 case IX86_BUILTIN_LWPINS32:
12479 case IX86_BUILTIN_LWPINS64:
12480 mode = ((fcode == IX86_BUILTIN_LWPVAL32
12481 || fcode == IX86_BUILTIN_LWPINS32)
12482 ? SImode : DImode);
12483
12484 if (fcode == IX86_BUILTIN_LWPVAL32
12485 || fcode == IX86_BUILTIN_LWPVAL64)
12486 icode = code_for_lwp_lwpval (mode);
12487 else
12488 icode = code_for_lwp_lwpins (mode);
12489
12490 arg0 = CALL_EXPR_ARG (exp, 0);
12491 arg1 = CALL_EXPR_ARG (exp, 1);
12492 arg2 = CALL_EXPR_ARG (exp, 2);
12493 op0 = expand_normal (arg0);
12494 op1 = expand_normal (arg1);
12495 op2 = expand_normal (arg2);
12496 mode0 = insn_data[icode].operand[0].mode;
12497
12498 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12499 op0 = copy_to_mode_reg (mode0, op0);
12500 if (!insn_data[icode].operand[1].predicate (op1, SImode))
12501 op1 = copy_to_mode_reg (SImode, op1);
12502
12503 if (!CONST_INT_P (op2))
12504 {
12505 error ("the last argument must be a 32-bit immediate");
12506 return const0_rtx;
12507 }
12508
12509 emit_insn (GEN_FCN (icode) (op0, op1, op2));
12510
12511 if (fcode == IX86_BUILTIN_LWPINS32
12512 || fcode == IX86_BUILTIN_LWPINS64)
12513 {
12514 if (target == 0
12515 || !nonimmediate_operand (target, QImode))
12516 target = gen_reg_rtx (QImode);
12517
12518 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12519 const0_rtx);
12520 emit_insn (gen_rtx_SET (target, pat));
12521
12522 return target;
12523 }
12524 else
12525 return 0;
12526
2bf6d935
ML
12527 case IX86_BUILTIN_BEXTRI32:
12528 case IX86_BUILTIN_BEXTRI64:
9e026191
UB
12529 mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
12530
2bf6d935
ML
12531 arg0 = CALL_EXPR_ARG (exp, 0);
12532 arg1 = CALL_EXPR_ARG (exp, 1);
12533 op0 = expand_normal (arg0);
12534 op1 = expand_normal (arg1);
9e026191 12535
2bf6d935 12536 if (!CONST_INT_P (op1))
9e026191
UB
12537 {
12538 error ("last argument must be an immediate");
12539 return const0_rtx;
12540 }
2bf6d935 12541 else
9e026191
UB
12542 {
12543 unsigned char lsb_index = UINTVAL (op1);
12544 unsigned char length = UINTVAL (op1) >> 8;
12545
12546 unsigned char bitsize = GET_MODE_BITSIZE (mode);
12547
12548 icode = code_for_tbm_bextri (mode);
2bf6d935
ML
12549
12550 mode1 = insn_data[icode].operand[1].mode;
12551 if (!insn_data[icode].operand[1].predicate (op0, mode1))
12552 op0 = copy_to_mode_reg (mode1, op0);
12553
12554 mode0 = insn_data[icode].operand[0].mode;
12555 if (target == 0
12556 || !register_operand (target, mode0))
12557 target = gen_reg_rtx (mode0);
12558
9e026191
UB
12559 if (length == 0 || lsb_index >= bitsize)
12560 {
12561 emit_move_insn (target, const0_rtx);
12562 return target;
12563 }
12564
12565 if (length + lsb_index > bitsize)
12566 length = bitsize - lsb_index;
12567
12568 op1 = GEN_INT (length);
12569 op2 = GEN_INT (lsb_index);
12570
12571 emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
12572 return target;
12573 }
2bf6d935
ML
12574
12575 case IX86_BUILTIN_RDRAND16_STEP:
9e026191 12576 mode = HImode;
2bf6d935
ML
12577 goto rdrand_step;
12578
12579 case IX86_BUILTIN_RDRAND32_STEP:
9e026191 12580 mode = SImode;
2bf6d935
ML
12581 goto rdrand_step;
12582
12583 case IX86_BUILTIN_RDRAND64_STEP:
9e026191 12584 mode = DImode;
2bf6d935
ML
12585
12586rdrand_step:
12587 arg0 = CALL_EXPR_ARG (exp, 0);
12588 op1 = expand_normal (arg0);
12589 if (!address_operand (op1, VOIDmode))
12590 {
12591 op1 = convert_memory_address (Pmode, op1);
12592 op1 = copy_addr_to_reg (op1);
12593 }
12594
9e026191
UB
12595 op0 = gen_reg_rtx (mode);
12596 emit_insn (gen_rdrand (mode, op0));
2bf6d935 12597
9e026191 12598 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
2bf6d935 12599
9e026191 12600 op1 = force_reg (SImode, const1_rtx);
2bf6d935
ML
12601
12602 /* Emit SImode conditional move. */
9e026191 12603 if (mode == HImode)
2bf6d935
ML
12604 {
12605 if (TARGET_ZERO_EXTEND_WITH_AND
12606 && optimize_function_for_speed_p (cfun))
12607 {
12608 op2 = force_reg (SImode, const0_rtx);
12609
12610 emit_insn (gen_movstricthi
12611 (gen_lowpart (HImode, op2), op0));
12612 }
12613 else
12614 {
12615 op2 = gen_reg_rtx (SImode);
12616
12617 emit_insn (gen_zero_extendhisi2 (op2, op0));
12618 }
12619 }
9e026191 12620 else if (mode == SImode)
2bf6d935
ML
12621 op2 = op0;
12622 else
12623 op2 = gen_rtx_SUBREG (SImode, op0, 0);
12624
12625 if (target == 0
12626 || !register_operand (target, SImode))
12627 target = gen_reg_rtx (SImode);
12628
12629 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
12630 const0_rtx);
12631 emit_insn (gen_rtx_SET (target,
12632 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
12633 return target;
12634
12635 case IX86_BUILTIN_RDSEED16_STEP:
9e026191 12636 mode = HImode;
2bf6d935
ML
12637 goto rdseed_step;
12638
12639 case IX86_BUILTIN_RDSEED32_STEP:
9e026191 12640 mode = SImode;
2bf6d935
ML
12641 goto rdseed_step;
12642
12643 case IX86_BUILTIN_RDSEED64_STEP:
9e026191 12644 mode = DImode;
2bf6d935
ML
12645
12646rdseed_step:
12647 arg0 = CALL_EXPR_ARG (exp, 0);
12648 op1 = expand_normal (arg0);
12649 if (!address_operand (op1, VOIDmode))
12650 {
12651 op1 = convert_memory_address (Pmode, op1);
12652 op1 = copy_addr_to_reg (op1);
12653 }
12654
9e026191
UB
12655 op0 = gen_reg_rtx (mode);
12656 emit_insn (gen_rdseed (mode, op0));
2bf6d935 12657
9e026191 12658 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
2bf6d935
ML
12659
12660 op2 = gen_reg_rtx (QImode);
12661
12662 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12663 const0_rtx);
12664 emit_insn (gen_rtx_SET (op2, pat));
12665
12666 if (target == 0
12667 || !register_operand (target, SImode))
12668 target = gen_reg_rtx (SImode);
12669
12670 emit_insn (gen_zero_extendqisi2 (target, op2));
12671 return target;
12672
12673 case IX86_BUILTIN_SBB32:
12674 icode = CODE_FOR_subborrowsi;
12675 icode2 = CODE_FOR_subborrowsi_0;
12676 mode0 = SImode;
12677 mode1 = DImode;
12678 mode2 = CCmode;
12679 goto handlecarry;
12680
12681 case IX86_BUILTIN_SBB64:
12682 icode = CODE_FOR_subborrowdi;
12683 icode2 = CODE_FOR_subborrowdi_0;
12684 mode0 = DImode;
12685 mode1 = TImode;
12686 mode2 = CCmode;
12687 goto handlecarry;
12688
12689 case IX86_BUILTIN_ADDCARRYX32:
12690 icode = CODE_FOR_addcarrysi;
12691 icode2 = CODE_FOR_addcarrysi_0;
12692 mode0 = SImode;
12693 mode1 = DImode;
12694 mode2 = CCCmode;
12695 goto handlecarry;
12696
12697 case IX86_BUILTIN_ADDCARRYX64:
12698 icode = CODE_FOR_addcarrydi;
12699 icode2 = CODE_FOR_addcarrydi_0;
12700 mode0 = DImode;
12701 mode1 = TImode;
12702 mode2 = CCCmode;
12703
12704 handlecarry:
12705 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
12706 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
12707 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
12708 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
12709
12710 op1 = expand_normal (arg0);
12711 if (!integer_zerop (arg0))
12712 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
12713
12714 op2 = expand_normal (arg1);
12715 if (!register_operand (op2, mode0))
12716 op2 = copy_to_mode_reg (mode0, op2);
12717
12718 op3 = expand_normal (arg2);
12719 if (!register_operand (op3, mode0))
12720 op3 = copy_to_mode_reg (mode0, op3);
12721
12722 op4 = expand_normal (arg3);
12723 if (!address_operand (op4, VOIDmode))
12724 {
12725 op4 = convert_memory_address (Pmode, op4);
12726 op4 = copy_addr_to_reg (op4);
12727 }
12728
12729 op0 = gen_reg_rtx (mode0);
12730 if (integer_zerop (arg0))
12731 {
12732 /* If arg0 is 0, optimize right away into add or sub
12733 instruction that sets CCCmode flags. */
12734 op1 = gen_rtx_REG (mode2, FLAGS_REG);
12735 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
12736 }
12737 else
12738 {
12739 /* Generate CF from input operand. */
12740 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
12741
12742 /* Generate instruction that consumes CF. */
12743 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
12744 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
12745 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
12746 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
12747 }
12748
12749 /* Return current CF value. */
12750 if (target == 0)
12751 target = gen_reg_rtx (QImode);
12752
12753 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
12754 emit_insn (gen_rtx_SET (target, pat));
12755
12756 /* Store the result. */
12757 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
12758
12759 return target;
12760
12761 case IX86_BUILTIN_READ_FLAGS:
12762 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
12763
12764 if (optimize
12765 || target == NULL_RTX
12766 || !nonimmediate_operand (target, word_mode)
12767 || GET_MODE (target) != word_mode)
12768 target = gen_reg_rtx (word_mode);
12769
12770 emit_insn (gen_pop (target));
12771 return target;
12772
12773 case IX86_BUILTIN_WRITE_FLAGS:
12774
12775 arg0 = CALL_EXPR_ARG (exp, 0);
12776 op0 = expand_normal (arg0);
12777 if (!general_no_elim_operand (op0, word_mode))
12778 op0 = copy_to_mode_reg (word_mode, op0);
12779
12780 emit_insn (gen_push (op0));
12781 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
12782 return 0;
12783
12784 case IX86_BUILTIN_KTESTC8:
12785 icode = CODE_FOR_ktestqi;
12786 mode3 = CCCmode;
12787 goto kortest;
12788
12789 case IX86_BUILTIN_KTESTZ8:
12790 icode = CODE_FOR_ktestqi;
12791 mode3 = CCZmode;
12792 goto kortest;
12793
12794 case IX86_BUILTIN_KTESTC16:
12795 icode = CODE_FOR_ktesthi;
12796 mode3 = CCCmode;
12797 goto kortest;
12798
12799 case IX86_BUILTIN_KTESTZ16:
12800 icode = CODE_FOR_ktesthi;
12801 mode3 = CCZmode;
12802 goto kortest;
12803
12804 case IX86_BUILTIN_KTESTC32:
12805 icode = CODE_FOR_ktestsi;
12806 mode3 = CCCmode;
12807 goto kortest;
12808
12809 case IX86_BUILTIN_KTESTZ32:
12810 icode = CODE_FOR_ktestsi;
12811 mode3 = CCZmode;
12812 goto kortest;
12813
12814 case IX86_BUILTIN_KTESTC64:
12815 icode = CODE_FOR_ktestdi;
12816 mode3 = CCCmode;
12817 goto kortest;
12818
12819 case IX86_BUILTIN_KTESTZ64:
12820 icode = CODE_FOR_ktestdi;
12821 mode3 = CCZmode;
12822 goto kortest;
12823
12824 case IX86_BUILTIN_KORTESTC8:
12825 icode = CODE_FOR_kortestqi;
12826 mode3 = CCCmode;
12827 goto kortest;
12828
12829 case IX86_BUILTIN_KORTESTZ8:
12830 icode = CODE_FOR_kortestqi;
12831 mode3 = CCZmode;
12832 goto kortest;
12833
12834 case IX86_BUILTIN_KORTESTC16:
12835 icode = CODE_FOR_kortesthi;
12836 mode3 = CCCmode;
12837 goto kortest;
12838
12839 case IX86_BUILTIN_KORTESTZ16:
12840 icode = CODE_FOR_kortesthi;
12841 mode3 = CCZmode;
12842 goto kortest;
12843
12844 case IX86_BUILTIN_KORTESTC32:
12845 icode = CODE_FOR_kortestsi;
12846 mode3 = CCCmode;
12847 goto kortest;
12848
12849 case IX86_BUILTIN_KORTESTZ32:
12850 icode = CODE_FOR_kortestsi;
12851 mode3 = CCZmode;
12852 goto kortest;
12853
12854 case IX86_BUILTIN_KORTESTC64:
12855 icode = CODE_FOR_kortestdi;
12856 mode3 = CCCmode;
12857 goto kortest;
12858
12859 case IX86_BUILTIN_KORTESTZ64:
12860 icode = CODE_FOR_kortestdi;
12861 mode3 = CCZmode;
12862
12863 kortest:
12864 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
12865 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
12866 op0 = expand_normal (arg0);
12867 op1 = expand_normal (arg1);
12868
12869 mode0 = insn_data[icode].operand[0].mode;
12870 mode1 = insn_data[icode].operand[1].mode;
12871
12872 if (GET_MODE (op0) != VOIDmode)
12873 op0 = force_reg (GET_MODE (op0), op0);
12874
12875 op0 = gen_lowpart (mode0, op0);
12876
12877 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12878 op0 = copy_to_mode_reg (mode0, op0);
12879
12880 if (GET_MODE (op1) != VOIDmode)
12881 op1 = force_reg (GET_MODE (op1), op1);
12882
12883 op1 = gen_lowpart (mode1, op1);
12884
12885 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12886 op1 = copy_to_mode_reg (mode1, op1);
12887
12888 target = gen_reg_rtx (QImode);
12889
12890 /* Emit kortest. */
12891 emit_insn (GEN_FCN (icode) (op0, op1));
12892 /* And use setcc to return result from flags. */
12893 ix86_expand_setcc (target, EQ,
12894 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
12895 return target;
12896
12897 case IX86_BUILTIN_GATHERSIV2DF:
12898 icode = CODE_FOR_avx2_gathersiv2df;
12899 goto gather_gen;
12900 case IX86_BUILTIN_GATHERSIV4DF:
12901 icode = CODE_FOR_avx2_gathersiv4df;
12902 goto gather_gen;
12903 case IX86_BUILTIN_GATHERDIV2DF:
12904 icode = CODE_FOR_avx2_gatherdiv2df;
12905 goto gather_gen;
12906 case IX86_BUILTIN_GATHERDIV4DF:
12907 icode = CODE_FOR_avx2_gatherdiv4df;
12908 goto gather_gen;
12909 case IX86_BUILTIN_GATHERSIV4SF:
12910 icode = CODE_FOR_avx2_gathersiv4sf;
12911 goto gather_gen;
12912 case IX86_BUILTIN_GATHERSIV8SF:
12913 icode = CODE_FOR_avx2_gathersiv8sf;
12914 goto gather_gen;
12915 case IX86_BUILTIN_GATHERDIV4SF:
12916 icode = CODE_FOR_avx2_gatherdiv4sf;
12917 goto gather_gen;
12918 case IX86_BUILTIN_GATHERDIV8SF:
12919 icode = CODE_FOR_avx2_gatherdiv8sf;
12920 goto gather_gen;
12921 case IX86_BUILTIN_GATHERSIV2DI:
12922 icode = CODE_FOR_avx2_gathersiv2di;
12923 goto gather_gen;
12924 case IX86_BUILTIN_GATHERSIV4DI:
12925 icode = CODE_FOR_avx2_gathersiv4di;
12926 goto gather_gen;
12927 case IX86_BUILTIN_GATHERDIV2DI:
12928 icode = CODE_FOR_avx2_gatherdiv2di;
12929 goto gather_gen;
12930 case IX86_BUILTIN_GATHERDIV4DI:
12931 icode = CODE_FOR_avx2_gatherdiv4di;
12932 goto gather_gen;
12933 case IX86_BUILTIN_GATHERSIV4SI:
12934 icode = CODE_FOR_avx2_gathersiv4si;
12935 goto gather_gen;
12936 case IX86_BUILTIN_GATHERSIV8SI:
12937 icode = CODE_FOR_avx2_gathersiv8si;
12938 goto gather_gen;
12939 case IX86_BUILTIN_GATHERDIV4SI:
12940 icode = CODE_FOR_avx2_gatherdiv4si;
12941 goto gather_gen;
12942 case IX86_BUILTIN_GATHERDIV8SI:
12943 icode = CODE_FOR_avx2_gatherdiv8si;
12944 goto gather_gen;
12945 case IX86_BUILTIN_GATHERALTSIV4DF:
12946 icode = CODE_FOR_avx2_gathersiv4df;
12947 goto gather_gen;
12948 case IX86_BUILTIN_GATHERALTDIV8SF:
12949 icode = CODE_FOR_avx2_gatherdiv8sf;
12950 goto gather_gen;
12951 case IX86_BUILTIN_GATHERALTSIV4DI:
12952 icode = CODE_FOR_avx2_gathersiv4di;
12953 goto gather_gen;
12954 case IX86_BUILTIN_GATHERALTDIV8SI:
12955 icode = CODE_FOR_avx2_gatherdiv8si;
12956 goto gather_gen;
12957 case IX86_BUILTIN_GATHER3SIV16SF:
12958 icode = CODE_FOR_avx512f_gathersiv16sf;
12959 goto gather_gen;
12960 case IX86_BUILTIN_GATHER3SIV8DF:
12961 icode = CODE_FOR_avx512f_gathersiv8df;
12962 goto gather_gen;
12963 case IX86_BUILTIN_GATHER3DIV16SF:
12964 icode = CODE_FOR_avx512f_gatherdiv16sf;
12965 goto gather_gen;
12966 case IX86_BUILTIN_GATHER3DIV8DF:
12967 icode = CODE_FOR_avx512f_gatherdiv8df;
12968 goto gather_gen;
12969 case IX86_BUILTIN_GATHER3SIV16SI:
12970 icode = CODE_FOR_avx512f_gathersiv16si;
12971 goto gather_gen;
12972 case IX86_BUILTIN_GATHER3SIV8DI:
12973 icode = CODE_FOR_avx512f_gathersiv8di;
12974 goto gather_gen;
12975 case IX86_BUILTIN_GATHER3DIV16SI:
12976 icode = CODE_FOR_avx512f_gatherdiv16si;
12977 goto gather_gen;
12978 case IX86_BUILTIN_GATHER3DIV8DI:
12979 icode = CODE_FOR_avx512f_gatherdiv8di;
12980 goto gather_gen;
12981 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12982 icode = CODE_FOR_avx512f_gathersiv8df;
12983 goto gather_gen;
12984 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12985 icode = CODE_FOR_avx512f_gatherdiv16sf;
12986 goto gather_gen;
12987 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12988 icode = CODE_FOR_avx512f_gathersiv8di;
12989 goto gather_gen;
12990 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12991 icode = CODE_FOR_avx512f_gatherdiv16si;
12992 goto gather_gen;
12993 case IX86_BUILTIN_GATHER3SIV2DF:
12994 icode = CODE_FOR_avx512vl_gathersiv2df;
12995 goto gather_gen;
12996 case IX86_BUILTIN_GATHER3SIV4DF:
12997 icode = CODE_FOR_avx512vl_gathersiv4df;
12998 goto gather_gen;
12999 case IX86_BUILTIN_GATHER3DIV2DF:
13000 icode = CODE_FOR_avx512vl_gatherdiv2df;
13001 goto gather_gen;
13002 case IX86_BUILTIN_GATHER3DIV4DF:
13003 icode = CODE_FOR_avx512vl_gatherdiv4df;
13004 goto gather_gen;
13005 case IX86_BUILTIN_GATHER3SIV4SF:
13006 icode = CODE_FOR_avx512vl_gathersiv4sf;
13007 goto gather_gen;
13008 case IX86_BUILTIN_GATHER3SIV8SF:
13009 icode = CODE_FOR_avx512vl_gathersiv8sf;
13010 goto gather_gen;
13011 case IX86_BUILTIN_GATHER3DIV4SF:
13012 icode = CODE_FOR_avx512vl_gatherdiv4sf;
13013 goto gather_gen;
13014 case IX86_BUILTIN_GATHER3DIV8SF:
13015 icode = CODE_FOR_avx512vl_gatherdiv8sf;
13016 goto gather_gen;
13017 case IX86_BUILTIN_GATHER3SIV2DI:
13018 icode = CODE_FOR_avx512vl_gathersiv2di;
13019 goto gather_gen;
13020 case IX86_BUILTIN_GATHER3SIV4DI:
13021 icode = CODE_FOR_avx512vl_gathersiv4di;
13022 goto gather_gen;
13023 case IX86_BUILTIN_GATHER3DIV2DI:
13024 icode = CODE_FOR_avx512vl_gatherdiv2di;
13025 goto gather_gen;
13026 case IX86_BUILTIN_GATHER3DIV4DI:
13027 icode = CODE_FOR_avx512vl_gatherdiv4di;
13028 goto gather_gen;
13029 case IX86_BUILTIN_GATHER3SIV4SI:
13030 icode = CODE_FOR_avx512vl_gathersiv4si;
13031 goto gather_gen;
13032 case IX86_BUILTIN_GATHER3SIV8SI:
13033 icode = CODE_FOR_avx512vl_gathersiv8si;
13034 goto gather_gen;
13035 case IX86_BUILTIN_GATHER3DIV4SI:
13036 icode = CODE_FOR_avx512vl_gatherdiv4si;
13037 goto gather_gen;
13038 case IX86_BUILTIN_GATHER3DIV8SI:
13039 icode = CODE_FOR_avx512vl_gatherdiv8si;
13040 goto gather_gen;
13041 case IX86_BUILTIN_GATHER3ALTSIV4DF:
13042 icode = CODE_FOR_avx512vl_gathersiv4df;
13043 goto gather_gen;
13044 case IX86_BUILTIN_GATHER3ALTDIV8SF:
13045 icode = CODE_FOR_avx512vl_gatherdiv8sf;
13046 goto gather_gen;
13047 case IX86_BUILTIN_GATHER3ALTSIV4DI:
13048 icode = CODE_FOR_avx512vl_gathersiv4di;
13049 goto gather_gen;
13050 case IX86_BUILTIN_GATHER3ALTDIV8SI:
13051 icode = CODE_FOR_avx512vl_gatherdiv8si;
13052 goto gather_gen;
13053 case IX86_BUILTIN_SCATTERSIV16SF:
13054 icode = CODE_FOR_avx512f_scattersiv16sf;
13055 goto scatter_gen;
13056 case IX86_BUILTIN_SCATTERSIV8DF:
13057 icode = CODE_FOR_avx512f_scattersiv8df;
13058 goto scatter_gen;
13059 case IX86_BUILTIN_SCATTERDIV16SF:
13060 icode = CODE_FOR_avx512f_scatterdiv16sf;
13061 goto scatter_gen;
13062 case IX86_BUILTIN_SCATTERDIV8DF:
13063 icode = CODE_FOR_avx512f_scatterdiv8df;
13064 goto scatter_gen;
13065 case IX86_BUILTIN_SCATTERSIV16SI:
13066 icode = CODE_FOR_avx512f_scattersiv16si;
13067 goto scatter_gen;
13068 case IX86_BUILTIN_SCATTERSIV8DI:
13069 icode = CODE_FOR_avx512f_scattersiv8di;
13070 goto scatter_gen;
13071 case IX86_BUILTIN_SCATTERDIV16SI:
13072 icode = CODE_FOR_avx512f_scatterdiv16si;
13073 goto scatter_gen;
13074 case IX86_BUILTIN_SCATTERDIV8DI:
13075 icode = CODE_FOR_avx512f_scatterdiv8di;
13076 goto scatter_gen;
13077 case IX86_BUILTIN_SCATTERSIV8SF:
13078 icode = CODE_FOR_avx512vl_scattersiv8sf;
13079 goto scatter_gen;
13080 case IX86_BUILTIN_SCATTERSIV4SF:
13081 icode = CODE_FOR_avx512vl_scattersiv4sf;
13082 goto scatter_gen;
13083 case IX86_BUILTIN_SCATTERSIV4DF:
13084 icode = CODE_FOR_avx512vl_scattersiv4df;
13085 goto scatter_gen;
13086 case IX86_BUILTIN_SCATTERSIV2DF:
13087 icode = CODE_FOR_avx512vl_scattersiv2df;
13088 goto scatter_gen;
13089 case IX86_BUILTIN_SCATTERDIV8SF:
13090 icode = CODE_FOR_avx512vl_scatterdiv8sf;
13091 goto scatter_gen;
13092 case IX86_BUILTIN_SCATTERDIV4SF:
13093 icode = CODE_FOR_avx512vl_scatterdiv4sf;
13094 goto scatter_gen;
13095 case IX86_BUILTIN_SCATTERDIV4DF:
13096 icode = CODE_FOR_avx512vl_scatterdiv4df;
13097 goto scatter_gen;
13098 case IX86_BUILTIN_SCATTERDIV2DF:
13099 icode = CODE_FOR_avx512vl_scatterdiv2df;
13100 goto scatter_gen;
13101 case IX86_BUILTIN_SCATTERSIV8SI:
13102 icode = CODE_FOR_avx512vl_scattersiv8si;
13103 goto scatter_gen;
13104 case IX86_BUILTIN_SCATTERSIV4SI:
13105 icode = CODE_FOR_avx512vl_scattersiv4si;
13106 goto scatter_gen;
13107 case IX86_BUILTIN_SCATTERSIV4DI:
13108 icode = CODE_FOR_avx512vl_scattersiv4di;
13109 goto scatter_gen;
13110 case IX86_BUILTIN_SCATTERSIV2DI:
13111 icode = CODE_FOR_avx512vl_scattersiv2di;
13112 goto scatter_gen;
13113 case IX86_BUILTIN_SCATTERDIV8SI:
13114 icode = CODE_FOR_avx512vl_scatterdiv8si;
13115 goto scatter_gen;
13116 case IX86_BUILTIN_SCATTERDIV4SI:
13117 icode = CODE_FOR_avx512vl_scatterdiv4si;
13118 goto scatter_gen;
13119 case IX86_BUILTIN_SCATTERDIV4DI:
13120 icode = CODE_FOR_avx512vl_scatterdiv4di;
13121 goto scatter_gen;
13122 case IX86_BUILTIN_SCATTERDIV2DI:
13123 icode = CODE_FOR_avx512vl_scatterdiv2di;
13124 goto scatter_gen;
13125 case IX86_BUILTIN_GATHERPFDPD:
13126 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
13127 goto vec_prefetch_gen;
13128 case IX86_BUILTIN_SCATTERALTSIV8DF:
13129 icode = CODE_FOR_avx512f_scattersiv8df;
13130 goto scatter_gen;
13131 case IX86_BUILTIN_SCATTERALTDIV16SF:
13132 icode = CODE_FOR_avx512f_scatterdiv16sf;
13133 goto scatter_gen;
13134 case IX86_BUILTIN_SCATTERALTSIV8DI:
13135 icode = CODE_FOR_avx512f_scattersiv8di;
13136 goto scatter_gen;
13137 case IX86_BUILTIN_SCATTERALTDIV16SI:
13138 icode = CODE_FOR_avx512f_scatterdiv16si;
13139 goto scatter_gen;
13140 case IX86_BUILTIN_SCATTERALTSIV4DF:
13141 icode = CODE_FOR_avx512vl_scattersiv4df;
13142 goto scatter_gen;
13143 case IX86_BUILTIN_SCATTERALTDIV8SF:
13144 icode = CODE_FOR_avx512vl_scatterdiv8sf;
13145 goto scatter_gen;
13146 case IX86_BUILTIN_SCATTERALTSIV4DI:
13147 icode = CODE_FOR_avx512vl_scattersiv4di;
13148 goto scatter_gen;
13149 case IX86_BUILTIN_SCATTERALTDIV8SI:
13150 icode = CODE_FOR_avx512vl_scatterdiv8si;
13151 goto scatter_gen;
13152 case IX86_BUILTIN_SCATTERALTSIV2DF:
13153 icode = CODE_FOR_avx512vl_scattersiv2df;
13154 goto scatter_gen;
13155 case IX86_BUILTIN_SCATTERALTDIV4SF:
13156 icode = CODE_FOR_avx512vl_scatterdiv4sf;
13157 goto scatter_gen;
13158 case IX86_BUILTIN_SCATTERALTSIV2DI:
13159 icode = CODE_FOR_avx512vl_scattersiv2di;
13160 goto scatter_gen;
13161 case IX86_BUILTIN_SCATTERALTDIV4SI:
13162 icode = CODE_FOR_avx512vl_scatterdiv4si;
13163 goto scatter_gen;
13164 case IX86_BUILTIN_GATHERPFDPS:
13165 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
13166 goto vec_prefetch_gen;
13167 case IX86_BUILTIN_GATHERPFQPD:
13168 icode = CODE_FOR_avx512pf_gatherpfv8didf;
13169 goto vec_prefetch_gen;
13170 case IX86_BUILTIN_GATHERPFQPS:
13171 icode = CODE_FOR_avx512pf_gatherpfv8disf;
13172 goto vec_prefetch_gen;
13173 case IX86_BUILTIN_SCATTERPFDPD:
13174 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
13175 goto vec_prefetch_gen;
13176 case IX86_BUILTIN_SCATTERPFDPS:
13177 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
13178 goto vec_prefetch_gen;
13179 case IX86_BUILTIN_SCATTERPFQPD:
13180 icode = CODE_FOR_avx512pf_scatterpfv8didf;
13181 goto vec_prefetch_gen;
13182 case IX86_BUILTIN_SCATTERPFQPS:
13183 icode = CODE_FOR_avx512pf_scatterpfv8disf;
13184 goto vec_prefetch_gen;
13185
13186 gather_gen:
13187 rtx half;
13188 rtx (*gen) (rtx, rtx);
13189
13190 arg0 = CALL_EXPR_ARG (exp, 0);
13191 arg1 = CALL_EXPR_ARG (exp, 1);
13192 arg2 = CALL_EXPR_ARG (exp, 2);
13193 arg3 = CALL_EXPR_ARG (exp, 3);
13194 arg4 = CALL_EXPR_ARG (exp, 4);
13195 op0 = expand_normal (arg0);
13196 op1 = expand_normal (arg1);
13197 op2 = expand_normal (arg2);
13198 op3 = expand_normal (arg3);
13199 op4 = expand_normal (arg4);
13200 /* Note the arg order is different from the operand order. */
13201 mode0 = insn_data[icode].operand[1].mode;
13202 mode2 = insn_data[icode].operand[3].mode;
13203 mode3 = insn_data[icode].operand[4].mode;
13204 mode4 = insn_data[icode].operand[5].mode;
13205
13206 if (target == NULL_RTX
13207 || GET_MODE (target) != insn_data[icode].operand[0].mode
13208 || !insn_data[icode].operand[0].predicate (target,
13209 GET_MODE (target)))
13210 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
13211 else
13212 subtarget = target;
13213
13214 switch (fcode)
13215 {
13216 case IX86_BUILTIN_GATHER3ALTSIV8DF:
13217 case IX86_BUILTIN_GATHER3ALTSIV8DI:
13218 half = gen_reg_rtx (V8SImode);
13219 if (!nonimmediate_operand (op2, V16SImode))
13220 op2 = copy_to_mode_reg (V16SImode, op2);
13221 emit_insn (gen_vec_extract_lo_v16si (half, op2));
13222 op2 = half;
13223 break;
13224 case IX86_BUILTIN_GATHER3ALTSIV4DF:
13225 case IX86_BUILTIN_GATHER3ALTSIV4DI:
13226 case IX86_BUILTIN_GATHERALTSIV4DF:
13227 case IX86_BUILTIN_GATHERALTSIV4DI:
13228 half = gen_reg_rtx (V4SImode);
13229 if (!nonimmediate_operand (op2, V8SImode))
13230 op2 = copy_to_mode_reg (V8SImode, op2);
13231 emit_insn (gen_vec_extract_lo_v8si (half, op2));
13232 op2 = half;
13233 break;
13234 case IX86_BUILTIN_GATHER3ALTDIV16SF:
13235 case IX86_BUILTIN_GATHER3ALTDIV16SI:
13236 half = gen_reg_rtx (mode0);
13237 if (mode0 == V8SFmode)
13238 gen = gen_vec_extract_lo_v16sf;
13239 else
13240 gen = gen_vec_extract_lo_v16si;
13241 if (!nonimmediate_operand (op0, GET_MODE (op0)))
13242 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
13243 emit_insn (gen (half, op0));
13244 op0 = half;
13245 op3 = lowpart_subreg (QImode, op3, HImode);
13246 break;
13247 case IX86_BUILTIN_GATHER3ALTDIV8SF:
13248 case IX86_BUILTIN_GATHER3ALTDIV8SI:
13249 case IX86_BUILTIN_GATHERALTDIV8SF:
13250 case IX86_BUILTIN_GATHERALTDIV8SI:
13251 half = gen_reg_rtx (mode0);
13252 if (mode0 == V4SFmode)
13253 gen = gen_vec_extract_lo_v8sf;
13254 else
13255 gen = gen_vec_extract_lo_v8si;
13256 if (!nonimmediate_operand (op0, GET_MODE (op0)))
13257 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
13258 emit_insn (gen (half, op0));
13259 op0 = half;
13260 if (VECTOR_MODE_P (GET_MODE (op3)))
13261 {
13262 half = gen_reg_rtx (mode0);
13263 if (!nonimmediate_operand (op3, GET_MODE (op3)))
13264 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
13265 emit_insn (gen (half, op3));
13266 op3 = half;
13267 }
13268 break;
13269 default:
13270 break;
13271 }
13272
13273 /* Force memory operand only with base register here. But we
13274 don't want to do it on memory operand for other builtin
13275 functions. */
13276 op1 = ix86_zero_extend_to_Pmode (op1);
13277
13278 if (!insn_data[icode].operand[1].predicate (op0, mode0))
13279 op0 = copy_to_mode_reg (mode0, op0);
13280 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
13281 op1 = copy_to_mode_reg (Pmode, op1);
13282 if (!insn_data[icode].operand[3].predicate (op2, mode2))
13283 op2 = copy_to_mode_reg (mode2, op2);
13284
13285 op3 = fixup_modeless_constant (op3, mode3);
13286
13287 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
13288 {
13289 if (!insn_data[icode].operand[4].predicate (op3, mode3))
13290 op3 = copy_to_mode_reg (mode3, op3);
13291 }
13292 else
13293 {
13294 op3 = copy_to_reg (op3);
13295 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
13296 }
13297 if (!insn_data[icode].operand[5].predicate (op4, mode4))
13298 {
13299 error ("the last argument must be scale 1, 2, 4, 8");
13300 return const0_rtx;
13301 }
13302
13303 /* Optimize. If mask is known to have all high bits set,
13304 replace op0 with pc_rtx to signal that the instruction
13305 overwrites the whole destination and doesn't use its
13306 previous contents. */
13307 if (optimize)
13308 {
13309 if (TREE_CODE (arg3) == INTEGER_CST)
13310 {
13311 if (integer_all_onesp (arg3))
13312 op0 = pc_rtx;
13313 }
13314 else if (TREE_CODE (arg3) == VECTOR_CST)
13315 {
13316 unsigned int negative = 0;
13317 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
13318 {
13319 tree cst = VECTOR_CST_ELT (arg3, i);
13320 if (TREE_CODE (cst) == INTEGER_CST
13321 && tree_int_cst_sign_bit (cst))
13322 negative++;
13323 else if (TREE_CODE (cst) == REAL_CST
13324 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
13325 negative++;
13326 }
13327 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
13328 op0 = pc_rtx;
13329 }
13330 else if (TREE_CODE (arg3) == SSA_NAME
13331 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
13332 {
13333 /* Recognize also when mask is like:
13334 __v2df src = _mm_setzero_pd ();
13335 __v2df mask = _mm_cmpeq_pd (src, src);
13336 or
13337 __v8sf src = _mm256_setzero_ps ();
13338 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
13339 as that is a cheaper way to load all ones into
13340 a register than having to load a constant from
13341 memory. */
13342 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
13343 if (is_gimple_call (def_stmt))
13344 {
13345 tree fndecl = gimple_call_fndecl (def_stmt);
13346 if (fndecl
13347 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
4d732405 13348 switch (DECL_MD_FUNCTION_CODE (fndecl))
2bf6d935
ML
13349 {
13350 case IX86_BUILTIN_CMPPD:
13351 case IX86_BUILTIN_CMPPS:
13352 case IX86_BUILTIN_CMPPD256:
13353 case IX86_BUILTIN_CMPPS256:
13354 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
13355 break;
13356 /* FALLTHRU */
13357 case IX86_BUILTIN_CMPEQPD:
13358 case IX86_BUILTIN_CMPEQPS:
13359 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
13360 && initializer_zerop (gimple_call_arg (def_stmt,
13361 1)))
13362 op0 = pc_rtx;
13363 break;
13364 default:
13365 break;
13366 }
13367 }
13368 }
13369 }
13370
13371 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
13372 if (! pat)
13373 return const0_rtx;
13374 emit_insn (pat);
13375
13376 switch (fcode)
13377 {
13378 case IX86_BUILTIN_GATHER3DIV16SF:
13379 if (target == NULL_RTX)
13380 target = gen_reg_rtx (V8SFmode);
13381 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
13382 break;
13383 case IX86_BUILTIN_GATHER3DIV16SI:
13384 if (target == NULL_RTX)
13385 target = gen_reg_rtx (V8SImode);
13386 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
13387 break;
13388 case IX86_BUILTIN_GATHER3DIV8SF:
13389 case IX86_BUILTIN_GATHERDIV8SF:
13390 if (target == NULL_RTX)
13391 target = gen_reg_rtx (V4SFmode);
13392 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
13393 break;
13394 case IX86_BUILTIN_GATHER3DIV8SI:
13395 case IX86_BUILTIN_GATHERDIV8SI:
13396 if (target == NULL_RTX)
13397 target = gen_reg_rtx (V4SImode);
13398 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
13399 break;
13400 default:
13401 target = subtarget;
13402 break;
13403 }
13404 return target;
13405
13406 scatter_gen:
13407 arg0 = CALL_EXPR_ARG (exp, 0);
13408 arg1 = CALL_EXPR_ARG (exp, 1);
13409 arg2 = CALL_EXPR_ARG (exp, 2);
13410 arg3 = CALL_EXPR_ARG (exp, 3);
13411 arg4 = CALL_EXPR_ARG (exp, 4);
13412 op0 = expand_normal (arg0);
13413 op1 = expand_normal (arg1);
13414 op2 = expand_normal (arg2);
13415 op3 = expand_normal (arg3);
13416 op4 = expand_normal (arg4);
13417 mode1 = insn_data[icode].operand[1].mode;
13418 mode2 = insn_data[icode].operand[2].mode;
13419 mode3 = insn_data[icode].operand[3].mode;
13420 mode4 = insn_data[icode].operand[4].mode;
13421
13422 /* Scatter instruction stores operand op3 to memory with
13423 indices from op2 and scale from op4 under writemask op1.
13424 If index operand op2 has more elements then source operand
13425 op3 one need to use only its low half. And vice versa. */
13426 switch (fcode)
13427 {
13428 case IX86_BUILTIN_SCATTERALTSIV8DF:
13429 case IX86_BUILTIN_SCATTERALTSIV8DI:
13430 half = gen_reg_rtx (V8SImode);
13431 if (!nonimmediate_operand (op2, V16SImode))
13432 op2 = copy_to_mode_reg (V16SImode, op2);
13433 emit_insn (gen_vec_extract_lo_v16si (half, op2));
13434 op2 = half;
13435 break;
13436 case IX86_BUILTIN_SCATTERALTDIV16SF:
13437 case IX86_BUILTIN_SCATTERALTDIV16SI:
13438 half = gen_reg_rtx (mode3);
13439 if (mode3 == V8SFmode)
13440 gen = gen_vec_extract_lo_v16sf;
13441 else
13442 gen = gen_vec_extract_lo_v16si;
13443 if (!nonimmediate_operand (op3, GET_MODE (op3)))
13444 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
13445 emit_insn (gen (half, op3));
13446 op3 = half;
13447 break;
13448 case IX86_BUILTIN_SCATTERALTSIV4DF:
13449 case IX86_BUILTIN_SCATTERALTSIV4DI:
13450 half = gen_reg_rtx (V4SImode);
13451 if (!nonimmediate_operand (op2, V8SImode))
13452 op2 = copy_to_mode_reg (V8SImode, op2);
13453 emit_insn (gen_vec_extract_lo_v8si (half, op2));
13454 op2 = half;
13455 break;
13456 case IX86_BUILTIN_SCATTERALTDIV8SF:
13457 case IX86_BUILTIN_SCATTERALTDIV8SI:
13458 half = gen_reg_rtx (mode3);
13459 if (mode3 == V4SFmode)
13460 gen = gen_vec_extract_lo_v8sf;
13461 else
13462 gen = gen_vec_extract_lo_v8si;
13463 if (!nonimmediate_operand (op3, GET_MODE (op3)))
13464 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
13465 emit_insn (gen (half, op3));
13466 op3 = half;
13467 break;
13468 case IX86_BUILTIN_SCATTERALTSIV2DF:
13469 case IX86_BUILTIN_SCATTERALTSIV2DI:
13470 if (!nonimmediate_operand (op2, V4SImode))
13471 op2 = copy_to_mode_reg (V4SImode, op2);
13472 break;
13473 case IX86_BUILTIN_SCATTERALTDIV4SF:
13474 case IX86_BUILTIN_SCATTERALTDIV4SI:
13475 if (!nonimmediate_operand (op3, GET_MODE (op3)))
13476 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
13477 break;
13478 default:
13479 break;
13480 }
13481
13482 /* Force memory operand only with base register here. But we
13483 don't want to do it on memory operand for other builtin
13484 functions. */
13485 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
13486
13487 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
13488 op0 = copy_to_mode_reg (Pmode, op0);
13489
13490 op1 = fixup_modeless_constant (op1, mode1);
13491
13492 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
13493 {
13494 if (!insn_data[icode].operand[1].predicate (op1, mode1))
13495 op1 = copy_to_mode_reg (mode1, op1);
13496 }
13497 else
13498 {
13499 op1 = copy_to_reg (op1);
13500 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
13501 }
13502
13503 if (!insn_data[icode].operand[2].predicate (op2, mode2))
13504 op2 = copy_to_mode_reg (mode2, op2);
13505
13506 if (!insn_data[icode].operand[3].predicate (op3, mode3))
13507 op3 = copy_to_mode_reg (mode3, op3);
13508
13509 if (!insn_data[icode].operand[4].predicate (op4, mode4))
13510 {
13511 error ("the last argument must be scale 1, 2, 4, 8");
13512 return const0_rtx;
13513 }
13514
13515 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
13516 if (! pat)
13517 return const0_rtx;
13518
13519 emit_insn (pat);
13520 return 0;
13521
13522 vec_prefetch_gen:
13523 arg0 = CALL_EXPR_ARG (exp, 0);
13524 arg1 = CALL_EXPR_ARG (exp, 1);
13525 arg2 = CALL_EXPR_ARG (exp, 2);
13526 arg3 = CALL_EXPR_ARG (exp, 3);
13527 arg4 = CALL_EXPR_ARG (exp, 4);
13528 op0 = expand_normal (arg0);
13529 op1 = expand_normal (arg1);
13530 op2 = expand_normal (arg2);
13531 op3 = expand_normal (arg3);
13532 op4 = expand_normal (arg4);
13533 mode0 = insn_data[icode].operand[0].mode;
13534 mode1 = insn_data[icode].operand[1].mode;
13535 mode3 = insn_data[icode].operand[3].mode;
13536 mode4 = insn_data[icode].operand[4].mode;
13537
13538 op0 = fixup_modeless_constant (op0, mode0);
13539
13540 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
13541 {
13542 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13543 op0 = copy_to_mode_reg (mode0, op0);
13544 }
13545 else
13546 {
13547 op0 = copy_to_reg (op0);
13548 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
13549 }
13550
13551 if (!insn_data[icode].operand[1].predicate (op1, mode1))
13552 op1 = copy_to_mode_reg (mode1, op1);
13553
13554 /* Force memory operand only with base register here. But we
13555 don't want to do it on memory operand for other builtin
13556 functions. */
13557 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
13558
13559 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
13560 op2 = copy_to_mode_reg (Pmode, op2);
13561
13562 if (!insn_data[icode].operand[3].predicate (op3, mode3))
13563 {
13564 error ("the forth argument must be scale 1, 2, 4, 8");
13565 return const0_rtx;
13566 }
13567
13568 if (!insn_data[icode].operand[4].predicate (op4, mode4))
13569 {
13570 error ("incorrect hint operand");
13571 return const0_rtx;
13572 }
13573
13574 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
13575 if (! pat)
13576 return const0_rtx;
13577
13578 emit_insn (pat);
13579
13580 return 0;
13581
13582 case IX86_BUILTIN_XABORT:
13583 icode = CODE_FOR_xabort;
13584 arg0 = CALL_EXPR_ARG (exp, 0);
13585 op0 = expand_normal (arg0);
13586 mode0 = insn_data[icode].operand[0].mode;
13587 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13588 {
13589 error ("the argument to %<xabort%> intrinsic must "
13590 "be an 8-bit immediate");
13591 return const0_rtx;
13592 }
13593 emit_insn (gen_xabort (op0));
13594 return 0;
13595
b5034abb
UB
13596 case IX86_BUILTIN_RDSSPD:
13597 case IX86_BUILTIN_RDSSPQ:
13598 mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
13599
13600 if (target == 0
13601 || !register_operand (target, mode))
13602 target = gen_reg_rtx (mode);
13603
13604 op0 = force_reg (mode, const0_rtx);
13605
13606 emit_insn (gen_rdssp (mode, target, op0));
13607 return target;
13608
13609 case IX86_BUILTIN_INCSSPD:
13610 case IX86_BUILTIN_INCSSPQ:
13611 mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
13612
13613 arg0 = CALL_EXPR_ARG (exp, 0);
13614 op0 = expand_normal (arg0);
13615
13616 op0 = force_reg (mode, op0);
13617
13618 emit_insn (gen_incssp (mode, op0));
13619 return 0;
13620
83927c63
HW
13621 case IX86_BUILTIN_HRESET:
13622 icode = CODE_FOR_hreset;
13623 arg0 = CALL_EXPR_ARG (exp, 0);
13624 op0 = expand_normal (arg0);
13625 op0 = force_reg (SImode, op0);
13626 emit_insn (gen_hreset (op0));
13627 return 0;
13628
2bf6d935
ML
13629 case IX86_BUILTIN_RSTORSSP:
13630 case IX86_BUILTIN_CLRSSBSY:
13631 arg0 = CALL_EXPR_ARG (exp, 0);
13632 op0 = expand_normal (arg0);
13633 icode = (fcode == IX86_BUILTIN_RSTORSSP
b5034abb
UB
13634 ? CODE_FOR_rstorssp
13635 : CODE_FOR_clrssbsy);
13636
2bf6d935
ML
13637 if (!address_operand (op0, VOIDmode))
13638 {
b5034abb
UB
13639 op0 = convert_memory_address (Pmode, op0);
13640 op0 = copy_addr_to_reg (op0);
2bf6d935 13641 }
b5034abb 13642 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
2bf6d935
ML
13643 return 0;
13644
13645 case IX86_BUILTIN_WRSSD:
13646 case IX86_BUILTIN_WRSSQ:
13647 case IX86_BUILTIN_WRUSSD:
13648 case IX86_BUILTIN_WRUSSQ:
b5034abb
UB
13649 mode = ((fcode == IX86_BUILTIN_WRSSD
13650 || fcode == IX86_BUILTIN_WRUSSD)
13651 ? SImode : DImode);
13652
2bf6d935
ML
13653 arg0 = CALL_EXPR_ARG (exp, 0);
13654 op0 = expand_normal (arg0);
13655 arg1 = CALL_EXPR_ARG (exp, 1);
13656 op1 = expand_normal (arg1);
b5034abb 13657
2bf6d935 13658 op0 = force_reg (mode, op0);
b5034abb 13659
2bf6d935
ML
13660 if (!address_operand (op1, VOIDmode))
13661 {
b5034abb
UB
13662 op1 = convert_memory_address (Pmode, op1);
13663 op1 = copy_addr_to_reg (op1);
2bf6d935 13664 }
b5034abb
UB
13665 op1 = gen_rtx_MEM (mode, op1);
13666
44320665
UB
13667 icode = ((fcode == IX86_BUILTIN_WRSSD
13668 || fcode == IX86_BUILTIN_WRSSQ)
13669 ? code_for_wrss (mode)
13670 : code_for_wruss (mode));
13671 emit_insn (GEN_FCN (icode) (op0, op1));
13672
2bf6d935
ML
13673 return 0;
13674
13675 default:
13676 break;
13677 }
13678
13679 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
13680 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
13681 {
13682 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
13683 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
13684 target);
13685 }
13686
fd5d5794
UB
13687 if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
13688 && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
13689 {
13690 i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
13691 return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp,
13692 target);
13693 }
13694
2bf6d935
ML
13695 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
13696 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
13697 {
13698 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
13699 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
13700 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
13701 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
13702 int masked = 1;
13703 machine_mode mode, wide_mode, nar_mode;
13704
13705 nar_mode = V4SFmode;
13706 mode = V16SFmode;
13707 wide_mode = V64SFmode;
13708 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
13709 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
13710
13711 switch (fcode)
13712 {
13713 case IX86_BUILTIN_4FMAPS:
13714 fcn = gen_avx5124fmaddps_4fmaddps;
13715 masked = 0;
13716 goto v4fma_expand;
13717
13718 case IX86_BUILTIN_4DPWSSD:
13719 nar_mode = V4SImode;
13720 mode = V16SImode;
13721 wide_mode = V64SImode;
13722 fcn = gen_avx5124vnniw_vp4dpwssd;
13723 masked = 0;
13724 goto v4fma_expand;
13725
13726 case IX86_BUILTIN_4DPWSSDS:
13727 nar_mode = V4SImode;
13728 mode = V16SImode;
13729 wide_mode = V64SImode;
13730 fcn = gen_avx5124vnniw_vp4dpwssds;
13731 masked = 0;
13732 goto v4fma_expand;
13733
13734 case IX86_BUILTIN_4FNMAPS:
13735 fcn = gen_avx5124fmaddps_4fnmaddps;
13736 masked = 0;
13737 goto v4fma_expand;
13738
13739 case IX86_BUILTIN_4FNMAPS_MASK:
13740 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
13741 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
13742 goto v4fma_expand;
13743
13744 case IX86_BUILTIN_4DPWSSD_MASK:
13745 nar_mode = V4SImode;
13746 mode = V16SImode;
13747 wide_mode = V64SImode;
13748 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
13749 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
13750 goto v4fma_expand;
13751
13752 case IX86_BUILTIN_4DPWSSDS_MASK:
13753 nar_mode = V4SImode;
13754 mode = V16SImode;
13755 wide_mode = V64SImode;
13756 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
13757 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
13758 goto v4fma_expand;
13759
13760 case IX86_BUILTIN_4FMAPS_MASK:
13761 {
13762 tree args[4];
13763 rtx ops[4];
13764 rtx wide_reg;
13765 rtx accum;
13766 rtx addr;
13767 rtx mem;
13768
13769v4fma_expand:
13770 wide_reg = gen_reg_rtx (wide_mode);
13771 for (i = 0; i < 4; i++)
13772 {
13773 args[i] = CALL_EXPR_ARG (exp, i);
13774 ops[i] = expand_normal (args[i]);
13775
13776 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
13777 ops[i]);
13778 }
13779
13780 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
13781 accum = force_reg (mode, accum);
13782
13783 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
13784 addr = force_reg (Pmode, addr);
13785
13786 mem = gen_rtx_MEM (nar_mode, addr);
13787
13788 target = gen_reg_rtx (mode);
13789
13790 emit_move_insn (target, accum);
13791
13792 if (! masked)
13793 emit_insn (fcn (target, accum, wide_reg, mem));
13794 else
13795 {
13796 rtx merge, mask;
13797 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
13798
13799 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
13800
13801 if (CONST_INT_P (mask))
13802 mask = fixup_modeless_constant (mask, HImode);
13803
13804 mask = force_reg (HImode, mask);
13805
13806 if (GET_MODE (mask) != HImode)
13807 mask = gen_rtx_SUBREG (HImode, mask, 0);
13808
13809 /* If merge is 0 then we're about to emit z-masked variant. */
13810 if (const0_operand (merge, mode))
13811 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
13812 /* If merge is the same as accum then emit merge-masked variant. */
13813 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
13814 {
13815 merge = force_reg (mode, merge);
13816 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
13817 }
13818 /* Merge with something unknown might happen if we z-mask w/ -O0. */
13819 else
13820 {
13821 target = gen_reg_rtx (mode);
13822 emit_move_insn (target, merge);
13823 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
13824 }
13825 }
13826 return target;
13827 }
13828
13829 case IX86_BUILTIN_4FNMASS:
13830 fcn = gen_avx5124fmaddps_4fnmaddss;
13831 masked = 0;
13832 goto s4fma_expand;
13833
13834 case IX86_BUILTIN_4FMASS:
13835 fcn = gen_avx5124fmaddps_4fmaddss;
13836 masked = 0;
13837 goto s4fma_expand;
13838
13839 case IX86_BUILTIN_4FNMASS_MASK:
13840 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
13841 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
13842 goto s4fma_expand;
13843
13844 case IX86_BUILTIN_4FMASS_MASK:
13845 {
13846 tree args[4];
13847 rtx ops[4];
13848 rtx wide_reg;
13849 rtx accum;
13850 rtx addr;
13851 rtx mem;
13852
13853 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
13854 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
13855
13856s4fma_expand:
13857 mode = V4SFmode;
13858 wide_reg = gen_reg_rtx (V64SFmode);
13859 for (i = 0; i < 4; i++)
13860 {
13861 rtx tmp;
13862 args[i] = CALL_EXPR_ARG (exp, i);
13863 ops[i] = expand_normal (args[i]);
13864
13865 tmp = gen_reg_rtx (SFmode);
13866 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
13867
13868 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
13869 gen_rtx_SUBREG (V16SFmode, tmp, 0));
13870 }
13871
13872 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
13873 accum = force_reg (V4SFmode, accum);
13874
13875 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
13876 addr = force_reg (Pmode, addr);
13877
13878 mem = gen_rtx_MEM (V4SFmode, addr);
13879
13880 target = gen_reg_rtx (V4SFmode);
13881
13882 emit_move_insn (target, accum);
13883
13884 if (! masked)
13885 emit_insn (fcn (target, accum, wide_reg, mem));
13886 else
13887 {
13888 rtx merge, mask;
13889 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
13890
13891 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
13892
13893 if (CONST_INT_P (mask))
13894 mask = fixup_modeless_constant (mask, QImode);
13895
13896 mask = force_reg (QImode, mask);
13897
13898 if (GET_MODE (mask) != QImode)
13899 mask = gen_rtx_SUBREG (QImode, mask, 0);
13900
13901 /* If merge is 0 then we're about to emit z-masked variant. */
13902 if (const0_operand (merge, mode))
13903 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
13904 /* If merge is the same as accum then emit merge-masked
13905 variant. */
13906 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
13907 {
13908 merge = force_reg (mode, merge);
13909 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
13910 }
13911 /* Merge with something unknown might happen if we z-mask
13912 w/ -O0. */
13913 else
13914 {
13915 target = gen_reg_rtx (mode);
13916 emit_move_insn (target, merge);
13917 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
13918 }
13919 }
13920 return target;
13921 }
13922 case IX86_BUILTIN_RDPID:
13923 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
13924 target);
13925 case IX86_BUILTIN_FABSQ:
13926 case IX86_BUILTIN_COPYSIGNQ:
13927 if (!TARGET_SSE)
13928 /* Emit a normal call if SSE isn't available. */
13929 return expand_call (exp, target, ignore);
13930 /* FALLTHRU */
13931 default:
13932 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
13933 }
13934 }
13935
13936 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
13937 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
13938 {
13939 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
13940 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
13941 }
13942
13943 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
13944 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
13945 {
13946 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
13947 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
13948 }
13949
13950 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
13951 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
13952 {
13953 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
13954 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
13955 }
13956
13957 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
13958 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
13959 {
13960 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
13961 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
13962 }
13963
13964 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
13965 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
13966 {
13967 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
13968 const struct builtin_description *d = bdesc_multi_arg + i;
13969 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
13970 (enum ix86_builtin_func_type)
13971 d->flag, d->comparison);
13972 }
13973
13974 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
13975 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
13976 {
13977 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
13978 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
13979 target);
13980 }
13981
2bf6d935
ML
13982 gcc_unreachable ();
13983}
13984
13985/* A subroutine of ix86_expand_vector_init_duplicate. Tries to
13986 fill target with val via vec_duplicate. */
13987
13988static bool
13989ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
13990{
13991 bool ok;
13992 rtx_insn *insn;
13993 rtx dup;
13994
13995 /* First attempt to recognize VAL as-is. */
13996 dup = gen_vec_duplicate (mode, val);
13997 insn = emit_insn (gen_rtx_SET (target, dup));
13998 if (recog_memoized (insn) < 0)
13999 {
14000 rtx_insn *seq;
14001 machine_mode innermode = GET_MODE_INNER (mode);
14002 rtx reg;
14003
14004 /* If that fails, force VAL into a register. */
14005
14006 start_sequence ();
14007 reg = force_reg (innermode, val);
14008 if (GET_MODE (reg) != innermode)
14009 reg = gen_lowpart (innermode, reg);
14010 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
14011 seq = get_insns ();
14012 end_sequence ();
14013 if (seq)
14014 emit_insn_before (seq, insn);
14015
14016 ok = recog_memoized (insn) >= 0;
14017 gcc_assert (ok);
14018 }
14019 return true;
14020}
14021
14022/* Get a vector mode of the same size as the original but with elements
14023 twice as wide. This is only guaranteed to apply to integral vectors. */
14024
14025static machine_mode
14026get_mode_wider_vector (machine_mode o)
14027{
14028 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
14029 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
14030 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
14031 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
14032 return n;
14033}
14034
14035static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
14036static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
14037
14038/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
14039 with all elements equal to VAR. Return true if successful. */
14040
51c30227 14041bool
2bf6d935
ML
14042ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
14043 rtx target, rtx val)
14044{
14045 bool ok;
14046
14047 switch (mode)
14048 {
14049 case E_V2SImode:
14050 case E_V2SFmode:
14051 if (!mmx_ok)
14052 return false;
14053 /* FALLTHRU */
14054
14055 case E_V4DFmode:
14056 case E_V4DImode:
14057 case E_V8SFmode:
14058 case E_V8SImode:
14059 case E_V2DFmode:
14060 case E_V2DImode:
14061 case E_V4SFmode:
14062 case E_V4SImode:
14063 case E_V16SImode:
14064 case E_V8DImode:
14065 case E_V16SFmode:
14066 case E_V8DFmode:
14067 return ix86_vector_duplicate_value (mode, target, val);
14068
14069 case E_V4HImode:
14070 if (!mmx_ok)
14071 return false;
14072 if (TARGET_SSE || TARGET_3DNOW_A)
14073 {
14074 rtx x;
14075
14076 val = gen_lowpart (SImode, val);
14077 x = gen_rtx_TRUNCATE (HImode, val);
14078 x = gen_rtx_VEC_DUPLICATE (mode, x);
14079 emit_insn (gen_rtx_SET (target, x));
14080 return true;
14081 }
14082 goto widen;
14083
8d7dae0e
UB
14084 case E_V2HImode:
14085 if (TARGET_SSE2)
14086 {
14087 rtx x;
14088
14089 val = gen_lowpart (SImode, val);
14090 x = gen_rtx_TRUNCATE (HImode, val);
14091 x = gen_rtx_VEC_DUPLICATE (mode, x);
14092 emit_insn (gen_rtx_SET (target, x));
14093 return true;
14094 }
14095 return false;
14096
2bf6d935 14097 case E_V8QImode:
64735dc9 14098 case E_V4QImode:
2bf6d935
ML
14099 if (!mmx_ok)
14100 return false;
14101 goto widen;
14102
14103 case E_V8HImode:
14104 if (TARGET_AVX2)
14105 return ix86_vector_duplicate_value (mode, target, val);
14106
14107 if (TARGET_SSE2)
14108 {
14109 struct expand_vec_perm_d dperm;
14110 rtx tmp1, tmp2;
14111
14112 permute:
14113 memset (&dperm, 0, sizeof (dperm));
14114 dperm.target = target;
14115 dperm.vmode = mode;
14116 dperm.nelt = GET_MODE_NUNITS (mode);
14117 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
14118 dperm.one_operand_p = true;
14119
14120 /* Extend to SImode using a paradoxical SUBREG. */
14121 tmp1 = gen_reg_rtx (SImode);
14122 emit_move_insn (tmp1, gen_lowpart (SImode, val));
14123
7fc4d600 14124 /* Insert the SImode value as low element of a V4SImode vector. */
2bf6d935
ML
14125 tmp2 = gen_reg_rtx (V4SImode);
14126 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
14127 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
14128
14129 ok = (expand_vec_perm_1 (&dperm)
14130 || expand_vec_perm_broadcast_1 (&dperm));
14131 gcc_assert (ok);
14132 return ok;
14133 }
14134 goto widen;
14135
14136 case E_V16QImode:
14137 if (TARGET_AVX2)
14138 return ix86_vector_duplicate_value (mode, target, val);
14139
14140 if (TARGET_SSE2)
14141 goto permute;
14142 goto widen;
14143
14144 widen:
14145 /* Replicate the value once into the next wider mode and recurse. */
14146 {
14147 machine_mode smode, wsmode, wvmode;
14148 rtx x;
14149
14150 smode = GET_MODE_INNER (mode);
14151 wvmode = get_mode_wider_vector (mode);
14152 wsmode = GET_MODE_INNER (wvmode);
14153
14154 val = convert_modes (wsmode, smode, val, true);
20a2c8ac
UB
14155
14156 if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
14157 emit_insn (gen_insv_1 (wsmode, val, val));
14158 else
14159 {
14160 x = expand_simple_binop (wsmode, ASHIFT, val,
14161 GEN_INT (GET_MODE_BITSIZE (smode)),
14162 NULL_RTX, 1, OPTAB_LIB_WIDEN);
14163 val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
14164 OPTAB_LIB_WIDEN);
14165 }
2bf6d935
ML
14166
14167 x = gen_reg_rtx (wvmode);
14168 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
14169 gcc_assert (ok);
14170 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
14171 return ok;
14172 }
14173
14174 case E_V16HImode:
14175 case E_V32QImode:
14176 if (TARGET_AVX2)
14177 return ix86_vector_duplicate_value (mode, target, val);
14178 else
14179 {
14180 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
14181 rtx x = gen_reg_rtx (hvmode);
14182
14183 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
14184 gcc_assert (ok);
14185
14186 x = gen_rtx_VEC_CONCAT (mode, x, x);
14187 emit_insn (gen_rtx_SET (target, x));
14188 }
14189 return true;
14190
14191 case E_V64QImode:
14192 case E_V32HImode:
14193 if (TARGET_AVX512BW)
14194 return ix86_vector_duplicate_value (mode, target, val);
14195 else
14196 {
14197 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
14198 rtx x = gen_reg_rtx (hvmode);
14199
14200 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
14201 gcc_assert (ok);
14202
14203 x = gen_rtx_VEC_CONCAT (mode, x, x);
14204 emit_insn (gen_rtx_SET (target, x));
14205 }
14206 return true;
14207
9e2a82e1 14208 case E_V8HFmode:
14209 case E_V16HFmode:
14210 case E_V32HFmode:
14211 return ix86_vector_duplicate_value (mode, target, val);
14212
2bf6d935
ML
14213 default:
14214 return false;
14215 }
14216}
14217
14218/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
14219 whose ONE_VAR element is VAR, and other elements are zero. Return true
14220 if successful. */
14221
14222static bool
14223ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
14224 rtx target, rtx var, int one_var)
14225{
14226 machine_mode vsimode;
14227 rtx new_target;
14228 rtx x, tmp;
14229 bool use_vector_set = false;
14230 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
14231
14232 switch (mode)
14233 {
14234 case E_V2DImode:
14235 /* For SSE4.1, we normally use vector set. But if the second
14236 element is zero and inter-unit moves are OK, we use movq
14237 instead. */
14238 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
14239 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
14240 && one_var == 0));
14241 break;
14242 case E_V16QImode:
14243 case E_V4SImode:
14244 case E_V4SFmode:
14245 use_vector_set = TARGET_SSE4_1;
14246 break;
14247 case E_V8HImode:
14248 use_vector_set = TARGET_SSE2;
c4d423c7 14249 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
14250 ? gen_vec_setv8hi_0 : NULL;
2bf6d935 14251 break;
8a0eb0cd
UB
14252 case E_V8QImode:
14253 use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14254 break;
2bf6d935
ML
14255 case E_V4HImode:
14256 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
14257 break;
64735dc9
UB
14258 case E_V4QImode:
14259 use_vector_set = TARGET_SSE4_1;
14260 break;
2bf6d935 14261 case E_V32QImode:
c4d423c7 14262 use_vector_set = TARGET_AVX;
14263 break;
2bf6d935
ML
14264 case E_V16HImode:
14265 use_vector_set = TARGET_AVX;
c4d423c7 14266 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
14267 ? gen_vec_setv16hi_0 : NULL;
2bf6d935
ML
14268 break;
14269 case E_V8SImode:
14270 use_vector_set = TARGET_AVX;
14271 gen_vec_set_0 = gen_vec_setv8si_0;
14272 break;
14273 case E_V8SFmode:
14274 use_vector_set = TARGET_AVX;
14275 gen_vec_set_0 = gen_vec_setv8sf_0;
14276 break;
14277 case E_V4DFmode:
14278 use_vector_set = TARGET_AVX;
14279 gen_vec_set_0 = gen_vec_setv4df_0;
14280 break;
14281 case E_V4DImode:
14282 /* Use ix86_expand_vector_set in 64bit mode only. */
14283 use_vector_set = TARGET_AVX && TARGET_64BIT;
14284 gen_vec_set_0 = gen_vec_setv4di_0;
14285 break;
14286 case E_V16SImode:
14287 use_vector_set = TARGET_AVX512F && one_var == 0;
14288 gen_vec_set_0 = gen_vec_setv16si_0;
14289 break;
14290 case E_V16SFmode:
14291 use_vector_set = TARGET_AVX512F && one_var == 0;
14292 gen_vec_set_0 = gen_vec_setv16sf_0;
14293 break;
14294 case E_V8DFmode:
14295 use_vector_set = TARGET_AVX512F && one_var == 0;
14296 gen_vec_set_0 = gen_vec_setv8df_0;
14297 break;
14298 case E_V8DImode:
14299 /* Use ix86_expand_vector_set in 64bit mode only. */
14300 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
14301 gen_vec_set_0 = gen_vec_setv8di_0;
14302 break;
9e2a82e1 14303 case E_V8HFmode:
14304 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
14305 gen_vec_set_0 = gen_vec_setv8hf_0;
14306 break;
14307 case E_V16HFmode:
14308 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
14309 gen_vec_set_0 = gen_vec_setv16hf_0;
14310 break;
14311 case E_V32HFmode:
14312 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
14313 gen_vec_set_0 = gen_vec_setv32hf_0;
14314 break;
c4d423c7 14315 case E_V32HImode:
14316 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
14317 gen_vec_set_0 = gen_vec_setv32hi_0;
2bf6d935
ML
14318 default:
14319 break;
14320 }
14321
14322 if (use_vector_set)
14323 {
14324 if (gen_vec_set_0 && one_var == 0)
14325 {
14326 var = force_reg (GET_MODE_INNER (mode), var);
14327 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
14328 return true;
14329 }
14330 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
14331 var = force_reg (GET_MODE_INNER (mode), var);
14332 ix86_expand_vector_set (mmx_ok, target, var, one_var);
14333 return true;
14334 }
14335
14336 switch (mode)
14337 {
14338 case E_V2SFmode:
14339 case E_V2SImode:
14340 if (!mmx_ok)
14341 return false;
14342 /* FALLTHRU */
14343
14344 case E_V2DFmode:
14345 case E_V2DImode:
14346 if (one_var != 0)
14347 return false;
14348 var = force_reg (GET_MODE_INNER (mode), var);
14349 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
14350 emit_insn (gen_rtx_SET (target, x));
14351 return true;
14352
14353 case E_V4SFmode:
14354 case E_V4SImode:
14355 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
14356 new_target = gen_reg_rtx (mode);
14357 else
14358 new_target = target;
14359 var = force_reg (GET_MODE_INNER (mode), var);
14360 x = gen_rtx_VEC_DUPLICATE (mode, var);
14361 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
14362 emit_insn (gen_rtx_SET (new_target, x));
14363 if (one_var != 0)
14364 {
14365 /* We need to shuffle the value to the correct position, so
14366 create a new pseudo to store the intermediate result. */
14367
14368 /* With SSE2, we can use the integer shuffle insns. */
14369 if (mode != V4SFmode && TARGET_SSE2)
14370 {
14371 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
14372 const1_rtx,
14373 GEN_INT (one_var == 1 ? 0 : 1),
14374 GEN_INT (one_var == 2 ? 0 : 1),
14375 GEN_INT (one_var == 3 ? 0 : 1)));
14376 if (target != new_target)
14377 emit_move_insn (target, new_target);
14378 return true;
14379 }
14380
14381 /* Otherwise convert the intermediate result to V4SFmode and
14382 use the SSE1 shuffle instructions. */
14383 if (mode != V4SFmode)
14384 {
14385 tmp = gen_reg_rtx (V4SFmode);
14386 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
14387 }
14388 else
14389 tmp = new_target;
14390
14391 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
14392 const1_rtx,
14393 GEN_INT (one_var == 1 ? 0 : 1),
14394 GEN_INT (one_var == 2 ? 0+4 : 1+4),
14395 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
14396
14397 if (mode != V4SFmode)
14398 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
14399 else if (tmp != target)
14400 emit_move_insn (target, tmp);
14401 }
14402 else if (target != new_target)
14403 emit_move_insn (target, new_target);
14404 return true;
14405
14406 case E_V8HImode:
14407 case E_V16QImode:
14408 vsimode = V4SImode;
14409 goto widen;
14410 case E_V4HImode:
14411 case E_V8QImode:
14412 if (!mmx_ok)
14413 return false;
14414 vsimode = V2SImode;
14415 goto widen;
14416 widen:
14417 if (one_var != 0)
14418 return false;
14419
14420 /* Zero extend the variable element to SImode and recurse. */
14421 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
14422
14423 x = gen_reg_rtx (vsimode);
14424 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
14425 var, one_var))
14426 gcc_unreachable ();
14427
14428 emit_move_insn (target, gen_lowpart (mode, x));
14429 return true;
14430
14431 default:
14432 return false;
14433 }
14434}
14435
14436/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
14437 consisting of the values in VALS. It is known that all elements
14438 except ONE_VAR are constants. Return true if successful. */
14439
14440static bool
14441ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
14442 rtx target, rtx vals, int one_var)
14443{
14444 rtx var = XVECEXP (vals, 0, one_var);
14445 machine_mode wmode;
14446 rtx const_vec, x;
14447
14448 const_vec = copy_rtx (vals);
14449 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
14450 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
14451
14452 switch (mode)
14453 {
14454 case E_V2DFmode:
14455 case E_V2DImode:
14456 case E_V2SFmode:
14457 case E_V2SImode:
14458 /* For the two element vectors, it's just as easy to use
14459 the general case. */
14460 return false;
14461
14462 case E_V4DImode:
14463 /* Use ix86_expand_vector_set in 64bit mode only. */
14464 if (!TARGET_64BIT)
14465 return false;
14466 /* FALLTHRU */
9e2a82e1 14467 case E_V8HFmode:
14468 case E_V16HFmode:
2bf6d935
ML
14469 case E_V4DFmode:
14470 case E_V8SFmode:
14471 case E_V8SImode:
14472 case E_V16HImode:
14473 case E_V32QImode:
14474 case E_V4SFmode:
14475 case E_V4SImode:
14476 case E_V8HImode:
14477 case E_V4HImode:
14478 break;
14479
14480 case E_V16QImode:
14481 if (TARGET_SSE4_1)
14482 break;
14483 wmode = V8HImode;
14484 goto widen;
14485 case E_V8QImode:
8a0eb0cd
UB
14486 if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
14487 break;
2bf6d935
ML
14488 wmode = V4HImode;
14489 goto widen;
64735dc9
UB
14490 case E_V4QImode:
14491 if (TARGET_SSE4_1)
14492 break;
14493 wmode = V2HImode;
2bf6d935
ML
14494 widen:
14495 /* There's no way to set one QImode entry easily. Combine
14496 the variable value with its adjacent constant value, and
14497 promote to an HImode set. */
14498 x = XVECEXP (vals, 0, one_var ^ 1);
14499 if (one_var & 1)
14500 {
14501 var = convert_modes (HImode, QImode, var, true);
14502 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
14503 NULL_RTX, 1, OPTAB_LIB_WIDEN);
14504 x = GEN_INT (INTVAL (x) & 0xff);
14505 }
14506 else
14507 {
14508 var = convert_modes (HImode, QImode, var, true);
14509 x = gen_int_mode (UINTVAL (x) << 8, HImode);
14510 }
14511 if (x != const0_rtx)
14512 var = expand_simple_binop (HImode, IOR, var, x, var,
14513 1, OPTAB_LIB_WIDEN);
14514
14515 x = gen_reg_rtx (wmode);
14516 emit_move_insn (x, gen_lowpart (wmode, const_vec));
14517 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
14518
14519 emit_move_insn (target, gen_lowpart (mode, x));
14520 return true;
14521
14522 default:
14523 return false;
14524 }
14525
14526 emit_move_insn (target, const_vec);
14527 ix86_expand_vector_set (mmx_ok, target, var, one_var);
14528 return true;
14529}
14530
14531/* A subroutine of ix86_expand_vector_init_general. Use vector
14532 concatenate to handle the most general case: all values variable,
14533 and none identical. */
14534
14535static void
14536ix86_expand_vector_init_concat (machine_mode mode,
14537 rtx target, rtx *ops, int n)
14538{
1aeecaf5
HL
14539 machine_mode half_mode = VOIDmode;
14540 rtx half[2];
2bf6d935
ML
14541 rtvec v;
14542 int i, j;
14543
14544 switch (n)
14545 {
14546 case 2:
14547 switch (mode)
14548 {
9e2a82e1 14549 case E_V32HFmode:
14550 half_mode = V16HFmode;
14551 break;
2bf6d935 14552 case E_V16SImode:
1aeecaf5 14553 half_mode = V8SImode;
2bf6d935
ML
14554 break;
14555 case E_V16SFmode:
1aeecaf5 14556 half_mode = V8SFmode;
2bf6d935
ML
14557 break;
14558 case E_V8DImode:
1aeecaf5 14559 half_mode = V4DImode;
2bf6d935
ML
14560 break;
14561 case E_V8DFmode:
1aeecaf5 14562 half_mode = V4DFmode;
2bf6d935 14563 break;
9e2a82e1 14564 case E_V16HFmode:
14565 half_mode = V8HFmode;
14566 break;
2bf6d935 14567 case E_V8SImode:
1aeecaf5 14568 half_mode = V4SImode;
2bf6d935
ML
14569 break;
14570 case E_V8SFmode:
1aeecaf5 14571 half_mode = V4SFmode;
2bf6d935
ML
14572 break;
14573 case E_V4DImode:
1aeecaf5 14574 half_mode = V2DImode;
2bf6d935
ML
14575 break;
14576 case E_V4DFmode:
1aeecaf5 14577 half_mode = V2DFmode;
2bf6d935
ML
14578 break;
14579 case E_V4SImode:
1aeecaf5 14580 half_mode = V2SImode;
2bf6d935
ML
14581 break;
14582 case E_V4SFmode:
1aeecaf5 14583 half_mode = V2SFmode;
2bf6d935
ML
14584 break;
14585 case E_V2DImode:
1aeecaf5 14586 half_mode = DImode;
2bf6d935
ML
14587 break;
14588 case E_V2SImode:
1aeecaf5 14589 half_mode = SImode;
2bf6d935
ML
14590 break;
14591 case E_V2DFmode:
1aeecaf5 14592 half_mode = DFmode;
2bf6d935
ML
14593 break;
14594 case E_V2SFmode:
1aeecaf5 14595 half_mode = SFmode;
2bf6d935
ML
14596 break;
14597 default:
14598 gcc_unreachable ();
14599 }
14600
1aeecaf5
HL
14601 if (!register_operand (ops[1], half_mode))
14602 ops[1] = force_reg (half_mode, ops[1]);
14603 if (!register_operand (ops[0], half_mode))
14604 ops[0] = force_reg (half_mode, ops[0]);
2bf6d935
ML
14605 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
14606 ops[1])));
14607 break;
14608
14609 case 4:
14610 switch (mode)
14611 {
14612 case E_V4DImode:
1aeecaf5 14613 half_mode = V2DImode;
2bf6d935
ML
14614 break;
14615 case E_V4DFmode:
1aeecaf5 14616 half_mode = V2DFmode;
2bf6d935
ML
14617 break;
14618 case E_V4SImode:
1aeecaf5 14619 half_mode = V2SImode;
2bf6d935
ML
14620 break;
14621 case E_V4SFmode:
1aeecaf5 14622 half_mode = V2SFmode;
2bf6d935
ML
14623 break;
14624 default:
14625 gcc_unreachable ();
14626 }
14627 goto half;
14628
14629 case 8:
14630 switch (mode)
14631 {
14632 case E_V8DImode:
1aeecaf5 14633 half_mode = V4DImode;
2bf6d935
ML
14634 break;
14635 case E_V8DFmode:
1aeecaf5 14636 half_mode = V4DFmode;
2bf6d935
ML
14637 break;
14638 case E_V8SImode:
1aeecaf5 14639 half_mode = V4SImode;
2bf6d935
ML
14640 break;
14641 case E_V8SFmode:
1aeecaf5 14642 half_mode = V4SFmode;
2bf6d935
ML
14643 break;
14644 default:
14645 gcc_unreachable ();
14646 }
14647 goto half;
14648
14649 case 16:
14650 switch (mode)
14651 {
14652 case E_V16SImode:
1aeecaf5 14653 half_mode = V8SImode;
2bf6d935
ML
14654 break;
14655 case E_V16SFmode:
1aeecaf5 14656 half_mode = V8SFmode;
2bf6d935
ML
14657 break;
14658 default:
14659 gcc_unreachable ();
14660 }
14661 goto half;
14662
14663half:
14664 /* FIXME: We process inputs backward to help RA. PR 36222. */
14665 i = n - 1;
1aeecaf5 14666 for (j = 1; j != -1; j--)
2bf6d935 14667 {
1aeecaf5
HL
14668 half[j] = gen_reg_rtx (half_mode);
14669 switch (n >> 1)
2bf6d935 14670 {
1aeecaf5
HL
14671 case 2:
14672 v = gen_rtvec (2, ops[i-1], ops[i]);
14673 i -= 2;
14674 break;
14675 case 4:
14676 v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
14677 i -= 4;
14678 break;
14679 case 8:
14680 v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
14681 ops[i-3], ops[i-2], ops[i-1], ops[i]);
14682 i -= 8;
14683 break;
14684 default:
14685 gcc_unreachable ();
2bf6d935 14686 }
1aeecaf5
HL
14687 ix86_expand_vector_init (false, half[j],
14688 gen_rtx_PARALLEL (half_mode, v));
2bf6d935 14689 }
1aeecaf5
HL
14690
14691 ix86_expand_vector_init_concat (mode, target, half, 2);
2bf6d935
ML
14692 break;
14693
14694 default:
14695 gcc_unreachable ();
14696 }
14697}
14698
14699/* A subroutine of ix86_expand_vector_init_general. Use vector
14700 interleave to handle the most general case: all values variable,
14701 and none identical. */
14702
14703static void
14704ix86_expand_vector_init_interleave (machine_mode mode,
14705 rtx target, rtx *ops, int n)
14706{
14707 machine_mode first_imode, second_imode, third_imode, inner_mode;
14708 int i, j;
9e2a82e1 14709 rtx op, op0, op1;
2bf6d935
ML
14710 rtx (*gen_load_even) (rtx, rtx, rtx);
14711 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
14712 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
14713
14714 switch (mode)
14715 {
9e2a82e1 14716 case E_V8HFmode:
7fc4d600 14717 gen_load_even = gen_vec_interleave_lowv8hf;
9e2a82e1 14718 gen_interleave_first_low = gen_vec_interleave_lowv4si;
14719 gen_interleave_second_low = gen_vec_interleave_lowv2di;
14720 inner_mode = HFmode;
14721 first_imode = V4SImode;
14722 second_imode = V2DImode;
14723 third_imode = VOIDmode;
14724 break;
2bf6d935
ML
14725 case E_V8HImode:
14726 gen_load_even = gen_vec_setv8hi;
14727 gen_interleave_first_low = gen_vec_interleave_lowv4si;
14728 gen_interleave_second_low = gen_vec_interleave_lowv2di;
14729 inner_mode = HImode;
14730 first_imode = V4SImode;
14731 second_imode = V2DImode;
14732 third_imode = VOIDmode;
14733 break;
14734 case E_V16QImode:
14735 gen_load_even = gen_vec_setv16qi;
14736 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
14737 gen_interleave_second_low = gen_vec_interleave_lowv4si;
14738 inner_mode = QImode;
14739 first_imode = V8HImode;
14740 second_imode = V4SImode;
14741 third_imode = V2DImode;
14742 break;
14743 default:
14744 gcc_unreachable ();
14745 }
14746
14747 for (i = 0; i < n; i++)
14748 {
9e2a82e1 14749 op = ops [i + i];
14750 if (inner_mode == HFmode)
14751 {
7fc4d600 14752 rtx even, odd;
14753 /* Use vpuncklwd to pack 2 HFmode. */
14754 op0 = gen_reg_rtx (V8HFmode);
14755 even = lowpart_subreg (V8HFmode, force_reg (HFmode, op), HFmode);
14756 odd = lowpart_subreg (V8HFmode,
14757 force_reg (HFmode, ops[i + i + 1]),
14758 HFmode);
14759 emit_insn (gen_load_even (op0, even, odd));
9e2a82e1 14760 }
7fc4d600 14761 else
14762 {
14763 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
14764 op0 = gen_reg_rtx (SImode);
14765 emit_move_insn (op0, gen_lowpart (SImode, op));
9e2a82e1 14766
7fc4d600 14767 /* Insert the SImode value as low element of V4SImode vector. */
14768 op1 = gen_reg_rtx (V4SImode);
14769 op0 = gen_rtx_VEC_MERGE (V4SImode,
14770 gen_rtx_VEC_DUPLICATE (V4SImode,
14771 op0),
14772 CONST0_RTX (V4SImode),
14773 const1_rtx);
14774 emit_insn (gen_rtx_SET (op1, op0));
2bf6d935 14775
7fc4d600 14776 /* Cast the V4SImode vector back to a vector in orignal mode. */
14777 op0 = gen_reg_rtx (mode);
14778 emit_move_insn (op0, gen_lowpart (mode, op1));
2bf6d935 14779
7fc4d600 14780 /* Load even elements into the second position. */
14781 emit_insn (gen_load_even (op0,
14782 force_reg (inner_mode,
14783 ops[i + i + 1]),
14784 const1_rtx));
14785 }
2bf6d935
ML
14786
14787 /* Cast vector to FIRST_IMODE vector. */
14788 ops[i] = gen_reg_rtx (first_imode);
14789 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
14790 }
14791
14792 /* Interleave low FIRST_IMODE vectors. */
14793 for (i = j = 0; i < n; i += 2, j++)
14794 {
14795 op0 = gen_reg_rtx (first_imode);
14796 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
14797
14798 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
14799 ops[j] = gen_reg_rtx (second_imode);
14800 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
14801 }
14802
14803 /* Interleave low SECOND_IMODE vectors. */
14804 switch (second_imode)
14805 {
14806 case E_V4SImode:
14807 for (i = j = 0; i < n / 2; i += 2, j++)
14808 {
14809 op0 = gen_reg_rtx (second_imode);
14810 emit_insn (gen_interleave_second_low (op0, ops[i],
14811 ops[i + 1]));
14812
14813 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
14814 vector. */
14815 ops[j] = gen_reg_rtx (third_imode);
14816 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
14817 }
14818 second_imode = V2DImode;
14819 gen_interleave_second_low = gen_vec_interleave_lowv2di;
14820 /* FALLTHRU */
14821
14822 case E_V2DImode:
14823 op0 = gen_reg_rtx (second_imode);
14824 emit_insn (gen_interleave_second_low (op0, ops[0],
14825 ops[1]));
14826
14827 /* Cast the SECOND_IMODE vector back to a vector on original
14828 mode. */
14829 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
14830 break;
14831
14832 default:
14833 gcc_unreachable ();
14834 }
14835}
14836
14837/* A subroutine of ix86_expand_vector_init. Handle the most general case:
14838 all values variable, and none identical. */
14839
14840static void
14841ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
14842 rtx target, rtx vals)
14843{
14844 rtx ops[64], op0, op1, op2, op3, op4, op5;
14845 machine_mode half_mode = VOIDmode;
14846 machine_mode quarter_mode = VOIDmode;
14847 int n, i;
14848
14849 switch (mode)
14850 {
14851 case E_V2SFmode:
14852 case E_V2SImode:
14853 if (!mmx_ok && !TARGET_SSE)
14854 break;
14855 /* FALLTHRU */
14856
14857 case E_V16SImode:
14858 case E_V16SFmode:
14859 case E_V8DFmode:
14860 case E_V8DImode:
14861 case E_V8SFmode:
14862 case E_V8SImode:
14863 case E_V4DFmode:
14864 case E_V4DImode:
14865 case E_V4SFmode:
14866 case E_V4SImode:
14867 case E_V2DFmode:
14868 case E_V2DImode:
14869 n = GET_MODE_NUNITS (mode);
14870 for (i = 0; i < n; i++)
14871 ops[i] = XVECEXP (vals, 0, i);
14872 ix86_expand_vector_init_concat (mode, target, ops, n);
14873 return;
14874
14875 case E_V2TImode:
14876 for (i = 0; i < 2; i++)
14877 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
14878 op0 = gen_reg_rtx (V4DImode);
14879 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
14880 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
14881 return;
14882
14883 case E_V4TImode:
14884 for (i = 0; i < 4; i++)
14885 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
14886 ops[4] = gen_reg_rtx (V4DImode);
14887 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
14888 ops[5] = gen_reg_rtx (V4DImode);
14889 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
14890 op0 = gen_reg_rtx (V8DImode);
14891 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
14892 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
14893 return;
14894
14895 case E_V32QImode:
14896 half_mode = V16QImode;
14897 goto half;
14898
14899 case E_V16HImode:
14900 half_mode = V8HImode;
14901 goto half;
14902
9e2a82e1 14903 case E_V16HFmode:
14904 half_mode = V8HFmode;
14905 goto half;
14906
2bf6d935
ML
14907half:
14908 n = GET_MODE_NUNITS (mode);
14909 for (i = 0; i < n; i++)
14910 ops[i] = XVECEXP (vals, 0, i);
14911 op0 = gen_reg_rtx (half_mode);
14912 op1 = gen_reg_rtx (half_mode);
14913 ix86_expand_vector_init_interleave (half_mode, op0, ops,
14914 n >> 2);
14915 ix86_expand_vector_init_interleave (half_mode, op1,
14916 &ops [n >> 1], n >> 2);
14917 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
14918 return;
14919
14920 case E_V64QImode:
14921 quarter_mode = V16QImode;
14922 half_mode = V32QImode;
14923 goto quarter;
14924
14925 case E_V32HImode:
14926 quarter_mode = V8HImode;
14927 half_mode = V16HImode;
14928 goto quarter;
14929
9e2a82e1 14930 case E_V32HFmode:
14931 quarter_mode = V8HFmode;
14932 half_mode = V16HFmode;
14933 goto quarter;
14934
2bf6d935
ML
14935quarter:
14936 n = GET_MODE_NUNITS (mode);
14937 for (i = 0; i < n; i++)
14938 ops[i] = XVECEXP (vals, 0, i);
14939 op0 = gen_reg_rtx (quarter_mode);
14940 op1 = gen_reg_rtx (quarter_mode);
14941 op2 = gen_reg_rtx (quarter_mode);
14942 op3 = gen_reg_rtx (quarter_mode);
14943 op4 = gen_reg_rtx (half_mode);
14944 op5 = gen_reg_rtx (half_mode);
14945 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
14946 n >> 3);
14947 ix86_expand_vector_init_interleave (quarter_mode, op1,
14948 &ops [n >> 2], n >> 3);
14949 ix86_expand_vector_init_interleave (quarter_mode, op2,
14950 &ops [n >> 1], n >> 3);
14951 ix86_expand_vector_init_interleave (quarter_mode, op3,
14952 &ops [(n >> 1) | (n >> 2)], n >> 3);
14953 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
14954 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
14955 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
14956 return;
14957
14958 case E_V16QImode:
14959 if (!TARGET_SSE4_1)
14960 break;
14961 /* FALLTHRU */
14962
14963 case E_V8HImode:
14964 if (!TARGET_SSE2)
14965 break;
14966
14967 /* Don't use ix86_expand_vector_init_interleave if we can't
14968 move from GPR to SSE register directly. */
14969 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
14970 break;
9e2a82e1 14971 /* FALLTHRU */
14972
14973 case E_V8HFmode:
2bf6d935
ML
14974
14975 n = GET_MODE_NUNITS (mode);
14976 for (i = 0; i < n; i++)
14977 ops[i] = XVECEXP (vals, 0, i);
14978 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
14979 return;
14980
14981 case E_V4HImode:
14982 case E_V8QImode:
8d7dae0e
UB
14983
14984 case E_V2HImode:
64735dc9 14985 case E_V4QImode:
2bf6d935
ML
14986 break;
14987
14988 default:
14989 gcc_unreachable ();
14990 }
14991
14992 {
14993 int i, j, n_elts, n_words, n_elt_per_word;
8d7dae0e 14994 machine_mode tmp_mode, inner_mode;
2bf6d935
ML
14995 rtx words[4], shift;
14996
8d7dae0e
UB
14997 tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
14998
2bf6d935
ML
14999 inner_mode = GET_MODE_INNER (mode);
15000 n_elts = GET_MODE_NUNITS (mode);
8d7dae0e 15001 n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
2bf6d935
ML
15002 n_elt_per_word = n_elts / n_words;
15003 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
15004
15005 for (i = 0; i < n_words; ++i)
15006 {
15007 rtx word = NULL_RTX;
15008
15009 for (j = 0; j < n_elt_per_word; ++j)
15010 {
15011 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
8d7dae0e 15012 elt = convert_modes (tmp_mode, inner_mode, elt, true);
2bf6d935
ML
15013
15014 if (j == 0)
15015 word = elt;
15016 else
15017 {
8d7dae0e 15018 word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
2bf6d935 15019 word, 1, OPTAB_LIB_WIDEN);
8d7dae0e 15020 word = expand_simple_binop (tmp_mode, IOR, word, elt,
2bf6d935
ML
15021 word, 1, OPTAB_LIB_WIDEN);
15022 }
15023 }
15024
15025 words[i] = word;
15026 }
15027
15028 if (n_words == 1)
15029 emit_move_insn (target, gen_lowpart (mode, words[0]));
15030 else if (n_words == 2)
15031 {
15032 rtx tmp = gen_reg_rtx (mode);
15033 emit_clobber (tmp);
8d7dae0e
UB
15034 emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]);
15035 emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]);
2bf6d935
ML
15036 emit_move_insn (target, tmp);
15037 }
15038 else if (n_words == 4)
15039 {
15040 rtx tmp = gen_reg_rtx (V4SImode);
8d7dae0e 15041 gcc_assert (tmp_mode == SImode);
2bf6d935
ML
15042 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
15043 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
15044 emit_move_insn (target, gen_lowpart (mode, tmp));
15045 }
15046 else
15047 gcc_unreachable ();
15048 }
15049}
15050
15051/* Initialize vector TARGET via VALS. Suppress the use of MMX
15052 instructions unless MMX_OK is true. */
15053
15054void
15055ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
15056{
15057 machine_mode mode = GET_MODE (target);
15058 machine_mode inner_mode = GET_MODE_INNER (mode);
15059 int n_elts = GET_MODE_NUNITS (mode);
15060 int n_var = 0, one_var = -1;
15061 bool all_same = true, all_const_zero = true;
15062 int i;
15063 rtx x;
15064
15065 /* Handle first initialization from vector elts. */
15066 if (n_elts != XVECLEN (vals, 0))
15067 {
15068 rtx subtarget = target;
15069 x = XVECEXP (vals, 0, 0);
15070 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
15071 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
15072 {
15073 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
b7dd2e4e
JJ
15074 if (inner_mode == QImode
15075 || inner_mode == HImode
15076 || inner_mode == TImode)
2bf6d935
ML
15077 {
15078 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
b7dd2e4e
JJ
15079 scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
15080 n_bits /= GET_MODE_SIZE (elt_mode);
15081 mode = mode_for_vector (elt_mode, n_bits).require ();
15082 inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
2bf6d935
ML
15083 ops[0] = gen_lowpart (inner_mode, ops[0]);
15084 ops[1] = gen_lowpart (inner_mode, ops[1]);
15085 subtarget = gen_reg_rtx (mode);
15086 }
15087 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
15088 if (subtarget != target)
15089 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
15090 return;
15091 }
15092 gcc_unreachable ();
15093 }
15094
15095 for (i = 0; i < n_elts; ++i)
15096 {
15097 x = XVECEXP (vals, 0, i);
15098 if (!(CONST_SCALAR_INT_P (x)
15099 || CONST_DOUBLE_P (x)
15100 || CONST_FIXED_P (x)))
15101 n_var++, one_var = i;
15102 else if (x != CONST0_RTX (inner_mode))
15103 all_const_zero = false;
15104 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
15105 all_same = false;
15106 }
15107
15108 /* Constants are best loaded from the constant pool. */
15109 if (n_var == 0)
15110 {
15111 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
15112 return;
15113 }
15114
15115 /* If all values are identical, broadcast the value. */
15116 if (all_same
15117 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
15118 XVECEXP (vals, 0, 0)))
15119 return;
15120
15121 /* Values where only one field is non-constant are best loaded from
15122 the pool and overwritten via move later. */
15123 if (n_var == 1)
15124 {
15125 if (all_const_zero
15126 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
15127 XVECEXP (vals, 0, one_var),
15128 one_var))
15129 return;
15130
15131 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
15132 return;
15133 }
15134
15135 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
15136}
15137
287cc750 15138/* Implemented as
15139 V setg (V v, int idx, T val)
15140 {
15141 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
15142 V valv = (V){val, val, val, val, val, val, val, val};
15143 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
15144 v = (v & ~mask) | (valv & mask);
15145 return v;
15146 }. */
15147void
15148ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
15149{
15150 rtx vec[64];
15151 machine_mode mode = GET_MODE (target);
15152 machine_mode cmp_mode = mode;
15153 int n_elts = GET_MODE_NUNITS (mode);
15154 rtx valv,idxv,constv,idx_tmp;
15155 bool ok = false;
15156
15157 /* 512-bits vector byte/word broadcast and comparison only available
15158 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
15159 when without TARGET_AVX512BW. */
15160 if ((mode == V32HImode || mode == V64QImode) && !TARGET_AVX512BW)
15161 {
15162 gcc_assert (TARGET_AVX512F);
15163 rtx vhi, vlo, idx_hi;
15164 machine_mode half_mode;
15165 rtx (*extract_hi)(rtx, rtx);
15166 rtx (*extract_lo)(rtx, rtx);
15167
15168 if (mode == V32HImode)
15169 {
15170 half_mode = V16HImode;
15171 extract_hi = gen_vec_extract_hi_v32hi;
15172 extract_lo = gen_vec_extract_lo_v32hi;
15173 }
15174 else
15175 {
15176 half_mode = V32QImode;
15177 extract_hi = gen_vec_extract_hi_v64qi;
15178 extract_lo = gen_vec_extract_lo_v64qi;
15179 }
15180
15181 vhi = gen_reg_rtx (half_mode);
15182 vlo = gen_reg_rtx (half_mode);
15183 idx_hi = gen_reg_rtx (GET_MODE (idx));
15184 emit_insn (extract_hi (vhi, target));
15185 emit_insn (extract_lo (vlo, target));
15186 vec[0] = idx_hi;
15187 vec[1] = idx;
15188 vec[2] = GEN_INT (n_elts/2);
15189 ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
15190 ix86_expand_vector_set_var (vhi, val, idx_hi);
15191 ix86_expand_vector_set_var (vlo, val, idx);
15192 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
15193 return;
15194 }
15195
15196 if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
15197 {
15198 switch (mode)
15199 {
15200 case E_V2DFmode:
15201 cmp_mode = V2DImode;
15202 break;
15203 case E_V4DFmode:
15204 cmp_mode = V4DImode;
15205 break;
15206 case E_V8DFmode:
15207 cmp_mode = V8DImode;
15208 break;
20a2c8ac
UB
15209 case E_V2SFmode:
15210 cmp_mode = V2SImode;
15211 break;
287cc750 15212 case E_V4SFmode:
15213 cmp_mode = V4SImode;
15214 break;
15215 case E_V8SFmode:
15216 cmp_mode = V8SImode;
15217 break;
15218 case E_V16SFmode:
15219 cmp_mode = V16SImode;
15220 break;
9e2a82e1 15221 /* TARGET_AVX512FP16 implies TARGET_AVX512BW. */
15222 case E_V8HFmode:
15223 cmp_mode = V8HImode;
15224 break;
15225 case E_V16HFmode:
15226 cmp_mode = V16HImode;
15227 break;
15228 case E_V32HFmode:
15229 cmp_mode = V32HImode;
15230 break;
287cc750 15231 default:
15232 gcc_unreachable ();
15233 }
15234 }
15235
15236 for (int i = 0; i != n_elts; i++)
15237 vec[i] = GEN_INT (i);
15238 constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
15239 valv = gen_reg_rtx (mode);
15240 idxv = gen_reg_rtx (cmp_mode);
15241 idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
15242
20a2c8ac
UB
15243 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
15244 mode, valv, val);
287cc750 15245 gcc_assert (ok);
20a2c8ac
UB
15246 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
15247 cmp_mode, idxv, idx_tmp);
287cc750 15248 gcc_assert (ok);
15249 vec[0] = target;
15250 vec[1] = valv;
15251 vec[2] = target;
15252 vec[3] = gen_rtx_EQ (mode, idxv, constv);
15253 vec[4] = idxv;
15254 vec[5] = constv;
15255 ok = ix86_expand_int_vcond (vec);
15256 gcc_assert (ok);
15257}
15258
2bf6d935
ML
15259void
15260ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
15261{
15262 machine_mode mode = GET_MODE (target);
15263 machine_mode inner_mode = GET_MODE_INNER (mode);
15264 machine_mode half_mode;
15265 bool use_vec_merge = false;
7fc4d600 15266 bool blendm_const = false;
2bf6d935 15267 rtx tmp;
9e2a82e1 15268 static rtx (*gen_extract[7][2]) (rtx, rtx)
2bf6d935
ML
15269 = {
15270 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
15271 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
15272 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
15273 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
15274 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
9e2a82e1 15275 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
15276 { gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf }
2bf6d935 15277 };
9e2a82e1 15278 static rtx (*gen_insert[7][2]) (rtx, rtx, rtx)
2bf6d935
ML
15279 = {
15280 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
15281 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
15282 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
15283 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
15284 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
9e2a82e1 15285 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
15286 { gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
2bf6d935
ML
15287 };
15288 int i, j, n;
15289 machine_mode mmode = VOIDmode;
15290 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
15291
15292 switch (mode)
15293 {
2bf6d935 15294 case E_V2SImode:
f15c7bd1
UB
15295 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15296 if (use_vec_merge)
15297 break;
15298 /* FALLTHRU */
15299
15300 case E_V2SFmode:
2bf6d935
ML
15301 if (mmx_ok)
15302 {
15303 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
15304 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
15305 if (elt == 0)
15306 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
15307 else
15308 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
15309 emit_insn (gen_rtx_SET (target, tmp));
15310 return;
15311 }
15312 break;
15313
15314 case E_V2DImode:
15315 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
15316 if (use_vec_merge)
15317 break;
15318
15319 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
15320 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
15321 if (elt == 0)
15322 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
15323 else
15324 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
15325 emit_insn (gen_rtx_SET (target, tmp));
15326 return;
15327
15328 case E_V2DFmode:
ac173024
L
15329 /* NB: For ELT == 0, use standard scalar operation patterns which
15330 preserve the rest of the vector for combiner:
15331
15332 (vec_merge:V2DF
15333 (vec_duplicate:V2DF (reg:DF))
15334 (reg:V2DF)
15335 (const_int 1))
15336 */
15337 if (elt == 0)
15338 goto do_vec_merge;
15339
2bf6d935
ML
15340 {
15341 rtx op0, op1;
15342
15343 /* For the two element vectors, we implement a VEC_CONCAT with
15344 the extraction of the other element. */
15345
15346 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
15347 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
15348
15349 if (elt == 0)
15350 op0 = val, op1 = tmp;
15351 else
15352 op0 = tmp, op1 = val;
15353
15354 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
15355 emit_insn (gen_rtx_SET (target, tmp));
15356 }
15357 return;
15358
15359 case E_V4SFmode:
15360 use_vec_merge = TARGET_SSE4_1;
15361 if (use_vec_merge)
15362 break;
15363
15364 switch (elt)
15365 {
15366 case 0:
15367 use_vec_merge = true;
15368 break;
15369
15370 case 1:
15371 /* tmp = target = A B C D */
15372 tmp = copy_to_reg (target);
15373 /* target = A A B B */
15374 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
15375 /* target = X A B B */
15376 ix86_expand_vector_set (false, target, val, 0);
15377 /* target = A X C D */
15378 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
15379 const1_rtx, const0_rtx,
15380 GEN_INT (2+4), GEN_INT (3+4)));
15381 return;
15382
15383 case 2:
15384 /* tmp = target = A B C D */
15385 tmp = copy_to_reg (target);
15386 /* tmp = X B C D */
15387 ix86_expand_vector_set (false, tmp, val, 0);
15388 /* target = A B X D */
15389 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
15390 const0_rtx, const1_rtx,
15391 GEN_INT (0+4), GEN_INT (3+4)));
15392 return;
15393
15394 case 3:
15395 /* tmp = target = A B C D */
15396 tmp = copy_to_reg (target);
15397 /* tmp = X B C D */
15398 ix86_expand_vector_set (false, tmp, val, 0);
15399 /* target = A B X D */
15400 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
15401 const0_rtx, const1_rtx,
15402 GEN_INT (2+4), GEN_INT (0+4)));
15403 return;
15404
15405 default:
15406 gcc_unreachable ();
15407 }
15408 break;
15409
15410 case E_V4SImode:
15411 use_vec_merge = TARGET_SSE4_1;
15412 if (use_vec_merge)
15413 break;
15414
15415 /* Element 0 handled by vec_merge below. */
15416 if (elt == 0)
15417 {
15418 use_vec_merge = true;
15419 break;
15420 }
15421
15422 if (TARGET_SSE2)
15423 {
15424 /* With SSE2, use integer shuffles to swap element 0 and ELT,
15425 store into element 0, then shuffle them back. */
15426
15427 rtx order[4];
15428
15429 order[0] = GEN_INT (elt);
15430 order[1] = const1_rtx;
15431 order[2] = const2_rtx;
15432 order[3] = GEN_INT (3);
15433 order[elt] = const0_rtx;
15434
15435 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
15436 order[1], order[2], order[3]));
15437
15438 ix86_expand_vector_set (false, target, val, 0);
15439
15440 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
15441 order[1], order[2], order[3]));
15442 }
15443 else
15444 {
15445 /* For SSE1, we have to reuse the V4SF code. */
15446 rtx t = gen_reg_rtx (V4SFmode);
15447 emit_move_insn (t, gen_lowpart (V4SFmode, target));
15448 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
15449 emit_move_insn (target, gen_lowpart (mode, t));
15450 }
15451 return;
15452
9e2a82e1 15453 case E_V8HFmode:
7fc4d600 15454 if (TARGET_AVX2)
15455 {
15456 mmode = SImode;
15457 gen_blendm = gen_sse4_1_pblendph;
15458 blendm_const = true;
15459 }
15460 else
15461 use_vec_merge = true;
9e2a82e1 15462 break;
15463
2bf6d935 15464 case E_V8HImode:
5883e567 15465 case E_V2HImode:
2bf6d935
ML
15466 use_vec_merge = TARGET_SSE2;
15467 break;
15468 case E_V4HImode:
15469 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
15470 break;
15471
15472 case E_V16QImode:
5883e567 15473 case E_V4QImode:
2bf6d935
ML
15474 use_vec_merge = TARGET_SSE4_1;
15475 break;
15476
15477 case E_V8QImode:
f15c7bd1 15478 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
2bf6d935
ML
15479 break;
15480
15481 case E_V32QImode:
15482 half_mode = V16QImode;
15483 j = 0;
15484 n = 16;
15485 goto half;
15486
9e2a82e1 15487 case E_V16HFmode:
7fc4d600 15488 if (TARGET_AVX2)
15489 {
15490 mmode = SImode;
15491 gen_blendm = gen_avx2_pblendph;
15492 blendm_const = true;
15493 break;
15494 }
15495 else
15496 {
15497 half_mode = V8HFmode;
15498 j = 6;
15499 n = 8;
15500 goto half;
15501 }
9e2a82e1 15502
2bf6d935
ML
15503 case E_V16HImode:
15504 half_mode = V8HImode;
15505 j = 1;
15506 n = 8;
15507 goto half;
15508
15509 case E_V8SImode:
15510 half_mode = V4SImode;
15511 j = 2;
15512 n = 4;
15513 goto half;
15514
15515 case E_V4DImode:
15516 half_mode = V2DImode;
15517 j = 3;
15518 n = 2;
15519 goto half;
15520
15521 case E_V8SFmode:
15522 half_mode = V4SFmode;
15523 j = 4;
15524 n = 4;
15525 goto half;
15526
15527 case E_V4DFmode:
15528 half_mode = V2DFmode;
15529 j = 5;
15530 n = 2;
15531 goto half;
15532
15533half:
15534 /* Compute offset. */
15535 i = elt / n;
15536 elt %= n;
15537
15538 gcc_assert (i <= 1);
15539
15540 /* Extract the half. */
15541 tmp = gen_reg_rtx (half_mode);
15542 emit_insn (gen_extract[j][i] (tmp, target));
15543
15544 /* Put val in tmp at elt. */
15545 ix86_expand_vector_set (false, tmp, val, elt);
15546
15547 /* Put it back. */
15548 emit_insn (gen_insert[j][i] (target, target, tmp));
15549 return;
15550
15551 case E_V8DFmode:
15552 if (TARGET_AVX512F)
15553 {
15554 mmode = QImode;
15555 gen_blendm = gen_avx512f_blendmv8df;
15556 }
15557 break;
15558
15559 case E_V8DImode:
15560 if (TARGET_AVX512F)
15561 {
15562 mmode = QImode;
15563 gen_blendm = gen_avx512f_blendmv8di;
15564 }
15565 break;
15566
15567 case E_V16SFmode:
15568 if (TARGET_AVX512F)
15569 {
15570 mmode = HImode;
15571 gen_blendm = gen_avx512f_blendmv16sf;
15572 }
15573 break;
15574
15575 case E_V16SImode:
15576 if (TARGET_AVX512F)
15577 {
15578 mmode = HImode;
15579 gen_blendm = gen_avx512f_blendmv16si;
15580 }
15581 break;
15582
9e2a82e1 15583 case E_V32HFmode:
15584 if (TARGET_AVX512BW)
15585 {
15586 mmode = SImode;
15587 gen_blendm = gen_avx512bw_blendmv32hf;
15588 }
15589 break;
2bf6d935
ML
15590 case E_V32HImode:
15591 if (TARGET_AVX512BW)
15592 {
15593 mmode = SImode;
15594 gen_blendm = gen_avx512bw_blendmv32hi;
15595 }
15596 else if (TARGET_AVX512F)
15597 {
15598 half_mode = E_V8HImode;
15599 n = 8;
15600 goto quarter;
15601 }
15602 break;
15603
15604 case E_V64QImode:
15605 if (TARGET_AVX512BW)
15606 {
15607 mmode = DImode;
15608 gen_blendm = gen_avx512bw_blendmv64qi;
15609 }
15610 else if (TARGET_AVX512F)
15611 {
15612 half_mode = E_V16QImode;
15613 n = 16;
15614 goto quarter;
15615 }
15616 break;
15617
15618quarter:
15619 /* Compute offset. */
15620 i = elt / n;
15621 elt %= n;
15622
15623 gcc_assert (i <= 3);
15624
15625 {
15626 /* Extract the quarter. */
15627 tmp = gen_reg_rtx (V4SImode);
15628 rtx tmp2 = gen_lowpart (V16SImode, target);
15629 rtx mask = gen_reg_rtx (QImode);
15630
15631 emit_move_insn (mask, constm1_rtx);
15632 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
15633 tmp, mask));
15634
15635 tmp2 = gen_reg_rtx (half_mode);
15636 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
15637 tmp = tmp2;
15638
15639 /* Put val in tmp at elt. */
15640 ix86_expand_vector_set (false, tmp, val, elt);
15641
15642 /* Put it back. */
15643 tmp2 = gen_reg_rtx (V16SImode);
15644 rtx tmp3 = gen_lowpart (V16SImode, target);
15645 mask = gen_reg_rtx (HImode);
15646 emit_move_insn (mask, constm1_rtx);
15647 tmp = gen_lowpart (V4SImode, tmp);
15648 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
15649 tmp3, mask));
15650 emit_move_insn (target, gen_lowpart (mode, tmp2));
15651 }
15652 return;
15653
15654 default:
15655 break;
15656 }
15657
15658 if (mmode != VOIDmode)
15659 {
15660 tmp = gen_reg_rtx (mode);
15661 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
7fc4d600 15662 rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
2bf6d935
ML
15663 /* The avx512*_blendm<mode> expanders have different operand order
15664 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
15665 elements where the mask is set and second input operand otherwise,
15666 in {sse,avx}*_*blend* the first input operand is used for elements
15667 where the mask is clear and second input operand otherwise. */
7fc4d600 15668 if (!blendm_const)
15669 merge_mask = force_reg (mmode, merge_mask);
15670 emit_insn (gen_blendm (target, target, tmp, merge_mask));
2bf6d935
ML
15671 }
15672 else if (use_vec_merge)
15673 {
ac173024 15674do_vec_merge:
2bf6d935
ML
15675 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
15676 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
15677 GEN_INT (HOST_WIDE_INT_1U << elt));
15678 emit_insn (gen_rtx_SET (target, tmp));
15679 }
15680 else
15681 {
15682 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
15683
15684 emit_move_insn (mem, target);
15685
15686 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
15687 emit_move_insn (tmp, val);
15688
15689 emit_move_insn (target, mem);
15690 }
15691}
15692
15693void
15694ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
15695{
15696 machine_mode mode = GET_MODE (vec);
15697 machine_mode inner_mode = GET_MODE_INNER (mode);
15698 bool use_vec_extr = false;
15699 rtx tmp;
15700
15701 switch (mode)
15702 {
15703 case E_V2SImode:
5fbc8ab4
UB
15704 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15705 if (use_vec_extr)
15706 break;
15707 /* FALLTHRU */
15708
2bf6d935
ML
15709 case E_V2SFmode:
15710 if (!mmx_ok)
15711 break;
15712 /* FALLTHRU */
15713
15714 case E_V2DFmode:
15715 case E_V2DImode:
15716 case E_V2TImode:
15717 case E_V4TImode:
15718 use_vec_extr = true;
15719 break;
15720
15721 case E_V4SFmode:
15722 use_vec_extr = TARGET_SSE4_1;
15723 if (use_vec_extr)
15724 break;
15725
15726 switch (elt)
15727 {
15728 case 0:
15729 tmp = vec;
15730 break;
15731
15732 case 1:
15733 case 3:
15734 tmp = gen_reg_rtx (mode);
15735 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
15736 GEN_INT (elt), GEN_INT (elt),
15737 GEN_INT (elt+4), GEN_INT (elt+4)));
15738 break;
15739
15740 case 2:
15741 tmp = gen_reg_rtx (mode);
15742 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
15743 break;
15744
15745 default:
15746 gcc_unreachable ();
15747 }
15748 vec = tmp;
15749 use_vec_extr = true;
15750 elt = 0;
15751 break;
15752
15753 case E_V4SImode:
15754 use_vec_extr = TARGET_SSE4_1;
15755 if (use_vec_extr)
15756 break;
15757
15758 if (TARGET_SSE2)
15759 {
15760 switch (elt)
15761 {
15762 case 0:
15763 tmp = vec;
15764 break;
15765
15766 case 1:
15767 case 3:
15768 tmp = gen_reg_rtx (mode);
15769 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
15770 GEN_INT (elt), GEN_INT (elt),
15771 GEN_INT (elt), GEN_INT (elt)));
15772 break;
15773
15774 case 2:
15775 tmp = gen_reg_rtx (mode);
15776 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
15777 break;
15778
15779 default:
15780 gcc_unreachable ();
15781 }
15782 vec = tmp;
15783 use_vec_extr = true;
15784 elt = 0;
15785 }
15786 else
15787 {
15788 /* For SSE1, we have to reuse the V4SF code. */
15789 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
15790 gen_lowpart (V4SFmode, vec), elt);
15791 return;
15792 }
15793 break;
15794
15795 case E_V8HImode:
5883e567 15796 case E_V2HImode:
2bf6d935
ML
15797 use_vec_extr = TARGET_SSE2;
15798 break;
15799 case E_V4HImode:
15800 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
15801 break;
15802
15803 case E_V16QImode:
15804 use_vec_extr = TARGET_SSE4_1;
f66e6e2b
JJ
15805 if (!use_vec_extr
15806 && TARGET_SSE2
15807 && elt == 0
15808 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
15809 {
15810 tmp = gen_reg_rtx (SImode);
15811 ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
15812 0);
15813 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
15814 return;
15815 }
2bf6d935 15816 break;
5883e567
UB
15817 case E_V4QImode:
15818 use_vec_extr = TARGET_SSE4_1;
15819 break;
2bf6d935
ML
15820
15821 case E_V8SFmode:
15822 if (TARGET_AVX)
15823 {
15824 tmp = gen_reg_rtx (V4SFmode);
15825 if (elt < 4)
15826 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
15827 else
15828 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
15829 ix86_expand_vector_extract (false, target, tmp, elt & 3);
15830 return;
15831 }
15832 break;
15833
15834 case E_V4DFmode:
15835 if (TARGET_AVX)
15836 {
15837 tmp = gen_reg_rtx (V2DFmode);
15838 if (elt < 2)
15839 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
15840 else
15841 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
15842 ix86_expand_vector_extract (false, target, tmp, elt & 1);
15843 return;
15844 }
15845 break;
15846
15847 case E_V32QImode:
15848 if (TARGET_AVX)
15849 {
15850 tmp = gen_reg_rtx (V16QImode);
15851 if (elt < 16)
15852 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
15853 else
15854 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
15855 ix86_expand_vector_extract (false, target, tmp, elt & 15);
15856 return;
15857 }
15858 break;
15859
15860 case E_V16HImode:
15861 if (TARGET_AVX)
15862 {
15863 tmp = gen_reg_rtx (V8HImode);
15864 if (elt < 8)
15865 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
15866 else
15867 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
15868 ix86_expand_vector_extract (false, target, tmp, elt & 7);
15869 return;
15870 }
15871 break;
15872
15873 case E_V8SImode:
15874 if (TARGET_AVX)
15875 {
15876 tmp = gen_reg_rtx (V4SImode);
15877 if (elt < 4)
15878 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
15879 else
15880 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
15881 ix86_expand_vector_extract (false, target, tmp, elt & 3);
15882 return;
15883 }
15884 break;
15885
15886 case E_V4DImode:
15887 if (TARGET_AVX)
15888 {
15889 tmp = gen_reg_rtx (V2DImode);
15890 if (elt < 2)
15891 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
15892 else
15893 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
15894 ix86_expand_vector_extract (false, target, tmp, elt & 1);
15895 return;
15896 }
15897 break;
15898
15899 case E_V32HImode:
15900 if (TARGET_AVX512BW)
15901 {
15902 tmp = gen_reg_rtx (V16HImode);
15903 if (elt < 16)
15904 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
15905 else
15906 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
15907 ix86_expand_vector_extract (false, target, tmp, elt & 15);
15908 return;
15909 }
15910 break;
15911
15912 case E_V64QImode:
15913 if (TARGET_AVX512BW)
15914 {
15915 tmp = gen_reg_rtx (V32QImode);
15916 if (elt < 32)
15917 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
15918 else
15919 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
15920 ix86_expand_vector_extract (false, target, tmp, elt & 31);
15921 return;
15922 }
15923 break;
15924
15925 case E_V16SFmode:
15926 tmp = gen_reg_rtx (V8SFmode);
15927 if (elt < 8)
15928 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
15929 else
15930 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
15931 ix86_expand_vector_extract (false, target, tmp, elt & 7);
15932 return;
15933
15934 case E_V8DFmode:
15935 tmp = gen_reg_rtx (V4DFmode);
15936 if (elt < 4)
15937 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
15938 else
15939 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
15940 ix86_expand_vector_extract (false, target, tmp, elt & 3);
15941 return;
15942
15943 case E_V16SImode:
15944 tmp = gen_reg_rtx (V8SImode);
15945 if (elt < 8)
15946 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
15947 else
15948 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
15949 ix86_expand_vector_extract (false, target, tmp, elt & 7);
15950 return;
15951
15952 case E_V8DImode:
15953 tmp = gen_reg_rtx (V4DImode);
15954 if (elt < 4)
15955 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
15956 else
15957 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
15958 ix86_expand_vector_extract (false, target, tmp, elt & 3);
15959 return;
15960
9e2a82e1 15961 case E_V32HFmode:
15962 tmp = gen_reg_rtx (V16HFmode);
15963 if (elt < 16)
15964 emit_insn (gen_vec_extract_lo_v32hf (tmp, vec));
15965 else
15966 emit_insn (gen_vec_extract_hi_v32hf (tmp, vec));
15967 ix86_expand_vector_extract (false, target, tmp, elt & 15);
15968 return;
15969
15970 case E_V16HFmode:
15971 tmp = gen_reg_rtx (V8HFmode);
15972 if (elt < 8)
15973 emit_insn (gen_vec_extract_lo_v16hf (tmp, vec));
15974 else
15975 emit_insn (gen_vec_extract_hi_v16hf (tmp, vec));
15976 ix86_expand_vector_extract (false, target, tmp, elt & 7);
15977 return;
15978
15979 case E_V8HFmode:
15980 use_vec_extr = true;
15981 break;
15982
2bf6d935 15983 case E_V8QImode:
5fbc8ab4 15984 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
2bf6d935 15985 /* ??? Could extract the appropriate HImode element and shift. */
5fbc8ab4
UB
15986 break;
15987
2bf6d935
ML
15988 default:
15989 break;
15990 }
15991
15992 if (use_vec_extr)
15993 {
15994 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
15995 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
15996
15997 /* Let the rtl optimizers know about the zero extension performed. */
15998 if (inner_mode == QImode || inner_mode == HImode)
15999 {
16000 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
16001 target = gen_lowpart (SImode, target);
16002 }
16003
16004 emit_insn (gen_rtx_SET (target, tmp));
16005 }
16006 else
16007 {
16008 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
16009
16010 emit_move_insn (mem, vec);
16011
16012 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
16013 emit_move_insn (target, tmp);
16014 }
16015}
16016
16017/* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
16018 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
16019 The upper bits of DEST are undefined, though they shouldn't cause
16020 exceptions (some bits from src or all zeros are ok). */
16021
16022static void
16023emit_reduc_half (rtx dest, rtx src, int i)
16024{
16025 rtx tem, d = dest;
16026 switch (GET_MODE (src))
16027 {
16028 case E_V4SFmode:
16029 if (i == 128)
16030 tem = gen_sse_movhlps (dest, src, src);
16031 else
16032 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
16033 GEN_INT (1 + 4), GEN_INT (1 + 4));
16034 break;
16035 case E_V2DFmode:
16036 tem = gen_vec_interleave_highv2df (dest, src, src);
16037 break;
16038 case E_V16QImode:
16039 case E_V8HImode:
16040 case E_V4SImode:
16041 case E_V2DImode:
16042 d = gen_reg_rtx (V1TImode);
16043 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
16044 GEN_INT (i / 2));
16045 break;
16046 case E_V8SFmode:
16047 if (i == 256)
16048 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
16049 else
16050 tem = gen_avx_shufps256 (dest, src, src,
16051 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
16052 break;
16053 case E_V4DFmode:
16054 if (i == 256)
16055 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
16056 else
16057 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
16058 break;
16059 case E_V32QImode:
16060 case E_V16HImode:
16061 case E_V8SImode:
16062 case E_V4DImode:
16063 if (i == 256)
16064 {
16065 if (GET_MODE (dest) != V4DImode)
16066 d = gen_reg_rtx (V4DImode);
16067 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
16068 gen_lowpart (V4DImode, src),
16069 const1_rtx);
16070 }
16071 else
16072 {
16073 d = gen_reg_rtx (V2TImode);
16074 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
16075 GEN_INT (i / 2));
16076 }
16077 break;
16078 case E_V64QImode:
16079 case E_V32HImode:
bee27152
JJ
16080 if (i < 64)
16081 {
16082 d = gen_reg_rtx (V4TImode);
16083 tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
16084 GEN_INT (i / 2));
16085 break;
16086 }
16087 /* FALLTHRU */
2bf6d935
ML
16088 case E_V16SImode:
16089 case E_V16SFmode:
16090 case E_V8DImode:
16091 case E_V8DFmode:
16092 if (i > 128)
16093 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
bee27152
JJ
16094 gen_lowpart (V16SImode, src),
16095 gen_lowpart (V16SImode, src),
16096 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
16097 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
16098 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
16099 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
16100 GEN_INT (0xC), GEN_INT (0xD),
16101 GEN_INT (0xE), GEN_INT (0xF),
16102 GEN_INT (0x10), GEN_INT (0x11),
16103 GEN_INT (0x12), GEN_INT (0x13),
16104 GEN_INT (0x14), GEN_INT (0x15),
16105 GEN_INT (0x16), GEN_INT (0x17));
2bf6d935
ML
16106 else
16107 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
bee27152
JJ
16108 gen_lowpart (V16SImode, src),
16109 GEN_INT (i == 128 ? 0x2 : 0x1),
16110 GEN_INT (0x3),
16111 GEN_INT (0x3),
16112 GEN_INT (0x3),
16113 GEN_INT (i == 128 ? 0x6 : 0x5),
16114 GEN_INT (0x7),
16115 GEN_INT (0x7),
16116 GEN_INT (0x7),
16117 GEN_INT (i == 128 ? 0xA : 0x9),
16118 GEN_INT (0xB),
16119 GEN_INT (0xB),
16120 GEN_INT (0xB),
16121 GEN_INT (i == 128 ? 0xE : 0xD),
16122 GEN_INT (0xF),
16123 GEN_INT (0xF),
16124 GEN_INT (0xF));
2bf6d935
ML
16125 break;
16126 default:
16127 gcc_unreachable ();
16128 }
16129 emit_insn (tem);
16130 if (d != dest)
16131 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
16132}
16133
16134/* Expand a vector reduction. FN is the binary pattern to reduce;
16135 DEST is the destination; IN is the input vector. */
16136
16137void
16138ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
16139{
16140 rtx half, dst, vec = in;
16141 machine_mode mode = GET_MODE (in);
16142 int i;
16143
16144 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
16145 if (TARGET_SSE4_1
16146 && mode == V8HImode
16147 && fn == gen_uminv8hi3)
16148 {
16149 emit_insn (gen_sse4_1_phminposuw (dest, in));
16150 return;
16151 }
16152
16153 for (i = GET_MODE_BITSIZE (mode);
16154 i > GET_MODE_UNIT_BITSIZE (mode);
16155 i >>= 1)
16156 {
16157 half = gen_reg_rtx (mode);
16158 emit_reduc_half (half, vec, i);
16159 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
16160 dst = dest;
16161 else
16162 dst = gen_reg_rtx (mode);
16163 emit_insn (fn (dst, half, vec));
16164 vec = dst;
16165 }
16166}
16167
16168/* Output code to perform a conditional jump to LABEL, if C2 flag in
16169 FP status register is set. */
16170
16171void
16172ix86_emit_fp_unordered_jump (rtx label)
16173{
16174 rtx reg = gen_reg_rtx (HImode);
16175 rtx_insn *insn;
16176 rtx temp;
16177
16178 emit_insn (gen_x86_fnstsw_1 (reg));
16179
16180 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
16181 {
16182 emit_insn (gen_x86_sahf_1 (reg));
16183
16184 temp = gen_rtx_REG (CCmode, FLAGS_REG);
16185 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
16186 }
16187 else
16188 {
16189 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
16190
16191 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
16192 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
16193 }
16194
16195 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
16196 gen_rtx_LABEL_REF (VOIDmode, label),
16197 pc_rtx);
16198 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
16199 predict_jump (REG_BR_PROB_BASE * 10 / 100);
16200 JUMP_LABEL (insn) = label;
16201}
16202
16203/* Output code to perform an sinh XFmode calculation. */
16204
16205void ix86_emit_i387_sinh (rtx op0, rtx op1)
16206{
16207 rtx e1 = gen_reg_rtx (XFmode);
16208 rtx e2 = gen_reg_rtx (XFmode);
16209 rtx scratch = gen_reg_rtx (HImode);
16210 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
16211 rtx half = const_double_from_real_value (dconsthalf, XFmode);
16212 rtx cst1, tmp;
16213 rtx_code_label *jump_label = gen_label_rtx ();
16214 rtx_insn *insn;
16215
16216 /* scratch = fxam (op1) */
16217 emit_insn (gen_fxamxf2_i387 (scratch, op1));
16218
16219 /* e1 = expm1 (|op1|) */
16220 emit_insn (gen_absxf2 (e2, op1));
16221 emit_insn (gen_expm1xf2 (e1, e2));
16222
16223 /* e2 = e1 / (e1 + 1.0) + e1 */
16224 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
16225 emit_insn (gen_addxf3 (e2, e1, cst1));
16226 emit_insn (gen_divxf3 (e2, e1, e2));
16227 emit_insn (gen_addxf3 (e2, e2, e1));
16228
16229 /* flags = signbit (op1) */
16230 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
16231
16232 /* if (flags) then e2 = -e2 */
16233 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
16234 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
16235 gen_rtx_LABEL_REF (VOIDmode, jump_label),
16236 pc_rtx);
16237 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
16238 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16239 JUMP_LABEL (insn) = jump_label;
16240
16241 emit_insn (gen_negxf2 (e2, e2));
16242
16243 emit_label (jump_label);
16244 LABEL_NUSES (jump_label) = 1;
16245
16246 /* op0 = 0.5 * e2 */
16247 half = force_reg (XFmode, half);
16248 emit_insn (gen_mulxf3 (op0, e2, half));
16249}
16250
16251/* Output code to perform an cosh XFmode calculation. */
16252
16253void ix86_emit_i387_cosh (rtx op0, rtx op1)
16254{
16255 rtx e1 = gen_reg_rtx (XFmode);
16256 rtx e2 = gen_reg_rtx (XFmode);
16257 rtx half = const_double_from_real_value (dconsthalf, XFmode);
16258 rtx cst1;
16259
16260 /* e1 = exp (op1) */
16261 emit_insn (gen_expxf2 (e1, op1));
16262
16263 /* e2 = e1 + 1.0 / e1 */
16264 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
16265 emit_insn (gen_divxf3 (e2, cst1, e1));
16266 emit_insn (gen_addxf3 (e2, e1, e2));
16267
16268 /* op0 = 0.5 * e2 */
16269 half = force_reg (XFmode, half);
16270 emit_insn (gen_mulxf3 (op0, e2, half));
16271}
16272
16273/* Output code to perform an tanh XFmode calculation. */
16274
16275void ix86_emit_i387_tanh (rtx op0, rtx op1)
16276{
16277 rtx e1 = gen_reg_rtx (XFmode);
16278 rtx e2 = gen_reg_rtx (XFmode);
16279 rtx scratch = gen_reg_rtx (HImode);
16280 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
16281 rtx cst2, tmp;
16282 rtx_code_label *jump_label = gen_label_rtx ();
16283 rtx_insn *insn;
16284
16285 /* scratch = fxam (op1) */
16286 emit_insn (gen_fxamxf2_i387 (scratch, op1));
16287
16288 /* e1 = expm1 (-|2 * op1|) */
16289 emit_insn (gen_addxf3 (e2, op1, op1));
16290 emit_insn (gen_absxf2 (e2, e2));
16291 emit_insn (gen_negxf2 (e2, e2));
16292 emit_insn (gen_expm1xf2 (e1, e2));
16293
16294 /* e2 = e1 / (e1 + 2.0) */
16295 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
16296 emit_insn (gen_addxf3 (e2, e1, cst2));
16297 emit_insn (gen_divxf3 (e2, e1, e2));
16298
16299 /* flags = signbit (op1) */
16300 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
16301
16302 /* if (!flags) then e2 = -e2 */
16303 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
16304 gen_rtx_NE (VOIDmode, flags, const0_rtx),
16305 gen_rtx_LABEL_REF (VOIDmode, jump_label),
16306 pc_rtx);
16307 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
16308 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16309 JUMP_LABEL (insn) = jump_label;
16310
16311 emit_insn (gen_negxf2 (e2, e2));
16312
16313 emit_label (jump_label);
16314 LABEL_NUSES (jump_label) = 1;
16315
16316 emit_move_insn (op0, e2);
16317}
16318
16319/* Output code to perform an asinh XFmode calculation. */
16320
16321void ix86_emit_i387_asinh (rtx op0, rtx op1)
16322{
16323 rtx e1 = gen_reg_rtx (XFmode);
16324 rtx e2 = gen_reg_rtx (XFmode);
16325 rtx scratch = gen_reg_rtx (HImode);
16326 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
16327 rtx cst1, tmp;
16328 rtx_code_label *jump_label = gen_label_rtx ();
16329 rtx_insn *insn;
16330
16331 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
16332 emit_insn (gen_mulxf3 (e1, op1, op1));
16333 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
16334 emit_insn (gen_addxf3 (e2, e1, cst1));
16335 emit_insn (gen_sqrtxf2 (e2, e2));
16336 emit_insn (gen_addxf3 (e2, e2, cst1));
16337
16338 /* e1 = e1 / e2 */
16339 emit_insn (gen_divxf3 (e1, e1, e2));
16340
16341 /* scratch = fxam (op1) */
16342 emit_insn (gen_fxamxf2_i387 (scratch, op1));
16343
16344 /* e1 = e1 + |op1| */
16345 emit_insn (gen_absxf2 (e2, op1));
16346 emit_insn (gen_addxf3 (e1, e1, e2));
16347
16348 /* e2 = log1p (e1) */
16349 ix86_emit_i387_log1p (e2, e1);
16350
16351 /* flags = signbit (op1) */
16352 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
16353
16354 /* if (flags) then e2 = -e2 */
16355 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
16356 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
16357 gen_rtx_LABEL_REF (VOIDmode, jump_label),
16358 pc_rtx);
16359 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
16360 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16361 JUMP_LABEL (insn) = jump_label;
16362
16363 emit_insn (gen_negxf2 (e2, e2));
16364
16365 emit_label (jump_label);
16366 LABEL_NUSES (jump_label) = 1;
16367
16368 emit_move_insn (op0, e2);
16369}
16370
16371/* Output code to perform an acosh XFmode calculation. */
16372
16373void ix86_emit_i387_acosh (rtx op0, rtx op1)
16374{
16375 rtx e1 = gen_reg_rtx (XFmode);
16376 rtx e2 = gen_reg_rtx (XFmode);
16377 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
16378
16379 /* e2 = sqrt (op1 + 1.0) */
16380 emit_insn (gen_addxf3 (e2, op1, cst1));
16381 emit_insn (gen_sqrtxf2 (e2, e2));
16382
16383 /* e1 = sqrt (op1 - 1.0) */
16384 emit_insn (gen_subxf3 (e1, op1, cst1));
16385 emit_insn (gen_sqrtxf2 (e1, e1));
16386
16387 /* e1 = e1 * e2 */
16388 emit_insn (gen_mulxf3 (e1, e1, e2));
16389
16390 /* e1 = e1 + op1 */
16391 emit_insn (gen_addxf3 (e1, e1, op1));
16392
16393 /* op0 = log (e1) */
16394 emit_insn (gen_logxf2 (op0, e1));
16395}
16396
16397/* Output code to perform an atanh XFmode calculation. */
16398
16399void ix86_emit_i387_atanh (rtx op0, rtx op1)
16400{
16401 rtx e1 = gen_reg_rtx (XFmode);
16402 rtx e2 = gen_reg_rtx (XFmode);
16403 rtx scratch = gen_reg_rtx (HImode);
16404 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
16405 rtx half = const_double_from_real_value (dconsthalf, XFmode);
16406 rtx cst1, tmp;
16407 rtx_code_label *jump_label = gen_label_rtx ();
16408 rtx_insn *insn;
16409
16410 /* scratch = fxam (op1) */
16411 emit_insn (gen_fxamxf2_i387 (scratch, op1));
16412
16413 /* e2 = |op1| */
16414 emit_insn (gen_absxf2 (e2, op1));
16415
16416 /* e1 = -(e2 + e2) / (e2 + 1.0) */
16417 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
16418 emit_insn (gen_addxf3 (e1, e2, cst1));
16419 emit_insn (gen_addxf3 (e2, e2, e2));
16420 emit_insn (gen_negxf2 (e2, e2));
16421 emit_insn (gen_divxf3 (e1, e2, e1));
16422
16423 /* e2 = log1p (e1) */
16424 ix86_emit_i387_log1p (e2, e1);
16425
16426 /* flags = signbit (op1) */
16427 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
16428
16429 /* if (!flags) then e2 = -e2 */
16430 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
16431 gen_rtx_NE (VOIDmode, flags, const0_rtx),
16432 gen_rtx_LABEL_REF (VOIDmode, jump_label),
16433 pc_rtx);
16434 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
16435 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16436 JUMP_LABEL (insn) = jump_label;
16437
16438 emit_insn (gen_negxf2 (e2, e2));
16439
16440 emit_label (jump_label);
16441 LABEL_NUSES (jump_label) = 1;
16442
16443 /* op0 = 0.5 * e2 */
16444 half = force_reg (XFmode, half);
16445 emit_insn (gen_mulxf3 (op0, e2, half));
16446}
16447
16448/* Output code to perform a log1p XFmode calculation. */
16449
16450void ix86_emit_i387_log1p (rtx op0, rtx op1)
16451{
16452 rtx_code_label *label1 = gen_label_rtx ();
16453 rtx_code_label *label2 = gen_label_rtx ();
16454
16455 rtx tmp = gen_reg_rtx (XFmode);
16456 rtx res = gen_reg_rtx (XFmode);
16457 rtx cst, cstln2, cst1;
16458 rtx_insn *insn;
16459
16460 cst = const_double_from_real_value
16461 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
16462 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
16463
16464 emit_insn (gen_absxf2 (tmp, op1));
16465
16466 cst = force_reg (XFmode, cst);
16467 ix86_expand_branch (GE, tmp, cst, label1);
16468 predict_jump (REG_BR_PROB_BASE * 10 / 100);
16469 insn = get_last_insn ();
16470 JUMP_LABEL (insn) = label1;
16471
16472 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
16473 emit_jump (label2);
16474
16475 emit_label (label1);
16476 LABEL_NUSES (label1) = 1;
16477
16478 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
16479 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
16480 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
16481
16482 emit_label (label2);
16483 LABEL_NUSES (label2) = 1;
16484
16485 emit_move_insn (op0, res);
16486}
16487
16488/* Emit code for round calculation. */
16489void ix86_emit_i387_round (rtx op0, rtx op1)
16490{
16491 machine_mode inmode = GET_MODE (op1);
16492 machine_mode outmode = GET_MODE (op0);
16493 rtx e1 = gen_reg_rtx (XFmode);
16494 rtx e2 = gen_reg_rtx (XFmode);
16495 rtx scratch = gen_reg_rtx (HImode);
16496 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
16497 rtx half = const_double_from_real_value (dconsthalf, XFmode);
16498 rtx res = gen_reg_rtx (outmode);
16499 rtx_code_label *jump_label = gen_label_rtx ();
16500 rtx (*floor_insn) (rtx, rtx);
16501 rtx (*neg_insn) (rtx, rtx);
16502 rtx_insn *insn;
16503 rtx tmp;
16504
16505 switch (inmode)
16506 {
16507 case E_SFmode:
16508 case E_DFmode:
16509 tmp = gen_reg_rtx (XFmode);
16510
16511 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
16512 op1 = tmp;
16513 break;
16514 case E_XFmode:
16515 break;
16516 default:
16517 gcc_unreachable ();
16518 }
16519
16520 switch (outmode)
16521 {
16522 case E_SFmode:
16523 floor_insn = gen_frndintxf2_floor;
16524 neg_insn = gen_negsf2;
16525 break;
16526 case E_DFmode:
16527 floor_insn = gen_frndintxf2_floor;
16528 neg_insn = gen_negdf2;
16529 break;
16530 case E_XFmode:
16531 floor_insn = gen_frndintxf2_floor;
16532 neg_insn = gen_negxf2;
16533 break;
16534 case E_HImode:
16535 floor_insn = gen_lfloorxfhi2;
16536 neg_insn = gen_neghi2;
16537 break;
16538 case E_SImode:
16539 floor_insn = gen_lfloorxfsi2;
16540 neg_insn = gen_negsi2;
16541 break;
16542 case E_DImode:
16543 floor_insn = gen_lfloorxfdi2;
16544 neg_insn = gen_negdi2;
16545 break;
16546 default:
16547 gcc_unreachable ();
16548 }
16549
16550 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
16551
16552 /* scratch = fxam(op1) */
16553 emit_insn (gen_fxamxf2_i387 (scratch, op1));
16554
16555 /* e1 = fabs(op1) */
16556 emit_insn (gen_absxf2 (e1, op1));
16557
16558 /* e2 = e1 + 0.5 */
16559 half = force_reg (XFmode, half);
16560 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
16561
16562 /* res = floor(e2) */
16563 switch (outmode)
16564 {
16565 case E_SFmode:
16566 case E_DFmode:
16567 {
16568 tmp = gen_reg_rtx (XFmode);
16569
16570 emit_insn (floor_insn (tmp, e2));
16571 emit_insn (gen_rtx_SET (res,
16572 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
16573 UNSPEC_TRUNC_NOOP)));
16574 }
16575 break;
16576 default:
16577 emit_insn (floor_insn (res, e2));
16578 }
16579
16580 /* flags = signbit(a) */
16581 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
16582
16583 /* if (flags) then res = -res */
16584 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
16585 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
16586 gen_rtx_LABEL_REF (VOIDmode, jump_label),
16587 pc_rtx);
16588 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
16589 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16590 JUMP_LABEL (insn) = jump_label;
16591
16592 emit_insn (neg_insn (res, res));
16593
16594 emit_label (jump_label);
16595 LABEL_NUSES (jump_label) = 1;
16596
16597 emit_move_insn (op0, res);
16598}
16599
16600/* Output code to perform a Newton-Rhapson approximation of a single precision
16601 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
16602
16603void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
16604{
16605 rtx x0, x1, e0, e1;
16606
16607 x0 = gen_reg_rtx (mode);
16608 e0 = gen_reg_rtx (mode);
16609 e1 = gen_reg_rtx (mode);
16610 x1 = gen_reg_rtx (mode);
16611
16612 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
16613
16614 b = force_reg (mode, b);
16615
16616 /* x0 = rcp(b) estimate */
16617 if (mode == V16SFmode || mode == V8DFmode)
16618 {
16619 if (TARGET_AVX512ER)
16620 {
16621 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
16622 UNSPEC_RCP28)));
16623 /* res = a * x0 */
16624 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
16625 return;
16626 }
16627 else
16628 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
16629 UNSPEC_RCP14)));
16630 }
16631 else
16632 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
16633 UNSPEC_RCP)));
16634
16635 /* e0 = x0 * b */
16636 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
16637
16638 /* e0 = x0 * e0 */
16639 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
16640
16641 /* e1 = x0 + x0 */
16642 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
16643
16644 /* x1 = e1 - e0 */
16645 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
16646
16647 /* res = a * x1 */
16648 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
16649}
16650
16651/* Output code to perform a Newton-Rhapson approximation of a
16652 single precision floating point [reciprocal] square root. */
16653
16654void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
16655{
16656 rtx x0, e0, e1, e2, e3, mthree, mhalf;
16657 REAL_VALUE_TYPE r;
16658 int unspec;
16659
16660 x0 = gen_reg_rtx (mode);
16661 e0 = gen_reg_rtx (mode);
16662 e1 = gen_reg_rtx (mode);
16663 e2 = gen_reg_rtx (mode);
16664 e3 = gen_reg_rtx (mode);
16665
16666 if (TARGET_AVX512ER && mode == V16SFmode)
16667 {
16668 if (recip)
16669 /* res = rsqrt28(a) estimate */
16670 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
16671 UNSPEC_RSQRT28)));
16672 else
16673 {
16674 /* x0 = rsqrt28(a) estimate */
16675 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
16676 UNSPEC_RSQRT28)));
16677 /* res = rcp28(x0) estimate */
16678 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
16679 UNSPEC_RCP28)));
16680 }
16681 return;
16682 }
16683
16684 real_from_integer (&r, VOIDmode, -3, SIGNED);
16685 mthree = const_double_from_real_value (r, SFmode);
16686
16687 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
16688 mhalf = const_double_from_real_value (r, SFmode);
16689 unspec = UNSPEC_RSQRT;
16690
16691 if (VECTOR_MODE_P (mode))
16692 {
16693 mthree = ix86_build_const_vector (mode, true, mthree);
16694 mhalf = ix86_build_const_vector (mode, true, mhalf);
16695 /* There is no 512-bit rsqrt. There is however rsqrt14. */
16696 if (GET_MODE_SIZE (mode) == 64)
16697 unspec = UNSPEC_RSQRT14;
16698 }
16699
16700 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
16701 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
16702
16703 a = force_reg (mode, a);
16704
16705 /* x0 = rsqrt(a) estimate */
16706 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
16707 unspec)));
16708
16709 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
16710 if (!recip)
16711 {
16712 rtx zero = force_reg (mode, CONST0_RTX(mode));
16713 rtx mask;
16714
16715 /* Handle masked compare. */
16716 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
16717 {
16718 mask = gen_reg_rtx (HImode);
16719 /* Imm value 0x4 corresponds to not-equal comparison. */
16720 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
16721 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
16722 }
16723 else
16724 {
16725 mask = gen_reg_rtx (mode);
16726 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
16727 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
16728 }
16729 }
16730
fab263ab
L
16731 mthree = force_reg (mode, mthree);
16732
2bf6d935
ML
16733 /* e0 = x0 * a */
16734 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
2bf6d935 16735
a6645a82
L
16736 unsigned vector_size = GET_MODE_SIZE (mode);
16737 if (TARGET_FMA
16738 || (TARGET_AVX512F && vector_size == 64)
16739 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
fab263ab
L
16740 emit_insn (gen_rtx_SET (e2,
16741 gen_rtx_FMA (mode, e0, x0, mthree)));
16742 else
16743 {
16744 /* e1 = e0 * x0 */
16745 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
16746
16747 /* e2 = e1 - 3. */
16748 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
16749 }
2bf6d935
ML
16750
16751 mhalf = force_reg (mode, mhalf);
16752 if (recip)
16753 /* e3 = -.5 * x0 */
16754 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
16755 else
16756 /* e3 = -.5 * e0 */
16757 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
16758 /* ret = e2 * e3 */
16759 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
16760}
16761
16762/* Expand fabs (OP0) and return a new rtx that holds the result. The
16763 mask for masking out the sign-bit is stored in *SMASK, if that is
16764 non-null. */
16765
16766static rtx
16767ix86_expand_sse_fabs (rtx op0, rtx *smask)
16768{
16769 machine_mode vmode, mode = GET_MODE (op0);
16770 rtx xa, mask;
16771
16772 xa = gen_reg_rtx (mode);
16773 if (mode == SFmode)
16774 vmode = V4SFmode;
16775 else if (mode == DFmode)
16776 vmode = V2DFmode;
16777 else
16778 vmode = mode;
16779 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
16780 if (!VECTOR_MODE_P (mode))
16781 {
16782 /* We need to generate a scalar mode mask in this case. */
16783 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
16784 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
16785 mask = gen_reg_rtx (mode);
16786 emit_insn (gen_rtx_SET (mask, tmp));
16787 }
16788 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
16789
16790 if (smask)
16791 *smask = mask;
16792
16793 return xa;
16794}
16795
16796/* Expands a comparison of OP0 with OP1 using comparison code CODE,
16797 swapping the operands if SWAP_OPERANDS is true. The expanded
16798 code is a forward jump to a newly created label in case the
16799 comparison is true. The generated label rtx is returned. */
16800static rtx_code_label *
16801ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
16802 bool swap_operands)
16803{
16804 bool unordered_compare = ix86_unordered_fp_compare (code);
16805 rtx_code_label *label;
16806 rtx tmp, reg;
16807
16808 if (swap_operands)
16809 std::swap (op0, op1);
16810
16811 label = gen_label_rtx ();
16812 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
16813 if (unordered_compare)
16814 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
16815 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
16816 emit_insn (gen_rtx_SET (reg, tmp));
16817 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
16818 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
16819 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
16820 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
16821 JUMP_LABEL (tmp) = label;
16822
16823 return label;
16824}
16825
16826/* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
16827 using comparison code CODE. Operands are swapped for the comparison if
16828 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
16829static rtx
16830ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
16831 bool swap_operands)
16832{
16833 rtx (*insn)(rtx, rtx, rtx, rtx);
16834 machine_mode mode = GET_MODE (op0);
16835 rtx mask = gen_reg_rtx (mode);
16836
16837 if (swap_operands)
16838 std::swap (op0, op1);
16839
16840 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
16841
16842 emit_insn (insn (mask, op0, op1,
16843 gen_rtx_fmt_ee (code, mode, op0, op1)));
16844 return mask;
16845}
16846
16847/* Expand copysign from SIGN to the positive value ABS_VALUE
16848 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
16849 the sign-bit. */
16850
16851static void
16852ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
16853{
16854 machine_mode mode = GET_MODE (sign);
16855 rtx sgn = gen_reg_rtx (mode);
16856 if (mask == NULL_RTX)
16857 {
16858 machine_mode vmode;
16859
16860 if (mode == SFmode)
16861 vmode = V4SFmode;
16862 else if (mode == DFmode)
16863 vmode = V2DFmode;
16864 else
16865 vmode = mode;
16866
16867 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
16868 if (!VECTOR_MODE_P (mode))
16869 {
16870 /* We need to generate a scalar mode mask in this case. */
16871 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
16872 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
16873 mask = gen_reg_rtx (mode);
16874 emit_insn (gen_rtx_SET (mask, tmp));
16875 }
16876 }
16877 else
16878 mask = gen_rtx_NOT (mode, mask);
16879 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
16880 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
16881}
16882
16883/* Expand SSE sequence for computing lround from OP1 storing
16884 into OP0. */
16885
16886void
16887ix86_expand_lround (rtx op0, rtx op1)
16888{
16889 /* C code for the stuff we're doing below:
d2754fbb
UB
16890 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
16891 return (long)tmp;
2bf6d935
ML
16892 */
16893 machine_mode mode = GET_MODE (op1);
16894 const struct real_format *fmt;
16895 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16896 rtx adj;
16897
16898 /* load nextafter (0.5, 0.0) */
16899 fmt = REAL_MODE_FORMAT (mode);
16900 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16901 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16902
16903 /* adj = copysign (0.5, op1) */
16904 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
16905 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
16906
16907 /* adj = op1 + adj */
16908 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
16909
16910 /* op0 = (imode)adj */
16911 expand_fix (op0, adj, 0);
16912}
16913
16914/* Expand SSE2 sequence for computing lround from OPERAND1 storing
16915 into OPERAND0. */
16916
16917void
16918ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
16919{
16920 /* C code for the stuff we're doing below (for do_floor):
16921 xi = (long)op1;
d2754fbb
UB
16922 xi -= (double)xi > op1 ? 1 : 0;
16923 return xi;
2bf6d935
ML
16924 */
16925 machine_mode fmode = GET_MODE (op1);
16926 machine_mode imode = GET_MODE (op0);
16927 rtx ireg, freg, tmp;
16928 rtx_code_label *label;
16929
16930 /* reg = (long)op1 */
16931 ireg = gen_reg_rtx (imode);
16932 expand_fix (ireg, op1, 0);
16933
16934 /* freg = (double)reg */
16935 freg = gen_reg_rtx (fmode);
16936 expand_float (freg, ireg, 0);
16937
16938 /* ireg = (freg > op1) ? ireg - 1 : ireg */
16939 label = ix86_expand_sse_compare_and_jump (UNLE,
16940 freg, op1, !do_floor);
16941 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
16942 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
16943 emit_move_insn (ireg, tmp);
16944
16945 emit_label (label);
16946 LABEL_NUSES (label) = 1;
16947
16948 emit_move_insn (op0, ireg);
16949}
16950
16951/* Generate and return a rtx of mode MODE for 2**n where n is the number
16952 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
16953
16954static rtx
16955ix86_gen_TWO52 (machine_mode mode)
16956{
d2754fbb 16957 const struct real_format *fmt;
2bf6d935
ML
16958 REAL_VALUE_TYPE TWO52r;
16959 rtx TWO52;
16960
d2754fbb
UB
16961 fmt = REAL_MODE_FORMAT (mode);
16962 real_2expN (&TWO52r, fmt->p - 1, mode);
2bf6d935
ML
16963 TWO52 = const_double_from_real_value (TWO52r, mode);
16964 TWO52 = force_reg (mode, TWO52);
16965
16966 return TWO52;
16967}
16968
16969/* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
16970
16971void
16972ix86_expand_rint (rtx operand0, rtx operand1)
16973{
16974 /* C code for the stuff we're doing below:
16975 xa = fabs (operand1);
d2754fbb 16976 if (!isless (xa, 2**52))
2bf6d935 16977 return operand1;
d2754fbb
UB
16978 two52 = 2**52;
16979 if (flag_rounding_math)
2bf6d935
ML
16980 {
16981 two52 = copysign (two52, operand1);
16982 xa = operand1;
16983 }
d2754fbb
UB
16984 xa = xa + two52 - two52;
16985 return copysign (xa, operand1);
2bf6d935
ML
16986 */
16987 machine_mode mode = GET_MODE (operand0);
81615bb0 16988 rtx res, xa, TWO52, mask;
2bf6d935
ML
16989 rtx_code_label *label;
16990
d2754fbb
UB
16991 TWO52 = ix86_gen_TWO52 (mode);
16992
16993 /* Temporary for holding the result, initialized to the input
16994 operand to ease control flow. */
16995 res = copy_to_reg (operand1);
2bf6d935
ML
16996
16997 /* xa = abs (operand1) */
16998 xa = ix86_expand_sse_fabs (res, &mask);
16999
17000 /* if (!isless (xa, TWO52)) goto label; */
2bf6d935
ML
17001 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
17002
2bf6d935
ML
17003 if (flag_rounding_math)
17004 {
81615bb0 17005 ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
2bf6d935
ML
17006 xa = res;
17007 }
17008
81615bb0
UB
17009 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
17010 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
17011
17012 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
17013 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
17014 xa = ix86_expand_sse_fabs (xa, NULL);
2bf6d935
ML
17015
17016 ix86_sse_copysign_to_positive (res, xa, res, mask);
17017
17018 emit_label (label);
17019 LABEL_NUSES (label) = 1;
17020
17021 emit_move_insn (operand0, res);
17022}
17023
36d387f2
UB
17024/* Expand SSE2 sequence for computing floor or ceil
17025 from OPERAND1 storing into OPERAND0. */
2bf6d935
ML
17026void
17027ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
17028{
17029 /* C code for the stuff we expand below.
17030 double xa = fabs (x), x2;
d2754fbb
UB
17031 if (!isless (xa, TWO52))
17032 return x;
2bf6d935 17033 x2 = (double)(long)x;
337ed0eb 17034
2bf6d935
ML
17035 Compensate. Floor:
17036 if (x2 > x)
17037 x2 -= 1;
17038 Compensate. Ceil:
17039 if (x2 < x)
17040 x2 += 1;
337ed0eb 17041
2bf6d935
ML
17042 if (HONOR_SIGNED_ZEROS (mode))
17043 return copysign (x2, x);
17044 return x2;
17045 */
17046 machine_mode mode = GET_MODE (operand0);
17047 rtx xa, xi, TWO52, tmp, one, res, mask;
17048 rtx_code_label *label;
17049
17050 TWO52 = ix86_gen_TWO52 (mode);
17051
17052 /* Temporary for holding the result, initialized to the input
17053 operand to ease control flow. */
d2754fbb 17054 res = copy_to_reg (operand1);
2bf6d935
ML
17055
17056 /* xa = abs (operand1) */
17057 xa = ix86_expand_sse_fabs (res, &mask);
17058
17059 /* if (!isless (xa, TWO52)) goto label; */
17060 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
17061
17062 /* xa = (double)(long)x */
d2754fbb 17063 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
2bf6d935
ML
17064 expand_fix (xi, res, 0);
17065 expand_float (xa, xi, 0);
17066
17067 /* generate 1.0 */
17068 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
17069
17070 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
17071 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
17072 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
17073 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
17074 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
2bf6d935 17075 if (HONOR_SIGNED_ZEROS (mode))
337ed0eb
UB
17076 {
17077 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
17078 if (do_floor && flag_rounding_math)
17079 tmp = ix86_expand_sse_fabs (tmp, NULL);
17080
17081 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
17082 }
17083 emit_move_insn (res, tmp);
2bf6d935
ML
17084
17085 emit_label (label);
17086 LABEL_NUSES (label) = 1;
17087
17088 emit_move_insn (operand0, res);
17089}
17090
36d387f2
UB
17091/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
17092 into OPERAND0 without relying on DImode truncation via cvttsd2siq
17093 that is only available on 64bit targets. */
2bf6d935 17094void
36d387f2 17095ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
2bf6d935
ML
17096{
17097 /* C code for the stuff we expand below.
d2754fbb
UB
17098 double xa = fabs (x), x2;
17099 if (!isless (xa, TWO52))
17100 return x;
17101 xa = xa + TWO52 - TWO52;
17102 x2 = copysign (xa, x);
337ed0eb 17103
36d387f2 17104 Compensate. Floor:
d2754fbb
UB
17105 if (x2 > x)
17106 x2 -= 1;
36d387f2 17107 Compensate. Ceil:
d2754fbb
UB
17108 if (x2 < x)
17109 x2 += 1;
337ed0eb 17110
36d387f2
UB
17111 if (HONOR_SIGNED_ZEROS (mode))
17112 x2 = copysign (x2, x);
17113 return x2;
2bf6d935
ML
17114 */
17115 machine_mode mode = GET_MODE (operand0);
36d387f2 17116 rtx xa, TWO52, tmp, one, res, mask;
2bf6d935
ML
17117 rtx_code_label *label;
17118
17119 TWO52 = ix86_gen_TWO52 (mode);
17120
17121 /* Temporary for holding the result, initialized to the input
17122 operand to ease control flow. */
d2754fbb 17123 res = copy_to_reg (operand1);
2bf6d935
ML
17124
17125 /* xa = abs (operand1) */
17126 xa = ix86_expand_sse_fabs (res, &mask);
17127
17128 /* if (!isless (xa, TWO52)) goto label; */
17129 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
17130
36d387f2
UB
17131 /* xa = xa + TWO52 - TWO52; */
17132 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
17133 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
2bf6d935 17134
36d387f2
UB
17135 /* xa = copysign (xa, operand1) */
17136 ix86_sse_copysign_to_positive (xa, xa, res, mask);
2bf6d935 17137
36d387f2
UB
17138 /* generate 1.0 */
17139 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
2bf6d935 17140
36d387f2
UB
17141 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
17142 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
17143 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
17144 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
17145 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
337ed0eb
UB
17146 if (HONOR_SIGNED_ZEROS (mode))
17147 {
17148 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
17149 if (do_floor && flag_rounding_math)
17150 tmp = ix86_expand_sse_fabs (tmp, NULL);
17151
17152 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
17153 }
36d387f2 17154 emit_move_insn (res, tmp);
2bf6d935
ML
17155
17156 emit_label (label);
17157 LABEL_NUSES (label) = 1;
17158
17159 emit_move_insn (operand0, res);
17160}
17161
36d387f2
UB
17162/* Expand SSE sequence for computing trunc
17163 from OPERAND1 storing into OPERAND0. */
2bf6d935
ML
17164void
17165ix86_expand_trunc (rtx operand0, rtx operand1)
17166{
17167 /* C code for SSE variant we expand below.
d2754fbb
UB
17168 double xa = fabs (x), x2;
17169 if (!isless (xa, TWO52))
17170 return x;
17171 x2 = (double)(long)x;
2bf6d935
ML
17172 if (HONOR_SIGNED_ZEROS (mode))
17173 return copysign (x2, x);
17174 return x2;
17175 */
17176 machine_mode mode = GET_MODE (operand0);
17177 rtx xa, xi, TWO52, res, mask;
17178 rtx_code_label *label;
17179
17180 TWO52 = ix86_gen_TWO52 (mode);
17181
17182 /* Temporary for holding the result, initialized to the input
17183 operand to ease control flow. */
d2754fbb 17184 res = copy_to_reg (operand1);
2bf6d935
ML
17185
17186 /* xa = abs (operand1) */
17187 xa = ix86_expand_sse_fabs (res, &mask);
17188
17189 /* if (!isless (xa, TWO52)) goto label; */
17190 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
17191
97d3ddcf 17192 /* xa = (double)(long)x */
d2754fbb 17193 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
2bf6d935 17194 expand_fix (xi, res, 0);
97d3ddcf 17195 expand_float (xa, xi, 0);
2bf6d935
ML
17196
17197 if (HONOR_SIGNED_ZEROS (mode))
97d3ddcf
UB
17198 ix86_sse_copysign_to_positive (xa, xa, res, mask);
17199
17200 emit_move_insn (res, xa);
2bf6d935
ML
17201
17202 emit_label (label);
17203 LABEL_NUSES (label) = 1;
17204
17205 emit_move_insn (operand0, res);
17206}
17207
17208/* Expand SSE sequence for computing trunc from OPERAND1 storing
36d387f2
UB
17209 into OPERAND0 without relying on DImode truncation via cvttsd2siq
17210 that is only available on 64bit targets. */
2bf6d935
ML
17211void
17212ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
17213{
17214 machine_mode mode = GET_MODE (operand0);
c142ae5e 17215 rtx xa, xa2, TWO52, tmp, one, res, mask;
2bf6d935
ML
17216 rtx_code_label *label;
17217
17218 /* C code for SSE variant we expand below.
d2754fbb
UB
17219 double xa = fabs (x), x2;
17220 if (!isless (xa, TWO52))
17221 return x;
17222 xa2 = xa + TWO52 - TWO52;
2bf6d935 17223 Compensate:
d2754fbb
UB
17224 if (xa2 > xa)
17225 xa2 -= 1.0;
17226 x2 = copysign (xa2, x);
17227 return x2;
2bf6d935
ML
17228 */
17229
17230 TWO52 = ix86_gen_TWO52 (mode);
17231
17232 /* Temporary for holding the result, initialized to the input
17233 operand to ease control flow. */
d2754fbb 17234 res =copy_to_reg (operand1);
2bf6d935
ML
17235
17236 /* xa = abs (operand1) */
c142ae5e 17237 xa = ix86_expand_sse_fabs (res, &mask);
2bf6d935
ML
17238
17239 /* if (!isless (xa, TWO52)) goto label; */
17240 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
17241
c142ae5e
UB
17242 /* xa2 = xa + TWO52 - TWO52; */
17243 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
17244 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
2bf6d935
ML
17245
17246 /* generate 1.0 */
17247 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
17248
c142ae5e
UB
17249 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
17250 tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
17251 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
2bf6d935 17252 tmp = expand_simple_binop (mode, MINUS,
c142ae5e
UB
17253 xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
17254 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
81615bb0 17255 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
c142ae5e 17256 tmp = ix86_expand_sse_fabs (tmp, NULL);
2bf6d935 17257
c142ae5e
UB
17258 /* res = copysign (xa2, operand1) */
17259 ix86_sse_copysign_to_positive (res, tmp, res, mask);
2bf6d935
ML
17260
17261 emit_label (label);
17262 LABEL_NUSES (label) = 1;
17263
17264 emit_move_insn (operand0, res);
17265}
17266
36d387f2
UB
17267/* Expand SSE sequence for computing round
17268 from OPERAND1 storing into OPERAND0. */
2bf6d935
ML
17269void
17270ix86_expand_round (rtx operand0, rtx operand1)
17271{
17272 /* C code for the stuff we're doing below:
d2754fbb
UB
17273 double xa = fabs (x);
17274 if (!isless (xa, TWO52))
17275 return x;
17276 xa = (double)(long)(xa + nextafter (0.5, 0.0));
17277 return copysign (xa, x);
2bf6d935
ML
17278 */
17279 machine_mode mode = GET_MODE (operand0);
17280 rtx res, TWO52, xa, xi, half, mask;
17281 rtx_code_label *label;
17282 const struct real_format *fmt;
17283 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
17284
17285 /* Temporary for holding the result, initialized to the input
17286 operand to ease control flow. */
d2754fbb 17287 res = copy_to_reg (operand1);
2bf6d935
ML
17288
17289 TWO52 = ix86_gen_TWO52 (mode);
17290 xa = ix86_expand_sse_fabs (res, &mask);
17291 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
17292
17293 /* load nextafter (0.5, 0.0) */
17294 fmt = REAL_MODE_FORMAT (mode);
17295 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
17296 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
17297
17298 /* xa = xa + 0.5 */
17299 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
17300 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
17301
17302 /* xa = (double)(int64_t)xa */
d2754fbb 17303 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
2bf6d935
ML
17304 expand_fix (xi, xa, 0);
17305 expand_float (xa, xi, 0);
17306
17307 /* res = copysign (xa, operand1) */
97d3ddcf 17308 ix86_sse_copysign_to_positive (res, xa, res, mask);
2bf6d935
ML
17309
17310 emit_label (label);
17311 LABEL_NUSES (label) = 1;
17312
17313 emit_move_insn (operand0, res);
17314}
17315
36d387f2
UB
17316/* Expand SSE sequence for computing round from OPERAND1 storing
17317 into OPERAND0 without relying on DImode truncation via cvttsd2siq
17318 that is only available on 64bit targets. */
17319void
17320ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
17321{
17322 /* C code for the stuff we expand below.
d2754fbb
UB
17323 double xa = fabs (x), xa2, x2;
17324 if (!isless (xa, TWO52))
17325 return x;
36d387f2
UB
17326 Using the absolute value and copying back sign makes
17327 -0.0 -> -0.0 correct.
d2754fbb 17328 xa2 = xa + TWO52 - TWO52;
36d387f2
UB
17329 Compensate.
17330 dxa = xa2 - xa;
d2754fbb
UB
17331 if (dxa <= -0.5)
17332 xa2 += 1;
17333 else if (dxa > 0.5)
17334 xa2 -= 1;
17335 x2 = copysign (xa2, x);
17336 return x2;
36d387f2
UB
17337 */
17338 machine_mode mode = GET_MODE (operand0);
17339 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
17340 rtx_code_label *label;
17341
17342 TWO52 = ix86_gen_TWO52 (mode);
17343
17344 /* Temporary for holding the result, initialized to the input
17345 operand to ease control flow. */
d2754fbb 17346 res = copy_to_reg (operand1);
36d387f2
UB
17347
17348 /* xa = abs (operand1) */
17349 xa = ix86_expand_sse_fabs (res, &mask);
17350
17351 /* if (!isless (xa, TWO52)) goto label; */
17352 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
17353
17354 /* xa2 = xa + TWO52 - TWO52; */
17355 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
17356 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
17357
17358 /* dxa = xa2 - xa; */
17359 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
17360
17361 /* generate 0.5, 1.0 and -0.5 */
17362 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
17363 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
17364 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
17365 0, OPTAB_DIRECT);
17366
17367 /* Compensate. */
17368 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
17369 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
17370 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
17371 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
17372 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
17373 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
17374 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
17375 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
17376
17377 /* res = copysign (xa2, operand1) */
97d3ddcf 17378 ix86_sse_copysign_to_positive (res, xa2, res, mask);
36d387f2
UB
17379
17380 emit_label (label);
17381 LABEL_NUSES (label) = 1;
17382
17383 emit_move_insn (operand0, res);
17384}
17385
2bf6d935
ML
17386/* Expand SSE sequence for computing round
17387 from OP1 storing into OP0 using sse4 round insn. */
17388void
17389ix86_expand_round_sse4 (rtx op0, rtx op1)
17390{
17391 machine_mode mode = GET_MODE (op0);
17392 rtx e1, e2, res, half;
17393 const struct real_format *fmt;
17394 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
17395 rtx (*gen_copysign) (rtx, rtx, rtx);
17396 rtx (*gen_round) (rtx, rtx, rtx);
17397
17398 switch (mode)
17399 {
17400 case E_SFmode:
17401 gen_copysign = gen_copysignsf3;
17402 gen_round = gen_sse4_1_roundsf2;
17403 break;
17404 case E_DFmode:
17405 gen_copysign = gen_copysigndf3;
17406 gen_round = gen_sse4_1_rounddf2;
17407 break;
17408 default:
17409 gcc_unreachable ();
17410 }
17411
17412 /* round (a) = trunc (a + copysign (0.5, a)) */
17413
17414 /* load nextafter (0.5, 0.0) */
17415 fmt = REAL_MODE_FORMAT (mode);
17416 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
17417 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
17418 half = const_double_from_real_value (pred_half, mode);
17419
17420 /* e1 = copysign (0.5, op1) */
17421 e1 = gen_reg_rtx (mode);
17422 emit_insn (gen_copysign (e1, half, op1));
17423
17424 /* e2 = op1 + e1 */
17425 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
17426
17427 /* res = trunc (e2) */
17428 res = gen_reg_rtx (mode);
17429 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
17430
17431 emit_move_insn (op0, res);
17432}
17433
17434/* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
17435 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
17436 insn every time. */
17437
17438static GTY(()) rtx_insn *vselect_insn;
17439
17440/* Initialize vselect_insn. */
17441
17442static void
17443init_vselect_insn (void)
17444{
17445 unsigned i;
17446 rtx x;
17447
17448 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
17449 for (i = 0; i < MAX_VECT_LEN; ++i)
17450 XVECEXP (x, 0, i) = const0_rtx;
17451 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
17452 const0_rtx), x);
17453 x = gen_rtx_SET (const0_rtx, x);
17454 start_sequence ();
17455 vselect_insn = emit_insn (x);
17456 end_sequence ();
17457}
17458
17459/* Construct (set target (vec_select op0 (parallel perm))) and
17460 return true if that's a valid instruction in the active ISA. */
17461
17462static bool
17463expand_vselect (rtx target, rtx op0, const unsigned char *perm,
17464 unsigned nelt, bool testing_p)
17465{
17466 unsigned int i;
17467 rtx x, save_vconcat;
17468 int icode;
17469
17470 if (vselect_insn == NULL_RTX)
17471 init_vselect_insn ();
17472
17473 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
17474 PUT_NUM_ELEM (XVEC (x, 0), nelt);
17475 for (i = 0; i < nelt; ++i)
17476 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
17477 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
17478 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
17479 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
17480 SET_DEST (PATTERN (vselect_insn)) = target;
17481 icode = recog_memoized (vselect_insn);
17482
17483 if (icode >= 0 && !testing_p)
17484 emit_insn (copy_rtx (PATTERN (vselect_insn)));
17485
17486 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
17487 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
17488 INSN_CODE (vselect_insn) = -1;
17489
17490 return icode >= 0;
17491}
17492
17493/* Similar, but generate a vec_concat from op0 and op1 as well. */
17494
17495static bool
17496expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
17497 const unsigned char *perm, unsigned nelt,
17498 bool testing_p)
17499{
17500 machine_mode v2mode;
17501 rtx x;
17502 bool ok;
17503
17504 if (vselect_insn == NULL_RTX)
17505 init_vselect_insn ();
17506
17507 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
17508 return false;
17509 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
17510 PUT_MODE (x, v2mode);
17511 XEXP (x, 0) = op0;
17512 XEXP (x, 1) = op1;
17513 ok = expand_vselect (target, x, perm, nelt, testing_p);
17514 XEXP (x, 0) = const0_rtx;
17515 XEXP (x, 1) = const0_rtx;
17516 return ok;
17517}
17518
4bf4c103 17519/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
17520 using movss or movsd. */
17521static bool
17522expand_vec_perm_movs (struct expand_vec_perm_d *d)
17523{
17524 machine_mode vmode = d->vmode;
17525 unsigned i, nelt = d->nelt;
17526 rtx x;
17527
17528 if (d->one_operand_p)
17529 return false;
17530
17531 if (!(TARGET_SSE && vmode == V4SFmode)
240198fe 17532 && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
2bf6d935
ML
17533 && !(TARGET_SSE2 && vmode == V2DFmode))
17534 return false;
17535
17536 /* Only the first element is changed. */
17537 if (d->perm[0] != nelt && d->perm[0] != 0)
17538 return false;
17539 for (i = 1; i < nelt; ++i)
17540 if (d->perm[i] != i + nelt - d->perm[0])
17541 return false;
17542
17543 if (d->testing_p)
17544 return true;
17545
17546 if (d->perm[0] == nelt)
17547 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
17548 else
17549 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
17550
17551 emit_insn (gen_rtx_SET (d->target, x));
17552
17553 return true;
17554}
17555
4bf4c103 17556/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
17557 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
17558
17559static bool
17560expand_vec_perm_blend (struct expand_vec_perm_d *d)
17561{
17562 machine_mode mmode, vmode = d->vmode;
fa2987ed
JJ
17563 unsigned i, nelt = d->nelt;
17564 unsigned HOST_WIDE_INT mask;
2bf6d935
ML
17565 rtx target, op0, op1, maskop, x;
17566 rtx rperm[32], vperm;
17567
17568 if (d->one_operand_p)
17569 return false;
17570 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
17571 && (TARGET_AVX512BW
17572 || GET_MODE_UNIT_SIZE (vmode) >= 4))
17573 ;
17574 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
17575 ;
17576 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
17577 ;
a325bdd1 17578 else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
be8749f9
UB
17579 || GET_MODE_SIZE (vmode) == 8
17580 || GET_MODE_SIZE (vmode) == 4))
2bf6d935
ML
17581 ;
17582 else
17583 return false;
17584
17585 /* This is a blend, not a permute. Elements must stay in their
17586 respective lanes. */
17587 for (i = 0; i < nelt; ++i)
17588 {
17589 unsigned e = d->perm[i];
17590 if (!(e == i || e == i + nelt))
17591 return false;
17592 }
17593
17594 if (d->testing_p)
17595 return true;
17596
17597 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
17598 decision should be extracted elsewhere, so that we only try that
17599 sequence once all budget==3 options have been tried. */
17600 target = d->target;
17601 op0 = d->op0;
17602 op1 = d->op1;
17603 mask = 0;
17604
17605 switch (vmode)
17606 {
17607 case E_V8DFmode:
17608 case E_V16SFmode:
17609 case E_V4DFmode:
17610 case E_V8SFmode:
17611 case E_V2DFmode:
17612 case E_V4SFmode:
a325bdd1 17613 case E_V4HImode:
2bf6d935
ML
17614 case E_V8HImode:
17615 case E_V8SImode:
17616 case E_V32HImode:
17617 case E_V64QImode:
17618 case E_V16SImode:
17619 case E_V8DImode:
17620 for (i = 0; i < nelt; ++i)
fa2987ed 17621 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
2bf6d935
ML
17622 break;
17623
17624 case E_V2DImode:
17625 for (i = 0; i < 2; ++i)
17626 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
17627 vmode = V8HImode;
17628 goto do_subreg;
17629
a325bdd1
PB
17630 case E_V2SImode:
17631 for (i = 0; i < 2; ++i)
17632 mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
17633 vmode = V4HImode;
17634 goto do_subreg;
17635
2bf6d935
ML
17636 case E_V4SImode:
17637 for (i = 0; i < 4; ++i)
17638 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
17639 vmode = V8HImode;
17640 goto do_subreg;
17641
17642 case E_V16QImode:
17643 /* See if bytes move in pairs so we can use pblendw with
17644 an immediate argument, rather than pblendvb with a vector
17645 argument. */
17646 for (i = 0; i < 16; i += 2)
17647 if (d->perm[i] + 1 != d->perm[i + 1])
17648 {
17649 use_pblendvb:
17650 for (i = 0; i < nelt; ++i)
17651 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
17652
17653 finish_pblendvb:
17654 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
17655 vperm = force_reg (vmode, vperm);
17656
be8749f9
UB
17657 if (GET_MODE_SIZE (vmode) == 4)
17658 emit_insn (gen_mmx_pblendvb32 (target, op0, op1, vperm));
17659 else if (GET_MODE_SIZE (vmode) == 8)
a325bdd1
PB
17660 emit_insn (gen_mmx_pblendvb64 (target, op0, op1, vperm));
17661 else if (GET_MODE_SIZE (vmode) == 16)
2bf6d935
ML
17662 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
17663 else
17664 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
17665 if (target != d->target)
17666 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17667 return true;
17668 }
17669
17670 for (i = 0; i < 8; ++i)
17671 mask |= (d->perm[i * 2] >= 16) << i;
17672 vmode = V8HImode;
17673 /* FALLTHRU */
17674
17675 do_subreg:
17676 target = gen_reg_rtx (vmode);
17677 op0 = gen_lowpart (vmode, op0);
17678 op1 = gen_lowpart (vmode, op1);
17679 break;
17680
a325bdd1
PB
17681 case E_V8QImode:
17682 for (i = 0; i < 8; i += 2)
17683 if (d->perm[i] + 1 != d->perm[i + 1])
17684 goto use_pblendvb;
17685
17686 for (i = 0; i < 4; ++i)
17687 mask |= (d->perm[i * 2] >= 8) << i;
17688 vmode = V4HImode;
17689 goto do_subreg;
17690
be8749f9
UB
17691 case E_V4QImode:
17692 for (i = 0; i < 4; i += 2)
17693 if (d->perm[i] + 1 != d->perm[i + 1])
17694 goto use_pblendvb;
17695
17696 for (i = 0; i < 2; ++i)
17697 mask |= (d->perm[i * 2] >= 4) << i;
17698 vmode = V2HImode;
17699 goto do_subreg;
17700
2bf6d935
ML
17701 case E_V32QImode:
17702 /* See if bytes move in pairs. If not, vpblendvb must be used. */
17703 for (i = 0; i < 32; i += 2)
17704 if (d->perm[i] + 1 != d->perm[i + 1])
17705 goto use_pblendvb;
17706 /* See if bytes move in quadruplets. If yes, vpblendd
17707 with immediate can be used. */
17708 for (i = 0; i < 32; i += 4)
17709 if (d->perm[i] + 2 != d->perm[i + 2])
17710 break;
17711 if (i < 32)
17712 {
17713 /* See if bytes move the same in both lanes. If yes,
17714 vpblendw with immediate can be used. */
17715 for (i = 0; i < 16; i += 2)
17716 if (d->perm[i] + 16 != d->perm[i + 16])
17717 goto use_pblendvb;
17718
17719 /* Use vpblendw. */
17720 for (i = 0; i < 16; ++i)
17721 mask |= (d->perm[i * 2] >= 32) << i;
17722 vmode = V16HImode;
17723 goto do_subreg;
17724 }
17725
17726 /* Use vpblendd. */
17727 for (i = 0; i < 8; ++i)
17728 mask |= (d->perm[i * 4] >= 32) << i;
17729 vmode = V8SImode;
17730 goto do_subreg;
17731
17732 case E_V16HImode:
17733 /* See if words move in pairs. If yes, vpblendd can be used. */
17734 for (i = 0; i < 16; i += 2)
17735 if (d->perm[i] + 1 != d->perm[i + 1])
17736 break;
17737 if (i < 16)
17738 {
17739 /* See if words move the same in both lanes. If not,
17740 vpblendvb must be used. */
17741 for (i = 0; i < 8; i++)
17742 if (d->perm[i] + 8 != d->perm[i + 8])
17743 {
17744 /* Use vpblendvb. */
17745 for (i = 0; i < 32; ++i)
17746 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
17747
17748 vmode = V32QImode;
17749 nelt = 32;
17750 target = gen_reg_rtx (vmode);
17751 op0 = gen_lowpart (vmode, op0);
17752 op1 = gen_lowpart (vmode, op1);
17753 goto finish_pblendvb;
17754 }
17755
17756 /* Use vpblendw. */
17757 for (i = 0; i < 16; ++i)
17758 mask |= (d->perm[i] >= 16) << i;
17759 break;
17760 }
17761
17762 /* Use vpblendd. */
17763 for (i = 0; i < 8; ++i)
17764 mask |= (d->perm[i * 2] >= 16) << i;
17765 vmode = V8SImode;
17766 goto do_subreg;
17767
17768 case E_V4DImode:
17769 /* Use vpblendd. */
17770 for (i = 0; i < 4; ++i)
17771 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
17772 vmode = V8SImode;
17773 goto do_subreg;
17774
17775 default:
17776 gcc_unreachable ();
17777 }
17778
17779 switch (vmode)
17780 {
17781 case E_V8DFmode:
17782 case E_V8DImode:
17783 mmode = QImode;
17784 break;
17785 case E_V16SFmode:
17786 case E_V16SImode:
17787 mmode = HImode;
17788 break;
17789 case E_V32HImode:
17790 mmode = SImode;
17791 break;
17792 case E_V64QImode:
17793 mmode = DImode;
17794 break;
17795 default:
17796 mmode = VOIDmode;
17797 }
17798
17799 if (mmode != VOIDmode)
17800 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
17801 else
17802 maskop = GEN_INT (mask);
17803
17804 /* This matches five different patterns with the different modes. */
17805 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
17806 x = gen_rtx_SET (target, x);
17807 emit_insn (x);
17808 if (target != d->target)
17809 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17810
17811 return true;
17812}
17813
4bf4c103 17814/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
17815 in terms of the variable form of vpermilps.
17816
17817 Note that we will have already failed the immediate input vpermilps,
17818 which requires that the high and low part shuffle be identical; the
17819 variable form doesn't require that. */
17820
17821static bool
17822expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
17823{
17824 rtx rperm[8], vperm;
17825 unsigned i;
17826
17827 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
17828 return false;
17829
17830 /* We can only permute within the 128-bit lane. */
17831 for (i = 0; i < 8; ++i)
17832 {
17833 unsigned e = d->perm[i];
17834 if (i < 4 ? e >= 4 : e < 4)
17835 return false;
17836 }
17837
17838 if (d->testing_p)
17839 return true;
17840
17841 for (i = 0; i < 8; ++i)
17842 {
17843 unsigned e = d->perm[i];
17844
17845 /* Within each 128-bit lane, the elements of op0 are numbered
17846 from 0 and the elements of op1 are numbered from 4. */
17847 if (e >= 8 + 4)
17848 e -= 8;
17849 else if (e >= 4)
17850 e -= 4;
17851
17852 rperm[i] = GEN_INT (e);
17853 }
17854
17855 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
17856 vperm = force_reg (V8SImode, vperm);
17857 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
17858
17859 return true;
17860}
17861
1fa991d1
UB
17862/* For V*[QHS]Imode permutations, check if the same permutation
17863 can't be performed in a 2x, 4x or 8x wider inner mode. */
17864
17865static bool
17866canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
17867 struct expand_vec_perm_d *nd)
17868{
17869 int i;
17870 machine_mode mode = VOIDmode;
17871
17872 switch (d->vmode)
17873 {
17874 case E_V8QImode: mode = V4HImode; break;
17875 case E_V16QImode: mode = V8HImode; break;
17876 case E_V32QImode: mode = V16HImode; break;
17877 case E_V64QImode: mode = V32HImode; break;
17878 case E_V4HImode: mode = V2SImode; break;
17879 case E_V8HImode: mode = V4SImode; break;
17880 case E_V16HImode: mode = V8SImode; break;
17881 case E_V32HImode: mode = V16SImode; break;
17882 case E_V4SImode: mode = V2DImode; break;
17883 case E_V8SImode: mode = V4DImode; break;
17884 case E_V16SImode: mode = V8DImode; break;
17885 default: return false;
17886 }
17887 for (i = 0; i < d->nelt; i += 2)
17888 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
17889 return false;
17890 nd->vmode = mode;
17891 nd->nelt = d->nelt / 2;
17892 for (i = 0; i < nd->nelt; i++)
17893 nd->perm[i] = d->perm[2 * i] / 2;
17894 if (GET_MODE_INNER (mode) != DImode)
17895 canonicalize_vector_int_perm (nd, nd);
17896 if (nd != d)
17897 {
17898 nd->one_operand_p = d->one_operand_p;
17899 nd->testing_p = d->testing_p;
17900 if (d->op0 == d->op1)
17901 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
17902 else
17903 {
17904 nd->op0 = gen_lowpart (nd->vmode, d->op0);
17905 nd->op1 = gen_lowpart (nd->vmode, d->op1);
17906 }
17907 if (d->testing_p)
17908 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
17909 else
17910 nd->target = gen_reg_rtx (nd->vmode);
17911 }
17912 return true;
17913}
17914
2bf6d935
ML
17915/* Return true if permutation D can be performed as VMODE permutation
17916 instead. */
17917
17918static bool
17919valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
17920{
17921 unsigned int i, j, chunk;
17922
17923 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
17924 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
17925 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
17926 return false;
17927
17928 if (GET_MODE_NUNITS (vmode) >= d->nelt)
17929 return true;
17930
17931 chunk = d->nelt / GET_MODE_NUNITS (vmode);
17932 for (i = 0; i < d->nelt; i += chunk)
17933 if (d->perm[i] & (chunk - 1))
17934 return false;
17935 else
17936 for (j = 1; j < chunk; ++j)
17937 if (d->perm[i] + j != d->perm[i + j])
17938 return false;
17939
17940 return true;
17941}
17942
4bf4c103 17943/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
17944 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
17945
17946static bool
17947expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
17948{
17949 unsigned i, nelt, eltsz, mask;
17950 unsigned char perm[64];
17951 machine_mode vmode = V16QImode;
1fa991d1 17952 struct expand_vec_perm_d nd;
2bf6d935
ML
17953 rtx rperm[64], vperm, target, op0, op1;
17954
17955 nelt = d->nelt;
17956
17957 if (!d->one_operand_p)
be8749f9
UB
17958 switch (GET_MODE_SIZE (d->vmode))
17959 {
17960 case 4:
17961 if (!TARGET_XOP)
17962 return false;
17963 vmode = V4QImode;
17964 break;
37e93925 17965
be8749f9
UB
17966 case 8:
17967 if (!TARGET_XOP)
17968 return false;
17969 vmode = V8QImode;
17970 break;
2bf6d935 17971
be8749f9
UB
17972 case 16:
17973 if (!TARGET_XOP)
2bf6d935 17974 return false;
be8749f9
UB
17975 break;
17976
17977 case 32:
17978 if (!TARGET_AVX2)
17979 return false;
17980
17981 if (valid_perm_using_mode_p (V2TImode, d))
17982 {
17983 if (d->testing_p)
17984 return true;
17985
17986 /* Use vperm2i128 insn. The pattern uses
17987 V4DImode instead of V2TImode. */
17988 target = d->target;
17989 if (d->vmode != V4DImode)
17990 target = gen_reg_rtx (V4DImode);
17991 op0 = gen_lowpart (V4DImode, d->op0);
17992 op1 = gen_lowpart (V4DImode, d->op1);
17993 rperm[0]
17994 = GEN_INT ((d->perm[0] / (nelt / 2))
17995 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
17996 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
17997 if (target != d->target)
17998 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17999 return true;
18000 }
18001 /* FALLTHRU */
18002
18003 default:
37e93925 18004 return false;
be8749f9 18005 }
2bf6d935 18006 else
be8749f9
UB
18007 switch (GET_MODE_SIZE (d->vmode))
18008 {
18009 case 4:
18010 if (!TARGET_SSSE3)
18011 return false;
18012 vmode = V4QImode;
18013 break;
2bf6d935 18014
be8749f9
UB
18015 case 8:
18016 if (!TARGET_SSSE3)
18017 return false;
18018 vmode = V8QImode;
18019 break;
2bf6d935 18020
be8749f9
UB
18021 case 16:
18022 if (!TARGET_SSSE3)
18023 return false;
18024 break;
18025
18026 case 32:
18027 if (!TARGET_AVX2)
18028 return false;
18029
18030 /* V4DImode should be already handled through
18031 expand_vselect by vpermq instruction. */
18032 gcc_assert (d->vmode != V4DImode);
18033
18034 vmode = V32QImode;
18035 if (d->vmode == V8SImode
18036 || d->vmode == V16HImode
18037 || d->vmode == V32QImode)
18038 {
18039 /* First see if vpermq can be used for
18040 V8SImode/V16HImode/V32QImode. */
18041 if (valid_perm_using_mode_p (V4DImode, d))
18042 {
18043 for (i = 0; i < 4; i++)
18044 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
18045 if (d->testing_p)
18046 return true;
18047 target = gen_reg_rtx (V4DImode);
18048 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
18049 perm, 4, false))
18050 {
18051 emit_move_insn (d->target,
18052 gen_lowpart (d->vmode, target));
2bf6d935 18053 return true;
be8749f9
UB
18054 }
18055 return false;
18056 }
2bf6d935 18057
be8749f9
UB
18058 /* Next see if vpermd can be used. */
18059 if (valid_perm_using_mode_p (V8SImode, d))
18060 vmode = V8SImode;
18061 }
18062 /* Or if vpermps can be used. */
18063 else if (d->vmode == V8SFmode)
18064 vmode = V8SImode;
2bf6d935 18065
be8749f9
UB
18066 if (vmode == V32QImode)
18067 {
18068 /* vpshufb only works intra lanes, it is not
18069 possible to shuffle bytes in between the lanes. */
18070 for (i = 0; i < nelt; ++i)
18071 if ((d->perm[i] ^ i) & (nelt / 2))
18072 return false;
18073 }
18074 break;
2bf6d935 18075
be8749f9
UB
18076 case 64:
18077 if (!TARGET_AVX512BW)
18078 return false;
2bf6d935 18079
be8749f9
UB
18080 /* If vpermq didn't work, vpshufb won't work either. */
18081 if (d->vmode == V8DFmode || d->vmode == V8DImode)
18082 return false;
18083
18084 vmode = V64QImode;
18085 if (d->vmode == V16SImode
18086 || d->vmode == V32HImode
18087 || d->vmode == V64QImode)
18088 {
18089 /* First see if vpermq can be used for
18090 V16SImode/V32HImode/V64QImode. */
18091 if (valid_perm_using_mode_p (V8DImode, d))
18092 {
18093 for (i = 0; i < 8; i++)
18094 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
18095 if (d->testing_p)
18096 return true;
18097 target = gen_reg_rtx (V8DImode);
18098 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
18099 perm, 8, false))
18100 {
18101 emit_move_insn (d->target,
18102 gen_lowpart (d->vmode, target));
2bf6d935 18103 return true;
be8749f9
UB
18104 }
18105 return false;
18106 }
2bf6d935 18107
be8749f9
UB
18108 /* Next see if vpermd can be used. */
18109 if (valid_perm_using_mode_p (V16SImode, d))
18110 vmode = V16SImode;
18111 }
18112 /* Or if vpermps can be used. */
18113 else if (d->vmode == V16SFmode)
18114 vmode = V16SImode;
18115 if (vmode == V64QImode)
18116 {
18117 /* vpshufb only works intra lanes, it is not
18118 possible to shuffle bytes in between the lanes. */
18119 for (i = 0; i < nelt; ++i)
18120 if ((d->perm[i] ^ i) & (3 * nelt / 4))
18121 return false;
18122 }
18123 break;
18124
18125 default:
2bf6d935 18126 return false;
be8749f9 18127 }
2bf6d935
ML
18128
18129 if (d->testing_p)
18130 return true;
18131
681143b9
UB
18132 /* Try to avoid variable permutation instruction. */
18133 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
18134 {
18135 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
18136 return true;
18137 }
18138
2bf6d935
ML
18139 if (vmode == V8SImode)
18140 for (i = 0; i < 8; ++i)
18141 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
18142 else if (vmode == V16SImode)
18143 for (i = 0; i < 16; ++i)
18144 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
18145 else
18146 {
18147 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18148 if (!d->one_operand_p)
18149 mask = 2 * nelt - 1;
2bf6d935
ML
18150 else if (vmode == V64QImode)
18151 mask = nelt / 4 - 1;
a325bdd1 18152 else if (vmode == V32QImode)
2bf6d935 18153 mask = nelt / 2 - 1;
a325bdd1
PB
18154 else
18155 mask = nelt - 1;
2bf6d935
ML
18156
18157 for (i = 0; i < nelt; ++i)
18158 {
18159 unsigned j, e = d->perm[i] & mask;
18160 for (j = 0; j < eltsz; ++j)
18161 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
18162 }
18163 }
18164
a325bdd1
PB
18165 machine_mode vpmode = vmode;
18166
be8749f9
UB
18167 if (vmode == V4QImode
18168 || vmode == V8QImode)
a325bdd1 18169 {
dd835ec2
UB
18170 rtx m128 = GEN_INT (-128);
18171
37e93925 18172 /* Remap elements from the second operand, as we have to
be8749f9 18173 account for inactive top elements from the first operand. */
37e93925 18174 if (!d->one_operand_p)
be8749f9
UB
18175 {
18176 int sz = GET_MODE_SIZE (vmode);
18177
18178 for (i = 0; i < nelt; ++i)
18179 {
18180 int ival = INTVAL (rperm[i]);
18181 if (ival >= sz)
18182 ival += 16-sz;
18183 rperm[i] = GEN_INT (ival);
18184 }
18185 }
37e93925 18186
be8749f9
UB
18187 /* V4QI/V8QI is emulated with V16QI instruction, fill inactive
18188 elements in the top positions with zeros. */
a325bdd1 18189 for (i = nelt; i < 16; ++i)
dd835ec2 18190 rperm[i] = m128;
37e93925 18191
a325bdd1
PB
18192 vpmode = V16QImode;
18193 }
18194
18195 vperm = gen_rtx_CONST_VECTOR (vpmode,
18196 gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
18197 vperm = force_reg (vpmode, vperm);
2bf6d935 18198
37e93925
UB
18199 if (vmode == d->vmode)
18200 target = d->target;
18201 else
2bf6d935 18202 target = gen_reg_rtx (vmode);
37e93925 18203
2bf6d935 18204 op0 = gen_lowpart (vmode, d->op0);
37e93925 18205
2bf6d935
ML
18206 if (d->one_operand_p)
18207 {
37e93925
UB
18208 rtx (*gen) (rtx, rtx, rtx);
18209
be8749f9
UB
18210 if (vmode == V4QImode)
18211 gen = gen_mmx_pshufbv4qi3;
18212 else if (vmode == V8QImode)
37e93925 18213 gen = gen_mmx_pshufbv8qi3;
a325bdd1 18214 else if (vmode == V16QImode)
37e93925 18215 gen = gen_ssse3_pshufbv16qi3;
2bf6d935 18216 else if (vmode == V32QImode)
37e93925 18217 gen = gen_avx2_pshufbv32qi3;
2bf6d935 18218 else if (vmode == V64QImode)
37e93925 18219 gen = gen_avx512bw_pshufbv64qi3;
2bf6d935 18220 else if (vmode == V8SFmode)
37e93925 18221 gen = gen_avx2_permvarv8sf;
2bf6d935 18222 else if (vmode == V8SImode)
37e93925 18223 gen = gen_avx2_permvarv8si;
2bf6d935 18224 else if (vmode == V16SFmode)
37e93925 18225 gen = gen_avx512f_permvarv16sf;
2bf6d935 18226 else if (vmode == V16SImode)
37e93925 18227 gen = gen_avx512f_permvarv16si;
2bf6d935
ML
18228 else
18229 gcc_unreachable ();
37e93925
UB
18230
18231 emit_insn (gen (target, op0, vperm));
2bf6d935
ML
18232 }
18233 else
18234 {
37e93925
UB
18235 rtx (*gen) (rtx, rtx, rtx, rtx);
18236
2bf6d935 18237 op1 = gen_lowpart (vmode, d->op1);
37e93925 18238
be8749f9
UB
18239 if (vmode == V4QImode)
18240 gen = gen_mmx_ppermv32;
18241 else if (vmode == V8QImode)
37e93925
UB
18242 gen = gen_mmx_ppermv64;
18243 else if (vmode == V16QImode)
18244 gen = gen_xop_pperm;
18245 else
18246 gcc_unreachable ();
18247
18248 emit_insn (gen (target, op0, op1, vperm));
2bf6d935 18249 }
37e93925 18250
2bf6d935
ML
18251 if (target != d->target)
18252 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
18253
18254 return true;
18255}
18256
2bf6d935
ML
18257/* Try to expand one-operand permutation with constant mask. */
18258
18259static bool
18260ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
18261{
18262 machine_mode mode = GET_MODE (d->op0);
18263 machine_mode maskmode = mode;
faf2b6bc 18264 unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
2bf6d935
ML
18265 rtx (*gen) (rtx, rtx, rtx) = NULL;
18266 rtx target, op0, mask;
18267 rtx vec[64];
18268
18269 if (!rtx_equal_p (d->op0, d->op1))
18270 return false;
18271
18272 if (!TARGET_AVX512F)
18273 return false;
18274
faf2b6bc 18275 /* Accept VNxHImode and VNxQImode now. */
18276 if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
18277 return false;
18278
18279 /* vpermw. */
18280 if (!TARGET_AVX512BW && inner_size == 2)
18281 return false;
18282
18283 /* vpermb. */
18284 if (!TARGET_AVX512VBMI && inner_size == 1)
18285 return false;
18286
2bf6d935
ML
18287 switch (mode)
18288 {
18289 case E_V16SImode:
18290 gen = gen_avx512f_permvarv16si;
18291 break;
18292 case E_V16SFmode:
18293 gen = gen_avx512f_permvarv16sf;
18294 maskmode = V16SImode;
18295 break;
18296 case E_V8DImode:
18297 gen = gen_avx512f_permvarv8di;
18298 break;
18299 case E_V8DFmode:
18300 gen = gen_avx512f_permvarv8df;
18301 maskmode = V8DImode;
18302 break;
faf2b6bc 18303 case E_V32HImode:
18304 gen = gen_avx512bw_permvarv32hi;
18305 break;
18306 case E_V16HImode:
18307 gen = gen_avx512vl_permvarv16hi;
18308 break;
18309 case E_V8HImode:
18310 gen = gen_avx512vl_permvarv8hi;
18311 break;
18312 case E_V64QImode:
18313 gen = gen_avx512bw_permvarv64qi;
18314 break;
18315 case E_V32QImode:
18316 gen = gen_avx512vl_permvarv32qi;
18317 break;
18318 case E_V16QImode:
18319 gen = gen_avx512vl_permvarv16qi;
18320 break;
18321
2bf6d935
ML
18322 default:
18323 return false;
18324 }
18325
04b4f315
JJ
18326 if (d->testing_p)
18327 return true;
18328
2bf6d935
ML
18329 target = d->target;
18330 op0 = d->op0;
18331 for (int i = 0; i < d->nelt; ++i)
18332 vec[i] = GEN_INT (d->perm[i]);
18333 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
18334 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
18335 return true;
18336}
18337
18338static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
18339
4bf4c103 18340/* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
2bf6d935
ML
18341 in a single instruction. */
18342
18343static bool
18344expand_vec_perm_1 (struct expand_vec_perm_d *d)
18345{
18346 unsigned i, nelt = d->nelt;
18347 struct expand_vec_perm_d nd;
18348
18349 /* Check plain VEC_SELECT first, because AVX has instructions that could
18350 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
18351 input where SEL+CONCAT may not. */
18352 if (d->one_operand_p)
18353 {
18354 int mask = nelt - 1;
18355 bool identity_perm = true;
18356 bool broadcast_perm = true;
18357
18358 for (i = 0; i < nelt; i++)
18359 {
18360 nd.perm[i] = d->perm[i] & mask;
18361 if (nd.perm[i] != i)
18362 identity_perm = false;
18363 if (nd.perm[i])
18364 broadcast_perm = false;
18365 }
18366
18367 if (identity_perm)
18368 {
18369 if (!d->testing_p)
18370 emit_move_insn (d->target, d->op0);
18371 return true;
18372 }
18373 else if (broadcast_perm && TARGET_AVX2)
18374 {
18375 /* Use vpbroadcast{b,w,d}. */
18376 rtx (*gen) (rtx, rtx) = NULL;
18377 switch (d->vmode)
18378 {
18379 case E_V64QImode:
18380 if (TARGET_AVX512BW)
18381 gen = gen_avx512bw_vec_dupv64qi_1;
18382 break;
18383 case E_V32QImode:
18384 gen = gen_avx2_pbroadcastv32qi_1;
18385 break;
18386 case E_V32HImode:
18387 if (TARGET_AVX512BW)
18388 gen = gen_avx512bw_vec_dupv32hi_1;
18389 break;
18390 case E_V16HImode:
18391 gen = gen_avx2_pbroadcastv16hi_1;
18392 break;
18393 case E_V16SImode:
18394 if (TARGET_AVX512F)
18395 gen = gen_avx512f_vec_dupv16si_1;
18396 break;
18397 case E_V8SImode:
18398 gen = gen_avx2_pbroadcastv8si_1;
18399 break;
18400 case E_V16QImode:
18401 gen = gen_avx2_pbroadcastv16qi;
18402 break;
18403 case E_V8HImode:
18404 gen = gen_avx2_pbroadcastv8hi;
18405 break;
18406 case E_V16SFmode:
18407 if (TARGET_AVX512F)
18408 gen = gen_avx512f_vec_dupv16sf_1;
18409 break;
18410 case E_V8SFmode:
18411 gen = gen_avx2_vec_dupv8sf_1;
18412 break;
18413 case E_V8DFmode:
18414 if (TARGET_AVX512F)
18415 gen = gen_avx512f_vec_dupv8df_1;
18416 break;
18417 case E_V8DImode:
18418 if (TARGET_AVX512F)
18419 gen = gen_avx512f_vec_dupv8di_1;
18420 break;
18421 /* For other modes prefer other shuffles this function creates. */
18422 default: break;
18423 }
18424 if (gen != NULL)
18425 {
18426 if (!d->testing_p)
18427 emit_insn (gen (d->target, d->op0));
18428 return true;
18429 }
18430 }
18431
18432 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
18433 return true;
18434
18435 /* There are plenty of patterns in sse.md that are written for
18436 SEL+CONCAT and are not replicated for a single op. Perhaps
18437 that should be changed, to avoid the nastiness here. */
18438
18439 /* Recognize interleave style patterns, which means incrementing
18440 every other permutation operand. */
18441 for (i = 0; i < nelt; i += 2)
18442 {
18443 nd.perm[i] = d->perm[i] & mask;
18444 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
18445 }
18446 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
18447 d->testing_p))
18448 return true;
18449
18450 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
18451 if (nelt >= 4)
18452 {
18453 for (i = 0; i < nelt; i += 4)
18454 {
18455 nd.perm[i + 0] = d->perm[i + 0] & mask;
18456 nd.perm[i + 1] = d->perm[i + 1] & mask;
18457 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
18458 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
18459 }
18460
18461 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
18462 d->testing_p))
18463 return true;
18464 }
18465 }
18466
18467 /* Try movss/movsd instructions. */
18468 if (expand_vec_perm_movs (d))
18469 return true;
18470
18471 /* Finally, try the fully general two operand permute. */
18472 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
18473 d->testing_p))
18474 return true;
18475
18476 /* Recognize interleave style patterns with reversed operands. */
18477 if (!d->one_operand_p)
18478 {
18479 for (i = 0; i < nelt; ++i)
18480 {
18481 unsigned e = d->perm[i];
18482 if (e >= nelt)
18483 e -= nelt;
18484 else
18485 e += nelt;
18486 nd.perm[i] = e;
18487 }
18488
18489 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
18490 d->testing_p))
18491 return true;
18492 }
18493
18494 /* Try the SSE4.1 blend variable merge instructions. */
18495 if (expand_vec_perm_blend (d))
18496 return true;
18497
18498 /* Try one of the AVX vpermil variable permutations. */
18499 if (expand_vec_perm_vpermil (d))
18500 return true;
18501
18502 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
18503 vpshufb, vpermd, vpermps or vpermq variable permutation. */
18504 if (expand_vec_perm_pshufb (d))
18505 return true;
18506
18507 /* Try the AVX2 vpalignr instruction. */
18508 if (expand_vec_perm_palignr (d, true))
18509 return true;
18510
faf2b6bc 18511 /* Try the AVX512F vperm{w,b,s,d} instructions */
2bf6d935
ML
18512 if (ix86_expand_vec_one_operand_perm_avx512 (d))
18513 return true;
18514
18515 /* Try the AVX512F vpermt2/vpermi2 instructions. */
18516 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
18517 return true;
18518
18519 /* See if we can get the same permutation in different vector integer
18520 mode. */
18521 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
18522 {
18523 if (!d->testing_p)
18524 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
18525 return true;
18526 }
18527 return false;
18528}
18529
4bf4c103 18530/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
2bf6d935
ML
18531 in terms of a pair of pshuflw + pshufhw instructions. */
18532
18533static bool
18534expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
18535{
18536 unsigned char perm2[MAX_VECT_LEN];
18537 unsigned i;
18538 bool ok;
18539
18540 if (d->vmode != V8HImode || !d->one_operand_p)
18541 return false;
18542
18543 /* The two permutations only operate in 64-bit lanes. */
18544 for (i = 0; i < 4; ++i)
18545 if (d->perm[i] >= 4)
18546 return false;
18547 for (i = 4; i < 8; ++i)
18548 if (d->perm[i] < 4)
18549 return false;
18550
18551 if (d->testing_p)
18552 return true;
18553
18554 /* Emit the pshuflw. */
18555 memcpy (perm2, d->perm, 4);
18556 for (i = 4; i < 8; ++i)
18557 perm2[i] = i;
18558 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
18559 gcc_assert (ok);
18560
18561 /* Emit the pshufhw. */
18562 memcpy (perm2 + 4, d->perm + 4, 4);
18563 for (i = 0; i < 4; ++i)
18564 perm2[i] = i;
18565 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
18566 gcc_assert (ok);
18567
18568 return true;
18569}
18570
4bf4c103 18571/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
18572 the permutation using the SSSE3 palignr instruction. This succeeds
18573 when all of the elements in PERM fit within one vector and we merely
18574 need to shift them down so that a single vector permutation has a
18575 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
18576 the vpalignr instruction itself can perform the requested permutation. */
18577
18578static bool
18579expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
18580{
18581 unsigned i, nelt = d->nelt;
18582 unsigned min, max, minswap, maxswap;
18583 bool in_order, ok, swap = false;
18584 rtx shift, target;
18585 struct expand_vec_perm_d dcopy;
18586
18587 /* Even with AVX, palignr only operates on 128-bit vectors,
18588 in AVX2 palignr operates on both 128-bit lanes. */
18589 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
18590 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
18591 return false;
18592
18593 min = 2 * nelt;
18594 max = 0;
18595 minswap = 2 * nelt;
18596 maxswap = 0;
18597 for (i = 0; i < nelt; ++i)
18598 {
18599 unsigned e = d->perm[i];
18600 unsigned eswap = d->perm[i] ^ nelt;
18601 if (GET_MODE_SIZE (d->vmode) == 32)
18602 {
18603 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
18604 eswap = e ^ (nelt / 2);
18605 }
18606 if (e < min)
18607 min = e;
18608 if (e > max)
18609 max = e;
18610 if (eswap < minswap)
18611 minswap = eswap;
18612 if (eswap > maxswap)
18613 maxswap = eswap;
18614 }
18615 if (min == 0
18616 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
18617 {
18618 if (d->one_operand_p
18619 || minswap == 0
18620 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
18621 ? nelt / 2 : nelt))
18622 return false;
18623 swap = true;
18624 min = minswap;
18625 max = maxswap;
18626 }
18627
18628 /* Given that we have SSSE3, we know we'll be able to implement the
18629 single operand permutation after the palignr with pshufb for
18630 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
18631 first. */
18632 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
18633 return true;
18634
18635 dcopy = *d;
18636 if (swap)
18637 {
18638 dcopy.op0 = d->op1;
18639 dcopy.op1 = d->op0;
18640 for (i = 0; i < nelt; ++i)
18641 dcopy.perm[i] ^= nelt;
18642 }
18643
18644 in_order = true;
18645 for (i = 0; i < nelt; ++i)
18646 {
18647 unsigned e = dcopy.perm[i];
18648 if (GET_MODE_SIZE (d->vmode) == 32
18649 && e >= nelt
18650 && (e & (nelt / 2 - 1)) < min)
18651 e = e - min - (nelt / 2);
18652 else
18653 e = e - min;
18654 if (e != i)
18655 in_order = false;
18656 dcopy.perm[i] = e;
18657 }
18658 dcopy.one_operand_p = true;
18659
18660 if (single_insn_only_p && !in_order)
18661 return false;
18662
18663 /* For AVX2, test whether we can permute the result in one instruction. */
18664 if (d->testing_p)
18665 {
18666 if (in_order)
18667 return true;
18668 dcopy.op1 = dcopy.op0;
18669 return expand_vec_perm_1 (&dcopy);
18670 }
18671
18672 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
18673 if (GET_MODE_SIZE (d->vmode) == 16)
18674 {
18675 target = gen_reg_rtx (TImode);
18676 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
18677 gen_lowpart (TImode, dcopy.op0), shift));
18678 }
18679 else
18680 {
18681 target = gen_reg_rtx (V2TImode);
18682 emit_insn (gen_avx2_palignrv2ti (target,
18683 gen_lowpart (V2TImode, dcopy.op1),
18684 gen_lowpart (V2TImode, dcopy.op0),
18685 shift));
18686 }
18687
18688 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
18689
18690 /* Test for the degenerate case where the alignment by itself
18691 produces the desired permutation. */
18692 if (in_order)
18693 {
18694 emit_move_insn (d->target, dcopy.op0);
18695 return true;
18696 }
18697
18698 ok = expand_vec_perm_1 (&dcopy);
18699 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
18700
18701 return ok;
18702}
18703
18704/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
18705 the permutation using the SSE4_1 pblendv instruction. Potentially
18706 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
18707
18708static bool
18709expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
18710{
18711 unsigned i, which, nelt = d->nelt;
18712 struct expand_vec_perm_d dcopy, dcopy1;
18713 machine_mode vmode = d->vmode;
18714 bool ok;
18715
18716 /* Use the same checks as in expand_vec_perm_blend. */
18717 if (d->one_operand_p)
18718 return false;
18719 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
18720 ;
18721 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
18722 ;
be8749f9
UB
18723 else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 4
18724 || GET_MODE_SIZE (vmode) == 8
a325bdd1 18725 || GET_MODE_SIZE (vmode) == 16))
2bf6d935
ML
18726 ;
18727 else
18728 return false;
18729
18730 /* Figure out where permutation elements stay not in their
18731 respective lanes. */
18732 for (i = 0, which = 0; i < nelt; ++i)
18733 {
18734 unsigned e = d->perm[i];
18735 if (e != i)
18736 which |= (e < nelt ? 1 : 2);
18737 }
18738 /* We can pblend the part where elements stay not in their
18739 respective lanes only when these elements are all in one
18740 half of a permutation.
18741 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
18742 lanes, but both 8 and 9 >= 8
18743 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
18744 respective lanes and 8 >= 8, but 2 not. */
18745 if (which != 1 && which != 2)
18746 return false;
18747 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
18748 return true;
18749
18750 /* First we apply one operand permutation to the part where
18751 elements stay not in their respective lanes. */
18752 dcopy = *d;
18753 if (which == 2)
18754 dcopy.op0 = dcopy.op1 = d->op1;
18755 else
18756 dcopy.op0 = dcopy.op1 = d->op0;
18757 if (!d->testing_p)
18758 dcopy.target = gen_reg_rtx (vmode);
18759 dcopy.one_operand_p = true;
18760
18761 for (i = 0; i < nelt; ++i)
18762 dcopy.perm[i] = d->perm[i] & (nelt - 1);
18763
18764 ok = expand_vec_perm_1 (&dcopy);
18765 if (GET_MODE_SIZE (vmode) != 16 && !ok)
18766 return false;
18767 else
18768 gcc_assert (ok);
18769 if (d->testing_p)
18770 return true;
18771
18772 /* Next we put permuted elements into their positions. */
18773 dcopy1 = *d;
18774 if (which == 2)
18775 dcopy1.op1 = dcopy.target;
18776 else
18777 dcopy1.op0 = dcopy.target;
18778
18779 for (i = 0; i < nelt; ++i)
18780 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
18781
18782 ok = expand_vec_perm_blend (&dcopy1);
18783 gcc_assert (ok);
18784
18785 return true;
18786}
18787
18788static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
18789
4bf4c103 18790/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
18791 a two vector permutation into a single vector permutation by using
18792 an interleave operation to merge the vectors. */
18793
18794static bool
18795expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
18796{
18797 struct expand_vec_perm_d dremap, dfinal;
18798 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
18799 unsigned HOST_WIDE_INT contents;
18800 unsigned char remap[2 * MAX_VECT_LEN];
18801 rtx_insn *seq;
18802 bool ok, same_halves = false;
18803
be8749f9
UB
18804 if (GET_MODE_SIZE (d->vmode) == 4
18805 || GET_MODE_SIZE (d->vmode) == 8
a325bdd1 18806 || GET_MODE_SIZE (d->vmode) == 16)
2bf6d935
ML
18807 {
18808 if (d->one_operand_p)
18809 return false;
18810 }
18811 else if (GET_MODE_SIZE (d->vmode) == 32)
18812 {
18813 if (!TARGET_AVX)
18814 return false;
18815 /* For 32-byte modes allow even d->one_operand_p.
18816 The lack of cross-lane shuffling in some instructions
18817 might prevent a single insn shuffle. */
18818 dfinal = *d;
18819 dfinal.testing_p = true;
18820 /* If expand_vec_perm_interleave3 can expand this into
18821 a 3 insn sequence, give up and let it be expanded as
18822 3 insn sequence. While that is one insn longer,
18823 it doesn't need a memory operand and in the common
18824 case that both interleave low and high permutations
18825 with the same operands are adjacent needs 4 insns
18826 for both after CSE. */
18827 if (expand_vec_perm_interleave3 (&dfinal))
18828 return false;
18829 }
18830 else
18831 return false;
18832
18833 /* Examine from whence the elements come. */
18834 contents = 0;
18835 for (i = 0; i < nelt; ++i)
18836 contents |= HOST_WIDE_INT_1U << d->perm[i];
18837
18838 memset (remap, 0xff, sizeof (remap));
18839 dremap = *d;
18840
be8749f9
UB
18841 if (GET_MODE_SIZE (d->vmode) == 4
18842 || GET_MODE_SIZE (d->vmode) == 8)
a325bdd1
PB
18843 {
18844 unsigned HOST_WIDE_INT h1, h2, h3, h4;
18845
18846 /* Split the two input vectors into 4 halves. */
18847 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
18848 h2 = h1 << nelt2;
18849 h3 = h2 << nelt2;
18850 h4 = h3 << nelt2;
18851
18852 /* If the elements from the low halves use interleave low,
18853 and similarly for interleave high. */
18854 if ((contents & (h1 | h3)) == contents)
18855 {
18856 /* punpckl* */
18857 for (i = 0; i < nelt2; ++i)
18858 {
18859 remap[i] = i * 2;
18860 remap[i + nelt] = i * 2 + 1;
18861 dremap.perm[i * 2] = i;
18862 dremap.perm[i * 2 + 1] = i + nelt;
18863 }
18864 }
18865 else if ((contents & (h2 | h4)) == contents)
18866 {
18867 /* punpckh* */
18868 for (i = 0; i < nelt2; ++i)
18869 {
18870 remap[i + nelt2] = i * 2;
18871 remap[i + nelt + nelt2] = i * 2 + 1;
18872 dremap.perm[i * 2] = i + nelt2;
18873 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
18874 }
18875 }
18876 else
18877 return false;
18878 }
18879 else if (GET_MODE_SIZE (d->vmode) == 16)
2bf6d935
ML
18880 {
18881 unsigned HOST_WIDE_INT h1, h2, h3, h4;
18882
18883 /* Split the two input vectors into 4 halves. */
18884 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
18885 h2 = h1 << nelt2;
18886 h3 = h2 << nelt2;
18887 h4 = h3 << nelt2;
18888
18889 /* If the elements from the low halves use interleave low, and similarly
18890 for interleave high. If the elements are from mis-matched halves, we
18891 can use shufps for V4SF/V4SI or do a DImode shuffle. */
18892 if ((contents & (h1 | h3)) == contents)
18893 {
18894 /* punpckl* */
18895 for (i = 0; i < nelt2; ++i)
18896 {
18897 remap[i] = i * 2;
18898 remap[i + nelt] = i * 2 + 1;
18899 dremap.perm[i * 2] = i;
18900 dremap.perm[i * 2 + 1] = i + nelt;
18901 }
18902 if (!TARGET_SSE2 && d->vmode == V4SImode)
18903 dremap.vmode = V4SFmode;
18904 }
18905 else if ((contents & (h2 | h4)) == contents)
18906 {
18907 /* punpckh* */
18908 for (i = 0; i < nelt2; ++i)
18909 {
18910 remap[i + nelt2] = i * 2;
18911 remap[i + nelt + nelt2] = i * 2 + 1;
18912 dremap.perm[i * 2] = i + nelt2;
18913 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
18914 }
18915 if (!TARGET_SSE2 && d->vmode == V4SImode)
18916 dremap.vmode = V4SFmode;
18917 }
18918 else if ((contents & (h1 | h4)) == contents)
18919 {
18920 /* shufps */
18921 for (i = 0; i < nelt2; ++i)
18922 {
18923 remap[i] = i;
18924 remap[i + nelt + nelt2] = i + nelt2;
18925 dremap.perm[i] = i;
18926 dremap.perm[i + nelt2] = i + nelt + nelt2;
18927 }
18928 if (nelt != 4)
18929 {
18930 /* shufpd */
18931 dremap.vmode = V2DImode;
18932 dremap.nelt = 2;
18933 dremap.perm[0] = 0;
18934 dremap.perm[1] = 3;
18935 }
18936 }
18937 else if ((contents & (h2 | h3)) == contents)
18938 {
18939 /* shufps */
18940 for (i = 0; i < nelt2; ++i)
18941 {
18942 remap[i + nelt2] = i;
18943 remap[i + nelt] = i + nelt2;
18944 dremap.perm[i] = i + nelt2;
18945 dremap.perm[i + nelt2] = i + nelt;
18946 }
18947 if (nelt != 4)
18948 {
18949 /* shufpd */
18950 dremap.vmode = V2DImode;
18951 dremap.nelt = 2;
18952 dremap.perm[0] = 1;
18953 dremap.perm[1] = 2;
18954 }
18955 }
18956 else
18957 return false;
18958 }
18959 else
18960 {
18961 unsigned int nelt4 = nelt / 4, nzcnt = 0;
18962 unsigned HOST_WIDE_INT q[8];
18963 unsigned int nonzero_halves[4];
18964
18965 /* Split the two input vectors into 8 quarters. */
18966 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
18967 for (i = 1; i < 8; ++i)
18968 q[i] = q[0] << (nelt4 * i);
18969 for (i = 0; i < 4; ++i)
18970 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
18971 {
18972 nonzero_halves[nzcnt] = i;
18973 ++nzcnt;
18974 }
18975
18976 if (nzcnt == 1)
18977 {
18978 gcc_assert (d->one_operand_p);
18979 nonzero_halves[1] = nonzero_halves[0];
18980 same_halves = true;
18981 }
18982 else if (d->one_operand_p)
18983 {
18984 gcc_assert (nonzero_halves[0] == 0);
18985 gcc_assert (nonzero_halves[1] == 1);
18986 }
18987
18988 if (nzcnt <= 2)
18989 {
18990 if (d->perm[0] / nelt2 == nonzero_halves[1])
18991 {
18992 /* Attempt to increase the likelihood that dfinal
18993 shuffle will be intra-lane. */
18994 std::swap (nonzero_halves[0], nonzero_halves[1]);
18995 }
18996
18997 /* vperm2f128 or vperm2i128. */
18998 for (i = 0; i < nelt2; ++i)
18999 {
19000 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
19001 remap[i + nonzero_halves[0] * nelt2] = i;
19002 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
19003 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
19004 }
19005
19006 if (d->vmode != V8SFmode
19007 && d->vmode != V4DFmode
19008 && d->vmode != V8SImode)
19009 {
19010 dremap.vmode = V8SImode;
19011 dremap.nelt = 8;
19012 for (i = 0; i < 4; ++i)
19013 {
19014 dremap.perm[i] = i + nonzero_halves[0] * 4;
19015 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
19016 }
19017 }
19018 }
19019 else if (d->one_operand_p)
19020 return false;
19021 else if (TARGET_AVX2
19022 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
19023 {
19024 /* vpunpckl* */
19025 for (i = 0; i < nelt4; ++i)
19026 {
19027 remap[i] = i * 2;
19028 remap[i + nelt] = i * 2 + 1;
19029 remap[i + nelt2] = i * 2 + nelt2;
19030 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
19031 dremap.perm[i * 2] = i;
19032 dremap.perm[i * 2 + 1] = i + nelt;
19033 dremap.perm[i * 2 + nelt2] = i + nelt2;
19034 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
19035 }
19036 }
19037 else if (TARGET_AVX2
19038 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
19039 {
19040 /* vpunpckh* */
19041 for (i = 0; i < nelt4; ++i)
19042 {
19043 remap[i + nelt4] = i * 2;
19044 remap[i + nelt + nelt4] = i * 2 + 1;
19045 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
19046 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
19047 dremap.perm[i * 2] = i + nelt4;
19048 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
19049 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
19050 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
19051 }
19052 }
19053 else
19054 return false;
19055 }
19056
19057 /* Use the remapping array set up above to move the elements from their
19058 swizzled locations into their final destinations. */
19059 dfinal = *d;
19060 for (i = 0; i < nelt; ++i)
19061 {
19062 unsigned e = remap[d->perm[i]];
19063 gcc_assert (e < nelt);
19064 /* If same_halves is true, both halves of the remapped vector are the
19065 same. Avoid cross-lane accesses if possible. */
19066 if (same_halves && i >= nelt2)
19067 {
19068 gcc_assert (e < nelt2);
19069 dfinal.perm[i] = e + nelt2;
19070 }
19071 else
19072 dfinal.perm[i] = e;
19073 }
19074 if (!d->testing_p)
19075 {
19076 dremap.target = gen_reg_rtx (dremap.vmode);
19077 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
19078 }
19079 dfinal.op1 = dfinal.op0;
19080 dfinal.one_operand_p = true;
19081
19082 /* Test if the final remap can be done with a single insn. For V4SFmode or
19083 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
19084 start_sequence ();
19085 ok = expand_vec_perm_1 (&dfinal);
19086 seq = get_insns ();
19087 end_sequence ();
19088
19089 if (!ok)
19090 return false;
19091
19092 if (d->testing_p)
19093 return true;
19094
19095 if (dremap.vmode != dfinal.vmode)
19096 {
19097 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
19098 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
19099 }
19100
19101 ok = expand_vec_perm_1 (&dremap);
19102 gcc_assert (ok);
19103
19104 emit_insn (seq);
19105 return true;
19106}
19107
4bf4c103 19108/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
19109 a single vector cross-lane permutation into vpermq followed
19110 by any of the single insn permutations. */
19111
19112static bool
19113expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
19114{
19115 struct expand_vec_perm_d dremap, dfinal;
19116 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
19117 unsigned contents[2];
19118 bool ok;
19119
19120 if (!(TARGET_AVX2
19121 && (d->vmode == V32QImode || d->vmode == V16HImode)
19122 && d->one_operand_p))
19123 return false;
19124
19125 contents[0] = 0;
19126 contents[1] = 0;
19127 for (i = 0; i < nelt2; ++i)
19128 {
19129 contents[0] |= 1u << (d->perm[i] / nelt4);
19130 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
19131 }
19132
19133 for (i = 0; i < 2; ++i)
19134 {
19135 unsigned int cnt = 0;
19136 for (j = 0; j < 4; ++j)
19137 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
19138 return false;
19139 }
19140
19141 if (d->testing_p)
19142 return true;
19143
19144 dremap = *d;
19145 dremap.vmode = V4DImode;
19146 dremap.nelt = 4;
19147 dremap.target = gen_reg_rtx (V4DImode);
19148 dremap.op0 = gen_lowpart (V4DImode, d->op0);
19149 dremap.op1 = dremap.op0;
19150 dremap.one_operand_p = true;
19151 for (i = 0; i < 2; ++i)
19152 {
19153 unsigned int cnt = 0;
19154 for (j = 0; j < 4; ++j)
19155 if ((contents[i] & (1u << j)) != 0)
19156 dremap.perm[2 * i + cnt++] = j;
19157 for (; cnt < 2; ++cnt)
19158 dremap.perm[2 * i + cnt] = 0;
19159 }
19160
19161 dfinal = *d;
19162 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
19163 dfinal.op1 = dfinal.op0;
19164 dfinal.one_operand_p = true;
19165 for (i = 0, j = 0; i < nelt; ++i)
19166 {
19167 if (i == nelt2)
19168 j = 2;
19169 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
19170 if ((d->perm[i] / nelt4) == dremap.perm[j])
19171 ;
19172 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
19173 dfinal.perm[i] |= nelt4;
19174 else
19175 gcc_unreachable ();
19176 }
19177
19178 ok = expand_vec_perm_1 (&dremap);
19179 gcc_assert (ok);
19180
19181 ok = expand_vec_perm_1 (&dfinal);
19182 gcc_assert (ok);
19183
19184 return true;
19185}
19186
19187static bool canonicalize_perm (struct expand_vec_perm_d *d);
19188
4bf4c103 19189/* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
2bf6d935
ML
19190 a vector permutation using two instructions, vperm2f128 resp.
19191 vperm2i128 followed by any single in-lane permutation. */
19192
19193static bool
19194expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
19195{
19196 struct expand_vec_perm_d dfirst, dsecond;
19197 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
19198 bool ok;
19199
19200 if (!TARGET_AVX
19201 || GET_MODE_SIZE (d->vmode) != 32
19202 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
19203 return false;
19204
19205 dsecond = *d;
19206 dsecond.one_operand_p = false;
19207 dsecond.testing_p = true;
19208
19209 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
19210 immediate. For perm < 16 the second permutation uses
19211 d->op0 as first operand, for perm >= 16 it uses d->op1
19212 as first operand. The second operand is the result of
19213 vperm2[fi]128. */
19214 for (perm = 0; perm < 32; perm++)
19215 {
19216 /* Ignore permutations which do not move anything cross-lane. */
19217 if (perm < 16)
19218 {
19219 /* The second shuffle for e.g. V4DFmode has
19220 0123 and ABCD operands.
19221 Ignore AB23, as 23 is already in the second lane
19222 of the first operand. */
19223 if ((perm & 0xc) == (1 << 2)) continue;
19224 /* And 01CD, as 01 is in the first lane of the first
19225 operand. */
19226 if ((perm & 3) == 0) continue;
19227 /* And 4567, as then the vperm2[fi]128 doesn't change
19228 anything on the original 4567 second operand. */
19229 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
19230 }
19231 else
19232 {
19233 /* The second shuffle for e.g. V4DFmode has
19234 4567 and ABCD operands.
19235 Ignore AB67, as 67 is already in the second lane
19236 of the first operand. */
19237 if ((perm & 0xc) == (3 << 2)) continue;
19238 /* And 45CD, as 45 is in the first lane of the first
19239 operand. */
19240 if ((perm & 3) == 2) continue;
19241 /* And 0123, as then the vperm2[fi]128 doesn't change
19242 anything on the original 0123 first operand. */
19243 if ((perm & 0xf) == (1 << 2)) continue;
19244 }
19245
19246 for (i = 0; i < nelt; i++)
19247 {
19248 j = d->perm[i] / nelt2;
19249 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
19250 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
19251 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
19252 dsecond.perm[i] = d->perm[i] & (nelt - 1);
19253 else
19254 break;
19255 }
19256
19257 if (i == nelt)
19258 {
19259 start_sequence ();
19260 ok = expand_vec_perm_1 (&dsecond);
19261 end_sequence ();
19262 }
19263 else
19264 ok = false;
19265
19266 if (ok)
19267 {
19268 if (d->testing_p)
19269 return true;
19270
19271 /* Found a usable second shuffle. dfirst will be
19272 vperm2f128 on d->op0 and d->op1. */
19273 dsecond.testing_p = false;
19274 dfirst = *d;
19275 dfirst.target = gen_reg_rtx (d->vmode);
19276 for (i = 0; i < nelt; i++)
19277 dfirst.perm[i] = (i & (nelt2 - 1))
19278 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
19279
19280 canonicalize_perm (&dfirst);
19281 ok = expand_vec_perm_1 (&dfirst);
19282 gcc_assert (ok);
19283
19284 /* And dsecond is some single insn shuffle, taking
19285 d->op0 and result of vperm2f128 (if perm < 16) or
19286 d->op1 and result of vperm2f128 (otherwise). */
19287 if (perm >= 16)
19288 dsecond.op0 = dsecond.op1;
19289 dsecond.op1 = dfirst.target;
19290
19291 ok = expand_vec_perm_1 (&dsecond);
19292 gcc_assert (ok);
19293
19294 return true;
19295 }
19296
19297 /* For one operand, the only useful vperm2f128 permutation is 0x01
19298 aka lanes swap. */
19299 if (d->one_operand_p)
19300 return false;
19301 }
19302
19303 return false;
19304}
19305
4bf4c103 19306/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
2bf6d935
ML
19307 a two vector permutation using 2 intra-lane interleave insns
19308 and cross-lane shuffle for 32-byte vectors. */
19309
19310static bool
19311expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
19312{
19313 unsigned i, nelt;
19314 rtx (*gen) (rtx, rtx, rtx);
19315
19316 if (d->one_operand_p)
19317 return false;
19318 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
19319 ;
19320 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
19321 ;
19322 else
19323 return false;
19324
19325 nelt = d->nelt;
19326 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
19327 return false;
19328 for (i = 0; i < nelt; i += 2)
19329 if (d->perm[i] != d->perm[0] + i / 2
19330 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
19331 return false;
19332
19333 if (d->testing_p)
19334 return true;
19335
19336 switch (d->vmode)
19337 {
19338 case E_V32QImode:
19339 if (d->perm[0])
19340 gen = gen_vec_interleave_highv32qi;
19341 else
19342 gen = gen_vec_interleave_lowv32qi;
19343 break;
19344 case E_V16HImode:
19345 if (d->perm[0])
19346 gen = gen_vec_interleave_highv16hi;
19347 else
19348 gen = gen_vec_interleave_lowv16hi;
19349 break;
19350 case E_V8SImode:
19351 if (d->perm[0])
19352 gen = gen_vec_interleave_highv8si;
19353 else
19354 gen = gen_vec_interleave_lowv8si;
19355 break;
19356 case E_V4DImode:
19357 if (d->perm[0])
19358 gen = gen_vec_interleave_highv4di;
19359 else
19360 gen = gen_vec_interleave_lowv4di;
19361 break;
19362 case E_V8SFmode:
19363 if (d->perm[0])
19364 gen = gen_vec_interleave_highv8sf;
19365 else
19366 gen = gen_vec_interleave_lowv8sf;
19367 break;
19368 case E_V4DFmode:
19369 if (d->perm[0])
19370 gen = gen_vec_interleave_highv4df;
19371 else
19372 gen = gen_vec_interleave_lowv4df;
19373 break;
19374 default:
19375 gcc_unreachable ();
19376 }
19377
19378 emit_insn (gen (d->target, d->op0, d->op1));
19379 return true;
19380}
19381
4bf4c103 19382/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
2bf6d935
ML
19383 a single vector permutation using a single intra-lane vector
19384 permutation, vperm2f128 swapping the lanes and vblend* insn blending
19385 the non-swapped and swapped vectors together. */
19386
19387static bool
19388expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
19389{
19390 struct expand_vec_perm_d dfirst, dsecond;
19391 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
19392 rtx_insn *seq;
19393 bool ok;
19394 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
19395
19396 if (!TARGET_AVX
19397 || TARGET_AVX2
19398 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
19399 || !d->one_operand_p)
19400 return false;
19401
19402 dfirst = *d;
19403 for (i = 0; i < nelt; i++)
19404 dfirst.perm[i] = 0xff;
19405 for (i = 0, msk = 0; i < nelt; i++)
19406 {
19407 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
19408 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
19409 return false;
19410 dfirst.perm[j] = d->perm[i];
19411 if (j != i)
19412 msk |= (1 << i);
19413 }
19414 for (i = 0; i < nelt; i++)
19415 if (dfirst.perm[i] == 0xff)
19416 dfirst.perm[i] = i;
19417
19418 if (!d->testing_p)
19419 dfirst.target = gen_reg_rtx (dfirst.vmode);
19420
19421 start_sequence ();
19422 ok = expand_vec_perm_1 (&dfirst);
19423 seq = get_insns ();
19424 end_sequence ();
19425
19426 if (!ok)
19427 return false;
19428
19429 if (d->testing_p)
19430 return true;
19431
19432 emit_insn (seq);
19433
19434 dsecond = *d;
19435 dsecond.op0 = dfirst.target;
19436 dsecond.op1 = dfirst.target;
19437 dsecond.one_operand_p = true;
19438 dsecond.target = gen_reg_rtx (dsecond.vmode);
19439 for (i = 0; i < nelt; i++)
19440 dsecond.perm[i] = i ^ nelt2;
19441
19442 ok = expand_vec_perm_1 (&dsecond);
19443 gcc_assert (ok);
19444
19445 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
19446 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
19447 return true;
19448}
19449
829c4bea
JJ
19450/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
19451 a two vector permutation using two single vector permutations and
19452 {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
19453 of dfirst or dsecond is identity permutation. */
19454
19455static bool
19456expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
19457{
19458 unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
19459 struct expand_vec_perm_d dfirst, dsecond, dfinal;
19460 bool ident1 = true, ident2 = true;
19461
19462 if (d->one_operand_p)
19463 return false;
19464
19465 if (GET_MODE_SIZE (d->vmode) == 16)
19466 {
19467 if (!TARGET_SSE)
19468 return false;
19469 if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
19470 return false;
19471 }
19472 else if (GET_MODE_SIZE (d->vmode) == 32)
19473 {
19474 if (!TARGET_AVX)
19475 return false;
19476 if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
19477 return false;
19478 lane = nelt2;
19479 }
19480 else
19481 return false;
19482
19483 for (i = 1; i < nelt; i++)
19484 if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
19485 return false;
19486
19487 dfirst = *d;
19488 dsecond = *d;
19489 dfinal = *d;
19490 dfirst.op1 = dfirst.op0;
19491 dfirst.one_operand_p = true;
19492 dsecond.op0 = dsecond.op1;
19493 dsecond.one_operand_p = true;
19494
19495 for (i = 0; i < nelt; i++)
19496 if (d->perm[i] >= nelt)
19497 {
19498 dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
19499 if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
19500 ident2 = false;
19501 dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
19502 = d->perm[i] - nelt;
19503 }
19504 else
19505 {
19506 dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
19507 if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
19508 ident1 = false;
19509 dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
19510 }
19511
19512 if (two_insn && !ident1 && !ident2)
19513 return false;
19514
19515 if (!d->testing_p)
19516 {
19517 if (!ident1)
19518 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
19519 if (!ident2)
19520 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
19521 if (d->perm[0] >= nelt)
19522 std::swap (dfinal.op0, dfinal.op1);
19523 }
19524
19525 bool ok;
19526 rtx_insn *seq1 = NULL, *seq2 = NULL;
19527
19528 if (!ident1)
19529 {
19530 start_sequence ();
19531 ok = expand_vec_perm_1 (&dfirst);
19532 seq1 = get_insns ();
19533 end_sequence ();
19534
19535 if (!ok)
19536 return false;
19537 }
19538
19539 if (!ident2)
19540 {
19541 start_sequence ();
19542 ok = expand_vec_perm_1 (&dsecond);
19543 seq2 = get_insns ();
19544 end_sequence ();
19545
19546 if (!ok)
19547 return false;
19548 }
19549
19550 if (d->testing_p)
19551 return true;
19552
19553 for (i = 0; i < nelt; i++)
19554 {
19555 dfinal.perm[i] = i / 2;
19556 if (i >= lane)
19557 dfinal.perm[i] += lane / 2;
19558 if ((i & 1) != 0)
19559 dfinal.perm[i] += nelt;
19560 }
19561 emit_insn (seq1);
19562 emit_insn (seq2);
19563 ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
19564 dfinal.perm, dfinal.nelt, false);
19565 gcc_assert (ok);
19566 return true;
19567}
19568
19569/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
19570 the permutation using two single vector permutations and the SSE4_1 pblendv
19571 instruction. If two_insn, succeed only if one of dfirst or dsecond is
19572 identity permutation. */
19573
19574static bool
19575expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
19576{
19577 unsigned i, nelt = d->nelt;
19578 struct expand_vec_perm_d dfirst, dsecond, dfinal;
19579 machine_mode vmode = d->vmode;
19580 bool ident1 = true, ident2 = true;
19581
19582 /* Use the same checks as in expand_vec_perm_blend. */
19583 if (d->one_operand_p)
19584 return false;
19585 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
19586 ;
19587 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
19588 ;
dd835ec2 19589 else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
be8749f9
UB
19590 || GET_MODE_SIZE (vmode) == 8
19591 || GET_MODE_SIZE (vmode) == 4))
829c4bea
JJ
19592 ;
19593 else
19594 return false;
19595
19596 dfirst = *d;
19597 dsecond = *d;
19598 dfinal = *d;
19599 dfirst.op1 = dfirst.op0;
19600 dfirst.one_operand_p = true;
19601 dsecond.op0 = dsecond.op1;
19602 dsecond.one_operand_p = true;
19603
19604 for (i = 0; i < nelt; ++i)
19605 if (d->perm[i] >= nelt)
19606 {
19607 dfirst.perm[i] = 0xff;
19608 dsecond.perm[i] = d->perm[i] - nelt;
19609 if (d->perm[i] != i + nelt)
19610 ident2 = false;
19611 }
19612 else
19613 {
19614 dsecond.perm[i] = 0xff;
19615 dfirst.perm[i] = d->perm[i];
19616 if (d->perm[i] != i)
19617 ident1 = false;
19618 }
19619
19620 if (two_insn && !ident1 && !ident2)
19621 return false;
19622
19623 /* For now. Ideally treat 0xff as a wildcard. */
19624 for (i = 0; i < nelt; ++i)
19625 if (dfirst.perm[i] == 0xff)
19626 {
19627 if (GET_MODE_SIZE (vmode) == 32
19628 && dfirst.perm[i ^ (nelt / 2)] != 0xff)
19629 dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
19630 else
19631 dfirst.perm[i] = i;
19632 }
19633 else
19634 {
19635 if (GET_MODE_SIZE (vmode) == 32
19636 && dsecond.perm[i ^ (nelt / 2)] != 0xff)
19637 dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
19638 else
19639 dsecond.perm[i] = i;
19640 }
19641
19642 if (!d->testing_p)
19643 {
19644 if (!ident1)
19645 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
19646 if (!ident2)
19647 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
19648 }
19649
19650 bool ok;
19651 rtx_insn *seq1 = NULL, *seq2 = NULL;
19652
19653 if (!ident1)
19654 {
19655 start_sequence ();
19656 ok = expand_vec_perm_1 (&dfirst);
19657 seq1 = get_insns ();
19658 end_sequence ();
19659
19660 if (!ok)
19661 return false;
19662 }
19663
19664 if (!ident2)
19665 {
19666 start_sequence ();
19667 ok = expand_vec_perm_1 (&dsecond);
19668 seq2 = get_insns ();
19669 end_sequence ();
19670
19671 if (!ok)
19672 return false;
19673 }
19674
19675 if (d->testing_p)
19676 return true;
19677
19678 for (i = 0; i < nelt; ++i)
19679 dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
19680
19681 emit_insn (seq1);
19682 emit_insn (seq2);
19683 ok = expand_vec_perm_blend (&dfinal);
19684 gcc_assert (ok);
19685 return true;
19686}
19687
4bf4c103 19688/* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
2bf6d935
ML
19689 permutation using two vperm2f128, followed by a vshufpd insn blending
19690 the two vectors together. */
19691
19692static bool
19693expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
19694{
19695 struct expand_vec_perm_d dfirst, dsecond, dthird;
19696 bool ok;
19697
19698 if (!TARGET_AVX || (d->vmode != V4DFmode))
19699 return false;
19700
19701 if (d->testing_p)
19702 return true;
19703
19704 dfirst = *d;
19705 dsecond = *d;
19706 dthird = *d;
19707
19708 dfirst.perm[0] = (d->perm[0] & ~1);
19709 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
19710 dfirst.perm[2] = (d->perm[2] & ~1);
19711 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
19712 dsecond.perm[0] = (d->perm[1] & ~1);
19713 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
19714 dsecond.perm[2] = (d->perm[3] & ~1);
19715 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
19716 dthird.perm[0] = (d->perm[0] % 2);
19717 dthird.perm[1] = (d->perm[1] % 2) + 4;
19718 dthird.perm[2] = (d->perm[2] % 2) + 2;
19719 dthird.perm[3] = (d->perm[3] % 2) + 6;
19720
19721 dfirst.target = gen_reg_rtx (dfirst.vmode);
19722 dsecond.target = gen_reg_rtx (dsecond.vmode);
19723 dthird.op0 = dfirst.target;
19724 dthird.op1 = dsecond.target;
19725 dthird.one_operand_p = false;
19726
19727 canonicalize_perm (&dfirst);
19728 canonicalize_perm (&dsecond);
19729
19730 ok = expand_vec_perm_1 (&dfirst)
19731 && expand_vec_perm_1 (&dsecond)
19732 && expand_vec_perm_1 (&dthird);
19733
19734 gcc_assert (ok);
19735
19736 return true;
19737}
19738
4bf4c103
JJ
19739static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
19740
19741/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
19742 a two vector permutation using two intra-lane vector
19743 permutations, vperm2f128 swapping the lanes and vblend* insn blending
19744 the non-swapped and swapped vectors together. */
19745
19746static bool
19747expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
19748{
19749 struct expand_vec_perm_d dfirst, dsecond, dthird;
19750 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
19751 rtx_insn *seq1, *seq2;
19752 bool ok;
19753 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
19754
19755 if (!TARGET_AVX
19756 || TARGET_AVX2
19757 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
19758 || d->one_operand_p)
19759 return false;
19760
19761 dfirst = *d;
19762 dsecond = *d;
19763 for (i = 0; i < nelt; i++)
19764 {
19765 dfirst.perm[i] = 0xff;
19766 dsecond.perm[i] = 0xff;
19767 }
19768 for (i = 0, msk = 0; i < nelt; i++)
19769 {
19770 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
19771 if (j == i)
19772 {
19773 dfirst.perm[j] = d->perm[i];
19774 which1 |= (d->perm[i] < nelt ? 1 : 2);
19775 }
19776 else
19777 {
19778 dsecond.perm[j] = d->perm[i];
19779 which2 |= (d->perm[i] < nelt ? 1 : 2);
19780 msk |= (1U << i);
19781 }
19782 }
19783 if (msk == 0 || msk == (1U << nelt) - 1)
19784 return false;
19785
19786 if (!d->testing_p)
19787 {
19788 dfirst.target = gen_reg_rtx (dfirst.vmode);
19789 dsecond.target = gen_reg_rtx (dsecond.vmode);
19790 }
19791
19792 for (i = 0; i < nelt; i++)
19793 {
19794 if (dfirst.perm[i] == 0xff)
19795 dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
19796 if (dsecond.perm[i] == 0xff)
19797 dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
19798 }
19799 canonicalize_perm (&dfirst);
19800 start_sequence ();
19801 ok = ix86_expand_vec_perm_const_1 (&dfirst);
19802 seq1 = get_insns ();
19803 end_sequence ();
19804
19805 if (!ok)
19806 return false;
19807
19808 canonicalize_perm (&dsecond);
19809 start_sequence ();
19810 ok = ix86_expand_vec_perm_const_1 (&dsecond);
19811 seq2 = get_insns ();
19812 end_sequence ();
19813
19814 if (!ok)
19815 return false;
19816
19817 if (d->testing_p)
19818 return true;
19819
19820 emit_insn (seq1);
19821 emit_insn (seq2);
19822
19823 dthird = *d;
19824 dthird.op0 = dsecond.target;
19825 dthird.op1 = dsecond.target;
19826 dthird.one_operand_p = true;
19827 dthird.target = gen_reg_rtx (dthird.vmode);
19828 for (i = 0; i < nelt; i++)
19829 dthird.perm[i] = i ^ nelt2;
19830
19831 ok = expand_vec_perm_1 (&dthird);
19832 gcc_assert (ok);
19833
19834 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
19835 emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
19836 return true;
19837}
19838
2bf6d935
ML
19839/* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
19840 permutation with two pshufb insns and an ior. We should have already
19841 failed all two instruction sequences. */
19842
19843static bool
19844expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
19845{
19846 rtx rperm[2][16], vperm, l, h, op, m128;
19847 unsigned int i, nelt, eltsz;
dd835ec2
UB
19848 machine_mode mode;
19849 rtx (*gen) (rtx, rtx, rtx);
2bf6d935 19850
dd835ec2 19851 if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
be8749f9
UB
19852 && GET_MODE_SIZE (d->vmode) != 8
19853 && GET_MODE_SIZE (d->vmode) != 4))
2bf6d935
ML
19854 return false;
19855 gcc_assert (!d->one_operand_p);
19856
19857 if (d->testing_p)
19858 return true;
19859
dd835ec2
UB
19860 switch (GET_MODE_SIZE (d->vmode))
19861 {
be8749f9
UB
19862 case 4:
19863 mode = V4QImode;
19864 gen = gen_mmx_pshufbv4qi3;
19865 break;
dd835ec2
UB
19866 case 8:
19867 mode = V8QImode;
19868 gen = gen_mmx_pshufbv8qi3;
19869 break;
19870 case 16:
19871 mode = V16QImode;
19872 gen = gen_ssse3_pshufbv16qi3;
19873 break;
19874 default:
19875 gcc_unreachable ();
19876 }
19877
2bf6d935
ML
19878 nelt = d->nelt;
19879 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19880
19881 /* Generate two permutation masks. If the required element is within
19882 the given vector it is shuffled into the proper lane. If the required
19883 element is in the other vector, force a zero into the lane by setting
19884 bit 7 in the permutation mask. */
19885 m128 = GEN_INT (-128);
19886 for (i = 0; i < nelt; ++i)
19887 {
dd835ec2 19888 unsigned j, k, e = d->perm[i];
2bf6d935
ML
19889 unsigned which = (e >= nelt);
19890 if (e >= nelt)
19891 e -= nelt;
19892
19893 for (j = 0; j < eltsz; ++j)
19894 {
19895 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
19896 rperm[1-which][i*eltsz + j] = m128;
19897 }
dd835ec2
UB
19898
19899 for (k = i*eltsz + j; k < 16; ++k)
19900 rperm[0][k] = rperm[1][k] = m128;
2bf6d935
ML
19901 }
19902
19903 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
19904 vperm = force_reg (V16QImode, vperm);
19905
dd835ec2
UB
19906 l = gen_reg_rtx (mode);
19907 op = gen_lowpart (mode, d->op0);
19908 emit_insn (gen (l, op, vperm));
2bf6d935
ML
19909
19910 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
19911 vperm = force_reg (V16QImode, vperm);
19912
dd835ec2
UB
19913 h = gen_reg_rtx (mode);
19914 op = gen_lowpart (mode, d->op1);
19915 emit_insn (gen (h, op, vperm));
2bf6d935
ML
19916
19917 op = d->target;
dd835ec2
UB
19918 if (d->vmode != mode)
19919 op = gen_reg_rtx (mode);
19920 emit_insn (gen_rtx_SET (op, gen_rtx_IOR (mode, l, h)));
2bf6d935
ML
19921 if (op != d->target)
19922 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
19923
19924 return true;
19925}
19926
19927/* Implement arbitrary permutation of one V32QImode and V16QImode operand
19928 with two vpshufb insns, vpermq and vpor. We should have already failed
19929 all two or three instruction sequences. */
19930
19931static bool
19932expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
19933{
19934 rtx rperm[2][32], vperm, l, h, hp, op, m128;
19935 unsigned int i, nelt, eltsz;
19936
19937 if (!TARGET_AVX2
19938 || !d->one_operand_p
19939 || (d->vmode != V32QImode && d->vmode != V16HImode))
19940 return false;
19941
19942 if (d->testing_p)
19943 return true;
19944
19945 nelt = d->nelt;
19946 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19947
19948 /* Generate two permutation masks. If the required element is within
19949 the same lane, it is shuffled in. If the required element from the
19950 other lane, force a zero by setting bit 7 in the permutation mask.
19951 In the other mask the mask has non-negative elements if element
19952 is requested from the other lane, but also moved to the other lane,
19953 so that the result of vpshufb can have the two V2TImode halves
19954 swapped. */
19955 m128 = GEN_INT (-128);
19956 for (i = 0; i < nelt; ++i)
19957 {
19958 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
19959 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
19960
19961 for (j = 0; j < eltsz; ++j)
19962 {
19963 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
19964 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
19965 }
19966 }
19967
19968 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
19969 vperm = force_reg (V32QImode, vperm);
19970
19971 h = gen_reg_rtx (V32QImode);
19972 op = gen_lowpart (V32QImode, d->op0);
19973 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
19974
19975 /* Swap the 128-byte lanes of h into hp. */
19976 hp = gen_reg_rtx (V4DImode);
19977 op = gen_lowpart (V4DImode, h);
19978 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
19979 const1_rtx));
19980
19981 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
19982 vperm = force_reg (V32QImode, vperm);
19983
19984 l = gen_reg_rtx (V32QImode);
19985 op = gen_lowpart (V32QImode, d->op0);
19986 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
19987
19988 op = d->target;
19989 if (d->vmode != V32QImode)
19990 op = gen_reg_rtx (V32QImode);
19991 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
19992 if (op != d->target)
19993 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
19994
19995 return true;
19996}
19997
19998/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
19999 and extract-odd permutations of two V32QImode and V16QImode operand
20000 with two vpshufb insns, vpor and vpermq. We should have already
20001 failed all two or three instruction sequences. */
20002
20003static bool
20004expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
20005{
20006 rtx rperm[2][32], vperm, l, h, ior, op, m128;
20007 unsigned int i, nelt, eltsz;
20008
20009 if (!TARGET_AVX2
20010 || d->one_operand_p
20011 || (d->vmode != V32QImode && d->vmode != V16HImode))
20012 return false;
20013
20014 for (i = 0; i < d->nelt; ++i)
20015 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
20016 return false;
20017
20018 if (d->testing_p)
20019 return true;
20020
20021 nelt = d->nelt;
20022 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
20023
20024 /* Generate two permutation masks. In the first permutation mask
20025 the first quarter will contain indexes for the first half
20026 of the op0, the second quarter will contain bit 7 set, third quarter
20027 will contain indexes for the second half of the op0 and the
20028 last quarter bit 7 set. In the second permutation mask
20029 the first quarter will contain bit 7 set, the second quarter
20030 indexes for the first half of the op1, the third quarter bit 7 set
20031 and last quarter indexes for the second half of the op1.
20032 I.e. the first mask e.g. for V32QImode extract even will be:
20033 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
20034 (all values masked with 0xf except for -128) and second mask
20035 for extract even will be
20036 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
20037 m128 = GEN_INT (-128);
20038 for (i = 0; i < nelt; ++i)
20039 {
20040 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
20041 unsigned which = d->perm[i] >= nelt;
20042 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
20043
20044 for (j = 0; j < eltsz; ++j)
20045 {
20046 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
20047 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
20048 }
20049 }
20050
20051 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
20052 vperm = force_reg (V32QImode, vperm);
20053
20054 l = gen_reg_rtx (V32QImode);
20055 op = gen_lowpart (V32QImode, d->op0);
20056 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
20057
20058 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
20059 vperm = force_reg (V32QImode, vperm);
20060
20061 h = gen_reg_rtx (V32QImode);
20062 op = gen_lowpart (V32QImode, d->op1);
20063 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
20064
20065 ior = gen_reg_rtx (V32QImode);
20066 emit_insn (gen_iorv32qi3 (ior, l, h));
20067
20068 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
20069 op = gen_reg_rtx (V4DImode);
20070 ior = gen_lowpart (V4DImode, ior);
20071 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
20072 const1_rtx, GEN_INT (3)));
20073 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
20074
20075 return true;
20076}
20077
20078/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
a325bdd1
PB
20079 and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
20080 operands with two "and" and "pack" or two "shift" and "pack" insns.
20081 We should have already failed all two instruction sequences. */
2bf6d935
ML
20082
20083static bool
20084expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
20085{
20086 rtx op, dop0, dop1, t;
20087 unsigned i, odd, c, s, nelt = d->nelt;
20088 bool end_perm = false;
20089 machine_mode half_mode;
20090 rtx (*gen_and) (rtx, rtx, rtx);
20091 rtx (*gen_pack) (rtx, rtx, rtx);
20092 rtx (*gen_shift) (rtx, rtx, rtx);
20093
20094 if (d->one_operand_p)
20095 return false;
20096
20097 switch (d->vmode)
20098 {
dd835ec2
UB
20099 case E_V4HImode:
20100 /* Required for "pack". */
20101 if (!TARGET_SSE4_1)
20102 return false;
20103 c = 0xffff;
20104 s = 16;
20105 half_mode = V2SImode;
20106 gen_and = gen_andv2si3;
20107 gen_pack = gen_mmx_packusdw;
20108 gen_shift = gen_lshrv2si3;
20109 break;
2bf6d935
ML
20110 case E_V8HImode:
20111 /* Required for "pack". */
20112 if (!TARGET_SSE4_1)
20113 return false;
20114 c = 0xffff;
20115 s = 16;
20116 half_mode = V4SImode;
20117 gen_and = gen_andv4si3;
20118 gen_pack = gen_sse4_1_packusdw;
20119 gen_shift = gen_lshrv4si3;
20120 break;
a325bdd1
PB
20121 case E_V8QImode:
20122 /* No check as all instructions are SSE2. */
20123 c = 0xff;
20124 s = 8;
20125 half_mode = V4HImode;
20126 gen_and = gen_andv4hi3;
20127 gen_pack = gen_mmx_packuswb;
20128 gen_shift = gen_lshrv4hi3;
20129 break;
2bf6d935
ML
20130 case E_V16QImode:
20131 /* No check as all instructions are SSE2. */
20132 c = 0xff;
20133 s = 8;
20134 half_mode = V8HImode;
20135 gen_and = gen_andv8hi3;
20136 gen_pack = gen_sse2_packuswb;
20137 gen_shift = gen_lshrv8hi3;
20138 break;
20139 case E_V16HImode:
20140 if (!TARGET_AVX2)
20141 return false;
20142 c = 0xffff;
20143 s = 16;
20144 half_mode = V8SImode;
20145 gen_and = gen_andv8si3;
20146 gen_pack = gen_avx2_packusdw;
20147 gen_shift = gen_lshrv8si3;
20148 end_perm = true;
20149 break;
20150 case E_V32QImode:
20151 if (!TARGET_AVX2)
20152 return false;
20153 c = 0xff;
20154 s = 8;
20155 half_mode = V16HImode;
20156 gen_and = gen_andv16hi3;
20157 gen_pack = gen_avx2_packuswb;
20158 gen_shift = gen_lshrv16hi3;
20159 end_perm = true;
20160 break;
20161 default:
dd835ec2 20162 /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
a325bdd1 20163 are more profitable than general shuffles. */
2bf6d935
ML
20164 return false;
20165 }
20166
20167 /* Check that permutation is even or odd. */
20168 odd = d->perm[0];
20169 if (odd > 1)
20170 return false;
20171
20172 for (i = 1; i < nelt; ++i)
20173 if (d->perm[i] != 2 * i + odd)
20174 return false;
20175
20176 if (d->testing_p)
20177 return true;
20178
20179 dop0 = gen_reg_rtx (half_mode);
20180 dop1 = gen_reg_rtx (half_mode);
20181 if (odd == 0)
20182 {
20183 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
20184 t = force_reg (half_mode, t);
20185 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
20186 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
20187 }
20188 else
20189 {
20190 emit_insn (gen_shift (dop0,
20191 gen_lowpart (half_mode, d->op0),
20192 GEN_INT (s)));
20193 emit_insn (gen_shift (dop1,
20194 gen_lowpart (half_mode, d->op1),
20195 GEN_INT (s)));
20196 }
20197 /* In AVX2 for 256 bit case we need to permute pack result. */
20198 if (TARGET_AVX2 && end_perm)
20199 {
20200 op = gen_reg_rtx (d->vmode);
20201 t = gen_reg_rtx (V4DImode);
20202 emit_insn (gen_pack (op, dop0, dop1));
20203 emit_insn (gen_avx2_permv4di_1 (t,
20204 gen_lowpart (V4DImode, op),
20205 const0_rtx,
20206 const2_rtx,
20207 const1_rtx,
20208 GEN_INT (3)));
20209 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
20210 }
20211 else
20212 emit_insn (gen_pack (d->target, dop0, dop1));
20213
20214 return true;
20215}
20216
20217/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
20218 and extract-odd permutations of two V64QI operands
20219 with two "shifts", two "truncs" and one "concat" insns for "odd"
20220 and two "truncs" and one concat insn for "even."
20221 Have already failed all two instruction sequences. */
20222
20223static bool
20224expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
20225{
20226 rtx t1, t2, t3, t4;
20227 unsigned i, odd, nelt = d->nelt;
20228
20229 if (!TARGET_AVX512BW
20230 || d->one_operand_p
20231 || d->vmode != V64QImode)
20232 return false;
20233
20234 /* Check that permutation is even or odd. */
20235 odd = d->perm[0];
20236 if (odd > 1)
20237 return false;
20238
20239 for (i = 1; i < nelt; ++i)
20240 if (d->perm[i] != 2 * i + odd)
20241 return false;
20242
20243 if (d->testing_p)
20244 return true;
20245
20246
20247 if (odd)
20248 {
20249 t1 = gen_reg_rtx (V32HImode);
20250 t2 = gen_reg_rtx (V32HImode);
20251 emit_insn (gen_lshrv32hi3 (t1,
20252 gen_lowpart (V32HImode, d->op0),
20253 GEN_INT (8)));
20254 emit_insn (gen_lshrv32hi3 (t2,
20255 gen_lowpart (V32HImode, d->op1),
20256 GEN_INT (8)));
20257 }
20258 else
20259 {
20260 t1 = gen_lowpart (V32HImode, d->op0);
20261 t2 = gen_lowpart (V32HImode, d->op1);
20262 }
20263
20264 t3 = gen_reg_rtx (V32QImode);
20265 t4 = gen_reg_rtx (V32QImode);
20266 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
20267 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
20268 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
20269
20270 return true;
20271}
20272
4bf4c103 20273/* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
2bf6d935
ML
20274 and extract-odd permutations. */
20275
20276static bool
20277expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
20278{
20279 rtx t1, t2, t3, t4, t5;
20280
20281 switch (d->vmode)
20282 {
20283 case E_V4DFmode:
20284 if (d->testing_p)
20285 break;
20286 t1 = gen_reg_rtx (V4DFmode);
20287 t2 = gen_reg_rtx (V4DFmode);
20288
20289 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
20290 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
20291 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
20292
20293 /* Now an unpck[lh]pd will produce the result required. */
20294 if (odd)
20295 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
20296 else
20297 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
20298 emit_insn (t3);
20299 break;
20300
20301 case E_V8SFmode:
20302 {
20303 int mask = odd ? 0xdd : 0x88;
20304
20305 if (d->testing_p)
20306 break;
20307 t1 = gen_reg_rtx (V8SFmode);
20308 t2 = gen_reg_rtx (V8SFmode);
20309 t3 = gen_reg_rtx (V8SFmode);
20310
20311 /* Shuffle within the 128-bit lanes to produce:
20312 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
20313 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
20314 GEN_INT (mask)));
20315
20316 /* Shuffle the lanes around to produce:
20317 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
20318 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
20319 GEN_INT (0x3)));
20320
20321 /* Shuffle within the 128-bit lanes to produce:
20322 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
20323 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
20324
20325 /* Shuffle within the 128-bit lanes to produce:
20326 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
20327 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
20328
20329 /* Shuffle the lanes around to produce:
20330 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
20331 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
20332 GEN_INT (0x20)));
20333 }
20334 break;
20335
20336 case E_V2DFmode:
20337 case E_V4SFmode:
20338 case E_V2DImode:
9b8579a6 20339 case E_V2SImode:
2bf6d935 20340 case E_V4SImode:
8d7dae0e 20341 case E_V2HImode:
2bf6d935
ML
20342 /* These are always directly implementable by expand_vec_perm_1. */
20343 gcc_unreachable ();
20344
240198fe
UB
20345 case E_V2SFmode:
20346 gcc_assert (TARGET_MMX_WITH_SSE);
20347 /* We have no suitable instructions. */
20348 if (d->testing_p)
20349 return false;
20350 break;
20351
be8749f9
UB
20352 case E_V4QImode:
20353 if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
20354 return expand_vec_perm_pshufb2 (d);
20355 else
20356 {
20357 if (d->testing_p)
20358 break;
20359 /* We need 2*log2(N)-1 operations to achieve odd/even
20360 with interleave. */
20361 t1 = gen_reg_rtx (V4QImode);
20362 emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
20363 emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
20364 if (odd)
20365 t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
20366 else
20367 t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
20368 emit_insn (t2);
20369 }
20370 break;
20371
9b8579a6 20372 case E_V4HImode:
dd835ec2
UB
20373 if (TARGET_SSE4_1)
20374 return expand_vec_perm_even_odd_pack (d);
20375 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
20376 return expand_vec_perm_pshufb2 (d);
9b8579a6 20377 else
dd835ec2
UB
20378 {
20379 if (d->testing_p)
20380 break;
20381 /* We need 2*log2(N)-1 operations to achieve odd/even
20382 with interleave. */
20383 t1 = gen_reg_rtx (V4HImode);
20384 emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
20385 emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
20386 if (odd)
20387 t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
20388 else
20389 t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
20390 emit_insn (t2);
20391 }
9b8579a6
UB
20392 break;
20393
2bf6d935
ML
20394 case E_V8HImode:
20395 if (TARGET_SSE4_1)
20396 return expand_vec_perm_even_odd_pack (d);
20397 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
20398 return expand_vec_perm_pshufb2 (d);
20399 else
20400 {
20401 if (d->testing_p)
20402 break;
20403 /* We need 2*log2(N)-1 operations to achieve odd/even
20404 with interleave. */
20405 t1 = gen_reg_rtx (V8HImode);
20406 t2 = gen_reg_rtx (V8HImode);
20407 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
20408 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
20409 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
20410 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
20411 if (odd)
20412 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
20413 else
20414 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
20415 emit_insn (t3);
20416 }
20417 break;
20418
a325bdd1 20419 case E_V8QImode:
2bf6d935
ML
20420 case E_V16QImode:
20421 return expand_vec_perm_even_odd_pack (d);
20422
20423 case E_V16HImode:
20424 case E_V32QImode:
20425 return expand_vec_perm_even_odd_pack (d);
20426
20427 case E_V64QImode:
20428 return expand_vec_perm_even_odd_trunc (d);
20429
20430 case E_V4DImode:
20431 if (!TARGET_AVX2)
20432 {
20433 struct expand_vec_perm_d d_copy = *d;
20434 d_copy.vmode = V4DFmode;
20435 if (d->testing_p)
20436 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
20437 else
20438 d_copy.target = gen_reg_rtx (V4DFmode);
20439 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
20440 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
20441 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
20442 {
20443 if (!d->testing_p)
20444 emit_move_insn (d->target,
20445 gen_lowpart (V4DImode, d_copy.target));
20446 return true;
20447 }
20448 return false;
20449 }
20450
20451 if (d->testing_p)
20452 break;
20453
20454 t1 = gen_reg_rtx (V4DImode);
20455 t2 = gen_reg_rtx (V4DImode);
20456
20457 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
20458 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
20459 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
20460
20461 /* Now an vpunpck[lh]qdq will produce the result required. */
20462 if (odd)
20463 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
20464 else
20465 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
20466 emit_insn (t3);
20467 break;
20468
20469 case E_V8SImode:
20470 if (!TARGET_AVX2)
20471 {
20472 struct expand_vec_perm_d d_copy = *d;
20473 d_copy.vmode = V8SFmode;
20474 if (d->testing_p)
20475 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
20476 else
20477 d_copy.target = gen_reg_rtx (V8SFmode);
20478 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
20479 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
20480 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
20481 {
20482 if (!d->testing_p)
20483 emit_move_insn (d->target,
20484 gen_lowpart (V8SImode, d_copy.target));
20485 return true;
20486 }
20487 return false;
20488 }
20489
20490 if (d->testing_p)
20491 break;
20492
20493 t1 = gen_reg_rtx (V8SImode);
20494 t2 = gen_reg_rtx (V8SImode);
20495 t3 = gen_reg_rtx (V4DImode);
20496 t4 = gen_reg_rtx (V4DImode);
20497 t5 = gen_reg_rtx (V4DImode);
20498
20499 /* Shuffle the lanes around into
20500 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
20501 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
20502 gen_lowpart (V4DImode, d->op1),
20503 GEN_INT (0x20)));
20504 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
20505 gen_lowpart (V4DImode, d->op1),
20506 GEN_INT (0x31)));
20507
20508 /* Swap the 2nd and 3rd position in each lane into
20509 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
20510 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
20511 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
20512 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
20513 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
20514
20515 /* Now an vpunpck[lh]qdq will produce
20516 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
20517 if (odd)
20518 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
20519 gen_lowpart (V4DImode, t2));
20520 else
20521 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
20522 gen_lowpart (V4DImode, t2));
20523 emit_insn (t3);
20524 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
20525 break;
20526
20527 default:
20528 gcc_unreachable ();
20529 }
20530
20531 return true;
20532}
20533
4bf4c103 20534/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
2bf6d935
ML
20535 extract-even and extract-odd permutations. */
20536
20537static bool
20538expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
20539{
20540 unsigned i, odd, nelt = d->nelt;
20541
20542 odd = d->perm[0];
20543 if (odd != 0 && odd != 1)
20544 return false;
20545
20546 for (i = 1; i < nelt; ++i)
20547 if (d->perm[i] != 2 * i + odd)
20548 return false;
20549
50b58779
JJ
20550 if (d->vmode == E_V32HImode
20551 && d->testing_p
20552 && !TARGET_AVX512BW)
20553 return false;
20554
2bf6d935
ML
20555 return expand_vec_perm_even_odd_1 (d, odd);
20556}
20557
4bf4c103 20558/* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
2bf6d935
ML
20559 permutations. We assume that expand_vec_perm_1 has already failed. */
20560
20561static bool
20562expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
20563{
20564 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
20565 machine_mode vmode = d->vmode;
be8749f9 20566 rtx (*gen) (rtx, rtx, rtx);
2bf6d935
ML
20567 unsigned char perm2[4];
20568 rtx op0 = d->op0, dest;
20569 bool ok;
20570
20571 switch (vmode)
20572 {
20573 case E_V4DFmode:
20574 case E_V8SFmode:
20575 /* These are special-cased in sse.md so that we can optionally
20576 use the vbroadcast instruction. They expand to two insns
20577 if the input happens to be in a register. */
20578 gcc_unreachable ();
20579
20580 case E_V2DFmode:
240198fe 20581 case E_V2SFmode:
2bf6d935 20582 case E_V4SFmode:
240198fe 20583 case E_V2DImode:
9b8579a6 20584 case E_V2SImode:
2bf6d935 20585 case E_V4SImode:
8d7dae0e
UB
20586 case E_V2HImode:
20587 case E_V4HImode:
2bf6d935
ML
20588 /* These are always implementable using standard shuffle patterns. */
20589 gcc_unreachable ();
20590
be8749f9
UB
20591 case E_V4QImode:
20592 /* This can be implemented via interleave and pshuflw. */
20593 if (d->testing_p)
20594 return true;
20595
20596 if (elt >= nelt2)
20597 {
20598 gen = gen_mmx_punpckhbw_low;
20599 elt -= nelt2;
20600 }
20601 else
20602 gen = gen_mmx_punpcklbw_low;
20603
20604 dest = gen_reg_rtx (vmode);
20605 emit_insn (gen (dest, op0, op0));
20606 vmode = get_mode_wider_vector (vmode);
20607 op0 = gen_lowpart (vmode, dest);
20608
20609 memset (perm2, elt, 2);
20610 dest = gen_reg_rtx (vmode);
20611 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
20612 gcc_assert (ok);
20613
20614 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
20615 return true;
20616
a325bdd1 20617 case E_V8QImode:
be8749f9 20618 /* This can be implemented via interleave. We save one insn by
a325bdd1
PB
20619 stopping once we have promoted to V2SImode and then use pshufd. */
20620 if (d->testing_p)
20621 return true;
20622 do
20623 {
a325bdd1
PB
20624 if (elt >= nelt2)
20625 {
20626 gen = vmode == V8QImode ? gen_mmx_punpckhbw
20627 : gen_mmx_punpckhwd;
20628 elt -= nelt2;
20629 }
be8749f9
UB
20630 else
20631 gen = vmode == V8QImode ? gen_mmx_punpcklbw
20632 : gen_mmx_punpcklwd;
a325bdd1
PB
20633 nelt2 /= 2;
20634
20635 dest = gen_reg_rtx (vmode);
20636 emit_insn (gen (dest, op0, op0));
20637 vmode = get_mode_wider_vector (vmode);
20638 op0 = gen_lowpart (vmode, dest);
20639 }
20640 while (vmode != V2SImode);
20641
20642 memset (perm2, elt, 2);
be8749f9 20643 dest = gen_reg_rtx (vmode);
a325bdd1
PB
20644 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
20645 gcc_assert (ok);
be8749f9
UB
20646
20647 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
a325bdd1
PB
20648 return true;
20649
2bf6d935
ML
20650 case E_V8HImode:
20651 case E_V16QImode:
20652 /* These can be implemented via interleave. We save one insn by
20653 stopping once we have promoted to V4SImode and then use pshufd. */
20654 if (d->testing_p)
20655 return true;
20656 do
20657 {
2bf6d935
ML
20658 if (elt >= nelt2)
20659 {
20660 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
20661 : gen_vec_interleave_highv8hi;
20662 elt -= nelt2;
20663 }
be8749f9
UB
20664 else
20665 gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
20666 : gen_vec_interleave_lowv8hi;
2bf6d935
ML
20667 nelt2 /= 2;
20668
20669 dest = gen_reg_rtx (vmode);
20670 emit_insn (gen (dest, op0, op0));
20671 vmode = get_mode_wider_vector (vmode);
20672 op0 = gen_lowpart (vmode, dest);
20673 }
20674 while (vmode != V4SImode);
20675
20676 memset (perm2, elt, 4);
be8749f9 20677 dest = gen_reg_rtx (vmode);
2bf6d935
ML
20678 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
20679 gcc_assert (ok);
be8749f9
UB
20680
20681 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
2bf6d935
ML
20682 return true;
20683
2bf6d935
ML
20684 case E_V32QImode:
20685 case E_V16HImode:
20686 case E_V8SImode:
20687 case E_V4DImode:
20688 /* For AVX2 broadcasts of the first element vpbroadcast* or
20689 vpermq should be used by expand_vec_perm_1. */
20690 gcc_assert (!TARGET_AVX2 || d->perm[0]);
20691 return false;
20692
240f0780
JJ
20693 case E_V64QImode:
20694 gcc_assert (!TARGET_AVX512BW || d->perm[0]);
20695 return false;
20696
04b4f315
JJ
20697 case E_V32HImode:
20698 gcc_assert (!TARGET_AVX512BW);
20699 return false;
20700
2bf6d935
ML
20701 default:
20702 gcc_unreachable ();
20703 }
20704}
20705
4bf4c103 20706/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
2bf6d935
ML
20707 broadcast permutations. */
20708
20709static bool
20710expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
20711{
20712 unsigned i, elt, nelt = d->nelt;
20713
20714 if (!d->one_operand_p)
20715 return false;
20716
20717 elt = d->perm[0];
20718 for (i = 1; i < nelt; ++i)
20719 if (d->perm[i] != elt)
20720 return false;
20721
20722 return expand_vec_perm_broadcast_1 (d);
20723}
20724
20725/* Implement arbitrary permutations of two V64QImode operands
20726 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
20727static bool
20728expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
20729{
20730 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
20731 return false;
20732
20733 if (d->testing_p)
20734 return true;
20735
20736 struct expand_vec_perm_d ds[2];
20737 rtx rperm[128], vperm, target0, target1;
20738 unsigned int i, nelt;
20739 machine_mode vmode;
20740
20741 nelt = d->nelt;
20742 vmode = V64QImode;
20743
20744 for (i = 0; i < 2; i++)
20745 {
20746 ds[i] = *d;
20747 ds[i].vmode = V32HImode;
20748 ds[i].nelt = 32;
20749 ds[i].target = gen_reg_rtx (V32HImode);
20750 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
20751 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
20752 }
20753
20754 /* Prepare permutations such that the first one takes care of
20755 putting the even bytes into the right positions or one higher
20756 positions (ds[0]) and the second one takes care of
20757 putting the odd bytes into the right positions or one below
20758 (ds[1]). */
20759
20760 for (i = 0; i < nelt; i++)
20761 {
20762 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
20763 if (i & 1)
20764 {
20765 rperm[i] = constm1_rtx;
20766 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
20767 }
20768 else
20769 {
20770 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
20771 rperm[i + 64] = constm1_rtx;
20772 }
20773 }
20774
20775 bool ok = expand_vec_perm_1 (&ds[0]);
20776 gcc_assert (ok);
20777 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
20778
20779 ok = expand_vec_perm_1 (&ds[1]);
20780 gcc_assert (ok);
20781 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
20782
20783 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
20784 vperm = force_reg (vmode, vperm);
20785 target0 = gen_reg_rtx (V64QImode);
20786 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
20787
20788 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
20789 vperm = force_reg (vmode, vperm);
20790 target1 = gen_reg_rtx (V64QImode);
20791 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
20792
20793 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
20794 return true;
20795}
20796
20797/* Implement arbitrary permutation of two V32QImode and V16QImode operands
20798 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
20799 all the shorter instruction sequences. */
20800
20801static bool
20802expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
20803{
20804 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
20805 unsigned int i, nelt, eltsz;
20806 bool used[4];
20807
20808 if (!TARGET_AVX2
20809 || d->one_operand_p
20810 || (d->vmode != V32QImode && d->vmode != V16HImode))
20811 return false;
20812
20813 if (d->testing_p)
20814 return true;
20815
20816 nelt = d->nelt;
20817 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
20818
20819 /* Generate 4 permutation masks. If the required element is within
20820 the same lane, it is shuffled in. If the required element from the
20821 other lane, force a zero by setting bit 7 in the permutation mask.
20822 In the other mask the mask has non-negative elements if element
20823 is requested from the other lane, but also moved to the other lane,
20824 so that the result of vpshufb can have the two V2TImode halves
20825 swapped. */
20826 m128 = GEN_INT (-128);
20827 for (i = 0; i < 32; ++i)
20828 {
20829 rperm[0][i] = m128;
20830 rperm[1][i] = m128;
20831 rperm[2][i] = m128;
20832 rperm[3][i] = m128;
20833 }
20834 used[0] = false;
20835 used[1] = false;
20836 used[2] = false;
20837 used[3] = false;
20838 for (i = 0; i < nelt; ++i)
20839 {
20840 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
20841 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
20842 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
20843
20844 for (j = 0; j < eltsz; ++j)
20845 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
20846 used[which] = true;
20847 }
20848
20849 for (i = 0; i < 2; ++i)
20850 {
20851 if (!used[2 * i + 1])
20852 {
20853 h[i] = NULL_RTX;
20854 continue;
20855 }
20856 vperm = gen_rtx_CONST_VECTOR (V32QImode,
20857 gen_rtvec_v (32, rperm[2 * i + 1]));
20858 vperm = force_reg (V32QImode, vperm);
20859 h[i] = gen_reg_rtx (V32QImode);
20860 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
20861 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
20862 }
20863
20864 /* Swap the 128-byte lanes of h[X]. */
20865 for (i = 0; i < 2; ++i)
20866 {
20867 if (h[i] == NULL_RTX)
20868 continue;
20869 op = gen_reg_rtx (V4DImode);
20870 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
20871 const2_rtx, GEN_INT (3), const0_rtx,
20872 const1_rtx));
20873 h[i] = gen_lowpart (V32QImode, op);
20874 }
20875
20876 for (i = 0; i < 2; ++i)
20877 {
20878 if (!used[2 * i])
20879 {
20880 l[i] = NULL_RTX;
20881 continue;
20882 }
20883 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
20884 vperm = force_reg (V32QImode, vperm);
20885 l[i] = gen_reg_rtx (V32QImode);
20886 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
20887 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
20888 }
20889
20890 for (i = 0; i < 2; ++i)
20891 {
20892 if (h[i] && l[i])
20893 {
20894 op = gen_reg_rtx (V32QImode);
20895 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
20896 l[i] = op;
20897 }
20898 else if (h[i])
20899 l[i] = h[i];
20900 }
20901
20902 gcc_assert (l[0] && l[1]);
20903 op = d->target;
20904 if (d->vmode != V32QImode)
20905 op = gen_reg_rtx (V32QImode);
20906 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
20907 if (op != d->target)
20908 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
20909 return true;
20910}
20911
20912/* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
20913 taken care of, perform the expansion in D and return true on success. */
20914
20915static bool
20916ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
20917{
20918 /* Try a single instruction expansion. */
20919 if (expand_vec_perm_1 (d))
20920 return true;
20921
20922 /* Try sequences of two instructions. */
20923
20924 if (expand_vec_perm_pshuflw_pshufhw (d))
20925 return true;
20926
20927 if (expand_vec_perm_palignr (d, false))
20928 return true;
20929
20930 if (expand_vec_perm_interleave2 (d))
20931 return true;
20932
20933 if (expand_vec_perm_broadcast (d))
20934 return true;
20935
20936 if (expand_vec_perm_vpermq_perm_1 (d))
20937 return true;
20938
20939 if (expand_vec_perm_vperm2f128 (d))
20940 return true;
20941
20942 if (expand_vec_perm_pblendv (d))
20943 return true;
20944
829c4bea
JJ
20945 if (expand_vec_perm_2perm_interleave (d, true))
20946 return true;
20947
20948 if (expand_vec_perm_2perm_pblendv (d, true))
20949 return true;
20950
2bf6d935
ML
20951 /* Try sequences of three instructions. */
20952
20953 if (expand_vec_perm_even_odd_pack (d))
20954 return true;
20955
20956 if (expand_vec_perm_2vperm2f128_vshuf (d))
20957 return true;
20958
20959 if (expand_vec_perm_pshufb2 (d))
20960 return true;
20961
20962 if (expand_vec_perm_interleave3 (d))
20963 return true;
20964
20965 if (expand_vec_perm_vperm2f128_vblend (d))
20966 return true;
20967
829c4bea
JJ
20968 if (expand_vec_perm_2perm_interleave (d, false))
20969 return true;
20970
20971 if (expand_vec_perm_2perm_pblendv (d, false))
20972 return true;
20973
2bf6d935
ML
20974 /* Try sequences of four instructions. */
20975
20976 if (expand_vec_perm_even_odd_trunc (d))
20977 return true;
20978 if (expand_vec_perm_vpshufb2_vpermq (d))
20979 return true;
20980
20981 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
20982 return true;
20983
20984 if (expand_vec_perm_vpermt2_vpshub2 (d))
20985 return true;
20986
20987 /* ??? Look for narrow permutations whose element orderings would
20988 allow the promotion to a wider mode. */
20989
20990 /* ??? Look for sequences of interleave or a wider permute that place
20991 the data into the correct lanes for a half-vector shuffle like
20992 pshuf[lh]w or vpermilps. */
20993
20994 /* ??? Look for sequences of interleave that produce the desired results.
20995 The combinatorics of punpck[lh] get pretty ugly... */
20996
20997 if (expand_vec_perm_even_odd (d))
20998 return true;
20999
21000 /* Even longer sequences. */
21001 if (expand_vec_perm_vpshufb4_vpermq2 (d))
21002 return true;
21003
21004 /* See if we can get the same permutation in different vector integer
21005 mode. */
21006 struct expand_vec_perm_d nd;
21007 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
21008 {
21009 if (!d->testing_p)
21010 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
21011 return true;
21012 }
21013
4bf4c103
JJ
21014 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
21015 if (expand_vec_perm2_vperm2f128_vblend (d))
21016 return true;
21017
2bf6d935
ML
21018 return false;
21019}
21020
21021/* If a permutation only uses one operand, make it clear. Returns true
21022 if the permutation references both operands. */
21023
21024static bool
21025canonicalize_perm (struct expand_vec_perm_d *d)
21026{
21027 int i, which, nelt = d->nelt;
21028
21029 for (i = which = 0; i < nelt; ++i)
4bf4c103 21030 which |= (d->perm[i] < nelt ? 1 : 2);
2bf6d935
ML
21031
21032 d->one_operand_p = true;
21033 switch (which)
21034 {
21035 default:
21036 gcc_unreachable();
21037
21038 case 3:
21039 if (!rtx_equal_p (d->op0, d->op1))
21040 {
21041 d->one_operand_p = false;
21042 break;
21043 }
21044 /* The elements of PERM do not suggest that only the first operand
21045 is used, but both operands are identical. Allow easier matching
21046 of the permutation by folding the permutation into the single
21047 input vector. */
21048 /* FALLTHRU */
21049
21050 case 2:
21051 for (i = 0; i < nelt; ++i)
21052 d->perm[i] &= nelt - 1;
21053 d->op0 = d->op1;
21054 break;
21055
21056 case 1:
21057 d->op1 = d->op0;
21058 break;
21059 }
21060
21061 return (which == 3);
21062}
21063
21064/* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
21065
21066bool
21067ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
21068 rtx op1, const vec_perm_indices &sel)
21069{
21070 struct expand_vec_perm_d d;
21071 unsigned char perm[MAX_VECT_LEN];
21072 unsigned int i, nelt, which;
21073 bool two_args;
21074
21075 d.target = target;
21076 d.op0 = op0;
21077 d.op1 = op1;
21078
21079 d.vmode = vmode;
21080 gcc_assert (VECTOR_MODE_P (d.vmode));
21081 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
21082 d.testing_p = !target;
21083
21084 gcc_assert (sel.length () == nelt);
21085 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
21086
21087 /* Given sufficient ISA support we can just return true here
21088 for selected vector modes. */
21089 switch (d.vmode)
21090 {
21091 case E_V16SFmode:
21092 case E_V16SImode:
21093 case E_V8DImode:
21094 case E_V8DFmode:
21095 if (!TARGET_AVX512F)
21096 return false;
21097 /* All implementable with a single vperm[it]2 insn. */
21098 if (d.testing_p)
21099 return true;
21100 break;
21101 case E_V32HImode:
50b58779 21102 if (!TARGET_AVX512F)
2bf6d935 21103 return false;
50b58779 21104 if (d.testing_p && TARGET_AVX512BW)
2bf6d935
ML
21105 /* All implementable with a single vperm[it]2 insn. */
21106 return true;
21107 break;
21108 case E_V64QImode:
50b58779 21109 if (!TARGET_AVX512F)
2bf6d935 21110 return false;
50b58779 21111 if (d.testing_p && TARGET_AVX512BW)
2bf6d935
ML
21112 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
21113 return true;
21114 break;
21115 case E_V8SImode:
21116 case E_V8SFmode:
21117 case E_V4DFmode:
21118 case E_V4DImode:
21119 if (!TARGET_AVX)
21120 return false;
21121 if (d.testing_p && TARGET_AVX512VL)
21122 /* All implementable with a single vperm[it]2 insn. */
21123 return true;
21124 break;
21125 case E_V16HImode:
21126 if (!TARGET_SSE2)
21127 return false;
21128 if (d.testing_p && TARGET_AVX2)
21129 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
21130 return true;
21131 break;
21132 case E_V32QImode:
21133 if (!TARGET_SSE2)
21134 return false;
21135 if (d.testing_p && TARGET_AVX2)
21136 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
21137 return true;
21138 break;
21139 case E_V8HImode:
21140 case E_V16QImode:
21141 if (!TARGET_SSE2)
21142 return false;
21143 /* Fall through. */
21144 case E_V4SImode:
21145 case E_V4SFmode:
21146 if (!TARGET_SSE)
21147 return false;
21148 /* All implementable with a single vpperm insn. */
21149 if (d.testing_p && TARGET_XOP)
21150 return true;
21151 /* All implementable with 2 pshufb + 1 ior. */
21152 if (d.testing_p && TARGET_SSSE3)
21153 return true;
21154 break;
240198fe 21155 case E_V2SFmode:
9b8579a6
UB
21156 case E_V2SImode:
21157 case E_V4HImode:
a325bdd1 21158 case E_V8QImode:
9b8579a6
UB
21159 if (!TARGET_MMX_WITH_SSE)
21160 return false;
21161 break;
8d7dae0e 21162 case E_V2HImode:
4986946f
UB
21163 if (!TARGET_SSE2)
21164 return false;
21165 /* All implementable with *punpckwd. */
21166 if (d.testing_p)
21167 return true;
21168 break;
be8749f9
UB
21169 case E_V4QImode:
21170 if (!TARGET_SSE2)
21171 return false;
21172 break;
2bf6d935
ML
21173 case E_V2DImode:
21174 case E_V2DFmode:
21175 if (!TARGET_SSE)
21176 return false;
21177 /* All implementable with shufpd or unpck[lh]pd. */
21178 if (d.testing_p)
21179 return true;
21180 break;
21181 default:
21182 return false;
21183 }
21184
21185 for (i = which = 0; i < nelt; ++i)
21186 {
21187 unsigned char e = sel[i];
21188 gcc_assert (e < 2 * nelt);
21189 d.perm[i] = e;
21190 perm[i] = e;
21191 which |= (e < nelt ? 1 : 2);
21192 }
21193
21194 if (d.testing_p)
21195 {
21196 /* For all elements from second vector, fold the elements to first. */
21197 if (which == 2)
21198 for (i = 0; i < nelt; ++i)
21199 d.perm[i] -= nelt;
21200
21201 /* Check whether the mask can be applied to the vector type. */
21202 d.one_operand_p = (which != 3);
21203
8d7dae0e 21204 /* Implementable with shufps, pshufd or pshuflw. */
9b8579a6 21205 if (d.one_operand_p
240198fe 21206 && (d.vmode == V4SFmode || d.vmode == V2SFmode
8d7dae0e
UB
21207 || d.vmode == V4SImode || d.vmode == V2SImode
21208 || d.vmode == V4HImode || d.vmode == V2HImode))
2bf6d935
ML
21209 return true;
21210
21211 /* Otherwise we have to go through the motions and see if we can
21212 figure out how to generate the requested permutation. */
21213 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
21214 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
21215 if (!d.one_operand_p)
21216 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
21217
21218 start_sequence ();
21219 bool ret = ix86_expand_vec_perm_const_1 (&d);
21220 end_sequence ();
21221
21222 return ret;
21223 }
21224
21225 two_args = canonicalize_perm (&d);
21226
b1d1e2b5
JJ
21227 /* If one of the operands is a zero vector, try to match pmovzx. */
21228 if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
21229 {
21230 struct expand_vec_perm_d dzero = d;
21231 if (d.op0 == CONST0_RTX (vmode))
21232 {
21233 d.op1 = dzero.op1 = force_reg (vmode, d.op1);
21234 std::swap (dzero.op0, dzero.op1);
21235 for (i = 0; i < nelt; ++i)
21236 dzero.perm[i] ^= nelt;
21237 }
21238 else
21239 d.op0 = dzero.op0 = force_reg (vmode, d.op0);
21240
21241 if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
21242 dzero.perm, nelt, dzero.testing_p))
21243 return true;
21244 }
21245
21246 /* Force operands into registers. */
21247 rtx nop0 = force_reg (vmode, d.op0);
21248 if (d.op0 == d.op1)
21249 d.op1 = nop0;
21250 d.op0 = nop0;
21251 d.op1 = force_reg (vmode, d.op1);
21252
2bf6d935
ML
21253 if (ix86_expand_vec_perm_const_1 (&d))
21254 return true;
21255
21256 /* If the selector says both arguments are needed, but the operands are the
21257 same, the above tried to expand with one_operand_p and flattened selector.
21258 If that didn't work, retry without one_operand_p; we succeeded with that
21259 during testing. */
21260 if (two_args && d.one_operand_p)
21261 {
21262 d.one_operand_p = false;
21263 memcpy (d.perm, perm, sizeof (perm));
21264 return ix86_expand_vec_perm_const_1 (&d);
21265 }
21266
21267 return false;
21268}
21269
21270void
21271ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
21272{
21273 struct expand_vec_perm_d d;
21274 unsigned i, nelt;
21275
21276 d.target = targ;
21277 d.op0 = op0;
21278 d.op1 = op1;
21279 d.vmode = GET_MODE (targ);
21280 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
21281 d.one_operand_p = false;
21282 d.testing_p = false;
21283
21284 for (i = 0; i < nelt; ++i)
21285 d.perm[i] = i * 2 + odd;
21286
21287 /* We'll either be able to implement the permutation directly... */
21288 if (expand_vec_perm_1 (&d))
21289 return;
21290
21291 /* ... or we use the special-case patterns. */
21292 expand_vec_perm_even_odd_1 (&d, odd);
21293}
21294
21295static void
21296ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
21297{
21298 struct expand_vec_perm_d d;
21299 unsigned i, nelt, base;
21300 bool ok;
21301
21302 d.target = targ;
21303 d.op0 = op0;
21304 d.op1 = op1;
21305 d.vmode = GET_MODE (targ);
21306 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
21307 d.one_operand_p = false;
21308 d.testing_p = false;
21309
21310 base = high_p ? nelt / 2 : 0;
21311 for (i = 0; i < nelt / 2; ++i)
21312 {
21313 d.perm[i * 2] = i + base;
21314 d.perm[i * 2 + 1] = i + base + nelt;
21315 }
21316
21317 /* Note that for AVX this isn't one instruction. */
21318 ok = ix86_expand_vec_perm_const_1 (&d);
21319 gcc_assert (ok);
21320}
21321
3bd86940 21322/* This function is similar as ix86_expand_vecop_qihi,
21323 but optimized under AVX512BW by using vpmovwb.
21324 For example, optimize vector MUL generation like
54cdb2f5 21325
21326 vpmovzxbw ymm2, xmm0
21327 vpmovzxbw ymm3, xmm1
21328 vpmullw ymm4, ymm2, ymm3
21329 vpmovwb xmm0, ymm4
21330
21331 it would take less instructions than ix86_expand_vecop_qihi.
21332 Return true if success. */
21333
3bd86940 21334static bool
21335ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
54cdb2f5 21336{
21337 machine_mode himode, qimode = GET_MODE (dest);
21338 rtx hop1, hop2, hdest;
21339 rtx (*gen_extend)(rtx, rtx);
21340 rtx (*gen_truncate)(rtx, rtx);
3bd86940 21341 bool uns_p = (code == ASHIFTRT) ? false : true;
54cdb2f5 21342
21343 /* There's no V64HImode multiplication instruction. */
21344 if (qimode == E_V64QImode)
21345 return false;
21346
21347 /* vpmovwb only available under AVX512BW. */
21348 if (!TARGET_AVX512BW)
21349 return false;
21350 if ((qimode == V8QImode || qimode == V16QImode)
21351 && !TARGET_AVX512VL)
21352 return false;
21353 /* Not generate zmm instruction when prefer 128/256 bit vector width. */
21354 if (qimode == V32QImode
21355 && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
21356 return false;
21357
21358 switch (qimode)
21359 {
21360 case E_V8QImode:
21361 himode = V8HImode;
3bd86940 21362 gen_extend = uns_p ? gen_zero_extendv8qiv8hi2 : gen_extendv8qiv8hi2;
54cdb2f5 21363 gen_truncate = gen_truncv8hiv8qi2;
21364 break;
21365 case E_V16QImode:
21366 himode = V16HImode;
3bd86940 21367 gen_extend = uns_p ? gen_zero_extendv16qiv16hi2 : gen_extendv16qiv16hi2;
54cdb2f5 21368 gen_truncate = gen_truncv16hiv16qi2;
21369 break;
21370 case E_V32QImode:
21371 himode = V32HImode;
3bd86940 21372 gen_extend = uns_p ? gen_zero_extendv32qiv32hi2 : gen_extendv32qiv32hi2;
54cdb2f5 21373 gen_truncate = gen_truncv32hiv32qi2;
21374 break;
21375 default:
21376 gcc_unreachable ();
21377 }
21378
21379 hop1 = gen_reg_rtx (himode);
21380 hop2 = gen_reg_rtx (himode);
21381 hdest = gen_reg_rtx (himode);
21382 emit_insn (gen_extend (hop1, op1));
21383 emit_insn (gen_extend (hop2, op2));
3bd86940 21384 emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode,
54cdb2f5 21385 hop1, hop2)));
21386 emit_insn (gen_truncate (dest, hdest));
21387 return true;
21388}
2bf6d935 21389
c7199fb6 21390/* Expand a vector operation shift by constant for a V*QImode in terms of the
21391 same operation on V*HImode. Return true if success. */
3bd86940 21392static bool
21393ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
21394 rtx dest, rtx op1, rtx op2)
c7199fb6 21395{
21396 machine_mode qimode, himode;
c44c2a3b 21397 HOST_WIDE_INT and_constant, xor_constant;
c7199fb6 21398 HOST_WIDE_INT shift_amount;
21399 rtx vec_const_and, vec_const_xor;
21400 rtx tmp, op1_subreg;
21401 rtx (*gen_shift) (rtx, rtx, rtx);
21402 rtx (*gen_and) (rtx, rtx, rtx);
21403 rtx (*gen_xor) (rtx, rtx, rtx);
21404 rtx (*gen_sub) (rtx, rtx, rtx);
21405
21406 /* Only optimize shift by constant. */
21407 if (!CONST_INT_P (op2))
21408 return false;
21409
21410 qimode = GET_MODE (dest);
21411 shift_amount = INTVAL (op2);
21412 /* Do nothing when shift amount greater equal 8. */
21413 if (shift_amount > 7)
21414 return false;
21415
21416 gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
21417 /* Record sign bit. */
21418 xor_constant = 1 << (8 - shift_amount - 1);
21419
21420 /* Zero upper/lower bits shift from left/right element. */
21421 and_constant
21422 = (code == ASHIFT ? 256 - (1 << shift_amount)
21423 : (1 << (8 - shift_amount)) - 1);
21424
21425 switch (qimode)
21426 {
21427 case V16QImode:
21428 himode = V8HImode;
21429 gen_shift =
21430 ((code == ASHIFT)
21431 ? gen_ashlv8hi3
21432 : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
21433 gen_and = gen_andv16qi3;
21434 gen_xor = gen_xorv16qi3;
21435 gen_sub = gen_subv16qi3;
21436 break;
21437 case V32QImode:
21438 himode = V16HImode;
21439 gen_shift =
21440 ((code == ASHIFT)
21441 ? gen_ashlv16hi3
21442 : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
21443 gen_and = gen_andv32qi3;
21444 gen_xor = gen_xorv32qi3;
21445 gen_sub = gen_subv32qi3;
21446 break;
21447 case V64QImode:
21448 himode = V32HImode;
21449 gen_shift =
21450 ((code == ASHIFT)
21451 ? gen_ashlv32hi3
21452 : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
21453 gen_and = gen_andv64qi3;
21454 gen_xor = gen_xorv64qi3;
21455 gen_sub = gen_subv64qi3;
21456 break;
21457 default:
21458 gcc_unreachable ();
21459 }
21460
21461 tmp = gen_reg_rtx (himode);
21462 vec_const_and = gen_reg_rtx (qimode);
21463 op1_subreg = lowpart_subreg (himode, op1, qimode);
21464
21465 /* For ASHIFT and LSHIFTRT, perform operation like
21466 vpsllw/vpsrlw $shift_amount, %op1, %dest.
21467 vpand %vec_const_and, %dest. */
21468 emit_insn (gen_shift (tmp, op1_subreg, op2));
21469 emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
21470 emit_move_insn (vec_const_and,
21471 ix86_build_const_vector (qimode, true,
c44c2a3b 21472 gen_int_mode (and_constant, QImode)));
c7199fb6 21473 emit_insn (gen_and (dest, dest, vec_const_and));
21474
21475 /* For ASHIFTRT, perform extra operation like
21476 vpxor %vec_const_xor, %dest, %dest
21477 vpsubb %vec_const_xor, %dest, %dest */
21478 if (code == ASHIFTRT)
21479 {
21480 vec_const_xor = gen_reg_rtx (qimode);
21481 emit_move_insn (vec_const_xor,
21482 ix86_build_const_vector (qimode, true,
c44c2a3b 21483 gen_int_mode (xor_constant, QImode)));
c7199fb6 21484 emit_insn (gen_xor (dest, dest, vec_const_xor));
21485 emit_insn (gen_sub (dest, dest, vec_const_xor));
21486 }
21487 return true;
21488}
21489
2bf6d935
ML
21490/* Expand a vector operation CODE for a V*QImode in terms of the
21491 same operation on V*HImode. */
21492
21493void
21494ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
21495{
21496 machine_mode qimode = GET_MODE (dest);
21497 machine_mode himode;
21498 rtx (*gen_il) (rtx, rtx, rtx);
21499 rtx (*gen_ih) (rtx, rtx, rtx);
21500 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
21501 struct expand_vec_perm_d d;
21502 bool ok, full_interleave;
21503 bool uns_p = false;
21504 int i;
21505
3bd86940 21506 if (CONST_INT_P (op2)
21507 && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
21508 && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
21509 return;
21510
21511 if (TARGET_AVX512BW
21512 && VECTOR_MODE_P (GET_MODE (op2))
21513 && ix86_expand_vecop_qihi2 (code, dest, op1, op2))
21514 return;
21515
2bf6d935
ML
21516 switch (qimode)
21517 {
21518 case E_V16QImode:
21519 himode = V8HImode;
21520 gen_il = gen_vec_interleave_lowv16qi;
21521 gen_ih = gen_vec_interleave_highv16qi;
21522 break;
21523 case E_V32QImode:
21524 himode = V16HImode;
21525 gen_il = gen_avx2_interleave_lowv32qi;
21526 gen_ih = gen_avx2_interleave_highv32qi;
21527 break;
21528 case E_V64QImode:
21529 himode = V32HImode;
21530 gen_il = gen_avx512bw_interleave_lowv64qi;
21531 gen_ih = gen_avx512bw_interleave_highv64qi;
21532 break;
21533 default:
21534 gcc_unreachable ();
21535 }
21536
2bf6d935
ML
21537 switch (code)
21538 {
21539 case MULT:
21540 /* Unpack data such that we've got a source byte in each low byte of
21541 each word. We don't care what goes into the high byte of each word.
21542 Rather than trying to get zero in there, most convenient is to let
21543 it be a copy of the low byte. */
21544 op2_l = gen_reg_rtx (qimode);
21545 op2_h = gen_reg_rtx (qimode);
21546 emit_insn (gen_il (op2_l, op2, op2));
21547 emit_insn (gen_ih (op2_h, op2, op2));
21548
21549 op1_l = gen_reg_rtx (qimode);
21550 op1_h = gen_reg_rtx (qimode);
21551 emit_insn (gen_il (op1_l, op1, op1));
21552 emit_insn (gen_ih (op1_h, op1, op1));
21553 full_interleave = qimode == V16QImode;
21554 break;
21555
21556 case ASHIFT:
21557 case LSHIFTRT:
21558 uns_p = true;
21559 /* FALLTHRU */
21560 case ASHIFTRT:
21561 op1_l = gen_reg_rtx (himode);
21562 op1_h = gen_reg_rtx (himode);
21563 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
21564 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
3bd86940 21565 /* vashr/vlshr/vashl */
21566 if (GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
21567 {
21568 rtx tmp = force_reg (qimode, op2);
21569 op2_l = gen_reg_rtx (himode);
21570 op2_h = gen_reg_rtx (himode);
21571 ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
21572 ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
21573 }
21574 else
21575 op2_l = op2_h = op2;
21576
2bf6d935
ML
21577 full_interleave = true;
21578 break;
21579 default:
21580 gcc_unreachable ();
21581 }
21582
3bd86940 21583 /* Perform vashr/vlshr/vashl. */
21584 if (code != MULT
21585 && GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
21586 {
21587 res_l = gen_reg_rtx (himode);
21588 res_h = gen_reg_rtx (himode);
21589 emit_insn (gen_rtx_SET (res_l,
21590 simplify_gen_binary (code, himode,
21591 op1_l, op2_l)));
21592 emit_insn (gen_rtx_SET (res_h,
21593 simplify_gen_binary (code, himode,
21594 op1_h, op2_h)));
21595 }
21596 /* Performance mult/ashr/lshr/ashl. */
21597 else
21598 {
21599 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
21600 1, OPTAB_DIRECT);
21601 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
21602 1, OPTAB_DIRECT);
21603 }
21604
2bf6d935
ML
21605 gcc_assert (res_l && res_h);
21606
21607 /* Merge the data back into the right place. */
21608 d.target = dest;
21609 d.op0 = gen_lowpart (qimode, res_l);
21610 d.op1 = gen_lowpart (qimode, res_h);
21611 d.vmode = qimode;
21612 d.nelt = GET_MODE_NUNITS (qimode);
21613 d.one_operand_p = false;
21614 d.testing_p = false;
21615
21616 if (full_interleave)
21617 {
21618 /* For SSE2, we used an full interleave, so the desired
21619 results are in the even elements. */
21620 for (i = 0; i < d.nelt; ++i)
21621 d.perm[i] = i * 2;
21622 }
21623 else
21624 {
21625 /* For AVX, the interleave used above was not cross-lane. So the
21626 extraction is evens but with the second and third quarter swapped.
21627 Happily, that is even one insn shorter than even extraction.
21628 For AVX512BW we have 4 lanes. We extract evens from within a lane,
21629 always first from the first and then from the second source operand,
21630 the index bits above the low 4 bits remains the same.
21631 Thus, for d.nelt == 32 we want permutation
21632 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
21633 and for d.nelt == 64 we want permutation
21634 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
21635 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
21636 for (i = 0; i < d.nelt; ++i)
21637 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
21638 }
21639
21640 ok = ix86_expand_vec_perm_const_1 (&d);
21641 gcc_assert (ok);
21642
21643 set_unique_reg_note (get_last_insn (), REG_EQUAL,
21644 gen_rtx_fmt_ee (code, qimode, op1, op2));
21645}
21646
21647/* Helper function of ix86_expand_mul_widen_evenodd. Return true
21648 if op is CONST_VECTOR with all odd elements equal to their
21649 preceding element. */
21650
21651static bool
21652const_vector_equal_evenodd_p (rtx op)
21653{
21654 machine_mode mode = GET_MODE (op);
21655 int i, nunits = GET_MODE_NUNITS (mode);
21656 if (GET_CODE (op) != CONST_VECTOR
21657 || nunits != CONST_VECTOR_NUNITS (op))
21658 return false;
21659 for (i = 0; i < nunits; i += 2)
21660 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
21661 return false;
21662 return true;
21663}
21664
21665void
21666ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
21667 bool uns_p, bool odd_p)
21668{
21669 machine_mode mode = GET_MODE (op1);
21670 machine_mode wmode = GET_MODE (dest);
21671 rtx x;
21672 rtx orig_op1 = op1, orig_op2 = op2;
21673
21674 if (!nonimmediate_operand (op1, mode))
21675 op1 = force_reg (mode, op1);
21676 if (!nonimmediate_operand (op2, mode))
21677 op2 = force_reg (mode, op2);
21678
21679 /* We only play even/odd games with vectors of SImode. */
21680 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
21681
21682 /* If we're looking for the odd results, shift those members down to
21683 the even slots. For some cpus this is faster than a PSHUFD. */
21684 if (odd_p)
21685 {
21686 /* For XOP use vpmacsdqh, but only for smult, as it is only
21687 signed. */
21688 if (TARGET_XOP && mode == V4SImode && !uns_p)
21689 {
21690 x = force_reg (wmode, CONST0_RTX (wmode));
21691 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
21692 return;
21693 }
21694
21695 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
21696 if (!const_vector_equal_evenodd_p (orig_op1))
21697 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
21698 x, NULL, 1, OPTAB_DIRECT);
21699 if (!const_vector_equal_evenodd_p (orig_op2))
21700 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
21701 x, NULL, 1, OPTAB_DIRECT);
21702 op1 = gen_lowpart (mode, op1);
21703 op2 = gen_lowpart (mode, op2);
21704 }
21705
21706 if (mode == V16SImode)
21707 {
21708 if (uns_p)
21709 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
21710 else
21711 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
21712 }
21713 else if (mode == V8SImode)
21714 {
21715 if (uns_p)
21716 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
21717 else
21718 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
21719 }
21720 else if (uns_p)
21721 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
21722 else if (TARGET_SSE4_1)
21723 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
21724 else
21725 {
21726 rtx s1, s2, t0, t1, t2;
21727
21728 /* The easiest way to implement this without PMULDQ is to go through
21729 the motions as if we are performing a full 64-bit multiply. With
21730 the exception that we need to do less shuffling of the elements. */
21731
21732 /* Compute the sign-extension, aka highparts, of the two operands. */
21733 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
21734 op1, pc_rtx, pc_rtx);
21735 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
21736 op2, pc_rtx, pc_rtx);
21737
21738 /* Multiply LO(A) * HI(B), and vice-versa. */
21739 t1 = gen_reg_rtx (wmode);
21740 t2 = gen_reg_rtx (wmode);
21741 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
21742 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
21743
21744 /* Multiply LO(A) * LO(B). */
21745 t0 = gen_reg_rtx (wmode);
21746 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
21747
21748 /* Combine and shift the highparts into place. */
21749 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
21750 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
21751 1, OPTAB_DIRECT);
21752
21753 /* Combine high and low parts. */
21754 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
21755 return;
21756 }
21757 emit_insn (x);
21758}
21759
21760void
21761ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
21762 bool uns_p, bool high_p)
21763{
21764 machine_mode wmode = GET_MODE (dest);
21765 machine_mode mode = GET_MODE (op1);
21766 rtx t1, t2, t3, t4, mask;
21767
21768 switch (mode)
21769 {
21770 case E_V4SImode:
21771 t1 = gen_reg_rtx (mode);
21772 t2 = gen_reg_rtx (mode);
21773 if (TARGET_XOP && !uns_p)
21774 {
21775 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
21776 shuffle the elements once so that all elements are in the right
21777 place for immediate use: { A C B D }. */
21778 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
21779 const1_rtx, GEN_INT (3)));
21780 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
21781 const1_rtx, GEN_INT (3)));
21782 }
21783 else
21784 {
21785 /* Put the elements into place for the multiply. */
21786 ix86_expand_vec_interleave (t1, op1, op1, high_p);
21787 ix86_expand_vec_interleave (t2, op2, op2, high_p);
21788 high_p = false;
21789 }
21790 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
21791 break;
21792
21793 case E_V8SImode:
21794 /* Shuffle the elements between the lanes. After this we
21795 have { A B E F | C D G H } for each operand. */
21796 t1 = gen_reg_rtx (V4DImode);
21797 t2 = gen_reg_rtx (V4DImode);
21798 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
21799 const0_rtx, const2_rtx,
21800 const1_rtx, GEN_INT (3)));
21801 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
21802 const0_rtx, const2_rtx,
21803 const1_rtx, GEN_INT (3)));
21804
21805 /* Shuffle the elements within the lanes. After this we
21806 have { A A B B | C C D D } or { E E F F | G G H H }. */
21807 t3 = gen_reg_rtx (V8SImode);
21808 t4 = gen_reg_rtx (V8SImode);
21809 mask = GEN_INT (high_p
21810 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
21811 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
21812 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
21813 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
21814
21815 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
21816 break;
21817
21818 case E_V8HImode:
21819 case E_V16HImode:
21820 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
21821 uns_p, OPTAB_DIRECT);
21822 t2 = expand_binop (mode,
21823 uns_p ? umul_highpart_optab : smul_highpart_optab,
21824 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
21825 gcc_assert (t1 && t2);
21826
21827 t3 = gen_reg_rtx (mode);
21828 ix86_expand_vec_interleave (t3, t1, t2, high_p);
21829 emit_move_insn (dest, gen_lowpart (wmode, t3));
21830 break;
21831
21832 case E_V16QImode:
21833 case E_V32QImode:
21834 case E_V32HImode:
21835 case E_V16SImode:
21836 case E_V64QImode:
21837 t1 = gen_reg_rtx (wmode);
21838 t2 = gen_reg_rtx (wmode);
21839 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
21840 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
21841
21842 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
21843 break;
21844
21845 default:
21846 gcc_unreachable ();
21847 }
21848}
21849
21850void
21851ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
21852{
21853 rtx res_1, res_2, res_3, res_4;
21854
21855 res_1 = gen_reg_rtx (V4SImode);
21856 res_2 = gen_reg_rtx (V4SImode);
21857 res_3 = gen_reg_rtx (V2DImode);
21858 res_4 = gen_reg_rtx (V2DImode);
21859 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
21860 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
21861
21862 /* Move the results in element 2 down to element 1; we don't care
21863 what goes in elements 2 and 3. Then we can merge the parts
21864 back together with an interleave.
21865
21866 Note that two other sequences were tried:
21867 (1) Use interleaves at the start instead of psrldq, which allows
21868 us to use a single shufps to merge things back at the end.
21869 (2) Use shufps here to combine the two vectors, then pshufd to
21870 put the elements in the correct order.
21871 In both cases the cost of the reformatting stall was too high
21872 and the overall sequence slower. */
21873
21874 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
21875 const0_rtx, const2_rtx,
21876 const0_rtx, const0_rtx));
21877 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
21878 const0_rtx, const2_rtx,
21879 const0_rtx, const0_rtx));
21880 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
21881
21882 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
21883}
21884
21885void
21886ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
21887{
21888 machine_mode mode = GET_MODE (op0);
21889 rtx t1, t2, t3, t4, t5, t6;
21890
21891 if (TARGET_AVX512DQ && mode == V8DImode)
21892 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
21893 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
21894 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
21895 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
21896 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
21897 else if (TARGET_XOP && mode == V2DImode)
21898 {
21899 /* op1: A,B,C,D, op2: E,F,G,H */
21900 op1 = gen_lowpart (V4SImode, op1);
21901 op2 = gen_lowpart (V4SImode, op2);
21902
21903 t1 = gen_reg_rtx (V4SImode);
21904 t2 = gen_reg_rtx (V4SImode);
21905 t3 = gen_reg_rtx (V2DImode);
21906 t4 = gen_reg_rtx (V2DImode);
21907
21908 /* t1: B,A,D,C */
21909 emit_insn (gen_sse2_pshufd_1 (t1, op1,
21910 GEN_INT (1),
21911 GEN_INT (0),
21912 GEN_INT (3),
21913 GEN_INT (2)));
21914
21915 /* t2: (B*E),(A*F),(D*G),(C*H) */
21916 emit_insn (gen_mulv4si3 (t2, t1, op2));
21917
21918 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
21919 emit_insn (gen_xop_phadddq (t3, t2));
21920
21921 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
21922 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
21923
21924 /* Multiply lower parts and add all */
21925 t5 = gen_reg_rtx (V2DImode);
21926 emit_insn (gen_vec_widen_umult_even_v4si (t5,
21927 gen_lowpart (V4SImode, op1),
21928 gen_lowpart (V4SImode, op2)));
8ba6ea87 21929 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
2bf6d935
ML
21930 }
21931 else
21932 {
21933 machine_mode nmode;
21934 rtx (*umul) (rtx, rtx, rtx);
21935
21936 if (mode == V2DImode)
21937 {
21938 umul = gen_vec_widen_umult_even_v4si;
21939 nmode = V4SImode;
21940 }
21941 else if (mode == V4DImode)
21942 {
21943 umul = gen_vec_widen_umult_even_v8si;
21944 nmode = V8SImode;
21945 }
21946 else if (mode == V8DImode)
21947 {
21948 umul = gen_vec_widen_umult_even_v16si;
21949 nmode = V16SImode;
21950 }
21951 else
21952 gcc_unreachable ();
21953
21954
21955 /* Multiply low parts. */
21956 t1 = gen_reg_rtx (mode);
21957 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
21958
21959 /* Shift input vectors right 32 bits so we can multiply high parts. */
21960 t6 = GEN_INT (32);
21961 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
21962 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
21963
21964 /* Multiply high parts by low parts. */
21965 t4 = gen_reg_rtx (mode);
21966 t5 = gen_reg_rtx (mode);
21967 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
21968 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
21969
21970 /* Combine and shift the highparts back. */
21971 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
21972 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
21973
21974 /* Combine high and low parts. */
21975 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
21976 }
21977
21978 set_unique_reg_note (get_last_insn (), REG_EQUAL,
21979 gen_rtx_MULT (mode, op1, op2));
21980}
21981
21982/* Return 1 if control tansfer instruction INSN
21983 should be encoded with notrack prefix. */
21984
21985bool
e8b0314a 21986ix86_notrack_prefixed_insn_p (rtx_insn *insn)
2bf6d935
ML
21987{
21988 if (!insn || !((flag_cf_protection & CF_BRANCH)))
21989 return false;
21990
21991 if (CALL_P (insn))
21992 {
21993 rtx call = get_call_rtx_from (insn);
21994 gcc_assert (call != NULL_RTX);
21995 rtx addr = XEXP (call, 0);
21996
21997 /* Do not emit 'notrack' if it's not an indirect call. */
21998 if (MEM_P (addr)
21999 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
22000 return false;
22001 else
22002 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
22003 }
22004
22005 if (JUMP_P (insn) && !flag_cet_switch)
22006 {
22007 rtx target = JUMP_LABEL (insn);
22008 if (target == NULL_RTX || ANY_RETURN_P (target))
22009 return false;
22010
22011 /* Check the jump is a switch table. */
22012 rtx_insn *label = as_a<rtx_insn *> (target);
22013 rtx_insn *table = next_insn (label);
22014 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
22015 return false;
22016 else
22017 return true;
22018 }
22019 return false;
22020}
22021
22022/* Calculate integer abs() using only SSE2 instructions. */
22023
22024void
22025ix86_expand_sse2_abs (rtx target, rtx input)
22026{
22027 machine_mode mode = GET_MODE (target);
22028 rtx tmp0, tmp1, x;
22029
22030 switch (mode)
22031 {
22032 case E_V2DImode:
22033 case E_V4DImode:
22034 /* For 64-bit signed integer X, with SSE4.2 use
22035 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
22036 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
22037 32 and use logical instead of arithmetic right shift (which is
22038 unimplemented) and subtract. */
22039 if (TARGET_SSE4_2)
22040 {
22041 tmp0 = gen_reg_rtx (mode);
22042 tmp1 = gen_reg_rtx (mode);
22043 emit_move_insn (tmp1, CONST0_RTX (mode));
22044 if (mode == E_V2DImode)
22045 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
22046 else
22047 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
22048 }
22049 else
22050 {
22051 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
22052 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
22053 - 1), NULL, 0, OPTAB_DIRECT);
22054 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
22055 }
22056
22057 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
22058 NULL, 0, OPTAB_DIRECT);
22059 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
22060 target, 0, OPTAB_DIRECT);
22061 break;
22062
22063 case E_V4SImode:
22064 /* For 32-bit signed integer X, the best way to calculate the absolute
22065 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
22066 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
22067 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
22068 NULL, 0, OPTAB_DIRECT);
22069 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
22070 NULL, 0, OPTAB_DIRECT);
22071 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
22072 target, 0, OPTAB_DIRECT);
22073 break;
22074
22075 case E_V8HImode:
22076 /* For 16-bit signed integer X, the best way to calculate the absolute
22077 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
22078 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
22079
22080 x = expand_simple_binop (mode, SMAX, tmp0, input,
22081 target, 0, OPTAB_DIRECT);
22082 break;
22083
22084 case E_V16QImode:
22085 /* For 8-bit signed integer X, the best way to calculate the absolute
22086 value of X is min ((unsigned char) X, (unsigned char) (-X)),
22087 as SSE2 provides the PMINUB insn. */
22088 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
22089
22090 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
22091 target, 0, OPTAB_DIRECT);
22092 break;
22093
22094 default:
22095 gcc_unreachable ();
22096 }
22097
22098 if (x != target)
22099 emit_move_insn (target, x);
22100}
22101
22102/* Expand an extract from a vector register through pextr insn.
22103 Return true if successful. */
22104
22105bool
22106ix86_expand_pextr (rtx *operands)
22107{
22108 rtx dst = operands[0];
22109 rtx src = operands[1];
22110
22111 unsigned int size = INTVAL (operands[2]);
22112 unsigned int pos = INTVAL (operands[3]);
22113
22114 if (SUBREG_P (dst))
22115 {
22116 /* Reject non-lowpart subregs. */
22117 if (SUBREG_BYTE (dst) > 0)
22118 return false;
22119 dst = SUBREG_REG (dst);
22120 }
22121
22122 if (SUBREG_P (src))
22123 {
22124 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
22125 src = SUBREG_REG (src);
22126 }
22127
22128 switch (GET_MODE (src))
22129 {
22130 case E_V16QImode:
22131 case E_V8HImode:
22132 case E_V4SImode:
22133 case E_V2DImode:
22134 case E_V1TImode:
2bf6d935
ML
22135 {
22136 machine_mode srcmode, dstmode;
22137 rtx d, pat;
22138
22139 if (!int_mode_for_size (size, 0).exists (&dstmode))
22140 return false;
22141
22142 switch (dstmode)
22143 {
22144 case E_QImode:
22145 if (!TARGET_SSE4_1)
22146 return false;
22147 srcmode = V16QImode;
22148 break;
22149
22150 case E_HImode:
22151 if (!TARGET_SSE2)
22152 return false;
22153 srcmode = V8HImode;
22154 break;
22155
22156 case E_SImode:
22157 if (!TARGET_SSE4_1)
22158 return false;
22159 srcmode = V4SImode;
22160 break;
22161
22162 case E_DImode:
22163 gcc_assert (TARGET_64BIT);
22164 if (!TARGET_SSE4_1)
22165 return false;
22166 srcmode = V2DImode;
22167 break;
22168
22169 default:
22170 return false;
22171 }
22172
22173 /* Reject extractions from misaligned positions. */
22174 if (pos & (size-1))
22175 return false;
22176
22177 if (GET_MODE (dst) == dstmode)
22178 d = dst;
22179 else
22180 d = gen_reg_rtx (dstmode);
22181
22182 /* Construct insn pattern. */
22183 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
22184 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
22185
22186 /* Let the rtl optimizers know about the zero extension performed. */
22187 if (dstmode == QImode || dstmode == HImode)
22188 {
22189 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
22190 d = gen_lowpart (SImode, d);
22191 }
22192
22193 emit_insn (gen_rtx_SET (d, pat));
22194
22195 if (d != dst)
22196 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
22197 return true;
22198 }
22199
22200 default:
22201 return false;
22202 }
22203}
22204
22205/* Expand an insert into a vector register through pinsr insn.
22206 Return true if successful. */
22207
22208bool
22209ix86_expand_pinsr (rtx *operands)
22210{
22211 rtx dst = operands[0];
22212 rtx src = operands[3];
22213
22214 unsigned int size = INTVAL (operands[1]);
22215 unsigned int pos = INTVAL (operands[2]);
22216
22217 if (SUBREG_P (dst))
22218 {
22219 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
22220 dst = SUBREG_REG (dst);
22221 }
22222
22223 switch (GET_MODE (dst))
22224 {
22225 case E_V16QImode:
22226 case E_V8HImode:
22227 case E_V4SImode:
22228 case E_V2DImode:
22229 case E_V1TImode:
2bf6d935
ML
22230 {
22231 machine_mode srcmode, dstmode;
22232 rtx (*pinsr)(rtx, rtx, rtx, rtx);
22233 rtx d;
22234
22235 if (!int_mode_for_size (size, 0).exists (&srcmode))
22236 return false;
22237
22238 switch (srcmode)
22239 {
22240 case E_QImode:
22241 if (!TARGET_SSE4_1)
22242 return false;
22243 dstmode = V16QImode;
22244 pinsr = gen_sse4_1_pinsrb;
22245 break;
22246
22247 case E_HImode:
22248 if (!TARGET_SSE2)
22249 return false;
22250 dstmode = V8HImode;
22251 pinsr = gen_sse2_pinsrw;
22252 break;
22253
22254 case E_SImode:
22255 if (!TARGET_SSE4_1)
22256 return false;
22257 dstmode = V4SImode;
22258 pinsr = gen_sse4_1_pinsrd;
22259 break;
22260
22261 case E_DImode:
22262 gcc_assert (TARGET_64BIT);
22263 if (!TARGET_SSE4_1)
22264 return false;
22265 dstmode = V2DImode;
22266 pinsr = gen_sse4_1_pinsrq;
22267 break;
22268
22269 default:
22270 return false;
22271 }
22272
22273 /* Reject insertions to misaligned positions. */
22274 if (pos & (size-1))
22275 return false;
22276
22277 if (SUBREG_P (src))
22278 {
22279 unsigned int srcpos = SUBREG_BYTE (src);
22280
22281 if (srcpos > 0)
22282 {
22283 rtx extr_ops[4];
22284
22285 extr_ops[0] = gen_reg_rtx (srcmode);
22286 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
22287 extr_ops[2] = GEN_INT (size);
22288 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
22289
22290 if (!ix86_expand_pextr (extr_ops))
22291 return false;
22292
22293 src = extr_ops[0];
22294 }
22295 else
22296 src = gen_lowpart (srcmode, SUBREG_REG (src));
22297 }
22298
22299 if (GET_MODE (dst) == dstmode)
22300 d = dst;
22301 else
22302 d = gen_reg_rtx (dstmode);
22303
22304 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
22305 gen_lowpart (srcmode, src),
22306 GEN_INT (1 << (pos / size))));
22307 if (d != dst)
22308 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
22309 return true;
22310 }
22311
22312 default:
22313 return false;
22314 }
22315}
22316
22317/* All CPUs prefer to avoid cross-lane operations so perform reductions
22318 upper against lower halves up to SSE reg size. */
22319
22320machine_mode
22321ix86_split_reduction (machine_mode mode)
22322{
22323 /* Reduce lowpart against highpart until we reach SSE reg width to
22324 avoid cross-lane operations. */
22325 switch (mode)
22326 {
22327 case E_V8DImode:
22328 case E_V4DImode:
22329 return V2DImode;
22330 case E_V16SImode:
22331 case E_V8SImode:
22332 return V4SImode;
22333 case E_V32HImode:
22334 case E_V16HImode:
22335 return V8HImode;
22336 case E_V64QImode:
22337 case E_V32QImode:
22338 return V16QImode;
22339 case E_V16SFmode:
22340 case E_V8SFmode:
22341 return V4SFmode;
22342 case E_V8DFmode:
22343 case E_V4DFmode:
22344 return V2DFmode;
22345 default:
22346 return mode;
22347 }
22348}
22349
22350/* Generate call to __divmoddi4. */
22351
22352void
22353ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
22354 rtx op0, rtx op1,
22355 rtx *quot_p, rtx *rem_p)
22356{
22357 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
22358
22359 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
22360 mode, op0, mode, op1, mode,
22361 XEXP (rem, 0), Pmode);
22362 *quot_p = quot;
22363 *rem_p = rem;
22364}
22365
22366#include "gt-i386-expand.h"